Using eBPF to measure CPU mode switch overhead incured by making system call - linux

As title, but the measurement result is unreasonable. Let me describe the current status.
I'm using syscall getuid as measurement target, I started by measureing the complete overhead with two clock_gettime bounded around, then measure the entry (what SYSCALL instruction does before executing the actual getuid code) and leaving overhead saparately (with eBPF program hook onto the entry and leaving point).
The result for the complete overhead is ~65ns, and regarding to the entry and leaving overhead, it's ~77ns and ~70ns respectively.
It's obvious that my measurement has some additional overhead except the typical overhead. However, it's weird that since clock_gettime is a vDSO syscall, it should barely have noticeable overhead. And BPF, which is a lightweight instrumental tool (JIT-ed and etc.) these day in Linux, shouldn't have noticeable overhead too.
Is there anyone have idea what additional overhead my measurement incurs?
Following is my measurement code:
userland (measuring the return-from-kernel overhead):
#define _GNU_SOURCE
#include <bpf.h>
#include <libbpf.h>
#include <stdlib.h>
#include <arpa/inet.h>
#include <net/if.h>
#include <string.h>
#include <asm/errno.h>
#include <linux/if_link.h>
#include <errno.h>
#include <sys/resource.h>
#include <unistd.h>
#include <asm/unistd.h>
#include <time.h>
#include <linux/perf_event.h>
#include <linux/hw_breakpoint.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <stdio.h>
#include <sys/ioctl.h>
#include <sched.h>
#define likely(x) __builtin_expect((x),1)
#define unlikely(x) __builtin_expect((x),0)
#define TEST_CNT 1000000
#define BPF_FILE_NAME "mkern.o"
#define BPF_MAP_NAME "msys"
static inline int sys_perf_event_open(struct perf_event_attr *attr, pid_t pid,
int cpu, int group_fd,
unsigned long flags)
{
attr->size = sizeof(*attr);
return syscall(__NR_perf_event_open, attr, pid, cpu, group_fd, flags);
}
static int attach_kprobe(int prog_fd)
{
int err, fd, id;
char buf[32];
struct perf_event_attr attr = {};
err = system("echo 'r:kp_sys_batch __x64_sys_getuid' > /sys/kernel/debug/tracing/kprobe_events");
if (err < 0) {
fprintf(stderr, "Failed to create kprobe, error '%s'\n", strerror(errno));
return -1;
}
fd = open("/sys/kernel/debug/tracing/events/kprobes/kp_sys_batch/id", O_RDONLY, 0);
if (fd < 0) {
fprintf(stderr, "Failed to open event %s\n", "sys_batch");
return -1;
}
err = read(fd, buf, sizeof(buf));
if (err < 0 || err >= sizeof(buf)) {
fprintf(stderr, "read from '%s' failed '%s'\n", "sys_batch", strerror(errno));
return -1;
}
close(fd);
buf[err] = 0;
id = atoi(buf);
attr.config = id;
attr.type = PERF_TYPE_TRACEPOINT;
attr.sample_type = PERF_SAMPLE_RAW;
attr.sample_period = 1;
attr.wakeup_events = 1;
fd = sys_perf_event_open(&attr, 0/*this process*/, -1/*any cpu*/, -1/*group leader*/, 0);
if (fd < 0) {
perror("sys_perf_event_open");
fprintf(stderr, "Failed to open perf_event (id: %llu)\n", attr.config);
return -1;
}
err = ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
if (err < 0) {
fprintf(stderr, "ioctl PERF_EVENT_IOC_ENABLE failed err %s\n",
strerror(errno));
return -1;
}
err = ioctl(fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
if (err < 0) {
fprintf(stderr, "ioctl PERF_EVENT_IOC_SET_BPF failed: %s\n",
strerror(errno));
return -1;
}
return 0;
}
static void maxi_memlock_rlimit(void)
{
struct rlimit rlim_new = {
.rlim_cur = RLIM_INFINITY,
.rlim_max = RLIM_INFINITY,
};
if (setrlimit(RLIMIT_MEMLOCK, &rlim_new)) {
fprintf(stderr, "Failed to increase RLIMIT_MEMLOCK limit!\n");
exit(-1);
}
}
static int find_map_fd(struct bpf_object *bpf_obj, const char *mapname)
{
struct bpf_map *map;
int map_fd = -1;
map = bpf_object__find_map_by_name(bpf_obj, mapname);
if (!map) {
fprintf(stderr, "Failed finding map by name: %s\n", mapname);
exit(-1);
}
map_fd = bpf_map__fd(map);
return map_fd;
}
int main(int argc, char **argv)
{
int bpf_map_fd;
int bpf_prog_fd = -1;
int err;
int key = 0;
struct timespec tp;
struct bpf_object *bpf_obj;
struct reals map;
struct bpf_prog_load_attr xattr = {
.prog_type = BPF_PROG_TYPE_KPROBE,
.file = BPF_FILE_NAME,
};
maxi_memlock_rlimit();
err = bpf_prog_load_xattr(&xattr, &bpf_obj, &bpf_prog_fd);
if (err) {
fprintf(stderr, "Failed loading bpf object file\n");
exit(-1);
}
if (attach_kprobe(bpf_prog_fd)) {
fprintf(stderr, "Failed attaching kprobe\n");
exit(-1);
}
bpf_map_fd = find_map_fd(bpf_obj, BPF_MAP_NAME);
if (find_map_fd < 0) {
fprintf(stderr, "Failed finding map fd\n");
exit(-1);
}
/* warm up */
for (int i = 0; i < TEST_CNT; i++) {
syscall(__NR_getuid); /* dummy call */
clock_gettime(CLOCK_MONOTONIC, &tp);
if (unlikely(bpf_map_lookup_elem(bpf_map_fd, &key, &map))) {
fprintf(stderr, "Failed to lookup map element\n");
perror("lookup");
exit(-1);
}
}
uint64_t delta = 0;
for (int i = 0; i < TEST_CNT; i++) {
syscall(__NR_getuid); /* dummy call */
clock_gettime(CLOCK_MONOTONIC, &tp);
if (unlikely(bpf_map_lookup_elem(bpf_map_fd, &key, &map))) {
fprintf(stderr, "Failed to lookup map element\n");
perror("lookup");
exit(-1);
}
delta += (1000000000 * tp.tv_sec + tp.tv_nsec) - map.ts;
}
printf("avg: %fns\n", (double) delta / TEST_CNT);
return 0;
}
user land (measuring the enter-kernel overhead, almost same as the above, except what I pointed out):
err = system("echo 'p:kp_sys_batch sys_batch' > /sys/kernel/debug/tracing/kprobe_events");
...
clock_gettime(CLOCK_MONOTONIC, &tp);
syscall(__NR_getuid); /* dummy call */
...
delta += map.ts - (1000000000 * tp.tv_sec + tp.tv_nsec);
kernel land:
SEC("getuid")
int kp_sys_batch(struct pt_regs *ctx)
{
__u32 i = 0;
struct reals *r;
r = bpf_map_lookup_elem(&reals, &i);
if (!r)
return 1;
r->ts = bpf_ktime_get_ns();
return 0;
}
Except the additional overhead I mentioned above, inside the return-from-kernel measurement code, if the echo 'r:kp_sys_batch sys_batch' is changed to echo 'p:kp_sys_batch sys_batch' (which means that the measurement would take the syscall execution overhead into account), the result would be ~48ns, this means that the result includes overhead of syscall execution and return-from-kernel. Any idea why this could be only ~48ns?
Thanks!

Related

QProcess outputs with delay threads' stdout/stderr of multithreaded application

My platform:
Ubuntu 17.10 32-bit (Vbox VM)
Qt Creator 3.5.1 (opensource)
Qt 5.5.1 (GCC 4.9.1 20140922 (Red Hat 4.9.1-10), 32 bit
I am trying to invoke a multithreaded program (with arguments) using QProcess.start().
My program runs fine on terminal, i.e. periodically prints on stdout.
Using a TextEdit to log the stdout/stderr of the program I have connected QProcess readyReadStandardOutput/Error signals.
The stdout/stderr that comes from the main thread of the program is correctly shown on the TextEdit, the rest of the output (the one from all the other threads) is not shown.
EDIT
On the main thread an HTTP server is listening.
If a HTTP request is performed by browser at the url "127.0.0.1:32001" (port 32001 is hard coded in the QT code), when the HTTP request is received the program appends the HTTP packet and all the pending output from the other thread (moduleThread) to the TextEdit, so it could be a problem of flushing.
main.c
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include <errno.h>
#include <unistd.h>
#include <pthread.h>
#include <semaphore.h>
#include <sys/time.h>
#include <sys/signal.h>
#include <sys/stat.h>
#include <sys/types.h>
#include "http_server.h"
static pthread_t moduleThr;
static pthread_attr_t moduleThread_attr;
static bool one_second_elapsed;
static sem_t oneSecondSem;
void *moduleThread(void *arg0)
{
bool one_second_elapsed_local;
while (1)
{
sem_wait(&oneSecondSem);
one_second_elapsed_local = one_second_elapsed;
one_second_elapsed = false;
sem_post(&oneSecondSem);
if (one_second_elapsed_local)
fprintf(stdout, "Hello world each second!\r\n");
usleep(50000);
}
}
static void oneSecElapsed(int signum)
{
sem_wait(&oneSecondSem);
one_second_elapsed = true;
sem_post(&oneSecondSem);
}
static void TIMER_1sec_init(void)
{
struct sigaction sa;
struct itimerval timer;
/* Install timer_handler as the signal handler for SIGALRM. */
memset (&sa, 0, sizeof (sa));
sa.sa_handler = &oneSecElapsed;
sigaction (SIGALRM, &sa, NULL);
/* Configure the timer to expire after 1 sec... */
timer.it_value.tv_sec = 1;
timer.it_value.tv_usec = 0;
/* ... and every 1 sec after that... */
timer.it_interval.tv_sec = 1;
timer.it_interval.tv_usec = 0;
/* Start a virtual timer. It counts down whenever this process is
executing. */
setitimer (ITIMER_REAL, &timer, NULL);
}
/*
* ======== main ========
*/
int main(int argc, char *argv[])
{
const char *tcpport;
if (argc > 2)
{
fprintf(stdout, "id = %s\r\n", argv[1]);
fprintf(stdout, "tcpport = %s\r\n", argv[2]);
tcpport = argv[2];
}
else
exit(-1);
sem_init(&oneSecondSem, 0, 1);
one_second_elapsed = false;
/* Create thread quadro manager */
pthread_attr_init(&moduleThread_attr);
pthread_attr_setstacksize(&moduleThread_attr, 2048);
pthread_create(&moduleThr, &moduleThread_attr, moduleThread, NULL);
TIMER_1sec_init();
HTTPSERVER_init(tcpport);
/* should never return */
return (0);
}
http_server.c
#include <stdio.h>
#include <stdlib.h>
#include <string.h> /* memset() */
#include <stdbool.h>
#include <signal.h>
#include <sys/socket.h>
#include <sys/signal.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <unistd.h>
#include <netdb.h>
#include "module.h"
typedef struct { char *name, *value; } header_t;
static header_t reqhdr[17] = { {"\0", "\0"} };
//#define PORT "32001" /* Port to listen on */
#define BACKLOG 10 /* Passed to listen() */
static char *buf;
static char *method; // "GET" or "POST"
static char *uri; // "/index.html" things before '?'
static char *qs; // "a=1&b=2" things after '?'
static char *prot; // "HTTP/1.1"
static char *payload; // for POST
static int payload_size;
// get request header
char *request_header(const char* name)
{
header_t *h = reqhdr;
while(h->name) {
if (strcmp(h->name, name) == 0) return h->value;
h++;
}
return NULL;
}
void handle(int newsock)
{
//static int count = 0;
/* recv(), send(), close() */
int rcvd;
buf = malloc(65535);
rcvd=recv(newsock, buf, 65535, 0);
if (rcvd<0) // receive error
fprintf(stderr,("recv() error\n"));
else if (rcvd==0) // receive socket closed
fprintf(stderr,"Client disconnected unexpectedly.\n");
else // message received
{
buf[rcvd] = '\0';
//fputs(buf, stdout);
//fprintf(stdout, "count = %d\n", count);
method = strtok(buf, " \t\r\n");
uri = strtok(NULL, " \t");
prot = strtok(NULL, " \t\r\n");
fprintf(stderr, "\x1b[32m + [%s] %s\x1b[0m\n", method, uri);
if ((qs = strchr(uri, '?')))
{
*qs++ = '\0'; //split URI
} else {
qs = uri - 1; //use an empty string
}
header_t *h = reqhdr;
char *t = (char *)reqhdr;
char *t2;
while(h < reqhdr+16) {
char *k,*v;
k = strtok(NULL, "\r\n: \t");
if (!k)
break;
v = strtok(NULL, "\r\n");
while(*v && *v==' ')
v++;
h->name = k;
h->value = v;
h++;
fprintf(stderr, "[H] %s: %s\n", k, v);
t = v + 1 + strlen(v);
if (t[1] == '\r' && t[2] == '\n')
break;
}
t+=2;
t++; // now the *t shall be the beginning of user payload
t2 = request_header("content-length"); // and the related header if there is
payload = t;
payload_size = t2 ? atol(t2) : (rcvd-(t-buf));
fprintf(stdout, "-- payload len = %d >", payload_size);
fputs(payload, stdout);
fprintf(stdout, "<\r\n");
if (strcmp(method, "GET") == 0)
{
fprintf(stdout, "\nit's a GET!\r\n");
}
else if (strcmp(method, "POST") == 0)
{
fprintf(stdout, "\nit's a POST!\r\n");
}
sprintf(buf, "HTTP/1.1 200 OK\r\n\r\n");
send(newsock, buf, strlen(buf), 0);
close(newsock);
free(buf);
}
}
int HTTPSERVER_init(const char *tcpport)
{
int sock;
struct addrinfo hints, *res;
int reuseaddr = 1; /* True */
sigset_t sigset, oldset;
sigfillset(&sigset);
pthread_sigmask(SIG_BLOCK, &sigset, &oldset);
/* Get the address info */
memset(&hints, 0, sizeof hints);
hints.ai_family = AF_INET;
hints.ai_socktype = SOCK_STREAM;
if (getaddrinfo(NULL, tcpport, &hints, &res) != 0) {
perror("getaddrinfo");
return 1;
}
/* Create the socket */
sock = socket(res->ai_family, res->ai_socktype, res->ai_protocol);
if (sock == -1) {
perror("socket");
return 1;
}
/* Enable the socket to reuse the address */
if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, &reuseaddr, sizeof(int)) == -1) {
perror("setsockopt");
return 1;
}
/* Bind to the address */
if (bind(sock, res->ai_addr, res->ai_addrlen) == -1) {
perror("bind");
return 1;
}
/* Listen */
if (listen(sock, BACKLOG) == -1) {
perror("listen");
return 1;
}
freeaddrinfo(res);
/* Main loop */
while (1) {
printf("waiting on accept\n");
fflush(stdout);
socklen_t size = sizeof(struct sockaddr_in);
struct sockaddr_in their_addr;
int newsock = accept(sock, (struct sockaddr*)&their_addr, &size);
if (newsock == -1) {
perror("accept");
}
else {
printf("Got a connection from %s on port %d\n",
inet_ntoa(their_addr.sin_addr), htons(their_addr.sin_port));
handle(newsock);
}
}
close(sock);
return 0;
}
As pointed out by #eyllanesc, the pending strings "Hello world each second!\r\n" come out eventually after several seconds.
The moduleThread shoud be a thread and not a different process: this is verified by commadn "ps aux" that shows only "../program/testApp test 32001".
Could any of you kindly give me a hint on what is going on here please?
Thanks for your time,
Francesco

Time the connect syscall

I would like to see how much time it takes for connect syscall. I get the code for a simple TCP client. However, the program will wait for the server to respond after connect. How can I make it return right after syscall or using some other ways to time the syscall time?
#include <sys/socket.h>
#include <sys/types.h>
#include <netinet/in.h>
#include <netdb.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <arpa/inet.h>
int main(int argc, char *argv[]) {
int sockfd = 0, n = 0;
char recvBuff[1024];
struct sockaddr_in serv_addr;
if(argc != 2) {
printf("\n Usage: %s <ip of server> \n",argv[0]);
return 1;
}
memset(recvBuff, '0',sizeof(recvBuff));
if((sockfd = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
printf("\n Error : Could not create socket \n");
return 1;
}
memset(&serv_addr, '0', sizeof(serv_addr));
serv_addr.sin_family = AF_INET;
serv_addr.sin_port = htons(5000);
if(inet_pton(AF_INET, argv[1], &serv_addr.sin_addr)<=0) {
printf("\n inet_pton error occured\n");
return 1;
}
if( connect(sockfd, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) < 0) {
printf("\n Error : Connect Failed \n");
return 1;
}
printf("\nhello\n");
while ( (n = read(sockfd, recvBuff, sizeof(recvBuff)-1)) > 0) {
recvBuff[n] = 0;
if(fputs(recvBuff, stdout) == EOF) {
printf("\n Error : Fputs error\n");
}
}
if(n < 0)
{
printf("\n Read error \n");
}
return 0;
}
However, the program will wait for the server to respond after connect.
Yes, and it does so here:
while ( (n = read(sockfd, recvBuff, sizeof(recvBuff)-1)) > 0) {
recvBuff[n] = 0;
if(fputs(recvBuff, stdout) == EOF) {
printf("\n Error : Fputs error\n");
}
}
How can I make it return right after syscall
Err, remove the receive loop above?

write_proc is not invoked when written from userspace

I am trying to understand procfs for communication between userspace and kernel module. My module has basic two functions for procfs write_proc, driver_mmap.
I call multiple times write_proc by calling fputs("123456789",fd). where fd is file descriptor to procfs entry in /proc directory. But I don't see write_proc called multiple time. Code is attached by here.
<code>
#include <linux/init.h>
#include <linux/pci.h>
#include <linux/slab.h>
#include <sound/core.h>
#include <sound/initval.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/proc_fs.h>
#include <asm/uaccess.h>
#include <linux/interrupt.h>
#include <linux/io.h>
#include <linux/mm.h> /* mmap related stuff */
#define BUF_SIZE 64 * 1024
int *MmapBuffer;
int Factor = 1;
static int write_proc(struct file *filp, int *buf, size_t count, loff_t *offp)
{
int rc,i;
printk("in Write \n");
for (i = 1; i <= 16*1024 ; i++)
MmapBuffer[i-1] = (i+1)*Factor;
Factor++;
return count;
}
static int driver_mmap(struct file *file, struct vm_area_struct *vma)
{
int ret;
vma->vm_flags |= VM_LOCKED|VM_SHARED;
ret = remap_pfn_range(vma, vma->vm_start,
virt_to_phys(MmapBuffer) >> PAGE_SHIFT,
vma->vm_end-vma->vm_start, vma->vm_page_prot);
if(ret != 0)
printk("MMAP Failed \n");
SetPageReserved(virt_to_page(MmapBuffer));
printk("MMAP Succeeded \n");
return 0;
}
// file operations
struct file_operations proc_fops =
{
.write = write_proc,
.mmap = driver_mmap,
};
// init module
int init_module_test(void)
{
printk("<1>Hello world\n");
MmapBuffer = kzalloc(BUF_SIZE,__GFP_COLD|GFP_DMA);
if(MmapBuffer == NULL)
printk("Kzalloc failed. reduce buffer size \n");
proc_create ("Test_fs",0,NULL, &proc_fops);
return 0;
}
// exit module
void cleanup_module_test(void)
{
kfree(MmapBuffer);
remove_proc_entry ("Test_fs", NULL);
printk("Goodbye world\n");
}
module_init(init_module_test);
module_exit(cleanup_module_test);
MODULE_LICENSE("GPL");
</code>
Application code
<code>
#include<stdio.h>
#include<stdlib.h>
#include<sys/mman.h>
#include<errno.h>
#include <fcntl.h>
int main(void)
{
int fd;
int i,j;
int *msg ;
printf("Allocation started \n ");
msg=(int*)malloc(64*1024);
if(msg == NULL)
printf("Allocation failed \n");
//unsigned int *addr;
printf("Starting opening \n ");
if((fd=open("/proc/Test_fs", O_RDONLY ))<0)
{
printf("File not opened ");
}
printf("Starting mapping \n ");
msg = mmap(NULL, 64*1024, PROT_READ, MAP_SHARED , fd, 0);
printf("done from module \n ");
if(msg == MAP_FAILED)
{
printf("MAP failed and error is %s", strerror(errno));
return 0;
}
close(fd);
printf("Successful mapping");
FILE *f;
f=fopen("/proc/Test_fs", "wr");
if(!f)
{
printf("File not opened ");
}
for (j = 0; j < 10 ; j++)
{
if(fputs("1234567890,",f) <= 0)
printf("write failed, ");
for (i = 0; i < 16*1024 ; i++)
printf("%d, ", msg[i]);
printf("\n \n done \n \n ");
}
fclose(f);
return 0;
}
</code>

Proper implementation of an inter process communication (IPC)

Is the following a proper implementation of an inter-process communication?
#include <stdio.h>
#include <fcntl.h>
#include <sys/poll.h>
int main(int argc, char** argv) {
if (argc > 1) {
//Sending side
struct stat buffer;
if (stat("/tmp/PROCAtoPROCB", &buffer) != 0)
mkfifo("/tmp/PROCAtoPROCB", (mode_t)0600);
int fdFIFO = open("/tmp/PROCAtoPROCB", O_WRONLY | O_NONBLOCK);
if (fdFIFO > 0) {
write(fdFIFO, (void *)argv[1], sizeof(argv[1]));
close(fdFIFO);
}
} else {
//Receiving side
int fdFIFO = -1;
struct stat buffer;
if (stat("/tmp/PROCAtoPROCB", &buffer) != 0)
mkfifo("/tmp/PROCAtoPROCB", (mode_t)0600);
while (1) {
struct pollfd pollfds[1];
if (fdFIFO == -1)
fdFIFO = open("/tmp/PROCAtoPROCB", O_RDONLY | O_NONBLOCK);
pollfds[0].fd = fdFIFO;
pollfds[0].events = POLLIN;
poll(pollfds, 1, -1);
if (pollfds[0].revents & POLLIN) {
char buf[1024];
read(fdFIFO, &buf, 1024);
close(fdFIFO);
fdFIFO = -1;
printf("Other process says %s\n", buf);
}
printf("End of loop\n");
}
}
return 0;
}
It seems to be working but I'm wondering if there could be a race condition leading to hanging. One constraint is that both processes need to be started independently and in any order.
Some stress tests showed no problem so the implementation seems OK if somebody wants to reuse the code.

Net Link Linux User code bind socket call always fail for multicast group Id (non zero value)

Hi am trying to implement net link user code and kernel code every thing works fine for unicast (src_addr.nl_groups = 0;). For mulicast, user code bind call always fails for non zero src_addr.nl_groups value. Really am not sure what value to put for multicast and how to proceed further. I checked the usage of netlink_broadcast in kernel source tree, so I put the same group Id value (RTMGRP_LINK) here. For unicast I found good number of help in internet but for multicast I don't think so . So Please help me to proceed further.
Error am getting is:
bind: No such file or directory
./a.out: can't bind socket (3)and err : -1: No such file or directory
#include <sys/socket.h>
#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/types.h>
#include <stdio.h>
#include <linux/netlink.h>
#include <linux/rtnetlink.h>
#define NETLINK_TEST 28
#define GROUP_IB 1
#define MAX_PAYLOAD 1024
struct sockaddr_nl src_addr, dst_addr;
struct nlmsghdr *nlh = NULL;
struct msghdr msg;
struct iovec iov;
int sock_fd;
int main(int argc, char ** argv)
{
int err;
sock_fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_TEST);
if (sock_fd<0) {
char s[BUFSIZ];
sprintf( s, "%s: can't assign fd for socket", argv[0] );
perror(s);
return -1;
}
memset(&src_addr, 0, sizeof(src_addr));
src_addr.nl_family = AF_NETLINK;
src_addr.nl_pid = getpid();
src_addr.nl_groups = 0; // Unicast
//src_addr.nl_groups = RTMGRP_LINK; /* Multicast, bind call always fails for non zero values */
err = bind(sock_fd, (struct sockaddr*)&src_addr, sizeof(src_addr));
perror("bind");
if (err<0) {
char s[BUFSIZ];
sprintf( s, "%s: can't bind socket (%d)and err : %d", argv[0], sock_fd,err );
perror(s);
return -1;
}
memset(&dst_addr, 0, sizeof(dst_addr));
nlh = (struct nlhmsghdr *) malloc(NLMSG_SPACE(MAX_PAYLOAD));
memset(nlh, 0, NLMSG_SPACE(MAX_PAYLOAD));
iov.iov_base = (void *)nlh;
iov.iov_len = NLMSG_SPACE(MAX_PAYLOAD);
msg.msg_name = (void *)&dst_addr;
msg.msg_namelen = sizeof(dst_addr);
msg.msg_iov = &iov;
msg.msg_iovlen = 1;
printf("pid : %d\n Waiting for messages from kernel...\n",getpid());
recvmsg(sock_fd, &msg, 0);
printf("Message : %s\n", NLMSG_DATA(nlh));
close(sock_fd);
return 0;
}
Netlink socket binds are sensitive to what USER you are- I've seen them reliably fail if you are not running the program in question as 'root', at least on RedHat 6.
Try running as root 1st, before changing your logic. If you get the same failure as you do in normal operation, then you know it isn't (necessarily) a permissions issue.
The issue is
sock_fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_TEST);
Does you kernel module define the NETLINK_TEST family? your own family might must be supported at kernel module and it should post the message in the proper group using nlmsg_multicast()
RTMGRP_LINK is group defined in NETLINK_ROUTE.
This sample code is example for multicast
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <sys/socket.h>
#include <linux/netlink.h>
#include <unistd.h>
#define MYPROTO NETLINK_USERSOCK
#define MYMGRP 21
int open_netlink(void)
{
int sock;
struct sockaddr_nl addr;
int group = MYMGRP;
sock = socket(AF_NETLINK, SOCK_RAW, MYPROTO);
if (sock < 0) {
printf("sock < 0.\n");
return sock;
}
memset((void *) &addr, 0, sizeof(addr));
addr.nl_family = AF_NETLINK;
addr.nl_pid = getpid();
/* addr.nl_groups = MYMGRP; */
if (bind(sock, (struct sockaddr *) &addr, sizeof(addr)) < 0) {
printf("bind < 0.\n");
return -1;
}
if (setsockopt(sock, 270, NETLINK_ADD_MEMBERSHIP, &group, sizeof(group)) < 0) {
printf("setsockopt < 0\n");
return -1;
}
return sock;
}
void read_event(int sock)
{
struct sockaddr_nl nladdr;
struct msghdr msg;
struct iovec iov;
char buffer[65536];
int ret;
iov.iov_base = (void *) buffer;
iov.iov_len = sizeof(buffer);
msg.msg_name = (void *) &(nladdr);
msg.msg_namelen = sizeof(nladdr);
msg.msg_iov = &iov;
msg.msg_iovlen = 1;
printf("Ok, listening.\n");
ret = recvmsg(sock, &msg, 0);
if (ret < 0)
printf("ret < 0.\n");
else
printf("Received message payload: %s\n", NLMSG_DATA((struct nlmsghdr *) &buffer));
}
int main(int argc, char *argv[])
{
int nls;
nls = open_netlink();
if (nls < 0)
return nls;
while (1)
read_event(nls);
return 0;
}
kernel module:
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/netlink.h>
#include <net/netlink.h>
#include <net/net_namespace.h>
#define MYPROTO NETLINK_USERSOCK
#define MYGRP 21
static struct sock *nl_sk = NULL;
static void send_to_user(void)
{
struct sk_buff *skb;
struct nlmsghdr *nlh;
char *msg = "Hello from kernel";
int msg_size = strlen(msg) + 1;
int res;
pr_info("Creating skb.\n");
skb = nlmsg_new(NLMSG_ALIGN(msg_size + 1), GFP_KERNEL);
if (!skb) {
pr_err("Allocation failure.\n");
return;
}
nlh = nlmsg_put(skb, 0, 1, NLMSG_DONE, msg_size + 1, 0);
strcpy(nlmsg_data(nlh), msg);
pr_info("Sending skb.\n");
res = nlmsg_multicast(nl_sk, skb, 0, MYGRP, GFP_KERNEL);
if (res < 0)
pr_info("nlmsg_multicast() error: %d\n", res);
else
pr_info("Success.\n");
}
static int __init hello_init(void)
{
pr_info("Inserting hello module.\n");
nl_sk = netlink_kernel_create(&init_net, MYPROTO, NULL);
if (!nl_sk) {
pr_err("Error creating socket.\n");
return -10;
}
send_to_user();
netlink_kernel_release(nl_sk);
return 0;
}
static void __exit hello_exit(void)
{
pr_info("Exiting hello module.\n");
}
module_init(hello_init);
module_exit(hello_exit);
MODULE_LICENSE("GPL");

Resources