Linux Netlink Socket Communication Crashes VM - linux

I have written a kernel module and userspace program such that the kernel module sends netlink multicast messages, and the userspace program reads these messages and prints them out. The kernel module and userspace program are available here (https://github.com/akshayknarayan/netlink-test) and replicated below. The code was adapted from this post: Multicast from kernel to user space via Netlink in C
If line 69 of the userspace program (the call to usleep) is commented out, then everything works; once the kernel module is loaded, it repeatedly multicasts messages and the userspace program prints them out.
However, if line 69 of the userspace program is uncommented, within a second of loading the kernel module, my VM hangs and becomes unresponsive.
Why is this the case? How can I prevent the kernel from hanging?
Linux ubuntu-xenial 4.4.0-75-generic #96-Ubuntu SMP Thu Apr 20 09:56:33 UTC 2017 x86_64 x86_64 x86_64 GNU/Linux
Userspace program:
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <sys/socket.h>
#include <linux/netlink.h>
#include <unistd.h>
/* Multicast group, consistent in both kernel prog and user prog. */
#define MYMGRP 22
int nl_open(void) {
int sock;
struct sockaddr_nl addr;
int group = MYMGRP;
sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_USERSOCK);
if (sock < 0) {
printf("sock < 0.\n");
return sock;
}
memset((void *) &addr, 0, sizeof(addr));
addr.nl_family = AF_NETLINK;
addr.nl_pid = getpid();
if (bind(sock, (struct sockaddr *) &addr, sizeof(addr)) < 0) {
printf("bind < 0.\n");
return -1;
}
if (setsockopt(sock, 270, NETLINK_ADD_MEMBERSHIP, &group, sizeof(group)) < 0) {
printf("setsockopt < 0\n");
return -1;
}
return sock;
}
void nl_recv(int sock) {
struct sockaddr_nl nladdr;
struct msghdr msg;
struct iovec iov;
char buffer[65536];
int ret;
iov.iov_base = (void *) buffer;
iov.iov_len = sizeof(buffer);
msg.msg_name = (void *) &(nladdr);
msg.msg_namelen = sizeof(nladdr);
msg.msg_iov = &iov;
msg.msg_iovlen = 1;
ret = recvmsg(sock, &msg, 0);
if (ret < 0)
printf("ret < 0.\n");
else
printf("Received message payload: %s\n", (char*) NLMSG_DATA((struct nlmsghdr *) &buffer));
}
int main(int argc, char *argv[]) {
int nls;
nls = nl_open();
if (nls < 0)
return nls;
while (1) {
nl_recv(nls);
usleep(5000);
}
return 0;
}
Kernel module:
#include <linux/module.h>
#include <linux/netlink.h>
#include <linux/skbuff.h>
#include <linux/gfp.h>
#include <net/sock.h>
#define MYMGRP 22
struct sock *nl_sk = NULL;
static struct timer_list timer;
void nl_send_msg(unsigned long data) {
struct sk_buff *skb_out;
struct nlmsghdr *nlh;
int res;
char *msg = "hello from kernel!\n";
int msg_size = strlen(msg);
skb_out = nlmsg_new(
NLMSG_ALIGN(msg_size), // #payload: size of the message payload
GFP_KERNEL // #flags: the type of memory to allocate.
);
if (!skb_out) {
printk(KERN_ERR "Failed to allocate new skb\n");
return;
}
nlh = nlmsg_put(
skb_out, // #skb: socket buffer to store message in
0, // #portid: netlink PORTID of requesting application
0, // #seq: sequence number of message
NLMSG_DONE, // #type: message type
msg_size, // #payload: length of message payload
0 // #flags: message flags
);
memcpy(nlmsg_data(nlh), msg, msg_size+1);
res = nlmsg_multicast(
nl_sk, // #sk: netlink socket to spread messages to
skb_out, // #skb: netlink message as socket buffer
0, // #portid: own netlink portid to avoid sending to yourself
MYMGRP, // #group: multicast group id
GFP_KERNEL // #flags: allocation flags
);
if (res < 0) {
printk(KERN_INFO "Error while sending to user: %d\n", res);
} else {
mod_timer(&timer, jiffies + msecs_to_jiffies(1));
}
}
static int __init nl_init(void) {
struct netlink_kernel_cfg cfg = {};
printk(KERN_INFO "init NL\n");
nl_sk = netlink_kernel_create(&init_net, NETLINK_USERSOCK, &cfg);
if (!nl_sk) {
printk(KERN_ALERT "Error creating socket.\n");
return -10;
}
init_timer(&timer);
timer.function = nl_send_msg;
timer.expires = jiffies + 1000;
timer.data = 0;
add_timer(&timer);
nl_send_msg(0);
return 0;
}
static void __exit nl_exit(void) {
printk(KERN_INFO "exit NL\n");
del_timer_sync(&timer);
netlink_kernel_release(nl_sk);
}
module_init(nl_init);
module_exit(nl_exit);
MODULE_LICENSE("GPL");

For posterity: I believe the problem was the allocation in nlmsg_new, which should not occur inside an interrupt handler (the timer handler, nl_send_msg), as explained here.
Without the sleep, I believe nlmsg_new does not need to sleep when allocating, so the requirement that an interrupt handler not sleep is not violated. However, if the userspace process lags behind the kernel, it is possible for the kernel to sleep during the allocation, causing a hang.

Related

Using eBPF to measure CPU mode switch overhead incured by making system call

As title, but the measurement result is unreasonable. Let me describe the current status.
I'm using syscall getuid as measurement target, I started by measureing the complete overhead with two clock_gettime bounded around, then measure the entry (what SYSCALL instruction does before executing the actual getuid code) and leaving overhead saparately (with eBPF program hook onto the entry and leaving point).
The result for the complete overhead is ~65ns, and regarding to the entry and leaving overhead, it's ~77ns and ~70ns respectively.
It's obvious that my measurement has some additional overhead except the typical overhead. However, it's weird that since clock_gettime is a vDSO syscall, it should barely have noticeable overhead. And BPF, which is a lightweight instrumental tool (JIT-ed and etc.) these day in Linux, shouldn't have noticeable overhead too.
Is there anyone have idea what additional overhead my measurement incurs?
Following is my measurement code:
userland (measuring the return-from-kernel overhead):
#define _GNU_SOURCE
#include <bpf.h>
#include <libbpf.h>
#include <stdlib.h>
#include <arpa/inet.h>
#include <net/if.h>
#include <string.h>
#include <asm/errno.h>
#include <linux/if_link.h>
#include <errno.h>
#include <sys/resource.h>
#include <unistd.h>
#include <asm/unistd.h>
#include <time.h>
#include <linux/perf_event.h>
#include <linux/hw_breakpoint.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <stdio.h>
#include <sys/ioctl.h>
#include <sched.h>
#define likely(x) __builtin_expect((x),1)
#define unlikely(x) __builtin_expect((x),0)
#define TEST_CNT 1000000
#define BPF_FILE_NAME "mkern.o"
#define BPF_MAP_NAME "msys"
static inline int sys_perf_event_open(struct perf_event_attr *attr, pid_t pid,
int cpu, int group_fd,
unsigned long flags)
{
attr->size = sizeof(*attr);
return syscall(__NR_perf_event_open, attr, pid, cpu, group_fd, flags);
}
static int attach_kprobe(int prog_fd)
{
int err, fd, id;
char buf[32];
struct perf_event_attr attr = {};
err = system("echo 'r:kp_sys_batch __x64_sys_getuid' > /sys/kernel/debug/tracing/kprobe_events");
if (err < 0) {
fprintf(stderr, "Failed to create kprobe, error '%s'\n", strerror(errno));
return -1;
}
fd = open("/sys/kernel/debug/tracing/events/kprobes/kp_sys_batch/id", O_RDONLY, 0);
if (fd < 0) {
fprintf(stderr, "Failed to open event %s\n", "sys_batch");
return -1;
}
err = read(fd, buf, sizeof(buf));
if (err < 0 || err >= sizeof(buf)) {
fprintf(stderr, "read from '%s' failed '%s'\n", "sys_batch", strerror(errno));
return -1;
}
close(fd);
buf[err] = 0;
id = atoi(buf);
attr.config = id;
attr.type = PERF_TYPE_TRACEPOINT;
attr.sample_type = PERF_SAMPLE_RAW;
attr.sample_period = 1;
attr.wakeup_events = 1;
fd = sys_perf_event_open(&attr, 0/*this process*/, -1/*any cpu*/, -1/*group leader*/, 0);
if (fd < 0) {
perror("sys_perf_event_open");
fprintf(stderr, "Failed to open perf_event (id: %llu)\n", attr.config);
return -1;
}
err = ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
if (err < 0) {
fprintf(stderr, "ioctl PERF_EVENT_IOC_ENABLE failed err %s\n",
strerror(errno));
return -1;
}
err = ioctl(fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
if (err < 0) {
fprintf(stderr, "ioctl PERF_EVENT_IOC_SET_BPF failed: %s\n",
strerror(errno));
return -1;
}
return 0;
}
static void maxi_memlock_rlimit(void)
{
struct rlimit rlim_new = {
.rlim_cur = RLIM_INFINITY,
.rlim_max = RLIM_INFINITY,
};
if (setrlimit(RLIMIT_MEMLOCK, &rlim_new)) {
fprintf(stderr, "Failed to increase RLIMIT_MEMLOCK limit!\n");
exit(-1);
}
}
static int find_map_fd(struct bpf_object *bpf_obj, const char *mapname)
{
struct bpf_map *map;
int map_fd = -1;
map = bpf_object__find_map_by_name(bpf_obj, mapname);
if (!map) {
fprintf(stderr, "Failed finding map by name: %s\n", mapname);
exit(-1);
}
map_fd = bpf_map__fd(map);
return map_fd;
}
int main(int argc, char **argv)
{
int bpf_map_fd;
int bpf_prog_fd = -1;
int err;
int key = 0;
struct timespec tp;
struct bpf_object *bpf_obj;
struct reals map;
struct bpf_prog_load_attr xattr = {
.prog_type = BPF_PROG_TYPE_KPROBE,
.file = BPF_FILE_NAME,
};
maxi_memlock_rlimit();
err = bpf_prog_load_xattr(&xattr, &bpf_obj, &bpf_prog_fd);
if (err) {
fprintf(stderr, "Failed loading bpf object file\n");
exit(-1);
}
if (attach_kprobe(bpf_prog_fd)) {
fprintf(stderr, "Failed attaching kprobe\n");
exit(-1);
}
bpf_map_fd = find_map_fd(bpf_obj, BPF_MAP_NAME);
if (find_map_fd < 0) {
fprintf(stderr, "Failed finding map fd\n");
exit(-1);
}
/* warm up */
for (int i = 0; i < TEST_CNT; i++) {
syscall(__NR_getuid); /* dummy call */
clock_gettime(CLOCK_MONOTONIC, &tp);
if (unlikely(bpf_map_lookup_elem(bpf_map_fd, &key, &map))) {
fprintf(stderr, "Failed to lookup map element\n");
perror("lookup");
exit(-1);
}
}
uint64_t delta = 0;
for (int i = 0; i < TEST_CNT; i++) {
syscall(__NR_getuid); /* dummy call */
clock_gettime(CLOCK_MONOTONIC, &tp);
if (unlikely(bpf_map_lookup_elem(bpf_map_fd, &key, &map))) {
fprintf(stderr, "Failed to lookup map element\n");
perror("lookup");
exit(-1);
}
delta += (1000000000 * tp.tv_sec + tp.tv_nsec) - map.ts;
}
printf("avg: %fns\n", (double) delta / TEST_CNT);
return 0;
}
user land (measuring the enter-kernel overhead, almost same as the above, except what I pointed out):
err = system("echo 'p:kp_sys_batch sys_batch' > /sys/kernel/debug/tracing/kprobe_events");
...
clock_gettime(CLOCK_MONOTONIC, &tp);
syscall(__NR_getuid); /* dummy call */
...
delta += map.ts - (1000000000 * tp.tv_sec + tp.tv_nsec);
kernel land:
SEC("getuid")
int kp_sys_batch(struct pt_regs *ctx)
{
__u32 i = 0;
struct reals *r;
r = bpf_map_lookup_elem(&reals, &i);
if (!r)
return 1;
r->ts = bpf_ktime_get_ns();
return 0;
}
Except the additional overhead I mentioned above, inside the return-from-kernel measurement code, if the echo 'r:kp_sys_batch sys_batch' is changed to echo 'p:kp_sys_batch sys_batch' (which means that the measurement would take the syscall execution overhead into account), the result would be ~48ns, this means that the result includes overhead of syscall execution and return-from-kernel. Any idea why this could be only ~48ns?
Thanks!

QProcess outputs with delay threads' stdout/stderr of multithreaded application

My platform:
Ubuntu 17.10 32-bit (Vbox VM)
Qt Creator 3.5.1 (opensource)
Qt 5.5.1 (GCC 4.9.1 20140922 (Red Hat 4.9.1-10), 32 bit
I am trying to invoke a multithreaded program (with arguments) using QProcess.start().
My program runs fine on terminal, i.e. periodically prints on stdout.
Using a TextEdit to log the stdout/stderr of the program I have connected QProcess readyReadStandardOutput/Error signals.
The stdout/stderr that comes from the main thread of the program is correctly shown on the TextEdit, the rest of the output (the one from all the other threads) is not shown.
EDIT
On the main thread an HTTP server is listening.
If a HTTP request is performed by browser at the url "127.0.0.1:32001" (port 32001 is hard coded in the QT code), when the HTTP request is received the program appends the HTTP packet and all the pending output from the other thread (moduleThread) to the TextEdit, so it could be a problem of flushing.
main.c
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include <errno.h>
#include <unistd.h>
#include <pthread.h>
#include <semaphore.h>
#include <sys/time.h>
#include <sys/signal.h>
#include <sys/stat.h>
#include <sys/types.h>
#include "http_server.h"
static pthread_t moduleThr;
static pthread_attr_t moduleThread_attr;
static bool one_second_elapsed;
static sem_t oneSecondSem;
void *moduleThread(void *arg0)
{
bool one_second_elapsed_local;
while (1)
{
sem_wait(&oneSecondSem);
one_second_elapsed_local = one_second_elapsed;
one_second_elapsed = false;
sem_post(&oneSecondSem);
if (one_second_elapsed_local)
fprintf(stdout, "Hello world each second!\r\n");
usleep(50000);
}
}
static void oneSecElapsed(int signum)
{
sem_wait(&oneSecondSem);
one_second_elapsed = true;
sem_post(&oneSecondSem);
}
static void TIMER_1sec_init(void)
{
struct sigaction sa;
struct itimerval timer;
/* Install timer_handler as the signal handler for SIGALRM. */
memset (&sa, 0, sizeof (sa));
sa.sa_handler = &oneSecElapsed;
sigaction (SIGALRM, &sa, NULL);
/* Configure the timer to expire after 1 sec... */
timer.it_value.tv_sec = 1;
timer.it_value.tv_usec = 0;
/* ... and every 1 sec after that... */
timer.it_interval.tv_sec = 1;
timer.it_interval.tv_usec = 0;
/* Start a virtual timer. It counts down whenever this process is
executing. */
setitimer (ITIMER_REAL, &timer, NULL);
}
/*
* ======== main ========
*/
int main(int argc, char *argv[])
{
const char *tcpport;
if (argc > 2)
{
fprintf(stdout, "id = %s\r\n", argv[1]);
fprintf(stdout, "tcpport = %s\r\n", argv[2]);
tcpport = argv[2];
}
else
exit(-1);
sem_init(&oneSecondSem, 0, 1);
one_second_elapsed = false;
/* Create thread quadro manager */
pthread_attr_init(&moduleThread_attr);
pthread_attr_setstacksize(&moduleThread_attr, 2048);
pthread_create(&moduleThr, &moduleThread_attr, moduleThread, NULL);
TIMER_1sec_init();
HTTPSERVER_init(tcpport);
/* should never return */
return (0);
}
http_server.c
#include <stdio.h>
#include <stdlib.h>
#include <string.h> /* memset() */
#include <stdbool.h>
#include <signal.h>
#include <sys/socket.h>
#include <sys/signal.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <unistd.h>
#include <netdb.h>
#include "module.h"
typedef struct { char *name, *value; } header_t;
static header_t reqhdr[17] = { {"\0", "\0"} };
//#define PORT "32001" /* Port to listen on */
#define BACKLOG 10 /* Passed to listen() */
static char *buf;
static char *method; // "GET" or "POST"
static char *uri; // "/index.html" things before '?'
static char *qs; // "a=1&b=2" things after '?'
static char *prot; // "HTTP/1.1"
static char *payload; // for POST
static int payload_size;
// get request header
char *request_header(const char* name)
{
header_t *h = reqhdr;
while(h->name) {
if (strcmp(h->name, name) == 0) return h->value;
h++;
}
return NULL;
}
void handle(int newsock)
{
//static int count = 0;
/* recv(), send(), close() */
int rcvd;
buf = malloc(65535);
rcvd=recv(newsock, buf, 65535, 0);
if (rcvd<0) // receive error
fprintf(stderr,("recv() error\n"));
else if (rcvd==0) // receive socket closed
fprintf(stderr,"Client disconnected unexpectedly.\n");
else // message received
{
buf[rcvd] = '\0';
//fputs(buf, stdout);
//fprintf(stdout, "count = %d\n", count);
method = strtok(buf, " \t\r\n");
uri = strtok(NULL, " \t");
prot = strtok(NULL, " \t\r\n");
fprintf(stderr, "\x1b[32m + [%s] %s\x1b[0m\n", method, uri);
if ((qs = strchr(uri, '?')))
{
*qs++ = '\0'; //split URI
} else {
qs = uri - 1; //use an empty string
}
header_t *h = reqhdr;
char *t = (char *)reqhdr;
char *t2;
while(h < reqhdr+16) {
char *k,*v;
k = strtok(NULL, "\r\n: \t");
if (!k)
break;
v = strtok(NULL, "\r\n");
while(*v && *v==' ')
v++;
h->name = k;
h->value = v;
h++;
fprintf(stderr, "[H] %s: %s\n", k, v);
t = v + 1 + strlen(v);
if (t[1] == '\r' && t[2] == '\n')
break;
}
t+=2;
t++; // now the *t shall be the beginning of user payload
t2 = request_header("content-length"); // and the related header if there is
payload = t;
payload_size = t2 ? atol(t2) : (rcvd-(t-buf));
fprintf(stdout, "-- payload len = %d >", payload_size);
fputs(payload, stdout);
fprintf(stdout, "<\r\n");
if (strcmp(method, "GET") == 0)
{
fprintf(stdout, "\nit's a GET!\r\n");
}
else if (strcmp(method, "POST") == 0)
{
fprintf(stdout, "\nit's a POST!\r\n");
}
sprintf(buf, "HTTP/1.1 200 OK\r\n\r\n");
send(newsock, buf, strlen(buf), 0);
close(newsock);
free(buf);
}
}
int HTTPSERVER_init(const char *tcpport)
{
int sock;
struct addrinfo hints, *res;
int reuseaddr = 1; /* True */
sigset_t sigset, oldset;
sigfillset(&sigset);
pthread_sigmask(SIG_BLOCK, &sigset, &oldset);
/* Get the address info */
memset(&hints, 0, sizeof hints);
hints.ai_family = AF_INET;
hints.ai_socktype = SOCK_STREAM;
if (getaddrinfo(NULL, tcpport, &hints, &res) != 0) {
perror("getaddrinfo");
return 1;
}
/* Create the socket */
sock = socket(res->ai_family, res->ai_socktype, res->ai_protocol);
if (sock == -1) {
perror("socket");
return 1;
}
/* Enable the socket to reuse the address */
if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, &reuseaddr, sizeof(int)) == -1) {
perror("setsockopt");
return 1;
}
/* Bind to the address */
if (bind(sock, res->ai_addr, res->ai_addrlen) == -1) {
perror("bind");
return 1;
}
/* Listen */
if (listen(sock, BACKLOG) == -1) {
perror("listen");
return 1;
}
freeaddrinfo(res);
/* Main loop */
while (1) {
printf("waiting on accept\n");
fflush(stdout);
socklen_t size = sizeof(struct sockaddr_in);
struct sockaddr_in their_addr;
int newsock = accept(sock, (struct sockaddr*)&their_addr, &size);
if (newsock == -1) {
perror("accept");
}
else {
printf("Got a connection from %s on port %d\n",
inet_ntoa(their_addr.sin_addr), htons(their_addr.sin_port));
handle(newsock);
}
}
close(sock);
return 0;
}
As pointed out by #eyllanesc, the pending strings "Hello world each second!\r\n" come out eventually after several seconds.
The moduleThread shoud be a thread and not a different process: this is verified by commadn "ps aux" that shows only "../program/testApp test 32001".
Could any of you kindly give me a hint on what is going on here please?
Thanks for your time,
Francesco

How to use socat proxy to intercept Unix domain socket sending ancillary data?

I have two programs, a server and a client. The server opens a file, writes data to it, and then send its file descriptor to the client over a unix domain socket. Everything works fine untill I introduce a socat proxy in between.
socat -x -v UNIX-LISTEN:/tmp/unixSockSendFe,mode=775,reuseaddr,fork UNIX-CONNECT:/tmp/unixSockSendFd
Explanation
The server listens on /tmp/unixSockSendFd, socat connects to it(UNIX-CONNECT:/tmp/unixSockSendFd), and creates another Unix domain socket(UNIX-LISTEN:/tmp/unixSockSendFe,mode=775,reuseaddr,fork), on which the client connects. Any communication between the client and server gets relayed through socat, which prints the bytes sent in their binary (-x option), and ascii (-v option) form.
If I don't use socat, and client directly connects to server(on /tmp/unixSockSendFd socket), everything works fine, but when socat is used as a proxy, the client crashes with a segmentation fault.
Server
/*Server code - sendfd.c*/
#include <sys/socket.h>
#include <sys/un.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <errno.h>
char *socket_path = "/tmp/unixSockSendFd";
char *file="/tmp/abcd.txt" ;/*file whose fd is to be sent*/
int sendfd(int sock, int fd);
int recvfd(int s);
char data[]="sahil\0";
int main(int argc, char *argv[]) {
struct sockaddr_un addr;
char buf[100];
buf[0]='\n';
int fd,rc,confd;
int fd_to_send;
int temp,len;
temp=1;
fd_to_send=open(file,O_TRUNC|O_RDWR|O_CREAT,S_IRWXU|S_IRWXG|S_IRWXO);
if(fd_to_send==-1)
{
perror("file open error");
return -1;
}
if (argc > 1) socket_path=argv[1];
if ( (fd = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) {
perror("socket error");
exit(-1);
}
memset(&addr, 0, sizeof(addr));
addr.sun_family = AF_UNIX;
if (*socket_path == '\0') {
*addr.sun_path = '\0';
strncpy(addr.sun_path+1, socket_path+1, sizeof(addr.sun_path)-2);
} else {
strncpy(addr.sun_path, socket_path, sizeof(addr.sun_path)-1);
}
unlink(socket_path);
if(bind(fd,(struct sockaddr*)&addr,sizeof(addr))==-1){
perror("bind error");
return -1;
}
/*Writing data to file before sending fd*/
len=write(fd_to_send,data,(int)strlen(data));
fsync(fd_to_send);
printf("(len=%d)data written in file(content between ## marks) ##%s##\n",len,data);
listen(fd,1);
for(;;){
confd=accept(fd,NULL,NULL);
if(confd==-1)
{
perror("accept error");
continue;
}
else{
printf("new client connected ... sending fd ... \n");
sendfd(confd,fd_to_send);
close(confd);
}
}
return 0;
}
int sendfd(int sock, int fd)
{
struct msghdr hdr;
struct iovec data;
char cmsgbuf[CMSG_SPACE(sizeof(int))];
char dummy = '*';
data.iov_base = &dummy;
data.iov_len = sizeof(dummy);
memset(&hdr, 0, sizeof(hdr));
hdr.msg_name = NULL;
hdr.msg_namelen = 0;
hdr.msg_iov = &data;
hdr.msg_iovlen = 1;
hdr.msg_flags = 0;
hdr.msg_control = cmsgbuf;
hdr.msg_controllen = CMSG_LEN(sizeof(int));
struct cmsghdr* cmsg = CMSG_FIRSTHDR(&hdr);
cmsg->cmsg_len = CMSG_LEN(sizeof(int));
cmsg->cmsg_level = SOL_SOCKET;
cmsg->cmsg_type = SCM_RIGHTS;
*(int*)CMSG_DATA(cmsg) = fd;
//memcpy((CMSG_DATA(cmsg)), &fd, sizeof(fd)); -- from ivshmem server code - this too works instead of previous line
int n = sendmsg(sock, &hdr, 0);
if(n == -1)
printf("sendmsg() failed: %s (socket fd = %d)\n", strerror(errno), sock);
return n;
}
int recvfd(int s)
{
int n;
int fd;
char buf[1];
struct iovec iov;
struct msghdr msg;
struct cmsghdr *cmsg;
char cms[CMSG_SPACE(sizeof(int))];
iov.iov_base = buf;
iov.iov_len = 1;
memset(&msg, 0, sizeof msg);
msg.msg_name = 0;
msg.msg_namelen = 0;
msg.msg_iov = &iov;
msg.msg_iovlen = 1;
msg.msg_control = (caddr_t)cms;
msg.msg_controllen = sizeof cms;
if((n=recvmsg(s, &msg, 0)) < 0)
return -1;
if(n == 0){
perror("unexpected EOF");
return -1;
}
cmsg = CMSG_FIRSTHDR(&msg);
memmove(&fd, CMSG_DATA(cmsg), sizeof(int));
return fd;
}
Client
/*Client code - recvfd.c*/
#include <sys/socket.h>
#include <sys/un.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <errno.h>
char *socket_path = "/tmp/unixSockSendFe";
int sendfd(int sock, int fd);
int recvfd(int s);
int fd_received;
int main(int argc, char *argv[]) {
struct sockaddr_un addr;
char buf[100];
buf[0]='\n';
int fd,rc,confd;
int temp,len;
temp=1;
if (argc > 1) socket_path=argv[1];
if ( (fd = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) {
perror("socket error");
exit(-1);
}
memset(&addr, 0, sizeof(addr));
addr.sun_family = AF_UNIX;
if (*socket_path == '\0') {
*addr.sun_path = '\0';
strncpy(addr.sun_path+1, socket_path+1, sizeof(addr.sun_path)-2);
} else {
strncpy(addr.sun_path, socket_path, sizeof(addr.sun_path)-1);
}
if(connect(fd,(struct sockaddr*)&addr,sizeof(addr))==-1)
{
perror("connect error");
exit(-1);
}
fd_received=recvfd(fd);
lseek(fd_received,0,SEEK_SET);
len=read(fd_received,buf,5);
if(len<0)
{
perror("read error");
}
printf("(fd_received=%d,len=%d) first %d characters read from the file whoes fd was received(content within ##) ##%.*s##\n",fd_received,len,5,5,buf);
return 0;
}
int sendfd(int sock, int fd)
{
struct msghdr hdr;
struct iovec data;
char cmsgbuf[CMSG_SPACE(sizeof(int))];
char dummy = '*';
data.iov_base = &dummy;
data.iov_len = sizeof(dummy);
memset(&hdr, 0, sizeof(hdr));
hdr.msg_name = NULL;
hdr.msg_namelen = 0;
hdr.msg_iov = &data;
hdr.msg_iovlen = 1;
hdr.msg_flags = 0;
hdr.msg_control = cmsgbuf;
hdr.msg_controllen = CMSG_LEN(sizeof(int));
struct cmsghdr* cmsg = CMSG_FIRSTHDR(&hdr);
cmsg->cmsg_len = CMSG_LEN(sizeof(int));
cmsg->cmsg_level = SOL_SOCKET;
cmsg->cmsg_type = SCM_RIGHTS;
*(int*)CMSG_DATA(cmsg) = fd;
int n = sendmsg(sock, &hdr, 0);
if(n == -1)
printf("sendmsg() failed: %s (socket fd = %d)\n", strerror(errno), sock);
return n;
}
int recvfd(int s)
{
int n;
int fd;
char buf[1];
struct iovec iov;
struct msghdr msg;
struct cmsghdr *cmsg;
char cms[CMSG_SPACE(sizeof(int))];
iov.iov_base = buf;
iov.iov_len = 1;
memset(&msg, 0, sizeof msg);
msg.msg_name = 0;
msg.msg_namelen = 0;
msg.msg_iov = &iov;
msg.msg_iovlen = 1;
msg.msg_control = (caddr_t)cms;
msg.msg_controllen = sizeof cms;
if((n=recvmsg(s, &msg, 0)) < 0)
return -1;
if(n == 0){
perror("unexpected EOF");
return -1;
}
cmsg = CMSG_FIRSTHDR(&msg);
memmove(&fd, CMSG_DATA(cmsg), sizeof(int));
return fd;
}
On running client (recvfd) I get segmentation fault.
./recvfd
[1] 6104 segmentation fault (core dumped) ./recvfd
Here are lines from running gdb with coredump
Core was generated by `./recvfd'.
Program terminated with signal SIGSEGV, Segmentation fault.
#0 0x0000000000400cf9 in recvfd (s=3) at recvfd.c:146
146 memmove(&fd, CMSG_DATA(cmsg), sizeof(int));
Here is the core dump - Link.
I want to sniff the communication happening between the two processes when the file descriptor is being sent. I am not able to figure out why the client crashes when run with socat, but doesn't when run without it.
Update 1
While using socat to sniff communication happening between two processes of a well established open source project (ivshmem - used for sharing memory between running virtual machines, also a part of Intel DPDK, Link), I observed the following.
None of the processes crash on using socat
When socat is used, the file descriptor is not properly sent, and does not get added to the recipient process.
When socat is not used, and the two processes are connected directly, the file descriptor gets sent properly, and gets added to the recipient process.

Net Link Linux User code bind socket call always fail for multicast group Id (non zero value)

Hi am trying to implement net link user code and kernel code every thing works fine for unicast (src_addr.nl_groups = 0;). For mulicast, user code bind call always fails for non zero src_addr.nl_groups value. Really am not sure what value to put for multicast and how to proceed further. I checked the usage of netlink_broadcast in kernel source tree, so I put the same group Id value (RTMGRP_LINK) here. For unicast I found good number of help in internet but for multicast I don't think so . So Please help me to proceed further.
Error am getting is:
bind: No such file or directory
./a.out: can't bind socket (3)and err : -1: No such file or directory
#include <sys/socket.h>
#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/types.h>
#include <stdio.h>
#include <linux/netlink.h>
#include <linux/rtnetlink.h>
#define NETLINK_TEST 28
#define GROUP_IB 1
#define MAX_PAYLOAD 1024
struct sockaddr_nl src_addr, dst_addr;
struct nlmsghdr *nlh = NULL;
struct msghdr msg;
struct iovec iov;
int sock_fd;
int main(int argc, char ** argv)
{
int err;
sock_fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_TEST);
if (sock_fd<0) {
char s[BUFSIZ];
sprintf( s, "%s: can't assign fd for socket", argv[0] );
perror(s);
return -1;
}
memset(&src_addr, 0, sizeof(src_addr));
src_addr.nl_family = AF_NETLINK;
src_addr.nl_pid = getpid();
src_addr.nl_groups = 0; // Unicast
//src_addr.nl_groups = RTMGRP_LINK; /* Multicast, bind call always fails for non zero values */
err = bind(sock_fd, (struct sockaddr*)&src_addr, sizeof(src_addr));
perror("bind");
if (err<0) {
char s[BUFSIZ];
sprintf( s, "%s: can't bind socket (%d)and err : %d", argv[0], sock_fd,err );
perror(s);
return -1;
}
memset(&dst_addr, 0, sizeof(dst_addr));
nlh = (struct nlhmsghdr *) malloc(NLMSG_SPACE(MAX_PAYLOAD));
memset(nlh, 0, NLMSG_SPACE(MAX_PAYLOAD));
iov.iov_base = (void *)nlh;
iov.iov_len = NLMSG_SPACE(MAX_PAYLOAD);
msg.msg_name = (void *)&dst_addr;
msg.msg_namelen = sizeof(dst_addr);
msg.msg_iov = &iov;
msg.msg_iovlen = 1;
printf("pid : %d\n Waiting for messages from kernel...\n",getpid());
recvmsg(sock_fd, &msg, 0);
printf("Message : %s\n", NLMSG_DATA(nlh));
close(sock_fd);
return 0;
}
Netlink socket binds are sensitive to what USER you are- I've seen them reliably fail if you are not running the program in question as 'root', at least on RedHat 6.
Try running as root 1st, before changing your logic. If you get the same failure as you do in normal operation, then you know it isn't (necessarily) a permissions issue.
The issue is
sock_fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_TEST);
Does you kernel module define the NETLINK_TEST family? your own family might must be supported at kernel module and it should post the message in the proper group using nlmsg_multicast()
RTMGRP_LINK is group defined in NETLINK_ROUTE.
This sample code is example for multicast
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <sys/socket.h>
#include <linux/netlink.h>
#include <unistd.h>
#define MYPROTO NETLINK_USERSOCK
#define MYMGRP 21
int open_netlink(void)
{
int sock;
struct sockaddr_nl addr;
int group = MYMGRP;
sock = socket(AF_NETLINK, SOCK_RAW, MYPROTO);
if (sock < 0) {
printf("sock < 0.\n");
return sock;
}
memset((void *) &addr, 0, sizeof(addr));
addr.nl_family = AF_NETLINK;
addr.nl_pid = getpid();
/* addr.nl_groups = MYMGRP; */
if (bind(sock, (struct sockaddr *) &addr, sizeof(addr)) < 0) {
printf("bind < 0.\n");
return -1;
}
if (setsockopt(sock, 270, NETLINK_ADD_MEMBERSHIP, &group, sizeof(group)) < 0) {
printf("setsockopt < 0\n");
return -1;
}
return sock;
}
void read_event(int sock)
{
struct sockaddr_nl nladdr;
struct msghdr msg;
struct iovec iov;
char buffer[65536];
int ret;
iov.iov_base = (void *) buffer;
iov.iov_len = sizeof(buffer);
msg.msg_name = (void *) &(nladdr);
msg.msg_namelen = sizeof(nladdr);
msg.msg_iov = &iov;
msg.msg_iovlen = 1;
printf("Ok, listening.\n");
ret = recvmsg(sock, &msg, 0);
if (ret < 0)
printf("ret < 0.\n");
else
printf("Received message payload: %s\n", NLMSG_DATA((struct nlmsghdr *) &buffer));
}
int main(int argc, char *argv[])
{
int nls;
nls = open_netlink();
if (nls < 0)
return nls;
while (1)
read_event(nls);
return 0;
}
kernel module:
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/netlink.h>
#include <net/netlink.h>
#include <net/net_namespace.h>
#define MYPROTO NETLINK_USERSOCK
#define MYGRP 21
static struct sock *nl_sk = NULL;
static void send_to_user(void)
{
struct sk_buff *skb;
struct nlmsghdr *nlh;
char *msg = "Hello from kernel";
int msg_size = strlen(msg) + 1;
int res;
pr_info("Creating skb.\n");
skb = nlmsg_new(NLMSG_ALIGN(msg_size + 1), GFP_KERNEL);
if (!skb) {
pr_err("Allocation failure.\n");
return;
}
nlh = nlmsg_put(skb, 0, 1, NLMSG_DONE, msg_size + 1, 0);
strcpy(nlmsg_data(nlh), msg);
pr_info("Sending skb.\n");
res = nlmsg_multicast(nl_sk, skb, 0, MYGRP, GFP_KERNEL);
if (res < 0)
pr_info("nlmsg_multicast() error: %d\n", res);
else
pr_info("Success.\n");
}
static int __init hello_init(void)
{
pr_info("Inserting hello module.\n");
nl_sk = netlink_kernel_create(&init_net, MYPROTO, NULL);
if (!nl_sk) {
pr_err("Error creating socket.\n");
return -10;
}
send_to_user();
netlink_kernel_release(nl_sk);
return 0;
}
static void __exit hello_exit(void)
{
pr_info("Exiting hello module.\n");
}
module_init(hello_init);
module_exit(hello_exit);
MODULE_LICENSE("GPL");

create SOCK_RAW socket just for sending data without any recvform()

If I create a socket whose type is SOCK_RAW only to send some data without receiving any data, is there any problem when kernel continue to receive network packets and copy its datagram to somebuffer (of application?). In other words, after the somebuffer is filled what will happened? error or ignore?
I don't know how to prevent kernel from delivering the copy of datagram to my application.
Reference http://sock-raw.org/papers/sock_raw 0x4 raw_input
After the IP layer processes
a new incoming IP datagram, it calls ip_local_deliver_finish() kernel function
which is responsibe for calling a registered transport protocol handler by
inspecting the protocol field of the IP header (remember from above). However
before it delivers the datagram to the handler, it checks every time if an
application has created a raw socket with the same protocol number. If there
is one or more such applications, it makes a copy of the datagram and delivers
it to them as well.
You can use shutdown(2) in order to shutdown reception part of the socket.
See shutdown man page
EDIT : I found that shutdown only works on connected (ie TCP) sockets.
With Raw socket, there are 2 possibilities :
Receive data into a temporary buffer (with recv) and discard them (perhaps in an other thread)
If I remember well, when the socket buffer is full, incoming data are automatically discarded (and data in the buffer aren't modified), so you can set the socket reception buffer size to 0 (and increase it later if needed).
Here's how to set reception buffer size to 0 :
int opt = 0;
setsockopt(sock_fd, SOL_SOCKET, SO_RCVBUF, &opt, sizeof(opt));
TEST
/**
* #file raw_print_pkt.c
* #brief
* #author Airead Fan <fgh1987168#gmail.com>
* #date 2012/08/22 12:35:22
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <netinet/in.h>
#include <sys/ioctl.h>
#include <netinet/ip.h>
#include <netinet/tcp.h>
int main(int argc, char *argv[])
{
int s;
ssize_t rn; /* receive number */
struct sockaddr_in saddr;
char packet[4096];
int count;
if ((s = socket(AF_INET, SOCK_RAW, IPPROTO_TCP)) < 0) {
perror("error:");
exit(EXIT_FAILURE);
}
memset(packet, 0, sizeof(packet));
socklen_t *len = (socklen_t *)sizeof(saddr);
int fromlen = sizeof(saddr);
int opt = 0;
count = 0;
while(1) {
if ((rn = recvfrom(s, (char *)&packet, sizeof(packet), 0,
(struct sockaddr *)&saddr, &fromlen)) < 0)
perror("packet receive error:");
if (rn == 0) {
printf("the peer has performed an orderly shutdown\n");
break;
}
printf("[%d] rn = %lu \n", count++, rn);
if (count == 16) {
if (setsockopt(s, SOL_SOCKET, SO_RCVBUF, &opt, sizeof(opt)) < 0) {
perror("setsocketopt failed");
} else {
fprintf(stdout, "setsocketopt successful\n");
}
// int shutdown(int sockfd, int how);
/* if (shutdown(s, SHUT_RD) < 0) {
* perror("shutdown failed");
* } */
}
}
return 0;
}
TEST 2 (same includes):
int main(int argc, char *argv[])
{
int s;
ssize_t rn; /* receive number */
char packet[4096];
int count;
if ((s = socket(AF_INET, SOCK_RAW, IPPROTO_TCP)) < 0) {
perror("error:");
exit(EXIT_FAILURE);
}
memset(packet, 0, sizeof(packet));
int opt = 0;
count = 0;
//Set recv buffer size
if (setsockopt(s, SOL_SOCKET, SO_RCVBUF, &opt, sizeof(opt)) < 0) {
perror("setsocketopt failed");
} else {
fprintf(stdout, "setsocketopt successful\n");
}
//10 seconds countdown
int i = 10;
while(i > 0)
{
printf("\r%d ", i);
fflush(stdout);
i--;
sleep(1);
}
printf("\n");
while(1) {
if ((rn = recv(s, (char *)&packet, sizeof(packet), 0)) <= 0)
perror("packet receive error:");
printf("[%d] rn = %lu \n", count++, rn);
}
return 0;
}
Here's how to proceed with test 2 :
First of all, set the buffer size to 4096 (or bigger if you have a lot of traffic on your network). Compile and launch. During the 10 seconds before starting receiving data, send a lot of data to the socket. After the 10 seconds, the program will receive everything you sent during the countdown.
After that, set the buffer size to 0. Proceed as previously. After the 10 seconds, the program won't receive the data you sent during the countdown. But if you send data while it's in recvfrom, it will read them normally.
I don't really understand what you want! if you want just to inject some packets, it's simple:
#include<netinet/tcp.h> /* TCP header */
#include<netinet/ip.h> /* IP header */
/* Checksum compute function */
/* source : http://www.winpcap.org/pipermail/winpcap-users/2007-July/001984.html */
unsigned short checksum(unsigned short *buffer, int size)
{
unsigned long cksum=0;
while(size >1)
{
cksum+=*buffer++;
size -=sizeof(unsigned short);
}
if(size)
cksum += *(UCHAR*)buffer;
cksum = (cksum >> 16) + (cksum & 0xffff);
cksum += (cksum >>16);
return (unsigned short)(~cksum);
}
int main (int argc, char **argv)
{
char packet_buffer[BUFFER_SIZE];
struct sockaddr_in sin;
struct iphdr *ip_header; /* IP header */
struct tcphdr *tcp_header; /* TCP header */
int flag = 1;
/* Creating RAW socket */
int raw_socket = socket (PF_INET, SOCK_RAW, IPPROTO_TCP);
ip_header = (struct iphdr *) packet_buffer;
tcp_header = (struct tcphdr *) (packet_buffer + sizeof (struct ip));
sin.sin_family = AF_INET;
sin.sin_port = htons(PORT_NUMBER);
sin.sin_addr.s_addr = inet_addr (IP_ADDRESS);
/* Zeroing the bbuffer */
memset (packet_buffer, 0, BUFFER_SIZE);
/* Construct your IP Header */
ip_header->ihl = 5;
ip_header->version = 4;
ip_header->tos = 0;
ip_header->tot_len = sizeof (struct ip) + sizeof (struct tcphdr);
ip_header->id = htonl(CHOOSE_PACKET_ID);
ip_header->frag_off = 0;
ip_header->ttl = 255;
ip_header->protocol = 6; /* TCP. Change to 17 if you want UDP */
ip_header->check = 0;
ip_header->saddr = inet_addr (SOURCE_IP_ADDRESS_TO_SPOOF);
ip_header->daddr = sin.sin_addr.s_addr;
/* Construct your TCP Header */
tcp_header->source = htons (SOURCE);
tcp_header->dest = htons(DEST);
tcp_header->seq = random();
tcp_header->ack_seq = 0;
tcp_header->doff = 0;
tcp_header->syn = 1;
tcp_header->window = htonl(65535);
tcp_header->check = 0;
tcp_header->urg_ptr = 0;
/* IP Checksum */
ip_header->check = checksum((unsigned short *) packet_buffer, ip_header->tot_len >> 1);
if (setsockopt(raw_socket, IPPROTO_IP, IP_HDRINCL, &flag, sizeof(flag)) < 0)
{
/* ERROR handling */
}
while (1)
{
/* Send the packet */
if (sendto(raw_socket, packet_buffer, ip_header->tot_len, 0, (struct sockaddr *) &sin, sizeof (sin)) < 0)
{
/* ERROR handling */
}
/* The rest of your need */
}
return 0;
}

Resources