Shared variable between different executables in Linux - linux

What I want to do is to create a globally shared variable to be accessed by different processes. I want the child process to be replaced by an existing executable.
UPDATE: I think this is the solution. The code is borrowed from here. But since every process needs at least one I/O operation to mmap the file, is there any faster approach?
mycode.h
static void* addr; //static
app1.cc
include
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <sys/stat.h>
int main(void)
{
size_t length = 1024 * 1024;
off_t offset = 0;
int prot = (PROT_READ| PROT_WRITE);
int flags = MAP_SHARED;
int fd = -1;
fd = open("./jim.mymemory", O_RDWR| O_CREAT, S_IRUSR| S_IWUSR );
if (fd == 0) {
int myerr = errno;
printf("ERROR: open failed (errno %d %s)\n", myerr, strerror(myerr));
return EXIT_FAILURE;
}
addr = mmap(NULL, length, prot, flags, fd, offset);
if (addr == 0) {
int myerr = errno;
printf("ERROR (child): mmap failed (errno %d %s)\n", myerr,
strerror(myerr));
}
*((int *) addr)=5;
if (munmap(addr, length) == -1) {
int myerr = errno;
printf("ERROR (child): munmap failed (errno %d %s)\n", myerr,
strerror(myerr));
}
return 0;
}
mycode.cc
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <sys/wait.h>
#include "mycode.h"
int main(void) {
size_t length = 1024 * 1024;
off_t offset = 0;
int prot = (PROT_READ| PROT_WRITE);
int flags = MAP_SHARED;
int fd = -1;
pid_t pid;
fd = open("./jim.mymemory", O_RDWR| O_CREAT, S_IRUSR| S_IWUSR );
if (fd == 0) {
int myerr = errno;
printf("ERROR: open failed (errno %d %s)\n", myerr, strerror(myerr));
return EXIT_FAILURE;
}
if (lseek(fd, length - 1, SEEK_SET) == -1) {
int myerr = errno;
printf("ERROR: lseek failed (errno %d %s)\n", myerr, strerror(myerr));
return EXIT_FAILURE;
}
write(fd, "", 1);
if ((pid = fork()) == 0) { // child
/*Child process*/
printf("INFO (child): start \n");
execv("./app1", NULL); // **app1**
printf("INFO (child): done \n");
msync(addr,sizeof(int),MS_SYNC|MS_INVALIDATE); // can be commented out, since we wait in the parent process
} else {
/*Parent process*/
unsigned int readval = 0;
addr = mmap(NULL, length, prot, flags, fd, offset);
if (addr == 0) {
int myerr = errno;
printf("ERROR (parent): mmap failed (errno %d %s)\n", myerr,
strerror(myerr));
}
printf("INFO (parent): start read\n");
wait(NULL);
readval = *((int *) addr);
printf("val: %d \n", readval);
printf("INFO (parent): done read\n");
if (munmap(addr, length) == -1) {
int myerr = errno;
printf("ERROR (parent): munmap failed (errno %d %s)\n", myerr,
strerror(myerr));
}
}
if (close(fd) == -1) {
int myerr = errno;
printf("ERROR: close failed (errno %d %s)\n", myerr, strerror(myerr));
}
unlink ("./jim.mymemory");
return EXIT_SUCCESS;
}
Any help is appreciated.

The execve will drop all mappings in the kernel, so this technique will not work. What you can do instead is open a file (as in Vaughn's suggestion) and pass the descriptor to the child process. Open file descriptors are unchanged across an exec. Then you can map it in the child. Alternatively, investigate APIs like shm_open()/shm_unlink() which will manage a global file mapping such that other processes can use it, not just a child.
But basically: you have to mmap() in the child, you can't pass anything in your address space to the child in Unix.

Related

Using eBPF to measure CPU mode switch overhead incured by making system call

As title, but the measurement result is unreasonable. Let me describe the current status.
I'm using syscall getuid as measurement target, I started by measureing the complete overhead with two clock_gettime bounded around, then measure the entry (what SYSCALL instruction does before executing the actual getuid code) and leaving overhead saparately (with eBPF program hook onto the entry and leaving point).
The result for the complete overhead is ~65ns, and regarding to the entry and leaving overhead, it's ~77ns and ~70ns respectively.
It's obvious that my measurement has some additional overhead except the typical overhead. However, it's weird that since clock_gettime is a vDSO syscall, it should barely have noticeable overhead. And BPF, which is a lightweight instrumental tool (JIT-ed and etc.) these day in Linux, shouldn't have noticeable overhead too.
Is there anyone have idea what additional overhead my measurement incurs?
Following is my measurement code:
userland (measuring the return-from-kernel overhead):
#define _GNU_SOURCE
#include <bpf.h>
#include <libbpf.h>
#include <stdlib.h>
#include <arpa/inet.h>
#include <net/if.h>
#include <string.h>
#include <asm/errno.h>
#include <linux/if_link.h>
#include <errno.h>
#include <sys/resource.h>
#include <unistd.h>
#include <asm/unistd.h>
#include <time.h>
#include <linux/perf_event.h>
#include <linux/hw_breakpoint.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <stdio.h>
#include <sys/ioctl.h>
#include <sched.h>
#define likely(x) __builtin_expect((x),1)
#define unlikely(x) __builtin_expect((x),0)
#define TEST_CNT 1000000
#define BPF_FILE_NAME "mkern.o"
#define BPF_MAP_NAME "msys"
static inline int sys_perf_event_open(struct perf_event_attr *attr, pid_t pid,
int cpu, int group_fd,
unsigned long flags)
{
attr->size = sizeof(*attr);
return syscall(__NR_perf_event_open, attr, pid, cpu, group_fd, flags);
}
static int attach_kprobe(int prog_fd)
{
int err, fd, id;
char buf[32];
struct perf_event_attr attr = {};
err = system("echo 'r:kp_sys_batch __x64_sys_getuid' > /sys/kernel/debug/tracing/kprobe_events");
if (err < 0) {
fprintf(stderr, "Failed to create kprobe, error '%s'\n", strerror(errno));
return -1;
}
fd = open("/sys/kernel/debug/tracing/events/kprobes/kp_sys_batch/id", O_RDONLY, 0);
if (fd < 0) {
fprintf(stderr, "Failed to open event %s\n", "sys_batch");
return -1;
}
err = read(fd, buf, sizeof(buf));
if (err < 0 || err >= sizeof(buf)) {
fprintf(stderr, "read from '%s' failed '%s'\n", "sys_batch", strerror(errno));
return -1;
}
close(fd);
buf[err] = 0;
id = atoi(buf);
attr.config = id;
attr.type = PERF_TYPE_TRACEPOINT;
attr.sample_type = PERF_SAMPLE_RAW;
attr.sample_period = 1;
attr.wakeup_events = 1;
fd = sys_perf_event_open(&attr, 0/*this process*/, -1/*any cpu*/, -1/*group leader*/, 0);
if (fd < 0) {
perror("sys_perf_event_open");
fprintf(stderr, "Failed to open perf_event (id: %llu)\n", attr.config);
return -1;
}
err = ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
if (err < 0) {
fprintf(stderr, "ioctl PERF_EVENT_IOC_ENABLE failed err %s\n",
strerror(errno));
return -1;
}
err = ioctl(fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
if (err < 0) {
fprintf(stderr, "ioctl PERF_EVENT_IOC_SET_BPF failed: %s\n",
strerror(errno));
return -1;
}
return 0;
}
static void maxi_memlock_rlimit(void)
{
struct rlimit rlim_new = {
.rlim_cur = RLIM_INFINITY,
.rlim_max = RLIM_INFINITY,
};
if (setrlimit(RLIMIT_MEMLOCK, &rlim_new)) {
fprintf(stderr, "Failed to increase RLIMIT_MEMLOCK limit!\n");
exit(-1);
}
}
static int find_map_fd(struct bpf_object *bpf_obj, const char *mapname)
{
struct bpf_map *map;
int map_fd = -1;
map = bpf_object__find_map_by_name(bpf_obj, mapname);
if (!map) {
fprintf(stderr, "Failed finding map by name: %s\n", mapname);
exit(-1);
}
map_fd = bpf_map__fd(map);
return map_fd;
}
int main(int argc, char **argv)
{
int bpf_map_fd;
int bpf_prog_fd = -1;
int err;
int key = 0;
struct timespec tp;
struct bpf_object *bpf_obj;
struct reals map;
struct bpf_prog_load_attr xattr = {
.prog_type = BPF_PROG_TYPE_KPROBE,
.file = BPF_FILE_NAME,
};
maxi_memlock_rlimit();
err = bpf_prog_load_xattr(&xattr, &bpf_obj, &bpf_prog_fd);
if (err) {
fprintf(stderr, "Failed loading bpf object file\n");
exit(-1);
}
if (attach_kprobe(bpf_prog_fd)) {
fprintf(stderr, "Failed attaching kprobe\n");
exit(-1);
}
bpf_map_fd = find_map_fd(bpf_obj, BPF_MAP_NAME);
if (find_map_fd < 0) {
fprintf(stderr, "Failed finding map fd\n");
exit(-1);
}
/* warm up */
for (int i = 0; i < TEST_CNT; i++) {
syscall(__NR_getuid); /* dummy call */
clock_gettime(CLOCK_MONOTONIC, &tp);
if (unlikely(bpf_map_lookup_elem(bpf_map_fd, &key, &map))) {
fprintf(stderr, "Failed to lookup map element\n");
perror("lookup");
exit(-1);
}
}
uint64_t delta = 0;
for (int i = 0; i < TEST_CNT; i++) {
syscall(__NR_getuid); /* dummy call */
clock_gettime(CLOCK_MONOTONIC, &tp);
if (unlikely(bpf_map_lookup_elem(bpf_map_fd, &key, &map))) {
fprintf(stderr, "Failed to lookup map element\n");
perror("lookup");
exit(-1);
}
delta += (1000000000 * tp.tv_sec + tp.tv_nsec) - map.ts;
}
printf("avg: %fns\n", (double) delta / TEST_CNT);
return 0;
}
user land (measuring the enter-kernel overhead, almost same as the above, except what I pointed out):
err = system("echo 'p:kp_sys_batch sys_batch' > /sys/kernel/debug/tracing/kprobe_events");
...
clock_gettime(CLOCK_MONOTONIC, &tp);
syscall(__NR_getuid); /* dummy call */
...
delta += map.ts - (1000000000 * tp.tv_sec + tp.tv_nsec);
kernel land:
SEC("getuid")
int kp_sys_batch(struct pt_regs *ctx)
{
__u32 i = 0;
struct reals *r;
r = bpf_map_lookup_elem(&reals, &i);
if (!r)
return 1;
r->ts = bpf_ktime_get_ns();
return 0;
}
Except the additional overhead I mentioned above, inside the return-from-kernel measurement code, if the echo 'r:kp_sys_batch sys_batch' is changed to echo 'p:kp_sys_batch sys_batch' (which means that the measurement would take the syscall execution overhead into account), the result would be ~48ns, this means that the result includes overhead of syscall execution and return-from-kernel. Any idea why this could be only ~48ns?
Thanks!

How to change TCP Congestion Control algorithm using setsockopt() call from C++ code

Is it possible to change TCP congestion control algorithm from Cubic to Reno or vice versa using setsockopt call from C++ code in linux?
I am looking for an example code of doing so.
You can use the TCP_CONGESTION socket option to get or set the congestion control algorithm for a socket to one of the values listed in /proc/sys/net/ipv4/tcp_allowed_congestion_control or to any one of the values in /proc/sys/net/ipv4/tcp_available_congestion_control if your process is privileged.
C example:
#include <sys/socket.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
int main(int argc, char **argv)
{
char buf[256];
socklen_t len;
int sock = socket(AF_INET, SOCK_STREAM, 0);
if (sock == -1)
{
perror("socket");
return -1;
}
len = sizeof(buf);
if (getsockopt(sock, IPPROTO_TCP, TCP_CONGESTION, buf, &len) != 0)
{
perror("getsockopt");
return -1;
}
printf("Current: %s\n", buf);
strcpy(buf, "reno");
len = strlen(buf);
if (setsockopt(sock, IPPROTO_TCP, TCP_CONGESTION, buf, len) != 0)
{
perror("setsockopt");
return -1;
}
len = sizeof(buf);
if (getsockopt(sock, IPPROTO_TCP, TCP_CONGESTION, buf, &len) != 0)
{
perror("getsockopt");
return -1;
}
printf("New: %s\n", buf);
close(sock);
return 0;
}
For me outputs:
Current: cubic
New: reno

Time the connect syscall

I would like to see how much time it takes for connect syscall. I get the code for a simple TCP client. However, the program will wait for the server to respond after connect. How can I make it return right after syscall or using some other ways to time the syscall time?
#include <sys/socket.h>
#include <sys/types.h>
#include <netinet/in.h>
#include <netdb.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <arpa/inet.h>
int main(int argc, char *argv[]) {
int sockfd = 0, n = 0;
char recvBuff[1024];
struct sockaddr_in serv_addr;
if(argc != 2) {
printf("\n Usage: %s <ip of server> \n",argv[0]);
return 1;
}
memset(recvBuff, '0',sizeof(recvBuff));
if((sockfd = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
printf("\n Error : Could not create socket \n");
return 1;
}
memset(&serv_addr, '0', sizeof(serv_addr));
serv_addr.sin_family = AF_INET;
serv_addr.sin_port = htons(5000);
if(inet_pton(AF_INET, argv[1], &serv_addr.sin_addr)<=0) {
printf("\n inet_pton error occured\n");
return 1;
}
if( connect(sockfd, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) < 0) {
printf("\n Error : Connect Failed \n");
return 1;
}
printf("\nhello\n");
while ( (n = read(sockfd, recvBuff, sizeof(recvBuff)-1)) > 0) {
recvBuff[n] = 0;
if(fputs(recvBuff, stdout) == EOF) {
printf("\n Error : Fputs error\n");
}
}
if(n < 0)
{
printf("\n Read error \n");
}
return 0;
}
However, the program will wait for the server to respond after connect.
Yes, and it does so here:
while ( (n = read(sockfd, recvBuff, sizeof(recvBuff)-1)) > 0) {
recvBuff[n] = 0;
if(fputs(recvBuff, stdout) == EOF) {
printf("\n Error : Fputs error\n");
}
}
How can I make it return right after syscall
Err, remove the receive loop above?

Join network namespace from inside user namespace

Root creates a network namespace testns, and clone with CLONE_NEWUSER flag to get the child inside an user namespace. Then the child tries to join testns by calling setns, which shows the permission denied. Is there any alternative to let the child inside an user namespace to join a network namespace?
Just wrote up this following test: (Before running, I created testns by calling sudo ip netns add testns)
#define _GNU_SOURCE
#include <sched.h>
#include <stdlib.h>
#include <signal.h>
#include <fcntl.h>
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <limits.h>
#define STACK_SIZE (1024 * 1024)
static char child_stack[STACK_SIZE];
static void update_map(char *mapping, char *map_file) {
int fd, j;
size_t map_len;
map_len = strlen(mapping);
for (j = 0; j < map_len; j++)
if (mapping[j] == ',')
mapping[j] = '\n';
fd = open(map_file, O_RDWR);
if (fd == -1) {
fprintf(stderr, "open %s: %s\n", map_file, strerror(errno));
exit(EXIT_FAILURE);
}
if (write(fd, mapping, map_len) != map_len) {
fprintf(stderr, "write %s: %s\n", map_file, strerror(errno));
exit(EXIT_FAILURE);
}
close(fd);
}
static void proc_setgroups_write(pid_t child_pid, char *str) {
char setgroups_path[PATH_MAX];
int fd;
snprintf(setgroups_path, PATH_MAX, "/proc/%ld/setgroups",
(long) child_pid);
fd = open(setgroups_path, O_RDWR);
if (fd == -1) {
if (errno != ENOENT)
fprintf(stderr, "ERROR: open %s: %s\n", setgroups_path,
strerror(errno));
return;
}
if (write(fd, str, strlen(str)) == -1)
fprintf(stderr, "ERROR: write %s: %s\n", setgroups_path,
strerror(errno));
close(fd);
}
static void update_userns(pid_t pid, char *uidMap, char *gidMap) {
char map_path[PATH_MAX];
snprintf(map_path, PATH_MAX, "/proc/%ld/uid_map", (long) pid);
update_map(uidMap, map_path);
proc_setgroups_write(pid, "deny");
snprintf(map_path, PATH_MAX, "/proc/%ld/gid_map", (long) pid);
update_map(gidMap, map_path);
}
static int join_netns(char *netns_name) {
char netns_path[256];
snprintf(netns_path, sizeof(netns_path), "/var/run/netns/%s", netns_name);
int fd = open(netns_path, O_RDONLY);
if (fd == -1) {
fprintf(stderr, "open netns path failed: %s\n", strerror(errno));
exit(EXIT_FAILURE);
}
if (setns(fd, CLONE_NEWNET) == -1) {
fprintf(stderr, "set netns failed: %s\n", strerror(errno));
exit(EXIT_FAILURE);
}
return 0;
}
static int child(void* arg) {
// Sleep so that userns has the correct mapping.
sleep(1);
return join_netns("testns");
}
int main(int argc, char *argv[]) {
uid_t uid = getuid();
char mapping[10];
snprintf(mapping, 10, "0 %d 1", uid);
int pid1 = clone(child, child_stack + STACK_SIZE, CLONE_NEWUSER | SIGCHLD, NULL);
if (pid1 == -1) {
perror("Clone");
exit(EXIT_FAILURE);
}
update_userns(pid1, mapping, mapping);
if (waitpid(pid1, NULL, 0) == -1) {
perror("waitpid");
exit(EXIT_FAILURE);
}
}
The kernel will do a security check that the userns "owner"/creator of the network namespace matches before allowing a join to an existing network namespace.
You can definitely join a network namespace once inside a user namespace, but that initial network namespace must be created with the same user namespace. In container runtimes like the runc tool, you can validate this by starting a simple container in a user namespace with a network namespace created, and then start a second container with references to the first container's user and network namespace paths. I demo'd this using runc at a previous DockerCon; you can see me sharing the user namespace and network namespace in this segment starting at 41:24

write_proc is not invoked when written from userspace

I am trying to understand procfs for communication between userspace and kernel module. My module has basic two functions for procfs write_proc, driver_mmap.
I call multiple times write_proc by calling fputs("123456789",fd). where fd is file descriptor to procfs entry in /proc directory. But I don't see write_proc called multiple time. Code is attached by here.
<code>
#include <linux/init.h>
#include <linux/pci.h>
#include <linux/slab.h>
#include <sound/core.h>
#include <sound/initval.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/proc_fs.h>
#include <asm/uaccess.h>
#include <linux/interrupt.h>
#include <linux/io.h>
#include <linux/mm.h> /* mmap related stuff */
#define BUF_SIZE 64 * 1024
int *MmapBuffer;
int Factor = 1;
static int write_proc(struct file *filp, int *buf, size_t count, loff_t *offp)
{
int rc,i;
printk("in Write \n");
for (i = 1; i <= 16*1024 ; i++)
MmapBuffer[i-1] = (i+1)*Factor;
Factor++;
return count;
}
static int driver_mmap(struct file *file, struct vm_area_struct *vma)
{
int ret;
vma->vm_flags |= VM_LOCKED|VM_SHARED;
ret = remap_pfn_range(vma, vma->vm_start,
virt_to_phys(MmapBuffer) >> PAGE_SHIFT,
vma->vm_end-vma->vm_start, vma->vm_page_prot);
if(ret != 0)
printk("MMAP Failed \n");
SetPageReserved(virt_to_page(MmapBuffer));
printk("MMAP Succeeded \n");
return 0;
}
// file operations
struct file_operations proc_fops =
{
.write = write_proc,
.mmap = driver_mmap,
};
// init module
int init_module_test(void)
{
printk("<1>Hello world\n");
MmapBuffer = kzalloc(BUF_SIZE,__GFP_COLD|GFP_DMA);
if(MmapBuffer == NULL)
printk("Kzalloc failed. reduce buffer size \n");
proc_create ("Test_fs",0,NULL, &proc_fops);
return 0;
}
// exit module
void cleanup_module_test(void)
{
kfree(MmapBuffer);
remove_proc_entry ("Test_fs", NULL);
printk("Goodbye world\n");
}
module_init(init_module_test);
module_exit(cleanup_module_test);
MODULE_LICENSE("GPL");
</code>
Application code
<code>
#include<stdio.h>
#include<stdlib.h>
#include<sys/mman.h>
#include<errno.h>
#include <fcntl.h>
int main(void)
{
int fd;
int i,j;
int *msg ;
printf("Allocation started \n ");
msg=(int*)malloc(64*1024);
if(msg == NULL)
printf("Allocation failed \n");
//unsigned int *addr;
printf("Starting opening \n ");
if((fd=open("/proc/Test_fs", O_RDONLY ))<0)
{
printf("File not opened ");
}
printf("Starting mapping \n ");
msg = mmap(NULL, 64*1024, PROT_READ, MAP_SHARED , fd, 0);
printf("done from module \n ");
if(msg == MAP_FAILED)
{
printf("MAP failed and error is %s", strerror(errno));
return 0;
}
close(fd);
printf("Successful mapping");
FILE *f;
f=fopen("/proc/Test_fs", "wr");
if(!f)
{
printf("File not opened ");
}
for (j = 0; j < 10 ; j++)
{
if(fputs("1234567890,",f) <= 0)
printf("write failed, ");
for (i = 0; i < 16*1024 ; i++)
printf("%d, ", msg[i]);
printf("\n \n done \n \n ");
}
fclose(f);
return 0;
}
</code>

Resources