I am trying to simulate failure of write() system call.
I have read that return value -1 (in EAX) indicates error in system call and errno gives the exact reason for failure.
I am trying to intercept system call write() return -1 in EAX register and set the "errno" to some error value.
puts() internally uses write(), which is system call number 4.
If i do perror("Error:") in the child, it should show the error corresponding to "errno" which i would like to set.
#include <sys/ptrace.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
#include <sys/reg.h> /* For constants ORIG_EAX etc */
#include <stdio.h>
#include <sys/user.h>
#include <errno.h>
int main()
{ pid_t child;
int status;
long orig_eax,eax,params[3];
int ret_val=-1,insyscall=0;
struct user_regs_struct regs;
child = fork();
if(child == 0) {
ptrace(PTRACE_TRACEME, 0, NULL, NULL);
execl("/home/kashi/Documents/2nd_Sem/MyPrgms/ptrace/ramana/write", "write", NULL);
//execl("/bin/ls","ls",NULL);
}
else {
while(1)
{
wait(&status);
if(WIFEXITED(status))
break;
orig_eax = ptrace(PTRACE_PEEKUSER,
child, 4 * ORIG_EAX, NULL);
if(orig_eax == 4)
{
ptrace( PTRACE_GETREGS, child, 0, ®s );
printf("Write called with %ld, %ld %ld\n",regs.ebx, regs.ecx,regs.edx);
if(insyscall == 0)
{
/* Syscall entry */
printf("In %d\n",insyscall);
insyscall = 1;
}
else
{
/* Syscall exit */
regs.orig_eax=-1;
**errno=11; //This errno should be set in the child process, how to do it?**
ptrace( PTRACE_SETREGS, child, 0, ®s );
eax = ptrace(PTRACE_PEEKUSER, child, 4 * ORIG_EAX, NULL);
printf("Write returned with %ld\n", eax);
insyscall = 0;
}
}
//printf("The child made a "
// "system call %ld\n", regs.orig_eax);
ptrace(PTRACE_SYSCALL, child, NULL, NULL);
}
}
return 0;
}
The tracee program (testwrite.c) is:
#include<stdio.h>
#include<unistd.h>
#include<sys/ptrace.h>
#include<signal.h>
#include<errno.h>
//int display(char *p);
int main()
{
printf("Hi Kashi\n");
perror("Error: ");
return 0;
}
output:
[kashi#localhost ramana]$ ./test.sh
In 0
Hi Kashi
Write returned with -1
In 0
**Error: : Success**
Write returned with -1
The perror("Error:") displays text message corresponding to "errno". I am not able to set this "errno" in child process, how can i do it?
Related
As title, but the measurement result is unreasonable. Let me describe the current status.
I'm using syscall getuid as measurement target, I started by measureing the complete overhead with two clock_gettime bounded around, then measure the entry (what SYSCALL instruction does before executing the actual getuid code) and leaving overhead saparately (with eBPF program hook onto the entry and leaving point).
The result for the complete overhead is ~65ns, and regarding to the entry and leaving overhead, it's ~77ns and ~70ns respectively.
It's obvious that my measurement has some additional overhead except the typical overhead. However, it's weird that since clock_gettime is a vDSO syscall, it should barely have noticeable overhead. And BPF, which is a lightweight instrumental tool (JIT-ed and etc.) these day in Linux, shouldn't have noticeable overhead too.
Is there anyone have idea what additional overhead my measurement incurs?
Following is my measurement code:
userland (measuring the return-from-kernel overhead):
#define _GNU_SOURCE
#include <bpf.h>
#include <libbpf.h>
#include <stdlib.h>
#include <arpa/inet.h>
#include <net/if.h>
#include <string.h>
#include <asm/errno.h>
#include <linux/if_link.h>
#include <errno.h>
#include <sys/resource.h>
#include <unistd.h>
#include <asm/unistd.h>
#include <time.h>
#include <linux/perf_event.h>
#include <linux/hw_breakpoint.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <stdio.h>
#include <sys/ioctl.h>
#include <sched.h>
#define likely(x) __builtin_expect((x),1)
#define unlikely(x) __builtin_expect((x),0)
#define TEST_CNT 1000000
#define BPF_FILE_NAME "mkern.o"
#define BPF_MAP_NAME "msys"
static inline int sys_perf_event_open(struct perf_event_attr *attr, pid_t pid,
int cpu, int group_fd,
unsigned long flags)
{
attr->size = sizeof(*attr);
return syscall(__NR_perf_event_open, attr, pid, cpu, group_fd, flags);
}
static int attach_kprobe(int prog_fd)
{
int err, fd, id;
char buf[32];
struct perf_event_attr attr = {};
err = system("echo 'r:kp_sys_batch __x64_sys_getuid' > /sys/kernel/debug/tracing/kprobe_events");
if (err < 0) {
fprintf(stderr, "Failed to create kprobe, error '%s'\n", strerror(errno));
return -1;
}
fd = open("/sys/kernel/debug/tracing/events/kprobes/kp_sys_batch/id", O_RDONLY, 0);
if (fd < 0) {
fprintf(stderr, "Failed to open event %s\n", "sys_batch");
return -1;
}
err = read(fd, buf, sizeof(buf));
if (err < 0 || err >= sizeof(buf)) {
fprintf(stderr, "read from '%s' failed '%s'\n", "sys_batch", strerror(errno));
return -1;
}
close(fd);
buf[err] = 0;
id = atoi(buf);
attr.config = id;
attr.type = PERF_TYPE_TRACEPOINT;
attr.sample_type = PERF_SAMPLE_RAW;
attr.sample_period = 1;
attr.wakeup_events = 1;
fd = sys_perf_event_open(&attr, 0/*this process*/, -1/*any cpu*/, -1/*group leader*/, 0);
if (fd < 0) {
perror("sys_perf_event_open");
fprintf(stderr, "Failed to open perf_event (id: %llu)\n", attr.config);
return -1;
}
err = ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
if (err < 0) {
fprintf(stderr, "ioctl PERF_EVENT_IOC_ENABLE failed err %s\n",
strerror(errno));
return -1;
}
err = ioctl(fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
if (err < 0) {
fprintf(stderr, "ioctl PERF_EVENT_IOC_SET_BPF failed: %s\n",
strerror(errno));
return -1;
}
return 0;
}
static void maxi_memlock_rlimit(void)
{
struct rlimit rlim_new = {
.rlim_cur = RLIM_INFINITY,
.rlim_max = RLIM_INFINITY,
};
if (setrlimit(RLIMIT_MEMLOCK, &rlim_new)) {
fprintf(stderr, "Failed to increase RLIMIT_MEMLOCK limit!\n");
exit(-1);
}
}
static int find_map_fd(struct bpf_object *bpf_obj, const char *mapname)
{
struct bpf_map *map;
int map_fd = -1;
map = bpf_object__find_map_by_name(bpf_obj, mapname);
if (!map) {
fprintf(stderr, "Failed finding map by name: %s\n", mapname);
exit(-1);
}
map_fd = bpf_map__fd(map);
return map_fd;
}
int main(int argc, char **argv)
{
int bpf_map_fd;
int bpf_prog_fd = -1;
int err;
int key = 0;
struct timespec tp;
struct bpf_object *bpf_obj;
struct reals map;
struct bpf_prog_load_attr xattr = {
.prog_type = BPF_PROG_TYPE_KPROBE,
.file = BPF_FILE_NAME,
};
maxi_memlock_rlimit();
err = bpf_prog_load_xattr(&xattr, &bpf_obj, &bpf_prog_fd);
if (err) {
fprintf(stderr, "Failed loading bpf object file\n");
exit(-1);
}
if (attach_kprobe(bpf_prog_fd)) {
fprintf(stderr, "Failed attaching kprobe\n");
exit(-1);
}
bpf_map_fd = find_map_fd(bpf_obj, BPF_MAP_NAME);
if (find_map_fd < 0) {
fprintf(stderr, "Failed finding map fd\n");
exit(-1);
}
/* warm up */
for (int i = 0; i < TEST_CNT; i++) {
syscall(__NR_getuid); /* dummy call */
clock_gettime(CLOCK_MONOTONIC, &tp);
if (unlikely(bpf_map_lookup_elem(bpf_map_fd, &key, &map))) {
fprintf(stderr, "Failed to lookup map element\n");
perror("lookup");
exit(-1);
}
}
uint64_t delta = 0;
for (int i = 0; i < TEST_CNT; i++) {
syscall(__NR_getuid); /* dummy call */
clock_gettime(CLOCK_MONOTONIC, &tp);
if (unlikely(bpf_map_lookup_elem(bpf_map_fd, &key, &map))) {
fprintf(stderr, "Failed to lookup map element\n");
perror("lookup");
exit(-1);
}
delta += (1000000000 * tp.tv_sec + tp.tv_nsec) - map.ts;
}
printf("avg: %fns\n", (double) delta / TEST_CNT);
return 0;
}
user land (measuring the enter-kernel overhead, almost same as the above, except what I pointed out):
err = system("echo 'p:kp_sys_batch sys_batch' > /sys/kernel/debug/tracing/kprobe_events");
...
clock_gettime(CLOCK_MONOTONIC, &tp);
syscall(__NR_getuid); /* dummy call */
...
delta += map.ts - (1000000000 * tp.tv_sec + tp.tv_nsec);
kernel land:
SEC("getuid")
int kp_sys_batch(struct pt_regs *ctx)
{
__u32 i = 0;
struct reals *r;
r = bpf_map_lookup_elem(&reals, &i);
if (!r)
return 1;
r->ts = bpf_ktime_get_ns();
return 0;
}
Except the additional overhead I mentioned above, inside the return-from-kernel measurement code, if the echo 'r:kp_sys_batch sys_batch' is changed to echo 'p:kp_sys_batch sys_batch' (which means that the measurement would take the syscall execution overhead into account), the result would be ~48ns, this means that the result includes overhead of syscall execution and return-from-kernel. Any idea why this could be only ~48ns?
Thanks!
There is a possibility that i found a bug in linux kernel. Let's consider application that write to /proc/self/loginuid from main thread and one auxiliary thread. The code is below:
#include <stdio.h>
#include <pthread.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
void write_loginuid(char *str)
{
int fd;
printf("%s\n", str);
fd = open("/proc/self/loginuid", O_RDWR);
if (fd < 0) {
perror(str);
return;
}
if (write(fd, "0", 2) != 2) {
printf("write\n");
perror(str);
}
close(fd);
}
void *thread_function(void *arg)
{
fprintf(stderr, "Hello from thread! my pid = %u, tid = %u, parent pid = %u\n", getpid(), syscall(SYS_gettid), getppid());
write_loginuid("thread");
return NULL;
}
int main()
{
pthread_t thread;
pthread_create(&thread, NULL, thread_function, NULL);
write_loginuid("main process");
fprintf(stderr, "test my pid = %u, tid = %u, parent pid = %u\n", getpid(), syscall(SYS_gettid), getppid());
pthread_join(thread, NULL);
return 0;
}
After executing this application we get:
main process
test my pid = 3487, tid = 3487, parent pid = 3283
Hello from thread! my pid = 3487, tid = 3488, parent pid = 3283
thread
write
thread: Operation not permitted
That tells us the thread write failed by -EPERM.
Looking at the kernel file fs/proc/base.c and function proc_loginuid_write() we see at the beginning check:
static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
size_t count, loff_t *ppos)
{
struct inode * inode = file_inode(file);
uid_t loginuid;
kuid_t kloginuid;
int rv;
/* this is the probably buggy check */
rcu_read_lock();
if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
rcu_read_unlock();
return -EPERM;
}
rcu_read_unlock();
So, looking at the code above we see that only for exact PID (checked by me with printks) we pass through.Thread doesn't satisfy the condition, because compared pids differs.
So my question is: is this a bug ? Why to not allow thread's of particular process to change the loginuid? I encountered this in login application that spawned another thread for PAM login.
Whether this is bug or not i written a fix that extends writing permission to this file by threads:
rcu_read_lock();
/*
* I changed the condition that it checks now the tgid as returned in sys_getpid()
* rather than task_struct pointers
*/
if (task_tgid_vnr(current) != task_tgid_vnr(pid_task(proc_pid(inode), PIDTYPE_PID))) {
rcu_read_unlock();
return -EPERM;
}
rcu_read_unlock();
What do you think about it? Does it affects security?
When the following C program is executed, and SIGUSR1 is sent to the running process repeatedly, the pclose() call will sometimes return 13. 13 corresponds to SIGPIPE on my system.
Why does this happen?
I am using while true; do kill -SIGUSR1 <process-id>; done to send SIGUSR1 to the program. The program is executed on Ubuntu 14.04.
#include <pthread.h>
#include <signal.h>
#include <unistd.h>
#include <stdio.h>
void handler(int i) {}
void* task(void*)
{
FILE *s;
char b [BUFSIZ];
while (1) {
if ((s = popen("echo hello", "r")) == NULL) {
printf("popen() failed\n");
}
while (fgets(b, BUFSIZ, s) != NULL) ;
if (int r = pclose(s)) {
printf("pclose() failed (%d)\n", r);
}
}
return 0;
}
int main(int argc, char **argv)
{
struct sigaction action;
action.sa_handler = handler;
sigemptyset(&action.sa_mask);
action.sa_flags = 0;
sigaction(SIGUSR1, &action, NULL);
pthread_t tid;
pthread_create(&tid, 0, task, NULL);
pthread_join(tid, NULL);
}
This happens when fgets gets interrupted by the signal. The program doesn't read the pipe to the end and closes it. The other program then SIGPIPEs.
The correct pipe reading operation is:
do {
while (fgets(b, BUFSIZ, s) != NULL) ;
} while (errno == EINTR);
Parent receives SIGPIPE sending chars to aborted child process through FIFO pipe.
I am trying to avoid this, using select() function. In the attached sample code,
select() retruns OK even after the child at the other end of pipe having been terminated.
Tested in
RedHat EL5 (Linux 2.6.18-194.32.1.el5)
GNU C Library stable release version 2.5
Any help appreciated. Thnak you.
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <signal.h>
#include <sys/stat.h>
#include <unistd.h>
static void sigpipe_fct();
main()
{
struct stat st;
int i, fd_out, fd_in, child;
char buf[1024];
#define p_out "/tmp/pout"
signal(SIGPIPE, sigpipe_fct);
if (stat(p_out, &st) != 0) {
mknod(p_out, S_IFIFO, 0);
chmod(p_out, 0666);
}
/* start receiving process */
if ((child = fork()) == 0) {
if ((fd_in = open(p_out, O_RDONLY)) < 0) {
perror(p_out);
exit(1);
}
while(1) {
i = read(fd_in, buf, sizeof(buf));
fprintf(stderr, "child %d read %.*s\n", getpid(), i, buf);
lseek(fd_in, 0, 0);
}
}
else {
fprintf(stderr,
"reading from %s - exec \"kill -9 %d\" to test\n", p_out, child);
if ((fd_out = open(p_out, O_WRONLY + O_NDELAY)) < 0) { /* output */
perror(p_out);
exit(1);
}
while(1) {
if (SelectChkWrite(fd_out) == fd_out) {
fprintf(stderr, "SelectChkWrite() success write abc\n");
write(fd_out, "abc", 3);
}
else
fprintf(stderr, "SelectChkWrite() failed\n");
sleep(3);
}
}
}
static void sigpipe_fct()
{
fprintf(stderr, "SIGPIPE received\n");
exit(-1);
}
SelectChkWrite(ch)
int ch;
{
#include <sys/select.h>
fd_set writefds;
int i;
FD_ZERO(&writefds);
FD_SET (ch, &writefds);
i = select(ch + 1, NULL, &writefds, NULL, NULL);
if (i == -1)
return(-1);
else if (FD_ISSET(ch, &writefds))
return(ch);
else
return(-1);
}
From the Linux select(3) man page:
A descriptor shall be considered ready for writing when a call to an
output function with O_NONBLOCK clear would not block, whether or not
the function would transfer data successfully.
When the pipe is closed, it won't block, so it is considered "ready" by select.
BTW, having #include <sys/select.h> inside your SelectChkWrite() function is extremely bad form.
Although select() and poll() are both in the POSIX standard, select() is much older and more limited than poll(). In general, I recommend people use poll() by default and only use select() if they have a good reason. (See here for one example.)
I am trying to ptrace a vsftpd server process on linux to be able to get control whenever vsftpd process makes a system call. I start the vsftpd process and pass this process id as command line to the following program which traces vsftpd.
however, when I run the following program it just hangs and does not print anything.Can anyone point out what could be wrong? Thanks a lot for your help!!
#include <sys/ptrace.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
#include <linux/user.h>
#include <sys/syscall.h> /* For SYS_write etc */
#include<sys/reg.h>
int main(int argc,char* argv[])
{ pid_t child;
long orig_eax, eax;
long params[3];
int status;
int insyscall = 0;
child = atoi(argv[1]);
ptrace(PTRACE_ATTACH,child,NULL,NULL);
while(1) {
wait(&status);
if(WIFEXITED(status))
break;
orig_eax = ptrace(PTRACE_PEEKUSER,
child, 4 * ORIG_EAX, NULL);
if(orig_eax == __NR_clone || orig_eax == __NR_open || orig_eax == __NR_write)
{
if(insyscall == 0) {
/* Syscall entry */
insyscall = 1;
params[0] = ptrace(PTRACE_PEEKUSER,
child, 4 * EBX,
NULL);
params[1] = ptrace(PTRACE_PEEKUSER,
child, 4 * ECX,
NULL);
params[2] = ptrace(PTRACE_PEEKUSER,
child, 4 * EDX,
NULL);
if(orig_eax == __NR_clone)
{
printf("\nClone");
}
else if(orig_eax == __NR_open)
printf("\nOpen");
else if(orig_eax == __NR_write)
printf("\nWrite");
printf(" called with "
"%ld, %ld, %ld\n",
params[0], params[1],
params[2]);
}
else { /* Syscall exit */
eax = ptrace(PTRACE_PEEKUSER,
child, 4 * EAX, NULL);
printf("Returned "
"with %ld\n", eax);
insyscall = 0;
}
}
ptrace(PTRACE_SYSCALL,
child, NULL, NULL);
}
return 0;
}
You need to have the privilege to trace VSFTPD. Run this as root. To test, put the result of ptrace(PTRACE_ATTACH,child,NULL,NULL); into a variable and print it, ie.
long result = ptrace(PTRACE_ATTACH,child,NULL,NULL);
printf("%ld",result);
On my system if result == -1, I do not have permission. If result == 0, I do.