Why STRACE shows EAGAIN (Resource temporarily unavailable) - linux

Following is the sequence I am getting
socket(PF_INET, SOCK_STREAM, IPPROTO_IP) = 7
setsockopt(7, SOL_TCP, TCP_NODELAY, [1], 4) = 0
setsockopt(7, SOL_SOCKET, SO_SNDBUF, [32120], 4) = 0
getsockopt(7, SOL_SOCKET, SO_SNDBUF, [30064835312], [4]) = 0
setsockopt(7, SOL_SOCKET, SO_SNDBUF, [64240], 4) = 0
getsockopt(7, SOL_SOCKET, SO_SNDBUF, [30064899552], [4]) = 0
stat("/etc/localtime", {st_dev=makedev(8, 1), st_ino=229001, st_mode=S_IFREG|0644, st_nlink=1, st_uid=0, st_gid=0, st_blksize=4096, st_blocks=8, st_size=265, st_atime=2013/07/15-06:30:03, st_mtime=2012/06/25-23:46:43, st_ctime=2012/06/25-23:46:43}) = 0
write(1, "[info 2013/07/16 05:53:24.622210"..., 114) = 114
setsockopt(7, SOL_SOCKET, SO_RCVBUF, [32120], 4) = 0
getsockopt(7, SOL_SOCKET, SO_RCVBUF, [30064835312], [4]) = 0
setsockopt(7, SOL_SOCKET, SO_RCVBUF, [64240], 4) = 0
getsockopt(7, SOL_SOCKET, SO_RCVBUF, [30064899552], [4]) = 0
fcntl(7, F_GETFL) = 0x2 (flags O_RDWR)
fcntl(7, F_SETFL, O_RDWR|O_NONBLOCK) = 0
rt_sigaction(SIGPIPE, {SIG_IGN, [PIPE], SA_RESTORER|SA_RESTART, 0x33b3632920}, {SIG_DFL, [], 0}, 8) = 0
fcntl(7, F_GETFL) = 0x802 (flags O_RDWR|O_NONBLOCK)
fcntl(7, F_SETFL, O_RDWR|O_NONBLOCK) = 0
connect(7, {sa_family=AF_INET, sin_port=htons(50505), sin_addr=inet_addr("1.2.3.4")}, 16) = -1 EINPROGRESS (Operation now in progress)
poll([{fd=7, events=POLLIN|POLLOUT}], 1, 59000) = 1 ([{fd=7, revents=POLLOUT}])
fcntl(7, F_GETFL) = 0x802 (flags O_RDWR|O_NONBLOCK)
fcntl(7, F_SETFL, O_RDWR) = 0
getsockname(7, {sa_family=AF_INET, sin_port=htons(33220), sin_addr=inet_addr("10.112.204.215")}, [16]) = 0
fcntl(7, F_GETFL) = 0x2 (flags O_RDWR)
fcntl(7, F_GETFL) = 0x2 (flags O_RDWR)
fcntl(7, F_SETFL, O_RDWR|O_NONBLOCK) = 0
write(7, "d\23;\177\377\330\357\1&W\1\\\4\np\314\327\0\0\0\2W\0\rpnq-gst-"..., 103) = 103
fcntl(7, F_GETFL) = 0x802 (flags O_RDWR|O_NONBLOCK)
fcntl(7, F_SETFL, O_RDWR) = 0
fcntl(7, F_GETFL) = 0x2 (flags O_RDWR)
fcntl(7, F_GETFL) = 0x2 (flags O_RDWR)
fcntl(7, F_SETFL, O_RDWR|O_NONBLOCK) = 0
read(7, 0x9d9f90, 1) = -1 EAGAIN (Resource temporarily unavailable)
Why this read is getting called, I assume that poll should wake up only when there is data to read

poll woke up with revents = POLLOUT, which means that the socket is ready to write, not ready to read. The code is apparently not checking this flag, and trying to read anyway.
This might be intentional. Even though poll didn't say the socket is ready to read, it might have become ready while it was writing. So it calls ready just in case something has shown up. If not, it will go back into poll to wait again. This allows it to process incoming data more quickly, since it can get it in one call rather than two.

Related

"Device or resource busy" while adding vlan using vconfig

It has been two days but not getting any clue why I am getting "vconfig: ioctl error for add: Device or resource busy" error while configuring VLAN.
Checklist:
Port is up
Port is not the part of any bridge.
Below is the strace of the command but not getting much pointer for this problem:
# strace vconfig add port00 100
execve("/usr/sbin/vconfig", ["vconfig", "add", "port00", "100"], 0xffcb640c /* 7 vars */) = 0
readlinkat(AT_FDCWD, "/proc/self/exe", "/usr/bin/busybox", 4096) = 16
mmap2(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|1<<MAP_HUGE_SHIFT, -1, 0) = 0xf77b0000
open("/lib//libc.so.0", O_RDONLY) = 7
fstat(7, {st_mode=S_IFREG|0775, st_size=653404, ...}) = 0
mmap2(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|1<<MAP_HUGE_SHIFT, -1, 0) = 0xf77af000
read(7, "\177ELF\1\1\1\0\0\0\0\0\0\0\0\0\3\0\3\0\1\0\0\0P\1\1\0004\0\0\0"..., 4096) = 4096
mmap2(NULL, 753664, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0xf76f7000
mmap2(0xf76f7000, 647324, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_FIXED, 7, 0) = 0xf76f7000
mmap2(0xf7796000, 4990, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED, 7, 0x9e000) = 0xf7796000
mmap2(0xf7798000, 91824, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0xf7798000
close(7) = 0
munmap(0xf77af000, 4096) = 0
stat("/lib/ld-uClibc.so.0", {st_mode=S_IFREG|0775, st_size=25388, ...}) = 0
mmap2(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|1<<MAP_HUGE_SHIFT, -1, 0) = 0xf77af000
set_thread_area({entry_number=-1, base_addr=0xf77af690, limit=0x0fffff, seg_32bit=1, contents=0, read_exec_only=0, limit_in_pages=1, seg_not_present=0, useable=1}) = 0 (entry_number=12)
gettimeofday({tv_sec=1595835665, tv_usec=909210}, NULL) = 0
mprotect(0x80c4000, 4096, PROT_READ) = 0
mprotect(0xf7796000, 4096, PROT_READ) = 0
mprotect(0xf77b7000, 4096, PROT_READ) = 0
set_tid_address(0xf77af6f8) = 16103
set_robust_list(0xf77af6fc, 12) = 0
rt_sigaction(SIGRTMIN, {sa_handler=0xf775abdb, sa_mask=[], sa_flags=SA_RESTORER|SA_SIGINFO, sa_restorer=0xf7708eed}, NULL, 8) = 0
rt_sigaction(SIGRT_1, {sa_handler=0xf775aaf8, sa_mask=[], sa_flags=SA_RESTORER|SA_RESTART|SA_SIGINFO, sa_restorer=0xf7708eed}, NULL, 8) = 0
rt_sigprocmask(SIG_UNBLOCK, [RTMIN RT_1], NULL, 8) = 0
ugetrlimit(RLIMIT_STACK, {rlim_cur=8192*1024, rlim_max=RLIM_INFINITY}) = 0
uname({sysname="Linux", nodename="EAGLE-ECE555F60000", ...}) = 0
ioctl(0, TCGETS, {B115200 opost isig icanon echo ...}) = 0
ioctl(1, TCGETS, {B115200 opost isig icanon echo ...}) = 0
getuid32() = 0
socket(AF_INET, SOCK_STREAM, IPPROTO_IP) = 7
ioctl(7, SIOCSIFVLAN, 0xffb0ceb4) = -1 EBUSY (Device or resource busy)
write(2, "vconfig: ioctl error for add: De"..., 54vconfig: ioctl error for add: Device or resource busy
) = 54
exit_group(1) = ?
+++ exited with 1 +++
#
vconfig is a virtual ethernet configuration program. TCGETS (in the stack trace) is a tty (teletype terminal) configuration - i.e. serial port settings (hence the B115200 (baud rate 115200 in your strace). In other words, you are trying to configure a serial port as a vlan port. Are you sure you want to be doing that ? What is the device that port00 refers to ?

Can't listen on IPv4 and IPv6 together (address already in use)

I don't think the precise code is important. Instead, I'll give the strace output:
socket(AF_INET, SOCK_STREAM, IPPROTO_IP) = 5
fcntl(5, F_SETFL, O_WRONLY|O_NONBLOCK) = 0
epoll_ctl(3, EPOLL_CTL_ADD, 5, {EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLET, {u32=209357450, u64=94373525752458}}) = 0
setsockopt(5, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
bind(5, {sa_family=AF_INET, sin_port=htons(31337), sin_addr=inet_addr("0.0.0.0")}, 16) = 0
listen(5, 10) = 0
accept(5, 0x7f2a6aade440, [110]) = -1 EAGAIN (Resource temporarily unavailable)
stat("/etc/localtime", {st_mode=S_IFREG|0644, st_size=2265, ...}) = 0
stat("/etc/localtime", {st_mode=S_IFREG|0644, st_size=2265, ...}) = 0
stat("/etc/localtime", {st_mode=S_IFREG|0644, st_size=2265, ...}) = 0
stat("/etc/localtime", {st_mode=S_IFREG|0644, st_size=2265, ...}) = 0
write(1, "\33[1;30m09:44:34.625\33[0m\342\224\202\33[36m0"..., 130) = 130
socket(AF_INET6, SOCK_STREAM, IPPROTO_IP) = 6
fcntl(6, F_SETFL, O_WRONLY|O_NONBLOCK) = 0
epoll_ctl(3, EPOLL_CTL_ADD, 6, {EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLET, {u32=209357482, u64=94373525752490}}) = 0
setsockopt(6, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
bind(6, {sa_family=AF_INET6, sin6_port=htons(31337), inet_pton(AF_INET6, "::", &sin6_addr), sin6_flowinfo=htonl(0), sin6_scope_id=0}, 28) = -1 EADDRINUSE (Address already in use)
We can see that socket 5 (IPv4) successfully bound to port any:31337, but when I try to bind socket 6 (IPv6), it fails with EADDRINUSE.
You can also see that I did set SO_REUSEADDR on both sockets, so I believe this problem should not have happened.
$ lsb_release -a
No LSB modules are available.
Distributor ID: Ubuntu
Description: Ubuntu 17.10
Release: 17.10
Codename: artful
You don't need to listen to both IPv4 and IPv6, as listening for IPv6 connections is sufficient. Incoming IPv4 connections will be handled by a socket listing for IPv6 connections. The client addresses might show in a format like ::FFFF:192.168.1.1 then.

strace reports bad file descriptor

Trying to debug my program which doesn't return to the bash prompt, I used strace and gave it the PID. The program is a binary file and I don't have the source code. According to the strace, there is a -1 EBADF (Bad file descriptor). However, I don't know which file is problematic.
As you can see below the strace exits, so running lsof -p <PID> has no result.
read(5, "80\0\0\0\00078", 8) = 8
read(5, "prf-exit\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 80) = 80
rt_sigprocmask(SIG_BLOCK, NULL, [], 8) = 0
fstat(1, {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2af316b0f000
write(1, "\n", 1) = 1
read(5, "\0\0\0\0", 4) = 4
write(5, "\0\0\0\0", 4) = 4
socket(PF_INET, SOCK_STREAM, IPPROTO_IP) = 9
setsockopt(9, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
setsockopt(9, SOL_TCP, TCP_NODELAY, [1], 4) = 0
setsockopt(9, SOL_SOCKET, SO_SNDBUF, [65536], 4) = 0
setsockopt(9, SOL_SOCKET, SO_RCVBUF, [65536], 4) = 0
fcntl(9, F_GETFL) = 0x2 (flags O_RDWR)
fcntl(9, F_SETFL, O_RDWR) = 0
connect(9, {sa_family=AF_INET, sin_port=htons(45323), sin_addr=inet_addr("10.10.10.251")}, 16) = 0
write(9, "12345\0", 6) = 6
write(9, "15 NORMAL_EXITING\0", 19) = 19
read(9, "\0", 1) = 1
close(9) = 0
futex(0x2af31686d9d0, FUTEX_WAIT, 29590, NULL) = 0
futex(0x2af31666c9d0, FUTEX_WAIT, 29589, NULL) = 0
close(6) = 0
close(7) = 0
read(5, "\0\0\0\0", 4) = 4
write(5, "\0\0\0\0", 4) = 4
read(5, "\0\0\0\0", 4) = 4
write(5, "\0\0\0\0", 4) = 4
close(5) = 0
close(5) = -1 EBADF (Bad file descriptor)
socket(PF_INET, SOCK_STREAM, IPPROTO_IP) = 5
connect(5, {sa_family=AF_INET, sin_port=htons(49986), sin_addr=inet_addr("172.20.54.10")}, 16) = 0
setsockopt(5, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
setsockopt(5, SOL_TCP, TCP_NODELAY, [1], 4) = 0
write(5, "\35\0\0\0\0\0\0\0\1\0\0\0\0\0\0\0\1\0\0\0\1\0\0\0\0\0\0\0\0\0\0\0"..., 64) = 64
close(5) = 0
close(4) = 0
exit_group(0) = ?
Process 29588 detached
[root#compute-0-3 ~]# lsof -p 29588
[root#compute-0-3 ~]#
How can I search for the missing/erroneous file?
You're closing the same file descriptor twice:
close(5) = 0
close(5) = -1 EBADF (Bad file descriptor)
EBADF happens when a file descriptor number isn't mapped to a file. Thus, by definition, there is no file that's problematic.
Frankly, this isn't a bug, and certainly isn't the bug you're looking for. It's entirely common behavior to try to close FDs even if you're not sure they were open to begin with -- it's more efficient to do so than to try to check for whether a file descriptor is still open via other means and close it only conditionally.
Closing a file descriptor more than once is a very bad idea if your program is multi-threaded, because after you close it for the first time, another thread can call open() and be given the same file descriptor (but referring to a different file); at which point calling close for a second time in your thread will close the other thread's file. Always use something like:
close(x);
x = -1;
to make sure you don't inadvertently close a reused descriptor.

Program termination does not return back to bash

Recently, I asked a question but got down votes since guys thought it is not clear. However, I have found a hint which needs some digging...
There is a command line program called fluent. Problem is that in the Rocks, when I run it on the front-end and enter exit, it will return to the command prompt.
5991 nodes, binary.
5991 node flags, binary.
Done.
> exit
mahmood#cluster:~$
However, when I run the same command (the application is on /export/ which is a NFS drive) on the compute node via ssh, it doesn't return to the command prompt.
5991 nodes, binary.
5991 node flags, binary.
Done.
> exit
^C^C^Z
[1]+ Stopped /share/apps/fluent/bin/fluent 3d -g -t4 -i elbow.journal
mahmood#compute-0-3:~$ pkill fluent*
mahmood#compute-0-3:~$ fg
/share/apps/fluent/bin/fluent 3d -g -t4 -i elbow.journal
Terminated
As suggested, I tried with strace and attached it to the process multiple times since the application runs on multicores. In one attempt, the application returned back to the terminal. I noticed that in in the last lines of the strace, there is a difference between the outcome of futex.
In the correct execution, I see:
socket(PF_INET, SOCK_STREAM, IPPROTO_IP) = 12
setsockopt(12, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
setsockopt(12, SOL_TCP, TCP_NODELAY, [1], 4) = 0
setsockopt(12, SOL_SOCKET, SO_SNDBUF, [65536], 4) = 0
setsockopt(12, SOL_SOCKET, SO_RCVBUF, [65536], 4) = 0
fcntl(12, F_GETFL) = 0x2 (flags O_RDWR)
fcntl(12, F_SETFL, O_RDWR) = 0
connect(12, {sa_family=AF_INET, sin_port=htons(45470), sin_addr=inet_addr("10.10.10.251")}, 16) = 0
write(12, "12345\0", 6) = 6
write(12, "15 NORMAL_EXITING\0", 19) = 19
read(12, "\0", 1) = 1
close(12) = 0
futex(0x2b66afe5d9d0, FUTEX_WAIT, 12432, NULL) = 0
futex(0x2b66afc5c9d0, FUTEX_WAIT, 12427, NULL) = 0
close(6) = 0
close(7) = 0
close(8) = 0
close(9) = 0
close(10) = 0
shmdt(0x2b66af7d8000) = 0
shmdt(0x2b66b0018000) = 0
shmdt(0x2b66af3a8000) = 0
shmdt(0x2b66af638000) = 0
shmdt(0x2b66af758000) = 0
shmdt(0x2b66aff78000) = 0
shmdt(0x2b66af6d8000) = 0
shmdt(0x2b66afed8000) = 0
close(4) = 0
close(5) = 0
exit_group(0) = ?
Process 12420 detached
and in the buggy run, I see:
socket(PF_INET, SOCK_STREAM, IPPROTO_IP) = 9
setsockopt(9, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
setsockopt(9, SOL_TCP, TCP_NODELAY, [1], 4) = 0
setsockopt(9, SOL_SOCKET, SO_SNDBUF, [65536], 4) = 0
setsockopt(9, SOL_SOCKET, SO_RCVBUF, [65536], 4) = 0
fcntl(9, F_GETFL) = 0x2 (flags O_RDWR)
fcntl(9, F_SETFL, O_RDWR) = 0
connect(9, {sa_family=AF_INET, sin_port=htons(50825), sin_addr=inet_addr("10.10.10.251")}, 16) = 0
write(9, "12345\0", 6) = 6
write(9, "15 NORMAL_EXITING\0", 19) = 19
read(9, "\0", 1) = 1
close(9) = 0
futex(0x2b74f03659d0, FUTEX_WAIT, 13135, NULL) = -1 EAGAIN (Resource temporarily unavailable)
futex(0x2b74f01649d0, FUTEX_WAIT, 13132, NULL) = 0
close(6) = 0
close(7) = 0
shmdt(0x2b74efce0000) = 0
shmdt(0x2b74f03e0000) = 0
shmdt(0x2b74efbe0000) = 0
shmdt(0x2b74f0480000) = 0
shmdt(0x2b74ef8b0000) = 0
shmdt(0x2b74efb40000) = 0
shmdt(0x2b74efc60000) = 0
shmdt(0x2b74f0520000) = 0
close(4) = 0
close(5) = 0
exit_group(0) = ?
Process 13129 detached
As you can see, although both of them say exit_group(0), the latter says a resource is temporarily unavailable.
Any thought on that?

executing service (openoffice headless) in chroot is slow

I created an ubuntu chroot environment (using debootstrap) and started openoffice as a service to convert files using unoconv. It works but it spends about 20s to do conversions which are below 1s outside chroot jail. Stracing it I can see that this extra time is spent in some socket operations whose timeout expires, the same operations that work well without a chroot environment. However, similar operations works fine.
Any suggestion?
starting server (chrooted):
soffice -invisible -headless -nologo -nodefault "-accept=socket,host=localhost,port=8301;urp;StarOffice.ComponentContext" -env:UserInstallation=ootest2
calling it (chrooted too):
strace unoconv -f pdf -p 8300 simple_test.docx
timeout trace:
socket(PF_INET, SOCK_DGRAM, IPPROTO_IP) = 4
connect(4, {sa_family=AF_INET, sin_port=htons(53), sin_addr=inet_addr("127.0.0.1")}, 16) = 0
fcntl(4, F_GETFL) = 0x2 (flags O_RDWR)
fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK) = 0
poll([{fd=4, events=POLLOUT}], 1, 0) = 1 ([{fd=4, revents=POLLOUT}])
sendto(4, "M#\1\0\0\1\0\0\0\0\0\0\tlocalhost\6(none)\10mo"..., 47, MSG_NOSIGNAL, NULL, 0) = 47
poll([{fd=4, events=POLLIN}], 1, 5000) = 0 (Timeout)
close(4) = 0
similar trace without timeout (same unoconv operation):
socket(PF_INET, SOCK_DGRAM, IPPROTO_IP) = 4
connect(4, {sa_family=AF_INET, sin_port=htons(53), sin_addr=inet_addr("127.0.0.1")}, 16) = 0
fcntl(4, F_GETFL) = 0x2 (flags O_RDWR)
fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK) = 0
poll([{fd=4, events=POLLOUT}], 1, 0) = 1 ([{fd=4, revents=POLLOUT}])
sendto(4, "M#\1\0\0\1\0\0\0\0\0\0\tlocalhost\6(none)\10mo"..., 47, MSG_NOSIGNAL, NULL, 0) = 47
poll([{fd=4, events=POLLIN}], 1, 5000) = 1 ([{fd=4, revents=POLLIN}])
ioctl(4, FIONREAD, [47]) = 0
recvfrom(4, "M#\201\202\0\1\0\0\0\0\0\0\tlocalhost\6(none)\10mo"..., 1024, 0, {sa_family=AF_INET, sin_port=htons(53), sin_addr=inet_addr("127.0.0.1")}, [16]) = 47
close(4) = 0
chroot details:
using dchroot to use it as a normal user
mounted dev, proc, etc as (fstab):
/tmp /var/local/chrootest/tmp
/dev /var/local/chrootest/dev
/sys /var/local/chrootest/sys
proc-chroot /var/local/chrootest/proc
devpts-chroot /var/local/chrootest/dev/pts
unoconv does some wack DNS lookups. I added this to my /etc/hosts:
127.0.0.1 localhost localhost.(none)
And it made things significantly better.

Resources