How to exit child fork process correctly if error encountered when connected to oracle database using perl - linux

I have recently been using perl (v5.10.1) on a Linux system to connect to a database and perform some tasks.
To do this more efficiently I have been using fork() to be able to perform the tasks in parallel. Whilst doing this I have noticed some problems if the child exits with some sort of error (killed by kill command, dies etc.)
I have searched the forum for possible explanations but have not found anything related to using fork() while connected to a database.
Below is my initial program structure. My actual code is more complex but this simplified code illustrates the idea.
use strict;
use warnings;
use utf8;
use APR::UUID ;
use DBI ;
use DBD::Oracle ;
use Data::Dumper;
$ENV{'ORACLE_HOME'} = "/home/data/ora11g2" ;
$ENV{'NLS_LANG'} = "french_france.AL32UTF8" ;
$ENV{'LANG'} = "fr_FR.utf-8" ;
my $IDJOB = APR::UUID->new->format ;
my $DB="DB_val";
my $SRV="SRV_val";
my %attr = (
PrintError => 1,
RaiseError => 0
);
my %attr_CHILD = (
PrintError => 1,
RaiseError => 0
);
my $db = DBI->connect("dbi:Oracle:$SRV/$DB", "user", "pword", \%attr ) or die "impossible de se connecter à $SRV / $DB";
$db->{AutoCommit} = 0 ;
$db->{InactiveDestroy} = 1; # This needs to be set to 1 if any parallel processing will be used.
# Otherwise database is disconnected in parent after children have finished.
my $Crash_Error_String='';
my #Res1;
eval{#Res1=Mainsub($db)};
#
$Crash_Error_String=$# unless #Res1 ;
$Res1[0] = 501 unless #Res1 ;
print "ERROR code:" . $Res1[0] . " (Error string:$Crash_Error_String)\n";
$db->commit if defined($db) ;
$db->disconnect if defined($db) ;
#
#
#
sub Mainsub{
my $db=shift;
#
my $Program_Termination_Code=0;
my #Results=(0,0,0,0);
my $Processes_To_Use_After_Calc=4;
my $fh1PR_E_filename_STEM="/tmp/error_file_Test_parallel_rows_" . $IDJOB . "_";
my $forked = 0;
my $err = 0;
my #child_pids_rows;
my #child_ispawns_rows;
my $start = time;
for my $ispawn (1 .. $Processes_To_Use_After_Calc){
my $ispawn_XML=$ispawn-1;
my $child_pid = fork();
if(!defined $child_pid){
$err++
} else {
push #child_pids_rows,$child_pid;
push #child_ispawns_rows,$ispawn;
}
if(defined $child_pid && $child_pid > 0) {
## Parent
$forked++;
} elsif(defined $child_pid){
my $db_child;
my $fh1PR_E_filename=$fh1PR_E_filename_STEM . $ispawn . ".err";
#$SIG{__DIE__} = $SIG{TERM} = $SIG{INT} = sub {
# my $ERROR_Val=$!;
# open(my $fh1PR_E, '>:encoding(UTF-8)', $fh1PR_E_filename);
# print $fh1PR_E "Caught an errorsignal: $ERROR_Val (child $ispawn)";
# close $fh1PR_E;
# $db_child->commit unless $db_child->{AutoCommit};
# $db_child->disconnect if defined($db_child);
# exit;
#};
my $ERROR_Code_child=0;
$db_child = DBI->connect("dbi:Oracle:$SRV/$DB", "user", "pword", \%attr_CHILD ) or die "impossible de se connecter à $SRV / $DB";
$db_child->{AutoCommit} = 0 ;
$db_child->commit unless $db_child->{AutoCommit} ;
#
#
#
#my $ased=4/0 if $ispawn==2 || $ispawn==1;
$db_child->commit unless $db_child->{AutoCommit} ;
$db_child->disconnect if defined($db_child) ;
exit;
} else {
## unable to fork
$err++;
}
}
my $Total_Children_Errors=0;
my $Total_Children_Exited=0;
my $Error_Messages="";
while (scalar #child_pids_rows) {
my $pid = $child_pids_rows[0];
my $ispawn=$child_ispawns_rows[0];
my $kid = waitpid $pid, 0;
my $ERROR_Count=0;
if($kid > 0){
my ($rc, $sig, $core) = ($? >> 8, $? & 127, $? & 128);
if ($core){
$ERROR_Count++;
$Total_Children_Errors++;
$Error_Messages eq "" ? $Error_Messages="$pid dumped core" : $Error_Messages=$Error_Messages . "\n" . "$pid dumped core";
} elsif($sig == 9){
$ERROR_Count++;
$Total_Children_Errors++;
$Error_Messages eq "" ? $Error_Messages="$pid was murdered!" : $Error_Messages=$Error_Messages . "\n" . "$pid was murdered!";
} else {
print "$pid returned $rc";
print ($sig?" after receiving signal $sig":"\n");
my $fname=$fh1PR_E_filename_STEM . $ispawn . ".err";
if(-f "$fname"){
$Total_Children_Errors++;
$ERROR_Count++;
if($Error_Messages eq ""){
$Error_Messages="Error found in parallel row process $ispawn (see file " . $fh1PR_E_filename_STEM . $ispawn . ".err for details)";
} else {
$Error_Messages=$Error_Messages . "\n" . "Error found in parallel row process $ispawn (see file " .
$fh1PR_E_filename_STEM . $ispawn . ".err for details)";
}
}
}
} else {
$ERROR_Count++;
$Total_Children_Errors++;
$Error_Messages eq "" ? $Error_Messages="$pid... um... disappeared..." : $Error_Messages=$Error_Messages . "\n" . "$pid... um... disappeared...";
}
$Total_Children_Exited++;
if($ERROR_Count==0){
print "Child $pid exited successfully (" . eval($forked-$Total_Children_Exited) . " of " . $forked . " Children left)\n";
} else {
print "Child $pid exited with ERROR! (" . eval($forked-$Total_Children_Exited) . " of " . $forked . " Children left)\n";
}
shift #child_pids_rows;
shift #child_ispawns_rows;
}
#print "Total child errors:$Total_Children_Errors\n";
if($Total_Children_Errors>0){
$Program_Termination_Code=915;
print $Error_Messages . "\n";
#Results=($Program_Termination_Code,0,0);
goto END101;
} else {
if($err > 0){
$Program_Termination_Code=919;
#Results=($Program_Termination_Code,0,0);
goto END101;
} else {
print "ALL Child processes terminated correctly (Parallel Rows)!\n";
}
}
END101:
return #Results;
}
Running this code produces the output:
27713 returned 0
Child 27713 exited successfully (3 of 4 Children left)
27714 returned 0
Child 27714 exited successfully (2 of 4 Children left)
27715 returned 0
Child 27715 exited successfully (1 of 4 Children left)
27716 returned 0
Child 27716 exited successfully (0 of 4 Children left)
ALL Child processes terminated correctly (Parallel Rows)!
ERROR code:0 (Error string:)
So far no problems. However, now I introduce a deliberate division by zero error in the child process by uncommenting the line (see original code above)
my $ased=4/0 if $ispawn==2 || $ispawn==1;
Now I get the output
ERROR code:501 (Error string:Illegal division by zero at /home/public/AGO/testcode/BArt_F/perl/DB_forking_with_errors_test_code_1.pl line 83.)
ERROR code:501 (Error string:Illegal division by zero at /home/public/AGO/testcode/BArt_F/perl/DB_forking_with_errors_test_code_1.pl line 83.)
30744 returned 0
Child 30744 exited successfully (3 of 4 Children left)
30745 returned 0
Child 30745 exited successfully (2 of 4 Children left)
30746 returned 0
Child 30746 exited successfully (1 of 4 Children left)
30747 returned 0
Child 30747 exited successfully (0 of 4 Children left)
ALL Child processes terminated correctly (Parallel Rows)!
ERROR code:0 (Error string:)
DBD::Oracle::db commit failed: ORA-03113: fin de fichier sur canal de communication
ID de processus : 22739
ID de session : 1, Numéro de série : 54727 (DBD ERROR: OCITransCommit) at /home/public/AGO/testcode/BArt_F/perl/DB_forking_with_errors_test_code_1.pl line 35.
Here I have lost the connection to the database in the parent and the code does not terminate correctly!
Finally, to sort this out, I uncomment the code in the child process (see original code above):
$SIG{__DIE__} = $SIG{TERM} = $SIG{INT} = sub {
my $ERROR_Val=$!;
open(my $fh1PR_E, '>:encoding(UTF-8)', $fh1PR_E_filename);
print $fh1PR_E "Caught an errorsignal: $ERROR_Val (child $ispawn)";
close $fh1PR_E;
$db_child->commit unless $db_child->{AutoCommit};
$db_child->disconnect if defined($db_child);
exit;
};
Now running the code I get:
946 returned 0
Child 946 exited with ERROR! (3 of 4 Children left)
947 returned 0
Child 947 exited with ERROR! (2 of 4 Children left)
948 returned 0
Child 948 exited successfully (1 of 4 Children left)
949 returned 0
Child 949 exited successfully (0 of 4 Children left)
Error found in parallel row process 1 (see file /tmp/error_file_Test_parallel_rows_53a6e838-def0-11eb-b482-8f8e0f0aecb2_1.err for details)
Error found in parallel row process 2 (see file /tmp/error_file_Test_parallel_rows_53a6e838-def0-11eb-b482-8f8e0f0aecb2_2.err for details)
ERROR code:915 (Error string:)
Now the error is trapped and the parent exits correctly.
All of this seems fine until I have read (https://www.perlmonks.org/?node_id=1173708) that I should not use
$SIG{__DIE__}
However I cannot find any alternative method that allows my parent program to exit correctly if any of the child processes die.
Could anyone tell me if there is an alternative method to using
$SIG{__DIE__}

Related

ssh port forwarding ("ssh -fNL") doesn't work via expect spawn to automatically provide password

I know that to do port forwarding, the command is ssh -L. I also use other options to decorate it. So for example, a final full command may look like this ssh -fCNL *:10000:127.0.0.1:10001 127.0.0.1. And everything just works after entering password.
Then, because there is not only one port need to be forwarded, I decide to leave the job to shell script and use expect(tcl) to provide passwords(all the same).
Although without a deep understanding of expect, I managed to write the code with the help of Internet. The script succeeds spawning ssh and provides correct password. But I end up finding there is no such process when I try to check using ps -ef | grep ssh and netstat -anp | grep 10000.
I give -v option to ssh and the output seems to be fine.
So where is the problem? I have searched through Internet but most of questions are not about port forwarding. I'm not sure whether it is proper to use expect while I just want to let script automatically provide password.
Here the script.
#!/bin/sh
# Port Forwarding
# set -x
## function definition
connection ()
{
ps -ef | grep -v grep | grep ssh | grep $1 | grep $2 > /dev/null
if [ $? -eq 0 ] ; then
echo "forward $1 -> $2 done"
exit 0
fi
# ssh-keygen -f "$HOME/.ssh/known_hosts" -R "127.0.0.1"
/usr/bin/expect << EOF
set timeout 30
spawn /usr/bin/ssh -v -fCNL *:$1:127.0.0.1:$2 127.0.0.1
expect {
"yes/no" {send "yes\r" ; exp_continue}
"password:" {send "1234567\r" ; exp_continue}
eof
}
catch wait result
exit [lindex \$result 3]
EOF
echo "expect ssh return $?"
echo "forward $1 -> $2 done"
}
## check expect available
which expect > /dev/null
if [ $? -ne 0 ] ; then
echo "command expect not available"
exit 1
fi
login_port="10000"
forward_port="10001"
## check whether the number of elements is equal
login_port_num=$(echo ${login_port} | wc -w)
forward_port_num=$(echo ${forward_port} | wc -w)
if [ ${login_port_num} -ne ${forward_port_num} ] ; then
echo "The numbers of login ports and forward ports are not equal"
exit 1
fi
port_num=${login_port_num}
## provide pair of arguments to ssh main function
index=1
while [ ${index} -le ${port_num} ] ; do
login_p=$(echo ${login_port} | awk '{print $'$index'}')
forward_p=$(echo ${forward_port} | awk '{print $'$index'}')
connection ${login_p} ${forward_p}
index=$((index + 1))
done
Here the output from script
spawn /usr/bin/ssh -v -fCNL *:10000:127.0.0.1:10001 127.0.0.1
OpenSSH_7.2p2 Ubuntu-4ubuntu2.10, OpenSSL 1.0.2g 1 Mar 2016
...
debug1: Next authentication method: password
wang#127.0.0.1's password:
debug1: Enabling compression at level 6.
debug1: Authentication succeeded (password).
Authenticated to 127.0.0.1 ([127.0.0.1]:22).
debug1: Local connections to *:10000 forwarded to remote address 127.0.0.1:10001
debug1: Local forwarding listening on 0.0.0.0 port 10000.
debug1: channel 0: new [port listener]
debug1: Local forwarding listening on :: port 10000.
debug1: channel 1: new [port listener]
debug1: Requesting no-more-sessions#openssh.com
debug1: forking to background
expect ssh return 0
forward 10000 -> 10001 done
This should work for you:
spawn -ignore SIGHUP ssh -f ...
UPDATE:
Another workaround is:
spawn bash -c "ssh -f ...; sleep 1"
UPDATE 2 (a bit explanation):
ssh -f calls daemon() to make itself a daemon. See ssh.c in the souce code:
/* Do fork() after authentication. Used by "ssh -f" */
static void
fork_postauth(void)
{
if (need_controlpersist_detach)
control_persist_detach();
debug("forking to background");
fork_after_authentication_flag = 0;
if (daemon(1, 1) == -1)
fatal("daemon() failed: %.200s", strerror(errno));
}
daemon() is implemented like this:
int
daemon(int nochdir, int noclose)
{
int fd;
switch (fork()) {
case -1:
return (-1);
case 0:
break;
default:
_exit(0);
}
if (setsid() == -1)
return (-1);
if (!nochdir)
(void)chdir("/");
if (!noclose && (fd = open(_PATH_DEVNULL, O_RDWR, 0)) != -1) {
(void)dup2(fd, STDIN_FILENO);
(void)dup2(fd, STDOUT_FILENO);
(void)dup2(fd, STDERR_FILENO);
if (fd > 2)
(void)close (fd);
}
return (0);
}
There's a race condition (not sure if its the correct term for here) between _exit() in the parent process and setsid() in the child process. Here _exit() would always complete first since "the function _exit() terminates the calling process immediately" and setsid() is much more heavy weight. So when the parent process exits, setsid() is not effective yet and the child process is still in the same session as the parent process. According to the apue book (I'm referring to the 2005 edition, Chapter 10: Signals), SIGHUP "is also generated if the session leader terminates. In this case, the signal is sent to each process in the foreground process group."
In brief:
Expect allocates a pty and runs ssh on the pty. Here, ssh would be running in a new session and be the session leader.
ssh -f calls daemon(). The parent process (session leader) calls _exit(). At this time, the child process is still in the session so it'll get SIGHUP whose default behavior is to terminate the process.
How the workarounds works:
The nohup way (spawn -ignore SIGHUP) is to explicitly ask the process to ignore SIGHUP so it'll not be terminated.
For bash -c 'sshh -f ...; sleep 1', bash would be the session leader and sleep 1 in the end prevents the session leader from exiting too soon. So after sleep 1, the child ssh process's setsid() has already done and child ssh is already in a new process session.
UPDATE 3:
You can compile ssh with the following modification (in ssh.c) and verify:
static int
my_daemon(int nochdir, int noclose)
{
int fd;
switch (fork()) {
case -1:
return (-1);
case 0:
break;
default:
// wait a while for child's setsid() to complete
sleep(1);
// ^^^^^^^^
_exit(0);
}
if (setsid() == -1)
return (-1);
if (!nochdir)
(void)chdir("/");
if (!noclose && (fd = open(_PATH_DEVNULL, O_RDWR, 0)) != -1) {
(void)dup2(fd, STDIN_FILENO);
(void)dup2(fd, STDOUT_FILENO);
(void)dup2(fd, STDERR_FILENO);
if (fd > 2)
(void)close (fd);
}
return (0);
}
/* Do fork() after authentication. Used by "ssh -f" */
static void
fork_postauth(void)
{
if (need_controlpersist_detach)
control_persist_detach();
debug("forking to background");
fork_after_authentication_flag = 0;
if (my_daemon(1, 1) == -1)
// ^^^^^^^^^
fatal("my_daemon() failed: %.200s", strerror(errno));
}

Run command in golang and detach it from process

Problem:
I'm writing program in golang on linux that needs to execute long running process so that:
I redirect stdout of running process to file.
I control the user of process.
Process doesn't die when my program exits.
The process doesn't become a zombie when it crashes.
I get PID of running process.
I'm running my program with root permissions.
Attempted solution:
func Run(pathToBin string, args []string, uid uint32, stdLogFile *os.File) (int, error) {
cmd := exec.Command(pathToBin, args...)
cmd.SysProcAttr = &syscall.SysProcAttr{
Credential: &syscall.Credential{
Uid: uid,
},
}
cmd.Stdout = stdLogFile
if err := cmd.Start(); err != nil {
return -1, err
}
go func() {
cmd.Wait() //Wait is necessary so cmd doesn't become a zombie
}()
return cmd.Process.Pid, nil
}
This solution seems to satisfy almost all of my requirements except that when I send SIGTERM/SIGKILL to my program the underlying process crashes. In fact I want my background process to be as separate as possible: it has different parent pid, group pid etc. from my program. I want to run it as daemon.
Other solutions on stackoverflow suggested to use cmd.Process.Release() for similar use cases, but it doesn't seem to work.
Solutions which are not applicable in my case:
I have no control over code of process I'm running. My solution has to work for any process.
I can't use external commands to run it, just pure go. So using systemd or something similar is not applicable.
I can in fact use library that is easily importable using import from github etc.
TLDR;
Just use https://github.com/hashicorp/go-reap
There is a great Russian expression which reads "don't try to give birth to a bicycle" and it means don't reinvent the wheel and keep it simple. I think it applies here. If I were you, I'd reconsider using one of:
https://github.com/krallin/tini
https://busybox.net/
https://software.clapper.org/daemonize/
https://wiki.gentoo.org/wiki/OpenRC
https://www.freedesktop.org/wiki/Software/systemd/
This issue has already been solved ;)
Your question is imprecise or you are asking for non-standard features.
In fact I want my background process to be as separate as possible: it has different parent pid, group pid etc. from my program. I want to run it as daemon.
That is not how process inheritance works. You can not have process A start Process B and somehow change the parent of B to C. To the best of my knowledge this is not possible in Linux.
In other words, if process A (pid 55) starts process B (100), then B must have parent pid 55.
The only way to avoid that is have something else start the B process such as atd, crond, or something else - which is not what you are asking for.
If parent 55 dies, then PID 1 will be the parent of 100, not some arbitrary process.
Your statement "it has different parent pid" does not makes sense.
I want to run it as daemon.
That's excellent. However, in a GNU / Linux system, all daemons have a parent pid and those parents have a parent pid going all the way up to pid 1, strictly according to the parent -> child rule.
when I send SIGTERM/SIGKILL to my program the underlying process crashes.
I can not reproduce that behavior. See case8 and case7 from the proof-of-concept repo.
make case8
export NOSIGN=1; make build case7
unset NOSIGN; make build case7
$ make case8
{ sleep 6 && killall -s KILL zignal; } &
./bin/ctrl-c &
sleep 2; killall -s TERM ctrl-c
kill with:
{ pidof ctrl-c; pidof signal ; } | xargs -r -t kill -9
main() 2476074
bashed 2476083 (2476081)
bashed 2476084 (2476081)
bashed 2476085 (2476081)
zignal 2476088 (2476090)
go main() got 23 urgent I/O condition
go main() got 23 urgent I/O condition
zignal 2476098 (2476097)
go main() got 23 urgent I/O condition
zignal 2476108 (2476099)
main() wait...
p 2476088
p 2476098
p 2476108
p 2476088
go main() got 15 terminated
sleep 1; killall -s TERM ctrl-c
p 2476098
p 2476108
p 2476088
go main() got 15 terminated
sleep 1; killall -s TERM ctrl-c
p 2476098
p 2476108
p 2476088
Bash c 2476085 EXITs ELAPSED 4
go main() got 17 child exited
go main() got 23 urgent I/O condition
main() children done: 1 %!s(<nil>)
main() wait...
go main() got 15 terminated
go main() got 23 urgent I/O condition
sleep 1; killall -s KILL ctrl-c
p 2476098
p 2476108
p 2476088
balmora: ~/src/my/go/doodles/sub-process [main]
$ p 2476098
p 2476108
Bash _ 2476083 EXITs ELAPSED 6
Bash q 2476084 EXITs ELAPSED 8
The bash processes keep running after the parent is killed.
killall -s KILL ctrl-c;
All 3 "zignal" sub-processes are running until killed by
killall -s KILL zignal;
In both cases the sub-processes continue to run despite main process being signaled with TERM, HUP, INT. This behavior is different in a shell environment because of convenience reasons. See the related questions about signals. This particular answer illustrates a key difference for SIGINT. Note that SIGSTOP and SIGKILL cannot be caught by an application.
It was necessary to clarify the above before proceeding with the other parts of the question.
So far you have already solved the following:
redirect stdout of sub-process to a file
set owner UID of sub-process
sub-process survives death of parent (my program exits)
the PID of sub-process can be seen by the main program
The next one depends on whether the children are "attached" to a shell or not
sub-process survives the parent being killed
The last one is hard to reproduce, but I have heard about this problem in the docker world, so the rest of this answer is focused on addressing this issue.
sub-process survives if the parent crashes and does not become a zombie
As you have noted, the Cmd.Wait() is necessary to avoid creating zombies. After some experimentation I was able to consistency produce zombies in a docker environment using an intentionally simple replacement for /bin/sh. This "shell" implemented in go will only run a single command and not much else in terms of reaping children. You can study the code over at github.
The zombie solution
the simple wrapper which causes zombies
package main
func main() {
Sh()
}
The reaper wrapper
package main
import (
"fmt"
"sync"
"github.com/fatih/color"
"github.com/hashicorp/go-reap"
)
func main() {
if reap.IsSupported() {
done := make(chan struct{})
var reapLock sync.RWMutex
pids := make(reap.PidCh, 1)
errors := make(reap.ErrorCh, 1)
go reap.ReapChildren(pids, errors, done, &reapLock)
go report(pids, errors, done)
Sh()
close(done)
} else {
fmt.Println("Sorry, go-reap isn't supported on your platform.")
}
}
func report(pids reap.PidCh, errors reap.ErrorCh, done chan struct{}) {
sprintf := color.New(color.FgWhite, color.Bold).SprintfFunc()
for ;; {
select {
case pid := <-pids:
println(sprintf("raeper pid %d", pid))
case err := <-errors:
println(sprintf("raeper er %s", err))
case <-done:
return
}
}
}
The init / sh (pid 1) process which runs other commands
package main
import (
"os"
"os/exec"
"strings"
"time"
"github.com/google/shlex"
"github.com/tox2ik/go-poc-reaper/fn"
)
func Sh() {
args := os.Args[1:]
script := args[0:0]
if len(args) >= 1 {
if args[0] == "-c" {
script = args[1:]
}
}
if len(script) == 0 {
fn.CyanBold("cmd: expecting sh -c 'foobar'")
os.Exit(111)
}
var cmd *exec.Cmd
parts, _ := shlex.Split(strings.Join(script, " "))
if len(parts) >= 2 {
cmd = fn.Merge(exec.Command(parts[0], parts[1:]...), nil)
}
if len(parts) == 1 {
cmd = fn.Merge(exec.Command(parts[0]), nil)
}
if fn.IfEnv("HANG") {
fn.CyanBold("cmd: %v\n start", parts)
ex := cmd.Start()
if ex != nil {
fn.CyanBold("cmd %v err: %s", parts, ex)
}
go func() {
time.Sleep(time.Millisecond * 100)
errw := cmd.Wait()
if errw != nil {
fn.CyanBold("cmd %v err: %s", parts, errw)
} else {
fn.CyanBold("cmd %v all done.", parts)
}
}()
fn.CyanBold("cmd: %v\n dispatched, hanging forever (i.e. to keep docker running)", parts)
for {
time.Sleep(time.Millisecond * time.Duration(fn.EnvInt("HANG", 2888)))
fn.SystemCyan("/bin/ps", "-e", "-o", "stat,comm,user,etime,pid,ppid")
}
} else {
if fn.IfEnv("NOWAIT") {
ex := cmd.Start()
if ex != nil {
fn.CyanBold("cmd %v start err: %s", parts, ex)
}
} else {
ex := cmd.Run()
if ex != nil {
fn.CyanBold("cmd %v run err: %s", parts, ex)
}
}
fn.CyanBold("cmd %v\n dispatched, exit docker.", parts)
}
}
The Dockerfile
FROM scratch
# for sh.go
ENV HANG ""
# for sub-process.go
ENV ABORT ""
ENV CRASH ""
ENV KILL ""
# for ctrl-c.go, signal.go
ENV NOSIGN ""
COPY bin/sh /bin/sh ## <---- wrapped or simple /bin/sh or "init"
COPY bin/sub-process /bin/sub-process
COPY bin/zleep /bin/zleep
COPY bin/fork-if /bin/fork-if
COPY --from=busybox:latest /bin/find /bin/find
COPY --from=busybox:latest /bin/ls /bin/ls
COPY --from=busybox:latest /bin/ps /bin/ps
COPY --from=busybox:latest /bin/killall /bin/killall
Remaining code / setup can be seen here:
https://github.com/tox2ik/go-poc-reaper
Case 5 (simple /bin/sh)
The gist of it is we start two sub-processes from go, using the "parent" sub-process binary. The first child is zleep and the second fork-if. The second one starts a "daemon" that runs a forever-loop in addition to a few short-lived threads. After a while, we kill the sub-procss parent, forcing sh to take over the parenting for these children.
Since this simple implementation of sh does not know how to deal with abandoned children, the children become zombies.
This is standard behavior. To avoid this, init systems are usually responsible for cleaning up any such children.
Check out this repo and run the cases:
$ make prep build
$ make prep build2
The first one will use the simple /bin/sh in the docker container, and the socond one will use the same code wrapped in a reaper.
With zombies:
$ make prep build case5
(…)
main() Daemon away! 16 (/bin/zleep)
main() Daemon away! 22 (/bin/fork-if)
(…)
main() CRASH imminent
panic: runtime error: invalid memory address or nil pointer dereference
[signal SIGSEGV: segmentation violation code=0x1 addr=0x0 pc=0x49e45c]
goroutine 1 [running]:
main.main()
/home/jaroslav/src/my/go/doodles/sub-process/sub-process.go:137 +0xfc
cmd [/bin/sub-process /log/case5 3 /bin/zleep 111 2 -- /dev/stderr 3 /bin/fork-if --] err: exit status 2
Child '1' done
thread done
STAT COMMAND USER ELAPSED PID PPID
R sh 0 0:02 1 0
S zleep 3 0:02 16 1
Z fork-if 3 0:02 22 1
R fork-child-A 3 0:02 25 1
R fork-child-B 3 0:02 26 25
S fork-child-C 3 0:02 27 26
S fork-daemon 3 0:02 28 27
R ps 0 0:01 30 1
Child '2' done
thread done
daemon
(…)
STAT COMMAND USER ELAPSED PID PPID
R sh 0 0:04 1 0
Z zleep 3 0:04 16 1
Z fork-if 3 0:04 22 1
Z fork-child-A 3 0:04 25 1
R fork-child-B 3 0:04 26 1
S fork-child-C 3 0:04 27 26
S fork-daemon 3 0:04 28 27
R ps 0 0:01 33 1
(…)
With reaper:
$ make -C ~/src/my/go/doodles/sub-process case5
(…)
main() CRASH imminent
(…)
Child '1' done
thread done
raeper pid 24
STAT COMMAND USER ELAPSED PID PPID
S sh 0 0:02 1 0
S zleep 3 0:01 18 1
R fork-child-A 3 0:01 27 1
R fork-child-B 3 0:01 28 27
S fork-child-C 3 0:01 30 28
S fork-daemon 3 0:01 31 30
R ps 0 0:01 32 1
Child '2' done
thread done
raeper pid 27
daemon
STAT COMMAND USER ELAPSED PID PPID
S sh 0 0:03 1 0
S zleep 3 0:02 18 1
R fork-child-B 3 0:02 28 1
S fork-child-C 3 0:02 30 28
S fork-daemon 3 0:02 31 30
R ps 0 0:01 33 1
STAT COMMAND USER ELAPSED PID PPID
S sh 0 0:03 1 0
S zleep 3 0:02 18 1
R fork-child-B 3 0:02 28 1
S fork-child-C 3 0:02 30 28
S fork-daemon 3 0:02 31 30
R ps 0 0:01 34 1
raeper pid 18
daemon
STAT COMMAND USER ELAPSED PID PPID
S sh 0 0:04 1 0
R fork-child-B 3 0:03 28 1
S fork-child-C 3 0:03 30 28
S fork-daemon 3 0:03 31 30
R ps 0 0:01 35 1
(…)
Here is a picture of the same output, which may be less confusing to read.
Zombies
Reaper
How to run the cases in the poc repo
Get the code
git clone https://github.com/tox2ik/go-poc-reaper.git
One terminal:
make tail-cases
Another terminal
make prep
make build
or make build2
make case0 case1
...
Related questions:
go
How to create a daemon process in Golang?
How to start a Go program as a daemon in Ubuntu?
how to keep subprocess running after program exit in golang?
Prevent Ctrl+C from interrupting exec.Command in Golang
signals
https://unix.stackexchange.com/questions/386999/what-terminal-related-signals-are-sent-to-the-child-processes-of-the-shell-direc
https://unix.stackexchange.com/questions/6332/what-causes-various-signals-to-be-sent
https://en.wikipedia.org/wiki/Signal_(IPC)#List_of_signals
Related discussions:
https://github.com/golang/go/issues/227
https://blog.phusion.nl/2015/01/20/docker-and-the-pid-1-zombie-reaping-problem/
Relevant projects:
http://software.clapper.org/daemonize/ (what I would use)
https://github.com/hashicorp/go-reap (if you must have run go on pid 1)
https://github.com/sevlyar/go-daemon (mimics posix fork)
Relevant prose:
A zombie process is a process whose execution is completed but it still has an entry in the process table. Zombie processes usually occur for child processes, as the parent process still needs to read its child’s exit status. Once this is done using the wait system call, the zombie process is eliminated from the process table. This is known as reaping the zombie process.
from https://www.tutorialspoint.com/what-is-zombie-process-in-linux

Who runs first in fork, with contradicting results

I have this simple test:
int main() {
int res = fork();
if (res == 0) { // child
printf("Son running now, pid = %d\n", getpid());
}
else { // parent
printf("Parent running now, pid = %d\n", getpid());
wait(NULL);
}
return 0;
}
When I run it a hundred times, i.e. run this command,
for ((i=0;i<100;i++)); do echo ${i}:; ./test; done
I get:
0:
Parent running now, pid = 1775
Son running now, pid = 1776
1:
Parent running now, pid = 1777
Son running now, pid = 1778
2:
Parent running now, pid = 1779
Son running now, pid = 1780
and so on; whereas when I first write to a file and then read the file, i.e. run this command,
for ((i=0;i<100;i++)); do echo ${i}:; ./test; done > forout
cat forout
I get it flipped! That is,
0:
Son running now, pid = 1776
Parent running now, pid = 1775
1:
Son running now, pid = 1778
Parent running now, pid = 1777
2:
Son running now, pid = 1780
Parent running now, pid = 1779
I know about the scheduler. What does this result not mean, in terms of who runs first after forking?
The forking function, do_fork() (at kernel/fork.c) ends with setting the need_resched flag to 1, with the comment by kernel developers saying, "let the child process run first."
I guessed that this has something to do with the buffers that the printf writes to.
Also, is it true to say that the input redirection (>) writes everything to a buffer first and only then copies to the file? And even so, why would this change the order of the prints?
Note: I am running the test on a single-core virtual machine with a Linux kernel v2.4.14.
Thank you for your time.
When you redirect, glibc detects that stdout is not tty turns on output buffering for efficiency. The buffer is therefore not written until the process exits. You can see this with e.g.:
int main() {
printf("hello world\n");
sleep(60);
}
When you run it interactively, it prints "hello world" and waits. When you redirect to a file, you will see that nothing is written for 60 seconds:
$ ./foo > file & tail -f file
(no output for 60 seconds)
Since your parent process waits for the child, it will necessarily always exit last, and therefore flush its output last.

gdb stops in a command file if there is an error. How to continue despite the error?

I my real gdb script while analyzing a core file I try to dereference a pointer and get "Error in sourced command file: Cannot access memory at address " and then my gdb script stops. What I want is just to go on executing my gdb script without stopping. Is it possible?
This is a test program and a test gdb script that demonstrates my problem. In this situation the pointer has NULL value but in a real situation the pointer will like have not null invalid value.
This is test C program:
#include <stdio.h>
struct my_struct {
int v1;
int v2;
};
int main()
{
my_struct *p;
printf("%d %d\n", p->v1, p->v2);
return 0;
}
This is a test gdb script:
>cat analyze.gdb
p p->v1
q
And this is demonstration of the problem (what I want from gdb here is to get this error message and then go process quit command):
>gdb -silent a.out ./core.22384 -x ./analyze.gdb
Reading symbols from /a.out...done.
[New Thread 22384]
Core was generated by `./a.out'.
Program terminated with signal 11, Segmentation fault.
#0 0x0000000000400598 in main () at main.cpp:11
11 printf("%d %d\n", p->v1, p->v2);
./analyze.gdb:1: Error in sourced command file:
Cannot access memory at address 0x0
Missing separate debuginfos, use: debuginfo-install glibc-2.12-1.80.el6.x86_64
Update
Thanks to Tom. This is a gdb script that handles this problem:
>cat ./analyze.v2.gdb
python
def my_ignore_errors(arg):
try:
gdb.execute("print \"" + "Executing command: " + arg + "\"")
gdb.execute (arg)
except:
gdb.execute("print \"" + "ERROR: " + arg + "\"")
pass
my_ignore_errors("p p")
my_ignore_errors("p p->v1")
gdb.execute("quit")
This is how it works:
>gdb -silent ./a.out -x ./analyze.v2.gdb -c ./core.15045
Reading symbols from /import/home/a.out...done.
[New Thread 15045]
Core was generated by `./a.out'.
Program terminated with signal 11, Segmentation fault.
#0 0x0000000000400598 in main () at main.cpp:11
11 printf("%d %d\n", p->v1, p->v2);
$1 = "Executing command: p p"
$2 = (my_struct *) 0x0
$3 = "Executing command: p p->v1"
$4 = "ERROR: p p->v1"
$5 = "Executing command: quit"
gdb's command language doesn't have a way to ignore an error when processing a command.
This is easily done, though, if your gdb was built with the Python extension. Search for the "ignore-errors" script. With that, you can:
(gdb) ignore-errors print *foo
... and any errors from print will be shown but not abort the rest of your script.
You can also do this:
gdb a.out < analyze.v2.gdb
This will execute the commands in analyze.v2.gdb line by line, even if an error occurs.
If you just want to exit if any error occurs, you can use the -batch gdb option:
Run in batch mode. Exit with status 0 after processing all the command
files specified with ‘-x’ (and all commands from initialization files,
if not inhibited with ‘-n’). Exit with nonzero status if an error
occurs in executing the GDB commands in the command files. [...]

Capture both exit status and output from a system call in R

I've been playing a bit with system() and system2() for fun, and it struck me that I can save either the output or the exit status in an object. A toy example:
X <- system("ping google.com",intern=TRUE)
gives me the output, whereas
X <- system2("ping", "google.com")
gives me the exit status (1 in this case, google doesn't take ping). If I want both the output and the exit status, I have to do 2 system calls, which seems a bit overkill. How can I get both with using only one system call?
EDIT : I'd like to have both in the console, if possible without going over a temporary file by using stdout="somefile.ext" in the system2 call and subsequently reading it in.
As of R 2.15, system2 will give the return value as an attribute when stdout and/or stderr are TRUE. This makes it easy to get the text output and return value.
In this example, ret ends up being a string with an attribute "status":
> ret <- system2("ls","xx", stdout=TRUE, stderr=TRUE)
Warning message:
running command ''ls' xx 2>&1' had status 1
> ret
[1] "ls: xx: No such file or directory"
attr(,"status")
[1] 1
> attr(ret, "status")
[1] 1
I am a bit confused by your description of system2, because it has stdout and stderr arguments. So it is able to return both exit status, stdout and stderr.
> out <- tryCatch(ex <- system2("ls","xx", stdout=TRUE, stderr=TRUE), warning=function(w){w})
> out
<simpleWarning: running command ''ls' xx 2>&1' had status 2>
> ex
[1] "ls: cannot access xx: No such file or directory"
> out <- tryCatch(ex <- system2("ls","-l", stdout=TRUE, stderr=TRUE), warning=function(w){w})
> out
[listing snipped]
> ex
[listing snipped]
I suggest using this function here:
robust.system <- function (cmd) {
stderrFile = tempfile(pattern="R_robust.system_stderr", fileext=as.character(Sys.getpid()))
stdoutFile = tempfile(pattern="R_robust.system_stdout", fileext=as.character(Sys.getpid()))
retval = list()
retval$exitStatus = system(paste0(cmd, " 2> ", shQuote(stderrFile), " > ", shQuote(stdoutFile)))
retval$stdout = readLines(stdoutFile)
retval$stderr = readLines(stderrFile)
unlink(c(stdoutFile, stderrFile))
return(retval)
}
This will only work on a Unix-like shell that accepts > and 2> notations, and the cmd argument should not redirect output itself. But it does the trick:
> robust.system("ls -la")
$exitStatus
[1] 0
$stdout
[1] "total 160"
[2] "drwxr-xr-x 14 asieira staff 476 10 Jun 18:18 ."
[3] "drwxr-xr-x 12 asieira staff 408 9 Jun 20:13 .."
[4] "-rw-r--r-- 1 asieira staff 6736 5 Jun 19:32 .Rapp.history"
[5] "-rw-r--r-- 1 asieira staff 19109 11 Jun 20:44 .Rhistory"
$stderr
character(0)

Resources