Commit 44dcf3c6 authored by Mike Hibler's avatar Mike Hibler

Fix small race conditions introduced when fixing the larger race condition.

Since SIGCHLD could be delayed due to blocking in progmode(), need to
be ready for those when they finally do happen (i.e., after the pid in
question may have been cleaned up or the global progpid var reset).

Also tweak the "did it fail immediately" timeout to be a bit longer.
Empirically, it now handles unresponsive IPMI SOL sessions better.
parent a1ca76b6
/* /*
* Copyright (c) 2000-2013 University of Utah and the Flux Group. * Copyright (c) 2000-2014 University of Utah and the Flux Group.
* *
* {{{EMULAB-LICENSE * {{{EMULAB-LICENSE
* *
...@@ -102,7 +102,7 @@ void dolog(int level, char *format, ...); ...@@ -102,7 +102,7 @@ void dolog(int level, char *format, ...);
int val2speed(int val); int val2speed(int val);
void rawmode(char *devname, int speed); void rawmode(char *devname, int speed);
int netmode(); int netmode();
int progmode(); int progmode(int isrestart);
void writepid(void); void writepid(void);
void createkey(void); void createkey(void);
int handshake(void); int handshake(void);
...@@ -671,7 +671,7 @@ main(int argc, char **argv) ...@@ -671,7 +671,7 @@ main(int argc, char **argv)
die("Could not establish connection to %s", Devname); die("Could not establish connection to %s", Devname);
} }
else if (programmode) { else if (programmode) {
if (progmode() != 0) if (progmode(0) != 0)
die("Could not start program %s", Devname); die("Could not start program %s", Devname);
} }
else else
...@@ -1067,7 +1067,7 @@ capture(void) ...@@ -1067,7 +1067,7 @@ capture(void)
else { else {
warning("sub-program died;" warning("sub-program died;"
" attempting to restart"); " attempting to restart");
while (progmode() != 0) { while (progmode(1) != 0) {
usleep(5000000); usleep(5000000);
} }
} }
...@@ -1350,17 +1350,43 @@ deadchild(int sig) ...@@ -1350,17 +1350,43 @@ deadchild(int sig)
#ifdef USESOCKETS #ifdef USESOCKETS
int status, rval; int status, rval;
/*
* We may be receiving a delayed signal after being blocked
* in progmode(). Make sure there really is a child process.
*/
if (progpid < 0) {
/* XXX sanity check */
if ((rval = waitpid(-1, &status, WNOHANG)) > 0)
dolog(LOG_NOTICE,
"waitpid found unexpected child %d (0x%x)\n",
rval, status);
return;
}
rval = waitpid(progpid, &status, WNOHANG); rval = waitpid(progpid, &status, WNOHANG);
if (rval < 0) { if (rval < 0) {
die("waitpid(%d): %s", progpid, geterr(errno)); warning("waitpid(%d): %s", progpid, geterr(errno));
progpid = -1;
return;
} }
/*
* Huh, something must have died, so do a wait and find it.
*/
if (rval == 0) { if (rval == 0) {
dolog(LOG_NOTICE, "waitpid returned zero"); dolog(LOG_NOTICE, "waitpid returned zero, doing general wait");
do {
rval = waitpid(-1, &status, WNOHANG);
if (rval > 0)
dolog(LOG_NOTICE, " pid %d: status=0x%x\n",
rval, status);
} while (rval > 0);
progpid = -1;
return; return;
} }
if (rval != progpid) { if (rval != progpid) {
die("waitpid(%d): waitpid returned some other pid", progpid); die("waitpid(%d): waitpid returned some other pid", progpid);
} }
progpid = -1;
dolog(LOG_NOTICE, "child died"); dolog(LOG_NOTICE, "child died");
#endif /* USESOCKETS */ #endif /* USESOCKETS */
} }
...@@ -1679,10 +1705,16 @@ netmode() ...@@ -1679,10 +1705,16 @@ netmode()
* The console line is really a pipe connected to a program we start. * The console line is really a pipe connected to a program we start.
*/ */
int int
progmode() progmode(int isrestart)
{ {
int pipefds[2]; int pipefds[2];
sigset_t mask; sigset_t mask;
int rv = -1;
/* token attempt to clean up previous child */
if (isrestart && progpid > 0) {
deadchild(SIGCHLD);
}
if (socketpair(AF_UNIX, SOCK_STREAM, 0, pipefds) < 0) { if (socketpair(AF_UNIX, SOCK_STREAM, 0, pipefds) < 0) {
warning("socketpair(): %s", geterr(errno)); warning("socketpair(): %s", geterr(errno));
...@@ -1695,10 +1727,9 @@ progmode() ...@@ -1695,10 +1727,9 @@ progmode()
if ((progpid = fork()) < 0) { if ((progpid = fork()) < 0) {
warning("fork(): %s", geterr(errno)); warning("fork(): %s", geterr(errno));
sigprocmask(SIG_UNBLOCK, &mask, 0);
close(pipefds[0]); close(pipefds[0]);
close(pipefds[1]); close(pipefds[1]);
return -1; goto err;
} }
if (progpid) { if (progpid) {
int status; int status;
...@@ -1707,7 +1738,7 @@ progmode() ...@@ -1707,7 +1738,7 @@ progmode()
* Wait a short time and see if it dies right off the bat. * Wait a short time and see if it dies right off the bat.
* Otherwise we will get into a fast respawn loop. * Otherwise we will get into a fast respawn loop.
*/ */
usleep(1000000); usleep(2000000);
if (waitpid(progpid, &status, WNOHANG) == progpid) { if (waitpid(progpid, &status, WNOHANG) == progpid) {
char buf[256]; char buf[256];
int cc; int cc;
...@@ -1719,10 +1750,11 @@ progmode() ...@@ -1719,10 +1750,11 @@ progmode()
buf[cc] = '\0'; buf[cc] = '\0';
warning("%s ...", buf); warning("%s ...", buf);
} }
sigprocmask(SIG_UNBLOCK, &mask, 0);
close(pipefds[0]); close(pipefds[0]);
close(pipefds[1]); close(pipefds[1]);
return -1; /* don't confuse deadchild */
progpid = -1;
goto err;
} }
close(pipefds[1]); close(pipefds[1]);
devfd = pipefds[0]; devfd = pipefds[0];
...@@ -1731,9 +1763,9 @@ progmode() ...@@ -1731,9 +1763,9 @@ progmode()
warning("%s: fcntl(O_NONBLOCK): %s", Devname, warning("%s: fcntl(O_NONBLOCK): %s", Devname,
geterr(errno)); geterr(errno));
close(devfd); close(devfd);
sigprocmask(SIG_UNBLOCK, &mask, 0); goto err;
return -1;
} }
rv = 0;
} }
else { else {
int i, max; int i, max;
...@@ -1757,11 +1789,13 @@ progmode() ...@@ -1757,11 +1789,13 @@ progmode()
for (i = 3; i < max; i++) for (i = 3; i < max; i++)
(void) close(i); (void) close(i);
sigprocmask(SIG_UNBLOCK, &mask, 0);
execvp(programargv[1], &programargv[1]); execvp(programargv[1], &programargv[1]);
exit(666); exit(666);
} }
err:
sigprocmask(SIG_UNBLOCK, &mask, 0); sigprocmask(SIG_UNBLOCK, &mask, 0);
return 0; return rv;
} }
#endif #endif
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment