Commit 5f413b47 authored by Mike Hibler's avatar Mike Hibler
Browse files

First crack at surviving down planetlab nodes. If the master barrier sync

node sits in the stub or monitor barrier sync for more than the SYNCTIMO
timeout value in common-env.sh, it will send a HUP to syncd which will
knock all the other nodes out of their barrier sync.  If that happens,
all nodes will print a warning message and continue.

All nodes wait for both a stub sync and a monitor sync, so if one plab node
is down, they will timeout on both barrier syncs.  Race conditions?  Sure.
If for example everyone times out on the stub barrier due to a slow node,
and then that node reaches the barrier, it will hang there while everyone
else waits on the monitor barrier.  When the latter times out, it will
kick the slow node out of the stub sync and it will then proceed to hang
in the monitor sync until the experiment is stopped.  Got that?

As an aside, it would be nice if the initializer of a barrier could specify
a timeout value, and return a special error code to everyone if it timed out,
but that would require an incompatible change to the sync protocol.
parent 4c5005da
......@@ -68,6 +68,7 @@ set plabnodes {}
# Put the path to your tarball in this variable
#
set pelab_tar "/proj/tbres/CHANGEME.tar.gz"
#set pelab_tar "/proj/tbres/mike/pelab-bin.tar.gz"
#
......@@ -88,7 +89,6 @@ set control_bw "100Mbps"
#
# These are the initial conditions for the 'elabc' cloud, the Emulab side of
# a pelab experiment
# NOTE: Currently ignored!
#
set ecloud_delay "0ms"
set ecloud_bw "100Mbps"
......
......@@ -42,7 +42,10 @@ SH=/bin/sh
SUDO="${BIN_PATH}/sudo"
MKDIR="/bin/mkdir"
CHMOD="/bin/chmod"
SYNC="/usr/local/etc/emulab/emulab-sync"
SYNCTIMO=120
if [ "$UNAME" == "Linux" ]; then
GREP="/bin/grep"
elif [ "$UNAME" == "FreeBSD" ]; then
......@@ -192,11 +195,15 @@ barrier_wait()
if [ "$MASTER" == "1" ]; then
# I know, this looks backwards. But it's right
$SYNC -n $BARRIER
_rval=$?
else
WAITERS=`expr $PEERS - 1`
echo "Waiting for $WAITERS clients"
$SYNC -n $BARRIER -i $WAITERS
echo "Waiting up to $SYNCTIMO seconds for $WAITERS clients"
sync_timeout $SYNCTIMO $SYNC -n $BARRIER -i $WAITERS
_rval=$?
fi
return $_rval
}
#
......@@ -211,4 +218,41 @@ log_output_background()
echo $!
}
#
# If $SYNC command doesn't return within the indicated timeout period,
# HUP the syncserver to force everyone out of a barrier.
#
sync_timeout()
{
TIMO=$1
shift
CMDSTR=$*
if [ -r /var/run/syncd.pid ]; then
SYNCDPID=`cat /var/run/syncd.pid`
else
SYNCDPID=""
fi
# fire off the command
$CMDSTR & CMDPID=$!
# and a watchdog
if [ -n "$SYNCDPID" ]; then
(sleep $TIMO; echo '*** HUPing syncd'; $AS_ROOT kill -HUP $SYNCDPID) & DOGPID=$!
fi
# wait for the command to finish or be terminated
wait $CMDPID
RVAL=$?
# nuke the watchdog
if [ -n "$SYNCDPID" ]; then
kill $DOGPID >/dev/null 2>&1
fi
# and return the result
return $RVAL
}
fi # End of header guard
......@@ -8,7 +8,10 @@ ARGS=$*
# Wait for all of the stubs to start
#
echo "Waiting for stubs to become ready";
barrier_wait "stub";
barrier_wait "stub"; _rval=$?
if [ $_rval -ne 0 ]; then
echo "*** WARNING: not all stubs started ($_rval)"
fi
#
# Potential race condition here? The monitor cannot connect to the
......@@ -34,7 +37,10 @@ sleep 1
# Wait for all the monitors to come up
#
echo "Waiting for monitors to become ready";
barrier_wait "monitor";
barrier_wait "monitor"; _rval=$?
if [ $_rval -ne 0 ]; then
echo "*** WARNING: not all monitors started ($_rval)"
fi
echo "Running!";
......
......@@ -22,13 +22,19 @@ sleep 1
# Wait for all of the stubs to start
#
echo "Waiting for stubs to become ready";
barrier_wait "stub";
barrier_wait "stub"; _rval=$?
if [ $_rval -ne 0 ]; then
echo "*** WARNING: not all stubs started ($_rval)"
fi
#
# Wait for all the monitors to come up
#
echo "Waiting for monitors to become ready";
barrier_wait "monitor";
barrier_wait "monitor"; _rval=$?
if [ $_rval -ne 0 ]; then
echo "*** WARNING: not all monitors started ($_rval)"
fi
echo "Running!";
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment