From 08ce72b617d5b3813e480b932e119694e4476ba7 Mon Sep 17 00:00:00 2001 From: Leigh B Stoller Date: Tue, 1 Dec 2015 17:45:15 -0700 Subject: [PATCH] Add an "interruptible" option to TBScriptLock(). When set, each time through the loop we look to see if signals are pending, and if so we return early with an error. The caller (libvnode_xen) can use this to avoid really long waits, when the server has said to stop what its doing. For example, a vnode setup is waiting for an image lock, but the server comes along ands to stop setting up. Previously, we would wait for the lock, now we return early. This is to help with cancelation where it is nice if the server can stop a CreateSliver() in its tracks, when it is safe to do so. --- clientside/tmcc/common/libtestbed.pm | 37 +++++++++- clientside/tmcc/common/mkvnode.pl | 67 +++++++++--------- clientside/tmcc/linux/xen/libvnode_xen.pm | 85 ++++++++++++++++------- 3 files changed, 130 insertions(+), 59 deletions(-) diff --git a/clientside/tmcc/common/libtestbed.pm b/clientside/tmcc/common/libtestbed.pm index e319e7c03..7f07b3bb6 100644 --- a/clientside/tmcc/common/libtestbed.pm +++ b/clientside/tmcc/common/libtestbed.pm @@ -34,7 +34,8 @@ use Exporter; TBSCRIPTLOCK_OKAY TBSCRIPTLOCK_TIMEDOUT TBSCRIPTLOCK_IGNORE TBSCRIPTLOCK_FAILED TBSCRIPTLOCK_GLOBALWAIT TBSCRIPTLOCK_SHAREDLOCK TBSCRIPTLOCK_NONBLOCKING - TBSCRIPTLOCK_WOULDBLOCK + TBSCRIPTLOCK_WOULDBLOCK TBSCRIPTLOCK_INTERRUPTED + TBSCRIPTLOCK_INTERRUPTIBLE TBTimeStamp TBTimeStampWithDate TBBackGround ReOpenLog ); @@ -44,6 +45,7 @@ use English; use Fcntl ':flock'; use IO::Handle; use Time::HiRes qw(gettimeofday); +use POSIX qw(:signal_h); # # Turn off line buffering on output @@ -267,10 +269,12 @@ sub TBSCRIPTLOCK_OKAY() { 0; } sub TBSCRIPTLOCK_TIMEDOUT() { 1; } sub TBSCRIPTLOCK_IGNORE() { 2; } sub TBSCRIPTLOCK_WOULDBLOCK() { 4; } +sub TBSCRIPTLOCK_INTERRUPTED() { 8; } sub TBSCRIPTLOCK_FAILED() { -1; } sub TBSCRIPTLOCK_GLOBALWAIT() { 0x01; } sub TBSCRIPTLOCK_SHAREDLOCK() { 0x10; } sub TBSCRIPTLOCK_NONBLOCKING() { 0x20; } +sub TBSCRIPTLOCK_INTERRUPTIBLE(){ 0x40; } # # There are two kinds of serialization. @@ -288,6 +292,7 @@ sub TBScriptLock($;$$$) local *LOCK; my $global = 0; my $shared = 0; + my $interruptible = 0; if (!defined($waittime)) { $waittime = 30; @@ -299,6 +304,8 @@ sub TBScriptLock($;$$$) if (defined($flags) && ($flags & TBSCRIPTLOCK_GLOBALWAIT())); $shared = 1 if (defined($flags) && ($flags & TBSCRIPTLOCK_SHAREDLOCK())); + $interruptible = 1 + if (defined($flags) && ($flags & TBSCRIPTLOCK_INTERRUPTIBLE())); $lockname = "/var/tmp/testbed_${token}_lockfile"; my $oldmask = umask(0000); @@ -310,6 +317,20 @@ sub TBScriptLock($;$$$) } umask($oldmask); + my $checkforinterrupt = sub { + my $sigset = POSIX::SigSet->new; + sigpending($sigset); + + # XXX Why isn't SIGRTMIN and SIGRTMAX defined in the POSIX module. + for (my $i = 1; $i < 50; $i++) { + if ($sigset->ismember($i)) { + print "checkForInterrupt: Signal $i is pending\n"; + return 1; + } + } + return 0; + }; + if (! $global) { # # A plain old lock. @@ -329,6 +350,10 @@ sub TBScriptLock($;$$$) return TBSCRIPTLOCK_TIMEDOUT(); } sleep(1); + if ($interruptible && &$checkforinterrupt()) { + print STDERR "ScriptLock interrupted by signal!\n"; + return TBSCRIPTLOCK_INTERRUPTED(); + } } # Okay, got the lock. Save the handle. We need it below. if (defined($lockhandle_ref)) { @@ -373,9 +398,13 @@ sub TBScriptLock($;$$$) return TBSCRIPTLOCK_TIMEDOUT(); } sleep(1); + if ($interruptible && &$checkforinterrupt()) { + print STDERR "ScriptLock interrupted by signal!\n"; + return TBSCRIPTLOCK_INTERRUPTED(); + } } - $count = 0; + my $count = 0; # # If we did not get the lock, wait for the process that did to finish. # @@ -397,6 +426,10 @@ sub TBScriptLock($;$$$) return TBSCRIPTLOCK_TIMEDOUT(); } sleep(1); + if ($interruptible && &$checkforinterrupt()) { + print STDERR "ScriptLock interrupted by signal!\n"; + return TBSCRIPTLOCK_INTERRUPTED(); + } } } } diff --git a/clientside/tmcc/common/mkvnode.pl b/clientside/tmcc/common/mkvnode.pl index 5675bd759..8f39bdf64 100755 --- a/clientside/tmcc/common/mkvnode.pl +++ b/clientside/tmcc/common/mkvnode.pl @@ -755,41 +755,44 @@ if (defined(VNCONFIG('SSHDPORT')) && VNCONFIG('SSHDPORT') ne "" && # it running in its new context. Still, lets protect it with a timer # since it might get hung up inside and we do not want to get stuck here. # -my $childpid = fork(); -if ($childpid) { - my $timedout = 0; - local $SIG{ALRM} = sub { kill("TERM", $childpid); $timedout = 1; }; - alarm 180 - if (!$ISXENVM); - waitpid($childpid, 0); - alarm 0 - if (!$ISXENVM); +if (!$ISXENVM) { + my $childpid = fork(); + if ($childpid) { + my $timedout = 0; + local $SIG{ALRM} = sub { kill("TERM", $childpid); $timedout = 1; }; + alarm 180; + waitpid($childpid, 0); + alarm 0; - # - # If failure then cleanup. - # - if ($? || $timedout) { - MyFatal("$vnodeid container startup ". - ($timedout ? "timed out." : "failed.")); + # + # If failure then cleanup. + # + if ($? || $timedout) { + MyFatal("$vnodeid container startup ". + ($timedout ? "timed out." : "failed.")); + } } -} -else { - # - # We want to call this as clean as possible. - # - $SIG{TERM} = 'DEFAULT'; - $SIG{INT} = 'DEFAULT'; - $SIG{USR1} = 'DEFAULT'; - $SIG{USR2} = 'DEFAULT'; - $SIG{HUP} = 'DEFAULT'; - POSIX::setsid(); - - if ($libops{$vmtype}{"vnodeBoot"}->($vnodeid, $vmid, - \%vnconfig, $vnstate->{'private'})) { - print STDERR "*** ERROR: vnodeBoot failed\n"; - exit(1); + else { + # + # We want to call this as clean as possible. + # + $SIG{TERM} = 'DEFAULT'; + $SIG{INT} = 'DEFAULT'; + $SIG{USR1} = 'DEFAULT'; + $SIG{USR2} = 'DEFAULT'; + $SIG{HUP} = 'DEFAULT'; + POSIX::setsid(); + + if ($libops{$vmtype}{"vnodeBoot"}->($vnodeid, $vmid, + \%vnconfig, $vnstate->{'private'})){ + print STDERR "*** ERROR: vnodeBoot failed\n"; + exit(1); + } + exit(0); } - exit(0); +} +elsif (safeLibOp('vnodeBoot', 1, 1)) { + MyFatal("$vnodeid container startup failed."); } if (safeLibOp('vnodePostConfig', 1, 1)) { MyFatal("vnodePostConfig failed"); diff --git a/clientside/tmcc/linux/xen/libvnode_xen.pm b/clientside/tmcc/linux/xen/libvnode_xen.pm index a00cad450..db2da8ea4 100644 --- a/clientside/tmcc/linux/xen/libvnode_xen.pm +++ b/clientside/tmcc/linux/xen/libvnode_xen.pm @@ -84,6 +84,7 @@ use File::Basename; use File::Path; use File::Copy; use File::Temp; +use POSIX qw(:signal_h); # Pull in libvnode BEGIN { require "/etc/emulab/paths.pm"; import emulabpaths; } @@ -295,6 +296,10 @@ my $VIFROUTING = ((-e "$ETCDIR/xenvifrouting") ? 1 : 0); my $TMCD_PORT = 7777; +# Number of concurrent containers set up in parallel. We bump this up +# a bit down in doingThinLVM(). +my $MAXCONCURRENT = 3; + # # Information about the running Xen hypervisor # @@ -336,6 +341,7 @@ sub LookupRouteTable($); sub FreeRouteTable($); sub downloadOneImage($$$); sub captureRunning($); +sub checkForInterrupt(); sub getXenInfo() { @@ -782,8 +788,9 @@ sub rootPreConfigNetwork($$$$) TBDebugTimeStamp("rootPreConfigNetwork: grabbing global lock $GLOBAL_CONF_LOCK") if ($lockdebug); - if (TBScriptLock($GLOBAL_CONF_LOCK, 0, 900) != TBSCRIPTLOCK_OKAY()) { - print STDERR "Could not get the global lock after a long time!\n"; + if (TBScriptLock($GLOBAL_CONF_LOCK, + TBSCRIPTLOCK_INTERRUPTIBLE(), 900) != TBSCRIPTLOCK_OKAY()){ + print STDERR "Could not get the global lock!\n"; return -1; } TBDebugTimeStamp(" got global lock") @@ -863,9 +870,10 @@ sub vnodeCreate($$$$) my $imagelockname = ImageLockName($imagename); TBDebugTimeStamp("grabbing image lock $imagelockname shared") if ($lockdebug); - if (TBScriptLock($imagelockname, TBSCRIPTLOCK_SHAREDLOCK(), 1800) - != TBSCRIPTLOCK_OKAY()) { - fatal("Could not get $imagelockname lock after a long time!"); + if (TBScriptLock($imagelockname, + TBSCRIPTLOCK_INTERRUPTIBLE()|TBSCRIPTLOCK_SHAREDLOCK(), + 1800) != TBSCRIPTLOCK_OKAY()) { + fatal("Could not get $imagelockname lock!"); } TBDebugTimeStamp(" got image lock") if ($lockdebug); @@ -899,10 +907,9 @@ sub vnodeCreate($$$$) TBScriptUnlock(); TBDebugTimeStamp("grabbing image lock $imagelockname exclusive") if ($lockdebug); - if (TBScriptLock($imagelockname, undef, 1800) + if (TBScriptLock($imagelockname, TBSCRIPTLOCK_INTERRUPTIBLE(), 1800) != TBSCRIPTLOCK_OKAY()) { - fatal("Could not get $imagelockname write lock ". - "after a long time!"); + fatal("Could not get $imagelockname write lock!"); } TBDebugTimeStamp(" got image lock") if ($lockdebug); @@ -918,7 +925,9 @@ sub vnodeCreate($$$$) TBScriptUnlock(); TBDebugTimeStamp("grabbing image lock $imagelockname shared") if ($lockdebug); - if (TBScriptLock($imagelockname, TBSCRIPTLOCK_SHAREDLOCK(), 1800) + if (TBScriptLock($imagelockname, + TBSCRIPTLOCK_INTERRUPTIBLE()| + TBSCRIPTLOCK_SHAREDLOCK(), 1800) != TBSCRIPTLOCK_OKAY()) { fatal("Could not get $imagelockname lock back ". "after a long time!"); @@ -2099,8 +2108,9 @@ sub vnodePreConfigExpNetwork($$$$) # TBDebugTimeStamp("vnodePreConfigExpNetwork: grabbing global lock $GLOBAL_CONF_LOCK") if ($lockdebug); - if (TBScriptLock($GLOBAL_CONF_LOCK, 0, 900) != TBSCRIPTLOCK_OKAY()) { - print STDERR "Could not get the global lock after a long time!\n"; + if (TBScriptLock($GLOBAL_CONF_LOCK, TBSCRIPTLOCK_INTERRUPTIBLE(), 900) + != TBSCRIPTLOCK_OKAY()) { + print STDERR "Could not get the global lock!\n"; return -1; } TBDebugTimeStamp(" got global lock") @@ -2375,6 +2385,8 @@ sub vnodeBoot($$$$) return 0; } $countdown--; + last + if (checkForInterrupt()); } # # Tear it down and try again. Use vnodeHalt cause it protects @@ -2392,6 +2404,8 @@ sub vnodeBoot($$$$) TBDebugTimeStamp("Container not gone yet"); } TBDebugTimeStamp("Container is gone ($i)!"); + last + if (checkForInterrupt()); } return -1; } @@ -3131,7 +3145,8 @@ sub grabGoldenLock($) TBDebugTimeStamp("grabbing gimage lock $token") if ($lockdebug); - if (TBScriptLock($token, undef, 900, \$lockref) == TBSCRIPTLOCK_OKAY()) { + if (TBScriptLock($token, TBSCRIPTLOCK_INTERRUPTIBLE(), + 900, \$lockref) == TBSCRIPTLOCK_OKAY()) { TBDebugTimeStamp(" got gimage lock") if ($lockdebug); return $lockref; @@ -3283,10 +3298,10 @@ sub createImageDisk($$$$) # And back to a shared lock. TBDebugTimeStamp("grabbing image lock $imagelockname shared") if ($lockdebug); - if (TBScriptLock($imagelockname, TBSCRIPTLOCK_SHAREDLOCK(), 1800) - != TBSCRIPTLOCK_OKAY()) { - print STDERR "Could not get $imagelockname lock back ". - "after a long time!\n"; + if (TBScriptLock($imagelockname, + TBSCRIPTLOCK_INTERRUPTIBLE()|TBSCRIPTLOCK_SHAREDLOCK(), + 1800) != TBSCRIPTLOCK_OKAY()) { + print STDERR "Could not get $imagelockname lock back!\n"; return -1; } TBDebugTimeStamp(" got image lock") @@ -3321,9 +3336,9 @@ sub downloadOneImage($$$) TBDebugTimeStamp("grabbing image lock $imagelockname exclusive") if ($lockdebug); - if (TBScriptLock($imagelockname, undef, 1800) != TBSCRIPTLOCK_OKAY()) { - print STDERR "Could not get $imagelockname write lock". - "after a long time!\n"; + if (TBScriptLock($imagelockname, TBSCRIPTLOCK_INTERRUPTIBLE(), 1800) + != TBSCRIPTLOCK_OKAY()) { + print STDERR "Could not get $imagelockname write lock!\n"; return -1; } TBDebugTimeStamp(" got image lock") @@ -4230,8 +4245,9 @@ sub createExpBridges($$$) # TBDebugTimeStamp("createExpBridges: grabbing global lock $GLOBAL_CONF_LOCK") if ($lockdebug); - if (TBScriptLock($GLOBAL_CONF_LOCK, 0, 1800) != TBSCRIPTLOCK_OKAY()) { - print STDERR "Could not get the global lock after a long time!\n"; + if (TBScriptLock($GLOBAL_CONF_LOCK, TBSCRIPTLOCK_INTERRUPTIBLE(), + 1800) != TBSCRIPTLOCK_OKAY()) { + print STDERR "Could not get the global lock!\n"; return -1; } TBDebugTimeStamp(" got global lock") @@ -4720,7 +4736,7 @@ sub doingThinLVM() $usethin = 0; return 0; } - + $MAXCONCURRENT = 5; return 1; } @@ -4959,7 +4975,8 @@ sub AllocateIFBs($$$) TBDebugTimeStamp("AllocateIFBs: grabbing global lock $GLOBAL_CONF_LOCK") if ($lockdebug); - if (TBScriptLock($GLOBAL_CONF_LOCK, 0, 1800) != TBSCRIPTLOCK_OKAY()) { + if (TBScriptLock($GLOBAL_CONF_LOCK, TBSCRIPTLOCK_INTERRUPTIBLE(), + 1800) != TBSCRIPTLOCK_OKAY()) { print STDERR "Could not get the global lock after a long time!\n"; return -1; } @@ -5407,16 +5424,30 @@ sub RunWithLock($$) return $status; } +sub checkForInterrupt() +{ + my $sigset = POSIX::SigSet->new; + sigpending($sigset); + + # XXX Why isn't SIGRTMIN and SIGRTMAX defined in th POSIX module. + for (my $i = 1; $i < 50; $i++) { + if ($sigset->ismember($i)) { + print "checkForInterrupt: Signal $i is pending\n"; + return 1; + } + } + return 0; +} + # # We need to control how many simultaneous creates happen at once. # -my $MAXCONCURRENT = 3; my $createvnode_lockref; sub CreateVnodeLock() { my $tries = 1000; - + while ($tries) { for (my $i = 0; $i < $MAXCONCURRENT; $i++) { my $token = "createvnode_${i}"; @@ -5435,7 +5466,11 @@ sub CreateVnodeLock() } print "Still trying to get the create lock at " . time() . "\n" if (($tries % 60) == 0); + return -1 + if (checkForInterrupt()); sleep(4); + return -1 + if (checkForInterrupt()); $tries--; } TBDebugTimeStamp("Could not get the createvnode lock after a long time!"); -- GitLab