#!/usr/bin/perl -wT # # EMULAB-COPYRIGHT # Copyright (c) 2000-2005 University of Utah and the Flux Group. # All rights reserved. # use English; use Getopt::Std; use POSIX qw(isatty setsid); # # This gets invoked from the Web interface. # Swap an experiment in, swap it out, restart or modify. # sub usage() { print(STDERR "Usage: swapexp [-q] [-b | -w] [-i | -a | -f] [-r] [-e]\n". " <-s in | out | restart | modify | pause>\n". " []\n". "switches and arguments:\n". "-w - wait for non-batchmode experiment swap/modify\n". "-q - be less chatty\n". "-r - reboot nodes when doing a modify experiment\n". "-e - restart event scheduler when doing a modify experiment\n". "-s - Operation to perform; one of those listed above\n". " - The project the experiment belongs to\n". " - The experiment name (id)\n". " - Optional NS file to parse for experiment modify\n"); exit(-1); } sub fatal($); my $optlist = "biafres:wq"; # # Exit codes are important; they tell the web page what has happened so # it can say something useful to the user. Fatal errors are mostly done # with die(), but expected errors use this routine. At some point we will # use the DB to communicate the actual error. # # $status < 0 - Fatal error. Something went wrong we did not expect. # $status = 0 - Termination is proceeding in the background. Notified later. # $status > 0 - Expected error. User not allowed for some reason. # sub ExitWithStatus($$) { my ($status, $message) = @_; if ($status < 0) { die("*** $0:\n". " $message\n"); } else { print STDERR "$message\n"; } exit($status); } # # Configure variables # my $TB = "@prefix@"; my $TBOPS = "@TBOPSEMAIL@"; my $TBLOGS = "@TBLOGSEMAIL@"; my $TBINFO = "$TB/expinfo"; my $TBDOCBASE = "@TBDOCBASE@"; my $TBBASE = "@TBBASE@"; my $CONTROL = "@USERNODE@"; # # Testbed Support libraries # use lib "@prefix@/lib"; use libdb; use libtestbed; use libtblog; use libArchive; # Be careful not to exit on transient error; 0 means infinite retry. $libdb::DBQUERY_MAXTRIES = 0; # For the END block below. my $cleaning = 0; my $justexit = 1; my $signaled = 0; my $tbdir = "$TB/bin"; my $tbdata = "tbdata"; my $checkquota = "$TB/sbin/checkquota"; my $batch = 0; my $idleswap = 0; my $autoswap = 0; my $force = 0; my $reboot = 0; my $waitmode = 0; my $quiet = 0; my $eventsys_restart = 0; my $errorstat= -1; my $modifyHosed = 0; my $modifySwapped = 0; my $inout; my $logname; my $dbuid; my $user_name; my $user_email; my @allnodes; my @row; my $action; my $tag; my $nextswapstate; my $termswapstate; my $isadmin = 0; # # Untaint the path # $ENV{'PATH'} = "/bin:/usr/bin:$TB/libexec/vis"; delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'}; # # Turn off line buffering on output # $| = 1; # # Set umask for start/swap. We want other members in the project to be # able to swap/end experiments, so the log and intermediate files need # to be 664 since some are opened for append. # umask(0002); # # Parse command arguments. Once we return from getopts, all that should # left are the required arguments. # %options = (); if (! getopts($optlist, \%options)) { usage(); } if (defined($options{"i"})) { $idleswap = 1; } if (defined($options{"w"})) { $waitmode = 1; } if (defined($options{"a"})) { $autoswap = 1; } if (defined($options{"f"})) { $force = 1; } if (defined($options{"b"})) { $batch = 1; } if (defined($options{"r"})) { $reboot = 1; } if (defined($options{"e"})) { $eventsys_restart = 1; } if (defined($options{"q"})) { $quiet = 1; } if (defined($options{"s"})) { $inout = $options{"s"}; if ($inout ne "out" && $inout ne "in" && $inout ne "restart" && $inout ne "pause" && $inout ne "modify") { usage(); } } else { usage(); } usage() if (($waitmode && $batch) || ($inout ne "modify" && @ARGV != 2) || (($waitmode || $batch) && ($idleswap || $autoswap || $force))); if ($eventsys_restart && $inout ne "modify") { print STDOUT "Usage: swapexp: -e (eventsys_restart) can be used ". "only with -s modify\n"; usage(); } my $pid = $ARGV[0]; my $eid = $ARGV[1]; # # Untaint the arguments. # if ($pid =~ /^([-\w\.]+)$/) { $pid = $1; } else { die("Tainted argument $pid!\n"); } if ($eid =~ /^([-\w\.]+)$/) { $eid = $1; } else { die("Tainted argument $eid!\n"); } my $repfile = "$eid.report"; my $workdir = TBExptWorkDir($pid, $eid); my $userdir = TBExptUserDir($pid, $eid); my $tempnsfile; my $modnsfile; my $nsfile; if ($inout eq "modify" && @ARGV > 2) { $tempnsfile = $ARGV[2]; # # Untaint nsfile argument; Allow slash. # if ($tempnsfile =~ /^([-\w\.\/]+)$/) { $tempnsfile = $1; } else { die("Tainted nsfile name: $tempnsfile\n"); } # # Called from ops interactively. Make sure NS file in /proj or /users. # # Use realpath to resolve any symlinks. # my $translated = `realpath $tempnsfile`; if ($translated =~ /^([-\w\.\/]+)$/) { $tempnsfile = $1; } else { die("Tainted nsfile returned by realpath: $translated\n"); } # # The file must reside in /proj, /groups, or /users. Since this script # runs as the caller, regular file permission checks ensure its a file # the user is allowed to use. /tmp/$guid-$nsref.nsfile also allowed # since this script is invoked directly from web interface, which generates # a name that should not be guessable, so as long as it looks to be in # proper format, we accept it. # if (! ($tempnsfile =~ /^\/tmp\/[-\w]+-\d+\.nsfile/) && ! ($tempnsfile =~ /^\/var\/tmp\/php\w+/) && ! ($tempnsfile =~ /^\/proj/) && ! ($tempnsfile =~ /^\/groups/) && ! ($tempnsfile =~ /^\/users/)) { die("$tempnsfile does not resolve to an appropriate directory!\n"); } if (! -f $tempnsfile || -z $tempnsfile || ! -r $tempnsfile) { die("*** $0:\n". " $tempnsfile does not look like an NS file!\n"); } $nsfile = "$eid.ns"; $modnsfile = "${eid}-modify.ns"; } # # Verify user and get his DB uid. # if (! UNIX2DBUID($UID, \$dbuid)) { die("*** $0:\n". " You do not exist in the Emulab Database.\n"); } # # Get email info for user. # if (! UserDBInfo($dbuid, \$user_name, \$user_email)) { die("*** $0:\n". " Cannot determine your name and email address.\n"); } $isadmin = TBAdmin($UID); # # Set error reporting info # tblog_set_info($pid,$eid,$UID); # # Verify that this person can muck with the experiment. # Note that any script down the line has to do an admin check also. # if ($UID && !$isadmin && !TBExptAccessCheck($dbuid, $pid, $eid, TB_EXPT_DESTROY)) { die("*** $0:\n". " You do not have permission to swap or modify this experiment!\n"); } # Must do this before lock tables! # idleswap is in minutes, threshold is in hours $idleswap_time = 60 * TBGetSiteVar("idle/threshold"); # # In wait mode, block interrupt until we spin off the background process. # if ($waitmode) { $SIG{TERM} = 'IGNORE'; $SIG{QUIT} = 'IGNORE'; $SIG{INT} = 'IGNORE'; } # # Check for overquota; we deal with it below, cause of the batch system. # my $overquota = system("$checkquota $dbuid"); if ($overquota) { print STDERR "*** $0:\n". " You are over your disk quota on $CONTROL; please cleanup!\n"; } # # Temp fix; Disallow swapmod to firewalled experiments. This will come # out later. # my $firewalled = TBExptFirewall($pid, $eid); # # We have to protect against trying to end an experiment that is currently # in the process of being terminated. We use a "wrapper" state (actually # a timestamp so we can say when termination was requested) since # terminating consists of a couple of different experiment states down inside # the tb scripts. # DBQueryFatal("lock tables experiments write"); $query_result = DBQueryFatal("SELECT * FROM experiments WHERE eid='$eid' and pid='$pid'"); if (! $query_result->numrows) { die("*** $0:\n". " No such experiment $pid/$eid exists!\n"); } my %hashrow = $query_result->fetchhash(); my $expt_head_login = $hashrow{'expt_head_uid'}; my $last_swap_uid = $hashrow{'expt_swap_uid'}; my $estate = $hashrow{'state'}; my $batchstate = $hashrow{'batchstate'}; my $expt_path = $hashrow{'path'}; my $expt_locked = $hashrow{'expt_locked'}; my $isbatchexpt = $hashrow{'batchmode'}; my $canceled = $hashrow{'canceled'}; my $swappablebit= $hashrow{'swappable'}; my $idleswapbit = $hashrow{'idleswap'}; my $autoswapbit = $hashrow{'autoswap'}; my $swappablestr= ( $swappablebit ? "Yes" : "No" ); my $idleswapstr = ( $idleswapbit ? "Yes" : "No" ); my $autoswapstr = ( $autoswapbit ? "Yes" : "No" ); my $noswap = $hashrow{'noswap_reason'}; my $noidleswap = $hashrow{'noidleswap_reason'}; my $idleswaptime= $hashrow{'idleswap_timeout'} / 60.0; my $autoswaptime= $hashrow{'autoswap_timeout'} / 60.0; my $rendering = $hashrow{'prerender_pid'}; my $elabinelab = $hashrow{'elab_in_elab'}; my $lockdown = $hashrow{'lockdown'}; if ($inout ne "out") { # I'm going to update this below, so fix the value before I use it. $idleswap_time = min($idleswaptime * 60, $idleswap_time); $idleswaptime = $idleswap_time / 60.0; } my $swapsettings = "Idle-Swap: $idleswapstr". ($idleswapbit ? ", at $idleswaptime hours\n" : " (Reason: $noidleswap)\n"). "Auto-Swap: $autoswapstr". ($autoswapbit ? ", at $autoswaptime hours\n" : "\n"); if (! chdir($workdir)) { die("*** $0:\n". " Could not chdir to $workdir: $!\n"); } # # This script is called from the batch daemon. # if ($batch) { # # Sanity Check. If called from the daemon, must already be locked, # must be a batch experiment, and must be in proper state for the # operation requested. # die("*** $0:\n". " Experiment $pid/$eid is supposed to be a batch experiment!\n") if (!$isbatchexpt); die("*** $0:\n". " Batch experiment $pid/$eid should be locked!\n") if (!defined($expt_locked) || $batchstate ne BATCHSTATE_LOCKED()); die("*** $0:\n". " Batch experiment $pid/$eid is locked down; cannot be swapped!\n") if ($lockdown); if ($inout eq "in") { die("*** $0:\n". " Batch experiment $pid/$eid is not in the proper state!\n". " Currently $estate, but should be QUEUED.\n") if ($estate ne EXPTSTATE_QUEUED); die("*** $0:\n". " Batch experiment $pid/$eid has been canceled! Aborting.\n") if ($canceled); # Do not allow it to swap in. What about swapout? die("*** $0:\n". " Batch experiment cannot swap in when over quota! Aborting.\n") if ($overquota); } elsif ($inout eq "out") { die("*** $0:\n". " Batch experiment $pid/$eid is not in the proper state!\n". " Currently $estate, but should be ACTIVE.\n") if ($estate ne EXPTSTATE_ACTIVE); } else { die("*** $0:\n". " Improper request from batch daemon for $pid/$eid!\n"); } } else { if ($isbatchexpt) { # # User is requesting that a batch either be injected or paused. # Sanity check the state, but otherwise let the batch daemon # handle it. # ExitWithStatus(1, "Batch experiment $pid/$eid is still canceling!") if ($canceled); ExitWithStatus(1, "Batch experiment $pid/$eid is locked down!") if ($lockdown); if ($inout eq "in") { ExitWithStatus(1, "Batch experiment $pid/$eid must be SWAPPED to\n". "QUEUE. Currently $estate.") if ($estate ne EXPTSTATE_SWAPPED); ExitWithStatus(1, "Batch experiment $pid/$eid cannot swap in when ". "over quota!\n") if ($overquota); SetExpState($pid, $eid, EXPTSTATE_QUEUED); } elsif ($inout eq "out") { ExitWithStatus(1, "Batch experiment $pid/$eid must be ACTIVE or\n". "ACTIVATING to swap out. Currently $estate.") if ($estate ne EXPTSTATE_ACTIVE && $estate ne EXPTSTATE_ACTIVATING); # # Since the batch daemon has control, all we can do is set # the cancel bit. # TBSetCancelFlag($pid, $eid, EXPTCANCEL_SWAP); } elsif ($inout eq "pause") { ExitWithStatus(1, "Batch experiment $pid/$eid must be QUEUED to\n". "DEQUEUE. Currently $estate.") if ($estate ne EXPTSTATE_QUEUED); # # XXX. The batch daemon might already have the experiment, but # not have shipped it off to startexp. Change the state # anyway. The error will be noticed later when startexp dies, # and the batch daemon gets the error back. This sucks. # SetExpState($pid, $eid, EXPTSTATE_SWAPPED); } elsif ($inout eq "modify") { ExitWithStatus(1, "Batch experiment $pid/$eid must be SWAPPED or\n". "ACTIVE to modify. Currently $estate.") if (($estate ne EXPTSTATE_SWAPPED && $estate ne EXPTSTATE_ACTIVATING) || $batchstate ne BATCHSTATE_UNLOCKED()); ExitWithStatus(1, "Cannot modify an active firewalled experiment (yet).") if ($firewalled && $estate ne EXPTSTATE_SWAPPED && !$isadmin); ExitWithStatus(1, "Cannot modify an active ElabInElab experiment (yet).") if ($elabinelab && $estate ne EXPTSTATE_SWAPPED && !$isadmin); ExitWithStatus(1, "Cannot modify batch experiment $pid/$eid when ". "over quota!\n") if ($overquota); # # Otherwise, proceed with the modify. The experiment will be # locked below, and so it cannot be injected or otherwise messed # with since its state is going to be changed before we unlock # the experiments table. The batch daemon will leave it alone # until the modify is done. If the modify fails and cannot recover # it is going to get swapped out; that is okay since the batch # daemon does not keep state internally. # goto doit; } else { die("*** $0:\n", " Operation $inout not allowed on a batch experiment!\n"); } ExitWithStatus(0, "Batch experiment $pid/$eid state has been changed.\n"); doit: } else { # # If the cancel flag is set, then user must wait for that to # clear before we can do anything else. # ExitWithStatus(1, "Experiment $pid/$eid has its cancel flag set!.\n". "You must wait for that to clear before you can swap\n". "or modify the experiment.\n") if ($canceled); ExitWithStatus(1, "Experiment $pid/$eid is locked down; cannot swap!\n") if ($lockdown); # # Check the state for the various operations. # if (!$force) { SWITCH: for ($inout) { /^in$/i && do { if ($estate ne EXPTSTATE_SWAPPED()) { ExitWithStatus(1, "Experiment $pid/$eid is not swapped out!"); } ExitWithStatus(1, "Experiment $pid/$eid cannot swap in when ". "over quota!\n") if ($overquota); last SWITCH; }; /^out$/i && do { if ($estate ne EXPTSTATE_ACTIVE() && $estate ne EXPTSTATE_PANICED() && $estate ne EXPTSTATE_ACTIVATING()) { ExitWithStatus(1, "Experiment $pid/$eid is not swapped in ". "or activating!\n"); } # # Must be an admin person to swap out an experiment that # has had its panic button pressed. # if ($estate eq EXPTSTATE_PANICED() && !$isadmin) { ExitWithStatus(1, "Experiment $pid/$eid had its panic ". "button pressed!\n". "Only a testbed administrator can swap ". "this experiment out."); } if ($estate eq EXPTSTATE_ACTIVATING()) { # # All we can do is set the cancel flag and hope that # it gets noticed. We do not wait. # TBSetCancelFlag($pid, $eid, EXPTCANCEL_SWAP); ExitWithStatus(0, "Experiment $pid/$eid swapin has been ". "marked for cancelation.\n". "You will receive email when the original ". "swap request has finished."); } last SWITCH; }; /^restart$/i && do { if ($estate ne EXPTSTATE_ACTIVE()) { ExitWithStatus(1, "Experiment $pid/$eid is not swapped in!"); } last SWITCH; }; /^modify$/i && do { if ($estate ne EXPTSTATE_ACTIVE() && $estate ne EXPTSTATE_SWAPPED()) { ExitWithStatus(1, "Experiment $pid/$eid must be ACTIVE or\n". "SWAPPED to modify!\n"); } ExitWithStatus(1, "Cannot modify an active firewalled experiment (yet).") if ($firewalled && $estate ne EXPTSTATE_SWAPPED && !$isadmin); ExitWithStatus(1, "Cannot modify an active ElabInElab experiment (yet).") if ($elabinelab && $estate ne EXPTSTATE_SWAPPED && !$isadmin); ExitWithStatus(1, "Experiment $pid/$eid cannot be modified ". "when over quota!\n") if ($overquota); last SWITCH; }; die("*** $0:\n". " Missing state check for action: $action\n"); } } } } # # Determine the temporary and next state for experiment. If the experiment # is a batch experiment, then the next state is actually handled by the # batch daemon, but we still have to deal with the temporary state. # SWITCH: for ($inout) { /^in$/i && do { $nextswapstate = EXPTSTATE_ACTIVATING(); last SWITCH; }; /^out$/i && do { $nextswapstate = EXPTSTATE_SWAPPING(); last SWITCH; }; /^restart$/i && do { $nextswapstate = EXPTSTATE_RESTARTING(); last SWITCH; }; /^modify$/i && do { $nextswapstate = (($estate eq EXPTSTATE_SWAPPED()) ? EXPTSTATE_MODIFY_PARSE() : EXPTSTATE_MODIFY_REPARSE()); last SWITCH; }; die("*** $0:\n". " Missing state check for action: $action\n"); } # Update idleswap_timeout to whatever the current value is. if ($inout ne "out") { DBQueryFatal("update experiments set idleswap_timeout='$idleswap_time' ". "where eid='$eid' and pid='$pid'"); } # # On a failure, we go back to this swapstate. Might be modified below. # $termswapstate = $estate; # Lock the record, set the nextstate, and unlock the table. TBLockExp($pid, $eid, $nextswapstate) or die("*** $0:\n". "Failed to set experiment state to $nextswapstate\n"); # # At this point, we need to force a cleanup no matter how we exit. # See the END block below. # $justexit = 0; DBQueryFatal("unlock tables"); # # XXX - At this point a failure is going to leave things in an # inconsistent state. Be sure to call fatal() only since we are # going into the background, and we have to send email since no # one is going to see printed error messages (output goes into the # log file, which will be sent along in the email). # if ($inout eq "in") { $action = "swapped in"; $tag = "swapin"; } if ($inout eq "out") { $action = "swapped out"; $tag = "swapout"; } if ($inout eq "restart") { $action = "restarted"; } if ($inout eq "modify") { $action = "modified"; $tag = "swapmod"; } # # Get email address of the experiment head, which may be different than # the person who is actually terminating the experiment, since its polite # to let the original creator know whats going on. # my $expt_head_name; my $expt_head_email; if (! UserDBInfo($expt_head_login, \$expt_head_name, \$expt_head_email)) { print STDERR "*** WARNING: ". "Could not determine name/email for $expt_head_login.\n"; $expt_head_name = "TBOPS"; $expt_head_email = $TBOPS; } # # Before going to background, we have to copy out the NS file! # if ($inout eq "modify" && defined($modnsfile)) { unlink($modnsfile); if (system("/bin/cp", "$tempnsfile", "$modnsfile")) { fatal("Could not copy $tempnsfile to $modnsfile"); } chmod(0664, "$modnsfile"); } # # If not in batch mode, go into the background. Parent exits. # if (! $batch) { $logname = TBExptCreateLogFile($pid, $eid, "swapexp"); TBExptSetLogFile($pid, $eid, $logname); TBExptOpenLogFile($pid, $eid); if (my $childpid = TBBackGround($logname)) { # # Parent exits normally, unless in waitmode. We have to set # justexit to make sure the END block below does not run. # $justexit = 1; if (!$waitmode) { print("Experiment $pid/$eid is now being $action.\n". "You will be notified via email when the this is done.\n") if (! $quiet); exit(0); } print("Waiting for experiment $eid to finish its swap${action}\n") if (! $quiet); if (isatty(STDIN) && !$quiet) { print("You may type ^C at anytime; you will be notified via email.". "\n". "You will not actually interrupt the experiment itself.\n"); } # Give child a chance to run. select(undef, undef, undef, 0.25); # # Reset signal handlers. User can now kill this process, without # stopping the child. # $SIG{TERM} = 'DEFAULT'; $SIG{INT} = 'DEFAULT'; $SIG{QUIT} = 'DEFAULT'; # # Wait until child exits or until user gets bored and types ^C. # waitpid($childpid, 0); print("Done. Exited with status: $?\n") if (! $quiet); exit($? >> 8); } TBdbfork(); } # # When in waitmode, must put ourselves in another process group so that # an interrupt to the parent will not have any effect on the backend. # if ($waitmode) { POSIX::setsid(); } # # We need to catch TERM cause sometimes shit happens and we have to kill # an experiment swap that is hung or otherwise scrogged. Rather then # trying to kill off the children one by one, lets arrange to catch it # here and send a killpg to the children. This is not to be done lightly, # cause it can leave things worse then they were before! # sub handler ($) { my ($signame) = @_; $SIG{TERM} = 'IGNORE'; my $pgrp = getpgrp(0); kill('TERM', -$pgrp); sleep(1); $signaled = 1; fatal("Caught SIG${signame}! Killing experiment setup ..."); } $SIG{TERM} = \&handler; $SIG{QUIT} = 'DEFAULT'; # # Gather stats; start clock ticking # if ($inout eq "in") { GatherSwapStats($pid, $eid, $dbuid, TBDB_STATS_SWAPIN, 0, TBDB_STATS_FLAGS_START); } elsif ($inout eq "out") { GatherSwapStats($pid, $eid, $dbuid, TBDB_STATS_SWAPOUT, 0, TBDB_STATS_FLAGS_START); } elsif ($inout eq "modify") { GatherSwapStats($pid, $eid, $dbuid, TBDB_STATS_SWAPMODIFY, 0, TBDB_STATS_FLAGS_START); } # # Remove old report file since its contents are going to be invalid. # if ($inout ne "restart" && -e $repfile) { unlink("$repfile"); } # # Sanity check states in case someone changes something. # if ($inout eq "out") { my $optarg = (($force || $idleswap) ? "-force" : ""); print STDOUT "Running 'tbswap out $optarg $pid $eid'\n"; if (system("$tbdir/tbswap out $optarg $pid $eid") != 0) { $errorstat = $? >> 8; fatal("tbswap out failed!"); } SetExpState($pid, $eid, EXPTSTATE_SWAPPED) or fatal("Failed to set experiment state to " . EXPTSTATE_SWAPPED()); TBExptClearPanicBit($pid, $eid); } elsif ($inout eq "in") { GatherSwapStats($pid, $eid, $dbuid, TBDB_STATS_SWAPIN, 0, TBDB_STATS_FLAGS_PRESWAPIN); # Set the swapper now so that nodes use the proper uid. If the swapin # fails, we leave the swapper as is, since its harmless and informative. TBExptSetSwapUID($pid, $eid, $dbuid); print STDOUT "Running 'tbswap in $pid $eid'\n"; if (system("$tbdir/tbswap in $pid $eid") != 0) { $errorstat = $? >> 8; fatal("tbswap in failed!"); } SetExpState($pid, $eid, EXPTSTATE_ACTIVE) or fatal("Failed to set experiment state to " . EXPTSTATE_ACTIVE()); system("$tbdir/tbreport -b $pid $eid 2>&1 > $repfile"); } elsif ($inout eq "modify") { my $modifyError; # # Prepare the Archive for the swapmod, in case we have to "roll back". # if (libArchive::TBExperimentArchivePreSwapMod($pid, $eid) < 0) { fatal("Failed to do a preswapmod on the experiment archive!"); } GatherSwapStats($pid, $eid, $dbuid, TBDB_STATS_SWAPMODIFY, 0, TBDB_STATS_FLAGS_PREMODIFY); # Gather up some firewall state for later comparison. if (GatherFWinfo() < 0) { fatal("Could not gather firewall info; cannot safely continue!"); } print "Backing up old experiment state ... " . TBTimeStamp() . "\n"; if (TBExptBackupVirtualState($pid, $eid)) { fatal("Could not backup experiment state; cannot safely continue!"); } # # Rerun tbprerun if modifying, but only if new NS file provided. # Yep, we allow reswap without changing the NS file. For Shashi and SIM. # Note that tbprerun kills the renderer if its running. # if (defined($modnsfile)) { print STDOUT "Running 'tbprerun $pid $eid $modnsfile'\n"; if (system("$tbdir/tbprerun $pid $eid $modnsfile") != 0) { print STDOUT "Modify Error: tbprerun failed.\n"; FWHOSED: print STDOUT "Recovering experiment state...\n"; if (TBExptRemoveVirtualState($pid, $eid) || TBExptRestoreVirtualState($pid, $eid)) { $modifyHosed = 1; fatal("Experiment state could not be restored!"); # Never returns; } # # If the renderer was running when we started the swapmod, then we # want to restart it. If it was stopped, then the renderer info # was captured with the rest of the virtual state (restored above). # system("prerender -t $pid $eid") if ($rendering); fatal("Update aborted; old virtual state restored."); # Never returns; } # # Okay, whenever a new NS file is presented, we need to do some # checks on the firewall to make sure the user is not trying to # do something "unsafe". # if (CheckFWinfo($estate) != 0) { # All the stuff for recovering is right above, so go there. goto FWHOSED; } } # # Our next state depends on whether the experiment was active or swapped. # if ($estate eq EXPTSTATE_SWAPPED) { SetExpState($pid, $eid, EXPTSTATE_SWAPPED); } else { SetExpState($pid, $eid, EXPTSTATE_MODIFY_RESWAP); # Set the swapper now so that nodes use the proper uid. If the # swapin fails, we need to reset the swapper back so that he # is charged appropriately. TBExptSetSwapUID($pid, $eid, $dbuid); my $optarg = ""; # # For elabinelab experiments; ignore reboot/eventsys_restart, # and force noreconfig; none of it will work or make sense. # if ($elabinelab) { $optarg = "-noreconfig"; } else { $optarg = ($reboot ? "-reboot" : ""); $optarg .= ($eventsys_restart ? " -eventsys_restart" : ""); } print STDOUT "Running 'tbswap update $optarg $pid $eid'\n"; if (system("$tbdir/tbswap update $optarg $pid $eid") == 0) { # # Success. Set the state back to active cause thats where it # started. # SetExpState($pid, $eid, EXPTSTATE_ACTIVE); $estate = EXPTSTATE_ACTIVE; } else { $modifyError = $errorstat = $? >> 8; print STDOUT "Modify Error: tbswap update failed.\n"; # # tbswap either restored the experiment to the way it was, # or it swapped it out completely. In either case, it has # also restored the virtual state. # # Icky. Magic return code that says tbswap swapped it out. # We do not want tbswap to muck with states anymore, so # need to know what it did. At some point we should clean # up the exit reporting! Anyway, fatal() needs to know the # the right state to go back to (no longer ACTIVE). # if ($errorstat & 0x40) { $estate = EXPTSTATE_SWAPPED; $termswapstate = EXPTSTATE_SWAPPED; $modifySwapped = 1; # Old accounting info. TBSetExpSwapTime($pid, $eid); $modifyError = "Update aborted; experiment swapped out."; } else { $modifyError = "Update aborted; old state restored."; # Reset the swapper since the experiment is still running. TBExptSetSwapUID($pid, $eid, $last_swap_uid); } } } # # We need to rerender only if the NS file was changed (ran tbprerun), # If the swapmod succeeded, then unconditionally run the renderer. If # swap failed, then need to run the renderer only if we stopped one in # progress. # if (defined($modnsfile)) { system("prerender -t $pid $eid") if (!defined($modifyError) || $rendering); } # # Swapmod failed ... # fatal($modifyError) if (defined($modifyError)); # # Move the temporary ns file to its real name. # if (defined($modnsfile)) { unlink($nsfile); if (system("/bin/mv", "$modnsfile", "$nsfile")) { fatal("Could not mv $modnsfile to $nsfile"); } } TBExptClearBackupState($pid, $eid); system("$tbdir/tbreport -b $pid $eid 2>&1 > $repfile"); } else { # $inout eq "restart" assumed. print STDOUT "Running 'tbrestart $pid $eid'\n"; if (system("$tbdir/tbrestart $pid $eid") != 0) { fatal("tbrestart failed!"); } SetExpState($pid, $eid, EXPTSTATE_ACTIVE); } # # Try to copy off the files for testbed information gathering. # TBSaveExpLogFiles($pid, $eid); # # Make a copy of the work dir in the user visible space so the user # can see the log files. This overwrites existing files of course, # but thats okay. # system("cp -Rfp $workdir/ $userdir/tbdata/"); # # The archive gets different treatment when doing a swapmod. # if ($inout eq "modify") { # Get the new NS file into the new swapdir. if (defined($nsfile)) { system("cp -p $workdir/$nsfile $userdir/$nsfile"); if (libArchive::TBExperimentArchiveAddFile($pid, $eid, "$userdir/$nsfile") < 0) { fatal("Failed to add $userdir/$nsfile to the archive!"); } } print "Doing a commit on the previous experiment archive ...\n"; libArchive::TBExperimentArchiveSwapModCommit($pid, $eid) == 0 or fatal("Failed to commit experiment archive!"); } # # Do a SavePoint on the experiment files. # if (libArchive::TBExperimentArchiveSavePoint($pid, $eid, $tag) < 0) { fatal("Failed to do a savepoint on the experiment archive!"); } # Commit the archive after swapout if ($inout eq "out") { print "Doing a commit on the experiment archive ...\n"; libArchive::TBCommitExperimentArchive($pid, $eid, $tag) == 0 or fatal("Failed to commit experiment archive!"); } # # Gather stats. # if ($inout eq "in") { GatherSwapStats($pid, $eid, $dbuid, TBDB_STATS_SWAPIN, 0); } elsif ($inout eq "out") { GatherSwapStats($pid, $eid, $dbuid, TBDB_STATS_SWAPOUT, 0, ($idleswap ? TBDB_STATS_FLAGS_IDLESWAP() : 0)); } elsif ($inout eq "modify") { GatherSwapStats($pid, $eid, $dbuid, TBDB_STATS_SWAPMODIFY, 0); } # Old accounting info. TBSetExpSwapTime($pid, $eid); # # Set the swapper on swapout, only after stats have been gathered, and # *only* on success (since on failure nodes might still be held). # See above for swapin/swapmod. # if ($inout eq "out") { TBExptSetSwapUID($pid, $eid, $dbuid); } # # In batch mode, just exit without sending email or unlocking. The # batch daemon will take care of that. # if ($batch) { exit(0); } # # Clear the log file so the web page stops spewing. # if (defined($logname)) { TBExptCloseLogFile($pid, $eid); } # # Must unlock before exit. # TBUnLockExp($pid, $eid); # # Since the swap completed, clear the cancel flag. This must be done # after we change the experiment state (above). # TBSetCancelFlag($pid, $eid, EXPTCANCEL_CLEAR); print "Swap Success!\n"; # # Send email notification to user. # my $message = "Experiment $eid in project $pid has been "; if ($inout eq "out" && ($idleswap || $autoswap || $force) ) { $message .= "forcibly swapped out by\nEmulab"; if ($idleswap) { $message .= " because it was idle for too long (Idle-Swap).\n". "(See also the Idle-Swap info in \n". "$TBDOCBASE/docwrapper.php3?docname=swapping.html )\n"; } elsif ($autoswap) { $message .= " because it exceeded its Maximum Duration.\n". "(See also the Max. Duration info in \n". "$TBDOCBASE/docwrapper.php3?docname=swapping.html )\n"; } elsif ($force) { $message .= ". (See also our Node Usage Policies in \n". "$TBDOCBASE/docwrapper.php3?docname=swapping.html )\n"; } } else { $message .= "$action.\n"; } if ($inout eq "in") { # Add the swap settings... $message .="\nCurrent swap settings:\n$swapsettings"; } $message .= "\n". "Appended below is the output. If you have any questions or comments,\n" . "please include the output in your message to $TBOPS\n"; SENDMAIL("$user_name <$user_email>", "Experiment $pid/$eid \u$action", $message, ($idleswap ? $TBOPS : "$user_name <$user_email>"), "Cc: $expt_head_name <$expt_head_email>\n". "Bcc: $TBLOGS", (($inout eq "restart") ? ($logname) : (($repfile, $logname), (defined($nsfile) ? ($nsfile) : ())))); exit(0); sub cleanup() { # # Gather stats. # if ($inout eq "in") { GatherSwapStats($pid, $eid, $dbuid, TBDB_STATS_SWAPIN, $errorstat); } elsif ($inout eq "out") { GatherSwapStats($pid, $eid, $dbuid, TBDB_STATS_SWAPOUT, $errorstat); } elsif ($inout eq "modify") { # # If the modify fails, and the experiment is swapped out, then # insert a record for that since that is different then a modify # that fails, but results in the experiment being restored the # way it was. # GatherSwapStats($pid, $eid, $dbuid, TBDB_STATS_SWAPMODIFY, $errorstat); if ($modifySwapped) { GatherSwapStats($pid, $eid, $dbuid, TBDB_STATS_SWAPOUT, 0); } } if ($inout eq "modify") { # # Clear backup state since not needed anymore; experiment is toast. # TBExptClearBackupState($pid, $eid); # # Rollback the archive. # libArchive::TBExperimentArchiveRollBack($pid, $eid); } # # If hosed, we entirely terminate the experiment. # if ($modifyHosed) { my $stat = 0; # # Note: $estate is indeed still set appropriately! # if ($estate eq EXPTSTATE_ACTIVE) { my $optarg = "-force"; print "Running 'tbswap out $optarg $pid $eid'\n"; if (system("$tbdir/tbswap out $optarg $pid $eid") != 0) { print "tbswap out failed!\n"; $stat = $? >> 8; } SetExpState($pid, $eid, EXPTSTATE_SWAPPED); GatherSwapStats($pid, $eid, $dbuid, TBDB_STATS_SWAPOUT, $stat); } $stat = 0; print "Running 'tbend -force $pid $eid'\n"; if (system("$tbdir/tbend -force $pid $eid") != 0) { print "tbend failed!\n"; $stat = $? >> 8; } # # Okay, we are going to destroy the experiment below. # GatherSwapStats($pid, $eid, $dbuid, TBDB_STATS_TERMINATE, 0); # Must override since we are so badly hosed. $termswapstate = EXPTSTATE_TERMINATED; } # Set proper state, which is typically the way we came in. SetExpState($pid, $eid, $termswapstate); # Copy over the log files so the user can see them. system("/bin/cp -Rfp $workdir/ $userdir/tbdata"); # # In batch mode, exit without sending the email or unlocking. The # batch daemon will take care of that. # if ($batch) { return; } # # Clear the log file so the web page stops spewing. # if (defined($logname)) { TBExptCloseLogFile($pid, $eid); } # Unlock and reset state to its terminal value. TBUnLockExp($pid, $eid); # # Clear the cancel flag now that the operation is complete. Must be done # after we change the experiment state (above). # TBSetCancelFlag($pid, $eid, EXPTCANCEL_CLEAR); # # Send a message to the testbed list. Append the logfile. # SENDMAIL("$user_name <$user_email>", "Swap ${inout} Failure: $pid/$eid", "Please look at the log below to see what happened. If the error\n". "resulted from a lack of free nodes, you can use this web page to\n". "get a summary of free nodes:\n\n". " ${TBBASE}/nodecontrol_list.php3\n". "\n". "Please do not try again until you see enough nodes free. Or, you\n". "can use the batch system to swap your experiment in when enough\n". "nodes are free:\n\n". " ${TBDOCBASE}/tutorial/docwrapper.php3?docname=tutorial.html". "#BatchMode\n", ($idleswap ? $TBOPS : "$user_name <$user_email>"), "Cc: $expt_head_name <$expt_head_email>\n". "Cc: $TBOPS", (($logname), (defined($modnsfile) ? ($modnsfile) : ()))); if ($modifyHosed) { # # Copy off the workdir to the user directory, Then back up both of # them for post-mortem debugging. # system("/bin/cp -Rfp $workdir/ $userdir/tbdata"); system("/bin/rm -rf ${workdir}-failed"); system("/bin/mv -f $workdir ${workdir}-failed"); system("/bin/rm -rf ${userdir}-failed"); system("/bin/mv -f $userdir ${userdir}-failed"); TBExptDestroy($pid, $eid); } return; } # # Some firewall related stuff. There are special rules governing the # modification of a firewalled experiment, and this is the easiest place # to deal with it. We need to compare the virtual firewall info before # and after the parse, and disallow some changes. Maybe move this someplace # else at some point. # my $wasfirewalled; my $fwname; my $fwtype; my $fwstyle; sub GatherFWinfo() { $wasfirewalled = 0; my $query_result = DBQueryWarn("select fwname,type,style from virt_firewalls ". "where pid='$pid' and eid='$eid'"); return -1 if (!$query_result); # Not firewalled. return 0 if (!$query_result->numrows); $wasfirewalled = 1; ($fwname,$fwtype,$fwstyle) = $query_result->fetchrow_array(); return 0; } sub CheckFWinfo($) { my ($curstate) = @_; my $msg = ""; my $nowfirewalled = 0; my %fwstyle_mapping = ("open" => 0, "basic" => 1, "closed" => 2, "emulab" => 3); my $query_result = DBQueryWarn("select fwname,type,style from virt_firewalls ". "where pid='$pid' and eid='$eid'"); return -1 if (!$query_result); $nowfirewalled = $query_result->numrows; # Do nothing if not firewalled before or after! goto okay if (!$wasfirewalled && !$nowfirewalled); # Experiment cannot go from firewalled to not firewalled, in either the # swapped or active state. if ($wasfirewalled && !$nowfirewalled) { if (! $isadmin) { $msg = "Not allowed to turn off firewalling!"; goto noway; } if ($curstate eq EXPTSTATE_ACTIVE()) { $msg = "Not allowed to turn off firewalling while active!"; goto noway; } goto okay; } # # Make sure there is at least one other node besides the firewalled node. # Rob sez we could eat too many VLANs if we allowed this, so only admin # users, not mere users. # if (!$isadmin && $nowfirewalled) { my $virt_result = DBQueryWarn("select vname from virt_nodes ". "where pid='$pid' and eid='$eid'"); return -1 if (!$virt_result); if (! ($virt_result->numrows > $query_result->numrows)) { $msg = "Must have at least one firewalled node!"; goto noway; } } # Experiment cannot go from not firewalled to firewalled while in the # active state. We will allow this later. if (!$wasfirewalled && $nowfirewalled) { if ($curstate eq EXPTSTATE_ACTIVE()) { $msg = "Not allowed to turn on firewalling while active!"; goto noway; } goto okay; } my ($new_fwname,$new_fwtype,$new_fwstyle) = $query_result->fetchrow_array(); # Not allowed to change the name of the firewall while active. if ($curstate eq EXPTSTATE_ACTIVE() && $fwname ne $new_fwname) { $msg = "Not allowed to change the name of the firewall!"; goto noway; } # Not allowed to change the type of the firewall at all yet. if ($fwtype ne $new_fwtype) { $msg = "Not allowed to change the type of the firewall!"; goto noway; } # Dealing with the style is harder. First off, while active we do not # allow the style to be changed. if ($curstate eq EXPTSTATE_ACTIVE() && $fwstyle ne $new_fwstyle) { $msg = "Not allowed to change the style (level) of the firewall!"; goto noway; } # Okay, while experiment is swapped, can only go from less firewalled # to more firewalled. if ($curstate eq EXPTSTATE_SWAPPED() && $fwstyle ne $new_fwstyle) { if (!exists($fwstyle_mapping{$new_fwstyle})) { $msg = "Unknown firewall style (level): '$new_fwstyle'!"; goto noway; } if (!exists($fwstyle_mapping{$fwstyle})) { $msg = "Unknown firewall style (level): '$fwstyle'!"; goto noway; } if ($fwstyle_mapping{$new_fwstyle} < $fwstyle_mapping{$fwstyle}) { $msg = "Not allowed to reduce the firewall level!"; goto noway; } } okay: return 0; noway: print STDOUT "*** $msg\n"; return 1; } # # We need this END block to make sure that we clean up after a fatal # exit in the library. This is problematic, cause we could be exiting # cause the mysql server has gone whacky again. # sub fatal($) { my($mesg) = $_[0]; print "*** $0:\n"; print " $mesg\n"; print "Cleaning up and exiting with status $errorstat ...\n"; # # This exit will drop into the END block below. # exit($errorstat); } END { # Normal exit, nothing to do. if (!$? || $justexit) { return; } my $saved_exitcode = $?; tblog_find_error() if $?; if ($cleaning) { # # We are screwed; a recursive error. Someone will have to clean # up by hand. # SENDMAIL(TBOPS, "Swap ${inout} Failure: $pid/$eid", "Recursive error in cleanup! This is very bad."); $? = $saved_exitcode; return; } $cleaning = 1; cleanup(); $? = $saved_exitcode; }