Commit 7308f458 authored by Chad Barb's avatar Chad Barb

Robust Experiment Modify -and-

Various Other changes to get Expt Modify ready for prime time.

 - If assign fails on a modify, experiment will
   be restored to old state, *not* swapped out.

 - Reboot option has been improved to reboot all
   nodes as part of os_setup, not in separate
   step.

 - Different assign error codes result in different
   retry behavior for assign_wrapper
   (Follow's Rob's change to assign to make it
    pass back special code for non-retriable faults)

 - '64' bit in assign_wrapper exit code indicates to tbswap
   that db/phys state hadn't been mucked with before
   the exit occurred
   (ergo, '65' and '1' are the common return codes,
    though the old 4,8,16,32 are still there for assign failing.)

 - (tbswap still returns codes from assign wrapper)

 - Added 5 sec pause between assign attempts.

 - Cleaned up tbswap code.

 - Physical state backup/restore removed from tbprerun,
   put into swapexp.

 - Interfaces table now getting cleaned up correctly
   (Mike noticed problem)

 - Changed menu display in showexp to show
   the "modify" menu option for swapped out experiments
   (like it used to.)

 - A couple other changes.

Note:
 Still admin-only, but I plan to change that soon.

To do:
 - Erase expt backups in /tmp after using them.
 - Re-viz failed experiments.
parent d1697743
This diff is collapsed.
......@@ -403,7 +403,7 @@ sub fatal()
# Clear the logfile so the webpage stops.
TBExptClearLogFile($pid, $eid);
#
# Send a message to the testbed list.
#
......
......@@ -10,12 +10,13 @@ use English;
use Getopt::Std;
#
# This gets invoked from the Web interface.
# This gets invoked from the Web interface.
# Swap an experiment in, swap it out, restart or modify.
#
sub usage()
{
print STDOUT "Usage: swapexp [-i] [-r] <-s in | out | restart | modify> ".
print STDOUT "Usage: swapexp [-i] [-r] <-s in | out | restart | modify> " .
"<pid> <eid> [<nsfile>]\n";
exit(-1);
}
......@@ -40,7 +41,7 @@ my $tbdir = "$TB/bin/";
my $tbdata = "tbdata";
my $batch = 0;
my $idleswap = 0;
my $reboot = 0;
my $reboot = 0;
my $errorstat= -1;
my $inout;
......@@ -87,9 +88,9 @@ if (defined($options{"r"})) {
if (defined($options{"s"})) {
$inout = $options{"s"};
if ($inout ne "out" &&
$inout ne "in" &&
$inout ne "restart" &&
if ($inout ne "out" &&
$inout ne "in" &&
$inout ne "restart" &&
$inout ne "modify") {
usage();
}
......@@ -106,7 +107,7 @@ my $eid = $ARGV[1];
my $nsfile;
if ($inout eq "modify") {
if ($inout eq "modify") {
$nsfile = $ARGV[2];
#
......@@ -169,7 +170,7 @@ if ($UID && !TBAdmin($UID) &&
# in the process of being terminated. We use a "wrapper" state (actually
# a timestamp so we can say when termination was requested) since
# terminating consists of a couple of different experiment states down inside
# the tb scripts.
# the tb scripts.
#
DBQueryFatal("lock tables experiments write");
......@@ -194,7 +195,7 @@ if (! chdir($workdir)) {
if (defined($hashrow{'expt_locked'})) {
$val = $hashrow{'expt_locked'};
die("*** $0:\n".
" It appears that $pid/$eid went into transition at $val.\n".
" You will be notified via email when experiment transition ".
......@@ -286,7 +287,7 @@ if (! $batch) {
$logname = TBExptCreateLogFile($pid, $eid, "swapexp");
TBExptSetLogFile($pid, $eid, $logname);
TBExptOpenLogFile($pid, $eid);
if (TBBackGround($logname)) {
#
# Parent exits normally
......@@ -304,15 +305,6 @@ if ($inout ne "restart" && -e $repfile) {
unlink("$repfile");
}
#
# Rerun tbprerun if modifying.
#
if ($inout eq "modify") {
if (system("$tbdir/tbprerun -m $pid $eid $nsfile") != 0) {
fatal("tbprerun failed!\n");
}
}
#
# Sanity check states in case someone changes something.
#
......@@ -322,7 +314,7 @@ if ($inout eq "out") {
$errorstat = $? >> 8;
fatal("tbswap out failed!\n");
}
$estate = ExpState($pid,$eid);
if ($estate ne EXPTSTATE_SWAPPED) {
fatal("Experiment is in the wrong state: $estate\n");
......@@ -334,38 +326,72 @@ elsif ($inout eq "in") {
$errorstat = $? >> 8;
fatal("tbswap in failed!\n");
}
$estate = ExpState($pid,$eid);
if ($estate ne EXPTSTATE_ACTIVE) {
fatal("Experiment is in the wrong state: $estate\n");
}
system("$tbdir/tbreport -b $pid $eid 2>&1 > $repfile");
}
}
elsif ($inout eq "modify") {
my $modifyError = "";
print "Backing up old experiment state ... " . TBTimeStamp() . "\n";
if (TBExptBackupVirtualState($pid, $eid, $$)) {
fatal("*** $0:\n".
" Could not backup experiment state; cannot safely continue!\n");
}
TBExptRemoveVirtualState($pid, $eid);
#
# Rerun tbprerun if modifying.
#
if (system("$tbdir/tbprerun -m $pid $eid $nsfile") != 0) {
$modifyError = "tbprerun failed!";
}
#
# If experiment is currently swapped out, no need to do an update
# after modifying it.
#
if ($estate eq EXPTSTATE_ACTIVE) {
if (! $modifyError && $estate eq EXPTSTATE_ACTIVE) {
print STDOUT "Running 'tbswap update' with arguments: $pid $eid\n";
if (system("$tbdir/tbswap update $pid $eid") != 0) {
$errorstat = $? >> 8;
fatal("tbswap update failed!\n");
my $rebootSwitch = "";
if ($reboot) {
$rebootSwitch = "-reboot";
}
$estate = ExpState($pid, $eid);
if ($estate ne EXPTSTATE_ACTIVE) {
fatal("Experiment is in the wrong state: $estate\n");
if (system("$tbdir/tbswap update $rebootSwitch $pid $eid") != 0) {
$errorstat = $? >> 8;
$modifyError = "tbswap update failed!";
}
system("$tbdir/tbreport -b $pid $eid 2>&1 > $repfile");
if ($reboot) {
if (system("$tbdir/node_reboot -e $pid,$eid") != 0) {
fatal("node reboot failed!\n");
if (! $modifyError) {
$estate = ExpState($pid, $eid);
if ($estate ne EXPTSTATE_ACTIVE) {
fatal("Experiment is in the wrong state: $estate!");
}
}
if (! $modifyError) {
system("$tbdir/tbreport -b $pid $eid 2>&1 > $repfile");
}
}
if ($modifyError) {
print STDERR "*** $0:\n".
" $modifyError\n";
print STDOUT "Recovering experiment state...\n";
TBExptRemoveVirtualState($pid, $eid );
if (0 == TBExptRestoreVirtualState($pid, $eid, $$)) {
fatal("*** Update aborted; old state restored.\n");
} else {
# Set state to NEW so experiment will get wiped.
SetExpState($pid, $eid, EXPTSTATE_NEW);
fatal("*** Experiment state could not be restored!\n");
}
}
}
else { # $inout eq "restart" assumed.
......@@ -471,7 +497,7 @@ exit 0;
sub fatal($)
{
my($mesg) = $_[0];
print STDOUT $mesg;
#
......@@ -499,7 +525,7 @@ sub fatal($)
$hosed = 1;
}
if ($hosed) {
if ($hosed) {
#
# Note: $estate is still set to the state which the experiment was in
# when we began.
......@@ -510,7 +536,7 @@ sub fatal($)
print "tbswap out failed!\n";
}
}
print "Running tbend with arguments: -force $pid $eid\n";
if (system("$tbdir/tbend -force $pid $eid") != 0) {
print "tbend failed!\n";
......@@ -527,7 +553,7 @@ sub fatal($)
system("/bin/cp -Rfp $workdir/ $userdir/tbdata");
#
# In batch mode, exit without sending the email.
# In batch mode, exit without sending the email.
#
if ($batch) {
TBUnLockExp($pid, $eid);
......@@ -535,7 +561,7 @@ sub fatal($)
}
#
# Clear the log file so the web page stops spewing.
# Clear the log file so the web page stops spewing.
#
if (defined($logname)) {
TBExptCloseLogFile($pid, $eid);
......@@ -562,9 +588,11 @@ sub fatal($)
system("/bin/mv -f $workdir ${workdir}-failed");
system("/bin/rm -rf ${userdir}-failed");
system("/bin/mv -f $userdir ${userdir}-failed");
TBExptDestroy($pid, $eid);
TBExptDestroy($pid, $eid);
}
exit(-1);
}
......@@ -97,42 +97,22 @@ if (! $modify) {
}
#
# Cleanup if something goes wrong.
#
# Cleanup if something goes wrong.
#
sub cleanup {
if ($modify) {
print STDERR "Recovering from errors.\n";
print "Restoring old experiment state ... " . TBTimeStamp() . "\n";
if (0 == TBExptRestoreVirtualState($pid, $eid, $$)) {
print "Restoration done! " . TBTimeStamp() . "\n";
return;
} else {
print "Restoration failed... aborting! " . TBTimeStamp() . "\n";
# Fall through to full cleanup...
}
}
print STDERR "Cleaning up after errors.\n";
print "Removing experiment state ... " . TBTimeStamp() . "\n";
TBExptRemoveVirtualState($pid, $eid );
print "Removal done! " . TBTimeStamp() . "\n";
#
# If modify fails, and recovery fails,
# going to "NEW" tells swapexp to fully terminate the experiment.
#
SetExpState($pid, $eid, EXPTSTATE_NEW);
system("prerender -r $pid $eid");
}
if ($modify) {
print "Backing up old experiment state ... " . TBTimeStamp() . "\n";
if (TBExptBackupVirtualState($pid, $eid, $$)) {
die("*** $0:\n".
" Could not backup experiment state!\n");
if (! $modify) {
print STDERR "Cleaning up after errors.\n";
print "Removing experiment state ... " . TBTimeStamp() . "\n";
TBExptRemoveVirtualState($pid, $eid );
print "Removal done! " . TBTimeStamp() . "\n";
#
# If modify fails,
# going to "NEW" tells swapexp to fully terminate the experiment.
#
SetExpState($pid, $eid, EXPTSTATE_NEW);
system("prerender -r $pid $eid");
}
TBExptRemoveVirtualState($pid, $eid);
print "Backup done! " . TBTimeStamp() . "\n";
}
# This setups virt_nodes, virt_names including all IP address calculation
......
This diff is collapsed.
......@@ -111,17 +111,22 @@ if (TBExptAccessCheck($uid, $exp_pid, $exp_eid, $TB_EXPT_MODIFY)) {
}
if (ISADMIN($uid)) {
if (strcmp($expstate, $TB_EXPTSTATE_ACTIVE) == 0) {
if (strcmp($expstate, $TB_EXPTSTATE_ACTIVE) == 0 ||
strcmp($expstate, $TB_EXPTSTATE_SWAPPED) == 0) {
SUBMENUSECTION("Beta-Test Options");
WRITESUBMENUBUTTON("Restart this Experiment",
"swapexp.php3?inout=restart&pid=$exp_pid".
"&eid=$exp_eid");
if (strcmp($expstate, $TB_EXPTSTATE_ACTIVE) == 0) {
WRITESUBMENUBUTTON("Restart this Experiment",
"swapexp.php3?inout=restart&pid=$exp_pid".
"&eid=$exp_eid");
}
WRITESUBMENUBUTTON("Modify this Experiment",
"modifyexp.php3?pid=$exp_pid&eid=$exp_eid");
}
if (strcmp($expstate, $TB_EXPTSTATE_ACTIVE) == 0) {
SUBMENUSECTION("Admin Options");
WRITESUBMENUBUTTON("Send a Swap Request",
......@@ -131,7 +136,7 @@ if (ISADMIN($uid)) {
WRITESUBMENUBUTTON("Force Swap Out (Idle-Swap)",
"swapexp.php3?inout=out&force=1".
"&pid=$exp_pid&eid=$exp_eid");
SUBMENUSECTIONEND();
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment