Commit 7308f458 authored by Chad Barb's avatar Chad Barb
Browse files

Robust Experiment Modify -and-

Various Other changes to get Expt Modify ready for prime time.

 - If assign fails on a modify, experiment will
   be restored to old state, *not* swapped out.

 - Reboot option has been improved to reboot all
   nodes as part of os_setup, not in separate
   step.

 - Different assign error codes result in different
   retry behavior for assign_wrapper
   (Follow's Rob's change to assign to make it
    pass back special code for non-retriable faults)

 - '64' bit in assign_wrapper exit code indicates to tbswap
   that db/phys state hadn't been mucked with before
   the exit occurred
   (ergo, '65' and '1' are the common return codes,
    though the old 4,8,16,32 are still there for assign failing.)

 - (tbswap still returns codes from assign wrapper)

 - Added 5 sec pause between assign attempts.

 - Cleaned up tbswap code.

 - Physical state backup/restore removed from tbprerun,
   put into swapexp.

 - Interfaces table now getting cleaned up correctly
   (Mike noticed problem)

 - Changed menu display in showexp to show
   the "modify" menu option for swapped out experiments
   (like it used to.)

 - A couple other changes.

Note:
 Still admin-only, but I plan to change that soon.

To do:
 - Erase expt backups in /tmp after using them.
 - Re-viz failed experiments.
parent d1697743
......@@ -15,8 +15,22 @@
# Syntax: assign_wrapper <pid> <eid>
#
# Return code:
#
# 0 - success
# 1+ - error (Add other values:)
# 4 - bandwidth violation
# 8 - linkusers violation
# 16 - desires violation
# 32 - unassigned
# 64 - Set to indicate 'recoverability'
# (E.g., no db or physical state was modified
# by the time the error occurred.)
#
# Caveats:
# The support for direct and interswitch links has not been testbed much.
# The support for direct and interswitch links has not been tested much.
# Settings
# delaythresh is the maximum delay in ms above which a delay node is needed.
......@@ -78,6 +92,14 @@ sub usage {
exit(-1);
}
sub fatal($$) {
my $exitcode = shift;
my $message = shift;
$message =~ s/\n$//;
print STDERR "$message\n";
exit($exitcode);
}
my $verbose = 0;
my $updating = 0;
......@@ -549,8 +571,8 @@ while (($name,$weight,$types) = $result->fetchrow_array) {
# Open the TOP file
$topfile = "$eid.top";
open(TOPFILE,"> $topfile") || do {
die("*** $0:\n".
" Could not open $topfile.\n");
fatal(65,"*** $0:\n".
" Could not open $topfile.\n");
};
# Print out vtypes
......@@ -611,8 +633,8 @@ foreach $lan (keys(%lans)) {
# Hmm, no emulated lans for now.
#
if ($emulated && @members > 2) {
die("*** $0:\n".
" Emulated lans ($lan) not allowed yet! Only links.\n");
fatal(65,"*** $0:\n".
" Emulated lans ($lan) not allowed yet! Only links.\n");
}
if ($simnodes > 0 && $realnodes == 0 && $sharks == 0) {
$trivial_ok = 1;
......@@ -846,8 +868,8 @@ foreach $lan (keys(%lans)) {
# " m.pid!='$DEADPID' or m.eid!='$DEADEID'))");
if( scalar(@simnodelist) > 0 ) {
open(AVAIL,"$TBROOT/sbin/avail type=pc rand |")
or die "*** $0:\n".
" avail failed\n";
or fatal(65, "*** $0:\n".
" avail failed\n");
my $num = 0;
while (<AVAIL>) {
......@@ -860,8 +882,7 @@ if( scalar(@simnodelist) > 0 ) {
close(AVAIL);
if( $num == 0 ) {
print STDERR "$0: *** Insufficient PCs available.\n";
exit(2);
fatal(65, "$0: *** Insufficient PCs available.\n");
}
foreach $simnode (@simnodelist) {
......@@ -951,8 +972,7 @@ while (1) {
$numnodes = $numnodes_result->numrows;
if ($numnodes < $minimum_nodes) {
print STDERR "$0: *** Insufficient nodes available.\n";
exit(2);
fatal(65, "$0: *** Insufficient nodes available.\n");
}
TBDebugTimeStamp("assign started");
......@@ -962,31 +982,40 @@ while (1) {
$cmdargs = "-p $cmdargs"
if ($virtcount);
print "assign $cmdargs\n";
open(ASSIGNFP,"assign $cmdargs | tee assign.log |");
if (-1 == system "assign $cmdargs > assign.log") {
fatal(65, "*** Couldn't run assign!");
}
$violations = 0;
$score = -1;
$assignexitcode = $? >> 8;
# read output
# Header
printdb "Reading assign results.\n";
while (<ASSIGNFP>) {
chop;
/No physical nodes of type (.+)$/ && do {
$score=-2;
print $_ . "\n";
};
/^With ([0-9]+) violations$/ && do {
$violations = $1;
last;
};
/^[ \t]+BEST SCORE: [ \t]+([0-9]+(\.[0-9]+)?)/ && ($score=$1);
}
if ($score == -2) {
# Type error
print "Giving up.\n";
exit(2);
open(ASSIGNFP, "assign.log")
or fatal(65, "*** Couldn't open assign logfile!");
if ($assignexitcode == 0)
{
# read output
# Header
printdb "Reading assign results.\n";
while (<ASSIGNFP>) {
chop;
/No physical nodes of type (.+)$/ && do {
$score=-2;
print $_ . "\n";
};
/^With ([0-9]+) violations$/ && do {
$violations = $1;
last;
};
/^[ \t]+BEST SCORE: [ \t]+([0-9]+(\.[0-9]+)?)/ && ($score=$1);
}
if ($score == -2) {
# Type error
fatal(65, "Giving up.\n" );
}
printdb "Found score $score, violations $violations.\n";
}
printdb "Found score $score, violations $violations.\n";
# We don't bother reading anything else if violations occured.
if (($violations == 0) && ($score != -1)) {
......@@ -1046,7 +1075,7 @@ while (1) {
# read Edges
# By convention, in plinks, the delay node is always the second
# entry.
while (<ASSIGNFP> !~ /^Edges:/) {}
while (<ASSIGNFP> !~ /^Edges:/) { }
printdb "Edges:\n";
EDGEWHILE: while (<ASSIGNFP>) {
/^End Edges$/ && last EDGEWHILE;
......@@ -1063,8 +1092,8 @@ while (1) {
last SWITCH1;
};
/^direct$/ && do {
die("*** $0:\n".
" Unsupported link type: direct.\n");
fatal(65, "*** $0:\n".
" Unsupported link type: direct.\n");
};
/^trivial$/ && do {
# we don't have plinks for trivial links
......@@ -1083,7 +1112,7 @@ while (1) {
}
} else {
# spit out up to nodes
print "ASSIGN FAILED:\n";
print "ASSIGN FAILED: \n";
while (<ASSIGNFP>) {
if (/link_users:\s*(\d+)$/) {
$linkusers = $1;
......@@ -1103,9 +1132,10 @@ while (1) {
close(ASSIGNFP);
TBDebugTimeStamp("assign finished");
TBDebugTimeStamp("reserving started");
# Reserve resources
if (!$fail) {
TBDebugTimeStamp("reserving started");
if (system("nalloc $pid $eid " . join(" ", keys(%toreserve)))) {
print "Failed to reserve nodes. Trying again.\n";
} else {
......@@ -1145,11 +1175,11 @@ while (1) {
}
# Check for exit
if ($currentrun >= $maxrun) {
$exitcode = 1;
if ($assignexitcode == 2 || $currentrun >= $maxrun) {
$exitcode = 65;
if ($bandwidth > 0) {
$exitcode += 4;
}
}
if ($linkusers > 0) {
$exitcode += 8;
}
......@@ -1159,10 +1189,18 @@ while (1) {
if ($unassigned > 0) {
$exitcode += 32;
}
print "*** $0:\n".
" Reached run limit. Giving up. Exitcode: $exitcode.\n";
exit($exitcode);
if ($assignexitcode == 2) {
fatal($exitcode, "*** $0:\n".
" Unretriable error. Giving up.\n");
} else {
fatal($exitcode, "*** $0:\n".
" Reached run limit. Giving up.\n");
}
}
print "Waiting 5 seconds and trying again...\n";
sleep(5);
$currentrun++;
}
......@@ -1199,8 +1237,8 @@ if (scalar(keys(%isremotenode))) {
if ($doremote) {
print "Running 'wanassign -d $pid $eid'\n";
open(WANFP,"wanassign -d $pid $eid 2>&1 | tee wanassign.log |") or
die("*** $0:\n".
" Failed to start wanassign: $!\n");
fatal(65, "*** $0:\n".
" Failed to start wanassign: $!\n");
printdb "Reading wanassign results.\n";
while (<WANFP>) {
......@@ -1215,13 +1253,13 @@ if (scalar(keys(%isremotenode))) {
# Skip other output. Usually its debugging output.
}
close(WANFP) or
die("*** $0:\n".
" wanassign: " . $? ? "exited with status: $?.\n" :
fatal(65, "*** $0:\n".
" wanassign: " . $? ? "exited with status: $?.\n" :
"error closing pipe: $!\n");
if (!$success) {
die("*** $0:\n".
" wanassign could not find a solution!\n");
fatal(65,"*** $0:\n".
" wanassign could not find a solution!\n");
}
foreach my $virtual (keys(%v2vmap)) {
my $physical = $v2vmap{$virtual};
......@@ -1243,6 +1281,38 @@ if (scalar(keys(%isremotenode))) {
}
}
#
# Recoverability ends.
# (All fatal() calls from this point do not have
# the recoverable '64' bit set.)
#
#
# For update, wipe old interfaces in DB (normally done by nfree.)
# These will get rebuilt soon.
#
if ($updating) {
foreach $node (keys(%alreadyAllocated)) {
my $result = DBQueryFatal("select nt.control_net ".
"from nodes as n ".
"left join node_types as nt ".
"on nt.type=n.type ".
"where n.node_id='$node'");
my ($control) = $result->fetchrow_array();
my $pred = "1";
if (defined $control) {
$pred = "(card != '$control')";
}
DBQueryFatal("update interfaces set IP='' " .
"where node_id='$node' and $pred");
}
}
#
# VIRTNODES HACK: Local virtnodes have to be mapped now. This is a little
# hokey in that the virtnodes just need to be allocated from the pool that
......@@ -1260,8 +1330,8 @@ foreach my $pnode (keys(%virtnodes)) {
my $num = @vlist;
open(AVAIL,"$TBROOT/sbin/avail virtonly=$pnode rand limit=$num |")
or die("*** $0:\n".
" avail failed\n");
or fatal(1, "*** $0:\n".
" avail failed\n");
while (<AVAIL>) {
next
......@@ -1275,9 +1345,8 @@ foreach my $pnode (keys(%virtnodes)) {
close(AVAIL);
if (scalar(@vlist) != scalar(@plist)) {
print STDERR "*** $0:\n".
"Could not map some virtual nodes on $pnode\n";
exit(2);
fatal(1, "*** $0:\n".
"Could not map some virtual nodes on $pnode\n");
}
#
......@@ -1286,8 +1355,8 @@ foreach my $pnode (keys(%virtnodes)) {
#
print "Reserving @plist ...\n";
if (system("nalloc $pid $eid @plist")) {
die("*** $0:\n".
"Failed to reserve @plist (on $pnode)\n");
fatal(1, "*** $0:\n".
"Failed to reserve @plist (on $pnode)\n");
}
while (@plist) {
my $physical = pop(@plist);
......@@ -2030,8 +2099,8 @@ foreach $lan (keys(%rnodelans)) {
my @members = @{$tunnels{$lan}};
if (@members != 2) {
die("*** $0:\n".
" Too many members for a tunnel!\n");
fatal(1, "*** $0:\n".
" Too many members for a tunnel!\n");
}
my $server = $lantotunnelserver{$lan};
my $ipport = $rnodetotunnelport{$server};
......@@ -2073,8 +2142,8 @@ foreach $lan (keys(%rnodelans)) {
# physical node.
#
if (! TBControlNetIP($peervvnode, \$ip)) {
die("*** $0:\n".
" No Control Network IP for $peervvnode!\n");
fatal(1, "*** $0:\n".
" No Control Network IP for $peervvnode!\n");
}
printdb " vnode:$vnode port:$ipport isserver:$isserver peer_ip:$ip ".
......@@ -2288,8 +2357,8 @@ sub InitPnode($pnode, $vnode)
#
elsif (! ($osid = TBOSID($pid, $osname)) &&
! ($osid = TBOSID(TB_OPSPID, $osname))) {
die("*** $0:\n".
" Invalid OS $osname in project $pid!\n");
fatal(1, "*** $0:\n".
" Invalid OS $osname in project $pid!\n");
}
DBQueryFatal("UPDATE nodes set def_boot_cmd_line='$cmdline'," .
" startstatus='none'," .
......@@ -2449,8 +2518,8 @@ sub nextipportnum($) {
my $port = $portnext{$pnode};
if ($port >= $porthigh{$pnode}) {
die("*** $0:\n".
" No more dynamic ports available for $pnode!\n");
fatal(1, "*** $0:\n".
" No more dynamic ports available for $pnode!\n");
}
$portnext{$pnode}++;
return $port;
......
......@@ -403,7 +403,7 @@ sub fatal()
# Clear the logfile so the webpage stops.
TBExptClearLogFile($pid, $eid);
#
# Send a message to the testbed list.
#
......
......@@ -10,12 +10,13 @@ use English;
use Getopt::Std;
#
# This gets invoked from the Web interface.
# This gets invoked from the Web interface.
# Swap an experiment in, swap it out, restart or modify.
#
sub usage()
{
print STDOUT "Usage: swapexp [-i] [-r] <-s in | out | restart | modify> ".
print STDOUT "Usage: swapexp [-i] [-r] <-s in | out | restart | modify> " .
"<pid> <eid> [<nsfile>]\n";
exit(-1);
}
......@@ -40,7 +41,7 @@ my $tbdir = "$TB/bin/";
my $tbdata = "tbdata";
my $batch = 0;
my $idleswap = 0;
my $reboot = 0;
my $reboot = 0;
my $errorstat= -1;
my $inout;
......@@ -87,9 +88,9 @@ if (defined($options{"r"})) {
if (defined($options{"s"})) {
$inout = $options{"s"};
if ($inout ne "out" &&
$inout ne "in" &&
$inout ne "restart" &&
if ($inout ne "out" &&
$inout ne "in" &&
$inout ne "restart" &&
$inout ne "modify") {
usage();
}
......@@ -106,7 +107,7 @@ my $eid = $ARGV[1];
my $nsfile;
if ($inout eq "modify") {
if ($inout eq "modify") {
$nsfile = $ARGV[2];
#
......@@ -169,7 +170,7 @@ if ($UID && !TBAdmin($UID) &&
# in the process of being terminated. We use a "wrapper" state (actually
# a timestamp so we can say when termination was requested) since
# terminating consists of a couple of different experiment states down inside
# the tb scripts.
# the tb scripts.
#
DBQueryFatal("lock tables experiments write");
......@@ -194,7 +195,7 @@ if (! chdir($workdir)) {
if (defined($hashrow{'expt_locked'})) {
$val = $hashrow{'expt_locked'};
die("*** $0:\n".
" It appears that $pid/$eid went into transition at $val.\n".
" You will be notified via email when experiment transition ".
......@@ -286,7 +287,7 @@ if (! $batch) {
$logname = TBExptCreateLogFile($pid, $eid, "swapexp");
TBExptSetLogFile($pid, $eid, $logname);
TBExptOpenLogFile($pid, $eid);
if (TBBackGround($logname)) {
#
# Parent exits normally
......@@ -304,15 +305,6 @@ if ($inout ne "restart" && -e $repfile) {
unlink("$repfile");
}
#
# Rerun tbprerun if modifying.
#
if ($inout eq "modify") {
if (system("$tbdir/tbprerun -m $pid $eid $nsfile") != 0) {
fatal("tbprerun failed!\n");
}
}
#
# Sanity check states in case someone changes something.
#
......@@ -322,7 +314,7 @@ if ($inout eq "out") {
$errorstat = $? >> 8;
fatal("tbswap out failed!\n");
}
$estate = ExpState($pid,$eid);
if ($estate ne EXPTSTATE_SWAPPED) {
fatal("Experiment is in the wrong state: $estate\n");
......@@ -334,38 +326,72 @@ elsif ($inout eq "in") {
$errorstat = $? >> 8;
fatal("tbswap in failed!\n");
}
$estate = ExpState($pid,$eid);
if ($estate ne EXPTSTATE_ACTIVE) {
fatal("Experiment is in the wrong state: $estate\n");
}
system("$tbdir/tbreport -b $pid $eid 2>&1 > $repfile");
}
}
elsif ($inout eq "modify") {
my $modifyError = "";
print "Backing up old experiment state ... " . TBTimeStamp() . "\n";
if (TBExptBackupVirtualState($pid, $eid, $$)) {
fatal("*** $0:\n".
" Could not backup experiment state; cannot safely continue!\n");
}
TBExptRemoveVirtualState($pid, $eid);
#
# Rerun tbprerun if modifying.
#
if (system("$tbdir/tbprerun -m $pid $eid $nsfile") != 0) {
$modifyError = "tbprerun failed!";
}
#
# If experiment is currently swapped out, no need to do an update
# after modifying it.
#
if ($estate eq EXPTSTATE_ACTIVE) {
if (! $modifyError && $estate eq EXPTSTATE_ACTIVE) {
print STDOUT "Running 'tbswap update' with arguments: $pid $eid\n";
if (system("$tbdir/tbswap update $pid $eid") != 0) {
$errorstat = $? >> 8;
fatal("tbswap update failed!\n");
my $rebootSwitch = "";
if ($reboot) {
$rebootSwitch = "-reboot";
}
$estate = ExpState($pid, $eid);
if ($estate ne EXPTSTATE_ACTIVE) {
fatal("Experiment is in the wrong state: $estate\n");
if (system("$tbdir/tbswap update $rebootSwitch $pid $eid") != 0) {
$errorstat = $? >> 8;
$modifyError = "tbswap update failed!";
}
system("$tbdir/tbreport -b $pid $eid 2>&1 > $repfile");
if ($reboot) {
if (system("$tbdir/node_reboot -e $pid,$eid") != 0) {
fatal("node reboot failed!\n");
if (! $modifyError) {
$estate = ExpState($pid, $eid);
if ($estate ne EXPTSTATE_ACTIVE) {
fatal("Experiment is in the wrong state: $estate!");
}
}
if (! $modifyError) {
system("$tbdir/tbreport -b $pid $eid 2>&1 > $repfile");
}
}
if ($modifyError) {
print STDERR "*** $0:\n".
" $modifyError\n";
print STDOUT "Recovering experiment state...\n";
TBExptRemoveVirtualState($pid, $eid );
if (0 == TBExptRestoreVirtualState($pid, $eid, $$)) {
fatal("*** Update aborted; old state restored.\n");
} else {
# Set state to NEW so experiment will get wiped.
SetExpState($pid, $eid, EXPTSTATE_NEW);
fatal("*** Experiment state could not be restored!\n");
}
}
}
else { # $inout eq "restart" assumed.
......@@ -471,7 +497,7 @@ exit 0;
sub fatal($)
{
my($mesg) = $_[0];
print STDOUT $mesg;
#
......@@ -499,7 +525,7 @@ sub fatal($)
$hosed = 1;
}
if ($hosed) {
if ($hosed) {
#
# Note: $estate is still set to the state which the experiment was in
# when we began.
......@@ -510,7 +536,7 @@ sub fatal($)
print "tbswap out failed!\n";
}
}
print "Running tbend with arguments: -force $pid $eid\n";
if (system("$tbdir/tbend -force $pid $eid") != 0) {
print "tbend failed!\n";
......@@ -527,7 +553,7 @@ sub fatal($)
system("/bin/cp -Rfp $workdir/ $userdir/tbdata");
#
# In batch mode, exit without sending the email.
# In batch mode, exit without sending the email.
#
if ($batch) {
TBUnLockExp($pid, $eid);
......@@ -535,7 +561,7 @@ sub fatal($)
}
#
# Clear the log file so the web page stops spewing.
# Clear the log file so the web page stops spewing.
#
if (defined($logname)) {
TBExptCloseLogFile($pid, $eid);
......@@ -562,9 +588,11 @@ sub fatal($)
system("/bin/mv -f $workdir ${workdir}-failed");
system("/bin/rm -rf ${userdir}-failed");
system("/bin/mv -f $userdir ${userdir}-failed");
TBExptDestroy($pid, $eid);
TBExptDestroy($pid, $eid);
}
exit(-1);
}
......@@ -97,42 +97,22 @@ if (! $modify) {
}
#
# Cleanup if something goes wrong.
#
# Cleanup if something goes wrong.
#
sub cleanup {
if ($modify) {
print STDERR "Recovering from errors.\n";
print "Restoring old experiment state ... " . TBTimeStamp() . "\n";
if (0 == TBExptRestoreVirtualState($pid, $eid, $$)) {