Commit d651dd42 authored by Leigh Stoller's avatar Leigh Stoller

New "restart" or perhaps better if named "replay" mode to swapexp.

Attempts to replay an experiment by rebooting all the nodes, clearing
the various startup bits (ready, startstatus, bootstatus, portstats),
and then restarting the event system. I am dubious that this is a
workable solution because of the asynchronous nature of the testbed
(nodes happily cruise from TBRESET to ISUP and beyond without
stopping), and so its hard to truly replicate the initial lack of
state that a freshly swapped in experiment has. Still, people
requested it and I cheerfully provided it cause thats what I do;
service with a smile and not a wit of complaint. Is anyone reading
this?
parent b485a466
......@@ -12,13 +12,9 @@ use Getopt::Std;
#
# This gets invoked from the Web interface. Swap an experiment in or out.
#
# Note about exit value. -1 means error. 0 means backgrounded. 1 means
# somthing else. The web page uses this to decide what kind of message
# to give the user.
#
sub usage()
{
print STDOUT "Usage: swapexp <-s in | out> <pid> <eid>\n";
print STDOUT "Usage: swapexp <-s in | out | restart> <pid> <eid>\n";
exit(-1);
}
my $optlist = "s:";
......@@ -48,6 +44,7 @@ my $dbuid;
my $user_name;
my $user_email;
my @row;
my $action;
#
# Untaint the path
......@@ -80,13 +77,10 @@ if (@ARGV != 2) {
}
my $pid = $ARGV[0];
my $eid = $ARGV[1];
if (defined($options{"b"})) {
$batch = $options{"b"};
}
if (defined($options{"s"})) {
$inout = $options{"s"};
if ($inout ne "out" && $inout ne "in") {
if ($inout ne "out" && $inout ne "in" && $inout ne "restart") {
usage();
}
}
......@@ -136,7 +130,7 @@ if (! UserDBInfo($dbuid, \$user_name, \$user_email)) {
if ($UID && !TBAdmin($UID) &&
!TBExptAccessCheck($dbuid, $pid, $eid, TB_EXPT_DESTROY)) {
die("*** $0:\n".
" You do not have permission to end this experiment!\n");
" You do not have permission to swap this experiment!\n");
}
#
......@@ -206,6 +200,10 @@ if ($inout eq "out" && $estate eq EXPTSTATE_SWAPPED) {
die("*** $0:\n".
" It appears that experiment $pid/$eid is already swapped out!");
}
if ($inout eq "restart" && $estate ne EXPTSTATE_ACTIVE) {
die("*** $0:\n".
" It appears that experiment $pid/$eid is not active!");
}
#
# Set the timestamp now, and unlock the experiments table.
......@@ -220,6 +218,16 @@ DBQueryFatal("unlock tables");
# inconsistent state.
#
if ($inout eq "in") {
$action = "swapped in";
}
if ($inout eq "out") {
$action = "swapped out";
}
if ($inout eq "restart") {
$action = "restarted";
}
#
# Get email address of the experiment head, which may be different than
# the person who is actually terminating the experiment, since its polite
......@@ -247,10 +255,8 @@ if (! $batch) {
#
# Parent exits normally
#
print STDOUT
"Experiment $pid/$eid is now swapping $inout.\n".
"You will be notified via email when the experiment has ".
"finished swapping.\n";
print "Experiment $pid/$eid is now being $action.\n".
"You will be notified via email when the this is done.\n";
exit(0);
}
}
......@@ -258,7 +264,7 @@ if (! $batch) {
#
# Remove old report file since its contents are going to be invalid.
#
if (-e $repfile) {
if ($inout ne "restart" && -e $repfile) {
unlink("$repfile");
}
......@@ -276,7 +282,7 @@ if ($inout eq "out") {
fatal("Experiment is in the wrong state: $estate\n");
}
}
else {
elsif ($inout eq "in") {
print STDOUT "Running tbswapin with arguments: $pid $eid\n";
if (system("$tbdir/tbswapin $pid $eid") != 0) {
fatal("tbswapin failed!\n");
......@@ -289,6 +295,12 @@ else {
system("$tbdir/tbreport -b $pid $eid 2>&1 > $repfile");
}
else {
print STDOUT "Running tbrestart with arguments: $pid $eid\n";
if (system("$tbdir/tbrestart $pid $eid") != 0) {
fatal("tbrestart failed!\n");
}
}
#
# Try to copy off the files for testbed information gathering.
......@@ -307,7 +319,7 @@ system("cp -Rfp $workdir/ $userdir/tbdata");
#
TBUnLockExp($pid, $eid);
print "Swap Success\n";
print "Done!\n";
#
# In batch mode, just exit without sending email.
......@@ -327,18 +339,17 @@ if (defined($logname)) {
# Send email notification to user.
#
my $message =
"Experiment `$eid' in project `$pid' has been swapped $inout.\n\n" .
"Appended below is the output of the experiment swap${inout}. If you\n" .
"have any questions or comments, please include the output below\n" .
"in your message to $TBOPS\n";
"Experiment `$eid' in project `$pid' has been $action.\n\n" .
"Appended below is the output. If you have any questions or comments,\n" .
"please include the output in your message to $TBOPS\n";
SENDMAIL("$user_name <$user_email>",
"Experiment $pid/$eid Swapped $inout",
"Experiment $pid/$eid $action",
$message,
"$user_name <$user_email>",
"Cc: $expt_head_name <$expt_head_email>\n".
"Bcc: $TBLOGS",
($repfile, $logname));
(($inout eq "restart") ? ($logname) : ($repfile, $logname)));
exit 0;
......@@ -376,7 +387,7 @@ sub fatal($)
# Send a message to the testbed list. Append the logfile.
#
SENDMAIL("$user_name <$user_email>",
"Swap${inout} Failure: $pid/$eid",
"Swap ${inout} Failure: $pid/$eid",
$mesg,
"$user_name <$user_email>",
"Cc: $expt_head_name <$expt_head_email>\n".
......
#!/usr/bin/perl -w
#
# EMULAB-COPYRIGHT
# Copyright (c) 2000-2002 University of Utah and the Flux Group.
# All rights reserved.
#
# For restart: event system restart. clear ready bits and startup status,
# boot status, port counters.
use English;
use Getopt::Std;
#
# This gets invoked from the Web interface. Swap an experiment in or out.
#
sub usage()
{
print STDOUT "Usage: tbrestart <pid> <eid>\n";
exit(-1);
}
my $optlist = "";
#
# Configure variables
#
my $TB = "@prefix@";
my $DBNAME = "@TBDBNAME@";
my $TBOPS = "@TBOPSEMAIL@";
my $TBLOGS = "@TBLOGSEMAIL@";
#
# Testbed Support libraries
#
use lib "@prefix@/lib";
use libdb;
use libtestbed;
# Locals
my $nodereboot = "$TB/bin/node_reboot";
my $waitstart = time;
my $failed = 0;
my $state;
#
# Untaint the path
#
# Untaint the path
$ENV{'PATH'} = "/bin:/usr/bin:/sbin:/usr/sbin:$TB/libexec:$TB/sbin:$TB/bin";
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};
#
# Turn off line buffering on output
#
$| = 1;
#
# Parse command arguments. Once we return from getopts, all that should
# left are the required arguments.
#
%options = ();
if (! getopts($optlist, \%options)) {
usage();
}
if (@ARGV != 2) {
usage();
}
my $pid = $ARGV[0];
my $eid = $ARGV[1];
my @nodes = ExpNodes($pid, $eid);
print "Beginning restart in for $pid/$eid. " . TBTimeStamp() . "\n";
TBDebugTimeStamp("tbrestart started");
#
# Must be an active experiment to restart!
#
if (! ($state = ExpState($pid, $eid))) {
die("*** $0:\n".
" No such experiment $pid/$eid\n");
}
if ($state ne EXPTSTATE_ACTIVE) {
die("*** $0:\n".
" Experiment must be active to be restart!\n");
}
#
# Stop the event system.
#
if (!$DISABLE_EVENTS) {
print "Stopping the event system.\n";
TBDebugTimeStamp("eventsys_control started");
if (system("eventsys_control stop $pid $eid")) {
die("*** $0:\n".
" Failed to stop the event system.\n");
}
TBDebugTimeStamp("eventsys_control finished");
}
#
# Clearing the portstat counters seems like a good idea.
#
print "Clearing port counters.\n";
TBDebugTimeStamp("portstats started");
if (system("portstats -z -a -q $pid $eid")) {
print STDERR "*** WARNING: Failed to clear port counters.\n";
#
# This is a non-fatal error.
#
}
TBDebugTimeStamp("portstats finished");
#
# Grab the node list. We are going to reboot each one in turn, instead of
# as a group. Why? Cause we need to know when the node is down so that we
# clear/reset state in the DB. We have no idea what the node is doing at
# this point. This is terribly imperfect of course, since there are no
# guarantees, especially since the events are async (a tbreset and isup
# could be in the event queue for a node). The ready bits present the worst
# problem.
#
print "Rebooting all nodes\n";
TBDebugTimeStamp("node reboot started");
foreach my $node ( @nodes ) {
if (system("$nodereboot $node")) {
die("*** $0:\n".
" Failed to reboot node $node!\n");
}
# Clears various things including ready bits.
TBNodeBootReset($node);
}
print STDOUT "Waiting for nodes to come up ...\n";
foreach my $node ( sort(@nodes) ) {
if (! TBNodeStateWait($node, TBDB_NODESTATE_ISUP, $waitstart, (60*6))) {
print STDOUT "$node is alive and well\n";
SetNodeBootStatus($node, NODEBOOTSTATUS_OKAY);
next;
}
SetNodeBootStatus($node, NODEBOOTSTATUS_FAILED);
$failed++;
}
TBDebugTimeStamp("node reboot finished");
if ($failed) {
die("*** $0:\n".
" $failed nodes failed to reboot properly! \n");
}
#
# Start the event system.
#
if (!$DISABLE_EVENTS) {
print "Starting the event system.\n";
TBDebugTimeStamp("eventsys_control started");
if (system("eventsys_control start $pid $eid")) {
die("*** $0:\n".
" Failed to start the event system.\n");
}
TBDebugTimeStamp("eventsys_control finished");
}
print "Restart finished. " . TBTimeStamp() . "\n";
TBDebugTimeStamp("tbrestart finished");
exit(0);
......@@ -43,6 +43,7 @@
<li> <a href="#UTT-7">Are the nodes in my experiment backed up
(filesaved)?</a>
<li> <a href="#UTT-Swapping">What is Swapping?</a>
<li> <a href="#UTT-Restart">What is Experiment Restart?</a>
<li> <a href="#UTT-8">How can I get switch statistics (such as packet
counts) for my experiment?</a>
<li> <a href="#UTT-Naming">What names should I use to refer to the
......@@ -603,6 +604,23 @@
as well.
</p>
<li><a NAME="UTT-Restart"></a>
<h3>What is Experiment Restart?</h3>
<p>
Experiment restart (or perhaps more aptly, replay) allows you to
rerun your experiment from scratch, but without the added expense
of a swapin and swapout. In other words, the nodes that are
currently allocated to your experiment are all rebooted, and the
experiment startup state is cleared. This includes the
<a href="#SWS-6">ready bits</a>, the boot status in the web page,
and the <a href="#SWS-4">startup command status</a>. In addition,
the event scheduler for the experiment is restarted, and your event
sequence is replayed again. Note that your rpms and tarfiles are
<b>not</b> installed again. Replay is obviously faster than
swapout/swapin, and has the added benefit that you will not run
the risk of not being able to swapin for lack of available nodes.
</p>
<li><a NAME="UTT-8"></a>
<h3>How can I get switch statistics (such as packet counts) for my
experiment?</h3>
......
......@@ -9,7 +9,7 @@ include("defs.php3");
#
# Standard Testbed Header
#
PAGEHEADER("Swap an Experiment");
PAGEHEADER("Swap/Restart an Experiment");
#
# Only known and logged in users can end experiments.
......@@ -22,22 +22,33 @@ LOGGEDINORDIE($uid);
#
if (!isset($pid) ||
strcmp($pid, "") == 0) {
USERERROR("The project ID was not provided!", 1);
USERERROR("The project ID was not provided!", 1);
}
if (!isset($eid) ||
strcmp($eid, "") == 0) {
USERERROR("The experiment ID was not provided!", 1);
USERERROR("The experiment ID was not provided!", 1);
}
if (!isset($inout) ||
(strcmp($inout, "in") && strcmp($inout, "out"))) {
USERERROR("The swap direction must be either in or out!", 1);
(strcmp($inout, "in") && strcmp($inout, "out") &&
strcmp($inout, "restart"))) {
USERERROR("The argument must be either in, out, or restart!", 1);
}
$exp_eid = $eid;
$exp_pid = $pid;
if (!strcmp($inout, "in")) {
$action = "swapin";
}
elseif (!strcmp($inout, "out")) {
$action = "swapout";
}
elseif (!strcmp($inout, "restart")) {
$action = "restart";
}
#
# Check to make sure thats this is a valid PID/EID tuple.
#
......@@ -45,8 +56,8 @@ $query_result =
DBQueryFatal("SELECT * FROM experiments WHERE ".
"eid='$exp_eid' and pid='$exp_pid'");
if (mysql_num_rows($query_result) == 0) {
USERERROR("The experiment $exp_eid is not a valid experiment ".
"in project $exp_pid.", 1);
USERERROR("The experiment $exp_eid is not a valid experiment ".
"in project $exp_pid.", 1);
}
$row = mysql_fetch_array($query_result);
$exp_gid = $row[gid];
......@@ -69,7 +80,7 @@ if ($expt_locked) {
# Verify permissions.
#
if (! TBExptAccessCheck($uid, $exp_pid, $exp_eid, $TB_EXPT_MODIFY)) {
USERERROR("You do not have permission to swap experiment $exp_eid!", 1);
USERERROR("You do not have permission for $exp_eid!", 1);
}
#
......@@ -80,7 +91,7 @@ if (! TBExptAccessCheck($uid, $exp_pid, $exp_eid, $TB_EXPT_MODIFY)) {
#
if ($canceled) {
echo "<center><h2><br>
Experiment swap$inout canceled!
Experiment $action canceled!
</h2></center>\n";
PAGEFOOTER();
......@@ -89,7 +100,7 @@ if ($canceled) {
if (!$confirmed) {
echo "<center><h2><br>
Are you sure you want to swap$inout experiment '$exp_eid?'
Are you sure you want to $action experiment '$exp_eid?'
</h2>\n";
echo "<form action='swapexp.php3?inout=$inout&pid=$exp_pid&eid=$exp_eid'
......@@ -97,9 +108,17 @@ if (!$confirmed) {
echo "<b><input type=submit name=confirmed value=Confirm></b>\n";
echo "<b><input type=submit name=canceled value=Cancel></b>\n";
echo "</form>\n";
echo "<p>
<a href='$TBDOCBASE/faq.php3#UTT-Swapping'>
(Information on experiment swapping)</a>\n";
if (!strcmp($inout, "restart")) {
echo "<p>
<a href='$TBDOCBASE/faq.php3#UTT-Restart'>
(Information on experiment restart)</a>\n";
}
else {
echo "<p>
<a href='$TBDOCBASE/faq.php3#UTT-Swapping'>
(Information on experiment swapping)</a>\n";
}
echo "</center>\n";
PAGEFOOTER();
......@@ -119,7 +138,7 @@ TBGroupUnixInfo($exp_pid, $exp_gid, $unix_gid, $unix_name);
# tbstopit <pid> <eid>
#
echo "<center><br>";
echo "<h2>Starting experiment swap$inout. Please wait a moment ...
echo "<h2>Starting experiment $action. Please wait a moment ...
</h2></center>";
flush();
......@@ -141,7 +160,7 @@ $result = exec("$TBSUEXEC_PATH $uid $unix_gid ".
if ($retval) {
echo "<br><br><h2>
Swap Failure($retval): Output as follows:
$action failure($retval): Output as follows:
</h2>
<br>
<XMP>\n";
......@@ -167,15 +186,15 @@ if ($retval == 0) {
echo "Experiment
<a href='showexp.php3?pid=$exp_pid&eid=$exp_eid'>$exp_eid</a>
in project <A href='showproject.php3?pid=$exp_pid'>$exp_pid</A>
is swapping $inout.
has started its $action.
<br><br>
You will be notified via email when the experiment has finished
swapping. This typically takes $howlong minutes, depending on the
You will be notified via email when the operation is complete.
This typically takes $howlong minutes, depending on the
number of nodes in the experiment.
If you do not receive email notification within a reasonable amount
of time, please contact $TBMAILADDR.
<br><br>
While you are waiting, you can watch the log of experiment swap
While you are waiting, you can watch the log
in <a target=_blank href=spewlogfile.php3?pid=$exp_pid&eid=$exp_eid>
realtime</a>.\n";
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment