Commit 32b16d6e authored by Leigh B. Stoller's avatar Leigh B. Stoller
Browse files

A set of fixes and updates to batchmode, which had suffered a fair

degree of bit rot. Updated the create batch web page to do
local/remote NS file spec. Switched to libdb interface in the batch
mode scripts. Removed estimated pc/shark slots since we don't support
that anymore, and because the new parser returns proper exit status
when not enough nodes are available. The DB will be updated at that
point, and we can incorporate that info (have not done so yet).
Incorporated node boot status into isexpdone(), since a failed node
will never report startcommand exit status. Clean up email code so
that email goes to user/tblogs/tbops in the proper situations.
parent e439b07b
......@@ -25,6 +25,13 @@ my $DBNAME = "@TBDBNAME@";
my $TBOPS = "@TBOPSEMAIL@";
my $TBLOGS = "@TBLOGSEMAIL@";
#
# Testbed Support libraries
#
use lib "@prefix@/lib";
use libdb;
use libtestbed;
#
# Ug, exit value from startexp when not enough nodes.
#
......@@ -56,12 +63,6 @@ my $user_email = "$TBOPS";
#
$| = 1;
#
# Testbed Support library
#
push(@INC, "$TB/lib");
require libtestbed;
#
# Untaint the path
#
......@@ -88,17 +89,11 @@ if (! $debug) {
daemonize();
}
#
# Set up for querying the database.
#
use Mysql;
my $DB = Mysql->connect("localhost", $DBNAME, "script", "none");
#
# Loop, looking for batch experiments that want to run.
#
while (1) {
my($count, $i);
my($count, $i, $query_result, $pending_result, $running_result);
my(%row, %pending_row);
#
......@@ -117,7 +112,7 @@ while (1) {
# through in a "least recently attempted" manner.
#
$query_result =
DBquery("lock tables batch_experiments write");
DBQuery("lock tables batch_experiments write");
if (! $query_result) {
print "DB Error locking tables. Waiting a bit ...\n";
sleep(10);
......@@ -125,25 +120,25 @@ while (1) {
}
$pending_result =
DBquery("SELECT * FROM batch_experiments ".
DBQuery("SELECT * FROM batch_experiments ".
"WHERE status='new' and canceled=0 and (attempts=0 or ".
"((UNIX_TIMESTAMP() - UNIX_TIMESTAMP(started) > (60 * 10)))) ".
"ORDER BY started LIMIT 1");
$running_result =
DBquery("SELECT * FROM batch_experiments ".
DBQuery("SELECT * FROM batch_experiments ".
"WHERE status='running' ORDER BY started");
if (!$pending_result || !$running_result) {
print "DB Error getting batch info. Waiting a bit ...\n";
DBquery("unlock tables");
DBQuery("unlock tables");
sleep(10);
next;
}
if (!$pending_result->numrows && !$running_result->numrows) {
DBquery("unlock tables");
DBQuery("unlock tables");
sleep(10);
next;
}
......@@ -162,17 +157,17 @@ while (1) {
my $now = `date '+20%y-%m-%d %H:%M:%S'`;
$query_result =
DBquery("update batch_experiments set status='configuring', ".
DBQuery("update batch_experiments set status='configuring', ".
"started='$now' where eid='$eid' and pid='$pid'");
if (! $query_result) {
print "DB error setting batch $pid/$eid to configuring.\n";
DBquery("unlock tables");
DBQuery("unlock tables");
sleep(10);
next;
}
}
DBquery("unlock tables");
DBQueryWarn("unlock tables");
#
# Okay, first we check the status of running batch mode experiments
......@@ -185,10 +180,7 @@ while (1) {
# loop instead of in the child that started the experiment, its so that
# we fire up again and look for them in the event that paper goes down.
#
$count = $running_result->numrows;
for ($i = 0; $i < $count; $i++) {
%row = $running_result->fetchhash();
while (%row = $running_result->fetchhash()) {
my $canceled = $row{'canceled'};
if ($canceled) {
dosomething("cancel", %row);
......@@ -216,7 +208,7 @@ sub dosomething($$)
{
my($dowhat) = shift;
my(%exphash) = @_;
my($uid, $gid, $row);
my($uid, $gid, $row, $query_result);
# Global vars
$eid = $exphash{'eid'};
......@@ -259,13 +251,6 @@ sub dosomething($$)
}
openlog($logname);
#
# Form a new connection to the DB since we are in the child. Not sure
# if this happens as a result of the fork, but lets be sure.
#
undef($DB);
$DB = Mysql->connect("localhost", $DBNAME, "script", "none");
my $creator = $exphash{'creator_uid'};
my $longname = $exphash{'name'};
......@@ -277,11 +262,10 @@ sub dosomething($$)
# Get some user information.
#
$query_result =
$DB->query("SELECT usr_name,usr_email from users ".
"WHERE uid='$creator'");
DBQueryFatal("SELECT usr_name,usr_email from users ".
"WHERE uid='$creator'");
if (! $query_result ||
$query_result->numrows != 1) {
if ($query_result->numrows != 1) {
fatal("DB Error getting user information for uid $creator\n");
}
@row = $query_result->fetchrow_array();
......@@ -325,7 +309,7 @@ sub dosomething($$)
sub startexp($)
{
my(%exphash) = @_;
my($exit_status, $running);
my($exit_status, $running, $query_result);
my $creator = $exphash{'creator_uid'};
my $longname = $exphash{'name'};
......@@ -336,14 +320,11 @@ sub startexp($)
#
my $rightnow = `date '+20%y-%m-%d %H:%M:%S'`;
$query_result =
DBquery("insert into experiments ".
"(eid, pid, expt_created, expt_name, ".
"expt_head_uid, expt_start, state, batchmode) ".
"VALUES ('$eid', '$pid', '$rightnow', '$longname', ".
"'$creator', '$rightnow', 'new', 1)");
if (! $query_result) {
fatal("DB error inserting experiment record. Quitting ...\n");
}
DBQueryFatal("insert into experiments ".
"(eid, pid, expt_created, expt_name, ".
"expt_head_uid, expt_start, state, batchmode) ".
"VALUES ('$eid', '$pid', '$rightnow', '$longname', ".
"'$creator', '$rightnow', 'new', 1)");
#
# Try to start the experiment. If it fails, the experiment is gone.
......@@ -360,8 +341,8 @@ sub startexp($)
# we can pick up the cancelation later.
#
$query_result =
DBquery("select canceled from batch_experiments ".
"where eid='$eid' and pid='$pid'");
DBQueryWarn("select canceled from batch_experiments ".
"where eid='$eid' and pid='$pid'");
if ($query_result) {
@row = $query_result->fetchrow_array();
......@@ -388,8 +369,8 @@ sub startexp($)
# XXX - What if this update fails?
#
$query_result =
DBquery("update batch_experiments set status='new', ".
"attempts=attempts+1 where eid='$eid' and pid='$pid'");
DBQueryWarn("update batch_experiments set status='new', ".
"attempts=attempts+1 where eid='$eid' and pid='$pid'");
$attempts++;
if (($exit_status == $TOOFEWNODES && $attempts >= 9 &&
......@@ -397,9 +378,10 @@ sub startexp($)
(($exit_status != $TOOFEWNODES) && ($attempts % 5) == 0) ||
($attempts == 0)) {
fatal("Could not configure Batch Mode experiment $pid/$eid\n".
"There have been $attempts attempts made to start this ".
"batch\n");
email_status("Could not configure Batch Mode experiment ".
"$pid/$eid\n".
"There have been $attempts attempts made to start ".
"this batch\n");
}
exit($exit_status);
}
......@@ -410,8 +392,8 @@ sub startexp($)
# XXX - What if this update fails?
#
$query_result =
DBquery("update batch_experiments set status='running' ".
"where eid='$eid' and pid='$pid'");
DBQueryWarn("update batch_experiments set status='running' ".
"where eid='$eid' and pid='$pid'");
email_status("Batch Mode experiment $pid/$eid is now running!\n".
"Please consult the Web interface to see how it is doing\n");
......@@ -430,7 +412,8 @@ sub endexp($)
my(%exphash) = @_;
system("$endexp -b $pid $eid");
DBquery("DELETE from batch_experiments WHERE eid='$eid' and pid='$pid'");
DBQueryWarn("DELETE from batch_experiments ".
"WHERE eid='$eid' and pid='$pid'");
email_status("Batch Mode experiment $pid/$eid has finished!\n");
system("rm -rf $dirname");
......@@ -452,8 +435,9 @@ sub cancelexp($$)
system("$endexp -b $pid $eid");
}
DBquery("DELETE from batch_experiments WHERE eid='$eid' and pid='$pid'");
notify_user("Your Batch Mode experiment has been canceled. You may now\n".
DBQueryWarn("DELETE from batch_experiments ".
"WHERE eid='$eid' and pid='$pid'");
donotify("Your Batch Mode experiment has been canceled. You may now\n".
"reuse the experiment name\n", "Canceled", 0);
system("rm -rf $dirname");
......@@ -470,7 +454,7 @@ sub cancelexp($$)
sub isexpdone($)
{
my(%exphash) = @_;
my($row, $done, $i);
my($query_result, @row);
# Global vars
$eid = $exphash{'eid'};
......@@ -482,37 +466,30 @@ sub isexpdone($)
# Look to see if any nodes yet to report status. If so, spin again.
#
$query_result =
DBquery("SELECT startstatus FROM nodes LEFT JOIN reserved ".
"ON nodes.node_id=reserved.node_id ".
"WHERE reserved.eid='$eid' and reserved.pid='$pid'");
DBQueryWarn("SELECT startstatus,bootstatus FROM nodes ".
"LEFT JOIN reserved ON nodes.node_id=reserved.node_id ".
"WHERE reserved.eid='$eid' and reserved.pid='$pid'");
if (! $query_result) {
return 0;
}
$done = 1;
for ($i = 0; $i < $query_result->numrows; $i++) {
@row = $query_result->fetchrow_array();
if ($row[0] eq "none") {
$done = 0;
#
# Well, right now a node is considered finished up only if its
# boot did not fail, and it has reported start command status.
# The idea being that if the boot failed, then its status will
# never be reported anyway, and we might as well consider the node
# done (else the experiment would never end).
#
while (@row = $query_result->fetchrow_array()) {
if ($row[1] eq NODEBOOTSTATUS_FAILED) {
next;
}
if ($row[0] eq NODESTARTSTATUS_NOSTATUS) {
return 0;
}
}
return $done;
}
sub DBquery($)
{
my($query) = $_[0];
my($result);
$result = $DB->query($query);
if (! $result) {
print "DB Query failed: $query\n";
}
return $result;
return 1;
}
#
......@@ -539,41 +516,56 @@ sub openlog($)
return 0;
}
#
# A fatal error is something that the user does not need to know about.
# Caused by a breakdown in the TB system. Generally speaking, once the
# experiment is running, this should not be used.
#
sub fatal($)
{
my($mesg) = $_[0];
notify_user($mesg, "Failure", 1);
donotify($mesg, "Failure", 1);
exit(-1);
}
#
# Something the user cares about.
#
sub email_status($)
{
my($mesg) = $_[0];
notify_user($mesg, "Status", 0);
donotify($mesg, "Status", 0);
}
sub notify_user($$$)
sub donotify($$$)
{
my($mesg, $subtext, $iserr) = @_;
my($subject, $from, $to, $cc);
my($subject, $from, $to, $hdrs);
my $MAIL;
print STDOUT "$mesg\n";
$subject = "TESTBED: Batch Mode Experiment $subtext $pid/$eid";
$from = $TBOPS;
$to = "$user_name <$user_email>";
$hdrs = "Reply-To: $TBOPS";
#
# An error goes just to Testbed Operations. Normal status messages go
# to the user and to the Testbed Logs address.
#
if ($iserr) {
$cc = "Cc: $TBOPS";
$to = "$TBOPS";
}
else {
$cc = "Bcc: $TBLOGS";
$to = "$user_name <$user_email>";
$hdrs = "Bcc: $TBLOGS\n".
"$hdrs";
}
if (! ($MAIL = OPENMAIL($to, $subject, $from, $cc))) {
if (! ($MAIL = OPENMAIL($to, $subject, $from, $hdrs))) {
die("Cannot start mail program!");
}
......
......@@ -20,8 +20,16 @@ my $optlist = "";
my $TB = "@prefix@";
my $DBNAME = "@TBDBNAME@";
#
# Testbed Support libraries
#
use lib "@prefix@/lib";
use libdb;
use libtestbed;
my $tbbindir = "$TB/bin/";
my $batchdir = "$TB/batch";
my $parser = "/usr/testbed/libexec/ns2ir/parse.tcl";
my $projroot = "/proj";
my $dirname;
......@@ -33,14 +41,10 @@ $| = 1;
#
# Untaint the path
#
$ENV{'PATH'} = "/bin:/usr/bin:$TB/libexec:$TB/libexec/ir".
":$TB/libexec/ns2ir:$TB/sbin:$TB/bin";
# un-taint path
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};
$TBIRLIB = "$TB/lib/ir";
push(@INC,$TBIRLIB);
require libir;
#
# Parse command arguments. Once we return from getopts, all that should
# left are the required arguments.
......@@ -65,12 +69,6 @@ else {
fatal("Tainted argument $tempfile");
}
#
# Set up for querying the database.
#
use Mysql;
my $DB = Mysql->connect("localhost", $DBNAME, "script", "none");
#
# Parse the batchfile.
#
......@@ -91,7 +89,6 @@ if (!defined($eid) || !defined($pid) || !defined($longname) ||
fatal("Batchfile is incomplete!");
}
$nsfile = "$eid.ns";
$irfile = "$eid.ir";
#
# Create a subdir in the batch directory to work in.
......@@ -99,10 +96,10 @@ $irfile = "$eid.ir";
$dirname = "$batchdir/$pid-$eid";
mkdir($dirname, 0775) or
fatal("Could not mkdir $dirname");
fatal("Could not mkdir $dirname: $!");
chdir($dirname) or
fatal("Could not chdir to $dirname");
fatal("Could not chdir to $dirname: $!");
#
# Copy in the batch file. Web script is responsible for removing the
......@@ -118,8 +115,7 @@ if (system("/bin/cp", "$tempfile", "batchfile")) {
# First off, get some user information.
#
$query_result =
DBquery("SELECT uid,usr_name,usr_email from users ".
"WHERE unix_uid='$EUID'");
DBQuery("SELECT uid from users WHERE unix_uid='$EUID'");
if ($query_result->numrows < 1) {
fatal("Go Away! You do not exist in the Emulab Database.");
......@@ -127,14 +123,12 @@ if ($query_result->numrows < 1) {
@row = $query_result->fetchrow_array();
$uid = $row[0];
$user_name = $row[1];
$user_email = $row[2];
#
# Make sure UID is allowed to create experiments in this project.
#
$query_result =
DBquery("SELECT trust from proj_memb WHERE uid='$uid' and pid='$pid'");
DBQuery("SELECT trust from proj_memb WHERE uid='$uid' and pid='$pid'");
if ($query_result->numrows == 0) {
fatal("Go Away! You are not a member of project $pid!");
......@@ -152,14 +146,16 @@ if ($trust ne "local_root" &&
# The pid/eid pair has to be unique. LOCKING!
#
$query_result =
DBquery("SELECT * FROM experiments WHERE eid='$eid' and pid='$pid'");
DBQueryFatal("SELECT * FROM experiments ".
"WHERE eid='$eid' and pid='$pid'");
if ($query_result->numrows) {
fatal("Experiment $eid in project $pid already exists!");
}
$query_result =
DBquery("SELECT * FROM batch_experiments WHERE eid='$eid' and pid='$pid'");
DBQueryFatal("SELECT * FROM batch_experiments ".
"WHERE eid='$eid' and pid='$pid'");
if ($query_result->numrows) {
fatal("Batch experiment $eid in project $pid already exists!");
......@@ -172,44 +168,12 @@ if (system("/bin/cp", "$webnsfile", "$nsfile")) {
fatal("Could not copy $webnsfile to $dirname/$nsfile");
}
#
# Do a firstcut parse on the NS file, converting it to IR format. This
# operates as a syntax check on the NS file, so we can kick back bad NS
# files now instead of later. It also means we don't need the NS file after
# this.
#
# XXX This is copied from tbprerun.
#
$tbcmdfile = "tbcmds";
$id = "$pid-$eid";
# Run parse in impotent mode on the NS file. This has no effect but
# will display any errors.
if (system("parse.tcl $id $nsfile $irfile") != 0) {
if (system("$parser -n -a $nsfile") != 0) {
fatal("NS Parse failed!");
}
if (system("extract_tb $nsfile $tbcmdfile") != 0) {
fatal("NS extract_tb pass failed!");
}
if (system("postparse $tbcmdfile $irfile") != 0) {
fatal("NS postparse pass failed!");
}
#
# Figure out what resources are needed so the batch daemon can make an
# informed decision about whether to even try.
#
$pcs = 0;
$sharks = 0;
&ir_read($irfile);
foreach my $foo (split("\n", &ir_get("/topology/nodes"))) {
($node,$type) = split(' ', $foo);
if ($type eq "pc") {
$pcs++;
}
if ($type eq "sh") {
$sharks++;
}
}
#
# Gen up the creation time.
......@@ -220,11 +184,11 @@ $created = `date '+%Y:%m:%d %H:%M:%S'`;
# Insert the record. We leave this to very last cause the batch daemon
# is looking for batch experiments to run. Easy race avoidance.
#
DBquery("INSERT INTO batch_experiments ".
"(eid, pid, created, started, expires, ".
" name, creator_uid, numpcs, numsharks, status) ".
"VALUES ('$eid', '$pid', '$created', '$created', '$expires', ".
"'$longname', '$uid', $pcs, $sharks, 'new')");
DBQueryFatal("INSERT INTO batch_experiments ".
"(eid, pid, created, started, expires, ".
" name, creator_uid, status) ".
"VALUES ('$eid', '$pid', '$created', '$created', '$expires', ".
"'$longname', '$uid', 'new')");
exit 0;
......@@ -276,17 +240,3 @@ sub parse_batchfile()
close(BATCH);
return 1;
}
sub DBquery()
{
my($query) = $_[0];
my($result);
$result = $DB->query($query);
if (! $result) {
fatal("DB Error: $query");
}
return $result;
}
......@@ -19,6 +19,13 @@ my $optlist = "";
#
my $TB = "@prefix@";
#
# Testbed Support libraries
#
use lib "@prefix@/lib";
use libdb;
use libtestbed;
my $tbbindir = "$TB/bin/";
my $batchdir = "$TB/batch";
my $projroot = "/proj";
......@@ -34,13 +41,6 @@ delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};
#
$| = 1;
#
# Testbed Support libraries
#
push(@INC, "$TB/lib");
require libtestbed;
require libdb;
#
# Parse command arguments. Once we return from getopts, all that should
# be left are the required arguments.
......
......@@ -47,17 +47,64 @@ if (strlen($exp_id) > $TBDB_EIDLEN) {
$exp_name = addslashes($exp_name);
#
# Must provide an NS file!
#
$nonsfile = 0;
if (!isset($exp_nsfile) ||
strcmp($exp_nsfile, "") == 0 ||
strcmp($exp_nsfile, "none") == 0) {
# Not allowed to specify both a local and an upload!
#
$speclocal = 0;
$specupload = 0;
if (isset($exp_localnsfile) && strcmp($exp_localnsfile, "")) {
$speclocal = 1;
}
if (isset($exp_nsfile) && strcmp($exp_nsfile, "") &&
strcmp($exp_nsfile, "none")) {
$specupload = 1;
}
if (!$speclocal && !$specupload) {
USERERROR("You must supply either NS file name (local or remote)", 1);
}
if ($speclocal && $specupload) {
USERERROR("You may not specify both an uploaded NS file and an ".
"NS file that is located on the Emulab server", 1);
}
if (!$specupload && strcmp($exp_nsfile_name, "")) {
#
# Catch an invalid filename.
#
USERERROR("The NS file '$exp_nsfile_name' does not appear to be a ".
"valid filename. Please go back and try again.", 1);
}
if ($speclocal) {
#
# No way to tell from here if this file actually exists, since