Commit 4d420b21 authored by Leigh B. Stoller's avatar Leigh B. Stoller

Rework of the batch experiment code. Unified it with the immediate

experiment code. No longer uses another table. Rather, the experiment
record contains a couple of extra fields for the batch system. Also
combined some of the backend code (no longer a killbatch script).
Also added scriptable experiments; the batchexp program in the bin
directory can start an experiment from the command line, and in fact
is used from the web page for both batch experiments and immediate
experiments (-i option). All of the DB code that was in the web
interfaces was moved to batchexp.
parent 0fac2e7e
......@@ -1037,8 +1037,8 @@ outfiles="$outfiles Makeconf GNUmakefile \
tbsetup/node_reboot tbsetup/webnscheck tbsetup/nscheck \
tbsetup/resetvlans tbsetup/rmacct-ctrl tbsetup/rmproj \
tbsetup/sched_reload tbsetup/sched_reserve tbsetup/reload_daemon \
tbsetup/batchexp tbsetup/killbatchexp tbsetup/batch_daemon \
tbsetup/webbatchexp tbsetup/webkillbatchexp tbsetup/webreport \
tbsetup/batchexp tbsetup/batch_daemon \
tbsetup/webbatchexp tbsetup/webreport \
tbsetup/startexp tbsetup/endexp tbsetup/webstartexp tbsetup/webendexp \
tbsetup/snmpit tbsetup/ns2ir/GNUmakefile \
tbsetup/ns2ir/parse.tcl tbsetup/ns2ir/tb_compat.tcl \
......
......@@ -159,8 +159,8 @@ outfiles="$outfiles Makeconf GNUmakefile \
tbsetup/node_reboot tbsetup/webnscheck tbsetup/nscheck \
tbsetup/resetvlans tbsetup/rmacct-ctrl tbsetup/rmproj \
tbsetup/sched_reload tbsetup/sched_reserve tbsetup/reload_daemon \
tbsetup/batchexp tbsetup/killbatchexp tbsetup/batch_daemon \
tbsetup/webbatchexp tbsetup/webkillbatchexp tbsetup/webreport \
tbsetup/batchexp tbsetup/batch_daemon \
tbsetup/webbatchexp tbsetup/webreport \
tbsetup/startexp tbsetup/endexp tbsetup/webstartexp tbsetup/webendexp \
tbsetup/snmpit tbsetup/ns2ir/GNUmakefile \
tbsetup/ns2ir/parse.tcl tbsetup/ns2ir/tb_compat.tcl \
......
......@@ -51,7 +51,8 @@ use Exporter;
EXPTSTATE_ACTIVATING EXPTSTATE_ACTIVE EXPTSTATE_TESTING
EXPTSTATE_TERMINATING EXPTSTATE_TERMINATED
BATCHSTATE_POSTED BATCHSTATE_RUNNING BATCHSTATE_TERMINATED
BATCHSTATE_POSTED BATCHSTATE_RUNNING BATCHSTATE_TERMINATING
BATCHSTATE_ACTIVATING
TBBatchState TBSetBatchState
TBAdmin TBProjAccessCheck TBNodeAccessCheck TBOSIDAccessCheck
......
......@@ -11,7 +11,7 @@ include $(OBJDIR)/Makeconf
SUBDIRS = checkpass ns2ir
BIN_STUFF = power snmpit tbend tbswapin tbswapout tbprerun tbreport \
os_load savevlans startexp endexp batchexp killbatchexp \
os_load savevlans startexp endexp batchexp \
node_reboot nscheck node_update savelogs
# Stuff that mere users get on plastic.
......@@ -24,7 +24,7 @@ SBIN_STUFF = resetvlans console_setup.proxy sched_reload named_setup \
LIBEXEC_STUFF = mkprojdir rmproj mkacct-ctrl rmacct-ctrl \
os_setup mkexpdir console_setup webnscheck webreport \
webstartexp webendexp webbatchexp webkillbatchexp \
webstartexp webendexp webbatchexp \
assign_wrapper ptopgen webnodeupdate webgroupupdate \
webrmgroup
......
......@@ -48,6 +48,11 @@ my $projroot = "/proj";
my $debug = 0;
my $dirname;
my $BSTATE_POSTED = BATCHSTATE_POSTED;
my $BSTATE_ACTIVATING = BATCHSTATE_ACTIVATING;
my $BSTATE_RUNNING = BATCHSTATE_RUNNING;
my $BSTATE_TERMINATING = BATCHSTATE_TERMINATING;
#
# These are valid in the children, not the parent. I suppose I could use
# dynamically scoped variables, but hardly worth it.
......@@ -88,7 +93,9 @@ if (defined($options{"d"})) {
# Go to ground.
if (! $debug) {
daemonize();
if (TBBackGround($batchlog)) {
exit(0);
}
}
#
......@@ -99,14 +106,14 @@ while (1) {
my(%row, %pending_row);
#
# Need to lock the table here because of cancelation in killbatchexp.
# Need to lock the table here because of cancelation in endexp.
# See the comments in there. We need to atomically grab the next
# batch experiment we want to try, and then change its state from
# new to configuring. We want to grab just one experiment, since
# it takes a while to configure an experiment, and grabbing a bunch and
# locking them up might result in having to wait a really long time
# to cancel a batch experiment that hasn't really tried to start yet!
# Thats would ne annoying to users, and we love our users, right?
# Thats would ne annoying to users.
#
# So, now you're wondering what my selection criteria is? Well, its
# damn simplistic. I set the "started" datetime field each attempt,
......@@ -114,7 +121,7 @@ while (1) {
# through in a "least recently attempted" manner.
#
$query_result =
DBQuery("lock tables batch_experiments write");
DBQuery("lock tables experiments write");
if (! $query_result) {
print "DB Error locking tables. Waiting a bit ...\n";
sleep(10);
......@@ -122,14 +129,18 @@ while (1) {
}
$pending_result =
DBQuery("SELECT * FROM batch_experiments ".
"WHERE status='new' and canceled=0 and (attempts=0 or ".
"((UNIX_TIMESTAMP() - UNIX_TIMESTAMP(started) > (60 * 10)))) ".
"ORDER BY started LIMIT 1");
DBQueryWarn("SELECT * FROM experiments ".
"WHERE batchmode=1 and canceled=0 and ".
" batchstate='$BSTATE_POSTED' and ".
" (attempts=0 or ".
" ((UNIX_TIMESTAMP() - ".
" UNIX_TIMESTAMP(expt_start) > (60 * 10)))) ".
"ORDER BY expt_start LIMIT 1");
$running_result =
DBQuery("SELECT * FROM batch_experiments ".
"WHERE status='running' ORDER BY started");
DBQuery("select * from experiments ".
"where batchmode=1 and batchstate='$BSTATE_RUNNING' ".
"ORDER BY expt_start");
if (!$pending_result || !$running_result) {
print "DB Error getting batch info. Waiting a bit ...\n";
......@@ -147,7 +158,7 @@ while (1) {
#
# If we have a pending experiment to run, set its state to configuring
# right away, while we have the tables locked. This prevents killbatchexp
# right away, while we have the tables locked. This prevents endexp
# from seeing it as something it can cancel.
#
if ($pending_result->numrows) {
......@@ -156,11 +167,11 @@ while (1) {
# Local vars!
my $eid = $pending_row{'eid'};
my $pid = $pending_row{'pid'};
my $now = DBDateTime();
$query_result =
DBQuery("update batch_experiments set status='configuring', ".
"started='$now' where eid='$eid' and pid='$pid'");
DBQuery("update experiments set expt_start=now(), ".
"batchstate='$BSTATE_ACTIVATING' ".
"where eid='$eid' and pid='$pid'");
if (! $query_result) {
print "DB error setting batch $pid/$eid to configuring.\n";
......@@ -213,9 +224,14 @@ sub dosomething($$)
my($unix_uid, $unix_gid, $row, $query_result);
# Global vars
$eid = $exphash{'eid'};
$pid = $exphash{'pid'};
$gid = $exphash{'gid'};
$eid = $exphash{'eid'};
$pid = $exphash{'pid'};
$gid = $exphash{'gid'};
$dirname = $exphash{'path'};
$nsfile = "$eid.ns";
# Locals
my $creator = $exphash{'expt_head_uid'};
print "Doing a '$dowhat' to batch experiment $pid/$eid\n";
......@@ -254,26 +270,15 @@ sub dosomething($$)
}
openlog($logname);
my $creator = $exphash{'creator_uid'};
my $longname = $exphash{'name'};
# Global vars
$dirname = "$batchdir/$pid-$eid";
$nsfile = "$dirname/$eid.ns";
#
# Get some user information.
#
$query_result =
DBQueryFatal("SELECT usr_name,usr_email from users ".
"WHERE uid='$creator'");
if ($query_result->numrows != 1) {
if (!UserDBInfo($creator, \$user_name, \$user_email)) {
fatal("DB Error getting user information for uid $creator\n");
}
@row = $query_result->fetchrow_array();
$user_name = $row[0];
$user_email = $row[1];
chdir("$dirname/tbdata") or
fatal("Could not cd into $dirname/tbdata!");
#
# Figure out the unix uid/gid that the experiment configuration is
......@@ -287,7 +292,8 @@ sub dosomething($$)
#
# Change the ownership of the log file before we flip.
#
chown($unix_uid, $unix_gid, $logname);
chown($unix_uid, $unix_gid, $logname) or
fatal("Could not chown $logname to $unix_uid/$unix_gid!");
# Flip to the user. We never flip back.
$EGID = $GID = $unix_gid;
......@@ -314,25 +320,10 @@ sub startexp($)
my(%exphash) = @_;
my($exit_status, $running, $query_result);
my $creator = $exphash{'creator_uid'};
my $longname = $exphash{'name'};
my $attempts = $exphash{'attempts'};
my $expires = $exphash{'expires'};
my $rightnow = DBDateTime();
#
# Insert an experiment record for startexp.
#
$query_result =
DBQueryFatal("insert into experiments ".
"(eid, pid, gid, expt_created, expt_name, ".
"expt_head_uid, expt_expires, state, batchmode) ".
"VALUES ('$eid', '$pid', '$gid', '$rightnow', ".
" '$longname', ".
" '$creator', '$expires', 'new', 1)");
#
# Try to start the experiment. If it fails, the experiment is gone.
# Try to start the experiment.
#
system("$startexp -b $logname -g $gid $pid $eid $nsfile");
$exit_status = $? >> 8;
......@@ -346,7 +337,7 @@ sub startexp($)
# we can pick up the cancelation later.
#
$query_result =
DBQueryWarn("select canceled from batch_experiments ".
DBQueryWarn("select canceled from experiments ".
"where eid='$eid' and pid='$pid'");
if ($query_result) {
......@@ -374,8 +365,9 @@ sub startexp($)
# XXX - What if this update fails?
#
$query_result =
DBQueryWarn("update batch_experiments set status='new', ".
"attempts=attempts+1 where eid='$eid' and pid='$pid'");
DBQueryWarn("update experiments set attempts=attempts+1, ".
" batchstate='$BSTATE_POSTED' ".
"where eid='$eid' and pid='$pid'");
$attempts++;
if (($exit_status == $TOOFEWNODES && $attempts >= 9 &&
......@@ -394,14 +386,10 @@ sub startexp($)
#
# Well, it configured! Lets set it state to running.
#
# XXX - What if this update fails?
#
$query_result =
DBQueryWarn("update batch_experiments set status='running' ".
"where eid='$eid' and pid='$pid'");
TBSetBatchState($pid, $eid, $BSTATE_RUNNING);
email_status("Batch Mode experiment $pid/$eid is now running!\n".
"Please consult the Web interface to see how it is doing\n");
"Please consult the Web interface to see how it is doing.\n");
#
# Done with this phase. Must exit.
......@@ -416,11 +404,16 @@ sub endexp($)
{
my(%exphash) = @_;
#
# Save tiplogs
#
#
# Save tiplogs
#
system("$savelogs $pid $eid");
#
# Have to set the state to terminating or else endexp will not accept it.
#
TBSetBatchState($pid, $eid, $BSTATE_TERMINATING);
system("$endexp -b $pid $eid");
my $exit_status = $? >> 8;
......@@ -430,11 +423,9 @@ sub endexp($)
#
fatal("Terminating Batch Mode experiment $pid/$eid");
}
DBQueryWarn("DELETE from batch_experiments ".
"WHERE eid='$eid' and pid='$pid'");
ExptCleanup();
email_status("Batch Mode experiment $pid/$eid has finished!\n");
system("rm -rf $dirname");
#
# Child must exit!
......@@ -450,15 +441,14 @@ sub cancelexp($$)
my($running) = shift;
my(%exphash) = @_;
TBSetBatchState($pid, $eid, $BSTATE_TERMINATING);
if ($running) {
system("$endexp -b $pid $eid");
}
DBQueryWarn("DELETE from batch_experiments ".
"WHERE eid='$eid' and pid='$pid'");
donotify("Your Batch Mode experiment has been canceled. You may now\n".
"reuse the experiment name\n", "Canceled", 0);
system("rm -rf $dirname");
ExptCleanup();
donotify("Your Batch Mode experiment has been canceled!", "Canceled", 0);
#
# Child must exit!
......@@ -511,6 +501,29 @@ sub isexpdone($)
return 1;
}
#
# Remove all trace.
#
sub ExptCleanup()
{
if (system("rm -rf $dirname")) {
print "*** WARNING: Not able to remove experiment directory.\n";
print " Someone will need to do this by hand.\n";
}
#
# Remove all trace from the DB.
#
DBQueryWarn("DELETE from nsfiles ".
"WHERE eid='$eid' and pid='$pid'");
DBQueryWarn("DELETE from exppid_access ".
"WHERE exp_eid='$eid' and exp_pid='$pid'");
DBQueryWarn("DELETE from experiments ".
"WHERE eid='$eid' and pid='$pid'");
}
#
# Start up a child, and set its descriptors talking to a log file.
# The log file already exists, created with mktemp above.
......@@ -584,56 +597,7 @@ sub donotify($$$)
"$hdrs";
}
if (! ($MAIL = OPENMAIL($to, $subject, $from, $hdrs))) {
die("Cannot start mail program!");
}
print $MAIL $mesg;
if (defined($logname) && open(IN, "$logname")) {
print $MAIL "\n\n---------\n\n";
while (<IN>) {
print $MAIL "$_";
}
close(IN);
}
if (defined($nsfile) && open(IN, "$nsfile")) {
print $MAIL "\n\n---------\n\n";
while (<IN>) {
print $MAIL "$_";
}
close(IN);
}
close($MAIL);
SENDMAIL($to, $subject, $mesg, $from, $hdrs,
($logname, $nsfile));
}
#
# Become a daemon.
#
sub daemonize()
{
my $mypid = fork();
if ($mypid) {
exit(0);
}
#
# We have to disconnect from the caller by redirecting both STDIN and
# STDOUT away from the pipe. Otherwise the caller will continue to wait
# even though the parent has exited.
#
open(STDIN, "< /dev/null") or
die("opening /dev/null for STDIN: $!");
#
# Open the batch log and start writing to it.
#
open(STDERR, ">> $batchlog") or die("opening $batchlog for STDERR: $!");
open(STDOUT, ">> $batchlog") or die("opening $batchlog for STDOUT: $!");
return 0;
}
......@@ -7,16 +7,17 @@ use Getopt::Std;
#
sub usage()
{
print STDOUT "Usage: batchexp <batchfile>\n";
exit(-1);
die("Usage: batchexp [-i] [-x expires] [-E description] [-g gid] ".
"-p <pid> -e <eid> <nsfile>\n");
}
my $optlist = "";
my $optlist = "iE:d:g:x:e:p:";
#
# Configure variables
#
my $TB = "@prefix@";
my $DBNAME = "@TBDBNAME@";
my $PROJROOT = "/proj";
#
# Testbed Support libraries
......@@ -25,11 +26,14 @@ use lib "@prefix@/lib";
use libdb;
use libtestbed;
my $tbbindir = "$TB/bin/";
my $batchdir = "$TB/batch";
my $parser = "$TB/libexec/ns2ir/parse.tcl";
my $projroot = "/proj";
my $mkexpdir = "$TB/libexec/mkexpdir";
my $startexp = "$TB/bin/startexp";
my $tbdata = "tbdata";
my $immediate= 0;
my $dirname;
my $dbuid;
my @row;
#
# Turn off line buffering on output
......@@ -43,187 +47,266 @@ $| = 1;
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};
my $eid;
my $pid;
my $gid;
my $description;
my $expires;
my $tempnsfile;
#
# Parse command arguments. Once we return from getopts, all that should
# left are the required arguments.
# Verify user and get his DB uid.
#
%options = ();
if (! getopts($optlist, \%options)) {
usage();
}
if (@ARGV != 1) {
usage();
if (! UNIX2DBUID($UID, \$dbuid)) {
die("*** $0:\n".
" You do not exist in the Emulab Database!\n");
}
my $tempfile = $ARGV[0];
#
# Untaint the arguments.
# Parse command arguments.
#
# Note different taint check (allow /).
if ($tempfile =~ /^([-\@\w.\/]+)$/) {
$tempfile = $1;
ParseArgs();
#
# Sanity check them.
#
if (!defined($pid) || !defined($eid)) {
usage();
}
if (!defined($gid)) {
$gid = $pid;
}
if (defined($description)) {
$description = DBQuoteSpecial($description);
}
else {
fatal("Tainted argument $tempfile");
$description = "'Created by $dbuid'";
}
if (! defined($expires)) {
$expires = DBDateTime(60 * 60 * 24 * 30);
}
#
# Parse the batchfile.
#
my $eid;
my $pid;
my $gid;
my $dbuid;
my $longname;
my $expires;
my $webnsfile;
parse_batchfile($tempfile) or
fatal("*** Could not parse batchfile $tempfile");
$nsfile = "$eid.ns";
#
# Sanity check a few things.
# Make sure UID is allowed to create experiments in this project.
#
if (!defined($eid) || !defined($pid) || !defined($longname) ||
!defined($expires) || !defined($webnsfile)) {
fatal("*** Batchfile is incomplete!");
if (! TBProjAccessCheck($dbuid, $pid, $gid, TB_PROJECT_CREATEEXPT)) {
die("*** $0:\n".
" You do not have permission to create experiments in $pid/$gid\n");
}
$nsfile = "$eid.ns";
#
# Create a subdir in the batch directory to work in.
# Create an experiment record. The pid/eid has to be unique, so lock the
# table for the check/insert.
#
$dirname = "$batchdir/$pid-$eid";
DBQueryFatal("lock tables experiments write");
mkdir($dirname, 0775) or
fatal("*** Could not mkdir $dirname: $!");
$query_result =
DBQueryFatal("SELECT pid,eid FROM experiments ".
"WHERE eid='$eid' and pid='$pid'");
chdir($dirname) or
fatal("*** Could not chdir to $dirname: $!");
if ($query_result->numrows) {
DBQueryWarn("unlock tables");
die("*** $0:\n".
" Experiment $eid in project $pid already exists!\n");
}
#
# Copy in the batch file. Web script is responsible for removing the
# original.
# Insert the record. This reserves the pid/eid for us. If its a batchmode
# experiment, we will update the record later so that the batch daemon
# will recognize it.
#
if (system("/bin/cp", "$tempfile", "batchfile")) {
fatal("*** Could not copy $tempfile to $dirname");
if (! DBQueryWarn("INSERT INTO experiments ".
"(eid, pid, gid, expt_created, expt_expires, ".
" expt_name, expt_head_uid, state) ".
"VALUES ('$eid', '$pid', '$gid', now(), '$expires', ".
"$description, '$dbuid', 'new')")) {
DBQueryWarn("unlock tables");
die("*** $0:\n".
" Database error inserting record for $pid/$eid!\n");
}
#
# Verify user and get his DB uid.
#
if (! UNIX2DBUID($UID, \$dbuid)) {
fatal("*** You do not exist in the Emulab Database!");
if (! DBQueryWarn("unlock tables")) {
fatal("Unexpected DB Error!");
}
#
# Make sure UID is allowed to create experiments in this project.
# Create a directory structure for the experiment.
#
if (!TBAdmin($UID) &&
!TBProjAccessCheck($dbuid, $pid, $gid, TB_PROJECT_CREATEEXPT)) {
fatal("*** You do not have permission to create experiments ".
"in project $pid";
if (system("$mkexpdir $pid $gid $eid") != 0) {
fatal("$mkexpdir failed");
}
#
# The pid/eid pair has to be unique. LOCKING!
#
# Grab that path from the DB (set by mkexpdir).
#
$query_result =
DBQueryFatal("SELECT * FROM experiments ".
"WHERE eid='$eid' and pid='$pid'");
if ($query_result->numrows) {
fatal("*** Experiment $eid in project $pid already exists!");
DBQueryWarn("select path from experiments ".
"where pid='$pid' and eid='$eid'");
if (! $query_result ||
! $query_result->numrows) {
fatal("Unexpected DB Error! Experiment $pid/$eid does not exist!");
}
@row = $query_result->fetchrow_array();
$dirname = $row[0];
$query_result =
DBQueryFatal("SELECT * FROM batch_experiments ".
"WHERE eid='$eid' and pid='$pid'");
if ($query_result->numrows) {
fatal("*** Batch experiment $eid in project $pid already exists!");
}
chdir("$dirname/$tbdata") or
fatal("Could not chdir to $dirname/$tbdata: $!");
#
# Now we can get the NS file!
#