Commit 54f289a4 authored by Leigh B. Stoller's avatar Leigh B. Stoller
Browse files

First cut at batch mode experiments.

parent e6a5db44
......@@ -43,6 +43,8 @@ install-mkdirs:
-mkdir -p $(INSTALL_TOPDIR)/log
-mkdir -p $(INSTALL_TOPDIR)/lists
-mkdir -p $(INSTALL_TOPDIR)/backup
-mkdir -p $(INSTALL_TOPDIR)/batch
-chmod 777 $(INSTALL_TOPDIR)/batch
clean: clean-subdirs
distclean: distclean-subdirs
......
......@@ -957,6 +957,8 @@ outfiles="$outfiles Makeconf GNUmakefile \
tbsetup/os_load tbsetup/os_setup tbsetup/mkprojdir tbsetup/power \
tbsetup/resetvlans tbsetup/rmacct-ctrl tbsetup/rmproj \
tbsetup/sched_reload \
tbsetup/batchexp tbsetup/killbatchexp tbsetup/batch_daemon \
tbsetup/webbatchexp tbsetup/webkillbatchexp \
tbsetup/startexp tbsetup/endexp tbsetup/webstartexp tbsetup/webendexp \
tbsetup/ir/GNUmakefile tbsetup/ir/postassign tbsetup/snmpit \
tbsetup/ir/assign_wrapper tbsetup/ns2ir/GNUmakefile \
......
......@@ -116,6 +116,8 @@ outfiles="$outfiles Makeconf GNUmakefile \
tbsetup/os_load tbsetup/os_setup tbsetup/mkprojdir tbsetup/power \
tbsetup/resetvlans tbsetup/rmacct-ctrl tbsetup/rmproj \
tbsetup/sched_reload \
tbsetup/batchexp tbsetup/killbatchexp tbsetup/batch_daemon \
tbsetup/webbatchexp tbsetup/webkillbatchexp \
tbsetup/startexp tbsetup/endexp tbsetup/webstartexp tbsetup/webendexp \
tbsetup/ir/GNUmakefile tbsetup/ir/postassign tbsetup/snmpit \
tbsetup/ir/assign_wrapper tbsetup/ns2ir/GNUmakefile \
......
......@@ -11,13 +11,14 @@ include $(OBJDIR)/Makeconf
SUBDIRS = checkpass ir ns2ir
BIN_STUFF = power snmpit tbend tbrun tbprerun tbreport \
os_load savevlans startexp endexp
os_load savevlans startexp endexp batchexp killbatchexp
SBIN_STUFF = resetvlans console_setup.proxy sched_reload named_setup
SBIN_STUFF = resetvlans console_setup.proxy sched_reload named_setup \
batch_daemon
LIBEXEC_STUFF = mkprojdir rmproj mkacct-ctrl rmacct-ctrl \
os_setup mkexpdir console_setup \
webstartexp webendexp
webstartexp webendexp webbatchexp webkillbatchexp
LIB_STUFF = libtbsetup.pm
......@@ -76,6 +77,8 @@ post-install:
chmod u+s $(INSTALL_BINDIR)/savevlans
chown root $(INSTALL_LIBEXECDIR)/console_setup
chmod u+s $(INSTALL_LIBEXECDIR)/console_setup
chown root $(INSTALL_SBINDIR)/batch_daemon
chmod u+s $(INSTALL_SBINDIR)/batch_daemon
#
# Control node installation (okay, plastic)
......
#!/usr/bin/perl -wT
use English;
use Getopt::Std;
#
# Create a batch experiment.
#
# BIG ASS WARNING: This works great as long as paper does not reboot!
# Needs some work if we want it to be stateless across reboots.
#
# usage: batch_daemon
#
sub usage()
{
print STDOUT "Usage: batch_daemon\n";
exit(-1);
}
my $optlist = "";
#
# Configure variables
#
my $TB = "@prefix@";
my $DBNAME = "@TBDBNAME@";
my $TBOPS = "@TBOPSEMAIL@";
my $tbbindir = "$TB/bin/";
my $batchdir = "$TB/batch";
my $startexp = "$TB/bin/startexp";
my $endexp = "$TB/bin/endexp";
my $batchlog = "$TB/log/batchlog";
my $projroot = "/proj";
my $dirname;
#
# These are valid in the children, not the parent. I suppose I could use
# dynamically scoped variables, but hardly worth it.
#
my $eid;
my $pid;
my $logname;
my $user_name = "Batch Daemon";
my $user_email = "$TBOPS";
#
# Turn off line buffering on output
#
$| = 1;
#
# Untaint the path
#
$ENV{'PATH'} = "/bin:/usr/bin:";
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};
#
# Parse command arguments. Once we return from getopts, all that should be
# left are the required arguments.
#
%options = ();
if (! getopts($optlist, \%options)) {
usage();
}
if (@ARGV != 0) {
usage();
}
# Go to ground.
daemonize();
#
# Set up for querying the database.
#
use Mysql;
my $DB = Mysql->connect("localhost", $DBNAME, "script", "none");
#
# Loop, looking for batch experiments that want to run.
#
while (1) {
#
# Need to lock the table here because of cancelation in killbatchexp.
# See the comments in there. We need to atomically grab the next
# batch experiment we want to try, and then change its state from
# new to configuring. We want to grab just one experiment, since
# it takes a while to configure an experiment, and grabbing a bunch and
# locking them up might result in having to wait a really long time
# to cancel a batch experiment that hasn't really tried to start yet!
# Thats would ne annoying to users, and we love our users, right?
#
# So, now your'e wondering what my selection criteria is? Well, its
# damn simplistic. I set the "started" datetime field each attempt,
# and I pick the batch_experiment with the oldest time, thereby cycling
# through in a "least recently attempted" manner.
#
DBquery("lock tables batch_experiments write");
$query_result =
DBquery("SELECT * FROM batch_experiments ".
"WHERE status='new' and canceled=0 ORDER BY started LIMIT 1");
if (! $query_result->numrows) {
DBquery("unlock tables");
sleep(10);
next;
}
my %row = $query_result->fetchhash();
#
# Set the configuring flag right away so killbatchexp won't see them
# in a "new" state. Might as well set the started time to ensure that
# it goes to the end of the line.
#
# Local vars!
my $eid = $row{'eid'};
my $pid = $row{'pid'};
my $now = `date '+20%y-%m-%d %H:%M:%S'`;
DBquery("update batch_experiments set status='configuring', ".
"started='$now' where eid='$eid' and pid='$pid'");
DBquery("unlock tables");
runexp(%row);
sleep(300);
}
#
# The guts of running a single experiment.
#
sub runexp($)
{
my(%exphash) = @_;
my($uid, $gid, $row);
# Global vars
$eid = $exphash{'eid'};
$pid = $exphash{'pid'};
my $creator = $exphash{'creator_uid'};
my $longname = $exphash{'name'};
print STDOUT "Trying to start experiment $eid in project $pid\n";
#
# Start up a child to run the guts. The parent waits. If the
# experiment configures okay, the parent can return to try something
# new, while the child is going to hang out and wait for all the nodes
# to report exit status, or for the cancel bit to get set.
#
$childpid = fork();
if ($childpid) {
waitpid($childpid, 0);
my $status = $?;
return;
}
# global var
$dirname = "$batchdir/$pid-$eid";
my $nsfile = "$dirname/$eid.ns";
#
# Get some user information.
#
$query_result =
$DB->query("SELECT usr_name,usr_email from users ".
"WHERE uid='$creator'");
if (! $query_result ||
$query_result->numrows != 1) {
fatal("DB Error getting user information for uid $creator\n");
}
@row = $query_result->fetchrow_array();
$user_name = $row[0];
$user_email = $row[1];
#
# Figure out the unix uid/gid that the experiment configuration is
# going to run as.
#
(undef,undef,$uid) = getpwnam($creator) or
fatal("No such user $creator");
(undef,undef,$gid) = getgrnam($pid) or
fatal("No such group $pid");
$EGID = $GID = $gid;
$EUID = $UID = $uid;
#
# Create a temporary name for a log file and open it up.
#
$logname = `mktemp /tmp/start-batch-$pid-$eid.XXXXXX`;
# Note different taint check (allow /).
if ($logname =~ /^([-\@\w.\/]+)$/) {
$logname = $1;
} else {
die "Bad data in $logname";
}
openlog($logname);
#
# Insert an experiment record for startexp.
#
my $rightnow = `date '+20%y-%m-%d %H:%M:%S'`;
DBquery("insert into experiments ".
"(eid, pid, expt_created, expt_name, ".
"expt_head_uid, expt_start, expt_ready, batchmode) ".
"VALUES ('$eid', '$pid', '$rightnow', '$longname', ".
"'$creator', '$rightnow', 0, 1)");
#
# Try to start the experiment. If it fails, the experiment is gone.
#
system("$startexp -b $pid $eid $nsfile");
my $exit_status = $? >> 8;
my $running = 1;
if ($exit_status) {
$running = 0;
}
#
# Look for cancelation.
#
$query_result =
DBquery("select canceled from batch_experiments ".
"where eid='$eid' and pid='$pid'");
@row = $query_result->fetchrow_array();
my $canceled = $row[0];
#
# If canceled and the experiment got running, need to tear it down
# and tell the owner about it.
#
if ($canceled) {
cancel_batch($running);
exit(0);
}
#
# If the configuration failed, then send email for now. This
# part needs work. We have to reset the state to "new" so that
# it will be retried again later.
#
if (! $running) {
DBquery("update batch_experiments set status='new' ".
"where eid='$eid' and pid='$pid'");
fatal("Could not configure Batch Mode experiment $pid/$eid");
}
#
# Well, it configured! Lets set it state to running.
#
DBquery("update batch_experiments set status='running' ".
"where eid='$eid' and pid='$pid'");
email_status("Batch Mode experiment $pid/$eid is now running!\n".
"Please consult the Web interface to see how its doing\n");
#
# Now loop, periodically looking for a change in the status of the
# nodes, or for a cancelation request.
#
while (1) {
$query_result =
DBquery("select canceled from batch_experiments ".
"where eid='$eid' and pid='$pid'");
if ($query_result->numrows != 1) {
#
# Jeez, something really went wrong!
#
fatal("Batch Mode record for $pid/$eid is gone! HELP ME!");
}
@row = $query_result->fetchrow_array();
if ($row[0]) {
cancel_batch(1);
exit(0);
}
$query_result =
DBquery("SELECT startstatus FROM nodes LEFT JOIN reserved ".
"ON nodes.node_id=reserved.node_id ".
"WHERE reserved.eid='$eid' and reserved.pid='$pid'");
#
# Look to see if any nodes yet to report status. If so, spin again.
#
my $done = 1;
for ($i = 0; $i < $query_result->numrows; $i++) {
@row = $query_result->fetchrow_array();
if ($row[0] eq "none") {
$done = 0;
}
}
if ($done) {
last;
}
sleep(15);
}
#
# Yippie! Tear it down and send email. Need to look for failures
# in the teardown!
#
system("$endexp -b $pid $eid");
DBquery("DELETE from batch_experiments WHERE eid='$eid' and pid='$pid'");
system("rm -rf $dirname");
email_status("Batch Mode experiment $pid/$eid has finished!\n");
#
# Child must exit!
#
exit(0);
}
sub DBquery($)
{
my($query) = $_[0];
$query_result = $DB->query($query);
if (! $query_result) {
fatal("DB Error: $query");
}
return $query_result;
}
#
# Start up a child, and set its descriptors talking to a log file.
#
sub openlog($)
{
my($logname) = $_[0];
#
# We have to disconnect from the caller by redirecting both STDIN and
# STDOUT away from the pipe. Otherwise the caller (the web server) will
# continue to wait even though the parent has exited.
#
open(STDIN, "< /dev/null") or
fatal("opening /dev/null for STDIN: $!");
open(STDERR, ">> $logname") or
fatal("opening $logname for STDERR: $!");
open(STDOUT, ">> $logname") or
fatal("opening $logname for STDOUT: $!");
return 0;
}
sub fatal()
{
my($mesg) = $_[0];
print STDOUT "$mesg\n";
#
# Send a message to the testbed list. Append the logfile if it got
# that far.
#
open(MAIL, "| /usr/bin/mail ".
"-s \"TESTBED: Batch Mode Failure $pid/$eid\" ".
"-c $TBOPS \"$user_name <$user_email>\" >/dev/null 2>&1")
or die "Cannot start mail program: $!";
print MAIL $mesg;
if (defined($logname) && open(IN, "$logname")) {
print MAIL "\n\n---------\n\n";
while (<IN>) {
print MAIL "$_";
}
close(IN);
unlink("$logname");
}
close(MAIL);
exit(-1);
}
sub cancel_batch($)
{
my($running) = $_[0];
if ($running) {
system("$endexp -b $pid $eid");
}
DBquery("DELETE from batch_experiments WHERE eid='$eid' and pid='$pid'");
open(MAIL, "| /usr/bin/mail ".
"-s \"TESTBED: Batch Mode Cancelation $pid/$eid\" ".
"-c $TBOPS \"$user_name <$user_email>\" >/dev/null 2>&1")
or die "Cannot start mail program: $!";
print MAIL
"Your Batch Mode experiment has been canceled. You may now\n".
"reuse the experiement name\n\n";
if (defined($logname) && open(IN, "$logname")) {
print MAIL "\n\n---------\n\n";
while (<IN>) {
print MAIL "$_";
}
close(IN);
unlink("$logname");
}
close(MAIL);
#
# And kill the batch directory.
#
system("rm -rf $dirname");
}
sub email_status($)
{
my($mesg) = $_[0];
print STDOUT "$mesg\n";
open(MAIL, "| /usr/bin/mail ".
"-s \"TESTBED: Batch Mode Experiment Status $pid/$eid\" ".
"-c $TBOPS \"$user_name <$user_email>\" >/dev/null 2>&1")
or die "Cannot start mail program: $!";
print MAIL $mesg;
if (defined($logname) && open(IN, "$logname")) {
print MAIL "\n\n---------\n\n";
while (<IN>) {
print MAIL "$_";
}
close(IN);
}
close(MAIL);
}
#
# Become a daemon.
#
sub daemonize()
{
my $mypid = fork();
if ($mypid) {
exit(0);
}
#
# We have to disconnect from the caller by redirecting both STDIN and
# STDOUT away from the pipe. Otherwise the caller will continue to wait
# even though the parent has exited.
#
open(STDIN, "< /dev/null") or
die("opening /dev/null for STDIN: $!");
#
# Open the batch log and start writing to it.
#
open(STDERR, ">> $batchlog") or die("opening $batchlog for STDERR: $!");
open(STDOUT, ">> $batchlog") or die("opening $batchlog for STDOUT: $!");
return 0;
}
#!/usr/bin/perl -wT
use English;
use Getopt::Std;
#
# Create a batch experiment.
#
# usage: batchexp <batchfile>
#
sub usage()
{
print STDOUT "Usage: batchexp <batchfile>\n";
exit(-1);
}
my $optlist = "";
#
# Configure variables
#
my $TB = "@prefix@";
my $DBNAME = "@TBDBNAME@";
my $TBOPS = "@TBOPSEMAIL@";
my $tbbindir = "$TB/bin/";
my $batchdir = "$TB/batch";
my $projroot = "/proj";
my $dirname;
#
# Turn off line buffering on output
#
$| = 1;
#
# Untaint the path
#
$ENV{'PATH'} = "/bin:/usr/bin:$TB/libexec:$TB/libexec/ir".
":$TB/libexec/ns2ir:$TB/sbin:$TB/bin";
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};
$TBIRLIB = "$TB/lib/ir";
push(@INC,$TBIRLIB);
require libir;
#
# Parse command arguments. Once we return from getopts, all that should
# left are the required arguments.
#
%options = ();
if (! getopts($optlist, \%options)) {
usage();
}
if (@ARGV != 1) {
usage();
}
my $tempfile = $ARGV[0];
#
# Untaint the arguments.
#
# Note different taint check (allow /).
if ($tempfile =~ /^([-\@\w.\/]+)$/) {
$tempfile = $1;
}
else {
fatal("Tainted argument $tempfile");
}
#
# Set up for querying the database.
#
use Mysql;
my $DB = Mysql->connect("localhost", $DBNAME, "script", "none");
#
# Parse the batchfile.
#
my $eid;
my $pid;
my $longname;
my $expires;
my $webnsfile;
parse_batchfile($tempfile) or
fatal("Could not parse batchfile $tempfile");