Commit 1c2de867 authored by Leigh B. Stoller's avatar Leigh B. Stoller

Several tweaks to last revision for batch daemon.

parent 533dc18f
......@@ -50,7 +50,6 @@ my $avail = "$TB/sbin/avail";
my $batchlog = "$TB/log/batchlog";
my $projroot = "/proj";
my $debug = 0;
my $dirname;
my $BSTATE_POSTED = BATCHSTATE_POSTED;
my $BSTATE_ACTIVATING = BATCHSTATE_ACTIVATING;
......@@ -66,6 +65,8 @@ my $pid;
my $gid;
my $logname;
my $nsfile;
my $userdir;
my $workdir;
my $user_name = "Testbed Operations";
my $user_email = "$TBOPS";
......@@ -236,7 +237,8 @@ sub dosomething($$)
$eid = $exphash{'eid'};
$pid = $exphash{'pid'};
$gid = $exphash{'gid'};
$dirname = $exphash{'path'};
$userdir = $exphash{'path'};
$workdir = TBExptWorkDir($pid, $eid);
$nsfile = "$eid.ns";
# Locals
......@@ -266,20 +268,12 @@ sub dosomething($$)
print "Child PID $childpid exited with exit status $status\n";
#
# Move the temporary log file into the experiment directory and
# change the name in the DB. This makes it available to the web
# interface later on if desired.
#
# Close the log file.
# The exp dir might be gone if the batch was killed/canceled.
#
if (-e $dirname) {
my $fname = "$dirname/tbdata/${dowhat}-batch.log";
system("cp -pf $logname $fname");
if (-e $userdir) {
TBExptCloseLogFile($pid, $eid);
TBExptSetLogFile($pid, $eid, $fname);
}
unlink($logname);
return $status;
}
openlog($logname);
......@@ -293,8 +287,8 @@ sub dosomething($$)
fatal("DB Error getting user information for uid $creator");
}
chdir("$dirname/tbdata") or
fatal("Could not cd into $dirname/tbdata!");
chdir("$workdir") or
fatal("Could not cd into $workdir!");
#
# Figure out the unix uid/gid that the experiment configuration is
......@@ -341,7 +335,7 @@ sub startexp($)
#
# Try to start the experiment.
#
system("$startexp -b $logname -g $gid $pid $eid $nsfile");
system("$startexp -b -g $gid $pid $eid $nsfile");
$exit_status = $? >> 8;
$running = 1;
if ($exit_status) {
......@@ -393,7 +387,7 @@ sub startexp($)
if ($exit_status == 1 || $exit_status == -1) {
email_status("Experiment startup has failed with a fatal error!\n".
"Batch has been removed from the system.");
ExptCleanup();
TBExptDestroy($pid, $eid);
exit($exit_status);
}
......@@ -457,16 +451,12 @@ sub endexp($)
TBSetBatchState($pid, $eid, $BSTATE_TERMINATING);
system("$endexp -b $pid $eid");
my $exit_status = $? >> 8;
if ($exit_status) {
if ($?) {
#
# TB admin is going to have to clean up.
#
fatal("Terminating Batch Mode experiment $pid/$eid");
}
ExptCleanup();
email_status("Batch Mode experiment $pid/$eid has finished!");
#
......@@ -487,9 +477,16 @@ sub cancelexp($$)
if ($running) {
system("$endexp -b $pid $eid");
if ($?) {
#
# TB admin is going to have to clean up.
#
fatal("Terminating Batch Mode experiment $pid/$eid");
}
}
else {
TBExptDestroy($pid, $eid);
}
ExptCleanup();
donotify("Your Batch Mode experiment has been canceled!", "Canceled", 0);
#
......@@ -543,29 +540,6 @@ sub isexpdone($)
return 1;
}
#
# Remove all trace.
#
sub ExptCleanup()
{
if (system("rm -rf $dirname")) {
print "*** WARNING: Not able to remove experiment directory.\n";
print " Someone will need to do this by hand.\n";
}
#
# Remove all trace from the DB.
#
DBQueryWarn("DELETE from nsfiles ".
"WHERE eid='$eid' and pid='$pid'");
DBQueryWarn("DELETE from exppid_access ".
"WHERE exp_eid='$eid' and exp_pid='$pid'");
DBQueryWarn("DELETE from experiments ".
"WHERE eid='$eid' and pid='$pid'");
}
#
# Start up a child, and set its descriptors talking to a log file.
# The log file already exists, created with mktemp above.
......
......@@ -208,12 +208,6 @@ if ($immediate) {
my $farg = ($frontend ? "-f" : "");
if (system("$startexp $farg -g $gid $pid $eid $nsfile")) {
#
# Save a copy of the failed experiment directory for debugging.
#
system("/bin/rm", "-rf", "${workdir}-TBfailed");
system("/bin/mv", "-f", "${workdir}", "${workdir}-TBfailed");
fatal("Failed to start experiment $pid/$eid!");
}
exit(0);
......@@ -233,22 +227,10 @@ sub fatal($)
print STDOUT " $mesg\n";
#
# It is safe to do this because fatal is called *after* the experiment
# record has been sucessfully inserted. We own it! Before that, use
# the die function.
#
DBQueryWarn("DELETE from nsfiles ".
"WHERE eid='$eid' and pid='$pid'");
DBQueryWarn("DELETE from exppid_access ".
"WHERE exp_eid='$eid' and exp_pid='$pid'");
DBQueryWarn("DELETE from experiments ".
"WHERE eid='$eid' and pid='$pid'");
if (defined($workdir)) {
system("/bin/rm", "-rf", "$workdir");
}
# Clear the record and cleanup.
#
TBExptDestroy($pid, $eid);
exit(-1);
}
......
......@@ -196,7 +196,7 @@ if ($isbatchexpt && $ebatchstate ne BATCHSTATE_TERMINATING) {
#
# Cleanup Experiment state.
#
ExptCleanup();
TBExptDestroy($pid, $eid);
exit(1);
}
......@@ -307,17 +307,17 @@ TBExptClearLogFile($pid, $eid);
#
# Cleanup DB state and remove directory.
#
ExptCleanup();
print "Termination Success\n";
TBExptDestroy($pid, $eid);
#
# In batch mode, just exit without sending email.
# In batch mode, exit now.
#
if ($batch) {
exit(0);
}
print "Termination Success\n";
#
# Send email notification to user.
#
......@@ -330,7 +330,7 @@ SENDMAIL("$user_name <$user_email>",
$message,
"$user_name <$user_email>",
"Cc: $expt_head_name <$expt_head_email>\n".
"Bcc: $TBLOGS");
"Bcc: $TBLOGS", ($logname));
exit 0;
......@@ -376,37 +376,3 @@ sub fatal($)
exit(-1);
}
sub ExptCleanup()
{
#
# Try to remove experiment directory. We allow for it not being there
# cause we often run the tb programs directly. We also allow for not
# having permission, in the case that an admin type is running this,
# in which case it won't be allowed cause of directory permissions. Thats
# okay since admin types should rarely end experiments in other projects.
#
print STDOUT "Removing experiment directories ... \n";
if (system("/bin/rm -rf $userdir")) {
print "*** WARNING: Not able to remove $userdir\n";
print " Someone will need to do this by hand.\n";
}
if (system("/bin/rm -rf $workdir")) {
print "*** WARNING: Not able to remove $workdir\n";
print " Someone will need to do this by hand.\n";
}
#
# Remove all trace from the DB.
#
DBQueryWarn("DELETE from nsfiles ".
"WHERE eid='$eid' and pid='$pid'");
DBQueryWarn("DELETE from exppid_access ".
"WHERE exp_eid='$eid' and exp_pid='$pid'");
DBQueryWarn("DELETE from experiments ".
"WHERE eid='$eid' and pid='$pid'");
}
......@@ -18,17 +18,13 @@ use Getopt::Std;
# it into the background and send email, but just want an exit status
# returned to the batch system.
#
# XXX - The -b option takes a logfile name. This is so this script can
# save off the file in the expinfo directory. The caller (batch daemon)
# opens the file and just passes the name in. I do not like this!
#
sub usage()
{
print STDOUT
"Usage: startexp [-b logfile | -f] [-g gid] <pid> <eid> <nsfile>\n";
"Usage: startexp [-b | -f] [-g gid] <pid> <eid> <nsfile>\n";
exit(-1);
}
my $optlist = "b:g:f";
my $optlist = "bg:f";
#
# Configure variables
......@@ -94,14 +90,6 @@ my $eid = $ARGV[1];
my $nsfile= $ARGV[2];
if (defined($options{"b"})) {
$batch = 1;
$logname = $options{"b"};
if ($logname =~ /^([-\@\w.\/]+)$/) {
$logname = $1;
}
else {
die("Bad data in logfile name: $logname");
}
}
if (defined($options{"f"})) {
$frontend = 1;
......@@ -146,10 +134,6 @@ else {
die("Tainted nsfile name: $nsfile");
}
if (!defined($logname)) {
$logname = TBExptCreateLogFile($pid, $eid, "startexp");
}
my $workdir = TBExptWorkDir($pid, $eid);
my $userdir = TBExptUserDir($pid, $eid);
my $repfile = "$eid.report";
......@@ -224,6 +208,7 @@ TBLockExp($pid, $eid);
# is actually torn down.
#
if (! $batch) {
$logname = TBExptCreateLogFile($pid, $eid, "startexp");
TBExptSetLogFile($pid, $eid, $logname);
TBExptOpenLogFile($pid, $eid);
......@@ -385,6 +370,7 @@ sub fatal()
#
# In batch mode, exit. Must unlock the experiment since the record
# is kept by the batch system until it is finished or canceled.
# The batch daemon might terminate the experiment based on the error.
#
if ($batch) {
TBUnLockExp($pid, $eid);
......@@ -404,19 +390,6 @@ sub fatal()
"Cc: $TBOPS",
($logname, "assign.log", "wanassign.log", $nsfile));
#
# We have to cleanup the DB since we are disconnected from the
# wrapper script.
#
DBQueryWarn("DELETE from nsfiles ".
"WHERE eid='$eid' and pid='$pid'");
DBQueryWarn("DELETE from exppid_access ".
"WHERE exp_eid='$eid' and exp_pid='$pid'");
DBQueryWarn("DELETE from experiments ".
"WHERE eid='$eid' and pid='$pid'");
#
# Copy off the workdir to the user directory, Then back up both of
# them for post-mortem debugging.
......@@ -426,6 +399,11 @@ sub fatal()
system("/bin/mv -f $workdir ${workdir}-failed");
system("/bin/rm -rf ${userdir}-failed");
system("/bin/mv -f $userdir ${userdir}-failed");
#
# Clear the record and cleanup.
#
TBExptDestroy($pid, $eid);
exit($errorstat);
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment