Commit 2025e0bd authored by Leigh Stoller's avatar Leigh Stoller

Merge the two state machines (batchstate and state) into a single

state machine (state). All of the stuff that was previously handled by
using batchstate is now embedded into the one state machine. Of
course, these mostly overlapped, so its not that much of a change,
except that we also redid the machine, adding more states (for
example, modify phases are now explicit. To get a picture of the
actual state machine, on boss:

		stategraph -o newstates EXPTSTATE
		gv newstates.ps

Things to note:

* The "batchstate" slot of the experiments table is now used solely to
  provide a lock for batch daemon. A secondary change will be to
  change the slot name to something more appropriate, but it can
  happen anytime after this new stuff is installed.

* I have left expt_locked for now, but another later change will be to remove
  expt_locked, and change it to active_busy or some such new state name in
  the state machine. I have removed most uses of expt_locked, except those
  that were necessary until there is a new state to replace it.

* These new changes are an implementation of the new state machine,
  but I have not done anything fancy. Most of the code is the same as
  it was before.

* I suspect that there are races with the batch daemon now, but they
  are going to be rare, and the end result is probably that a
  cancelation is delayed a little bit.
parent b1de9fb2
......@@ -63,14 +63,14 @@ use Exporter;
DBLIMIT_NSFILESIZE NODERELOADPENDING_EID
EXPTSTATE_NEW EXPTSTATE_PRERUN EXPTSTATE_SWAPPED EXPTSTATE_SWAPPING
EXPTSTATE_ACTIVATING EXPTSTATE_ACTIVE EXPTSTATE_TESTING
EXPTSTATE_TERMINATING EXPTSTATE_TERMINATED EXPTSTATE_UPDATING
EXPTSTATE_ACTIVATING EXPTSTATE_ACTIVE
EXPTSTATE_TERMINATING EXPTSTATE_TERMINATED EXPTSTATE_QUEUED
EXPTSTATE_MODIFY_PARSE EXPTSTATE_MODIFY_REPARSE EXPTSTATE_MODIFY_RESWAP
EXPTSTATE_RESTARTING
BATCHSTATE_LOCKED BATCHSTATE_UNLOCKED
EXPTCANCEL_CLEAR EXPTCANCEL_TERM EXPTCANCEL_SWAP
BATCHSTATE_POSTED BATCHSTATE_RUNNING BATCHSTATE_TERMINATING
BATCHSTATE_ACTIVATING BATCHSTATE_PAUSED
BATCHSTATE_RUNNING_LOCKED BATCHSTATE_MODIFYING
BATCHMODE_CANCELTERM BATCHMODE_CANCELSWAP BATCHMODE_CANCELCLEAR
TBBatchState TBSetBatchState TBSetBatchCancelFlag TBGetBatchCancelFlag
TBSetCancelFlag TBGetCancelFlag
TB_NODELOGTYPE_MISC TB_NODELOGTYPES TB_DEFAULT_NODELOGTYPE
......@@ -287,29 +287,28 @@ sub NODEFAILMODE_FATAL() { "fatal"; }
sub NODEFAILMODE_NONFATAL() { "nonfatal"; }
sub NODEFAILMODE_IGNORE() { "ignore"; }
# These are really "sub" states.
# Experiment states
sub EXPTSTATE_NEW() { "new"; }
sub EXPTSTATE_PRERUN() { "prerunning"; }
sub EXPTSTATE_SWAPPED() { "swapped"; }
sub EXPTSTATE_QUEUED() { "queued"; }
sub EXPTSTATE_SWAPPING() { "swapping"; }
sub EXPTSTATE_ACTIVATING() { "activating"; }
sub EXPTSTATE_ACTIVE() { "active"; }
sub EXPTSTATE_TESTING() { "testing"; }
sub EXPTSTATE_TERMINATING() { "terminating"; }
sub EXPTSTATE_TERMINATED() { "ended"; }
sub EXPTSTATE_MODIFY_PARSE() { "modify_parse"; }
sub EXPTSTATE_MODIFY_REPARSE() { "modify_reparse"; }
sub EXPTSTATE_MODIFY_RESWAP() { "modify_reswap"; }
sub EXPTSTATE_RESTARTING() { "restarting"; }
# For the batch_daemon.
sub BATCHSTATE_LOCKED() { "locked";}
sub BATCHSTATE_UNLOCKED() { "unlocked";}
# These are really experiment states (both batch *and* plain).
sub BATCHSTATE_POSTED() { "posted"; }
sub BATCHSTATE_ACTIVATING() { "activating"; }
sub BATCHSTATE_RUNNING() { "active"; }
sub BATCHSTATE_RUNNING_LOCKED() { "active_locked"; }
sub BATCHSTATE_MODIFYING() { "modifying"; }
sub BATCHSTATE_PAUSED() { "paused"; }
sub BATCHSTATE_TERMINATING() { "terminating"; }
# Cancel flags
sub BATCHMODE_CANCELCLEAR { 0 ;}
sub BATCHMODE_CANCELTERM { 1 ;}
sub BATCHMODE_CANCELSWAP { 2 ;}
sub EXPTCANCEL_CLEAR() { 0 ;}
sub EXPTCANCEL_TERM() { 1 ;}
sub EXPTCANCEL_SWAP() { 2 ;}
sub USERSTATUS_ACTIVE() { "active"; }
sub USERSTATUS_FROZEN() { "frozen"; }
......@@ -348,7 +347,7 @@ sub TB_USERINFO_MODIFYINFO() { 2; }
sub TB_USERINFO_MIN() { TB_USERINFO_READINFO; }
sub TB_USERINFO_MAX() { TB_USERINFO_MODIFYINFO; }
# Experiments (also batch experiments).
# Experiments.
sub TB_EXPT_READINFO() { 1; }
sub TB_EXPT_MODIFY() { 2; }
sub TB_EXPT_DESTROY() { 3; }
......@@ -1302,7 +1301,7 @@ sub TBLockExp($$;$)
my $query_result =
DBQueryWarn("update experiments set expt_locked=now() ".
(defined($newstate) ? ",batchstate='$newstate' " : "") .
(defined($newstate) ? ",state='$newstate' " : "") .
"where eid='$eid' and pid='$pid'");
if (! $query_result ||
......@@ -1324,7 +1323,7 @@ sub TBExpLocked($$;$)
my($pid, $eid, $curstate) = @_;
my $query_result =
DBQueryWarn("select expt_locked,batchstate from experiments ".
DBQueryWarn("select expt_locked,state from experiments ".
"where eid='$eid' and pid='$pid'");
if (! $query_result ||
......@@ -1353,7 +1352,7 @@ sub TBUnLockExp($$;$)
my $query_result =
DBQueryWarn("update experiments set expt_locked=NULL ".
(defined($newstate) ? ",batchstate='$newstate' " : "") .
(defined($newstate) ? ",state='$newstate' " : "") .
"where eid='$eid' and pid='$pid'");
if (! $query_result ||
......@@ -1364,59 +1363,13 @@ sub TBUnLockExp($$;$)
}
#
# Return BatchMode state.
# Set cancel flag,
#
# usage: TBBatchState(char *pid, char *eid)
# returns state if a valid pid/eid.
# returns 0 if an invalid pid/eid or if an error.
#
sub TBBatchState($$)
{
my($pid, $eid) = @_;
my $query_result =
DBQueryWarn("select batchstate from experiments ".
"where eid='$eid' and pid='$pid' and batchmode=1");
if (! $query_result ||
$query_result->numrows == 0) {
return 0;
}
my @row = $query_result->fetchrow_array();
return $row[0];
}
#
# Set BatchMode state.
#
# usage: SetBatchState(char *pid, char *eid, char *state)
# returns 1 if okay.
# returns 0 if an invalid pid/eid or if an error.
#
sub TBSetBatchState($$$)
{
my($pid, $eid, $state) = @_;
my $query_result =
DBQueryWarn("update experiments set batchstate='$state',batchmode=1 ".
"where eid='$eid' and pid='$pid'");
if (! $query_result ||
$query_result->numrows == 0) {
return 0;
}
return 1;
}
#
# Set BatchMode cancel flag,
#
# usage: SetBatchCancel(char *pid, char *eid, char *flag)
# usage: SetCancelFlag(char *pid, char *eid, char *flag)
# returns 1 if okay.
# returns 0 if an invalid pid/eid or if an error.
#
sub TBSetBatchCancelFlag($$$)
sub TBSetCancelFlag($$$)
{
my($pid, $eid, $flag) = @_;
......@@ -1432,13 +1385,13 @@ sub TBSetBatchCancelFlag($$$)
}
#
# Get BatchMode cancel flag,
# Get cancel flag,
#
# usage: GetBatchCancel(char *pid, char *eid, char **flag)
# usage: TBGetCancelFlag(char *pid, char *eid, char **flag)
# returns 1 if okay.
# returns 0 if an invalid pid/eid or if an error.
#
sub TBGetBatchCancelFlag($$$)
sub TBGetCancelFlag($$$)
{
my($pid, $eid, $flag) = @_;
......@@ -2409,7 +2362,7 @@ sub MarkPhysNodeDown($)
DBQueryFatal("lock tables reserved write");
DBQueryFatal("update reserved set " .
" pid='$pid',eid='$eid' ".
" pid='$pid',eid='$eid',rsrv_time=now() ".
"where node_id='$pnode'");
DBQueryFatal("unlock tables");
......
......@@ -258,18 +258,10 @@ REPLACE INTO state_timeouts VALUES ('PCVM','TBSETUP',600,'NOTIFY');
REPLACE INTO state_transitions VALUES ('ALWAYSUP','ISUP','SHUTDOWN','Reboot');
REPLACE INTO state_transitions VALUES ('ALWAYSUP','SHUTDOWN','ISUP','BootDone');
REPLACE INTO state_transitions VALUES ('EXPTSTATUS','ACTIVATING','ACTIVE','SwappedIn');
REPLACE INTO state_transitions VALUES ('EXPTSTATUS','ACTIVATING','SWAPPED','Error');
REPLACE INTO state_transitions VALUES ('EXPTSTATUS','ACTIVE','SWAPPING','SwapOut');
REPLACE INTO state_transitions VALUES ('EXPTSTATUS','PRERUN','SWAPPED','Create');
REPLACE INTO state_transitions VALUES ('EXPTSTATUS','SWAPPED','ACTIVATING','SwapIn');
REPLACE INTO state_transitions VALUES ('EXPTSTATUS','SWAPPED','TERMINATING','End');
REPLACE INTO state_transitions VALUES ('EXPTSTATUS','SWAPPING','SWAPPED','SwappedOut');
REPLACE INTO state_transitions VALUES ('EXPTSTATUS','TESTING','SWAPPING','SwapOut');
REPLACE INTO state_transitions VALUES ('MINIMAL','BOOTING','BOOTING','DHCPRetry');
REPLACE INTO state_transitions VALUES ('MINIMAL','BOOTING','ISUP','BootDone');
REPLACE INTO state_transitions VALUES ('MINIMAL','BOOTING','SHUTDOWN','Error');
REPLACE INTO state_transitions VALUES ('MINIMAL','ISUP','BOOTING','SilentReboot');
REPLACE INTO state_transitions VALUES ('MINIMAL','ISUP','BOOTING','KernelChange');
REPLACE INTO state_transitions VALUES ('MINIMAL','ISUP','SHUTDOWN','Reboot');
REPLACE INTO state_transitions VALUES ('MINIMAL','SHUTDOWN','BOOTING','DHCP');
REPLACE INTO state_transitions VALUES ('MINIMAL','SHUTDOWN','SHUTDOWN','Retry');
......@@ -365,28 +357,31 @@ REPLACE INTO state_transitions VALUES ('EXAMPLE','UNVERIFIED','NEW','Un-Approve'
REPLACE INTO state_transitions VALUES ('BATCHSTATE','ACTIVE','TERMINATING','SwapOut');
REPLACE INTO state_transitions VALUES ('BATCHSTATE','TERMINATING','SWAPPED','SwapOut');
REPLACE INTO state_transitions VALUES ('BATCHSTATE','SWAPPED','POSTED','RePost');
REPLACE INTO state_transitions VALUES ('EXPTSTATUS2','ACTIVATING','ACTIVE','SwappedIn');
REPLACE INTO state_transitions VALUES ('EXPTSTATUS2','ACTIVATING','SWAPPED','Error');
REPLACE INTO state_transitions VALUES ('EXPTSTATUS2','POSTED','ACTIVATING','BatchRun');
REPLACE INTO state_transitions VALUES ('EXPTSTATUS2','ACTIVE','SWAPPING','SwapOut');
REPLACE INTO state_transitions VALUES ('EXPTSTATUS2','PRERUN','POSTED','Create');
REPLACE INTO state_transitions VALUES ('EXPTSTATUS2','SWAPPED','ACTIVATING','SwapIn');
REPLACE INTO state_transitions VALUES ('EXPTSTATUS2','SWAPPING','SWAPPED','SwappedOut');
REPLACE INTO state_transitions VALUES ('EXPTSTATUS2','PRERUN','SWAPPED','Create');
REPLACE INTO state_transitions VALUES ('EXPTSTATUS','ACTIVATING','TESTING','Testing');
REPLACE INTO state_transitions VALUES ('EXPTSTATUS2','SWAPPED','POSTED','ReBatch');
REPLACE INTO state_transitions VALUES ('EXPTSTATUS2','SWAPPED','TERMINATING','End');
REPLACE INTO state_transitions VALUES ('EXPT_STATES','NEW','ACTIVATING','');
REPLACE INTO state_transitions VALUES ('EXPT_STATES','ACTIVATING','SWAPPED','');
REPLACE INTO state_transitions VALUES ('EXPT_STATES','ACTIVATING','POSTED','');
REPLACE INTO state_transitions VALUES ('EXPT_STATES','POSTED','ACTIVE','');
REPLACE INTO state_transitions VALUES ('EXPT_STATES','SWAPPED','ACTIVE','');
REPLACE INTO state_transitions VALUES ('EXPT_STATES','ACTIVE','SWAPPED','');
REPLACE INTO state_transitions VALUES ('EXPT_STATES','SWAPPED','POSTED','');
REPLACE INTO state_transitions VALUES ('EXPT_STATES','POSTED','SWAPPED','');
REPLACE INTO state_transitions VALUES ('EXPT_STATES','SWAPPED','TERMINATED','');
REPLACE INTO state_transitions VALUES ('EXPT_STATES','ACTIVE','ACTIVE','');
REPLACE INTO state_transitions VALUES ('EXPT_STATES','ACTIVATING','ACTIVE','');
REPLACE INTO state_transitions VALUES ('EXPTSTATE','MODIFY_RESWAP','SWAPPING','Nonrecover Error');
REPLACE INTO state_transitions VALUES ('EXPTSTATE','NEW','PRERUN','Create');
REPLACE INTO state_transitions VALUES ('EXPTSTATE','NEW','ENDED','Endexp');
REPLACE INTO state_transitions VALUES ('EXPTSTATE','PRERUN','QUEUED','Batch');
REPLACE INTO state_transitions VALUES ('EXPTSTATE','PRERUN','SWAPPED','Immediate');
REPLACE INTO state_transitions VALUES ('EXPTSTATE','QUEUED','SWAPPED','Dequeue');
REPLACE INTO state_transitions VALUES ('EXPTSTATE','QUEUED','ACTIVATING','BatchRun');
REPLACE INTO state_transitions VALUES ('EXPTSTATE','QUEUED','TERMINATING','Endexp');
REPLACE INTO state_transitions VALUES ('EXPTSTATE','SWAPPED','QUEUED','Queue');
REPLACE INTO state_transitions VALUES ('EXPTSTATE','SWAPPED','ACTIVATING','SwapIn');
REPLACE INTO state_transitions VALUES ('EXPTSTATE','ACTIVATING','ACTIVE','NoError');
REPLACE INTO state_transitions VALUES ('EXPTSTATE','ACTIVATING','SWAPPED','Error');
REPLACE INTO state_transitions VALUES ('EXPTSTATE','ACTIVE','SWAPPING','SwapOut');
REPLACE INTO state_transitions VALUES ('EXPTSTATE','ACTIVE','RESTARTING','Restart');
REPLACE INTO state_transitions VALUES ('EXPTSTATE','RESTARTING','ACTIVE','(No)Error');
REPLACE INTO state_transitions VALUES ('EXPTSTATE','SWAPPING','SWAPPED','(No)Error');
REPLACE INTO state_transitions VALUES ('EXPTSTATE','SWAPPED','TERMINATING','EndExp');
REPLACE INTO state_transitions VALUES ('EXPTSTATE','TERMINATING','SWAPPED','Error');
REPLACE INTO state_transitions VALUES ('EXPTSTATE','TERMINATING','ENDED','NoError');
REPLACE INTO state_transitions VALUES ('EXPTSTATE','SWAPPED','MODIFY_PRERUN','Modify');
REPLACE INTO state_transitions VALUES ('EXPTSTATE','MODIFY_PRERUN','SWAPPED','(No)Error');
REPLACE INTO state_transitions VALUES ('EXPTSTATE','ACTIVE','MODIFY_PARSE','Modify');
REPLACE INTO state_transitions VALUES ('EXPTSTATE','MODIFY_PARSE','MODIFY_RESWAP','NoError');
REPLACE INTO state_transitions VALUES ('EXPTSTATE','MODIFY_PARSE','ACTIVE','Error');
REPLACE INTO state_transitions VALUES ('EXPTSTATE','MODIFY_RESWAP','ACTIVE','(No)Error');
--
-- Dumping data for table `state_triggers`
......
......@@ -436,7 +436,7 @@ if ($plabcount && (keys(%virt_nodes) == $plabcount)) {
TBDebugTimeStamp("assign_loop started");
while (1) {
# Check cancel flag before continuing.
TBGetBatchCancelFlag($pid, $eid, \$canceled);
TBGetCancelFlag($pid, $eid, \$canceled);
fatal($WRAPPER_FAILED|$WRAPPER_FAILED_CANRECOVER,
"Cancel flag set; aborting assign run!")
if ($canceled);
......@@ -545,7 +545,7 @@ sub RunAssign ($)
}
# Check cancel flag.
TBGetBatchCancelFlag($pid, $eid, \$canceled);
TBGetCancelFlag($pid, $eid, \$canceled);
if ($canceled) {
if ((my $pgrp = getpgrp($childpid)) > 0) {
kill('TERM', -$pgrp);
......@@ -570,7 +570,7 @@ sub RunAssign ($)
}
# Check cancel flag before continuing.
TBGetBatchCancelFlag($pid, $eid, \$canceled);
TBGetCancelFlag($pid, $eid, \$canceled);
fatal($WRAPPER_FAILED|$WRAPPER_FAILED_CANRECOVER,
"Cancel flag set; aborting assign run!")
if ($canceled);
......@@ -1028,7 +1028,7 @@ foreach my $pnode (keys(%virtnodes)) {
my @ovlist = ();
# Check cancel flag before continuing.
TBGetBatchCancelFlag($pid, $eid, \$canceled);
TBGetCancelFlag($pid, $eid, \$canceled);
fatal($WRAPPER_FAILED,
"Cancel flag set; aborting assign run!")
if ($canceled);
......@@ -1158,7 +1158,7 @@ foreach my $pnode (keys(%virtnodes)) {
}
# Check cancel flag before continuing.
TBGetBatchCancelFlag($pid, $eid, \$canceled);
TBGetCancelFlag($pid, $eid, \$canceled);
fatal($WRAPPER_FAILED,
"Cancel flag set; aborting assign run!")
if ($canceled);
......
......@@ -61,11 +61,13 @@ my $batchlog = "$TB/log/batchlog";
my $projroot = "/proj";
my $debug = 0;
my $BSTATE_POSTED = BATCHSTATE_POSTED;
my $BSTATE_ACTIVATING = BATCHSTATE_ACTIVATING;
my $BSTATE_RUNNING = BATCHSTATE_RUNNING;
my $BSTATE_TERMINATING = BATCHSTATE_TERMINATING;
my $BSTATE_PAUSED = BATCHSTATE_PAUSED;
my $BSTATE_POSTED = EXPTSTATE_QUEUED;
my $BSTATE_ACTIVATING = EXPTSTATE_ACTIVATING;
my $BSTATE_RUNNING = EXPTSTATE_ACTIVE;
my $BSTATE_TERMINATING = EXPTSTATE_TERMINATING;
my $BSTATE_PAUSED = EXPTSTATE_SWAPPED;
my $BSTATE_LOCKED = BATCHSTATE_LOCKED;
my $BSTATE_UNLOCKED = BATCHSTATE_UNLOCKED;
#
# These are valid in the children, not the parent. I suppose I could use
......@@ -156,12 +158,12 @@ while (1) {
DBQueryWarn("SELECT e1.* FROM experiments as e1 ".
"left join experiments as e2 on ".
" e2.expt_head_uid=e1.expt_head_uid and ".
" e2.batchmode=1 and e2.batchstate='$BSTATE_RUNNING' and ".
" e2.batchmode=1 and e2.state='$BSTATE_RUNNING' and ".
" e1.pid=e2.pid and e1.eid!=e2.eid ".
"WHERE e2.eid is null and ".
" e1.batchmode=1 and e1.canceled=0 and ".
" e1.expt_locked is null and ".
" e1.batchstate='$BSTATE_POSTED' and ".
" e1.state='$BSTATE_POSTED' and ".
" (e1.attempts=0 or ".
" ((UNIX_TIMESTAMP() - ".
" UNIX_TIMESTAMP(e1.expt_start) > ($retry_wait)))) ".
......@@ -169,7 +171,7 @@ while (1) {
$running_result =
DBQuery("select * from experiments ".
"where batchmode=1 and batchstate='$BSTATE_RUNNING' ".
"where batchmode=1 and state='$BSTATE_RUNNING' ".
"ORDER BY expt_start LIMIT 1");
if (!$pending_result || !$running_result) {
......@@ -184,9 +186,8 @@ while (1) {
}
#
# If we have a pending experiment to run, set its state to configuring
# right away, while we have the tables locked. This prevents endexp
# from seeing it as something it can cancel.
# If we have a pending experiment to run, the lock it right away,
# while we have the tables locked.
#
if ($pending_result->numrows) {
%pending_row = $pending_result->fetchhash();
......@@ -196,8 +197,9 @@ while (1) {
my $pid = $pending_row{'pid'};
$query_result =
DBQuery("update experiments set expt_start=now(), ".
"batchstate='$BSTATE_ACTIVATING' ".
DBQuery("update experiments set ".
" expt_locked=now(), ".
" batchstate='$BSTATE_LOCKED' ".
"where eid='$eid' and pid='$pid'");
if (! $query_result) {
......@@ -218,66 +220,55 @@ while (1) {
# loop instead of in the child that started the experiment, its so that
# we fire up again and look for them in the event that paper goes down.
#
if ($running_result->numrows) {
if (!$running_result->numrows) {
DBQueryWarn("unlock tables");
}
else {
my %running_row = $running_result->fetchhash();
my $canceled = $running_row{'canceled'};
if ($canceled) {
# Local vars!
my $eid = $running_row{'eid'};
my $pid = $running_row{'pid'};
# Local vars!
my $eid = $running_row{'eid'};
my $pid = $running_row{'pid'};
#
# Have to set the state to busy so that no one will be able
# to mess with the experiment while we deal with termination.
#
TBSetBatchState($pid, $eid, BATCHSTATE_RUNNING_LOCKED());
DBQueryWarn("unlock tables");
# Look at the cancel flag.
if ($canceled == BATCHMODE_CANCELTERM) {
dosomething("cancel", %running_row);
}
elsif ($canceled == BATCHMODE_CANCELSWAP) {
dosomething("swap", %running_row);
}
else {
print "Improper cancel flag: $canceled\n";
}
}
else {
#
# Have to set the state to busy so that no one will be able
# to mess with the experiment while trying to determine if
# the batch is done.
#
TBSetBatchState($pid, $eid, BATCHSTATE_RUNNING_LOCKED());
DBQueryWarn("unlock tables");
if (isexpdone(%running_row)) {
#
# Terminate the experiment. Set the state appropriately
# so that swapexp will accept it. It is okay to do this
# with the table unlocked since no one is allowed to mess
# with a batch experiment in the RUNNING_LOCKED state.
#
dosomething("swap", %running_row);
#
# Lock so user cannot mess with it.
#
$query_result =
DBQuery("update experiments set ".
" expt_locked=now(), ".
" batchstate='$BSTATE_LOCKED' ".
"where eid='$eid' and pid='$pid'");
DBQueryWarn("unlock tables");
if ($query_result) {
if ($canceled) {
# Look at the cancel flag.
if ($canceled == EXPTCANCEL_TERM) {
dosomething("cancel", %running_row);
}
elsif ($canceled == EXPTCANCEL_SWAP) {
dosomething("swap", %running_row);
}
else {
print "Improper cancel flag: $canceled\n";
}
}
else {
#
# Reset the state to RUNNING. It is okay to do this with
# the table unlocked since no one is allowed to mess
# with a batch experiment in the RUNNING_LOCKED state.
#
TBSetBatchState($pid, $eid, $BSTATE_RUNNING);
if (isexpdone(%running_row)) {
#
# Terminate the experiment.
#
dosomething("swap", %running_row);
}
else {
#
# Unlock.
#
TBBatchUnLockExp($pid, $eid);
}
}
}
}
else {
# no one above unlocked the tables ...
DBQueryWarn("unlock tables");
}
#
# Finally start an actual experiment!
#
......@@ -444,15 +435,15 @@ sub startexp($)
$exphash{'canceled'} = $canceled;
# Yuck: This is strictly for the benefit of swapexp() below.
$exphash{'batchstate'} = BATCHSTATE_RUNNING
$exphash{'state'} = EXPTSTATE_ACTIVE
if ($running);
if ($canceled) {
# Look at the cancel flag.
if ($canceled == BATCHMODE_CANCELTERM) {
if ($canceled == EXPTCANCEL_TERM) {
cancelexp(%exphash);
}
elsif ($canceled == BATCHMODE_CANCELSWAP) {
elsif ($canceled == EXPTCANCEL_SWAP) {
swapexp(%exphash);
}
else {
......@@ -488,8 +479,7 @@ sub startexp($)
# never going to work.
#
if ($exit_status == 1 || $exit_status == -1) {
TBSetBatchState($pid, $eid, $BSTATE_PAUSED);
TBUnLockExp($pid, $eid);
TBBatchUnLockExp($pid, $eid, EXPTSTATE_SWAPPED());
email_status("Experiment startup has failed with a fatal error!\n".
"Batch has been dequeued so that you may check it.");
......@@ -514,22 +504,14 @@ sub startexp($)
email_status($msg);
}
#
# There is some state that needs to be reset so that another
# attempt can be made.
#
TBSetBatchState($pid, $eid, $BSTATE_POSTED);
TBUnLockExp($pid, $eid);
TBBatchUnLockExp($pid, $eid, EXPTSTATE_QUEUED());
exit($exit_status);
}
#
# Well, it configured! Lets set it state to running.
# Well, it configured! We can now unlock it.
#
TBSetBatchState($pid, $eid, $BSTATE_RUNNING);
TBUnLockExp($pid, $eid);
TBBatchUnLockExp($pid, $eid);
email_status("Batch Mode experiment $pid/$eid is now running!\n".
"Please consult the Web interface to see how it is doing.");
......@@ -547,15 +529,9 @@ sub swapexp($;$)
{
my(%exphash) = @_;
my $canceled = $exphash{'canceled'};
my $running = ($exphash{'batchstate'} eq BATCHSTATE_RUNNING);
my $running = ($exphash{'state'} eq EXPTSTATE_ACTIVE);
if ($running) {
#
# Have to set the state to terminating so that swap/end exp
# will accept it.
#
TBSetBatchState($pid, $eid, $BSTATE_TERMINATING);
system("$swapexp -b -s out $pid $eid");
if ($?) {
#
......@@ -568,9 +544,8 @@ sub swapexp($;$)
# Set the state to paused to ensure that it is not run again until
# the user wants to.
#
TBSetBatchCancelFlag($pid, $eid, BATCHMODE_CANCELCLEAR);
TBSetBatchState($pid, $eid, $BSTATE_PAUSED);
TBUnLockExp($pid, $eid);
TBSetCancelFlag($pid, $eid, EXPTCANCEL_CLEAR);
TBBatchUnLockExp($pid, $eid);
if ($canceled) {
email_status("Batch Mode experiment $pid/$eid has been stopped!");
......@@ -592,11 +567,6 @@ sub cancelexp($)
{
my(%exphash) = @_;
#
# Have to set the state to terminating so that swap/end exp will accept it.
#
TBSetBatchState($pid, $eid, $BSTATE_TERMINATING);
#
# It does not matter if the experiment is running; endexp does the
# right thing.
......@@ -743,3 +713,14 @@ sub donotify($$$)
SENDMAIL($to, $subject, $mesg, $from, $hdrs,
($logname, "assign.log", $nsfile));
}
sub TBBatchUnLockExp($$;$)
{
my($pid, $eid, $newstate) = @_;
DBQueryWarn("update experiments set expt_locked=NULL, ".
" batchstate='$BSTATE_UNLOCKED' ".
(defined($newstate) ? ",state='$newstate' " : "") .
"where eid='$eid' and pid='$pid'");
return 1;
}
......@@ -92,7 +92,6 @@ my $autoswaptime = 10 * 60;
my $idleignore = 0;
my $priority = TB_EXPTPRIORITY_LOW;
my $exptstate = EXPTSTATE_NEW();
my $swapstate = BATCHSTATE_ACTIVATING();
#
# Verify user and get his DB uid.
......@@ -192,12 +191,12 @@ if (! DBQueryWarn("INSERT INTO experiments ".
"(eid, pid, gid, expt_created, expt_expires, expt_name,".
" expt_head_uid,expt_swap_uid, state, priority, swappable,".
" idleswap, idleswap_timeout, autoswap, autoswap_timeout,".
" idle_ignore, keyhash, batchstate, expt_locked, eventkey) ".
" idle_ignore, keyhash, expt_locked, eventkey) ".
"VALUES ('$eid', '$pid', '$gid', now(), '$expires', ".
"$description,'$dbuid', '$dbuid', '$exptstate', $priority, ".
"$swappable, $idleswap, '$swaptime', $autoswap, ".
"'$autoswaptime', $idleignore, '$webkey', ".
"'$swapstate', now(), '$eventkey')")) {
"now(), '$eventkey')")) {
DBQueryWarn("unlock tables");
die("*** $0:\n".
" Database error inserting record for $pid/$eid!\n");
......@@ -254,7 +253,7 @@ if ($EVENTSYS) {
# the user is forced to do a modify first (to give it a topology).
#
if (!defined($tempnsfile)) {
TBUnLockExp($pid, $eid, BATCHSTATE_PAUSED);
TBUnLockExp($pid, $eid, EXPTSTATE_NEW());
exit(0);
}
......@@ -305,44 +304,23 @@ if ($nsfile_string) {
}
#
# Check for immediate or batch experiment. If immediate, fire off the
# the startexp script to do the rest. It exits and so do we; user gets
# email later. If its a batch experiment, update the experiment record
# so that the batch daemon will see it and act.
# A batch experiment is essentially preloaded (frontend mode) and then
# dropped into the batch queue, unless the user requested only preload.
# Startexp figures all this out, and in fact this script could easily
# be merged with startexp. Note that we call startexp with the experiment
# locked, and it checks to make sure.
#
# Note that we hand off to startexp with the experiment locked and ACTIVATING.
# This is "okay" since no one else calls startexp.
#
if ($immediate) {
my $optargs = "";
$optargs .= " -f"
if ($frontend);
$optargs .= " -b"
if ($quiet);
my $optargs = "";
$optargs .= " -f"
if ($frontend);
$optargs .= " -b"
if ($quiet);
if (system("$startexp $optargs -g $gid $pid $eid $nsfile")) {
# Obey exit status protocol for web page.
$errorstat = 1;
fatal("Failed to start experiment $pid/$eid!");
}
}
else {
#
# Preload the experiment in the foreground. User sees parse errors
# right away, and the experiment is now in the system so we can look
# at it.
#
if (system("$startexp -f -b -g $gid $pid $eid $nsfile")) {
# Obey exit status protocol for web page.
$errorstat = 1;
fatal("Failed to preload batch experiment $pid/$eid!");
}
# And drop the batch into the queue unless the user was preloading
# a batch experiment.
TBSetBatchState($pid, $eid,
($frontend ? BATCHSTATE_PAUSED : BATCHSTATE_POSTED));
if (system("$startexp $optargs -g $gid $pid $eid $nsfile")) {
# Obey exit status protocol for web page.
$errorstat = 1;
fatal("Failed to start experiment $pid/$eid!");
}
exit(0);
sub fatal($)
......
......@@ -66,10 +66,14 @@ use lib "@prefix@/lib";
use libdb;
use libtestbed;
# Be careful not to exit on transient error
$libdb::DBQUERY_MAXTRIES = 30;
my $tbdir = "$TB/bin/";
my $projroot = "/proj";
my $tbdata = "tbdata";
my $batch = 0;
my $nextstate;
my $logname;
my $dbuid;
my $user_name;
......@@ -168,32 +172,51 @@ if (! $query_result->numrows) {
my %hashrow = $query_result->fetchhash();
my $expt_head_login = $hashrow{'expt_head_uid'};
my $estate = $hashrow{'state'};
my $batchstate = $hashrow{'batchstate'};
my $expt_path = $hashrow{'path'};
my $isbatchexpt = $hashrow{'batchmode'};
my $ebatchstate = $hashrow{'batchstate'};
my $cancelflag = $hashrow{'canceled'};
my $expt_locked = $hashrow{'expt_locked'};
#
# Batch experiments get a different protocol to avoid races with the
# batch daemon. We can kill the experiment directly, but only if the
# batch daemon is not currently working on it. In this case, its the
# same as killing an experiment that is not running.
# same as killing an experiment that is SWAPPED.
#
# XXX: This script is run from the batch daemon. Use the batch state
# to determine when this was invoked from the batch daemon for a valid
# teardown.
# XXX: This script is run from the batch daemon.
#
if ($isbatchexpt) {
if ($batch) {
#
# Sanity Check. If called from the daemon, must be in the proper state.
# Sanity Check. If called from the daemon, must already be locked,
# must be a batch experiment, and must be ACTIVE or SWAPPED.
#
if ($batch) {
die("*** $0:\n".
" Batch experiment $pid/$eid is not in the correct state!\n".
" Currently $ebatchstate, but should be TERMINATING\n")
if ($ebatchstate ne BATCHSTATE_TERMINATING);
}
else {
die("*** $0:\n".
" Experiment $pid/$eid is supposed to be a batch experiment!\n")
if (!$isbatchexpt);
die("*** $0:\n".
" Batch experiment $pid/$eid should be locked!\n")
if (!defined($expt_locked) ||
$batchstate ne BATCHSTATE_LOCKED());
die("*** $0:\n".
" Batch experiment $pid/$eid is not in the correct state!\n".
" Currently $estate, but should be SWAPPED,QUEUED, or ACTIVE\n")
if ($estate ne EXPTSTATE_ACTIVE &&
$estate ne EXPTSTATE_QUEUED &&
$estate ne EXPTSTATE_SWAPPED);