Commit 2025e0bd authored by Leigh B. Stoller's avatar Leigh B. Stoller

Merge the two state machines (batchstate and state) into a single

state machine (state). All of the stuff that was previously handled by
using batchstate is now embedded into the one state machine. Of
course, these mostly overlapped, so its not that much of a change,
except that we also redid the machine, adding more states (for
example, modify phases are now explicit. To get a picture of the
actual state machine, on boss:

		stategraph -o newstates EXPTSTATE
		gv newstates.ps

Things to note:

* The "batchstate" slot of the experiments table is now used solely to
  provide a lock for batch daemon. A secondary change will be to
  change the slot name to something more appropriate, but it can
  happen anytime after this new stuff is installed.

* I have left expt_locked for now, but another later change will be to remove
  expt_locked, and change it to active_busy or some such new state name in
  the state machine. I have removed most uses of expt_locked, except those
  that were necessary until there is a new state to replace it.

* These new changes are an implementation of the new state machine,
  but I have not done anything fancy. Most of the code is the same as
  it was before.

* I suspect that there are races with the batch daemon now, but they
  are going to be rare, and the end result is probably that a
  cancelation is delayed a little bit.
parent b1de9fb2
...@@ -63,14 +63,14 @@ use Exporter; ...@@ -63,14 +63,14 @@ use Exporter;
DBLIMIT_NSFILESIZE NODERELOADPENDING_EID DBLIMIT_NSFILESIZE NODERELOADPENDING_EID
EXPTSTATE_NEW EXPTSTATE_PRERUN EXPTSTATE_SWAPPED EXPTSTATE_SWAPPING EXPTSTATE_NEW EXPTSTATE_PRERUN EXPTSTATE_SWAPPED EXPTSTATE_SWAPPING
EXPTSTATE_ACTIVATING EXPTSTATE_ACTIVE EXPTSTATE_TESTING EXPTSTATE_ACTIVATING EXPTSTATE_ACTIVE
EXPTSTATE_TERMINATING EXPTSTATE_TERMINATED EXPTSTATE_UPDATING EXPTSTATE_TERMINATING EXPTSTATE_TERMINATED EXPTSTATE_QUEUED
EXPTSTATE_MODIFY_PARSE EXPTSTATE_MODIFY_REPARSE EXPTSTATE_MODIFY_RESWAP
EXPTSTATE_RESTARTING
BATCHSTATE_LOCKED BATCHSTATE_UNLOCKED
EXPTCANCEL_CLEAR EXPTCANCEL_TERM EXPTCANCEL_SWAP
BATCHSTATE_POSTED BATCHSTATE_RUNNING BATCHSTATE_TERMINATING TBSetCancelFlag TBGetCancelFlag
BATCHSTATE_ACTIVATING BATCHSTATE_PAUSED
BATCHSTATE_RUNNING_LOCKED BATCHSTATE_MODIFYING
BATCHMODE_CANCELTERM BATCHMODE_CANCELSWAP BATCHMODE_CANCELCLEAR
TBBatchState TBSetBatchState TBSetBatchCancelFlag TBGetBatchCancelFlag
TB_NODELOGTYPE_MISC TB_NODELOGTYPES TB_DEFAULT_NODELOGTYPE TB_NODELOGTYPE_MISC TB_NODELOGTYPES TB_DEFAULT_NODELOGTYPE
...@@ -287,29 +287,28 @@ sub NODEFAILMODE_FATAL() { "fatal"; } ...@@ -287,29 +287,28 @@ sub NODEFAILMODE_FATAL() { "fatal"; }
sub NODEFAILMODE_NONFATAL() { "nonfatal"; } sub NODEFAILMODE_NONFATAL() { "nonfatal"; }
sub NODEFAILMODE_IGNORE() { "ignore"; } sub NODEFAILMODE_IGNORE() { "ignore"; }
# These are really "sub" states. # Experiment states
sub EXPTSTATE_NEW() { "new"; } sub EXPTSTATE_NEW() { "new"; }
sub EXPTSTATE_PRERUN() { "prerunning"; } sub EXPTSTATE_PRERUN() { "prerunning"; }
sub EXPTSTATE_SWAPPED() { "swapped"; } sub EXPTSTATE_SWAPPED() { "swapped"; }
sub EXPTSTATE_QUEUED() { "queued"; }
sub EXPTSTATE_SWAPPING() { "swapping"; } sub EXPTSTATE_SWAPPING() { "swapping"; }
sub EXPTSTATE_ACTIVATING() { "activating"; } sub EXPTSTATE_ACTIVATING() { "activating"; }
sub EXPTSTATE_ACTIVE() { "active"; } sub EXPTSTATE_ACTIVE() { "active"; }
sub EXPTSTATE_TESTING() { "testing"; }
sub EXPTSTATE_TERMINATING() { "terminating"; } sub EXPTSTATE_TERMINATING() { "terminating"; }
sub EXPTSTATE_TERMINATED() { "ended"; } sub EXPTSTATE_TERMINATED() { "ended"; }
sub EXPTSTATE_MODIFY_PARSE() { "modify_parse"; }
sub EXPTSTATE_MODIFY_REPARSE() { "modify_reparse"; }
sub EXPTSTATE_MODIFY_RESWAP() { "modify_reswap"; }
sub EXPTSTATE_RESTARTING() { "restarting"; }
# For the batch_daemon.
sub BATCHSTATE_LOCKED() { "locked";}
sub BATCHSTATE_UNLOCKED() { "unlocked";}
# These are really experiment states (both batch *and* plain).
sub BATCHSTATE_POSTED() { "posted"; }
sub BATCHSTATE_ACTIVATING() { "activating"; }
sub BATCHSTATE_RUNNING() { "active"; }
sub BATCHSTATE_RUNNING_LOCKED() { "active_locked"; }
sub BATCHSTATE_MODIFYING() { "modifying"; }
sub BATCHSTATE_PAUSED() { "paused"; }
sub BATCHSTATE_TERMINATING() { "terminating"; }
# Cancel flags # Cancel flags
sub BATCHMODE_CANCELCLEAR { 0 ;} sub EXPTCANCEL_CLEAR() { 0 ;}
sub BATCHMODE_CANCELTERM { 1 ;} sub EXPTCANCEL_TERM() { 1 ;}
sub BATCHMODE_CANCELSWAP { 2 ;} sub EXPTCANCEL_SWAP() { 2 ;}
sub USERSTATUS_ACTIVE() { "active"; } sub USERSTATUS_ACTIVE() { "active"; }
sub USERSTATUS_FROZEN() { "frozen"; } sub USERSTATUS_FROZEN() { "frozen"; }
...@@ -348,7 +347,7 @@ sub TB_USERINFO_MODIFYINFO() { 2; } ...@@ -348,7 +347,7 @@ sub TB_USERINFO_MODIFYINFO() { 2; }
sub TB_USERINFO_MIN() { TB_USERINFO_READINFO; } sub TB_USERINFO_MIN() { TB_USERINFO_READINFO; }
sub TB_USERINFO_MAX() { TB_USERINFO_MODIFYINFO; } sub TB_USERINFO_MAX() { TB_USERINFO_MODIFYINFO; }
# Experiments (also batch experiments). # Experiments.
sub TB_EXPT_READINFO() { 1; } sub TB_EXPT_READINFO() { 1; }
sub TB_EXPT_MODIFY() { 2; } sub TB_EXPT_MODIFY() { 2; }
sub TB_EXPT_DESTROY() { 3; } sub TB_EXPT_DESTROY() { 3; }
...@@ -1302,7 +1301,7 @@ sub TBLockExp($$;$) ...@@ -1302,7 +1301,7 @@ sub TBLockExp($$;$)
my $query_result = my $query_result =
DBQueryWarn("update experiments set expt_locked=now() ". DBQueryWarn("update experiments set expt_locked=now() ".
(defined($newstate) ? ",batchstate='$newstate' " : "") . (defined($newstate) ? ",state='$newstate' " : "") .
"where eid='$eid' and pid='$pid'"); "where eid='$eid' and pid='$pid'");
if (! $query_result || if (! $query_result ||
...@@ -1324,7 +1323,7 @@ sub TBExpLocked($$;$) ...@@ -1324,7 +1323,7 @@ sub TBExpLocked($$;$)
my($pid, $eid, $curstate) = @_; my($pid, $eid, $curstate) = @_;
my $query_result = my $query_result =
DBQueryWarn("select expt_locked,batchstate from experiments ". DBQueryWarn("select expt_locked,state from experiments ".
"where eid='$eid' and pid='$pid'"); "where eid='$eid' and pid='$pid'");
if (! $query_result || if (! $query_result ||
...@@ -1353,7 +1352,7 @@ sub TBUnLockExp($$;$) ...@@ -1353,7 +1352,7 @@ sub TBUnLockExp($$;$)
my $query_result = my $query_result =
DBQueryWarn("update experiments set expt_locked=NULL ". DBQueryWarn("update experiments set expt_locked=NULL ".
(defined($newstate) ? ",batchstate='$newstate' " : "") . (defined($newstate) ? ",state='$newstate' " : "") .
"where eid='$eid' and pid='$pid'"); "where eid='$eid' and pid='$pid'");
if (! $query_result || if (! $query_result ||
...@@ -1364,59 +1363,13 @@ sub TBUnLockExp($$;$) ...@@ -1364,59 +1363,13 @@ sub TBUnLockExp($$;$)
} }
# #
# Return BatchMode state. # Set cancel flag,
# #
# usage: TBBatchState(char *pid, char *eid) # usage: SetCancelFlag(char *pid, char *eid, char *flag)
# returns state if a valid pid/eid.
# returns 0 if an invalid pid/eid or if an error.
#
sub TBBatchState($$)
{
my($pid, $eid) = @_;
my $query_result =
DBQueryWarn("select batchstate from experiments ".
"where eid='$eid' and pid='$pid' and batchmode=1");
if (! $query_result ||
$query_result->numrows == 0) {
return 0;
}
my @row = $query_result->fetchrow_array();
return $row[0];
}
#
# Set BatchMode state.
#
# usage: SetBatchState(char *pid, char *eid, char *state)
# returns 1 if okay.
# returns 0 if an invalid pid/eid or if an error.
#
sub TBSetBatchState($$$)
{
my($pid, $eid, $state) = @_;
my $query_result =
DBQueryWarn("update experiments set batchstate='$state',batchmode=1 ".
"where eid='$eid' and pid='$pid'");
if (! $query_result ||
$query_result->numrows == 0) {
return 0;
}
return 1;
}
#
# Set BatchMode cancel flag,
#
# usage: SetBatchCancel(char *pid, char *eid, char *flag)
# returns 1 if okay. # returns 1 if okay.
# returns 0 if an invalid pid/eid or if an error. # returns 0 if an invalid pid/eid or if an error.
# #
sub TBSetBatchCancelFlag($$$) sub TBSetCancelFlag($$$)
{ {
my($pid, $eid, $flag) = @_; my($pid, $eid, $flag) = @_;
...@@ -1432,13 +1385,13 @@ sub TBSetBatchCancelFlag($$$) ...@@ -1432,13 +1385,13 @@ sub TBSetBatchCancelFlag($$$)
} }
# #
# Get BatchMode cancel flag, # Get cancel flag,
# #
# usage: GetBatchCancel(char *pid, char *eid, char **flag) # usage: TBGetCancelFlag(char *pid, char *eid, char **flag)
# returns 1 if okay. # returns 1 if okay.
# returns 0 if an invalid pid/eid or if an error. # returns 0 if an invalid pid/eid or if an error.
# #
sub TBGetBatchCancelFlag($$$) sub TBGetCancelFlag($$$)
{ {
my($pid, $eid, $flag) = @_; my($pid, $eid, $flag) = @_;
...@@ -2409,7 +2362,7 @@ sub MarkPhysNodeDown($) ...@@ -2409,7 +2362,7 @@ sub MarkPhysNodeDown($)
DBQueryFatal("lock tables reserved write"); DBQueryFatal("lock tables reserved write");
DBQueryFatal("update reserved set " . DBQueryFatal("update reserved set " .
" pid='$pid',eid='$eid' ". " pid='$pid',eid='$eid',rsrv_time=now() ".
"where node_id='$pnode'"); "where node_id='$pnode'");
DBQueryFatal("unlock tables"); DBQueryFatal("unlock tables");
......
...@@ -258,18 +258,10 @@ REPLACE INTO state_timeouts VALUES ('PCVM','TBSETUP',600,'NOTIFY'); ...@@ -258,18 +258,10 @@ REPLACE INTO state_timeouts VALUES ('PCVM','TBSETUP',600,'NOTIFY');
REPLACE INTO state_transitions VALUES ('ALWAYSUP','ISUP','SHUTDOWN','Reboot'); REPLACE INTO state_transitions VALUES ('ALWAYSUP','ISUP','SHUTDOWN','Reboot');
REPLACE INTO state_transitions VALUES ('ALWAYSUP','SHUTDOWN','ISUP','BootDone'); REPLACE INTO state_transitions VALUES ('ALWAYSUP','SHUTDOWN','ISUP','BootDone');
REPLACE INTO state_transitions VALUES ('EXPTSTATUS','ACTIVATING','ACTIVE','SwappedIn');
REPLACE INTO state_transitions VALUES ('EXPTSTATUS','ACTIVATING','SWAPPED','Error');
REPLACE INTO state_transitions VALUES ('EXPTSTATUS','ACTIVE','SWAPPING','SwapOut');
REPLACE INTO state_transitions VALUES ('EXPTSTATUS','PRERUN','SWAPPED','Create');
REPLACE INTO state_transitions VALUES ('EXPTSTATUS','SWAPPED','ACTIVATING','SwapIn');
REPLACE INTO state_transitions VALUES ('EXPTSTATUS','SWAPPED','TERMINATING','End');
REPLACE INTO state_transitions VALUES ('EXPTSTATUS','SWAPPING','SWAPPED','SwappedOut');
REPLACE INTO state_transitions VALUES ('EXPTSTATUS','TESTING','SWAPPING','SwapOut');
REPLACE INTO state_transitions VALUES ('MINIMAL','BOOTING','BOOTING','DHCPRetry'); REPLACE INTO state_transitions VALUES ('MINIMAL','BOOTING','BOOTING','DHCPRetry');
REPLACE INTO state_transitions VALUES ('MINIMAL','BOOTING','ISUP','BootDone'); REPLACE INTO state_transitions VALUES ('MINIMAL','BOOTING','ISUP','BootDone');
REPLACE INTO state_transitions VALUES ('MINIMAL','BOOTING','SHUTDOWN','Error'); REPLACE INTO state_transitions VALUES ('MINIMAL','BOOTING','SHUTDOWN','Error');
REPLACE INTO state_transitions VALUES ('MINIMAL','ISUP','BOOTING','SilentReboot'); REPLACE INTO state_transitions VALUES ('MINIMAL','ISUP','BOOTING','KernelChange');
REPLACE INTO state_transitions VALUES ('MINIMAL','ISUP','SHUTDOWN','Reboot'); REPLACE INTO state_transitions VALUES ('MINIMAL','ISUP','SHUTDOWN','Reboot');
REPLACE INTO state_transitions VALUES ('MINIMAL','SHUTDOWN','BOOTING','DHCP'); REPLACE INTO state_transitions VALUES ('MINIMAL','SHUTDOWN','BOOTING','DHCP');
REPLACE INTO state_transitions VALUES ('MINIMAL','SHUTDOWN','SHUTDOWN','Retry'); REPLACE INTO state_transitions VALUES ('MINIMAL','SHUTDOWN','SHUTDOWN','Retry');
...@@ -365,28 +357,31 @@ REPLACE INTO state_transitions VALUES ('EXAMPLE','UNVERIFIED','NEW','Un-Approve' ...@@ -365,28 +357,31 @@ REPLACE INTO state_transitions VALUES ('EXAMPLE','UNVERIFIED','NEW','Un-Approve'
REPLACE INTO state_transitions VALUES ('BATCHSTATE','ACTIVE','TERMINATING','SwapOut'); REPLACE INTO state_transitions VALUES ('BATCHSTATE','ACTIVE','TERMINATING','SwapOut');
REPLACE INTO state_transitions VALUES ('BATCHSTATE','TERMINATING','SWAPPED','SwapOut'); REPLACE INTO state_transitions VALUES ('BATCHSTATE','TERMINATING','SWAPPED','SwapOut');
REPLACE INTO state_transitions VALUES ('BATCHSTATE','SWAPPED','POSTED','RePost'); REPLACE INTO state_transitions VALUES ('BATCHSTATE','SWAPPED','POSTED','RePost');
REPLACE INTO state_transitions VALUES ('EXPTSTATUS2','ACTIVATING','ACTIVE','SwappedIn'); REPLACE INTO state_transitions VALUES ('EXPTSTATE','MODIFY_RESWAP','SWAPPING','Nonrecover Error');
REPLACE INTO state_transitions VALUES ('EXPTSTATUS2','ACTIVATING','SWAPPED','Error'); REPLACE INTO state_transitions VALUES ('EXPTSTATE','NEW','PRERUN','Create');
REPLACE INTO state_transitions VALUES ('EXPTSTATUS2','POSTED','ACTIVATING','BatchRun'); REPLACE INTO state_transitions VALUES ('EXPTSTATE','NEW','ENDED','Endexp');
REPLACE INTO state_transitions VALUES ('EXPTSTATUS2','ACTIVE','SWAPPING','SwapOut'); REPLACE INTO state_transitions VALUES ('EXPTSTATE','PRERUN','QUEUED','Batch');
REPLACE INTO state_transitions VALUES ('EXPTSTATUS2','PRERUN','POSTED','Create'); REPLACE INTO state_transitions VALUES ('EXPTSTATE','PRERUN','SWAPPED','Immediate');
REPLACE INTO state_transitions VALUES ('EXPTSTATUS2','SWAPPED','ACTIVATING','SwapIn'); REPLACE INTO state_transitions VALUES ('EXPTSTATE','QUEUED','SWAPPED','Dequeue');
REPLACE INTO state_transitions VALUES ('EXPTSTATUS2','SWAPPING','SWAPPED','SwappedOut'); REPLACE INTO state_transitions VALUES ('EXPTSTATE','QUEUED','ACTIVATING','BatchRun');
REPLACE INTO state_transitions VALUES ('EXPTSTATUS2','PRERUN','SWAPPED','Create'); REPLACE INTO state_transitions VALUES ('EXPTSTATE','QUEUED','TERMINATING','Endexp');
REPLACE INTO state_transitions VALUES ('EXPTSTATUS','ACTIVATING','TESTING','Testing'); REPLACE INTO state_transitions VALUES ('EXPTSTATE','SWAPPED','QUEUED','Queue');
REPLACE INTO state_transitions VALUES ('EXPTSTATUS2','SWAPPED','POSTED','ReBatch'); REPLACE INTO state_transitions VALUES ('EXPTSTATE','SWAPPED','ACTIVATING','SwapIn');
REPLACE INTO state_transitions VALUES ('EXPTSTATUS2','SWAPPED','TERMINATING','End'); REPLACE INTO state_transitions VALUES ('EXPTSTATE','ACTIVATING','ACTIVE','NoError');
REPLACE INTO state_transitions VALUES ('EXPT_STATES','NEW','ACTIVATING',''); REPLACE INTO state_transitions VALUES ('EXPTSTATE','ACTIVATING','SWAPPED','Error');
REPLACE INTO state_transitions VALUES ('EXPT_STATES','ACTIVATING','SWAPPED',''); REPLACE INTO state_transitions VALUES ('EXPTSTATE','ACTIVE','SWAPPING','SwapOut');
REPLACE INTO state_transitions VALUES ('EXPT_STATES','ACTIVATING','POSTED',''); REPLACE INTO state_transitions VALUES ('EXPTSTATE','ACTIVE','RESTARTING','Restart');
REPLACE INTO state_transitions VALUES ('EXPT_STATES','POSTED','ACTIVE',''); REPLACE INTO state_transitions VALUES ('EXPTSTATE','RESTARTING','ACTIVE','(No)Error');
REPLACE INTO state_transitions VALUES ('EXPT_STATES','SWAPPED','ACTIVE',''); REPLACE INTO state_transitions VALUES ('EXPTSTATE','SWAPPING','SWAPPED','(No)Error');
REPLACE INTO state_transitions VALUES ('EXPT_STATES','ACTIVE','SWAPPED',''); REPLACE INTO state_transitions VALUES ('EXPTSTATE','SWAPPED','TERMINATING','EndExp');
REPLACE INTO state_transitions VALUES ('EXPT_STATES','SWAPPED','POSTED',''); REPLACE INTO state_transitions VALUES ('EXPTSTATE','TERMINATING','SWAPPED','Error');
REPLACE INTO state_transitions VALUES ('EXPT_STATES','POSTED','SWAPPED',''); REPLACE INTO state_transitions VALUES ('EXPTSTATE','TERMINATING','ENDED','NoError');
REPLACE INTO state_transitions VALUES ('EXPT_STATES','SWAPPED','TERMINATED',''); REPLACE INTO state_transitions VALUES ('EXPTSTATE','SWAPPED','MODIFY_PRERUN','Modify');
REPLACE INTO state_transitions VALUES ('EXPT_STATES','ACTIVE','ACTIVE',''); REPLACE INTO state_transitions VALUES ('EXPTSTATE','MODIFY_PRERUN','SWAPPED','(No)Error');
REPLACE INTO state_transitions VALUES ('EXPT_STATES','ACTIVATING','ACTIVE',''); REPLACE INTO state_transitions VALUES ('EXPTSTATE','ACTIVE','MODIFY_PARSE','Modify');
REPLACE INTO state_transitions VALUES ('EXPTSTATE','MODIFY_PARSE','MODIFY_RESWAP','NoError');
REPLACE INTO state_transitions VALUES ('EXPTSTATE','MODIFY_PARSE','ACTIVE','Error');
REPLACE INTO state_transitions VALUES ('EXPTSTATE','MODIFY_RESWAP','ACTIVE','(No)Error');
-- --
-- Dumping data for table `state_triggers` -- Dumping data for table `state_triggers`
......
...@@ -436,7 +436,7 @@ if ($plabcount && (keys(%virt_nodes) == $plabcount)) { ...@@ -436,7 +436,7 @@ if ($plabcount && (keys(%virt_nodes) == $plabcount)) {
TBDebugTimeStamp("assign_loop started"); TBDebugTimeStamp("assign_loop started");
while (1) { while (1) {
# Check cancel flag before continuing. # Check cancel flag before continuing.
TBGetBatchCancelFlag($pid, $eid, \$canceled); TBGetCancelFlag($pid, $eid, \$canceled);
fatal($WRAPPER_FAILED|$WRAPPER_FAILED_CANRECOVER, fatal($WRAPPER_FAILED|$WRAPPER_FAILED_CANRECOVER,
"Cancel flag set; aborting assign run!") "Cancel flag set; aborting assign run!")
if ($canceled); if ($canceled);
...@@ -545,7 +545,7 @@ sub RunAssign ($) ...@@ -545,7 +545,7 @@ sub RunAssign ($)
} }
# Check cancel flag. # Check cancel flag.
TBGetBatchCancelFlag($pid, $eid, \$canceled); TBGetCancelFlag($pid, $eid, \$canceled);
if ($canceled) { if ($canceled) {
if ((my $pgrp = getpgrp($childpid)) > 0) { if ((my $pgrp = getpgrp($childpid)) > 0) {
kill('TERM', -$pgrp); kill('TERM', -$pgrp);
...@@ -570,7 +570,7 @@ sub RunAssign ($) ...@@ -570,7 +570,7 @@ sub RunAssign ($)
} }
# Check cancel flag before continuing. # Check cancel flag before continuing.
TBGetBatchCancelFlag($pid, $eid, \$canceled); TBGetCancelFlag($pid, $eid, \$canceled);
fatal($WRAPPER_FAILED|$WRAPPER_FAILED_CANRECOVER, fatal($WRAPPER_FAILED|$WRAPPER_FAILED_CANRECOVER,
"Cancel flag set; aborting assign run!") "Cancel flag set; aborting assign run!")
if ($canceled); if ($canceled);
...@@ -1028,7 +1028,7 @@ foreach my $pnode (keys(%virtnodes)) { ...@@ -1028,7 +1028,7 @@ foreach my $pnode (keys(%virtnodes)) {
my @ovlist = (); my @ovlist = ();
# Check cancel flag before continuing. # Check cancel flag before continuing.
TBGetBatchCancelFlag($pid, $eid, \$canceled); TBGetCancelFlag($pid, $eid, \$canceled);
fatal($WRAPPER_FAILED, fatal($WRAPPER_FAILED,
"Cancel flag set; aborting assign run!") "Cancel flag set; aborting assign run!")
if ($canceled); if ($canceled);
...@@ -1158,7 +1158,7 @@ foreach my $pnode (keys(%virtnodes)) { ...@@ -1158,7 +1158,7 @@ foreach my $pnode (keys(%virtnodes)) {
} }
# Check cancel flag before continuing. # Check cancel flag before continuing.
TBGetBatchCancelFlag($pid, $eid, \$canceled); TBGetCancelFlag($pid, $eid, \$canceled);
fatal($WRAPPER_FAILED, fatal($WRAPPER_FAILED,
"Cancel flag set; aborting assign run!") "Cancel flag set; aborting assign run!")
if ($canceled); if ($canceled);
......
...@@ -61,11 +61,13 @@ my $batchlog = "$TB/log/batchlog"; ...@@ -61,11 +61,13 @@ my $batchlog = "$TB/log/batchlog";
my $projroot = "/proj"; my $projroot = "/proj";
my $debug = 0; my $debug = 0;
my $BSTATE_POSTED = BATCHSTATE_POSTED; my $BSTATE_POSTED = EXPTSTATE_QUEUED;
my $BSTATE_ACTIVATING = BATCHSTATE_ACTIVATING; my $BSTATE_ACTIVATING = EXPTSTATE_ACTIVATING;
my $BSTATE_RUNNING = BATCHSTATE_RUNNING; my $BSTATE_RUNNING = EXPTSTATE_ACTIVE;
my $BSTATE_TERMINATING = BATCHSTATE_TERMINATING; my $BSTATE_TERMINATING = EXPTSTATE_TERMINATING;
my $BSTATE_PAUSED = BATCHSTATE_PAUSED; my $BSTATE_PAUSED = EXPTSTATE_SWAPPED;
my $BSTATE_LOCKED = BATCHSTATE_LOCKED;
my $BSTATE_UNLOCKED = BATCHSTATE_UNLOCKED;
# #
# These are valid in the children, not the parent. I suppose I could use # These are valid in the children, not the parent. I suppose I could use
...@@ -156,12 +158,12 @@ while (1) { ...@@ -156,12 +158,12 @@ while (1) {
DBQueryWarn("SELECT e1.* FROM experiments as e1 ". DBQueryWarn("SELECT e1.* FROM experiments as e1 ".
"left join experiments as e2 on ". "left join experiments as e2 on ".
" e2.expt_head_uid=e1.expt_head_uid and ". " e2.expt_head_uid=e1.expt_head_uid and ".
" e2.batchmode=1 and e2.batchstate='$BSTATE_RUNNING' and ". " e2.batchmode=1 and e2.state='$BSTATE_RUNNING' and ".
" e1.pid=e2.pid and e1.eid!=e2.eid ". " e1.pid=e2.pid and e1.eid!=e2.eid ".
"WHERE e2.eid is null and ". "WHERE e2.eid is null and ".
" e1.batchmode=1 and e1.canceled=0 and ". " e1.batchmode=1 and e1.canceled=0 and ".
" e1.expt_locked is null and ". " e1.expt_locked is null and ".
" e1.batchstate='$BSTATE_POSTED' and ". " e1.state='$BSTATE_POSTED' and ".
" (e1.attempts=0 or ". " (e1.attempts=0 or ".
" ((UNIX_TIMESTAMP() - ". " ((UNIX_TIMESTAMP() - ".
" UNIX_TIMESTAMP(e1.expt_start) > ($retry_wait)))) ". " UNIX_TIMESTAMP(e1.expt_start) > ($retry_wait)))) ".
...@@ -169,7 +171,7 @@ while (1) { ...@@ -169,7 +171,7 @@ while (1) {
$running_result = $running_result =
DBQuery("select * from experiments ". DBQuery("select * from experiments ".
"where batchmode=1 and batchstate='$BSTATE_RUNNING' ". "where batchmode=1 and state='$BSTATE_RUNNING' ".
"ORDER BY expt_start LIMIT 1"); "ORDER BY expt_start LIMIT 1");
if (!$pending_result || !$running_result) { if (!$pending_result || !$running_result) {
...@@ -184,9 +186,8 @@ while (1) { ...@@ -184,9 +186,8 @@ while (1) {
} }
# #
# If we have a pending experiment to run, set its state to configuring # If we have a pending experiment to run, the lock it right away,
# right away, while we have the tables locked. This prevents endexp # while we have the tables locked.
# from seeing it as something it can cancel.
# #
if ($pending_result->numrows) { if ($pending_result->numrows) {
%pending_row = $pending_result->fetchhash(); %pending_row = $pending_result->fetchhash();
...@@ -196,8 +197,9 @@ while (1) { ...@@ -196,8 +197,9 @@ while (1) {
my $pid = $pending_row{'pid'}; my $pid = $pending_row{'pid'};
$query_result = $query_result =
DBQuery("update experiments set expt_start=now(), ". DBQuery("update experiments set ".
"batchstate='$BSTATE_ACTIVATING' ". " expt_locked=now(), ".
" batchstate='$BSTATE_LOCKED' ".
"where eid='$eid' and pid='$pid'"); "where eid='$eid' and pid='$pid'");
if (! $query_result) { if (! $query_result) {
...@@ -218,66 +220,55 @@ while (1) { ...@@ -218,66 +220,55 @@ while (1) {
# loop instead of in the child that started the experiment, its so that # loop instead of in the child that started the experiment, its so that
# we fire up again and look for them in the event that paper goes down. # we fire up again and look for them in the event that paper goes down.
# #
if ($running_result->numrows) { if (!$running_result->numrows) {
DBQueryWarn("unlock tables");
}
else {
my %running_row = $running_result->fetchhash(); my %running_row = $running_result->fetchhash();
my $canceled = $running_row{'canceled'}; my $canceled = $running_row{'canceled'};
# Local vars!
if ($canceled) { my $eid = $running_row{'eid'};
# Local vars! my $pid = $running_row{'pid'};
my $eid = $running_row{'eid'};
my $pid = $running_row{'pid'};
# #
# Have to set the state to busy so that no one will be able # Lock so user cannot mess with it.
# to mess with the experiment while we deal with termination. #
# $query_result =
TBSetBatchState($pid, $eid, BATCHSTATE_RUNNING_LOCKED()); DBQuery("update experiments set ".
DBQueryWarn("unlock tables"); " expt_locked=now(), ".
" batchstate='$BSTATE_LOCKED' ".
# Look at the cancel flag. "where eid='$eid' and pid='$pid'");
if ($canceled == BATCHMODE_CANCELTERM) { DBQueryWarn("unlock tables");
dosomething("cancel", %running_row);
} if ($query_result) {
elsif ($canceled == BATCHMODE_CANCELSWAP) { if ($canceled) {
dosomething("swap", %running_row); # Look at the cancel flag.
} if ($canceled == EXPTCANCEL_TERM) {
else { dosomething("cancel", %running_row);
print "Improper cancel flag: $canceled\n"; }
} elsif ($canceled == EXPTCANCEL_SWAP) {
} dosomething("swap", %running_row);
else { }
# else {
# Have to set the state to busy so that no one will be able print "Improper cancel flag: $canceled\n";
# to mess with the experiment while trying to determine if }
# the batch is done.
#
TBSetBatchState($pid, $eid, BATCHSTATE_RUNNING_LOCKED());
DBQueryWarn("unlock tables");
if (isexpdone(%running_row)) {
#
# Terminate the experiment. Set the state appropriately
# so that swapexp will accept it. It is okay to do this
# with the table unlocked since no one is allowed to mess
# with a batch experiment in the RUNNING_LOCKED state.
#
dosomething("swap", %running_row);
} }
else { else {
# if (isexpdone(%running_row)) {
# Reset the state to RUNNING. It is okay to do this with #
# the table unlocked since no one is allowed to mess # Terminate the experiment.
# with a batch experiment in the RUNNING_LOCKED state. #
# dosomething("swap", %running_row);
TBSetBatchState($pid, $eid, $BSTATE_RUNNING); }
else {
#
# Unlock.
#
TBBatchUnLockExp($pid, $eid);
}
} }
} }
} }
else {
# no one above unlocked the tables ...
DBQueryWarn("unlock tables");
}
# #
# Finally start an actual experiment! # Finally start an actual experiment!
# #
...@@ -444,15 +435,15 @@ sub startexp($) ...@@ -444,15 +435,15 @@ sub startexp($)
$exphash{'canceled'} = $canceled; $exphash{'canceled'} = $canceled;
# Yuck: This is strictly for the benefit of swapexp() below. # Yuck: This is strictly for the benefit of swapexp() below.
$exphash{'batchstate'} = BATCHSTATE_RUNNING $exphash{'state'} = EXPTSTATE_ACTIVE
if ($running); if ($running);
if ($canceled) { if ($canceled) {
# Look at the cancel flag. # Look at the cancel flag.
if ($canceled == BATCHMODE_CANCELTERM) { if ($canceled == EXPTCANCEL_TERM) {
cancelexp(%exphash); cancelexp(%exphash);
} }
elsif ($canceled == BATCHMODE_CANCELSWAP) { elsif ($canceled == EXPTCANCEL_SWAP) {
swapexp(%exphash); swapexp(%exphash);
} }
else { else {
...@@ -488,8 +479,7 @@ sub startexp($) ...@@ -488,8 +479,7 @@ sub startexp($)
# never going to work. # never going to work.
# #
if ($exit_status == 1 || $exit_status == -1) { if ($exit_status == 1 || $exit_status == -1) {
TBSetBatchState($pid, $eid, $BSTATE_PAUSED); TBBatchUnLockExp($pid, $eid, EXPTSTATE_SWAPPED());
TBUnLockExp($pid, $eid);
email_status("Experiment startup has failed with a fatal error!\n". email_status("Experiment startup has failed with a fatal error!\n".
"Batch has been dequeued so that you may check it."); "Batch has been dequeued so that you may check it.");
...@@ -514,22 +504,14 @@ sub startexp($) ...@@ -514,22 +504,14 @@ sub startexp($)
email_status($msg); email_status($msg);
} }
TBBatchUnLockExp($pid, $eid, EXPTSTATE_QUEUED());
#
# There is some state that needs to be reset so that another
# attempt can be made.
#
TBSetBatchState($pid, $eid, $BSTATE_POSTED);
TBUnLockExp($pid, $eid);
exit($exit_status); exit($exit_status);
} }
# #
# Well, it configured! Lets set it state to running. # Well, it configured! We can now unlock it.
# #
TBSetBatchState($pid, $eid, $BSTATE_RUNNING); TBBatchUnLockExp($pid, $eid);
TBUnLockExp($pid, $eid);
email_status("Batch Mode experiment $pid/$eid is now running!\n". email_status("Batch Mode experiment $pid/$eid is now running!\n".
"Please consult the Web interface to see how it is doing."); "Please consult the Web interface to see how it is doing.");
...@@ -547,15 +529,9 @@ sub swapexp($;$) ...@@ -547,15 +529,9 @@ sub swapexp($;$)
{ {
my(%exphash) = @_; my(%exphash) = @_;
my $canceled = $exphash{'canceled'}; my $canceled = $exphash{'canceled'};
my $running = ($exphash{'batchstate'} eq BATCHSTATE_RUNNING); my $running = ($exphash{'state'} eq EXPTSTATE_ACTIVE);
if ($running) { if ($running) {
#
# Have to set the state to terminating so that swap/end exp
# will accept it.
#
TBSetBatchState($pid, $eid, $BSTATE_TERMINATING);
system("$swapexp -b -s out $pid $eid"); system("$swapexp -b -s out $pid $eid");