Commit 947a74cc authored by Leigh B. Stoller's avatar Leigh B. Stoller

Minor changes for batch mode. Propagate error status back out so that

the batch daemon knows when configuraton failed cause there were not
enough nodes. The batch daemon uses this in an information manner
right now, but later we should kick out experiments that fail for any
reason other than lack of resources.
parent 7dec1b27
...@@ -437,11 +437,11 @@ $num_sshelves = $#t+1; ...@@ -437,11 +437,11 @@ $num_sshelves = $#t+1;
$num_pcs = $#t+1 - int($delayindex/2); $num_pcs = $#t+1 - int($delayindex/2);
if ($num_pcs > $avail_pcs) { if ($num_pcs > $avail_pcs) {
print STDERR "Insufficient PCs to do mapping. Have $avail_pcs; need $num_pcs.\n"; print STDERR "Insufficient PCs to do mapping. Have $avail_pcs; need $num_pcs.\n";
exit(777); exit(77);
} }
if ($num_sshelves > $avail_sshelves) { if ($num_sshelves > $avail_sshelves) {
print STDERR "Insufficient Shark Shelves to do mapping. Have $avail_sshelves; need $num_sshelves.\n"; print STDERR "Insufficient Shark Shelves to do mapping. Have $avail_sshelves; need $num_sshelves.\n";
exit(777); exit(77);
} }
......
<
...@@ -33,6 +33,7 @@ my $projroot = "/proj"; ...@@ -33,6 +33,7 @@ my $projroot = "/proj";
my $tbdata = "tbdata"; my $tbdata = "tbdata";
my $cleanme = 0; my $cleanme = 0;
my $batch = 0; my $batch = 0;
my $errorstat= -1;
# #
# For debugging all this goo. Leaves the experiment directory intact, # For debugging all this goo. Leaves the experiment directory intact,
...@@ -210,15 +211,18 @@ if (system("/bin/cp", "$tempns", "$nsfile") != 0) { ...@@ -210,15 +211,18 @@ if (system("/bin/cp", "$tempns", "$nsfile") != 0) {
} }
# #
# Run the various scripts. # Run the various scripts. We want to propogate the error from tbprerun
# and tbrun back out, hence the bogus looking errorstat variable.
# #
if (system("$tbdir/tbprerun -nologfile $pid $eid $nsfile") != 0) { if (system("$tbdir/tbprerun -nologfile $pid $eid $nsfile") != 0) {
$errorstat = $? >> 8;
fatal("tbprerun failed!\n"); fatal("tbprerun failed!\n");
} }
# So fatal errors run tbend. # So fatal errors run tbend.
$cleanme = 1; $cleanme = 1;
if (system("$tbdir/tbrun -nologfile $pid $eid $irfile") != 0) { if (system("$tbdir/tbrun -nologfile $pid $eid $irfile") != 0) {
$errorstat = $? >> 8;
fatal("tbrun failed!\n"); fatal("tbrun failed!\n");
} }
...@@ -343,10 +347,10 @@ exit 0; ...@@ -343,10 +347,10 @@ exit 0;
sub fatal() sub fatal()
{ {
my($mesg) = $_[0]; my($mesg) = $_[0];
print STDOUT "$mesg\n"; print STDOUT "$mesg\n";
print STDOUT "Cleaning up ...\n"; print STDOUT "Cleaning up and exiting with status $errorstat ...\n";
# #
# If we got far enough to allocate nodes, must run tbend. # If we got far enough to allocate nodes, must run tbend.
...@@ -370,11 +374,21 @@ sub fatal() ...@@ -370,11 +374,21 @@ sub fatal()
# In batch mode, exit. Make sure to delete tempns file. # In batch mode, exit. Make sure to delete tempns file.
# #
if ($batch) { if ($batch) {
if (open(IN, "$eiddir/assign.log")) {
print STDOUT "\n\n--------- assign.log --------\n\n";
while (<IN>) {
print STDOUT "$_";
}
close(IN);
}
if (chdir($expdir)) { if (chdir($expdir)) {
system("/bin/mv", "-f", "$eid", "$eid-$PID"); system("/bin/rm", "-rf", "${eid}-TBfailed");
system("/bin/mv", "-f", "$eid", "${eid}-TBfailed");
} }
unlink("$tempns"); unlink("$tempns");
exit(-1); exit($errorstat);
} }
# #
...@@ -389,7 +403,7 @@ sub fatal() ...@@ -389,7 +403,7 @@ sub fatal()
print MAIL $mesg; print MAIL $mesg;
if (open(IN, "$tempns")) { if (open(IN, "$tempns")) {
print MAIL "\n\n---------\n\n"; print MAIL "\n\n--------- $tempns ---------\n\n";
while (<IN>) { while (<IN>) {
print MAIL "$_"; print MAIL "$_";
...@@ -399,7 +413,7 @@ sub fatal() ...@@ -399,7 +413,7 @@ sub fatal()
} }
if (open(IN, "$logname")) {