Commit 947a74cc authored by Leigh B. Stoller's avatar Leigh B. Stoller

Minor changes for batch mode. Propagate error status back out so that

the batch daemon knows when configuraton failed cause there were not
enough nodes. The batch daemon uses this in an information manner
right now, but later we should kick out experiments that fail for any
reason other than lack of resources.
parent 7dec1b27
......@@ -437,11 +437,11 @@ $num_sshelves = $#t+1;
$num_pcs = $#t+1 - int($delayindex/2);
if ($num_pcs > $avail_pcs) {
print STDERR "Insufficient PCs to do mapping. Have $avail_pcs; need $num_pcs.\n";
exit(777);
exit(77);
}
if ($num_sshelves > $avail_sshelves) {
print STDERR "Insufficient Shark Shelves to do mapping. Have $avail_sshelves; need $num_sshelves.\n";
exit(777);
exit(77);
}
......
......@@ -33,6 +33,7 @@ my $projroot = "/proj";
my $tbdata = "tbdata";
my $cleanme = 0;
my $batch = 0;
my $errorstat= -1;
#
# For debugging all this goo. Leaves the experiment directory intact,
......@@ -210,15 +211,18 @@ if (system("/bin/cp", "$tempns", "$nsfile") != 0) {
}
#
# Run the various scripts.
# Run the various scripts. We want to propogate the error from tbprerun
# and tbrun back out, hence the bogus looking errorstat variable.
#
if (system("$tbdir/tbprerun -nologfile $pid $eid $nsfile") != 0) {
$errorstat = $? >> 8;
fatal("tbprerun failed!\n");
}
# So fatal errors run tbend.
$cleanme = 1;
if (system("$tbdir/tbrun -nologfile $pid $eid $irfile") != 0) {
$errorstat = $? >> 8;
fatal("tbrun failed!\n");
}
......@@ -346,7 +350,7 @@ sub fatal()
my($mesg) = $_[0];
print STDOUT "$mesg\n";
print STDOUT "Cleaning up ...\n";
print STDOUT "Cleaning up and exiting with status $errorstat ...\n";
#
# If we got far enough to allocate nodes, must run tbend.
......@@ -370,11 +374,21 @@ sub fatal()
# In batch mode, exit. Make sure to delete tempns file.
#
if ($batch) {
if (open(IN, "$eiddir/assign.log")) {
print STDOUT "\n\n--------- assign.log --------\n\n";
while (<IN>) {
print STDOUT "$_";
}
close(IN);
}
if (chdir($expdir)) {
system("/bin/mv", "-f", "$eid", "$eid-$PID");
system("/bin/rm", "-rf", "${eid}-TBfailed");
system("/bin/mv", "-f", "$eid", "${eid}-TBfailed");
}
unlink("$tempns");
exit(-1);
exit($errorstat);
}
#
......@@ -389,7 +403,7 @@ sub fatal()
print MAIL $mesg;
if (open(IN, "$tempns")) {
print MAIL "\n\n---------\n\n";
print MAIL "\n\n--------- $tempns ---------\n\n";
while (<IN>) {
print MAIL "$_";
......@@ -399,7 +413,7 @@ sub fatal()
}
if (open(IN, "$logname")) {
print MAIL "\n\n---------\n\n";
print MAIL "\n\n--------- $logname ---------\n\n";
while (<IN>) {
print MAIL "$_";
......@@ -408,7 +422,7 @@ sub fatal()
}
if (open(IN, "$eiddir/assign.log")) {
print MAIL "\n\n---------\n\n";
print MAIL "\n\n--------- assign.log --------\n\n";
while (<IN>) {
print MAIL "$_";
......@@ -420,9 +434,10 @@ sub fatal()
unlink("$tempns");
unlink("$logname");
if (chdir($expdir)) {
system("/bin/mv", "-f", "$eid", "$eid-$PID");
system("/bin/rm", "-rf", "${eid}-TBfailed");
system("/bin/mv", "-f", "$eid", "${eid}-TBfailed");
}
exit(-1);
exit($errorstat);
}
#
......
......@@ -104,6 +104,7 @@ if (&tbs_exec("postparse $tbcmdfile $irfile")) {
}
# Now we do a loop with assign/nalloc until we work or give up
$assign_exitval = 0;
$run = 0;
$done = 0;
&tbs_out("Starting assignment.\n");
......@@ -112,6 +113,9 @@ while ($done == 0) {
$run++;
if ($run > $maxruns) {
&tbs_out("Too many runs - Giving up.\n");
if ($assign_exitval) {
exit($assign_exitval);
}
exit(1);
}
&tbs_out("Run: $run\n");
......@@ -123,7 +127,9 @@ while ($done == 0) {
}
&tbs_out("Allocating resources - This may take a while.\n");
if (&tbs_exec("assign_wrapper $irfile $ptopfile")) {
$assign_exitval = &tbs_exec("assign_wrapper $irfile $ptopfile");
$assign_exitval = $assign_exitval >> 8;
if ($assign_exitval) {
&tbs_out("Failed to allocate resources.\n");
} else {
# now we need to read what resources are needed from the IR file
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment