Commit 11074445 authored by Leigh Stoller's avatar Leigh Stoller

Changes to how we handle/report mapping failures that also fail the

empty testbed test.

Prior to this commit, we were not invoking the empty testbed case
consitently. Now we do, but that exposed another problem; reporting that
to the error to the Portal in a meaningful way. Basically, we can report
a different error code for an impossible to map error, but then we lose
the info we store now about what the actual failure was (which we show
to the user with additional helpful info). Since we cannot (easily)
change the Geni API for CreateSliver(), I have elected to continue the
practice of returning the specific error codes (which also go into the
database for long term historical info), and add more helpful text that
for the Portal user that explains clearly that the mapping is impossible
on the target cluster. This extra text also go into the database in the
attached message field, so we ccan come back later and post process if
we decide to do something different.
parent 69230970
...@@ -3521,6 +3521,7 @@ sub GetTicketAuxAux($) ...@@ -3521,6 +3521,7 @@ sub GetTicketAuxAux($)
my $inprecheck = 0; my $inprecheck = 0;
my $insummary = 0; my $insummary = 0;
my $isassignerror= 0; my $isassignerror= 0;
my $impossible = 0;
my $errorcode = GENIRESPONSE_NO_MAPPING; my $errorcode = GENIRESPONSE_NO_MAPPING;
unlink($tmpfile); unlink($tmpfile);
...@@ -3553,6 +3554,15 @@ sub GetTicketAuxAux($) ...@@ -3553,6 +3554,15 @@ sub GetTicketAuxAux($)
$insummary = 0; $insummary = 0;
$errorcode = GENIRESPONSE_NO_MAPPING; $errorcode = GENIRESPONSE_NO_MAPPING;
} }
#
# This is the very last thing the mapper will print, so
# need to be looking for it after ASSIGN FAILED
#
if ($line =~ /cannot be instantiated on this testbed/) {
$impossible = 1;
last;
}
# Skipping ahead looking for another assign run # Skipping ahead looking for another assign run
next next
if ($done); if ($done);
...@@ -3570,18 +3580,6 @@ sub GetTicketAuxAux($) ...@@ -3570,18 +3580,6 @@ sub GetTicketAuxAux($)
$errorcode = GENIRESPONSE_INSUFFICIENT_NODES(); $errorcode = GENIRESPONSE_INSUFFICIENT_NODES();
last; last;
} }
if ($line =~ /cannot be instantiated on this testbed/) {
$logstuff = $line;
# Snarf up next couple of lines.
while ($output =~ /^(.*)$/gm) {
my $l = $1 . "\n";
last
if ($l =~ /^\s*\*\*\*/);
$logstuff .= $l;
}
$errorcode = GENIRESPONSE_MAPPING_IMPOSSIBLE();
last;
}
# #
# The assign_wrapper2 script wraps assign, and basically # The assign_wrapper2 script wraps assign, and basically
...@@ -3739,6 +3737,32 @@ sub GetTicketAuxAux($) ...@@ -3739,6 +3737,32 @@ sub GetTicketAuxAux($)
print STDERR "---------------- Mapper Log ------------------\n"; print STDERR "---------------- Mapper Log ------------------\n";
print STDERR $output; print STDERR $output;
if ($impossible) {
# WARNING: This error string is important, do not change it!!!
my $message =
"Your topology cannot be instantiated on this cluster. ".
"You have most likely asked for hardware that does not ".
"exist, such as nodes of a type that do not exist, or more ".
"network interfaces that exist on any of the nodes at this ".
"cluster. You will need to modify your experiment or try ".
"a different cluster - re-submitting as-is will always ".
"result in failure!";
if ($errorstr ne "") {
$errorstr .= "\n\n";
$errorstr .= "Note that " . lcfirst($message);
}
else {
$errorstr = $message;
}
if ($logstuff ne "") {
$logstuff .= "\n\n";
$logstuff .= "Note that " . lcfirst($message);
}
else {
$logstuff = $message;
}
}
$response = $response =
GeniResponse->Create($errorcode, GeniResponse->Create($errorcode,
"Could not map to resources" . "Could not map to resources" .
......
...@@ -5482,7 +5482,7 @@ sub AllocNodes($) ...@@ -5482,7 +5482,7 @@ sub AllocNodes($)
# if the return value of nalloc (number of nodes not allocated) does # if the return value of nalloc (number of nodes not allocated) does
# not equal the number of nodes we tried to allocate. # not equal the number of nodes we tried to allocate.
# #
return (($tcount == $exitval) ? 2 : 3); return (($tcount == $exitval) ? 10 : 11);
} }
# #
......
...@@ -5556,7 +5556,7 @@ sub AllocNodes($) ...@@ -5556,7 +5556,7 @@ sub AllocNodes($)
# if the return value of nalloc (number of nodes not allocated) does # if the return value of nalloc (number of nodes not allocated) does
# not equal the number of nodes we tried to allocate. # not equal the number of nodes we tried to allocate.
# #
return (($tcount == $exitval) ? 2 : 3); return (($tcount == $exitval) ? 10 : 11);
} }
# #
......
...@@ -576,10 +576,13 @@ sub AssignLoop() ...@@ -576,10 +576,13 @@ sub AssignLoop()
# #
# RunAssign returns 0 if successful. # RunAssign returns 0 if successful.
# returns -1 if failure, but assign says to stop trying. # returns -1 if failure, this is a total failure, stop.
# returns 1 if failure, but assign says to try again. # returns 1 if failure, but assign says to try again.
# returns 2 if assign succeeds, but no nodes allocated. # returns 2 if failure, assign is saying this vtop cannot
# returns 3 if assign succeeds, but some nodes allocated. # ever map with the current ptop, but we try
# with the empty testbed to see if is mappable.
# returns 10 if assign succeeds, but no nodes allocated.
# returns 11 if assign succeeds, but some nodes allocated.
# #
my $retval = RunAssign($precheck, $prefix); my $retval = RunAssign($precheck, $prefix);
...@@ -616,7 +619,7 @@ sub AssignLoop() ...@@ -616,7 +619,7 @@ sub AssignLoop()
# then we found a solution, and so trying on an empty testbed is # then we found a solution, and so trying on an empty testbed is
# pointless; it will obviously find a solution again. # pointless; it will obviously find a solution again.
# #
if (!$precheck && !$tried_precheck && ($retval == 2 || $retval == 3)) { if (!$precheck && !$tried_precheck && ($retval == 10 || $retval == 11)){
$tried_precheck = 1; $tried_precheck = 1;
} }
if (!$precheck && !$tried_precheck) { if (!$precheck && !$tried_precheck) {
...@@ -639,6 +642,16 @@ sub AssignLoop() ...@@ -639,6 +642,16 @@ sub AssignLoop()
$impotent = $save_impotent; $impotent = $save_impotent;
$tried_precheck = 1; $tried_precheck = 1;
} }
#
# If assign failed precheck then no point in continuing, it cannot
# map given the currently free resources.
#
if ($retval == 2) {
return [{type => 'primary', severity => SEV_ERROR,
error => ['assign_precheck_failure']},
"No progress, giving up."];
}
# We try a minimum number of times, cause the node pool is # We try a minimum number of times, cause the node pool is
# always changing. But once we hit the maxrun, we continue # always changing. But once we hit the maxrun, we continue
# only if progress on the last loop. # only if progress on the last loop.
...@@ -649,7 +662,7 @@ sub AssignLoop() ...@@ -649,7 +662,7 @@ sub AssignLoop()
} }
# See if we made progress or not. # See if we made progress or not.
# Keep going if we allocated some nodes. # Keep going if we allocated some nodes.
$progress = ($retval == 3); $progress = ($retval == 11);
# A little bit of backoff after failure. # A little bit of backoff after failure.
my $sval = int(rand($currentrun * 3)) + 3; my $sval = int(rand($currentrun * 3)) + 3;
...@@ -902,6 +915,7 @@ sub RunAssign($$) ...@@ -902,6 +915,7 @@ sub RunAssign($$)
if (waitpid($childpid, &WNOHANG) == $childpid) { if (waitpid($childpid, &WNOHANG) == $childpid) {
$assignexitcode = $? >> 8; $assignexitcode = $? >> 8;
$assignexitcode = -1 if ($assignexitcode == 255);
last; last;
} }
...@@ -963,12 +977,13 @@ sub RunAssign($$) ...@@ -963,12 +977,13 @@ sub RunAssign($$)
# #
# assign returns two positive error codes (that we care about). # assign returns two positive error codes (that we care about).
# The distinction between them is somewhat murky. An exitval of # The distinction between them is somewhat murky. An exitval of
# 1 means "retryable" while 2 means "unretryable". The former # 1 means "retryable", which means the annealing phase failed, and
# means we can try again, while the later says there is no possible # running assign again might produce a valid mapping. 2 means
# way to map it. We pass this back to the caller so that we know # the precheck failed and running again will not produce a valid
# to exit the loop or try again. # result, so no point in trying. Either way, we still want to
# try on the empty testbed once. Any other exit code is bad.
# #
return (($assignexitcode == 1) ? 1 : -1); return $assignexitcode;
} }
# #
# If we were doing the precheck, go ahead and exit now - there is no # If we were doing the precheck, go ahead and exit now - there is no
......
...@@ -160,15 +160,17 @@ function Do_GetInstanceStatus() ...@@ -160,15 +160,17 @@ function Do_GetInstanceStatus()
"Please try again later.\n\n"; "Please try again later.\n\n";
} }
elseif ($webtask->exitcode() == GENIRESPONSE_MAPPING_IMPOSSIBLE) { elseif ($webtask->exitcode() == GENIRESPONSE_MAPPING_IMPOSSIBLE) {
$blob["reason"] = "Your topology cannot instantiated. ". $blob["reason"] = "Your topology cannot be instantiated. ".
"You have most likely asked for hardware that does not exist, ". "You have most likely asked for hardware that does not exist, ".
"such as nodes of a type that are not at the target cluster, ". "such as nodes of a type that are not at the target cluster, ".
"or nodes with too many network interfaces.\n\n"; "or more network interfaces that exist on any of the nodes ".
"at the target cluster. You will need to modify your profile ".
"or try a different cluster.\n\n";
} }
elseif ($webtask->exitcode() == GENIRESPONSE_NO_MAPPING) { elseif ($webtask->exitcode() == GENIRESPONSE_NO_MAPPING) {
$blob["reason"] = "Your topology could not be mapped to physical ". $blob["reason"] = "Your topology could not be mapped to physical ".
"resources. In addition to any information below, you can ". "resources. In addition to any information below, you can ".
"click on the 'Sliver' button above, which will provide ". "click on the 'Logs' button above, which will provide ".
"lots of info, some of which might be useful in figuring out ". "lots of info, some of which might be useful in figuring out ".
"why it failed.\n\n"; "why it failed.\n\n";
} }
...@@ -191,8 +193,8 @@ function Do_GetInstanceStatus() ...@@ -191,8 +193,8 @@ function Do_GetInstanceStatus()
# #
$blob["reason"] = "You are over your disk quota at the target ". $blob["reason"] = "You are over your disk quota at the target ".
"cluster, probably because of too many saved disk images. ". "cluster, probably because of too many saved disk images. ".
"Please click on the 'Storage->My Disk Images' above and delete ". "Please click on the 'Storage->My Disk Images' above and ".
"images you no longer need."; "delete images you no longer need.";
} }
elseif ($webtask->exitcode() == GENIRESPONSE_STITCHER_ERROR) { elseif ($webtask->exitcode() == GENIRESPONSE_STITCHER_ERROR) {
# #
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment