Commit e9dedfc7 authored by Leigh B Stoller's avatar Leigh B Stoller

Another round of fixes and improvements to previous revision.

parent 0a06db53
......@@ -2660,19 +2660,19 @@ sub SliceResolve($)
sub Provision($$$$)
{
my ($self, $perrmsg, $users, $cert, $key) = @_;
my ($self, $users, $cert, $key) = @_;
my $authority = $self->GetGeniAuthority();
my $urn = $self->aggregate_urn();
my $geniuser = $self->instance()->GetGeniUser();
my $slice = $self->instance()->GetGeniSlice();
my $context = APT_Geni::GeniContext();
return -1
return ContextError()
if (! (defined($geniuser) && defined($authority) &&
defined($slice) && defined($context)));
my ($slice_credential, $speaksfor_credential) =
APT_Geni::GenCredentials($slice, $geniuser, undef, 0);
return -1
return CredentialError()
if (! (defined($speaksfor_credential) &&
defined($slice_credential)));
......@@ -2701,31 +2701,7 @@ sub Provision($$$$)
$cmurl = devurl($cmurl) if ($usemydevtree);
$cmurl .= "/3.0";
my $tries = 10;
while ($tries) {
my $response =
Genixmlrpc::CallMethod($cmurl, $context, "Provision", @params);
if (defined($response) && defined($response->logurl())) {
$self->SetPublicURL($response->logurl());
}
if (!defined($response) || $response->code() != GENIRESPONSE_SUCCESS) {
if (defined($response) &&
($response->code() == GENIRESPONSE_SERVER_UNAVAILABLE ||
$response->code() == GENIRESPONSE_BUSY) &&
$tries >= 0) {
print STDERR "Server for $urn reports too busy or slice busy, ".
"waiting a while ...\n";
sleep(int(rand(20)) + 10);
$tries--;
next;
}
$$perrmsg = $response->error();
return -1;
}
last;
}
return 0;
return Genixmlrpc::CallMethod($cmurl, $context, "Provision", @params);
}
#
......@@ -3141,8 +3117,8 @@ sub WaitForSliver($)
my $ready = 0;
my $failed = 0;
my $rpcfail = 0;
my $async_code;
my $async_output;
my $failure_code;
my $failure_output;
my $repblob;
my $laststatus;
......@@ -3158,7 +3134,8 @@ sub WaitForSliver($)
$response->code() != GENIRESPONSE_SERVER_UNAVAILABLE &&
$response->code() != GENIRESPONSE_BUSY &&
$response->code() != GENIRESPONSE_NETWORK_ERROR)) {
print STDERR "SliverStatus failed: " . $response->error() . "\n";
$failure_code = $response->code();
$failure_output = "SliverStatus failed: " . $response->error();
$failed = 1;
last;
}
......@@ -3251,8 +3228,8 @@ sub WaitForSliver($)
#
if (exists($repblob->{'async_code'})) {
# We are getting back async error status.
$async_code = $repblob->{'async_code'};
$async_output = $repblob->{'async_output'};
$failure_code = $repblob->{'async_code'};
$failure_output = $repblob->{'async_output'};
$failed = 1;
last;
}
......@@ -3261,13 +3238,20 @@ sub WaitForSliver($)
last;
}
elsif ($repblob->{'status'} eq "failed") {
print STDERR Dumper($repblob);
$failed = 1;
my $msg = $repblob->{'error'}
if (defined($repblob->{'error'}) && $repblob->{'error'} ne "");
print STDERR "*** $urn failed\n";
print STDERR " " . $repblob->{'error'} . "\n" if ($msg);
$webtask->output("Experiment setup on the $aggname cluster failed" .
($msg ? ": $msg" : "."));
$failure_code = GENIRESPONSE_SETUPFAILURE_BOOTFAILED();
# Backwards compat; boot_failure is new, and used to be only 1.
# If it is not set, that usually means node failure.
if (exists($repblob->{"boot_failure"})) {
if ($repblob->{"boot_failure"} > 1) {
$failure_code = $repblob->{"boot_failure"};
}
}
$failure_output = "Experiment setup on the $aggname cluster failed";
if (exists($repblob->{'error'}) && $repblob->{'error'} ne "") {
$failure_output .= ": " . $repblob->{'error'};
}
last;
}
elsif ($aggobj->instance()->IsCanceled()) {
......@@ -3281,23 +3265,22 @@ sub WaitForSliver($)
if ($failed || !$ready) {
$aggobj->SetStatus("failed");
if ($failed) {
if (defined($async_code)) {
$webtask->output($async_output);
$webtask->Exited($async_code);
}
else {
# Output set above.
$webtask->Exited(GENIRESPONSE_ERROR);
}
print STDERR "*** $urn failure\n";
print STDERR " $failure_output\n";
$webtask->output($failure_output);
$webtask->Exited($failure_code);
}
elsif (!$ready) {
# XXX Need better handling for timeout.
print STDERR "*** $urn timed out.\n";
$webtask->output("Experiment setup on $urn timed out");
$webtask->output("Experiment setup on the $aggobj cluster".
"timed out");
$webtask->Exited(GENIRESPONSE_TIMEDOUT);
}
else {
$webtask->Exited(1);
print STDERR "*** $urn unknown error\n";
$webtask->output("Unknown setup failure on the $aggobj cluster");
$webtask->Exited(GENIRESPONSE_ERROR);
}
return $webtask->exitcode();
}
......
......@@ -71,6 +71,7 @@ sub SnapShot($$$);
sub CreateDatasetCreds($$$$$);
sub CreateSlivers();
sub RunStitcher();
sub CallMethodOnAggregate($$$);
#
# Configure variables
......@@ -1024,9 +1025,10 @@ Genixmlrpc->SetTimeout(60);
# Okay, fire off the waits for each aggregate
#
my @return_codes = ();
if (ParRun({"maxwaittime" => 99999, "maxchildren" => scalar(@aggregate_list)},
\@return_codes,
\&APT_Instance::Aggregate::WaitForSliver, @aggregate_list)) {
if (ParRun({"maxwaittime" => 99999,
"maxchildren" => scalar(@aggregate_list)},
\@return_codes,
\&APT_Instance::Aggregate::WaitForSliver, @aggregate_list)) {
#
# The parent caught a signal. Leave things intact so that we can
# kill things cleanly later.
......@@ -1061,26 +1063,32 @@ my $startuprunning = 0;
#
# Check the exit codes; any failure is a total failure (for now).
#
my $hosed = 0;
my $failed = 0;
foreach my $aggobj (@aggregate_list) {
my $code = shift(@return_codes);
my $cluster = $aggobj->GetAptAggregate()->name();
# Updated in a forked child, must refresh.
$aggobj->Refresh();
print $aggobj->aggregate_urn() . "\n";
if ($code) {
$failed++;
print "WaitforSliver Failure!\n";
if (defined($aggobj->webtask()->output())) {
$webtask->output($aggobj->webtask()->output());
$webtask->Exited($aggobj->webtask()->exitcode());
print $aggobj->webtask()->output() . "\n";
}
else {
$webtask->output("WaitforSliver Failure at " .
$aggobj->aggregate_urn());
$webtask->Exited(1);
my $exitcode = $aggobj->webtask()->exitcode();
my $output = $aggobj->webtask()->output();
#
# Some of the errors should go the user, others to tbops.
#
if (! (($exitcode >= GENIRESPONSE_SETUPFAILURE() &&
$exitcode <= GENIRESPONSE_SETUPFAILURE_MAXERROR()) ||
$exitcode == GENIRESPONSE_TIMEDOUT())) {
$hosed++;
$output = "Internal error creating experiment on the ".
"$cluster cluster";
}
$webtask->output($output);
$webtask->Exited($exitcode);
# Promote the log up to the instance so that so its easy to find.
$instance->SetPublicURL($aggobj->public_url())
if (defined($aggobj->public_url()));
......@@ -1133,7 +1141,7 @@ else {
$webtask->Exited(0);
}
}
exit(0);
exit($hosed);
#
# Create credentials to access datasets.
......@@ -1148,8 +1156,9 @@ sub CreateDatasetCreds($$$$$)
print STDERR "CreateDatasetCreds: Could not parse rspec\n";
return -1;
}
foreach my $ref (GeniXML::FindNodes("n:node", $rspec)->get_nodelist()) {
my $manager_urn = GetManagerId($ref);
my $manager_urn = GetManagerId($ref);
foreach my $blockref (GeniXML::FindNodesNS("n:blockstore", $ref,
$GeniXML::EMULAB_NS)->get_nodelist()) {
......@@ -1346,19 +1355,12 @@ sub CreateSliver($)
}
elsif ($response->code() == GENIRESPONSE_INPROGRESS()) {
$async = 1;
print STDERR Dumper($response);
#print STDERR Dumper($response);
last;
}
#
# We do not want to show some errors to users, we are going to
# end up sending email to tbops.
#
if ($response->code() == GENIRESPONSE_RPCERROR()) {
$errmsg = "Internal RPC error creating sliver";
}
else {
$errmsg = $response->error();
}
print STDERR "CreateSliver failed on $urn: " .
$response->error() . "\n";
$webtask->output($errmsg);
$webtask->Exited($response->code());
$aggobj->SetStatus("failed");
......@@ -1366,8 +1368,6 @@ sub CreateSliver($)
if (defined($response->logurl())) {
$aggobj->SetPublicURL($response->logurl());
}
print STDERR "CreateSliver failed on $urn: ".
(defined($response) ? $response->output() : "") . "\n";
return -1;
}
last;
......@@ -1406,7 +1406,7 @@ sub CreateSliver($)
$code = $tmp->code();
print STDERR "Resolve returned $code\n";
print STDERR Dumper($tmp);
#print STDERR Dumper($tmp);
# Just keep going, we will get there eventually.
# Lets say RPC errors will clear up at some point, even though
......@@ -1515,6 +1515,8 @@ sub CreateSlivers()
# Check the exit codes; any failure is a total failure (for now).
#
foreach my $aggobj (@aggregate_list) {
my $cluster = $aggobj->GetAptAggregate()->name();
#
# Have to refresh the sliver objects since they were updated in a fork.
# Need the manifests for the call to ComputeNodeCounts below.
......@@ -1526,12 +1528,24 @@ sub CreateSlivers()
# Promote the log up to the instance so that so its easy to find.
$instance->SetPublicURL($aggobj->public_url())
if (defined($aggobj->public_url()));
# Ditto the error output.
$webtask->output($aggobj->webtask()->output())
if (defined($aggobj->webtask()->output()));
my $output = $aggobj->webtask()->output();
my $exitcode = $aggobj->webtask()->exitcode();
#
# We do not want to show some errors to users, we are going to
# end up sending email to tbops.
#
if ($exitcode == GENIRESPONSE_RPCERROR() ||
$exitcode == GENIRESPONSE_SERVERERROR()) {
$output = "Internal error creating experiment on the ".
"$cluster cluster";
}
$webtask->output($output);
# This will be the createsliver exit code if we got one, or -1.
$code = $aggobj->webtask()->exitcode()
if (defined($aggobj->webtask()->exitcode()));
$code = $exitcode;
# Do this last so that the web interface does not see failed
# before the reason is in the webtask.
$instance->SetStatus("failed");
......@@ -1766,7 +1780,7 @@ sub RunStitcher()
print STDERR "No manifest for AL2S\n";
$aggobj->SetStatus("failed");
$webtask->output("No manifest for AL2S");
$webtask->Exited(-1);
$webtask->Exited(GENIRESPONSE_ERROR);
return -1;
}
my $manifest_string = "";
......@@ -1774,7 +1788,7 @@ sub RunStitcher()
print STDERR "Could not open $al2smanifest\n";
$aggobj->SetStatus("failed");
$webtask->output("Could not open manifest file");
$webtask->Exited(-1);
$webtask->Exited(GENIRESPONSE_ERROR);
return -1;
}
while (<MAN>) {
......@@ -1787,49 +1801,63 @@ sub RunStitcher()
return 0;
}
print "Provisioning at $urn\n";
if ($aggobj->Provision(\$errmsg, $users,
$alt_certificate->cert(),
$alt_certificate->PrivKeyDelimited())) {
my $response =
CallMethodOnAggregate($aggobj,
sub {
return $aggobj->Provision($users,
$alt_certificate->cert(),
$alt_certificate->PrivKeyDelimited());
}, 10);
if ($response->code() != GENIRESPONSE_SUCCESS) {
$aggobj->SetStatus("failed");
$webtask->output($errmsg);
$webtask->Exited(-1);
print STDERR "Provision failed on $urn: $errmsg\n";
$webtask->output("Provision failed on $urn: ".$response->error());
$webtask->Exited($response->code());
print STDERR "Provision failed on $urn: ".$response->error()."\n";
return -1;
}
$aggobj->SetStatus("provisioned");
print "Requesting manifest from $urn\n";
my $response = $aggobj->GetManifest();
if (!defined($response)) {
$response = CallMethodOnAggregate($aggobj, "SliceResolve", 10);
if ($response->code() != GENIRESPONSE_SUCCESS) {
$aggobj->SetStatus("failed");
$webtask->output("Could not get manifest from $urn");
$webtask->Exited(-1);
$webtask->output("Could not Resolve at $urn: ".$response->error());
$webtask->Exited($response->code());
print STDERR "Could not Resolve at $urn: ".$response->error()."\n";
return -1;
}
# Web interface wants this as soon as possible.
$aggobj->SetManifest($response->value()->{'manifest'});
# This will get overwritten later during the wait.
$aggobj->SetPublicURL($response->value()->{'public_url'})
if (exists($response->value()->{'public_url'}));
print "Forcing correct slice expiration\n";
$response = $aggobj->Extend($slice->ExpirationGMT(), $this_user);
if (!defined($response) ||
$response->code() != GENIRESPONSE_SUCCESS) {
$response =
CallMethodOnAggregate($aggobj,
sub {
return $aggobj->Extend($slice->ExpirationGMT(),
$this_user);
}, 10);
if ($response->code() != GENIRESPONSE_SUCCESS) {
$aggobj->SetStatus("failed");
$webtask->output("Renew failed on $urn");
$webtask->Exited(-1);
print STDERR "Renew failed on $urn\n";
$webtask->output("Renew failed on $urn: ". $response->error());
$webtask->Exited($response->code());
print STDERR "Renew failed on $urn: ". $response->error() . "\n";
return -1;
}
print "Calling SliverStart at $urn\n";
$response = $aggobj->SliverAction(\$errmsg, "start");
if (! defined($response)) {
$response =
CallMethodOnAggregate($aggobj,
sub {
return $aggobj->SliverAction("start");
}, 10);
if ($response->code() != GENIRESPONSE_SUCCESS) {
$aggobj->SetStatus("failed");
$webtask->output($errmsg);
$webtask->Exited(-1);
print STDERR "SliverStart failed on $urn: $errmsg\n";
$webtask->output("Start failed on $urn: ". $response->error());
$webtask->Exited($response->code());
print STDERR "Start failed on $urn: ". $response->error() . "\n";
return -1;
}
# This will get overwritten later during the wait.
$aggobj->SetPublicURL($response->logurl())
if (defined($response->logurl()));
return 0;
};
......@@ -1907,4 +1935,49 @@ sub UserError($) {
exit(1);
}
#
# Call an aggregate method with retry,
#
sub CallMethodOnAggregate($$$)
{
my ($aggregate, $method, $retries) = @_;
my $response;
while ($retries) {
#
# This can be a coderef for more complicated invocations.
#
if (ref($method) eq "CODE") {
$response = &$method($aggregate);
}
else {
$response = $aggregate->$method();
}
if (!defined($response)) {
# We want to know about this, something is very wrong.
$response =
GeniResponse->new(GENIRESPONSE_ERROR, -1,
"Internal error calling method on $aggregate");
last;
}
print Dumper($response);
last
if ($response->code() == GENIRESPONSE_SUCCESS);
# We can keep trying for these, but not an RPC error.
last
if (! ($response->code() == GENIRESPONSE_BUSY ||
$response->code() == GENIRESPONSE_SERVER_UNAVAILABLE ||
($response->code() == GENIRESPONSE_NETWORK_ERROR &&
$response->value() ==
GENIRESPONSE_NETWORK_ERROR_NOCONNECT)));
#
# Wait for a while and try again.
#
$retries--;
if ($retries) {
sleep(10);
}
}
return $response;
}
......@@ -2722,9 +2722,8 @@ sub StartMonitorInternal(;$)
# going to ready, if we failed initially, the user can get
# themselves out of failure mode for the instance, say by
# reboot or reload.
$sliver->SetStatus("ready")
if ($sliver->status() eq "failed");
$sliver->SetStatus("ready");
return $response
if (!$executing || !$waitforstartup);
}
......@@ -2772,6 +2771,8 @@ sub StartMonitorInternal(;$)
foreach my $aggregate (@aggregates) {
my $response = shift(@{$response});
my $code = $response->code();
# Updated in the forked child, need to refresh.
$aggregate->Refresh();
next
if ($code == GENIRESPONSE_SUCCESS);
......@@ -4260,9 +4261,10 @@ sub CallMethodOnAggregates($$$@)
my $response;
if (0) {
# Need unblessed ref to store into webtask.
$response =
GeniResponse->new(GENIRESPONSE_SERVER_UNAVAILABLE,
undef, "Testing mode");
GeniResponse->Create(GENIRESPONSE_SERVER_UNAVAILABLE,
undef, "Testing mode");
$webtask->response($response);
return -1;
}
......@@ -4273,9 +4275,10 @@ sub CallMethodOnAggregates($$$@)
# the clusters that are up (say, terminate).
#
if ($aggregate->CheckStatus(\$errmsg)) {
# Need unblessed ref to store into webtask.
$response =
GeniResponse->new(GENIRESPONSE_SERVER_UNAVAILABLE,
undef, $errmsg);
GeniResponse->Create(GENIRESPONSE_SERVER_UNAVAILABLE,
undef, $errmsg);
$webtask->response($response);
return -1;
}
......@@ -4297,6 +4300,8 @@ sub CallMethodOnAggregates($$$@)
"$sliver");
last;
}
print Dumper($response);
# We can keep trying for these, but not an RPC error.
last
if (! ($response->code() == GENIRESPONSE_BUSY ||
......@@ -4312,7 +4317,8 @@ sub CallMethodOnAggregates($$$@)
sleep(15);
}
}
$webtask->response($response);
# Need unblessed ref to store into webtask.
$webtask->response($response->Unbless());
return ($response->code() == GENIRESPONSE_SUCCESS ? 0 : -1);
};
my @return_codes = ();
......@@ -4329,8 +4335,10 @@ sub CallMethodOnAggregates($$$@)
@return_codes = (&$coderef([$aggregate, $method, $webtask]));
}
else {
my @tmp = map { [$_, $method, $webtask] } @aggregates;
my @tmp = ();
for (my $i = 0; $i < scalar(@aggregates); $i++) {
push(@tmp, [$aggregates[$i], $method, $webtasks[$i]]);
}
if (ParRun({"maxwaittime" => 99999,
"maxchildren" => scalar(@aggregates)},
\@return_codes, $coderef, @tmp)) {
......@@ -4345,10 +4353,12 @@ sub CallMethodOnAggregates($$$@)
#
foreach my $agg (@aggregates) {
my $webtask = shift(@webtasks);
print "$agg\n";
# No need to refresh if we did not use ParRun above.
$webtask->Refresh() if (@aggregates > 1);
push(@return_values, $webtask->response());
push(@return_values, GeniResponse->Bless($webtask->response()));
}
$$prval = \@return_values;
map { $_->Delete(); } @webtasks;
......
......@@ -277,6 +277,10 @@ sub DoReserve()
$rpcargs{"reason"}= $reason if (defined($reason));
$rpcargs{"update"}= $update if (defined($update));
# We can check for disabled aggregate now, nothing below matters.
if ($aggregate->CheckStatus(\$errmsg, 1)) {
UserError($errmsg);
}
if ($this_user->IsAdmin()) {
#
# We do not have a very good notion of cross site admin.
......@@ -305,10 +309,6 @@ sub DoReserve()
$rpcargs{"credentials"} = $credentials;
$context = APT_Geni::GeniContext();
}
# We can check for disabled aggregate now, nothing below matters.
if ($aggregate->CheckStatus(\$errmsg, 1)) {
UserError($errmsg);
}
my $response =
APT_Geni::PortalRPC($authority, $context, "Reserve", \%rpcargs);
if (GeniResponse::IsError($response)) {
......@@ -805,7 +805,8 @@ sub DoPrediction()
my $optlist = "p:";
my $portal;
my $errmsg;
my @aggregates = ($authority);
my @aggregates = ($aggregate);
my @authorities = ($authority);
my @webtasks = ();
my @projlist = ();
my $blob = {};
......@@ -846,7 +847,8 @@ sub DoPrediction()
# all aggregates listed for the portal.
#
if (defined($portal)) {
@aggregates = ();
@authorities = ();
@aggregates = ();
my @list = APT_Aggregate->LookupForPortal($portal);
foreach my $agg (@list) {
......@@ -860,24 +862,30 @@ sub DoPrediction()
$errmsg = "Cannot lookup authority for $agg";
goto bad;
}
push(@aggregates, $authority);
push(@aggregates, $agg);
push(@authorities, $authority);
}
if (!@aggregates) {
UserError("No clusters are online, please try again later.");
}
}
else {
if ($aggregate->disabled()) {
UserError("The " . $aggregate->name() . " cluster ".
"is currently offline, please try again later.");
}
}
my $coderef = sub {
my ($blob) = @_;
my $authority = $blob->{"authority"};
my $aggregate = $blob->{"aggregate"};
my $webtask = $blob->{"webtask"};
my $errmsg;
if ($aggregate->CheckStatus(\$errmsg, 1)) {
# Need unblessed ref to store into webtask.
my $response =
GeniResponse->Create(GENIRESPONSE_SERVER_UNAVAILABLE,
undef, $errmsg);
$webtask->response($response);
print "$aggregate offline\n";
return 1;
}
# PortalRPC will use the root context in this case, which is
# essentially saying the caller is an admin. But thats okay
# for this call, it is just informational.
......@@ -885,6 +893,10 @@ sub DoPrediction()
APT_Geni::PortalRPC($authority, undef,
"ReservationPrediction",
{"projlist" => \@projlist});
# Need unblessed ref to store into webtask.
$response = GeniResponse->Create($response->code(),
$response->value(),
$response->output);
$webtask->response($response);
if (GeniResponse::IsError($response)) {
......@@ -898,11 +910,16 @@ sub DoPrediction()
# Do not bother with ParRun if only one aggregate.
#