Commit 4861de85 authored by Leigh B Stoller's avatar Leigh B Stoller

A set of changes to deal with really big topologies; The initial

CreateSliver() RPC is going to take a really long time as is going to
time out. This would cause the experiment to fail at the Portal. Now we
continue on to WaitForSlivers(), and wait for the manifest to appear,
which signals the CreateSliver() has finally finished, and then we go
into normal waitmode.
parent 20b486dd
#!/usr/bin/perl -wT
#
# Copyright (c) 2007-2017 University of Utah and the Flux Group.
# Copyright (c) 2007-2018 University of Utah and the Flux Group.
#
# {{{EMULAB-LICENSE
#
......@@ -1341,6 +1341,43 @@ sub HoursToEnglish($)
return $string;
}
#
# Check to see if all aggregates provisioned, and update node counts.
#
sub CheckProvisioned($)
{
my ($self) = @_;
my $provisioned = 1;
DBQueryWarn("lock tables apt_instances write, ".
" apt_instance_aggregates write, ".
" web_tasks read")
or return -1;
#print STDERR "CheckProvisioned: " . $self->status() . "\n";
# We only do this while we are provisioning.
$self->Refresh();
goto done
if ($self->status() ne "provisioning");
foreach my $agg ($self->AggregateList()) {
$agg->Refresh();
$provisioned = 0
if (!defined($agg->manifest()));
}
if ($provisioned) {
#print STDERR "CheckProvisioned: provisioned\n";
$self->SetStatus("provisioned");
$self->ComputeNodeCounts();
}
done:
DBQueryWarn("unlock tables");
return 0;
}
###################################################################
package APT_Instance::ExtensionInfo;
use emdb;
......@@ -2473,7 +2510,7 @@ sub SliceStatus($)
}
#
# Ask for the manifest
# Ask for the manifest. Returns a response object.
#
sub GetManifest($)
{
......@@ -2548,11 +2585,68 @@ sub GetManifest($)
}
last;
}
return $response->value()
return GeniResponse->new($response->code(),
{"manifest" => $response->value()})
if ($self->isAL2S());
return undef
if (! exists($response->value()->{'manifest'}));
return $response->value()->{'manifest'};
return $response;
}
#
# General resolve
#
sub SliceResolve($)
{
my ($self) = @_;
my $credentials;
my $method;
my @params;
my $authority = $self->GetGeniAuthority();
my $geniuser = $self->instance()->GetGeniUser();
my $urn = $self->aggregate_urn();
my $slice = $self->instance()->GetGeniSlice();
my $context = APT_Geni::GeniContext();
return undef
if (! (defined($geniuser) && defined($authority) &&
defined($slice) && defined($context)));
if ($self->isAL2S()) {
my $slice_credential = APT_Geni::GenAuthCredential($slice);
if (!defined($slice_credential)) {
print STDERR "Could not generate slice credential\n";
return undef;
}
$method = "ListResources";
# This breaks, cause perl encodes '3' as an integer, but the
# python on the other side demands a string.
@params = ([$slice_credential->asString()],
{"geni_slice_urn" => $slice->urn(),
"geni_rspec_version" => {'version' => '3',
'type' => 'GENI'},
});
}
else {
my $credentials;
my ($slice_credential, $speaksfor_credential) =
APT_Geni::GenCredentials($slice, $geniuser, undef, 1);
return undef
if (!defined($slice_credential));
$credentials = [$slice_credential->asString()];
if (defined($speaksfor_credential)) {
$credentials = [@$credentials, $speaksfor_credential->asString()];
}
$method = "Resolve";
@params = ({"urn" => $slice->urn(),
"credentials" => $credentials});
}
my $cmurl = $authority->url();
$cmurl = devurl($cmurl) if ($usemydevtree);
return Genixmlrpc::CallMethod($cmurl, $context, $method, @params);
}
sub Provision($$$$)
......@@ -3115,14 +3209,42 @@ sub WaitForSliver($)
my $ready = 0;
my $failed = 0;
my $rpcfail = 0;
my $public_url;
my $repblob;
my $laststatus;
while ($seconds > 0) {
sleep($interval);
$seconds -= $interval;
print STDERR "WaitForSliver: " . TBTimeStamp() . "\n";
#
# If we do not have a manifest, then we want to ask for the manifest
# since SliverStatus() might fail if the CreateSliver() failed cause
# of a read timeout before the slice aggregate has been created.
#
if (!defined($aggobj->manifest())) {
print STDERR "No manifest. asking\n";
my $response = $aggobj->GetManifest();
if (defined($response)) {
print STDERR "Got manifest\n";
$aggobj->SetStatus("provisioned");
$aggobj->SetManifest($response->value()->{'manifest'});
$aggobj->instance()->CheckProvisioned();
# We will not have this since CreateSliver() timed out, so
# make sure we get it now.
if (defined($response->logurl())) {
$aggobj->SetPublicURL($response->logurl());
}
}
else {
print STDERR "Still no manifest, looping\n";
next;
}
}
my $response = $aggobj->SliceStatus();
if (!defined($response) || !defined($response->value()) ||
($response->code() != GENIRESPONSE_SUCCESS &&
......@@ -3237,7 +3359,7 @@ sub WaitForSliver($)
$laststatus = $statusblob;
if (exists($repblob->{'public_url'})) {
$public_url = $repblob->{'public_url'};
my $public_url = $repblob->{'public_url'};
$aggobj->SetPublicURL($public_url);
}
if ($repblob->{'status'} eq "ready") {
......
......@@ -996,8 +996,12 @@ else {
exit($rval);
}
}
$instance->SetStatus("provisioned");
$instance->ComputeNodeCounts();
#
# Cause of early return, we have to Check to see if all aggregates provisioned.
# This will update status and node counts.
#
$instance->CheckProvisioned();
#
# Now wait for the sliver to be ready, which means polling.
......@@ -1254,6 +1258,7 @@ sub CreateSliver($)
my $authority = $aggobj->_authority();
my $cmurl = $authority->url();
my $urn = $authority->urn();
my $manifest;
$webtask->Refresh();
# Debugging
......@@ -1267,6 +1272,7 @@ sub CreateSliver($)
#
my $tries = 15;
my $response;
my $earlyreturn = 0;
while (1) {
$response =
......@@ -1286,9 +1292,8 @@ sub CreateSliver($)
"usetracker" => $usetracker,
});
if (!defined($response) || $response->code() != GENIRESPONSE_SUCCESS) {
if (defined($response) &&
$response->code() == GENIRESPONSE_SERVER_UNAVAILABLE &&
if ($response->code() != GENIRESPONSE_SUCCESS) {
if ($response->code() == GENIRESPONSE_SERVER_UNAVAILABLE &&
$tries >= 0) {
print STDERR "Server for $urn reports too busy, ".
"waiting a while ...\n";
......@@ -1296,16 +1301,39 @@ sub CreateSliver($)
$tries--;
next;
}
if (defined($response)) {
$webtask->output($response->output());
$webtask->Exited($response->code());
}
else {
$webtask->Exited(-1);
elsif ($response->code() == GENIRESPONSE_RPCERROR) {
#
# See if a read timeout for a setup that is taking a long
# time. If so, lets not fail yet, but instead see if we can
# sync back up during WaitForSlivers().
#
if ($response->output() =~ /read timeout/) {
Genixmlrpc->SetTimeout(15);
my $tmp = $aggobj->SliceResolve();
print STDERR "SliceStatus: " . $tmp->code() . "\n";
if ($tmp->code() == GENIRESPONSE_RPCERROR ||
$tmp->code() == GENIRESPONSE_SEARCHFAILED) {
#
# Okay, we can bail.
#
print STDERR "Read timeout, bailing\n";
}
else {
#
# Accept that we do not have a manifest, but the CM
# is reachable and the slice exists.
#
$earlyreturn = 1;
last;
}
}
}
$webtask->output($response->output());
$webtask->Exited($response->code());
$aggobj->SetStatus("failed");
if (defined($response) && defined($response->logurl())) {
if (defined($response->logurl())) {
$aggobj->SetPublicURL($response->logurl());
}
print STDERR "CreateSliver failed on $urn: ".
......@@ -1315,34 +1343,38 @@ sub CreateSliver($)
last;
}
# This will get overwritten later.
if (defined($response) && defined($response->logurl())) {
if (defined($response->logurl())) {
$aggobj->SetPublicURL($response->logurl());
}
my $manifest = $response->value()->[1];
if (!defined($manifest)) {
$webtask->Exited(-1);
$aggobj->SetStatus("failed");
print STDERR "CreateSliver $urn: No manifest returned\n";
return -1;
}
#
# This needs to be done differently; passing an extra credential
# and setting the valid_until in the rspec.
# If earlyreturn is set, we got a read timeout and so we will not
# have a manifest. We will hopefully catch up with it later during
# WaitForSlivers().
#
if ($duration > $maxduration) {
print "Forcing correct slice expiration\n";
$response = $aggobj->Extend($slice->ExpirationGMT(), $this_user);
if (!defined($response) ||
$response->code() != GENIRESPONSE_SUCCESS) {
$aggobj->SetStatus("failed");
$webtask->output("Renew failed on $urn");
if ($earlyreturn) {
print STDERR "Read timeout forcing early return mode\n";
}
elsif (ref($response->value()) eq "ARRAY") {
$manifest = $response->value()->[1];
if (!defined($manifest)) {
$webtask->Exited(-1);
print STDERR "Renew failed on $urn\n";
$aggobj->SetStatus("failed");
print STDERR "CreateSliver $urn: No manifest returned\n";
return -1;
}
}
$aggobj->SetStatus("provisioned");
$aggobj->SetManifest($manifest);
else {
$webtask->Exited(-1);
$aggobj->SetStatus("failed");
print STDERR "CreateSliver $urn: Unexpected return\n";
return -1;
}
if (defined($manifest)) {
$aggobj->SetStatus("provisioned");
$aggobj->SetManifest($manifest);
}
return 0;
}
......@@ -1651,18 +1683,18 @@ sub RunStitcher()
}
$aggobj->SetStatus("provisioned");
print "Requesting manifest from $urn\n";
my $manifest = $aggobj->GetManifest();
if (!defined($manifest)) {
my $response = $aggobj->GetManifest();
if (!defined($response)) {
$aggobj->SetStatus("failed");
$webtask->output("Could not get manifest from $urn");
$webtask->Exited(-1);
return -1;
}
# Web interface wants this as soon as possible.
$aggobj->SetManifest($manifest);
$aggobj->SetManifest($response->value()->{'manifest'});
print "Forcing correct slice expiration\n";
my $response = $aggobj->Extend($slice->ExpirationGMT(), $this_user);
$response = $aggobj->Extend($slice->ExpirationGMT(), $this_user);
if (!defined($response) ||
$response->code() != GENIRESPONSE_SUCCESS) {
$aggobj->SetStatus("failed");
......
......@@ -2573,12 +2573,17 @@ sub DoManifests()
my $webtask = $sliver->webtask();
my $errmsg;
my $manifest = $sliver->GetManifest();
if (!defined($manifest)) {
my $response = $sliver->GetManifest();
if (!defined($response)) {
$errmsg = "RPC Error calling GetManifest";
goto bad;
}
my $response = $sliver->SliceStatus();
my $manifest = $response->value()->{'manifest'};
if ($debug) {
print STDERR $manifest . "\n";
}
$response = $sliver->SliceStatus();
if (!defined($response)) {
$errmsg = "RPC Error calling SliceStatus";
goto bad;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment