Commit 8ad3ba0b authored by Leigh B Stoller's avatar Leigh B Stoller
Browse files

Changes to allow retry when protogeni nodes fail to allocate. There

are two ways this can happen.

1. When allocating the proxy nodes before getting the tickets. This is
   just like the local node case; need to go around the loop again and
   retry with a new node set. 

2. When getting the tickets, someone else got the nodes and gthe
   getticket fails. This is a lot like case 1, but now we have to
   nfree the proxy node, and tell the mapper to try again.
parent 42040c90
......@@ -76,22 +76,49 @@ sub MapResources($$$$)
or return -1;
foreach my $ref (@{ $rspec->{'node'} }) {
my $resource;
my %copy = %{ $ref };
my $copy = \%copy;
my $node_urn = $copy->{'component_urn'};
my $node_urn = $copy->{'request_urn'};
my ($auth,$type,$node_id) = GeniHRN::Parse($node_urn);
my $cm = GeniHRN::Generate($auth, "authority", "cm");
$copy->{'component_manager_uuid'} = $cm;
if ($node_id eq "*") {
# assign will use this, but its format is bogus. Kill it.
delete($copy->{'component_urn'});
#
# Get the resource object.
#
if (!exists($cm_urns{$cm})) {
$resource = GeniResource->Lookup($experiment->idx(), $cm);
if (!defined($resource)) {
$resource = GeniResource->Create($experiment, $cm);
if (!defined($resource)) {
print STDERR "Could not create GeniResource for $cm\n";
return -1;
}
}
$cm_urns{$cm} = $resource;
}
else {
$copy->{'component_uuid'} = $node_urn;
$resource = $cm_urns{$cm};
#
# request_urn means nothing to assign; kill that from the copy.
#
delete($copy->{'request_urn'});
# Ditto
delete($copy->{'tarfiles'});
#
# If already have the ticket, then leave the urn alone.
# We do not run assign again, but we need the rspec to be
# complete for loops below.
#
if (!$resource->HaveTicket()) {
if ($node_id ne "*") {
$copy->{'component_uuid'} = $node_urn;
$copy->{'component_urn'} = $node_urn;
}
}
$cm_urns{$cm} = $cm;
#
# The point of this is to split the rspec apart, since at present
......@@ -125,6 +152,9 @@ sub MapResources($$$$)
foreach my $ref (@{ $rspec->{'link'} }) {
my $linkname = $ref->{'virtual_id'};
# means nothing to assign; added again below.
delete($ref->{'component_manager'});
# Skip tunnels until rspec stitching in place.
next
if (exists($ref->{'link_type'}) &&
......@@ -156,21 +186,6 @@ sub MapResources($$$$)
}
}
#
# Get the resource objects.
#
foreach my $cm (keys(%cm_urns)) {
my $resource = GeniResource->Lookup($experiment->idx(), $cm);
if (!defined($resource)) {
$resource = GeniResource->Create($experiment, $cm);
if (!defined($resource)) {
print STDERR "Could not create GeniResource for $cm\n";
return -1;
}
}
$cm_urns{$cm} = $resource;
}
#
# Discover resources at the component and run assign.
#
......@@ -179,6 +194,14 @@ sub MapResources($$$$)
my $fragment = $fragments{$cm};
my $advertisement;
#
# We got the ticket on a previous loop.
#
if ($resource->HaveTicket()) {
print STDERR "Already have a ticket for $resource; skipping ...\n";
next;
}
print STDERR "Asking for resource list from $resource\n";
if ($resource->Discover($user, \$advertisement)) {
print STDERR "Could not get resource list for $resource\n";
......@@ -316,6 +339,7 @@ sub GetTickets($$$$)
{
my ($experiment, $verbose, $user, $rspec) = @_;
my %cm_urns = ();
my %nodemap = ();
Register($experiment, $user) == 0
or return -1;
......@@ -331,33 +355,39 @@ sub GetTickets($$$$)
$ref->{'node_type'}->{'type_name'} eq "lan") {
next;
}
my $virtual_id = $ref->{'virtual_id'};
my $node_urn = $ref->{'component_urn'};
my ($auth,$type,$node_id) = GeniHRN::Parse($node_urn);
my $cm = GeniHRN::Generate($auth, "authority", "cm");
$cm_urns{$cm} = $cm;
#
# This is how we get the client side to do cooked mode properly.
# Get the resource objects for below.
#
$ref->{'tarfiles'} = "/usr/local/etc/emulab ".
"$TBDOCBASE/downloads/geniclient.tar";
}
#
# Get the resource objects.
#
foreach my $cm (keys(%cm_urns)) {
my $resource = GeniResource->Lookup($experiment->idx(), $cm);
if (!defined($resource)) {
$resource = GeniResource->Create($experiment, $cm);
if (!exists($cm_urns{$cm})) {
my $resource = GeniResource->Lookup($experiment->idx(), $cm);
if (!defined($resource)) {
print STDERR "Could not create GeniResource for $cm\n";
print STDERR "Could not get GeniResource for $cm\n";
return -1;
}
$nodemap{$virtual_id} = $resource;
#
# We got the ticket on a previous loop.
#
next
if ($resource->HaveTicket());
$cm_urns{$cm} = $resource;
}
$cm_urns{$cm} = $resource;
#
# This is how we get the client side to do cooked mode properly.
#
$ref->{'tarfiles'} = "/usr/local/etc/emulab ".
"$TBDOCBASE/downloads/geniclient.tar";
}
# No tickets needed, return now.
return 0
if (! scalar(keys(%cm_urns)));
#
# XXX Convert to a proper XML looking thing. This is just a temporay
......@@ -390,6 +420,9 @@ sub GetTickets($$$$)
$resource->last_rpc_value()) {
print STDERR $resource->last_rpc_value() . "\n";
}
# Return indicator of possible forward progress.
return 1
if ($resource->last_rpc_output() =~ /Could not map to/i);
return -1;
}
return 0;
......@@ -409,8 +442,9 @@ sub GetTickets($$$$)
#
# Check the exit codes. Eventually return specific error info.
#
my $errors = 0;
my $count = 0;
my $errors = 0;
my $count = 0;
my $progress = 0;
foreach my $result (@results) {
my $resource = $resources[$count];
......@@ -425,11 +459,41 @@ sub GetTickets($$$$)
elsif ($result != 0) {
print STDERR "*** Error getting ticket for $resource\n";
$errors++;
# Watch for forward progress. Not being able to map actually
# means forward progress since we want to try again with
# different resources. The mapper will try a few times before
$progress++
if ($result > 1);
}
else {
$progress++;
#
# Got a ticket; mark the proxy nodes so that libvtop knows.
# Failure to get a ticket means we need to release the node
# up in libvtop. Probably need a state variable instead.
#
foreach my $virtual_id (keys(%nodemap)) {
next
if (!$resource->SameResource($nodemap{$virtual_id}));
my $node = $experiment->VnameToNode($virtual_id);
if (defined($node)) {
$node->ModifyReservation({"external_resource_index" =>
$resource->idx()})
== 0 or return -1;
}
}
}
$count++;
}
return 0
if (!$errors);
print STDERR Dumper($rspec) if ($errors);
return $errors;
# Return indication of forward progress so caller knows to to stop.
return ($progress ? 1 : -1);
}
#
......
......@@ -1234,7 +1234,7 @@ sub GenVirtNodes($)
return -1;
}
my $ref = { 'virtual_id' => $vname,
'component_urn' => $vnode->fixed(),
'request_urn' => $vnode->fixed()
};
if ($vnode->_isvirtnode()) {
$ref->{'virtualization_type'} = 'emulab-vnode';
......@@ -1410,7 +1410,7 @@ sub GenFixNodes($)
tbwarn("GenFixNodes: No fixed node for $vname\n");
}
# Normal nodes have a vnodem but delay nodes do not.
# Normal nodes have a vnode but delay nodes do not.
if (!defined($vnode) && !$self->isadelaynode($vname)) {
tbwarn("GenFixNodes: No vnode for $vname\n");
}
......@@ -2340,7 +2340,7 @@ sub GenVirtLans($)
if (defined($fixed) && $fixed ne "") {
my ($authority,$type,$nodeid) = GeniHRN::Parse($fixed);
$noderef->{'component_urn'} =
$noderef->{'request_urn'} =
GeniHRN::Generate($authority, "node", "*");
}
my $ref = {
......@@ -3678,14 +3678,15 @@ sub AllocNodes($)
$self->fixednodes()->{$vname} = $nodeid;
# And add to the results for the next vtop print.
$self->addfixed("$vname $nodeid");
$self->addfixed("$vname $nodeid")
if (!$pnode->isfednode());
}
}
}
if ($exitval > 0) {
#
# We got some but no all the nodes.
# We got some but not all the nodes.
#
my $rcount = scalar(@reserved);
my $tcount = scalar(@nodeids);
......@@ -3726,7 +3727,8 @@ sub AllocNodes($)
$self->fixednodes()->{$vname} = $nodeid;
# And add to the results for the next vtop print.
$self->addfixed("$vname $nodeid");
$self->addfixed("$vname $nodeid")
if (!$pnode->isfednode());
}
}
}
......@@ -3799,11 +3801,45 @@ sub AllocNodes($)
if (defined($self->rspec()) &&
!($self->impotent() || $self->alloconly())) {
$self->printdb("Requesting geni tickets ...\n");
if (libGeni::GetTickets($self->experiment(), $self->verbose(),
$self->user(), $self->rspec())) {
tberror("Could not allocate Geni Tickets\n");
return -1;
my $progress = libGeni::GetTickets($self->experiment(),
$self->verbose(),
$self->user(), $self->rspec());
if ($progress) {
tberror("Error allocating (some) Geni Tickets\n");
#
# Need to find out what geni nodes we could not get tickets for.
# The local proxy nodes for them need to be released, and if we
# made forward progress, we try again in another loop through.
# The rest of the solution state will get cleared before
# the next loop, but calling nfree is easiest here.
#
my @tofree = ();
foreach my $pnode (@reserved) {
my $nodeid = $pnode->node_id();
#
# XXX See libGeni; it sets external_resource_index if the
# if we get a ticket. Need a state variable instead.
#
if (exists($self->solution()->{'TORESERVE'}->{$nodeid}) &&
!defined($pnode->external_resource_index())) {
push(@tofree, $nodeid);
}
}
if (@tofree) {
tbinfo("Releasing unticketed nodes: @tofree\n");
system("$NFREE $pid $eid @tofree");
if ($?) {
tberror("Could not free unticketed nodes\n");
return -1;
}
foreach my $nodeid (@tofree) {
delete($self->newreserved()->{$nodeid});
}
}
return $progress;
}
tbinfo("Successfully got all geni tickets we needed.\n");
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment