Commit ae4af900 authored by Leigh Stoller's avatar Leigh Stoller

Backend part of "async" mode.

parent 56f6d601
......@@ -745,6 +745,8 @@ sub SetBootFailure($)
if (!DBQueryWarn("update geni_aggregates set ".
" boot_failure='1' ".
"where idx='$idx'"));
$self->{'AGGREGATE'}->{'boot_failure'} = 1;
return 0;
}
......@@ -761,6 +763,8 @@ sub ClearBootFailure($)
if (!DBQueryWarn("update geni_aggregates set ".
" boot_failure='0' ".
"where idx='$idx'"));
$self->{'AGGREGATE'}->{'boot_failure'} = 0;
return 0;
}
......@@ -1072,6 +1076,8 @@ sub ActionStart($$;$)
# Clear last error.
$self->SetErrorLog("");
# Set new status so ComputeState() knows what is going on.
$self->SetStatus("working");
my $experiment = Experiment->Lookup($self->slice_uuid());
if (!defined($experiment)) {
......@@ -1114,7 +1120,7 @@ sub ActionStart($$;$)
if ($sliver->state() eq "started");
$sliver->Start($version) == 0
or return -1;
or goto bad;
next;
}
#
......@@ -1379,11 +1385,13 @@ sub ActionStart($$;$)
}
}
DebugTimeStamp("snmpit finished");
exit(0);
# Avoid END block processing
POSIX::_exit(0);
badsnmpit:
print STDERR "Failed to setup vlans: $msg\n";
exit(-1);
# Avoid END block processing
POSIX::_exit(-1);
}
my @nodes = keys(%nodes);
......@@ -1393,7 +1401,9 @@ sub ActionStart($$;$)
$experiment->SetState($expstate);
#
# See what nodes succeeded or failed.
# See what nodes succeeded or failed. We want to hold off setting
# the new state on the slivers until the end so that we do not flip
# aggregate status to ready before we actually finish up in here.
#
foreach my $node_id (keys(%nodes)) {
my $node = $nodes{$node_id};
......@@ -1462,7 +1472,7 @@ sub ActionStart($$;$)
my $kid = waitpid($snmpit_child, 0);
if ($kid == $snmpit_child) {
if ($?) {
$msg = "Wait for snmpit returned error $?\n";
$msg = "Failed to set up networks\n";
goto bad;
}
else {
......@@ -1475,7 +1485,7 @@ sub ActionStart($$;$)
}
}
if ($rval) {
$msg .= "$OSSETUP failed\n";
$msg .= "Unable to OS setup nodes\n";
goto bad;
}
......@@ -1501,38 +1511,52 @@ sub ActionStart($$;$)
# @failed list returned from WaitForNodes().n
#
if (@failed) {
print STDERR
"Some nodes did not boot, not doing elabinelab setup\n";
return -1;
$msg .= "Some nodes did not boot, not doing elabinelab setup";
goto bad;
}
print STDERR "Setting up elabinelab. This could take a while!\n";
if (system("$ELAB_SETUP $pid $eid")) {
print STDERR "Failed to setup elabinelab!\n";
return -1;
$msg .= "Failed to setup elabinelab!";
goto bad;
}
}
elsif (($flags & $ACTION_FLAGS_NOEVENTSTART) == 0) {
$self->ComputeState();
if ($self->status() eq "ready") {
#
# Start the event scheduler. Note that the experiment is already
# in the ACTIVE state, so the scheduler is going to fire off the
# timeline automatically.
#
system("$EVENTSYS start $pid,$eid");
if ($?) {
$msg .= "Failed to (re)start the event system";
if ($TB ne "/usr/testbed") {
# Not sure why this is failing.
print STDERR "$msg\n";
}
else {
goto bad;
}
elsif (($flags & $ACTION_FLAGS_NOEVENTSTART) == 0 && !@failed) {
#
# Start the event scheduler. Note that the experiment is already
# in the ACTIVE state, so the scheduler is going to fire off the
# timeline automatically.
#
system("$EVENTSYS start $pid,$eid");
if ($?) {
$msg .= "Failed to (re)start the event system";
if ($TB ne "/usr/testbed") {
# Not sure why this is failing.
print STDERR "$msg\n";
}
else {
goto bad;
}
}
}
if (0) {
# Testing.
@failed = values(%nodes);
}
if (@failed) {
my @node_ids = map { $_->node_id() } @failed;
$self->SetBootFailure();
$self->SetErrorLog("The following nodes failed to setup: " .
join(" ", @node_ids));
}
elsif (0) {
# Testing.
$self->SetBootFailure();
$self->SetErrorLog("Testing");
}
# This has to be last so that SliverStatus() does not hand back a
# ready or failed state until we are fully done here.
$self->SetStatus("mixed");
$self->ComputeState();
return 0;
bad:
......@@ -1547,6 +1571,10 @@ sub ActionStart($$;$)
$sliver->SetErrorLog($msg)
if (defined($msg));
}
if ($self->status() eq "working") {
$self->SetStatus("mixed");
$self->ComputeState();
}
return -1;
}
......@@ -1633,6 +1661,8 @@ sub Action($$$;$)
# Clear last error.
$self->SetErrorLog("");
# Set new status so ComputeState() knows what is going on.
$self->SetStatus("working");
my $experiment = Experiment->Lookup($self->slice_uuid());
if (!defined($experiment)) {
......@@ -1744,7 +1774,7 @@ sub Action($$$;$)
if ($sliver->state() eq "started" && !$restart);
$sliver->Start($version) == 0
or return -1;
or goto bad;
next;
}
my $node = Node->Lookup($sliver->resource_id());
......@@ -2433,11 +2463,12 @@ sub Action($$$;$)
}
}
}
my @failed = ();
return -1
if ($self->WaitForNodes(\@failed, $osload_object, \@reload_children,
@waitpnodes, @waitvnodes));
my @failed = ();
if ($self->WaitForNodes(\@failed, $osload_object, \@reload_children,
@waitpnodes, @waitvnodes)) {
$msg .= "Not all nodes booted successfully";
goto bad;
}
#
# Before we fire off any async activity, push out any experiment
......@@ -2463,40 +2494,37 @@ sub Action($$$;$)
# @failed list returned from WaitForNodes().n
#
if (@failed) {
print STDERR
"Some nodes did not boot, not doing elabinelab setup\n";
return -1;
$msg .= "Some nodes did not boot, not doing elabinelab setup";
goto bad;
}
print STDERR "Setting up elabinelab. This could take a while!\n";
if (system("$ELAB_SETUP $pid $eid")) {
print STDERR "Failed to setup elabinelab!\n";
return -1;
$msg .= "Failed to setup elabinelab!";
goto bad;
}
}
elsif (($action eq "start" || $action eq "restart") &&
elsif (($action eq "start" || $action eq "restart") && !@failed &&
($flags & $ACTION_FLAGS_NOEVENTSTART) == 0) {
$self->ComputeState();
if ($self->status() eq "ready") {
#
# Start the event scheduler. Note that the experiment is already
# in the ACTIVE state, so the scheduler is going to fire off the
# timeline automatically.
#
my $action = ($restart ? "replay" : "start");
system("$EVENTSYS $action $pid,$eid");
if ($?) {
$msg .= "Failed to (re)start the event system";
if ($TB ne "/usr/testbed") {
# Not sure why this is failing.
print STDERR "$msg\n";
}
else {
goto bad;
}
#
# Start the event scheduler. Note that the experiment is already
# in the ACTIVE state, so the scheduler is going to fire off the
# timeline automatically.
#
my $action = ($restart ? "replay" : "start");
system("$EVENTSYS $action $pid,$eid");
if ($?) {
$msg .= "Failed to (re)start the event system";
if ($TB ne "/usr/testbed") {
# Not sure why this is failing.
print STDERR "$msg\n";
}
else {
goto bad;
}
}
}
$self->SetStatus("mixed");
$self->ComputeState();
return 0;
bad:
......@@ -2519,6 +2547,10 @@ sub Action($$$;$)
$sliver->SetErrorLog($msg)
if (defined($msg));
}
if ($self->status() eq "working") {
$self->SetStatus("mixed");
$self->ComputeState();
}
return -1;
}
......@@ -3133,6 +3165,9 @@ sub BatchAction($$@)
# Clear last error.
$self->SetErrorLog("");
$self->ClearBootFailure();
# Set new status so ComputeState() knows what is going on.
$self->SetStatus("working");
my $experiment = Experiment->Lookup($self->slice_uuid());
if (!defined($experiment)) {
......@@ -3237,8 +3272,10 @@ sub BatchAction($$@)
map { $_->_reloaded(1) } values(%reloads);
system("$OSLOAD -c -s @node_ids");
return -1
if ($?);
if ($?) {
$msg .= "OSload failed on @node_ids";
goto bad;
}
}
#
# Then power on any physical nodes that had been stopped.
......@@ -3283,6 +3320,9 @@ sub BatchAction($$@)
foreach my $sliver (@slivers) {
$sliver->SetState("started");
}
# Flip back to mixed to allow ComputeState to do its thing.
$self->SetStatus("mixed");
$self->ComputeState();
return 0;
bad:
......@@ -3295,6 +3335,10 @@ sub BatchAction($$@)
$self->SetErrorLog($msg);
print STDERR "$msg\n";
}
if ($self->status() eq "working") {
$self->SetStatus("mixed");
$self->ComputeState();
}
return -1;
}
......@@ -3557,7 +3601,11 @@ sub ComputeState($)
else {
$newstate = "mixed";
}
if ($ready == $count) {
# Internal status, we do not want to force a change while in this status.
if ($self->status() eq "working") {
$newstatus = $self->status();
}
elsif ($ready == $count) {
$newstatus = "ready";
}
elsif ($notready == $count) {
......
......@@ -492,19 +492,39 @@ sub GetTicket($;$)
if (GeniResponse::IsResponse($credential));
if ($isupdate) {
$ticket = CheckTicket($tickstr, $credential->target_urn());
$ticket = CheckTicket($tickstr,
$credential->target_urn());
return $ticket
if (GeniResponse::IsResponse($ticket));
}
return GetTicketAux($credential,
$rspecstr, $isupdate, $impotent, 0, 1, 0, $ticket);
return GetTicketAux({"credential" => $credential,
"rspecstr" => $rspecstr.
"isupdate" => $isupdate,
"impotent" => $impotent,
"v2" => 0,
"level" => 1,
"usetracker" => 0,
"ticket" => $ticket,
"speaksfor" => undef,
"morecreds" => [],
"async" => 0});
}
sub GetTicketAux($$$$$$$$$@)
sub GetTicketAux($)
{
my ($credential, $rspecstr, $isupdate, $impotent, $v2, $level, $usetracker,
$ticket, $speaksfor, @morecreds) = @_;
my ($argref) = @_;
my $credential = $argref->{"credential"};
my $rspecstr = $argref->{"rspecstr"};
my $isupdate = $argref->{"isupdate"};
my $impotent = $argref->{"impotent"};
my $v2 = $argref->{"v2"};
my $level = $argref->{"level"};
my $usetracker = $argref->{"usetracker"};
my $ticket = $argref->{"ticket"};
my $speaksfor = $argref->{"speaksfor"};
my $morecreds = $argref->{"morecreds"};
my $async = $argref->{"async"};
defined($credential) &&
($credential->HasPrivilege( "pi" ) or
$credential->HasPrivilege( "instantiate" ) or
......@@ -544,15 +564,35 @@ sub GetTicketAux($$$$$$$$$@)
main::AddLogfileMetaDataFromSlice($slice);
return GetTicketAuxAux($slice, $user, $rspecstr,
$isupdate, $impotent, $v2, $level, $usetracker,
$ticket, [$credential, @morecreds], $speaksfor);
return GetTicketAuxAux({"slice" => $slice,
"user" => $user,
"rspecstr" => $rspecstr,
"isupdate" => $isupdate,
"impotent" => $impotent,
"v2" => $v2,
"level" => $level,
"usetracker" => $usetracker,
"ticket" => $ticket,
"credentials"=> [$credential, @$morecreds],
"speaksfor" => $speaksfor,
"async" => $async});
}
sub GetTicketAuxAux($$$$$$$$$$$)
sub GetTicketAuxAux($)
{
my ($slice, $user, $rspecstr, $isupdate,
$impotent, $v2, $level, $usetracker,
$ticket, $credentials, $speaksfor) = @_;
my ($argref) = @_;
my $slice = $argref->{"slice"};
my $user = $argref->{"user"};
my $rspecstr = $argref->{"rspecstr"};
my $isupdate = $argref->{"isupdate"};
my $impotent = $argref->{"impotent"};
my $v2 = $argref->{"v2"};
my $level = $argref->{"level"};
my $usetracker = $argref->{"usetracker"};
my $ticket = $argref->{"ticket"};
my $credentials = $argref->{"credentials"};
my $speaksfor = $argref->{"speaksfor"};
my $async = $argref->{"async"};
my $response = undef;
my $restorevirt = 0; # Flag to restore virtual state
my $restorephys = 0; # Flag to restore physical state
......@@ -598,6 +638,15 @@ sub GetTicketAuxAux($$$$$$$$$$$)
"Unknown RSpec Version");
}
# Quick check for no nodes or links. It actually happens and the mapper
# is not happy with a virtual topo that has no nodes or links.
my @nodelist = GeniXML::FindNodes("n:node", $rspec)->get_nodelist();
my @linklist = GeniXML::FindNodes("n:link", $rspec)->get_nodelist();
if (! (@nodelist || @linklist)) {
return GeniResponse->Create(GENIRESPONSE_BADARGS, undef,
"No nodes or links in your topology");
}
#
# A sitevar controls whether external users can get any nodes.
#
......@@ -2908,6 +2957,18 @@ sub GetTicketAuxAux($$$$$$$$$$$)
}
#print STDERR $output;
#
# Async operation.
#
if ($async) {
my $mypid = main::WrapperFork();
if ($mypid) {
return GeniResponse->Create(GENIRESPONSE_INPROGRESS);
}
# Flag the slice as operating in async mode.
$slice->SetAsyncMode(1);
}
# Do a render cause its nice to have on the show experiment page.
# But skip for really big topos, too much overhead.
if (scalar(keys(%nodemap)) <= 500) {
......@@ -3748,6 +3809,7 @@ sub GetTicketAuxAux($$$$$$$$$$$)
# so need to call this as a function instead of a method.
VLan::ClearReservedVlanTag($lanid, $vlan_tag);
}
# We keep the slice for a while in async mode.
if ($v2 && $level == 0) {
if (defined($slice_experiment) && -e $slice_experiment->WorkDir()) {
my $dir = $slice_experiment->WorkDir();
......@@ -3755,9 +3817,16 @@ sub GetTicketAuxAux($$$$$$$$$$$)
system("/bin/rm -rf ${dir}.failed");
system("/bin/cp -rp ${dir} ${dir}.failed");
}
CleanupDeadSlice($slice, 1)
if (defined($slice));
return $response;
if (!$async) {
CleanupDeadSlice($slice, 1)
if (defined($slice));
return $response;
}
}
# In async mode we leave the slice intact but store the error info
# with it so the caller can get at it (for a while).
if ($async && defined($slice)) {
$slice->SetAsyncError($response);
}
$slice->UnLock()
if (defined($slice));
......@@ -3835,6 +3904,11 @@ sub SliverWorkAux($$$$$$$$)
require Interface;
require User;
if (0) {
sleep(30);
return GeniResponse->Create(GENIRESPONSE_ERROR, undef, "Testing mode");
}
# V2 API support.
if (($v2 && $level == 0) || ref($object) ne "GeniTicket") {
$rspec = $object;
......@@ -3972,6 +4046,7 @@ sub SliverWorkAux($$$$$$$$)
# We need this for accounting.
$experiment->SetSwapInfo($realuser->emulab_user());
DebugTimeStamp("Redeem starting");
#
# Figure out what nodes to allocate or free.
#
......@@ -4014,7 +4089,7 @@ sub SliverWorkAux($$$$$$$$)
$schemaLocation->setValue($value);
}
}
DebugTimeStamp("Manifest cloned");
#
# Find current slivers and save.
......@@ -4155,6 +4230,7 @@ sub SliverWorkAux($$$$$$$$)
next;
}
}
DebugTimeStamp("Node alloc check");
#
# What *slivers* need to be released? This may result in physical
......@@ -4196,6 +4272,7 @@ sub SliverWorkAux($$$$$$$$)
push(@freenodes, $sliver);
}
}
DebugTimeStamp("Release check done");
#
# What vhost slivers need to be deleted.
......@@ -4216,6 +4293,8 @@ sub SliverWorkAux($$$$$$$$)
push(@freevhosts, $sliver);
}
}
DebugTimeStamp("sliver delete check done");
#
# We are actually an Aggregate, so return an aggregate of slivers,
......@@ -4258,6 +4337,7 @@ sub SliverWorkAux($$$$$$$$)
delete($linkmap{$linkname});
}
}
DebugTimeStamp("link teardown done");
if ($isupdate) {
#
......@@ -4324,6 +4404,7 @@ sub SliverWorkAux($$$$$$$$)
$solfile = "-L solution.${PID}";
print STDERR "Using assign solution file solution.${PID}\n";
}
DebugTimeStamp("Starting mapper");
# Add -u for update mode, but not -f (fixnode).
my $output = GeniUtil::ExecQuiet("$MAPPER -d -v $solfile -z -u $pid $eid");
......@@ -4375,6 +4456,9 @@ sub SliverWorkAux($$$$$$$$)
unlink("solution.${PID}") if ($solfile ne "");
goto bad;
}
DebugTimeStamp("Mapper run done");
# Do not leave this around.
unlink("solution.${PID}") if ($solfile ne "");
$shouldrollback = 1;
......@@ -4395,6 +4479,7 @@ sub SliverWorkAux($$$$$$$$)
goto bad;
}
DebugTimeStamp("Creating node slivers");
#
# Now for each resource (okay, node) in the ticket create a sliver and
# add it to the aggregate.
......@@ -4456,7 +4541,7 @@ sub SliverWorkAux($$$$$$$$)
}
}
if (grep {$_ eq $virtual_id} keys(%nodemap)) {
if (exists($nodemap{$virtual_id})) {
#
# Already in the aggregate, so reuse sliver.
#
......@@ -4561,14 +4646,16 @@ sub SliverWorkAux($$$$$$$$)
goto skiplinks
if (!defined(GeniXML::FindFirst("n:link", $rspec)));
DebugTimeStamp("Creating link slivers");
foreach my $linkref (GeniXML::FindNodes("n:link",
$rspec)->get_nodelist()) {
my @linkslivers = ();
my $inaggregate = 0;
my %managers = ();
my $linkname = GeniXML::GetVirtualId($linkref);
my @interfaces = GeniXML::FindNodes("n:linkendpoints | ".
"n:interface_ref",
my @interfaces = GeniXML::FindNodes("n:interface_ref | ".
"n:linkendpoints",
$linkref)->get_nodelist();
if (! ($linkname =~ /^[-\w]*$/)) {
......@@ -4920,6 +5007,7 @@ sub SliverWorkAux($$$$$$$$)
# since we might have changed the interfaces down inside the node in
# the loop above.
#
DebugTimeStamp("Updating node manifests");
foreach my $ref (GeniXML::FindNodes("n:node",
$manifest)->get_nodelist()) {
my $node_id = GeniXML::GetVirtualId($ref);
......@@ -4932,6 +5020,8 @@ sub SliverWorkAux($$$$$$$$)
}
}
}
DebugTimeStamp("Done updating node manifests");
skiplinks:
#
# Create a planetlab slice before provisioning (which creates nodes).
......@@ -5195,6 +5285,8 @@ sub SliverWorkAux($$$$$$$$)
$experiment->SaveExperimentState();
$experiment->SaveLogFiles();
$experiment->InitKeyDist();
DebugTimeStamp("Redeem done!");
if ($v2) {
return GeniResponse->Create(GENIRESPONSE_SUCCESS,
......
......@@ -142,6 +142,8 @@ sub GetVersion()
"output_rspec" => "2",
"ad_rspec" => \@ad_rspec_versions
};
#POSIX::_exit(1);
#sleep(30);
return GeniResponse->Create(GENIRESPONSE_SUCCESS, $blob);
}
......@@ -267,7 +269,16 @@ sub Resolve($)
"No permission to resolve $slice\n");
}
# Return a blob.
my $blob = { "urn" => $urn };
my $blob = { "urn" => $urn };
#
# We stored an error after a wrapperfork; return that error now.
#
if ($slice->async_code()) {
$blob->{"async_code"} = $slice->async_code();
$blob->{"async_output"} = $slice->async_output();
return GeniResponse->Create(GENIRESPONSE_SUCCESS, $blob);
}
my $aggregate = GeniAggregate->SliceAggregate($slice);
if (defined($aggregate)) {
......@@ -412,6 +423,9 @@ sub CreateSliver($)
my $keys = $argref->{'keys'};
my $impotent = $argref->{'impotent'} || 0;
my $usetracker = $argref->{'usetracker'} || 0;
my $async = $argref->{'asyncmode'} || 0;
my $mypid = $PID;
my $cachedebug = 0;
require Node;
require Experiment;
require libtestbed;
......@@ -484,12 +498,34 @@ sub CreateSliver($)
"Must delete existing slice first");
}
}
my $rspec = GeniCM::GetTicketAux($credential, $rspecstr,
0, $impotent, 1, 0, $usetracker,
undef, $speaksfor, @morecreds);
return $rspec
if (GeniResponse::IsResponse($rspec));
my $rspec = GeniCM::GetTicketAux({"credential" => $credential,
"rspecstr" => $rspecstr,
"isupdate" => 0,
"impotent" => $impotent,
"v2" => 1,
"level" => 0,
"usetracker" => $usetracker,
"ticket" => undef,
"speaksfor" => $speaksfor,
"morecreds" => \@morecreds,
"async" => $async});
if ($async) {
if ($PID == $mypid) {
# Did not fork or the parent, so we just return the response.
return $rspec
if (GeniResponse::IsResponse($rspec));
}
else {
# Forked, return status code only for failure, otherwise
# we keep on going.
return $rspec->{'code'}
if (GeniResponse::IsError($rspec));
}
}
else {
return $rspec
if (GeniResponse::IsResponse($rspec));
}
$slice = GeniSlice->Lookup($credential->target_urn());
if (!defined($slice)) {
print STDERR "CreateSliver: Could not find slice for $credential\n";
......@@ -512,6 +548,10 @@ sub CreateSliver($)
# Make sure that the next phase sees all changes.
Experiment->FlushAll();
Node->FlushAll();
if ($cachedebug) {
GeniUtil::DumpCaches();
emutil::DumpCaches();
}
my $response = GeniCM::SliverWorkAux($credential, $rspec,
$keys, 0, $impotent, 1, 0, $speaksfor);
......@@ -525,12 +565,22 @@ sub CreateSliver($)
#
$slice = GeniSlice->Lookup($credential->target_urn());
if (defined($slice)) {
if ($slice->Lock() != 0) {
if ($slice->WaitForLock(30) != 0) {
print STDERR
"CreateSliver: Could not lock $slice before delete\n";
return $response;
}
GeniCM::CleanupDeadSlice($slice, 1);
# In async mode store off error info.
if ($async) {
$slice->SetAsyncError($response);
}
# In async mode we are going to leave the slice record around
# so the client can pickup the error info.
GeniCM::CleanupDeadSlice($slice, !$async);
# And unlock since it still exists.
if ($async) {
$slice->UnLock();
}
}
return $response;
}
......@@ -563,15 +613,24 @@ sub CreateSliver($)
# At this point we want to return and let the startsliver proceed
# in the background. Parent never returns, just the child.
#
my $mypid = main::WrapperFork();
if ($mypid) {
return GeniResponse->Create(GENIRESPONSE_SUCCESS,
# But in async mode, GetTicketAux has already forked, so do not do
# it again here.
#
if (!$async) {
$mypid = main::WrapperFork();
if ($mypid) {
return GeniResponse->Create(GENIRESPONSE_SUCCESS,
[$sliver_credential, $sliver_manifest]);
}
}
# Make sure that the next phase sees all changes.
Experiment->FlushAll();
Node->FlushAll();
if ($cachedebug) {
GeniUtil::DumpCaches();
emutil::DumpCaches();
}
#
# The callee might also do a wrapper fork, so remember our PID
......@@ -588,11 +647,19 @@ sub CreateSliver($)
else {
print STDERR "Error waiting for nodes.\n";
}
if ($cachedebug) {
GeniUtil::DumpCaches();
emutil::DumpCaches();