Commit 18cdfa8b authored by Leigh Stoller's avatar Leigh Stoller

Some work on restarting (rebooting) nodes. Presently, there is a bit of

an inconsistency in SliverAction(); when operating on the entire slice
we do the whole thing in the background, returning (almost) immediately.
Which makes sense, we expect the caller to poll for status after.

But when operating on a subset of slivers (nodes), we do it
synchronously, which means the caller is left waiting until we get
through rebooting all the nodes. As David pointed out, when rebooting
nodes in the openstack profile, this can take a long time as the VMs are
torn down. This leaves the user looking at a spinner modal for a long
time, which is not a nice UI feature.

So I added a local option to do slivers in the background, and return
immediately. I am doing the for restart and reload at the moment since
that is primarily what we use from the Portal.

Note that this has to push out to all clusters.
parent 015bd060
......@@ -2112,6 +2112,10 @@ sub SliverAction($$$;@)
else {
$args->{"slice_urn"} = $slice->urn();
}
# Experimental.
if ($which eq "reboot" || $which eq "reload") {
$args->{"asyncmode"} = 1;
}
my $cmurl = $authority->url();
$cmurl = devurl($cmurl) if ($usemydevtree);
......
......@@ -2225,6 +2225,187 @@ sub Stop($$;$)
return -1;
}
#
# For efficiency, we really want to do restart (and maybe reload) in a
# batch instead of individually calling GeniSliver->Action() for each
# one, since each one can take 15-60 seconds depending on what the
# node is doing or if it needs to be power cycled, etc. So lets do a
# very small subset of what Action() is doing above, on a list of
# slivers. Might change my mind and roll this back into Action above.
#
sub BatchAction($$@)
{
my ($self, $action, @slivers) = @_;
my $msg = "Internal Error: ";
my $restart = ($action eq "restart" ? 1 : 0);
my $reload = ($action eq "reload" ? 1 : 0);
# Clear last error.
$self->SetErrorLog("");
my $experiment = Experiment->Lookup($self->slice_uuid());
if (!defined($experiment)) {
$msg .= "Could not map $self to its experiment";
goto bad;
}
my $pid = $experiment->pid();
my $eid = $experiment->eid();
my %pnodes = ();
my %vnodes = ();
my %reloads = ();
my %reboots = ();
my %starts = ();
my %poweron = ();
my %states = ();
foreach my $sliver (@slivers) {
next
if (ref($sliver) ne "GeniSliver::Node");
$states{"$sliver->idx()"} = $sliver->state();
my $node = Node->Lookup($sliver->resource_id());
if (!defined($node)) {
$msg .= "Could not map $sliver to a node";
goto bad;
}
my $node_id = $node->node_id();
my $reservation = $node->Reservation();
if (!defined($reservation)) {
$msg .= "$node no longer belongs to $self";
goto bad;
}
if ($reservation->SameExperiment($experiment)) {
# These are used in WaitForNodes();
$node->_reloaded(0);
$node->_rebooted(0);
$node->_sliver($sliver);
$node->_image(undef);
#
# We reload current image only.
#
if ($reload) {
$reloads{$node_id} = $sliver;
}
if ($node->isvirtnode()) {
$vnodes{$node_id} = $node;
if ($sliver->state() eq "stopped") {
$starts{$node_id} = $node;
}
elsif ($restart) {
$reboots{$node_id} = $node;
}
}
else {
$pnodes{$node_id} = $node;
#
# Look to see if local physical node was stopped (powered off).
#
if ($sliver->state() eq "stopped") {
$poweron{$node_id} = $node;
}
elsif ($restart) {
$reboots{$node_id} = $node;
}
}
}
else {
$msg .= "$node is reserved to another, not $self";
goto bad;
}
}
#
# Cull out vnodes that are going to get rebooted/restarted cause the
# physnode is getting rebooted.
#
foreach my $vnode (values(%vnodes)) {
my $node_id = $vnode->node_id();
if (exists($reboots{$vnode->phys_nodeid()}) ||
exists($poweron{$vnode->phys_nodeid()})) {
delete($reboots{$node_id})
if (exists($reboots{$node_id}));
delete($starts{$node_id})
if (exists($starts{$node_id}));
}
}
# We do this so the slivers no longer say "ready" in the status.
foreach my $sliver (@slivers) {
$sliver->SetState("restarting");
}
if ($reload) {
my @node_ids = keys(%reloads);
# Mark them as being reloaded See below.
map { $_->_reloaded(1) } values(%reloads);
system("$OSLOAD -c -s @node_ids");
return -1
if ($?);
}
#
# Then power on any physical nodes that had been stopped.
# Then reboot the physical nodes, then any leftover virtual nodes.
#
if (keys(%poweron)) {
my @node_ids = keys(%poweron);
system("$POWER on @node_ids");
if ($?) {
$msg .= "Failed to power on @node_ids";
goto bad;
}
}
if (keys(%reboots)) {
my @node_ids = keys(%reboots);
# Mark them as being rebooted. See below.
map { $_->_rebooted(1) } values(%reboots);
system("$NODEREBOOT @node_ids");
if ($?) {
$msg .= "Failed to reboot @node_ids";
goto bad;
}
}
if (keys(%starts)) {
my @node_ids = keys(%starts);
#
# There are so many ways this can throw an error, lets
# not give up here, but go ahead and use the monitor
# to wait for nodes since some might actually boot. Unless
# the exit code indicates abject failure (-1).
#
system("$VNODESETUP -j -m $pid $eid @node_ids");
if ($? && $? >> 8 == 256) {
$msg .= "Failed to set up vnodes @node_ids";
goto bad;
}
}
foreach my $sliver (@slivers) {
$sliver->SetState("started");
}
return 0;
bad:
foreach my $sliver (@slivers) {
if (exists($states{$sliver->idx()})) {
$sliver->SetState($states{$sliver->idx()});
}
}
if (defined($msg)) {
$self->SetErrorLog($msg);
print STDERR "$msg\n";
}
return -1;
}
#
# Provision all the slivers in the aggregate.
#
......@@ -2444,6 +2625,9 @@ sub ComputeState($)
elsif ($state eq "updating_users") {
$updating++;
}
elsif ($state eq "restarting") {
$restarting++;
}
else {
$unknown++;
}
......
......@@ -896,7 +896,7 @@ sub StartSliver($)
my $manifest = $argref->{'manifest'};
return SliverAction("start",
$slice_urn, $sliver_urns, $credentials, $manifest);
$slice_urn, $sliver_urns, $credentials, $manifest, 0);
}
sub StopSliver($)
......@@ -907,7 +907,7 @@ sub StopSliver($)
my $credentials = $argref->{'credentials'};
return SliverAction("stop",
$slice_urn, $sliver_urns, $credentials, undef);
$slice_urn, $sliver_urns, $credentials, undef, 0);
}
sub RestartSliver($)
......@@ -917,9 +917,12 @@ sub RestartSliver($)
my $sliver_urns = $argref->{'sliver_urns'} || $argref->{'component_urns'};
my $credentials = $argref->{'credentials'};
my $manifest = $argref->{'manifest'};
my $asyncmode = (exists($argref->{'asyncmode'}) ?
$argref->{'asyncmode'} : 0);
return SliverAction("restart",
$slice_urn, $sliver_urns, $credentials, $manifest);
$slice_urn, $sliver_urns, $credentials, $manifest,
$asyncmode);
}
sub ReloadSliver($)
......@@ -930,12 +933,13 @@ sub ReloadSliver($)
my $credentials = $argref->{'credentials'};
return SliverAction("reload",
$slice_urn, $sliver_urns, $credentials, undef);
$slice_urn, $sliver_urns, $credentials, undef, 0);
}
sub SliverAction($$$$$)
sub SliverAction($$$$$$)
{
my ($action, $slice_urn, $sliver_urns, $credentials, $manifest) = @_;
my ($action, $slice_urn, $sliver_urns, $credentials,
$manifest, $asyncmode) = @_;
my $response;
my $isasync = 0;
......@@ -1061,7 +1065,7 @@ sub SliverAction($$$$$)
return 0;
};
my $PerformAction = sub {
my ($object, $action) = @_;
my ($object, $action, @slivers) = @_;
my $exitval = 0;
......@@ -1072,10 +1076,20 @@ sub SliverAction($$$$$)
$exitval = $object->Stop($API_VERSION);
}
elsif ($action eq "restart") {
$exitval = $object->Restart($API_VERSION);
if (@slivers) {
$exitval = $object->BatchAction("restart", @slivers);
}
else {
$exitval = $object->Restart($API_VERSION);
}
}
elsif ($action eq "reload") {
$exitval = $object->Reload($API_VERSION);
if (@slivers) {
$exitval = $object->BatchAction("reload", @slivers);
}
else {
$exitval = $object->Reload($API_VERSION);
}
}
return GeniResponse->Create(GENIRESPONSE_ERROR,
"Could not $action sliver")
......@@ -1159,12 +1173,38 @@ sub SliverAction($$$$$)
goto bad;
}
}
$response = &$PerformAction($sliver, $action);
}
if ($asyncmode && $action =~ /^(restart|reload)$/) {
#
# At this point we want to return and let the startsliver proceed
# in the background
#
my $mypid = main::WrapperFork();
if ($mypid) {
return GeniResponse->Create(GENIRESPONSE_SUCCESS);
}
# Remember our pid in case callee forks again.
$isasync = $PID;
$response = &$PerformAction($aggregate, $action, @slivers);
goto bad
if (GeniResponse::IsResponse($response));
# Callee did not fork again, we can unlock.
if ($isasync == $PID) {
$slice->UnLock();
}
}
else {
foreach my $sliver (@slivers) {
$response = &$PerformAction($sliver, $action);
goto bad
if (GeniResponse::IsResponse($response));
}
}
$slice->UnLock();
return GeniResponse->Create(GENIRESPONSE_SUCCESS);
return ($isasync ? GENIRESPONSE_SUCCESS :
GeniResponse->Create(GENIRESPONSE_SUCCESS));
}
bad:
$slice->UnLock();
......
......@@ -1434,7 +1434,7 @@ sub ComputeStatus($;$)
$self->SetState("started");
}
}
if ($self->state() eq "imaging") {
if ($self->state() eq "imaging" || $self->state() eq "restarting") {
$status = "changing";
goto done;
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment