Commit bd9613cc authored by Leigh Stoller's avatar Leigh Stoller

Checkpoint two changes:

1. Using frisbee events in libosload_new as a replacement for
   statically (hand waved) maxwait times for image loading. When frisbee
   is generating events, we use those to determined if progress is being
   made.

2. Convert the CM to using the libosload_new library directly (like
   os_setup does). This is conditional on the NewOsload feature being
   attached to the geniuser. Otherwise, we go through the old path.
parent 05aae5c8
#!/usr/bin/perl -wT
#
# Copyright (c) 2008-2016 University of Utah and the Flux Group.
# Copyright (c) 2008-2017 University of Utah and the Flux Group.
#
# {{{GENIPUBLIC-LICENSE
#
......@@ -55,6 +55,7 @@ use GeniEvent;
use GeniXML;
use emutil;
use EmulabConstants;
use EmulabFeatures;
use Node;
use Logfile;
use libtestbed;
......@@ -824,7 +825,7 @@ sub GetCreator($)
}
#
# Create a signed credential for this aggregate, issued to the provided user.
# Create a signed credential for this aggregate, issued to the provided user
# The credential will grant all permissions for now.
#
# Should we store these credentials in the DB, recording what we hand out?
......@@ -1044,6 +1045,9 @@ sub Action($$$;$)
my @waitpnodes = ();
# See "bad" label below; want to know what sliver failed (if any).
my $sliver;
# For reload.
my $osload_object;
my @reload_children = ();
#
# Download the images. If this fails, we have wasted our time,
......@@ -1386,19 +1390,15 @@ sub Action($$$;$)
%vnodes = %tmp;
#
# Setup the reloads. We do not reboot the nodes until below.
# Figure out which osload library to use.
#
if (keys(%reloads)) {
foreach my $imageid (keys(%reloads)) {
my @nodes = @{ $reloads{$imageid} };
my @node_ids = map { $_->node_id() } @nodes;
# No wait, no reboot. reload runs completely in the background.
system("$OSLOAD -s -r -m $imageid @node_ids");
if ($?) {
$msg .= "Failed to setup reload: $imageid on @node_ids";
goto bad;
}
if (1 || EmulabFeatures->FeatureEnabled("NewOsload",
GeniUtil::GeniUser(),
undef, undef)) {
require libosload_new;
$osload_object = libosload_new->New();
$osload_object->debug(1);
}
}
......@@ -1568,6 +1568,68 @@ sub Action($$$;$)
goto bad;
}
}
#
# Setup the reloads. We do not reboot the nodes until below.
# Here we do it the old way, new way is below.
#
if (keys(%reloads) && !defined($osload_object)) {
foreach my $imageid (keys(%reloads)) {
my @nodes = @{ $reloads{$imageid} };
my @node_ids = map { $_->node_id() } @nodes;
# No wait, no reboot. reload runs completely in the background.
system("$OSLOAD -s -r -m $imageid @node_ids");
if ($?) {
$msg .= "Failed to setup reload: $imageid on @node_ids";
goto bad;
}
}
}
#
# Ug, if we have reloads then the library is going to fork off
# some children that we have to wait for. But since we cannot wait
# for grandchildren processes, we cannot do the wrapperfork later in
# WaitForNodes() cause then the parent of the osload children exits,
# and the WaitForNodes is now running once-removed from the osload
# children.
#
# So wrapperfork() here and return. This does not substantially change
# things, since most callers have already wrapperforked and returned to
# the client, who are polling for status.
#
my $childpid = main::WrapperFork();
if ($childpid) {
return 0;
}
#
# Setup the reloads. We do not reboot the nodes until below.
# This is the new way.
#
if (keys(%reloads) && defined($osload_object)) {
foreach my $imageid (keys(%reloads)) {
my @nodes = @{ $reloads{$imageid} };
my ($pid, $waiter, $failhash) =
StartOsLoad($osload_object, $imageid, @nodes);
#
# If we have fired off some of the reloads, we cannot just
# return since we have children running.
#
if ($pid > 0) {
push(@reload_children, [$pid, $waiter, $failhash]);
next;
}
#
# If we have failures, call it quits now.
# The children are killed below.
#
$msg .= "Failed to setup reload: $imageid on @nodes";
goto bad;
}
}
#
# Then power on any physical nodes that had been stopped.
......@@ -1633,13 +1695,10 @@ sub Action($$$;$)
if (ref($sliver) eq "GeniSliver::Node");
}
my @failed = ();
my $childpid = $self->WaitForNodes(\@failed, @waitpnodes, @waitvnodes);
# Parent returns
return 0
if ($childpid > 0);
return -1
if ($childpid < 0);
if ($self->WaitForNodes(\@failed, $osload_object, \@reload_children,
@waitpnodes, @waitvnodes));
# Waiting is done.
if ($experiment->elab_in_elab()) {
#
......@@ -1685,6 +1744,14 @@ sub Action($$$;$)
return 0;
bad:
# Only for the new way of loading.
while (@reload_children) {
my ($pid) = @{ pop(@reload_children) };
if (defined($osload_object)) {
$osload_object->osload_kill($childpid);
}
}
$self->SetBootFailure();
if (defined($msg)) {
$self->SetErrorLog($msg);
......@@ -1697,12 +1764,45 @@ sub Action($$$;$)
return -1;
}
#
# Fire off a reload asynchronously, the same way ossetup does.
# We use with libosload_new only.
#
sub StartOsLoad($$@)
{
my ($loadobj, $imageid, @nodes) = @_;
my @list = ();
foreach my $node (@nodes) {
my $node_id = $node->node_id();
# The osload library gets ids.
push(@list, $node_id);
}
my %reload_args = ();
my $reload_failures = {};
$reload_args{'debug'} = 1;
$reload_args{'noreboot'} = 1;
$reload_args{'asyncmode'} = 1;
$reload_args{'imageid'} = $imageid;
$reload_args{'nodelist'} = [ @list ];
#
# New osload library.
#
my $pid = $loadobj->osload(\%reload_args, $reload_failures);
my $coderef = sub {
my $childpid = shift;
return $loadobj->osload_wait($childpid);
};
return ($pid, $coderef, $reload_failures);
}
#
# Wait for nodes
#
sub WaitForNodes($$@)
sub WaitForNodes($$$$@)
{
my ($self, $pfailed, @nodes) = @_;
my ($self, $pfailed, $osload_object, $reload_children, @nodes) = @_;
my %nodes = ();
my @waitstates = (TBDB_NODESTATE_TBFAILED, TBDB_NODESTATE_RELOADFAILED,
TBDB_NODESTATE_ISUP);
......@@ -1734,14 +1834,7 @@ sub WaitForNodes($$@)
return -1;
}
#
# At this point we want to return and let the startsliver proceed
# in the background.
#
my $mypid = main::WrapperFork();
if ($mypid) {
return $mypid;
}
# We are now a monitor.
$slice->SetMonitorPid($PID);
#
......@@ -1773,8 +1866,15 @@ sub WaitForNodes($$@)
}
# Set the waitmax time for each node.
foreach my $node (@nodes) {
$node->_maxwait(1000 + ($node->_reloaded() ? 600 : 0));
$node->_maxwait(1000);
#
# If using the old osload method, we have to tack on tine to wait
# for the osload to finish.
#
if (defined($osload_object) && $node->_reloaded()) {
$node->_maxwait($node->_maxwait() + 600);
}
if ($node->isvirtnode()) {
#
# Bump waitime according to number of virtnodes on each physnode.
......@@ -1813,6 +1913,34 @@ sub WaitForNodes($$@)
}
}
#
# First thing we do is wait for the reloads to finish. This tells us
# which nodes not to wait for below cause the reloads failed.
#
if (@{ $reload_children }) {
my @children = @{ $reload_children };
while (@children) {
my ($pid, $waitfunc, $failhash) = @{ pop(@children) };
next
if (! &$waitfunc($pid));
#
# Failure. Remove the failed nodes from the wait list.
#
foreach my $node_id (keys(%{ $failhash })) {
my $node = $nodes{$node_id};
$node->_bootstatus("reloadfail");
$node->_sliver()->SetStatus("failed")
if (defined($node->_sliver()));
delete($nodes{$node_id});
}
}
}
#
# Start a counter going, relative to the time we rebooted the first
# node.
......
#!/usr/bin/perl -wT
#
# Copyright (c) 2000-2016 University of Utah and the Flux Group.
# Copyright (c) 2000-2017 University of Utah and the Flux Group.
#
# {{{EMULAB-LICENSE
#
......@@ -1440,6 +1440,16 @@ sub osload_wait($)
return $? >> 8;
}
sub osload_kill($)
{
my ($childpid) = @_;
print STDERR "osload_kill($childpid): starting\n";
kill('TERM', $childpid);
waitpid($childpid, 0);
return 0;
}
#
# Save signature files and boot partition info for all nodes in an experiment
# (or just the listed nodes). We call this when swapping in an experiment or
......
#!/usr/bin/perl -wT
#
# Copyright (c) 2000-2016 University of Utah and the Flux Group.
# Copyright (c) 2000-2017 University of Utah and the Flux Group.
#
# {{{EMULAB-LICENSE
#
......@@ -40,8 +40,10 @@ use libreboot;
use libtblog;
use Node;
use NodeType;
use Interface;
use OSImage;
use User;
use Blockstore; # For ConvertToMebi();
use EmulabConstants;
use English;
use event;
......@@ -116,7 +118,7 @@ sub New($$$;@)
bless($self, $class);
# We need this guy right away!
$self->debug($DEFAULT_DEBUG_LEVEL);
# $self->debug($DEFAULT_DEBUG_LEVEL);
return $self;
}
......@@ -420,6 +422,8 @@ sub osload($$$) {
my @imageids;
my @nodes = ();
my %nodeflags = ();
# For frisbee events, which come with IP.
my %iptonode = ();
# Locals
my $mereuser = 0;
......@@ -592,6 +596,21 @@ sub osload($$$) {
goto failednode;
}
# For frisbee events.
my $interface = Interface->LookupControl($nodeobject);
if (!defined($interface)) {
print STDERR "*** No control interface for $node\n";
}
else {
my $ctrlip = $interface->IP();
if (!defined($ctrlip) || $ctrlip eq "") {
print STDERR "*** No control IP for $node\n";
}
else {
$iptonode{$ctrlip} = $node;
}
}
#
# Look for type specific module first.
#
......@@ -811,17 +830,37 @@ sub osload($$$) {
my $node_id = event_notification_get_objname($handle,$notification);
my $event = event_notification_get_eventtype($handle,$notification);
my $objtype = event_notification_get_objtype($handle,$notification);
return
if (!defined($eventnodes));
$eventnodes->{'GOTONE'} = 1;
# These events have IP instead of node_id.
if ($objtype eq "FRISBEESTATUS") {
return
if (!exists($iptonode{$node_id}));
$node_id = $iptonode{$node_id};
}
if (exists($eventnodes->{$node_id})) {
my $et = time();
if ($self->debug()) {
print "$self: eventhandler: $node_id => $event @ $et\n"
print STDERR "$self: eventhandler: $node_id => $event @ $et\n"
}
$eventnodes->{$node_id} = $et;
if ($objtype eq "FRISBEESTATUS") {
my $nodeobject = $self->node($node_id);
my $wbytes = event_notification_get_string($handle,
$notification,
"BYTES_WRITTEN");
$self->nodeinfo($nodeobject, 'frisbeestatus',
{"when" => $et,
"wbytes" => $wbytes,
"image" => $event});
}
}
};
my $evhandle = $self->SetupEventHandler($handler);
......@@ -834,6 +873,13 @@ sub osload($$$) {
goto done;
}
# Prime the event handler above.
$eventnodes = {} if (!defined($eventnodes));
foreach my $node (@nodes) {
$eventnodes->{$node} = 0;
}
$self->dprintts("event handler enabled");
while (@nodes) {
my ($reboot_nodes, $noreboot_nodes)
= $self->GetNodesRequiringReboot(@nodes);
......@@ -842,13 +888,6 @@ sub osload($$$) {
print "$self: ".
"Issuing reboot for @$reboot_nodes and then waiting ...\n";
# Prime the event handler above.
$eventnodes = {} if (!defined($eventnodes));
foreach my $node (@$reboot_nodes) {
$eventnodes->{$node} = 0;
}
$self->dprintts("event handler enabled");
my %reboot_args = ();
my %reboot_failures = ();
......@@ -870,6 +909,8 @@ sub osload($$$) {
}
else {
push(@temp, $node);
# Mark node as being rebooted for waitloop
$self->nodeinfo($self->node($node), 'rebooted',1);
}
}
@nodes = (@temp,@$noreboot_nodes);
......@@ -1153,47 +1194,99 @@ sub WaitTillReloadDone($$$$$@)
$waittime = time - $startwait;
# If the node doesn't made a transition within $REBOOTWAIT
# minutes of booting, we declare it stuck.
# minutes of booting, we declare it stuck.
my $isstuck = ($minutes > $REBOOTWAIT &&
exists($eventnodes->{$node}) &&
$self->nodeinfo($nodeobject, 'rebooted') &&
$eventnodes->{$node} == 0);
#
# But, if we are in reloading, then we obviously missed
# a state transition in our handler. Probably just
# need to increase $MAXEVENTS above.
#
if ($isstuck &&
$eventstate eq TBDB_NODESTATE_RELOADING()) {
tbnotice("$self: missed state transition to RELOADING".
" for $node; faking it.");
$eventnodes->{$node} = time();
goto okay;
}
#
# Another form of being stuck is no frisbee events for
# too long. But only if we are getting frisbee events
# and only if we are still in the RELOADING state.
#
if (! $isstuck &&
$eventstate eq TBDB_NODESTATE_RELOADING()) {
my $frisbeestatus =
$self->nodeinfo($nodeobject, "frisbeestatus");
if (defined($frisbeestatus)) {
my $lastevent = $frisbeestatus->{'when'};
my $diff = time() - $lastevent;
if ($diff > 180) {
$isstuck = 1;
print STDERR "$self: $node: ".
"Frisbee events have stopped coming in.\n"
if ($self->debug());
}
}
}
if ($waittime > $maxwait ||
$eventstate eq TBDB_NODESTATE_TBFAILED() ||
$eventstate eq TBDB_NODESTATE_PXEFAILED() ||
$eventstate eq TBDB_NODESTATE_RELOADFAILED() ||
$isstuck) {
#
# If we are in reloading, then we obviously missed
# a state transition in our handler. Probably just
# need to increase $MAXEVENTS above.
#
if ($isstuck &&
$eventstate eq TBDB_NODESTATE_RELOADING()) {
tbnotice("$self: missed state transition to RELOADING".
" for $node; faking it.");
$eventnodes->{$node} = time();
goto okay;
}
my $t = (int ($waittime / 60));
tbnotice "$self: $node appears wedged; ".
"it has been $t minutes since it was rebooted.";
my $msg = "$self: $node appears wedged; ".
"it has been $t minutes since it was rebooted.\n";
if ($eventstate eq TBDB_NODESTATE_TBFAILED() ||
$eventstate eq TBDB_NODESTATE_RELOADFAILED() ||
$eventstate eq TBDB_NODESTATE_PXEFAILED()) {
tbnotice("$self: $node is stuck in $eventstate.");
}
elsif ($eventstate eq TBDB_NODESTATE_RELOADING()) {
tbnotice("$self: $node did not finish reloading.");
$msg .= "$self: $node is stuck in $eventstate.";
}
elsif ($isstuck) {
tbnotice("$self: $node failed to make a state ".
"transition after $REBOOTWAIT minutes; ".
"stuck in $eventstate.");
if ($eventstate eq TBDB_NODESTATE_RELOADING()) {
$msg .= "$self: $node has not made any ".
"frisbee progress in a long time.";
}
else {
$msg .= "$self: $node failed to make a state ".
"transition after $REBOOTWAIT minutes; ".
"stuck in $eventstate.";
}
}
else {
#
# We have waited longer then maxwait. If we are
# getting frisbee events then we ignore that limit
# since that is soooo old school. Now frisbee tells
# us it is making progress, so we wait until there
# have been no frisbee events (no progress) for too
# long, and then call it stuck.
#
my $frisbeestatus =
$self->nodeinfo($nodeobject, "frisbeestatus");
if (defined($frisbeestatus)) {
my $lastevent = $frisbeestatus->{'when'};
my $diff = time() - $lastevent;
if ($diff < 120) {
print STDERR "$self: $node: ".
"waiting for another frisbee event.\n"
if ($self->debug());
goto okay;
}
print STDERR "$self: $node: ".
"Frisbee events have stopped coming in.\n"
if ($self->debug());
}
$msg .= "$self: $node did not finish reloading.";
}
tbnotice($msg);
TBNodeConsoleTail($node, *STDERR);
$count--;
......@@ -1205,7 +1298,16 @@ sub WaitTillReloadDone($$$$$@)
if (int($waittime / 60) > $minutes) {
$minutes = int($waittime / 60);
print STDERR "$self ($node): still waiting; ".
"it has been $minutes minute(s)\n";
"it has been $minutes minute(s). ";
my $frisbeestatus =
$self->nodeinfo($nodeobject, "frisbeestatus");
if (defined($frisbeestatus)) {
my $wbytes = $frisbeestatus->{"wbytes"};
my $mebi = Blockstore::ConvertToMebi("${wbytes}B");
print STDERR "$mebi MiB written.";
}
print STDERR "\n";
}
}
}
......@@ -1349,6 +1451,17 @@ sub osload_wait($$)
return $retval;
}
sub osload_kill($$)
{
my ($self,$childpid) = @_;
$self->dprintts("osload_kill($childpid): starting");
kill('TERM', $childpid);
waitpid($childpid, 0);
$self->dprintts("osload_kill($childpid) finished");
return 0;
}
#
# Save signature files and boot partition info for all nodes in an experiment
# (or just the listed nodes). We call this when swapping in an experiment or
......@@ -1630,6 +1743,17 @@ sub SetupEventHandler($$)
print STDERR "*** event: Could not subscribe to events\n";
return undef;
}
$tuple = address_tuple_alloc();
if (!$tuple) {
print STDERR "*** event: Could not allocate an address tuple\n";
return undef;
}
%$tuple = (objtype => "FRISBEESTATUS");
if (!event_subscribe($EVhandle, $handler, $tuple)) {
print STDERR "*** event: Could not subscribe to events\n";
return undef;
}
return $EVhandle;
}
......@@ -2578,7 +2702,7 @@ sub ComputeMaxLoadWaitTime($$)
# given how synchronous our scripts are, so give it 8 mins
# for now.
#
if (!$nodeobject->_onsharednode()) {
if (!$nodeobject->OnSharedNode()) {
$maxwait += 8 * 60;
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment