Commit 17febf4d authored by Leigh Stoller's avatar Leigh Stoller

Small changes to gracefully catch experiment cancellation flag, and stop

the current osload/ossetup.
parent 68103b3b
#!/usr/bin/perl -w
#
# Copyright (c) 2000-2018 University of Utah and the Flux Group.
# Copyright (c) 2000-2019 University of Utah and the Flux Group.
#
# {{{EMULAB-LICENSE
#
......@@ -107,6 +107,8 @@ sub New($$$;@)
$self->{'FAILCOUNT'} = 0;
$self->{'TYPECACHE'} = {};
$self->{'TYPEOBJECTS'} = {};
$self->{'USER'} = undef;
$self->{'EXPT'} = undef;
foreach my $node (@nodelist) {
$self->{'NODES'}->{$node->node_id()} = $node;
......@@ -498,6 +500,12 @@ sub osload($$$) {
if (defined($args->{'debug'})) {
$self->debug($args->{'debug'});
}
if (defined($args->{'user'})) {
$self->{"USER"} = $args->{'user'};
}
if (defined($args->{'experiment'})) {
$self->{"EXPT"} = $args->{'experiment'};
}
$self->{FLAGS} = \%flags;
......@@ -859,7 +867,7 @@ sub osload($$$) {
}
if (exists($eventnodes->{$node_id})) {
my $et = time();
if ($self->debug()) {
if ($self->debug() > 1) {
print STDERR "$self: eventhandler: $node_id => $event @ $et\n"
}
$eventnodes->{$node_id} = $et;
......@@ -944,9 +952,17 @@ sub osload($$$) {
my $node = shift(@failednodes);
my $nodeobject = $self->node($node);
my $typeobject = $self->typeobject($nodeobject);
my $retries = $self->nodeinfo($nodeobject,'retries');
my $retries = $self->nodeinfo($nodeobject,'retries');
if ($retries) {
#
# If we have been canceled, do not worry about retry.
#
if (defined($self->experiment()) &&
$self->experiment()->canceled()) {
$result->{$node} = -1;
$typeobject->ReloadDone($nodeobject);
}
elsif ($retries) {
tbnotice "$self ($node): Trying again ...";
# Possible race with reboot?
......@@ -1226,6 +1242,13 @@ sub WaitTillReloadDone($$$$$@)
goto okay;
}
#
# Watch for experiment cancelation in direct mode.
#
my $canceled = ($typewaitstatus &&
defined($self->experiment()) &&
$self->experiment()->canceled());
#
# Another form of being stuck is no frisbee events for
# too long. But only if we are getting frisbee events
......@@ -1251,10 +1274,28 @@ sub WaitTillReloadDone($$$$$@)
$eventstate eq TBDB_NODESTATE_TBFAILED() ||
$eventstate eq TBDB_NODESTATE_PXEFAILED() ||
$eventstate eq TBDB_NODESTATE_RELOADFAILED() ||
$isstuck) {
$isstuck || $canceled) {
my $msg;
if ($canceled) {
#
# In direct mode, watch for experiment cancellation
# and terminate early.
#
# We do not clear the reloads or partitions, in
# case this is a panic situation. We hope that when
# (if) the node comes out of panic, we might pick
# up where things left off. If not a panic situation,
# the reload daemon will clear the partitions and
# reload.
#
$msg = "$self: $node has been stopped because of ".
"experiment cancellation.";
goto nodefailed;
}
my $t = (int ($waittime / 60));
my $msg = "$self: $node appears wedged; ".
$msg = "$self: $node appears wedged; ".
"it has been $t minutes since it was rebooted.\n";
if ($eventstate eq TBDB_NODESTATE_TBFAILED() ||
......@@ -1299,8 +1340,9 @@ sub WaitTillReloadDone($$$$$@)
}
$msg .= "$self: $node did not finish reloading.";
}
nodefailed:
tbnotice($msg);
TBNodeConsoleTail($node, *STDERR);
TBNodeConsoleTail($node, *STDERR) if (!$canceled);
$count--;
$done{$node} = $waitmode;
......
#!/usr/bin/perl -w
#
# Copyright (c) 2000-2018 University of Utah and the Flux Group.
# Copyright (c) 2000-2019 University of Utah and the Flux Group.
#
# {{{EMULAB-LICENSE
#
......@@ -28,13 +28,13 @@ use Exporter;
use vars qw(@EXPORT $AUTOLOAD
$NOSTATE $RELOAD $RECONFIG $REBOOT
$RELOAD_FAILED $RECONFIG_FAILED $REBOOT_FAILED $SETUP_FAILED
$SETUP_OKAY);
$SETUP_OKAY $SETUP_CANCELED);
use base qw( Exporter );
@EXPORT = qw(&die_noretry
$NOSTATE $RELOAD $RECONFIG $REBOOT
$RELOAD_FAILED $RECONFIG_FAILED $REBOOT_FAILED $SETUP_FAILED
$SETUP_OKAY);
$SETUP_OKAY $SETUP_CANCELED);
use libdb;
use libtestbed;
......@@ -69,6 +69,7 @@ $RELOAD_FAILED = 0x01;
$RECONFIG_FAILED = 0x02;
$REBOOT_FAILED = 0x04;
$SETUP_FAILED = 0x08;
$SETUP_CANCELED = 0x10;
#
# Used to die with a -1 return code, to indicate to caller (tbswap)
......@@ -683,9 +684,9 @@ sub LightUpNodes($@)
($self->loadobj())->debug($self->debug());
# add a few more things for feature checks down the line:
$reload_args{user} = $self->user();
$reload_args{group} = $self->group();
$reload_args{experiment} = $self->experiment();
$reload_args{'user'} = $self->user();
$reload_args{'group'} = $self->group();
$reload_args{'experiment'} = $self->experiment();
$pid = ($self->loadobj())->osload(\%reload_args, $reload_failures);
$coderef = sub {
......@@ -882,6 +883,17 @@ sub WaitForNodes($@)
my $state;
my $state_timestamp;
#
# Terminate the remaining nodes early ...
#
if ($canceled) {
print STDERR "$node_id: not waiting cause of cancellation.\n";
$node->_setupstatus($SETUP_CANCELED);
delete($nodes{$node_id});
$typehandler->WaitDone($node);
next;
}
#
# Call typehandler specific wait function;
#
......@@ -1476,8 +1488,7 @@ sub WaitDone($@)
tbnotice("could not force into PXEWAIT; failing\n");
goto bad;
}
# clear the reload info
$node->ClearCurrentReload();
# and partitions
$node->ClearPartitions();
......@@ -1503,6 +1514,10 @@ sub WaitDone($@)
next;
}
}
elsif ($setupstatus == $libossetup::SETUP_CANCELED) {
# Skip the stuff below, not needed.
next;
}
bad:
#
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment