Commit 6231f84d authored by Leigh B Stoller's avatar Leigh B Stoller

Checkpoint new approach.

parent 5c581a10
This diff is collapsed.
......@@ -258,7 +258,8 @@ foreach my $node (@nodelist) {
#
# Look for type specific module first. Eventually this should be more
# dynamic in how the modules are loaded/defined.
# dynamic in how the modules are loaded/defined, perhaps specified on
# a per-type basis in the DB.
#
my $object = $MyStruct->TypeCache($node);
if (!defined($object)) {
......@@ -278,34 +279,97 @@ foreach my $node (@nodelist) {
$object->AddNode($node);
}
#
# If the above worked, go through and set the OS that should boot,
# as well as any reboots and reconfigs that are needed.
#
foreach my $node (@nodelist) {
my $node_id = $node->node_id();
$MyStruct->SetOS($node);
}
while (1) {
my $objects = $MyStruct->OperationList();
my @volunteers = ();
my @nodes = ();
#
# Now get the operation lists.
#
foreach my $list ($MyStruct->NextOperationList()) {
#
# Do not bother if we got canceled.
#
if (! $MyStruct->canceled()) {
my $canceled = $experiment->canceled();
if ($canceled) {
$MyStruct->canceled($canceled);
tbnotice({cause => 'canceled', severity => SEV_IMMEDIATE,
error => ['cancel_flag']},
"Swap canceled; will terminate os_setup early!");
last;
}
}
#
# Clear the inform lists, since we want to send email in batches
# as things fail.
#
$MyStruct->ClearInformLists();
#
# Go through and ask each one for volunteers.
#
foreach my $object (@{ $objects }) {
my @list = $object->Volunteers();
last
if (! @list);
@nodes = (@nodes, @list);
push(@volunteers, [$object, \@list]);
}
last
if (!@nodes);
#
# Light up the nodes in parallel.
#
my @results = ();
my $coderef = sub {
my ($ref) = @_;
my ($object, $noderef) = @{ $ref };
my @nodelist = @{ $noderef };
print STDERR "Lighting up nodes: @nodelist\n"
if ($debug);
if ($object->LightUpNodes(@nodelist)) {
return -1;
}
return 0;
};
print STDERR "Lighting up nodes in parallel ...\n";
if (ParRun({"maxwaittime" => 99999},
\@results, $coderef, @volunteers)) {
$MyStruct->die_noretry("*** LightUpNodes: Internal error\n");
}
#
# The list is the set that can be done in parallel.
# Check the exit codes. An error at this phase is unusual, and
# we want to turn off retry.
#
foreach my $object (@{ $list }) {
$object->LightUpNodes();
$object->WaitForNodes();
my $errors = 0;
my $count = 0;
foreach my $result (@results) {
my ($object, $noderef) = @{ $volunteers[$count] };
my @nodelist = @{ $noderef };
if ($result != 0) {
print STDERR "*** Error lighting up nodes: @nodelist\n"
if ($debug);
$MyStruct->noretry(1);
#
# Make sure all the nodes are marked as down so that
# we do not wait for them.
#
foreach my $node (@nodelist) {
$node->SetAllocState(TBDB_ALLOCSTATE_DOWN());
}
}
$count++;
}
# And wait.
$MyStruct->WaitForNodes(@nodes);
#
# Fire off email for this batch.
#
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment