Commit 355abb8f authored by Leigh Stoller's avatar Leigh Stoller

Per Kevin's suggestion, add check for nodes in unknown state and

move them into hwcheckup.

Make sure nodes are imageable, otherwise straight to hwdown.

Minor fix to Node::MoveReservation().
parent 5cf6aad2
......@@ -798,9 +798,6 @@ sub MoveReservation($$)
return -1
if (! (ref($self) && ref($newexperiment)));
return -1
if (! $self->IsReserved());
my $node_id = $self->node_id();
my $newpid = $newexperiment->pid();
my $neweid = $newexperiment->eid();
......@@ -810,7 +807,7 @@ sub MoveReservation($$)
my $oldidx = 0;
# Must remember old reservation when moving to new oldreserved.
if ($newpid eq OLDRESERVED_PID() && $neweid eq OLDRESERVED_EID()) {
if ($newpid eq OLDRESERVED_PID() && $neweid eq OLDRESERVED_EID()) {
#
# Cannot do an experiment Lookup cause reserved table may be locked.
# IsReserved() will load the reserved table entry only.
......
......@@ -41,6 +41,8 @@ delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};
# Protos
sub fatal($);
sub NodeIsDead($);
sub NodeIsOkay($);
#
# Turn off line buffering on output
......@@ -59,7 +61,7 @@ if (! getopts($optlist, \%options)) {
usage();
}
if (defined($options{"d"})) {
$debug = 1;
$debug++;
}
# Load the Testbed support stuff.
......@@ -129,9 +131,43 @@ $SIG{HUP} = \&handler
if (!$debug);
while (1) {
my @informtbopsfatal = ();
my @informtbopswarn = ();
my @tmp = ();
print "Running at ".
POSIX::strftime("20%y-%m-%d %H:%M:%S", localtime()) . "\n";
#
# Look for nodes in a wierd state. Lets test them.
#
my $query_result =
DBQueryWarn("select n.node_id,n.eventstate, ".
" FROM_UNIXTIME(n.state_timestamp) from nodes as n ".
"left join reserved as r on r.node_id=n.node_id ".
"where (n.eventstate!='". TBDB_NODESTATE_ISUP ."' and ".
" n.eventstate!='". TBDB_NODESTATE_PXEWAIT ."' and ".
" n.eventstate!='". TBDB_NODESTATE_ALWAYSUP ."' and ".
" n.eventstate!='". TBDB_NODESTATE_POWEROFF ."') and ".
" r.pid is null and n.role='testnode' and ".
" (UNIX_TIMESTAMP(now()) - n.state_timestamp) > 600");
goto loop
if (!$query_result);
while (my ($nodeid,$eventstate,$stamp) = $query_result->fetchrow_array()) {
print "Node in unknown state: $nodeid,$eventstate,$stamp\n";
my $node = Node->Lookup($nodeid);
if (!defined($node)) {
print STDERR "Cannot find object for $nodeid\n";
next;
}
$node->MoveReservation($experiment);
$node->InsertNodeLogEntry($elabman, TB_DEFAULT_NODELOGTYPE(),
"'Moved to hwcheckup by checknodes daemon; ".
"stuck in $eventstate since $stamp'");
}
$experiment->Flush();
Node->FlushAll();
......@@ -142,55 +178,58 @@ while (1) {
foreach my $node (@nodelist) {
if ($node->ClearBootAttributes()) {
print STDERR "$node: Could not clear boot attributes.\n";
next;
}
if (! $node->imageable()) {
print STDERR "$node is not imageable.\n";
NodeIsDead($node);
push(@informtbopsfatal, $node->node_id());
next;
}
push(@tmp, $node);
}
@nodelist = @tmp;
print "Running $GENTOPOFILE ...\n";
if (system("$GENTOPOFILE $pid $eid")) {
print STDERR "$GENTOPOFILE failed\n";
next;
}
print "Running $EXPORTS_SETUP ...\n";
if (system("$EXPORTS_SETUP")) {
print STDERR "$EXPORTS_SETUP failed\n";
next;
}
# The nodes will not boot locally unless there is a DNS record.
print "Running $NAMED_SETUP ...\n";
if (system("$NAMED_SETUP")) {
print STDERR "$NAMED_SETUP failed\n";
next;
}
my @nodenames = map { $_->node_id() } @nodelist;
my %reload_args = ();
my %reload_results = ();
$reload_args{'debug'} = $debug;
$reload_args{'waitmode'} = 2; # XXX Wait till reboot after reload.
$reload_args{'nodelist'} = [ @nodenames ];
print "Running osload on @nodenames\n";
my $failures = osload(\%reload_args, \%reload_results);
if ($failures) {
print STDERR "osload returned $failures failures\n";
}
my @informtbopsfatal = ();
my @informtbopswarn = ();
foreach my $node (@nodelist) {
if ($reload_results{$node->node_id()}) {
push(@informtbopsfatal, $node->node_id());
if (@nodelist) {
print "Running $GENTOPOFILE ...\n";
if (system("$GENTOPOFILE $pid $eid")) {
print STDERR "$GENTOPOFILE failed\n";
next;
}
print "Running $EXPORTS_SETUP ...\n";
if (system("$EXPORTS_SETUP")) {
print STDERR "$EXPORTS_SETUP failed\n";
next;
}
# The nodes will not boot locally unless there is a DNS
# record.
print "Running $NAMED_SETUP ...\n";
if (system("$NAMED_SETUP")) {
print STDERR "$NAMED_SETUP failed\n";
next;
}
my @nodenames = map { $_->node_id() } @nodelist;
my %reload_args = ();
my %reload_results = ();
print STDERR "$node is fatally ill; moving to hwdown.\n";
$node->MarkAsDown();
$node->InsertNodeLogEntry($elabman, TB_DEFAULT_NODELOGTYPE(),
"Moved to hwdown by checknodes daemon");
$reload_args{'debug'} = $debug;
$reload_args{'waitmode'} = 2; # XXX Wait till reboot after reload.
$reload_args{'nodelist'} = [ @nodenames ];
print "Running osload on @nodenames\n";
my $failures = osload(\%reload_args, \%reload_results);
if ($failures) {
print STDERR "osload returned $failures failures\n";
}
else {
push(@informtbopswarn, $node->node_id());
print STDERR "$node appears to be okay; releasing.\n";
$node->InsertNodeLogEntry($elabman, TB_DEFAULT_NODELOGTYPE(),
"Released by checknodes daemon");
foreach my $node (@nodelist) {
if ($reload_results{$node->node_id()}) {
push(@informtbopsfatal, $node->node_id());
NodeIsDead($node);
}
else {
push(@informtbopswarn, $node->node_id());
NodeIsOkay($node);
}
}
}
if (@informtbopsfatal) {
......@@ -238,6 +277,25 @@ while (1) {
}
exit(0);
sub NodeIsDead($)
{
my ($node) = @_;
print STDERR "$node is fatally ill; moving to hwdown.\n";
$node->MarkAsDown();
$node->InsertNodeLogEntry($elabman, TB_DEFAULT_NODELOGTYPE(),
"Moved to hwdown by checknodes daemon");
}
sub NodeIsOkay($)
{
my ($node) = @_;
print STDERR "$node appears to be okay; releasing.\n";
$node->InsertNodeLogEntry($elabman, TB_DEFAULT_NODELOGTYPE(),
"Released by checknodes daemon");
}
sub fatal($)
{
my ($msg) = @_;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment