Commit fbc26201 authored by Leigh B Stoller's avatar Leigh B Stoller

Bug Fix: my previous attempt to catch failed XEN guests via a state machine

change had some problems.
parent 088ae82c
......@@ -2298,16 +2298,6 @@ sub vnodeBoot($$$$)
# stated to do its thing, this state name is treated specially.
libutil::setState("BOOTING");
#
# But, we find ourselves stuck in BOOTING quite often if the VM
# fails to boot far enough to to send in a state transition. We want
# to catch this specific hangup, so we will send an intermediate
# state that the server side can notice, and watch for how long it
# stays in the state.
#
sleep(1);
libutil::setState("VNODEBOOTSTART");
#
# We are going to watch for a busted control network interface, which
# happens a lot. There is a problem with the control vif not working,
......@@ -2350,6 +2340,15 @@ sub vnodeBoot($$$$)
# Ping returns zero if any packets received.
if (! $?) {
TBDebugTimeStamp("Created virtual machine $vnode_id");
#
# But, we still find ourselves stuck in BOOTING quite
# often if the VM fails to boot far enough to to send
# in a state transition. We want to catch this
# specific hangup, so we will send an intermediate
# state that the server side can notice, and watch for
# how long it stays in the state.
#
libutil::setState("VNODEBOOTSTART");
return 0;
}
$countdown--;
......
......@@ -141,7 +141,7 @@ use vars qw(@ISA @EXPORT);
TBDB_NODEOPMODE_DELAY
TBDB_NODEOPMODE_BOOTWHAT
TBDB_NODEOPMODE_ANY
TBDB_NODEOPMODE_UNKNOWN
TBDB_NODEOPMODE_UNKNOWN TBDB_NODEOPMODE_NORMALv2
TBDB_COMMAND_REBOOT
TBDB_COMMAND_POWEROFF TBDB_COMMAND_POWERON TBDB_COMMAND_POWERCYCLE
......@@ -511,6 +511,7 @@ sub TBDB_NODEOPMODE_DELAYING { "DELAYING"; }
sub TBDB_NODEOPMODE_UNKNOWNOS { "UNKNOWNOS"; }
sub TBDB_NODEOPMODE_RELOADING { "RELOADING"; }
sub TBDB_NODEOPMODE_NORMALv1 { "NORMALv1"; }
sub TBDB_NODEOPMODE_NORMALv2 { "NORMALv2"; }
sub TBDB_NODEOPMODE_MINIMAL { "MINIMAL"; }
sub TBDB_NODEOPMODE_PCVM { "PCVM"; }
sub TBDB_NODEOPMODE_RELOAD { "RELOAD"; }
......
......@@ -928,23 +928,28 @@ sub GetAllocState($$)
#
# We do this cause we always want to go to the DB.
#
sub GetEventState($$)
sub GetEventState($$;$)
{
my ($self, $pstate) = @_;
my ($self, $pstate, $popmode) = @_;
my $node_id = $self->node_id();
my $query_result =
DBQueryWarn("select eventstate from nodes where node_id='$node_id'");
DBQueryWarn("select eventstate,op_mode from nodes ".
"where node_id='$node_id'");
return -1
if (!$query_result || !$query_result->numrows);
my ($state) = $query_result->fetchrow_array();
my ($state,$op_mode) = $query_result->fetchrow_array();
$state = TBDB_NODESTATE_UNKNOWN
if (!defined($state));
$op_mode = TBDB_NODEOPMODE_UNKNOWN
if (!defined($op_mode));
$self->{'DBROW'}->{'eventstate'} = $state
if (defined($self->{'DBROW'}));
$$pstate = $state;
$$popmode= $op_mode
if (defined($popmode));
return 0;
}
......
......@@ -1771,9 +1771,9 @@ sub WaitForNodes($$@)
}
$node->_waitstart($parent->_waitend());
}
my $state;
my ($state,$op_mode);
if ($node->GetEventState(\$state)) {
if ($node->GetEventState(\$state, \$op_mode)) {
print STDERR "*** Error getting event state for $node_id.\n";
$node->_sliver()->SetStatus("failed")
if (defined($node->_sliver()));
......@@ -1805,6 +1805,7 @@ sub WaitForNodes($$@)
# XEN VMs, a reboot typically solves it.
#
if ($node->isvirtnode() &&
$op_mode eq TBDB_NODEOPMODE_NORMALv2 &&
$state eq TBDB_NODESTATE_VNODEBOOTSTART &&
$state eq $node->_laststate() &&
time() - $node->_laststatestamp() > 180) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment