From fbc26201ed7254b9de7c75ca43267c8f098e894d Mon Sep 17 00:00:00 2001 From: Leigh B Stoller Date: Mon, 4 May 2015 18:20:06 -0600 Subject: [PATCH] Bug Fix: my previous attempt to catch failed XEN guests via a state machine change had some problems. --- clientside/tmcc/linux/xen/libvnode_xen.pm | 19 +++++++++---------- db/EmulabConstants.pm.in | 3 ++- db/Node.pm.in | 13 +++++++++---- protogeni/lib/GeniAggregate.pm.in | 5 +++-- 4 files changed, 23 insertions(+), 17 deletions(-) diff --git a/clientside/tmcc/linux/xen/libvnode_xen.pm b/clientside/tmcc/linux/xen/libvnode_xen.pm index 778e86709..9a2b8dc55 100644 --- a/clientside/tmcc/linux/xen/libvnode_xen.pm +++ b/clientside/tmcc/linux/xen/libvnode_xen.pm @@ -2298,16 +2298,6 @@ sub vnodeBoot($$$$) # stated to do its thing, this state name is treated specially. libutil::setState("BOOTING"); - # - # But, we find ourselves stuck in BOOTING quite often if the VM - # fails to boot far enough to to send in a state transition. We want - # to catch this specific hangup, so we will send an intermediate - # state that the server side can notice, and watch for how long it - # stays in the state. - # - sleep(1); - libutil::setState("VNODEBOOTSTART"); - # # We are going to watch for a busted control network interface, which # happens a lot. There is a problem with the control vif not working, @@ -2350,6 +2340,15 @@ sub vnodeBoot($$$$) # Ping returns zero if any packets received. if (! $?) { TBDebugTimeStamp("Created virtual machine $vnode_id"); + # + # But, we still find ourselves stuck in BOOTING quite + # often if the VM fails to boot far enough to to send + # in a state transition. We want to catch this + # specific hangup, so we will send an intermediate + # state that the server side can notice, and watch for + # how long it stays in the state. + # + libutil::setState("VNODEBOOTSTART"); return 0; } $countdown--; diff --git a/db/EmulabConstants.pm.in b/db/EmulabConstants.pm.in index e958625ba..7d33d33c7 100644 --- a/db/EmulabConstants.pm.in +++ b/db/EmulabConstants.pm.in @@ -141,7 +141,7 @@ use vars qw(@ISA @EXPORT); TBDB_NODEOPMODE_DELAY TBDB_NODEOPMODE_BOOTWHAT TBDB_NODEOPMODE_ANY - TBDB_NODEOPMODE_UNKNOWN + TBDB_NODEOPMODE_UNKNOWN TBDB_NODEOPMODE_NORMALv2 TBDB_COMMAND_REBOOT TBDB_COMMAND_POWEROFF TBDB_COMMAND_POWERON TBDB_COMMAND_POWERCYCLE @@ -511,6 +511,7 @@ sub TBDB_NODEOPMODE_DELAYING { "DELAYING"; } sub TBDB_NODEOPMODE_UNKNOWNOS { "UNKNOWNOS"; } sub TBDB_NODEOPMODE_RELOADING { "RELOADING"; } sub TBDB_NODEOPMODE_NORMALv1 { "NORMALv1"; } +sub TBDB_NODEOPMODE_NORMALv2 { "NORMALv2"; } sub TBDB_NODEOPMODE_MINIMAL { "MINIMAL"; } sub TBDB_NODEOPMODE_PCVM { "PCVM"; } sub TBDB_NODEOPMODE_RELOAD { "RELOAD"; } diff --git a/db/Node.pm.in b/db/Node.pm.in index d26ffde1b..0f53beb9d 100755 --- a/db/Node.pm.in +++ b/db/Node.pm.in @@ -928,23 +928,28 @@ sub GetAllocState($$) # # We do this cause we always want to go to the DB. # -sub GetEventState($$) +sub GetEventState($$;$) { - my ($self, $pstate) = @_; + my ($self, $pstate, $popmode) = @_; my $node_id = $self->node_id(); my $query_result = - DBQueryWarn("select eventstate from nodes where node_id='$node_id'"); + DBQueryWarn("select eventstate,op_mode from nodes ". + "where node_id='$node_id'"); return -1 if (!$query_result || !$query_result->numrows); - my ($state) = $query_result->fetchrow_array(); + my ($state,$op_mode) = $query_result->fetchrow_array(); $state = TBDB_NODESTATE_UNKNOWN if (!defined($state)); + $op_mode = TBDB_NODEOPMODE_UNKNOWN + if (!defined($op_mode)); $self->{'DBROW'}->{'eventstate'} = $state if (defined($self->{'DBROW'})); $$pstate = $state; + $$popmode= $op_mode + if (defined($popmode)); return 0; } diff --git a/protogeni/lib/GeniAggregate.pm.in b/protogeni/lib/GeniAggregate.pm.in index 9df46ca8f..cb27330e0 100755 --- a/protogeni/lib/GeniAggregate.pm.in +++ b/protogeni/lib/GeniAggregate.pm.in @@ -1771,9 +1771,9 @@ sub WaitForNodes($$@) } $node->_waitstart($parent->_waitend()); } - my $state; + my ($state,$op_mode); - if ($node->GetEventState(\$state)) { + if ($node->GetEventState(\$state, \$op_mode)) { print STDERR "*** Error getting event state for $node_id.\n"; $node->_sliver()->SetStatus("failed") if (defined($node->_sliver())); @@ -1805,6 +1805,7 @@ sub WaitForNodes($$@) # XEN VMs, a reboot typically solves it. # if ($node->isvirtnode() && + $op_mode eq TBDB_NODEOPMODE_NORMALv2 && $state eq TBDB_NODESTATE_VNODEBOOTSTART && $state eq $node->_laststate() && time() - $node->_laststatestamp() > 180) { -- GitLab