Commit 7f90bc5c authored by Leigh Stoller's avatar Leigh Stoller

Stated headaches.

parent 8c991ba5
...@@ -74,6 +74,10 @@ waitmode() ...@@ -74,6 +74,10 @@ waitmode()
{ {
echo "Waiting for something to do" echo "Waiting for something to do"
# This tells stated we are in the PXEKERNEL state machine.
$TMCC state PXEBOOTING
sleep 1
# #
# We use bootinfoclient. If we get a reboot command do that, # We use bootinfoclient. If we get a reboot command do that,
# otherwise we just return to query bootwhat again. # otherwise we just return to query bootwhat again.
...@@ -104,18 +108,16 @@ reload_nos() ...@@ -104,18 +108,16 @@ reload_nos()
{ {
echo "Setting up to reload the NOS" echo "Setting up to reload the NOS"
# Tell boss we are booting. # Tell boss we are booting into reload MFS.
$TMCC state BOOTING $TMCC state BOOTING
$BINDIR/rc.reload $BINDIR/rc.reload
rc=$? rc=$?
if [ $rc -ne 0 ]; then if [ $rc -ne 0 ]; then
echo "Dropping into admin shell" echo "Dropping into the shell after failed reload"
return 1 exit 1
fi fi
boot_nos return 0
# Should not return
return 1;
} }
# #
...@@ -124,7 +126,7 @@ reload_nos() ...@@ -124,7 +126,7 @@ reload_nos()
boot_nos() boot_nos()
{ {
echo "Setting up to boot the NOS" echo "Setting up to boot the NOS"
if [ ! -s $EMULABENV ]; then if [ ! -s $EMULABENV ]; then
grub-editenv $EMULABENV create grub-editenv $EMULABENV create
rc=$? rc=$?
...@@ -139,6 +141,10 @@ boot_nos() ...@@ -139,6 +141,10 @@ boot_nos()
echo "Failed to update grub env with bootnos=yes" echo "Failed to update grub env with bootnos=yes"
return 1 return 1
fi fi
# Tell boss we are booting into reload MFS.
$TMCC state BOOTING
sleep 5
echo "Rebooting into the NOS" echo "Rebooting into the NOS"
/sbin/reboot /sbin/reboot
exit 0; exit 0;
...@@ -176,8 +182,8 @@ while : ; do ...@@ -176,8 +182,8 @@ while : ; do
;; ;;
*onie_reload) *onie_reload)
reload_nos reload_nos
# Does not return # We loop again to see what we do next. If we got here by the
exit 1 # reload daemon, we are probably going into waitmode.
;; ;;
*onie_admin) *onie_admin)
echo "Dropping into admin mode" echo "Dropping into admin mode"
......
...@@ -394,6 +394,7 @@ sub Reboot($$) ...@@ -394,6 +394,7 @@ sub Reboot($$)
$self->dprint(0, "$self RebootNOS($node_id): error: '$error'\n"); $self->dprint(0, "$self RebootNOS($node_id): error: '$error'\n");
goto reboot; goto reboot;
} }
TBSetNodeEventState($node_id, TBDB_NODESTATE_SHUTDOWN);
return 0; return 0;
reboot: reboot:
......
...@@ -602,6 +602,13 @@ sub Reconfigure($$;$) ...@@ -602,6 +602,13 @@ sub Reconfigure($$;$)
return -1; return -1;
} }
$self->dprint(2,"$self: Reconfigure($node_id): ".
"setting state to SHUTDOWN");
# Need to do this before we return to ossetup, since it is looking
# for a transition to ISUP to know when the Reconfigure is done.
TBSetNodeEventState($node_id, TBDB_NODESTATE_SHUTDOWN);
# #
# Reload can't block, so fork and make a note of ourself! # Reload can't block, so fork and make a note of ourself!
# #
......
...@@ -2013,19 +2013,11 @@ sub SetBootOS($$) ...@@ -2013,19 +2013,11 @@ sub SetBootOS($$)
# last image # last image
# #
my $image = $images[-1]; my $image = $images[-1];
my $imageid = $image->imageid();
my $defosid = $image->default_osid(); print "$self SetBootOS($node_id): changing default OS to $image\n";
my $osimage = OSImage->Lookup($defosid);
if (!defined($osimage)) {
tberror("$self SetBootOS($node_id): could not map OSID $defosid to its object!");
return -1;
}
print "$self SetBootOS($node_id): changing default OS to $osimage\n";
if (!$TESTMODE) { if (!$TESTMODE) {
if ($nodeobject->OSSelect($image,"def_boot_osid",$self->debug())) { if ($nodeobject->OSSelect($image,"def_boot_osid",$self->debug())) {
tberror "$self SetBootOS($node_id): os_select $defosid failed!"; tberror "$self SetBootOS($node_id): os_select $image failed!";
return -1; return -1;
} }
} }
...@@ -2039,7 +2031,7 @@ sub SetBootOS($$) ...@@ -2039,7 +2031,7 @@ sub SetBootOS($$)
$ocmdline = $nodeobject->def_boot_cmd_line(); $ocmdline = $nodeobject->def_boot_cmd_line();
$ocmdline = "" $ocmdline = ""
if (!defined($ocmdline)); if (!defined($ocmdline));
$osimage->OSBootCmd("delay", \$ncmdline); $image->OSBootCmd("delay", \$ncmdline);
$ncmdline = "" $ncmdline = ""
if (!defined($ncmdline)); if (!defined($ncmdline));
if ($ocmdline ne $ncmdline) { if ($ocmdline ne $ncmdline) {
......
...@@ -36,6 +36,7 @@ my $TBOPS = "@TBOPSEMAIL@"; ...@@ -36,6 +36,7 @@ my $TBOPS = "@TBOPSEMAIL@";
my $OURDOMAIN = "@OURDOMAIN@"; my $OURDOMAIN = "@OURDOMAIN@";
my $PING = "/sbin/ping"; my $PING = "/sbin/ping";
my $NETCAT = "/usr/local/bin/netcat"; my $NETCAT = "/usr/local/bin/netcat";
my $BISEND = "$TB/sbin/bootinfosend";
my $EXPECT_CONN_TIMEOUT = 10; my $EXPECT_CONN_TIMEOUT = 10;
...@@ -375,10 +376,25 @@ sub Reconfigure($$$) ...@@ -375,10 +376,25 @@ sub Reconfigure($$$)
{ {
my ($self, $nodeobject, $dowait) = @_; my ($self, $nodeobject, $dowait) = @_;
my $node_id = $nodeobject->node_id(); my $node_id = $nodeobject->node_id();
my $reconfig_only = 0;
my $running_onie = 0;
my $retval; my $retval;
$self->dprint(0, "$self: Reconfigure($node_id): starting, dowait:$dowait"); $self->dprint(0, "$self: Reconfigure($node_id): starting, dowait:$dowait");
# os_setup is doing only a reconfig.
if (!$dowait && !$self->nodeflag($nodeobject, 'reconfig_will_follow')) {
$reconfig_only = 1;
#
# See if the switch is in PXEWAIT, which says we can reboot it
# with with node_reboot directly which will tell ONIE to query.
#
$nodeobject->Refresh();
if ($nodeobject->eventstate() eq TBDB_NODESTATE_PXEWAIT()) {
$running_onie = 1;
}
}
# #
# For now, we allow Reconfigure to block -- it will be called from Reload # For now, we allow Reconfigure to block -- it will be called from Reload
# or from ossetup, ossetup says do not wait. # or from ossetup, ossetup says do not wait.
...@@ -406,6 +422,73 @@ sub Reconfigure($$$) ...@@ -406,6 +422,73 @@ sub Reconfigure($$$)
return 0; return 0;
} }
} }
#
# Again, $dowait is our indicator we are called from os_setup.
# There is a bit of mismatch between the what libossetup_switch
# does and what we expect; the switch should be in the ONIE
# MFS at this point, so we need to reboot it and wait for it
# to come back online before we can actually reconfig.
#
# If this is a reconfig after reload from os_setup, then the
# switch is already in the NOS and ready for us.
#
# Need to clean this up, but going to wait until MLNX support.
#
if ($reconfig_only) {
if ($running_onie) {
system("$BISEND -q $node_id");
if ($?) {
goto failed;
}
}
elsif ($self->Reboot($nodeobject)) {
goto failed;
}
#
# Now we are waiting for reboot to complete and the switch to come back
# online. Should be very quick, although there will some delay before
# DHCP finishes and we can ssh over.
#
my $seconds = 180;
$self->dprint(0,"Reconfigure($node_id): ".
"waiting $seconds seconds for ping");
# Need time for node to actually reboot;
sleep(30);
$seconds -= 30;
while ($seconds >= 0) {
sleep(15);
$seconds -= 15;
last
if ($self->Pingable($nodeobject));
}
if ($seconds < 0) {
tbwarn "$self Reconfigure($node_id): timed out waiting for ping\n";
goto failed;
}
#
# Now we wait for sshd to come online.
#
$seconds = 180;
$self->dprint(0,"Reconfigure($node_id): ".
"waiting $seconds seconds for sshd");
while ($seconds >= 0) {
sleep(15);
$seconds -= 15;
system("$NETCAT -z -w 3 $node_id 22 > /dev/null ");
if ($? == 0) {
last
}
}
if ($seconds < 0) {
tbwarn "$self Reconfigure($node_id): timed out waiting for sshd\n";
goto failed;
}
}
# #
# Hand this off to the device dependent library. Which is actually # Hand this off to the device dependent library. Which is actually
...@@ -415,7 +498,7 @@ sub Reconfigure($$$) ...@@ -415,7 +498,7 @@ sub Reconfigure($$$)
goto failed; goto failed;
} }
# Signal ossetup that we are done with reconfig. # Signal ossetup that we are done with reconfig.
if ($nodeobject->eventstate() eq TBDB_NODESTATE_SHUTDOWN()) { if (!$dowait) {
TBSetNodeEventState($node_id, TBDB_NODESTATE_ISUP()); TBSetNodeEventState($node_id, TBDB_NODESTATE_ISUP());
} }
return 0; return 0;
......
...@@ -104,13 +104,6 @@ sub AddNode($$) ...@@ -104,13 +104,6 @@ sub AddNode($$)
|| $node->allocstate() eq TBDB_ALLOCSTATE_RES_INIT_DIRTY()) { || $node->allocstate() eq TBDB_ALLOCSTATE_RES_INIT_DIRTY()) {
$self->{OPLIST}->{$node_id} = [ $libossetup::RECONFIG ]; $self->{OPLIST}->{$node_id} = [ $libossetup::RECONFIG ];
$node->_setupoperation($libossetup::RECONFIG); $node->_setupoperation($libossetup::RECONFIG);
#
# XXX hack -- our node is already ISUP, and we need to force it
# out of that state so taht WaitForNodes doesn't beat our type
# handler object to forcing it to SHUTDOWN
#
print STDERR "$self AddNode($node_id): forcing to SHUTDOWN before RECONFIG\n";
TBSetNodeEventState($node_id,TBDB_NODESTATE_SHUTDOWN);
} }
elsif ($node->allocstate() ne TBDB_ALLOCSTATE_RES_READY()) { elsif ($node->allocstate() ne TBDB_ALLOCSTATE_RES_READY()) {
# only reboot node if assign_wrapper just pulled it into expt. # only reboot node if assign_wrapper just pulled it into expt.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment