Commit cb7801fb authored by Timothy Stack's avatar Timothy Stack

Fix the race between loading a mote and rebooting its host stargate.

	* db/libdb.pm.in: Add TBNodeSubNodes function which returns the
	list of subnodes for a given node.

	* mote/tbuisp.in: Don't reboot the stargate anymore after loading
	the attached mote.  The problem with the radio not working after
	the upload should be fixed now.

	* tbsetup/libreboot.pm.in: Check if a node's subnodes are being
	reloaded.  If so, try to wait until they reach ISUP before
	actually doing the reboot.

	* tbsetup/os_setup.in: Do not skip the ISUP wait for subnodes that
	are imageable (like motes), otherwise their allocstates are not
	updated correctly.  Remove the robot-specific hack that	assumed
	tbuisp would do the reboot if the attached mote was being reloaded.
parent b9dc258e
......@@ -173,6 +173,7 @@ use vars qw(@ISA @EXPORT);
TBExptFirewall TBNodeFirewall TBSetExptFirewallVlan
TBClearExptFirewallVlan
TBNodeSubNodes
TBNodeType TBNodeTypeProcInfo TBNodeTypeBiosWaittime
TBExptRemoveVirtualState TBExptBackupVirtualState
......@@ -3254,6 +3255,42 @@ sub TBExptClearBackupState($$)
if (-e $vstateDir);
}
#
# Return the list of subnodes for the given node.
#
sub TBNodeSubNodes($)
{
my ($node) = @_;
my (@row);
my (@nodes);
my $result = DBQueryFatal("SELECT n.node_id FROM nodes AS n " .
"LEFT JOIN node_types " .
" AS nt ON n.type = nt.type " .
"WHERE n.phys_nodeid='$node' and nt.issubnode");
if (! $result or $result->numrows == 0) {
return ();
}
while (@row = $result->fetchrow_array()) {
my $node = $row[0];
#
# Taint check. I do not understand this sillyness, but if I
# taint check these node names, I avoid warnings throughout.
#
if ($node =~ /^([-\w]+)$/) {
$node = $1;
push(@nodes, $node);
}
else {
print "*** $0: WARNING: Bad node name: $node.\n";
}
}
return @nodes;
}
#
# Return a node's type and class, in a two-element array
# If the caller asked for a scalar, give them only the type
......
......@@ -500,15 +500,6 @@ MOTE: foreach my $mote (@motes) {
if ($tmpfile) {
system "rm -f $tmpfile";
}
# XXX - We have to reboot stargates after loading the mote. Disgusting,
# there should be some better way
if ($upload_method eq "ssh") {
if (system("$TB/bin/node_reboot $host")) {
$errors++;
warn "Failed to upload code to $mote";
}
}
}
if ($errors) {
......
#!/usr/bin/perl -wT
#
# EMULAB-COPYRIGHT
# Copyright (c) 2000-2004 University of Utah and the Flux Group.
# Copyright (c) 2000-2005 University of Utah and the Flux Group.
# All rights reserved.
#
# node reboot library. Basically the backend to the node_reboot script, but
......@@ -609,6 +609,37 @@ sub RebootNode {
}
}
#
# If any of the node's subnodes are being reloaded, wait for the operation
# to finish before doing the reboot.
#
my @subnodes = TBNodeSubNodes($pc);
foreach my $subnode (@subnodes) {
my $opmode;
if (TBGetNodeOpMode($subnode, \$opmode) &&
defined($opmode) &&
(($opmode eq TBDB_NODEOPMODE_RELOADING) ||
($opmode eq TBDB_NODEOPMODE_RELOAD) ||
($opmode eq TBDB_NODEOPMODE_RELOADMOTE))) {
my $startwait = time;
my $actual_state;
print "reboot ($pc): waiting for subnode '$subnode' to finish ".
"reloading...\n";
sleep(5);
if (TBNodeStateWait($subnode,
$startwait,
(60*10),
\$actual_state,
(TBDB_NODESTATE_TBFAILED,
TBDB_NODESTATE_ISUP))) {
print "reboot ($pc): subnode has not finished reloading, ".
"rebooting anyways...\n";
}
}
}
#
# Run an ssh command in a child process, protected by an alarm to
# ensure that the ssh is not hung up forever if the machine is in
......
......@@ -224,7 +224,7 @@ while (my %row = $db_result->fetchhash()) {
next;
}
}
elsif ($subnode) {
elsif ($subnode && !$imageable) {
print "Will skip subnode $node ISUP wait.\n";
}
else {
......@@ -517,18 +517,6 @@ if (!$TESTMODE) {
delete $reboots{$node};
delete $reconfigs{$node};
$rebooted{$node} = 1;
# XXX BEGIN Robot hack
my $host;
TBPhysNodeID($node, \$host);
if ($host ne $node) {
delete $reboots{$host};
delete $reconfigs{$host};
$rebooted{$host} = 1;
}
# XXX END Robot hack
}
my %reload_args = ();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment