Commit 6d4187a5 authored by Leigh B. Stoller's avatar Leigh B. Stoller

Minor allocstate changes to try and cope with plab nodes that either

fail in "plabnode alloc" or in the remote vnodesetup call. In the
former case, we do not want to "plabnode free" it later. In the later,
we want to plabnode free it right away, and make sure we do not try to
remote vnode teardown or plabfree it later. In either case, os_setup
needs to check so that it does not bother waiting for the node since
it is wasted time. I use an alternate dead state for this, but the
real solution is to move much of the vnode specific code from os_setup
to vnode_setup.

Note that this stuff is mostly untested since I need nodes to fail!
The normal path works fine though.
parent 46e96b8a
......@@ -699,21 +699,30 @@ elsif (@vnodelist) {
my $pnode = $vnode2pnode{$node};
my $wstart = $waitstart{$node};
my $maxwait = 90 + (100 * $pnodevcount{$pnode});
my $curallocstate;
if (!TBNodeStateWait($node, TBDB_NODESTATE_ISUP, $wstart, $maxwait)) {
print "$node is alive and well\n";
# Might have already been set above.
TBSetNodeAllocState($node, TBDB_ALLOCSTATE_RES_READY);
$nodeAllocStates{$node} = TBDB_ALLOCSTATE_RES_READY;
SetNodeBootStatus($node, NODEBOOTSTATUS_OKAY);
next;
}
TBGetNodeAllocState($node, \$curallocstate);
SetNodeBootStatus($node, NODEBOOTSTATUS_FAILED);
TBSetNodeAllocState($node, TBDB_ALLOCSTATE_DOWN());
$nodeAllocStates{$node} = TBDB_ALLOCSTATE_DOWN;
#
# See if vnode_setup alreadt determined the node was dead.
#
if ($curallocstate ne TBDB_ALLOCSTATE_DOWN() &&
$curallocstate ne TBDB_ALLOCSTATE_DEAD()) {
if (!TBNodeStateWait($node, TBDB_NODESTATE_ISUP,
$wstart, $maxwait)) {
print "$node is alive and well\n";
# Might have already been set above.
TBSetNodeAllocState($node, TBDB_ALLOCSTATE_RES_READY);
SetNodeBootStatus($node, NODEBOOTSTATUS_OKAY);
next;
}
SetNodeBootStatus($node, NODEBOOTSTATUS_FAILED);
TBSetNodeAllocState($node, TBDB_ALLOCSTATE_DOWN());
}
print "*** WARNING: $node may be down.\n";
print "*** WARNING: $node did not boot!\n";
if ($canfail{$node}) {
# Send mail to testbed-ops and to the user about it.
......
......@@ -181,8 +181,13 @@ foreach my $node (@nodes) {
next;
}
elsif ($allocstate eq TBDB_ALLOCSTATE_DOWN) {
print "$node never booted; skipping.\n";
next;
if (!$plab) {
print "$node failed to boot; skipping $mode.\n";
next;
}
# Plab nodes need to be cleaned up.
print "$node failed to boot; changing to cleanup.\n";
$mode = "cleanup";
}
}
elsif ($exptstate eq EXPTSTATE_ACTIVATING) {
......@@ -221,10 +226,20 @@ foreach my $node (@nodes) {
}
else {
if ($killmode) {
if ($allocstate eq TBDB_ALLOCSTATE_DOWN) {
print "$node failed to boot; skipping $mode.\n";
if ($allocstate eq TBDB_ALLOCSTATE_DEAD) {
# plab only. See below.
print "$node failed to initialize; skipping $mode.\n";
next;
}
elsif ($allocstate eq TBDB_ALLOCSTATE_DOWN) {
if (!$plab) {
print "$node failed to boot; skipping $mode.\n";
next;
}
# Plab nodes need to be cleaned up.
print "$node failed to boot; changing to cleanup.\n";
$mode = "cleanup";
}
elsif ($allocstate eq TBDB_ALLOCSTATE_RES_INIT_CLEAN()) {
print "$node never booted; skipping $mode.\n";
next;
......@@ -235,7 +250,7 @@ foreach my $node (@nodes) {
next;
}
elsif ($allocstate eq TBDB_ALLOCSTATE_RES_INIT_DIRTY()) {
print "$node only needs a reboot on remote node $pnode\n";
print "$node needs a reboot on remote node $pnode\n";
$mode = "reboot";
}
}
......@@ -297,25 +312,31 @@ while (1) {
[$vnode, $pnode, $mode, $jailed, $plab, time()];
$children++;
} else {
my $args = (($mode eq "teardown") ? "-k " :
($mode eq "reboot" ? "-r " : " "));
$args .= ($jailed ? "-j " : " ");
$args .= ($plab ? "-p " : " ");
$args .= "$vnode ";
# Must change our real UID to root so that ssh will work.
$UID = 0;
if ($plab && $mode eq "setup") {
if (system("$TB/sbin/plabnode alloc $pid $eid $vnode")) {
die("*** $0:\n".
" Plab node allocation failed");
print STDERR "*** $0:\n" .
" Plab node allocation failed";
# Should check DB state instead.
exit(99);
}
}
exec("$ssh -host $vnode $CLIENT_BIN/vnodesetup $args");
die("*** $0:\n".
" exec failed!\n");
# Cleanup is used only on plab nodes.
if ($mode ne "cleanup") {
my $args = (($mode eq "teardown") ? "-k " :
($mode eq "reboot" ? "-r " : " "));
$args .= ($jailed ? "-j " : " ");
$args .= ($plab ? "-p " : " ");
$args .= "$vnode ";
exec("$ssh -host $vnode $CLIENT_BIN/vnodesetup $args");
die("*** $0:\n".
" exec failed!\n");
}
exit(0);
}
} else {
#
......@@ -352,6 +373,7 @@ while (1) {
alarm(($oldest + 120) - time());
my $childpid = wait();
my $exitstatus = $?;
alarm 0;
#
......@@ -375,9 +397,7 @@ while (1) {
#
# Look for setup failure, reported back through ssh.
#
if ($?) {
my $exitstatus = $?;
if ($exitstatus) {
print STDERR "vnode $vnode $mode on $pnode returned $?.\n"
if $debug;
......@@ -387,23 +407,42 @@ while (1) {
elsif ($exitstatus == 15) {
print STDERR "$vnode is wedged.\n" if $debug;
}
elsif ($exitstatus >> 8 == 99) {
print STDERR "$vnode did not allocate properly.\n" if $debug;
}
push @failed_nodes, [$vnode, $pnode, $mode, $exitstatus];
if (!$killmode) {
warn("*** $0:\n".
" Virtual node $vnode setup failure!\n");
}
else {
warn("*** $0:\n".
" Virtual node $vnode teardown failure!\n");
}
warn("*** $0:\n".
" Virtual node $vnode $mode failure!\n");
}
if ($plab && $killmode) {
if (system("$TB/sbin/plabnode free $pid $eid $vnode")) {
warn("*** $0:\n".
" Plab node free of $vnode failed");
if ($plab) {
#
# I am totally unhappy with this mess.
#
if ((($mode eq "teardown") || ($mode eq "cleanup")) ||
($exitstatus && (($exitstatus >> 8) != 99))) {
#
# Besides teardown/cleanup, we want to do this if the
# vnode setup on the node failed, but only if the
# plabnode alloc worked (it can fail too, exits with 99).
#
if (system("$TB/sbin/plabnode free $pid $eid $vnode")) {
warn("*** $0:\n".
" Plab node free of $vnode failed");
}
}
#
# If the node was in the setup process, then mark its allocstate
# as down so os_setup knows not to bother waiting for it. DEAD
# is a temp state, different then DOWN. It indicates the node
# was never instantiated (important distinction for plab).
#
if ($exitstatus &&
(($mode eq "setup") || ($mode eq "reboot"))) {
TBSetNodeAllocState($vnode, TBDB_ALLOCSTATE_DEAD());
}
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment