Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
emulab
emulab-devel
Commits
3b210b7b
Commit
3b210b7b
authored
Oct 13, 2003
by
Mac Newbold
Browse files
Rollback to prestatewait for now.
parent
b71f5f90
Changes
6
Expand all
Hide whitespace changes
Inline
Side-by-side
db/libdb.pm.in
View file @
3b210b7b
This diff is collapsed.
Click to expand it.
event/stated/stated.in
View file @
3b210b7b
...
...
@@ -116,7 +116,6 @@ my $TBNODESTATE = TBDB_TBEVENT_NODESTATE;
my
$TBNODEOPMODE
=
TBDB_TBEVENT_NODEOPMODE
;
my
$TBCONTROL
=
TBDB_TBEVENT_CONTROL
;
my
$TBCOMMAND
=
TBDB_TBEVENT_COMMAND
;
my
$TBFAILED
=
TBDB_TBEVENT_TBFAILED
;
my
$TBREBOOT
=
TBDB_COMMAND_REBOOT
;
my
$TBPOWEROFF
=
TBDB_COMMAND_POWEROFF
;
my
$TBPOWERON
=
TBDB_COMMAND_POWERON
;
...
...
@@ -296,10 +295,10 @@ while (1) {
# Check for nodes that have passed their timeout
if
(
!
qhead
(
$deadline
,
$node
))
{
debug
("
HEAD:
$node
in
"
.
(
$deadline
-
$now
)
.
"
, queue=
"
.
qsize
()
.
"
\n
");
info
("
HEAD:
$node
in
"
.
(
$deadline
-
$now
)
.
"
, queue=
"
.
qsize
()
.
"
\n
");
while
(
$now
>=
$deadline
&&
$node
ne
"")
{
qpop
(
$deadline
,
$node
);
debug
("
POP:
$node
in
"
.
(
$deadline
-
$now
)
.
"
, queue=
"
.
qsize
()
.
"
\n
");
info
("
POP:
$node
in
"
.
(
$deadline
-
$now
)
.
"
, queue=
"
.
qsize
()
.
"
\n
");
handleCtrlEvent
(
$node
,
$TBTIMEOUT
);
if
(
0
)
{
qshow
();
}
if
(
qhead
(
$deadline
,
$node
))
{
...
...
@@ -369,7 +368,6 @@ sub readStates(;@) {
$nodes
{
$node_id
}{
notified
}
=
0
;
$nodes
{
$node_id
}{
timedout
}
=
0
;
$nodes
{
$node_id
}{
noretry
}
=
0
;
$nodes
{
$node_id
}{
rebooting
}
=
0
;
# Is there a timeout? If so, set it up!
setTimeout
(
$mode
,
$state
,
$node_id
,
$timestamp
);
}
...
...
@@ -624,7 +622,6 @@ sub stateTransition($$) {
# We successfully booted, so clear some flags
$nodes
{
$node
}{
noretry
}
=
0
;
$nodes
{
$node
}{
timedout
}
=
0
;
$nodes
{
$node
}{
rebooting
}
=
0
;
# Check if we really need to do a reset
my
$r
=
DBQueryWarn
("
select osid,def_boot_osid from nodes
"
.
"
where node_id='
$node
'
");
...
...
@@ -829,33 +826,18 @@ sub handleCtrlEvent($$) {
foreach
(
$action
)
{
/^$TBTIMEOUTREBOOT/
&&
do
{
# If the node is in our control (ie node_reboot),
# we want to do something. If it is in the user's
# control (went to shutdown without a reboot event),
# then we don't want to touch it.
if
(
$nodes
{
$node
}{
rebooting
})
{
if
(
$timedout
>
1
)
{
# We've tried too many times...
# The node has now officially failed to boot
notify
("
Node
$node
timed out
$timedout
times!
\n
"
.
"
Giving up until it boots sucessfully.
\n
");
$nodes
{
$node
}{
noretry
}
=
1
;
info
("
$node
: Sending
$TBFAILED
$TBREBOOT
$node
\n
");
EventSendWarn
(
host
=>
$BOSSNODE
,
objtype
=>
$TBFAILED
,
eventtype
=>
$TBREBOOT
,
objname
=>
$node
);
}
else
{
# XXX Temporary! For now notify instead of
# really rebooting, until the timeout/retry
# stuff is gone from os_setup and os_load
#notify("Node $node has timed out in state ".
# "$mode/$state - REBOOT requested\n");
handleCommand
(
$node
,
$TBREBOOT
,
$timedout
,
1
);
}
if
(
$timedout
>
3
)
{
# We've tried too many times...
notify
("
Node
$node
has timed out too many times!
\n
"
.
"
Giving up until it boots sucessfully.
\n
");
$nodes
{
$node
}{
noretry
}
=
1
;
}
else
{
info
("
Node
$node
timed out in state
$mode
/
$state
"
.
"
under user's control - not rebooting
\n
");
# XXX Temporary! For now notify instead of
# really rebooting, until the timeout/retry
# stuff is gone from os_setup and os_load
notify
("
Node
$node
has timed out in state
"
.
"
$mode
/
$state
- REBOOT requested
\n
");
#handleCommand($node,$TBREBOOT,$timedout,1);
}
last
;
};
/^$TBTIMEOUTNOTIFY/
&&
do
{
...
...
@@ -881,7 +863,7 @@ sub handleCommand($$;$$) {
# We may need to do it here (while iterating over the list), or
# make some other fix up in handleEvent.
if
(
$command
eq
$TBREBOOT
&&
$retry
>
1
)
{
if
(
$command
eq
$TBREBOOT
&&
$retry
>
=
4
)
{
announce
("
Node
$params
has tried rebooting
$retry
times and has
\n
"
.
"
still not been successful. Please look into it soon.
\n
"
.
""
);
# "In the meantime, $params will be powered off.\n");
...
...
@@ -903,14 +885,11 @@ sub handleCommand($$;$$) {
$node
=
$nodes
[
$n
];
debug
("
Checking rebooting:
$node
,
$nodes
{
$node
},
"
.
"
$nodes
{
$node
}{state},
$nodes
{
$node
}{noretry}
\n
");
if
((
$nodes
{
$node
}{
rebooting
}
)
&&
(
!
$nodes
{
$node
}{
noretry
})
)
{
if
((
$nodes
{
$node
}{
state
}
ne
TBDB_NODESTATE_ISUP
)
&&
(
!
$nodes
{
$node
}{
noretry
})
)
{
# This node shouldn't be rebooted now...
info
("
$node
: Sending
$TBFAILED
$TBREBOOT
$node
\n
");
EventSendWarn
(
host
=>
$BOSSNODE
,
objtype
=>
$TBFAILED
,
eventtype
=>
$TBREBOOT
,
objname
=>
$node
);
# XXX Send feedback here somehow!
info
("
$node
: Trying to reboot too soon! Skipping.
\n
");
# Cut it out of the list
debug
("
Nodelist before ==>
"
.
join
("
",
@nodes
)
.
"
\n
");
...
...
@@ -926,8 +905,6 @@ sub handleCommand($$;$$) {
# Permissions were checked in order to send the message,
# so we don't need to do any fancy stuff here.
$nodes
{
$node
}{
rebooting
}
=
1
;
my
$cmd
=
"
$nodereboot
-r
$nodelist
";
my
$redir
=
"
2>&1 >> /usr/testbed/log/nodereboot.log &
";
debug
("
$cmd
$redir
\n
");
...
...
tbsetup/node_reboot.in
View file @
3b210b7b
...
...
@@ -45,7 +45,6 @@ use lib "@prefix@/lib";
use
libdb
;
use
libtestbed
;
use
event
;
use
StateWait
;
use
POSIX
qw(strftime)
;
my
$ssh
=
"
$TB
/bin/sshtb -n
";
...
...
@@ -121,10 +120,10 @@ if (defined($options{"e"})) {
}
# XXX Temporary, until we make event sending the default
#
$realmode=1;
if
(
$realmode
&&
$UID
&&
!
TBAdmin
(
$UID
))
{
die
("
*** You cannot use real mode!
\n
");
}
$realmode
=
1
;
#
if ($realmode && $UID && !TBAdmin($UID)) {
#
die("*** You cannot use real mode!\n");
#
}
#
# If eidmode, then get the node list out of the DB instead of the command
...
...
@@ -233,51 +232,21 @@ if (! keys(%realnodes) && ! keys(%virtnodes)) {
my
@sortednodes
=
sort
(
keys
(
%realnodes
));
if
(
!
$realmode
)
{
$
StateWait::
debug
=
$debug
;
if
(
!
$nowait
)
{
my
@states
=
();
if
(
$waitmode
)
{
print
"
Waiting for nodes to shut down and come up...
\n
";
@states
=
(
TBDB_NODESTATE_SHUTDOWN
,
TBDB_NODESTATE_ISUP
);
}
else
{
print
"
Waiting for nodes to shut down...
\n
";
@states
=
(
TBDB_NODESTATE_SHUTDOWN
);
}
initStateWait
(
\
@states
,
@sortednodes
)
}
EventSendFatal
(
host
=>
$BOSSNODE
,
objtype
=>
TBDB_TBEVENT_COMMAND
,
eventtype
=>
TBDB_COMMAND_REBOOT
,
objname
=>
join
("
,
",
@sortednodes
)
);
my
$rv
=
0
;
if
(
!
$nowait
)
{
# In here we can do some output to tell the user what's going on.
my
$start
=
time
();
my
$now
=
$start
;
my
$done
=
0
;
my
$total
=
scalar
(
@sortednodes
);
my
@finished
=
();
my
@failed
=
();
while
(
$done
<
$total
)
{
print
"
Waiting for
"
.
(
$total
-
$done
)
.
"
nodes...
\n
";
waitForState
(
\
@finished
,
\
@failed
,
60
);
$now
=
time
();
$done
=
scalar
(
@finished
)
+
scalar
(
@failed
);
my
$min
=
int
((
$now
-
$start
+
30
)
/
60
);
# round to nearest min.
print
"
After
$min
min.,
$done
nodes done...
\n
";
#print "fin = ".join(",",@finished)." fail = ".join(",",@failed).
# " Time=$now (".($now-$start)." elapsed), done=$done\n";
sleep
(
1
);
}
print
"
All
$total
nodes finished.
\n
";
$bad
=
scalar
(
@failed
);
if
(
$bad
)
{
print
"
There were
$bad
failures:
"
.
join
("
",
@failed
)
.
"
\n
";
$rv
=
1
;
if
(
$waitmode
)
{
# Wait for [SHUTDOWN,ISUP]
}
else
{
# Wait for [SHUTDOWN]
}
endStateWait
();
}
exit
(
$rv
);
exit
(
0
);
}
#
...
...
tbsetup/os_load.in
View file @
3b210b7b
...
...
@@ -51,7 +51,6 @@ my $MAXRETRIES = 1;
use
lib
"
@prefix
@/lib
";
use
libdb
;
use
libtestbed
;
use
StateWait
;
# Be careful not to exit on transient error
$
libdb::
DBQUERY_MAXTRIES
=
30
;
...
...
@@ -69,6 +68,7 @@ my $imageid;
my
$imagepid
=
TB_OPSPID
;
my
%imageid_row
;
my
@nodes
=
();
my
%retries
=
();
my
$mereuser
=
0
;
my
$waitmode
=
1
;
my
$failures
=
0
;
...
...
@@ -308,36 +308,111 @@ if (! $waitmode) {
exit
$failures
;
}
print
"
Issuing reboot for
@nodes
and then waiting ...
\n
";
initStateWait
([
TBDB_NODESTATE_RELOADDONE
]
,
@nodes
);
system
("
$nodereboot
@nodes
");
if
(
$?
)
{
print
"
Reboot failed for (some of)
@nodes
. Quitting!
\n
";
exit
(
$?
>>
8
);
# The retry vector is initialized to the number of retries we allow per
# node, afterwhich its a fatal error.
foreach
my
$node
(
@nodes
)
{
$retries
{
$node
}
=
$MAXRETRIES
;
}
my
$total
=
scalar
(
@nodes
);
my
@finished
=
();
my
@failed
=
();
waitForState
(
\
@finished
,
\
@failed
,
60
*
15
);
endStateWait
();
my
$worked
=
scalar
(
@finished
);
my
$failed
=
scalar
(
@failed
);
my
$remain
=
$total
-
$worked
-
$failed
;
if
(
$worked
!=
$total
)
{
print
"
*** os_load: Only
$worked
nodes of
$total
succeeded!
\n
";
if
(
$failed
)
{
print
"
\t
There were
$failed
failures.
\n
";
}
if
(
$remain
)
{
print
"
\t
There were
$remain
nodes that timed out.
\n
";
}
my
@failed
=
();
while
(
@nodes
)
{
# Reboot them all.
print
"
Issuing reboot for
@nodes
and then waiting ...
\n
";
if
(
$reboot
)
{
system
("
$nodereboot
@nodes
");
if
(
$?
)
{
print
"
Reboot failed for (some of)
@nodes
. Quitting!
\n
";
exit
(
$?
>>
8
);
}
}
# Now wait for them.
$startwait
=
time
;
@failed
=
WaitTillReloadDone
(
@nodes
);
@nodes
=
();
while
(
@failed
)
{
my
$node
=
shift
(
@failed
);
if
(
$retries
{
$node
})
{
print
"
*** Trying
$node
again (resetting/rebooting) ...
\n
";
push
(
@nodes
,
$node
);
# Possible race with reboot?
SetupReload
(
$node
);
# Retry until count hits zero.
$retries
{
$node
}
-=
1
;
}
else
{
print
"
***
$node
failed too many times. Skipping!
\n
";
$failures
++
;
}
}
}
$failures
+=
$failed
;
print
"
OS Reload Done! There were
$failures
failures!
\n
";
exit
(
$failures
);
# Wait for a reload to finish by watching its state
sub
WaitTillReloadDone
{
my
(
@nodes
)
=
@_
;
my
%done
=
();
my
$count
=
@nodes
;
my
@failed
=
();
foreach
my
$node
(
@nodes
)
{
$done
{
$node
}
=
0
;
}
print
STDERR
"
Waiting for
@nodes
to finish reloading
\n
"
.
`
date
`
if
$dbg
;
# Start a counter going, relative to the time we rebooted the first
# node.
my
$waittime
=
0
;
my
$minutes
=
0
;
while
(
$count
)
{
# Wait first to make sure reboot is done, and so that we don't
# wait one more time after everyone is up.
sleep
(
5
);
foreach
my
$node
(
@nodes
)
{
if
(
!
$done
{
$node
})
{
my
(
$query_result
,
@row
);
$query_result
=
DBQueryFatal
("
SELECT op_mode FROM nodes
"
.
"
where node_id='
$node
'
");
@row
=
$query_result
->
fetchrow_array
();
# We simply wait for the node to leave the reloading opmode
if
(
$row
[
0
]
ne
TBDB_NODEOPMODE_RELOAD
)
{
print
STDERR
"
$node
has left reloading mode
\n
"
.
`
date
`
if
$dbg
;
$count
--
;
$done
{
$node
}
=
1
;
next
;
}
# Soon we will have stated's timeouts take care of
# rebooting once or twice if we get stuck during
# reloading.
$waittime
=
time
-
$startwait
;
if
(
$waittime
>
$maxwait
)
{
my
$t
=
(
int
(
$waittime
/
60
));
print
"
***
$node
appears wedged;
"
.
"
its been
$t
minutes since it was rebooted.
\n
";
$count
--
;
$done
{
$node
}
=
1
;
push
(
@failed
,
$node
);
next
;
}
if
(
int
(
$waittime
/
60
)
>
$minutes
)
{
$minutes
=
int
(
$waittime
/
60
);
print
"
Still waiting for
$node
to reload -
"
.
"
its been
$minutes
minute(s)
\n
";
}
}
}
}
return
@failed
;
}
# Setup a reload. Note that imageid is global.
sub
SetupReload
($)
{
...
...
tbsetup/os_setup.in
View file @
3b210b7b
...
...
@@ -17,7 +17,7 @@ require 'ctime.pl';
# experiment creation to continue.
#
# TODO: Reload disk images.
#
#
# usage: os_setup <pid> <eid>
#
# errorcode: 0 - all reboots succeeded.
...
...
@@ -58,7 +58,6 @@ my $TFTP = "/tftpboot";
use
lib
"
@prefix
@/lib
";
use
libdb
;
use
libtestbed
;
use
StateWait
;
my
$nodereboot
=
"
$TB
/bin/node_reboot
";
my
$os_load
=
"
$TB
/bin/os_load
";
...
...
@@ -82,14 +81,14 @@ my @row;
#
# Ah, Frisbee works so lets do auto reloading for nodes that do not have
# the proper OS loaded on it. This will be a hash of lists; for each
# imageid, a list of the nodes to pass to os_load for that imageid.
# imageid, a list of the nodes to pass to os_load for that imageid.
#
my
%reloads
=
();
my
%reboots
=
();
my
%rebooted
=
();
my
$doautoload
=
1
;
my
$dolastload
=
1
;
# un-taint path
$ENV
{'
PATH
'}
=
'
/bin:/usr/bin:/usr/local/bin
';
delete
@ENV
{'
IFS
',
'
CDPATH
',
'
ENV
',
'
BASH_ENV
'};
...
...
@@ -172,7 +171,7 @@ $db_result =
"
left join node_types as nt on nt.type=n.type
"
.
"
where r.pid='
$pid
' and r.eid='
$eid
'
");
if
(
$db_result
->
numrows
<
1
)
{
if
(
$db_result
->
numrows
<
1
)
{
print
"
There are no nodes in experiment '
$eid
' in project '
$pid
'.
\n
";
exit
0
;
}
...
...
@@ -218,7 +217,7 @@ while (my %row = $db_result->fetchhash()) {
#
# Make sure the files specified in the paths exist. We mount the
# user tftp directory on boss node, so we can ignore the IP address,
# and just check the path directly.
# and just check the path directly.
#
if
(
defined
(
$row
{'
def_boot_path
'}))
{
my
$path
=
$row
{'
def_boot_path
'};
...
...
@@ -287,13 +286,13 @@ while (my %row = $db_result->fetchhash()) {
die_noretry
("
*** RPM
$rpm
for node
$node
does not exist!
");
}
}
#
# XXX - Ditto for tarfiles.
#
foreach
my
$tarspec
(
split
("
:
",
$row
{'
tarballs
'}))
{
my
(
$dir
,
$tar
)
=
split
("
",
$tarspec
);
if
(
!
-
f
$tar
)
{
die_noretry
("
*** Tarfile
$tar
for node
$node
does not exist!
");
}
...
...
@@ -303,13 +302,13 @@ while (my %row = $db_result->fetchhash()) {
# If there is a path specified, then we don't worry anymore about it.
# The user must know what is going on. The OSID might have a path
# associated with it, which means the same thing; we don't worry about
# it.
# it.
#
if
(
!
$bootpath
&&
!
$jailnode
&&
!
$plabnode
&&
!
$subnode
)
{
#
# These checks are not necessary if the front end and web page
# are doing the right thing, but lets be careful anyway.
#
#
if
(
!
$osid
)
{
die_noretry
(
"
***
$node
has no bootpath and no def_boot_osid set!
");
...
...
@@ -320,20 +319,20 @@ while (my %row = $db_result->fetchhash()) {
#
my
$osid_result
=
DBQueryFatal
("
select * from os_info where osid='
$osid
'
");
if
(
$osid_result
->
numrows
==
0
)
{
die_noretry
("
*** No such OSID
$osid
is defined!
");
}
my
%osid_row
=
$osid_result
->
fetchhash
();
#
# If there is an actual path, its an OSKit kernel not an image.
#
#
if
(
!
defined
(
$osid_row
{'
path
'})
||
$osid_row
{'
path
'}
eq
"")
{
#
# Not an OSKit kernel.
# Make sure this OSID is actually loaded on the machine.
# Make sure this OSID is actually loaded on the machine.
#
my
$p_result
=
DBQueryFatal
("
select * from partitions
"
.
...
...
@@ -341,8 +340,8 @@ while (my %row = $db_result->fetchhash()) {
#
# If not loaded, then see if the user was looking for the generic
# name of the OS that is loaded.
#
# name of the OS that is loaded.
#
if
(
$p_result
->
numrows
==
0
)
{
#
# Check to see if a non specific version specified.
...
...
@@ -352,7 +351,7 @@ while (my %row = $db_result->fetchhash()) {
#
# A non-specific version. There needs to be a way to
# map it to another osid.
# map it to another osid.
#
if
(
!
defined
(
$osid_row
{'
nextosid
'}))
{
die_noretry
(
...
...
@@ -360,11 +359,11 @@ while (my %row = $db_result->fetchhash()) {
"
No mapping can be made for
$osid
(
$node
)!
");
}
my
$nextosid
=
$osid_row
{'
nextosid
'};
#
# See if the nextosid is already on the disk. If not,
# it needs to be loaded.
#
#
my
$o_result
=
DBQueryFatal
("
select osid from partitions as p
"
.
"
where p.node_id='
$node
' and
"
.
...
...
@@ -373,18 +372,18 @@ while (my %row = $db_result->fetchhash()) {
if
(
!
$o_result
->
numrows
)
{
#
# User wants a specific version of an OS, but its not
# loaded on the machine.
# loaded on the machine.
#
print
"
Mapping
$osid
on
$node
to
$nextosid
"
.
"
and setting up a reload.
\n
";
SetupReload
(
$node
,
$nextosid
,
$type
);
$osids
{
$node
}
=
$nextosid
;
}
else
{
#
# Already loaded.
#
# Already loaded.
#
print
"
Mapping
$osid
on
$node
to
$nextosid
.
\n
";
if
(
$dolastload
&&
...
...
@@ -402,14 +401,14 @@ while (my %row = $db_result->fetchhash()) {
else
{
#
# User wants a specific version of an OS, but its not
# loaded on the machine.
# loaded on the machine.
#
SetupReload
(
$node
,
$osid
,
$type
);
}
}
else
{
#
# OSID is loaded, but might need to be cleaned.
# OSID is loaded, but might need to be cleaned.
#
if
(
$dolastload
&&
defined
(
$row
{'
pid
'})
&&
$row
{'
pid
'}
ne
$pid
)
{
...
...
@@ -418,18 +417,18 @@ while (my %row = $db_result->fetchhash()) {
}
}
}
#
# Set the canfail bit.
#
$canfail
{
$node
}
=
((
$failmode
eq
NODEFAILMODE_FATAL
())
?
0
:
1
);
print
STDERR
"
$node
-
$osids
{
$node
} -
$canfail
{
$node
}
\n
"
if
$dbg
;
}
#
# Collect some info about vnodes.
# Collect some info about vnodes.
#
foreach
my
$vnode
(
keys
(
%vnodes
))
{
my
$jailed
=
$vnodes
{
$vnode
};
...
...
@@ -461,7 +460,7 @@ foreach my $vnode (keys(%vnodes)) {
#
next
;
}
# Nothing else to do for local jail nodes at this time ...
}
...
...
@@ -474,10 +473,6 @@ if (!$TESTMODE) {
my
$count
=
0
;
my
$cmd
;
#$StateWait::debug=1;
initStateWait
(
[
TBDB_NODESTATE_ISUP
],
keys
%nodes
);
#$StateWait::debug=0;
foreach
my
$imageid
(
keys
(
%reloads
)
)
{
my
@list
=
@
{
$reloads
{
$imageid
}
};
...
...
@@ -540,18 +535,16 @@ foreach my $imageid ( keys(%reloads) ) {
foreach
my
$node
(
@list
)
{
my
$mode
;
if
(
!
TBGetNodeOpMode
(
$node
,
\
$mode
))
{
print
"
*** Error getting operational mode for
$node
!
\n
";
$failed
++
;
delete
(
$nodes
{
$node
});
cancelWait
(
$node
);
}
if
(
$mode
eq
TBDB_NODEOPMODE_RELOAD
)
{
print
"
*** Not waiting for
$node
since its reload failed!
\n
";
$failed
++
;
delete
(
$nodes
{
$node
});
cancelWait
(
$node
);
}
}
}
...
...
@@ -565,97 +558,81 @@ my @nodelist = keys(%nodes);
#
if
(
@nodelist
)
{
print
"
Waiting for local testbed nodes to finish rebooting ...
\n
";
}
TBDebugTimeStamp
("
Local node waiting started
");
my
%notified
=
();
foreach
$n
(
keys
%nodes
)
{
$notified
{
$n
}
=
0
;
}
my
$maxwait
=
60
*
15
;
# Don't wait more than 15 min.
my
$u
=
60
;
# Update the user every 60 seconds.
my
$total
=
scalar
keys
%nodes
;
my
$done
=
0
;
my
$start
=
time
();
my
$now
=
$start
;
my
@finished
=
();
my
@failed
=
();
while
(
(
$now
-
$start
)
<
$maxwait
&&
$done
<
$total
)
{
my
$wait
=
min
(
$u
,
$start
+
$maxwait
-
$now
);
if
(
$wait
==
0
)
{
die
("
Argh! Wait was 0 again
");
}
waitForState
(
\
@finished
,
\
@failed
,
$wait
);
$now
=
time
();
$done
=
scalar
(
@finished
)
+
scalar
(
@failed
);
foreach
$node
(
@finished
)
{
if
(
!
$notified
{
$node
})
{
print
"
$node
is alive and well
\n
";
SetNodeBootStatus
(
$node
,
NODEBOOTSTATUS_OKAY
);
TBSetNodeAllocState
(
$node
,
TBDB_ALLOCSTATE_RES_READY
()
);
$nodeAllocStates
{
$node
}
=
TBDB_ALLOCSTATE_RES_READY
();
$notified
{
$node
}
=
1
;
}
}
foreach
$node
(
@failed
)
{
if
(
!
$notified
{
$node
})
{
SetNodeBootStatus
(
$node
,
NODEBOOTSTATUS_FAILED
);
print
"
*** WARNING:
$node
may be down.
\n
"
.
"
This has been reported to testbed-ops.
\n
";
if
(
$canfail
{
$node
})
{
# Send mail to testbed-ops and to the user about it.
my
(
$user
)
=
getpwuid
(
$UID
);
SENDMAIL
(
$user_email_to
,
"
Node
$node
is down
",
"
Node
$node
in pid/eid
$pid
/
$eid
"
.
"
appears to be dead.
\n\n
"
.