stated.in 55.5 KB
Newer Older
Robert Ricci's avatar
Robert Ricci committed
1
#!/usr/bin/perl -w
Leigh B. Stoller's avatar
Leigh B. Stoller committed
2
3
#
# EMULAB-COPYRIGHT
4
# Copyright (c) 2000-2011 University of Utah and the Flux Group.
Leigh B. Stoller's avatar
Leigh B. Stoller committed
5
6
7
# All rights reserved.
#

Robert Ricci's avatar
Robert Ricci committed
8
9
10
#
# stated - A daemon to monitor the states of nodes in the testbed. Recives
# state change notification through the event system, and writes the new
Mac Newbold's avatar
Mac Newbold committed
11
# state into the database. Also watches for invalid transitions, timeouts,
12
# and performs other state-related control functions.
Robert Ricci's avatar
Robert Ricci committed
13
14
15
16
#
# Send it a HUP signal to get it to reload the timeout and transition
# information. Periodically reloads this information regardless, though.
#
17
# Will restart when sent SIGUSR1, by exec'ing its executable again.
Robert Ricci's avatar
Robert Ricci committed
18
#
19

Robert Ricci's avatar
Robert Ricci committed
20
21
# Configure variables
use lib '@prefix@/lib';
22
my $TB = "@prefix@";
23
my $REALTB = "/usr/testbed"; # So we know if we're the "real" stated or not
Robert Ricci's avatar
Robert Ricci committed
24
my $BOSSNODE = "@BOSSNODE@";
25
my $TBOPS = "@TBSTATEDEMAIL@";
26
my $REALTBOPS = "@TBOPSEMAIL@";
27
my $TBDBNAME = "@TBDBNAME@";
28
my $REALTBDBNAME = "tbdb"; # So we know if we're using the "real" db
29
my $osselect = "$TB/bin/os_select";
30
my $nodereboot = "$TB/bin/node_reboot";
31
my $rebootlog  = "$TB/log/nodereboot.log";
32
my $power = "$TB/bin/power";
33
my $apod = "$TB/sbin/apod";
34
my $TBLOG = "@TBLOGFACIL@";
35
my $LOGFILE = "$TB/log/stated.log";
Robert Ricci's avatar
Robert Ricci committed
36
37
38
39
40
41

$| = 1;

use event;
use libdb;
use libtestbed;
42
use TimeoutQueue;
Robert Ricci's avatar
Robert Ricci committed
43
use Getopt::Std;
44
#use strict;
Robert Ricci's avatar
Robert Ricci committed
45
use English;
Mac Newbold's avatar
Mac Newbold committed
46
47
use POSIX;			# for strftime, and sigprocmask and friends
use Fcntl;			# file constants for pidfile
48
49
use POSIX ":sys_wait_h";
use IO::Poll qw(POLLIN);
50
use POSIX qw(:errno_h);
Mac Newbold's avatar
Mac Newbold committed
51

52
53
54
55
56
# Set up some notification throttling
my $mailgap = 15;		# in seconds
my $lastmail = time() - $mailgap + 2; # Send a digest of startup msgs after 2s.
my %msgs = ();

Mac Newbold's avatar
Mac Newbold committed
57
# Number of iterations (roughly, seconds) after which we'll reload
Robert Ricci's avatar
Robert Ricci committed
58
59
# information from the database. This is so we don't end up with information
# that's _too_ out of sync.
60
my $reload_time = 600;
61
my $last_reload = time;
Robert Ricci's avatar
Robert Ricci committed
62

Mike Hibler's avatar
Mike Hibler committed
63
64
65
66
67
# For startup and reload: maximum time in the past for which we will schedule
# a timeout. Anything older is assumed to be ancient history and is ignored.
# Note it is a negative value.
my $maxpasttimeout = -(2 * 24 * 60 * 60);	# 2 days

68
69
70
# Handling of SECVIOLATIONS, eventually controlled by a sitevar
my $soft_secviolation = 1;

71
72
73
# Command line opts.
my $dbtag = "";
my $debug = 0;
74
75
my $server = "localhost";
my $port   = @BOSSEVENTPORT@;
76
77
my $lockfile;
my $pidfile;
78
my $eventchild;
79

Robert Ricci's avatar
Robert Ricci committed
80
81
82
# Process command-line arguments

sub usage {
Mac Newbold's avatar
Mac Newbold committed
83
    print << "END";
84
Usage: $0 [-h] [-d] [-s server] [-p port] [-t dbtag]
85
-h              This message
86
87
-d              Turn on debugging output, and do not go into the background
-t tag          Use only those nodes with matching tag in nodes table
Robert Ricci's avatar
Robert Ricci committed
88
89
-s server       Use specified server, instead of this site's bossnode
-p port	        Use specified port
90
Send SIGHUP to reload database state, or SIGUSR1 to restart completely.
Robert Ricci's avatar
Robert Ricci committed
91
END
92
    exit(1);
Robert Ricci's avatar
Robert Ricci committed
93
94
}

Mac Newbold's avatar
Mac Newbold committed
95
# Only root should run this - it won't work when run as a user...
96
# (Or, let an admin run it if it isn't the real one in /usr/testbed/ )
97
if ($UID && ( $TB eq $REALTB || ! TBAdmin($UID) ) ) {
Mac Newbold's avatar
Mac Newbold committed
98
99
100
    die("Only root can run this script!\n");
}

101
my @args = @ARGV;    # save a copy for restart before we mess with them.
Robert Ricci's avatar
Robert Ricci committed
102
my %opt = ();
103
if (!getopts("ds:p:ht:l",\%opt)) { usage(); }
Robert Ricci's avatar
Robert Ricci committed
104

Mac Newbold's avatar
Mac Newbold committed
105
106
107
108
109
110
if ($opt{h}) {
    exit &usage;
}
if (@ARGV) {
    exit &usage;
}
Robert Ricci's avatar
Robert Ricci committed
111

Mac Newbold's avatar
Mac Newbold committed
112
113
114
115
116
117
if ($opt{s}) {
    $server = $opt{s};
}
if ($opt{p}) {
    $port = $opt{p};
}
118
119
120
if ($opt{t}) {
    $dbtag = $opt{t};
}
Mac Newbold's avatar
Mac Newbold committed
121
122
123
if ($opt{d}) {
    $debug = 1;
}
Robert Ricci's avatar
Robert Ricci committed
124

125
# Grab some constants into variables
126
my $TBANYMODE    = TBDB_NODEOPMODE_ANY;
127
128
my $TBRESET      = TBDB_TBCONTROL_RESET;
my $TBTIMEOUT    = TBDB_TBCONTROL_TIMEOUT;
Mac Newbold's avatar
Mac Newbold committed
129
130
131
my $PXEBOOT      = TBDB_TBCONTROL_PXEBOOT;
my $BOOTING      = TBDB_TBCONTROL_BOOTING;
my $CHECKGENISUP = TBDB_TBCONTROL_CHECKGENISUP;
132
133
134
my $TBNOTIMEOUT  = TBDB_NO_STATE_TIMEOUT;
my $TBNODESTATE  = TBDB_TBEVENT_NODESTATE;
my $TBNODEOPMODE = TBDB_TBEVENT_NODEOPMODE;
135
136
137
138
139
140
my $TBCONTROL    = TBDB_TBEVENT_CONTROL;
my $TBCOMMAND    = TBDB_TBEVENT_COMMAND;
my $TBREBOOT     = TBDB_COMMAND_REBOOT;
my $TBPOWEROFF   = TBDB_COMMAND_POWEROFF;
my $TBPOWERON    = TBDB_COMMAND_POWERON;
my $TBPOWERCYCLE = TBDB_COMMAND_POWERCYCLE;
141
my $TBISUP       = TBDB_NODESTATE_ISUP;
142
143
144
my $PXEWAIT      = TBDB_NODESTATE_PXEWAIT;
my $PXEWAKEUP    = TBDB_NODESTATE_PXEWAKEUP;
my $PXEBOOTING   = TBDB_NODESTATE_PXEBOOTING;
145
146
my $TBRELOADDONEV1    = TBDB_TBCONTROL_RELOADDONE;
my $TBRELOADDONEV2    = TBDB_TBCONTROL_RELOADDONE_V2;
147
148
149
150
my $TBTIMEOUTREBOOT   = TBDB_STATED_TIMEOUT_REBOOT;
my $TBTIMEOUTNOTIFY   = TBDB_STATED_TIMEOUT_NOTIFY;
my $TBTIMEOUTCMDRETRY = TBDB_STATED_TIMEOUT_CMDRETRY;
my $TB_OSID_MBKERNEL  = TB_OSID_MBKERNEL;
151

152
153
# Special PXEBOOT state machine that all local nodes use.
my $PXEKERNEL	 = "PXEKERNEL";
154

155
156
# Even special-er SECUREBOOT state machine that local nodes may use.
my $SECUREBOOT	 = "SECUREBOOT";
Cody Cutler's avatar
Cody Cutler committed
157

158
159
160
161
# Protos.
sub debug(@);
sub fatal($);
sub notify($;$);
162
sub info($);
163
164
165
166
167
168
169
sub getTimeouts();
sub getValid();
sub getModeTrans();
sub getTriggers();
sub readStates(;@);
sub handleCtrlEvent($$);
sub reload();
170
171
sub StartEvents();
sub PollEvents($$);
172
    
173
174
175
if (!$debug) {
    if ( $TB eq $REALTB ) {
	$pidfile = "/var/run/stated.pid";
Mac Newbold's avatar
Mac Newbold committed
176
    } else {
177
	$pidfile = "$TB/locks/stated.pid";
Mac Newbold's avatar
Mac Newbold committed
178
    }
179
    debug("Using pidfile $pidfile\n");
Mac Newbold's avatar
Mac Newbold committed
180

181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
    if (-e $pidfile) {
	my $otherpid = `cat $pidfile`;
	my $running = `ps -auxww | grep $otherpid | grep -v grep`;
	if ($running ne "") {
	    fatal("Lockfile $pidfile exists, and process $otherpid appears ".
		  "to be running.\n");
	} else {
	    notify("Lockfile exists, but process $otherpid appears to be dead".
		   "\n".
		   "Removing lock file...\n");
	}
	system("rm $pidfile") &&
	    fatal("Couldn't remove $pidfile: $? $!\n");
    }
    # Background
196
    if (TBBackGround($LOGFILE)) {
Mac Newbold's avatar
Mac Newbold committed
197
198
	exit(0);
    }
199
    TBdbfork();
Robert Ricci's avatar
Robert Ricci committed
200
}
201
202
203
204
205
206
207
208
209

if (defined($pidfile)) {
    sysopen(PIDFILE, $pidfile, O_WRONLY | O_EXCL | O_CREAT) ||
	fatal("Couldn't create '$pidfile': $? $!\n");
    print PIDFILE "$$";
    close PIDFILE;
    # If I make it to here, I'll need to clean up the lock file
    $lockfile = $pidfile;
}
Robert Ricci's avatar
Robert Ricci committed
210

211
212
213
# Change my $0 so that it is easier to see in a ps/top
$0 = "$0";

214
215
216
217
218
#
# Start up the event system interface.
#
if (StartEvents() != 0) {
    fatal("Error starting events");
Robert Ricci's avatar
Robert Ricci committed
219
}
220
221
# We want to exit on any warning. 
$SIG{__WARN__} = sub { print STDERR $_[0]; exit(-1); };
Robert Ricci's avatar
Robert Ricci committed
222
223
224

# Read in the pre-existing node states, and timeout and valid transition
# information from the database
225
226
227
228
my %timeouts  = getTimeouts();
my %valid     = getValid();
my %modeTrans = getModeTrans();
my %triggers  = getTriggers();
229
my %nodes     = readStates();
230
my %timeouttag= ();
231
if ($debug) { qshow(); }
Robert Ricci's avatar
Robert Ricci committed
232
233
234

# Gets set if a reload of state from the database should happen.
my $do_reload = 0;
235
my $do_reopen = 0;
236
237
my $sigrestart= 0;
my $sigcleanup= 0;
238
my $exiting   = 0;
Robert Ricci's avatar
Robert Ricci committed
239
240
241
242

# Make the daemon reload database state on a sighup - but I'm worried
# about what would happen if we tried to do this mid-loop. So, we'll
# just set a flag and do it when we're done with our current pass.
243
$SIG{HUP}  = sub { info("SIGHUP - Reloading DB state\n"); $do_reload = 1; };
244
$SIG{USR2} = sub { info("SIGUSR2 - Reopening logfile\n"); $do_reopen = 1; };
245

Mac Newbold's avatar
Mac Newbold committed
246
# Set up other signals.
247
248
249
250
251
$SIG{USR1} = \&restart_wrap;
$SIG{INT}  = \&cleanup_wrap;
$SIG{QUIT} = \&cleanup_wrap;
$SIG{ABRT} = \&cleanup_wrap;
$SIG{TERM} = \&cleanup_wrap;
Robert Ricci's avatar
Robert Ricci committed
252

253
254
255
# Track if I handled an event or not
my $event_count = 0;

256
257
# Control how long I block while waiting for events
my $blockwait=0;
258
my $nextdeadline=0;
259
260
my $mailqueue=0;

261
262
263
264
265
notify("Stated starting up\n");

sub process_event_queue() {
    $event_count=0;
    my $lastcount=-1;
266
267
    my $wait;
    my $now = time();
Mike Hibler's avatar
Mike Hibler committed
268
    debug("Polling at $now - mq=$mailqueue bw=$blockwait ndl=$nextdeadline\n");
269
270
271
272
273
274
275
276
277
278
    if ( $mailqueue == 0) {
	# no messages waiting...
	if ($blockwait) {
	    # we can wait a long time - nothing else will happen
	    # until we get an event, or get woken up by a signal
	    $wait = 600;
	} else {
	    # only wait until the next deadline...
	    if ($nextdeadline > 0) {
		$wait = $nextdeadline - $now;
279
280
	    } else {
		$wait = 0;
281
282
283
284
285
	    }
	}
    } else {
	# mail is waiting. Only block until it is time to send it.
	$wait = $lastmail + $mailgap - $now;
Mac Newbold's avatar
Mac Newbold committed
286
	debug("Now $now, mailgap $mailgap, last $lastmail ==> wait $wait\n");
287
288
289
    }
    if ($wait < 0) { debug("Wait was $wait!\n"); $wait=0; }
    my $finish = $now + $wait;
Mac Newbold's avatar
Mac Newbold committed
290
    while (($event_count != $lastcount || $wait > 0) &&
291
	   !($sigrestart || $sigcleanup || $do_reload || $do_reopen)) {
292
	$lastcount = $event_count;
Mac Newbold's avatar
Mac Newbold committed
293
	# Don't block if we got a signal!
294
	if ($wait<=0 || $sigrestart || $sigcleanup || $do_reload || $do_reload){
295
	    PollEvents(0, 0);
296
	} else {
297
	    #debug("Using blocking event poll - $wait seconds\n");
298
	    # timeout param is in milliseconds, so multiply
299
	    PollEvents(1, $wait*1000);
300
301
302
	    $now = time();
	    # subtract seconds elapsed from my wait time
	    $wait = $finish - $now;
303
	    #debug("Finished blocking event poll - $wait seconds remain\n");
304
	    if ($event_count > 0 &&
Mac Newbold's avatar
Mac Newbold committed
305
		(qsize() > 0 || $mailqueue ||
306
		 $sigrestart || $sigcleanup || $do_reload || $do_reopen)) {
307
308
309
310
311
312
313
		$blockwait=0;
		$wait=0;
		#debug("Cancelling wait - timeouts/msgs waiting, or HUP'd\n");
		#debug("---End Blocking Wait ---\n");
	    }
	}
	#debug("Wait is $wait\n");
314
315
    }
    if ($event_count > 0) {
Mac Newbold's avatar
Mac Newbold committed
316
	debug("Handled $event_count event(s).\n");
317
318
    }
}
Robert Ricci's avatar
Robert Ricci committed
319

320
# Now, we just poll for events, and watch for timeouts
Robert Ricci's avatar
Robert Ricci committed
321
while (1) {
Mac Newbold's avatar
Mac Newbold committed
322
    my $now = time();
323
324
325
326
    my ($deadline,$node);

    # Check for nodes that have passed their timeout
    if (!qhead($deadline,$node)) {
327
	info("HEAD: $node in ".($deadline-$now).", queue=".qsize()."\n");
328
329
	while ($now >= $deadline && $node ne "") {
	    qpop($deadline,$node);
330
	    info("POP: $node in ".($deadline-$now).", queue=".qsize()."\n");
331
332
333
334
335
336
337
338
339
340
341
342
343
	    #
	    # If the node is no longer in the nodes array, it was most
	    # likely a dynamic virtual node which is now gone. Ignore.
	    # Need to look at reload() to see if we can catch this earlier. 
	    #
	    if (exists($nodes{$node})) {
		handleCtrlEvent($node,$TBTIMEOUT);
	    }
	    else {
		info("POP: $node is no longer in the nodes array. Skipping\n");
		delete($timeout_tag{$node})
		    if (exists($timeout_tag{$node}));
	    }
344
345
346
347
	    if (0) { qshow(); }
	    if (qhead($deadline,$node)) {
		$deadline=0; $node="";
	    }
348
	}
349
350
    } else {
	$deadline=0;
351
    }
352
    $nextdeadline = $deadline;
353

Mac Newbold's avatar
Mac Newbold committed
354
355
356
357
    if ($do_reload || ($now - $last_reload > $reload_time)) {
	reload();
	$do_reload = 0;
    }
Mac Newbold's avatar
Mac Newbold committed
358

Mike Hibler's avatar
Mike Hibler committed
359
360
361
362
363
    if (qsize()==0) {
	$blockwait=1;
	debug("---Blocking wait okay---\n");
    }

Mac Newbold's avatar
Mac Newbold committed
364
365
    # Send any messages in the queue if it is time
    notify("",1);
Mac Newbold's avatar
Mac Newbold committed
366

367
368
    if ($sigrestart) { restart(); }
    if ($sigcleanup) { cleanup(); }
369
370
371
372
373
374
375
    if ($do_reopen)  {
	ReOpenLog($LOGFILE);
	if (! kill('USR2', $eventchild)) {
	    fatal("Could not signal(USR2) event reader child\n");
	}
	$do_reopen = 0;
    }
376
    process_event_queue;
Robert Ricci's avatar
Robert Ricci committed
377
378
}

Mac Newbold's avatar
Mac Newbold committed
379
380
exit(0);

Robert Ricci's avatar
Robert Ricci committed
381
# Read the current states of nodes from the database
382
sub readStates(;@) {
Mac Newbold's avatar
Mac Newbold committed
383
    my %oldnodes = @_;
Mike Hibler's avatar
Mike Hibler committed
384
    my $now = time();
Mac Newbold's avatar
Mac Newbold committed
385
386

    # Guard against undefined variable warnings
387
    if (! %oldnodes) {
Mac Newbold's avatar
Mac Newbold committed
388
389
	%oldnodes = ();
    }
390

Mac Newbold's avatar
Mac Newbold committed
391
392
393
    #debug("readStates called\n");
    my $result = DBQueryFatal("SELECT node_id, eventstate, " .
			      "state_timestamp, op_mode, " .
394
			      "op_mode_timestamp, stated_tag FROM nodes ".
395
			      "where node_id not like 'sh%'");
Mac Newbold's avatar
Mac Newbold committed
396
397

    my %nodes;
398
    while (my ($node_id, $state, $timestamp, $mode, $mode_timestamp, $tag)
Mac Newbold's avatar
Mac Newbold committed
399
	   = $result->fetchrow()) {
400
401
402
403
404
405
406
407
408
409
410
	$nodes{$node_id}{"tag"} = (defined($tag) ? $tag : "");

	if ($dbtag ne "" && $dbtag eq $nodes{$node_id}{"tag"}) {
	    info("This stated will work on $node_id\n");
	}
	if ($dbtag eq "" && $dbtag ne $nodes{$node_id}{"tag"}) {
	    info("This stated will *NOT* work on $node_id\n");
	}
	if ($dbtag ne $nodes{$node_id}{"tag"}) {
	    remTimeout($node_id);
	}
Mac Newbold's avatar
Mac Newbold committed
411

412
	#
Mac Newbold's avatar
Mac Newbold committed
413
414
415
416
	# If there's an entry in oldnodes for this node, and it
	# hasn't changed state or time, use the old entry (so that
	# we don't lose information about which nodes we've already
	# notified the ops about, etc.)
417
	#
Mac Newbold's avatar
Mac Newbold committed
418
419
420
421
422
	if ($oldnodes{$node_id} && $state && $timestamp &&
	    ($oldnodes{$node_id}{state} eq $state) &&
	    ($oldnodes{$node_id}{mode} eq $mode) &&
	    ($oldnodes{$node_id}{timestamp} == $timestamp)) {
	    $nodes{$node_id} = $oldnodes{$node_id};
423
	} else {
Mac Newbold's avatar
Mac Newbold committed
424
425
426
427
	    $nodes{$node_id}{state}          = $state;
	    $nodes{$node_id}{timestamp}      = $timestamp;
	    $nodes{$node_id}{mode}           = $mode;
	    $nodes{$node_id}{mode_timestamp} = $mode_timestamp;
428
429
430
	    $nodes{$node_id}{notified}       = 0;
	    $nodes{$node_id}{timedout}       = 0;
	    $nodes{$node_id}{noretry}        = 0;
431
	    # Is there a timeout? If so, set it up!
Mike Hibler's avatar
Mike Hibler committed
432
433
434
435
436
437
438
439
	    if (defined($timestamp) && $dbtag eq $nodes{$node_id}{"tag"}) {
		my $TO = $timestamp - $now;
		if ($TO > $maxpasttimeout) {
		    setTimeout($mode,$state,$node_id,$timestamp);
		} else {
		    debug("Ignoring ancient timeout $TO for ",
			  "($node_id,$mode,$state)\n");
		}
440
	    }
441
	}
Mac Newbold's avatar
Mac Newbold committed
442
443
    }
    return %nodes;
Robert Ricci's avatar
Robert Ricci committed
444
445
446
447
448
449
}

#
# Read timeouts for various states from the database
#
sub getTimeouts() {
Mac Newbold's avatar
Mac Newbold committed
450
451
452
    #debug("getTimeouts called\n");
    my $result = DBQueryFatal("SELECT op_mode, state, timeout, action " .
			      "FROM state_timeouts");
Robert Ricci's avatar
Robert Ricci committed
453

Mac Newbold's avatar
Mac Newbold committed
454
455
456
457
458
    my %timeouts;
    while (my ($op_mode, $state, $timeout, $action) = $result->fetchrow()) {
	$timeouts{$op_mode}{$state} = [ $timeout, $action ];
    }
    return %timeouts;
Robert Ricci's avatar
Robert Ricci committed
459
460
461
462
463
464
}

#
# Read the list of valid state transitions from the database
#
sub getValid() {
Mac Newbold's avatar
Mac Newbold committed
465
466
467
    #debug("getValid called\n");
    my $result = DBQueryFatal("SELECT op_mode, state1, state2 " .
			      "FROM state_transitions");
Robert Ricci's avatar
Robert Ricci committed
468

Mac Newbold's avatar
Mac Newbold committed
469
470
471
472
473
    my %valid;
    while (my ($mode,$state1, $state2) = $result->fetchrow()) {
	$valid{$mode}{$state1}{$state2} = 1;
    }
    return %valid;
Robert Ricci's avatar
Robert Ricci committed
474
475
}

476
477
478
479
#
# Read the list of valid mode transitions from the database
#
sub getModeTrans() {
Mac Newbold's avatar
Mac Newbold committed
480
    #debug("getModeTrans called\n");
Mac Newbold's avatar
Mac Newbold committed
481
    my $result =
Mac Newbold's avatar
Mac Newbold committed
482
483
484
485
486
487
488
489
490
491
492
      DBQueryFatal("SELECT op_mode1, state1, op_mode2, state2 " .
		   "FROM mode_transitions order by op_mode1,state1");

    my %modeTrans;
    while (my ($mode1,$state1, $mode2, $state2) = $result->fetchrow()) {
	if (!defined($modeTrans{"$mode1:$state1"})) {
	    $modeTrans{"$mode1:$state1"}= ["$mode2:$state2"];
	} else {
	    my @l = @{$modeTrans{"$mode1:$state1"}};
	    push(@l, "$mode2:$state2");
	    $modeTrans{"$mode1:$state1"}= \@l;
493
	}
Mac Newbold's avatar
Mac Newbold committed
494
495
    }
    return %modeTrans;
496
497
498
499
500
501
}

#
# Read the list of states which trigger an action
#
sub getTriggers() {
502
    debug("getTriggers called\n");
Mac Newbold's avatar
Mac Newbold committed
503

504
505
    debug("anymode ==> '$TBANYMODE'\n");

Mac Newbold's avatar
Mac Newbold committed
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
    # A note about triggers:
    #
    # "per-node" triggers only affect their specific node in a
    # particular mode/state, and are run first of all. "global"
    # triggers are triggers for a given mode/state that affect all
    # nodes, and are run after any per-node triggers. "Any-mode"
    # triggers are tied to a state, and occur in that state in any
    # mode. The any-mode triggers are over-ridden by global triggers,
    # and if an "Any-mode" trigger for state XYZ exists as well as a
    # global trigger for mode FOOBAR state XYZ, then when I arrive in
    # XYZ any per-node triggers will be run. Then, if I'm in mode
    # FOOBAR, only the global trigger will run. If I'm in any other
    # mode, only the any-mode trigger will run.

    # (our "*" is stored as $TBANYMODE)
    # Per-node triggers have a specific node_id
    # Global triggers have "*" as the node_id
    # Any-mode triggers have "*" as the mode, and can be global or per-node

    # Grab global triggers (including any-mode)
    my $result =
527
      DBQueryFatal("SELECT op_mode, state, `trigger` " .
528
529
		   "FROM state_triggers where node_id='$TBANYMODE' ".
		   "order by op_mode,state");
Mac Newbold's avatar
Mac Newbold committed
530
    my %t;
531
    while (my ($mode, $state, $trig) = $result->fetchrow()) {
Mac Newbold's avatar
Mac Newbold committed
532
533
534
	my @trigs = split(/\s*,\s*/,$trig);
	$t{"$mode:$state"} = \@trigs;
	debug("trig($mode:$state)\t => ".join(',',@trigs)."\n");
535
536
    }

Mac Newbold's avatar
Mac Newbold committed
537
538
    # Grab per-node triggers (including any-mode)
    $result =
539
      DBQueryFatal("SELECT node_id, op_mode, state, `trigger` " .
540
541
542
543
544
545
		   "FROM state_triggers where node_id!='$TBANYMODE' ".
		   "order by op_mode,state");
    while (my ($n, $mode, $state, $trig) = $result->fetchrow()) {
	my @trigs = split(/\s*,\s*/,$trig);
	$t{"$n:$mode:$state"} = \@trigs;
	debug("trig($n:$mode:$state)\t => ".join(',',@trigs)."\n");
Mac Newbold's avatar
Mac Newbold committed
546
    }
547

Mac Newbold's avatar
Mac Newbold committed
548
549
    debug(hash_recurse(%t));

Mac Newbold's avatar
Mac Newbold committed
550
    return %t;
551
552
}

Robert Ricci's avatar
Robert Ricci committed
553
554
555
556
#
# Gets called for every event that we recieve
#
sub handleEvent($$$) {
557
    my ($objtype,$objname,$eventtype) = @_;
Mac Newbold's avatar
Mac Newbold committed
558
    $event_count++;
Mac Newbold's avatar
Mac Newbold committed
559

560
    #
Mac Newbold's avatar
Mac Newbold committed
561
562
563
    # For readability, only do this on the main stated. This will print all
    # events, which gets cumbersome with debugging versions, so we'll print
    # only applicable events in debug versions, after we decide they apply.
564
    #
565
    if ($dbtag eq "") {
566
567
	debug("Got an event: ($objtype,$objname,$eventtype)\n");
    }
Mac Newbold's avatar
Mac Newbold committed
568
569
570
571

    #
    # Check to see if another instance is supposed to be handling this node
    #
572
573
    if ($objtype ne $TBCOMMAND) {
	my $node = $objname;
Mac Newbold's avatar
Mac Newbold committed
574

575
576
577
578
579
580
581
	#
	# If we have never seen this node, reload.
	#
	if (! defined($nodes{$node})) {
	    reload();

	    # Still not defined, someone screwed up! This could end up
Mac Newbold's avatar
Mac Newbold committed
582
	    # churning via reload(). Bad.
583
	    if (! defined($nodes{$node})) {
584
585
586
587
		notify("Got $objtype/$eventtype for nonexistent $node!\n");
		return;
	    }
	}
Mac Newbold's avatar
Mac Newbold committed
588

589
590
	#
	# If a stated_tag was specified on the command line, ignore those
Mac Newbold's avatar
Mac Newbold committed
591
	# nodes that do not match.
592
	#
Mac Newbold's avatar
Mac Newbold committed
593
	#debug("dbtag='$dbtag', node $node='".$nodes{$node}{"tag"}."'\n");
594
595
596
597
598
599
600
601
602
603
	if ($dbtag ne $nodes{$node}{"tag"}) {
	    # Record when main stated ignores a node.
	    info("Got $objtype/$eventtype for $node, which is not mine\n")
		if ($dbtag eq "");
	    return;
	}
	if (!checkDBRedirect($node)) {
	    info("Got $objtype/$eventtype for $node, which is not mine\n");
	    return;
	}
Mac Newbold's avatar
Mac Newbold committed
604
    }
Mac Newbold's avatar
Mac Newbold committed
605
606
607
608
609

    #
    # If this is a debugging version, then this event is for one of my
    # nodes, so I can print out the event now. (Main version prints earlier.)
    #
610
611
612
    if ($dbtag ne "") {
	debug("Got an event: ($objtype,$objname,$eventtype)\n");
    }
Mac Newbold's avatar
Mac Newbold committed
613
614
615

 SWITCH: for ($objtype) {

Mac Newbold's avatar
Mac Newbold committed
616
617
618
619
620
621
622
623
624
625
626
627
628
629
	(/$TBNODESTATE/) && do {
	    stateTransition($objname,$eventtype);
	    last;
	};
	(/$TBNODEOPMODE/) && do {
	    opModeTransition($objname,$eventtype);
	    notify("Use of deprecated event TBNODEOPMODE:\n".
		   "$objname->$eventtype\n");
	    last;
	};
	(/$TBCONTROL/) && do {
	    handleCtrlEvent($objname,$eventtype);
	    last;
	};
630
631
632
633
	(/$TBCOMMAND/) && do {
	    handleCommand($objname,$eventtype);
	    last;
	};
634

Mac Newbold's avatar
Mac Newbold committed
635
    }
636
637
638
639
640

}

sub stateTransition($$) {

641
    my ($node,$newstate) = @_;
Robert Ricci's avatar
Robert Ricci committed
642

643
644
    # Check for invalid transitions
    my ($oldstate, $mode);
645
646
647
    $oldstate = $nodes{$node}{state};
    $mode     = $nodes{$node}{mode};

648
649
650
    if ($oldstate && $mode && $valid{$mode} && $valid{$mode}{$oldstate} &&
	!$valid{$mode}{$oldstate}{$newstate}) {
	notify("Invalid transition for node $node from $mode/$oldstate " .
651
	       "to $newstate\n");
652
653
654
655
656
657
	#
	# Machines in the secure boot path are not allowed to jump
	# willy-nilly into unknown states.
	#
        if ($mode eq TBDB_NODEOPMODE_SECUREBOOT ||
	    $mode eq TBDB_NODEOPMODE_SECURELOAD) {
658
659
660
            $newstate = TBDB_NODESTATE_SECVIOLATION;
            notify("Moving $node to $newstate because it's in $mode\n");
        }
661
662
663
664
665
666
    }
    #
    # Nodes that are in the SECVIOLATION state are not allowed to leave!
    #
    if ($oldstate eq TBDB_NODESTATE_SECVIOLATION &&
	$newstate ne TBDB_NODESTATE_SECVIOLATION) {
667
668
669
670
671
672
673
674
675
676
677
678
679
	#
	# Allow transitions to SHUTDOWN.
	# This allows someone to reboot a node in the SECVIOLATION state
	# getting it back to MINIMAL/SHUTDOWN.
	#
	# XXX DEBUG ONLY!
	# 
	if ($soft_secviolation && $newstate eq TBDB_NODESTATE_SHUTDOWN) {
	    notify("$node allowed to transition: SECVIOLATION => SHUTDOWN\n");
	} else {
	    notify("$node tried to leave SECVIOLATION (to $newstate)\n");
	    $newstate = TBDB_NODESTATE_SECVIOLATION;
	}
680
    }
Robert Ricci's avatar
Robert Ricci committed
681

682
683
684
685
    my $now = time();
    $nodes{$node}{state}     = $newstate;
    $nodes{$node}{timestamp} = $now;
    $nodes{$node}{notified}  = 0;
686

687
688
689
    info("$node: $mode/$oldstate => $mode/$newstate\n");
    DBQueryFatal("UPDATE nodes SET eventstate='$newstate', " .
		 "state_timestamp='$now' WHERE node_id='$node'");
690

691
692
693
694
    # Before we set the timeout (overwriting any current ones), we need
    # to check if we had a pending command
    if (qfind($node) &&
	$timeout_tag{$node} =~ /^$TBCOMMAND:/) {
695
        debug("TimeoutTag = '$timeout_tag{$node}'\n");
696
	my ($str,$cmd) = split(":",$timeout_tag{$node});
697
	debug("str=$str\tcmd=$cmd\tTBREBOOT=$TBREBOOT\tstate=$newstate\n");
698
	if ($cmd eq $TBREBOOT) {
699
	    if ($newstate eq TBDB_NODESTATE_SHUTDOWN ) {
700
701
702
		info("$node: $TBREBOOT success\n");
		# Timeout will get cleared below by setTimeout call
	    } else {
703
704
		notify("$node: $TBREBOOT in progress, but got state ".
		       "$newstate instead of ".TBDB_NODESTATE_SHUTDOWN."!\n");
705
706
707
708
709
	    }
	#} elsif ($cmd eq $FOO ) {
	    # Add more here...
	} else {
	    notify("$node: Unknown command timeout '$timeout_tag{$node}' ".
710
		   "found at $mode/$newstate\n");
711
712
713
	}
    }

714
715
716
717
718
    #
    # Check if this state has a timeout, and if so, put it in the queue.
    # Note that any opmode transition below will replace (or remove) this
    # timeout if appropriate.
    #
719
720
    setTimeout($mode,$newstate,$node,$now);

Mac Newbold's avatar
Mac Newbold committed
721
722
723
724
725
726
727
    # Check if this state has any triggers
    my @nodetrigs = GetNodeTriggerList($node,$mode,$newstate,1);
    my @trigs = GetNodeTriggerList($node,$mode,$newstate);
    if (@trigs > 0) {
	debug("Running triggers: ".join("/",@trigs)."\n");
	foreach ( @trigs) {
	    my $trig = $_;
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
	    /^SCRIPT:([-\w\/]+)$/ && do {
		#
		# Run a script. No arguments at the moment.
		#
		my $script = $1;

		#
		# If the path is absolute, run it. Otherwise it has to
		# come from the sbin directory.
		#
		if (! ($script =~ /^\//)) {
		    $script = "$TB/sbin/$script";
		}
		info("$node: Running $script.\n");
		system("$script &");
		next;
	    };
Mac Newbold's avatar
Mac Newbold committed
745
	    /^$PXEBOOT$/ && do {
746
		#
Mac Newbold's avatar
Mac Newbold committed
747
748
749
750
751
752
753
754
755
756
757
758
		# See if we jumped into the PXEBOOT kernel. Bootinfo
		# will send PXEBOOTING every time a node contacts it,
		# which is our indicator that the node is in the first
		# phase of booting. At this point we want to switch
		# state machines since the entire boot process is
		# governed by a single state machine that is
		# independent of the OS that the node will eventually
		# boot.  Rather then encode that in each state
		# machine, we use a special machine with a defined
		# entrypoint (PXEBOOTING) and a defined exitpoint
		# (BOOTING). See below for where we jump back out of
		# this state machine.
759
		#
Mac Newbold's avatar
Mac Newbold committed
760
761
762
763
		# Jumped in. We need to change the opmode so that
		# the state transitions are legal. We do not
		# bother to save the old opmode since we can
		# figure it out later when we leave.
764
		#
Mac Newbold's avatar
Mac Newbold committed
765
766
767
768
769
		debug("Running $PXEBOOT trigger\n");
		if ($mode ne $PXEKERNEL) {
		    info("$node: Forcing mode transition into $PXEKERNEL!\n");
		    opModeTransition($node, $PXEKERNEL, 1);
		    $mode=$PXEKERNEL;
770
		}
Mac Newbold's avatar
Mac Newbold committed
771
772
		next;
	    };
773
774
775
776
777
778
	    /^$SECUREBOOT$/ && do {
		#
		# Force machine into the SECUREBOOT/LOAD op_mode.
		# Currently triggered by receipt of GPXEBOOTING state.
		# This could come from any state as it just indicates that
		# a machine with a gPXE dongle has rebooted.
Cody Cutler's avatar
Cody Cutler committed
779
		#
780
781
782
		# To differentiate BOOT from LOAD we check next_op_mode.
		# It will be set to SECURELOAD when we need to go there,
		# otherwise we force it to SECUREBOOT.
Cody Cutler's avatar
Cody Cutler committed
783
		#
784
785
786
787
788
789
790
791
792
793
794
795
		debug("Running $SECUREBOOT trigger\n");
		my $query_result =
		    DBQueryWarn("select next_op_mode from nodes ".
				"where node_id='$node'");
		my ($nextmode) = $query_result->fetchrow();
		if (!$nextmode || $nextmode ne TBDB_NODEOPMODE_SECURELOAD) {
		    $nextmode = TBDB_NODEOPMODE_SECUREBOOT;
		}
		if ($mode ne $nextmode) {
		    info("$node: Forcing mode transition to $nextmode!\n");
		    opModeTransition($node, $nextmode, 1);
		    $mode=$nextmode;
Cody Cutler's avatar
Cody Cutler committed
796
797
798
		}
		next;
	    };
Mac Newbold's avatar
Mac Newbold committed
799
800
801
802
803
	    /^$BOOTING$/ && do {
		#
		# See if we are in the right mode/osid.
		#
		my ($bootosid,$bootopmode) = TBBootWhat($node, $debug);
804

Mac Newbold's avatar
Mac Newbold committed
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
		info("$node: BootWhat says $bootosid (mode $bootopmode).\n");
		DBQueryFatal("update nodes set osid='$bootosid' ".
			     "where node_id='$node'");

		if ($bootopmode ne $mode) {
		    if ($mode eq $PXEKERNEL) {
			#
			# If we came from PXE boot, then we have to
			# jump out of the PXEKERNEL state machine into
			# whatever state machine is current for the
			# node. Since we came through bootinfo, we
			# know that the node is doing what it is
			# supposed to, and that this change matches
			# what the node is booting.
			#
			info("$node: Forcing mode transition out of $PXEKERNEL!\n");
			opModeTransition($node, $bootopmode, 1);
			$mode=$bootopmode;
		    }
		    elsif ($oldstate eq TBDB_NODESTATE_ISUP) {
			#
			# Skipped SHUTDOWN, which could result in a
			# missed opmode transition. Can this really
			# happen anymore?
			#
			info("$node: Came from ISUP! ".
			     "Checking for mode transition\n");
			my $query_result =
			  DBQueryWarn("select next_op_mode from nodes ".
				      "where node_id='$node'");
			my ($nextmode) = $query_result->fetchrow();
			if ($nextmode) {
			    info("$node: Forcing mode transition!\n");
			    opModeTransition($node, $nextmode, 1);
			    $mode=$nextmode;
			}
		    }
		    else {
			my $str = "$node is running $bootosid, but in ".
			  "mode $mode\ninstead of mode $bootopmode!\n";
			
			if ($bootopmode eq "RELOAD") {
			    #
			    # For now, only force if we're going into
			    # reload mode, so we don't get stuck
			    # looping in reloading.  Can this happen
			    # anymore?
			    #
			    DBQueryFatal("UPDATE nodes SET ".
					 "op_mode='$bootopmode', ".
					 "op_mode_timestamp=".
					 "unix_timestamp(now()) ".
					 "WHERE node_id='$node'");
			    $nodes{$node}{mode} = $bootopmode;
			    $nodes{$node}{mode_timestamp} = $now;
			    $str .= "Forced op_mode to $bootopmode.\n";
			}
			notify($str);
		    }
		}
		next;
	    };
	    /^$CHECKGENISUP$/ && do {
		checkGenISUP($node);
		next;
	    };
871
872
873
874
	    /^CHECKPORTREG$/ && do {
		CheckPortRegistration($node);
		next;
	    };
875
	    /^$TBRESET$/ && do {
876
877
878
		# We successfully booted, so clear some flags
		$nodes{$node}{noretry}   = 0;
		$nodes{$node}{timedout}  = 0;
879
880
881
882
		# Check if we really need to do a reset
		my $r = DBQueryWarn("select osid,def_boot_osid from nodes ".
				    "where node_id='$node'");
		my ($osid,$defosid) = $r->fetchrow();
883
884
885
886
887
888
889
		if (! (defined($osid) && defined($defosid))) {
		    info("$node: osid not defined\n")
			if (!defined($osid));
		    info("$node: def_boot_osid not defined\n")
			if (!defined($defosid));
		}
		elsif ($osid ne $defosid) {
890
891
892
893
		    handleCtrlEvent($node,$trig);
		}
		next;
	    };
894
	    (/^$TBRELOADDONEV1$/ || /^$TBRELOADDONEV2$/) && do {
895
896
897
		handleCtrlEvent($node,$trig);
		next;
	    };
898
899
900
901
902
903
904
905
	    /^$TBISUP$/ && do {
		info("$node: Triggered $TBISUP\n");
		EventSendWarn(host      => $BOSSNODE ,
			      objtype   => TBDB_TBEVENT_NODESTATE ,
			      eventtype => TBDB_NODESTATE_ISUP ,
			      objname   => $node);
		next;
	    };
906
	    (/^$TBREBOOT$/ || /^$TBPOWERCYCLE$/ || /^$TBPOWEROFF$/) && do {
907
		handleCommand($node,$trig);
908
909
910
		next;
	    };
            (/^EMAILNOTIFY$/) && do {
911
912
		my $msg = "$node entered state $mode/$newstate from " .
		    "$mode/$oldstate";
913
		my $dest = $REALTBOPS;
914
		if ($newstate eq TBDB_NODESTATE_SECVIOLATION) {
915
916
917
918
919
920
921
922
		    if ($soft_secviolation) {
			$msg .= "\n\nNode $node was allowed to continue.\n";
			$dest = $TBOPS;
		    } else {
			$msg .= "\n\nNode $node has been powered off.\n" .
			        "You must address the cause of the violation ".
				"and reset the eventstate before powering on.";
		    }
923
		}
924
925
		SENDMAIL($dest, "STATED: $node entered state $newstate",
			 $msg, "Stated Daemon <".$TBOPS.">");
926
		next;
927
            };
928
929
930
931
932
933
934
935
936
937
938
939
	    /^RELOADOLDMFS$/ && do {
		my $frisbee_osid = TBNodeDiskloadOSID($node);
		my $frisbee_name = DBQuerySingleFatal("select osname from os_info where osid=$frisbee_osid");
		my $msg = 
		    ("Attempted to load multiple images on $node using an old Frisbee MFS.\n".
		     "To make this work please update the $frisbee_name MFS image.\n");
		SENDMAIL($REALTBOPS,
			 "$frisbee_name Needs Updating",
			 $msg,
			 "Stated Daemon <".$TBOPS.">");
		next;
	    };
940
	    notify("Unknown trigger '$trig' for $node in $mode/$newstate!\n");
941
	}
Mac Newbold's avatar
Mac Newbold committed
942
943
944
945
946
947
948
	# Clear any of the node triggers that we ran.
	# (Don't clear all of them, because some of the triggers we ran
	# may have caused others to be set, and we don't want to nuke them.)
	if (@nodetrigs > 0) {
	    debug("Clearing node triggers: ".join("/",@nodetrigs)."\n");
	    ClearNodeTrigger($node,$mode,$newstate,@nodetrigs);
	}
949
    }
950

951
952
953
954
955
956
957
958
    # Check if this state can trigger a mode transition
    if (defined($modeTrans{"$mode:$newstate"})) {
	info("$node: Checking for mode transition\n");
	my $r = DBQueryWarn("select next_op_mode from nodes ".
			    "where node_id='$node'");
	my ($nextmode) = $r->fetchrow();
	if ($nextmode) {
	    opModeTransition($node,$nextmode);
Mac Newbold's avatar
Mac Newbold committed
959
960
961
	} else {
	    debug("No next mode.\n");
	}
962
963
    }
}
964

965
sub opModeTransition($$;$) {
Mac Newbold's avatar
Mac Newbold committed
966

967
968
    my ($node,$newmode,$force) = @_;
    if (!defined($force)) { $force = 0; }
Mac Newbold's avatar
Mac Newbold committed
969

970
    info("$node: Mode change to $newmode requested ($force)\n");
Mac Newbold's avatar
Mac Newbold committed
971

972
973
    # Check for invalid transitions
    my ($oldstate, $mode, $nextstate);
974
975
976
    $oldstate = $nodes{$node}{state};
    $mode     = $nodes{$node}{mode};

977
    if (defined($modeTrans{"$mode:$oldstate"}) || $force) {
978
	if (!$force) {
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
	    debug("Mode Transition check:\n");
	    my $translist = join(",",@{$modeTrans{"$mode:$oldstate"}});
	    #debug("translist=$translist\n");
	    #debug("splitlist=".join(", ",split(/[:,]/,$translist))."\n");
	    my %trans = split(/[:,]/,$translist);
	    debug("Valid transitions from $mode/$oldstate are:\n");
	    foreach my $k (sort keys %trans) {
		debug("$k => $trans{$k}\n");
	    }
	    if (defined($trans{$newmode})) {
		$nextstate=$trans{$newmode};
	    } else {
		notify("Invalid mode transition for $node from ".
		       "$mode/$oldstate to $newmode!\n");
	    }
994
995
	}
    } else {
996
	notify("Invalid mode transition for $node from $mode/$oldstate: ".
997
998
	       "Not a valid mode transition state!\n");
    }
Mac Newbold's avatar
Mac Newbold committed
999
1000
1001
    if (!$nextstate) {
	$nextstate=$oldstate;
    }
Mac Newbold's avatar
Mac Newbold committed
1002

1003
1004
1005
1006
1007
1008
    my $now = time();
    $nodes{$node}{state}     = $nextstate;
    $nodes{$node}{timestamp} = $now;
    $nodes{$node}{mode}           = $newmode;
    $nodes{$node}{mode_timestamp} = $now;
    $nodes{$node}{notified}       = 0;
Mac Newbold's avatar
Mac Newbold committed
1009

1010
1011
1012
1013
1014
    info("$node: $mode/$oldstate => $newmode/$nextstate\n");
    DBQueryFatal("UPDATE nodes SET eventstate='$nextstate', ".
		 "next_op_mode='', op_mode='$newmode', ".
		 "state_timestamp='$now', ".
		 "op_mode_timestamp='$now' WHERE node_id='$node'");
Mac Newbold's avatar
Mac Newbold committed
1015
1016
1017
1018

    # Check if this state has a timeout, and if so, put it in the queue
    setTimeout($newmode,$nextstate,$node,$now);

1019
1020
1021
1022
}

sub handleCtrlEvent($$) {
    my ($node,$event) = @_;
1023

1024
    info("CtrlEvent: $node, $event\n");
1025

1026
1027
    foreach ($event) {
	/^$TBRESET$/ && do {
1028
1029
	    #
	    # Clear next_boot_path with os_select.
Mac Newbold's avatar
Mac Newbold committed
1030
	    #
1031
	    $cmd = "$osselect -d -c -1 $node";
1032
	    system($cmd) and
1033
		notify("$node/$event: Could not clear next_boot_path!\n");
Mac Newbold's avatar
Mac Newbold committed
1034
1035

	    info("Performed $TBRESET for $node\n");
1036
1037
	    next;
	};
1038
	(/^$TBRELOADDONEV1$/ || /^$TBRELOADDONEV2$/) && do {
1039
1040
1041
1042
1043
1044
1045
	    info("Clearing reload info for $node\n");
	    DBQueryFatal("delete from current_reloads where node_id='$node'");
	    my ($pid,$eid);
	    NodeidToExp($node,\$pid,\$eid);
	    if (($pid eq NODERELOADING_PID) && ($eid eq NODERELOADING_EID)) {
		DBQueryFatal("delete from scheduled_reloads ".
			     "where node_id='$node'");
1046
		DBQueryFatal("delete from reserved where node_id='$node'");
1047
1048
		TBSetNodeHistory($node, TB_NODEHISTORY_OP_FREE,
				 $UID, $pid, $eid);
1049
		info("Released $node from $pid/$eid\n");
1050
	    }
1051
1052
1053
1054
1055
	    if ($event eq $TBRELOADDONEV2) {
		info("Sending an apod to $node\n");
		system("$apod $node") == 0 or
		    notify("Could not apod $node after $TBRELOADDONEV2!\n");
	    }
1056
1057
1058
	    next;
	};
	/^$TBTIMEOUT$/ && do {
1059
1060
1061
1062
	    my ($mode,$state) = split(":",$timeout_tag{$node});
	    delete($timeout_tag{$node});
	    my $curstate = $nodes{$node}{state};
	    my $curmode = $nodes{$node}{mode};
1063
	    my ($timeout,$action);
1064
1065
1066
1067
1068
	    if (!defined($nodes{$node}{notified})) {
		$nodes{$node}{notified}=0;
	    }
	    $nodes{$node}{notified}++;
	    my $notified = $nodes{$node}{notified};
1069
1070
	    $nodes{$node}{timedout}++;
	    my $timedout = $nodes{$node}{timedout};
1071
1072
1073
1074
	    if ($mode && $state && $timeouts{$mode} &&
		$timeouts{$mode}{$state}) {
		($timeout, $action) = @{$timeouts{$mode}{$state}};
	    }
1075
1076
	    if ($mode eq $TBCOMMAND) {
		# It is a command, not a true state
1077
		if ($action eq $TBTIMEOUTCMDRETRY) {
1078
		    # Retry the command
1079
		    notify("$node: Command $state, retry #$timedout\n");
1080
		    # notify in case we get in a retry loop...
1081
		    handleCommand($node,$state,$timedout,1);
1082
1083
1084
1085
1086
1087
		} else {
		    notify("$node: Unknown timeout action for ".
			   "$mode/$state: '$action'\n");
		}
		next;
	    }
1088
1089
1090
1091
1092
1093
1094

	    #
	    # Trash. This stuff should not be encoded this way, but I have
	    # no idea how timeouts, TBCOMMAND, and actions interact.
	    #
	    if ($curstate eq $PXEWAKEUP) {
		my $optarg = ($debug ? "-d " : "");
Mac Newbold's avatar
Mac Newbold committed
1095

1096
1097
		if ($timedout < 3) {
		    #
Mac Newbold's avatar
Mac Newbold committed
1098
1099
		    # Try again.
		    #
1100
1101
1102
1103
1104
1105
		    info("Node $node has timed out $timedout times in ".
			 "$PXEWAKEUP!\n".
			 "Sending it a another wakeup command\n");
		}
		else {
		    #
Mac Newbold's avatar
Mac Newbold committed
1106
1107
		    # Failed too many times, power cycle instead.
		    #
1108
1109
1110
1111
1112
1113
1114
		    notify("Node $node has timed out $timedout times in ".
			   "$PXEWAKEUP!\n".
			   "Sending it a reboot command\n");
		    $optarg .= "-k";
		}
		my $cmd = "$nodereboot -r $optarg $node";
		debug("$cmd\n");
1115
		system("(date; $cmd) >>$rebootlog 2>&1 &") and
1116
		    notify("$PXEWAKEUP retry: ".
1117
1118
1119
1120
1121
			   "Command '$cmd' failed, error $?: $!\n");

		next;
	    }

1122
1123
1124
1125
1126
1127
	    info("Node $node has timed out in state $mode/$state".
		 ($action ne "" ? "\n\tRequested action $action." : "").
		 "\n");

	    foreach ($action) {
		/^$TBTIMEOUTREBOOT/ && do {