stated.in 61.3 KB
Newer Older
Robert Ricci's avatar
Robert Ricci committed
1
#!/usr/bin/perl -w
Leigh B. Stoller's avatar
Leigh B. Stoller committed
2
#
3
# Copyright (c) 2000-2012 University of Utah and the Flux Group.
4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
# 
# {{{EMULAB-LICENSE
# 
# This file is part of the Emulab network testbed software.
# 
# This file is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or (at
# your option) any later version.
# 
# This file is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public
# License for more details.
# 
# You should have received a copy of the GNU Affero General Public License
# along with this file.  If not, see <http://www.gnu.org/licenses/>.
# 
# }}}
Leigh B. Stoller's avatar
Leigh B. Stoller committed
23 24
#

Robert Ricci's avatar
Robert Ricci committed
25 26 27
#
# stated - A daemon to monitor the states of nodes in the testbed. Recives
# state change notification through the event system, and writes the new
Mac Newbold's avatar
Mac Newbold committed
28
# state into the database. Also watches for invalid transitions, timeouts,
29
# and performs other state-related control functions.
Robert Ricci's avatar
Robert Ricci committed
30 31 32 33
#
# Send it a HUP signal to get it to reload the timeout and transition
# information. Periodically reloads this information regardless, though.
#
34
# Will restart when sent SIGUSR1, by exec'ing its executable again.
Robert Ricci's avatar
Robert Ricci committed
35
#
36

Robert Ricci's avatar
Robert Ricci committed
37 38
# Configure variables
use lib '@prefix@/lib';
39
my $TB = "@prefix@";
40
my $REALTB = "/usr/testbed"; # So we know if we're the "real" stated or not
Robert Ricci's avatar
Robert Ricci committed
41
my $BOSSNODE = "@BOSSNODE@";
42
my $TBOPS = "@TBSTATEDEMAIL@";
43
my $REALTBOPS = "@TBOPSEMAIL@";
44
my $TBDBNAME = "@TBDBNAME@";
45
my $REALTBDBNAME = "tbdb"; # So we know if we're using the "real" db
46
my $pxeselect = "$TB/bin/pxe_select";
47
my $osselect = "$TB/bin/os_select";
48
my $nodereboot = "$TB/bin/node_reboot";
49
my $rebootlog  = "$TB/log/nodereboot.log";
50
my $power = "$TB/bin/power";
51
my $apod = "$TB/sbin/apod";
52
my $TBLOG = "@TBLOGFACIL@";
53
my $LOGFILE = "$TB/log/stated.log";
Robert Ricci's avatar
Robert Ricci committed
54 55 56 57 58

$| = 1;

use event;
use libdb;
59
use Node;
Robert Ricci's avatar
Robert Ricci committed
60
use libtestbed;
61
use TimeoutQueue;
Robert Ricci's avatar
Robert Ricci committed
62
use Getopt::Std;
63
#use strict;
Robert Ricci's avatar
Robert Ricci committed
64
use English;
Mac Newbold's avatar
Mac Newbold committed
65 66
use POSIX;			# for strftime, and sigprocmask and friends
use Fcntl;			# file constants for pidfile
67 68
use POSIX ":sys_wait_h";
use IO::Poll qw(POLLIN);
69
use POSIX qw(:errno_h);
Mac Newbold's avatar
Mac Newbold committed
70

71 72 73 74 75
# Set up some notification throttling
my $mailgap = 15;		# in seconds
my $lastmail = time() - $mailgap + 2; # Send a digest of startup msgs after 2s.
my %msgs = ();

Mac Newbold's avatar
Mac Newbold committed
76
# Number of iterations (roughly, seconds) after which we'll reload
Robert Ricci's avatar
Robert Ricci committed
77 78
# information from the database. This is so we don't end up with information
# that's _too_ out of sync.
79
my $reload_time = 600;
80
my $last_reload = time;
Robert Ricci's avatar
Robert Ricci committed
81

Mike Hibler's avatar
Mike Hibler committed
82 83 84
# For startup and reload: maximum time in the past for which we will schedule
# a timeout. Anything older is assumed to be ancient history and is ignored.
# Note it is a negative value.
85
my $maxpasttimeout = -(7 * 24 * 60 * 60);	# 1 week
Mike Hibler's avatar
Mike Hibler committed
86

87 88 89
# Handling of SECVIOLATIONS, eventually controlled by a sitevar
my $soft_secviolation = 1;

90 91 92
# Command line opts.
my $dbtag = "";
my $debug = 0;
93 94
my $server = "localhost";
my $port   = @BOSSEVENTPORT@;
95 96
my $lockfile;
my $pidfile;
97
my $eventchild;
98

Robert Ricci's avatar
Robert Ricci committed
99 100 101
# Process command-line arguments

sub usage {
Mac Newbold's avatar
Mac Newbold committed
102
    print << "END";
103
Usage: $0 [-h] [-d] [-s server] [-p port] [-t dbtag]
104
-h              This message
105 106
-d              Turn on debugging output, and do not go into the background
-t tag          Use only those nodes with matching tag in nodes table
Robert Ricci's avatar
Robert Ricci committed
107 108
-s server       Use specified server, instead of this site's bossnode
-p port	        Use specified port
109
Send SIGHUP to reload database state, or SIGUSR1 to restart completely.
Robert Ricci's avatar
Robert Ricci committed
110
END
111
    exit(1);
Robert Ricci's avatar
Robert Ricci committed
112 113
}

Mac Newbold's avatar
Mac Newbold committed
114
# Only root should run this - it won't work when run as a user...
115
# (Or, let an admin run it if it isn't the real one in /usr/testbed/ )
116
if ($UID && ( $TB eq $REALTB || ! TBAdmin($UID) ) ) {
Mac Newbold's avatar
Mac Newbold committed
117 118 119
    die("Only root can run this script!\n");
}

120
my @args = @ARGV;    # save a copy for restart before we mess with them.
Robert Ricci's avatar
Robert Ricci committed
121
my %opt = ();
122
if (!getopts("ds:p:ht:l",\%opt)) { usage(); }
Robert Ricci's avatar
Robert Ricci committed
123

Mac Newbold's avatar
Mac Newbold committed
124 125 126 127 128 129
if ($opt{h}) {
    exit &usage;
}
if (@ARGV) {
    exit &usage;
}
Robert Ricci's avatar
Robert Ricci committed
130

Mac Newbold's avatar
Mac Newbold committed
131 132 133 134 135 136
if ($opt{s}) {
    $server = $opt{s};
}
if ($opt{p}) {
    $port = $opt{p};
}
137 138 139
if ($opt{t}) {
    $dbtag = $opt{t};
}
Mac Newbold's avatar
Mac Newbold committed
140 141 142
if ($opt{d}) {
    $debug = 1;
}
Robert Ricci's avatar
Robert Ricci committed
143

144
# Grab some constants into variables
145
my $TBANYMODE    = TBDB_NODEOPMODE_ANY;
146
my $TBPXERESET   = TBDB_TBCONTROL_PXERESET;
147 148
my $TBRESET      = TBDB_TBCONTROL_RESET;
my $TBTIMEOUT    = TBDB_TBCONTROL_TIMEOUT;
Mac Newbold's avatar
Mac Newbold committed
149 150 151
my $PXEBOOT      = TBDB_TBCONTROL_PXEBOOT;
my $BOOTING      = TBDB_TBCONTROL_BOOTING;
my $CHECKGENISUP = TBDB_TBCONTROL_CHECKGENISUP;
152 153 154
my $TBNOTIMEOUT  = TBDB_NO_STATE_TIMEOUT;
my $TBNODESTATE  = TBDB_TBEVENT_NODESTATE;
my $TBNODEOPMODE = TBDB_TBEVENT_NODEOPMODE;
155 156 157 158 159 160
my $TBCONTROL    = TBDB_TBEVENT_CONTROL;
my $TBCOMMAND    = TBDB_TBEVENT_COMMAND;
my $TBREBOOT     = TBDB_COMMAND_REBOOT;
my $TBPOWEROFF   = TBDB_COMMAND_POWEROFF;
my $TBPOWERON    = TBDB_COMMAND_POWERON;
my $TBPOWERCYCLE = TBDB_COMMAND_POWERCYCLE;
161
my $TBISUP       = TBDB_NODESTATE_ISUP;
162 163 164
my $PXEWAIT      = TBDB_NODESTATE_PXEWAIT;
my $PXEWAKEUP    = TBDB_NODESTATE_PXEWAKEUP;
my $PXEBOOTING   = TBDB_NODESTATE_PXEBOOTING;
165 166
my $TBRELOADDONEV1    = TBDB_TBCONTROL_RELOADDONE;
my $TBRELOADDONEV2    = TBDB_TBCONTROL_RELOADDONE_V2;
167 168 169 170
my $TBTIMEOUTREBOOT   = TBDB_STATED_TIMEOUT_REBOOT;
my $TBTIMEOUTNOTIFY   = TBDB_STATED_TIMEOUT_NOTIFY;
my $TBTIMEOUTCMDRETRY = TBDB_STATED_TIMEOUT_CMDRETRY;
my $TB_OSID_MBKERNEL  = TB_OSID_MBKERNEL;
171

172 173
# Special PXEBOOT state machine that all local nodes use.
my $PXEKERNEL	 = "PXEKERNEL";
174

175 176
# Even special-er SECUREBOOT state machine that local nodes may use.
my $SECUREBOOT	 = "SECUREBOOT";
177

178 179 180 181
# Protos.
sub debug(@);
sub fatal($);
sub notify($;$);
182
sub info($);
183 184 185 186 187 188 189
sub getTimeouts();
sub getValid();
sub getModeTrans();
sub getTriggers();
sub readStates(;@);
sub handleCtrlEvent($$);
sub reload();
190 191
sub StartEvents();
sub PollEvents($$);
192
    
193 194 195
if (!$debug) {
    if ( $TB eq $REALTB ) {
	$pidfile = "/var/run/stated.pid";
Mac Newbold's avatar
Mac Newbold committed
196
    } else {
197
	$pidfile = "$TB/locks/stated.pid";
Mac Newbold's avatar
Mac Newbold committed
198
    }
199
    debug("Using pidfile $pidfile\n");
Mac Newbold's avatar
Mac Newbold committed
200

201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
    if (-e $pidfile) {
	my $otherpid = `cat $pidfile`;
	my $running = `ps -auxww | grep $otherpid | grep -v grep`;
	if ($running ne "") {
	    fatal("Lockfile $pidfile exists, and process $otherpid appears ".
		  "to be running.\n");
	} else {
	    notify("Lockfile exists, but process $otherpid appears to be dead".
		   "\n".
		   "Removing lock file...\n");
	}
	system("rm $pidfile") &&
	    fatal("Couldn't remove $pidfile: $? $!\n");
    }
    # Background
216
    if (TBBackGround($LOGFILE)) {
Mac Newbold's avatar
Mac Newbold committed
217 218
	exit(0);
    }
219
    TBdbfork();
Robert Ricci's avatar
Robert Ricci committed
220
}
221 222 223 224 225 226 227 228 229

if (defined($pidfile)) {
    sysopen(PIDFILE, $pidfile, O_WRONLY | O_EXCL | O_CREAT) ||
	fatal("Couldn't create '$pidfile': $? $!\n");
    print PIDFILE "$$";
    close PIDFILE;
    # If I make it to here, I'll need to clean up the lock file
    $lockfile = $pidfile;
}
Robert Ricci's avatar
Robert Ricci committed
230

231 232 233
# Change my $0 so that it is easier to see in a ps/top
$0 = "$0";

234 235 236 237 238
#
# Start up the event system interface.
#
if (StartEvents() != 0) {
    fatal("Error starting events");
Robert Ricci's avatar
Robert Ricci committed
239
}
240 241
# We want to exit on any warning. 
$SIG{__WARN__} = sub { print STDERR $_[0]; exit(-1); };
Robert Ricci's avatar
Robert Ricci committed
242 243 244

# Read in the pre-existing node states, and timeout and valid transition
# information from the database
245 246 247 248
my %timeouts  = getTimeouts();
my %valid     = getValid();
my %modeTrans = getModeTrans();
my %triggers  = getTriggers();
249
my %nodes     = readStates();
250
my %timeouttag= ();
251
if ($debug) { qshow(); }
Robert Ricci's avatar
Robert Ricci committed
252 253 254

# Gets set if a reload of state from the database should happen.
my $do_reload = 0;
255
my $do_reopen = 0;
256 257
my $sigrestart= 0;
my $sigcleanup= 0;
258
my $exiting   = 0;
Robert Ricci's avatar
Robert Ricci committed
259 260 261 262

# Make the daemon reload database state on a sighup - but I'm worried
# about what would happen if we tried to do this mid-loop. So, we'll
# just set a flag and do it when we're done with our current pass.
263
$SIG{HUP}  = sub { info("SIGHUP - Reloading DB state\n"); $do_reload = 1; };
264
$SIG{USR2} = sub { info("SIGUSR2 - Reopening logfile\n"); $do_reopen = 1; };
265

Mac Newbold's avatar
Mac Newbold committed
266
# Set up other signals.
267 268 269 270 271
$SIG{USR1} = \&restart_wrap;
$SIG{INT}  = \&cleanup_wrap;
$SIG{QUIT} = \&cleanup_wrap;
$SIG{ABRT} = \&cleanup_wrap;
$SIG{TERM} = \&cleanup_wrap;
Robert Ricci's avatar
Robert Ricci committed
272

273 274 275
# Track if I handled an event or not
my $event_count = 0;

276 277
# Control how long I block while waiting for events
my $blockwait=0;
278
my $nextdeadline=0;
279 280
my $mailqueue=0;

281 282 283 284 285
notify("Stated starting up\n");

sub process_event_queue() {
    $event_count=0;
    my $lastcount=-1;
286 287
    my $wait;
    my $now = time();
Mike Hibler's avatar
Mike Hibler committed
288
    debug("Polling at $now - mq=$mailqueue bw=$blockwait ndl=$nextdeadline\n");
289 290 291 292 293 294 295 296 297 298
    if ( $mailqueue == 0) {
	# no messages waiting...
	if ($blockwait) {
	    # we can wait a long time - nothing else will happen
	    # until we get an event, or get woken up by a signal
	    $wait = 600;
	} else {
	    # only wait until the next deadline...
	    if ($nextdeadline > 0) {
		$wait = $nextdeadline - $now;
299 300
	    } else {
		$wait = 0;
301 302 303 304 305
	    }
	}
    } else {
	# mail is waiting. Only block until it is time to send it.
	$wait = $lastmail + $mailgap - $now;
Mac Newbold's avatar
Mac Newbold committed
306
	debug("Now $now, mailgap $mailgap, last $lastmail ==> wait $wait\n");
307 308 309
    }
    if ($wait < 0) { debug("Wait was $wait!\n"); $wait=0; }
    my $finish = $now + $wait;
Mac Newbold's avatar
Mac Newbold committed
310
    while (($event_count != $lastcount || $wait > 0) &&
311
	   !($sigrestart || $sigcleanup || $do_reload || $do_reopen)) {
312
	$lastcount = $event_count;
Mac Newbold's avatar
Mac Newbold committed
313
	# Don't block if we got a signal!
314
	if ($wait<=0 || $sigrestart || $sigcleanup || $do_reload || $do_reload){
315
	    PollEvents(0, 0);
316
	} else {
317
	    #debug("Using blocking event poll - $wait seconds\n");
318
	    # timeout param is in milliseconds, so multiply
319
	    PollEvents(1, $wait);
320 321 322
	    $now = time();
	    # subtract seconds elapsed from my wait time
	    $wait = $finish - $now;
323
	    #debug("Finished blocking event poll - $wait seconds remain\n");
324
	    if ($event_count > 0 &&
Mac Newbold's avatar
Mac Newbold committed
325
		(qsize() > 0 || $mailqueue ||
326
		 $sigrestart || $sigcleanup || $do_reload || $do_reopen)) {
327 328 329 330 331 332 333
		$blockwait=0;
		$wait=0;
		#debug("Cancelling wait - timeouts/msgs waiting, or HUP'd\n");
		#debug("---End Blocking Wait ---\n");
	    }
	}
	#debug("Wait is $wait\n");
334 335
    }
    if ($event_count > 0) {
Mac Newbold's avatar
Mac Newbold committed
336
	debug("Handled $event_count event(s).\n");
337 338
    }
}
Robert Ricci's avatar
Robert Ricci committed
339

340
# Now, we just poll for events, and watch for timeouts
Robert Ricci's avatar
Robert Ricci committed
341
while (1) {
Mac Newbold's avatar
Mac Newbold committed
342
    my $now = time();
343 344 345 346
    my ($deadline,$node);

    # Check for nodes that have passed their timeout
    if (!qhead($deadline,$node)) {
347
	info("HEAD: $node in ".($deadline-$now).", queue=".qsize()."\n");
348 349
	while ($now >= $deadline && $node ne "") {
	    qpop($deadline,$node);
350
	    info("POP: $node in ".($deadline-$now).", queue=".qsize()."\n");
351 352 353 354 355 356 357 358 359 360 361 362 363
	    #
	    # If the node is no longer in the nodes array, it was most
	    # likely a dynamic virtual node which is now gone. Ignore.
	    # Need to look at reload() to see if we can catch this earlier. 
	    #
	    if (exists($nodes{$node})) {
		handleCtrlEvent($node,$TBTIMEOUT);
	    }
	    else {
		info("POP: $node is no longer in the nodes array. Skipping\n");
		delete($timeout_tag{$node})
		    if (exists($timeout_tag{$node}));
	    }
364 365 366 367
	    if (0) { qshow(); }
	    if (qhead($deadline,$node)) {
		$deadline=0; $node="";
	    }
368
	}
369 370
    } else {
	$deadline=0;
371
    }
372
    $nextdeadline = $deadline;
373

Mac Newbold's avatar
Mac Newbold committed
374 375 376 377
    if ($do_reload || ($now - $last_reload > $reload_time)) {
	reload();
	$do_reload = 0;
    }
Mac Newbold's avatar
Mac Newbold committed
378

Mike Hibler's avatar
Mike Hibler committed
379 380 381 382 383
    if (qsize()==0) {
	$blockwait=1;
	debug("---Blocking wait okay---\n");
    }

Mac Newbold's avatar
Mac Newbold committed
384 385
    # Send any messages in the queue if it is time
    notify("",1);
Mac Newbold's avatar
Mac Newbold committed
386

387 388
    if ($sigrestart) { restart(); }
    if ($sigcleanup) { cleanup(); }
389 390 391 392 393 394 395
    if ($do_reopen)  {
	ReOpenLog($LOGFILE);
	if (! kill('USR2', $eventchild)) {
	    fatal("Could not signal(USR2) event reader child\n");
	}
	$do_reopen = 0;
    }
396
    process_event_queue;
Robert Ricci's avatar
Robert Ricci committed
397 398
}

Mac Newbold's avatar
Mac Newbold committed
399 400
exit(0);

Robert Ricci's avatar
Robert Ricci committed
401
# Read the current states of nodes from the database
402
sub readStates(;@) {
Mac Newbold's avatar
Mac Newbold committed
403
    my %oldnodes = @_;
Mike Hibler's avatar
Mike Hibler committed
404
    my $now = time();
Mac Newbold's avatar
Mac Newbold committed
405 406

    # Guard against undefined variable warnings
407
    if (! %oldnodes) {
Mac Newbold's avatar
Mac Newbold committed
408 409
	%oldnodes = ();
    }
410

Mac Newbold's avatar
Mac Newbold committed
411 412 413
    #debug("readStates called\n");
    my $result = DBQueryFatal("SELECT node_id, eventstate, " .
			      "state_timestamp, op_mode, " .
414
			      "op_mode_timestamp, stated_tag FROM nodes ".
415
			      "where node_id not like 'sh%'");
Mac Newbold's avatar
Mac Newbold committed
416 417

    my %nodes;
418
    while (my ($node_id, $state, $timestamp, $mode, $mode_timestamp, $tag)
Mac Newbold's avatar
Mac Newbold committed
419
	   = $result->fetchrow()) {
420 421 422 423 424 425 426 427 428 429 430
	$nodes{$node_id}{"tag"} = (defined($tag) ? $tag : "");

	if ($dbtag ne "" && $dbtag eq $nodes{$node_id}{"tag"}) {
	    info("This stated will work on $node_id\n");
	}
	if ($dbtag eq "" && $dbtag ne $nodes{$node_id}{"tag"}) {
	    info("This stated will *NOT* work on $node_id\n");
	}
	if ($dbtag ne $nodes{$node_id}{"tag"}) {
	    remTimeout($node_id);
	}
Mac Newbold's avatar
Mac Newbold committed
431

432
	#
Mac Newbold's avatar
Mac Newbold committed
433 434 435 436
	# If there's an entry in oldnodes for this node, and it
	# hasn't changed state or time, use the old entry (so that
	# we don't lose information about which nodes we've already
	# notified the ops about, etc.)
437
	#
Mac Newbold's avatar
Mac Newbold committed
438 439 440 441 442
	if ($oldnodes{$node_id} && $state && $timestamp &&
	    ($oldnodes{$node_id}{state} eq $state) &&
	    ($oldnodes{$node_id}{mode} eq $mode) &&
	    ($oldnodes{$node_id}{timestamp} == $timestamp)) {
	    $nodes{$node_id} = $oldnodes{$node_id};
443
	} else {
Mac Newbold's avatar
Mac Newbold committed
444 445 446 447
	    $nodes{$node_id}{state}          = $state;
	    $nodes{$node_id}{timestamp}      = $timestamp;
	    $nodes{$node_id}{mode}           = $mode;
	    $nodes{$node_id}{mode_timestamp} = $mode_timestamp;
448 449 450
	    $nodes{$node_id}{notified}       = 0;
	    $nodes{$node_id}{timedout}       = 0;
	    $nodes{$node_id}{noretry}        = 0;
451
	    # Is there a timeout? If so, set it up!
Mike Hibler's avatar
Mike Hibler committed
452
	    if (defined($timestamp) && $dbtag eq $nodes{$node_id}{"tag"}) {
453
		setTimeout($mode,$state,$node_id,$timestamp);
454
	    }
455
	}
Mac Newbold's avatar
Mac Newbold committed
456 457
    }
    return %nodes;
Robert Ricci's avatar
Robert Ricci committed
458 459 460 461 462 463
}

#
# Read timeouts for various states from the database
#
sub getTimeouts() {
Mac Newbold's avatar
Mac Newbold committed
464 465 466
    #debug("getTimeouts called\n");
    my $result = DBQueryFatal("SELECT op_mode, state, timeout, action " .
			      "FROM state_timeouts");
Robert Ricci's avatar
Robert Ricci committed
467

Mac Newbold's avatar
Mac Newbold committed
468 469 470 471 472
    my %timeouts;
    while (my ($op_mode, $state, $timeout, $action) = $result->fetchrow()) {
	$timeouts{$op_mode}{$state} = [ $timeout, $action ];
    }
    return %timeouts;
Robert Ricci's avatar
Robert Ricci committed
473 474 475 476 477 478
}

#
# Read the list of valid state transitions from the database
#
sub getValid() {
Mac Newbold's avatar
Mac Newbold committed
479 480 481
    #debug("getValid called\n");
    my $result = DBQueryFatal("SELECT op_mode, state1, state2 " .
			      "FROM state_transitions");
Robert Ricci's avatar
Robert Ricci committed
482

Mac Newbold's avatar
Mac Newbold committed
483
    my %valid;
484
    while (my ($mode, $state1, $state2) = $result->fetchrow()) {
Mac Newbold's avatar
Mac Newbold committed
485 486 487
	$valid{$mode}{$state1}{$state2} = 1;
    }
    return %valid;
Robert Ricci's avatar
Robert Ricci committed
488 489
}

490 491 492 493 494 495 496 497
#
# Check if a transition is valid. Actually, it is a check for an invalid
# transition.
#
sub IsValidTransition($$$)
{
    my ($mode, $oldstate, $newstate) = @_;

498 499 500 501 502 503 504 505 506 507 508
    # If there are no transitions for mode, then all transitions are invalid.
    if (!exists($valid{$mode})) {
	return 0;
    }

    # See if there is a specific transition for mode/oldstate -> newstate
    if (exists($valid{$mode}{$oldstate}) &&
	exists($valid{$mode}{$oldstate}{$newstate}) &&
	$valid{$mode}{$oldstate}{$newstate} == 1) {
	return 1;
    }
509

510 511 512 513 514 515 516 517 518
    # See if there is a wildcard transition for mode/* -> newstate
    if (exists($valid{$mode}{"*"}) &&
	exists($valid{$mode}{"*"}{$newstate}) &&
	$valid{$mode}{"*"}{$newstate} == 1) {
	return 1;
    }

    # Otherwise, transition is invalid.
    return 0;
519 520
}

521 522 523 524
#
# Read the list of valid mode transitions from the database
#
sub getModeTrans() {
Mac Newbold's avatar
Mac Newbold committed
525
    #debug("getModeTrans called\n");
Mac Newbold's avatar
Mac Newbold committed
526
    my $result =
Mac Newbold's avatar
Mac Newbold committed
527 528 529 530 531 532 533 534 535 536 537
      DBQueryFatal("SELECT op_mode1, state1, op_mode2, state2 " .
		   "FROM mode_transitions order by op_mode1,state1");

    my %modeTrans;
    while (my ($mode1,$state1, $mode2, $state2) = $result->fetchrow()) {
	if (!defined($modeTrans{"$mode1:$state1"})) {
	    $modeTrans{"$mode1:$state1"}= ["$mode2:$state2"];
	} else {
	    my @l = @{$modeTrans{"$mode1:$state1"}};
	    push(@l, "$mode2:$state2");
	    $modeTrans{"$mode1:$state1"}= \@l;
538
	}
Mac Newbold's avatar
Mac Newbold committed
539 540
    }
    return %modeTrans;
541 542 543 544 545 546
}

#
# Read the list of states which trigger an action
#
sub getTriggers() {
547
    debug("getTriggers called\n");
Mac Newbold's avatar
Mac Newbold committed
548

549 550
    debug("anymode ==> '$TBANYMODE'\n");

Mac Newbold's avatar
Mac Newbold committed
551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571
    # A note about triggers:
    #
    # "per-node" triggers only affect their specific node in a
    # particular mode/state, and are run first of all. "global"
    # triggers are triggers for a given mode/state that affect all
    # nodes, and are run after any per-node triggers. "Any-mode"
    # triggers are tied to a state, and occur in that state in any
    # mode. The any-mode triggers are over-ridden by global triggers,
    # and if an "Any-mode" trigger for state XYZ exists as well as a
    # global trigger for mode FOOBAR state XYZ, then when I arrive in
    # XYZ any per-node triggers will be run. Then, if I'm in mode
    # FOOBAR, only the global trigger will run. If I'm in any other
    # mode, only the any-mode trigger will run.

    # (our "*" is stored as $TBANYMODE)
    # Per-node triggers have a specific node_id
    # Global triggers have "*" as the node_id
    # Any-mode triggers have "*" as the mode, and can be global or per-node

    # Grab global triggers (including any-mode)
    my $result =
572
      DBQueryFatal("SELECT op_mode, state, `trigger` " .
573 574
		   "FROM state_triggers where node_id='$TBANYMODE' ".
		   "order by op_mode,state");
Mac Newbold's avatar
Mac Newbold committed
575
    my %t;
576
    while (my ($mode, $state, $trig) = $result->fetchrow()) {
Mac Newbold's avatar
Mac Newbold committed
577 578 579
	my @trigs = split(/\s*,\s*/,$trig);
	$t{"$mode:$state"} = \@trigs;
	debug("trig($mode:$state)\t => ".join(',',@trigs)."\n");
580 581
    }

Mac Newbold's avatar
Mac Newbold committed
582 583
    # Grab per-node triggers (including any-mode)
    $result =
584
      DBQueryFatal("SELECT node_id, op_mode, state, `trigger` " .
585 586 587 588 589 590
		   "FROM state_triggers where node_id!='$TBANYMODE' ".
		   "order by op_mode,state");
    while (my ($n, $mode, $state, $trig) = $result->fetchrow()) {
	my @trigs = split(/\s*,\s*/,$trig);
	$t{"$n:$mode:$state"} = \@trigs;
	debug("trig($n:$mode:$state)\t => ".join(',',@trigs)."\n");
Mac Newbold's avatar
Mac Newbold committed
591
    }
592

Mac Newbold's avatar
Mac Newbold committed
593 594
    debug(hash_recurse(%t));

Mac Newbold's avatar
Mac Newbold committed
595
    return %t;
596 597
}

Robert Ricci's avatar
Robert Ricci committed
598 599 600 601
#
# Gets called for every event that we recieve
#
sub handleEvent($$$) {
602
    my ($objtype,$objname,$eventtype) = @_;
Mac Newbold's avatar
Mac Newbold committed
603
    $event_count++;
Mac Newbold's avatar
Mac Newbold committed
604

605
    #
Mac Newbold's avatar
Mac Newbold committed
606 607 608
    # For readability, only do this on the main stated. This will print all
    # events, which gets cumbersome with debugging versions, so we'll print
    # only applicable events in debug versions, after we decide they apply.
609
    #
610
    if ($dbtag eq "") {
611 612
	debug("Got an event: ($objtype,$objname,$eventtype)\n");
    }
Mac Newbold's avatar
Mac Newbold committed
613 614 615 616

    #
    # Check to see if another instance is supposed to be handling this node
    #
617 618
    if ($objtype ne $TBCOMMAND) {
	my $node = $objname;
Mac Newbold's avatar
Mac Newbold committed
619

620 621 622 623 624 625 626
	#
	# If we have never seen this node, reload.
	#
	if (! defined($nodes{$node})) {
	    reload();

	    # Still not defined, someone screwed up! This could end up
Mac Newbold's avatar
Mac Newbold committed
627
	    # churning via reload(). Bad.
628
	    if (! defined($nodes{$node})) {
629 630 631 632
		notify("Got $objtype/$eventtype for nonexistent $node!\n");
		return;
	    }
	}
Mac Newbold's avatar
Mac Newbold committed
633

634 635
	#
	# If a stated_tag was specified on the command line, ignore those
Mac Newbold's avatar
Mac Newbold committed
636
	# nodes that do not match.
637
	#
Mac Newbold's avatar
Mac Newbold committed
638
	#debug("dbtag='$dbtag', node $node='".$nodes{$node}{"tag"}."'\n");
639 640 641 642 643 644 645 646 647 648
	if ($dbtag ne $nodes{$node}{"tag"}) {
	    # Record when main stated ignores a node.
	    info("Got $objtype/$eventtype for $node, which is not mine\n")
		if ($dbtag eq "");
	    return;
	}
	if (!checkDBRedirect($node)) {
	    info("Got $objtype/$eventtype for $node, which is not mine\n");
	    return;
	}
Mac Newbold's avatar
Mac Newbold committed
649
    }
Mac Newbold's avatar
Mac Newbold committed
650 651 652 653 654

    #
    # If this is a debugging version, then this event is for one of my
    # nodes, so I can print out the event now. (Main version prints earlier.)
    #
655 656 657
    if ($dbtag ne "") {
	debug("Got an event: ($objtype,$objname,$eventtype)\n");
    }
Mac Newbold's avatar
Mac Newbold committed
658 659 660

 SWITCH: for ($objtype) {

Mac Newbold's avatar
Mac Newbold committed
661 662 663 664 665 666 667 668 669 670 671 672 673 674
	(/$TBNODESTATE/) && do {
	    stateTransition($objname,$eventtype);
	    last;
	};
	(/$TBNODEOPMODE/) && do {
	    opModeTransition($objname,$eventtype);
	    notify("Use of deprecated event TBNODEOPMODE:\n".
		   "$objname->$eventtype\n");
	    last;
	};
	(/$TBCONTROL/) && do {
	    handleCtrlEvent($objname,$eventtype);
	    last;
	};
675 676 677 678
	(/$TBCOMMAND/) && do {
	    handleCommand($objname,$eventtype);
	    last;
	};
679

Mac Newbold's avatar
Mac Newbold committed
680
    }
681 682 683 684 685

}

sub stateTransition($$) {

686
    my ($node,$newstate) = @_;
Robert Ricci's avatar
Robert Ricci committed
687

688 689
    # Check for invalid transitions
    my ($oldstate, $mode);
690 691 692
    $oldstate = $nodes{$node}{state};
    $mode     = $nodes{$node}{mode};

693
    if ($oldstate && $mode && !IsValidTransition($mode,$oldstate,$newstate)) {
694
	notify("Invalid transition for node $node from $mode/$oldstate " .
695
	       "to $newstate\n");
696
	
697 698 699 700 701 702
	#
	# Machines in the secure boot path are not allowed to jump
	# willy-nilly into unknown states.
	#
        if ($mode eq TBDB_NODEOPMODE_SECUREBOOT ||
	    $mode eq TBDB_NODEOPMODE_SECURELOAD) {
703 704 705
            $newstate = TBDB_NODESTATE_SECVIOLATION;
            notify("Moving $node to $newstate because it's in $mode\n");
        }
706
    }
707

708 709 710 711 712
    #
    # Nodes that are in the SECVIOLATION state are not allowed to leave!
    #
    if ($oldstate eq TBDB_NODESTATE_SECVIOLATION &&
	$newstate ne TBDB_NODESTATE_SECVIOLATION) {
713
	#
714 715 716
	# Allow transitions to SHUTDOWN/POWEROFF.
	# This allows someone to reboot (turn-off and back on) a node in
	# the SECVIOLATION state getting it back to MINIMAL/SHUTDOWN.
717 718 719
	#
	# XXX DEBUG ONLY!
	# 
720 721
	if ($soft_secviolation &&
	    ($newstate eq TBDB_NODESTATE_SHUTDOWN ||
722
	     $newstate eq TBDB_NODESTATE_POWEROFF ||
723 724
	     $newstate eq TBDB_NODESTATE_GPXEBOOTING)) {
	    notify("$node allowed to transition: SECVIOLATION => $newstate\n");
725 726 727 728
	} else {
	    notify("$node tried to leave SECVIOLATION (to $newstate)\n");
	    $newstate = TBDB_NODESTATE_SECVIOLATION;
	}
729
    }
Robert Ricci's avatar
Robert Ricci committed
730

731 732 733 734
    my $now = time();
    $nodes{$node}{state}     = $newstate;
    $nodes{$node}{timestamp} = $now;
    $nodes{$node}{notified}  = 0;
735

736 737 738
    info("$node: $mode/$oldstate => $mode/$newstate\n");
    DBQueryFatal("UPDATE nodes SET eventstate='$newstate', " .
		 "state_timestamp='$now' WHERE node_id='$node'");
739

740 741 742 743
    # Before we set the timeout (overwriting any current ones), we need
    # to check if we had a pending command
    if (qfind($node) &&
	$timeout_tag{$node} =~ /^$TBCOMMAND:/) {
744
        debug("TimeoutTag = '$timeout_tag{$node}'\n");
745
	my ($str,$cmd) = split(":",$timeout_tag{$node});
746
	debug("str=$str\tcmd=$cmd\tTBREBOOT=$TBREBOOT\tstate=$newstate\n");
747
	if ($cmd eq $TBREBOOT) {
748
	    if ($newstate eq TBDB_NODESTATE_SHUTDOWN ) {
749 750 751
		info("$node: $TBREBOOT success\n");
		# Timeout will get cleared below by setTimeout call
	    } else {
752 753
		notify("$node: $TBREBOOT in progress, but got state ".
		       "$newstate instead of ".TBDB_NODESTATE_SHUTDOWN."!\n");
754 755 756 757 758
	    }
	#} elsif ($cmd eq $FOO ) {
	    # Add more here...
	} else {
	    notify("$node: Unknown command timeout '$timeout_tag{$node}' ".
759
		   "found at $mode/$newstate\n");
760 761 762
	}
    }

763 764 765 766 767
    #
    # Check if this state has a timeout, and if so, put it in the queue.
    # Note that any opmode transition below will replace (or remove) this
    # timeout if appropriate.
    #
768 769
    setTimeout($mode,$newstate,$node,$now);

Mac Newbold's avatar
Mac Newbold committed
770 771 772 773 774 775 776
    # Check if this state has any triggers
    my @nodetrigs = GetNodeTriggerList($node,$mode,$newstate,1);
    my @trigs = GetNodeTriggerList($node,$mode,$newstate);
    if (@trigs > 0) {
	debug("Running triggers: ".join("/",@trigs)."\n");
	foreach ( @trigs) {
	    my $trig = $_;
777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793
	    /^SCRIPT:([-\w\/]+)$/ && do {
		#
		# Run a script. No arguments at the moment.
		#
		my $script = $1;

		#
		# If the path is absolute, run it. Otherwise it has to
		# come from the sbin directory.
		#
		if (! ($script =~ /^\//)) {
		    $script = "$TB/sbin/$script";
		}
		info("$node: Running $script.\n");
		system("$script &");
		next;
	    };
Mac Newbold's avatar
Mac Newbold committed
794
	    /^$PXEBOOT$/ && do {
795
		#
Mac Newbold's avatar
Mac Newbold committed
796 797 798 799 800 801 802 803 804 805 806 807
		# See if we jumped into the PXEBOOT kernel. Bootinfo
		# will send PXEBOOTING every time a node contacts it,
		# which is our indicator that the node is in the first
		# phase of booting. At this point we want to switch
		# state machines since the entire boot process is
		# governed by a single state machine that is
		# independent of the OS that the node will eventually
		# boot.  Rather then encode that in each state
		# machine, we use a special machine with a defined
		# entrypoint (PXEBOOTING) and a defined exitpoint
		# (BOOTING). See below for where we jump back out of
		# this state machine.
808
		#
Mac Newbold's avatar
Mac Newbold committed
809 810 811 812
		# Jumped in. We need to change the opmode so that
		# the state transitions are legal. We do not
		# bother to save the old opmode since we can
		# figure it out later when we leave.
813
		#
Mac Newbold's avatar
Mac Newbold committed
814 815 816 817 818
		debug("Running $PXEBOOT trigger\n");
		if ($mode ne $PXEKERNEL) {
		    info("$node: Forcing mode transition into $PXEKERNEL!\n");
		    opModeTransition($node, $PXEKERNEL, 1);
		    $mode=$PXEKERNEL;
819
		}
Mac Newbold's avatar
Mac Newbold committed
820 821
		next;
	    };
822 823 824 825 826 827
	    /^$SECUREBOOT$/ && do {
		#
		# Force machine into the SECUREBOOT/LOAD op_mode.
		# Currently triggered by receipt of GPXEBOOTING state.
		# This could come from any state as it just indicates that
		# a machine with a gPXE dongle has rebooted.
828
		#
829 830 831
		# To differentiate BOOT from LOAD:
		# if next_op_mode is SECURELOAD, goto SECURELOAD
		# else if mode/state is SECURELOAD/SHUTDOWN, goto SECURELOAD
832
		# else if mode/state is SECURELOAD/REBOOTING, goto SECURELOAD
833
		# else goto SECUREBOOT.
834
		#
835 836 837 838
		# The SHUTDOWN case handles os_load.
		# The REBOOT case handles the forced reboot following PXEWAIT
		# (inflicted by bootinfo--see bootinfo_mysql.c).
		#
839 840 841 842
		my $query_result =
		    DBQueryWarn("select next_op_mode from nodes ".
				"where node_id='$node'");
		my ($nextmode) = $query_result->fetchrow();
843 844 845
		if (!$nextmode) {
		    $nextmode = $mode;
		}
846
		info("Running $SECUREBOOT trigger with $nextmode/$oldstate\n");
847
		if ($nextmode ne TBDB_NODEOPMODE_SECURELOAD ||
848 849
		    ($oldstate ne TBDB_NODESTATE_SHUTDOWN &&
		     $oldstate ne TBDB_NODESTATE_REBOOTING)) {
850 851 852 853 854 855
		    $nextmode = TBDB_NODEOPMODE_SECUREBOOT;
		}
		if ($mode ne $nextmode) {
		    info("$node: Forcing mode transition to $nextmode!\n");
		    opModeTransition($node, $nextmode, 1);
		    $mode=$nextmode;
856 857 858
		}
		next;
	    };
Mac Newbold's avatar
Mac Newbold committed
859 860 861 862 863
	    /^$BOOTING$/ && do {
		#
		# See if we are in the right mode/osid.
		#
		my ($bootosid,$bootopmode) = TBBootWhat($node, $debug);
864 865 866 867 868 869 870
		if (!defined($bootosid)) {
		    info("$node: TBBootWhat say node doesn't exist!?");
		    next;
		} elsif ($bootosid == 0) {
		    info("$node: should be in PXEWAIT, why are we here?!");
		    next;
		}
871

872 873 874 875 876 877 878
		# XXX defensive programming, this happened once
		if (!defined($bootopmode)) {
		    info("$node: TBBootWhat did not return op_mode ".
			 "(osid=$bootosid)!? Remaining in $mode");
		    next;
		}

Mac Newbold's avatar
Mac Newbold committed
879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944
		info("$node: BootWhat says $bootosid (mode $bootopmode).\n");
		DBQueryFatal("update nodes set osid='$bootosid' ".
			     "where node_id='$node'");

		if ($bootopmode ne $mode) {
		    if ($mode eq $PXEKERNEL) {
			#
			# If we came from PXE boot, then we have to
			# jump out of the PXEKERNEL state machine into
			# whatever state machine is current for the
			# node. Since we came through bootinfo, we
			# know that the node is doing what it is
			# supposed to, and that this change matches
			# what the node is booting.
			#
			info("$node: Forcing mode transition out of $PXEKERNEL!\n");
			opModeTransition($node, $bootopmode, 1);
			$mode=$bootopmode;
		    }
		    elsif ($oldstate eq TBDB_NODESTATE_ISUP) {
			#
			# Skipped SHUTDOWN, which could result in a
			# missed opmode transition. Can this really
			# happen anymore?
			#
			info("$node: Came from ISUP! ".
			     "Checking for mode transition\n");
			my $query_result =
			  DBQueryWarn("select next_op_mode from nodes ".
				      "where node_id='$node'");
			my ($nextmode) = $query_result->fetchrow();
			if ($nextmode) {
			    info("$node: Forcing mode transition!\n");
			    opModeTransition($node, $nextmode, 1);
			    $mode=$nextmode;
			}
		    }
		    else {
			my $str = "$node is running $bootosid, but in ".
			  "mode $mode\ninstead of mode $bootopmode!\n";
			
			if ($bootopmode eq "RELOAD") {
			    #
			    # For now, only force if we're going into
			    # reload mode, so we don't get stuck
			    # looping in reloading.  Can this happen
			    # anymore?
			    #
			    DBQueryFatal("UPDATE nodes SET ".
					 "op_mode='$bootopmode', ".
					 "op_mode_timestamp=".
					 "unix_timestamp(now()) ".
					 "WHERE node_id='$node'");
			    $nodes{$node}{mode} = $bootopmode;
			    $nodes{$node}{mode_timestamp} = $now;
			    $str .= "Forced op_mode to $bootopmode.\n";
			}
			notify($str);
		    }
		}
		next;
	    };
	    /^$CHECKGENISUP$/ && do {
		checkGenISUP($node);
		next;
	    };
945 946 947 948
	    /^CHECKPORTREG$/ && do {
		CheckPortRegistration($node);
		next;
	    };
949 950 951 952 953 954
	    /^$TBPXERESET$/ && do {
		# We successfully booted, so reset one-shot PXEboot if any
		# Check if we really need to do a reset
		handleCtrlEvent($node,$trig);
		next;
	    };
955
	    /^$TBRESET$/ && do {
956 957 958
		# We successfully booted, so clear some flags
		$nodes{$node}{noretry}   = 0;
		$nodes{$node}{timedout}  = 0;
959 960 961 962
		# Check if we really need to do a reset
		my $r = DBQueryWarn("select osid,def_boot_osid from nodes ".
				    "where node_id='$node'");
		my ($osid,$defosid) = $r->fetchrow();
963 964 965 966 967 968 969
		if (! (defined($osid) && defined($defosid))) {
		    info("$node: osid not defined\n")
			if (!defined($osid));
		    info("$node: def_boot_osid not defined\n")
			if (!defined($defosid));
		}
		elsif ($osid ne $defosid) {
970 971
		    handleCtrlEvent($node,$trig);
		}
972 973 974 975 976
		# XXX tmp hack
		elsif ($mode eq "WIMRELOAD") {
		    info("$node: RESET in WIMRELOAD, but osid=defosid=$osid\n");
		    handleCtrlEvent($node,$trig);
		}
977 978
		next;
	    };
979
	    (/^$TBRELOADDONEV1$/ || /^$TBRELOADDONEV2$/) && do {
980 981 982
		handleCtrlEvent($node,$trig);
		next;
	    };
983 984 985 986 987 988 989 990
	    /^$TBISUP$/ && do {
		info("$node: Triggered $TBISUP\n");
		EventSendWarn(host      => $BOSSNODE ,
			      objtype   => TBDB_TBEVENT_NODESTATE ,
			      eventtype => TBDB_NODESTATE_ISUP ,
			      objname   => $node);
		next;
	    };
991
	    (/^$TBREBOOT$/ || /^$TBPOWERCYCLE$/ || /^$TBPOWEROFF$/) && do {
992
		handleCommand($node,$trig);
993 994 995
		next;
	    };
            (/^EMAILNOTIFY$/) && do {
996 997
		my $msg = "$node entered state $mode/$newstate from " .
		    "$mode/$oldstate";
998
		my $dest = $REALTBOPS;
999
		if ($newstate eq TBDB_NODESTATE_SECVIOLATION) {
1000 1001 1002 1003 1004 1005 1006 1007
		    if ($soft_secviolation) {
			$msg .= "\n\nNode $node was allowed to continue.\n";
			$dest = $TBOPS;
		    } else {
			$msg .= "\n\nNode $node has been powered off.\n" .
			        "You must address the cause of the violation ".
				"and reset the eventstate before powering on.";
		    }
1008
		}
1009 1010
		SENDMAIL($dest, "STATED: $node entered state $newstate",
			 $msg, "Stated Daemon <".$TBOPS.">");
1011
		next;
1012
            };
1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024
	    /^RELOADOLDMFS$/ && do {
		my $frisbee_osid = TBNodeDiskloadOSID($node);
		my $frisbee_name = DBQuerySingleFatal("select osname from os_info where osid=$frisbee_osid");
		my $msg = 
		    ("Attempted to load multiple images on $node using an old Frisbee MFS.\n".
		     "To make this work please update the $frisbee_name MFS image.\n");
		SENDMAIL($REALTBOPS,
			 "$frisbee_name Needs Updating",
			 $msg,
			 "Stated Daemon <".$TBOPS.">");
		next;
	    };
1025
	    notify("Unknown trigger '$trig' for $node in $mode/$newstate!\n");
1026
	}
Mac Newbold's avatar
Mac Newbold committed
1027 1028 1029 1030 1031 1032 1033
	# Clear any of the node triggers that we ran.
	# (Don't clear all of them, because some of the triggers we ran
	# may have caused others to be set, and we don't want to nuke them.)
	if (@nodetrigs > 0) {
	    debug("Clearing node triggers: ".join("/",@nodetrigs)."\n");
	    ClearNodeTrigger($node,$mode,$newstate,@nodetrigs);
	}
1034
    }
1035

1036 1037 1038 1039 1040 1041 1042 1043
    # Check if this state can trigger a mode transition
    if (defined($modeTrans{"$mode:$newstate"})) {
	info("$node: Checking for mode transition\n");
	my $r = DBQueryWarn("select next_op_mode from nodes ".
			    "where node_id='$node'");
	my ($nextmode) = $r->fetchrow();
	if ($nextmode) {
	    opModeTransition($node,$nextmode);
Mac Newbold's avatar
Mac Newbold committed
1044 1045 1046
	} else {
	    debug("No next mode.\n");
	}
1047 1048
    }
}
1049

1050
sub opModeTransition($$;$) {
Mac Newbold's avatar
Mac Newbold committed
1051

1052 1053
    my ($node,$newmode,$force) = @_;
    if (!defined($force)) { $force = 0; }
Mac Newbold's avatar
Mac Newbold committed
1054

1055
    info("$node: Mode change to $newmode requested ($force)\n");
Mac Newbold's avatar
Mac Newbold committed
1056

1057 1058
    # Check for invalid transitions
    my ($oldstate, $mode, $nextstate);
1059 1060 1061
    $oldstate = $nodes{$node}{state};
    $mode     = $nodes{$node}{mode};

1062
    if (defined($modeTrans{"$mode:$oldstate"}) || $force) {
1063 1064 1065 1066 1067
	#
	# If there is a next state associated with mode:oldstate, use it.
	# Otherwise we just remain in the old state.
	#
	if (defined($modeTrans{"$mode:$oldstate"})) {
1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078
	    debug("Mode Transition check:\n");
	    my $translist = join(",",@{$modeTrans{"$mode:$oldstate"}});
	    #debug("translist=$translist\n");
	    #debug("splitlist=".join(", ",split(/[:,]/,$translist))."\n");
	    my %trans = split(/[:,]/,$translist);
	    debug("Valid transitions from $mode/$oldstate are:\n");
	    foreach my $k (sort keys %trans) {
		debug("$k => $trans{$k}\n");
	    }
	    if (defined($trans{$newmode})) {
		$nextstate=$trans{$newmode};
1079 1080 1081 1082
		if ($force) {
		    info("$node: opMode force changing state along with mode\n");
		}
	    } elsif (!$force) {
1083 1084 1085
		notify("Invalid mode transition for $node from ".
		       "$mode/$oldstate to $newmode!\n");
	    }
1086 1087
	}
    } else {
1088
	notify("Invalid mode transition for $node from $mode/$oldstate: ".
1089 1090
	       "Not a valid mode transition state!\n");
    }
Mac Newbold's avatar
Mac Newbold committed
1091 1092 1093
    if (!$nextstate) {
	$nextstate=$oldstate;
    }
Mac Newbold's avatar
Mac Newbold committed
1094

1095 1096 1097 1098 1099 1100
    my $now = time();
    $nodes{$node}{state}     = $nextstate;
    $nodes{$node}{timestamp} = $now;
    $nodes{$node}{mode}           = $newmode;
    $nodes{$node}{mode_timestamp} = $now;
    $nodes{$node}{notified}       = 0;
Mac Newbold's avatar
Mac Newbold committed
1101

1102 1103 1104 1105 1106
    info("$node: $mode/$oldstate => $newmode/$nextstate\n");
    DBQueryFatal("UPDATE nodes SET eventstate='$nextstate', ".
		 "next_op_mode='', op_mode='$newmode', ".
		 "state_timestamp='$now', ".
		 "op_mode_timestamp='$now' WHERE node_id='$node'");
Mac Newbold's avatar
Mac Newbold committed
1107 1108 1109 1110

    # Check if this state has a timeout, and if so, put it in the queue
    setTimeout($newmode,$nextstate,$node,$now);

1111 1112 1113 1114
}

sub handleCtrlEvent($$) {
    my ($node,$event) = @_;
1115

1116 1117 1118 1119 1120 1121 1122
    # stated should use the node objects.
    my $nodeobj = Node->Lookup($node);
    if (!defined($nodeobj)) {
	notify("handleCtrlEvent: Could not lookup node object for $node!\n");
	return;
    }
    $nodeobj->FlushReserved();
1123
    info("CtrlEvent: $node, $event\n");
1124

1125
    foreach ($event) {
1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140
	/^$TBPXERESET$/ && do {
	    #
	    # Clear next_pxe_boot_path with pxe_select.
	    #
	    # Note that this will recreate the DHCPD config file and HUP it.
	    # It will also ssh over to any subbosses and do the same.
	    # Thus there is lost of potential to get hung or take a long time.
	    #
	    $cmd = "$pxeselect -d -c -1 $node";
	    system($cmd) and
		notify("$node/$event: Could not clear next_pxe_boot_path!\n");

	    info("Performed $TBPXERESET for $node\n");
	    next;
	};
1141
	/^$TBRESET$/ && do {
1142 1143
	    # Clear next_boot_path.
	    if ($nodeobj->OSSelect(undef, "next_boot_osid", 1) != 0) {
1144
		notify("$node/$event: Could not clear next_boot_path!\n");
1145
	    }
Mac Newbold's avatar
Mac Newbold committed
1146 1147

	    info("Performed $TBRESET for $node\n");
1148 1149
	    next;
	};
1150
	(/^$TBRELOADDONEV1$/ || /^$TBRELOADDONEV2$/) && do {
1151
	    info("Clearing reload info for $node\n");
1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166
	    $nodeobj->ClearCurrentReload();
	    $nodeobj->FlushReserved();
	    my $experiment = $nodeobj->Reservation();
	    if (defined($experiment) &&
		$experiment->pid() eq NODERELOADING_PID &&
		$experiment->eid() eq NODERELOADING_EID) {
		$nodeobj->ClearSchedReload();
		my $reserved_pid = $nodeobj->CheckPreReserve(1);
		if (defined($reserved_pid)) {
		    info("Setting pre reserve for $node to $reserved_pid\n");
		}
		$nodeobj->ClearReservation();
		$nodeobj->SetNodeHistory(TB_NODEHISTORY_OP_FREE,
					 undef, $experiment);
		info("Released $node from $experiment\n");
1167
	    }
1168 1169 1170 1171 1172
	    if ($event eq $TBRELOADDONEV2) {
		info("Sending an apod to $node\n");
		system("$apod $node") == 0 or
		    notify("Could not apod $node after $TBRELOADDONEV2!\n");
	    }
1173