stated.in 37.5 KB
Newer Older
Robert Ricci's avatar
Robert Ricci committed
1
#!/usr/bin/perl -w
Leigh B. Stoller's avatar
Leigh B. Stoller committed
2 3
#
# EMULAB-COPYRIGHT
4
# Copyright (c) 2000-2003 University of Utah and the Flux Group.
Leigh B. Stoller's avatar
Leigh B. Stoller committed
5 6 7
# All rights reserved.
#

Robert Ricci's avatar
Robert Ricci committed
8 9 10
#
# stated - A daemon to monitor the states of nodes in the testbed. Recives
# state change notification through the event system, and writes the new
11 12
# state into the database. Also watches for invalid transitions, timeouts, 
# and performs other state-related control functions.
Robert Ricci's avatar
Robert Ricci committed
13 14 15 16
#
# Send it a HUP signal to get it to reload the timeout and transition
# information. Periodically reloads this information regardless, though.
#
17
# Will restart when sent SIGUSR1, by exec'ing its executable again.
Robert Ricci's avatar
Robert Ricci committed
18
#
19

Robert Ricci's avatar
Robert Ricci committed
20 21
# Configure variables
use lib '@prefix@/lib';
22
my $TB = "@prefix@";
23
my $REALTB = "/usr/testbed"; # So we know if we're the "real" stated or not
Robert Ricci's avatar
Robert Ricci committed
24
my $BOSSNODE = "@BOSSNODE@";
25
my $TBOPS = "@TBSTATEDEMAIL@";
26
my $REALTBOPS = "@TBOPSEMAIL@";
27
my $TBDBNAME = "@TBDBNAME@";
28
my $REALTBDBNAME = "tbdb"; # So we know if we're using the "real" db
29
my $osselect = "$TB/bin/os_select";
30 31
my $nodereboot = "$TB/bin/node_reboot";
my $power = "$TB/bin/power";
Robert Ricci's avatar
Robert Ricci committed
32 33 34 35 36 37

$| = 1;

use event;
use libdb;
use libtestbed;
38
use TimeoutQueue;
Robert Ricci's avatar
Robert Ricci committed
39
use Getopt::Std;
40
#use strict;
Robert Ricci's avatar
Robert Ricci committed
41
use English;
Mac Newbold's avatar
Mac Newbold committed
42 43
use POSIX;			# for strftime, and sigprocmask and friends
use Fcntl;			# file constants for pidfile
Mac Newbold's avatar
Mac Newbold committed
44 45 46 47 48
use Sys::Syslog;
# Important note about syslog: It defaults to using an inet socket,
# but 'syslogd -s' (the default) doesn't listen for one. So either
# run syslogd without -s, or use setlogsock('unix') before openlog.
# (To get setlocksock: 'use Sys::Syslog qw(:DEFAULT setlogsock);' )
Robert Ricci's avatar
Robert Ricci committed
49

Mac Newbold's avatar
Mac Newbold committed
50 51 52
# Do lots of db retries before we fail and die
$libdb::DBQUERY_MAXTRIES = 5;

Robert Ricci's avatar
Robert Ricci committed
53 54 55
# Number of iterations (roughly, seconds) after which we'll reload 
# information from the database. This is so we don't end up with information
# that's _too_ out of sync.
56
my $reload_time = 600;
57
my $last_reload = time;
Robert Ricci's avatar
Robert Ricci committed
58 59 60 61

# Process command-line arguments

sub usage {
Mac Newbold's avatar
Mac Newbold committed
62
    print << "END";
63 64 65
Usage: $0 [-h] [-d] [-s server] [-p port]
-h              This message
-d              Turn on debugging output, and don't go into the background
Robert Ricci's avatar
Robert Ricci committed
66 67
-s server       Use specified server, instead of this site's bossnode
-p port	        Use specified port
68
Send SIGHUP to reload database state, or SIGUSR1 to restart completely.
Robert Ricci's avatar
Robert Ricci committed
69 70 71
END
}

Mac Newbold's avatar
Mac Newbold committed
72
# Only root should run this - it won't work when run as a user...
73
# (Or, let an admin run it if it isn't the real one in /usr/testbed/ )
74
if ($UID && ( $TB eq $REALTB || ! TBAdmin($UID) ) ) {
Mac Newbold's avatar
Mac Newbold committed
75 76 77
    die("Only root can run this script!\n");
}

78
my @args = @ARGV;    # save a copy for restart before we mess with them.
Robert Ricci's avatar
Robert Ricci committed
79
my %opt = ();
80
getopts("ds:p:h",\%opt);
Robert Ricci's avatar
Robert Ricci committed
81

Mac Newbold's avatar
Mac Newbold committed
82 83 84 85 86 87
if ($opt{h}) {
    exit &usage;
}
if (@ARGV) {
    exit &usage;
}
Robert Ricci's avatar
Robert Ricci committed
88

89
my ($server,$port,$debug);
Mac Newbold's avatar
Mac Newbold committed
90 91 92 93 94 95 96 97 98 99 100 101 102
if ($opt{s}) {
    $server = $opt{s};
} else {
    $server = $BOSSNODE;
}
if ($opt{p}) {
    $port = $opt{p};
}
if ($opt{d}) {
    $debug = 1;
} else {
    $debug = 0;
}
Robert Ricci's avatar
Robert Ricci committed
103

104
# Grab some constants into variables
105
my $TBANYMODE    = TBDB_NODEOPMODE_ANY;
106 107 108 109 110 111
my $TBRESET      = TBDB_TBCONTROL_RESET;
my $TBRELOADDONE = TBDB_TBCONTROL_RELOADDONE;
my $TBTIMEOUT    = TBDB_TBCONTROL_TIMEOUT;
my $TBNOTIMEOUT  = TBDB_NO_STATE_TIMEOUT;
my $TBNODESTATE  = TBDB_TBEVENT_NODESTATE;
my $TBNODEOPMODE = TBDB_TBEVENT_NODEOPMODE;
112 113 114 115 116 117
my $TBCONTROL    = TBDB_TBEVENT_CONTROL;
my $TBCOMMAND    = TBDB_TBEVENT_COMMAND;
my $TBREBOOT     = TBDB_COMMAND_REBOOT;
my $TBPOWEROFF   = TBDB_COMMAND_POWEROFF;
my $TBPOWERON    = TBDB_COMMAND_POWERON;
my $TBPOWERCYCLE = TBDB_COMMAND_POWERCYCLE;
118
my $TB_OSID_MBKERNEL = TB_OSID_MBKERNEL;
119

120 121 122
# This only gets used here, so it isn't in a lib constant.
my $TBFREENODE = "FREENODE";

123
# Set up some notification throttling
Mac Newbold's avatar
Mac Newbold committed
124
my $mailgap = 15;		# in seconds
125 126 127
my $lastmail = time() - $mailgap + 2; # Send a digest of startup msgs after 2s.
my %msgs = ();

Mac Newbold's avatar
Mac Newbold committed
128
my $pidfile;
129
if ( $TB eq $REALTB ) {
Mac Newbold's avatar
Mac Newbold committed
130 131
    $pidfile = "/var/run/stated.pid";
} else {
132
    $pidfile = "$TB/locks/stated.pid";
Mac Newbold's avatar
Mac Newbold committed
133
}
Mac Newbold's avatar
Mac Newbold committed
134 135
debug("Using pidfile $pidfile\n");
if (-e $pidfile) {
Mac Newbold's avatar
Mac Newbold committed
136 137 138 139 140 141 142 143 144 145 146
    my $otherpid = `cat $pidfile`;
    my $running = `ps -auxww | grep $otherpid | grep -v grep`;
    if ($running ne "") {
	fatal("Lockfile $pidfile exists, and process $otherpid appears to be ".
	      "running.\n");
    } else {
	notify("Lockfile exists, but process $otherpid appears to be dead.\n".
	       "Removing lock file...\n");
    }
    system("rm $pidfile") &&
      fatal("Couldn't remove $pidfile: $? $!\n");
Mac Newbold's avatar
Mac Newbold committed
147
}
Robert Ricci's avatar
Robert Ricci committed
148
# Background
149
if (!$debug) {
Mac Newbold's avatar
Mac Newbold committed
150 151 152 153
    # We use syslog, so redirect the output to nothing
    if (TBBackGround("/dev/null")) {
	exit(0);
    }
Robert Ricci's avatar
Robert Ricci committed
154
}
Mac Newbold's avatar
Mac Newbold committed
155 156
# set up syslog
openlog("stated","pid","user");
Mac Newbold's avatar
Mac Newbold committed
157 158 159 160 161 162
sysopen(PIDFILE, $pidfile, O_WRONLY | O_EXCL | O_CREAT) ||
  fatal("Couldn't create '$pidfile': $? $!\n");
print PIDFILE "$$";
close PIDFILE;
# If I make it to here, I'll need to clean up the lock file
my $lockfile=$pidfile;
Robert Ricci's avatar
Robert Ricci committed
163

164 165 166
# Change my $0 so that it is easier to see in a ps/top
$0 = "$0";

Robert Ricci's avatar
Robert Ricci committed
167
my $URL = "elvin://$server";
Mac Newbold's avatar
Mac Newbold committed
168 169 170
if ($port) {
    $URL .= ":$port";
}
Robert Ricci's avatar
Robert Ricci committed
171 172 173

# Connect to the event system, and subscribe the the events we want 
my $handle = event_register($URL,0);
Mac Newbold's avatar
Mac Newbold committed
174 175 176
if (!$handle) {
    fatal("Unable to register with event system\n");
}
Robert Ricci's avatar
Robert Ricci committed
177 178

my $tuple = address_tuple_alloc();
Mac Newbold's avatar
Mac Newbold committed
179 180 181
if (!$tuple) {
    fatal("Could not allocate an address tuple\n");
}
Robert Ricci's avatar
Robert Ricci committed
182

183 184 185
%$tuple = ( objtype => join(",",
			    $TBNODESTATE, $TBNODEOPMODE,
			    $TBCONTROL, $TBCOMMAND) );
186

Robert Ricci's avatar
Robert Ricci committed
187
if (!event_subscribe($handle,\&handleEvent,$tuple)) {
Mac Newbold's avatar
Mac Newbold committed
188
    fatal("Could not subscribe to events\n");
Robert Ricci's avatar
Robert Ricci committed
189 190 191 192
}

# Read in the pre-existing node states, and timeout and valid transition
# information from the database
193 194 195 196
my %timeouts  = getTimeouts();
my %valid     = getValid();
my %modeTrans = getModeTrans();
my %triggers  = getTriggers();
197
my %nodes     = readStates();
198
my %timeouttag= ();
199
if ($debug) { qshow(); }
Robert Ricci's avatar
Robert Ricci committed
200 201 202

# Gets set if a reload of state from the database should happen.
my $do_reload = 0;
203 204
my $sigrestart= 0;
my $sigcleanup= 0;
Robert Ricci's avatar
Robert Ricci committed
205 206 207 208

# Make the daemon reload database state on a sighup - but I'm worried
# about what would happen if we tried to do this mid-loop. So, we'll
# just set a flag and do it when we're done with our current pass.
209 210
$SIG{HUP}  = sub { info("SIGHUP - Reloading DB state\n"); $do_reload = 1; };

Mac Newbold's avatar
Mac Newbold committed
211
# Set up other signals.
212 213 214 215 216 217 218
$SIG{USR1} = \&restart_wrap;
$SIG{USR2} = \&cleanup_wrap;
$SIG{INT}  = \&cleanup_wrap;
$SIG{QUIT} = \&cleanup_wrap;
$SIG{ABRT} = \&cleanup_wrap;
$SIG{TERM} = \&cleanup_wrap;
$SIG{KILL} = \&cleanup_wrap;
Robert Ricci's avatar
Robert Ricci committed
219

220 221 222
# Track if I handled an event or not
my $event_count = 0;

223 224
# Control how long I block while waiting for events
my $blockwait=0;
225
my $nextdeadline=0;
226 227
my $mailqueue=0;

228 229 230 231 232
notify("Stated starting up\n");

sub process_event_queue() {
    $event_count=0;
    my $lastcount=-1;
233 234 235 236 237 238 239 240 241 242 243 244 245
    my $wait;
    my $now = time();
    debug("Polling - mq=$mailqueue bw=$blockwait\n");
    if ( $mailqueue == 0) {
	# no messages waiting...
	if ($blockwait) {
	    # we can wait a long time - nothing else will happen
	    # until we get an event, or get woken up by a signal
	    $wait = 600;
	} else {
	    # only wait until the next deadline...
	    if ($nextdeadline > 0) {
		$wait = $nextdeadline - $now;
246 247
	    } else {
		$wait = 0;
248 249 250 251 252
	    }
	}
    } else {
	# mail is waiting. Only block until it is time to send it.
	$wait = $lastmail + $mailgap - $now;
Mac Newbold's avatar
Mac Newbold committed
253
	debug("Now $now, mailgap $mailgap, last $lastmail ==> wait $wait\n");
254 255 256
    }
    if ($wait < 0) { debug("Wait was $wait!\n"); $wait=0; }
    my $finish = $now + $wait;
Mac Newbold's avatar
Mac Newbold committed
257 258
    while (($event_count != $lastcount || $wait > 0) &&
	   !($sigrestart || $sigcleanup || $do_reload)) {
259
	$lastcount = $event_count;
Mac Newbold's avatar
Mac Newbold committed
260 261
	# Don't block if we got a signal!
	if ($wait<=0 || $sigrestart || $sigcleanup || $do_reload) {
262 263 264 265 266 267 268 269
	    event_poll($handle);
	} else {
	    debug("Using blocking event poll - $wait seconds\n");
	    # timeout param is in milliseconds, so multiply
	    event_poll_blocking($handle, $wait*1000);
	    $now = time();
	    # subtract seconds elapsed from my wait time
	    $wait = $finish - $now;
Mac Newbold's avatar
Mac Newbold committed
270
	    debug("Finished blocking event poll - $wait seconds remain\n");
271
	    if ($event_count > 0 &&
Mac Newbold's avatar
Mac Newbold committed
272 273
		(qsize() > 0 || $mailqueue ||
		 $sigrestart || $sigcleanup || $do_reload)) {
274 275 276 277 278 279 280
		$blockwait=0;
		$wait=0;
		#debug("Cancelling wait - timeouts/msgs waiting, or HUP'd\n");
		#debug("---End Blocking Wait ---\n");
	    }
	}
	#debug("Wait is $wait\n");
281 282
    }
    if ($event_count > 0) {
Mac Newbold's avatar
Mac Newbold committed
283
	debug("Handled $event_count event(s).\n");
284 285
    }
}
Robert Ricci's avatar
Robert Ricci committed
286

287
# Now, we just poll for events, and watch for timeouts
Robert Ricci's avatar
Robert Ricci committed
288
while (1) {
Mac Newbold's avatar
Mac Newbold committed
289
    my $now = time();
290 291 292 293
    my ($deadline,$node);

    # Check for nodes that have passed their timeout
    if (!qhead($deadline,$node)) {
Mac Newbold's avatar
Mac Newbold committed
294
	info("HEAD: $node in ".($deadline-$now).", queue=".qsize()."\n");
295 296
	while ($now >= $deadline && $node ne "") {
	    qpop($deadline,$node);
Mac Newbold's avatar
Mac Newbold committed
297
	    info("POP: $node in ".($deadline-$now).", queue=".qsize()."\n");
298
	    handleCtrlEvent($node,$TBTIMEOUT);
299 300 301 302
	    if (0) { qshow(); }
	    if (qhead($deadline,$node)) {
		$deadline=0; $node="";
	    }
303
	}
304 305
    } else {
	$deadline=0;
306
    }
307
    $nextdeadline = $deadline;
308

309 310 311 312 313
    if (qsize()==0) {
	$blockwait=1;
	debug("---Blocking wait okay---\n");
    }
		
Mac Newbold's avatar
Mac Newbold committed
314 315 316 317
    if ($do_reload || ($now - $last_reload > $reload_time)) {
	reload();
	$do_reload = 0;
    }
Mac Newbold's avatar
Mac Newbold committed
318

Mac Newbold's avatar
Mac Newbold committed
319 320
    # Send any messages in the queue if it is time
    notify("",1);
Mac Newbold's avatar
Mac Newbold committed
321

322 323 324
    if ($sigrestart) { restart(); }
    if ($sigcleanup) { cleanup(); }

325
    process_event_queue;
Robert Ricci's avatar
Robert Ricci committed
326 327
}

Mac Newbold's avatar
Mac Newbold committed
328 329
exit(0);

Robert Ricci's avatar
Robert Ricci committed
330
# Read the current states of nodes from the database
331
sub readStates(;@) {
Mac Newbold's avatar
Mac Newbold committed
332 333 334 335 336 337
    my %oldnodes = @_;

    # Guard against undefined variable warnings
    if (! defined(%oldnodes)) {
	%oldnodes = ();
    }
338

Mac Newbold's avatar
Mac Newbold committed
339 340 341
    #debug("readStates called\n");
    my $result = DBQueryFatal("SELECT node_id, eventstate, " .
			      "state_timestamp, op_mode, " .
342 343
			      "op_mode_timestamp FROM nodes ".
			      "where node_id not like 'sh%'");
Mac Newbold's avatar
Mac Newbold committed
344 345 346 347

    my %nodes;
    while (my ($node_id, $state, $timestamp, $mode, $mode_timestamp)
	   = $result->fetchrow()) {
348
	#
Mac Newbold's avatar
Mac Newbold committed
349 350 351 352
	# If there's an entry in oldnodes for this node, and it
	# hasn't changed state or time, use the old entry (so that
	# we don't lose information about which nodes we've already
	# notified the ops about, etc.)
353
	#
Mac Newbold's avatar
Mac Newbold committed
354 355 356 357 358
	if ($oldnodes{$node_id} && $state && $timestamp &&
	    ($oldnodes{$node_id}{state} eq $state) &&
	    ($oldnodes{$node_id}{mode} eq $mode) &&
	    ($oldnodes{$node_id}{timestamp} == $timestamp)) {
	    $nodes{$node_id} = $oldnodes{$node_id};
359
	} else {
Mac Newbold's avatar
Mac Newbold committed
360 361 362 363
	    $nodes{$node_id}{state}          = $state;
	    $nodes{$node_id}{timestamp}      = $timestamp;
	    $nodes{$node_id}{mode}           = $mode;
	    $nodes{$node_id}{mode_timestamp} = $mode_timestamp;
364 365
	    # Is there a timeout? If so, set it up!
	    setTimeout($mode,$state,$node_id,$timestamp);
366
	}
Mac Newbold's avatar
Mac Newbold committed
367 368
    }
    return %nodes;
Robert Ricci's avatar
Robert Ricci committed
369 370 371 372 373 374
}

#
# Read timeouts for various states from the database
#
sub getTimeouts() {
Mac Newbold's avatar
Mac Newbold committed
375 376 377
    #debug("getTimeouts called\n");
    my $result = DBQueryFatal("SELECT op_mode, state, timeout, action " .
			      "FROM state_timeouts");
Robert Ricci's avatar
Robert Ricci committed
378

Mac Newbold's avatar
Mac Newbold committed
379 380 381 382 383
    my %timeouts;
    while (my ($op_mode, $state, $timeout, $action) = $result->fetchrow()) {
	$timeouts{$op_mode}{$state} = [ $timeout, $action ];
    }
    return %timeouts;
Robert Ricci's avatar
Robert Ricci committed
384 385 386 387 388 389
}

#
# Read the list of valid state transitions from the database
#
sub getValid() {
Mac Newbold's avatar
Mac Newbold committed
390 391 392
    #debug("getValid called\n");
    my $result = DBQueryFatal("SELECT op_mode, state1, state2 " .
			      "FROM state_transitions");
Robert Ricci's avatar
Robert Ricci committed
393

Mac Newbold's avatar
Mac Newbold committed
394 395 396 397 398
    my %valid;
    while (my ($mode,$state1, $state2) = $result->fetchrow()) {
	$valid{$mode}{$state1}{$state2} = 1;
    }
    return %valid;
Robert Ricci's avatar
Robert Ricci committed
399 400
}

401 402 403 404
#
# Read the list of valid mode transitions from the database
#
sub getModeTrans() {
Mac Newbold's avatar
Mac Newbold committed
405 406 407 408 409 410 411 412 413 414 415 416 417
    #debug("getModeTrans called\n");
    my $result = 
      DBQueryFatal("SELECT op_mode1, state1, op_mode2, state2 " .
		   "FROM mode_transitions order by op_mode1,state1");

    my %modeTrans;
    while (my ($mode1,$state1, $mode2, $state2) = $result->fetchrow()) {
	if (!defined($modeTrans{"$mode1:$state1"})) {
	    $modeTrans{"$mode1:$state1"}= ["$mode2:$state2"];
	} else {
	    my @l = @{$modeTrans{"$mode1:$state1"}};
	    push(@l, "$mode2:$state2");
	    $modeTrans{"$mode1:$state1"}= \@l;
418
	}
Mac Newbold's avatar
Mac Newbold committed
419 420
    }
    return %modeTrans;
421 422 423 424 425 426
}

#
# Read the list of states which trigger an action
#
sub getTriggers() {
427 428 429 430 431
    debug("getTriggers called\n");
    
    debug("anymode ==> '$TBANYMODE'\n");

    # Grab global triggers
Mac Newbold's avatar
Mac Newbold committed
432 433
    my $result = 
      DBQueryFatal("SELECT op_mode, state, trigger " .
434 435
		   "FROM state_triggers where node_id='$TBANYMODE' ".
		   "order by op_mode,state");
Mac Newbold's avatar
Mac Newbold committed
436
    my %t;
437
    while (my ($mode, $state, $trig) = $result->fetchrow()) {
Mac Newbold's avatar
Mac Newbold committed
438
	$t{"$mode:$state"} = $trig;
439 440 441 442 443 444 445 446 447 448 449 450
	debug("trig($mode:$state)\t => $trig\n");
    }

    # Grab per-node triggers
    $result = 
      DBQueryFatal("SELECT node_id, op_mode, state, trigger " .
		   "FROM state_triggers where node_id!='$TBANYMODE' ".
		   "order by op_mode,state");
    while (my ($n, $mode, $state, $trig) = $result->fetchrow()) {
	my @trigs = split(/\s*,\s*/,$trig);
	$t{"$n:$mode:$state"} = \@trigs;
	debug("trig($n:$mode:$state)\t => ".join(',',@trigs)."\n");
Mac Newbold's avatar
Mac Newbold committed
451
    }
452

Mac Newbold's avatar
Mac Newbold committed
453
    return %t;
454 455
}

Robert Ricci's avatar
Robert Ricci committed
456 457 458 459
#
# Gets called for every event that we recieve
#
sub handleEvent($$$) {
Mac Newbold's avatar
Mac Newbold committed
460 461 462 463 464 465 466 467 468 469 470
    my ($handle,$notification,$data) = @_;
    my $objtype = event_notification_get_objtype($handle,$notification);
    my $objname = event_notification_get_objname($handle,$notification);
    my $eventtype = event_notification_get_eventtype($handle,$notification);

    $event_count++;
    debug("Got an event: ($objtype,$objname,$eventtype)\n");

    #
    # Check to see if another instance is supposed to be handling this node
    #
471
    if ($objtype ne $TBCOMMAND && !checkDBRedirect($objname)) {
Mac Newbold's avatar
Mac Newbold committed
472 473 474
	info("Got an event for node $objname, which isn't mine\n");
	return;
    }
475

Mac Newbold's avatar
Mac Newbold committed
476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491
  SWITCH: for ($objtype) {
	
	(/$TBNODESTATE/) && do {
	    stateTransition($objname,$eventtype);
	    last;
	};
	(/$TBNODEOPMODE/) && do {
	    opModeTransition($objname,$eventtype);
	    notify("Use of deprecated event TBNODEOPMODE:\n".
		   "$objname->$eventtype\n");
	    last;
	};
	(/$TBCONTROL/) && do {
	    handleCtrlEvent($objname,$eventtype);
	    last;
	};
492 493 494 495
	(/$TBCOMMAND/) && do {
	    handleCommand($objname,$eventtype);
	    last;
	};
496

Mac Newbold's avatar
Mac Newbold committed
497
    }
498 499 500 501 502

}

sub stateTransition($$) {

503
    my ($node,$newstate) = @_;
Robert Ricci's avatar
Robert Ricci committed
504

505 506 507 508 509 510 511 512
    # Check for invalid transitions
    my ($oldstate, $mode);
    if ($nodes{$node}) {
	$oldstate = $nodes{$node}{state};
	$mode = $nodes{$node}{mode};
    } else {
	# Try reloading the cache once before we give up on this node
	reload();
513
	if ($nodes{$node}) {
514 515
	    $oldstate = $nodes{$node}{state};
	    $mode = $nodes{$node}{mode};
Robert Ricci's avatar
Robert Ricci committed
516
	} else {
517
	    notify("Got an event for a node ($node) I don't know about\n");
Robert Ricci's avatar
Robert Ricci committed
518
	}
519 520 521 522
    }
    if ($oldstate && $mode && $valid{$mode} && $valid{$mode}{$oldstate} &&
	!$valid{$mode}{$oldstate}{$newstate}) {
	notify("Invalid transition for node $node from $mode/$oldstate " .
523
	       "to $newstate\n");
524
    }
Robert Ricci's avatar
Robert Ricci committed
525

526 527 528 529
    my $now = time();
    $nodes{$node}{state}     = $newstate;
    $nodes{$node}{timestamp} = $now;
    $nodes{$node}{notified}  = 0;
530

531 532 533
    info("$node: $mode/$oldstate => $mode/$newstate\n");
    DBQueryFatal("UPDATE nodes SET eventstate='$newstate', " .
		 "state_timestamp='$now' WHERE node_id='$node'");
534

535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555
    # Before we set the timeout (overwriting any current ones), we need
    # to check if we had a pending command
    if (qfind($node) &&
	$timeout_tag{$node} =~ /^$TBCOMMAND:/) {
	my ($str,$cmd) = split(":",$timeout_tag{$node});
	if ($cmd eq $TBREBOOT) {
	    if ($state eq TBDB_NODESTATE_SHUTDOWN ) {
		info("$node: $TBREBOOT success\n");
		# Timeout will get cleared below by setTimeout call
	    } else {
		notify("$node: $TBREBOOT in progress, but got state $state ".
		       "instead of ". TBDB_NODESTATE_SHUTDOWN ."!\n");
	    }
	#} elsif ($cmd eq $FOO ) {
	    # Add more here...
	} else {
	    notify("$node: Unknown command timeout '$timeout_tag{$node}' ".
		   "found at $mode/$state\n");
	}
    }

556 557 558
    # Check if this state has a timeout, and if so, put it in the queue
    setTimeout($mode,$newstate,$node,$now);

559 560
    # Check if this is TBDB_NODESTATE_BOOTING , which has actions
    if ($newstate eq TBDB_NODESTATE_BOOTING) {
561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578
	# If I skipped shutdown, and came to booting directly from isup,
	# check for a mode transition so I don't miss one...
	if ($oldstate eq TBDB_NODESTATE_ISUP) {
	    info("$node: Came from ISUP! Checking for mode transition\n");
	    my $r = DBQueryWarn("select next_op_mode from nodes ".
				"where node_id='$node'");
	    my ($nextmode) = $r->fetchrow();
	    if ($nextmode) {
		# Force the transition even though it is illegal
		info("$node: Forcing mode transition!\n");
		opModeTransition($node,$nextmode,1);
		$mode=$nextmode;
	    } else {
		debug("No next mode.\n");
	    }
	}

	# Check if I'm in the right mode
579
	my $osid = TBBootWhat($node,$debug);
580 581
	my $os_op_mode = os_opmode($osid);
	info("$node: Current OS is '$osid', OS mode is '$os_op_mode'\n");
582
	DBQueryFatal("UPDATE nodes SET osid='$osid' WHERE node_id='$node'");
583
	if ($os_op_mode ne $mode) {
584 585
	    my $str = "Node $node is running OS '$osid' but in mode '$mode' ".
	      "instead of mode '$os_op_mode'!\n";
586 587 588
	    # For now, only force if we're going into reload mode, so we
	    # don't get stuck looping in reloading.
	    if ($os_op_mode eq "RELOAD") {
589 590 591 592 593 594 595 596 597
		DBQueryFatal("UPDATE nodes SET op_mode='$os_op_mode', ".
			     "op_mode_timestamp=unix_timestamp(now()) ".
			     "WHERE node_id='$node'");
		$nodes{$node}{mode} = $os_op_mode;
		$nodes{$node}{mode_timestamp} = $now;
		$str .= "Forced op_mode to '$os_op_mode'.\n";
	    }
	    notify($str);
	}
598 599
	checkGenISUP($node);
    }
600

601 602 603 604 605
    # Check if this state has any triggers
    my @nodetrigs = GetNodeTriggerList($node,$mode,$newstate);
    if (defined($triggers{"$mode:$newstate"}) ||
        (@nodetrigs > 0) ) {
	# check for global triggers
606
	my @trigs = split(/\s*,\s*/,$triggers{"$mode:$newstate"});
607 608 609 610
	# Run all the triggers
	debug("Running triggers. Global=".join("/",@trigs).
	      "   node=".join("/",@nodetrigs)."\n");
	foreach ( @trigs , @nodetrigs) {
611 612 613 614 615 616 617 618 619 620 621 622 623 624 625
	    my $trig = $_;
	    /^$TBRESET$/ && do {
		# Check if we really need to do a reset
		my $r = DBQueryWarn("select osid,def_boot_osid from nodes ".
				    "where node_id='$node'");
		my ($osid,$defosid) = $r->fetchrow();
		if ($osid ne $defosid) {
		    handleCtrlEvent($node,$trig);
		}
		next;
	    };
	    /^$TBRELOADDONE$/ && do {
		handleCtrlEvent($node,$trig);
		next;
	    };
626 627 628 629
	    /^$TBFREENODE$/ && do {
		handleCtrlEvent($node,$trig);
		next;
	    };
630 631 632 633 634 635 636 637
	    /^$TBISUP$/ && do {
		info("$node: Triggered $TBISUP\n");
		EventSendWarn(host      => $BOSSNODE ,
			      objtype   => TBDB_TBEVENT_NODESTATE ,
			      eventtype => TBDB_NODESTATE_ISUP ,
			      objname   => $node);
		next;
	    };
638
	    notify("Unknown trigger '$trig' for $node in $mode/$newstate!\n");
639
	}
640 641 642
	# Clear any of the node triggers that we ran
	debug("Clearing node triggers: ".join("/",@nodetrigs)."\n");
	ClearNodeTrigger($node,$mode,$newstate,@nodetrigs);
643
    }
644

645 646 647 648 649 650 651 652
    # Check if this state can trigger a mode transition
    if (defined($modeTrans{"$mode:$newstate"})) {
	info("$node: Checking for mode transition\n");
	my $r = DBQueryWarn("select next_op_mode from nodes ".
			    "where node_id='$node'");
	my ($nextmode) = $r->fetchrow();
	if ($nextmode) {
	    opModeTransition($node,$nextmode);
Mac Newbold's avatar
Mac Newbold committed
653 654 655
	} else {
	    debug("No next mode.\n");
	}
656 657
    }
}
658

659
sub opModeTransition($$;$) {
Mac Newbold's avatar
Mac Newbold committed
660

661 662
    my ($node,$newmode,$force) = @_;
    if (!defined($force)) { $force = 0; }
Mac Newbold's avatar
Mac Newbold committed
663

664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679
    info("$node: Mode change to $newmode requested\n");
    # Check for invalid transitions
    my ($oldstate, $mode, $nextstate);
    if ($nodes{$node}) {
	$oldstate = $nodes{$node}{state};
	$mode = $nodes{$node}{mode};
    } else {
	# Try reloading the cache once before we give up on this node
	reload();
	if ($nodes{$node}) {
	    $oldstate = $nodes{$node}{state};
	    $mode = $nodes{$node}{mode};
	} else {
	    notify("Got an event for a node ($node) I don't know about\n");
	}
    }
680
    if (defined($modeTrans{"$mode:$oldstate"}) || $force) {
681
	if (!$force) {
682 683 684 685 686 687 688 689 690 691 692 693 694 695 696
	    debug("Mode Transition check:\n");
	    my $translist = join(",",@{$modeTrans{"$mode:$oldstate"}});
	    #debug("translist=$translist\n");
	    #debug("splitlist=".join(", ",split(/[:,]/,$translist))."\n");
	    my %trans = split(/[:,]/,$translist);
	    debug("Valid transitions from $mode/$oldstate are:\n");
	    foreach my $k (sort keys %trans) {
		debug("$k => $trans{$k}\n");
	    }
	    if (defined($trans{$newmode})) {
		$nextstate=$trans{$newmode};
	    } else {
		notify("Invalid mode transition for $node from ".
		       "$mode/$oldstate to $newmode!\n");
	    }
697 698
	}
    } else {
699
	notify("Invalid mode transition for $node from $mode/$oldstate: ".
700 701
	       "Not a valid mode transition state!\n");
    }
Mac Newbold's avatar
Mac Newbold committed
702 703 704
    if (!$nextstate) {
	$nextstate=$oldstate;
    }
Mac Newbold's avatar
Mac Newbold committed
705

706 707 708 709 710 711
    my $now = time();
    $nodes{$node}{state}     = $nextstate;
    $nodes{$node}{timestamp} = $now;
    $nodes{$node}{mode}           = $newmode;
    $nodes{$node}{mode_timestamp} = $now;
    $nodes{$node}{notified}       = 0;
Mac Newbold's avatar
Mac Newbold committed
712

713 714 715 716 717
    info("$node: $mode/$oldstate => $newmode/$nextstate\n");
    DBQueryFatal("UPDATE nodes SET eventstate='$nextstate', ".
		 "next_op_mode='', op_mode='$newmode', ".
		 "state_timestamp='$now', ".
		 "op_mode_timestamp='$now' WHERE node_id='$node'");
Mac Newbold's avatar
Mac Newbold committed
718 719 720 721

    # Check if this state has a timeout, and if so, put it in the queue
    setTimeout($newmode,$nextstate,$node,$now);

722 723 724 725
}

sub handleCtrlEvent($$) {
    my ($node,$event) = @_;
726

727
    info("CtrlEvent: $node, $event\n");
728

729 730
    foreach ($event) {
	/^$TBRESET$/ && do {
Mac Newbold's avatar
Mac Newbold committed
731 732 733
	    my $result = DBQueryFatal("SELECT pxe_boot_path, def_boot_osid ".
				      "FROM nodes where node_id='$node'");
	    my ($pxepath,$osid) = $result->fetchrow();
734

735 736 737 738
	    # Important note on ordering here:
	    # Because setting a normal osid resets pxe path to PXEBOOT,
	    # We need to read it out first, then set the osid, then set
	    # the pxepath back to its original value at the end.
739

740 741
	    $cmd = "$osselect $osid $node";
	    system($cmd) and
Mac Newbold's avatar
Mac Newbold committed
742 743
	      notify("$node/$event: Couldn't clear next_boot_*\n".
		     "\tcmd=$cmd\n\t*** $!\n");
744

745
	    $pxepath = "-p ".$pxepath;
Mac Newbold's avatar
Mac Newbold committed
746 747 748 749
	    if ($pxepath eq "-p ") {
		$pxepath="PXEBOOT";
	    }
	    ;
Mac Newbold's avatar
Mac Newbold committed
750
	    my $cmd = "$osselect -m $pxepath $node";
751
	    system($cmd) and
Mac Newbold's avatar
Mac Newbold committed
752 753
	      notify("$node/$event: Couldn't clear next_pxe_boot_path\n".
		     "\tcmd=$cmd\n\t*** $!\n");
754

Mac Newbold's avatar
Mac Newbold committed
755
	    info("Performed RESET for $node to $osid/$pxepath\n");
756 757 758 759 760 761 762 763 764 765
	    next;
	};
	/^$TBRELOADDONE$/ && do {
	    info("Clearing reload info for $node\n");
	    DBQueryFatal("delete from current_reloads where node_id='$node'");
	    my ($pid,$eid);
	    NodeidToExp($node,\$pid,\$eid);
	    if (($pid eq NODERELOADING_PID) && ($eid eq NODERELOADING_EID)) {
		DBQueryFatal("delete from scheduled_reloads ".
			     "where node_id='$node'");
766 767 768 769
		AddNodeTrigger($node, $TBANYMODE, TBDB_NODESTATE_ISUP,
			       $TBFREENODE)
		  && notify("$node: Couldn't add trigger $TBFREENODE!\n");
		info("Set up freeing of $node from $pid/$eid\n");
770 771 772
	    }
	    next;
	};
773 774 775 776 777 778 779 780
	/^$TBFREENODE$/ && do {
	    # Don't need pid/eid, but we should put it in the log
	    my ($pid,$eid);
	    NodeidToExp($node,\$pid,\$eid);
	    DBQueryFatal("delete from reserved where node_id='$node'");
	    info("Released $node from $pid/$eid\n");
	    next;
	};
781
	/^$TBTIMEOUT$/ && do {
782 783 784 785
	    my ($mode,$state) = split(":",$timeout_tag{$node});
	    delete($timeout_tag{$node});
	    my $curstate = $nodes{$node}{state};
	    my $curmode = $nodes{$node}{mode};
786
	    my ($timeout,$action);
787 788 789 790 791
	    if (!defined($nodes{$node}{notified})) {
		$nodes{$node}{notified}=0;
	    }
	    $nodes{$node}{notified}++;
	    my $notified = $nodes{$node}{notified};
792 793 794 795
	    if ($mode && $state && $timeouts{$mode} &&
		$timeouts{$mode}{$state}) {
		($timeout, $action) = @{$timeouts{$mode}{$state}};
	    }
796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813
	    if ($mode eq $TBCOMMAND) {
		# It is a command, not a true state
		if ($action eq "CMDRETRY") {
		    # Retry the command
		    notify("$node: Command $state, retry #$notified\n");
		    # notify in case we get in a retry loop...
		    handleCommand($node,$state,$notified);
		} else {
		    notify("$node: Unknown timeout action for ".
			   "$mode/$state: '$action'\n");
		}
		next;
	    } else {
		if ($notified>1) {	
		    notify("$node: Timed out at $now (d=$deadline), ".
			   "but notified already!\n");
		}
	    }
814 815
	    notify("Node $node has timed out in state $mode/$state".
		   ($action ne "" ? "\n\tRequested action $action." : "").
816
		   "\n");
817 818
	    next;
	};
819
	notify("$node: Unknown CtrlEvent: $event\n");
820 821
    }
}
Robert Ricci's avatar
Robert Ricci committed
822

823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892
sub handleCommand($$;$) {
    my ($params,$command,$retry) = @_;
    if (!defined($retry)) { $retry=0; }

    info("Command: $params, $command (attempt $retry)\n");

    # XXX - Right now we skip the checkDBRedirect calls for our
    # TBCOMMAND events, since they may have a list of nodes in them.
    # We may need to do it here (while iterating over the list), or
    # make some other fix up in handleEvent.

    if ($command eq $TBREBOOT && $retry >=3) {
	announce("Node $params has tried rebooting $retry times and has \n".
		 "still not been successful. Please look into it soon.\n".
		 "In the meantime, $params will be powered off.\n");
	# change my command to poweroff.
	$command = $TBPOWEROFF;
    }

    foreach ($command) {
	/^$TBREBOOT$/ && do {
	    # For reboot, the params is a comma-separated list of nodes
	    my @nodes = split(",",$params);
	    my $nodelist=join(" ",@nodes);
	    info("Rebooting nodes: $nodelist\n");
	    # Permissions were checked in order to send the message,
	    # so we don't need to do any fancy stuff here.

	    my $cmd = "$nodereboot $nodelist &";
	    debug("$cmd\n") or
	    #system($cmd) and
	      notify("$params/$command: ".
		     "Command '$cmd' failed, error $?: $!\n");

	    # Set up a timeout, so we retry if we don't get SHUTDOWN in time
	    foreach $node (@nodes) {
		# Note: This will replace any state timeouts currently in
		# the queue. But here that's okay because we're expecting
		# to see another transition really soon anyway.
		setTimeout($TBCOMMAND,$command,$node,time());
	    }

	    info("Performed $command for $params\n");
	    next;
	};
	(/^$TBPOWEROFF$/ || /^$TBPOWERON$/ || /^$TBPOWERCYCLE$/) && do {
	    # For power, the params is a comma-separated list of nodes
	    my @nodes = split(",",$params);
	    my $nodelist=join(" ",@nodes);
	    my %funcmap = ( $TBPOWERCYCLE => "cycle",
			    $TBPOWERON    => "on",
			    $TBPOWEROFF   => "off");
	    my $func = $funcmap{$command};
	    info("Sending power $func nodes: $nodelist\n");
	    # Permissions were checked in order to send the message,
	    # so we don't need to do any fancy stuff here.

	    my $cmd = "$power $func $nodelist &";
	    debug("$cmd\n") or
	    #system($cmd) and
	      notify("$params/$command: ".
		     "Command '$cmd' failed, error $?: $!\n");

	    info("Performed $command for $params\n");
	    next;
	};
	notify("$params: Unknown Command: $command\n");
    }
}

893 894 895 896
#
# Check if we need to generate an ISUP
#
sub checkGenISUP($) {
Mac Newbold's avatar
Mac Newbold committed
897 898 899 900 901 902 903 904 905 906
    my ($node) = @_;
    debug("$node: Checking ISUP Generation\n");
    my $r = DBQueryWarn("select osfeatures from nodes as n ".
			"left join os_info as o on o.osid=n.osid ".
			"where node_id='$node' and osfeatures is not null");
    my $osfeatures="";
    # If we don't get anything back, assume it has no features.
    if ($r->num_rows() > 0) {
	($osfeatures) = $r->fetchrow();
    }
Mac Newbold's avatar
Mac Newbold committed
907

Mac Newbold's avatar
Mac Newbold committed
908 909 910 911 912 913
    my @features = split(",",$osfeatures);
    # Make sure features I care about are defined
    my %can=("ping"=>0, "isup"=>0);
    foreach my $f (@features) {
	$can{"\L$f"}=1;	# make sure it's all lowercase
    }
Mac Newbold's avatar
Mac Newbold committed
914

Mac Newbold's avatar
Mac Newbold committed
915 916 917 918 919
    # If os will send ISUP on its own, do nothing here.
    if ($can{"isup"}) {
	debug("$node: Will send own ISUP\n"); 
	return 0;
    }
Mac Newbold's avatar
Mac Newbold committed
920

Mac Newbold's avatar
Mac Newbold committed
921 922 923 924 925 926
    # If os doesn't support isup but can ping, fork and ping it every
    # few seconds and send isup when it pings, or timeout after too long.
    if ($can{"ping"}) {
	debug("$node: Needs to be pinged - calling eventping\n");
	system("$TB/sbin/eventping $node &");
	return 0;
927
    }
Mac Newbold's avatar
Mac Newbold committed
928

Mac Newbold's avatar
Mac Newbold committed
929 930 931 932 933 934 935 936
    # If os doesn't support ping or isup, stated sends ISUP just after 
    # the node gets to BOOTING (a bit early, but the best we can do)

    debug("$node: OS doesn't ping - sending ISUP\n");
    EventSendWarn(host      => $BOSSNODE ,
		  objtype   => TBDB_TBEVENT_NODESTATE ,
		  eventtype => TBDB_NODESTATE_ISUP ,
		  objname   => $node);
Robert Ricci's avatar
Robert Ricci committed
937 938
}

939 940 941 942
# Figure out if this node belongs to us (ie. if it's using our database.)
# Returns 1 if it does, 0 if not
sub checkDBRedirect($) {

Mac Newbold's avatar
Mac Newbold committed
943
    my ($node) = @_;
944

Mac Newbold's avatar
Mac Newbold committed
945 946 947 948 949 950 951 952
    # XXX: I don't want to do this every time, for performance reaons,
    # but we need to make sure that we don't get into an inconsistent 
    # state
    my $result=DBQueryFatal("SELECT testdb FROM nodes as n " .
			    "LEFT JOIN reserved as r ON n.node_id=r.node_id ".
			    "LEFT JOIN experiments as e ON r.pid = e.pid " .
			    "AND r.eid = e.eid " .
			    "WHERE n.node_id = '$node'");
953

Mac Newbold's avatar
Mac Newbold committed
954 955 956 957
    if (!$result->num_rows()) {
	notify("Got an event for a node ($node) I don't know about\n");
	return 0;
    }
958

Mac Newbold's avatar
Mac Newbold committed
959
    my ($testdb) = $result->fetchrow();
960

Mac Newbold's avatar
Mac Newbold committed
961 962 963
    # XXX: It's hokey to hardcode tbdb here, but....

    #debug("checkDBRedirect: $node => $testdb (I'm $TBDBNAME)\n");
964
    if ((!$testdb && ($TBDBNAME eq $REALTBDBNAME)) ||
Mac Newbold's avatar
Mac Newbold committed
965 966 967 968 969
	($testdb && ($testdb eq $TBDBNAME))) {
	return 1;
    } else {
	return 0;
    }
970 971
}

972 973 974 975
# Check if this state has a timeout, and if so, put it in the queue
sub setTimeout( $$$$ ) {
    my ($mode,$state,$node,$now) = @_;
    if (0) { print "Original: ($mode,$state,$node,$now)\n"; qshow(); }
976
    if (defined(qfind($node))) { qdelete($node); delete($timeout_tag{$node}); }
977 978 979 980 981 982 983 984 985 986 987
    if (0) { print "Deleted:\n"; qshow(); }
    if (defined($mode) && defined($state) &&
	defined($timeouts{$mode}) &&
	defined($timeouts{$mode}{$state})) {
	my $deadline = ${$timeouts{$mode}{$state}}[0];
        if (defined($deadline) &&
	    $deadline != $TBNOTIMEOUT) {
	    my $TO = $deadline + $now;
	    debug("Setting timeout for ($node,$mode,$state) at ".
		  "$deadline + $now ($TO)\n");
	    qinsert($TO,$node);
988
	    $timeout_tag{$node} = "$mode:$state";
989 990 991 992 993 994
	    if (0) { qshow(); }
	}
    }
    if (0) { print "Done:\n"; qshow(); }
}

Robert Ricci's avatar
Robert Ricci committed
995 996
# Reload state from the database
sub reload() {
Mac Newbold's avatar
Mac Newbold committed
997 998 999 1000 1001 1002
    debug("Reloading state from database\n");
    $last_reload = time();
    %timeouts  = getTimeouts();
    %valid     = getValid();
    %modeTrans = getModeTrans();
    %triggers  = getTriggers();
1003
    %nodes     = readStates(%nodes);
Robert Ricci's avatar
Robert Ricci committed
1004 1005
}

1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085
#
# Some functions for node triggers
#

# $rv   = AddNodeTrigger($node, $mode, $state, @triglist);
sub AddNodeTrigger( $$$@ ) {
    my ($node, $mode, $state, @trigs) = @_;
    if (@trigs == 0) { return 1; }
    if (defined($triggers{"$node:$mode:$state"})) {
	my %t = ();
	foreach $k (@{$triggers{"$node:$mode:$state"}}) { $t{$k} = 1; }
	my @newtrigs = ();
	foreach $k (@trigs) { if (!defined($t{$k})) { push(@newtrigs,$k); } }
	push(@{$triggers{"$node:$mode:$state"}},@newtrigs);
    } else {
	$triggers{"$node:$mode:$state"} = \@trigs;
    }
    my $triglist = join(",",@{$triggers{"$node:$mode:$state"}});
    DBQueryFatal("replace into state_triggers ".
		 "(node_id,op_mode,state,trigger) values ".
		 "('$node','$mode','$state','$triglist')");
    return 0;
}

# @list = GetNodeTriggerList($node, $mode, $state);
sub GetNodeTriggerList( $$$ ) {
    my ($node, $mode, $state) = @_;
    my @l = ();
    if (defined($triggers{"$node:$mode:$state"})) {
	push(@l,@{$triggers{"$node:$mode:$state"}});
    }
    if (defined($triggers{"$node:$TBANYMODE:$state"})) {
	push(@l,@{$triggers{"$node:$TBANYMODE:$state"}});
    }
    return @l;
}

# $rv   = ClearNodeTrigger($node, $mode, $state, @triglist);
# Note: When not clearing all triggers, ordering is not preserved!
sub ClearNodeTrigger( $$$ ; @ ) {
    my ($node, $mode, $state, @trigs) = @_;
    # We have to keep any triggers that aren't on the list, but the
    # most common case will be that the list they give us is the whole
    # list anyway. So treat that case special.
    my @reallist = GetNodeTriggerList($node,$mode,$state);
    # empty list means clear all...
    if ((@trigs==0) || join(",",sort @reallist) eq join(",",sort @trigs)) {
	# Same list... just nuke the entry
	debug("Clearing all triggers for $node...\n");
	delete($triggers{"$node:$mode:$state"});
	delete($triggers{"$node:$TBANYMODE:$state"});
	DBQueryFatal("delete from state_triggers ".
		     "where node_id='$node' and state='$state' and ".
		     "(op_mode='$mode' or op_mode='$TBANYMODE')");
    } else {
	# Subtract @trigs from @reallist
	my %temptrigs = ();
	foreach $k (@reallist) { $temptrigs{$k} = 1; }
	debug("Reallist = ".join("/",@reallist).", trigs=".
	      join("/",@trigs).".\n");
	foreach $t (@trigs) {
	    if (defined($temptrigs{$t})) {
		delete($temptrigs{$t});
		debug("Clearing $t\n");
	    }
	}
	# Note: This doesn't quite do the right thing with triggers
	# for a fixed mode vs TBANYMODE. So if you start using this
	# code, make sure and debug it first!
	my @newtrigs = keys %temptrigs;
	debug("Newlist = ".join("/",@newtrigs).".\n");
	delete($triggers{"$node:$mode:$state"});
	if (@newtrigs > 0) {
	    AddNodeTrigger($node,$mode,$state,@newtrigs);
	} 
    }

    return 0;
}

1086 1087
sub os_opmode() {
    my $osid = shift || "";
Mac Newbold's avatar
Mac Newbold committed
1088 1089 1090
    if ($osid eq $TB_OSID_MBKERNEL) {
	return "MINIMAL";
    }
1091 1092
    my $cmd = "select op_mode from os_info where osid='$osid';";
    my $q = DBQueryFatal($cmd);
Mac Newbold's avatar
Mac Newbold committed
1093 1094 1095
    if ($q->numrows() < 1) {
	return "";
    }
1096 1097 1098
    my @r = $q->fetchrow_array();
    my $opmode=$r[0];
    debug("OpMode for '$osid' is '$opmode'\n");
Mac Newbold's avatar
Mac Newbold committed
1099 1100 1101
    if (defined($opmode) && $opmode ne "") {
	return $opmode;
    }
1102 1103
    return "";
}
Mac Newbold's avatar
Mac Newbold committed
1104

1105 1106 1107 1108
#
# Functions for controlling output/logging, and signal handling
#

Robert Ricci's avatar
Robert Ricci committed
1109
sub debug(@) {
Mac Newbold's avatar
Mac Newbold committed
1110 1111 1112
    if ($debug) {
	print @_;
    }
Mac Newbold's avatar
Mac Newbold committed
1113 1114 1115
}

sub fatal($) {
Mac Newbold's avatar
Mac Newbold committed
1116 1117 1118
    my $msg = shift;
    notify($msg);
    die($msg);
Robert Ricci's avatar
Robert Ricci committed
1119 1120
}

Mac Newbold's avatar
Mac Newbold committed
1121
sub showqueue() {
Mac Newbold's avatar
Mac Newbold committed
1122 1123 1124 1125 1126 1127 1128 1129 1130 1131
    if ($debug < 2) {
	return;
    }
    if ((keys %msgs) > 0) {
	debug("\nMAILQUEUE:\n");
    }
    foreach $k (sort keys %msgs) {
	my @l = @{$msgs{$k}};
	debug("MSGS:\n$k==> (".(@l+0).",'".join("','",@l)."')\n");
    }
Mac Newbold's avatar
Mac Newbold committed
1132 1133 1134
}

sub notify($;$) {
Mac Newbold's avatar
Mac Newbold committed
1135 1136 1137 1138
    my $message = shift;
    my $checkonly = shift || 0;
    # Use a timestamp, now that we're throttling mail
    my $tstamp=strftime("%b %e %H:%M:%S",localtime);
Mac Newbold's avatar
Mac Newbold committed
1139
    showqueue();
Mac Newbold's avatar
Mac Newbold committed
1140 1141
    if (!$checkonly) {
	info($message);
1142
	$mailqueue++;
Mac Newbold's avatar
Mac Newbold committed
1143 1144 1145 1146
	# Queue up the message
	# (The queue is a hash of lists of timestamps, keyed by message
	if (defined($msgs{$message})) {
	    push(@{$msgs{$message}},$tstamp);
Mac Newbold's avatar
Mac Newbold committed
1147
	} else {
Mac Newbold's avatar
Mac Newbold committed
1148
	    $msgs{$message} = [$tstamp];
Mac Newbold's avatar
Mac Newbold committed
1149
	}
Mac Newbold's avatar
Mac Newbold committed
1150
	showqueue();
Mac Newbold's avatar
Mac Newbold committed
1151
    }
Mac Newbold's avatar
Mac Newbold committed
1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174
    my $now = time;
    if ($now - $lastmail >= $mailgap) {
	if ((keys %msgs)>0) {
	    debug("SENDING MAILQUEUE\n"."(now $now, lastmail $lastmail, ".
		  ($now-$lastmail).">=$mailgap)\n");
	    my $mailbody="";
	    my $sep = '-'x5;
	    # We're okay to send. Make a digest of all the queued messages.
	    foreach my $msg (sort keys %msgs) {
		my @tlist = @{$msgs{$msg}};
		my $count = 0+@tlist;
		$mailbody .= "\n$msg\n";
		if ($count > 1) {
		    my $first = shift @tlist;
		    my $last = pop @tlist;
		    $mailbody .= "($count copies from $first to $last)\n";
		} else {
		    $mailbody .= "($count copy at $tlist[0])\n";
		}
		$mailbody .= "$sep\n";
	    }
	    # Now reset the mail queue
	    %msgs = ();
1175
	    $mailqueue=0;
Mac Newbold's avatar
Mac Newbold committed
1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187
	    showqueue();
	    $lastmail = time;
	    if (!$debug) {
		SENDMAIL("Stated List <".$TBOPS.">",
			 "Stated Messsage",$mailbody,
			 "Stated Daemon <".$TBOPS.">");
	    } else {
		debug("notify: Not sending mail in debug mode\n");
		debug("MAIL CONTAINS:\n".$mailbody."\n");
	    }
	}
    } # else do nothing, not time yet
1188 1189
}

1190
sub announce($) {
Mac Newbold's avatar
Mac Newbold committed
1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202
    my $message = shift;
    my $tstamp=strftime("%b %e %H:%M:%S",localtime);
    notify("ANNOUCEMENT: ".$message."\n\n(Sent to $REALTBOPS)\n");
    $mailbody = "\n$message\n\n$tstamp\n";
    if (!$debug) {
	SENDMAIL($REALTBOPS,
		 "Stated Messsage",$mailbody,
		 "Stated Daemon <".$TBOPS.">");
    } else {
	debug("announce: Not sending mail in debug mode\n");
	debug("MAIL CONTAINS:\n".$mailbody."\n");
    }
1203 1204
}

1205
sub info($;$) {
1206
    my $message = shift;
1207
    my $notice = shift || 0;
Mac Newbold's avatar
Mac Newbold committed
1208 1209
    # Use syslog
    my $prio="info";
Mac Newbold's avatar
Mac Newbold committed
1210 1211 1212
    if ($notice) {
	$prio = "notice";
    }
Mac Newbold's avatar
Mac Newbold committed
1213
    if ($debug) {
Mac Newbold's avatar
Mac Newbold committed
1214 1215 1216 1217
	# Print out log entries like this:
	# Sep 20 09:36:00 stated[238]: Reloading state from database
	print strftime("%b %e %H:%M:%S",localtime)." stated[$$]: $message";
	$message = "DEBUG: ".$message;
1218
    }
Mac Newbold's avatar
Mac Newbold committed
1219
    syslog($prio,$message) || notify("syslog failed: $? $!\n");
1220 1221
}

1222 1223
sub restart_wrap { $sigrestart=1; }

1224 1225
# This gets called if we catch a signal USR1
sub restart {
Mac Newbold's avatar
Mac Newbold committed
1226 1227 1228 1229 1230 1231 1232 1233
    my $params = join(" ",@args);
    my $prog = "";
    # If we're started from an abosolute path, use that.
    if ($0 =~ /^\//) {
	$prog = $0;
    } else {
	$prog = "$TB/sbin/stated";
    }
1234 1235 1236
    info("SIGUSER1 received: Performing final event poll before restarting\n");
    $blockwait=0;
    process_event_queue;
Mac Newbold's avatar
Mac Newbold committed
1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247
    info("Restarting from '$prog".($params ne "" ? " $params" : "")."'\n");
    if ($handle && event_unregister($handle) == 0) {
	warn "Unable to unregister with event system\n";
    }
    if (defined($lockfile) && $lockfile ne "") {
	unlink $lockfile;
    }
    if (!defined(sigprocmask(SIG_UNBLOCK, POSIX::SigSet->new(SIGUSR1,SIGHUP)))) {
	notify("sigprocmask: sig unblock failed! $?, $!\n");
	die("\n");
    }
1248 1249
    $lastmail=0;
    notify("",1);
Mac Newbold's avatar
Mac Newbold committed
1250 1251 1252 1253 1254 1255 1256 1257
    announce("Stated restarted\n");
    exec("$prog $params") or 
      do {
	  my $msg = "Couldn't restart stated! cmd='$prog $params'\n".
	    "Error: ($?) $!\n";
	  announce($msg);
	  die($msg);
      };
Robert Ricci's avatar
Robert Ricci committed
1258 1259
}

1260 1261
sub cleanup_wrap { $sigcleanup=1; }

1262 1263
# This gets called if we catch a signal (TERM, etc.)
sub cleanup {
Mac Newbold's avatar
Mac Newbold committed
1264 1265 1266
    notify("Signal received, exiting\n");
    # now do the normal exit stuff in END {}
    exit(0);
1267 1268
}

Robert Ricci's avatar
Robert Ricci committed
1269 1270
# This gets called if we die of 'natural causes' (exit, die, etc.)
END {
1271
    debug("Ending stated...\n");
Mac Newbold's avatar
Mac Newbold committed
1272
    my $stat = $?;
1273 1274
    $lastmail=0;
    notify("",1);
Mac Newbold's avatar
Mac Newbold committed
1275 1276 1277 1278 1279 1280 1281
    if (defined($lockfile) && $lockfile ne "") {
	unlink $lockfile;
	announce("Stated exiting, cleaning up\n");
    } else {
	# Must be a child
	info("Stated child exiting\n");
    }
1282
    debug("Annouced. Cleaning up...\n");
Mac Newbold's avatar
Mac Newbold committed
1283 1284 1285
    # clean up Syslog
    closelog();
    if ($handle) {
1286
	debug("Unregistering w/event system...\n");
Mac Newbold's avatar
Mac Newbold committed
1287 1288 1289
	if (event_unregister($handle) == 0) {
	    die "Unable to unregister with event system\n";
	}
1290
	debug("Unregistered.\n");
Mac Newbold's avatar
Mac Newbold committed
1291
    }
1292
    debug("Cleaned up. Bye!\n");
Mac Newbold's avatar
Mac Newbold committed
1293 1294
    # Restore $? in case one of the things I called changed it
    $? = $stat;