Commit cf61f6f3 authored by Leigh B. Stoller's avatar Leigh B. Stoller

A set of debugging changes to allow running multiple stateds. This is

probably imperfect, but better then nothing. New option, "-t tag"
allows you to specify an arbitrary tag to match against the stated_tag
of the nodes table. The stated invocation will only operate on nodes
that match the tag, ignoring all events for other nodes. If
unspecified, stated will operate on all nodes with a NULL tag. This is
setup up at the beginning of time (or during a reload) saving the
per-node tag in the $nodes hash. Each time an event arrives, check the
tag in the table, ignoring the event if not a match.

On signaled reload() must also be careful to throw away timeouts from
the queue (and be careful not to set up new timeouts for ignored
nodes).  So, this allows you to set the tag for a node in the DB, and
then HUP stated so that it reloads it tables. That node will now be
ignored by that stated.

Also made some changes to debug mode. In debug mode, don't worry about
the pidfile or the lockfile or checking for other running stated
(which causes my debug version to exit! right away). Also, added a new
-l option to turn of syslog output and just send it all to stdout with
the debug output. -l can be only be used with -d of course.

So what can I do with all this:

	update nodes set stated_tag='lbs' where node_id='pc5';
	sudo kill -HUP `cat /var/run/stated.pid`
	sudo stated -d -l -t lbs

Which tells the main stated to ignore pc5. Then I run a debugging
stated that operates only on pc5. Later when done:

	update nodes set stated_tag=NULL where node_id='pc5';
	sudo kill -HUP `cat /var/run/stated.pid`

Which tells the main stated to operate on pc5 again.
parent ac01c40b
#!/usr/bin/perl -w #!/usr/bin/perl -w
# #
# EMULAB-COPYRIGHT # EMULAB-COPYRIGHT
# Copyright (c) 2000-2003 University of Utah and the Flux Group. # Copyright (c) 2000-2004 University of Utah and the Flux Group.
# All rights reserved. # All rights reserved.
# #
...@@ -62,17 +62,28 @@ my %msgs = (); ...@@ -62,17 +62,28 @@ my %msgs = ();
my $reload_time = 600; my $reload_time = 600;
my $last_reload = time; my $last_reload = time;
# Command line opts.
my $dbtag = "";
my $debug = 0;
my $nolog = 0;
my ($server,$port);
my $lockfile;
my $pidfile;
# Process command-line arguments # Process command-line arguments
sub usage { sub usage {
print << "END"; print << "END";
Usage: $0 [-h] [-d] [-s server] [-p port] Usage: $0 [-h] [-d] [-s server] [-p port] [-t dbtag]
-h This message -h This message
-d Turn on debugging output, and don't go into the background -d Turn on debugging output, and do not go into the background
-l Do not use syslog; send output to stderr. Use with -d only
-t tag Use only those nodes with matching tag in nodes table
-s server Use specified server, instead of this site's bossnode -s server Use specified server, instead of this site's bossnode
-p port Use specified port -p port Use specified port
Send SIGHUP to reload database state, or SIGUSR1 to restart completely. Send SIGHUP to reload database state, or SIGUSR1 to restart completely.
END END
exit(1);
} }
# Only root should run this - it won't work when run as a user... # Only root should run this - it won't work when run as a user...
...@@ -83,7 +94,7 @@ if ($UID && ( $TB eq $REALTB || ! TBAdmin($UID) ) ) { ...@@ -83,7 +94,7 @@ if ($UID && ( $TB eq $REALTB || ! TBAdmin($UID) ) ) {
my @args = @ARGV; # save a copy for restart before we mess with them. my @args = @ARGV; # save a copy for restart before we mess with them.
my %opt = (); my %opt = ();
getopts("ds:p:h",\%opt); if (!getopts("ds:p:ht:l",\%opt)) { usage(); }
if ($opt{h}) { if ($opt{h}) {
exit &usage; exit &usage;
...@@ -92,7 +103,6 @@ if (@ARGV) { ...@@ -92,7 +103,6 @@ if (@ARGV) {
exit &usage; exit &usage;
} }
my ($server,$port,$debug);
if ($opt{s}) { if ($opt{s}) {
$server = $opt{s}; $server = $opt{s};
} else { } else {
...@@ -101,10 +111,16 @@ if ($opt{s}) { ...@@ -101,10 +111,16 @@ if ($opt{s}) {
if ($opt{p}) { if ($opt{p}) {
$port = $opt{p}; $port = $opt{p};
} }
if ($opt{l}) {
usage()
if (! $opt{d});
$nolog = 1;
}
if ($opt{t}) {
$dbtag = $opt{t};
}
if ($opt{d}) { if ($opt{d}) {
$debug = 1; $debug = 1;
} else {
$debug = 0;
} }
# Grab some constants into variables # Grab some constants into variables
...@@ -130,41 +146,47 @@ my $TB_OSID_MBKERNEL = TB_OSID_MBKERNEL; ...@@ -130,41 +146,47 @@ my $TB_OSID_MBKERNEL = TB_OSID_MBKERNEL;
# This only gets used here, so it isn't in a lib constant. # This only gets used here, so it isn't in a lib constant.
my $TBFREENODE = "FREENODE"; my $TBFREENODE = "FREENODE";
my $pidfile; if (!$debug) {
if ( $TB eq $REALTB ) { if ( $TB eq $REALTB ) {
$pidfile = "/var/run/stated.pid"; $pidfile = "/var/run/stated.pid";
} else {
$pidfile = "$TB/locks/stated.pid";
}
debug("Using pidfile $pidfile\n");
if (-e $pidfile) {
my $otherpid = `cat $pidfile`;
my $running = `ps -auxww | grep $otherpid | grep -v grep`;
if ($running ne "") {
fatal("Lockfile $pidfile exists, and process $otherpid appears to be ".
"running.\n");
} else { } else {
notify("Lockfile exists, but process $otherpid appears to be dead.\n". $pidfile = "$TB/locks/stated.pid";
"Removing lock file...\n");
} }
system("rm $pidfile") && debug("Using pidfile $pidfile\n");
fatal("Couldn't remove $pidfile: $? $!\n");
} if (-e $pidfile) {
# Background my $otherpid = `cat $pidfile`;
if (!$debug) { my $running = `ps -auxww | grep $otherpid | grep -v grep`;
if ($running ne "") {
fatal("Lockfile $pidfile exists, and process $otherpid appears ".
"to be running.\n");
} else {
notify("Lockfile exists, but process $otherpid appears to be dead".
"\n".
"Removing lock file...\n");
}
system("rm $pidfile") &&
fatal("Couldn't remove $pidfile: $? $!\n");
}
# Background
# We use syslog, so redirect the output to nothing # We use syslog, so redirect the output to nothing
if (TBBackGround("/dev/null")) { if (TBBackGround("/dev/null")) {
exit(0); exit(0);
} }
} }
# set up syslog # set up syslog
openlog("stated","pid",$TBLOG); if (! $nolog) {
sysopen(PIDFILE, $pidfile, O_WRONLY | O_EXCL | O_CREAT) || openlog("stated","pid",$TBLOG);
fatal("Couldn't create '$pidfile': $? $!\n"); }
print PIDFILE "$$";
close PIDFILE; if (defined($pidfile)) {
# If I make it to here, I'll need to clean up the lock file sysopen(PIDFILE, $pidfile, O_WRONLY | O_EXCL | O_CREAT) ||
my $lockfile=$pidfile; fatal("Couldn't create '$pidfile': $? $!\n");
print PIDFILE "$$";
close PIDFILE;
# If I make it to here, I'll need to clean up the lock file
$lockfile = $pidfile;
}
# Change my $0 so that it is easier to see in a ps/top # Change my $0 so that it is easier to see in a ps/top
$0 = "$0"; $0 = "$0";
...@@ -344,12 +366,24 @@ sub readStates(;@) { ...@@ -344,12 +366,24 @@ sub readStates(;@) {
#debug("readStates called\n"); #debug("readStates called\n");
my $result = DBQueryFatal("SELECT node_id, eventstate, " . my $result = DBQueryFatal("SELECT node_id, eventstate, " .
"state_timestamp, op_mode, " . "state_timestamp, op_mode, " .
"op_mode_timestamp FROM nodes ". "op_mode_timestamp, stated_tag FROM nodes ".
"where node_id not like 'sh%'"); "where node_id not like 'sh%'");
my %nodes; my %nodes;
while (my ($node_id, $state, $timestamp, $mode, $mode_timestamp) while (my ($node_id, $state, $timestamp, $mode, $mode_timestamp, $tag)
= $result->fetchrow()) { = $result->fetchrow()) {
$nodes{$node_id}{"tag"} = (defined($tag) ? $tag : "");
if ($dbtag ne "" && $dbtag eq $nodes{$node_id}{"tag"}) {
info("This stated will work on $node_id\n");
}
if ($dbtag eq "" && $dbtag ne $nodes{$node_id}{"tag"}) {
info("This stated will *NOT* work on $node_id\n");
}
if ($dbtag ne $nodes{$node_id}{"tag"}) {
remTimeout($node_id);
}
# #
# If there's an entry in oldnodes for this node, and it # If there's an entry in oldnodes for this node, and it
# hasn't changed state or time, use the old entry (so that # hasn't changed state or time, use the old entry (so that
...@@ -370,7 +404,9 @@ sub readStates(;@) { ...@@ -370,7 +404,9 @@ sub readStates(;@) {
$nodes{$node_id}{timedout} = 0; $nodes{$node_id}{timedout} = 0;
$nodes{$node_id}{noretry} = 0; $nodes{$node_id}{noretry} = 0;
# Is there a timeout? If so, set it up! # Is there a timeout? If so, set it up!
setTimeout($mode,$state,$node_id,$timestamp); if ($dbtag eq $nodes{$node_id}{"tag"}) {
setTimeout($mode,$state,$node_id,$timestamp);
}
} }
} }
return %nodes; return %nodes;
...@@ -471,16 +507,49 @@ sub handleEvent($$$) { ...@@ -471,16 +507,49 @@ sub handleEvent($$$) {
my $eventtype = event_notification_get_eventtype($handle,$notification); my $eventtype = event_notification_get_eventtype($handle,$notification);
$event_count++; $event_count++;
debug("Got an event: ($objtype,$objname,$eventtype)\n");
#
# For readability, only do this on the main stated.
#
if ($dbtag ne "") {
debug("Got an event: ($objtype,$objname,$eventtype)\n");
}
# #
# Check to see if another instance is supposed to be handling this node # Check to see if another instance is supposed to be handling this node
# #
if ($objtype ne $TBCOMMAND && !checkDBRedirect($objname)) { if ($objtype ne $TBCOMMAND) {
info("Got an event for node $objname, which isn't mine\n"); my $node = $objname;
return;
#
# If we have never seen this node, reload.
#
if (! defined($nodes{$node})) {
reload();
# Still not defined, someone screwed up! This could end up
# churning via reload(). Bad.
if (defined($nodes{$node})) {
notify("Got $objtype/$eventtype for nonexistent $node!\n");
return;
}
}
#
# If a stated_tag was specified on the command line, ignore those
# nodes that do not match.
#
if ($dbtag ne $nodes{$node}{"tag"}) {
# Record when main stated ignores a node.
info("Got $objtype/$eventtype for $node, which is not mine\n")
if ($dbtag eq "");
return;
}
if (!checkDBRedirect($node)) {
info("Got $objtype/$eventtype for $node, which is not mine\n");
return;
}
} }
SWITCH: for ($objtype) { SWITCH: for ($objtype) {
(/$TBNODESTATE/) && do { (/$TBNODESTATE/) && do {
...@@ -1052,6 +1121,17 @@ sub setTimeout( $$$$ ) { ...@@ -1052,6 +1121,17 @@ sub setTimeout( $$$$ ) {
if (0) { print "Done:\n"; qshow(); } if (0) { print "Done:\n"; qshow(); }
} }
# Remove a timeout.
sub remTimeout ($)
{
my ($node) = @_;
if (defined(qfind($node))) {
qdelete($node);
delete($timeout_tag{$node});
}
}
# Reload state from the database # Reload state from the database
sub reload() { sub reload() {
debug("Reloading state from database\n"); debug("Reloading state from database\n");
...@@ -1276,7 +1356,12 @@ sub info($;$) { ...@@ -1276,7 +1356,12 @@ sub info($;$) {
print strftime("%b %e %H:%M:%S",localtime)." stated[$$]: $message"; print strftime("%b %e %H:%M:%S",localtime)." stated[$$]: $message";
$message = "DEBUG: ".$message; $message = "DEBUG: ".$message;
} }
syslog($prio,$message) || notify("syslog failed: $? $!\n"); if ($nolog) {
print $message;
}
else {
syslog($prio,$message) || notify("syslog failed: $? $!\n");
}
} }
sub restart_wrap { $sigrestart=1; } sub restart_wrap { $sigrestart=1; }
...@@ -1341,7 +1426,9 @@ END { ...@@ -1341,7 +1426,9 @@ END {
} }
debug("Annouced. Cleaning up...\n"); debug("Annouced. Cleaning up...\n");
# clean up Syslog # clean up Syslog
closelog(); if (! $nolog) {
closelog();
}
if ($handle) { if ($handle) {
debug("Unregistering w/event system...\n"); debug("Unregistering w/event system...\n");
if (event_unregister($handle) == 0) { if (event_unregister($handle) == 0) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment