checknodes_daemon.in 9.25 KB
Newer Older
1 2
#!/usr/bin/perl -w
#
3
# EMULAB-COPYRIGHT
4
# Copyright (c) 2009-2011 University of Utah and the Flux Group.
5 6 7 8 9 10 11 12 13 14 15 16 17 18
# All rights reserved.
#
use strict;
use English;
use Getopt::Std;

#
# Attempt to determine if nodes are really messed up.
#
sub usage()
{
    print "Usage: checknodes_daemon [-d]\n";
    exit(1);
}
19
my $optlist   = "dn";
20
my $debug     = 0;
21
my $impotent  = 0;
22 23 24 25 26 27 28 29

#
# Configure variables
#
my $TB		  = "@prefix@";
my $TBOPS         = "@TBOPSEMAIL@";
my $TBLOGS        = "@TBLOGSEMAIL@";
my $LOGFILE       = "$TB/log/checknodes.log";
30
my $PIDFILE       = "/var/run/checknodes.pid";
31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
my $SUDO          = "/usr/local/bin/sudo";
my $PROTOUSER     = "elabman";
my $WAP           = "$TB/sbin/withadminprivs";
my $BATCHEXP      = "$TB/bin/batchexp";
my $NAMED_SETUP   = "$TB/sbin/named_setup";
my $EXPORTS_SETUP = "$TB/sbin/exports_setup";
my $GENTOPOFILE   = "$TB/libexec/gentopofile";
my $NFREE         = "$TB/bin/nfree";

# un-taint path
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin:/usr/site/bin';
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};

# Protos
sub fatal($);
46
sub logit($$);
47 48
sub NodeIsDead($);
sub NodeIsOkay($);
49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66
	  
#
# Turn off line buffering on output
#
$| = 1; 

if ($UID != 0) {
    die("Must be root to run this script\n");
}

#
# Check args early so we get the right DB.
#
my %options = ();
if (! getopts($optlist, \%options)) {
    usage();
}
if (defined($options{"d"})) {
67
    $debug++;
68
}
69 70 71
if (defined($options{"n"})) {
    $impotent = 1;
}
72

73 74 75
# Set this to turn off tblog in libraries.
$ENV{'TBLOG_OFF'} = "yep";

76 77 78 79 80 81 82 83 84 85 86 87
# Load the Testbed support stuff.
use lib "@prefix@/lib";
use libdb;
use libosload;
use libtestbed;
use Experiment;
use Node;
use User;

my $NODEILL_PID = NODEILL_PID();
my $NODEILL_EID = NODEILL_EID();

88
#
89
# Only one please.
90
#
91 92
if (CheckDaemonRunning("checknodes_daemon")) {
    fatal("Not starting another checknodes daemon!");
93 94
}

95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
#
# We need this user for running below.
#
my $elabman = User->Lookup($PROTOUSER);
if (!defined($elabman)) {
    fatal("Could not lookup $PROTOUSER user. Exiting ...");
}

#
# Grab the expeiment we use,
#
my $experiment = Experiment->Lookup($NODEILL_PID, $NODEILL_EID);
if (!defined($experiment)) {
    #
    # Create if it does not exist.
    #
    system("$SUDO -u $PROTOUSER $WAP $BATCHEXP ".
	   " -q -i -k -j -w -f -n -S 'System Experiment' ".
	   " -L 'System Experiment' ".
	   " -E 'Check failed nodes before moving to hwdown - DO NOT DELETE' ".
	   " -p $NODEILL_PID -e $NODEILL_EID");
    if ($?) {
 	fatal("Could not create experiment for $NODEILL_PID/$NODEILL_EID\n");
    }
    $experiment = Experiment->Lookup($NODEILL_PID, $NODEILL_EID);
}
if ($experiment->state() eq EXPTSTATE_SWAPPED()) {
    $experiment->SetState(EXPTSTATE_ACTIVE());
}
my $pid = $experiment->pid();
my $eid = $experiment->eid();

Leigh Stoller's avatar
Leigh Stoller committed
127 128 129 130 131 132 133 134
#
# We need the hwdown experiment below.
#
my $hwdown_experiment = Experiment->Lookup(NODEDEAD_PID(), NODEDEAD_EID());
if (!defined($hwdown_experiment)) {
    fatal("Cannot find the hwdown experiment.");
}

135 136 137 138 139 140
# Go to ground.
if (! $debug) {
    if (TBBackGround($LOGFILE)) {
	exit(0);
    }
}
141 142
if (MarkDaemonRunning("checknodes_daemon")) {
    fatal("Could not mark daemon as running!");
143
}
144
logit("Check Nodes Daemon starting ... pid $$", 0);
145 146 147 148 149 150 151 152 153 154

if ($elabman->FlipTo($experiment->unix_gid())) {
    fatal("Could not flipto $elabman ($experiment)");
}

#
# Setup a signal handler for newsyslog.
#
sub handler()
{
155 156 157
    my $SAVEEUID = $EUID;
    
    $EUID = 0;
158
    ReOpenLog($LOGFILE);
159
    $EUID = $SAVEEUID;
160 161 162 163 164
}
$SIG{HUP} = \&handler
    if (!$debug);

while (1) {
165 166 167
    my @informtbopsfatal = ();
    my @informtbopswarn  = ();
    my @tmp = ();
168
    my $disabled;
169

170
    if (! TBGetSiteVar("web/nologins", \$disabled) || $disabled) {
171
	logit("Skipping this loop because of nologins", 0);
172 173
	goto loop;
    }
174
    logit("Running", 0);
175

176 177 178 179 180 181 182
    #
    # Look for nodes in a wierd state. Lets test them.
    #
    my $query_result = 
	DBQueryWarn("select n.node_id,n.eventstate, ".
	    "   FROM_UNIXTIME(n.state_timestamp) from nodes as n ".
	    "left join reserved as r on r.node_id=n.node_id ".
183
	    "left join node_types as t on t.type=n.type ".
184 185
	    "where (n.eventstate!='". TBDB_NODESTATE_ISUP ."' and ".
	    "       n.eventstate!='". TBDB_NODESTATE_PXEWAIT ."' and ".
186
	    "       n.eventstate!='". TBDB_NODESTATE_PXELIMBO ."' and ".
187 188 189
	    "       n.eventstate!='". TBDB_NODESTATE_ALWAYSUP ."' and ".
	    "       n.eventstate!='". TBDB_NODESTATE_POWEROFF ."') and ".
            "       r.pid is null and n.role='testnode' and ".
190
	    "       t.isvirtnode=0 and ".
191 192 193 194 195 196 197
            "       (UNIX_TIMESTAMP(now()) - n.state_timestamp) > 600");
    goto loop
	if (!$query_result);

    while (my ($nodeid,$eventstate,$stamp) = $query_result->fetchrow_array()) {
	my $node = Node->Lookup($nodeid);
	if (!defined($node)) {
198
	    logit("Cannot find object for $nodeid", 1);
199 200
	    next;
	}
201 202 203 204 205 206 207
	#
	# Skip nodes that are not imageable; we cannot load them
	# to test them.
	#
	next
	    if (!$node->imageable() ||
		!defined($node->default_imageid()));
208

209
	logit("Node in unknown state: $nodeid,$eventstate,$stamp", 0);
210 211 212

	next
	    if ($impotent);
213
	
Leigh Stoller's avatar
Leigh Stoller committed
214 215 216 217 218
	#
	# The node is not in any experiment, so we have to explicitly
	# push it into the hwcheckup experiment.
	#
	if ($node->MoveReservation($experiment)) {
219
	    logit("Could not move $node to $experiment", 1);
Leigh Stoller's avatar
Leigh Stoller committed
220 221
	    next;
	}
222 223 224 225 226
	$node->InsertNodeLogEntry($elabman, TB_DEFAULT_NODELOGTYPE(),
				  "'Moved to hwcheckup by checknodes daemon; ".
				  "stuck in $eventstate since $stamp'");
    }

227 228 229 230 231 232 233
    $experiment->Flush();
    Node->FlushAll();

    my @nodelist = $experiment->NodeList();
    goto loop
	if (!@nodelist);

234
    if ($impotent) {
235
	logit("Would check @nodelist", 0);
236 237 238
	goto loop;
    }

239 240
    foreach my $node (@nodelist) {
	if ($node->ClearBootAttributes()) {
241
	    logit("$node: Could not clear boot attributes.", 1);
242
	    next;
243
	}
244
	if (! $node->imageable()) {
245
	    logit("$node is not imageable.", 1);
246 247 248 249 250
	    NodeIsDead($node);
	    push(@informtbopsfatal, $node->node_id());
	    next;
	}
	push(@tmp, $node);
251
    }
252
    @nodelist = @tmp;
253

254
    logit("Checking nodes @nodelist", 0);
Leigh Stoller's avatar
Leigh Stoller committed
255

256
    if (@nodelist) {
257
	logit("Running $GENTOPOFILE ...", 0);
258
	if (system("$GENTOPOFILE $pid $eid")) {
259
	    logit("$GENTOPOFILE failed", 1);
260 261
	    next;
	}
262
	logit("Running $EXPORTS_SETUP ...", 0);
263
	if (system("$EXPORTS_SETUP")) {
264
	    logit("$EXPORTS_SETUP failed", 1);
265 266 267 268
	    next;
	}
	# The nodes will not boot locally unless there is a DNS
	# record.
269
	logit("Running $NAMED_SETUP ...", 0);
270
	if (system("$NAMED_SETUP")) {
271
	    logit("$NAMED_SETUP failed", 1);
272 273 274 275 276
	    next;
	}
	my @nodenames       = map { $_->node_id() } @nodelist;
	my %reload_args     = ();
	my %reload_results  = ();
277

278 279 280
	$reload_args{'debug'}     = $debug;
	$reload_args{'waitmode'}  = 2; # XXX Wait till reboot after reload.
	$reload_args{'nodelist'}  = [ @nodenames ];
281
	logit("Running osload on @nodenames", 0);
282 283
	my $failures = osload(\%reload_args, \%reload_results);
	if ($failures) {
284
	    logit("osload returned $failures failures", 1);
285 286
	}

287 288 289 290 291 292 293 294 295
	foreach my $node (@nodelist) {
	    if ($reload_results{$node->node_id()}) {
		push(@informtbopsfatal, $node->node_id());
		NodeIsDead($node);
	    }
	    else {
		push(@informtbopswarn, $node->node_id());
		NodeIsOkay($node);
	    }
296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321
	}
    }
    if (@informtbopsfatal) {
	my $count = scalar(@informtbopsfatal);
	SENDMAIL($TBOPS, "$count nodes are down",
		 "Nodes:\n".
		 "  " . join(" ", @informtbopsfatal) . "\n".
		 "appear to be dead.\n\n".
		 "The nodes have been taken out of the pool.\n");
    }
    if (@informtbopswarn) {
	my $count = scalar(@informtbopswarn);

	system("$NFREE $pid $eid @informtbopswarn");
	if ($?) {
	    fatal("Could not free nodes: @informtbopswarn");
	}
	else {
	    SENDMAIL($TBOPS, "$count nodes appear to be okay",
		     "Nodes:\n".
		     "  " . join(" ", @informtbopswarn) . "\n".
		     "have reloaded and rebooted okay.\n\n".
		     "The nodes have been freed.\n");
	}
    }

322
    logit("Running $GENTOPOFILE ...", 0);
323
    if (system("$GENTOPOFILE $pid $eid")) {
324
	logit("$GENTOPOFILE failed", 1);
325 326
	next;
    }
327
    logit("Running $EXPORTS_SETUP ...", 0);
328
    if (system("$EXPORTS_SETUP")) {
329
	logit("$EXPORTS_SETUP failed", 1);
330 331 332
	next;
    }
    # The nodes will not boot locally unless there is a DNS record.
333
    logit("Running $NAMED_SETUP ...", 0);
334
    if (system("$NAMED_SETUP")) {
335
	logit("$NAMED_SETUP failed", 1);
336 337 338
	next;
    }
  loop:
339
    sleep(($debug ? 10 : 60));
340
}
341
MarkDaemonStopped("checknodes_daemon");
342 343
exit(0);

344 345 346 347
sub NodeIsDead($)
{
    my ($node) = @_;
    
Leigh Stoller's avatar
Leigh Stoller committed
348
    if ($node->MoveReservation($hwdown_experiment)) {
349
	logit("Could not move $node to $hwdown_experiment", 1);
Leigh Stoller's avatar
Leigh Stoller committed
350 351
	return;
    }
352
    logit("$node is fatally ill; moving to hwdown.", 1);
353 354 355 356 357 358 359 360
    $node->InsertNodeLogEntry($elabman, TB_DEFAULT_NODELOGTYPE(),
			      "Moved to hwdown by checknodes daemon");
}

sub NodeIsOkay($)
{
    my ($node) = @_;

361
    logit("$node appears to be okay; releasing.", 1);
362 363 364 365
    $node->InsertNodeLogEntry($elabman, TB_DEFAULT_NODELOGTYPE(),
			      "Released by checknodes daemon");
}

366 367 368 369 370 371 372 373 374 375 376 377
sub fatal($)
{
    my ($msg) = @_;

    #
    # Send a message to the testbed list. 
    #
    SENDMAIL($TBOPS,
	     "Check Nodes Daemon died",
	     $msg,
	     $TBOPS);

378 379
    MarkDaemonStopped("checknodes_daemon");

380 381 382
    die("*** $0:\n".
	"    $msg\n");
}
383

384 385 386 387 388 389 390 391 392 393 394
sub logit($$)
{
    my ($msg,$stderr) = @_;
    my $stamp = POSIX::strftime("20%y-%m-%d %H:%M:%S", localtime());

    if ($stderr) {
	print STDERR "$stamp: $msg\n";
    } else {
	print "$stamp: $msg\n";
    }
}