All new accounts created on Gitlab now require administrator approval. If you invite any collaborators, please let Flux staff know so they can approve the accounts.

checknodes_daemon.in 9.18 KB
Newer Older
1 2
#!/usr/bin/perl -w
#
3
# EMULAB-COPYRIGHT
4
# Copyright (c) 2009-2010 University of Utah and the Flux Group.
5 6 7 8 9 10 11 12 13 14 15 16 17 18
# All rights reserved.
#
use strict;
use English;
use Getopt::Std;

#
# Attempt to determine if nodes are really messed up.
#
sub usage()
{
    print "Usage: checknodes_daemon [-d]\n";
    exit(1);
}
19
my $optlist   = "dn";
20
my $debug     = 0;
21
my $impotent  = 0;
22 23 24 25 26 27 28 29

#
# Configure variables
#
my $TB		  = "@prefix@";
my $TBOPS         = "@TBOPSEMAIL@";
my $TBLOGS        = "@TBLOGSEMAIL@";
my $LOGFILE       = "$TB/log/checknodes.log";
30
my $PIDFILE       = "/var/run/checknodes.pid";
31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
my $SUDO          = "/usr/local/bin/sudo";
my $PROTOUSER     = "elabman";
my $WAP           = "$TB/sbin/withadminprivs";
my $BATCHEXP      = "$TB/bin/batchexp";
my $NAMED_SETUP   = "$TB/sbin/named_setup";
my $EXPORTS_SETUP = "$TB/sbin/exports_setup";
my $GENTOPOFILE   = "$TB/libexec/gentopofile";
my $NFREE         = "$TB/bin/nfree";

# un-taint path
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin:/usr/site/bin';
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};

# Protos
sub fatal($);
Mike Hibler's avatar
Mike Hibler committed
46
sub logit($$);
47 48
sub NodeIsDead($);
sub NodeIsOkay($);
49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66
	  
#
# Turn off line buffering on output
#
$| = 1; 

if ($UID != 0) {
    die("Must be root to run this script\n");
}

#
# Check args early so we get the right DB.
#
my %options = ();
if (! getopts($optlist, \%options)) {
    usage();
}
if (defined($options{"d"})) {
67
    $debug++;
68
}
69 70 71
if (defined($options{"n"})) {
    $impotent = 1;
}
72 73 74 75 76 77 78 79 80 81 82 83 84

# Load the Testbed support stuff.
use lib "@prefix@/lib";
use libdb;
use libosload;
use libtestbed;
use Experiment;
use Node;
use User;

my $NODEILL_PID = NODEILL_PID();
my $NODEILL_EID = NODEILL_EID();

85
#
86
# Only one please.
87
#
88 89
if (CheckDaemonRunning("checknodes_daemon")) {
    fatal("Not starting another checknodes daemon!");
90 91
}

92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123
#
# We need this user for running below.
#
my $elabman = User->Lookup($PROTOUSER);
if (!defined($elabman)) {
    fatal("Could not lookup $PROTOUSER user. Exiting ...");
}

#
# Grab the expeiment we use,
#
my $experiment = Experiment->Lookup($NODEILL_PID, $NODEILL_EID);
if (!defined($experiment)) {
    #
    # Create if it does not exist.
    #
    system("$SUDO -u $PROTOUSER $WAP $BATCHEXP ".
	   " -q -i -k -j -w -f -n -S 'System Experiment' ".
	   " -L 'System Experiment' ".
	   " -E 'Check failed nodes before moving to hwdown - DO NOT DELETE' ".
	   " -p $NODEILL_PID -e $NODEILL_EID");
    if ($?) {
 	fatal("Could not create experiment for $NODEILL_PID/$NODEILL_EID\n");
    }
    $experiment = Experiment->Lookup($NODEILL_PID, $NODEILL_EID);
}
if ($experiment->state() eq EXPTSTATE_SWAPPED()) {
    $experiment->SetState(EXPTSTATE_ACTIVE());
}
my $pid = $experiment->pid();
my $eid = $experiment->eid();

Leigh B. Stoller's avatar
Leigh B. Stoller committed
124 125 126 127 128 129 130 131
#
# We need the hwdown experiment below.
#
my $hwdown_experiment = Experiment->Lookup(NODEDEAD_PID(), NODEDEAD_EID());
if (!defined($hwdown_experiment)) {
    fatal("Cannot find the hwdown experiment.");
}

132 133 134 135 136 137
# Go to ground.
if (! $debug) {
    if (TBBackGround($LOGFILE)) {
	exit(0);
    }
}
138 139
if (MarkDaemonRunning("checknodes_daemon")) {
    fatal("Could not mark daemon as running!");
140
}
Mike Hibler's avatar
Mike Hibler committed
141
logit("Check Nodes Daemon starting ... pid $$", 0);
142 143 144 145 146 147 148 149 150 151

if ($elabman->FlipTo($experiment->unix_gid())) {
    fatal("Could not flipto $elabman ($experiment)");
}

#
# Setup a signal handler for newsyslog.
#
sub handler()
{
152 153 154
    my $SAVEEUID = $EUID;
    
    $EUID = 0;
155
    ReOpenLog($LOGFILE);
156
    $EUID = $SAVEEUID;
157 158 159 160 161
}
$SIG{HUP} = \&handler
    if (!$debug);

while (1) {
162 163 164
    my @informtbopsfatal = ();
    my @informtbopswarn  = ();
    my @tmp = ();
Leigh B Stoller's avatar
Leigh B Stoller committed
165
    my $disabled;
166

Leigh B Stoller's avatar
Leigh B Stoller committed
167
    if (! TBGetSiteVar("web/nologins", \$disabled) || $disabled) {
Mike Hibler's avatar
Mike Hibler committed
168
	logit("Skipping this loop because of nologins", 0);
Leigh B Stoller's avatar
Leigh B Stoller committed
169 170
	goto loop;
    }
Mike Hibler's avatar
Mike Hibler committed
171
    logit("Running", 0);
Leigh B Stoller's avatar
Leigh B Stoller committed
172

173 174 175 176 177 178 179
    #
    # Look for nodes in a wierd state. Lets test them.
    #
    my $query_result = 
	DBQueryWarn("select n.node_id,n.eventstate, ".
	    "   FROM_UNIXTIME(n.state_timestamp) from nodes as n ".
	    "left join reserved as r on r.node_id=n.node_id ".
180
	    "left join node_types as t on t.type=n.type ".
181 182
	    "where (n.eventstate!='". TBDB_NODESTATE_ISUP ."' and ".
	    "       n.eventstate!='". TBDB_NODESTATE_PXEWAIT ."' and ".
183
	    "       n.eventstate!='". TBDB_NODESTATE_PXELIMBO ."' and ".
184 185 186
	    "       n.eventstate!='". TBDB_NODESTATE_ALWAYSUP ."' and ".
	    "       n.eventstate!='". TBDB_NODESTATE_POWEROFF ."') and ".
            "       r.pid is null and n.role='testnode' and ".
187
	    "       t.isvirtnode=0 and ".
188 189 190 191 192 193 194
            "       (UNIX_TIMESTAMP(now()) - n.state_timestamp) > 600");
    goto loop
	if (!$query_result);

    while (my ($nodeid,$eventstate,$stamp) = $query_result->fetchrow_array()) {
	my $node = Node->Lookup($nodeid);
	if (!defined($node)) {
Mike Hibler's avatar
Mike Hibler committed
195
	    logit("Cannot find object for $nodeid", 1);
196 197
	    next;
	}
198 199 200 201 202 203 204
	#
	# Skip nodes that are not imageable; we cannot load them
	# to test them.
	#
	next
	    if (!$node->imageable() ||
		!defined($node->default_imageid()));
205

Mike Hibler's avatar
Mike Hibler committed
206
	logit("Node in unknown state: $nodeid,$eventstate,$stamp", 0);
207 208 209

	next
	    if ($impotent);
210
	
Leigh B. Stoller's avatar
Leigh B. Stoller committed
211 212 213 214 215
	#
	# The node is not in any experiment, so we have to explicitly
	# push it into the hwcheckup experiment.
	#
	if ($node->MoveReservation($experiment)) {
Mike Hibler's avatar
Mike Hibler committed
216
	    logit("Could not move $node to $experiment", 1);
Leigh B. Stoller's avatar
Leigh B. Stoller committed
217 218
	    next;
	}
219 220 221 222 223
	$node->InsertNodeLogEntry($elabman, TB_DEFAULT_NODELOGTYPE(),
				  "'Moved to hwcheckup by checknodes daemon; ".
				  "stuck in $eventstate since $stamp'");
    }

224 225 226 227 228 229 230
    $experiment->Flush();
    Node->FlushAll();

    my @nodelist = $experiment->NodeList();
    goto loop
	if (!@nodelist);

231
    if ($impotent) {
Mike Hibler's avatar
Mike Hibler committed
232
	logit("Would check @nodelist", 0);
233 234 235
	goto loop;
    }

236 237
    foreach my $node (@nodelist) {
	if ($node->ClearBootAttributes()) {
Mike Hibler's avatar
Mike Hibler committed
238
	    logit("$node: Could not clear boot attributes.", 1);
239
	    next;
240
	}
241
	if (! $node->imageable()) {
Mike Hibler's avatar
Mike Hibler committed
242
	    logit("$node is not imageable.", 1);
243 244 245 246 247
	    NodeIsDead($node);
	    push(@informtbopsfatal, $node->node_id());
	    next;
	}
	push(@tmp, $node);
248
    }
249
    @nodelist = @tmp;
250

Mike Hibler's avatar
Mike Hibler committed
251
    logit("Checking nodes @nodelist", 0);
Leigh B. Stoller's avatar
Leigh B. Stoller committed
252

253
    if (@nodelist) {
Mike Hibler's avatar
Mike Hibler committed
254
	logit("Running $GENTOPOFILE ...", 0);
255
	if (system("$GENTOPOFILE $pid $eid")) {
Mike Hibler's avatar
Mike Hibler committed
256
	    logit("$GENTOPOFILE failed", 1);
257 258
	    next;
	}
Mike Hibler's avatar
Mike Hibler committed
259
	logit("Running $EXPORTS_SETUP ...", 0);
260
	if (system("$EXPORTS_SETUP")) {
Mike Hibler's avatar
Mike Hibler committed
261
	    logit("$EXPORTS_SETUP failed", 1);
262 263 264 265
	    next;
	}
	# The nodes will not boot locally unless there is a DNS
	# record.
Mike Hibler's avatar
Mike Hibler committed
266
	logit("Running $NAMED_SETUP ...", 0);
267
	if (system("$NAMED_SETUP")) {
Mike Hibler's avatar
Mike Hibler committed
268
	    logit("$NAMED_SETUP failed", 1);
269 270 271 272 273
	    next;
	}
	my @nodenames       = map { $_->node_id() } @nodelist;
	my %reload_args     = ();
	my %reload_results  = ();
274

275 276 277
	$reload_args{'debug'}     = $debug;
	$reload_args{'waitmode'}  = 2; # XXX Wait till reboot after reload.
	$reload_args{'nodelist'}  = [ @nodenames ];
Mike Hibler's avatar
Mike Hibler committed
278
	logit("Running osload on @nodenames", 0);
279 280
	my $failures = osload(\%reload_args, \%reload_results);
	if ($failures) {
Mike Hibler's avatar
Mike Hibler committed
281
	    logit("osload returned $failures failures", 1);
282 283
	}

284 285 286 287 288 289 290 291 292
	foreach my $node (@nodelist) {
	    if ($reload_results{$node->node_id()}) {
		push(@informtbopsfatal, $node->node_id());
		NodeIsDead($node);
	    }
	    else {
		push(@informtbopswarn, $node->node_id());
		NodeIsOkay($node);
	    }
293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318
	}
    }
    if (@informtbopsfatal) {
	my $count = scalar(@informtbopsfatal);
	SENDMAIL($TBOPS, "$count nodes are down",
		 "Nodes:\n".
		 "  " . join(" ", @informtbopsfatal) . "\n".
		 "appear to be dead.\n\n".
		 "The nodes have been taken out of the pool.\n");
    }
    if (@informtbopswarn) {
	my $count = scalar(@informtbopswarn);

	system("$NFREE $pid $eid @informtbopswarn");
	if ($?) {
	    fatal("Could not free nodes: @informtbopswarn");
	}
	else {
	    SENDMAIL($TBOPS, "$count nodes appear to be okay",
		     "Nodes:\n".
		     "  " . join(" ", @informtbopswarn) . "\n".
		     "have reloaded and rebooted okay.\n\n".
		     "The nodes have been freed.\n");
	}
    }

Mike Hibler's avatar
Mike Hibler committed
319
    logit("Running $GENTOPOFILE ...", 0);
320
    if (system("$GENTOPOFILE $pid $eid")) {
Mike Hibler's avatar
Mike Hibler committed
321
	logit("$GENTOPOFILE failed", 1);
322 323
	next;
    }
Mike Hibler's avatar
Mike Hibler committed
324
    logit("Running $EXPORTS_SETUP ...", 0);
325
    if (system("$EXPORTS_SETUP")) {
Mike Hibler's avatar
Mike Hibler committed
326
	logit("$EXPORTS_SETUP failed", 1);
327 328 329
	next;
    }
    # The nodes will not boot locally unless there is a DNS record.
Mike Hibler's avatar
Mike Hibler committed
330
    logit("Running $NAMED_SETUP ...", 0);
331
    if (system("$NAMED_SETUP")) {
Mike Hibler's avatar
Mike Hibler committed
332
	logit("$NAMED_SETUP failed", 1);
333 334 335
	next;
    }
  loop:
336
    sleep(($debug ? 10 : 60));
337
}
338
MarkDaemonStopped("checknodes_daemon");
339 340
exit(0);

341 342 343 344
sub NodeIsDead($)
{
    my ($node) = @_;
    
Leigh B. Stoller's avatar
Leigh B. Stoller committed
345
    if ($node->MoveReservation($hwdown_experiment)) {
Mike Hibler's avatar
Mike Hibler committed
346
	logit("Could not move $node to $hwdown_experiment", 1);
Leigh B. Stoller's avatar
Leigh B. Stoller committed
347 348
	return;
    }
Mike Hibler's avatar
Mike Hibler committed
349
    logit("$node is fatally ill; moving to hwdown.", 1);
350 351 352 353 354 355 356 357
    $node->InsertNodeLogEntry($elabman, TB_DEFAULT_NODELOGTYPE(),
			      "Moved to hwdown by checknodes daemon");
}

sub NodeIsOkay($)
{
    my ($node) = @_;

Mike Hibler's avatar
Mike Hibler committed
358
    logit("$node appears to be okay; releasing.", 1);
359 360 361 362
    $node->InsertNodeLogEntry($elabman, TB_DEFAULT_NODELOGTYPE(),
			      "Released by checknodes daemon");
}

363 364 365 366 367 368 369 370 371 372 373 374
sub fatal($)
{
    my ($msg) = @_;

    #
    # Send a message to the testbed list. 
    #
    SENDMAIL($TBOPS,
	     "Check Nodes Daemon died",
	     $msg,
	     $TBOPS);

375 376
    MarkDaemonStopped("checknodes_daemon");

377 378 379
    die("*** $0:\n".
	"    $msg\n");
}
380

Mike Hibler's avatar
Mike Hibler committed
381 382 383 384 385 386 387 388 389 390 391
sub logit($$)
{
    my ($msg,$stderr) = @_;
    my $stamp = POSIX::strftime("20%y-%m-%d %H:%M:%S", localtime());

    if ($stderr) {
	print STDERR "$stamp: $msg\n";
    } else {
	print "$stamp: $msg\n";
    }
}