node_reboot.in 9.94 KB
Newer Older
1
#!/usr/bin/perl -wT
Leigh B. Stoller's avatar
Leigh B. Stoller committed
2
3
4
5
6
7
8

#
# EMULAB-COPYRIGHT
# Copyright (c) 2000-2002 University of Utah and the Flux Group.
# All rights reserved.
#

9
10
11
12
use English;
use Getopt::Std;

#
13
14
# Reboot a node (or nodes). Will power cycle the node as a last resort.
# Use -e option to reboot all nodes in an experiment.
15
#
16
# usage: node_reboot [-d] [-f] node [node ...]
17
#        node_reboot [-d] [-f] -e pid,eid
18
19
20
21
22
#        Exit value is 0 if all nodes reboot okay, or the number of nodes
#        could not be rebooted.
#
sub usage()
{
23
    print STDOUT "Usage: node_reboot [-d] [-f] node [node ...]\n" .
24
	         "       node_reboot [-d] [-f] -e pid,eid\n".
25
	"Use the -d option to turn on debugging\n" .
26
	"Use the -e option to reboot all the nodes in an experiment\n" .
27
	"Use the -f option to shoot the node in the head\n";
28
29
    exit(-1);
}
30
my  $optlist = "dfe:";
31
32
33
34
35

#
# Configure variables
#
my $TB		= "@prefix@";
36
37

#
38
# Testbed Support libraries
39
#
40
41
42
use lib "@prefix@/lib";
use libdb;
use libtestbed;
43
use POSIX qw(strftime);
44

Robert Ricci's avatar
Robert Ricci committed
45
my $ssh		= "$TB/bin/sshtb -n";
46
my $power	= "$TB/bin/power";
47
my $ipod	= "$TB/sbin/apod";
48
my $logfile	= "$TB/log/power.log";
49
50
51
52
my $ping	= "/sbin/ping";
my %pids	= ();
my @row;
my @nodes       = ();
53
my $debug       = 0;
54
my $force       = 0;
55
my $failed      = 0;
56
57
58
my $eidmode     = 0;
my $pid;
my $eid;
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82

# un-taint path
$ENV{'PATH'} = '/bin:/sbin:/usr/bin:/usr/local/bin';
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};

# Turn off line buffering on output
$| = 1; 

#
# We don't want to run this script unless its the real version.
#
if ($EUID != 0) {
    die("Must be root! Maybe its a development version?");
}

#
# Parse command arguments. Once we return from getopts, all that should
# left are the required arguments.
#
%options = ();
if (! getopts($optlist, \%options)) {
    usage();
}
if (defined($options{"d"})) {
83
84
85
86
    $debug = 1;
}
if (defined($options{"f"})) {
    $force = 1;
87
}
88
89
90
91
92
93
94
95
96
if (defined($options{"e"})) {
    if (@ARGV) {
	usage();
    }
    
    $eidmode = $options{"e"};
    if ($eidmode =~ /([-\w]*),([-\w]*)/) {
	$pid = $1;
	$eid = $2;
97
98
    }
    else {
99
100
	print STDOUT "Invalid argument to -e option: $eidmode\n";
	usage();
101
    }
102
103
104
105
106
107
108
109
}

#
# If eidmode, then get the node list out of the DB instead of the command
# line. A proper check is made later, so need to be fancy about the query.
#
if ($eidmode) {
    my @row;
Leigh B. Stoller's avatar
Leigh B. Stoller committed
110
111
112
113
114
115
116
117

    #
    # Verify permission to muck with this experiment.
    #
    if ($UID && !TBAdmin($UID) &&
	! TBExptAccessCheck($UID, $pid, $eid, TB_EXPT_MODIFY)) {
	die("*** You not have permission to reboot nodes in $pid/$eid!\n");
    }
118
    
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
    my $query_result =
	DBQueryFatal("select node_id from reserved where ".
		     "pid='$pid' and eid='$eid'");

    if ($query_result->numrows == 0) {
	print STDOUT "There are no nodes reserved in pid/eid $pid/$eid\n";
	usage();
    }
    while (@row = $query_result->fetchrow_array()) {
	push(@nodes, $row[0]);
    }
}
else {
    if (@ARGV == 0) {
	usage();
    }
    
    # Untaint the nodes.
    foreach my $node ( @ARGV ) {
	if ($node =~ /^([-\@\w]+)$/) {
	    $node = $1;
	}
	else {
	    die("Bad node name: $node.");
	}
    
	push(@nodes, $node);
    }
147

Leigh B. Stoller's avatar
Leigh B. Stoller committed
148
149
150
151
152
153
154
    #
    # Verify permission to reboot these nodes.
    #
    if ($UID && !TBAdmin($UID) &&
	! TBNodeAccessCheck($UID, TB_NODEACCESS_REBOOT, @nodes)) {
	die("You do not have permission to reboot one (or more) ".
	    "of the nodes!\n");
155
156
157
    }
}

158
159
160
161
162
163
164
165
166
167
168
169
170
#
# VIRTNODE HACK: Virtual nodes are special. Do not reboot!
#
my @temp = ();
foreach my $node ( @nodes ) {
    if (TBIsNodeVirtual($node)) {
	print "*** Skipping virtual node $node ...\n";
	next;
    }
    push(@temp, $node);
}
@nodes = @temp;
if (! @nodes) {
171
    print "No nodes to reboot. Exiting ...\n";
172
173
174
    exit(0);
}

175
#
176
177
178
179
# Another shark hack. Well, perhaps not. We really don't want 50 nodes
# all rebooting at the same time, PCs *or* sharks. Lets order them
# so that the shelves are grouped together at least, and issue the reboots
# in batches. 
180
#
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
my @sortednodes = sort(@nodes);

while (@sortednodes) {
    my @batch = ();
    my $i     = 0;
    my $lastshelf = 0;
    
    while ($i < 8 && @sortednodes > 0) {
	my $node = shift(@sortednodes);
	my $shelf;
	my $unit;

	#
	# The point of this sillyness is stop at each shelf transition.
	#
	if (IsShelved($node, \$shelf, \$unit)) {
	    if ($lastshelf && $lastshelf ne $shelf) {
		unshift(@sortednodes, $node);
		last;
	    }
	    $lastshelf = $shelf;
	}
	    
	push(@batch, $node);
	$i++;
    }

    if ($force) {
        #
        # In force mode, call the power program for the whole batch, and
	# continue on. We don't wait for them to go down or reboot.
        #
213
	info("Force mode: power cycle ".join(" ",@batch));
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
	system("$power cycle @batch");
	if ($?) {
	    exit ($? >> 8);
	}
    }
    else {
        #
        # Fire off a reboot process so that we can overlap them all.
        # We need the pid so we can wait for them all before preceeding.
        #
	foreach my $node ( @batch ) {
	    $mypid = RebootNode($node);
	    $pids{$node} = $mypid;
	}
    }

    # 
    # If there are more nodes to go, then lets pause a bit so that we
    # do not get a flood of machines coming up all at the same exact
    # moment.
    #
    if (@sortednodes) {
	print STDOUT "Pausing to give some nodes time to reboot ...\n";
	if ($lastshelf) {
	    sleep(15);
Mac Newbold's avatar
Mac Newbold committed
239
	} else {
240
	    sleep(10);
Mac Newbold's avatar
Mac Newbold committed
241
	}
242
    }
243
244
}

245
#
246
# In force mode, we are done.
247
#
248
249
if ($force) {
    exit 0;
250
251
252
253
254
}

#
# Wait for all the reboot children to exit before continuing.
#
255
foreach my $node ( sort(@nodes) ) {
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
    my $mypid     = $pids{$node};

    waitpid($mypid, 0);
    if ($?) {
	$failed++;
	print STDERR "Reboot of node $node failed!\n";
    }
    else {
	print STDOUT "$node rebooting ...\n";
    }
}

if ($debug && $failed) {
    print STDERR "$failed nodes could not be rebooted\n";
}
exit $failed;

#
# Reboot a node in a child process. Return the pid to the parent so
# that it can wait on all the children later.
# 
sub RebootNode {
    local($pc) = @_;
279
    local($status, $syspid, $mypid);
280
281
282
283
284
285
286

    print STDOUT "Rebooting $pc ...\n";

    $mypid = fork();
    if ($mypid) {
	return $mypid;
    }
287
    TBdbfork();
288
289
290
291
292
293
294

    #
    # See if the machine is pingable. If its not pingable, then we just
    # power cycle the machine rather than wait for ssh to time out.
    #
    # ping returns 0 if any packets make it through. 
    #
295
    if (! DoesPing($pc)) {
296
	info("$pc appears dead: power cycle");
297
298
299
300
301
302
303
304
305
306
307
	print STDERR "$pc appears to be dead. Power cycling ...\n" if $debug;
	if (PowerCycle($pc)) {
	    exit(-1);
	}
	exit(0);
    }

    #
    # Machine is pingable at least. Try to reboot it gracefully,
    # or power cycle anyway if that does not work. 
    #
308
    print STDERR "Trying ssh reboot of $pc ...\n" if $debug;
309

310
311
312
313
314
315
    #
    # Must change our real UID to root so that ssh will work. We save the old
    # UID so that we can restore it after we finish the ssh
    #
    my $oldUID = $UID;
    print STDERR "Saved UID: $oldUID\n" if $debug;
316
317
    $UID = 0;
    
318
319
320
321
322
323
    #
    # Run an ssh command in a child process, protected by an alarm to
    # ensure that the ssh is not hung up forever if the machine is in
    # some funky state.
    # 
    $syspid = fork();
324

325
326
    if ($syspid) {
	local $SIG{ALRM} = sub { kill("TERM", $syspid); };
327
	alarm 20;
328
329
330
331
332
333
334
	waitpid($syspid, 0);
	alarm 0;

	#
	# The ssh can return non-zero exit status, but still have worked.
	# FreeBSD for example.
	#
335
	print STDERR "reboot of $pc returned $?.\n" if $debug;
336
337
    
	#
338
339
	# If either ssh is not running or it timed out,
	# send it a ping of death.
340
	# 
341
342
343
344
345
346
	if ($? == 256 || $? == 15) {
	    if ($? == 256) {
		print STDERR "$pc is not running sshd.\n" if $debug;
	    } else {
		print STDERR "$pc is wedged.\n" if $debug;
	    }
347
	    info("$pc: ssh reboot failed ... sending ipod");
348
349
	    print STDERR "Trying Ping-of-Death on $pc ...\n" if $debug;

350
	    system("$ipod $pc");
351
352
	} else {
	    info("$pc: ssh reboot");
353
354
355
	}
    }
    else {
356
	exec("$ssh -host $pc /sbin/reboot");
357
358
359
	exit(0);
    }

360
361
362
363
364
365
366
    #
    # Restore the old UID so that scripts run from this point on get the 
    # user's real UID
    #
    $UID = $oldUID;
    print STDERR "Restored UID: $UID\n" if $debug;

367
368
369
370
371
372
    #
    # Okay, before we power cycle lets really make sure. We wait a while
    # for it to stop responding to pings, and if it never goes silent,
    # punch the power button.
    #
    if (WaitTillDead($pc) == 0) {
373
	TBSetNodeEventState($pc,TBDB_NODESTATE_REBOOTING);
374
375
376
	exit(0);
    }
	
377
    info("$pc: ipod failed ... power cycle");
378
379
    print STDERR "$pc is still running. Power cycling ...\n" if $debug;
    if (PowerCycle($pc)) {
380
	info("$pc: power cycle failed!");
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
	exit(-1);
    }
    exit(0);
}

#
# Power cycle a PC using the testbed power program.
#
sub PowerCycle {
    local($pc) = @_;

    system("$power cycle $pc");
    return $? >> 8;
}

#
# Wait until a machine stops returning ping packets.
# 
sub WaitTillDead {
    local($pc) = @_;
    local($status);

    print STDERR "Waiting for $pc to die off\n" if $debug;
    
    #
    # Sigh, a long ping results in the script waiting until all the
    # packets are sent from all the pings, before it will exit. So,
    # loop doing a bunch of shorter pings.
    #
410
411
412
    for ($i = 0; $i < 30; $i++) {
	if (! DoesPing($pc)) {
	    print STDERR "$pc is rebooting.\n" if $debug;
413
414
415
416
417
418
419
	    return 0;
	}
    }
    print STDERR "$pc is still alive.\n" if $debug;
    return 1;
}

420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
#
# Returns 1 if host is responding to pings, 0 otherwise
#
sub DoesPing {
    local($pc) = @_;
    local($status);
    local($saveuid);

    $saveuid = $UID;
    $UID = 0;
    system("$ping -q -i 0.25 -c 8 -t 2 $pc >/dev/null 2>&1");
    $UID = $saveuid;
    $status = $? >> 8;

    #
    # Returns 0 if any packets are returned. Returns 2 if pingable
    # but no packets are returned. Other non-zero error codes indicate
    # other problems.  Any non-zero return indicates "not pingable" to us.
    # 
    print STDERR "$ping $pc returned $status\n" if $debug;
    if ($status) {
	return 0;
    }
    return 1;
}
445
446
447
448
449
450
451
452
453

sub info($) {
    my $message = shift;
    # Print out log entries like this:
    # Sep 20 09:36:00 $message
    open(LOG,">> $logfile");
    print LOG strftime("%b %e %H:%M:%S",localtime)." $message\n";
    close(LOG);
}