tbrestart.in 4.23 KB
Newer Older
1 2 3 4
#!/usr/bin/perl -w

#
# EMULAB-COPYRIGHT
5
# Copyright (c) 2000-2004, 2007 University of Utah and the Flux Group.
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
# All rights reserved.
#

# For restart: event system restart. clear ready bits and startup status,
#             boot status, port counters.

use English;
use Getopt::Std;

#
# This gets invoked from the Web interface. Swap an experiment in or out.
#
sub usage()
{
    print STDOUT "Usage: tbrestart <pid> <eid>\n";
    exit(-1);
}
my  $optlist = "";

#
# Configure variables
#
my $TB     = "@prefix@";
my $DBNAME = "@TBDBNAME@";
my $TBOPS  = "@TBOPSEMAIL@";
my $TBLOGS = "@TBLOGSEMAIL@";

#
# Testbed Support libraries
#
use lib "@prefix@/lib";
use libdb;
use libtestbed;

# Locals
my $nodereboot = "$TB/bin/node_reboot";
my $waitstart  = time;
my $failed     = 0;
my $state;

#
# Untaint the path
48
# 
49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
# Untaint the path
$ENV{'PATH'} = "/bin:/usr/bin:/sbin:/usr/sbin:$TB/libexec:$TB/sbin:$TB/bin";
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};

#
# Turn off line buffering on output
#
$| = 1;

#
# Parse command arguments. Once we return from getopts, all that should
# left are the required arguments.
#
%options = ();
if (! getopts($optlist, \%options)) {
    usage();
}
if (@ARGV != 2) {
    usage();
}
my $pid   = $ARGV[0];
my $eid   = $ARGV[1];
my @nodes = ExpNodes($pid, $eid);

print "Beginning restart in for $pid/$eid. " . TBTimeStamp() . "\n";
TBDebugTimeStamp("tbrestart started");

#
# Must be an active experiment to restart!
78
# 
79 80 81 82 83 84 85 86 87 88
if (! ($state = ExpState($pid, $eid))) {
    die("*** $0:\n".
	"    No such experiment $pid/$eid\n");
}
if ($state ne EXPTSTATE_ACTIVE) {
    die("*** $0:\n".
	"    Experiment must be active to be restart!\n");
}

#
89 90
# Stop the event system. 
# 
91 92 93
if (!$DISABLE_EVENTS) {
    print "Stopping the event system.\n";
    TBDebugTimeStamp("eventsys_control started");
94
    if (system("eventsys_control stop $pid,$eid")) {
95 96 97 98 99 100 101 102
	die("*** $0:\n".
	    "    Failed to stop the event system.\n");
    }
    TBDebugTimeStamp("eventsys_control finished");
}

#
# Clearing the portstat counters seems like a good idea.
103
# 
104 105 106 107 108 109
print "Clearing port counters.\n";
TBDebugTimeStamp("portstats started");
if (system("portstats -z -a -q $pid $eid")) {
    print STDERR "*** WARNING: Failed to clear port counters.\n";
    #
    # This is a non-fatal error.
110
    # 
111 112 113 114 115 116 117 118 119 120
}
TBDebugTimeStamp("portstats finished");

#
# Grab the node list. We are going to reboot each one in turn, instead of
# as a group. Why? Cause we need to know when the node is down so that we
# clear/reset state in the DB. We have no idea what the node is doing at
# this point. This is terribly imperfect of course, since there are no
# guarantees, especially since the events are async (a tbreset and isup
# could be in the event queue for a node). The ready bits present the worst
121
# problem. 
122 123 124 125
#
print "Rebooting all nodes\n";
TBDebugTimeStamp("node reboot started");
foreach my $node ( @nodes ) {
126 127 128 129
    #
    # Must duplicate a check that would be done in node_reboot if we
    # gave it the entire list. No point in rebooting local jails.
    #
130
    my ($jailed, $plab);
131
    
132 133
    if (TBIsNodeVirtual($node, \$jailed, \$plab)) {
	if (! $jailed && ! $plab) {
134 135
	    next;
	}
136
	if (! TBIsNodeRemote($node)) {
137 138 139
	    next;
	}
    }
140
    
141 142 143 144 145 146 147 148 149
    if (system("$nodereboot $node")) {
	die("*** $0:\n".
	    "    Failed to reboot node $node!\n");
    }
    # Clears various things including ready bits.
    TBNodeBootReset($node);
}

print STDOUT "Waiting for nodes to come up ...\n";
150 151
    
foreach my $node ( sort(@nodes) ) {
152 153
    if (! TBNodeStateWait($node, $waitstart, (60*6), undef,
			  (TBDB_NODESTATE_ISUP))) {
154 155 156
	print STDOUT "$node is alive and well\n";
	SetNodeBootStatus($node, NODEBOOTSTATUS_OKAY);
	next;
157
    }
158 159
    SetNodeBootStatus($node, NODEBOOTSTATUS_FAILED);
    $failed++;
160 161 162 163 164 165 166 167 168
}
TBDebugTimeStamp("node reboot finished");

if ($failed) {
    die("*** $0:\n".
	"    $failed nodes failed to reboot properly! \n");
}

#
169 170
# Start the event system. 
# 
171 172 173
if (!$DISABLE_EVENTS) {
    print "Starting the event system.\n";
    TBDebugTimeStamp("eventsys_control started");
174
    if (system("eventsys_control start $pid,$eid")) {
175 176 177 178 179 180 181 182 183
	die("*** $0:\n".
	    "    Failed to start the event system.\n");
    }
    TBDebugTimeStamp("eventsys_control finished");
}

print "Restart finished. " . TBTimeStamp() . "\n";
TBDebugTimeStamp("tbrestart finished");
exit(0);