Commit ab0a67d0 authored by Leigh B. Stoller's avatar Leigh B. Stoller

New node_reboot command. To be used instead of power, especially by

mere users. os_load and os_setup reworked to use node_reboot.
parent d08bd358
......@@ -11,9 +11,11 @@ include $(OBJDIR)/Makeconf
SUBDIRS = checkpass ir ns2ir
BIN_STUFF = power snmpit tbend tbrun tbprerun tbreport \
os_load savevlans startexp endexp batchexp killbatchexp
os_load savevlans startexp endexp batchexp killbatchexp \
node_reboot
CONTROL_BIN = power snmpit os_load
# Stuff that mere users get on plastic.
USERS_BIN = snmpit os_load node_reboot
SBIN_STUFF = resetvlans console_setup.proxy sched_reload named_setup \
batch_daemon exports_setup reload_daemon
......@@ -69,20 +71,16 @@ post-install:
chmod u+s $(INSTALL_LIBEXECDIR)/mkacct-ctrl
chown root $(INSTALL_LIBEXECDIR)/rmacct-ctrl
chmod u+s $(INSTALL_LIBEXECDIR)/rmacct-ctrl
chown root $(INSTALL_LIBEXECDIR)/os_setup
chmod u+s $(INSTALL_LIBEXECDIR)/os_setup
chown root $(INSTALL_SBINDIR)/named_setup
chmod u+s $(INSTALL_SBINDIR)/named_setup
chown root $(INSTALL_SBINDIR)/exports_setup
chmod u+s $(INSTALL_SBINDIR)/exports_setup
chown root $(INSTALL_BINDIR)/os_load
chmod u+s $(INSTALL_BINDIR)/os_load
chown root $(INSTALL_BINDIR)/savevlans
chmod u+s $(INSTALL_BINDIR)/savevlans
chown root $(INSTALL_LIBEXECDIR)/console_setup
chmod u+s $(INSTALL_LIBEXECDIR)/console_setup
chown root $(INSTALL_SBINDIR)/batch_daemon
chmod u+s $(INSTALL_SBINDIR)/batch_daemon
chown root $(INSTALL_BINDIR)/node_reboot
chmod u+s $(INSTALL_BINDIR)/node_reboot
#
# Control node installation (okay, plastic)
......
#!/usr/bin/perl -wT
use English;
use Getopt::Std;
#
# Reboot a node. Will power cycle the node as a last resort.
#
# usage: node_reboot [-d] node [node ...]
# Exit value is 0 if all nodes reboot okay, or the number of nodes
# could not be rebooted.
#
sub usage()
{
print STDOUT "Usage: node_reboot [-d] node [node ...]\n" .
"Use the -d option to turn on debugging\n";
exit(-1);
}
my $optlist = "d";
#
# Configure variables
#
my $TB = "@prefix@";
my $DBNAME = "@TBDBNAME@";
my $ssh = "ssh -n -q";
my $power = "$TB/bin/power";
my $ipod = "$TB/sbin/ipod";
my $ping = "/sbin/ping";
my %pids = ();
my @row;
my @nodes = ();
my $debug = 0;
my $failed = 0;
# un-taint path
$ENV{'PATH'} = '/bin:/sbin:/usr/bin:/usr/local/bin';
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};
# Turn off line buffering on output
$| = 1;
#
# Set up for querying the database.
#
use Mysql;
my $DB = Mysql->connect("localhost", $DBNAME, "script", "none");
#
# We don't want to run this script unless its the real version.
#
if ($EUID != 0) {
die("Must be root! Maybe its a development version?");
}
#
# Parse command arguments. Once we return from getopts, all that should
# left are the required arguments.
#
%options = ();
if (! getopts($optlist, \%options)) {
usage();
}
if (@ARGV == 0) {
usage();
}
if (defined($options{"d"})) {
$debug = $options{"d"};
}
# Untaint the nodes.
foreach my $node ( @ARGV ) {
if ($node =~ /^([-\@\w]+)$/) {
$node = $1;
}
else {
die("Bad node name: $node.");
}
push(@nodes, $node);
}
#
# Figure out who called us. Root and admin types can do whatever they
# want. Normal users can reboot nodes in their experiment.
#
if ($UID != 0) {
my $query_result;
my ($name) = getpwuid($UID) or
die("$UID not in passwd file");
$query_result = DBquery("select admin from users where uid='$name'");
if (! $query_result) {
die("DB Error getting user status");
}
@row = $query_result->fetchrow_array();
if ($row[0] != 1) {
#
# Check to make sure that mere user is allowed to muck with nodes
#
foreach my $node (@nodes) {
$query_result = DBquery("select reserved.node_id from reserved ".
"left join proj_memb on ".
"reserved.pid=proj_memb.pid and ".
"reserved.node_id='$node' ".
"where proj_memb.uid='$name'");
if ($query_result->numrows < 1) {
die("You do not have permission to reboot $node\n");
}
}
}
}
#
# Fire off a reboot process so that we can overlap them all.
# We need the pid so we can wait for them all before preceeding.
#
foreach my $node ( @nodes ) {
$mypid = RebootNode($node);
$pids{$node} = $mypid;
}
#
# Wait for all the reboot children to exit before continuing.
#
foreach my $node ( @nodes ) {
my $mypid = $pids{$node};
waitpid($mypid, 0);
if ($?) {
$failed++;
print STDERR "Reboot of node $node failed!\n";
}
else {
print STDOUT "$node rebooting ...\n";
}
}
if ($debug && $failed) {
print STDERR "$failed nodes could not be rebooted\n";
}
exit $failed;
#
# Reboot a node in a child process. Return the pid to the parent so
# that it can wait on all the children later.
#
sub RebootNode {
local($pc) = @_;
local($status, $syspid, $mypid);
print STDOUT "Rebooting $pc ...\n";
$mypid = fork();
if ($mypid) {
return $mypid;
}
#
# See if the machine is pingable. If its not pingable, then we just
# power cycle the machine rather than wait for ssh to time out.
#
# ping returns 0 if any packets make it through.
#
system("$ping -q -c 4 -t 4 $pc >/dev/null 2>&1");
$status = $? >> 8;
print STDERR "Ping $pc returned $status.\n" if $debug;
#
# Power cycle if the machine is dead.
#
if ($status) {
print STDERR "$pc appears to be dead. Power cycling ...\n" if $debug;
if (PowerCycle($pc)) {
exit(-1);
}
exit(0);
}
#
# Machine is pingable at least. Try to reboot it gracefully,
# or power cycle anyway if that does not work.
#
print STDERR "Rebooting $pc with ssh command ...\n" if $debug;
#
# Run an ssh command in a child process, protected by an alarm to
# ensure that the ssh is not hung up forever if the machine is in
# some funky state.
#
$syspid = fork();
if ($syspid) {
local $SIG{ALRM} = sub { kill("TERM", $syspid); };
alarm 30;
waitpid($syspid, 0);
alarm 0;
#
# The ssh can return non-zero exit status, but still have worked.
# FreeBSD for example.
#
print STDERR "reboot returned $?.\n" if $debug;
#
# Did the ssh time out? Send it a ping of death.
#
if ($? == 15) {
print STDERR "$pc is wedged. Sending a POD.\n" if $debug;
system("$ipod $pc");
}
}
else {
# Must change our real UID to root so that ssh will work.
$UID = 0;
exec("$ssh $pc /sbin/reboot");
exit(0);
}
#
# Okay, before we power cycle lets really make sure. We wait a while
# for it to stop responding to pings, and if it never goes silent,
# punch the power button.
#
if (WaitTillDead($pc) == 0) {
exit(0);
}
print STDERR "$pc is still running. Power cycling ...\n" if $debug;
if (PowerCycle($pc)) {
exit(-1);
}
exit(0);
}
#
# Power cycle a PC using the testbed power program.
#
sub PowerCycle {
local($pc) = @_;
system("$power cycle $pc");
return $? >> 8;
}
#
# Wait until a machine stops returning ping packets.
#
sub WaitTillDead {
local($pc) = @_;
local($status);
print STDERR "Waiting for $pc to die off\n" if $debug;
#
# Sigh, a long ping results in the script waiting until all the
# packets are sent from all the pings, before it will exit. So,
# loop doing a bunch of shorter pings.
#
for ($i = 0; $i < 15; $i++) {
system("$ping -q -c 4 -t 4 $pc >/dev/null 2>&1");
$status = $? >> 8;
#
# Returns 0 if any packets are returned. Returns 2 if pingable
# but no packets are returned. Other non-zero error codes indicate
# other problems. Assume that these other problems do not matter.
#
if ($status) {
return 0;
}
}
print STDERR "$pc is still alive.\n" if $debug;
return 1;
}
sub DBquery($)
{
my($query) = $_[0];
my($result);
$result = $DB->query($query);
if (! $result) {
print "DB Query failed: $query\n";
}
return $result;
}
......@@ -32,19 +32,18 @@ use English;
my $TB = "@prefix@";
my $DBNAME = "@TBDBNAME@";
my $rsh = "sshtb -n -q";
my $ssh = "sshtb -n -q";
my $NETDISK = "/tftpboot/netdisk";
my $PAPERADDR = "boss.emulab.net";
my $PLASTICADDR = "users.emulab.net";
my $power = "$TB/bin/power";
my $ping = "/sbin/ping";
my $nodereboot = "$TB/bin/node_reboot";
my $dbg = 1;
my %waitfor = ();
my $SAVEUID = $UID;
my @row;
my @nodes = ();
my $name = "";
my $mereuser = 0;
my $failures = 0;
# un-taint path
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
......@@ -65,7 +64,18 @@ if ( $#ARGV < 3) {
my $imageid = shift;
my $imagepart = shift;
my $imagepath = shift;
my @nodes = @ARGV;
# Untaint the nodes.
foreach my $node ( @ARGV ) {
if ($node =~ /^([-\@\w]+)$/) {
$node = $1;
}
else {
die("Bad node name: $node.");
}
push(@nodes, $node);
}
#
# Figure out who called us. Root and admin types can do whatever they
......@@ -128,17 +138,17 @@ else {
}
#
# XXX This test is wrong of course. Need a regex, and the project name.
# Admin types do whatever they like
#
my $cmdline = "";
if (index($imagepath, "$TB/images/") >= 0) {
$cmdline = "${PAPERADDR}:$imagepath $diskpart";
}
elsif (index($imagepath, "/proj/") >= 0) {
$cmdline = "${PLASTICADDR}:$imagepath $diskpart";
if ($mereuser) {
if (! ($imagepath =~ /^\/proj\//)) {
die("Your image must reside in /proj\n");
}
$cmdline = "${PLASTICADDR}:$imagepath $diskpart";
}
else {
die("Your image must reside in $TB/images or /proj\n");
$cmdline = "${PAPERADDR}:$imagepath $diskpart";
}
#
......@@ -147,7 +157,7 @@ else {
foreach my $node (@nodes) {
my $pc = $node;
print STDERR "Changing default OS for $pc to $imageid\n";
print STDOUT "Changing default OS for $pc to $imageid\n";
$sth = $DB->query("update nodes set ".
"def_boot_image_id='$imageid',def_boot_path='' ".
"where node_id='$pc'");
......@@ -170,7 +180,7 @@ foreach my $node (@nodes) {
}
}
print STDERR "Setting up reload for $pc\n";
print STDOUT "Setting up reload for $pc\n";
$sth = $DB->query("update nodes set ".
"next_boot_path='$NETDISK',".
"next_boot_cmd_line='$cmdline' ".
......@@ -179,152 +189,15 @@ foreach my $node (@nodes) {
die("Database update failed (nodes next_boot). Aborted...\n");
}
# Untaint the argument. Sheer idiocy.
#
if ($pc =~ /^([-\@\w.]+)$/) {
$pc = $1;
}
#
# See if the machine is pingable. If its not pingable, then
# we just power cycle the machine rather than wait for a bunch
# of ssh/rsh commands to time out.
#
print STDERR "Pinging $pc ... \n" if $dbg;
if (-e $ping) {
open(PING, "$ping -c 4 $pc 2>&1 |");
}
else {
die("PING command $ping not found!\n");
}
do {
}
until ( <PING> =~ /transmitted, (\d*) packets received/ );
close(PING);
print STDERR "Got back $1 ping packets from $pc.\n" if $dbg;
#
# Power cycle if the machine is dead. It will come back up with the
# proper OS, cause we modified the database above.
#
if ( $1 == 0 ) {
print STDERR "$pc appears to be dead. Power cycling ...\n";
PowerCycle($pc);
next;
}
#
# Machine is pingable at least. Try to reboot it gracefully,
# or power cycle anyway if that does not work. To do this, we must
# change our real UID to root so that ssh will work.
#
print STDERR "Rebooting $pc ...\n";
$UID = 0;
if (system("$ssh -l root $pc /sbin/reboot") == 0) {
$UID = $SAVEUID;
print STDERR "$pc appears to be rebooting\n" if $dbg;
next;
}
$UID = $SAVEUID;
#
# Okay, before we power cycle lets really make sure. On FreeBSD, it might
# have rebooted, but since the connection is terminated, system returns
# an error status. So, lets ping it again and if its pingable, the
# reboot must have failed. If it is not pingable, I assume that the
# reboot really worked, and the exit value can be ignored.
# Fire off a reboot.
#
my $exit_value = $? >> 8;
print STDERR "reboot returned $exit_value. Lets make sure it dies\n"
if $dbg;
if (WaitTillDead($pc) == 0) {
next;
if (system("$nodereboot $pc")) {
print STDERR "Node $pc could not be rebooted!\n";
$failures++;
}
print STDERR "$pc appears to still be running Power cycling ...\n";
PowerCycle($pc);
}
print STDOUT "OS Reload Done!\n";
exit 0;
#
# Power cycle a PC using the testbed power program.
#
sub PowerCycle {
local($pc) = @_;
if (system("$power cycle $pc") != 0) {
print STDERR "WARNING: Could not power cycle $pc. Skipping ...\n";
}
}
sub WaitTillAlive {
local($pc) = @_;
print STDERR "Waiting for $pc to come alive\n" if $dbg;
#
# Sigh, a long ping results in the script waiting until all the
# packets are sent from all the pings, before it will exit. So,
# loop doing a bunch of shorter pings.
#
for ($i = 0; $i < 30; $i++) {
open(PING, "$ping -c 5 $pc 2>&1 |");
do {
$_ = <PING>;
if ( $_ =~ /bytes from/ ) {
print STDERR "Yep, $pc alive and well\n" if $dbg;
return 0;
}
}
until ( $_ =~ /transmitted, (\d*) packets received/ );
}
close(PING);
print STDERR "$pc is not responding. Better check into it\n";
return 1;
}
exit $failures;
sub WaitTillDead {
local($pc) = @_;
print STDERR "Waiting for $pc to die off\n" if $dbg;
#
# Sigh, a long ping results in the script waiting until all the
# packets are sent from all the pings, before it will exit. So,
# loop doing a bunch of shorter pings.
#
for ($i = 0; $i < 12; $i++) {
open(PING, "$ping -c 5 $pc 2>&1 |");
do {
}
until ( <PING> =~ /transmitted, (\d*) packets received/ );
if ( $1 == 0 ) {
print STDERR "Good, $pc must have rebooted.\n" if $dbg;
return 0;
}
}
close(PING);
print STDERR "$pc is still alive.\n" if $dbg;
return 1;
}
sub OSFeatureSupported {
local($os) = $_[0];
local($feature) = $_[1];
$db_result = $DB->query("select osfeatures from disk_images ".
"where image_id='$os'");
if ($db_result->numrows < 1) {
return 0;
}
foreach $osfeature (split(',', $db_result->fetchrow_array())) {
if ($feature eq $osfeature) {
return 1;
}
}
return 0;
}
......@@ -25,19 +25,18 @@ require 'ctime.pl';
my $TB = "@prefix@";
my $DBNAME = "@TBDBNAME@";
my $ssh = "ssh -n -q";
my $power = "$TB/bin/power";
my $nodereboot = "$TB/bin/node_reboot";
my $ping = "/sbin/ping";
my $mail = "/usr/bin/mail";
my $tbops = "testbed-ops\@flux.cs.utah.edu";
my $dbg = 0;
my @nodes = ();
my %imagepaths = ();
my %imageparts = ();
my %nodeos = ();
my %nodepath = ();
my %nodepart = ();
my %waitfor = ();
my %pids = ();
my $SAVEUID = $UID;
my @row;
......@@ -159,8 +158,17 @@ while (<IN>) {
last;
}
my ($node,$os) = split();
# untaint since $node is passed off to another program.
if ($node =~ /^([-\@\w]+)$/) {
$node = $1;
}
else {
die("Bad node name: $node.");
}
print STDERR "$node $os\n" if $dbg;
$nodeos{$node} = $os;
push(@nodes, $node);
$db_result = $DB->query("select pid,eid from reserved ".
"where node_id='$node'");
......@@ -245,7 +253,7 @@ while (<IN>) {
}
#
# Lifted right out of delay_setup.
# Make the DB changes.
#
foreach my $node ( keys %nodeos ) {
my $pc = $node;
......@@ -284,27 +292,15 @@ foreach my $node ( keys %nodeos ) {
"next_boot_cmd_line='$RELOADCMD' ".
"where node_id='$pc'");
}
#
# Fire off a reboot process so that we can overlap them all.
# We need the pid so we can wait for them all before preceeding.
#
$mypid = RebootNode($pc);
$pids{$pc} = $mypid;
}
#
# Wait for all the reboot children to exit before continuing.
# Fire off a mass reboot. The reboot script does this in parallel, so
# no need to create any new children here. We just wait until it exits,
# which means all the nodes are actually rebooting.
#
foreach my $node ( keys %nodeos ) {
my $pc = $node;
my $mypid = $pids{$pc};
waitpid($mypid, 0);
if ($?) {
die("Reboot of node $pc failed!");
}
print STDOUT "$pc rebooting ...\n"
if (system("$nodereboot @nodes")) {
die("Failed to reboot some nodes!");
}
print STDOUT "Waiting for testbed nodes to finish rebooting ...\n";
......@@ -363,17 +359,6 @@ foreach my $node ( keys %nodeos ) {
print STDOUT "OS Setup Done!\n";
exit 0;
#
# Power cycle a PC using the testbed power program.
#
sub PowerCycle {
local($pc) = @_;
if (system("$power cycle $pc") != 0) {
print STDERR "WARNING: Could not power cycle $pc. Skipping ...\n";
}
}
sub WaitTillAlive {
my ($pc) = @_;
......@@ -413,129 +398,6 @@ sub WaitTillAlive {
return 1;
}
#
# Reboot a node in a child process. Return the pid to the parent so
# that it can wait on all the children later.
#
sub RebootNode {
local($pc) = @_;
print STDOUT "Rebooting $pc ...\n";
$mypid = fork();
if ($mypid) {
return $mypid;
}
#
# See if the machine is pingable. If its not pingable, then
# we just power cycle the machine rather than wait for a bunch
# of ssh/rsh commands to time out.
#
print STDERR "Pinging $pc ... \n" if $dbg;
if (-e $ping) {
open(PING, "$ping -c 4 -t 4 $pc 2>&1 |");
}
else {
die("PING command $ping not found!\n");
}
do {
}
until ( <PING> =~ /transmitted, (\d*) packets received/ );
close(PING);
print STDERR "Got back $1 ping packets from $pc.\n" if $dbg;
#
# Power cycle if the machine is dead. It will come back up with the
# proper OS, cause we modified the database above.
#
if ( $1 == 0 ) {
print STDERR "$pc appears to be dead. Power cycling ...\n" if $dbg;
PowerCycle($pc);
exit(0);
}
#
# Machine is pingable at least. Try to reboot it gracefully,
# or power cycle anyway if that does not work. To this, we must
# change our real UID to root so that ssh will work.
#
print STDERR "Rebooting $pc with ssh command ...\n" if $dbg;
#
# Run an ssh command in a child process, protected by an alarm to
# ensure that the ssh is not hung up forever if the machine is in
# some funky state.
#
$syspid = fork();
if ($syspid) {
local $SIG{ALRM} = sub { kill("TERM", $syspid); };
alarm 60;
waitpid($syspid, 0);
alarm 0;
#
# If ssh times out, just punch the button.
#
if ($? == 15) {
print STDERR "$pc appears to be wedged. Power cycling ...\n"
if $dbg;
PowerCycle($pc);
exit(0);
}
}
else {
$UID = 0;
exec("$ssh $pc /sbin/reboot");
exit(0);
}
#
# Okay, before we power cycle lets really make sure. On FreeBSD, it might
# have rebooted, but since the connection is terminated, system returns
# an error status. So, lets ping it again and if its pingable, the
# reboot must have failed. If it is not pingable, I assume that the
# reboot really worked, and the exit value can be ignored.
#