Commit 6feda7d3 authored by Mike Hibler's avatar Mike Hibler

Changes for speeding up elabinelab server setup.

Boss/ops/fs: reboot them together after setup rather than serially.

Nodes: leave them in PXEWAIT throughout the setup, until after boss has
been rebooted.  At that point we send them the new bootinfo RESTART command
telling pxeboot to re-DHCP and use the new info obtained (next-server) to
contact a potentially new boss node.  This is a quick way to switch a node
in PXEWAIT from talking to the outer boss to talking to the inner one.

A significant number of rinky-dink changes were needed to do this, primarily
adding a new state, PXELIMBO, where nodes can be sent to sit until they are
restarted.  It turns out, just putting them in an existing state such as
PXEWAKEUP or SHUTDOWN wouldn't work, as they tend to timeout or otherwise
reboot.
parent 3e34e8fe
......@@ -247,6 +247,7 @@ if (1) {
# Now symlink all the alternate boots to pxeboot.emu
# XXX we assume everything is at the top level of /tftpboot right now.
#
$EUID = 0;
foreach my $boot (@bogoboots) {
if ($boot =~ /^\/tftpboot\/([^\/]+)$/) {
$boot = $1;
......@@ -259,6 +260,7 @@ if (1) {
}
}
}
$EUID = $UID;
#
# Remake the dhcpd.conf file to reflect any pxeboot change.
......
......@@ -102,7 +102,7 @@ use vars qw(@ISA @EXPORT);
TBDB_NODESTATE_RELOADDONE TBDB_NODESTATE_RELOADDONE_V2
TBDB_NODESTATE_UNKNOWN
TBDB_NODESTATE_PXEWAIT TBDB_NODESTATE_PXEWAKEUP
TBDB_NODESTATE_PXEFAILED
TBDB_NODESTATE_PXEFAILED TBDB_NODESTATE_PXELIMBO
TBDB_NODESTATE_PXEBOOTING TBDB_NODESTATE_ALWAYSUP
TBDB_NODESTATE_MFSSETUP TBDB_NODESTATE_TBFAILED
TBDB_NODESTATE_POWEROFF
......@@ -485,6 +485,7 @@ sub TBDB_NODESTATE_RELOADDONE() { "RELOADDONE"; }
sub TBDB_NODESTATE_RELOADDONE_V2(){ "RELOADDONEV2"; }
sub TBDB_NODESTATE_UNKNOWN() { "UNKNOWN"; };
sub TBDB_NODESTATE_PXEWAIT() { "PXEWAIT"; }
sub TBDB_NODESTATE_PXELIMBO() { "PXELIMBO"; }
sub TBDB_NODESTATE_PXEWAKEUP() { "PXEWAKEUP"; }
sub TBDB_NODESTATE_PXEFAILED() { "PXEFAILED"; }
sub TBDB_NODESTATE_PXEBOOTING() { "PXEBOOTING"; }
......@@ -2748,13 +2749,15 @@ sub TBNodeStateWait ($$$$@) {
$waittime = time - $waitstart;
if ($waittime > $maxwait) {
$minutes = int($waittime / 60);
print "*** Giving up on $pc - it's been $minutes minute(s).\n";
print "*** Giving up on $pc ($state) - ",
"it's been $minutes minute(s).\n";
TBNodeConsoleTail($pc, *STDOUT);
return 1;
}
if (int($waittime / 60) > $minutes) {
$minutes = int($waittime / 60);
print "Still waiting for $pc - it's been $minutes minute(s).\n";
print "Still waiting for $pc ($state) - ",
"it's been $minutes minute(s).\n";
}
sleep(1);
}
......
/*
* EMULAB-COPYRIGHT
* Copyright (c) 2000-2004, 2006, 2007 University of Utah and the Flux Group.
* Copyright (c) 2000-2010 University of Utah and the Flux Group.
* All rights reserved.
*/
......@@ -32,7 +32,8 @@ usage()
"options:\n"
"-d - Turn on debugging\n"
"-q - Tell node to query bootinfo again\n"
"-r - Tell node to reboot\n",
"-r - Tell node to reboot\n"
"-R - Tell node to restart (re-DHCP to change server)\n",
progname);
exit(-1);
}
......@@ -40,7 +41,8 @@ usage()
int
main(int argc, char **argv)
{
int sock, err, c, reboot = 0, query = 0;
int sock, err, c;
int reboot = 0, restart = 0, query = 0;
struct sockaddr_in name, target;
boot_info_t boot_info;
boot_what_t *boot_whatp = (boot_what_t *) &boot_info.data;
......@@ -49,7 +51,7 @@ main(int argc, char **argv)
progname = argv[0];
while ((c = getopt(argc, argv, "dhvrq")) != -1) {
while ((c = getopt(argc, argv, "dhvrRq")) != -1) {
switch (c) {
case 'd':
debug++;
......@@ -57,6 +59,9 @@ main(int argc, char **argv)
case 'r':
reboot++;
break;
case 'R':
restart++;
break;
case 'q':
query++;
break;
......@@ -75,7 +80,7 @@ main(int argc, char **argv)
if (!argc)
usage();
if (query && reboot)
if (query && (reboot || restart))
usage();
if (debug)
......@@ -131,8 +136,9 @@ main(int argc, char **argv)
bzero(&boot_info, sizeof(boot_info));
boot_info.version = BIVERSION_CURRENT;
if (reboot) {
boot_whatp->type = BIBOOTWHAT_TYPE_REBOOT;
if (reboot || restart) {
boot_whatp->type = reboot ?
BIBOOTWHAT_TYPE_REBOOT : BIBOOTWHAT_TYPE_RESTART;
#ifdef EVENTSYS
bievent_send(target.sin_addr, (void *) NULL,
TBDB_NODESTATE_SHUTDOWN);
......
/*
* EMULAB-COPYRIGHT
* Copyright (c) 2000-2008 University of Utah and the Flux Group.
* Copyright (c) 2000-2010 University of Utah and the Flux Group.
* All rights reserved.
*
* Derived from boot/bootwhat.h in the OSKit.
......@@ -101,6 +101,7 @@ typedef struct boot_what {
#define BIBOOTWHAT_TYPE_REBOOT 5 /* Reboot */
#define BIBOOTWHAT_TYPE_AUTO 6 /* Do a bootinfo query */
#define BIBOOTWHAT_TYPE_MFS 7 /* Boot an MFS from server:/path */
#define BIBOOTWHAT_TYPE_RESTART 8 /* Restart ourselves without reset */
/* Flags */
#define BIBOOTWHAT_FLAGS_CMDLINE 0x01 /* Kernel to boot */
......
......@@ -425,6 +425,7 @@ REPLACE INTO state_transitions VALUES ('NORMALv1','ISUP','PXEBOOTING','KernelCha
REPLACE INTO state_transitions VALUES ('NODEALLOC','FREE_CLEAN','RES_INIT_CLEAN','Reserve');
REPLACE INTO state_transitions VALUES ('PXEKERNEL','PXEWAIT','PXEBOOTING','Retry');
REPLACE INTO state_transitions VALUES ('PXEKERNEL','PXEBOOTING','PXEWAIT','Free');
REPLACE INTO state_transitions VALUES ('PXEKERNEL','PXELIMBO','PXEBOOTING','Bootinfo-Restart');
REPLACE INTO state_transitions VALUES ('BATCHSTATE','ACTIVATING','SWAPPED','NonBatch');
REPLACE INTO state_transitions VALUES ('NORMAL','ISUP','SHUTDOWN','Reboot');
REPLACE INTO state_transitions VALUES ('NORMAL','REBOOTING','SHUTDOWN','Reboot');
......
#
# Add a PXEKERNEL state to support fast transition of nodes from outer
# Emulab to inner Emulabs.
#
use strict;
use libdb;
sub DoUpdate($$$)
{
my ($dbhandle, $dbname, $version) = @_;
DBQueryFatal("REPLACE INTO state_transitions VALUES ".
" ('PXEKERNEL','PXELIMBO','PXEBOOTING','Bootinfo-Restart')");
return 0;
}
1;
......@@ -183,6 +183,7 @@ while (1) {
"left join node_types as t on t.type=n.type ".
"where (n.eventstate!='". TBDB_NODESTATE_ISUP ."' and ".
" n.eventstate!='". TBDB_NODESTATE_PXEWAIT ."' and ".
" n.eventstate!='". TBDB_NODESTATE_PXELIMBO ."' and ".
" n.eventstate!='". TBDB_NODESTATE_ALWAYSUP ."' and ".
" n.eventstate!='". TBDB_NODESTATE_POWEROFF ."') and ".
" r.pid is null and n.role='testnode' and ".
......
......@@ -14,6 +14,7 @@ use Getopt::Std;
use lib "@prefix@/lib";
use libdb;
use libtestbed;
use libtblog;
use Experiment;
use User;
use Lan;
......@@ -29,7 +30,7 @@ sub usage()
exit(-1);
}
my $optlist = "dgkfur";
my $optlist = "dgkfurP";
my $debug = 1;
my $verbose = 0;
my $killmode = 0;
......@@ -38,6 +39,10 @@ my $dbgooonly= 0;
my $update = 0;
my $remove = 0;
# XXX experimental speed hacks
my $inparallel = 1;
my $restartnodes = 1;
sub DumpDBGoo();
#
......@@ -51,6 +56,7 @@ my $TBOPSPID = TBOPSPID();
my $SSH = "$TB/bin/sshtb";
my $SCP = "/usr/bin/scp";
my $nodereboot = "$TB/bin/node_reboot";
my $noderestart = "$TB/sbin/bootinfosend -R";
my $makeconf = "$TB/sbin/dhcpd_makeconf";
my $nodewait = "$TB/sbin/node_statewait";
my $snmpit = "$TB/bin/snmpit";
......@@ -117,6 +123,9 @@ if (defined($options{"u"})) {
if (defined($options{"r"})) {
$remove = 1;
}
if (defined($options{"P"})) {
$inparallel = 1;
}
if (! @ARGV) {
usage();
}
......@@ -278,6 +287,7 @@ $UID = 0;
# version from the web server.
#
# XXX ugh, copy over a newer mkextrafs.pl as well (one that supports -2).
# XXX ughII, we only copy over a FreeBSD version, this will break a Linux boss.
#
my $mkelab = "$TB/etc/rc.mkelab";
if (-e "$expdir/rc.mkelab") {
......@@ -386,37 +396,94 @@ if ($?) {
" Failed to reconfig/restart DHCPD.\n");
}
if (defined($fsnode)) {
# Reboot fs and wait for it to come back.
print "Rebooting fsnode ($fsnode).\n";
TBDebugTimeStamp("Rebooting fsnode");
system("$nodereboot -w $fsnode");
if ($inparallel) {
my $nodes = "$bossnode $opsnode";
$nodes .= " $fsnode"
if (defined($fsnode));
print "Rebooting servers ($nodes).\n";
TBDebugTimeStamp("Rebooting servers");
system("$nodereboot -w $nodes");
if ($?) {
die("*** $0:\n".
" Error rebooting the fsnode ($fsnode)!\n");
" Error rebooting the servers ($nodes)!\n");
}
} else {
if (defined($fsnode)) {
# Reboot fs and wait for it to come back.
print "Rebooting fsnode ($fsnode).\n";
TBDebugTimeStamp("Rebooting fsnode");
system("$nodereboot -w $fsnode");
if ($?) {
die("*** $0:\n".
" Error rebooting the fsnode ($fsnode)!\n");
}
}
# Reboot ops and wait for it to come back.
print "Rebooting opsnode ($opsnode).\n";
TBDebugTimeStamp("Rebooting opsnode");
system("$nodereboot -w $opsnode");
if ($?) {
die("*** $0:\n".
" Error rebooting the opsnode ($opsnode)!\n");
}
# Reboot boss and wait for it to come back.
print "Rebooting bossnode ($bossnode).\n";
TBDebugTimeStamp("Rebooting bossnode");
system("$nodereboot -w $bossnode");
if ($?) {
die("*** $0:\n".
" Error rebooting the bossnode ($bossnode)!\n");
}
}
# Reboot ops and wait for it to come back.
print "Rebooting opsnode ($opsnode).\n";
TBDebugTimeStamp("Rebooting opsnode");
system("$nodereboot -w $opsnode");
if ($?) {
die("*** $0:\n".
" Error rebooting the opsnode ($opsnode)!\n");
}
# Reboot boss and wait for it to come back.
print "Rebooting bossnode ($bossnode).\n";
TBDebugTimeStamp("Rebooting bossnode");
system("$nodereboot -w $bossnode");
if ($?) {
die("*** $0:\n".
" Error rebooting the bossnode ($bossnode)!\n");
}
$EUID = 0;
# Reboot the experimental nodes. They will come up inside the inner elab.
# DO NOT WAIT! They are not going to report ISUP from this point on.
if (@expnodes) {
#
# First we try the magic pxeboot restart.
# The nodes should still be in PXEWAIT, so we send them a restart
# to make them re-DHCP. This should get them quickly reparented to
# the inner boss.
#
# If this doesn't work, we fall back on rebooting the nodes.
#
if ($restartnodes) {
TBDebugTimeStamp("Redirecting experimental nodes to inner boss");
my $stat = 0;
# Run as real user again.
$EUID = $UID;
foreach my $node (@expnodes) {
$stat = system("$noderestart $node");
last if ($stat);
}
$EUID = 0;
if ($stat) {
tbwarn("Node restart failed ($stat), falling back to reboot.");
goto rebootnodes;
}
#
# Ssh into inner boss and use a utility script to determine
# when the nodes have reported in and are in PXEWAIT (part of the
# inner elab). Note the short timeout, since this operation should
# be virtually instantaneous.
#
print "Waiting for nodes to restart and join the inner emulab.\n";
TBDebugTimeStamp("Waiting for inner nodes to restart");
$UID = 0;
$stat = system("$SSH -host $bossnode ".
"/usr/testbed/sbin/node_statewait -t 15 -a");
$UID = $SAVEUID;
if ($stat) {
tbwarn("Error ($stat) waiting for nodes to restart, falling back to reboot.");
goto rebootnodes;
}
goto restartworked;
}
rebootnodes:
print "Rebooting inner experimental nodes.\n";
TBDebugTimeStamp("Rebooting experimental nodes");
# Run as real user again.
......@@ -445,6 +512,7 @@ if (@expnodes) {
}
$UID = $SAVEUID;
restartworked:
#
# To avoid confusion later (with swapmod, which wants them to be ISUP),
# and so the web interface does not show the nodes as down, set the
......@@ -652,6 +720,16 @@ sub DumpDBGoo()
# Clear any node reservations on the inside
DBQueryFatal("update temp_${table} set ".
" reserved_pid=null where reserved_pid is not null");
# Put the inner nodes into "limbo" so they DTRT when restarted
if ($restartnodes) {
DBQueryFatal("update temp_${table} set".
" op_mode='PXEKERNEL',next_op_mode='',".
" eventstate='". TBDB_NODESTATE_PXELIMBO . "',".
" temp_boot_osid=NULL,next_boot_osid=NULL,".
" osid=NULL".
" where role='testnode'");
}
}
DBQueryWarn("select * from temp_$table ".
......
#!/usr/bin/perl -wT
#
# EMULAB-COPYRIGHT
# Copyright (c) 2000-2009 University of Utah and the Flux Group.
# Copyright (c) 2000-2010 University of Utah and the Flux Group.
# All rights reserved.
#
# node reboot library. Basically the backend to the node_reboot script, but
......@@ -538,6 +538,7 @@ sub RebootNode {
return -2;
}
if ($nodestate eq TBDB_NODESTATE_PXEWAIT() ||
$nodestate eq TBDB_NODESTATE_PXELIMBO() ||
$nodestate eq TBDB_NODESTATE_PXEWAKEUP()) {
#
# In killmode, we do not want to bother with sending a wakeup event.
......
......@@ -82,6 +82,7 @@ my %vnode2pnode = ();
my %pnodevcount = ();
my %plabvnodes = ();
my %geninodes = ();
my %einenodes = ();
my %osids = ();
my %osmap = ();
my %canfail = ();
......@@ -225,7 +226,8 @@ if (TBExptPlabInElabPLC($pid, $eid, \$plcnode)) {
# Get the set of nodes, as well as the nodes table information for them.
#
my $db_result =
DBQueryFatal("select n.*,l.pid,r.vname,r.sharing_mode from reserved as r ".
DBQueryFatal("select n.*,l.pid,r.vname,r.sharing_mode,r.inner_elab_role ".
"from reserved as r ".
"left join nodes as n on n.node_id=r.node_id ".
"left join last_reservation as l on n.node_id=l.node_id ".
"where r.pid='$pid' and r.eid='$eid'");
......@@ -248,6 +250,8 @@ while (my %row = $db_result->fetchhash()) {
my $virtnode = $typeinfo->isvirtnode();
my $sharednode = defined($row{'sharing_mode'})
&& $row{'sharing_mode'} eq 'using_shared_local';
my $iseinenode= $elabinelab && defined($row{'inner_elab_role'})
&& $row{'inner_elab_role'} eq 'node';
my $isremote = $typeinfo->isremotenode();
my $isgeninode= $typeinfo->isfednode();
my $imageable = $typeinfo->imageable();
......@@ -282,6 +286,11 @@ while (my %row = $db_result->fetchhash()) {
next;
}
}
elsif ($iseinenode) {
print "Will skip reload/reboot of inner elab node $node.\n";
$einenodes{$node} = 1;
next;
}
elsif ($subnode && !$imageable) {
print "Will skip subnode $node ISUP wait.\n";
}
......@@ -530,6 +539,19 @@ while (my %row = $db_result->fetchhash()) {
if $dbg;
}
#
# XXX Inner elab nodes should never report in to us.
# If they do, make sure they wind up in PXEWAIT.
#
if (keys(%einenodes)) {
DBQueryFatal("update nodes set ".
" def_boot_osid=NULL,".
" next_boot_osid=NULL,".
" temp_boot_osid=NULL ".
"where node_id in (".
join(",", map("'$_'", keys %einenodes)). ")");
}
@all_nodes = (keys %nodes, keys %vnodes);
#
......
......@@ -2,7 +2,7 @@
#
# EMULAB-COPYRIGHT
# Copyright (c) 2000-2009 University of Utah and the Flux Group.
# Copyright (c) 2000-2010 University of Utah and the Flux Group.
# All rights reserved.
#
......@@ -245,7 +245,7 @@ if ($showmap && $state eq EXPTSTATE_ACTIVE) {
foreach $v (sort keys(%v2pmap)) {
my $p = $v2pmap{$v}->{PNODE};
my $t = $v2pmap{$v}->{TYPE};
my $o = $v2pmap{$v}->{OSID};
my $o = $v2pmap{$v}->{OSID} ? $v2pmap{$v}->{OSID} : "<NONE>";
my $m = $v2pmap{$v}->{VVP};
printf "%-15s %-12s %-15s %s", $v, $t, $o, $p;
if ($m ne $p) {
......
......@@ -1422,12 +1422,17 @@ sub doSwapin($) {
#
# Do linktest if user requested it at swapin.
#
# We don't run linktest on elabinelabs since there may be no network
# (i.e., in the "single control network" case) and inner nodes are
# going to be sitting in PXEWAIT.
#
my $query_result =
DBQueryFatal("select linktest_level,linktest_pid from experiments ".
"where pid='$pid' and eid='$eid'");
my ($linktest_level,$linktest_pid) = $query_result->fetchrow_array();
if ($linktest_level && ($type == REAL || $type == MODIFY)) {
if (!$elabinelab && $linktest_level &&
($type == REAL || $type == MODIFY)) {
if ($linktest_pid) {
tbwarn "Linktest is already running! $linktest_pid";
}
......
......@@ -1735,8 +1735,25 @@ sub SetupBossNode()
#
# Hmm, need to run this at startup though.
#
mysystem("echo '/usr/local/etc/emulab/rc/rc.inelab' ".
" >> /etc/rc.local");
# XXX we used to append this to /etc/rc.local but that file is
# executed before many critical services (such as sshd!) have been
# started. So we need to push this as late as possible, which we do
# with a glorious file naming hack. This late start is important for
# the inner boss node because the outer boss elabinelab script SSHs into
# the inner boss to invoke node_statewait after it has been informed that
# the inner boss is up. We can't have the inner boss reporting in before
# it has even started sshd!
#
my $rcfile;
if (-d "/usr/local/etc/rc.d") {
$rcfile = "/usr/local/etc/rc.d/zzz-inelab.sh";
mysystem("echo '#!/bin/sh' > $rcfile");
mysystem("echo '# Auto generated by rc.mkelab' >> $rcfile");
chmod(0755, $rcfile);
} else {
$rcfile = "/etc/rc.local";
}
mysystem("echo '/usr/local/etc/emulab/rc/rc.inelab' >> $rcfile");
}
#
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment