#!/usr/bin/perl -wT # # EMULAB-COPYRIGHT # Copyright (c) 2000-2009 University of Utah and the Flux Group. # All rights reserved. # use English; use Getopt::Std; # # Set up the vnode state on a virtual (multiplexed) node. # # XXX - This script should only be run from os_setup! # # The output is all jumbled together since the updates are issued in parallel. # Might be a pain when debugging. # sub usage() { print STDOUT "Usage: vnode_setup [-m] [-q] [-f] [-k] [-j] [-p] [-n ] [-w ] [node ...]\n"; exit(-1); } my $optlist = "fdkjpn:w:mq"; # # We don't want to run this script unless its the real version. # That is, it must be setuid root. # if ($EUID != 0) { die("*** $0:\n". " Must be root! Maybe its a development version?\n"); } # # Configure variables # my $TB = "@prefix@"; my $TESTMODE = @TESTMODE@; my $TBOPS = "@TBOPSEMAIL@"; my $TBLOGS = "@TBLOGSEMAIL@"; my $CLIENT_BIN = "@CLIENT_BINDIR@"; my $PGENISUPPORT= @PROTOGENI_SUPPORT@; my $SAVEUID = $UID; my $ssh = "$TB/bin/sshtb -n"; my $debug = 0; my $force = 0; my $failed = 0; my $killmode = 0; my $jailonly = 0; my $sendemail = 0; my $quiet = 0; my $plabonly = 0; my $numbatch = 10; my $childwait = 3000; my $dbuid; # # Load the Testbed support stuff. # use lib "@prefix@/lib"; use libdb; use libtestbed; use libtblog; use Experiment; use Node; if ($PGENISUPPORT) { require libGeni; } # un-taint path $ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin'; delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'}; # Turn off line buffering on output $| = 1; # # Parse command arguments. Once we return from getopts, all that should be # left are the required arguments. # %options = (); if (! getopts($optlist, \%options)) { usage(); } if (@ARGV < 2) { usage(); } if (defined($options{"f"})) { $force = 1; } if (defined($options{"m"})) { $sendemail = 1; } if (defined($options{"q"})) { $quiet = 1; } if (defined($options{"d"})) { $debug = 1; } if (defined($options{"k"})) { $killmode = 1; } if (defined($options{"j"})) { $jailonly = 1; } if (defined($options{"p"})) { $plabonly = 1; } if (defined($options{"n"})) { if ($options{"n"} =~ /^(\d+)$/) { $numbatch = $1; } else { die ("*** Bad data in numbatch: $options{'n'}"); } } if (defined($options{"w"})) { if ($options{"w"} =~ /^(\d+)$/) { $childwait = $1; } else { die ("*** Bad data in wait_time: $options{'w'}"); } } my $pid = shift(@ARGV); my $eid = shift(@ARGV); # # Untaint the arguments. # if ($pid =~ /^([-\@\w]+)$/) { $pid = $1; } else { die("*** Bad data in pid: $pid\n"); } if ($eid =~ /^([-\@\w]+)$/) { $eid = $1; } else { die("*** Bad data in eid: $eid\n"); } if ($plabonly && $jailonly) { tbdie("*** '-j' and '-p' are mutually exclusive."); } # # Verify user and get his DB uid and other info for later. # my $this_user = User->ThisUser(); if (!defined($this_user) && $UID) { tbdie("You ($UID) do not exist!"); } my $experiment = Experiment->Lookup($pid, $eid); if (!defined($experiment)) { tbdie("Could not locate object for experiment $pid/$eid"); } # # Verify permission to muck with this experiment. Note that this script # is run as root from the plab monitor daemon. # if (defined($this_user) && !$this_user->IsAdmin() && !$experiment->AccessCheck($this_user, TB_EXPT_DESTROY)) { tbdie("You do not have permission to mess with $pid/$eid!"); } # # Get the list of nodes in this experiment. # my @nodes = $experiment->NodeList(1, 1); if (! @nodes) { tbwarn("No allocated nodes in experiment $pid/$eid."); exit(0); } # Nodes on the command line. Operate only on this set. if (@ARGV) { my %fulllist = (); # Temporary hash list for searching. foreach my $node ( @nodes ) { $fulllist{$node} = 1; } @nodes = (); foreach my $node ( @ARGV ) { if ($node =~ /^([-\@\w]+)$/) { $node = $1; if (!defined($fulllist{$node})) { tbdie("Node $node is not allocated to $pid/$eid!"); } } else { tbdie("Bad node name: $node."); } push(@nodes, $node); } } my $exptstate = $experiment->state(); # Just the vnodes mam. foreach my $node (@nodes) { my $mode = ($killmode ? "teardown" : "setup"); my $nodeobj = Node->Lookup($node); if (!defined($nodeobj)) { tbdie("Could not map $node to its object"); } my $jailed = $nodeobj->jailflag(); my $plab = $nodeobj->isplabdslice(); my $remote = $nodeobj->isremotenode(); my $pnode = $nodeobj->phys_nodeid(); my $allocstate = $nodeobj->allocstate(); my $geninode = $nodeobj->isfednode(); my $shared = defined($nodeobj->sharing_mode()); next if (!$nodeobj->isvirtnode()); # Special hack for SPP nodes. Need to generalize this. if ($shared && $nodeobj->type eq "sppvm") { if ($mode eq "teardown") { $nodeobj->SetEventState(TBDB_NODESTATE_SHUTDOWN()); } else { $nodeobj->SetEventState(TBDB_NODESTATE_ISUP()); } next; } if (($plabonly || $jailonly) and !(($plabonly && $plab) || ($jailonly && (($jailed || $remote) && !($plab || $geninode))))) { next; } if (!defined($pnode)) { tbdie("No physical node for $node!"); } # # On remote nodes, or when forcemode is on, always do the deed. # Otherwise, look at experiment state. # if (!$force) { if ($exptstate eq EXPTSTATE_SWAPPING) { # # When swapping, local vnodes go down with the physnode. # if (! ($remote || $shared)) { print "$node will $mode with local node $pnode.\n"; next; } elsif ($allocstate eq TBDB_ALLOCSTATE_DOWN) { if (!$plab) { print "$node failed to boot; skipping $mode.\n"; next; } # Plab nodes need to be cleaned up. print "$node failed to boot; changing to cleanup.\n"; $mode = "cleanup"; } } elsif ($exptstate eq EXPTSTATE_ACTIVATING || $exptstate eq EXPTSTATE_MODIFY_RESWAP) { # # The allocstate determines if the vnode actually needs to # be setup or torndown. Note that a failed experiment will # cause a bunch of vnodes to be torndown, while in the # ACTIVATING state. See os_setup and assign_wrapper; the # idea is to avoid doing setup/teardown up vnodes on # machines that are rebooting anyway, or that failed. # Complicated by modify which could add/subtract a vnode on # an existing machine, but not reboot the machine. Note that # free (now unused) vnodes will land in RES_TEARDOWN. It is # assumed that these booted okay, and need to be torndown, # even though they are not RES_READY. # if (! ($remote || $shared)) { if ($killmode) { if ($allocstate eq TBDB_ALLOCSTATE_DOWN) { print "$node failed to boot; skipping $mode.\n"; next; } elsif ($allocstate eq TBDB_ALLOCSTATE_RES_INIT_CLEAN()) { print "$node never booted; skipping $mode.\n"; next; } elsif ($allocstate eq TBDB_ALLOCSTATE_RES_READY()) { print "$node will $mode with local node $pnode.\n"; next; } } elsif ($allocstate eq TBDB_ALLOCSTATE_RES_READY()) { print "$node is already setting up on local node $pnode\n"; next; } } else { if ($killmode) { if ($allocstate eq TBDB_ALLOCSTATE_DEAD) { # plab only. See below. print "$node failed to initialize; skipping $mode.\n"; next; } elsif ($allocstate eq TBDB_ALLOCSTATE_DOWN) { if (!$plab) { print "$node failed to boot; skipping $mode.\n"; next; } # Plab nodes need to be cleaned up. print "$node failed to boot; changing to cleanup.\n"; $mode = "cleanup"; } elsif ($allocstate eq TBDB_ALLOCSTATE_RES_INIT_CLEAN()) { print "$node never booted; skipping $mode.\n"; next; } } elsif ($allocstate eq TBDB_ALLOCSTATE_RES_READY()) { print "$node is already set up on $pnode\n"; next; } elsif ($allocstate eq TBDB_ALLOCSTATE_RES_INIT_DIRTY()) { print "$node needs a reboot on $pnode\n"; $mode = "reboot"; } elsif ($allocstate eq TBDB_ALLOCSTATE_RES_RECONFIG()) { print "$node needs a reconfig on $pnode\n"; # We do not actually reconfig virtual nodes; just # reboot them. Might reconfig someday, in which case # this would move up into os_setup. $mode = "reboot"; } elsif ($plab && $allocstate eq TBDB_ALLOCSTATE_RES_INIT_CLEAN()) { # This is a special case. If we reuse one of the plab # nodes, but use a different vname in the topo, there will # be an entry in the database and a slice will be reserved # because it's never torn down. However, we need to # skip plabnode alloc and go straight to vnodesetup. # # BUT, we only can do this if the sliver entry is already # in the database! # Also note that this could eventually cause problems # if there is a mistaken sliver entry in the DB. $res = DBQueryFatal( "select ps.slicename,psn.node_id". " from plab_slices as ps" . " left join plab_slice_nodes as psn" . " on (ps.slicename=psn.slicename" . " and ps.plc_idx=psn.plc_idx)" . " where ps.pid='$pid' and ps.eid='$eid'" . " and psn.node_id='$node'"); if ($res->numrows == 1) { # node exists; change mode to resetup $mode = "resetup"; print "Doing a resetup on '$node'\n"; } } } } } # # When setting up a vnode, force its event state into SHUTDOWN since # no telling what its initial state is. # # XXX: Don't we always want to set this? # if ($mode eq "teardown" || $mode eq "reboot") { $nodeobj->SetEventState(TBDB_NODESTATE_SHUTDOWN); } # # Put this into the list of calls we have to make in the next loop # push @vnodes, [$nodeobj, $mode]; } my $children = 0; my %child_vnodes = (); print STDOUT "vnode_setup running at parallelization: $numbatch ". "wait_time: $childwait\n" if (!$quiet); while (1) { # Space out the invocation of child processes a little. sleep(1); # # We're done when we've hit the last vnode, and we've outlived all of our # children # if ((!@vnodes) && ($children == 0)) { last; } # # There are more free slots # if (($children < $numbatch) && @vnodes) { # # Look for a vnode that is not on a pnode we're already working on # # XXX - do this! my ($nodeobj, $mode) = @{pop @vnodes}; my $vnode = $nodeobj->node_id(); my $jailed = $nodeobj->jailflag(); my $plab = $nodeobj->isplabdslice(); my $remote = $nodeobj->isremotenode(); my $pnode = $nodeobj->phys_nodeid(); my $geni = $nodeobj->isfednode(); print STDOUT "Doing $mode of vnode $vnode on $pnode ...\n" if (!$quiet); # # Run an ssh command in a child process, protected by an alarm to # ensure that the ssh is not hung up forever if the machine is in some # funky state. # my $syspid = fork(); if ($syspid) { # # Just keep track of it, we'll wait for it finish down below # $child_vnodes{$syspid} = [$nodeobj, $mode, time()]; $children++; } else { TBdbfork(); # So we get the event system fork too ... my $exval = 0; # Must change our real UID to root so that ssh will work. $UID = 0; if ($mode eq "setup" && ($plab || !$jailed || $geni)) { # Make sure vnode is in the proper state before trying to # bring it up. # XXX: do this for all vnodes (see above)? $nodeobj->SetEventState(TBDB_NODESTATE_SHUTDOWN); if ($geni) { $UID = $SAVEUID; $EUID = $UID; # $exval = GeniEmulab->StartSlivers($experiment, [$nodeobj]); } elsif ($plab) { if (TBForkCmd("$TB/sbin/plabnode ". ($force ? "-f" : ""). " alloc $pid $eid $vnode", 1)) { print STDERR "*** $0:\n" . " Plab node allocation failed\n"; # Should check DB state instead. exit(99); } } else { if (TBForkCmd("$ssh -host $pnode $CLIENT_BIN/vnodesetup ". " -i $vnode", 1)) { exit(99); } } # Make sure the system knows we now have state on the node! $nodeobj->SetAllocState(TBDB_ALLOCSTATE_RES_INIT_DIRTY); } if ($geni && ($mode eq "teardown" || $mode eq "cleanup")) { $nodeobj->SetEventState(TBDB_NODESTATE_SHUTDOWN); $UID = $SAVEUID; $EUID = $UID; # $exval = GeniEmulab->DestroySlivers($experiment, [ $nodeobj ]); } elsif (!($plab && ($mode eq "cleanup" || $mode eq "teardown")) && !($mode eq "setup" && $remote && !$plab)) { # Cleanup is used only on plab nodes. # Don't try to teardown plab vnodes; it's just asking for # trouble as the shutdown may hang or take too long. It's # best to simply try and free the vserver below. my $args = (($mode eq "teardown") ? "-k " : ($mode eq "reboot" ? "-r " : " ")); $args .= ($jailed ? "-jVt " : ($plab ? "-p " : "-i ")); $args .= "$vnode "; # If it's a plab node, we must ssh to the vnode, not pnode. my $pnodeOrVnode = $pnode; if ($plab) { $pnodeOrVnode = $vnode; } $exval = TBForkCmd("$ssh -host $pnodeOrVnode ". "$CLIENT_BIN/vnodesetup $args",1); } # Free the plab node lease if necessary. if ($plab && ($mode eq "teardown" || $mode eq "cleanup")) { $nodeobj->SetEventState(TBDB_NODESTATE_SHUTDOWN); exec("$TB/sbin/plabnode free $pid $eid $vnode"); die("*** $0:\n". " exec failed!\n"); } exit($exval); } } else { # # We have too many of the little rugrats, wait for one to die # # # Set up a timer - we want to kill processes after they hit 120 seconds # old (not much of a life, is it?), so we find the first one marked for # death. # my $oldest; my $oldestpid = 0; my $oldestvnode = ""; while (my ($pid, $aref) = each %child_vnodes) { my ($nodeobj, $mode, $birthtime) = @$aref; my $vnode = $nodeobj->node_id(); if ((!$oldestpid) || ($birthtime < $oldest)) { $oldest = $birthtime; $oldestpid = $pid; $oldestvnode = $vnode; } } # # Sanity check # if (!$oldest) { die "*** $0\n". "Uh oh, I have no children left, something is wrong!\n"; } # # If the oldest has already expired, just kill it off right now, and go # back around the loop # my $now = time(); my $waittime = ($oldest + $childwait) - time(); # # Kill off the oldest if it gets too old while we're waiting. # my $childpid = -1; my $exitstatus = -1; eval { local $SIG{ALRM} = sub { die "alarm clock" }; if ($waittime <= 0) { print STDERR "*** $0: timeout waiting for vnode: $oldestvnode\n"; kill("TERM",$oldestpid); } else { alarm($waittime); } $childpid = wait(); alarm 0; $exitstatus = $?; }; if ($@) { die unless $@ =~ /alarm clock/; next; } # # Another sanity check # if ($childpid < 0) { die "*** $0\n". "wait() returned <0, something is wrong!\n"; } # # Look up to see what vnode, etc. this was associated with - if we # don't know about this child, ignore it # my $aref = $child_vnodes{$childpid}; next unless @$aref; my ($nodeobj, $mode, $birthtime) = @$aref; my $vnode = $nodeobj->node_id(); my $pnode = $nodeobj->phys_nodeid(); $children--; delete $child_vnodes{$childpid}; # # Look for setup failure, reported back through ssh. # if ($exitstatus) { print STDERR "vnode $vnode $mode on $pnode returned $?.\n" if $debug; if ($exitstatus == 256) { print STDERR "$vnode is not running sshd.\n" if $debug; } elsif ($exitstatus == 15) { print STDERR "$vnode is wedged.\n" if $debug; } elsif ($exitstatus >> 8 == 99) { print STDERR "$vnode did not allocate properly.\n" if $debug; } push @failed_nodes, [$vnode, $pnode, $mode, $exitstatus]; warn("*** $0:\n". " Virtual node $vnode $mode failure!\n"); } if ($nodeobj->isplabdslice()) { # # If the node was in the setup process, then mark its allocstate # as down so os_setup knows not to bother waiting for it. DEAD # is a temp state, different then DOWN. It indicates the node # was never instantiated (important distinction for plab). # if ($exitstatus && (($mode eq "setup") || ($mode eq "reboot"))) { $nodeobj->SetAllocState(TBDB_ALLOCSTATE_DEAD()); } } } } # # In force node, do not bother with this other stuff. # exit(scalar(@failed_nodes)) if ($force && !$sendemail); # # Send mail to testbed-ops about failed nodes # if (@failed_nodes) { my $failed_lines = join("\n",map { join("\t",@{$_}) } @failed_nodes); SENDMAIL($TBOPS, "Virtual Node failure for $pid/$eid", "The following virtual nodes failed: \n" . "vnode\t\tpnode\tmode\texit status\n" . $failed_lines); } if ($killmode) { print STDOUT "Vnode teardown finished.\n" if (!$quiet); } else { print STDOUT "Vnode setup initiated on all nodes ...\n" if (!$quiet); } exit(0);