Commit 4c6bf730 authored by Leigh Stoller's avatar Leigh Stoller

Checkpoint the shared pool daemon before I start bashing on it again.

parent 325dc67f
......@@ -38,7 +38,7 @@ SBIN_STUFF = resetvlans console_setup.proxy sched_reload named_setup \
newnode_reboot savelogs.proxy eventsys.proxy \
elabinelab snmpit.proxy panic node_attributes \
nfstrace plabinelab smbpasswd_setup smbpasswd_setup.proxy \
rmproj snmpit.proxynew snmpit.proxyv2
rmproj snmpit.proxynew snmpit.proxyv2 pool_daemon
ifeq ($(ISMAINSITE),1)
SBIN_STUFF += repos_daemon
endif
......
#!/usr/bin/perl -w
#
# EMULAB-COPYRIGHT
# Copyright (c) 2009 University of Utah and the Flux Group.
# All rights reserved.
#
use strict;
use English;
use Getopt::Std;
use POSIX qw(tmpnam);
use Data::Dumper();
#
# Manage the pool of shared nodes.
#
sub usage()
{
print STDOUT "Usage: pool_daemon [-d]\n" .
"Use the -d option to prevent daemonization\n";
exit(-1);
}
my $optlist = "d";
my $debug = 0;
my $startup = 0;
#
# This should run as root.
#
if ($UID != 0) {
die("*** $0:\n".
" Only root can run this script!\n");
}
#
# Configure variables
#
my $TB = "@prefix@";
my $TBOPS = "@TBOPSEMAIL@";
my $BOSSNODE = "@BOSSNODE@";
my $batchlog = "$TB/log/poollog";
my $SWAPEXP = "$TB/bin/swapexp";
# Testbed Support library
use lib "@prefix@/lib";
use libdb;
use libtblog;
use event;
use libtestbed;
use NodeType;
use Experiment;
use User;
use OSinfo;
# We use tblog to determine why swapexp failed.
tblog_stop_capture();
# Locals
my $EVhandle;
# Prototypes.
sub SetupEventHandler();
sub fatal($);
sub notify($);
#
# Turn off line buffering on output (dots ...).
#
$| = 1;
#
# Untaint the path
#
$ENV{'PATH'} = "/bin:/usr/bin:";
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};
#
# Parse command arguments. Once we return from getopts, all that should be
# left are the required arguments.
#
my %options = ();
if (! getopts($optlist, \%options)) {
usage();
}
if (@ARGV != 0) {
usage();
}
if (defined($options{"d"})) {
$debug = $options{"d"};
}
# Go to ground.
if (! $debug) {
if (TBBackGround($batchlog)) {
exit(0);
}
}
print "Pool Daemon starting... pid $$, at ".`date`;
#
# Grab the shared node experiment. This should come from a sitevar.
# Or perhaps we want to manage multiple shared pools?
#
my $experiment = Experiment->Lookup(TBOPSPID(), "shared-nodes");
if (!defined($experiment)) {
fatal("Could not lookup shared node experiment. Exiting ...");
}
my $pid = $experiment->pid();
my $eid = $experiment->eid();
#
# We need this user for running swapexp below.
#
my $elabman = User->Lookup("elabman");
if (!defined($elabman)) {
fatal("Could not lookup elabman user. Exiting ...");
}
while (1) {
# Use a long period; we do not want the pool to change too fast.
if (!$startup) {
$startup++;
}
else {
sleep(120);
}
print "Pool Daemon running at ".`date`;
Node->FlushAll();
$experiment->Refresh() == 0
or fatal("Could not reload $experiment");
my @nodelist = $experiment->NodeList();
my %inuse = ();
my %tofree = ();
my $loaded = 0;
my $unloaded = 0;
my $newcount = 0;
#
# Look to see how each of the nodes is packed. This is
# advisory; we will not know for sure until tables locked
# in nfree and we can count the number of vnodes on it.
#
foreach my $node (@nodelist) {
my $reservation = $node->Reservation();
# Node released somewhere else.
next
if (!defined($reservation));
next
if (!$experiment->SameExperiment($reservation));
my $maxsharecount = $node->MaxShareCount();
# Transient error?
next
if ($maxsharecount < 0);
# Look for nodes with nothing on them.
my $vnodecount = $node->HasVirtualNodes();
# Transient error?
next
if ($vnodecount < 0);
if ($vnodecount == 0) {
print "$node no longer has virtual nodes on it\n";
$tofree{$node->node_id()} = $node;
next;
}
# Count up loaded vs. unloaded nodes.
my $factor = $maxsharecount / $vnodecount;
if ($factor < 0.5) {
$unloaded++;
}
else {
$loaded++;
}
$inuse{$node->vname()} = $node;
}
#
# Try to keep an unloaded machine available.
#
if (scalar(@nodelist) - $loaded <= 1) {
if (keys(%tofree)) {
# rescue one from the free list.
my $key = (keys(%tofree))[0];
delete($tofree{$key});
}
else {
$newcount++;
}
}
# But do not let all the nodes go free.
if (@nodelist && (scalar(@nodelist) == scalar(keys(%tofree)))) {
my $key = (keys(%tofree))[0];
delete($tofree{$key});
}
next
if (! (keys(%tofree) || $newcount));
#
# Generate a new NS file. Be nice to not have to this, but not
# having an NS file could confuse things for the web interface.
#
my $tmpfile = "/tmp/pool-$$.nsfile";
print "Generating a new NS file in $tmpfile\n";
if (!open(NS, ">$tmpfile")) {
notify("Could not create $tmpfile");
next;
}
print NS "# Auto generated by the pool daemon\n\n";
print NS "source tb_compat.tcl\n";
print NS "set ns [new Simulator]\n";
foreach my $node (@nodelist) {
next
if (exists($tofree{$node->node_id}));
my $vname = $node->vname();
my $osid = $node->def_boot_osid();
my $osinfo = OSinfo->Lookup($osid);
my $osname = $osinfo->osname();
print NS "set $vname [\$ns node]\n";
print NS "tb-set-node-os \$$vname $osname\n";
print NS "tb-set-node-sharingmode \$$vname \"shared_local\"\n";
}
while ($newcount) {
my $id = 1;
my $vname = "vhost${id}";
while (exists($inuse{$vname})) {
$id++;
$vname = "vhost${id}";
}
print NS "set $vname [\$ns node]\n";
print NS "tb-set-node-os \$$vname FEDORA8-OVZ-STD\n";
print NS "tb-set-node-sharingmode \$$vname \"shared_local\"\n";
$newcount--;
}
print NS "\$ns rtproto Static\n";
print NS "\$ns run\n";
close(NS);
chmod(0775, $tmpfile);
#
# Start a swapmod.
#
my $childpid = fork();
if ($childpid) {
print "Starting a swap modify. Child is $childpid.\n";
#
# Wait for the swapmod to complete.
#
waitpid($childpid, 0);
my $exitval = $?;
print "Swap modify done at " . `date`;
$experiment->LockDown(1);
if ($exitval) {
my $error_data = tblog_find_error();
print Dumper($error_data);
fatal("swapmod failed");
}
}
else {
if ($elabman->FlipTo($experiment->unix_gid())) {
fatal("Could not flipto $elabman");
}
$experiment->LockDown(0);
tblog_new_child_process();
exec("$SWAPEXP -q -w -n -s modify $pid $eid $tmpfile");
die("Could not exec $SWAPEXP\n");
}
}
#
# Subscribe to experiment state change events.
#
sub SetupEventHandler()
{
my $port = @BOSSEVENTPORT@;
my $URL = "elvin://localhost:$port";
# Connect to the event system, and subscribe the the events we want
$EVhandle = event_register($URL, 0);
if (!$EVhandle) {
fatal("Unable to register with event system");
}
my $tuple = address_tuple_alloc();
if (!$tuple) {
fatal("Could not allocate an address tuple");
}
%$tuple = ( objtype => libdb::TBDB_TBEVENT_CONTROL(),
objname => "pool_daemon",
host => $BOSSNODE,
);
if (!event_subscribe($EVhandle, \&EventHandler, $tuple)) {
fatal("Could not subscribe to events");
}
}
#
# Callback for above.
#
sub EventHandler($$$) {
my ($handle,$notification,undef) = @_;
my $objname = event_notification_get_objname($handle,$notification);
my $eventtype = event_notification_get_eventtype($handle,$notification);
print "$objname, $eventtype\n";
}
sub fatal($)
{
my ($msg) = @_;
SENDMAIL($TBOPS, "Pool Daemon Died", $msg, $TBOPS);
die($msg);
}
sub notify($)
{
my ($msg) = @_;
print "$msg\n";
SENDMAIL($TBOPS, "Pool Daemon Message", $msg, $TBOPS);
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment