Commit 9edc2757 authored by Leigh B Stoller's avatar Leigh B Stoller

Add simple locking between the pool daemon and the mapper so

that the pool daemon does not run while the mapper is working.
This will only affect experiments that are using shared nodes
though, since the lock is way too coarse and basically serializes
the mapper. Since very experiments use hared nodes at this time, it
will not really be a problem.

Also added TERM handling to avoid killing the daemon while its
working, as when stopping testbed daemons to update the installation.
parent 9d051c7c
#!/usr/bin/perl -w
#
# EMULAB-COPYRIGHT
# Copyright (c) 2000-2009 University of Utah and the Flux Group.
# Copyright (c) 2000-2010 University of Utah and the Flux Group.
# All rights reserved.
#
use strict;
......@@ -284,6 +284,8 @@ sub AssignLoop()
TBDebugTimeStamp("mapper loop started");
while (1) {
my $gotlock = 0;
chat("Assign run $currentrun\n");
my $prefix = ($debug || $regression ? "$pid-$eid" : "$pid-$eid-$$");
......@@ -297,6 +299,28 @@ sub AssignLoop()
chat("Trying assign on an empty testbed.\n");
}
#
# Serialize with the pool daemon if using shared nodes.
# XXX When using shared nodes, only one can proceed at a
# time through assignment. This is okay for now since few
# experiments are using shared nodes. Eventually needs to be
# a barrier.
#
if ((!($impotent || $regression)) && $vtop->sharednodecount()) {
while (1) {
my $lock_result =
DBQueryWarn("select get_lock('pool_daemon', 10)");
fatal("DB Error tring to get pool_daemon lock")
if (!defined($lock_result));
($gotlock) = $lock_result->fetchrow_array();
last
if ($gotlock);
chat("Waiting for pool daemon lock ...\n");
}
}
#
# RunAssign returns 0 if successful.
# returns -1 if failure, but assign says to stop trying.
......@@ -305,6 +329,11 @@ sub AssignLoop()
#
my $retval = RunAssign($precheck, $prefix);
if ($gotlock) {
DBQueryWarn("select release_lock('pool_daemon')")
or fatal("Could not release the pool lock");
}
# Success!
last
if ($retval == 0);
......
......@@ -19,10 +19,11 @@ sub usage()
"Use the -d option to prevent daemonization\n";
exit(-1);
}
my $optlist = "dn";
my $optlist = "dne";
my $debug = 0;
my $impotent = 0;
my $startup = 0;
my $killme = 0;
my $nofree = 1;
#
# This should run as root.
......@@ -88,10 +89,13 @@ if (@ARGV != 0) {
usage();
}
if (defined($options{"d"})) {
$debug = $options{"d"};
$debug = 1;
}
if (defined($options{"n"})) {
$impotent = $options{"n"};
$impotent = 1;
}
if (defined($options{"e"})) {
$nofree = 1;
}
if (!$impotent && CheckDaemonRunning("pool_daemon")) {
......@@ -164,21 +168,26 @@ if (!defined($image)) {
exit(0);
}
while (1) {
#
# And handler for TERM since we really do not want this to be
# interrupted. Just set a flag that will cause it to exit at
# the next loop.
#
sub sigterm()
{
print "Got a TERM signal; arranging to exit soon\n";
$killme = 1;
}
$SIG{TERM} = \&sigterm;
while (!$killme) {
my $disabled;
# Use a long period; we do not want the pool to change too fast.
if (!$startup) {
$startup++;
}
else {
sleep(120);
}
print "Pool Daemon running at ".`date`;
if (! TBGetSiteVar("web/nologins", \$disabled) || $disabled) {
print " Skipping this loop cause of nologins\n";
next;
goto loop;
}
Node->FlushAll();
......@@ -196,6 +205,24 @@ while (1) {
my $minpoolsize = TBGetSiteVar("general/minpoolsize");
my $poolnodetype = TBGetSiteVar("general/poolnodetype");
#
# Serialize this part with the mapper.
#
if (!$impotent) {
while (1) {
my $lock_result =
DBQueryWarn("select get_lock('pool_daemon', 5)");
fatal("DB Error tring to get pool_daemon lock")
if (!defined($lock_result));
my ($gotlock) = $lock_result->fetchrow_array();
last
if ($gotlock);
print "Waiting for pool daemon lock ...\n";
}
}
#
# Look to see how each of the nodes is packed. This is
# advisory; we will not know for sure until tables locked
......@@ -220,7 +247,7 @@ while (1) {
next
if ($vnodecount < 0);
if ($vnodecount == 0) {
if ($vnodecount == 0 && !$nofree) {
print "$node no longer has virtual nodes on it.\n";
# Free the node unless we would go below the minpoolsize.
if (scalar(@nodelist) - scalar(keys(%tofree)) > $minpoolsize) {
......@@ -231,6 +258,7 @@ while (1) {
}
# Count up loaded vs. unloaded nodes.
my $factor = $maxsharecount / $vnodecount;
print "$node load factor is $factor\n";
if ($factor < 0.5) {
$unloaded++;
}
......@@ -248,14 +276,14 @@ while (1) {
my $key = (keys(%tofree))[0];
delete($tofree{$key});
}
elsif (scalar(@nodelist) < $maxpoolsize) {
elsif (scalar(@nodelist) < $maxpoolsize) {
$newcount++;
}
}
if (! (keys(%tofree) || $newcount)) {
exit(0)
if ($impotent);
next;
goto loop;
}
#
......@@ -268,7 +296,7 @@ while (1) {
if (!open(NS, ">$tmpfile")) {
notify("Could not create $tmpfile");
next;
goto loop;
}
print NS "# Auto generated by the pool daemon\n\n";
print NS "source tb_compat.tcl\n";
......@@ -311,7 +339,7 @@ while (1) {
chmod(0775, $tmpfile);
exit(0)
if ($impotent);
if ($impotent || $killme);
# Must do this each time before fork.
tblog_new_session();
......@@ -335,7 +363,7 @@ while (1) {
if ($error_data->{'cause'} eq "temp") {
print "Temporary resource shortage; try again later\n";
next;
goto loop;
}
fatal("swapmod failed");
}
......@@ -348,6 +376,12 @@ while (1) {
exec("$SWAPEXP -q -w -n -s modify $pid $eid $tmpfile");
die("Could not exec $SWAPEXP\n");
}
loop:
DBQueryWarn("select release_lock('pool_daemon')")
or fatal("Could not release the pool lock");
# Use a long period; we do not want the pool to change too fast.
sleep(120);
}
#
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment