From 63cfbd3af05e2869eb8ef9fd3b20bc379bef4ebe Mon Sep 17 00:00:00 2001 From: David Johnson <johnsond@flux.utah.edu> Date: Fri, 21 Sep 2007 21:20:05 +0000 Subject: [PATCH] Attempt to fix a problem in which the window size (max plab nodes being tested at once) gets negative and too many processes get forked. --- tbsetup/plab/plabmon_badpool.pm.in | 8 ++++---- tbsetup/plab/plabmon_goodpool.pm.in | 10 +++++----- tbsetup/plab/plabmonitord.in | 18 ++++++++++++++++-- 3 files changed, 25 insertions(+), 11 deletions(-) diff --git a/tbsetup/plab/plabmon_badpool.pm.in b/tbsetup/plab/plabmon_badpool.pm.in index bc07ede28a..d9c55a60be 100644 --- a/tbsetup/plab/plabmon_badpool.pm.in +++ b/tbsetup/plab/plabmon_badpool.pm.in @@ -84,7 +84,7 @@ sub checknextnode($) { # Nothinig to check! if (!$pnode) { - return; + return 1; } # Grab the vnode for this pnode (service sliver vnode) @@ -92,7 +92,7 @@ sub checknextnode($) { if (!defined($vnode)) { print "Could not find vnode associated with $pnode!\n"; - return; + return 1; } print "Pool: $self->{'NAME'}: Testing node $pnode->{'name'} at ". @@ -110,7 +110,7 @@ sub checknextnode($) { $self->{'PENDING'}->{$pnode->{'name'}} = $pnode; $self->{'CHPID2POOL'}->{$chpid} = $self; - return + return 0; } # Worker process. @@ -214,7 +214,7 @@ sub processchild($$$) { if (!defined($pnode)) { print "Pool: $self->{'NAME'}: $chpid not found in pending list!\n"; - return 1; + return 0; } # Setup log entry prefix diff --git a/tbsetup/plab/plabmon_goodpool.pm.in b/tbsetup/plab/plabmon_goodpool.pm.in index 64afddf6cb..aca7ed61f0 100644 --- a/tbsetup/plab/plabmon_goodpool.pm.in +++ b/tbsetup/plab/plabmon_goodpool.pm.in @@ -101,7 +101,7 @@ sub checknextnode($) { # Nothing to check! if (!$pnode) { - return; + return 1; } # Grab a new sliver to test with @@ -113,14 +113,14 @@ sub checknextnode($) { 'nodeid' => $pnode->{'name'}); if (Node::CreateVnodes(\@vnodes, \%options)) { print "Failed to allocate vnode for $pnode->{'name'}!\n"; - return; + return 1; } my $vnode = $vnodes[0]; if (!defined($vnode)) { print "Could not create vnode associated with $pnode!\n"; - return; + return 1; } print "Pool: $self->{'NAME'}: Testing node $pnode->{'name'} at ". @@ -138,7 +138,7 @@ sub checknextnode($) { $self->{'PENDING'}->{$pnode->{'name'}} = $pnode; $self->{'CHPID2POOL'}->{$chpid} = $self; - return; + return 0; } # Worker process. @@ -242,7 +242,7 @@ sub processchild($$$) { if (!defined($pnode)) { print "Pool: $self->{'NAME'}: $chpid not found in pending list!\n"; - return 1; + return 0; } # Setup log entry prefix diff --git a/tbsetup/plab/plabmonitord.in b/tbsetup/plab/plabmonitord.in index c050d34de1..8b1c3bc9fd 100644 --- a/tbsetup/plab/plabmonitord.in +++ b/tbsetup/plab/plabmonitord.in @@ -219,8 +219,11 @@ while (1) { foreach my $pool (@allpools) { # if pool still has nodes to test, get them going. if ($pool->getnextchecktime() <= $now) { - $pool->checknextnode(); - $windowsize++; + # Only increment the window if we successfully launched a + # process. + if (!$pool->checknextnode()) { + $windowsize++; + } } } } @@ -256,6 +259,14 @@ while (1) { } } + # This is the best place to get an idea of running plabnode procs. + my @procs = `ps axwww | grep 'emulab-ops plab-' | wc -l`; + my $pcount = 0; + if (scalar(@procs) > 0) { + $procs[0] =~ /(\d+)/; + $pcount = $procs[0]; + } + # Look for expired processes. Calling checkexpiration on a pool # has the side effect of checking for ISUP (or ISUP expiration) for # any nodes pending thusly in the pool. The return value is the @@ -266,6 +277,9 @@ while (1) { $windowsize -= $numfinished; } + # Log diff between believed window size, and "actual" + print "winsize = $windowsize / runprocs = $pcount\n"; + # We may have just fired off a bunch of kills, so chill for a bit to # let things quiesce. sleep($CHILLTIME); -- GitLab