diff --git a/tbsetup/plab/plabmon_badpool.pm.in b/tbsetup/plab/plabmon_badpool.pm.in index bc07ede28a090f2b2a8d36a5d76ec1d799a3cf57..d9c55a60be215b1af103443e76cde4bc197f16b3 100644 --- a/tbsetup/plab/plabmon_badpool.pm.in +++ b/tbsetup/plab/plabmon_badpool.pm.in @@ -84,7 +84,7 @@ sub checknextnode($) { # Nothinig to check! if (!$pnode) { - return; + return 1; } # Grab the vnode for this pnode (service sliver vnode) @@ -92,7 +92,7 @@ sub checknextnode($) { if (!defined($vnode)) { print "Could not find vnode associated with $pnode!\n"; - return; + return 1; } print "Pool: $self->{'NAME'}: Testing node $pnode->{'name'} at ". @@ -110,7 +110,7 @@ sub checknextnode($) { $self->{'PENDING'}->{$pnode->{'name'}} = $pnode; $self->{'CHPID2POOL'}->{$chpid} = $self; - return + return 0; } # Worker process. @@ -214,7 +214,7 @@ sub processchild($$$) { if (!defined($pnode)) { print "Pool: $self->{'NAME'}: $chpid not found in pending list!\n"; - return 1; + return 0; } # Setup log entry prefix diff --git a/tbsetup/plab/plabmon_goodpool.pm.in b/tbsetup/plab/plabmon_goodpool.pm.in index 64afddf6cb9dcddf73d59005061684b6e3c0a3c4..aca7ed61f04799491d800f9ef499c29622762f14 100644 --- a/tbsetup/plab/plabmon_goodpool.pm.in +++ b/tbsetup/plab/plabmon_goodpool.pm.in @@ -101,7 +101,7 @@ sub checknextnode($) { # Nothing to check! if (!$pnode) { - return; + return 1; } # Grab a new sliver to test with @@ -113,14 +113,14 @@ sub checknextnode($) { 'nodeid' => $pnode->{'name'}); if (Node::CreateVnodes(\@vnodes, \%options)) { print "Failed to allocate vnode for $pnode->{'name'}!\n"; - return; + return 1; } my $vnode = $vnodes[0]; if (!defined($vnode)) { print "Could not create vnode associated with $pnode!\n"; - return; + return 1; } print "Pool: $self->{'NAME'}: Testing node $pnode->{'name'} at ". @@ -138,7 +138,7 @@ sub checknextnode($) { $self->{'PENDING'}->{$pnode->{'name'}} = $pnode; $self->{'CHPID2POOL'}->{$chpid} = $self; - return; + return 0; } # Worker process. @@ -242,7 +242,7 @@ sub processchild($$$) { if (!defined($pnode)) { print "Pool: $self->{'NAME'}: $chpid not found in pending list!\n"; - return 1; + return 0; } # Setup log entry prefix diff --git a/tbsetup/plab/plabmonitord.in b/tbsetup/plab/plabmonitord.in index c050d34de170729709cf0d7e78e4b0ee17cef76a..8b1c3bc9fd6d390b3f907b680a3bbf2ec3b9ab5e 100644 --- a/tbsetup/plab/plabmonitord.in +++ b/tbsetup/plab/plabmonitord.in @@ -219,8 +219,11 @@ while (1) { foreach my $pool (@allpools) { # if pool still has nodes to test, get them going. if ($pool->getnextchecktime() <= $now) { - $pool->checknextnode(); - $windowsize++; + # Only increment the window if we successfully launched a + # process. + if (!$pool->checknextnode()) { + $windowsize++; + } } } } @@ -256,6 +259,14 @@ while (1) { } } + # This is the best place to get an idea of running plabnode procs. + my @procs = `ps axwww | grep 'emulab-ops plab-' | wc -l`; + my $pcount = 0; + if (scalar(@procs) > 0) { + $procs[0] =~ /(\d+)/; + $pcount = $procs[0]; + } + # Look for expired processes. Calling checkexpiration on a pool # has the side effect of checking for ISUP (or ISUP expiration) for # any nodes pending thusly in the pool. The return value is the @@ -266,6 +277,9 @@ while (1) { $windowsize -= $numfinished; } + # Log diff between believed window size, and "actual" + print "winsize = $windowsize / runprocs = $pcount\n"; + # We may have just fired off a bunch of kills, so chill for a bit to # let things quiesce. sleep($CHILLTIME);