plabmetrics.in 12.2 KB
Newer Older
1
#!/usr/bin/perl -w
2
#
3
# Copyright (c) 2000-2004, 2008 University of Utah and the Flux Group.
4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
# 
# {{{EMULAB-LICENSE
# 
# This file is part of the Emulab network testbed software.
# 
# This file is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or (at
# your option) any later version.
# 
# This file is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public
# License for more details.
# 
# You should have received a copy of the GNU Affero General Public License
# along with this file.  If not, see <http://www.gnu.org/licenses/>.
# 
# }}}
23 24 25 26 27
#
use English;
use Getopt::Std;
use XML::Parser;

28 29 30 31 32 33 34
#
# Drift to allow between our clock and the gmond nodes
#
my $STALESLOP = 1 * 60;

#
# Age (in seconds) at which we consider metric data stale
35
# Configured via site variable.
36
#
37
my $STALEAGE;
38

39 40 41
sub usage()
{
    print STDOUT
42
	"Usage: plabmetrics [-d] [-n]\n";
43 44
    exit(-1);
}
45 46

sub InsertMetrics($%);
47 48
sub ProcessCoMonData($);
sub DownLoadURL($$);
49

50
my $optlist = "dn";
51 52
my $debug   = 0;
my $impotent= 0;
Mike Hibler's avatar
Mike Hibler committed
53
my $mailit  = 1;
54 55 56 57

#
# Only real root can call this.
# 
58
if ($UID != 0) {
59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87
    print STDERR "You must be root to run this script!\n";
    exit(-1);
}

#
# Configure variables
#
my $TB		= "@prefix@";
my $TBOPS       = "@TBOPSEMAIL@";

#
# Turn off line buffering on output
#
$| = 1;

#
# Untaint the path
# 
$ENV{'PATH'} = '/bin:/usr/bin';
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};

#
# Testbed Support libraries
#
use lib "@prefix@/lib";
use libdb;
use libtestbed;

# Locals
88 89 90
my $LOADMETRIC;
my $MAXLOAD;
my $MINDISK;
91

92 93 94 95 96 97 98 99 100 101
my $INF = 99999;
my %REQCOLUMNS = ('name'      => 1,
                  '1minload'  => 1, 
                  '5minload'  => 1, 
                  'diskused'  => 1, 
                  'disksize'  => 1,
                  'lastcotop' => 1,
                   );

my $COMONDATAFILE = TBMakeLogname("comon-data");
102
my $COMONURL = "http://comon.cs.princeton.edu/status/tabulator.cgi".
103 104
    "?table=table_nodeview&format=formatspaces&select='resptime%20%3E%200'";
my %host2nodeid = ();
105

106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124
# translates emulab keys to comon keys for inserts into the 
# plab_comondata table (the real point is to keep track of which comon fields
# our table stores, of course)
my %comon_trans_table = (resptime => 'resptime',uptime => 'uptime',
			 lastcotop => 'lastcotop',date => 'date',
			 drift => 'drift',cpuspeed => 'cpuspeed',
			 busycpu => 'busycpu',syscpu => 'syscpu',
			 freecpu => 'freecpu','1minload' => '1minload',
			 '5minload' => '5minload',numslices => 'numslices',
			 liveslices => 'liveslices',connmax => 'connmax',
			 connavg => 'connavg',timermax => 'timermax',
			 timeravg => 'timeravg',memsize => 'memsize',
			 memact => 'memact',freemem => 'freemem',
			 swapin => 'swapin',swapout => 'swapout',
			 diskin => 'diskin',diskout => 'diskout',
			 gbfree => 'gbfree',swapused => 'swapused',
			 bwlimit => 'bwlimit',txrate => 'txrate',
			 rxrate => 'rxrate');

125 126 127 128 129 130 131 132 133 134 135 136 137 138 139
# Be careful not to exit on transient error
$libdb::DBQUERY_MAXTRIES = 5;

#
# Parse command arguments. Once we return from getopts, all that should
# left are the required arguments.
#
%options = ();
if (! getopts($optlist, \%options)) {
    usage();
}
if (@ARGV) {
    usage();
}
if (defined($options{"d"})) {
140
    $debug = 1;
141 142 143 144 145 146
}
if (defined($options{"n"})) {
    $impotent = 1;
}

#
147
# Set default values
148
#
149 150
if (TBSiteVarExists("plab/load_metric")) {
    $LOADMETRIC = TBGetSiteVar("plab/load_metric");
151
    if ($LOADMETRIC !~ /^load_(one|five|fifteen)$/) {
152 153
	undef $LOADMETRIC;
    }
154 155 156 157 158 159 160 161 162 163 164
    else {
	if ($LOADMETRIC eq "load_one") {
	    $LOADMETRIC = "load_1min";
	}
	if ($LOADMETRIC eq "load_five") {
	    $LOADMETRIC = "load_5min";
	}
	if ($LOADMETRIC eq "load_fifteen") {
	    $LOADMETRIC = "load_15min";
	}
    }
165 166
}
if (!defined($LOADMETRIC)) {
167
    $LOADMETRIC = "load_15min";
168
}
169

170 171 172 173 174 175
if (TBSiteVarExists("plab/max_load")) {
    $MAXLOAD = TBGetSiteVar("plab/max_load");
    $MAXLOAD = 0.0 if $MAXLOAD <= 0.0;
    $MAXLOAD = 1000.0 if $MAXLOAD > 1000.0;
} else {
    $MAXLOAD = 5.0;
176 177
}

178 179 180 181 182 183 184 185
if (TBSiteVarExists("plab/min_disk")) {
    $MINDISK = TBGetSiteVar("plab/min_disk");
    $MINDISK = 0 if $MINDISK < 0;
    $MINDISK = 100 if $MINDISK > 100;
} else {
    $MINDISK = 10;
}

186 187 188 189 190 191 192 193
if (TBSiteVarExists("plab/stale_age")) {
    $STALEAGE = TBGetSiteVar("plab/stale_age");
    $STALEAGE = 0 if $STALEAGE < 0;
} else {
    $STALEAGE = 60;
}
$STALEAGE *= 60;

194 195 196 197 198
print "\n=== plabmetrics ".
    "(metric=$LOADMETRIC, maxload=$MAXLOAD, mindisk=$MINDISK) ".
    "running at " . `date`
    if $debug;

199 200 201 202 203 204 205
#
# Grab node telemetry from CoMon
#
if (DownLoadURL($COMONURL, $COMONDATAFILE)) {
    fatal("Failed to download CoMon data!");
}

206 207
#
# Grab the node list from the DB in one query, which we use later to
208
# map from the hostname we get from the CoMon output, to our node_id. 
209 210
#
my $query_result =
211
    DBQueryFatal("select n.node_id, wa.hostname".
212
		 " from nodes as n ".
213
		 "left join node_types as nt on n.type=nt.type ".
214
                 "left join widearea_nodeinfo as wa on n.node_id = wa.node_id ".
215
		 "where nt.isremotenode=1 and nt.isvirtnode=0 ".
216
    		 "and nt.class='pcplabphys'");
217

218
#
219 220 221 222
# Create hostname map.
#
while (my ($node_id, $hostname) = $query_result->fetchrow_array()) {
    $host2nodeid{$hostname} = $node_id;
223
}
224 225 226 227 228 229

#
# Run through the CoMon data file and insert metrics.
#
ProcessCoMonData($COMONDATAFILE);
unlink($COMONDATAFILE);
230
exit(0);
231

232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296
#
# data helper funcs
#
sub isnull($) {
    my $arg = shift;

    if (!defined($arg) or $arg eq 'null' or $arg eq 'NULL') {
        return 1;
    }
    return 0;
}

sub null2inf($) {
    my $arg = shift;

    if (isnull($arg)) {
        $arg = $INF;
    }

    return $arg;
}

#
# Run through each line of the comon data, extracting plab node metrics
# and inserting these into the DB.
#
sub ProcessCoMonData($) {
    my $comonfile = shift;
    my %colpos = ();

    open(COMON, "<$comonfile") or
        fatal("Can't open comon data file!");

    # Grab the column header line and find the position of all the columns
    # we care about.
    my $columnln = <COMON>;
    chomp $columnln;
    my @columns = split(/\s+/, $columnln);
    my $colnum = 0;
    foreach $column (@columns) {
        if (exists($REQCOLUMNS{$column})) {
            $colpos{$column} = $colnum;
        }
        $colnum++;
    }

    # Make sure all columns are present and accounted for
    if (scalar(keys %colpos) != scalar(keys %REQCOLUMNS)) {
        fatal("Some columns were missing in CoMon data!");
    }

    while (my $row = <COMON>) {
        chomp $row;
        my @coldata = split(/\s+/, $row);
        next if (!@coldata or scalar(@coldata) != scalar(@columns));
        my $hostname = $coldata[$colpos{'name'}];
        my $node_id = "";
        if (exists($host2nodeid{$hostname})) {
            $node_id = $host2nodeid{$hostname};
        } else {
            print STDERR "*** WARNING: $hostname not known in ".
                "Emulab database!\n";
            next;
        }
        my %metrics = ();
297 298 299 300 301
	# save off all the column data
	for (my $lpc = 0; $lpc < scalar(@coldata); ++$lpc) {
	    $metrics{$columns[$lpc]} = $coldata[$lpc];
	}
	# save off custom data
302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317
        $metrics{'metricsage'} = null2inf($coldata[$colpos{'lastcotop'}]);
        $metrics{'load_1min'}  = null2inf($coldata[$colpos{'1minload'}]);
        $metrics{'load_5min'}  = null2inf($coldata[$colpos{'5minload'}]);
        # XXX: bah, no load15 data in CoMon output.
        $metrics{'load_15min'} = null2inf($coldata[$colpos{'5minload'}]);
        my $disksize = $coldata[$colpos{'disksize'}];
        my $diskused = $coldata[$colpos{'diskused'}];
        if (isnull($disksize) or isnull($diskused) or $disksize < 1) {
            $metrics{'disk_used'} = 100;
        } else {
            $metrics{'disk_used'} = $diskused / $disksize * 100;
        }
        InsertMetrics($node_id, %metrics);
    }
}

318
#
319
# Insert the metrics we care about. Called for each node.
320
#
321
sub InsertMetrics($%)
322
{
323 324 325
    my ($nodeid)   = shift;
    my (%metrics)  = @_;
    my $metricsage = $metrics{'metricsage'};
326
    my $localdebug = $debug;
327 328 329
    my $scaled;
    my $load;
    my $disk;
330

331
    #
332
    # See if we got any metric data.  If so, then check for stale data.
333 334 335
    # In the case where metric data appears to be in the future,
    # it may be clock skew, so allow a little slop.
    #
336
    if (!defined($metricsage)) {
337
	print "WARNING: $nodeid: no metric data, ignoring\n"
338 339 340 341
	    if $localdebug;
	$metrics{$LOADMETRIC} = 999;
	$localdebug = 0;
    } elsif ($metricsage < 0) {
342
	if (-$metricsage > $STALESLOP) {
343
	    print "WARNING: $nodeid: metric data in the future, ignoring\n"
344
		if $localdebug;
345 346
	    $metrics{$LOADMETRIC} = 999;
	}
347 348
    } elsif ($STALEAGE == 0) {
	if ($metricsage > 4 * 60 * 60) {
349 350
	    print "WARNING: $nodeid: metric data older than 4 hours, ".
		"using anyway\n"
351 352
		if $localdebug;
	}
353
    } elsif ($metricsage > $STALEAGE) {
354
	print "WARNING: $nodeid: stale metric data, ignoring\n"
355
	    if $localdebug;
356 357 358 359 360 361 362
	$metrics{$LOADMETRIC} = 999;
    }

    #
    # Make sure all the metrics we might need are defined
    #
    if (!defined($metrics{$LOADMETRIC})) {
363
	print "WARNING: $nodeid: no $LOADMETRIC metric\n"
364
	    if $localdebug;
365 366
	$metrics{$LOADMETRIC} = 999;
    }
367 368
    if (!defined($metrics{disk_used})) {
	print "WARNING: $nodeid: no disk_used metrics, assuming enough\n"
369
	    if $localdebug;
370
	$metrics{disk_used} = 0;
371 372
    }

373 374 375 376
    #
    # Load must be under MAXLOAD, favor those with lower load
    #
    $load = $metrics{$LOADMETRIC};
377 378
    if ($MAXLOAD == 1000) {
	$scaled = 0;
379
    } else {
380
	$scaled = $load / $MAXLOAD;
381
    }
382 383 384 385 386

    # proper, valid feature weights have to be less than 1 
    if ($scaled > 0.99) {
	$scaled = 0.99;
    }
387
		
388 389 390 391 392 393 394
    #
    # Plab people request that we not start jobs on nodes
    # with less than a certain amount of available disk space
    #
    if ((100.0 - $metrics{disk_used}) >= $MINDISK) {
	$disk = 0;
    } else {
395
	$disk = 0.9;
396 397
    }

398 399
    if ($debug || $impotent) {
	print STDERR "$nodeid $load $scaled $disk\n";
400 401
    }

402
    if (!$impotent) {
403
	DBQueryWarn("replace into node_features ".
404
		    " (node_id, feature, weight) ".
405
		    " values ('$nodeid', 'load', $scaled)");
406
	DBQueryWarn("replace into node_features ".
407
		    " (node_id, feature, weight) ".
408
		    " values ('$nodeid', 'disk', $disk)");
409
    }
410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445

    # finally, insert int/float values into the main comon data table
    if (!$impotent) {
	my $qstr = "replace into plab_comondata ";
	my $kstr = "node_id,";
	my $vstr = "'$nodeid',";
	my @errors = ();
	foreach my $ekn (keys(%comon_trans_table)) {
	    $kstr .= "$ekn,";
	    my $val = $metrics{$comon_trans_table{$ekn}};
	    if ($val eq '' || $val =~ /null/i) {
		$val = -1;
	    }
	    # make sure we have an int or float; we accept nothing else.
	    if (!($val =~ /^\-?\d+(\.\d+)?$/)) {
		push @errors,"bad data in field ".$comon_trans_table{$ekn}.
		    ": '$val'";
	    }
	    $vstr .= $metrics{$comon_trans_table{$ekn}} . ",";
	}
	if (scalar(@errors) > 0) {
	    print STDERR "Errors during $nodeid:\n";
	    foreach my $er (@errors) {
		print STDERR "  $er\n";
	    }
	}
	else {
	    # close off the query and do it
	    chop($kstr);
	    chop($vstr);
	    $qstr .= "($kstr) values ($vstr)";

	    DBQueryWarn($qstr);
	}
    }
    
446 447
}

448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484

#
# Download URL into a file.
#
sub DownLoadURL($$)
{
    my ($url, $tempfile) = @_;

    print STDERR "Downloading $url to $tempfile ...\n"
	if (1);
    
    #
    # Must prevent hangs ...
    #
    my $syspid = fork();

    if ($syspid) {
	local $SIG{ALRM} = sub { kill("TERM", $syspid); };
	alarm 120;
	waitpid($syspid, 0);
	alarm 0;
	my $exitcode = $?;

	warn("*** Timed out downloading link data from web site!\n")
	    if ($exitcode == 15);
	    
	warn("*** Could not download link data from web site!\n")
	    if ($exitcode);

	return($exitcode >> 8);
    }
    else {
	exec("/usr/local/bin/wget","-q","-O","$tempfile","$url");
	exit(1);
    }
}

485 486
sub fatal {
    my $msg = $_[0];
487
    my $quiet = (defined($_[1]) ? $_[1] : 0);
488

Mike Hibler's avatar
Mike Hibler committed
489 490
    if ($mailit) {
	SENDMAIL($TBOPS, "plabmetrics Failed", $msg);
491
    }
492 493
    print "$msg\n"
	if $debug;
494

495 496
    unlink($COMONDATAFILE);

497 498
    die($msg);
}
Mike Hibler's avatar
Mike Hibler committed
499 500 501 502 503 504 505 506 507 508

sub nonfatal {
    my $msg = $_[0];

    SENDMAIL($TBOPS, "plabmetrics Failed", $msg);
    print "Would send mail\n";

    print "$msg\n"
	if $debug;
}