watchdog 19.2 KB
Newer Older
1
#!/usr/bin/perl -w
2
3
#
# EMULAB-COPYRIGHT
4
# Copyright (c) 2000-2004 University of Utah and the Flux Group.
5
6
7
8
9
10
11
12
13
14
15
16
17
# All rights reserved.
#
use Getopt::Std;
use English;
use Errno;
use POSIX qw(strftime);

#
# The Emulab watchdog. Currently, not really much of a watchdog. Simply
# contacts tmcd to find out if it needs to do an update.
#
sub usage()
{
Mike Hibler's avatar
Mike Hibler committed
18
    print "Usage: watchdog [-dv] [start | stop]\n";
19
20
    exit(1);
}
Mike Hibler's avatar
Mike Hibler committed
21
my $optlist = "Fdv";
22
23
24
25
26
27
28
29
30
31
32
33
34
35

#
# Turn off line buffering on output
#
$| = 1;

# Drag in path stuff so we can find emulab stuff.
BEGIN { require "/etc/emulab/paths.pm"; import emulabpaths; }

#
# Load the OS independent support library. It will load the OS dependent
# library and initialize itself. 
# 
use libsetup;
36
use libtmcc;
37

Mike Hibler's avatar
Mike Hibler committed
38
39
40
41
42
43
44
45
# XXX should be in libsetup
my $svcslice    = "utah_elab_svc";
sub PLABSVC() {
    return (PLAB() &&
	(defined($ENV{'USER'}) && ($ENV{'USER'} eq $svcslice) ||
	 defined($ENV{'SUDO_USER'}) && ($ENV{'SUDO_USER'} eq $svcslice)));
}

46
# Locals
47
my $action	= "start";
Mike Hibler's avatar
Mike Hibler committed
48
my $logname	= "$LOGDIR/emulab-watchdog.log";
49
my $pidfile	= "/var/run/emulab-watchdog.pid";
Mike Hibler's avatar
Mike Hibler committed
50
my $rusagebin	= "$BINDIR/plabrusage";
51
52
my $keydir	= "/etc/ssh";
my @keylist     = ("ssh_host_key", "ssh_host_dsa_key", "ssh_host_rsa_key");
53
my $debug	= 0;
Mike Hibler's avatar
Mike Hibler committed
54
55
my $verbose     = 0;
my $updatefailed= 0;
56
my $driftfile;
Mike Hibler's avatar
Mike Hibler committed
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
my $lastdrift;
my $rusagestr;
my $curtime;

# tmcc retries
my $trytcp = 0;
my $maxretries  = 3;
my %retry;

# XXX testing
my $fakeit;
my %faketimes;

#
# Default interval values in seconds.
# Compatible with old, static watchdog.
#
74
75
# yeah yeah, all these hashes should be a hash of records or something.
#
Mike Hibler's avatar
Mike Hibler committed
76
77
78
79
80
81
my %iv = (
    check   => 0,
    isalive => ((REMOTE() == 1) ? (PLAB() ? 600 : 60) : (JAILED() ? 600 : 180)),
    drift   => (60 * 60 * 12),
    cvsup   => (60 * 60 * 12),
    rusage  => 0,
82
    hkeys   => 0,
Mike Hibler's avatar
Mike Hibler committed
83
84
85
86
87
88
89
90
);

my %funcs = (
    check   => \&setintervals,
    isalive => \&sendisalive,
    drift   => \&ntpdrift,
    cvsup   => \&runcvsup,
    rusage  => \&sendrusage,
91
92
93
94
95
96
97
98
    hkeys   => \&sendhkeys,     
);

my %immediate = (
    check   => 0,
    isalive => 1,
    drift   => 0,
    cvsup   => 0,
99
    rusage  => 1,
100
    hkeys   => 1,	 
Mike Hibler's avatar
Mike Hibler committed
101
);
102
103
104
105

#
# Forward declarations for prototype checking
#
Mike Hibler's avatar
Mike Hibler committed
106
107
108
109
110
sub setintervals($);
sub sendisalive($);
sub ntpdrift($);
sub runcvsup($);
sub sendrusage($);
111
sub sendhkeys($);
Mike Hibler's avatar
Mike Hibler committed
112
sub logmsg($);
113
114
115
116
117
118
119
120
121
122
123
124

#
# Parse command arguments. Once we return from getopts, all that should be
# left are the required arguments.
#
%options = ();
if (! getopts($optlist, \%options)) {
    usage();
}
if (defined($options{"d"})) {
    $debug = 1;
}
Mike Hibler's avatar
Mike Hibler committed
125
126
127
128
129
130
if (defined($options{"v"})) {
    $verbose = 1;
}
if (defined($options{"F"})) {
    $fakeit = 1;
}
131
if (@ARGV) {
132
133
134
135
136
    $action = $ARGV[0];

    if (@ARGV != 1 || ($action ne "start" && $action ne "stop")) {
	usage();
    }
137
138
139
140
141
142
143
144
145
146
}

#
# Must be root.
# 
if ($UID != 0) {
    die("*** $0:\n".
	"    Must be root to run this script!\n");
}

147
148
149
150
151
152
153
154
155
156
157
158
#
# For stop, look to see if the pid file exists. If so, kill it and exit.
#
if ($action eq "stop") {
    if (! -e $pidfile) {
	exit(0);
    }
    system("kill `cat $pidfile`");
    sleep(1);
    exit($? >> 8);
}

159
160
161
162
163
164
165
166
#
# Put this into the background and log its output. We *must* do this cause
# we do not want to halt the boot if the testbed is down!
# 
if (!$debug && TBBackGround($logname)) {
    #
    # Parent exits normally
    #
167
    select(undef, undef, undef, 0.25);
168
169
170
    exit(0);
}

171
172
173
174
175
176
177
178
#
# Write our pid into the pid file so we can be killed later. We must
# do this first so that we can be killed before we change the sig
# handlers.
#
system("echo '$PID' > $pidfile") == 0
    or die("Could not create $pidfile!");

179
180
181
182
183
184
185
186
187
#
# Setup a handler to catch TERM, and kill our process group.
#
my $pgrp = getpgrp(0);

sub handler () {
    $SIG{TERM} = 'IGNORE';
    $SIG{INT} = 'IGNORE';
    kill('TERM', -$pgrp);
188
    unlink($pidfile);
189
190
191
192
193
194
    sleep(5);
    exit(0);
}
$SIG{TERM} = \&handler;
$SIG{INT}  = \&handler;

195
196
197
#
# If jailed, get our jailname. 
#
198
if (JAILED() || PLAB()) {
Mike Hibler's avatar
Mike Hibler committed
199
    my $vnodeid = libsetup_getvnodeid();
200
201
202
    # Tell the tmcc library. Note that its actually been done via libsetup
    # but I duplicate it here to make it explicit.
    configtmcc("subnode", $vnodeid);
203
204
}

205
#
Mike Hibler's avatar
Mike Hibler committed
206
207
208
209
# XXX plab UDP calls sometimes fail with EINVAL when reading a reply,
#     combat this by forcing the last retry of a failing call to use TCP
#     in the plab service slice.  Maybe we should do this for all plab
#     slices...
Mike Hibler's avatar
Mike Hibler committed
210
211
# XXX ok, really bad idea.  This just causes all our tmcd processes to
#     get hung with open connections to flaky plab machines
212
#
Mike Hibler's avatar
Mike Hibler committed
213
#$trytcp = 1 if (PLABSVC());
214
215
216
217
218
219

#
# For sending back ntpdrift.
# 
if (-e "/etc/ntp.drift") {
    $driftfile = "/etc/ntp.drift";
Mike Hibler's avatar
Mike Hibler committed
220
} elsif (-e "/etc/ntp/drift") {
221
222
223
    $driftfile = "/etc/ntp/drift";
}

224
225
226
#
# Initial drift value, we only update if it changes
#
Mike Hibler's avatar
Mike Hibler committed
227
228
229
230
231
232
233
234
235
236
237
238
if (defined($driftfile)) {
    $lastdrift = `cat $driftfile`;
    chomp($lastdrift);
}

#
# Retry state for failed tmcc calls
#
$retry{check} = 0;
$retry{isalive} = 0;
$retry{drift} = 0;
$retry{rusage} = 0;
239
$retry{hkeys} = 0;
Mike Hibler's avatar
Mike Hibler committed
240
241
242
243
244
245
246

$curtime = time();
if ($fakeit) {
    logmsg("Faking it\n");
} else {
    logmsg("Dogging it\n");
}
247

248
#
Mike Hibler's avatar
Mike Hibler committed
249
250
251
252
253
254
255
256
257
# Set our initial interval values.
# This will queue the interval check.
#
my $firsttime = 1;
setintervals($curtime);
$firsttime = 0;

#
# Loop, sleeping and then processing events
258
259
# 
while (1) {
Mike Hibler's avatar
Mike Hibler committed
260
261
262
    my ($nexttime, $event);

    $curtime = time();
263

Mike Hibler's avatar
Mike Hibler committed
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
    qhead($nexttime, $event) == 0 or
	die("All timeouts disabled at $date!");

    while ($curtime >= $nexttime) {
	qpop($nexttime, $event);
	&$event($curtime);
	qhead($nexttime, $event) == 0 or
	    die("All timeouts disabled at $date!");
	$curtime = time();
    }

    sleep($nexttime - $curtime);
}

exit(0);

sub sendisalive($)
{
    my ($curtime) = @_;
283
    
Mike Hibler's avatar
Mike Hibler committed
284
285
286
287
288
289
290
    if ($fakeit) {
	my $delta = $curtime - $faketimes{isalive};
	$faketimes{isalive} = $curtime;
	logmsg("sendisalive at +$delta\n");
	qinsert($curtime + $iv{isalive}, \&sendisalive) if ($iv{isalive});
	return;
    }
291

Mike Hibler's avatar
Mike Hibler committed
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
    if ($verbose) {
	if ($retry{isalive} == 0) {
	    logmsg("isalive: sending\n");
	} else {
	    logmsg("isalive: resending, retry=$retry{isalive}\n");
	}
    }

    my %tmccargs = ();
    $tmccargs{timeout} = 3;
    $tmccargs{useudp} = 1
	if (!$trytcp || $retry{isalive} != $maxretries);

    my @tmccresults;
    if (tmcc(TMCCCMD_ISALIVE, undef, \@tmccresults, %tmccargs) != 0 ||
	scalar(@tmccresults) == 0) {
	#
	# Failed, schedule a retry using a backoff.
	#
	if ($retry{isalive} < $maxretries) {
	    my $nexttime = time() + (1 << $retry{isalive});
	    qinsert($nexttime, \&sendisalive);
	    $retry{isalive}++;
	    logmsg("isalive: failed ($?), retry $retry{isalive}\n");
	    return;
	}
	#
	# Failed miserably, just whine and reschedule at the normal time.
	#
	logmsg("isalive: failed ($?) after $maxretries attempts\n");
    } else {
	#
	# Success.  The format of the response is rather simple right now.
	# Note: if the update failed last time, run it no matter what.
	#
	logmsg("isalive: succeeded after $retry{isalive} retries\n")
	    if ($retry{isalive});
	if ($updatefailed ||
	    $tmccresults[0] =~ /^UPDATE=1$/) {
	    logmsg("isalive: running an account update\n");
	    system("$BINDIR/update -i -l");
	    $updatefailed = $?;
	    logmsg("isalive: update done\n");
	}
    }
337

338
    #
Mike Hibler's avatar
Mike Hibler committed
339
340
    # Set up for another interval.
    # Since the tmcc call and update can take awhile, we update curtime
341
    #
Mike Hibler's avatar
Mike Hibler committed
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
    $retry{isalive} = 0;
    $curtime = time();
    qinsert($curtime + $iv{isalive}, \&sendisalive)
	if ($iv{isalive});
}

sub setintervals($)
{
    my ($curtime) = @_;
    my $report = 0;

    if ($fakeit) {
	$iv{check} = 7;
	$iv{isalive} = 3;
	$iv{drift} = 9;
	$iv{cvsup} = 21;
	$iv{rusage} = 15;

	my $delta = $curtime - $faketimes{check};
	$faketimes{check} = $curtime;

	logmsg("setintervals at +$delta\n");
	qinsert($curtime + $iv{check}, \&setintervals) if ($iv{check});
	return;
    }

    if ($verbose) {
	if ($retry{check} == 0) {
	    logmsg("setintervals: fetching intervals\n");
	} else {
	    logmsg("setintervals: refetching intervals, retry=$retry{check}\n");
373
374
	}
    }
Mike Hibler's avatar
Mike Hibler committed
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403

    # XXX fake an argument to force request to TMCD and avoid the cache
    my $arg = "foo";

    my %tmccargs = ();
    $tmccargs{timeout} = 3;
    $tmccargs{useudp} = 1
	if (!$trytcp || $retry{check} != $maxretries);

    my @tmccresults;
    if (tmcc(TMCCCMD_WATCHDOGINFO, $arg, \@tmccresults, %tmccargs) != 0 ||
	scalar(@tmccresults) == 0) {
	#
	# Failed, schedule a retry using a backoff.
	#
	if ($retry{check} < $maxretries) {
	    my $nexttime = time() + (1 << $retry{check});
	    qinsert($nexttime, \&setintervals);
	    $retry{check}++;
	    logmsg("setintervals: failed ($?), retry $retry{check}\n");
	    return;
	}
	#
	# Failed miserably, just whine and reschedule at the normal time.
	#
	logmsg("setintervals: failed ($?) after $maxretries attempts, ".
	       "using current values\n");
	$report = 1;
    } else {
404
	#
Mike Hibler's avatar
Mike Hibler committed
405
	# Success.
406
	#
Mike Hibler's avatar
Mike Hibler committed
407
408
409
410
411
412
413
414
415
	logmsg("setintervals: succeeded after $retry{check} retries\n")
	    if ($retry{check});

	my %oiv;
	$oiv{check} = $iv{check};
	$oiv{isalive} = $iv{isalive};
	$oiv{drift} = $iv{drift};
	$oiv{cvsup} = $iv{cvsup};
	$oiv{rusage} = $iv{rusage};
416
	$oiv{hkeys} = $iv{hkeys};
Mike Hibler's avatar
Mike Hibler committed
417
418

	if ($tmccresults[0] =~
419
	    /INTERVAL=(-?\d+) ISALIVE=(-?\d+) NTPDRIFT=(-?\d+) CVSUP=(-?\d+) RUSAGE=(-?\d+) HOSTKEYS=(-?\d+)/) {
Mike Hibler's avatar
Mike Hibler committed
420
421
422
423
424
425
426
427
428
429
	    $iv{check} = $1
		if ($1 >= 0);
	    $iv{isalive} = $2
		if ($2 >= 0);
	    $iv{drift} = $3
		if ($3 >= 0);
	    $iv{cvsup} = $4
		if ($4 >= 0);
	    $iv{rusage} = $5
		if ($5 >= 0);
430
431
	    $iv{hkeys} = $6
		if ($6 >= 0);
Mike Hibler's avatar
Mike Hibler committed
432
433
434
	}

	#
435
	# MFS nodes only report isalive
Mike Hibler's avatar
Mike Hibler committed
436
	#
437
	if (MFS()) {
Mike Hibler's avatar
Mike Hibler committed
438
439
440
	    $iv{drift} = 0;
	    $iv{cvsup} = 0;
	    $iv{rusage} = 0;
441
	    $iv{hkeys} = 0;
Mike Hibler's avatar
Mike Hibler committed
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
	}

	foreach my $key (keys %iv) {
	    if ($firsttime || $iv{$key} != $oiv{$key}) {
		$report = 1;

		#
		# Special handling of ourselves:
		# warn if future checks are disabled.
		#
		if ($key eq "check") {
		    if ($iv{$key} == 0) {
			logmsg("setintervals: ".
			       "WARNING interval checks disabled!\n");
		    } else {
			logmsg("setintervals: scheduling $key\n");
		    }
		    next;
		}

		if ($iv{$key} == 0) {
		    logmsg("setintervals: descheduling $key\n");
		    qdelete($funcs{$key});
		} elsif ($firsttime || $oiv{$key} == 0) {
Mike Hibler's avatar
Mike Hibler committed
466
		    #
467
468
		    # Some commands need to be run at boottime,
		    # schedule their first run immediately.
Mike Hibler's avatar
Mike Hibler committed
469
		    #
470
		    if ($firsttime && $immediate{$key}) {
471
			logmsg("setintervals: scheduling $key now\n");
Mike Hibler's avatar
Mike Hibler committed
472
473
			qinsert($curtime, $funcs{$key});
		    } else {
474
			logmsg("setintervals: scheduling $key\n");
Mike Hibler's avatar
Mike Hibler committed
475
476
			qinsert($curtime + $iv{$key}, $funcs{$key});
		    }
Mike Hibler's avatar
Mike Hibler committed
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
		} else {
		    #
		    # To reschedule an already existing event,
		    # we recompute when it was last scheduled and
		    # add the new interval to that.  If the result
		    # is before the current time, we set it to the
		    # current time so it will trigger immediately.
		    #
		    my $ntime = qfind($funcs{$key});
		    if (defined($ntime)) {
			$ntime -= $oiv{$key};
			$ntime += $iv{$key};
			$ntime = $curtime
			    if ($ntime < $curtime);
		    } else {
			$ntime = $curtime;
		    }
		    logmsg("setintervals: rescheduling $key at $ntime ".
			   "(now=$curtime)\n");
		    qinsert($ntime, $funcs{$key});
		}
	    }
	}
    }
	
    if ($report) {
	logmsg("setintervals: check=$iv{check}, isalive=$iv{isalive}, ".
504
505
	       "drift=$iv{drift}, cvsup=$iv{cvsup}, rusage=$iv{rusage}, ".
	       "hostkeys=$iv{hkeys}\n");
Mike Hibler's avatar
Mike Hibler committed
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
    }

    #
    # Set up for another interval.
    # Since the tmcc call can take awhile, we update curtime
    #
    $retry{check} = 0;
    $curtime = time();
    qinsert($curtime + $iv{check}, \&setintervals)
	if ($iv{check});
}

sub ntpdrift($)
{
    my ($curtime) = @_;
    
    if ($fakeit) {
	my $delta = $curtime - $faketimes{drift};
	$faketimes{drift} = $curtime;
	logmsg("ntpdrift at +$delta\n");
	qinsert($curtime + $iv{drift}, \&ntpdrift) if ($iv{drift});
	return;
    }

    logmsg("ntpdrift: reporting NTP drift\n")
	if ($verbose);

    my $drift = `cat $driftfile`;
    chomp($drift);
    
    if ($drift ne $lastdrift && $drift =~ /^([-\d\.]*)$/) {
	logmsg("ntpdrift: updating NTP drift from $lastdrift to $drift\n");

	# Server also checks the value for sanity.
	tmcc(TMCCCMD_NTPDRIFT, $1, undef, ("timeout" => 3));
	$lastdrift = $drift;
542
    }
Mike Hibler's avatar
Mike Hibler committed
543
544
545

    qinsert($curtime + $iv{drift}, \&ntpdrift)
	if ($iv{drift});
546
547
548
}

#
Mike Hibler's avatar
Mike Hibler committed
549
550
# Do a cvsup to get updated software.
# XXX fork this off?
551
#
Mike Hibler's avatar
Mike Hibler committed
552
sub runcvsup($)
553
{
Mike Hibler's avatar
Mike Hibler committed
554
555
556
557
558
559
560
    my ($curtime) = @_;
    
    if ($fakeit) {
	my $delta = $curtime - $faketimes{cvsup};
	$faketimes{cvsup} = $curtime;
	logmsg("runcvsup at +$delta\n");
	qinsert($curtime + $iv{cvsup}, \&runcvsup) if ($iv{cvsup});
561
562
	return;
    }
Mike Hibler's avatar
Mike Hibler committed
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579

    logmsg("runcvsup: checking for software updates\n");

    system("$BINDIR/runcvsup.sh");

    logmsg("runcvsup: software updates done\n");

    # cvsup can take awhile so update curtime
    $curtime = time();

    qinsert($curtime + $iv{cvsup}, \&runcvsup)
	if ($iv{cvsup});
}

sub sendrusage($)
{
    my ($curtime) = @_;
580
    
Mike Hibler's avatar
Mike Hibler committed
581
582
583
584
585
586
587
    if ($fakeit) {
	my $delta = $curtime - $faketimes{rusage};
	$faketimes{rusage} = $curtime;
	logmsg("sendrusage at +$delta\n");
	qinsert($curtime + $iv{rusage}, \&sendrusage) if ($iv{rusage});
	return;
    }
588

Mike Hibler's avatar
Mike Hibler committed
589
590
591
592
593
594
595
    if ($verbose) {
	if ($retry{rusage} == 0) {
	    logmsg("rusage: sending\n");
	} else {
	    logmsg("rusage: resending, retry=$retry{rusage}\n");
	}
    }
596

Mike Hibler's avatar
Mike Hibler committed
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
    #
    # Collect the stats
    #
    if ($retry{rusage} == 0) {
	if (! -x $rusagebin) {
	    logmsg("rusage: no $rusagebin\n");
	    goto resched;
	}
	$rusagestr = `$rusagebin 2>>$LOGDIR/emulab-rusage.log`;
	if ($?) {
	    logmsg("rusage: $rusagebin failed ($?)\n");
	    goto resched;
	}
	chomp $rusagestr;
	if ($rusagestr !~ /LA1=[\d\.]+ LA5=[\d\.]+ LA15=[\d\.]+ DUSED=[\d\.]+/) {
	    logmsg("rusage: $rusagebin returns gobbledy-gook: $rusagestr\n");
	    goto resched;
	}
    }
    logmsg("rusage: sending: $rusagestr\n")
	if ($verbose);

619
620
621
622
623
624
625
626
    my %tmccargs = ();
    $tmccargs{timeout} = 3;
    $tmccargs{useudp} = 1
	if (!$trytcp || $retry{rusage} != $maxretries);

    my @tmccresults;
    if (tmcc(TMCCCMD_RUSAGE, $rusagestr, \@tmccresults, %tmccargs) != 0 ||
	scalar(@tmccresults) == 0) {
Mike Hibler's avatar
Mike Hibler committed
627
628
629
630
631
632
633
634
635
636
	#
	# Failed, schedule a retry using a backoff.
	#
	if ($retry{rusage} < $maxretries) {
	    my $nexttime = time() + (1 << $retry{rusage});
	    qinsert($nexttime, \&sendrusage);
	    $retry{rusage}++;
	    logmsg("rusage: failed ($?), retry $retry{rusage}\n");
	    return;
	}
637
	#
Mike Hibler's avatar
Mike Hibler committed
638
	# Failed miserably, just whine and reschedule at the normal time.
639
	#
640
	logmsg("rusage: failed ($?) after $maxretries attempts\n");
Mike Hibler's avatar
Mike Hibler committed
641
642
    } else {
	#
643
644
	# Success.  The format of the response is rather simple right now.
	# Note: if the update failed last time, run it no matter what.
Mike Hibler's avatar
Mike Hibler committed
645
	#
646
	logmsg("rusage: succeeded after $retry{rusage} retries\n")
Mike Hibler's avatar
Mike Hibler committed
647
	    if ($retry{rusage});
648
649
650
651
652
653
654
	if ($updatefailed ||
	    $tmccresults[0] =~ /^UPDATE=1$/) {
	    logmsg("rusage: running an account update\n");
	    system("$BINDIR/update -i -l");
	    $updatefailed = $?;
	    logmsg("rusage: update done\n");
	}
Mike Hibler's avatar
Mike Hibler committed
655
    }
656

Mike Hibler's avatar
Mike Hibler committed
657
658
659
660
661
662
663
664
665
666
resched:
    #
    # Set up for another interval.
    # Since the tmcc call and update can take awhile, we update curtime
    #
    $retry{rusage} = 0;
    $curtime = time();
    qinsert($curtime + $iv{rusage}, \&sendrusage)
	if ($iv{rusage});
}
667

668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
sub sendhkeys($)
{
    my ($curtime) = @_;
    
    if ($fakeit) {
	my $delta = $curtime - $faketimes{hkeys};
	$faketimes{hkeys} = $curtime;
	logmsg("sendhkeys at +$delta\n");
	qinsert($curtime + $iv{hkeys}, \&sendhkeys) if ($iv{hkeys});
	return;
    }

    if ($verbose) {
	if ($retry{hkeys} == 0) {
	    logmsg("hostkeys: sending\n");
	} else {
	    logmsg("hostkeys: resending, retry=$retry{hkeys}\n");
	}
    }

    if (! -d $keydir) {
	logmsg("hostkeys: no SSH key directory $keydir\n");
	$iv{hkeys} = 0;
	return;
    }

    my $arg="";
    foreach my $kname (@keylist) {
	my $kpath = "$keydir/$kname.pub";
	next
	    if (! -r $kpath);

	my $key = `cat $kpath`;
	logmsg("hostkeys: could not read keyfile $kpath\n")
	    if ($?);
	chomp($key);
	$kname =~ tr/a-z/A-Z/;

	$arg .= "$kname='$key' ";
    }

    if ($arg ne "") {
710
	print "hostkeys: $arg\n"
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
	    if ($debug);

	my %tmccargs = ();
	$tmccargs{timeout} = 3;
	# send these with TCP for now
	#$tmccargs{useudp} = 1 if (!$trytcp || $retry{hkeys} != $maxretries);

	if (tmcc(TMCCCMD_HOSTKEYS, $arg, undef, %tmccargs) != 0) {
	    #
	    # Failed, schedule a retry using a backoff.
	    #
	    if ($retry{hkeys} < $maxretries) {
		my $nexttime = time() + (1 << $retry{hkeys});
		qinsert($nexttime, \&sendhkeys);
		$retry{hkeys}++;
		logmsg("hostkeys: failed ($?), retry $retry{hkeys}\n");
		return;
	    }
	    #
	    # Failed miserably, just whine and reschedule at the normal time.
	    #
	    logmsg("hostkeys: failed ($?) after $maxretries attempts\n");
	} else {
	    logmsg("hostkeys: succeeded after $retry{hkeys} retries\n")
		if ($retry{hkeys});
	}
    }

    #
    # Set up for another interval.
    # Since the tmcc call and update can take awhile, we update curtime
    #
    $retry{hkeys} = 0;
    $curtime = time();
    qinsert($curtime + $iv{hkeys}, \&sendhkeys)
	if ($iv{hkeys});
}

Mike Hibler's avatar
Mike Hibler committed
749
750
751
sub logmsg($)
{
    my ($msg) = @_;
752

Mike Hibler's avatar
Mike Hibler committed
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
    print strftime("%b %e %H:%M:%S", localtime)." watchdog[$$]: $msg";
}

#
# The following are lifted from stated's TimeoutQueue package
# Replicated to avoid excess dependencies
#

@q = (); # The queue
%i = (); # The index

#
# qinsert($timeout,$obj) - returns 0
#   Insert an object.  Object must not already be in the list.
#
sub qinsert {
    my ($timeout, $obj) = @_;
    if (defined($i{$obj})) {
	# Already in there... take it out
	qdelete($obj);
    }
    my $loc = qsearch($timeout,0);
    my @l = ($timeout,$obj);
    splice(@q,$loc,0,\@l);
    $i{$obj} = $timeout;
    return 0;
}

#
# qdelete($obj)		 - returns 0, or 1 if not found
#   Delete an object
#
sub qdelete {
    my ($obj) = @_;
    if (!defined($i{$obj})) {
	return 1;
    }
    my $timeout = $i{$obj};
    my $n=qsearch($timeout,1);
    my $end = @q+0;
    while (1) {
	$o = ${$q[$n]}[1];
	if ($o eq $obj) {
	    splice(@q,$n,1);
	    last;
798
	}
Mike Hibler's avatar
Mike Hibler committed
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
	$n++;
	if ($n > $end) { return 1;}
    }
    delete $i{$obj};
    return 0;
}

#
# qhead(\$timeout,\$obj) - returns 0, or 1 if not found
#   Look at the head item
#
sub qhead {
    if (@q+0 == 0) { $_[0]=undef; $_[1]=undef; return 1; }
    $_[0] = ${$q[0]}[0];
    $_[1] = ${$q[0]}[1];
    return 0;
}

#
# qpop(\$timeout,\$obj)	 - returns 0, or 1 if empty
#   Remove and return the head item
#
sub qpop {
    if (@q+0 == 0) { $_[0]=undef; $_[1]=undef; return 1; }
    $_[0] = ${$q[0]}[0];
    $_[1] = ${$q[0]}[1];
    shift(@q);
    delete $i{$_[1]};
    return 0;
}

#
# qfind($obj)		 - returns timeout, or undef if not found
#   Find the timeout for an item
#
sub qfind {
    my ($obj) = @_;
    return $i{$obj};
}

#
# qsearch($timeout,$first) - returns index
#   Find the index in @q where ($first ? $timout starts : $timeout ends)
#
sub qsearch {
    my ($timeout,$first) = @_;
    return qbinsearch($timeout,0,@q+0,$first);
}

#
# qbinsearch($timeout,$min,$max,$first) - returns index
#   Find the index in @q where ($first ? $timout starts : $timeout ends)
#
sub qbinsearch {
    my ($timeout,$min,$max,$first) = @_;
    # Implement a binary search
    my $len = $max - $min;
    my $mid = $min + int($len/2);
    if ($len < 1) { return $mid; }
    my $val = ${$q[$mid]}[0];
    if ($first) {
	if ($val >= $timeout) { return qbinsearch($timeout,$min,$mid,$first); }
	else { return qbinsearch($timeout,$mid+1,$max,$first); }
    } else {
	if ($val > $timeout) { return qbinsearch($timeout,$min,$mid,$first); }
	else { return qbinsearch($timeout,$mid+1,$max,$first); }
865
866
    }
}