watchdog 4.91 KB
Newer Older
1
#!/usr/bin/perl -w
2
3
#
# EMULAB-COPYRIGHT
4
# Copyright (c) 2000-2003 University of Utah and the Flux Group.
5
6
7
8
9
10
11
12
13
14
15
16
17
# All rights reserved.
#
use Getopt::Std;
use English;
use Errno;
use POSIX qw(strftime);

#
# The Emulab watchdog. Currently, not really much of a watchdog. Simply
# contacts tmcd to find out if it needs to do an update.
#
sub usage()
{
18
    print "Usage: watchdog [-d] [-t timeout] [start | stop]\n";
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
    exit(1);
}
my $optlist = "t:d";

#
# Turn off line buffering on output
#
$| = 1;

# Drag in path stuff so we can find emulab stuff.
BEGIN { require "/etc/emulab/paths.pm"; import emulabpaths; }

#
# Load the OS independent support library. It will load the OS dependent
# library and initialize itself. 
# 
use libsetup;
36
use libtmcc;
37
38

# Locals
39
40
my $action	= "start";
my $timeout	= (60 * 60 * 12);	# In seconds of course. 
41
42
43
my $logname	= "$LOGDIR/emulab-watchdog.debug";
my $pidfile	= "/var/run/emulab-watchdog.pid";
my $debug	= 0;
44
my $isalivewait = ((REMOTE() == 1) ? (PLAB() ? 600 : 60) ?
45
		   (JAILED() ? 600 : 180)); # Seconds to wait.
46
my $driftfile;
47
my $vnodeid;
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68

#
# Forward declarations for prototype checking
#
sub startisalive();

#
# Parse command arguments. Once we return from getopts, all that should be
# left are the required arguments.
#
%options = ();
if (! getopts($optlist, \%options)) {
    usage();
}
if (defined($options{"t"})) {
    $timeout = $options{"t"};
}
if (defined($options{"d"})) {
    $debug = 1;
}
if (@ARGV) {
69
70
71
72
73
    $action = $ARGV[0];

    if (@ARGV != 1 || ($action ne "start" && $action ne "stop")) {
	usage();
    }
74
75
76
77
78
79
80
81
82
83
}

#
# Must be root.
# 
if ($UID != 0) {
    die("*** $0:\n".
	"    Must be root to run this script!\n");
}

84
85
86
87
88
89
90
91
92
93
94
95
#
# For stop, look to see if the pid file exists. If so, kill it and exit.
#
if ($action eq "stop") {
    if (! -e $pidfile) {
	exit(0);
    }
    system("kill `cat $pidfile`");
    sleep(1);
    exit($? >> 8);
}

96
97
98
99
100
101
102
103
104
105
106
#
# Put this into the background and log its output. We *must* do this cause
# we do not want to halt the boot if the testbed is down!
# 
if (!$debug && TBBackGround($logname)) {
    #
    # Parent exits normally
    #
    exit(0);
}

107
108
109
110
111
112
113
114
#
# Write our pid into the pid file so we can be killed later. We must
# do this first so that we can be killed before we change the sig
# handlers.
#
system("echo '$PID' > $pidfile") == 0
    or die("Could not create $pidfile!");

115
116
117
118
119
120
121
122
123
#
# Setup a handler to catch TERM, and kill our process group.
#
my $pgrp = getpgrp(0);

sub handler () {
    $SIG{TERM} = 'IGNORE';
    $SIG{INT} = 'IGNORE';
    kill('TERM', -$pgrp);
124
    unlink($pidfile);
125
126
127
128
129
130
    sleep(5);
    exit(0);
}
$SIG{TERM} = \&handler;
$SIG{INT}  = \&handler;

131
132
133
#
# If jailed, get our jailname. 
#
134
if (JAILED() || PLAB()) {
135
    $vnodeid = libsetup_getvnodeid();
136
137
138
    # Tell the tmcc library. Note that its actually been done via libsetup
    # but I duplicate it here to make it explicit.
    configtmcc("subnode", $vnodeid);
139
140
}

141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
#
# Start isalive daemon.
#
startisalive();

#
# For sending back ntpdrift.
# 
if (-e "/etc/ntp.drift") {
    $driftfile = "/etc/ntp.drift";
}
elsif (-e "/etc/ntp/drift") {
    $driftfile = "/etc/ntp/drift";
}

#
# Loop!
# 
while (1) {
    sleep($timeout);
    
    my $date = POSIX::strftime("20%y/%m/%d %H:%M:%S", localtime());

    print "Dogging it at $date\n";
    
    #
    # Run account update. Use immediate mode so that it exits right away
    # if the lock is taken (another update already running).
    #
    print "Looking for new Emulab accounts ...\n";
    system("update -i");

173
174
    # Skip all this stuff in jail. 
    next
175
	if (JAILED() || PLAB());
176

177
178
179
    #
    # Send back ntpdrift info. Should move elsewhere.
    #
180
    if (!REMOTE() && !MFS() && defined($driftfile)) {
181
	my $drift = `cat $driftfile`;
182
	if ($drift =~ /^([-\d\.]*)$/) {
183
	    # Server also checks the value for sanity.
184
	    tmcc(TMCCCMD_NTPDRIFT, $1, undef, ("timeout" => 3));
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
	}
    }
    if (REMOTE()) {
	#
	# Do a cvsup to get updated software.
	#
	print "Looking for software updates ... \n";
	system("runcvsup.sh");
    }
}
exit(0);

#
# Fire off a child that does nothing but tell the boss we are alive.
#
sub startisalive()
{
    my $mypid = fork();
    if ($mypid) {
	return;
    }
    my $failed  = 0;
    
    print "Keep alive starting up ... \n";

    while (1) {
	#
	# Run tmcc in UDP mode.
	# Since its UDP, we try it a couple of times if it fails. 
	#
	my $retries = 3;

	while ($retries) {
218
219
220
221
222
	    my @tmccresults;

	    if (tmcc(TMCCCMD_ISALIVE, undef,
		     \@tmccresults, ("timeout" => 3, "useudp" => 1)) == 0 &&
		scalar(@tmccresults)) {
223
		my $date = POSIX::strftime("20%y/%m/%d %H:%M:%S", localtime());
224
225
226
227
228
229
230

		#
		# The format of the response is rather simple right now.
		# If the update failed last time, run it no matter what.
		#
		if ($failed ||
		    $tmccresults[0] =~ /^UPDATE=1$/) {
231
		    print "Running an update at $date ...\n";
232
		    system("update -i -a");
233
234
235
236
237
238
239
240
241
242
243
244
245
		    $failed = $?;
		}
		last;
	    }
	    $retries--;
	}
	if (!$retries) {
	    print "keep alive returned $?\n";
	}
	sleep($isalivewait);
    }
    exit(0);
}