node_update.in 7.74 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
#!/usr/bin/perl -wT
use English;
use Getopt::Std;

#
# Update mounts and accounts and anything else after changing the permissions
# for a node. This is intended to be invoked from the web interface after
# adding and/or subtracting pids from the experiment pid access list.
#
# XXX There is an inherent race condition with using this script. What if
# nodes are released while it is running?
12
13
14
#
# The output is all jumbled together since the updates are issued in parallel.
# Might be a pain when debugging. 
15
16
17
18
# 
sub usage()
{
    print STDOUT "Usage: node_update [-b] <pid> <eid>\n".
19
20
	"Update user accounts and NFS mounts on nodes in your project.\n".
	"Use -b to use batch operation (place in background, send email).\n";
21
22
    exit(-1);
}
23
my  $optlist = "be:";
24
25
26
27
28
29
30
31
32
33
  
#
# Configure variables
#
my $TB		= "@prefix@";
my $TESTMODE    = @TESTMODE@;
my $TBOPS       = "@TBOPSEMAIL@";
my $TBLOGS      = "@TBLOGSEMAIL@";

my $ssh		= "$TB/bin/sshtb -n";
34
my $sshremote	= "$TB/bin/sshremote -n";
35
36
my $expsetup    = "$TB/sbin/exports_setup";
my $batchmode   = 0;
37
my $maxchildren = 20;
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75

#
# Load the Testbed support stuff. 
#
use lib "@prefix@/lib";
use libdb;
use libtestbed;

# un-taint path
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};

# Turn off line buffering on output
$| = 1; 

#
# Parse command arguments. Once we return from getopts, all that should be
# left are the required arguments.
#
%options = ();
if (! getopts($optlist, \%options)) {
    usage();
}
if (@ARGV != 2) {
    usage();
}
my $pid   = $ARGV[0];
my $eid   = $ARGV[1];
if (defined($options{"b"})) {
    $batchmode = 1;
}

#
# Untaint the arguments.
#
if ($pid =~ /^([-\@\w]+)$/) {
    $pid = $1;
}
76
77
78
else {
    die("*** Bad data in pid: $pid\n");
}	
79
80
81
if ($eid =~ /^([-\@\w]+)$/) {
    $eid = $1;
}
82
83
84
else {
    die("*** Bad data in eid: $eid\n");
}	
85
86
87
88
89
90
91
92
93
94
95
96
97

my $user_name;
my $user_email;
my $logname;
my %pids	= ();
my $failed	= 0;
my $dbuid;

#
# We don't want to run this script unless its the real version.
# That is, it must be setuid root. 
#
if ($EUID != 0) {
98
99
    die("*** $0:\n".
	"    Must be root! Maybe its a development version?\n");
100
101
102
103
104
105
}

#
# Verify actual user and get his DB uid.
#
if (! UNIX2DBUID($UID, \$dbuid)) {
106
107
    die("*** $0:\n".
	"    You do not exist in the Emulab Database.\n");
108
109
110
}

if (! UserDBInfo($dbuid, \$user_name, \$user_email)) {
111
112
    die("*** $0:\n".
        "    Cannot determine your name and email address.\n");
113
114
115
116
117
118
}

#
# Verify that this person is allowed to do this. Must be an admin type,
# the experiment creator or the project leader.
#
119
if ($UID && !TBAdmin()) {
120
121
122
123
    my $expt_leader = ExpLeader($pid, $eid);
    my $proj_leader = ProjLeader($pid);

    if (!$expt_leader || !$proj_leader) {
124
125
	die("*** $0:\n".
	    "    No such Experiment $eid or no such Project $pid\n");
126
127
128
    }

    if ($expt_leader ne $dbuid && $proj_leader ne $dbuid) {
129
130
	die("*** $0:\n".
	    "    You must be the experiment creator or the project leader\n");
131
132
133
    }
}

134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#
# We need to lock down the experiment during this. 
#
DBQueryFatal("lock tables experiments write");

if (TBExpLocked($pid, $eid)) {
    DBQueryWarn("unlock tables");
    die("*** $0:\n".
	"    Experiment $pid/$eid is in transition. Please try later!\n");
}

#
# A sanity check. Lets make sure the experiment is in the swapped in
# state so that we are not trying to update nodes that are still booting
# or swapping out, etc. 
#
if (ExpState($pid, $eid) ne EXPTSTATE_ACTIVE) {
    DBQueryWarn("unlock tables");
    die("*** $0:\n".
	"    The experiment $pid/$eid must be fully activated first!\n");
}
TBLockExp($pid, $eid);
DBQueryFatal("unlock tables");

158
159
160
161
#
# Batchmode (as from the web interface) goes to background and reports
# later via email.
# 
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
if ($batchmode) {
    #
    # Create a temporary name for a log file.
    #
    $logname = `mktemp /tmp/node_update-$pid-$eid.XXXXXX`;
    chop($logname);
    
    if (TBBackGround($logname)) {
	#
	# Parent exits normally
	#
	print STDOUT
	    "Node Update for $pid/$eid is now in progress.\n".
	    "You will be notified via email when the is complete.\n";
	exit(0);
    }
}

#
# Currently, we just need to update the mount points. The UID change because
# of PERL sillyness.
#
$UID = $EUID;
if (system("$expsetup")) {
    fatal("Exports Setup Failed");
}
# Give ops a chance to react.
sleep(2);

#
# Get the list of nodes that need to be "updated."
# 
my @nodes = ExpNodes($pid, $eid);
if (! @nodes) {
    fatal("No Nodes in the experiment");
}

#
200
201
202
203
# We want some overlap, but not too much since we could burn up
# a lot processes on wedged nodes. Issue a small number in parallel,
# and wait once we reach the limit for one to finish, before issuing
# the next one.
204
#
205
my $maxpids = 0;
206
foreach my $node ( @nodes ) {
207
    while ($maxpids >= $maxchildren) {
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
	my $thispid  = waitpid(-1, 0);
	my $thisnode = $pids{$thispid};
	
	if ($?) {
	    $failed++;
	    print STDERR "Update of node $thisnode failed!\n";
	}
	else {
	    print STDOUT "$thisnode updated ...\n";
	}

	delete($pids{$thispid});
	$maxpids--;
    }
    my $thispid = UpdateNode($node);
    $pids{$thispid} = $node;
    $maxpids++;
    sleep(1);
226
227
228
}

#
229
# Wait for any remaining children to exit before continuing.
230
#
231
232
foreach my $thispid ( keys(%pids) ) {
    my $node = $pids{$thispid};
233

234
    waitpid($thispid, 0);
235
236
237
238
239
    if ($?) {
	$failed++;
	print STDERR "Update of node $node failed!\n";
    }
    else {
240
	print STDOUT "$node updated ...\n";
241
242
243
    }
}

244
TBUnLockExp($pid, $eid);
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
NotifyUser("Node Update Complete", $failed);
if (defined($logname)) {
    unlink($logname);
}
exit($failed);

#
# Update a node in a child process. Return the pid to the parent so
# that it can wait on all the children later.
# 
sub UpdateNode {
    my($node) = @_;
    my($syspid, $mypid);

    print STDOUT "Updating $node ...\n";

261
262
263
264
265
266
    #
    # We need to know if its a remote or local node, so we know how
    # to update it. This info needs to be in the DB at some point. 
    #
    my($isremote) = TBIsNodeRemote($node);

267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
    $mypid = fork();
    if ($mypid) {
	return $mypid;
    }

    #
    # Run an ssh command in a child process, protected by an alarm to
    # ensure that the ssh is not hung up forever if the machine is in
    # some funky state.
    # 
    $syspid = fork();

    # Must change our real UID to root so that ssh will work.
    $UID = 0;
    
    if ($syspid) {
	local $SIG{ALRM} = sub { kill("TERM", $syspid); };
	alarm 15;
	waitpid($syspid, 0);
	alarm 0;

	print STDERR "update of $node returned $?.\n" if $debug;
    
	#
	# If either ssh is not running or it timed out,
	# send it a ping of death.
	# 
	if ($? == 256 || $? == 15) {
	    if ($? == 256) {
		print STDERR "$node is not running sshd.\n" if $debug;
	    } else {
		print STDERR "$node is wedged.\n" if $debug;
	    }
	    exit(-1);
	}
	exit(0);
    }
    else {
305
306
307
308
309
310
	if ($isremote) {
	    exec("$sshremote $node /usr/local/etc/testbed/update");
	}
	else {
	    exec("$ssh $node /etc/testbed/update");
	}
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
	exit(0);
    }
    exit(0);
}

sub NotifyUser($$)
{
    my($mesg, $iserr) = @_;
    my($subject, $from, $to, $hdrs);

    print STDOUT "$mesg\n";

    if (! $batchmode) {
	return;
    }

    if ($iserr) {
328
	$subject = "Node Update Failed $pid/$eid";
329
330
    }
    else {
331
	$subject = "Node Update Success $pid/$eid";
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
    }
    $from  = $TBOPS;
    $hdrs  = "Reply-To: $TBOPS";
    
    #
    # Message goes to user. If a failure, TBOPS also gets it, otherwise
    # it goes into the logs.
    #
    $to    = "$user_name <$user_email>";    
    
    if ($iserr) {
	$hdrs = "Cc: $TBOPS\n".
	        "$hdrs";
    }
    else {
	$hdrs = "Bcc: $TBLOGS\n".
	        "$hdrs";
    }

351
352
353
354
    #
    # Send a message to the testbed list. Append the logfile.
    #
    SENDMAIL($to, $subject, $mesg, $from, $hdrs, ($logname));
355
356
357
358
359
}

sub fatal($) {
    my($mesg) = @_;

360
    TBUnLockExp($pid, $eid);
361
362
363
364
365
366
367
    NotifyUser($mesg, 1);
    if (defined($logname)) {
	unlink($logname);
    }
    exit(1);
}