node_update.in 7.7 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
#!/usr/bin/perl -wT
use English;
use Getopt::Std;

#
# Update mounts and accounts and anything else after changing the permissions
# for a node. This is intended to be invoked from the web interface after
# adding and/or subtracting pids from the experiment pid access list.
#
# XXX There is an inherent race condition with using this script. What if
# nodes are released while it is running?
12
13
14
#
# The output is all jumbled together since the updates are issued in parallel.
# Might be a pain when debugging. 
15
16
17
18
# 
sub usage()
{
    print STDOUT "Usage: node_update [-b] <pid> <eid>\n".
19
20
	"Update user accounts and NFS mounts on nodes in your project.\n".
	"Use -b to use batch operation (place in background, send email).\n";
21
22
    exit(-1);
}
23
my  $optlist = "be:";
24
25
26
27
28
29
30
31
32
33
34
35
  
#
# Configure variables
#
my $TB		= "@prefix@";
my $TESTMODE    = @TESTMODE@;
my $TBOPS       = "@TBOPSEMAIL@";
my $TBLOGS      = "@TBLOGSEMAIL@";

my $ssh		= "$TB/bin/sshtb -n";
my $expsetup    = "$TB/sbin/exports_setup";
my $batchmode   = 0;
36
my $maxchildren = 20;
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74

#
# Load the Testbed support stuff. 
#
use lib "@prefix@/lib";
use libdb;
use libtestbed;

# un-taint path
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};

# Turn off line buffering on output
$| = 1; 

#
# Parse command arguments. Once we return from getopts, all that should be
# left are the required arguments.
#
%options = ();
if (! getopts($optlist, \%options)) {
    usage();
}
if (@ARGV != 2) {
    usage();
}
my $pid   = $ARGV[0];
my $eid   = $ARGV[1];
if (defined($options{"b"})) {
    $batchmode = 1;
}

#
# Untaint the arguments.
#
if ($pid =~ /^([-\@\w]+)$/) {
    $pid = $1;
}
75
76
77
else {
    die("*** Bad data in pid: $pid\n");
}	
78
79
80
if ($eid =~ /^([-\@\w]+)$/) {
    $eid = $1;
}
81
82
83
else {
    die("*** Bad data in eid: $eid\n");
}	
84
85
86
87
88
89
90
91
92
93
94
95
96

my $user_name;
my $user_email;
my $logname;
my %pids	= ();
my $failed	= 0;
my $dbuid;

#
# We don't want to run this script unless its the real version.
# That is, it must be setuid root. 
#
if ($EUID != 0) {
97
98
    die("*** $0:\n".
	"    Must be root! Maybe its a development version?\n");
99
100
101
102
103
104
}

#
# Verify actual user and get his DB uid.
#
if (! UNIX2DBUID($UID, \$dbuid)) {
105
106
    die("*** $0:\n".
	"    You do not exist in the Emulab Database.\n");
107
108
109
}

if (! UserDBInfo($dbuid, \$user_name, \$user_email)) {
110
111
    die("*** $0:\n".
        "    Cannot determine your name and email address.\n");
112
113
114
115
116
117
}

#
# Verify that this person is allowed to do this. Must be an admin type,
# the experiment creator or the project leader.
#
118
if ($UID && !TBAdmin()) {
119
120
121
122
    my $expt_leader = ExpLeader($pid, $eid);
    my $proj_leader = ProjLeader($pid);

    if (!$expt_leader || !$proj_leader) {
123
124
	die("*** $0:\n".
	    "    No such Experiment $eid or no such Project $pid\n");
125
126
127
    }

    if ($expt_leader ne $dbuid && $proj_leader ne $dbuid) {
128
129
	die("*** $0:\n".
	    "    You must be the experiment creator or the project leader\n");
130
131
132
    }
}

133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#
# We need to lock down the experiment during this. 
#
DBQueryFatal("lock tables experiments write");

if (TBExpLocked($pid, $eid)) {
    DBQueryWarn("unlock tables");
    die("*** $0:\n".
	"    Experiment $pid/$eid is in transition. Please try later!\n");
}

#
# A sanity check. Lets make sure the experiment is in the swapped in
# state so that we are not trying to update nodes that are still booting
# or swapping out, etc. 
#
if (ExpState($pid, $eid) ne EXPTSTATE_ACTIVE) {
    DBQueryWarn("unlock tables");
    die("*** $0:\n".
	"    The experiment $pid/$eid must be fully activated first!\n");
}
TBLockExp($pid, $eid);
DBQueryFatal("unlock tables");

157
158
159
160
#
# Batchmode (as from the web interface) goes to background and reports
# later via email.
# 
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
if ($batchmode) {
    #
    # Create a temporary name for a log file.
    #
    $logname = `mktemp /tmp/node_update-$pid-$eid.XXXXXX`;
    chop($logname);
    
    if (TBBackGround($logname)) {
	#
	# Parent exits normally
	#
	print STDOUT
	    "Node Update for $pid/$eid is now in progress.\n".
	    "You will be notified via email when the is complete.\n";
	exit(0);
    }
}

#
# Currently, we just need to update the mount points. The UID change because
# of PERL sillyness.
#
$UID = $EUID;
if (system("$expsetup")) {
    fatal("Exports Setup Failed");
}
# Give ops a chance to react.
sleep(2);

#
# Get the list of nodes that need to be "updated."
# 
my @nodes = ExpNodes($pid, $eid);
if (! @nodes) {
    fatal("No Nodes in the experiment");
}

#
199
200
201
202
# We want some overlap, but not too much since we could burn up
# a lot processes on wedged nodes. Issue a small number in parallel,
# and wait once we reach the limit for one to finish, before issuing
# the next one.
203
#
204
my $maxpids = 0;
205
foreach my $node ( @nodes ) {
206
    while ($maxpids >= $maxchildren) {
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
	my $thispid  = waitpid(-1, 0);
	my $thisnode = $pids{$thispid};
	
	if ($?) {
	    $failed++;
	    print STDERR "Update of node $thisnode failed!\n";
	}
	else {
	    print STDOUT "$thisnode updated ...\n";
	}

	delete($pids{$thispid});
	$maxpids--;
    }
    my $thispid = UpdateNode($node);
    $pids{$thispid} = $node;
    $maxpids++;
    sleep(1);
225
226
227
}

#
228
# Wait for any remaining children to exit before continuing.
229
#
230
231
foreach my $thispid ( keys(%pids) ) {
    my $node = $pids{$thispid};
232

233
    waitpid($thispid, 0);
234
235
236
237
238
    if ($?) {
	$failed++;
	print STDERR "Update of node $node failed!\n";
    }
    else {
239
	print STDOUT "$node updated ...\n";
240
241
242
    }
}

243
TBUnLockExp($pid, $eid);
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
NotifyUser("Node Update Complete", $failed);
if (defined($logname)) {
    unlink($logname);
}
exit($failed);

#
# Update a node in a child process. Return the pid to the parent so
# that it can wait on all the children later.
# 
sub UpdateNode {
    my($node) = @_;
    my($syspid, $mypid);

    print STDOUT "Updating $node ...\n";

260
261
262
263
264
265
    #
    # We need to know if its a remote or local node, so we know how
    # to update it. This info needs to be in the DB at some point. 
    #
    my($isremote) = TBIsNodeRemote($node);

266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
    $mypid = fork();
    if ($mypid) {
	return $mypid;
    }

    #
    # Run an ssh command in a child process, protected by an alarm to
    # ensure that the ssh is not hung up forever if the machine is in
    # some funky state.
    # 
    $syspid = fork();

    # Must change our real UID to root so that ssh will work.
    $UID = 0;
    
    if ($syspid) {
	local $SIG{ALRM} = sub { kill("TERM", $syspid); };
	alarm 15;
	waitpid($syspid, 0);
	alarm 0;

	print STDERR "update of $node returned $?.\n" if $debug;
    
	#
	# If either ssh is not running or it timed out,
	# send it a ping of death.
	# 
	if ($? == 256 || $? == 15) {
	    if ($? == 256) {
		print STDERR "$node is not running sshd.\n" if $debug;
	    } else {
		print STDERR "$node is wedged.\n" if $debug;
	    }
	    exit(-1);
	}
	exit(0);
    }
    else {
304
	if ($isremote) {
305
	    exec("$ssh -host $node /usr/local/etc/testbed/update");
306
307
	}
	else {
308
	    exec("$ssh -host $node /etc/testbed/update");
309
	}
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
	exit(0);
    }
    exit(0);
}

sub NotifyUser($$)
{
    my($mesg, $iserr) = @_;
    my($subject, $from, $to, $hdrs);

    print STDOUT "$mesg\n";

    if (! $batchmode) {
	return;
    }

    if ($iserr) {
327
	$subject = "Node Update Failed $pid/$eid";
328
329
    }
    else {
330
	$subject = "Node Update Success $pid/$eid";
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
    }
    $from  = $TBOPS;
    $hdrs  = "Reply-To: $TBOPS";
    
    #
    # Message goes to user. If a failure, TBOPS also gets it, otherwise
    # it goes into the logs.
    #
    $to    = "$user_name <$user_email>";    
    
    if ($iserr) {
	$hdrs = "Cc: $TBOPS\n".
	        "$hdrs";
    }
    else {
	$hdrs = "Bcc: $TBLOGS\n".
	        "$hdrs";
    }

350
351
352
353
    #
    # Send a message to the testbed list. Append the logfile.
    #
    SENDMAIL($to, $subject, $mesg, $from, $hdrs, ($logname));
354
355
356
357
358
}

sub fatal($) {
    my($mesg) = @_;

359
    TBUnLockExp($pid, $eid);
360
361
362
363
364
365
366
    NotifyUser($mesg, 1);
    if (defined($logname)) {
	unlink($logname);
    }
    exit(1);
}