node_update.in 7.74 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11
#!/usr/bin/perl -wT
use English;
use Getopt::Std;

#
# Update mounts and accounts and anything else after changing the permissions
# for a node. This is intended to be invoked from the web interface after
# adding and/or subtracting pids from the experiment pid access list.
#
# XXX There is an inherent race condition with using this script. What if
# nodes are released while it is running?
12 13 14
#
# The output is all jumbled together since the updates are issued in parallel.
# Might be a pain when debugging. 
15 16 17 18
# 
sub usage()
{
    print STDOUT "Usage: node_update [-b] <pid> <eid>\n".
19 20
	"Update user accounts and NFS mounts on nodes in your project.\n".
	"Use -b to use batch operation (place in background, send email).\n";
21 22
    exit(-1);
}
23
my  $optlist = "be:";
24 25 26 27 28 29 30 31 32 33
  
#
# Configure variables
#
my $TB		= "@prefix@";
my $TESTMODE    = @TESTMODE@;
my $TBOPS       = "@TBOPSEMAIL@";
my $TBLOGS      = "@TBLOGSEMAIL@";

my $ssh		= "$TB/bin/sshtb -n";
34
my $sshremote	= "$TB/bin/sshremote -n";
35 36
my $expsetup    = "$TB/sbin/exports_setup";
my $batchmode   = 0;
37
my $maxchildren = 20;
38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75

#
# Load the Testbed support stuff. 
#
use lib "@prefix@/lib";
use libdb;
use libtestbed;

# un-taint path
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};

# Turn off line buffering on output
$| = 1; 

#
# Parse command arguments. Once we return from getopts, all that should be
# left are the required arguments.
#
%options = ();
if (! getopts($optlist, \%options)) {
    usage();
}
if (@ARGV != 2) {
    usage();
}
my $pid   = $ARGV[0];
my $eid   = $ARGV[1];
if (defined($options{"b"})) {
    $batchmode = 1;
}

#
# Untaint the arguments.
#
if ($pid =~ /^([-\@\w]+)$/) {
    $pid = $1;
}
76 77 78
else {
    die("*** Bad data in pid: $pid\n");
}	
79 80 81
if ($eid =~ /^([-\@\w]+)$/) {
    $eid = $1;
}
82 83 84
else {
    die("*** Bad data in eid: $eid\n");
}	
85 86 87 88 89 90 91 92 93 94 95 96 97

my $user_name;
my $user_email;
my $logname;
my %pids	= ();
my $failed	= 0;
my $dbuid;

#
# We don't want to run this script unless its the real version.
# That is, it must be setuid root. 
#
if ($EUID != 0) {
98 99
    die("*** $0:\n".
	"    Must be root! Maybe its a development version?\n");
100 101 102 103 104 105
}

#
# Verify actual user and get his DB uid.
#
if (! UNIX2DBUID($UID, \$dbuid)) {
106 107
    die("*** $0:\n".
	"    You do not exist in the Emulab Database.\n");
108 109 110
}

if (! UserDBInfo($dbuid, \$user_name, \$user_email)) {
111 112
    die("*** $0:\n".
        "    Cannot determine your name and email address.\n");
113 114 115 116 117 118
}

#
# Verify that this person is allowed to do this. Must be an admin type,
# the experiment creator or the project leader.
#
119
if ($UID && !TBAdmin()) {
120 121 122 123
    my $expt_leader = ExpLeader($pid, $eid);
    my $proj_leader = ProjLeader($pid);

    if (!$expt_leader || !$proj_leader) {
124 125
	die("*** $0:\n".
	    "    No such Experiment $eid or no such Project $pid\n");
126 127 128
    }

    if ($expt_leader ne $dbuid && $proj_leader ne $dbuid) {
129 130
	die("*** $0:\n".
	    "    You must be the experiment creator or the project leader\n");
131 132 133
    }
}

134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157
#
# We need to lock down the experiment during this. 
#
DBQueryFatal("lock tables experiments write");

if (TBExpLocked($pid, $eid)) {
    DBQueryWarn("unlock tables");
    die("*** $0:\n".
	"    Experiment $pid/$eid is in transition. Please try later!\n");
}

#
# A sanity check. Lets make sure the experiment is in the swapped in
# state so that we are not trying to update nodes that are still booting
# or swapping out, etc. 
#
if (ExpState($pid, $eid) ne EXPTSTATE_ACTIVE) {
    DBQueryWarn("unlock tables");
    die("*** $0:\n".
	"    The experiment $pid/$eid must be fully activated first!\n");
}
TBLockExp($pid, $eid);
DBQueryFatal("unlock tables");

158 159 160 161
#
# Batchmode (as from the web interface) goes to background and reports
# later via email.
# 
162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199
if ($batchmode) {
    #
    # Create a temporary name for a log file.
    #
    $logname = `mktemp /tmp/node_update-$pid-$eid.XXXXXX`;
    chop($logname);
    
    if (TBBackGround($logname)) {
	#
	# Parent exits normally
	#
	print STDOUT
	    "Node Update for $pid/$eid is now in progress.\n".
	    "You will be notified via email when the is complete.\n";
	exit(0);
    }
}

#
# Currently, we just need to update the mount points. The UID change because
# of PERL sillyness.
#
$UID = $EUID;
if (system("$expsetup")) {
    fatal("Exports Setup Failed");
}
# Give ops a chance to react.
sleep(2);

#
# Get the list of nodes that need to be "updated."
# 
my @nodes = ExpNodes($pid, $eid);
if (! @nodes) {
    fatal("No Nodes in the experiment");
}

#
200 201 202 203
# We want some overlap, but not too much since we could burn up
# a lot processes on wedged nodes. Issue a small number in parallel,
# and wait once we reach the limit for one to finish, before issuing
# the next one.
204
#
205
my $maxpids = 0;
206
foreach my $node ( @nodes ) {
207
    while ($maxpids >= $maxchildren) {
208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225
	my $thispid  = waitpid(-1, 0);
	my $thisnode = $pids{$thispid};
	
	if ($?) {
	    $failed++;
	    print STDERR "Update of node $thisnode failed!\n";
	}
	else {
	    print STDOUT "$thisnode updated ...\n";
	}

	delete($pids{$thispid});
	$maxpids--;
    }
    my $thispid = UpdateNode($node);
    $pids{$thispid} = $node;
    $maxpids++;
    sleep(1);
226 227 228
}

#
229
# Wait for any remaining children to exit before continuing.
230
#
231 232
foreach my $thispid ( keys(%pids) ) {
    my $node = $pids{$thispid};
233

234
    waitpid($thispid, 0);
235 236 237 238 239
    if ($?) {
	$failed++;
	print STDERR "Update of node $node failed!\n";
    }
    else {
240
	print STDOUT "$node updated ...\n";
241 242 243
    }
}

244
TBUnLockExp($pid, $eid);
245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260
NotifyUser("Node Update Complete", $failed);
if (defined($logname)) {
    unlink($logname);
}
exit($failed);

#
# Update a node in a child process. Return the pid to the parent so
# that it can wait on all the children later.
# 
sub UpdateNode {
    my($node) = @_;
    my($syspid, $mypid);

    print STDOUT "Updating $node ...\n";

261 262 263 264 265 266
    #
    # We need to know if its a remote or local node, so we know how
    # to update it. This info needs to be in the DB at some point. 
    #
    my($isremote) = TBIsNodeRemote($node);

267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304
    $mypid = fork();
    if ($mypid) {
	return $mypid;
    }

    #
    # Run an ssh command in a child process, protected by an alarm to
    # ensure that the ssh is not hung up forever if the machine is in
    # some funky state.
    # 
    $syspid = fork();

    # Must change our real UID to root so that ssh will work.
    $UID = 0;
    
    if ($syspid) {
	local $SIG{ALRM} = sub { kill("TERM", $syspid); };
	alarm 15;
	waitpid($syspid, 0);
	alarm 0;

	print STDERR "update of $node returned $?.\n" if $debug;
    
	#
	# If either ssh is not running or it timed out,
	# send it a ping of death.
	# 
	if ($? == 256 || $? == 15) {
	    if ($? == 256) {
		print STDERR "$node is not running sshd.\n" if $debug;
	    } else {
		print STDERR "$node is wedged.\n" if $debug;
	    }
	    exit(-1);
	}
	exit(0);
    }
    else {
305 306 307 308 309 310
	if ($isremote) {
	    exec("$sshremote $node /usr/local/etc/testbed/update");
	}
	else {
	    exec("$ssh $node /etc/testbed/update");
	}
311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327
	exit(0);
    }
    exit(0);
}

sub NotifyUser($$)
{
    my($mesg, $iserr) = @_;
    my($subject, $from, $to, $hdrs);

    print STDOUT "$mesg\n";

    if (! $batchmode) {
	return;
    }

    if ($iserr) {
328
	$subject = "Node Update Failed $pid/$eid";
329 330
    }
    else {
331
	$subject = "Node Update Success $pid/$eid";
332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350
    }
    $from  = $TBOPS;
    $hdrs  = "Reply-To: $TBOPS";
    
    #
    # Message goes to user. If a failure, TBOPS also gets it, otherwise
    # it goes into the logs.
    #
    $to    = "$user_name <$user_email>";    
    
    if ($iserr) {
	$hdrs = "Cc: $TBOPS\n".
	        "$hdrs";
    }
    else {
	$hdrs = "Bcc: $TBLOGS\n".
	        "$hdrs";
    }

351 352 353 354
    #
    # Send a message to the testbed list. Append the logfile.
    #
    SENDMAIL($to, $subject, $mesg, $from, $hdrs, ($logname));
355 356 357 358 359
}

sub fatal($) {
    my($mesg) = @_;

360
    TBUnLockExp($pid, $eid);
361 362 363 364 365 366 367
    NotifyUser($mesg, 1);
    if (defined($logname)) {
	unlink($logname);
    }
    exit(1);
}