create_image.in 21.6 KB
Newer Older
1
#!/usr/bin/perl -wT
Leigh B. Stoller's avatar
Leigh B. Stoller committed
2 3
#
# EMULAB-COPYRIGHT
4
# Copyright (c) 2000-2012 University of Utah and the Flux Group.
Leigh B. Stoller's avatar
Leigh B. Stoller committed
5 6
# All rights reserved.
#
7 8
use English;
use Getopt::Std;
9
use POSIX qw(setsid :sys_wait_h);
10
use File::Basename;
11

12
#
13
# Image Creation Tuneables.
14 15
#
# $maxwait	max wall clock time to allow, progress or not
16 17 18
#		Empirically we have observed about 1.6MB/sec on a pc850
#		for a Windows image (the slowest to create), so figuring
#		1.5MB/sec for a 6GB max image works out to around 72 minutes.
19 20 21 22
# $idlewait	max time to wait between periods of progress
# $checkwait	time between progress checks (must be int div of $idlewait)
# $reportwait	time between progress reports (must be multiple of $checkwait)
#
23 24 25 26
# $maximagesize	max size in bytes of an image.  This should really be in the
#		DB (per-testbed, per-project, per-user, per-something), and
#		not hardwired here.  In the meantime, we set this big and let
#		disk quotas do the dirty work of limiting size.
27
#
28
my $maxwait      = (72 * 60);
29 30 31
my $idlewait     = ( 8 * 60);
my $reportwait   = ( 2 * 60);
my $checkwait    = 15;
32 33
my $maximagesize = (6 * 1024**3); # 20GB

34 35 36 37 38 39 40 41
#
# Create a disk image.
#
# XXX: Device file should come from DB.
#      Start/count slice computation is not generalized at all.
#
sub usage()
{
42
    print(STDERR
43
	  "Usage: create_image [-wsN] [-p <pid>] <imagename> <node>\n" .
44
	  "switches and arguments:\n".
45
	  "-w          - wait for image to be fully created\n".
46 47
	  "-s          - use ssh instead of frisbee uploader\n".
	  "-N          - use NFS (if available) instead of frisbee uploader\n".
48 49 50
	  "-p <pid>    - project ID of the image; defaults to system project\n".
	  "<imagename> - imagename to use\n".
	  "<node>      - nodeid to create the image from\n");
51 52
    exit(-1);
}
53
my $optlist  = "p:wsNd";
54
my $waitmode = 0;
55
my $usessh = 0;
56 57
my $usenfs = 0;
my $usefup = 1;
58 59 60 61 62 63

#
# Configure variables
#
my $TB		= "@prefix@";
my $TBOPS       = "@TBOPSEMAIL@";
64
my $TBLOGS      = "@TBLOGSEMAIL@";
65
my $BOSSIP	= "@BOSSNODE_IP@";
66
my $CONTROL     = "@USERNODE@";
67
my $NONFS	= @NOSHAREDFS@;
68 69 70 71 72 73 74

#
# Testbed Support libraries
#
use lib "@prefix@/lib";
use libdb;
use libtestbed;
75
use libadminmfs;
76
use Experiment;
77
use Node;
78 79
use User;
use Image;
80
use Logfile;
81 82 83 84 85 86 87 88 89 90 91 92

#
# Turn off line buffering on output
#
$| = 1;

#
# Untaint the path
# 
$ENV{'PATH'} = "/bin:/sbin:/usr/bin:";
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};

93 94
sub cleanup();
sub fatal($);
95
sub check_progress($$);
96
sub run_with_ssh($$);
97

98 99
my $nodereboot	= "$TB/bin/node_reboot";
my $createimage = "/usr/local/bin/create-image";
100
my $friskiller  = "$TB/sbin/frisbeehelper";
101
my $osselect    = "$TB/bin/os_select";
102
my $checkquota  = "$TB/sbin/checkquota";
103
my $imagehash	= "$TB/bin/imagehash";
104
my $SHA1	= "/sbin/sha1";
105 106 107 108
my $def_devtype	= "ad";
my $def_devnum	= 0;
my $devtype;
my $devnum;
109 110
my $mereuser    = 0;
my $debug       = 0;
111
my $imagepid    = TB_OPSPID;
112 113
my $logfile;
my $oldlogfile;
114
my $needcleanup = 0;
115 116 117 118 119

#
# Parse command arguments. Once we return from getopts, all that should be
# left are the required arguments.
#
120
my %options = ();
121 122 123
if (! getopts($optlist, \%options)) {
    usage();
}
124 125 126
if (defined($options{"w"})) {
    $waitmode = 1;
}
127
if (defined($options{"s"})) {
128 129 130 131 132 133 134 135 136 137 138
    $usessh = 1;
    $usefup = $usenfs = 0;
}
if (defined($options{"N"})) {
    if (!$NONFS) {
	$usenfs = 1;
	$usefup = $usessh = 0;
    } else {
	print STDERR "NFS not available, cannot use -N\n";
	exit(1);
    }
139 140 141 142 143
}
if (defined($options{"d"})) {
    $debug = 1;
    $waitmode = 0;
}
144
if (@ARGV != 2) {
145 146 147
    usage();
}

148
my $imagename  = $ARGV[0];
149
my $node_id    = $ARGV[1];
150 151 152 153

#
# Untaint the arguments.
#
154 155
if ($node_id =~ /^([-\w]+)$/) {
    $node_id = $1;
156 157
}
else {
158
    die("*** $0:\n".
159
	"    Bad data in $node_id\n");
160 161
}

162
if ($imagename =~ /^([-\w\.\+]+)$/) {
163 164 165
    $imagename = $1;
}
else {
166 167
    die("*** $0:\n".
	"    Bad data in $imagename.\n");
168 169 170 171 172
}
    
if (defined($options{"p"})) {
    $imagepid = $options{"p"};
	
173
    if ($imagepid =~ /^([-\w\.]+)$/) {
174 175 176
	$imagepid = $1;
    }
    else {
177 178
	die("*** $0:\n".
	    "    Bad data in $imagepid.\n");
179
    }
180 181
}

182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201
#
# Reset default values from site variables if they exist.
#
my $tmp;
if (TBGetSiteVar("images/create/maxwait", \$tmp)) {
    $maxwait = $tmp * 60;
}
if (TBGetSiteVar("images/create/idlewait", \$tmp)) {
    $idlewait = $tmp * 60;
}
if (TBGetSiteVar("images/create/maxsize", \$tmp)) {
    $maximagesize = $tmp * 1024**3;
}
$idlewait = $maxwait
    if ($maxwait < $idlewait);
$reportwait = $idlewait
    if ($idlewait < $reportwait);
$checkwait = $reportwait
    if ($reportwait < $checkwait);

202
#
203
# Verify user and get his DB uid and other info for later.
204
#
205 206 207
my $this_user = User->ThisUser();
if (! defined($this_user)) {
    tbdie("You ($UID) do not exist!");
208
}
209 210 211
my $user_uid   = $this_user->uid();
my $user_name  = $this_user->name();
my $user_email = $this_user->email();
212

213 214 215
# Check node and permission
my $node = Node->Lookup($node_id);
if (!defined($node)) {
216
    die("*** $0:\n".
217
	"    Invalid node name $node_id!\n");
218
}
219
if ($UID && ! $this_user->IsAdmin()) {
Leigh B. Stoller's avatar
Leigh B. Stoller committed
220 221
    $mereuser = 1;

222
    if (! $node->AccessCheck($this_user, TB_NODEACCESS_LOADIMAGE)) {
223 224
	die("*** $0:\n".
	    "    You do not have permission to create an image from $node\n");
225 226 227
    }
}

228 229 230 231
#
# Before doing anything else, check for overquota ... lets not waste
# our time. Make sure user sees the error by exiting with 1.
#
232
if (system("$checkquota $user_uid") != 0) {
233
    die("*** $0:\n".
234 235
	"    You are over your disk quota on $CONTROL; ".
	"please login there and cleanup!\n");
236 237
}

238 239 240 241 242
#
# We need the project id for test below. The target directory for the
# output file has to be the node project directory, since that is the
# directory that is going to be NFS mounted by default.
#
243 244
my $experiment = $node->Reservation();
if (!defined($experiment)) {
245
    die("*** $0:\n".
246
	"    Could not map $node to its experiment object!\n");
247
}
248 249
my $pid = $experiment->pid();
my $eid = $experiment->eid();
250

251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267
#
# To avoid blowing a cavernous hole ("allow all TCP ports to boss")
# in the per-experiment firewall, we don't use the frisbee uploader if
# the node is firewalled.
# 
if ($usefup && $experiment->IsFirewalled()) {
    print "*** WARNING: $node_id is firewalled, not using Frisbee uploader\n";
    $usefup = 0;
    if ($NONFS) {
	$usenfs = 0;
	$usessh = 1;
    } else {
	$usenfs = 1;
	$usessh = 0;
    }
}

268 269 270 271 272
#
# Grab the imageid description from the DB. We do a permission check, but
# mostly to avoid hard to track errors that would result if the user picked
# the wrong one (which is likely to happen no matter what I do).
#
273 274
my $image = Image->Lookup($imagepid, $imagename);
if (!defined($image)) {
275 276
    die("*** $0:\n".
	"    No such image descriptor $imagename in project $imagepid!\n");
277
}
278
my $imageid = $image->imageid();
279

280
if ($mereuser &&
281
    ! $image->AccessCheck($this_user, TB_IMAGEID_ACCESS)) {
282 283
    die("*** $0:\n".
	"    You do not have permission to use imageid $imageid!\n");
284 285 286
}

#
287 288
# Make sure that the directory exists and is writeable for the user.
# We test this by creating the file. Its going to get wiped anyway.
289
#
290 291
my $filename = $image->path();
my $isglobal = $image->global();
292
my $usepath = 0;
293 294

#
295
# Redirect pathname for global images.
296
#
297
if ($isglobal && ($filename =~ /^\/usr\/testbed/)) {
Leigh B. Stoller's avatar
Leigh B. Stoller committed
298
    $filename = PROJROOT() . "/$pid/images/" . basename($filename);
299
    print "*** WARNING: Writing global descriptor to $filename instead!\n";
300 301 302 303 304 305 306 307
    #
    # XXX the Emulab config of the master server doesn't know this trick
    # so when it tries to lookup imageid emulab-ops/<whatever> it would
    # still map to /usr/testbed and fail because it cannot update images
    # outside of /{users,grouop,proj}. So we skirt the issue by passing
    # it the full path contructed here rather than the imageid.
    #
    $usepath = 1;
308
}
309

310 311 312 313 314 315 316 317
#
# Make sure real path is someplace that makes sense; remember that the
# image is created on the nodes, and it NFS mounts directories on ops.
# Writing the image to anyplace else is just going to break things.
#
# Use realpath to resolve any symlinks.
#
my $translated = `realpath $filename`;
Leigh B. Stoller's avatar
Leigh B. Stoller committed
318
if ($translated =~ /^([-\w\.\/\+]+)$/) {
319 320 321 322 323 324
    $filename = $1;
}
else {
    die("*** $0:\n".
	"    Bad data returned by realpath: $translated\n");
}
325 326 327 328 329
# Make sure not a directory.
if (-d $filename) {
    die("*** $0:\n".
	"    $filename is a directory! Must be a plain file.\n");
}
330 331

#
332
# The file must reside in an allowed directory. Since this script
333 334 335
# runs as the caller, regular file permission checks ensure its a file
# the user is allowed to use. 
#
336
if (! TBValidUserDir($filename, 0)) {
337 338 339 340
    die("*** $0:\n".
	"    $filename does not resolve to an allowed directory!\n");
}

341 342 343 344
#
# Be sure to kill off running frisbee. If a node is trying to load that
# image, well tough. 
#
345 346 347 348 349
system("$friskiller -k $imageid");
if ($?) {
    die("*** $0:\n".
	"    Could not kill running frisbee for $imageid!\n");
}
350

351 352 353 354 355 356
if (-e $filename) {
    unlink($filename) or
	die("*** $0:\n".
	    "    Could not delete $filename: $!\n");
}

357
open(FILE, "> $filename") or
358 359
    die("*** $0:\n".
	"    Could not create $filename: $!\n");
360
close(FILE) or
361 362
    die("*** $0:\n".
	"    Could not truncate $filename: $!\n");
363

364 365 366
#
# Get the disktype for this node
#
367 368
$node->disktype(\$devtype);
$node->bootdisk_unit(\$devnum);
369 370 371 372 373

$devtype = $def_devtype
    if (!defined($devtype));
$devnum = $def_devnum
    if (!defined($devnum));
374 375
my $device = "/dev/${devtype}${devnum}";

376 377 378 379
#
# Record when this image was updated, so that we can figure out which
# revision of the testbed image it was based off.
#
380
$image->MarkUpdateTime() == 0 or
381 382 383
    die("*** $0:\n".
	"    Could not mark the update time in $image\n");
    
384 385 386 387 388 389
#
# Okay, we want to build up a command line that will run the script on
# on the client node. We use the imageid description to determine what
# slice (or perhaps the entire disk) is going to be zipped up. We do not
# allow arbitrary combos of course. 
#
390 391
my $startslice = $image->loadpart();
my $loadlength = $image->loadlength();
392 393
my $command    = "$createimage ";

394 395 396
if ($usefup) {
    my $id = $usepath ? $filename : ($image->pid() . "/$imagename");
    $command .= " -S $BOSSIP -F $id";
397
}
398 399 400

if ($startslice || $loadlength == 1) {
    $command .= " -s $startslice";
401
}
402 403 404
$command .= " $device";

if ($usefup || $usessh) {
405 406 407
    $command .= " -";
} else {
    $command .= " $filename";
408 409 410 411 412
}

#
# Go to the background since this is going to take a while.
# 
413
if (!$debug) {
414
    $logfile = Logfile->Create($experiment->gid_idx());
415 416 417 418
    fatal("Could not create a logfile")
	if (!defined($logfile));
    # Mark it open since we are going to start using it right away.
    $logfile->Open();
419

420 421
    # Logfile becomes the current spew, but save off the old spew.
    $experiment->SetLogFile($logfile, \$oldlogfile);
422

423
    if (my $childpid = TBBackGround($logfile->filename())) {
424 425
	#
	# Parent exits normally, except if in waitmode. 
426
	#
427
	if (!$waitmode) {
428
	    print("Your image from $node_id is being created\n".
429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447
		  "You will be notified via email when the image has been\n".
		  "completed, and you can load the image on another node.\n");
	    exit(0);
	}
	print("Waiting for image creation to complete\n");
	print("You may type ^C at anytime; you will be notified via email;\n".
	      "later; you will not actually interrupt image creation.\n");
	
	# Give child a chance to run.
	select(undef, undef, undef, 0.25);
	
	#
	# Reset signal handlers. User can now kill this process, without
	# stopping the child.
	#
	$SIG{TERM} = 'DEFAULT';
	$SIG{INT}  = 'DEFAULT';
	$SIG{QUIT} = 'DEFAULT';

448
	#
449 450 451 452 453 454
	# Wait until child exits or until user gets bored and types ^C.
	#
	waitpid($childpid, 0);
	
	print("Done. Exited with status: $?\n");
	exit($? >> 8);
455
    }
456 457
}

458 459 460 461 462 463 464 465
#
# When in waitmode, must put ourselves in another process group so that
# an interrupt to the parent will not have any effect on the backend.
#
if ($waitmode) {
    POSIX::setsid();
}

466 467 468 469 470
#
# From here on out, we should take care to clean up the DB, and
# reboot the source node.
#
$needcleanup = 1;
471

472
# Clear the bootlog; see below.
473
$node->ClearBootLog();
474

475 476 477 478 479 480 481 482 483
# check_progress state
my $runticks	 = 0;
my $maxticks	 = int($maxwait / $checkwait);
my $reportticks  = int($reportwait / $checkwait);
my $idleticks    = 0;
my $maxidleticks = int($idlewait / $checkwait);
my $lastsize     = 0;
my $result;

484
#
Russ Fish's avatar
typo.  
Russ Fish committed
485
# Reboot into admin mode and run the command.
486 487
# Note that without a shared FS, we just boot the node into the admin MFS
# and run the command via SSH, capturing the output.
488 489 490 491
#
my $me           = $0;
my %args         = ();
$args{'name'}    = $me;
492
$args{'prepare'} = 1;
493

494
if ($usessh) {
495 496 497 498 499 500 501 502 503
    #
    # Put the node in admin mode...
    #
    $args{'on'} = 1;
    $args{'clearall'} = 0;
    if (TBAdminMfsSelect(\%args, undef, $node_id)) {
	$result = "setupfailed";
	goto done;
    }
504

505 506 507 508 509 510 511 512 513 514 515 516 517 518 519
    #
    # ...boot it...
    #
    $args{'reboot'} = 1;
    $args{'retry'} = 0;
    $args{'wait'} = 1;
    my @failed = ();
    if (TBAdminMfsBoot(\%args, \@failed, $node_id)) {
	$result = "setupfailed";
	goto done;
    }

    #
    # ...execute command and wait!
    #
520 521
    $result = run_with_ssh($command, $filename);
    if ($result eq "setupfailed") {
522 523 524 525 526 527 528 529 530 531 532 533 534 535 536
	goto done;
    }
} else {
    $args{'command'} = $command;
    $args{'timeout'} = $maxwait + $checkwait;
    $args{'pfunc'}     = \&check_progress;
    $args{'pinterval'} = $checkwait;

    my $retry = 1;
    while ($retry) {
	$retry = 0;
	if (TBAdminMfsRunCmd(\%args, undef, $node_id)) {
	    $result = "setupfailed"
		if (!defined($result));
	}
537 538 539
    }
}

540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558
#
# XXX woeful backward compat hack.
# The old client-side script will not recognize the -S and -F options
# we pass in and will exit(-1).  We detect that here and retry with
#
if ($usefup && $result eq "255") {
    print STDERR "MFS does not support frisbee upload, falling back on ",
                 $NONFS ? "ssh" : "nfs", "...\n";

    $command = "$createimage ";
    if ($startslice || $loadlength == 1) {
	$command .= " -s $startslice";
    }
    $command .= " $device";
    if ($usessh) {
	$command .= " -";
    } else {
	$command .= " $filename";
    }
559 560 561 562 563 564 565 566

    # reset state for check_progress
    $usefup = 0;
    $runticks = 0;
    $idleticks = 0;
    $lastsize = 0;
    $result = undef;

567 568 569 570 571 572 573
    if ($NONFS) {
	$result = run_with_ssh($command, $filename);
    } else {
	$result = run_with_ssh($command, undef);
    }
}

574 575
done:

576 577
if (! cleanup()) {
    fatal("Problem encountered while cleaning up!\n");
578 579 580
}

#
581 582
# If we timed out, if the result code was bad, or if the image size
# grew too large.
583
#
584 585
if ($result eq "setupfailed") {
    fatal("FAILED: Node setup failed ... \n");
586
}
587 588
if ($result eq "timeout") {
    fatal("FAILED: Timed out generating image ... \n");
589
}
590
if ($result eq "toobig") {
591 592
    fatal("FAILED: Maximum image size ($maximagesize bytes) exceeded ... \n");
}
593
if ($result != 0) {
Mike Hibler's avatar
Mike Hibler committed
594
    fatal("FAILED: Returned error code $result generating image ... \n");
595
}
596

597 598 599 600 601
#
# Everything worked, create the hash signature file.
#
my $sigdir;
($sigdir = $filename) =~ s/^(.*)\/[^\/]+$/$1\/sigs/;
602
mkdir($sigdir, 0770)
603 604 605 606 607
    if (! -d "$sigdir");

my $sigfilename;
($sigfilename = $filename) =~ s/^(.*)(\/[^\/]+$)/$1\/sigs$2.sig/;
my $swmsg = "";
608 609
if (! -x $imagehash ||
    system("$imagehash -c -o $sigfilename $filename") != 0) {
610 611 612 613 614 615 616
    warn("Could not create swapout signature file\n");
    $swmsg = "WARNING: could not create swapout signature file $sigfilename\n".
	     "       You will not be able to save disk state for this image\n";
} else {
    print("Swapout signature file created\n");
}

617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646
#
# Hash the file itself since we really want an integrity check
# on the image file.
#
my $hashfile = "${filename}.sha1";
my $filehash = `$SHA1 $filename`;
if ($?) {
    fatal("Could not generate sha1 hash of $filename");
}
if ($filehash =~ /^SHA1.*= (\w*)$/) {
    if ($isglobal && $usepath) {
	print "*** WARNING: Not updating SHA1 in DB record since the ".
	    "image was written to /proj!\n";
	print "    See $hashfile instead\n";
    }
    else {
	$image->SetHash($1) == 0
	    or fatal("Failed to set the hash for $image");
    }
}
else {
    fatal("Could not parse the sha1 hash: '$filehash'")
}
unlink($hashfile)
    if (-e $hashfile);
open(HASH, ">$hashfile") or
    fatal("Could not open $hashfile for writing: $!");
print HASH $filehash;
close($hashfile);

647
print "Image creation succeeded.\n";
Leigh B. Stoller's avatar
Leigh B. Stoller committed
648
print "Image written to $filename.\n";
649
#      "Final size: " . (stat($filename))[7] . " bytes.\n";
650

651
# Append bootlog (which has prepare output)
652 653
my $bootlog;
if ($node->GetBootLog(\$bootlog) == 0) {
654 655
    print "\n\n";
    print "------------------ Prepare Output ----------------\n";
656
    print "$bootlog\n";
657 658
}

659
SENDMAIL("$user_name <$user_email>",
660 661
	 "Image Creation on $node_id Completed: $pid/$imagename",
	 "Image creation on $node_id has completed. As you requested, the\n".
662
	 "image has been written to $filename.\n".
663 664
	 "You may now os_load this image on other nodes in your experiment.\n".
	 "$swmsg",
665 666
	 "$user_name <$user_email>",
	 "Bcc: $TBLOGS",
667
	 defined($logfile) ? ($logfile->filename()) : ());
668

669
if (defined($logfile)) {
670
    # Close up the log file so the webpage stops.
671 672 673 674
    $logfile->Close();
    # And restore the original logfile as current spew.
    $experiment->SetLogFile($oldlogfile)
	if (defined($oldlogfile));
Mike Hibler's avatar
Mike Hibler committed
675
    $logfile->Delete(1);
676
}
677 678 679 680
exit 0;

sub cleanup ()
{
681 682 683
    $needcleanup = 0;

    #
684
    # Turn admin mode back off and reboot back to the old OS
685
    #
686 687 688 689
    my %args          = ();
    $args{'name'}     = $me;
    $args{'on'}       = 0;
    $args{'clearall'} = 0;
690
    if (TBAdminMfsSelect(\%args, undef, $node_id)) {
691
	print("*** $me:\n".
692
	      "    Could not turn admin mode off for $node_id!\n");
693
	return 0;
694 695
    }

696 697 698 699 700
    %args           = ();
    $args{'name'}   = $me;
    $args{'on'}     = 0;
    $args{'reboot'} = 1;
    $args{'wait'}   = 0;
701
    if (TBAdminMfsBoot(\%args, undef, $node_id)) {
702
	print("*** $me:\n".
703
	      "    Failed to reboot $node_id on cleanup!\n");
704
	return 0;
705 706
    }

707
    return 1;
708 709 710 711 712
}

sub fatal($)
{
    my($mesg) = $_[0];
713 714

    print "$mesg\n";
715 716 717 718

    if ($needcleanup && !cleanup()) {
        print "Encountered problems cleaning up!\n";
    }
719 720
    
    #
721
    # Send a message to the testbed list. 
722
    #
723
    SENDMAIL("$user_name <$user_email>",
724
	     "Image Creation Failure on $node_id: $pid/$imagename",
725 726 727
	     $mesg,
	     "$user_name <$user_email>",
	     "Cc: $TBOPS",
728
	     defined($logfile) ? ($logfile->filename()) : ());
729
    
730 731 732 733 734 735 736 737 738
    if (defined($logfile)) {
	# Close up the log file so the webpage stops.
	$logfile->Close();
	# And restore the original logfile as current spew.
	$experiment->SetLogFile($oldlogfile)
	    if (defined($oldlogfile));
	$logfile->Delete();
	# This was mailed so no longer needed.
	unlink("$logfile->filename()");
739
    }
740 741 742
    exit(-1);
}

743 744 745 746 747 748 749 750 751 752 753
#
# Check progress of image creation by periodically checking the image size.
#
# Called every $checkwait seconds.
# Reports progress every $reportwait seconds.
# Gives up after $idlewait seconds without a size change.
#
sub check_progress($$)
{
    my (undef, $statusp) = @_;

754 755 756 757 758
    if ($runticks == 0) {
	print "$node_id: started image capture, ".
	    "waiting up to " . int($maxwait/60) . " minutes\n";
    }

759 760 761 762 763 764 765 766 767
    #
    # XXX frisbee uploader uploads into a temporary file and then moves
    # it into place. So track that tmp file here.
    #
    my $fname = $filename;
    if ($usefup) {
	$fname .= ".tmp";
    }

768 769 770
    #
    # Command has finished for better or worse, record status and finish.
    #
771
    if (defined($statusp) && $statusp->{$node_id} ne "none") {
772
	$result = $statusp->{$node_id};
773
	print "$node_id: image capture has completed: status='$result'\n";
774 775 776 777 778 779 780 781 782
	return 0;
    }

    #
    # Has run too long
    #
    $runticks++;
    if ($runticks >= $maxticks) {
	$result = "timeout";
783
	print "$node_id: image capture has completed: timeout\n";
784 785 786 787 788 789 790 791 792 793
	return 0;
    }

    #
    # See if imagezip on the node is making progress.  If not, we need to
    # check the idle timer and timeout if we have taken too long.
    #
    # Also, check to see if the (somewhat arbitrary) maximum filesize has 
    # been exceeded.
    #
794
    my $cursize = (stat($fname))[7];
795 796 797 798 799 800 801 802 803
    if ($usefup && !defined($cursize)) {
	#
	# XXX avoid an ugly race.
	# When done, frisuploadd moves foo.tmp -> foo
	# If we didn't find foo.tmp, try foo now.
	#
	$fname =~ s/\.tmp$//;
	$cursize = (stat($fname))[7];
    }
804 805
    if ($cursize > $maximagesize) {
	$result = "toobig";
806
	print "$node_id: image capture has completed: image too big\n";
807 808 809 810 811 812
	return 0;
    }
    if ($cursize == $lastsize) {
	$idleticks++;
	if ($idleticks >= $maxidleticks) {
	    $result = "timeout";
813
	    print "$node_id: image capture has completed: idle timeout\n";
814 815 816 817 818 819 820 821 822
	    return 0;
	}
    } else {
	$idleticks = 0;
    }
    $lastsize = $cursize;
    
    if (($runticks % $reportticks) == 0) {
	my $curtdiff = int($runticks * $checkwait / 60);
823 824
	print "$node_id: still waiting ...".
	    " it has been ". $curtdiff ." minutes.".
825 826 827 828 829
	    " Current image size: $cursize bytes.\n";
    }

    return 1;
}
830 831 832 833 834 835

sub run_with_ssh($$)
{
    my ($cmd,$output) = @_;
    my $stat = undef;

836
    $cmd = "$TB/bin/sshtb -n -host $node_id $cmd";
837 838 839
    if (defined($output)) {
	$cmd .= " > $output";
    }
840
    print STDERR "About to: '$cmd' as uid $UID\n" if ($debug);
841 842 843 844 845 846 847 848 849 850

    my $mypid = fork();
    if ($mypid < 0) {
	return "setupfailed";
    }

    #
    # Child. Just do it.
    #
    if ($mypid == 0) {
851 852 853 854 855 856 857 858 859
	my $stat = 0;
	if (system($cmd)) {
	    $stat = $?;
	}
	if ($stat & 127) {
	    # died with a signal, return the signal
	    exit($stat & 127);
	}
	exit($stat >> 8);
860 861 862 863 864 865 866 867 868 869 870 871
    }

    #
    # Parent.  Wait for ssh to finish, reporting periodic progress
    # as TBAdminMfsRunCmd would do.
    #
    my $endtime = time() + $maxwait + $checkwait;
    while (1) {
	my $kid = waitpid($mypid, &WNOHANG);
	# ssh finished
	if ($kid == $mypid) {
	    $stat = $?;
Mike Hibler's avatar
Mike Hibler committed
872 873 874 875 876 877 878
	    if ($stat & 127) {
		# died with a signal, return the signal
		$stat = $stat & 127;
	    } else {
		# else return the exit code
		$stat = $stat >> 8;
	    }
879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903
	    last;
	}

	# huh?
	if ($kid == -1) {
	    $stat = -1;
	    last;
	}

	# check on progress
	if (!check_progress(undef, undef)) {
	    $stat = $result;
	    last;
	}

	# wait for awhile
	sleep($checkwait);
	if (time() >= $endtime) {
	    $stat = "timeout";
	    last;
	}
    }

    return $stat;
}