#!/usr/bin/perl -wT # # EMULAB-COPYRIGHT # Copyright (c) 2000-2011 University of Utah and the Flux Group. # All rights reserved. # use English; use Getopt::Std; use POSIX qw(setsid :sys_wait_h); use File::Basename; # # Image Creation Tuneables. # # $maxwait max wall clock time to allow, progress or not # Empirically we have observed about 1.6MB/sec on a pc850 # for a Windows image (the slowest to create), so figuring # 1.5MB/sec for a 6GB max image works out to around 72 minutes. # $idlewait max time to wait between periods of progress # $checkwait time between progress checks (must be int div of $idlewait) # $reportwait time between progress reports (must be multiple of $checkwait) # # $maximagesize max size in bytes of an image. This should really be in the # DB (per-testbed, per-project, per-user, per-something), and # not hardwired here. In the meantime, we set this big and let # disk quotas do the dirty work of limiting size. # my $maxwait = (72 * 60); my $idlewait = ( 8 * 60); my $reportwait = ( 2 * 60); my $checkwait = 15; my $maximagesize = (6 * 1024**3); # 20GB # # Create a disk image. # # XXX: Device file should come from DB. # Start/count slice computation is not generalized at all. # sub usage() { print(STDERR "Usage: create_image [-wsN] [-p ] \n" . "switches and arguments:\n". "-w - wait for image to be fully created\n". "-s - use ssh instead of frisbee uploader\n". "-N - use NFS (if available) instead of frisbee uploader\n". "-p - project ID of the image; defaults to system project\n". " - imagename to use\n". " - nodeid to create the image from\n"); exit(-1); } my $optlist = "p:wsNd"; my $waitmode = 0; my $usessh = 0; my $usenfs = 0; my $usefup = 1; # # Configure variables # my $TB = "@prefix@"; my $TBOPS = "@TBOPSEMAIL@"; my $TBLOGS = "@TBLOGSEMAIL@"; my $BOSSIP = "@BOSSNODE_IP@"; my $CONTROL = "@USERNODE@"; my $NONFS = @NOSHAREDFS@; # # Testbed Support libraries # use lib "@prefix@/lib"; use libdb; use libtestbed; use libadminmfs; use Experiment; use Node; use User; use Image; use Logfile; # # Turn off line buffering on output # $| = 1; # # Untaint the path # $ENV{'PATH'} = "/bin:/sbin:/usr/bin:"; delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'}; sub cleanup(); sub fatal($); sub check_progress($$); sub run_with_ssh($$); my $nodereboot = "$TB/bin/node_reboot"; my $createimage = "/usr/local/bin/create-image"; my $friskiller = "$TB/sbin/frisbeehelper"; my $osselect = "$TB/bin/os_select"; my $checkquota = "$TB/sbin/checkquota"; my $imagehash = "$TB/bin/imagehash"; my $def_devtype = "ad"; my $def_devnum = 0; my $devtype; my $devnum; my $mereuser = 0; my $debug = 0; my $imagepid = TB_OPSPID; my $logfile; my $oldlogfile; my $needcleanup = 0; # # Parse command arguments. Once we return from getopts, all that should be # left are the required arguments. # my %options = (); if (! getopts($optlist, \%options)) { usage(); } if (defined($options{"w"})) { $waitmode = 1; } if (defined($options{"s"})) { $usessh = 1; $usefup = $usenfs = 0; } if (defined($options{"N"})) { if (!$NONFS) { $usenfs = 1; $usefup = $usessh = 0; } else { print STDERR "NFS not available, cannot use -N\n"; exit(1); } } if (defined($options{"d"})) { $debug = 1; $waitmode = 0; } if (@ARGV != 2) { usage(); } my $imagename = $ARGV[0]; my $node_id = $ARGV[1]; # # Untaint the arguments. # if ($node_id =~ /^([-\w]+)$/) { $node_id = $1; } else { die("*** $0:\n". " Bad data in $node_id\n"); } if ($imagename =~ /^([-\w\.\+]+)$/) { $imagename = $1; } else { die("*** $0:\n". " Bad data in $imagename.\n"); } if (defined($options{"p"})) { $imagepid = $options{"p"}; if ($imagepid =~ /^([-\w\.]+)$/) { $imagepid = $1; } else { die("*** $0:\n". " Bad data in $imagepid.\n"); } } # # Reset default values from site variables if they exist. # my $tmp; if (TBGetSiteVar("images/create/maxwait", \$tmp)) { $maxwait = $tmp * 60; } if (TBGetSiteVar("images/create/idlewait", \$tmp)) { $idlewait = $tmp * 60; } if (TBGetSiteVar("images/create/maxsize", \$tmp)) { $maximagesize = $tmp * 1024**3; } $idlewait = $maxwait if ($maxwait < $idlewait); $reportwait = $idlewait if ($idlewait < $reportwait); $checkwait = $reportwait if ($reportwait < $checkwait); # # Verify user and get his DB uid and other info for later. # my $this_user = User->ThisUser(); if (! defined($this_user)) { tbdie("You ($UID) do not exist!"); } my $user_uid = $this_user->uid(); my $user_name = $this_user->name(); my $user_email = $this_user->email(); # Check node and permission my $node = Node->Lookup($node_id); if (!defined($node)) { die("*** $0:\n". " Invalid node name $node_id!\n"); } if ($UID && ! $this_user->IsAdmin()) { $mereuser = 1; if (! $node->AccessCheck($this_user, TB_NODEACCESS_LOADIMAGE)) { die("*** $0:\n". " You do not have permission to create an image from $node\n"); } } # # Before doing anything else, check for overquota ... lets not waste # our time. Make sure user sees the error by exiting with 1. # if (system("$checkquota $user_uid") != 0) { die("*** $0:\n". " You are over your disk quota on $CONTROL; ". "please login there and cleanup!\n"); } # # We need the project id for test below. The target directory for the # output file has to be the node project directory, since that is the # directory that is going to be NFS mounted by default. # my $experiment = $node->Reservation(); if (!defined($experiment)) { die("*** $0:\n". " Could not map $node to its experiment object!\n"); } my $pid = $experiment->pid(); my $eid = $experiment->eid(); # # Grab the imageid description from the DB. We do a permission check, but # mostly to avoid hard to track errors that would result if the user picked # the wrong one (which is likely to happen no matter what I do). # my $image = Image->Lookup($imagepid, $imagename); if (!defined($image)) { die("*** $0:\n". " No such image descriptor $imagename in project $imagepid!\n"); } my $imageid = $image->imageid(); if ($mereuser && ! $image->AccessCheck($this_user, TB_IMAGEID_ACCESS)) { die("*** $0:\n". " You do not have permission to use imageid $imageid!\n"); } # # Make sure that the directory exists and is writeable for the user. # We test this by creating the file. Its going to get wiped anyway. # my $filename = $image->path(); my $isglobal = $image->global(); my $usepath = 0; # # Redirect pathname for global images. # if ($isglobal && ($filename =~ /^\/usr\/testbed/)) { $filename = PROJROOT() . "/$pid/images/" . basename($filename); print "*** WARNING: Writing global descriptor to $filename instead!\n"; # # XXX the Emulab config of the master server doesn't know this trick # so when it tries to lookup imageid emulab-ops/ it would # still map to /usr/testbed and fail because it cannot update images # outside of /{users,grouop,proj}. So we skirt the issue by passing # it the full path contructed here rather than the imageid. # $usepath = 1; } # # Make sure real path is someplace that makes sense; remember that the # image is created on the nodes, and it NFS mounts directories on ops. # Writing the image to anyplace else is just going to break things. # # Use realpath to resolve any symlinks. # my $translated = `realpath $filename`; if ($translated =~ /^([-\w\.\/\+]+)$/) { $filename = $1; } else { die("*** $0:\n". " Bad data returned by realpath: $translated\n"); } # Make sure not a directory. if (-d $filename) { die("*** $0:\n". " $filename is a directory! Must be a plain file.\n"); } # # The file must reside in an allowed directory. Since this script # runs as the caller, regular file permission checks ensure its a file # the user is allowed to use. # if (! TBValidUserDir($filename, 0)) { die("*** $0:\n". " $filename does not resolve to an allowed directory!\n"); } # # Be sure to kill off running frisbee. If a node is trying to load that # image, well tough. # system("$friskiller -k $imageid"); if ($?) { die("*** $0:\n". " Could not kill running frisbee for $imageid!\n"); } if (-e $filename) { unlink($filename) or die("*** $0:\n". " Could not delete $filename: $!\n"); } open(FILE, "> $filename") or die("*** $0:\n". " Could not create $filename: $!\n"); close(FILE) or die("*** $0:\n". " Could not truncate $filename: $!\n"); # # Get the disktype for this node # $node->disktype(\$devtype); $node->bootdisk_unit(\$devnum); $devtype = $def_devtype if (!defined($devtype)); $devnum = $def_devnum if (!defined($devnum)); my $device = "/dev/${devtype}${devnum}"; # # Record when this image was updated, so that we can figure out which # revision of the testbed image it was based off. # $image->MarkUpdateTime() == 0 or die("*** $0:\n". " Could not mark the update time in $image\n"); # # Okay, we want to build up a command line that will run the script on # on the client node. We use the imageid description to determine what # slice (or perhaps the entire disk) is going to be zipped up. We do not # allow arbitrary combos of course. # my $startslice = $image->loadpart(); my $loadlength = $image->loadlength(); my $command = "$createimage "; if ($usefup) { my $id = $usepath ? $filename : ($image->pid() . "/$imagename"); $command .= " -S $BOSSIP -F $id"; } if ($startslice || $loadlength == 1) { $command .= " -s $startslice"; } $command .= " $device"; if ($usefup || $usessh) { $command .= " -"; } else { $command .= " $filename"; } # # Go to the background since this is going to take a while. # if (!$debug) { $logfile = Logfile->Create($experiment->gid_idx()); fatal("Could not create a logfile") if (!defined($logfile)); # Mark it open since we are going to start using it right away. $logfile->Open(); # Logfile becomes the current spew, but save off the old spew. $experiment->SetLogFile($logfile, \$oldlogfile); if (my $childpid = TBBackGround($logfile->filename())) { # # Parent exits normally, except if in waitmode. # if (!$waitmode) { print("Your image from $node_id is being created\n". "You will be notified via email when the image has been\n". "completed, and you can load the image on another node.\n"); exit(0); } print("Waiting for image creation to complete\n"); print("You may type ^C at anytime; you will be notified via email;\n". "later; you will not actually interrupt image creation.\n"); # Give child a chance to run. select(undef, undef, undef, 0.25); # # Reset signal handlers. User can now kill this process, without # stopping the child. # $SIG{TERM} = 'DEFAULT'; $SIG{INT} = 'DEFAULT'; $SIG{QUIT} = 'DEFAULT'; # # Wait until child exits or until user gets bored and types ^C. # waitpid($childpid, 0); print("Done. Exited with status: $?\n"); exit($? >> 8); } } # # When in waitmode, must put ourselves in another process group so that # an interrupt to the parent will not have any effect on the backend. # if ($waitmode) { POSIX::setsid(); } # # From here on out, we should take care to clean up the DB, and # reboot the source node. # $needcleanup = 1; # Clear the bootlog; see below. $node->ClearBootLog(); # check_progress state my $runticks = 0; my $maxticks = int($maxwait / $checkwait); my $reportticks = int($reportwait / $checkwait); my $idleticks = 0; my $maxidleticks = int($idlewait / $checkwait); my $lastsize = 0; my $result; # # Reboot into admin mode and run the command. # Note that without a shared FS, we just boot the node into the admin MFS # and run the command via SSH, capturing the output. # my $me = $0; my %args = (); $args{'name'} = $me; $args{'prepare'} = 1; if ($usessh) { # # Put the node in admin mode... # $args{'on'} = 1; $args{'clearall'} = 0; if (TBAdminMfsSelect(\%args, undef, $node_id)) { $result = "setupfailed"; goto done; } # # ...boot it... # $args{'reboot'} = 1; $args{'retry'} = 0; $args{'wait'} = 1; my @failed = (); if (TBAdminMfsBoot(\%args, \@failed, $node_id)) { $result = "setupfailed"; goto done; } # # ...execute command and wait! # $result = run_with_ssh($command, $filename); if ($result eq "setupfailed") { goto done; } } else { $args{'command'} = $command; $args{'timeout'} = $maxwait + $checkwait; $args{'pfunc'} = \&check_progress; $args{'pinterval'} = $checkwait; my $retry = 1; while ($retry) { $retry = 0; if (TBAdminMfsRunCmd(\%args, undef, $node_id)) { $result = "setupfailed" if (!defined($result)); } } } # # XXX woeful backward compat hack. # The old client-side script will not recognize the -S and -F options # we pass in and will exit(-1). We detect that here and retry with # if ($usefup && $result eq "255") { print STDERR "MFS does not support frisbee upload, falling back on ", $NONFS ? "ssh" : "nfs", "...\n"; $command = "$createimage "; if ($startslice || $loadlength == 1) { $command .= " -s $startslice"; } $command .= " $device"; if ($usessh) { $command .= " -"; } else { $command .= " $filename"; } # reset state for check_progress $usefup = 0; $runticks = 0; $idleticks = 0; $lastsize = 0; $result = undef; if ($NONFS) { $result = run_with_ssh($command, $filename); } else { $result = run_with_ssh($command, undef); } } done: if (! cleanup()) { fatal("Problem encountered while cleaning up!\n"); } # # If we timed out, if the result code was bad, or if the image size # grew too large. # if ($result eq "setupfailed") { fatal("FAILED: Node setup failed ... \n"); } if ($result eq "timeout") { fatal("FAILED: Timed out generating image ... \n"); } if ($result eq "toobig") { fatal("FAILED: Maximum image size ($maximagesize bytes) exceeded ... \n"); } if ($result != 0) { fatal("FAILED: Returned error code $result generating image ... \n"); } # # Everything worked, create the hash signature file. # my $sigdir; ($sigdir = $filename) =~ s/^(.*)\/[^\/]+$/$1\/sigs/; mkdir($sigdir, 0770) if (! -d "$sigdir"); my $sigfilename; ($sigfilename = $filename) =~ s/^(.*)(\/[^\/]+$)/$1\/sigs$2.sig/; my $swmsg = ""; if (! -x $imagehash || system("$imagehash -c -o $sigfilename $filename") != 0) { warn("Could not create swapout signature file\n"); $swmsg = "WARNING: could not create swapout signature file $sigfilename\n". " You will not be able to save disk state for this image\n"; } else { print("Swapout signature file created\n"); } print "Image creation succeeded.\n"; print "Image written to $filename.\n"; # "Final size: " . (stat($filename))[7] . " bytes.\n"; # Append bootlog (which has prepare output) my $bootlog; if ($node->GetBootLog(\$bootlog) == 0) { print "\n\n"; print "------------------ Prepare Output ----------------\n"; print "$bootlog\n"; } SENDMAIL("$user_name <$user_email>", "Image Creation on $node_id Completed: $pid/$imagename", "Image creation on $node_id has completed. As you requested, the\n". "image has been written to $filename.\n". "You may now os_load this image on other nodes in your experiment.\n". "$swmsg", "$user_name <$user_email>", "Bcc: $TBLOGS", defined($logfile) ? ($logfile->filename()) : ()); if (defined($logfile)) { # Close up the log file so the webpage stops. $logfile->Close(); # And restore the original logfile as current spew. $experiment->SetLogFile($oldlogfile) if (defined($oldlogfile)); $logfile->Delete(1); } exit 0; sub cleanup () { $needcleanup = 0; # # Turn admin mode back off and reboot back to the old OS # my %args = (); $args{'name'} = $me; $args{'on'} = 0; $args{'clearall'} = 0; if (TBAdminMfsSelect(\%args, undef, $node_id)) { print("*** $me:\n". " Could not turn admin mode off for $node_id!\n"); return 0; } %args = (); $args{'name'} = $me; $args{'on'} = 0; $args{'reboot'} = 1; $args{'wait'} = 0; if (TBAdminMfsBoot(\%args, undef, $node_id)) { print("*** $me:\n". " Failed to reboot $node_id on cleanup!\n"); return 0; } return 1; } sub fatal($) { my($mesg) = $_[0]; print "$mesg\n"; if ($needcleanup && !cleanup()) { print "Encountered problems cleaning up!\n"; } # # Send a message to the testbed list. # SENDMAIL("$user_name <$user_email>", "Image Creation Failure on $node_id: $pid/$imagename", $mesg, "$user_name <$user_email>", "Cc: $TBOPS", defined($logfile) ? ($logfile->filename()) : ()); if (defined($logfile)) { # Close up the log file so the webpage stops. $logfile->Close(); # And restore the original logfile as current spew. $experiment->SetLogFile($oldlogfile) if (defined($oldlogfile)); $logfile->Delete(); # This was mailed so no longer needed. unlink("$logfile->filename()"); } exit(-1); } # # Check progress of image creation by periodically checking the image size. # # Called every $checkwait seconds. # Reports progress every $reportwait seconds. # Gives up after $idlewait seconds without a size change. # sub check_progress($$) { my (undef, $statusp) = @_; # # XXX frisbee uploader uploads into a temporary file and then moves # it into place. So track that tmp file here. # my $fname = $filename; if ($usefup) { $fname .= ".tmp"; } # # Command has finished for better or worse, record status and finish. # if (defined($statusp) && $statusp->{$node_id} ne "none") { $result = $statusp->{$node_id}; print "$node_id: image capture has completed\n"; return 0; } # # Has run too long # $runticks++; if ($runticks >= $maxticks) { $result = "timeout"; return 0; } # # See if imagezip on the node is making progress. If not, we need to # check the idle timer and timeout if we have taken too long. # # Also, check to see if the (somewhat arbitrary) maximum filesize has # been exceeded. # my $cursize = (stat($fname))[7]; if ($usefup && !defined($cursize)) { # # XXX avoid an ugly race. # When done, frisuploadd moves foo.tmp -> foo # If we didn't find foo.tmp, try foo now. # $fname =~ s/\.tmp$//; $cursize = (stat($fname))[7]; } if ($cursize > $maximagesize) { $result = "toobig"; return 0; } if ($cursize == $lastsize) { $idleticks++; if ($idleticks >= $maxidleticks) { $result = "timeout"; return 0; } } else { $idleticks = 0; } $lastsize = $cursize; if (($runticks % $reportticks) == 0) { my $curtdiff = int($runticks * $checkwait / 60); print "Still waiting ... its been ". $curtdiff ." minutes.". " Current image size: $cursize bytes.\n"; } return 1; } sub run_with_ssh($$) { my ($cmd,$output) = @_; my $stat = undef; $cmd = "$TB/bin/sshtb -n -host $node_id $cmd"; if (defined($output)) { $cmd .= " > $output"; } print STDERR "About to: '$cmd' as uid $UID\n" if ($debug); my $mypid = fork(); if ($mypid < 0) { return "setupfailed"; } # # Child. Just do it. # if ($mypid == 0) { my $stat = 0; if (system($cmd)) { $stat = $?; } if ($stat & 127) { # died with a signal, return the signal exit($stat & 127); } exit($stat >> 8); } # # Parent. Wait for ssh to finish, reporting periodic progress # as TBAdminMfsRunCmd would do. # my $endtime = time() + $maxwait + $checkwait; while (1) { my $kid = waitpid($mypid, &WNOHANG); # ssh finished if ($kid == $mypid) { $stat = $?; if ($stat & 127) { # died with a signal, return the signal $stat = $stat & 127; } else { # else return the exit code $stat = $stat >> 8; } last; } # huh? if ($kid == -1) { $stat = -1; last; } # check on progress if (!check_progress(undef, undef)) { $stat = $result; last; } # wait for awhile sleep($checkwait); if (time() >= $endtime) { $stat = "timeout"; last; } } return $stat; }