manage_instance.in 29.9 KB
Newer Older
1
2
#!/usr/bin/perl -w
#
3
# Copyright (c) 2000-2015 University of Utah and the Flux Group.
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# 
# {{{EMULAB-LICENSE
# 
# This file is part of the Emulab network testbed software.
# 
# This file is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or (at
# your option) any later version.
# 
# This file is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public
# License for more details.
# 
# You should have received a copy of the GNU Affero General Public License
# along with this file.  If not, see <http://www.gnu.org/licenses/>.
# 
# }}}
#
use English;
use strict;
use Getopt::Std;
use XML::Simple;
use Data::Dumper;
use CGI;
use POSIX ":sys_wait_h";
31
use POSIX qw(setsid close);
32
33
34
35
36
37

#
# Back-end script to manage APT profiles.
#
sub usage()
{
Leigh B Stoller's avatar
Leigh B Stoller committed
38
39
    print("Usage: manage_instance snapshot instance ".
	  "[-n node_id] [-i imagename] [-u node|all]\n");
40
    print("Usage: manage_instance consoleurl instance node\n");
41
    print("Usage: manage_instance extend instance [-f] seconds\n");
42
43
    print("Usage: manage_instance terminate instance\n");
    print("Usage: manage_instance refresh instance\n");
44
    print("Usage: manage_instance reboot instance node_id [node_id ...]\n");
45
    print("Usage: manage_instance reload instance node_id [node_id ...]\n");
46
    print("Usage: manage_instance monitor instance\n");
47
    print("Usage: manage_instance lockdown instance set|clear user|admin\n");
48
49
    exit(-1);
}
50
my $optlist     = "dt:";
51
52
my $debug       = 0;
my $webtask_id;
53
my $webtask;
54
55
56
57
58
59
60
61

#
# Configure variables
#
my $TB		= "@prefix@";
my $TBOPS       = "@TBOPSEMAIL@";
my $QUICKVM     = "$TB/sbin/protogeni/quickvm";

62
# Debugging
Leigh B Stoller's avatar
Leigh B Stoller committed
63
my $usemydevtree = 0;
64

65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#
# Untaint the path
#
$ENV{'PATH'} = "$TB/bin:$TB/sbin:/bin:/usr/bin:/usr/bin:/usr/sbin";
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};

#
# Turn off line buffering on output
#
$| = 1;

#
# Load the Testbed support stuff.
#
use lib "@prefix@/lib";
use EmulabConstants;
use emdb;
use emutil;
83
use libtestbed;
84
85
86
87
use User;
use Project;
use APT_Profile;
use APT_Instance;
88
use APT_Geni;
89
90
use GeniXML;
use GeniHRN;
91
92
93
use Genixmlrpc;
use GeniResponse;
use GeniSlice;
94
use WebTask;
95
use EmulabFeatures;
96
97
98
99

# Protos
sub fatal($);
sub DoSnapshot();
100
sub DoConsole();
101
102
103
sub DoTerminate();
sub DoExtend();
sub DoRefresh();
104
sub DoReboot();
105
sub DoReload();
106
sub DoLockdown();
107
sub StartMonitor();
108
109
110
111
112
113
114
115
116
117
118
119

#
# Parse command arguments. Once we return from getopts, all that should be
# left are the required arguments.
#
my %options = ();
if (! getopts($optlist, \%options)) {
    usage();
}
if (defined($options{"t"})) {
    $webtask_id = $options{"t"};
}
120
121
122
123
if (defined($options{"d"})) {
    $debug++;
}
if (@ARGV < 2) {
124
125
    usage();
}
126
my $action   = shift(@ARGV);
127
128
my $uuid     = shift(@ARGV);
my $instance = APT_Instance->Lookup($uuid);
129
130
131
if (!defined($instance)) {
    $instance = APT_Instance->LookupBySlice($uuid);
}
132
133
134
135
if (!defined($instance)) {
    fatal("No such instance $uuid");
}

136
if ($action eq "snapshot") {
137
138
    DoSnapshot();
}
139
140
141
142
if ($action eq "extend") {
    DoExtend();
}
elsif ($action eq "consoleurl") {
143
144
    DoConsole()
}
145
146
147
148
149
150
elsif ($action eq "terminate") {
    DoTerminate()
}
elsif ($action eq "refresh") {
    DoRefresh()
}
151
152
153
elsif ($action eq "reboot") {
    DoReboot()
}
154
155
156
elsif ($action eq "reload") {
    DoReload()
}
157
158
159
elsif ($action eq "monitor") {
    StartMonitor()
}
160
161
162
elsif ($action eq "lockdown") {
    DoLockdown()
}
163
164
165
else {
    usage();
}
166
167
168
169
170
171
172
exit(0);

#
# Take a snapshot. Implies a single node instance, for now.
#
sub DoSnapshot()
{
173
174
175
176
177
    my $errmsg;
    my $logfile;
    my $errcode        = 1;
    my $needunlock     = 0;
    my $old_status     = $instance->status();
178
179
180
    my $node_id;
    my $imagename;
    my $update_profile;
181
    my $update_prepare = 0;
182

183
    my $optlist = "n:i:u:U";
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
    my %options = ();
    if (! getopts($optlist, \%options)) {
	usage();
    }
    if (defined($options{"n"})) {
	$node_id = $options{"n"};
    }
    if (defined($options{"i"})) {
	$imagename = $options{"i"};
    }
    if (defined($options{"u"})) {
	$update_profile = $options{"u"};
	if ($update_profile !~ /^(node|all)$/) {
	    usage();
	}
    }
200
201
202
    if (defined($options{"U"})) {
	$update_prepare = 1;
    }
203
204
205
206
207
208
209
210
    
    if ($old_status ne "ready") {
	fatal("Instance must be in the ready state to take a snapshot");
    }
    my $slice = $instance->GetGeniSlice();
    if (!defined($slice)) {
	fatal("No slice for quick VM: $uuid");
    }
211
    my $authority = $instance->GetGeniAuthority();
212
213
    my $cmurl     = $authority->url();
    $cmurl =~ s/protogeni/protogeni\/stoller/ if ($usemydevtree);
214
    
Leigh B Stoller's avatar
Leigh B Stoller committed
215
216
217
218
219
220
221
222
    # The web interface (and in the future the xmlrpc interface) sets this.
    my $this_user = User->ImpliedUser();
    if (! defined($this_user)) {
	$this_user = User->ThisUser();
	if (!defined($this_user)) {
	    fatal("You ($UID) do not exist!");
	}
    }
223
    
224
    #
225
    # Might be a clone (manage_profile).
226
    #
227
    my $sliver_urn;
228
    
229
    my $profile = APT_Profile->Lookup($instance->profile_id());
230
231
232
    if (!defined($profile)) {
	fatal("Could not lookup profile for instance");
    }
233
234
235
236
237
238
239
240
    my $project = Project->Lookup($profile->pid_idx());
    if (!defined($project)) {
	fatal("Could not lookup project for profile");
    }
    
    if (defined($node_id)) {
	if (!defined($imagename)) {
	    $imagename = $profile->name() . "." . $node_id;
241
	}
242
243
    }
    else {
244
245
	if (!defined($imagename)) {
	    $imagename = $profile->name();
246
	}
247
248
249
250
251
252
253
254
255
    }
    
    #
    # Sanity check to make sure there is just one node. 
    #
    my $manifest = GeniXML::Parse($instance->manifest());
    if (! defined($manifest)) {
	fatal("Could not parse manifest");
    }
256
    my $node;
257
    my @nodes = GeniXML::FindNodes("n:node", $manifest)->get_nodelist();
258
    if (!defined($node_id)) {
259
	# We snapshot the one node in the instance.
260
261
262
	if (@nodes != 1) {
	    fatal("Too many nodes (> 1) to snapshot");
	}
263
	($node)     = @nodes;
264
	$sliver_urn = GeniXML::GetSliverId($node);
265
    }
266
    else {
267
268
	foreach my $ref (@nodes) {
	    my $client_id = GeniXML::GetVirtualId($ref);
269
	    if ($node_id eq $client_id) {
270
271
		$sliver_urn = GeniXML::GetSliverId($ref);
		$node = $ref;
272
273
274
275
276
		last;
	    }
	}
	if (!defined($sliver_urn)) {
	    fatal("Could not find node '$node_id' in manifest");
277
278
279
	}
    }

280
    #
281
282
283
    # We are not going to allow this if the instance is on a different
    # cluster then where the image was originally created, since otherwise
    # the image provenancewill look like spaghetti. 
284
    #
285
    if (defined($update_profile)) {
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
	my $diskref = GeniXML::GetDiskImage($node);
	if (defined($diskref)) {
	    my $image_url = GeniXML::GetText("url", $diskref);
	    if (defined($image_url)) {
		require URI;

		# Get the hostname for the image URL.
		my $uri = URI->new($image_url);
		if (!defined($uri)) {
		    fatal("Could not parse $image_url");
		}
		my $image_host = $uri->host();

		# Get the hostname for the authority.
		$uri = URI->new($authority->url());
		if (!defined($uri)) {
		    fatal("Could not parse authority URL");
		}
		my $authority_host = $uri->host();

		# Compare domains.
		$image_host =~ s/^([^.]+\.)//;
		$authority_host =~ s/^([^.]+\.)//;
	
		if ($image_host ne $authority_host) {
		    $errmsg  = "Not allowed to take a snapshot on this cluster";
		    $errcode = 1;
		    goto bad;
		}
	    }
	}
    }
    if ($slice->Lock()) {
	fatal("Slice is busy, cannot lock it");
    }
    $needunlock = 1;
322
323
324
325
326
327
328
329
330
331
332

    #
    # Create the webtask object, but AFTER locking the slice so we do
    # not destroy one in use.
    #
    if (defined($webtask_id)) {
	$webtask = WebTask->LookupOrCreate($instance->uuid(), $webtask_id);
	# Convenient.
	$webtask->AutoStore(1);
    }

333
334
335
336
337
338
339
340
341
342
343
344
345
346
    my $geniuser  = $instance->GetGeniUser();
    my $context   = APT_Geni::GeniContext();
    if (! (defined($geniuser) && defined($authority) &&
	   defined($slice) && defined($context))) {
	$errmsg = "Internal error getting instance objects";
	goto bad;
    }
    my ($slice_credential, $speaksfor_credential) =
	APT_Geni::GenCredentials($slice, $geniuser);
    if (! (defined($speaksfor_credential) && defined($slice_credential))) {
	$errmsg = "Internal error getting credentials";
	goto bad;
    }
    $instance->SetStatus("imaging");
347

348
349
350
351
    my $args = {
	"slice_urn"   => $slice->urn(),
	"imagename"   => $imagename,
	"sliver_urn"  => $sliver_urn,
352
	"global"      => 1,
353
354
355
	"credentials" => [$slice_credential->asString(),
			  $speaksfor_credential->asString()],
    };
356
357
358
    if ($update_prepare) {
	$args->{'update_prepare'} = 1;
    }
359
360
    #
    # This returns pretty fast, and then the imaging takes place in
361
    # the background at the aggregate. 
362
    #
363
    my $response = Genixmlrpc::CallMethod($cmurl,
364
365
366
367
368
					  $context, "CreateImage", $args);
    if (!defined($response)) {
	$errmsg = "Internal error creating image";
	$instance->SetStatus($old_status);
	goto bad;
369
    }
370
371
372
373
374
375
376
377
378
379
380
381
382
383
    if ($response->code() != GENIRESPONSE_SUCCESS) {
	$errmsg = "Could not create image: " . $response->output() . "\n";
	$instance->SetStatus($old_status);
	goto bad;
    }
    my ($image_urn, $image_url,
	$version_urn, $version_url) = @{ $response->value() };
    if (!defined($version_urn)) {
	$version_urn = $image_urn;
	$version_url = $image_url
    }
    if (defined($webtask)) {
	$webtask->image_urn($version_urn);
	$webtask->image_url($version_url);
384
385
    }
    else {
386
	print "$image_urn,$image_url\n";
387
388
389
390
391
392
    }

    #
    # Exit and leave child to poll.
    #
    if (! $debug) {
393
394
395
396
        $logfile = TBMakeLogname("snapshot");
	
	if (my $childpid = TBBackGround($logfile)) {
	    # Parent exits normally, web interface watches.
397
398
399
400
401
	    exit(0);
	}
	# Let parent exit;
	sleep(2);
    }
402
403
404
405
406
407
408
409
    # Bind the process id. This is important when the caller is
    # manage_profile, doing a clone.
    $webtask->SetProcessID($PID)
	if (defined($webtask));

    #
    # Poll for a reasonable amount of time.
    #
410
411
    my $seconds  = 1200;
    my $interval = 5;
412
    my $ready    = 0;
413
    my $sliver_ready = 0;
414
415
416
417
418
419
    my $failed   = 0;
    # do not want this in the args for calls below.
    delete($args->{'sliver_urn'});
    delete($args->{'imagename'});
    # But we need this;
    $args->{'image_urn'} = $image_urn;
420

421
    while ($seconds > 0) {
422
423
	sleep($interval);
	$seconds -= $interval;
424
425
    
	my $response =
426
	    Genixmlrpc::CallMethod($cmurl, $context, "SliverStatus", $args);
427
428
429
430
431
432
433
434
435

	if ($response->code() != GENIRESPONSE_SUCCESS &&
	    $response->code() != GENIRESPONSE_BUSY) {
	    $errmsg = "Sliverstatus failed: ". $response->output() . "\n";
	    $failed = 1;
	    last;
	}
	next
	    if ($response->code() == GENIRESPONSE_BUSY);
436

437
438
	my $blob = $response->value();
	if (defined($webtask)) {
439
440
441
442
443
444
445
446
447
448
449
	    # Always stick in full status for web interface.
	    my $statusblob = {};
	    foreach my $urn (keys(%{$blob->{'details'}})) {
		my $details = $blob->{'details'}->{$urn};
		my $node_id = $details->{'client_id'};
		$statusblob->{$node_id} = $details;
		# Special for imaging status display
		if ($urn eq $sliver_urn) {
		    $webtask->state($details->{'state'});
		    $webtask->rawstate($details->{'rawstate'});
		}
450
	    }
451
	    $webtask->sliverstatus($statusblob)
452
453
454
455
456
	}
	if ($blob->{'status'} eq "failed") {
	    $failed = 1;
	    last;
	}
457
458
459
460
	elsif ($blob->{'status'} eq "ready") {
	    $sliver_ready = 1;
	}
	
461
462
463
464
	#
	# We are watching for the image status to report ready or failed.
	#
	$response =
465
	    Genixmlrpc::CallMethod($cmurl, $context, "ImageInfo", $args);
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490

	if ($response->code() != GENIRESPONSE_SUCCESS &&
	    $response->code() != GENIRESPONSE_BUSY) {
	    $errmsg = "Imageinfo failed: ". $response->output() . "\n";
	    $failed = 1;
	    last;
	}
	next
	    if ($response->code() == GENIRESPONSE_BUSY);

	$blob = $response->value();
	if (defined($webtask)) {
	    $webtask->image_size($blob->{'size'}) 	
		if (exists($blob->{'size'}));
	    $webtask->image_status($blob->{'status'})
		if (exists($blob->{'status'}));
	}
	if ($blob->{'status'} eq "ready") {
	    $ready = 1;
	    last;
	}
	elsif ($blob->{'status'} eq "failed") {
	    $failed = 1;
	    last;
	}
491
    }
492
493
494
495
496
    if ($failed) {
	$errmsg = "Imaging failed"
	    if (!defined($errmsg));
	$errcode = 1;
	goto bad;
497
    }
498
499
500
501
502
    elsif (!$ready) {
	$errmsg  = "Imaging timed out";
	$errcode = 60;
	goto bad;
    }
503
    elsif (defined($update_profile)) {
504
505
506
507
508
509
	#
	# If successful, we create a new version of the profile and
	# update the rspec to reflect the new image version. Note
	# that we expect the CM is doing image versioning, so do not
	# bother to check if the image version is actually new.
	#
510
511
512
513
514
	my $doversions =
	    EmulabFeatures->FeatureEnabled("APT_ProfileVersions",
					   $this_user, $project);

	if ($doversions) {
515
516
517
518
519
520
521
	    $profile = $profile->NewVersion($this_user);
	    if (!defined($profile)) {
		print STDERR "Could not create new profile version\n";
		$webtask->Exited(70)
		    if (defined($webtask));
		exit(1);
	    }
522
	}
523
524
	$profile->UpdateDiskImage($node_id, $version_url,
				  ($update_profile eq "all" ? 1 : 0));
525
    }
526
527
528
529
530
531
532
533
    $instance->SetStatus("ready");
    # We garbage collect these later, so anyone waiting has a chance
    # to see the exit status
    $webtask->Exited(0)
	if (defined($webtask));
    $slice->UnLock();
    unlink($logfile)
	if (defined($logfile));
534
    exit(0);
535
  bad:
536
537
538
539
540
541
542
543
544
545
546
    if ($sliver_ready) {
	#
	# If the sliver comes back ready in spite of the imaging failure,
	# then change the instance back to ready. User will already know
	# that the imaging failed.
	#
	$instance->SetStatus("ready");
    }
    else {
	$instance->SetStatus("imaging-failed");
    }
547
548
549
550
551
    print STDERR "$errmsg\n";
    if (defined($errmsg)) {
	$webtask->Exited($errcode);
	$webtask->output($errmsg);
    }
552
553
    $slice->UnLock()
	if ($needunlock);
554
555
556
557
558
559
560
561
562
563
    if (defined($logfile)) {
	$instance->Brand()->SendEmail($instance->Brand()->OpsEmailAddress(),
				      "Snapshot failed",
				      "Error taking snapshot of $instance:\n\n".
				      "$errmsg\n",
				      $instance->Brand()->OpsEmailAddress(),
				      undef, $logfile);
	unlink($logfile);
    }
    exit($errcode);
564
}
565
566
567
568
569
570

#
# Ask the console URL for a node in an instance.
#
sub DoConsole()
{
571
    usage()
572
573
	if (!@ARGV);
    my $node_id = shift(@ARGV);
574
575
576
577
578
579

    if (defined($webtask_id)) {
	$webtask = WebTask->LookupOrCreate(undef, $webtask_id);
	if (!defined($webtask)) {
	    fatal("Could not lookup/create webtask for $webtask_id");
	}
580
581
	# Convenient.
	$webtask->AutoStore(1);
582
    }
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
    
    #
    # Sanity check to make sure the node is really in the rspec, since
    # we need its sliver urn to ask for the console url.
    #
    my $sliver_urn;
    my $manifest = GeniXML::Parse($instance->manifest());
    if (! defined($manifest)) {
	fatal("Could not parse manifest");
    }
    my @nodes = GeniXML::FindNodes("n:node", $manifest)->get_nodelist();
    foreach my $node (@nodes) {
	my $client_id = GeniXML::GetVirtualId($node);
	if ($node_id eq $client_id) {
	    $sliver_urn = GeniXML::GetSliverId($node);
	}
    }
    if (!defined($sliver_urn)) {
	fatal("Could not find node '$node_id' in manifest");
    }
Leigh B Stoller's avatar
Leigh B Stoller committed
603
    my $response = $instance->ConsoleInfo($sliver_urn);
604
    if (!defined($response)) {
Leigh B Stoller's avatar
Leigh B Stoller committed
605
606
607
608
609
610
611
612
	fatal("RPC Error calling ConsoleInfo");
    }
    if ($response->code() != GENIRESPONSE_SUCCESS) {
	$response = $instance->ConsoleURL($sliver_urn);
	if (!defined($response)) {
	    fatal("RPC Error calling ConsoleURL");
	}
	if ($response->code() != GENIRESPONSE_SUCCESS) {
Leigh B Stoller's avatar
Leigh B Stoller committed
613
614
615
	    if ($response->value()) {
		fatal($response->output());
	    }
616
617
	    fatal("Server returned error: " .
		  GENIRESPONSE_STRING($response->code));
Leigh B Stoller's avatar
Leigh B Stoller committed
618
619
620
621
622
623
624
625
626
627
628
629
	}
    }
    my $url;
    my $pswd;
	
    if (ref($response->value())) {
	$url  = $response->value()->{'url'};
	$pswd = $response->value()->{'password'}
	    if (exists($response->value()->{'password'}));
    }
    else {
	$url = $response->value();
630
    }
631
    if (defined($webtask)) {
Leigh B Stoller's avatar
Leigh B Stoller committed
632
633
634
635
636
637
638
	if ($response->code()) {
	    $webtask->output($response->output());
	}
	else {
	    $webtask->url($url);
	    $webtask->password($pswd) if (defined($pswd));
	}
639
	$webtask->Exited($response->code());
Leigh B Stoller's avatar
Leigh B Stoller committed
640
	exit($response->code());
641
642
643
644
645
    }
    # For command line operation too.
    if ($response->code()) {
	fatal($response->output());
    }
Leigh B Stoller's avatar
Leigh B Stoller committed
646
647
    print $url . "\n";
    print $pswd . "\n" if (defined($pswd));
648
649
    exit(0);
}
650

651
652
653
654
655
656
657
658
659
660
#
# Terminate
#
sub DoTerminate()
{
    my $errmsg;
    my $logfile;
    
    my $slice = $instance->GetGeniSlice();
    if (!defined($slice)) {
661
662
663
664
	#
	# No slice (typically) means we never got far enough to the
	# get the sliver created on the backend cluster.
	#
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
	goto killit;
    }
    #
    # Lock the slice in case it is doing something else, like taking
    # a disk image.
    #
    if ($slice->Lock()) {
	fatal("Slice is busy, cannot lock it");
    }
    # So we can communicate errors back to the web interface.
    if (defined($webtask_id)) {
	$webtask = WebTask->LookupOrCreate($instance->uuid(), $webtask_id);
	if (!defined($webtask)) {
	    $slice->UnLock();
	    fatal("Could not lookup/create webtask for $webtask_id");
	}
	$webtask->AutoStore(1);
    }
    my $old_status = $instance->status();
    $instance->SetStatus("terminating");

    #
    # Exit and let caller poll for status.
    #
    if (defined($webtask) && !$debug) {
        $logfile = TBMakeLogname("terminate");
	
	if (my $childpid = TBBackGround($logfile)) {
	    my $status = 0;
	    #
	    # Wait a couple of seconds to see if there is going to be an
	    # immediate error. Then return and let it continue to run. This
	    # allows the web server to see quick errors. Later errors will
	    # have to be emailed. 
	    #
	    sleep(3);
	    my $foo = waitpid($childpid, &WNOHANG);
	    if ($foo) {
		$status = $? >> 8;
	    }
	    exit($status);
	}
    }
    my $response = $instance->Terminate();
    if (!defined($response)) {
	$errmsg = "RPC Error calling Terminate";
	goto bad;
    }

    # SEARCHFAILED is success.
    if ($response->code() != GENIRESPONSE_SUCCESS &&
	$response->code() != GENIRESPONSE_SEARCHFAILED) {
	if ($response->code() == GENIRESPONSE_BUSY) {
	    $errmsg = "Slice was busy for too long; try again later?";
	    goto bad;
	}
	$errmsg = "Could not delete slice: ". $response->output();
	goto bad;
    }
    $slice->Delete();
    $instance->RecordHistory();
  killit:
    $instance->Delete();
    $webtask->Exited(0)
	if (defined($webtask));
    unlink($logfile)
	if (defined($logfile));
    exit(0);
  bad:
    print STDERR $errmsg . "\n";
    $instance->SetStatus($old_status);
    $slice->UnLock();
    if (defined($webtask)) {
	$webtask->output($errmsg);
	$webtask->Exited(1);
    }
    if (defined($logfile)) {
	$instance->Brand()->SendEmail($instance->Brand()->OpsEmailAddress(),
				      "Unable to terminate instance $uuid",
				      "Error terminating $instance:\n\n".
				      "$errmsg\n",
				      $instance->Brand()->OpsEmailAddress(),
				      undef, $logfile);
	unlink($logfile);
    }
    exit(1);
}

#
# Extend.
#
sub DoExtend()
{
758
759
    my $force = 0;
    
760
761
    usage()
	if (!@ARGV);
762
763
764
765
766
767
768
769
770
771

    if (@ARGV == 2) {
	my $arg = shift(@ARGV);
	if ($arg eq "-f") {
	    $force = 1;
	}
	else {
	    usage();
	}
    }
772
773
774
775
776
    my $seconds = shift(@ARGV);
    if ($seconds !~ /^\d*$/) {
	usage();
    }

777
    if ($instance->status() eq "failed" && !$force) {
778
779
780
781
782
783
	fatal("Cannot extend failed instance!");
    }
    my $slice = $instance->GetGeniSlice();
    if (!defined($slice)) {
	fatal("No slice for instance!");
    }
784
785
786
787
788
789
    # The web interface (and in the future the xmlrpc interface) sets this.
    my $this_user = User->ImpliedUser();
    if (! defined($this_user)) {
	$this_user = User->ThisUser();
    }

790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
    #
    # Lock the slice in case it is doing something else, like taking
    # a disk image.
    #
    if ($slice->Lock()) {
	fatal("Slice is busy, cannot lock it");
    }
    # Save in case of error.
    my $oldexpires = $slice->expires();

    # Need to update slice before creating new credential. 
    $slice->AddToExpiration($seconds);
    my $new_expires = $slice->ExpirationGMT();
    
    my $response = $instance->Extend($new_expires);
    if (!defined($response)) {
	$slice->UnLock();
	fatal("Internal error calling Extend");
    }
    if ($response->code() != GENIRESPONSE_SUCCESS) {
	# Reset back to original expiration, sorry.
	$slice->SetExpiration($oldexpires);
	$slice->UnLock();
	# This is something the user should see.
814
815
	if ($response->code() == GENIRESPONSE_REFUSED ||
	    $response->code() == GENIRESPONSE_BUSY) {
816
817
818
819
820
821
	    print STDERR $response->output() . "\n";
	    # For web interface.
	    exit(1);
	}
	fatal("Failed to extend slice: ". $response->output())
    }
822
823
824
825
826
827
828
829
830
831
    if (defined($this_user) && $this_user->IsAdmin() &&
	($seconds / (24 * 60 * 60)) > 10) {
	if (DoLockdownInternal("set", "admin")) {
	    SENDMAIL($TBOPS,
		     "Failed to lock down APT Instance",
		     "Failed to lock down $instance\n".
		     $instance->webURL() . "\n",
		     $TBOPS);
	}
    }
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
    $slice->UnLock();
    exit(0);
}

#
# Refresh; ask the aggregate for status and set the instance status
# accordingly.
#
sub DoRefresh()
{
    my $errmsg;
    
    my $slice = $instance->GetGeniSlice();
    if (!defined($slice)) {
	print STDERR "No slice for instance\n";
	goto killit;
    }
849
    
850
851
852
853
854
    #
    # Lock the slice in case it is doing something else, like taking
    # a disk image.
    #
    if ($slice->Lock()) {
855
856
857
858
859
860
861
862
863
864
865
	$errmsg = "Experiment is busy, cannot lock it. Please try again later";
	goto bad;
    }
    #
    # Create the webtask object, but AFTER locking the slice so we do
    # not destroy one in use.
    #
    if (defined($webtask_id)) {
	$webtask = WebTask->LookupOrCreate($instance->uuid(), $webtask_id);
	# Convenient.
	$webtask->AutoStore(1);
866
867
868
869
870
871
872
873
874
875
    }

    my $response = $instance->SliceStatus();
    if (!defined($response)) {
	$errmsg = "RPC Error calling SliceStatus";
	goto bad;
    }

    if ($response->code() != GENIRESPONSE_SUCCESS) {
	if ($response->code() == GENIRESPONSE_SEARCHFAILED) {
876
877
	    $errmsg = "Slice is gone";
	    goto bad;
878
879
	}
	if ($response->code() == GENIRESPONSE_BUSY) {
880
	    $errmsg = "Slice is busy; try again later";
881
882
883
884
885
886
887
888
889
890
891
892
	    goto bad;
	}
	$errmsg = "Could not get status: ". $response->output();
	goto bad;
    }
    my $blob = $response->value();
    if ($blob->{'status'} eq "ready") {
	$instance->SetStatus("ready");
    }
    elsif ($blob->{'status'} eq "failed") {
	$instance->SetStatus("failed");
    }
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
    #
    # Convert to something smaller, with info the web interface
    # cares about. 
    #
    my $statusblob = {};
    foreach my $urn (keys(%{$blob->{'details'}})) {
	my $details = $blob->{'details'}->{$urn};
	my $node_id = $details->{'client_id'};
	$statusblob->{$node_id} = $details;
    }
    if (defined($webtask)) {
	$webtask->sliverstatus($statusblob);
    }
    if ($debug) {
	print STDERR Dumper($statusblob);
    }
    $slice->UnLock();
    exit(0);
  killit:
    $instance->RecordHistory();
    $instance->Delete();
    exit(0);
  bad:
    $slice->UnLock();
    print STDERR $errmsg . "\n";
    if (defined($webtask)) {
	$webtask->output($errmsg);
	$webtask->Exited(1);
    }
    exit(1);
}

#
926
# Reboot or Reload nodes.
927
#
928
sub DoRebootOrReload($)
929
{
930
    my ($which) = @_;
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
    my $errmsg;
    my @slivers = ();
    
    usage()
	if (!@ARGV);

    my $slice = $instance->GetGeniSlice();
    if (!defined($slice)) {
	print STDERR "No slice for instance\n";
	goto killit;
    }

    my $manifest = GeniXML::Parse($instance->manifest());
    if (! defined($manifest)) {
	fatal("Could not parse manifest");
    }
    my @nodes = GeniXML::FindNodes("n:node", $manifest)->get_nodelist();
    foreach my $node (@nodes) {
	my $client_id = GeniXML::GetVirtualId($node);
	if (grep {$_ eq $client_id} @ARGV) {
	    my $sliver_urn = GeniXML::GetSliverId($node);
	    if (!defined($sliver_urn)) {
		fatal("No sliver id for $client_id");
	    }
	    push(@slivers, $sliver_urn);
	}
    }
    
    #
    # Lock the slice in case it is doing something else, like taking
    # a disk image.
    #
    if ($slice->Lock()) {
	$errmsg = "Experiment is busy, cannot lock it. Please try again later";
	goto bad;
    }

    #
    # Create the webtask object, but AFTER locking the slice so we do
    # not destroy one in use.
    #
    if (defined($webtask_id)) {
	$webtask = WebTask->LookupOrCreate($instance->uuid(), $webtask_id);
	# Convenient.
	$webtask->AutoStore(1);
    }

978
    my $response = $instance->SliverAction($which, @slivers);
979
    if (!defined($response)) {
980
	$errmsg = "RPC Error calling SliverAction";
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
	goto bad;
    }

    if ($response->code() != GENIRESPONSE_SUCCESS) {
	if ($response->code() == GENIRESPONSE_SEARCHFAILED) {
	    $errmsg = "Slice is gone";
	    goto bad;
	}
	if ($response->code() == GENIRESPONSE_BUSY) {
	    $errmsg = "Experiment is busy; try again later";
	    goto bad;
	}
	$errmsg = $response->output();
	goto bad;
    }
996
    $slice->UnLock();
997
998
999
1000
1001
1002
1003
1004
1005
    if (defined($webtask)) {
	$webtask->Exited(0);
    }
    #
    # Start the monitor so the web interface will see when the node
    # has actually come back up.
    #
    # XXX This will not return unless a monitor is already running.
    StartMonitor();
1006
1007
1008
1009
1010
1011
1012
1013
    exit(0);
  killit:
    $instance->RecordHistory();
    $instance->Delete();
    exit(0);
  bad:
    $slice->UnLock();
    print STDERR $errmsg . "\n";
1014
1015
1016
1017
    if (defined($webtask)) {
	$webtask->output($errmsg);
	$webtask->Exited(1);
    }
1018
1019
    exit(1);
}
1020
1021
sub DoReboot() { return DoRebootOrReload("reboot"); }
sub DoReload() { return DoRebootOrReload("reload"); }
1022

1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
#
# Start up the monitor for an instance. Only one though.
#
sub StartMonitor()
{
    my $logfile;
    my $needunlock = 0;
    
    my $slice = $instance->GetGeniSlice();
    if (!defined($slice)) {
	fatal("No slice for instance");
    }
    if ($instance->monitor_pid()) {
	my $pid = $instance->monitor_pid();
	if (kill(0, $pid)) {
	    print STDERR "Monitor already running ($pid). ".
		"Kill it before starting a new one.\n";
	    exit(0);
	}
	$instance->Update({"monitor_pid" => 0});
    }
    if (!$debug) {
	$logfile = TBMakeLogname("aptmonitor");
	if (TBBackGround($logfile)) {
	    exit(0);
	}
    }
    $instance->Update({"monitor_pid" => '$PID'});
    
    #
    # Need a TERM handler to clean things up.
    #
    my $handler = sub {
	unlink($logfile)
	    if (defined($logfile));
	$slice->UnLock()
	    if ($needunlock);
	exit(0);
    };
    local $SIG{TERM} = $handler;
    if ($debug) {
	local $SIG{INT} = $handler;
    }
    my $seconds  = 1500;
    my $interval = 15;
    # Shorten default timeout now.
    Genixmlrpc->SetTimeout(60);

    while ($seconds > 0) {
	sleep($interval);
	$seconds -= $interval;

	#
	# Lock the slice in case it is doing something else, like taking
	# a disk image. Just skip this turn.
	#
	next
	    if ($slice->Lock());
	$needunlock = 1;

	if (defined($webtask_id)) {
	    $webtask = WebTask->LookupOrCreate($instance->uuid(), $webtask_id);
	    # Convenient.
	    $webtask->AutoStore(1);
	}
	my $response = $instance->SliceStatus();
	if (!defined($response)) {
	    print STDERR "RPC Error calling SliceStatus\n";
	    goto skip;
	}
	if (($response->code() != GENIRESPONSE_SUCCESS &&
	     $response->code() != GENIRESPONSE_BUSY)) {
	    print STDERR "SliverStatus failed";
	    print STDERR ": " . $response->output() . "\n";
	    if (defined($webtask)) {
		if ($response->output() =~ /read timeout/) {
		    $webtask->output("Lost contact with the aggregate. " .
				     "Possibly a network failure, ".
				     "please try again later.");
		}
		else {
		    $webtask->output($response->output());
		}
	    }
	    $slice->UnLock();
	    $needunlock = 0;
	    last;
	}
	goto skip
	    if ($response->code() == GENIRESPONSE_BUSY);
    
	my $blob = $response->value();
	#
	# Convert to something smaller, with info the web interface
	# cares about. 
	#
	my $statusblob = {};
	foreach my $urn (keys(%{$blob->{'details'}})) {
	    my $details = $blob->{'details'}->{$urn};
	    my $node_id = $details->{'client_id'};
	    $statusblob->{$node_id} = $details;
	}
	if ($debug) {
	    print STDERR Dumper($statusblob);
	}
	$webtask->sliverstatus($statusblob)
	    if (defined($webtask));
	#
	# We poll until the status goes ready, to avoid continuous polling
	# for dozens of instances. Needs more thought.
	#
	if ($blob->{'status'} eq "ready") {
	    $slice->UnLock();
	    $needunlock = 0;
	    last;
	}
      skip:
	$slice->UnLock();
	$needunlock = 0;
    }
    unlink($logfile)
	if (defined($logfile));
    exit(0);
}

1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
#
# Experiment lockdown.
#
sub DoLockdownInternal($$)
{
    my ($setclr,$which) = @_;
    
    my $slice = $instance->GetGeniSlice();
    if (!defined($slice)) {
	fatal("No slice for instance");
    }
    if ($which eq "all") {
	if ($instance->SetLockdown("user", ($setclr eq "clear" ? 1 : 0))) {
	    print STDERR "Could not update instance lockdown\n";
	    return -1
	}
	$which = "admin"
    }
    if ($instance->SetLockdown($which, ($setclr eq "clear" ? 1 : 0))) {
	print STDERR "Could not update instance lockdown\n";
	return -1
    }
    my $clear = ($instance->admin_lockdown() ||
		 $instance->user_lockdown() ? 0 : 1);
    
    #
    # Have to set/clear the lockdown on the local slice.
    #
    if ($slice->SetLockdown($clear)) {
	print STDERR "Could not update slice lockdown\n";
	return -1
    }
    #
    # And tell the backend cluster to lockdown the slice.
    #
    my $response = $instance->Lockdown($clear);
    if (!defined($response)) {
	print STDERR "RPC Error calling Lockdown\n";
	# Clear this so we do not think it is locked down for real.
	$slice->SetLockdown(0);
	return -1;
    }
    if ($response->code() != GENIRESPONSE_SUCCESS) {
	print STDERR "Could not lockdown sliver: ". $response->output() . "\n";
	# Clear this so we do not think it is locked down for real.
	$slice->SetLockdown(0);
	return -1;
    }
    return 0;
}
sub DoLockdown()
{
    usage()
	if (@ARGV != 2);
    
    my $setclr = shift(@ARGV);
    my $which  = shift(@ARGV);

    fatal("Must specify either 'admin' or 'user'")
	if ($which !~ /^(admin|user|all)$/);
    fatal("Must specify either 'set' or 'clear'")
	if ($setclr !~ /^(set|clear)$/);

    if (DoLockdownInternal($setclr, $which)) {
	fatal("Could not lockdown instance!");
    }
    exit(0);
}

1217
1218
1219
1220
sub fatal($)
{
    my ($mesg) = @_;

1221
1222
1223
1224
    if (defined($webtask)) {
	$webtask->output($mesg);
	$webtask->code(-1);
    }
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
    print STDERR "*** $0:\n".
	         "    $mesg\n";
    # Exit with negative status so web interface treats it as system error.
    exit(-1);
}

sub escapeshellarg($)
{
    my ($str) = @_;

    $str =~ s/[^[:alnum:]]/\\$&/g;
    return $str;
}