nseswap.in 6.43 KB
Newer Older
1 2 3 4
#!/usr/bin/perl -wT

#
# EMULAB-COPYRIGHT
5
# Copyright (c) 2000-2003, 2007 University of Utah and the Flux Group.
6 7 8
# All rights reserved.
#

9
use Fcntl ':flock';
10 11 12
use English;
use Getopt::Std;
use Socket;
13
use IO::Handle;     # thousands of lines just for autoflush :-(
14 15 16 17 18 19 20 21 22 23 24
    
#
# In an experiment with simulated nodes, if some instance of the
# simulator (nse) on a physical node is unable to track real time, the
# node requests that the experiment be re-mapped and run with a more
# conservative co-location factor. This script implements the above
# functionality.
#
sub usage()
{
    print STDOUT
25
	"Usage: nseswap [-v] pid eid <eventargs>\n";
26 27 28
 
    exit(-1);
}
29
my  $optlist = "v";
30 31 32 33 34 35 36 37

#
# Configure variables
#
my $TB       = "@prefix@";
my $TBOPS    = "@TBOPSEMAIL@";
my $CONTROL  = "@USERNODE@";
my $TESTMODE = @TESTMODE@;
38
my $TBLOGS = "@TBLOGSEMAIL@";
39 40

# Locals
41
sub swapout_on_max_retries();
42 43
my $pid;
my $eid;
44 45
my $eventargs;
my $simhost = "";
46 47
my $max_retries = 100;
my $verbose = 1;
48 49 50 51 52 53 54

sub printdb ($)
{
    if ($verbose) {
	print $_[0];
    }
}
55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72

#
# Turn off line buffering on output
#
$| = 1;

#
# Untaint the path
# 
$ENV{'PATH'} = "$TB/bin:$TB/sbin:/bin:/usr/bin:/sbin:/usr/sbin";
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};

#
# Testbed Support libraries
#
use lib "@prefix@/lib";
use libdb;
use libtestbed;
73
use User;
74

75 76 77 78 79 80 81 82 83 84 85 86 87
#
# Parse command arguments. Once we return from getopts, all that should
# left are the required arguments.
#
%options = ();
if (! getopts($optlist, \%options)) {
    usage();
}
if (defined($options{"v"})) {
    $verbose = 1;
}

if (@ARGV != 3) {
88 89 90 91 92
    usage();
}

$pid   = $ARGV[0];
$eid   = $ARGV[1];
93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110
$eventargs  = $ARGV[2];

#
# Untaint the arguments.
#
if ($pid =~ /^([-\@\w.]+)$/) {
    $pid = $1;
}
else {
    die("Tainted argument $pid!\n");
}
if ($eid =~ /^([-\@\w.]+)$/) {
    $eid = $1;
}
else {
    die("Tainted argument $eid!\n");
}

111 112 113 114 115 116 117 118
#
# Verify user and get his DB uid and other info for later.
#
my $this_user = User->ThisUser();
if (! defined($this_user)) {
    tbdie("You ($UID) do not exist!");
}

119 120 121 122 123 124
my $exptidx;
if (!TBExptIDX($pid, $eid, \$exptidx)) {
    die("*** $0:\n".
	"    No such experiment $pid/$eid!");
}

125 126 127 128 129 130
my $argpat  = q(SIMHOST=([-\w]+));
if ( $eventargs =~ /$argpat/ ) {
    $simhost = $1;
}
my $lockfile    = "/var/tmp/$pid-$eid-nseswap-lockfile";
	       
131
TBDebugTimeStampsOn();
132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
my $query_result =
    DBQueryFatal("select node_id from reserved where pid='$pid' and ".
		 "eid='$eid' and vname='$simhost'");

if (! $query_result->numrows) {
    # print warning in some log
    print STDERR "*** $0: \"$simhost\" is not in the reserved table\n";
    exit(1);
}
my ($node_id) = $query_result->fetchrow_array();

# Update the DB with info from the NSESWAP event
# and be done with it
DBQueryFatal("lock tables reserved write");
DBQueryFatal("update reserved set simhost_violation='1' ".
            "where node_id='$node_id' ".
	    "and pid='$pid' and eid='$eid'");
printdb "node:$node_id simhost=$simhost simhost_violation set\n";
    
DBQueryFatal("unlock tables");

$query_result =
         DBQueryFatal("select vname from v2pmap where ".
		      "pid='$pid' and eid='$eid' and node_id='$node_id'");
156

157 158 159 160 161 162 163 164 165 166 167 168
while( ($vname) = $query_result->fetchrow_array() ) {

    my $query2_result = DBQueryFatal("select nodeweight from ".
	                             "virt_simnode_attributes where ".
				     "pid='$pid' and eid='$eid' and ".
				     "vname='$vname'");
    my $nodeweight = 2;				 
    if ( $query2_result->numrows ) {
	($nodeweight) = $query2_result->fetchrow_array();
	$nodeweight *= 2;
    }
    DBQueryFatal("replace into virt_simnode_attributes ".
169 170
	         "(exptidx,pid,eid,vname,nodeweight) values ".
		 "('$exptidx','$pid','$eid','$vname','$nodeweight')");
171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187
}

#
# We need to serialize this script since multiple pnodes
# could be reporting an error where nse can't keep up.
# The first pnode that caused this will run and wait for
# a little while to see if there are other pnodes reporting
# errors. Eventually, the first nseswap will cause re-swapin
# of the experiment. The subsequent nseswap scripts will just
# update the DB and be done with it
#
umask(002);
open(LOCK, ">>$lockfile") || fatal("Couldn't open $lockfile\n");
if (flock(LOCK, LOCK_EX|LOCK_NB)) {

    swapout_on_max_retries();

188 189 190
    my $qr = DBQueryFatal("select sim_reswap_count from experiments ".
			  "where eid='$eid' and pid='$pid'");
    my ($sim_reswap_count) = $qr->fetchrow_array();
191 192
    # We wait for a few seconds to let any other pnodes that may not
    # be able to track real-time
193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217
    if ($sim_reswap_count == 0) {
	sleep(60);
    } else {
	sleep(300);
    }
    if( $verbose ) {
	LOCK->autoflush(1);
	print LOCK "################ Reswap Count:$sim_reswap_count " . 
	           "################\n";
	$qr = DBQueryFatal("select vname,node_id,simhost_violation from reserved ".
	                   "where pid='$pid' and eid='$eid' " .
	                   "and erole='simhost' order by vname");
	while( my ($vname,$node_id,$violation) = $qr->fetchrow_array()) {
	    my $qr2 = DBQueryFatal("select vname from v2pmap " .
		                   "where pid='$pid' and eid='$eid' " .
	                           "and node_id='$node_id' order by vname");
	    print LOCK "vname:$vname node_id:$node_id numvnodes:" . $qr2->numrows() .
	               " " . ($violation ? "violation; " : " ; ") ;
	    while( my ($vnode) = $qr2->fetchrow_array()) {
		print LOCK "$vnode ";
	    }
	    print LOCK "\n";
	}
	print LOCK "################################################ \n";
    }
218 219 220 221 222

    DBQueryFatal("update experiments set sim_reswap_count=sim_reswap_count+1 ".
	         "where eid='$eid' and pid='$pid'");

    # do a swap modify 
223
    system("swapexp -w -e -r -s modify $pid $eid");
224 225 226 227 228 229
}

#
# Close the lock file. Exiting releases it, but might as well.
#
close(LOCK);
230
exit(0);
231 232 233 234 235 236 237 238 239 240

sub swapout_on_max_retries() {

    my $query_result =
          DBQueryFatal("select sim_reswap_count from experiments where eid='$eid' ".
	               "and pid='$pid'");

    my ($sim_reswap_count) = $query_result->fetchrow_array();

    if ($sim_reswap_count >= $max_retries) {
241 242
	my $user_name  = $this_user->name();
	my $user_email = $this_user->email();
243

244 245 246
        my $message =
            "Experiment $pid/$eid reached max retries:$max_retries ".
	    "trying to re-map\n".
247 248 249 250 251 252 253 254 255 256 257 258 259 260
            "simulated nodes. Forcibly swapping out the experiment\n";

    	SENDMAIL("$user_name <$user_email>",
	  	 "Experiment $pid/$eid Swapping out",
		 $message,
		 $TBOPS,
		 "Bcc: $TBLOGS");

    	system("swapexp -f -s out $pid $eid");
	sleep(10);
    	exit(2);
    }
    return;
}