Commit 92ff875a authored by Leigh B. Stoller's avatar Leigh B. Stoller
Browse files

A set of changes to make swapmod work on jailed nodes (note, swapmod

does not yet work with remove virtual nodes; that will take even more
work).

Added a new allocstate called RES_TEARDOWN. assign_wrapper no longer
deallocates unused nodes, but rather moves them into the new state for
the wrapper (tbswap) to deal with. Thats cause deleted vnodes need to
be torn down, since its possible that the node on which they were
living will not be deallocated (say, if there are other vnodes on
it). We do not want to be doing that from assign_wrapper, so tbswap
looks for those nodes.

Made vnode_setup allocstate aware in the same way that os_setup is;
do not reboot vnodes or try to set up vnodes when they are already in
the RES_READY state, as they will be when doing a swapmod. In
addition, if os_setup is going to reboot the underlying physnode, move
the vnodes on that node into RES_READY too, since there they will
setup automatically. Might need an interim state here, for correctness.
parent ffcdfeca
...@@ -103,6 +103,7 @@ use Exporter; ...@@ -103,6 +103,7 @@ use Exporter;
TBDB_ALLOCSTATE_RES_INIT_DIRTY TBDB_ALLOCSTATE_RES_INIT_CLEAN TBDB_ALLOCSTATE_RES_INIT_DIRTY TBDB_ALLOCSTATE_RES_INIT_CLEAN
TBDB_ALLOCSTATE_RES_REBOOT_DIRTY TBDB_ALLOCSTATE_RES_REBOOT_CLEAN TBDB_ALLOCSTATE_RES_REBOOT_DIRTY TBDB_ALLOCSTATE_RES_REBOOT_CLEAN
TBDB_ALLOCSTATE_RES_READY TBDB_ALLOCSTATE_UNKNOWN TBDB_ALLOCSTATE_RES_READY TBDB_ALLOCSTATE_UNKNOWN
TBDB_ALLOCSTATE_RES_TEARDOWN
TBDB_STATS_PRELOAD TBDB_STATS_START TBDB_STATS_TERMINATE TBDB_STATS_PRELOAD TBDB_STATS_START TBDB_STATS_TERMINATE
TBDB_STATS_SWAPIN TBDB_STATS_SWAPOUT TBDB_STATS_SWAPMODIFY TBDB_STATS_SWAPIN TBDB_STATS_SWAPOUT TBDB_STATS_SWAPMODIFY
...@@ -442,6 +443,7 @@ sub TBDB_ALLOCSTATE_RES_REBOOT_CLEAN() { "RES_REBOOT_CLEAN"; } ...@@ -442,6 +443,7 @@ sub TBDB_ALLOCSTATE_RES_REBOOT_CLEAN() { "RES_REBOOT_CLEAN"; }
sub TBDB_ALLOCSTATE_RES_INIT_DIRTY() { "RES_INIT_DIRTY"; } sub TBDB_ALLOCSTATE_RES_INIT_DIRTY() { "RES_INIT_DIRTY"; }
sub TBDB_ALLOCSTATE_RES_INIT_CLEAN() { "RES_INIT_CLEAN"; } sub TBDB_ALLOCSTATE_RES_INIT_CLEAN() { "RES_INIT_CLEAN"; }
sub TBDB_ALLOCSTATE_RES_READY() { "RES_READY"; } sub TBDB_ALLOCSTATE_RES_READY() { "RES_READY"; }
sub TBDB_ALLOCSTATE_RES_TEARDOWN() { "RES_TEARDOWN"; }
sub TBDB_ALLOCSTATE_UNKNOWN() { "UNKNOWN"; }; sub TBDB_ALLOCSTATE_UNKNOWN() { "UNKNOWN"; };
sub TBDB_TBCONTROL_RESET { "RESET"; } sub TBDB_TBCONTROL_RESET { "RESET"; }
......
...@@ -259,6 +259,15 @@ my %nobwshaping = (); ...@@ -259,6 +259,15 @@ my %nobwshaping = ();
# max_concurrent. # max_concurrent.
my %virtnodeosids = (); my %virtnodeosids = ();
# Virtual nodes that the user has requested be "fixed" to a specific
# physical node.
my %fixed_nodes = ();
# Support for experiment modify.
my %alreadyAllocated = ();
my %reserved_pnodes = ();
my %reserved_vnodes = ();
# #
# This is for stats gathering. It might duplicate other stuff, but # This is for stats gathering. It might duplicate other stuff, but
# thats okay. # thats okay.
...@@ -332,24 +341,42 @@ my $remotecount = 0; ...@@ -332,24 +341,42 @@ my $remotecount = 0;
my $virtcount = 0; my $virtcount = 0;
my $virtnode_id = 0; my $virtnode_id = 0;
my %alreadyAllocated = (); #
# If updating (modify) find out what nodes need to be fixed in the new
# run assign run. virtnodes complicate this!
#
if ($updating) { if ($updating) {
printdb "Fixing previously allocated nodes.\n"; printdb "Fixing previously allocated nodes.\n";
$result = $result =
DBQueryFatal("SELECT vname, node_id ". DBQueryFatal("select r.vname,r.node_id,n.phys_nodeid ".
"FROM reserved ". " from reserved as r ".
"WHERE pid='$pid' AND eid='$eid'"); "left join nodes as n on n.node_id=r.node_id ".
"where r.pid='$pid' and r.eid='$eid'");
while (($vname,$reserved) = $result->fetchrow_array) { while (($vname,$reserved,$physnode) = $result->fetchrow_array) {
$reserved_nodes{$vname} = $reserved; #
$fixed_nodes{$vname} = $reserved; # If a virtnode, we need the underlying physnode, since thats
$alreadyAllocated{$reserved} = "unused"; # what we fix it to. But we still need to remember which virtnodes
# are allocated for later.
#
# WIDEAREA nodes are going to break.
#
if ($reserved ne $physnode) {
$reserved_vnodes{$vname} = $reserved;
$reserved_pnodes{$vname} = $physnode;
$fixed_nodes{$vname} = $physnode;
$alreadyAllocated{$physnode} = "unused";
$alreadyAllocated{$reserved} = "unused";
}
else {
$reserved_pnodes{$vname} = $reserved;
$fixed_nodes{$vname} = $reserved;
$alreadyAllocated{$reserved} = "unused";
}
} }
$result->finish; $result->finish;
} }
printdb "Loading virt_nodes.\n"; printdb "Loading virt_nodes.\n";
$result = $result =
DBQueryFatal("select distinct vn.vname,vn.ips,vn.type,vn.fixed,vn.osname,". DBQueryFatal("select distinct vn.vname,vn.ips,vn.type,vn.fixed,vn.osname,".
...@@ -954,7 +981,7 @@ foreach $fixed (keys(%fixed_nodes)) { ...@@ -954,7 +981,7 @@ foreach $fixed (keys(%fixed_nodes)) {
if (!$isremotenode{$fixed} && if (!$isremotenode{$fixed} &&
(exists $nodes{$fixed} || exists $delaynodes{$fixed}) ) { (exists $nodes{$fixed} || exists $delaynodes{$fixed}) ) {
print TOPFILE "fix-node $fixed $fixed_nodes{$fixed}\n"; print TOPFILE "fix-node $fixed $fixed_nodes{$fixed}\n";
if ($reserved_nodes{$fixed}) { $reused_count++; } if ($reserved_pnodes{$fixed}) { $reused_count++; }
} }
} }
...@@ -1097,15 +1124,58 @@ while (1) { ...@@ -1097,15 +1124,58 @@ while (1) {
# #
# Look at virtual node being mapped to node; # Look at virtual node being mapped to node;
# if it wasn't in the previous map, mark node for reboot. # if it wasn't in the previous map, mark node for reboot.
# #
if (! exists $reserved_nodes{$virtual} || if ($alreadyAllocated{$physical} eq "reboot") {
$reserved_nodes{$virtual} ne $physical || # No changes once it goes into reboot.
$alreadyAllocated{$physical} eq "reboot") { ;
$alreadyAllocated{$physical} = "reboot";
} else {
$alreadyAllocated{$physical} = "reused";
} }
} else { elsif ($isvirtnode{$virtual}) {
#
# A new virt virtual node on an existing physical node
# does not force the physnode to be rebooted; we can
# set up a new virtnode on it without a reboot. If its
# an existing virtual on the same physnode, then mark
# both as reused; no need to reboot either. If the the
# virtnode has moved here from someplace else, no
# reboot of the physnode either, but obviously the
# vnode will be released and a new one allocated. What
# we cannot determine is if its just a renamed node
# (which would require a reboot of the the virtual
# node).
#
if (!exists($reserved_pnodes{$virtual})) {
$alreadyAllocated{$physical} = "reused";
}
elsif ($reserved_pnodes{$virtual} eq $physical) {
my $reserved = $reserved_vnodes{$virtual};
$alreadyAllocated{$reserved} = "reused";
$alreadyAllocated{$physical} = "reused";
}
else {
$alreadyAllocated{$physical} = "reused";
}
}
else {
#
# If a new virtual node mapped to this physnode (maybe
# even the luser changed the name of the node), or if an
# existing virtual node moved to this physnode, must
# reboot the physnode. Else, the physnode is being
# reused as is, and no need to mess with it. If the
# user requested reboot, that will be handled outside
# of this script.
#
if (!exists($reserved_pnodes{$virtual}) ||
$reserved_pnodes{$virtual} ne $physical) {
$alreadyAllocated{$physical} = "reboot";
}
else {
$alreadyAllocated{$physical} = "reused";
}
}
}
else {
# #
# This is a new node; we'll have to reserve it. # This is a new node; we'll have to reserve it.
# #
...@@ -1206,28 +1276,30 @@ while (1) { ...@@ -1206,28 +1276,30 @@ while (1) {
TBDebugTimeStamp("reserving finished"); TBDebugTimeStamp("reserving finished");
my %tolose = (); #
# Release phys and virt nodes no longer needed. They are marked
# for teardown. They need to be freed by SOMEONE, currently the
# wrapper (tbswap), since this only happens when in update mode
# (swapmod).
#
foreach $node (keys(%alreadyAllocated)) { foreach $node (keys(%alreadyAllocated)) {
if ($alreadyAllocated{$node} eq "unused") { if ($alreadyAllocated{$node} eq "unused") {
# #
# Node was used in previous incarnation, but not any more. # Node was used in previous incarnation, but not any more.
# #
$tolose{$node} = 1; TBSetNodeAllocState($node,
} elsif ($alreadyAllocated{$node} eq "reboot") { TBDB_ALLOCSTATE_RES_TEARDOWN());
}
elsif ($alreadyAllocated{$node} eq "reboot") {
# #
# Node is being reused, but for a different purpose, so # Node is being reused, but for a different purpose, so
# it should be rebooted. # it should be rebooted.
# #
TBSetNodeAllocState( $node, TBDB_ALLOCSTATE_RES_INIT_DIRTY() ); TBSetNodeAllocState($node,
} TBDB_ALLOCSTATE_RES_INIT_DIRTY());
}
if ((keys %tolose) > 0) {
if (system("nfree $pid $eid " . join(" ", keys(%tolose)))) {
print "Failed to free no-longer-needed nodes!";
} }
} }
last; last;
} }
} }
...@@ -1354,13 +1426,14 @@ if (scalar(keys(%isremotenode))) { ...@@ -1354,13 +1426,14 @@ if (scalar(keys(%isremotenode))) {
# #
if ($updating) { if ($updating) {
foreach $node (keys(%alreadyAllocated)) { foreach $node (keys(%alreadyAllocated)) {
my $result = DBQueryFatal("select nt.control_net ". my $result = DBQueryFatal("select nt.control_net,nt.isvirtnode ".
"from nodes as n ". "from nodes as n ".
"left join node_types as nt ". "left join node_types as nt ".
"on nt.type=n.type ". "on nt.type=n.type ".
"where n.node_id='$node'"); "where n.node_id='$node' and ".
" nt.isvirtnode=0");
my ($control) = $result->fetchrow_array(); my ($control,$isvirtnode) = $result->fetchrow_array();
my $pred = "1"; my $pred = "1";
...@@ -1370,10 +1443,12 @@ if ($updating) { ...@@ -1370,10 +1443,12 @@ if ($updating) {
DBQueryFatal("update interfaces set IP='' " . DBQueryFatal("update interfaces set IP='' " .
"where node_id='$node' and $pred"); "where node_id='$node' and $pred");
# Clean the veth_interfaces table for this node too.
DBQueryFatal("delete from veth_interfaces where node_id='$node'");
} }
} }
# #
# VIRTNODES HACK: Local virtnodes have to be mapped now. This is a little # VIRTNODES HACK: Local virtnodes have to be mapped now. This is a little
# hokey in that the virtnodes just need to be allocated from the pool that # hokey in that the virtnodes just need to be allocated from the pool that
...@@ -1382,50 +1457,104 @@ if ($updating) { ...@@ -1382,50 +1457,104 @@ if ($updating) {
# #
foreach my $pnode (keys(%virtnodes)) { foreach my $pnode (keys(%virtnodes)) {
my @vlist = @{$virtnodes{$pnode}}; my @vlist = @{$virtnodes{$pnode}};
my $numvs = @vlist;
my @plist = (); my @plist = ();
my @oplist = ();
my @ovlist = ();
# #
# Run avail to get the list of virtnodes on the phys node. We already # If updating, need to watch for nodes that are already reserved.
# know there are enough, since assign knows that. # We save that info in oplist/ovlist, and build a new vlist for
# avail, of just the nodes we need in this run.
# #
my $num = @vlist; if ($updating) {
my @newvlist = ();
my @delvlist = ();
foreach my $vnode (@vlist) {
if (!defined($reserved_vnodes{$vnode})) {
# A new vnode on pnode to allocate.
push(@newvlist, $vnode);
next;
}
if ($reserved_pnodes{$vnode} ne $pnode) {
# A vnode moved. Its new to this pnode.
print "$vnode has moved from $reserved_pnodes{$vnode} ".
"to $pnode!\n";
push(@newvlist, $vnode);
next;
}
print "Asking avail for $num vnodes on $pnode\n"; # Push already allocated p/v onto lists for later.
push(@oplist, $reserved_vnodes{$vnode});
open(AVAIL,"$TBROOT/sbin/avail virtonly=$pnode rand limit=$num |") push(@ovlist, $vnode);
or fatal(1, "*** $0:\n". }
" avail failed\n"); # These are the new nodes we need to allocate
@vlist = @newvlist;
$numvs = scalar(@vlist);
while (<AVAIL>) { if (@oplist) {
next print "Reusing vnodes @oplist\n";
if (! /^\|/); }
next }
if (/node_id/);
if ($_ =~ /^\|([-a-zA-Z0-9]+)\s*\|(\w+)\s*\|(\w+)\s*\|$/) { #
push(@plist, $1); # Still need to allocate some virtnodes?
#
if ($numvs) {
#
# Run avail to get the list of virtnodes on the phys node. We
# already know there are enough, since assign knows that.
#
print "Asking avail for $numvs vnodes on $pnode\n";
open(AVAIL,"$TBROOT/sbin/avail virtonly=$pnode rand limit=$numvs |")
or fatal(1, "*** $0:\n".
" avail failed\n");
while (<AVAIL>) {
next
if (! /^\|/);
next
if (/node_id/);
if ($_ =~ /^\|([-a-zA-Z0-9]+)\s*\|(\w+)\s*\|(\w+)\s*\|$/) {
push(@plist, $1);
}
else {
fatal(1, "*** $0:\n".
" Bad line from avail: $_\n");
}
} }
else { close(AVAIL);
# Sanity check.
if (scalar(@vlist) != scalar(@plist)) {
fatal(1, "*** $0:\n". fatal(1, "*** $0:\n".
" Bad line from avail: $_\n"); " Could not map some virtual nodes on $pnode\n");
} }
}
close(AVAIL);
if (scalar(@vlist) != scalar(@plist)) { #
fatal(1, "*** $0:\n". # Try to allocate. Note, if this fails we are done for. Okay for now
"Could not map some virtual nodes on $pnode\n"); # since it is never the case that it should fail!
#
print "Reserving ($pnode) @plist ...\n";
if (system("nalloc $pid $eid @plist")) {
fatal(1, "*** $0:\n".
" Failed to reserve @plist (on $pnode)\n");
}
} }
# if ($updating) {
# Try to allocate. Note, if this fails we are done for. Okay for now #
# since it is never the case that it should fail! # Append the lists we created above, so that we get all of them
# # in the loop below.
print "Reserving ($pnode) @plist ...\n"; #
if (system("nalloc $pid $eid @plist")) { @plist = (@plist, @oplist);
fatal(1, "*** $0:\n". @vlist = (@vlist, @ovlist);
"Failed to reserve @plist (on $pnode)\n");
} }
while (@plist) { while (@plist) {
my $physical = pop(@plist); my $physical = pop(@plist);
my $virtual = pop(@vlist); my $virtual = pop(@vlist);
...@@ -1439,9 +1568,19 @@ foreach my $pnode (keys(%virtnodes)) { ...@@ -1439,9 +1568,19 @@ foreach my $pnode (keys(%virtnodes)) {
TBIsNodeVirtual($physical, \$jailflag); TBIsNodeVirtual($physical, \$jailflag);
$jailed{$virtual} = $jailflag; $jailed{$virtual} = $jailflag;
# Virtual nodes are always clean. Also prevents errors elsewhere. #
TBSetNodeAllocState($physical, TBDB_ALLOCSTATE_RES_INIT_CLEAN()); # New virtual nodes are always clean. Old ones stay in whatever
# state they were in so that os_setup/vnode_setup know they
# need to reboot them.
#
if (!defined($reserved_vnodes{$virtual})) {
TBSetNodeAllocState($physical, TBDB_ALLOCSTATE_RES_INIT_CLEAN());
}
elsif ($reserved_vnodes{$virtual} ne $physical) {
# Node has moved! Nuts!
TBSetNodeAllocState($physical, TBDB_ALLOCSTATE_RES_INIT_DIRTY());
}
} }
} }
...@@ -2461,8 +2600,7 @@ sub InitPnode($pnode, $vnode) ...@@ -2461,8 +2600,7 @@ sub InitPnode($pnode, $vnode)
# #
# A local pnode hosting jails. Set the vname to something useful. # A local pnode hosting jails. Set the vname to something useful.
# #
$vname = "vhost-" . $virtnode_id; $vname = newvhostvname($pnode);
$virtnode_id++;
DBQueryFatal("update reserved set vname='$vname' " . DBQueryFatal("update reserved set vname='$vname' " .
"where node_id='$pnode'"); "where node_id='$pnode'");
...@@ -2836,3 +2974,36 @@ sub NewVethIface($$;$) ...@@ -2836,3 +2974,36 @@ sub NewVethIface($$;$)
return "veth" . $query_result->insertid; return "veth" . $query_result->insertid;
} }
#
# Give me a new name for a pnode hosting jails. We have to watch for names
# that were made up previously (say, if this is an update). Not allowed to
# reuse names of course. We do not mark nodes as hosting, so have to infer
# this from reserved_pnodes. I'm sure there is a better way to do this.
#
sub newvhostvname($)
{
my ($pnode) = @_;
#
# First check to see if this pnode was already allocated (update)
#
foreach my $vname (keys(%reserved_pnodes)) {
# Skip the v mappings.
next
if (exists($reserved_vnodes{$vname}));
if ($pnode eq $reserved_pnodes{$vname}) {
return $vname;
}
}
while (1) {
my $newname = "vhost-" . $virtnode_id;
$virtnode_id++;
if (!defined($reserved_pnodes{$newname})) {
return $newname;
}
}
}
...@@ -82,6 +82,7 @@ my @row; ...@@ -82,6 +82,7 @@ my @row;
# #
my %reloads = (); my %reloads = ();
my %reboots = (); my %reboots = ();
my %willreboot = ();
my $doautoload = 1; my $doautoload = 1;
my $dolastload = 1; my $dolastload = 1;
...@@ -426,14 +427,26 @@ foreach my $vnode (keys(%vnodes)) { ...@@ -426,14 +427,26 @@ foreach my $vnode (keys(%vnodes)) {
if (!defined($pnodevcount{$pnode})); if (!defined($pnodevcount{$pnode}));
$pnodevcount{$pnode}++; $pnodevcount{$pnode}++;
$vnode2pnode{$vnode} = $pnode; $vnode2pnode{$vnode} = $pnode;
if (!defined($nodes{$pnode})) { if (!exists($nodes{$pnode})) {
# #
# Typical on remote nodes; we do not allocate the underlying # Typical on remote nodes; we do not allocate the underlying
# phys node to the experiment. # phys node to the experiment.
# #
next; next;
} }
#
# Set the allocstate for the local vnode. Used to by vnode_setup to
# determine if a reboot is required. If the underlying physnode is
# going to be rebooted, move the node into the RES_READY state, since
# by definition, when the node reboots the vnode is going to get
# set up, and so vnode_setup should not do anything. Might need an
# intermediate state here, but not sure yet.
#
if (exists($reboots{$pnode})) {
TBSetNodeAllocState($vnode, TBDB_ALLOCSTATE_RES_READY());
}
# Nothing else to do for local jail nodes at this time ... # Nothing else to do for local jail nodes at this time ...
} }
...@@ -446,6 +459,21 @@ if (!$TESTMODE) { ...@@ -446,6 +459,21 @@ if (!$TESTMODE) {
my $count = 0; my $count = 0;
my $cmd; my $cmd;
foreach my $imageid ( keys(%reloads) ) {
my @list = @{ $reloads{$imageid} };
foreach my $node (@list) {
TBSetNodeAllocState( $node, TBDB_ALLOCSTATE_RES_RELOAD() );
$nodeAllocStates{$node} = TBDB_ALLOCSTATE_RES_RELOAD();
# No point in rebooting, obviously.
delete $reboots{$node};
}
sleep(5);
$pids{"$os_load -m $imageid @list"} =
ForkCmd("$os_load -m $imageid @list");
}
if (keys(%reboots)) { if (keys(%reboots)) {
foreach my $node (keys(%reboots)) { foreach my $node (keys(%reboots)) {
if ($nodeAllocStates{$node} eq TBDB_ALLOCSTATE_RES_INIT_CLEAN()) { if ($nodeAllocStates{$node} eq TBDB_ALLOCSTATE_RES_INIT_CLEAN()) {
...@@ -461,19 +489,6 @@ if (!$TESTMODE) { ...@@ -461,19 +489,6 @@ if (!$TESTMODE) {
$pids{$cmd} = ForkCmd($cmd); $pids{$cmd} = ForkCmd($cmd);
} }
foreach my $imageid ( keys(%reloads) ) {
my @list = @{ $reloads{$imageid} };
foreach my $node (@list) {
TBSetNodeAllocState( $node, TBDB_ALLOCSTATE_RES_RELOAD() );
$nodeAllocStates{$node} = TBDB_ALLOCSTATE_RES_RELOAD();
}
sleep(5);
$pids{"$os_load -m $imageid @list"} =
ForkCmd("$os_load -m $imageid @list");
}
foreach $cmd ( keys(%pids) ) { foreach $cmd ( keys(%pids) ) {
my $pid = $pids{$cmd}; my $pid = $pids{$cmd};
...@@ -637,10 +652,11 @@ elsif (@vnodelist) { ...@@ -637,10 +652,11 @@ elsif (@vnodelist) {
my $node = shift(@vnodelist); my $node = shift(@vnodelist);
my $pnode = $vnode2pnode{$node}; my $pnode = $vnode2pnode{$node};
my $wstart = $waitstart{$node};