Commit 895a44f6 authored by Leigh B. Stoller's avatar Leigh B. Stoller

Largish rework of nfree. Started out that I just wanted to map the

default OSID from the node_types table, to a specific OSID from the
partition table on the actual node. This is to avoid setting the boot
OSID to RHL_STD when the node is released, which causes a boot
failure. Okay, so I added a library routine to do this (yanked out of
os_setup where I did the code originally). This would solve most of
the problems, except where there was no OS loaded that would satisfy
the mapping, in which case the user must have done an os_load, and now
that auto schedules a reload. Anyway, seemed like this should work.
Ha! Mysql locking is downright dumb; all tables used within a lock
region must be locked. nfree was already locking 9 tables, and in
order to call out to library routines (which might use anything) I
would have to lock the world, which is not actually possible anyway.
Why all this locking in nfree in the first place? The idea is that
there is a race between releasing the node from reserved, and cleaning
up all those tables (interfaces, delays, nodes, etc). We don't want to
free a node, and have it get allocated to another experiment before
the cleanup is done, since that would mess up the state of the node.
The solution (albiet a crufty one) was to lock just the reserved table
(which guards against multiple people trying to nfree the same node at
the same time) and switch the reservation out of the pid,eid and into
a holding reservation. This effectively removes the node from the
users control, but keeps it reserved. Then I unlock the reserved
table. With that done, I can clean up all those tables without any
locking, since the node is still reserved. After cleanup, I can either
delete the reservation, or move it to the next reserve or reload
reservation if those were pending. No locking is needed at this point
since single table changes are atomic (and nalloc locks reserved
anyway). Okay, so now we sit back and see if this was a good idea.
parent 7cd86ce9
......@@ -59,6 +59,10 @@ use Exporter;
TB_DEFAULT_RELOADTYPE TB_RELOADTYPE_FRISBEE TB_RELOADTYPE_NETDISK
TB_EXPTPRIORITY_LOW TB_EXPTPRIORITY_HIGH
TB_ASSIGN_TOOFEWNODES
TBAdmin TBProjAccessCheck TBNodeAccessCheck TBOSIDAccessCheck
TBImageIDAccessCheck TBExptAccessCheck ExpLeader MarkNodeDown
SetNodeBootStatus OSFeatureSupported IsShelved NodeidToExp
......@@ -66,7 +70,7 @@ use Exporter;
DBQuoteSpecial UNIX2DBUID ExpState SetExpState ProjLeader
ExpNodes DBDateTime DefaultImageID GroupLeader TBGroupUnixInfo
TBValidNodeLogType TBValidNodeName TBSetNodeLogEntry
TBSetSchedReload
TBSetSchedReload MapNodeOSID
);
# Must come after package declaration!
......@@ -200,6 +204,13 @@ sub TB_RELOADTYPE_NETDISK() { "netdisk"; }
sub TB_RELOADTYPE_FRISBEE() { "frisbee"; }
sub TB_DEFAULT_RELOADTYPE() { TB_RELOADTYPE_NETDISK; }
# Experiment priorities.
sub TB_EXPTPRIORITY_LOW() { 0; }
sub TB_EXPTPRIORITY_HIGH() { 20; }
# Assign exit status for too few nodes.
sub TB_ASSIGN_TOOFEWNODES() { 2; }
#
# We should list all of the DB limits.
#
......@@ -806,7 +817,20 @@ sub ExpNodes($$)
return ();
}
while (@row = $query_result->fetchrow_array()) {
push(@nodes, $row[0]);
$node = $row[0];
#
# Taint check. I do not understand this sillyness, but if I
# taint check these node names, I avoid warnings throughout.
#
if ($node =~ /^([\w]+)$/) {
$node = $1;
push(@nodes, $node);
}
else {
print "*** $0: WARNING: Bad node name: $node.\n";
}
}
return @nodes;
}
......@@ -1262,5 +1286,72 @@ sub MapNumericUID($)
return $name;
}
#
# Map a generic OSID to a specific OSID for the actual node in question.
# The intent is that, for example, RHL-STD needs to be mapped to the
# specific version of RHL that is loaded on the machine. This bit of code
# does that mapping, return 0 if no mapping could be made.
#
# usage: MapNodeOSID(char *node, char *osid)
# Return the new osid if mapping successful (or actual osid loaded).
# Return 0 for all errors and if mapping not possible.
#
sub MapNodeOSID($$)
{
my ($node, $osid) = @_;
#
# See id this this OSID is actually loaded on the machine.
#
my $p_result =
DBQueryWarn("select * from partitions ".
"where node_id='$node' and osid='$osid'");
if (!$p_result) {
return 0;
}
if ($p_result->numrows) {
return $osid;
}
#
# Get OSID info.
#
my $osid_result =
DBQueryWarn("select * from os_info where osid='$osid'");
if (!$osid_result || $osid_result->numrows == 0) {
return 0;
}
my %osid_row = $osid_result->fetchhash();
#
# If its a specific Version, and its not loaded on the machine,
# nothing to do.
#
if (defined($osid_row{'version'}) && $osid_row{'version'} ne "") {
return 0;
}
#
# Try to map from a generic name to the specific name of the OS
# that *is* loaded.
#
my $o_result =
DBQueryWarn("select o1.* from os_info as o1 ".
"left join partitions as p on o1.osid=p.osid ".
"left join os_info as o2 on o2.OS=o1.OS ".
"where p.node_id='$node' and o2.osid='$osid'");
if (!$o_result || $o_result->numrows == 0) {
return 0;
}
my %o_row = $o_result->fetchhash();
my $n_osid = $o_row{'osid'};
return $n_osid;
}
1;
#!/usr/bin/perl
#!/usr/bin/perl -wT
use English;
#
# nfree - Takes pysical node names, and frees them from the experiment they
......@@ -7,45 +8,70 @@
# if this node should be re-reserved into another experiment and/or reloaded,
# rather than being put back into the pool of free nodes
#
sub usage ()
{
die("Usage: nfree <pid> <eid> [<node> <node> <...>]\n".
"Releases all nodes in the specified experiment.\n".
"If nodes are listed, nfree releases only those nodes.\n");
}
#
# Configure variables
#
my $TB = "@prefix@";
use lib '@prefix@/lib';
#
# Testbed Support libraries
#
use lib "@prefix@/lib";
use libdb;
use libtestbed;
if (@ARGV < 2) {
die("Usage: nfree <pid> <eid> [<node> <node> <...>]\n".
"Releases all nodes in the specified experiment. If nodes are listed,\n".
"nfree releases only the listed nodes.\n");
}
my $consetup = "$TB/libexec/console_setup";
my $reloadpid = "emulab-ops";
my $pendingeid = "reloadpending";
my $reloadeid = "reloading";
my $error = 0;
my $consetup="$TB/libexec/console_setup";
my $exportsetup="$TB/sbin/exports_setup";
my $os_load = "$TB/bin/os_load -r ";
my $reloadpid="emulab-ops";
my $reload_pendingeid="reloadpending";
my $reloadeid="reloading";
my @node_names=();
my @nodes;
my @freed_nodes=();
my $error = 0;
#
# Turn off line buffering on output
#
$| = 1;
#
# Untaint the path
#
$ENV{'PATH'} = "/bin:/usr/bin:/sbin:/usr/sbin";
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};
if (@ARGV < 2) {
usage();
}
my $pid = shift;
my $eid = shift;
use strict;
use English;
#
# Turn off line buffering on output
# Untaint args.
#
$| = 1;
if ($pid =~ /^([-\@\w]+)$/) {
$pid = $1;
}
else {
die("Bad data in pid: $pid.");
}
if ($eid =~ /^([-\@\w]+)$/) {
$eid = $1;
}
else {
die("Bad data in eid: $eid.");
}
#
# Make sure that the experiment actually exists
# NOTE: project permissions checking is done later, on an individual
# node basis.
#
if (!ExpState($pid,$eid)) {
die("There is no experiment '$eid' in project '$pid'.\n");
}
......@@ -54,219 +80,234 @@ if (!ExpState($pid,$eid)) {
# Make sure the user has the ability to modify this experiment
#
if (!TBExptAccessCheck($UID, $pid, $eid, TB_EXPT_MODIFY)) {
die("You don't have sufficient access to modify '$eid' in project '$pid'.\n");
die("You do not have permission to modify '$eid' in project '$pid'.\n");
}
######################################################################
# Step 1 - Free nodes
#
# Find nodes that can be freed at this time, and do so. Nodes which
# are awaiting reloads and which have been scheduled to be reserved
# to another experiment, are put into lists so that they can be
# handled later
######################################################################
# Make a list of nodes given on the command line
foreach my $n (@ARGV) {
# Shark hack
if ($n =~ /(sh\d+)/ ) {
#
# It's a shark - do the whole shelf if its not done already.
# Make a list of nodes given on the command line, or get the whole list from
# the DB if none provided.
#
if (@ARGV) {
foreach my $n (@ARGV) {
#
my $shelf = $1;
if ( ! (join(",",@node_names) =~ /,$shelf-\d,/)) {
# Shelf hasn't been done yet...
foreach my $n ( 1 .. 8 ) {
push(@node_names,"$shelf-$n");
}
# Taint check first! Solves silly perl problems.
#
if ($n =~ /^([\w]+)$/) {
$n = $1;
}
else {
die("*** $0:\n".
" Bad node name: $n.\n");
}
# End shark hack
} else {
# its not a shark - just add it in...
push(@node_names,"$n");
# Shark hack
if ($n =~ /(sh\d+)/ ) {
#
# It's a shark - do the whole shelf if its not done already.
#
my $shelf = $1;
if ( ! (join(",", @nodes) =~ /,$shelf-\d,/)) {
# Shelf hasn't been done yet...
foreach my $n ( 1 .. 8 ) {
push(@nodes, "$shelf-$n");
}
}
# End shark hack
}
else {
# its not a shark - just add it in...
push(@nodes, $n);
}
}
}
else {
print "Releasing all nodes from experiment '$eid' in project '$pid'.\n";
@nodes = ExpNodes($pid, $eid);
}
######################################################################
# Step 1
#
# Lock all of the tables we'll be reading, so that we get a consistent
# view of the current state
# See what nodes need to be freed, and then lock them down my moving
# them to a holding reservation.
#
DBQueryFatal("lock tables nodes write, node_types read, " .
"scheduled_reloads read, interfaces write, delays write, " .
"reserved write, last_reservation write, current_reloads write, " .
"next_reserve read");
# We lock just the reserved table. The prevents races between multiple
# invocations of nfree trying to free the same node. Rather than lock
# a zillion tables, move the reservation into a holding pattern. This
# effectively prevents someone else from freeing the same nodes, and from
# someone else allocating the nodes until we are done cleaning things up.
#
# If no nodes were given on the command line, fill the list with all nodes
# in the experiment.
# NOTE: My reason for not wanting to lock all those tables (9 in the
# original version) is that library calls will fail since mysql locking
# requires that every table used within the locked area, be locked.
# Of course, who knows what tables the library uses, so thats a silly
# way to go.
#
if (@node_names == 0) {
print "Releasing all nodes from experiment '$eid' in project '$pid'...\n";
push @node_names,ExpNodes($pid,$eid);
}
######################################################################
#
# Form a unique temporary EID. I want this to be unique in case something
# goes wrong, and they get left in the DB.
#
my $lockedeid = "nfree-locked-$PID";
my @reloads = ();
my %reserves= ();
foreach my $n (@node_names) {
DBQueryFatal("lock tables reserved write");
foreach my $n (@nodes) {
#
# Check to make sure they have acutally reserved the nodes.
# Check to make sure they have actually reserved the nodes.
#
my $result = DBQueryFatal("select * from reserved where node_id='$n' ".
"and eid='$eid' and pid='$pid'");
my $result =
DBQueryFatal("select * from reserved where node_id='$n' ".
"and eid='$eid' and pid='$pid'");
if ($result->numrows == 0) {
print "Node '$n' is not reserved by your experiment.\n";
$error++;
next;
} else {
push(@freed_nodes,$n);
}
#
# Move to locked reservation. Node is not free, but is no longer
# owned by the pid/eid, so cannot be mucked with.
#
if (! DBQueryWarn("update reserved " .
"set vname=NULL, pid='$reloadpid', eid='$lockedeid' ".
"where node_id='$n'")) {
print "*** WARNING: Error locking down node $n!\n";
next;
}
push(@freed_nodes, $n);
}
DBQueryFatal("unlock tables");
######################################################################
# Step 2
#
# Go through the list of nodes we successfully locked down, and clean
# up the node state (nodes, delays, interfaces, etc). Once that is done,
# move them to whatever new reservations are pending, or free the node.
#
######################################################################
foreach my $n (@freed_nodes) {
#
# This little sillyness is for disk reloading. Kill the last reservation.
#
DBQueryWarn("delete from last_reservation where node_id='$n'") || $error++;
#
# If the node has a reloads entry, change the reservation so that the
# reload_daemon will pick it up.
# Clean out all delays
#
$result = DBQueryFatal("select node_id,image_id from scheduled_reloads " .
"where node_id='$n'");
if ( $result->numrows() > 0 ) {
my @row = $result->fetchrow();
my $image_id = $row[1];
my $reload_type = $row[2];
print "Adding scheduled reload for $n to the list.\n";
push(@reloads,$n);
} else {
#
# If the node has a next_reserve entry, change the reservation
#
my $result = DBQueryFatal("select node_id,pid,eid from next_reserve ".
"where node_id='$n'");
if ( $result->num_rows() > 0 ) {
#
# Add the reservation to a list to be taken care of later
#
my ($node, $next_pid, $next_eid) = $result->fetchrow_array();
$reserves{$node} = "$next_pid:$next_eid";
} else {
#
# No reloads or reservation changes, so really free the node
#
# This little sillyness is for disk reloading. Remember the last
# project a node was reserved into.
#
DBQuery("insert into last_reservation values ('$n', '$pid')");
print "Releasing node '$n'...";
if (DBQueryWarn("delete from reserved " .
"where node_id='$n' and eid='$eid'")) {
"Succeeded.\n";
} else {
$error++;
}
}
}
DBQueryWarn("delete from delays where node_id='$n'") || $error++;
#
# Find the control net interface for this node type, as well as some
# of the default values for its node type
# of the default values for its node type.
#
$result = DBQueryFatal("select control_net,osid,node_types.pxe_boot_path " .
"from node_types " .
"left join nodes on nodes.type=node_types.type " .
"where node_id='$n'");
my $result =
DBQueryFatal("select control_net,osid,node_types.pxe_boot_path " .
"from node_types " .
"left join nodes on nodes.type=node_types.type " .
"where node_id='$n'");
my ($control, $osid, $pxe_boot_path) = $result->fetchrow_array();
if (! ($n =~ /sh\d+/)) { # If its not a shark
# Clean out all IPs except the control net
#
# Clean up interfaces by clearing IPs and/or aliases.
#
if (! ($n =~ /sh\d+/)) {
# Its not a shark, so clean out all IPs except the control net.
DBQueryWarn("update interfaces set IP='' " .
"where node_id='$n' and card!='$control'") || $error++;
} else {
# Shark hack
# it is a shark, so clear out the alias(es)
"where node_id='$n' and card!='$control'") || $error++;
}
else {
# XXX Shark Hack!
DBQueryWarn("update interfaces set IPalias='' ".
"where node_id='$n'") || $error++;
# End shark hack
}
#
# Clean out all delays
# Map the default OSID to something that is actually loaded on the
# machine. I think this is a hack, but its the best I can come up
# with right now.
#
DBQueryWarn("delete from delays where node_id='$n'") || $error++;
my $mapped_osid = MapNodeOSID($n, $osid);
if ($mapped_osid) {
$osid = $mapped_osid;
}
#
# And clean out various tidbits from the nodes table.
#
DBQueryWarn("update nodes set def_boot_osid='$osid', def_boot_cmd_line='',".
"def_boot_path='',startupcmd='',rpms='',deltas='',tarballs='',".
"pxe_boot_path='$pxe_boot_path', next_pxe_boot_path='' ".
"where node_id='$n'") || $error++;
# Clean up the nodes table so that its in a moderately clean state.
#
DBQueryWarn("update nodes set def_boot_osid='$osid',def_boot_cmd_line='',".
"def_boot_path='',startupcmd='',rpms='',deltas='', ".
"tarballs='',pxe_boot_path='$pxe_boot_path', ".
"next_pxe_boot_path='' where node_id='$n'") || $error++;
#
# Clean out the current_reloads table
# Clean out the current_reloads table (a just in case measure).
#
DBQueryWarn("delete from current_reloads where node_id='$n'") || $error++;
}
DBQueryFatal("unlock tables");
######################################################################
# Step 2 - Set up reserves and reloads
#
# If any nodes were awaiting reloads, put them in the proper
# experiment, and issue an os_load, if appropriate. If any nodes were
# set up for simple reservations, we take care of that in this stage
# too.
######################################################################
if ((@reloads > 0) || (keys %reserves > 0)) {
#
# Lock tables to maintain consistency
# Now its safe to change the reservation.
#
print "Locking tables.\n";
DBQueryFatal("lock tables nodes read, node_types read, scheduled_reloads read, ".
"interfaces write, reserved write, next_reserve write");
#
# Take care of reloads by putting them into a special experiment,
# which is processed by the reload_daemon
# If the node has a reloads entry, change the reservation so that the
# reload_daemon will pick it up.
#
foreach my $n ( @reloads ) {
#
# Change reservation (don't delete or we'll get races)
#
print "Changing reservation for $n to $reloadpid/$reload_pendingeid...\n";
$result =
DBQueryFatal("select node_id,image_id from scheduled_reloads " .
"where node_id='$n'");
if ($result->numrows()) {
print "Moving $n to $reloadpid/$pendingeid.\n";
DBQueryWarn("update reserved set ".
"pid='$reloadpid',eid='$reload_pendingeid',vname=NULL ".
"where node_id='$n'") || $error++;
"pid='$reloadpid',eid='$pendingeid' ".
"where node_id='$n'") || $error++;
next;
}
#
# Handle scheduled reservations
# If the node has a next_reserve entry, change the reservation.
#
foreach my $n ( keys %reserves ) {
my ($next_pid,$next_eid) = split (":",$reserves{$n});
#
# Change reservation (don't delete or we'll get races)
#
print "Changing reservation for $n to $next_pid/$next_eid...\n";
$result =
DBQueryFatal("select node_id,pid,eid from next_reserve ".
"where node_id='$n'");
if ($result->num_rows()) {
my ($node, $next_pid, $next_eid) = $result->fetchrow_array();
print "Moving $n to $next_pid/$next_eid.\n";
DBQueryWarn("update reserved set pid='$next_pid',eid='$next_eid'," .
"vname=NULL where node_id='$n'") || $error++;
DBQueryWarn("delete from next_reserve where node_id='$n'") || $error++;
}
"vname=NULL where node_id='$n'")
|| $error++;
DBQueryWarn("delete from next_reserve where node_id='$n'")
|| $error++;
next;
}
#
# Done, so we can now unlock tables
# No reloads or reservation changes, so really free the node
#
print "Unlocking tables.\n";
DBQueryFatal("unlock tables");
# This little sillyness is for disk reloading. Remember the last
# project a node was reserved into.
#
DBQuery("insert into last_reservation values ('$n', '$pid')");
print "Releasing node '$n' ... ";
if (DBQueryWarn("delete from reserved where node_id='$n'")) {
print "Succeeded.\n";
}
else {
print "Failed!\n";
$error++;
}
}
######################################################################
......@@ -274,6 +315,13 @@ if ((@reloads > 0) || (keys %reserves > 0)) {
#
# Using a list of freed nodes build eariler, run consetup to reset
# their serial consoles.
#
# NOTE: While it may seem like a race to do this after releasing the
# reservation, it really is not. Even if the node is allocated again
# console_setup looks at the current reservation and does the right
# thing, and since nalloc locks the reserved table, ordering will be
# preserved.
#
######################################################################
if (@freed_nodes) {
......@@ -299,8 +347,8 @@ if (@freed_nodes) {
#
# Finally, run the actual command
#
system("$consetup @conlist") == 0 or
print STDERR "WARNING: $consetup @conlist failed!\n";
system("$consetup @conlist") == 0 ||
print STDERR "WARNING: $consetup @conlist failed!\n";
}
exit($error);
......