Commit 8608423b authored by David Johnson's avatar David Johnson

Setup IMQ in openvz containers so that shaped lans have ingress shaping.

Of course, attaching tc filters via IMQ doesn't seem to work inside
openvz...
parent 84eeccb8
......@@ -22,6 +22,7 @@ use Exporter;
ixpsetup libsetup_refresh gettopomap getfwconfig gettiptunnelconfig
gettraceconfig genhostsfile getmotelogconfig calcroutes fakejailsetup
getlocalevserver genvnodesetup getgenvnodeconfig stashgenvnodeconfig
getlinkdelayconfig
TBDebugTimeStamp TBDebugTimeStampsOn
......@@ -736,6 +737,81 @@ sub getifconfig($;$)
return 0;
}
#
# Parse the linkdelay config and return a hash. This leaves the ugly pattern
# matching stuff here, but lets the caller do whatever with it.
#
sub getlinkdelayconfig($;$)
{
my ($rptr,$nocache) = @_; # Return list to caller (reference).
my @tmccresults = ();
my @ldlist = (); # To be returned to caller.
my %tmccopts = ();
if ($nocache) {
$tmccopts{"nocache"} = 1;
}
if (tmcc(TMCCCMD_LINKDELAYS, undef, \@tmccresults, %tmccopts) < 0) {
warn("*** WARNING: Could not get linkdelay config from server!\n");
@$rptr = ();
return -1;
}
my $pat = q(LINKDELAY IFACE=([\d\w]+) TYPE=(simplex|duplex) );
$pat .= q(LINKNAME=([-\d\w]+) VNODE=([-\d\w]+) );
$pat .= q(INET=([0-9.]*) MASK=([0-9.]*) );
$pat .= q(PIPE=(\d+) DELAY=([\d\.]+) BW=(\d+) PLR=([\d\.]+) );
$pat .= q(RPIPE=(\d+) RDELAY=([\d\.]+) RBW=(\d+) RPLR=([\d\.]+) );
$pat .= q(RED=(\d) LIMIT=(\d+) );
$pat .= q(MAXTHRESH=(\d+) MINTHRESH=(\d+) WEIGHT=([\d\.]+) );
$pat .= q(LINTERM=(\d+) QINBYTES=(\d+) BYTES=(\d+) );
$pat .= q(MEANPSIZE=(\d+) WAIT=(\d+) SETBIT=(\d+) );
$pat .= q(DROPTAIL=(\d+) GENTLE=(\d+));
foreach my $str (@tmccresults) {
my $ldc = {};
if ($str =~ /^$pat/) {
$ldc->{"IFACE"} = $1;
$ldc->{"TYPE"} = $2;
$ldc->{"LINKNAME"} = $3;
$ldc->{"VNODE"} = $4;
$ldc->{"INET"} = $5;
$ldc->{"MASK"} = $6;
$ldc->{"PIPE"} = $7;
$ldc->{"DELAY"} = $8;
$ldc->{"BW"} = $9;
$ldc->{"PLR"} = $10;
$ldc->{"RPIPE"} = $11;
$ldc->{"RDELAY"} = $12;
$ldc->{"RBW"} = $13;
$ldc->{"RPLR"} = $14;
$ldc->{"RED"} = $15;
$ldc->{"LIMIT"} = $16;
$ldc->{"MAXTHRESH"} = $17;
$ldc->{"MINTHRESH"} = $18;
$ldc->{"WEIGHT"} = $19;
$ldc->{"LINTERM"} = $20;
$ldc->{"QINBYTES"} = $21;
$ldc->{"BYTES"} = $22;
$ldc->{"MEANPSIZE"} = $23;
$ldc->{"WAIT"} = $24;
$ldc->{"SETBIT"} = $25;
$ldc->{"DROPTAIL"} = $26;
$ldc->{"GENTLE"} = $27;
push(@ldlist, $ldc);
}
else {
warn "*** WARNING: Bad linkdelay line: $str\n";
}
}
@$rptr = @ldlist;
return 0;
}
#
# Read the topomap and return something.
#
......
......@@ -84,6 +84,9 @@ if (! -e $TC && -e "/sbin/tc") { # If we hacked iproute rpm
$TC = "/sbin/tc";
}
my $IPTABLES = "/usr/local/sbin/iptables"; # This is the working version!
if (! -e $IPTABLES && -e "/sbin/iptables") { # If we hacked iptables rpm
$IPTABLES = "/sbin/iptables";
}
my $IFCONFIG = "/sbin/ifconfig";
my $MODPROBE = "/sbin/modprobe";
my $SYSCTL = "/sbin/sysctl";
......@@ -107,6 +110,23 @@ if ($reinstall || $uninstall) {
if ($uninstall);
}
#
# Find available imq devices if we're inside a non-fully virtualized container.
# This is necessary because the imq devices will have been bound into our
# container -- we can't do anything until that happens.
# If we're not in a container, they might not exist yet, so we use the old icky
# way -- modprobe'ing 10 and making assumptions :-(.
#
my $contained = 0;
my @imqdevs = ();
my $imqidx = 0;
if (GENVNODE() && GENVNODETYPE() eq 'openvz') {
$contained = 1;
opendir(DIR,'/sys/class/net') or die "could not open /sys/class/net: $!";
@imqdevs = grep { /^imq\d+$/ } readdir(DIR);
closedir(DIR);
}
#
# Update the delays configuration. Also run the the commands to make
# the changes.
......@@ -462,10 +482,12 @@ sub LinkDelaySetup()
# Figure out how we're going to flush iproute2+tc!
# print DEL "ipfw -f flush\n";
print DEL "modprobe imq numdevs=10\n";
print DEL "sysctl -w net.core.rmem_max=8388608\n";
print DEL "sysctl -w net.core.wmem_max=8388608\n";
print DEL "sysctl -w net.core.netdev_max_backlog=2048\n";
if (!$contained) {
print DEL "modprobe imq numdevs=10\n";
print DEL "sysctl -w net.core.rmem_max=8388608\n";
print DEL "sysctl -w net.core.wmem_max=8388608\n";
print DEL "sysctl -w net.core.netdev_max_backlog=2048\n";
}
foreach $delay (@delays) {
my $pat = q(LINKDELAY IFACE=([\d\w]+) TYPE=(simplex|duplex) );
......@@ -583,7 +605,7 @@ sub LinkDelaySetup()
# XXX: temporarily select between delay, plr, and [g]red
# until they become classful queues.
print DEL "ifconfig $iface txqueuelen $queue\n";
print DEL "$IFCONFIG $iface txqueuelen $queue\n";
print DEL "$TC qdisc add dev $iface handle $pipeno root ";
print DEL "plr $plr\n";
......@@ -601,22 +623,29 @@ sub LinkDelaySetup()
}
$iface =~ /\D+(\d+)/;
my $imqnum = $1;
my $imqdev = "imq$1";
my $imqnum = $1;
if ($contained) {
$imqdev = $imqdevs[$imqidx++];
if ($imqdev =~ /\D+(\d+)/) {
$imqnum = $1;
}
}
if ($type eq "duplex") {
print DEL "$TC qdisc add dev imq${imqnum} handle $pipeno ";
print DEL "$TC qdisc add dev $imqdev handle $pipeno ";
print DEL "root plr $rplr\n";
print DEL "$TC qdisc add dev imq${imqnum} handle ";
print DEL "$TC qdisc add dev $imqdev handle ";
print DEL "". ($pipeno+10) ." parent ${pipeno}:1 ";
print DEL "delay usecs $rdelay reset_time 1\n";
print DEL "$TC qdisc add dev imq${imqnum} handle ";
print DEL "$TC qdisc add dev $imqdev handle ";
print DEL "". ($pipeno+20) ." parent ". ($pipeno+10) .":1 ";
print DEL "htb default 1\n";
if ($rbandw != 0) {
print DEL "$TC class add dev imq${imqnum} classid ";
print DEL "$TC class add dev $imqdev classid ";
print DEL "". ($pipeno+20) .":1 parent ". ($pipeno+20) ." ";
print DEL "htb rate ${rbandw} ceil ${rbandw}\n";
}
......@@ -624,7 +653,7 @@ sub LinkDelaySetup()
print DEL "$IPTABLES -t mangle -A PREROUTING -i $iface ";
print DEL "-j IMQ --todev $imqnum\n";
print DEL "ifconfig imq${imqnum} up\n";
print DEL "$IFCONFIG $imqdev up\n";
#
# *** From FreeBSD version:
......@@ -646,7 +675,7 @@ sub LinkDelaySetup()
#}
if ($type eq "duplex") {
print MAP "$linkname duplex $vnode $vnode $iface imq${imqnum} ".
print MAP "$linkname duplex $vnode $vnode $iface $imqdev ".
"$pipeno $rpipeno\n";
}
else {
......@@ -671,7 +700,10 @@ sub LinkDelaySetup()
}
$checkreplace = 1;
}
if ($checkreplace) {
#
# Only replace the kernel if we're not in a VM, doh!
#
if ($checkreplace && !(GENVNODE() && GENVNODETYPE() eq 'openvz')) {
checkkernel($kernel);
}
return 0;
......
......@@ -48,6 +48,16 @@ use Data::Dumper;
require "/etc/emulab/paths.pm"; import emulabpaths;
use libvnode;
#
# Turn off line buffering on output
#
$| = 1;
#
# Load the OS independent support library. It will load the OS dependent
# library and initialize itself.
#
my $defaultImage = "emulab-default";
sub VZSTAT_RUNNING() { return "running"; }
......@@ -194,7 +204,7 @@ sub vz_rootPreConfig {
# and configuring them as necessary.
#
sub vz_rootPreConfigNetwork {
my ($node_ifs,$node_ifsets) = @_;
my ($node_ifs,$node_ifsets,$node_lds) = @_;
# figure out what bridges we need to make:
# we need a bridge for each physical iface that is a multiplex pipe,
......@@ -263,6 +273,77 @@ sub vz_rootPreConfigNetwork {
}
}
#
# Figure out how many IMQ devices we need.
#
my $imqnum = 0;
foreach my $node (keys(%$node_lds)) {
foreach my $ldc (@{$node_lds->{$node}}) {
if ($ldc->{"TYPE"} eq 'duplex') {
++$imqnum;
}
}
}
my $numchanged = 0;
#
# Find out how many already devices we configured last time (there is no
# way to tell how many there are if some have already been dedicated to
# running containers, so we have to check by writing a file...)
#
my $nidf = "${emulabpaths::VARDIR}/numimqdevs";
my $oldimqnum;
if (-e $nidf) {
open(FD,"$nidf") or die "could not open $nidf for read: $!\n";
$oldimqnum = <FD>;
chomp($oldimqnum);
close(FD);
if ($oldimqnum eq "$imqnum") {
$numchanged = 1;
}
}
if (!defined($oldimqnum) || $numchanged) {
open(FD,">$nidf") or die "could not open $nidf for write: $!\n";
print FD "$imqnum\n";
close(FD);
}
#
# XXX: we have to rmmod the imq module to change the number of allocated
# devices, ugh! So, all the VMs had better be stopped before we do this!
#
if (!defined($oldimqnum) || $numchanged) {
if (!system('lsmod | grep -q imq')) {
system("$MODPROBE -r imq");
}
if ($imqnum) {
mysystem("$MODPROBE imq numdevs=$imqnum");
mysystem("$MODPROBE ipt_IMQ");
#
# This is ugly -- we write a map file that tells
# vz_vnodePreConfigExpNetwork which container gets which imq devs.
#
open(FD,">${emulabpaths::VARDIR}/imqmap")
or die "could not open ${emulabpaths::VARDIR}/imqmap for write: $!\n";
my $ii = 0;
foreach my $node (sort(keys(%$node_lds))) {
my @nidevs = ();
foreach my $ldc (@{$node_lds->{$node}}) {
if ($ldc->{"TYPE"} eq 'duplex') {
push(@nidevs,"imq$ii");
++$ii;
}
}
print FD "$node\t" . join(',',@nidevs) . "\n";
}
close(FD);
}
}
return 0;
}
......@@ -349,6 +430,49 @@ sub vz_vnodeKill {
}
sub vz_vnodePreConfig {
my ($vnode_id,$vmid) = @_;
#
# Look and see if this node already has imq devs mapped into it -- if those
# match the ones in the map file (${emulabpaths::VARDIR}/imqmap), do nothing, else fixup.
#
my %devs = ();
if (-e "${emulabpaths::VARDIR}/imqmap") {
open(FD,"${emulabpaths::VARDIR}/imqmap") or die "could not open ${emulabpaths::VARDIR}/imqmap: $!\n";
while (<FD>) {
chomp($_);
if ($_ =~ /^([^\s]+)\s+([^\s]+)$/ && $1 eq $vnode_id) {
foreach my $dev (split(/,/,$2)) {
$devs{$dev} = 1;
}
}
}
close(FD);
}
my $existing = `sed -n -r -e 's/NETDEV="(.*)"/\1/p' /etc/vz/conf/$vmid.conf`;
chomp($existing);
foreach my $dev (split(/,/,$existing)) {
if (!exists($devs{$dev})) {
# needs deleting
$devs{$dev} = 0;
}
else {
# was already mapped, leave alone
$devs{$dev} = undef;
}
}
foreach my $dev (keys(%devs)) {
if ($devs{$dev} == 1) {
mysystem("$VZCTL set $vnode_id --netdev_add $dev --save");
}
elsif ($devs{$dev} == 0) {
mysystem("$VZCTL set $vnode_id --netdev_del $dev --save");
}
}
return 0;
}
......@@ -539,7 +663,7 @@ sub vz_vnodePreConfigControlNetwork {
# Preconfigures experimental interfaces in the vnode before its first boot.
#
sub vz_vnodePreConfigExpNetwork {
my ($vnode_id,$vmid,$ifs) = @_;
my ($vnode_id,$vmid,$ifs,$lds) = @_;
my $elabifs = "";
my %netif_strs = ();
......
......@@ -219,6 +219,7 @@ if (tmcc(TMCCCMD_VNODELIST, undef, \@tmccresults) < 0) {
}
my %ifconfigs = ();
my %ldconfigs = ();
my %vmconfigs = ();
foreach my $str (@tmccresults) {
if ($str =~ /^VNODEID=([-\w]+) JAILED=(\d)$/) {
......@@ -245,6 +246,14 @@ foreach my $str (@tmccresults) {
if (getifconfig(\@ifc));
$ifconfigs{$vid} = \@ifc;
#
# Now the linkdelay config:
#
my @ldc = ();
die "getlinkdelayconfig($vid): $!"
if (getlinkdelayconfig(\@ldc));
$ldconfigs{$vid} = \@ldc;
#
# Now the vmconfig:
#
......@@ -364,7 +373,8 @@ elsif ($action eq 'boot') {
TBDebugTimeStamp("starting $vmtype rootPreConfigNetwork")
if ($debug);
$libops{GENVNODETYPE()}{'rootPreConfigNetwork'}->(\%ifconfigs,{});
$libops{GENVNODETYPE()}{'rootPreConfigNetwork'}->(\%ifconfigs,{},
\%ldconfigs);
TBDebugTimeStamp("finished $vmtype rootPreConfigNetwork")
if ($debug);
......@@ -413,7 +423,8 @@ elsif ($action eq 'boot') {
# OP: exp net preconfig
next if (safeLibOp($vnode,'vnodePreConfigExpNetwork',1,1,
$vnode,$vmid,$ifconfigs{$vnode}));
$vnode,$vmid,
$ifconfigs{$vnode},$ldconfigs{$vnode}));
next if (safeLibOp($vnode,'vnodeConfigResources',1,1,$vnode,$vmid));
next if (safeLibOp($vnode,'vnodeConfigDevices',1,1,$vnode,$vmid));
......@@ -459,7 +470,8 @@ elsif ($action eq 'boot') {
# OP: exp net preconfig
next if (safeLibOp($vnode,'vnodePreConfigExpNetwork',1,1,
$vnode,$vmid,$ifconfigs{$vnode}));
$vnode,$vmid,
$ifconfigs{$vnode},$ldconfigs{$vnode}));
next if (safeLibOp($vnode,'vnodeConfigResources',1,1,$vnode,$vmid));
next if (safeLibOp($vnode,'vnodeConfigDevices',1,1,$vnode,$vmid));
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment