Commit 6904db1d authored by Mike Hibler's avatar Mike Hibler

Deal with some DHCP issues related to the nfe driver for Nvidia NICs.

The Nvidia NICs on the PRObE machines will occassionally hang when dhclient
is running. Gary Sandine came up with a work-around to keep things moving.
This is my "re-interpretation" of his fix (i.e., if it doesn't work, don't
blame him!)
parent c86aa1b1
......@@ -115,20 +115,22 @@ if [ -n "$new_routers" ]; then
fi
fi
#
# We have observed problems where changing the speed/duplex of a link
# leaves DNS a little wonky. So we whack on it til it responds so that
# the sethostname script won't fail.
#
if [ "$new_network_number" = "10.200.1.0" ]; then
for i in 0 1 2; do
if `$BINDIR/tmcc bossinfo >/dev/null 2>&1`; then
break
fi
echo "`date`: ${interface}: waiting for DNS.." >>$LOGDIR/dhclient-exit.log 2>&1
sleep $i
done
fi
##
## This is now handled in sethostname.
##
## We have observed problems where changing the speed/duplex of a link
## leaves DNS a little wonky. So we whack on it til it responds so that
## the sethostname script won't fail.
##
#if [ "$new_network_number" = "10.200.1.0" ]; then
# for i in 0 1 2; do
# if `$BINDIR/tmcc bossinfo >/dev/null 2>&1`; then
# break
# fi
# echo "`date`: ${interface}: waiting for DNS.." >>$LOGDIR/dhclient-exit.log 2>&1
# sleep $i
# done
#fi
#
# See if the Testbed configuration software wants to change the hostname.
......
#!/usr/bin/perl -wT
#
# Copyright (c) 2000-2004, 2007 University of Utah and the Flux Group.
# Copyright (c) 2000-2012 University of Utah and the Flux Group.
#
# {{{EMULAB-LICENSE
#
......@@ -72,9 +72,23 @@ if (! ($curname =~ /.+/)) {
}
#
# We assume that we are in the same domain as our boss node
# We assume that we are in the same domain as our boss node.
#
# Note: tmccbossname can fail if the DNS is not responding properly.
# Sometimes this is transient and is caused by the control net interface
# getting tweaked (down/up'ed, speed/duplex changed) during the DHCP process.
# Hence we wait a couple of seconds and retry if this happens.
#
my $bossname = tmccbossname();
if (!$bossname) {
for (my $i = 1; $i <= 3; $i++) {
print STDERR "Waiting for DNS...\n";
sleep(1);
$bossname = tmccbossname();
last if ($bossname);
}
$bossname = "";
}
if ($bossname =~ /^[^\.]+\.(.*)$/) {
$domain = ".$1";
}
......
......@@ -36,6 +36,29 @@ if [ ! -d "$ELAB_BOOTDIR" ]; then
fi
fi
#
# XXX hack for PRObE and nfe interfaces.
# The nfe driver apparently has issues and can fail to get an address.
# Gary Sandine found that taking the interface down and back up again can
# unstick things (if you are persistent enough).
#
cnet_nfe_hack()
{
for i in 5 10 15 20 20 20 30 30 30; do
echo "`date`: nfe_hack: taking interfaces down and up ..."
for _if in $*; do
ifconfig $_if down
sleep 2
ifconfig $_if up
done
sleep $i
if [ -e $ELAB_BOOTDIR/controlif ]; then
echo "`date`: nfe_hack: worked!"
break;
fi
done
}
# this is a separate function so we can redirect all the output below
cnet_dhcp()
{
......@@ -47,9 +70,10 @@ cnet_dhcp()
# installed.
#
if [ -x /usr/local/sbin/dhclient ]; then
echo "Using dhclient port..."
echo "`date`: Using dhclient port..."
dhclient_program="/usr/local/sbin/dhclient"
${dhclient_program} ${dhclient_flags} $*
echo "`date`: $dhclient_program returned $?"
else
echo "Using default dhclient..."
......@@ -68,6 +92,8 @@ cnet_dhcp()
cnet_start()
{
_nfe=""
rm -f $ELAB_BOOTDIR/controlif
#
......@@ -90,6 +116,10 @@ cnet_start()
;;
ath*)
;;
nfe*)
_nfe="$_nfe $_if"
_ifs="$_ifs $_if"
;;
*)
_ifs="$_ifs $_if"
;;
......@@ -147,9 +177,26 @@ cnet_start()
echo "Emulab looking for control net among: $_ifs ..."
cnet_dhcp $_ifs >$ELAB_LOGDIR/netif-emulab.log 2>&1
if [ -e $ELAB_BOOTDIR/controlif ]; then
echo "Emulab control net is `cat $ELAB_BOOTDIR/controlif`"
_cif=`cat $ELAB_BOOTDIR/controlif`
echo "Emulab control net is $_cif"
else
#
# XXX hack for nfe device on PRObE nodes. The PRObE
# nfe interfaces can get hung, but a down/up seems to
# get things moving again. The reset is done in
# dhclient-enter-hooks, we just wait for it to take
# effect here.
#
if [ -n "$_nfe" ]; then
echo "Engaging control net nfe hack on: $_nfe ..."
cnet_nfe_hack $_nfe >>$ELAB_LOGDIR/netif-emulab.log 2>&1
fi
if [ -e $ELAB_BOOTDIR/controlif ]; then
_cif=`cat $ELAB_BOOTDIR/controlif`
echo "Emulab control net is $_cif"
else
echo "*** No Emulab control net found!"
fi
fi
else
#
......
......@@ -36,6 +36,29 @@ if [ ! -d "$ELAB_BOOTDIR" ]; then
fi
fi
#
# XXX hack for PRObE and nfe interfaces.
# The nfe driver apparently has issues and can fail to get an address.
# Gary Sandine found that taking the interface down and back up again can
# unstick things (if you are persistent enough).
#
cnet_nfe_hack()
{
for i in 5 10 15 20 20 20 30 30 30; do
echo "`date`: nfe_hack: taking interfaces down and up ..."
for _if in $*; do
ifconfig $_if down
sleep 2
ifconfig $_if up
done
sleep $i
if [ -e $ELAB_BOOTDIR/controlif ]; then
echo "`date`: nfe_hack: worked!"
break;
fi
done
}
# this is a separate function so we can redirect all the output below
cnet_dhcp()
{
......@@ -47,9 +70,10 @@ cnet_dhcp()
# installed.
#
if [ -x /usr/local/sbin/dhclient ]; then
echo "Using dhclient port..."
echo "`date`: Using dhclient port..."
dhclient_program="/usr/local/sbin/dhclient"
${dhclient_program} ${dhclient_flags} $*
echo "`date`: $dhclient_program returned $?"
else
echo "Using default dhclient..."
......@@ -68,6 +92,8 @@ cnet_dhcp()
cnet_start()
{
_nfe=""
rm -f $ELAB_BOOTDIR/controlif
#
......@@ -85,8 +111,15 @@ cnet_start()
case $_if in
lo*|gif*|faith*|tun*|plip*|usbus*)
;;
# XXX skip Intel 10Gb for now; they require huge numbers of mbufs
ix[0-9])
;;
ath*)
;;
nfe*)
_nfe="$_nfe $_if"
_ifs="$_ifs $_if"
;;
*)
_ifs="$_ifs $_if"
;;
......@@ -144,9 +177,26 @@ cnet_start()
echo "Emulab looking for control net among: $_ifs ..."
cnet_dhcp $_ifs >$ELAB_LOGDIR/netif-emulab.log 2>&1
if [ -e $ELAB_BOOTDIR/controlif ]; then
echo "Emulab control net is `cat $ELAB_BOOTDIR/controlif`"
_cif=`cat $ELAB_BOOTDIR/controlif`
echo "Emulab control net is $_cif"
else
#
# XXX hack for nfe device on PRObE nodes. The PRObE
# nfe interfaces can get hung, but a down/up seems to
# get things moving again. The reset is done in
# dhclient-enter-hooks, we just wait for it to take
# effect here.
#
if [ -n "$_nfe" ]; then
echo "Engaging control net nfe hack on: $_nfe ..."
cnet_nfe_hack $_nfe >>$ELAB_LOGDIR/netif-emulab.log 2>&1
fi
if [ -e $ELAB_BOOTDIR/controlif ]; then
_cif=`cat $ELAB_BOOTDIR/controlif`
echo "Emulab control net is $_cif"
else
echo "*** No Emulab control net found!"
fi
fi
else
#
......
......@@ -36,6 +36,29 @@ if [ ! -d "$ELAB_BOOTDIR" ]; then
fi
fi
#
# XXX hack for PRObE and nfe interfaces.
# The nfe driver apparently has issues and can fail to get an address.
# Gary Sandine found that taking the interface down and back up again can
# unstick things (if you are persistent enough).
#
cnet_nfe_hack()
{
for i in 5 10 15 20 20 20 30 30 30; do
echo "`date`: nfe_hack: taking interfaces down and up ..."
for _if in $*; do
ifconfig $_if down
sleep 2
ifconfig $_if up
done
sleep $i
if [ -e $ELAB_BOOTDIR/controlif ]; then
echo "`date`: nfe_hack: worked!"
break;
fi
done
}
# this is a separate function so we can redirect all the output below
cnet_dhcp()
{
......@@ -47,9 +70,10 @@ cnet_dhcp()
# installed.
#
if [ -x /usr/local/sbin/dhclient ]; then
echo "Using dhclient port..."
echo "`date`: Using dhclient port..."
dhclient_program="/usr/local/sbin/dhclient"
${dhclient_program} ${dhclient_flags} $*
echo "`date`: $dhclient_program returned $?"
else
echo "Using default dhclient..."
......@@ -68,6 +92,8 @@ cnet_dhcp()
cnet_start()
{
_nfe=""
rm -f $ELAB_BOOTDIR/controlif
#
......@@ -90,6 +116,10 @@ cnet_start()
;;
ath*)
;;
nfe*)
_nfe="$_nfe $_if"
_ifs="$_ifs $_if"
;;
*)
_ifs="$_ifs $_if"
;;
......@@ -147,9 +177,26 @@ cnet_start()
echo "Emulab looking for control net among: $_ifs ..."
cnet_dhcp $_ifs >$ELAB_LOGDIR/netif-emulab.log 2>&1
if [ -e $ELAB_BOOTDIR/controlif ]; then
echo "Emulab control net is `cat $ELAB_BOOTDIR/controlif`"
_cif=`cat $ELAB_BOOTDIR/controlif`
echo "Emulab control net is $_cif"
else
#
# XXX hack for nfe device on PRObE nodes. The PRObE
# nfe interfaces can get hung, but a down/up seems to
# get things moving again. The reset is done in
# dhclient-enter-hooks, we just wait for it to take
# effect here.
#
if [ -n "$_nfe" ]; then
echo "Engaging control net nfe hack on: $_nfe ..."
cnet_nfe_hack $_nfe >>$ELAB_LOGDIR/netif-emulab.log 2>&1
fi
if [ -e $ELAB_BOOTDIR/controlif ]; then
_cif=`cat $ELAB_BOOTDIR/controlif`
echo "Emulab control net is $_cif"
else
echo "*** No Emulab control net found!"
fi
fi
else
#
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment