Commit 6cc8cf46 authored by Dan Reading's avatar Dan Reading
Browse files

Checknode should be ready for prime-time

* remove check for gather arg, use TESTINFO from tmcc hwinfo
  instead. Only call tmcc once. Let non-MFS boot collect data.
* tmcc hwinfo not return a TESTINFO line. Handle the LOGDIR, COLLECT
  and CHECK indicators
* Do to variable scoping make sure everything runs in the shell.
  Do not 'exit' from called programs. Source called programs instead
  of starting new shells.
* Nothing wrong with the which() function I put in since the
  MFS/busybox installation does not have one, but if we have a real
  which(1) program use it.
* output format changes and programming space changes
* remove check for gather arg, use TESTINFO from tmcc hwinfo
  instead. Only call tmcc once. Let non-MFS boot collect data.
* tmcc hwinfo not returning a TESTINFO line. Handle the LOGDIR, COLLECT
  and CHECK indicators
* Do to variable scoping make sure everything runs in the shell.
* Do not 'exit' from called programs. Source called programs instead of
  starting new shells.
* Nothing wrong with the which() function I put in since the MFS/busybox
  installation does not have one, but if we have a real which(1) program
  use it.
* call sub-programs by sourcing not by starting new shell.
* oops mfsmode is not set until initialize is run. also set -e -u to
  catch this type of thing.

Checknode tests once again run in standard images and can collect
data.

* All tests can now be "sourced" from gatherinv to insure everything
  runs in a single shell.
* All tests can also be called standalone.
* MFS mode and non-MFS mode honor the CHECK and COLLECT flags.
* Create, if necessary, a new project save directory.
* This will be needed once every-time checknodes is run in a new project.
  path is /proj/{pid}/nodecheck.
* If script can't run quit with a 'return 0' so the rest of the checks
  can continue.

Change filename gatherinv -> checknode

* Can't run in MFS mode if not in the emulab-ops pid
parent fd556121
......@@ -91,7 +91,7 @@ nodecheck-install: dir-install
$(INSTALL) -m 755 $(SRCDIR)/checknode/diskcheck $(BINDIR)/diskcheck
$(INSTALL) -m 755 $(SRCDIR)/checknode/memcheck $(BINDIR)/memcheck
$(INSTALL) -m 755 $(SRCDIR)/checknode/niccheck $(BINDIR)/niccheck
$(INSTALL) -m 755 $(SRCDIR)/checknode/gatherinv $(BINDIR)/gatherinv
$(INSTALL) -m 755 $(SRCDIR)/checknode/checknode $(BINDIR)/checknode
$(INSTALL) -m 755 $(SRCDIR)/checknode/rc.nodecheck $(BINDIR)/rc/rc.checknode
common-script-install: dir-install
......
#! /bin/bash
#
# Copyright (c) 2013 University of Utah and the Flux Group.
#
# {{{EMULAB-LICENSE
#
# This file is part of the Emulab network testbed software.
#
# This file is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or (at
# your option) any later version.
#
# This file is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
# License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this file. If not, see <http://www.gnu.org/licenses/>.
#
# }}}
#
# gaterinv now has two duties. gather/collect in MFS mode
# and run in normal mode so sub-programs can be called by sourcing,
# letting everthing run in the same shell. Sub-programs should
# also be allowed to run standalone.
source checkutils.sh
set -u
set -e
nodecheck_main() {
(( $check_flag )) && checks="time cpu mem nic disk" || checks="cpu mem nic disk"
for i in $checks ; do
source $BINDIR/${i}check $LOGDIR/nodecheck.log
done
return 0
}
gatherinv_main() {
echo -n ' Gathering Inventory..'
(( ! $collect_flag )) && echo "Hmmm tmcc says not to collect. Doing so anyway."
checks="disk cpu mem nic"
for i in $checks ; do
source $BINDIR/${i}check $LOGDIR/nodecheck.log
done
return 0
}
postprocessing()
{
timestamp=$(date +%y%m%d%H%M%S)
if (( $collect_flag )) ; then
# read in the newly found info into hwinv array
readtmcinfo /tmp/nodecheck.log.tb hwinv
# print it back out in tmcc hwinfo format
{ printtmcinfo
} > /tmp/nodecheck.log.tb.new # .new not necessary should reuse old name
# note what kernel we are running
printf "# KERNEL %s\n" "$(uname -svrm)" >> /tmp/nodecheck.log.tb.new
# add which mode we running as
printf "# ismfs=%s %s\n" "$mfsmode" "$(getfromtb TESTINFO)" >> /tmp/nodecheck.log.tb.new
# print the full listng of found inventory
{ printhwinv
} > /tmp/nodecheck.log.inv
printf "# KERNEL %s\n" "$(uname -svrm)" >> /tmp/nodecheck.log.inv
# add which mode we running as
printf "# ismfs=%s %s\n" "$mfsmode" "$(getfromtb TESTINFO)" >> /tmp/nodecheck.log.inv
# make sure projdir is available
if [ ! -d ${projdir} ] ; then
dirn="${projdir%${projdir##*/}}"
[[ "${dirn:=./}" != "/" ]] && dirn="${dirn%?}"
dirnmounted=$(df | grep $dirn)
if [ -n "$dirnmounted" ] ; then
if [ ! -d ${projdir} ] ; then
mkdir ${projdir}
if [ ! -d ${projdir} ] ; then
echo "Was not able to make $projdir...whats up with that"
exit 1
else
chmod 755 $projdir
fi
fi
else
echo "OHhh all this work for nothing $projdir not mounted can't save info"
exit 1
fi
fi
# start XXX
# decided to change names again, rename if old names -- XXX remove this after all node have run the new code
# take into consideration if old directory structure was added to to a new directory structure
if [ -d ${projdir}/$host ] ; then
owd=$PWD
cd ${projdir}/$host
[[ -L $host.full ]] && rm $host.full
[[ -L $host.diff ]] && rm $host.diff
for i in full diff tbdb ; do
if [ -d $i ] ; then
[[ ! -d .$i ]] && (mkdir .$i ; chmod g+x .$i)
list=$(ls -A $i)
for j in $list ; do
mv $i/$j .$i
done
rmdir $i
fi
done
cd $owd
fi
# end XXX
# have needed dirs ?
[[ ! -d ${projdir}/$host ]] && ( mkdir -p ${projdir}/$host ; chmod g+x ${projdir}/$host )
[[ ! -d ${projdir}/$host/.tbdb ]] && ( mkdir -p ${projdir}/$host/.tbdb ; chmod g+x ${projdir}/$host/.tbdb )
[[ ! -d ${projdir}/$host/.full ]] && ( mkdir -p ${projdir}/$host/.full ; chmod g+x ${projdir}/$host/.full )
[[ ! -d ${projdir}/$host/.diff ]] && ( mkdir -p ${projdir}/$host/.diff ; chmod g+x ${projdir}/$host/.diff )
[[ ! -d ${projdir}/$host/.tmcc ]] && ( mkdir -p ${projdir}/$host/.tmcc ; chmod g+x ${projdir}/$host/.tmcc )
# copy over the files including timestamps
cp /tmp/nodecheck.log.tb.new ${projdir}/$host/.tbdb/$timestamp
cp /tmp/nodecheck.log.inv ${projdir}/$host/.full/$timestamp
# remove old symlinks and make new ones
owd=$PWD
cd ${projdir}/$host
[[ -L $host ]] && rm -f $host #this should be a symlink
[[ -L node ]] && rm -f node #this should be a symlink
[[ -L full ]] && rm -f full #another symlink
[[ -L tmcc ]] && rm -f tmcc #another symlink
[[ -L diff ]] && rm -f diff #another symlink
ln -s .tbdb/${timestamp} ${projdir}/$host/$host
ln -s .tbdb/${timestamp} ${projdir}/$host/node
ln -s .full/$timestamp ${projdir}/$host/full
cd $owd
# make sure no sudo is needed for read
chmod g+r ${projdir}/$host/.*/${timestamp}
fi
if (( $check_flag )) ; then
cd ${projdir}/$host
# save what tmcc says right now
$($BINDIR/tmcc hwinfo > ${projdir}/$host/.tmcc/$timestamp)
ln -s .tmcc/$timestamp ${projdir}/$host/tmcc
cd $owd
# Test what was found locally against what is in the database
readtmcinfo tmcc hwinvcopy # info from tmcc.bin hwinv
# if testing can do something like readtmcinfo ${projdir}/test
readtmcinfo /tmp/nodecheck.log.inv hwinv # read full listing of locally found into hwinv
# diff for local stuff not in tbdb
comparetmcinfo /tmp/nodecheck.diff # file for output
# if we ended up with a diff file handle it
if [ -s /tmp/nodecheck.diff ] ; then
# show it at runtime?? cat /tmp/nodecheck.diff
# header
printf "\nDiff Report for %s @ %s\nKernel %s\n" "$host" "$(date)" "$(uname -srvm)" >> ${projdir}/$host/.diff/${timestamp}
echo "------------------------------------------------------------------" >> ${projdir}/$host/.diff/${timestamp}
# body
cat /tmp/nodecheck.diff >> ${projdir}/$host/.diff/${timestamp}
# link it
cd ${projdir}/$host
[[ -L diff ]] && rm -f diff
ln -s .diff/${timestamp} ${projdir}/${host}/diff
cd $owd
else
# no diff maybe it got better, put a marker out
cp /dev/null ${projdir}/${host}/.diff/${timestamp}
fi
chmod g+r ${projdir}/$host/.diff/${timestamp}
fi
[[ -f /tmp/nodecheck.log.inv ]] && rm /tmp/nodecheck.log.inv
[[ -f /tmp/nodecheck.log.tb.new ]] && rm /tmp/nodecheck.log.tb.new
[[ -f /tmp/nodecheck.diff ]] && rm /tmp/nodecheck.diff
}
initialize $@
(( $mfsmode )) && gatherinv_main $@ || nodecheck_main $@
postprocessing
......@@ -21,46 +21,75 @@
# }}}
#
#exit on unbound var
set -u
#exit on any error
set -e
#only source this file once
if [ "${checkutils+"beenhere"}" == "beenhere" ] ; then
return 0
else
checkutils="sourced"
if [ -z ${BASH_VERSINFO[0]} -o ${BASH_VERSINFO[0]} -lt 4 ] ; then
echo "Need at least BASH version 4 to run nodecheck tests or to Collect Inventory, Not running checks"
exit 0
fi
if [ -z "${BINDIR-""}" -a -f "/etc/emulab/paths.sh" ]; then
source /etc/emulab/paths.sh
fi
# Global Vars
# the bash syntax ${var-1} means: use var if set else use nothing
[[ -z "${NOSM-}" ]] && declare NOSM="echo" #do nothing command
[[ -z "${host-}" ]] && declare host #emulab hostname
[[ -z "${failed-}" ]] && declare failed="" #major falure to be commicated to user
[[ -z "${os-}" ]] && declare os="" #[Linux|FreeBSD] for now
[[ -z "${todo_exit-}" ]] && declare -a todo_exit
[[ -z "${hwinv[hwinvidx]-}" ]] && declare -A hwinv["hwinvidx"]="" # hwinv from tmcc
[[ -z "${hwinvcopy[hwinvidx]-}" ]] && declare -A hwinvcopy["hwinvidx"]="" # a copy of hwinv from tmcc
[[ -z "${tmccinfo[hwinvidx]-}" ]] && declare -A tmccinfo["hwinvidx"]="" # info from tmcc hwinfo
[[ -z "${collect_flag-}" ]] && declare -i collect_flag # from tmcc hwinfo
[[ -z "${check_flag-}" ]] && declare -i check_flag # from tmcc hwinfo
[[ -z "${projdir-}" ]] && declare projdir # from tmcc hwinfo
[[ -z "${errexit_val-}" ]] && declare errexit_val # holding var for set values, ie -e
[[ -z "${mfsmode-}" ]] && declare -i mfsmode=0 #are we running in a MFS?
# PathNames
[[ -z "${logfile-}" ]] && declare logfile # output log
[[ -z "${logfile4tb-}" ]] && declare -r logfile4tb="/tmp/nodecheck.log.tb" # for data to saved in perm storage
[[ -z "${tmplog-}" ]] && declare -r tmplog="/tmp/.$$tmp.log"
[[ -z "${logout-}" ]] && declare -r logout="/tmp/.$$logout.log" # temperary logging while building inventory"
[[ -z "${tmpout-}" ]] && declare -r tmpout="/tmp/.$$tempout.log" # ditto
# DEBUG
[[ -z "${DEBUG-}" ]] && declare -ir DEBUG=0 # Some debugging if set
initialize () {
#exit on unbound var
set -u
#exit on any error
set -e
#call only once
if [ "${initdone-uninit}" != "uninit" ] ; then
(( $DEBUG )) && printf "Attempt to call twice %s:%s called from %s\n" $FUNCNAME $LINENO "$(caller)"
return 0
fi
declare -i mfsmode="" #are we running in a MFS
if [ -f /etc/emulab/ismfs ] ; then
mfsmode=1
else
mfsmode=0
fi
if [ -z "${BINDIR-""}" ] ; then
if [ -f "/etc/emulab/paths.sh" ]; then
source /etc/emulab/paths.sh
else
export BINDIR=/usr/local/etc/emulab
export LOGDIR=/var/tmp
fi
fi
if [ -f /etc/emulab/ismfs ] ; then
mfsmode=1
else
mfsmode=0
fi
declare errext_val # holding var for set value, ie -e
inithostname
initlogs $@
inittestinfo
# Global Vars
declare NOSM="echo" #do nothing command
declare host #emulab hostname
declare failed="" #major falure to be commicated to user
declare os="" #[Linux|FreeBSD] for now
declare -a todo_exit
declare -A hwinv # hwinv from tmcc
declare -A hwinvcopy # a copy of hwinv from tmcc
declare -A tmccinfo # info from tmcc hwinfo
#trap 'err_report $FUNCNAME:$LINENO' ERR
trap 'err_report $LINENO' ERR
#declare -A tcm_out # hwinv for output
#declare -A tcm_inv # what we have discovered
# declare -p todo_exit
initdone="done"
export initdone
return 0
}
# any command causes exit if -e option set
# including a grep just used so see if some string is in a file
......@@ -75,10 +104,8 @@ restore_e() {
# give some indication of exit on ERR trap
err_report() {
echo "TRAP ERR at $1"
echo "TRAP ERR at Caller Line $(caller)"
}
#trap 'err_report $FUNCNAME:$LINENO' ERR
trap 'err_report $LINENO' ERR
# read info from tmcc or a file. Copy into one of the three global arrays
......@@ -496,15 +523,20 @@ printhwinv() {
# which is not in busybox and not a bash builtin
which() {
mypath=$PATH
mypath=${mypath//:/ }
for i in $mypath ; do
if [ -e $i/$1 ] ; then
echo $i/$1
return 0
fi
done
return 1
if [ -x /usr/bin/which ] ; then
# have real which, use it
/usr/bin/which $@
return 0
else
mypath=$PATH
mypath=${mypath//:/ }
for i in $mypath ; do
if [ -e $i/$1 ] ; then
echo $i/$1
return 0
fi
done
fi
}
inithostname() {
......@@ -546,13 +578,15 @@ findSmartctl() {
# Array of command to be run at exit time
on_exit() {
# (( $DEBUG )) && echo "EXIT on_exit $(caller)"
for i in "${todo_exit[@]}" ; do
$($i)
done
return 0
}
add_on_exit() {
local nex=${#todo_exit[*]}
local -i nex=${#todo_exit[*]}
todo_exit[$nex]="$@"
if [[ $nex -eq 0 ]]; then
trap on_exit EXIT
......@@ -560,29 +594,64 @@ add_on_exit() {
return 0
}
# setup logging
# setup logging $1 is local log file if not set default to /tmp file
# $2 is the collect file, if not set then no collection is done
initlogs () {
# the following syntax lets us test if a positional arg is set before we try and use it
# need if running with -u set.
#call only once
if [ "${initlogdone-notdone}" != "notdone" ] ; then
(( $DEBUG )) && printf "Attempt to call twice %s:%s called from %s\n" $FUNCNAME $LINENO "$(caller)"
return 0
fi
# the following bash syntax lets us test if a positional arg is set
# before we try and use it
# needed if running with -u set.
# It means use $1 if set else use a default path
logfile=${1-"/tmp/nodecheck.log"}
# need to have inittestinfo run, help programmer out
[[ "${collect_flag-undef}" = "undef" ]] && inittestinfo
# this file is only used in gather mode
# and should have been created in gatherinv
# set the name so it can be tested for
logfile4tb=${2-".$$no4tb"}
(( $collect_flag )) && { cat /dev/null > $logfile4tb ; add_on_exit "rm -f $logfile4tb" ; }
tmplog=/tmp/.$$tmp.log ; cat /dev/null > ${tmplog} # create and truncate
cat /dev/null > ${tmplog} # create and truncate
add_on_exit "rm -f $tmplog"
logout=/tmp/.$$logout.log ; cp /dev/null ${logout} # make it exist
cp /dev/null ${logout} # make it exist
add_on_exit "rm -f $logout"
tmpout=/tmp/.$$tmpout.log ; cp /dev/null ${tmpout}
cp /dev/null ${tmpout}
add_on_exit "rm -f $tmpout"
initlogdone="done"
export initlogdone
return 0
}
inittestinfo () {
local testinfo
#call only once
if [ "${inittestdone-notdone}" != "notdone" ] ; then
(( $DEBUG )) && printf "Attempt to call twice %s:%s called from %s\n" $FUNCNAME $LINENO "$(caller)"
return 0
fi
# if tmccinfo array not set then read it in
[[ -z "${tmccinfo["hwinvidx"]+${tmccinfo["hwinvidx"]}}" ]] && readtmcinfo tmcc tmccinfo
testinfo=${tmccinfo["TESTINFO"]}
collect_flag=$(echo $testinfo | awk -F = '{print $3}' | awk '{print $1}')
check_flag=$(echo $testinfo | awk -F = '{print $4}')
(( $collect_flag )) && projdir=$(echo $testinfo | awk -F \" '{print $2}') || projdir=""
[[ "${projdir:0:1}" != '/' ]] && ( printf "%s():collect is set but invaild path given |%s|" $FUNCNAME $projdir ; exit 1 )
inittestdone="done"
export inittestdone
return 0
}
getdrivenames() {
# use smartctl if exits
# use scan of disk devices
......@@ -639,6 +708,91 @@ getdrivenames() {
return 0
}
# return the requested hwinfo
# $1 is the type
getfromtb() {
local info=$1
local -i units=0
case $info in
TESTINFO | CPUINFO )
# make sure that tmccinfo does have the info requested.
[[ -z "${tmccinfo[$info]+${tmccinfo[$info]}}" ]] && return 0
# take off the info
s=${tmccinfo[$info]}
s=${s/$info }
printf "%s" "$s"
;;
NETINFO | DISKINFO )
[[ -z "${tmccinfo[$info]+${tmccinfo[$info]}}" ]] && return 0
s=${tmccinfo[$info]}
s=${s//=/}
s=${s/UNITS}
s=${s/$info }
printf "%s" "$s"
;;
MEMINFO )
[[ -z "${tmccinfo[$info]+${tmccinfo[$info]}}" ]] && return 0
s=${tmccinfo[$info]}
s=${s/$info SIZE=}
printf "%s" "$s"
;;
DISKUNIT )
[[ -z "${tmccinfo[${info}0]+${tmccinfo[${info}0]}}" ]] && return 0
# only returning serial numbers
x=${tmccinfo["DISKINFO"]}
units=${x/#DISKINFO UNITS=/}
for ((n=0; n<$units; n++)) ; do
s=${tmccinfo[DISKUNIT$n]}
# turn space seperated string into array
unset -v d ; declare -a d=(${s// / })
numelm=${#d[*]}
for ((elm=1; elm<$numelm; elm++)) ; do
objval=${d[$elm]}
[[ -z $objval ]] && continue # that's bad no tupil
obj=${objval%%=*}
val=${objval##*=}
[[ -z $val ]] && continue # bad also no value (or empty s
if [ "$obj" = "SN" ] ; then
val=${val//=/}
val=${val//\"/}
printf "%s " "$val"
fi
done
done
;;
NETUNIT )
[[ -z "${tmccinfo[${info}0]+${tmccinfo[${info}0]}}" ]] && return 0
# only return ID
x=${tmccinfo["NETINFO"]}
units=${x/#NETINFO UNITS=/}
for ((i=0; i<$units; i++)) ; do
s=${tmccinfo[NETUNIT$i]}
unset -v d ; declare -a d=(${s// / })
numelm=${#d[*]}
for ((elm=1; elm<$numelm; elm++)) ; do
objval=${d[$elm]}
[[ -z $objval ]] && continue # that's bad no tupil
obj=${objval%%=*}
val=${objval##*=}
[[ -z $val ]] && continue # bad also no value (or empty s
if [ "$obj" = "ID" ] ; then
val=${val//=/}
val=${val//\"/}
printf "%s " "$val"
fi
done
done
;;
* ) printf "ibinfo what is this in my case statment |%s|\n" "$info" ; exit 1
;;
esac
return 0
}
# The timesys function terminates its script unless it terminates earlier on its own
# args: max_time output_file command command_args
# does not work....
......@@ -663,3 +817,4 @@ timesys() {
} > $out 2>&1
}
return 0
......@@ -25,12 +25,9 @@
echo -n ' Cpucheck..'
source checkutils.sh
#source getfromtb.sh
if [ -z "$BOOTDIR" ]
then
BOOTDIR=/var/emulab/boot
fi
x=$(caller)
[[ "${x/NULL}" = "$x" ]] && declare -ri cpucheck_standalone=0 || declare -ri cpucheck_standalone=1
declare arch="" failed="" s=""
declare -i sockets=0 cores_socket=0 threads_core=0 mhz=0 err=0
......@@ -38,46 +35,26 @@ declare -i hyperthread=0 x64bit=0 hwvirt=0
declare p1="" p2="" p3=""
declare phy="" cid=""
os=`uname`
host=`hostname`
if [ -e "$BOOTDIR/realname" ]; then
host=`cat $BOOTDIR/realname`
fi
### setup logging
##if [ $1 ] ; then
## logfile=$1
##else
## logfile="/tmp/nodecheck.log"
##fi
##tmplog=/tmp/.$$.log
##cat /dev/null > ${tmplog}
initlogs $@
#set +x
#exit on unbound var
set -u
finish() {
echo "cpucheck `date`" >> ${logfile}
cat ${tmplog} >> ${logfile}
echo -n "Cpucheck `date`: " >> ${logfile}
# cat ${tmplog} >> ${logfile}
summary="Arch:$arch Sockets:$sockets Cores_socket:$cores_socket Threads_core:$th\
reads_core Mhz:$mhz HT:${hyperthread} 64bit:${x64bit} HV:${hwvirt}"
if [ -z "${failed}" ]
then
echo -n "Arch:$arch Sockets:$sockets Cores_socket:$cores_socket Threads_core:$threads_core Mhz:$mhz F:${hyperthread}${x64bit}${hwvirt} "
echo -n "$summary"
echo "$summary" >> ${logfile}
echo "OK"
else
echo "$failed"
exit 1
(( $cpucheck_standalone )) && exit 1 || return 1
fi