rc.frisbee 15 KB
Newer Older
1
#!/bin/sh
Leigh B. Stoller's avatar
Leigh B. Stoller committed
2
#
3
# Copyright (c) 2000-2012 University of Utah and the Flux Group.
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
# 
# {{{EMULAB-LICENSE
# 
# This file is part of the Emulab network testbed software.
# 
# This file is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or (at
# your option) any later version.
# 
# This file is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public
# License for more details.
# 
# You should have received a copy of the GNU Affero General Public License
# along with this file.  If not, see <http://www.gnu.org/licenses/>.
# 
# }}}
Leigh B. Stoller's avatar
Leigh B. Stoller committed
23
#
24
25
26
27
28
29
# Optional flag argument says "do not reboot"
#
reboot=1
if [ $# -eq 1 -a "$1" = "-noreboot" ]; then
    reboot=0
fi
Leigh B. Stoller's avatar
Leigh B. Stoller committed
30

31
32
33
34
35
36
#
# Amount of memory in MB to leave for everyone else in the system.  If you
# get out-of-memory or vm_pager error while running frisbee, increase this.
#
RESIDMEM=32

37
38
39
40
if [ -r /etc/emulab/paths.sh ]; then
	. /etc/emulab/paths.sh
else
	BINDIR=/etc/testbed
41
	BOOTDIR=/etc/testbed
42
43
44
45
46
47
48
	ETCDIR=/etc/testbed
fi

# Behave a little different on widearea nodes.
isrem=0
if [ -e $ETCDIR/isrem ]; then
    isrem=1
49
fi
50

51
52
53
54
55
56
57
58
59
60
61
62
63
#
# Update the MBR of the given disk to the indicated "version."
#
# XXX this is somewhat of a hack right now.  We recognize two
# versions of the MBR:
#	v1 (partition 1 size 6281352)
#	v2 (partition 1 size 12305790)
# Currently we only install a new MBR if the existing one is the
# wrong size, just in case the user has customized the boot program.
#
tweakmbr() {
    _DSK=$1
    _NEW=$2
64
    _ALWAYS=$3
65
66
67
68
69
70
71
72
73
74
75
76

    dd if=/dev/$_DSK of=/dev/null bs=512 count=1 2>/dev/null || {
	echo "WARNING: could not read from $_DSK, MBR not changed"
	return
    }

    _size=`fdisk -s $_DSK 2>/dev/null | sed -n -e 's/^ *1: *[0-9][0-9]* *\([0-9][0-9]*\).*$/\1/p'`
    case ${_size}s in
    6281352s)
	_CUR=1
	;;
    12305790s)
77
	_CUR=2
78
79
80
	;;
    s)
        # special case: no part1 so probably no MBR at all, make sure we install
81
	echo "Found no MBR on $_DSK, installing version $_NEW"
82
83
84
	_CUR=1000000
	;;
    *)
85
86
87
88
89
90
91
        if [ $_ALWAYS -eq 1 ]; then
	    echo "WARNING: overwriting unknown MBR on $_DSK with version $_NEW"
	    _CUR=1000000
	else
	    echo "WARNING: custom MBR on $_DSK, not changed"
	    return
	fi
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
    	;;
    esac

    if [ $_CUR = $_NEW ]; then
	return
    fi

    # now set it if we can
    if [ ! -r "/etc/emulab/mbr${_NEW}.dd" ]; then
	echo "WARNING: cannot find MBR version $_NEW, not installed"
	return
    fi

    echo "Installing MBR version $_NEW ..."
    dd if=/etc/emulab/mbr${_NEW}.dd of=/dev/$_DSK bs=512 count=1
}

109
find_disks() {
Mike Hibler's avatar
Mike Hibler committed
110
111
112
    _DISKS=""
    for d in `sed -n 's/^\([a-z]*[0-9][0-9]*\): [0-9][0-9]*MB/\1/p' /var/run/dmesg.boot`; do
	case $d in
113
	    ad*|da*|ar*|aacd*|amrd*|mfid*|mfisyspd*) _DISKS="$_DISKS $d"
Mike Hibler's avatar
Mike Hibler committed
114
115
	esac
    done
116

Mike Hibler's avatar
Mike Hibler committed
117
    echo $_DISKS
118
119
}

120
121
122
123
124
125
126
127
128
129
130
#
# Function to zero all potential superblocks in the DOS partitions that
# could interfere with the OSes on the image being loaded.
#
# FreeBSD 4 or 5 goes out of its way to make this hard.  In FBSD4, we
# cannot overwrite the beginning of partitions that have a legit superblock.
# In FBSD5, DOS partitions that have a zero type cannot even be accessed.
# So we have to use the whole-disk special file using offsets extracted
# via fdisk.
#
zapsuperblocks() {
Mike Hibler's avatar
Mike Hibler committed
131
    _DSK=$1
132
133
134
135
136

    #
    # Note we are not overly concerned about the consequences of misparsing
    # the fdisk output.  If we whack random blocks, it doesn't hurt anything.
    #
Mike Hibler's avatar
Mike Hibler committed
137
    offs=`fdisk -s $_DSK 2>/dev/null | sed -n -e 's/^[ 0-9]*: *\([0-9]*\).*$/\1/p'`
138
139
140
141
142

    if [ x"$offs" = x ]; then
        return
    fi

Mike Hibler's avatar
Mike Hibler committed
143
    echo -n "Invalidating old potential superblocks on $_DSK: "
144
145
    for off in $offs; do
        echo -n "$off "
Mike Hibler's avatar
Mike Hibler committed
146
	dd if=/dev/zero of=/dev/${_DSK} oseek=$off count=16 >/dev/null 2>&1 || {
147
148
149
150
151
152
153
154
	    echo "WARNING: failed to invalidate $off"
	}
    done
    echo ""

    return
}

155
156
157
158
#
# Function to load a single image on a disk
#
loadone() {
Mike Hibler's avatar
Mike Hibler committed
159
160
161
162
163
164
    _LOADINFO=$1
    _NUM=$2

    echo "Loading image #$_NUM"

    # Parse dem args
Mike Hibler's avatar
Mike Hibler committed
165
166
167
168
169
170
171
172
    ADDR=""
    SERVER=""
    PART=""
    PARTOS=""
    DISK=""
    ZFILL=""
    ACPI=""
    ASF=""
173
    NOCLFLUSH=""
Mike Hibler's avatar
Mike Hibler committed
174
175
    MBRVERS=""
    PREPARE=""
176
    VGAONLY=""
177
    IMAGEID=""
178
    KEEPALIVE=""
179

Mike Hibler's avatar
Mike Hibler committed
180
181
182
183
184
    for parm in $_LOADINFO; do
        case $parm in
	ADDR=*|\
	PART=*|\
	PARTOS=*|\
185
	SERVER=*|\
Mike Hibler's avatar
Mike Hibler committed
186
187
188
	DISK=*|\
	ZFILL=*|\
	ACPI=*|\
189
	NOCLFLUSH=*|\
Mike Hibler's avatar
Mike Hibler committed
190
	MBRVERS=*|\
191
192
	ASF=*|\
	PREPARE=*|\
193
	VGAONLY=*|\
194
195
        IMAGEID=*|\
	KEEPALIVE=*)
Mike Hibler's avatar
Mike Hibler committed
196
197
198
199
200
201
202
203
	    # XXX need to parse better, eval is dangerous!
	    eval $parm
	    ;;
        *)
	    echo "WARNING: bad loadinfo parameter \"$parm\" ignored"
	    ;;
	esac
    done
204

Mike Hibler's avatar
Mike Hibler committed
205
206
207
    #
    # Assign defaults where needed.
    #
208
    SERVER=${SERVER:-$BOSSIP}
Mike Hibler's avatar
Mike Hibler committed
209
210
    PART=${PART:-'0'}
    PARTOS=${PARTOS:-'unknown'}
211
212
213
214
    DISK=${DISK:-'ad0'}
    ZFILL=${ZFILL:-'0'}
    ACPI=${ACPI:-'unknown'}
    ASF=${ASF:-'unknown'}
215
    NOCLFLUSH=${NOCLFLUSH:-'unknown'}
216
    VGAONLY=${VGAONLY:-'unknown'}
Mike Hibler's avatar
Mike Hibler committed
217
    MBRVERS=${MBRVERS:-'1'}
218
219
    PREPARE=${PREPARE:-'0'}

220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
    #
    # XXX If KEEPALIVE is not explicitly set, attempt to intuit a value.
    #
    # It appears that FreeBSD 8.x's IGMP v3 implementation doesn't
    # properly sent V2 reports when it is connected to a V2-only querier
    # (switch). It insists on sending V3 reports event when the default
    # version is set to 2. So if detect that we have the newer IGMP
    # implementation, we will use the V2 keep alive mechanism in the
    # frisbee client.
    #
    if [ -z "$KEEPALIVE" ]; then
	igmpversion=`sysctl -n net.inet.igmp.default_version 2>/dev/null`
	if [ "$igmpversion"x != "x" ]; then
	    echo "WARNING: possible IGMP issues; using frisbee keep alive timer"
	    KEEPALIVE=30
	else
	    KEEPALIVE=0
	fi
    fi

240
241
242
    #
    # One of ADDR or IMAGEID must be set.
    #
243
244
245
246
247
248
249
250
251
252
253
    if [ x"$IMAGEID" != x ]; then
        ADDR=""
    	# IMAGEID=pid,gid,imagename
	pid=`echo $IMAGEID | awk -F, '{ printf $1 }'`
	name=`echo $IMAGEID | awk -F, '{ printf $3 }'`
	IMAGEID="$pid/$name"
    elif [ x"$ADDR" = x ]; then
	echo "Unable to get imageid or address for loading image"
	return 1
    fi

Mike Hibler's avatar
Mike Hibler committed
254
255
    if [ "$PART" != "0" ]; then
	SLICE="-s $PART"
256
257
258
259
260
261
262
263
264
	case $PARTOS in
	FreeBSD)
		SLICE="$SLICE -D 165"
		PTYPE=165
		;;
	OpenBSD)
		SLICE="$SLICE -D 166"
		PTYPE=166
		;;
265
	Fedora|Linux)
266
267
268
269
270
271
		SLICE="$SLICE -D 131"
		PTYPE=131
		;;
	*)
		;;
	esac
272
    fi
273

274
275
276
277
278
279
280
281
    #
    # set memory limits:
    #	allow $RESIDMEM MB for non-frisbee stuff
    #	split remaining memory (min of 2MB) between network/disk buffering
    #
    HOSTMEM=`sysctl -n hw.usermem`
    HOSTMEM=`expr $HOSTMEM / 1048576`
    if [ $HOSTMEM -ge `expr $RESIDMEM + 2` ]; then
282
	HOSTMEM=`expr $HOSTMEM - $RESIDMEM`
283
	KBYTES=`expr $HOSTMEM \* 1024`
284
	DATASEGSZ=`ulimit -d`
285
286
287
	if [ $KBYTES -gt $DATASEGSZ ]; then
	    KBYTES=$DATASEGSZ
	    HOSTMEM=`expr $KBYTES / 1024`
288
289
	    echo "WARNING: kernel limits buffering to $HOSTMEM MB"
	fi
290
	ulimit -v $KBYTES
291

292
	# Let the client split up the memory
293
	MEMARGS="-M $HOSTMEM"
294
    fi
295

296
    #
297
298
299
    # Make sure the necessary device files exist (only necessary on
    # FreeBSD 4.x).  Note that we create partition files for all slices,
    # not just slice 1, for the benefit of the slicefix script.
300
301
    #
    if [ -x /dev/MAKEDEV -a ! -e /dev/$DISK ]; then
302
	(cd /dev; ./MAKEDEV $DISK ${DISK}s2a ${DISK}s3a ${DISK}s4a)
303
    fi
304

Mike Hibler's avatar
Mike Hibler committed
305
306
307
    if [ x"$ADDR" != x ]; then
	isurl=`echo $ADDR | grep http -`
	ispath=`echo $ADDR | grep '^/' -`
308
309

	if [ x"$isurl" != x ]; then
Mike Hibler's avatar
Mike Hibler committed
310
	    echo "Need to download $ADDR"
311
312
313
314

	    isurl=1
	    if [ ! -d /images ]; then
		echo "Need to create or mount /images directory!"
315
		return 1
316
	    fi
317

318
319
320
	    #
	    # This needs a lot more work ...
	    #
Mike Hibler's avatar
Mike Hibler committed
321
	    imagefile=`echo $ADDR | sed -e 's,^http[s]*://[^/]*/,,'`
322
323
324
	    imagefile="/images/$imagefile"
	elif [ x"$ispath" != x ]; then
	    ispath=1
325

Mike Hibler's avatar
Mike Hibler committed
326
327
	    if [ ! -e $ADDR ]; then
		echo "$ADDR does not exist!"
328
		return 1
329
	    fi
Mike Hibler's avatar
Mike Hibler committed
330
	    imagefile="$ADDR"
331
	else
Mike Hibler's avatar
Mike Hibler committed
332
333
	    PORT=`echo $ADDR | awk -F: '{ printf $2 }'`
	    MCAST=`echo $ADDR | awk -F: '{ printf $1 }'`
334
335
336
	    if [ -e $BOOTDIR/myip ]; then
		MCASTIF="-i `cat $BOOTDIR/myip`"
	    else
337
		MCASTIF=""
338
339
	    fi
	    MCASTADDR="-m $MCAST -p $PORT"
340
	    IMAGEID="$MCASTIF $MCASTADDR"
341
342
	    isurl=0
	    ispath=0
343
	fi
344
    else
345
346
347
348
	#
	# Note: if you want to use broadcast rather that multicast as
	# the distribution method, add "-X bcast" to the IMAGEID= below.
	#
349
350
351
352
        IMAGEID="-B 30 -F $IMAGEID"
	isurl=0
	ispath=0
    fi
353

354
355
356
357
358
359
360
361
362
    #
    # ZFILL==1: use frisbee
    # ZFILL==2: separate disk-wipe pass (not yet implemented)
    #
    if [ "$ZFILL" != "0" ]; then
	ZFILL="-z"
    else
	ZFILL=""
    fi
363

364
365
366
367
368
369
    if [ "$KEEPALIVE" != "0" ]; then
	KA="-K $KEEPALIVE"
    else
	KA=""
    fi

370
371
372
373
374
375
376
377
378
379
380
    #
    # Make sure the write-cache is enabled on SCSI disks.  It makes a
    # huge difference.  We don't worry about data corruption in the
    # case of a crash, because we will just reload the disk again anyway
    # in that situation.
    #
    turncacheoff=0
    case $DISK in
    da*)
	if [ -x $BINDIR/camwce ] && $BINDIR/camwce on $DISK; then
	    turncacheoff=1;
381
	fi
382
383
	;;
    esac
384

385
386
387
388
389
390
    #
    # For slice images, ensure that the MBR is the correct version
    # and replace if not.
    #
    if [ $_NUM -eq 0 ]; then
	if [ "$PART" != "0" ]; then
391
	    tweakmbr $DISK $MBRVERS $PREPARE
392
	fi
393
394
395
396
        FIRSTMBR=$MBRVERS
    else
	if [ "$FIRSTMBR" != "$MBRVERS" ]; then
	    echo "MBR Mismatch: First MBR is \"$FIRSTMBR\" while image #$_NUM is \"$MBRVERS\""
397
	fi
398
    fi
399

400
401
402
403
404
405
406
    #
    # If a remote node and we have a URL, make sure that we have a place
    # to put it. Done after the MBR tweak of course. Then download the URL.
    #
    if [ $isrem -eq 1 -a $isurl -eq 1 ]; then
	echo "Downloading image \'$ADDR\' to /images directory ..."
	$BINDIR/mkextrafs.pl -c -s 4 -r $DISK /images || {
407
408
	    # XXX run growdisk to ensure we have a partition in the MBR
	    $BINDIR/growdisk -vW /dev/$DISK >/dev/null 2>&1
409
410
411
412
413
414
415
416
	    $BINDIR/mkextrafs.pl -n -f -s 4 -r $DISK /images || {
		echo "Could not create /images partition"
		return 1
	    }
	}
	wget -nv -N -P /images "$ADDR"
	wstat=$?
	case $wstat in
417
	0)
418
	    echo "wget succeeded getting the image"
419
	    ;;
420
	*)
421
422
	    echo "wget failed, status $wstat"
	    return 1
423
	    ;;
424
	esac
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
    fi

    #
    # If not zeroing the disk and we are loading a full disk image
    # we need to ensure that we at least invalidate any old superblocks
    # that might leak through (most likely in partition 4 which isn't
    # touched by our current image).  We do this before running frisbee
    # so that any legit filesystems loaded from the image work.
    #
    # Since we do it before frisbee, we are counting on the current
    # MBR being the same as the MBR being layed down.  While not
    # a reasonable assumption in general, it mostly works in our
    # environment and at least won't hurt anything if not true.
    #
    if [ $PREPARE -eq 1 -o \
         \( $isrem -eq 0 -a x"$ZFILL" = x -a "$PART" = "0" \) ]; then
	zapsuperblocks $DISK
    fi

    if [ x"$imagefile" != x ]; then
	echo "Running /usr/local/bin/imageunzip -o -O -W 32 $ZFILL $imagefile /dev/${DISK}s${PART}"
	/usr/local/bin/imageunzip -o -O -W 32 $ZFILL $imagefile /dev/${DISK}s${PART}
447
    else
448
449
	echo "Running $BINDIR/frisbee -S $SERVER $MEMARGS $KA $ZFILL $SLICE $IMAGEID /dev/$DISK at `date`"
	$BINDIR/frisbee -S $SERVER $MEMARGS $KA $ZFILL $SLICE $IMAGEID /dev/$DISK
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
    fi
    fstat=$?

    #
    # If we mounted a partition from the disk to store the image,
    # we must unmount it now so that slicefix and others don't fail
    # due to an in-use partition.
    #
    if [ $isrem -eq 1 -a $isurl -eq 1 ]; then
	umount /images || {
	    echo "WARNING: could not unmount /images"
	}
    fi

    #
    # Turn the cache back off if we turned it on.
    # Is this sufficient to ensure the cache gets flushed?
    #
    if [ $turncacheoff -eq 1 ]; then
	$BINDIR/camwce off $DISK
470
    fi
471
472
473
474
475
476

    case $fstat in
    0)
	echo "Adjusting slice-related files"
	export SLICEFIX_ACPI=$ACPI
	export SLICEFIX_ASF=$ASF
477
	export SLICEFIX_NOCLFLUSH=$NOCLFLUSH
478
	export SLICEFIX_VGAONLY=$VGAONLY
479
480
481
482
483
484
485
486
	$BINDIR/slicefix $PART $DISK
	echo "Image #$_NUM load complete at `date`"
	return 0
	;;
    *)
	echo "Frisbee run failed, status $fstat"
	;;
    esac
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
    return 1
}

$BINDIR/tmcc state RELOADSETUP

BOSSINFO=`$BINDIR/tmcc bossinfo`
STATUS=`$BINDIR/tmcc status`

BOSSIP=`echo $BOSSINFO | awk '{ print $2 }'`

if [ -x /usr/sbin/ntpdate ]; then
	/usr/sbin/ntpdate -b $BOSSIP >/dev/null 2>&1
fi

# Enable IPoD
if [ -r $BINDIR/rc.ipod ]; then
    . $BINDIR/rc.ipod
504
fi
505
506

#
Mike Hibler's avatar
Mike Hibler committed
507
508
# Assign each line (one image) to one of the positional parameters.
# This is done by setting IFS to a newline and using set.
509
510
# XXX there must be a better way to do this!
#
Mike Hibler's avatar
Mike Hibler committed
511
OIFS="$IFS"
512
513
514
IFS='
'
set -- `$BINDIR/tmcc loadinfo`
Mike Hibler's avatar
Mike Hibler committed
515
IFS="$OIFS"
516
517
518
519
520
521
522
if [ "$1"x = x ]; then
    echo "No load information for node"
    exit 1
fi

$BINDIR/tmcc state RELOADING

523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
# HACK ALERT: If we're reloading we need to zap the superblocks and
# MBRs of any other disks in the system.  This is to prevent Linux from
# finding an old filesystem with the same label or UUID and mounting
# that instead of the right one.  We skip the disks that are mounted
# and the disk we're going to write to.
# DOUBLE HACK ALERT: Changed this to zap all disks to avoid having
# to figure out what the other disks are when loading multiple images.
# Since a new MBR will be laid down anyway there is no harm in doing
# this as long as we are sure we are in the reloading experiment.
case $STATUS in
	*ALLOCATED=emulab-ops/reloading*)
		disks=`find_disks`
		for d in $disks; do
			#[ $d = $DISK ] && continue
			mount | grep "^/dev/$d" > /dev/null && continue
			zapsuperblocks $d
			echo "Invalidating MBR on $d"
540
			dd if=/dev/zero of=/dev/$d bs=512 count=16
541
542
543
544
		done
		;;
esac

545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
#
# Load each image in turn.
# If a load fails, we exit non-zero so that the rc script will drop into
# single-user mode.  If all loads succeed we either reboot or continue with
# the rc script as desired by the caller.
#
NUM=0
while [ "$1"x != x ]; do
    loadone "$1" $NUM || {
	echo "Failed to load disk, dropping to login prompt at `date`"
        exit 1
    }
    shift
    NUM=`expr $NUM + 1`
done
560
echo "Frisbee run(s) finished"
561

562
563
echo "Resizing final disk partition"
$BINDIR/growdisk -vW /dev/$DISK
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597

#
# If requested to reboot, do so.
#
# Note: there is a race condition with stated here.
# If we reboot immediately after sending RELOADDONE,
# it is possible that, under heavy server load, we will
# finish the reboot and reach the bootinfo stage before
# stated gets and processes our RELOADDONE.  So now we
# wait around after sending the RELOADDONE.  stated should
# force us to reboot when the transition takes place.
# For backward compatibility we use a new state: RELOADDONEV2.
# For paranoia we just wait around for awhile and then
# reboot anyway, just in case stated's reboot fails for
# some reason.
#
if [ $reboot -eq 1 ]; then
    $BINDIR/tmcc state RELOADDONEV2
    echo "Waiting for server to reboot us ..."
    if [ $isrem -eq 1 ]; then
	sleep 30
    else
	sleep 240
    fi
    echo "No response from server, rebooting myself ..."
    /sbin/reboot
    sleep 100
else
    $BINDIR/tmcc state RELOADDONE
fi

echo "Frisbee finished"

exit 0