rc.frisbee 14.5 KB
Newer Older
1
#!/bin/sh
Leigh B. Stoller's avatar
Leigh B. Stoller committed
2
3
#
# EMULAB-COPYRIGHT
4
# Copyright (c) 2000-2012 University of Utah and the Flux Group.
Leigh B. Stoller's avatar
Leigh B. Stoller committed
5
6
# All rights reserved.
#
7
8
9
10
11
12
# Optional flag argument says "do not reboot"
#
reboot=1
if [ $# -eq 1 -a "$1" = "-noreboot" ]; then
    reboot=0
fi
Leigh B. Stoller's avatar
Leigh B. Stoller committed
13

14
15
16
17
18
19
#
# Amount of memory in MB to leave for everyone else in the system.  If you
# get out-of-memory or vm_pager error while running frisbee, increase this.
#
RESIDMEM=32

20
21
22
23
if [ -r /etc/emulab/paths.sh ]; then
	. /etc/emulab/paths.sh
else
	BINDIR=/etc/testbed
24
	BOOTDIR=/etc/testbed
25
26
27
28
29
30
31
	ETCDIR=/etc/testbed
fi

# Behave a little different on widearea nodes.
isrem=0
if [ -e $ETCDIR/isrem ]; then
    isrem=1
32
fi
33

34
35
36
37
38
39
40
41
42
43
44
45
46
#
# Update the MBR of the given disk to the indicated "version."
#
# XXX this is somewhat of a hack right now.  We recognize two
# versions of the MBR:
#	v1 (partition 1 size 6281352)
#	v2 (partition 1 size 12305790)
# Currently we only install a new MBR if the existing one is the
# wrong size, just in case the user has customized the boot program.
#
tweakmbr() {
    _DSK=$1
    _NEW=$2
47
    _ALWAYS=$3
48
49
50
51
52
53
54
55
56
57
58
59

    dd if=/dev/$_DSK of=/dev/null bs=512 count=1 2>/dev/null || {
	echo "WARNING: could not read from $_DSK, MBR not changed"
	return
    }

    _size=`fdisk -s $_DSK 2>/dev/null | sed -n -e 's/^ *1: *[0-9][0-9]* *\([0-9][0-9]*\).*$/\1/p'`
    case ${_size}s in
    6281352s)
	_CUR=1
	;;
    12305790s)
60
	_CUR=2
61
62
63
	;;
    s)
        # special case: no part1 so probably no MBR at all, make sure we install
64
	echo "Found no MBR on $_DSK, installing version $_NEW"
65
66
67
	_CUR=1000000
	;;
    *)
68
69
70
71
72
73
74
        if [ $_ALWAYS -eq 1 ]; then
	    echo "WARNING: overwriting unknown MBR on $_DSK with version $_NEW"
	    _CUR=1000000
	else
	    echo "WARNING: custom MBR on $_DSK, not changed"
	    return
	fi
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
    	;;
    esac

    if [ $_CUR = $_NEW ]; then
	return
    fi

    # now set it if we can
    if [ ! -r "/etc/emulab/mbr${_NEW}.dd" ]; then
	echo "WARNING: cannot find MBR version $_NEW, not installed"
	return
    fi

    echo "Installing MBR version $_NEW ..."
    dd if=/etc/emulab/mbr${_NEW}.dd of=/dev/$_DSK bs=512 count=1
}

92
find_disks() {
Mike Hibler's avatar
Mike Hibler committed
93
94
95
    _DISKS=""
    for d in `sed -n 's/^\([a-z]*[0-9][0-9]*\): [0-9][0-9]*MB/\1/p' /var/run/dmesg.boot`; do
	case $d in
96
	    ad*|da*|ar*|aacd*|amrd*|mfid*) _DISKS="$_DISKS $d"
Mike Hibler's avatar
Mike Hibler committed
97
98
	esac
    done
99

Mike Hibler's avatar
Mike Hibler committed
100
    echo $_DISKS
101
102
}

103
104
105
106
107
108
109
110
111
112
113
#
# Function to zero all potential superblocks in the DOS partitions that
# could interfere with the OSes on the image being loaded.
#
# FreeBSD 4 or 5 goes out of its way to make this hard.  In FBSD4, we
# cannot overwrite the beginning of partitions that have a legit superblock.
# In FBSD5, DOS partitions that have a zero type cannot even be accessed.
# So we have to use the whole-disk special file using offsets extracted
# via fdisk.
#
zapsuperblocks() {
Mike Hibler's avatar
Mike Hibler committed
114
    _DSK=$1
115
116
117
118
119

    #
    # Note we are not overly concerned about the consequences of misparsing
    # the fdisk output.  If we whack random blocks, it doesn't hurt anything.
    #
Mike Hibler's avatar
Mike Hibler committed
120
    offs=`fdisk -s $_DSK 2>/dev/null | sed -n -e 's/^[ 0-9]*: *\([0-9]*\).*$/\1/p'`
121
122
123
124
125

    if [ x"$offs" = x ]; then
        return
    fi

Mike Hibler's avatar
Mike Hibler committed
126
    echo -n "Invalidating old potential superblocks on $_DSK: "
127
128
    for off in $offs; do
        echo -n "$off "
Mike Hibler's avatar
Mike Hibler committed
129
	dd if=/dev/zero of=/dev/${_DSK} oseek=$off count=16 >/dev/null 2>&1 || {
130
131
132
133
134
135
136
137
	    echo "WARNING: failed to invalidate $off"
	}
    done
    echo ""

    return
}

138
139
140
141
#
# Function to load a single image on a disk
#
loadone() {
Mike Hibler's avatar
Mike Hibler committed
142
143
144
145
146
147
    _LOADINFO=$1
    _NUM=$2

    echo "Loading image #$_NUM"

    # Parse dem args
Mike Hibler's avatar
Mike Hibler committed
148
149
150
151
152
153
154
155
    ADDR=""
    SERVER=""
    PART=""
    PARTOS=""
    DISK=""
    ZFILL=""
    ACPI=""
    ASF=""
156
    NOCLFLUSH=""
Mike Hibler's avatar
Mike Hibler committed
157
158
    MBRVERS=""
    PREPARE=""
159
    VGAONLY=""
160
    IMAGEID=""
161
    KEEPALIVE=""
162

Mike Hibler's avatar
Mike Hibler committed
163
164
165
166
167
    for parm in $_LOADINFO; do
        case $parm in
	ADDR=*|\
	PART=*|\
	PARTOS=*|\
168
	SERVER=*|\
Mike Hibler's avatar
Mike Hibler committed
169
170
171
	DISK=*|\
	ZFILL=*|\
	ACPI=*|\
172
	NOCLFLUSH=*|\
Mike Hibler's avatar
Mike Hibler committed
173
	MBRVERS=*|\
174
175
	ASF=*|\
	PREPARE=*|\
176
	VGAONLY=*|\
177
178
        IMAGEID=*|\
	KEEPALIVE=*)
Mike Hibler's avatar
Mike Hibler committed
179
180
181
182
183
184
185
186
	    # XXX need to parse better, eval is dangerous!
	    eval $parm
	    ;;
        *)
	    echo "WARNING: bad loadinfo parameter \"$parm\" ignored"
	    ;;
	esac
    done
187

Mike Hibler's avatar
Mike Hibler committed
188
189
190
    #
    # Assign defaults where needed.
    #
191
    SERVER=${SERVER:-$BOSSIP}
Mike Hibler's avatar
Mike Hibler committed
192
193
    PART=${PART:-'0'}
    PARTOS=${PARTOS:-'unknown'}
194
195
196
197
    DISK=${DISK:-'ad0'}
    ZFILL=${ZFILL:-'0'}
    ACPI=${ACPI:-'unknown'}
    ASF=${ASF:-'unknown'}
198
    NOCLFLUSH=${NOCLFLUSH:-'unknown'}
199
    VGAONLY=${VGAONLY:-'unknown'}
Mike Hibler's avatar
Mike Hibler committed
200
    MBRVERS=${MBRVERS:-'1'}
201
202
    PREPARE=${PREPARE:-'0'}

203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
    #
    # XXX If KEEPALIVE is not explicitly set, attempt to intuit a value.
    #
    # It appears that FreeBSD 8.x's IGMP v3 implementation doesn't
    # properly sent V2 reports when it is connected to a V2-only querier
    # (switch). It insists on sending V3 reports event when the default
    # version is set to 2. So if detect that we have the newer IGMP
    # implementation, we will use the V2 keep alive mechanism in the
    # frisbee client.
    #
    if [ -z "$KEEPALIVE" ]; then
	igmpversion=`sysctl -n net.inet.igmp.default_version 2>/dev/null`
	if [ "$igmpversion"x != "x" ]; then
	    echo "WARNING: possible IGMP issues; using frisbee keep alive timer"
	    KEEPALIVE=30
	else
	    KEEPALIVE=0
	fi
    fi

223
224
225
    #
    # One of ADDR or IMAGEID must be set.
    #
226
227
228
229
230
231
232
233
234
235
236
    if [ x"$IMAGEID" != x ]; then
        ADDR=""
    	# IMAGEID=pid,gid,imagename
	pid=`echo $IMAGEID | awk -F, '{ printf $1 }'`
	name=`echo $IMAGEID | awk -F, '{ printf $3 }'`
	IMAGEID="$pid/$name"
    elif [ x"$ADDR" = x ]; then
	echo "Unable to get imageid or address for loading image"
	return 1
    fi

Mike Hibler's avatar
Mike Hibler committed
237
238
    if [ "$PART" != "0" ]; then
	SLICE="-s $PART"
239
240
241
242
243
244
245
246
247
	case $PARTOS in
	FreeBSD)
		SLICE="$SLICE -D 165"
		PTYPE=165
		;;
	OpenBSD)
		SLICE="$SLICE -D 166"
		PTYPE=166
		;;
248
	Fedora|Linux)
249
250
251
252
253
254
		SLICE="$SLICE -D 131"
		PTYPE=131
		;;
	*)
		;;
	esac
255
    fi
256

257
258
259
260
261
262
263
264
    #
    # set memory limits:
    #	allow $RESIDMEM MB for non-frisbee stuff
    #	split remaining memory (min of 2MB) between network/disk buffering
    #
    HOSTMEM=`sysctl -n hw.usermem`
    HOSTMEM=`expr $HOSTMEM / 1048576`
    if [ $HOSTMEM -ge `expr $RESIDMEM + 2` ]; then
265
266
267
268
269
270
271
272
273
	HOSTMEM=`expr $HOSTMEM - $RESIDMEM`
	BYTES=`expr $HOSTMEM \* 1024`
	DATASEGSZ=`ulimit -d`
	if [ $BYTES -gt $DATASEGSZ ]; then
	    BYTES=$DATASEGSZ
	    HOSTMEM=`expr $BYTES / 1024`
	    echo "WARNING: kernel limits buffering to $HOSTMEM MB"
	fi
	ulimit -v $BYTES
274
275
276
277
278
279
280
281
282

	## For GaTech we use more memory for disks since the disks are so slow
	#NETMEM=`expr $HOSTMEM \* 1 / 3`
	#DISKMEM=`expr $HOSTMEM \* 2 / 3`
	#MEMARGS="-C $NETMEM -W $DISKMEM"

	# For Utah, we let the client split up the memory
	# (50/50, but no more chunk buffers than there are chunks in the image)
	MEMARGS="-M $HOSTMEM"
283
    fi
284

285
    #
286
287
288
    # Make sure the necessary device files exist (only necessary on
    # FreeBSD 4.x).  Note that we create partition files for all slices,
    # not just slice 1, for the benefit of the slicefix script.
289
290
    #
    if [ -x /dev/MAKEDEV -a ! -e /dev/$DISK ]; then
291
	(cd /dev; ./MAKEDEV $DISK ${DISK}s2a ${DISK}s3a ${DISK}s4a)
292
    fi
293

Mike Hibler's avatar
Mike Hibler committed
294
295
296
    if [ x"$ADDR" != x ]; then
	isurl=`echo $ADDR | grep http -`
	ispath=`echo $ADDR | grep '^/' -`
297
298

	if [ x"$isurl" != x ]; then
Mike Hibler's avatar
Mike Hibler committed
299
	    echo "Need to download $ADDR"
300
301
302
303

	    isurl=1
	    if [ ! -d /images ]; then
		echo "Need to create or mount /images directory!"
304
		return 1
305
	    fi
306

307
308
309
	    #
	    # This needs a lot more work ...
	    #
Mike Hibler's avatar
Mike Hibler committed
310
	    imagefile=`echo $ADDR | sed -e 's,^http[s]*://[^/]*/,,'`
311
312
313
	    imagefile="/images/$imagefile"
	elif [ x"$ispath" != x ]; then
	    ispath=1
314

Mike Hibler's avatar
Mike Hibler committed
315
316
	    if [ ! -e $ADDR ]; then
		echo "$ADDR does not exist!"
317
		return 1
318
	    fi
Mike Hibler's avatar
Mike Hibler committed
319
	    imagefile="$ADDR"
320
	else
Mike Hibler's avatar
Mike Hibler committed
321
322
	    PORT=`echo $ADDR | awk -F: '{ printf $2 }'`
	    MCAST=`echo $ADDR | awk -F: '{ printf $1 }'`
323
324
325
	    if [ -e $BOOTDIR/myip ]; then
		MCASTIF="-i `cat $BOOTDIR/myip`"
	    else
326
		MCASTIF=""
327
328
	    fi
	    MCASTADDR="-m $MCAST -p $PORT"
329
	    IMAGEID="$MCASTIF $MCASTADDR"
330
331
	    isurl=0
	    ispath=0
332
	fi
333
    else
334
335
336
337
	#
	# Note: if you want to use broadcast rather that multicast as
	# the distribution method, add "-X bcast" to the IMAGEID= below.
	#
338
339
340
341
        IMAGEID="-B 30 -F $IMAGEID"
	isurl=0
	ispath=0
    fi
342

343
344
345
346
347
348
349
350
351
    #
    # ZFILL==1: use frisbee
    # ZFILL==2: separate disk-wipe pass (not yet implemented)
    #
    if [ "$ZFILL" != "0" ]; then
	ZFILL="-z"
    else
	ZFILL=""
    fi
352

353
354
355
356
357
358
    if [ "$KEEPALIVE" != "0" ]; then
	KA="-K $KEEPALIVE"
    else
	KA=""
    fi

359
360
361
362
363
364
365
366
367
368
369
    #
    # Make sure the write-cache is enabled on SCSI disks.  It makes a
    # huge difference.  We don't worry about data corruption in the
    # case of a crash, because we will just reload the disk again anyway
    # in that situation.
    #
    turncacheoff=0
    case $DISK in
    da*)
	if [ -x $BINDIR/camwce ] && $BINDIR/camwce on $DISK; then
	    turncacheoff=1;
370
	fi
371
372
	;;
    esac
373

374
375
376
377
378
379
    #
    # For slice images, ensure that the MBR is the correct version
    # and replace if not.
    #
    if [ $_NUM -eq 0 ]; then
	if [ "$PART" != "0" ]; then
380
	    tweakmbr $DISK $MBRVERS $PREPARE
381
	fi
382
383
384
385
        FIRSTMBR=$MBRVERS
    else
	if [ "$FIRSTMBR" != "$MBRVERS" ]; then
	    echo "MBR Mismatch: First MBR is \"$FIRSTMBR\" while image #$_NUM is \"$MBRVERS\""
386
	fi
387
    fi
388

389
390
391
392
393
394
395
    #
    # If a remote node and we have a URL, make sure that we have a place
    # to put it. Done after the MBR tweak of course. Then download the URL.
    #
    if [ $isrem -eq 1 -a $isurl -eq 1 ]; then
	echo "Downloading image \'$ADDR\' to /images directory ..."
	$BINDIR/mkextrafs.pl -c -s 4 -r $DISK /images || {
396
397
	    # XXX run growdisk to ensure we have a partition in the MBR
	    $BINDIR/growdisk -vW /dev/$DISK >/dev/null 2>&1
398
399
400
401
402
403
404
405
	    $BINDIR/mkextrafs.pl -n -f -s 4 -r $DISK /images || {
		echo "Could not create /images partition"
		return 1
	    }
	}
	wget -nv -N -P /images "$ADDR"
	wstat=$?
	case $wstat in
406
	0)
407
	    echo "wget succeeded getting the image"
408
	    ;;
409
	*)
410
411
	    echo "wget failed, status $wstat"
	    return 1
412
	    ;;
413
	esac
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
    fi

    #
    # If not zeroing the disk and we are loading a full disk image
    # we need to ensure that we at least invalidate any old superblocks
    # that might leak through (most likely in partition 4 which isn't
    # touched by our current image).  We do this before running frisbee
    # so that any legit filesystems loaded from the image work.
    #
    # Since we do it before frisbee, we are counting on the current
    # MBR being the same as the MBR being layed down.  While not
    # a reasonable assumption in general, it mostly works in our
    # environment and at least won't hurt anything if not true.
    #
    if [ $PREPARE -eq 1 -o \
         \( $isrem -eq 0 -a x"$ZFILL" = x -a "$PART" = "0" \) ]; then
	zapsuperblocks $DISK
    fi

    if [ x"$imagefile" != x ]; then
	echo "Running /usr/local/bin/imageunzip -o -O -W 32 $ZFILL $imagefile /dev/${DISK}s${PART}"
	/usr/local/bin/imageunzip -o -O -W 32 $ZFILL $imagefile /dev/${DISK}s${PART}
436
    else
437
438
	echo "Running $BINDIR/frisbee -S $SERVER $MEMARGS $KA $ZFILL $SLICE $IMAGEID /dev/$DISK at `date`"
	$BINDIR/frisbee -S $SERVER $MEMARGS $KA $ZFILL $SLICE $IMAGEID /dev/$DISK
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
    fi
    fstat=$?

    #
    # If we mounted a partition from the disk to store the image,
    # we must unmount it now so that slicefix and others don't fail
    # due to an in-use partition.
    #
    if [ $isrem -eq 1 -a $isurl -eq 1 ]; then
	umount /images || {
	    echo "WARNING: could not unmount /images"
	}
    fi

    #
    # Turn the cache back off if we turned it on.
    # Is this sufficient to ensure the cache gets flushed?
    #
    if [ $turncacheoff -eq 1 ]; then
	$BINDIR/camwce off $DISK
459
    fi
460
461
462
463
464
465

    case $fstat in
    0)
	echo "Adjusting slice-related files"
	export SLICEFIX_ACPI=$ACPI
	export SLICEFIX_ASF=$ASF
466
	export SLICEFIX_NOCLFLUSH=$NOCLFLUSH
467
	export SLICEFIX_VGAONLY=$VGAONLY
468
469
470
471
472
473
474
475
	$BINDIR/slicefix $PART $DISK
	echo "Image #$_NUM load complete at `date`"
	return 0
	;;
    *)
	echo "Frisbee run failed, status $fstat"
	;;
    esac
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
    return 1
}

$BINDIR/tmcc state RELOADSETUP

BOSSINFO=`$BINDIR/tmcc bossinfo`
STATUS=`$BINDIR/tmcc status`

BOSSIP=`echo $BOSSINFO | awk '{ print $2 }'`

if [ -x /usr/sbin/ntpdate ]; then
	/usr/sbin/ntpdate -b $BOSSIP >/dev/null 2>&1
fi

# Enable IPoD
if [ -r $BINDIR/rc.ipod ]; then
    . $BINDIR/rc.ipod
493
fi
494
495

#
Mike Hibler's avatar
Mike Hibler committed
496
497
# Assign each line (one image) to one of the positional parameters.
# This is done by setting IFS to a newline and using set.
498
499
# XXX there must be a better way to do this!
#
Mike Hibler's avatar
Mike Hibler committed
500
OIFS="$IFS"
501
502
503
IFS='
'
set -- `$BINDIR/tmcc loadinfo`
Mike Hibler's avatar
Mike Hibler committed
504
IFS="$OIFS"
505
506
507
508
509
510
511
if [ "$1"x = x ]; then
    echo "No load information for node"
    exit 1
fi

$BINDIR/tmcc state RELOADING

512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
# HACK ALERT: If we're reloading we need to zap the superblocks and
# MBRs of any other disks in the system.  This is to prevent Linux from
# finding an old filesystem with the same label or UUID and mounting
# that instead of the right one.  We skip the disks that are mounted
# and the disk we're going to write to.
# DOUBLE HACK ALERT: Changed this to zap all disks to avoid having
# to figure out what the other disks are when loading multiple images.
# Since a new MBR will be laid down anyway there is no harm in doing
# this as long as we are sure we are in the reloading experiment.
case $STATUS in
	*ALLOCATED=emulab-ops/reloading*)
		disks=`find_disks`
		for d in $disks; do
			#[ $d = $DISK ] && continue
			mount | grep "^/dev/$d" > /dev/null && continue
			zapsuperblocks $d
			echo "Invalidating MBR on $d"
529
			dd if=/dev/zero of=/dev/$d bs=512 count=16
530
531
532
533
		done
		;;
esac

534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
#
# Load each image in turn.
# If a load fails, we exit non-zero so that the rc script will drop into
# single-user mode.  If all loads succeed we either reboot or continue with
# the rc script as desired by the caller.
#
NUM=0
while [ "$1"x != x ]; do
    loadone "$1" $NUM || {
	echo "Failed to load disk, dropping to login prompt at `date`"
        exit 1
    }
    shift
    NUM=`expr $NUM + 1`
done
549
echo "Frisbee run(s) finished"
550

551
552
echo "Resizing final disk partition"
$BINDIR/growdisk -vW /dev/$DISK
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586

#
# If requested to reboot, do so.
#
# Note: there is a race condition with stated here.
# If we reboot immediately after sending RELOADDONE,
# it is possible that, under heavy server load, we will
# finish the reboot and reach the bootinfo stage before
# stated gets and processes our RELOADDONE.  So now we
# wait around after sending the RELOADDONE.  stated should
# force us to reboot when the transition takes place.
# For backward compatibility we use a new state: RELOADDONEV2.
# For paranoia we just wait around for awhile and then
# reboot anyway, just in case stated's reboot fails for
# some reason.
#
if [ $reboot -eq 1 ]; then
    $BINDIR/tmcc state RELOADDONEV2
    echo "Waiting for server to reboot us ..."
    if [ $isrem -eq 1 ]; then
	sleep 30
    else
	sleep 240
    fi
    echo "No response from server, rebooting myself ..."
    /sbin/reboot
    sleep 100
else
    $BINDIR/tmcc state RELOADDONE
fi

echo "Frisbee finished"

exit 0