rc.frisbee 12.8 KB
Newer Older
1
#!/bin/sh
Leigh B. Stoller's avatar
Leigh B. Stoller committed
2
3
#
# EMULAB-COPYRIGHT
Mike Hibler's avatar
Mike Hibler committed
4
# Copyright (c) 2000-2007 University of Utah and the Flux Group.
Leigh B. Stoller's avatar
Leigh B. Stoller committed
5
6
# All rights reserved.
#
7
8
9
10
11
12
# Optional flag argument says "do not reboot"
#
reboot=1
if [ $# -eq 1 -a "$1" = "-noreboot" ]; then
    reboot=0
fi
Leigh B. Stoller's avatar
Leigh B. Stoller committed
13

14
15
16
17
18
19
#
# Amount of memory in MB to leave for everyone else in the system.  If you
# get out-of-memory or vm_pager error while running frisbee, increase this.
#
RESIDMEM=32

20
21
22
23
if [ -r /etc/emulab/paths.sh ]; then
	. /etc/emulab/paths.sh
else
	BINDIR=/etc/testbed
24
	BOOTDIR=/etc/testbed
25
26
27
28
29
30
31
	ETCDIR=/etc/testbed
fi

# Behave a little different on widearea nodes.
isrem=0
if [ -e $ETCDIR/isrem ]; then
    isrem=1
32
fi
33

34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
#
# Update the MBR of the given disk to the indicated "version."
#
# XXX this is somewhat of a hack right now.  We recognize two
# versions of the MBR:
#	v1 (partition 1 size 6281352)
#	v2 (partition 1 size 12305790)
# Currently we only install a new MBR if the existing one is the
# wrong size, just in case the user has customized the boot program.
#
tweakmbr() {
    _DSK=$1
    _NEW=$2

    dd if=/dev/$_DSK of=/dev/null bs=512 count=1 2>/dev/null || {
	echo "WARNING: could not read from $_DSK, MBR not changed"
	return
    }

    _size=`fdisk -s $_DSK 2>/dev/null | sed -n -e 's/^ *1: *[0-9][0-9]* *\([0-9][0-9]*\).*$/\1/p'`
    case ${_size}s in
    6281352s)
	_CUR=1
	;;
    12305790s)
59
	_CUR=2
60
61
62
	;;
    s)
        # special case: no part1 so probably no MBR at all, make sure we install
63
	echo "Found no MBR on $_DSK, installing version $_NEW"
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
	_CUR=1000000
	;;
    *)
	echo "WARNING: custom MBR on $_DSK, not changed"
	return
    	;;
    esac

    if [ $_CUR = $_NEW ]; then
	return
    fi

    # now set it if we can
    if [ ! -r "/etc/emulab/mbr${_NEW}.dd" ]; then
	echo "WARNING: cannot find MBR version $_NEW, not installed"
	return
    fi

    echo "Installing MBR version $_NEW ..."
    dd if=/etc/emulab/mbr${_NEW}.dd of=/dev/$_DSK bs=512 count=1
}

86
87
88
89
90
91
92
93
94
95
96
97
find_disks() {
	local disks

	for d in `sed -n 's/^\([a-z]*[0-9][0-9]*\): [0-9][0-9]*MB/\1/p' /var/run/dmesg.boot`; do
		case $d in
			ad*|da*|ar*|aacd*) disks="$disks $d"
		esac
	done

	echo $disks
}

98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#
# Function to zero all potential superblocks in the DOS partitions that
# could interfere with the OSes on the image being loaded.
#
# FreeBSD 4 or 5 goes out of its way to make this hard.  In FBSD4, we
# cannot overwrite the beginning of partitions that have a legit superblock.
# In FBSD5, DOS partitions that have a zero type cannot even be accessed.
# So we have to use the whole-disk special file using offsets extracted
# via fdisk.
#
zapsuperblocks() {
    DSK=$1

    #
    # Note we are not overly concerned about the consequences of misparsing
    # the fdisk output.  If we whack random blocks, it doesn't hurt anything.
    #
    offs=`fdisk -s $DSK 2>/dev/null | sed -n -e 's/^[ 0-9]*: *\([0-9]*\).*$/\1/p'`

    if [ x"$offs" = x ]; then
        return
    fi

121
    echo -n "Invalidating old potential superblocks on $DSK: "
122
123
    for off in $offs; do
        echo -n "$off "
124
	dd if=/dev/zero of=/dev/${DSK} oseek=$off count=16 >/dev/null 2>&1 || {
125
126
127
128
129
130
131
132
	    echo "WARNING: failed to invalidate $off"
	}
    done
    echo ""

    return
}

133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#
# Function to load a single image on a disk
#
loadone() {
    LOADINFO=$1
    NUM=$2

    echo "Loading image #$NUM"

    ADDRESS=`echo $LOADINFO | sed -e 's/.*ADDR=\([^[:space:]]*\).*/\1/'`
    PARTITION=`echo $LOADINFO | sed -e 's/.*PART=\([^[:space:]]*\).*/\1/'`
    PARTITION=${PARTITION:-'0'}
    PARTOS=`echo $LOADINFO | sed -e 's/.*PARTOS=\([^[:space:]]*\).*/\1/'`
    DISK=`echo $LOADINFO | sed -e 's/.*DISK=\([^[:space:]]*\).*/\1/'`
    DISK=${DISK:-'ad0'}
    ZFILL=`echo $LOADINFO | sed -e 's/.*ZFILL=\([^[:space:]]*\).*/\1/'`
    ZFILL=${ZFILL:-'0'}
    ACPI=`echo $LOADINFO | sed -e 's/.*ACPI=\([^[:space:]]*\).*/\1/'`
    ACPI=${ACPI:-'unknown'}
    ASF=`echo $LOADINFO | sed -e 's/.*ASF=\([^[:space:]]*\).*/\1/'`
    ASF=${ASF:-'unknown'}
    MBR=`echo $LOADINFO | sed -e 's/.*MBRVERS=\([^[:space:]]*\).*/\1/'`
    MBR=${MBR:-'1'}
    PREPARE=`echo $LOADINFO | sed -e 's/.*PREPARE=\([^[:space:]]*\).*/\1/'`
    PREPARE=${PREPARE:-'0'}

    if [ "$PARTITION" != "0" ]; then
160
	SLICE="-s $PARTITION"
161
162
163
164
165
166
167
168
169
	case $PARTOS in
	FreeBSD)
		SLICE="$SLICE -D 165"
		PTYPE=165
		;;
	OpenBSD)
		SLICE="$SLICE -D 166"
		PTYPE=166
		;;
170
	Fedora|Linux)
171
172
173
174
175
176
		SLICE="$SLICE -D 131"
		PTYPE=131
		;;
	*)
		;;
	esac
177
    fi
178

179
180
181
182
183
184
185
186
    #
    # set memory limits:
    #	allow $RESIDMEM MB for non-frisbee stuff
    #	split remaining memory (min of 2MB) between network/disk buffering
    #
    HOSTMEM=`sysctl -n hw.usermem`
    HOSTMEM=`expr $HOSTMEM / 1048576`
    if [ $HOSTMEM -ge `expr $RESIDMEM + 2` ]; then
187
188
189
190
191
192
193
194
195
	HOSTMEM=`expr $HOSTMEM - $RESIDMEM`
	BYTES=`expr $HOSTMEM \* 1024`
	DATASEGSZ=`ulimit -d`
	if [ $BYTES -gt $DATASEGSZ ]; then
	    BYTES=$DATASEGSZ
	    HOSTMEM=`expr $BYTES / 1024`
	    echo "WARNING: kernel limits buffering to $HOSTMEM MB"
	fi
	ulimit -v $BYTES
196
197
198
199
200
201
202
203
204

	## For GaTech we use more memory for disks since the disks are so slow
	#NETMEM=`expr $HOSTMEM \* 1 / 3`
	#DISKMEM=`expr $HOSTMEM \* 2 / 3`
	#MEMARGS="-C $NETMEM -W $DISKMEM"

	# For Utah, we let the client split up the memory
	# (50/50, but no more chunk buffers than there are chunks in the image)
	MEMARGS="-M $HOSTMEM"
205
    fi
206

207
208
209
210
211
212
    #
    # Make sure the necessary device files exist (only necessary on FreeBSD 4.x)
    # Note that we create partition files for all slices, not just slice 1,
	# for the benefit of the slicefix script.
    #
    if [ -x /dev/MAKEDEV -a ! -e /dev/$DISK ]; then
213
	(cd /dev; ./MAKEDEV $DISK ${DISK}s2a ${DISK}s3a ${DISK}s4a)
214
    fi
215

216
    if [ x"$ADDRESS" != x ]; then
217
218
	isurl=`echo $ADDRESS | grep http -`
	ispath=`echo $ADDRESS | grep '^/' -`
219
220
221
222
223
224
225

	if [ x"$isurl" != x ]; then
	    echo "Need to download $ADDRESS"

	    isurl=1
	    if [ ! -d /images ]; then
		echo "Need to create or mount /images directory!"
226
		return 1
227
	    fi
228

229
230
231
	    #
	    # This needs a lot more work ...
	    #
232
	    imagefile=`echo $ADDRESS | sed -e 's,^http[s]*://[^/]*/,,'`
233
234
235
	    imagefile="/images/$imagefile"
	elif [ x"$ispath" != x ]; then
	    ispath=1
236

237
238
	    if [ ! -e $ADDRESS ]; then
		echo "$ADDRESS does not exist!"
239
		return 1
240
241
	    fi
	    imagefile="$ADDRESS"
242
        else
243
244
245
246
247
	    PORT=`echo $ADDRESS | awk -F: '{ printf $2 }'`
	    MCAST=`echo $ADDRESS | awk -F: '{ printf $1 }'`
	    if [ -e $BOOTDIR/myip ]; then
		MCASTIF="-i `cat $BOOTDIR/myip`"
	    else
248
		MCASTIF=""
249
250
251
252
	    fi
	    MCASTADDR="-m $MCAST -p $PORT"
	    isurl=0
	    ispath=0
253
	fi
254

255
256
257
258
259
260
261
262
263
	#
	# ZFILL==1: use frisbee
	# ZFILL==2: separate disk-wipe pass (not yet implemented)
	#
	if [ "$ZFILL" != "0" ]; then
	    ZFILL="-z"
	else
	    ZFILL=""
	fi
264

265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
	#
	# Make sure the write-cache is enabled on SCSI disks.  It makes a
	# huge difference.  We don't worry about data corruption in the
	# case of a crash, because we will just reload the disk again anyway
	# in that situation.
	#
	turncacheoff=0
	case $DISK in
	da*)
	    if [ -x $BINDIR/camwce ] && $BINDIR/camwce on $DISK; then
		turncacheoff=1;
	    fi
	    ;;
	esac

280
281
282
283
	#
	# For slice images, ensure that the MBR is the correct version
	# and replace if not.
	#
284
285
286
287
288
289
290
291
292
293
        if [ $NUM -eq 0 ]; then
	    if [ "$PARTITION" != "0" ]; then
		tweakmbr $DISK $MBR
	    fi
            FIRSTMBR=$MBR
        else
            if [ "$FIRSTMBR" != "$MBR" ]; then
		echo "MBR Mismatch: First MBR is \"$FIRSTMBR\" while image #$NUM is \"$MBR\""
	    fi
        fi
294

295
296
	#
	# If a remote node and we have a URL, make sure that we have a place
297
	# to put it. Done after the MBR tweak of course. Then download the URL.
298
299
	#
	if [ $isrem -eq 1 -a $isurl -eq 1 ]; then
300
	    echo "Downloading image \'$ADDRESS\' to /images directory ..."
301
302
303
	    $BINDIR/mkextrafs.pl -c -s 4 -r $DISK /images || {
		$BINDIR/mkextrafs.pl -n -f -s 4 -r $DISK /images || {
		    echo "Could not create /images partition"
304
		    return 1
305
306
307
308
309
310
311
312
313
314
		}
	    }
	    wget -nv -N -P /images "$ADDRESS"
	    wstat=$?
	    case $wstat in
	    0)
		echo "wget succeeded getting the image"
		;;
	    *)
		echo "wget failed, status $wstat"
315
		return 1
316
317
318
319
		;;
	    esac
	fi

320
321
322
323
324
325
326
327
328
329
330
331
	#
	# If not zeroing the disk and we are loading a full disk image
	# we need to ensure that we at least invalidate any old superblocks
	# that might leak through (most likely in partition 4 which isn't
	# touched by our current image).  We do this before running frisbee
	# so that any legit filesystems loaded from the image work.
	#
	# Since we do it before frisbee, we are counting on the current
	# MBR being the same as the MBR being layed down.  While not
	# a reasonable assumption in general, it mostly works in our
	# environment and at least won't hurt anything if not true.
	#
332
333
334
	if [ $PREPARE -eq 1 -o \
             \( $isrem -eq 0 -a x"$ZFILL" = x -a "$PARTITION" = "0" \) ]
        then
335
336
337
	    zapsuperblocks $DISK
	fi

338
	if [ x"$imagefile" != x ]; then
339
340
	    echo "Running /usr/local/bin/imageunzip -o -O -W 32 $ZFILL $imagefile /dev/${DISK}s${PARTITION}"
	    /usr/local/bin/imageunzip -o -O -W 32 $ZFILL $imagefile /dev/${DISK}s${PARTITION}
341
342
343
344
345
	else
	    echo "Running $BINDIR/frisbee $LOADIP $MEMARGS $ZFILL $SLICE $MCASTIF $MCASTADDR /dev/$DISK at `date`"
	    $BINDIR/frisbee $LOADIP $MEMARGS $ZFILL $SLICE $MCASTIF $MCASTADDR /dev/$DISK
	fi
        fstat=$?
346

347
348
349
350
351
352
353
354
355
356
357
	#
	# If we mounted a partition from the disk to store the image,
	# we must unmount it now so that slicefix and others don't fail
	# due to an in-use partition.
	#
	if [ $isrem -eq 1 -a $isurl -eq 1 ]; then
	    umount /images || {
	        echo "WARNING: could not unmount /images"
	    }
	fi

358
359
360
361
362
363
364
365
366
	#
	# Turn the cache back off if we turned it on.
	# Is this sufficient to ensure the cache gets flushed?
	#
	if [ $turncacheoff -eq 1 ]; then
	    $BINDIR/camwce off $DISK
	fi

	case $fstat in
367
	0)
368
369
370
371
372
373
374
375
376
	    echo "Resizing final disk partition"
	    $BINDIR/growdisk -vW /dev/$DISK
	    echo "Adjusting slice-related files"
	    export SLICEFIX_ACPI=$ACPI
	    export SLICEFIX_ASF=$ASF
	    $BINDIR/slicefix $PARTITION $DISK
	    echo "Image #$NUM load complete at `date`"
	    return 0
	    ;;
377
	*)
378
379
	    echo "Frisbee run failed, status $fstat"
	    ;;
380
	esac
381
    else
382
	echo "Unable to get address for loading image"
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
    fi
    return 1
}

$BINDIR/tmcc state RELOADSETUP

BOSSINFO=`$BINDIR/tmcc bossinfo`
STATUS=`$BINDIR/tmcc status`

# For testing purposes.
#BOSSINFO='boss.emulab.net 155.101.128.70'
#LOADINFO='ADDR=234.5.6.69:4444'

BOSSIP=`echo $BOSSINFO | awk '{ print $2 }'`

# XXX should be part of loadinfo
LOADIP="-S $BOSSIP"

if [ -x /usr/sbin/ntpdate ]; then
	/usr/sbin/ntpdate -b $BOSSIP >/dev/null 2>&1
fi

# Enable IPoD
if [ -r $BINDIR/rc.ipod ]; then
    . $BINDIR/rc.ipod
408
fi
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425

#
# Break the load info into lines by setting IFS to a newline.
# XXX there must be a better way to do this!
#
OIFS=$IFS
IFS='
'
set -- `$BINDIR/tmcc loadinfo`
IFS=$OIFS
if [ "$1"x = x ]; then
    echo "No load information for node"
    exit 1
fi

$BINDIR/tmcc state RELOADING

426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
# HACK ALERT: If we're reloading we need to zap the superblocks and
# MBRs of any other disks in the system.  This is to prevent Linux from
# finding an old filesystem with the same label or UUID and mounting
# that instead of the right one.  We skip the disks that are mounted
# and the disk we're going to write to.
# DOUBLE HACK ALERT: Changed this to zap all disks to avoid having
# to figure out what the other disks are when loading multiple images.
# Since a new MBR will be laid down anyway there is no harm in doing
# this as long as we are sure we are in the reloading experiment.
case $STATUS in
	*ALLOCATED=emulab-ops/reloading*)
		disks=`find_disks`
		for d in $disks; do
			#[ $d = $DISK ] && continue
			mount | grep "^/dev/$d" > /dev/null && continue
			zapsuperblocks $d
			echo "Invalidating MBR on $d"
443
			dd if=/dev/zero of=/dev/$d bs=512 count=16
444
445
446
447
		done
		;;
esac

448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
#
# Load each image in turn.
# If a load fails, we exit non-zero so that the rc script will drop into
# single-user mode.  If all loads succeed we either reboot or continue with
# the rc script as desired by the caller.
#
NUM=0
while [ "$1"x != x ]; do
    loadone "$1" $NUM || {
	echo "Failed to load disk, dropping to login prompt at `date`"
        exit 1
    }
    shift
    NUM=`expr $NUM + 1`
done

echo "Frisbee run finished"

#
# If requested to reboot, do so.
#
# Note: there is a race condition with stated here.
# If we reboot immediately after sending RELOADDONE,
# it is possible that, under heavy server load, we will
# finish the reboot and reach the bootinfo stage before
# stated gets and processes our RELOADDONE.  So now we
# wait around after sending the RELOADDONE.  stated should
# force us to reboot when the transition takes place.
# For backward compatibility we use a new state: RELOADDONEV2.
# For paranoia we just wait around for awhile and then
# reboot anyway, just in case stated's reboot fails for
# some reason.
#
if [ $reboot -eq 1 ]; then
    $BINDIR/tmcc state RELOADDONEV2
    echo "Waiting for server to reboot us ..."
    if [ $isrem -eq 1 ]; then
	sleep 30
    else
	sleep 240
    fi
    echo "No response from server, rebooting myself ..."
    /sbin/reboot
    sleep 100
else
    $BINDIR/tmcc state RELOADDONE
fi

echo "Frisbee finished"

exit 0