Commit 2f3b1c67 authored by Mike Hibler's avatar Mike Hibler
Browse files

Add TRIM support to clientside of frisbee MFS.

Currently this is done using the -E (erase) option to zapdisk to
erase a partition or the entire disk.

We run zapdisk before starting a frisbee load. Ultimately, I want
TRIM support in frisbee proper (to erase rather than skip or zero
free blocks), but erasing the entire disk might be a better option
anyway (frisbee itself will benefit from the TRIMed disk).

Notes:

We only trim the disk when a node is in reloading. Didn't want to take
a chance on having a user reload a partition on their disk and wind up
getting the entire disk erased. That should not happen, but I didn't
want to think through all the possibilities right now.

Only works on FreeBSD right now. Linux MFS will still need to be modified.
Just didn't want to track down how Linux does block erase right now.

The clientside here is different than what is used on the Moonshots.
On the Linux-based Moonshot MFS, we TRIM in slicefix by running fstrim
on the root FS we just layed down. This used to be a good alternative
back when the root FS occupied the entire disk, but now it won't work
as well as we will only trim blocks from the first 6GB. Need to switch
this over to the zapdisk (or frisbee) based TRIM.

The server infrastructure on the Moonshots is the same: tmcc passes a
TRIM=1 flags in "loadinfo" when the disk should be trimmed. So when to
TRIM is based on the time since the last TRIM, and can now be specified
site-wide, per-nodetype, or per-node.

Currently, slicefix (x86 or ARM) doesn't do anything to the FS layed
down to ensure that TRIM is done during the lifetime of the FS. Both
FreeBSD and Linux have such options that can be set when the FS is
created or added with tunefs. We need to think about this for mkextrafs
and local blockstores as well.
parent f927aba6
/*
* Copyright (c) 2016 University of Utah and the Flux Group.
*
* {{{EMULAB-LICENSE
*
* This file is part of the Emulab network testbed software.
*
* This file is free software: you can redistribute it and/or modify it
* under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or (at
* your option) any later version.
*
* This file is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
* License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this file. If not, see <http://www.gnu.org/licenses/>.
*
* }}}
*/
/*
* Support for disk erase operations (e.g., TRIM).
*/
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <unistd.h>
#include <string.h>
#include <errno.h>
#ifdef __FreeBSD__
#include <sys/disk.h>
#endif
#define SECSIZE 512
#define SECALIGN(p) (void *)(((uintptr_t)(p) + (SECSIZE-1)) & ~(SECSIZE-1))
#define ISSECALIGNED(p) (((uintptr_t)(p) & (SECSIZE-1)) == 0)
#define ERASEMINSIZE 4096
#define ZEROSIZE (256*1024)
static off_t ebsize = -1;
static off_t erased = 0;
static off_t zeroed = 0;
static int zeroit(int fd, off_t offset, off_t count);
off_t
erasebsize(void)
{
if (ebsize < 0) {
#ifdef DIOCGDELETE
/* XXX this seems to be the minimum */
ebsize = ERASEMINSIZE;
#else
ebsize = 0;
#endif
}
return ebsize;
}
#ifdef DIOCGDELETE
int
erasedata(int fd, off_t offset, off_t ecount, int zeroonfail)
{
off_t args[2];
off_t toff, tend, tcnt;
off_t bsize = erasebsize();
if (bsize == 0) {
if (zeroonfail)
return zeroit(fd, offset, ecount);
return -1;
}
/*
* Check alignment/length, erase as much as we can.
*/
assert(ISSECALIGNED(toff));
assert(ISSECALIGNED(ecount));
toff = offset;
tend = offset + ecount;
if ((toff % bsize) != 0)
toff = ((toff + bsize-1) / bsize) * bsize;
if ((tend % bsize) != 0)
tend = (tend / bsize) * bsize;
tcnt = tend - toff;
#if 0
fprintf(stderr, "Would erase [%ld-%ld]\n", toff, tend-1);
if (toff > offset || tcnt != ecount) {
fprintf(stderr, "Would zero ");
if (toff > offset)
fprintf(stderr, "[%ld-%ld] ", offset, toff-1);
toff += tcnt;
if (toff < offset+ecount)
fprintf(stderr, "[%ld-%ld] ", toff, offset+ecount-1);
fprintf(stderr, "\n");
}
return 0;
#endif
args[0] = toff;
args[1] = tcnt;
if (ioctl(fd, DIOCGDELETE, args) < 0) {
fprintf(stderr,
"DIOCGDELETE of [%lld-%lld] failed (%d)\n",
(long long)args[0],
(long long)args[0]+args[1]-1, errno);
if (zeroonfail)
return zeroit(fd, offset, ecount);
return -1;
}
/*
* Take care of leading and trailing blocks we could not erase.
*/
if (toff > offset || tcnt != ecount) {
/* XXX zero the excess */
if (toff > offset && zeroit(fd, offset, toff-offset))
return -1;
toff += tcnt;
if (toff < offset+ecount &&
zeroit(fd, toff, offset+ecount-toff))
return -1;
}
return 0;
}
#else
int
erasedata(int fd, off_t offset, off_t ecount, int zeroonfail)
{
return -1;
}
#endif
static int
zeroit(int fd, off_t offset, off_t count)
{
char *buf, *_buf;
size_t bsize, wsize;
int err = 0;
if (lseek(fd, offset, SEEK_SET) < 0) {
perror("lseek to write zeros");
return -1;
}
if (count < bsize)
bsize = count;
else
bsize = ZEROSIZE;
_buf = malloc(bsize + SECSIZE);
if (_buf == NULL) {
fprintf(stderr, "Could not allocated zero buffer\n");
return -1;
}
buf = SECALIGN(_buf);
memset(buf, 0, bsize);
while (count > 0) {
if (count < bsize)
wsize = count;
else
wsize = bsize;
if (write(fd, buf, wsize) != wsize) {
fprintf(stderr, "Could not write zeros\n");
err = -1;
goto done;
}
count -= wsize;
}
done:
free(_buf);
return err;
}
#
# Copyright (c) 2000-2015 University of Utah and the Flux Group.
# Copyright (c) 2000-2016 University of Utah and the Flux Group.
#
# {{{EMULAB-LICENSE
#
......@@ -54,14 +54,17 @@ include $(TESTBED_SRCDIR)/GNUmakerules
CFLAGS = -O -g $(LDSTATIC) $(PARTFLAGS)
zapdisk: zapdisk.o disksize.o $(PARTLIBS)
$(CC) $(CFLAGS) zapdisk.o disksize.o $(PARTLIBS) -o zapdisk
zapdisk: zapdisk.o disksize.o erase.o $(PARTLIBS)
$(CC) $(CFLAGS) zapdisk.o disksize.o erase.o $(PARTLIBS) -o zapdisk
cp zapdisk zapdisk.debug
strip zapdisk
disksize.o: $(IZSRCDIR)/disksize.c
$(CC) -c $(CFLAGS) -o disksize.o $<
erase.o: $(IZSRCDIR)/erase.c
$(CC) -c $(CFLAGS) -o erase.o $<
$(PARTLIBS):
@$(MAKE) -C $(OBJDIR)/os/imagezip partlibs
......
/*
* Copyright (c) 2005-2015 University of Utah and the Flux Group.
* Copyright (c) 2005-2016 University of Utah and the Flux Group.
*
* {{{EMULAB-LICENSE
*
......@@ -55,6 +55,7 @@ static int verbose = 0;
static int pnum = 0;
static int bootblocks = 0;
static int superblocks = 0;
static int erase = 0;
static int doit = 0;
static char *diskname;
......@@ -71,10 +72,11 @@ static void
usage(void)
{
fprintf(stderr, "usage: "
"zapdisk [-BS] <diskdev>\n"
"zapdisk [-BSE] [-Z] <diskdev>\n"
" -p <pnum> operate only on the given partition\n"
" -B zap MBR/GPT and partition boot programs\n"
" -S zap possible superblocks in all partitions\n"
" -E erase (TRIM) the partition or disk\n"
" -Z really do the zap and don't just talk about it\n"
" <diskdev> disk special file to operate on\n");
exit(1);
......@@ -89,7 +91,7 @@ main(int argc, char **argv)
int ismbr;
int gotbb = 0;
while ((ch = getopt(argc, argv, "p:vBSZ")) != -1)
while ((ch = getopt(argc, argv, "p:vBSEZ")) != -1)
switch(ch) {
case 'Z':
doit++;
......@@ -106,6 +108,9 @@ main(int argc, char **argv)
case 'S':
superblocks++;
break;
case 'E':
erase++;
break;
case '?':
default:
usage();
......@@ -115,8 +120,8 @@ main(int argc, char **argv)
if (argc < 1)
usage();
if (!bootblocks && !superblocks) {
fprintf(stderr, "Must specify one or both of -B and -S\n");
if (!bootblocks && !superblocks && !erase) {
fprintf(stderr, "Must specify either -E or one or both of -B and -S\n");
usage();
}
diskname = argv[0];
......@@ -164,6 +169,45 @@ main(int argc, char **argv)
printf("%s: has MBR\n", diskname);
}
#endif
if (erase) {
iz_lba start;
iz_size size;
extern uint64_t getdisksize(int);
extern int erasedata(int, off_t, off_t, int);
if (!gotbb && pnum > 0) {
fprintf(stderr, "%s: No valid MBR/GPT,"
" cannot erase partitions\n", diskname);
exit(1);
}
if (pnum == 0) {
start = 0;
size = getdisksize(fd);
} else {
start = diskinfo.slices[pnum-1].offset;
size = diskinfo.slices[pnum-1].size;
}
if (!doit) {
printf("%s: would erase sectors [%lu-%lu]\n",
diskname, (unsigned long)start,
(unsigned long)start + size - 1);
exit(0);
}
if (verbose)
printf("%s: erasing sectors [%lu-%lu]\n",
diskname, (unsigned long)start,
(unsigned long)start + size - 1);
if (erasedata(fd, (off_t)start * secsize,
(off_t)size * secsize, 0)) {
fprintf(stderr,
"%s: Could not erase sectors [%lu-%lu]\n",
diskname, (unsigned long)start,
(unsigned long)start + size - 1);
exit(1);
}
exit(0);
}
if (!gotbb) {
/* lack of a valid partition table is ok */
if (superblocks)
......
#!/bin/sh
#
# Copyright (c) 2000-2015 University of Utah and the Flux Group.
# Copyright (c) 2000-2016 University of Utah and the Flux Group.
#
# {{{EMULAB-LICENSE
#
......@@ -204,15 +204,15 @@ loadone() {
echo "Loading image #$_NUM"
echo " LOADINFO=\"$_LOADINFO\""
ADDR=`getvar ADDR "$_LOADINFO"`;
SERVER=`getvar SERVER "$_LOADINFO" $BOSSIP`;
PARTOS=`getvar PARTOS "$_LOADINFO" unknown`;
ZFILL=`getvar ZFILL "$_LOADINFO" 0`;
MBRVERS=`getvar MBRVERS "$_LOADINFO" 1`;
PREPARE=`getvar PREPARE "$_LOADINFO" 0`;
IMAGEID=`getvar IMAGEID "$_LOADINFO"`;
KEEPALIVE=`getvar KEEPALIVE "$_LOADINFO"`;
OSVERSION=`getvar OSVERSION "$_LOADINFO" 0`;
ADDR=`getvar ADDR "$_LOADINFO"`
SERVER=`getvar SERVER "$_LOADINFO" $BOSSIP`
PARTOS=`getvar PARTOS "$_LOADINFO" unknown`
ZFILL=`getvar ZFILL "$_LOADINFO" 0`
MBRVERS=`getvar MBRVERS "$_LOADINFO" 1`
PREPARE=`getvar PREPARE "$_LOADINFO" 0`
IMAGEID=`getvar IMAGEID "$_LOADINFO"`
KEEPALIVE=`getvar KEEPALIVE "$_LOADINFO"`
OSVERSION=`getvar OSVERSION "$_LOADINFO" 0`
#
# XXX If KEEPALIVE is not explicitly set, attempt to intuit a value.
......@@ -533,7 +533,14 @@ fixone() {
$BINDIR/tmcc state RELOADSETUP
BOSSINFO=`$BINDIR/tmcc bossinfo`
STATUS=`$BINDIR/tmcc status`
# See if we are in the reloading experiment
INRELOADING=0
case `$BINDIR/tmcc status` in
*ALLOCATED=emulab-ops/reloading*)
INRELOADING=1
;;
esac
BOSSIP=`echo $BOSSINFO | awk '{ print $2 }'`
......@@ -567,17 +574,47 @@ fi
$BINDIR/tmcc state RELOADING
#
# Handle disk TRIMing.
#
# The TRIM= attribute will be the same for each line of the image,
# so we only need look at the first line. We also only want to do this
# once before loading the first image!
#
# If PREPARE is set, we TRIM the entire disk, even for a partition load.
# This will smoke the MBR, but it doesn't matter in the prepare case since
# the MBR would be overwritten anyway.
#
TRIM=`getvar TRIM "$1" 0`
if [ $TRIM -ne 0 ]; then
_DISK=`getvar DISK "$1" ad0`
_PART=`getvar PART "$1" 0`
_PREPARE=`getvar PREPARE "$1" 0`
_PARG=""
if [ $INRELOADING -eq 0 -a $_PART -ne 0 -a $_PREPARE -eq 0 ]; then
_PARG="-p $_PART"
fi
echo "`date`: Erasing /dev/$_DISK..."
if [ ! -x "$BINDIR/zapdisk" ] || ! zapdisk -v -EZ $_PARG /dev/$_DISK; then
echo "`date`: WARNING: Erase of /dev/$_DISK failed"
else
echo "`date`: Erase of /dev/$_DISK done"
fi
fi
#
# HACK ALERT: If we're reloading we need to zap the superblocks and
# MBRs of any other disks in the system. This is to prevent Linux from
# finding an old filesystem with the same label or UUID and mounting
# that instead of the right one. We skip the disks that are mounted
# and the disk we're going to write to.
#
# DOUBLE HACK ALERT: Changed this to zap all disks to avoid having
# to figure out what the other disks are when loading multiple images.
# Since a new MBR will be laid down anyway there is no harm in doing
# this as long as we are sure we are in the reloading experiment.
case $STATUS in
*ALLOCATED=emulab-ops/reloading*)
#
if [ $INRELOADING -eq 1 ]; then
disks=`find_disks`
for d in $disks; do
#[ $d = $DISK ] && continue
......@@ -600,8 +637,7 @@ case $STATUS in
if [ -x $BINDIR/rc.nodecheck ]; then
$BINDIR/rc.nodecheck boot
fi
;;
esac
fi
#
# Load each image in turn.
......
#!/bin/sh
#
# Copyright (c) 2000-2015 University of Utah and the Flux Group.
# Copyright (c) 2000-2016 University of Utah and the Flux Group.
#
# {{{EMULAB-LICENSE
#
......@@ -181,6 +181,7 @@ if [ $CONSOLE = "unknown" ]; then
echo "WARNING: console type not set, assuming \"sio1\""
CONSOLE=sio1
fi
# DOM0MEM is optional
DOM0MEM=$SLICEFIX_DOM0MEM
......@@ -260,7 +261,7 @@ dofreebsd() {
#
# This is clearly not a solution.
#
vers=`uname -v | sed -e 's/FreeBSD \([0-9]\).*/\1/'`
vers=`uname -v | sed -e 's/FreeBSD \([0-9][0-9]*\).*/\1/'`
# see if there is a root ('a') partition on this BSD slice
`disklabel ${disk}s${part} 2>&1 | grep -s -E '^[ ]+a:' >/dev/null` || {
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment