Commit bf3b4dfc authored by Mike Hibler's avatar Mike Hibler

Working well enough that I can start doing some serious testing.

To that end, added a new "imagerezip" program which reads in an image
and writes it back out. Right now, this is just to test that reading
and writing images via the new library produces the same result as the
"classic" tools. But it might be useful in the future to repack images
with a different compression level or a different signature (hash function
or blocksize).

Also checked in a hack to imageunzip I did for Xing (-I option) to just
produce a stream of the uncompressed image data (it ignores skips, aka
it doesn't skip on the output device to put the data in the correct place).

Unfortunately, I cannot just compare raw images because even trying to
replicate the chunk packing strategy of the original code, the new library
packs a bit more into each chunk. Not sure why, but anyway, I have to
compare old/new images by dumping their contents and by checking the
generated signatures.
parent d5ffec9a
......@@ -127,7 +127,7 @@ endif
# Necessary sometimes.
#PTHREADCFLAGS += -DCONDVARS_WORK
PROGS = imagezip imageunzip imagedump imagedelta
PROGS = imagezip imageunzip imagedump imagedelta imagerezip
CFLAGS = $(SUBDIRCFLAGS) -I$(SRCDIR) $(LDSTATIC)
LIBS = -lz
ZIPCFLAGS = $(CFLAGS) -Wall
......@@ -266,6 +266,12 @@ imagedelta: imagedelta.o version.o $(NDZLIBS)
imagedelta.o: imagedelta.c
$(CC) -c $(CFLAGS) -o imagedelta.o $<
imagerezip: imagerezip.o version.o $(NDZLIBS)
$(CC) $(CFLAGS) imagerezip.o version.o $(NDZLIBS) -o imagerezip
imagerezip.o: imagerezip.c
$(CC) -c $(CFLAGS) -o imagerezip.o $<
sizetest: disksize.c
$(CC) -DTEST $< -o sizetest
......@@ -284,9 +290,10 @@ endif
imagezip.o: sliceinfo.h imagehdr.h global.h range.h hashmap/hashmap.h
imageunzip.o: imagehdr.h
imagehash.o: imagehdr.h imagehash.h
imagedelta.o: imagehdr.h libndz/libndz.h
imagedelta.o: imagehdr.h imagehash.h libndz/libndz.h
imagerezip.o: imagehdr.h imagehash.h libndz/libndz.h
version.c: imagezip.c imageunzip.c imagedump.c imagedelta.c
version.c: imagezip.c imageunzip.c imagedump.c imagedelta.c imagerezip.c
echo >$@ "char build_info[] = \"Built `date +%d-%b-%Y` by `id -nu`@`hostname | sed 's/\..*//'`:`pwd`\";"
install: $(addprefix $(INSTALL_BINDIR)/, $(PROGS))
......
......@@ -90,12 +90,6 @@ struct fileinfo {
struct ndz_rangemap *map, *sigmap;
} ndz1, ndz2, delta;
struct hashdata {
ndz_chunkno_t chunkno;
uint32_t hashlen;
uint8_t hash[HASH_MAXSIZE];
};
static int usesigfiles = 0;
static int forcesig = 0;
static int debug = 0;
......@@ -152,7 +146,7 @@ verifyfunc(struct ndz_rangemap *imap, struct ndz_range *range, void *arg)
* identify those cases...
*/
if (srange->data) {
struct hashdata *hd = (struct hashdata *)srange->data;
struct ndz_hashdata *hd = (struct ndz_hashdata *)srange->data;
if (HASH_CHUNKDOESSPAN(hd->chunkno)) {
/*
......@@ -247,6 +241,11 @@ openofile(char *file, struct fileinfo *info)
{
int sigfd;
if (strcmp(file, "-") == 0) {
fprintf(stderr, "Cannot output to stdout yet\n");
exit(1);
}
info->ndz = ndz_open(file, 1);
if (info->ndz == NULL) {
perror(file);
......@@ -359,8 +358,16 @@ initnewchunk(struct chunkstate *cstate)
/*
* Iterator for ranges in the delta map.
* Read and chunkify the data from the full image, hashing the data as
* we go.
*
* Read and chunkify the data from the full image to produce the delta.
*
* If we have a signature file for the "source" full image, then we don't
* need to do any hashing as the delta signature will be identical
* (signatures always cover the entire image).
*
* If we have no signature, or we are changing the hash block size or
* hash algorithm, or we just want to validate the original signature,
* then we hash the data as we go.
*/
static int
chunkify(struct ndz_rangemap *mmap, struct ndz_range *range, void *arg)
......@@ -368,16 +375,20 @@ chunkify(struct ndz_rangemap *mmap, struct ndz_range *range, void *arg)
struct chunkstate *cstate = arg;
ndz_addr_t rstart = range->start;
ndz_size_t rsize = range->end + 1 - rstart, sc;
uint32_t offset, hsize;
size_t rbytes;
uint32_t roffset, hstart, hsize;
size_t hbytes;
ssize_t cc;
unsigned char hashbuf[HASH_MAXSIZE], *hash;
struct ndz_hashdata *hdata;
struct ndz_range *hrange;
#ifdef CHUNKIFY_DEBUG
fprintf(stderr, "chunkify [%lu-%lu]:\n", range->start, range->end);
#endif
/*
* First call. Initialize the state we are going to carry through
* with us via the iterator argument.
*/
if (cstate->chunkobj == NULL) {
cstate->chunkdatabuf = malloc(hashblksize * delta.ndz->sectsize);
if (cstate->chunkdatabuf == NULL) {
......@@ -391,23 +402,41 @@ chunkify(struct ndz_rangemap *mmap, struct ndz_range *range, void *arg)
cstate->curregion->start = rstart;
}
offset = rstart % hashblksize;
/*
* Process the range in units of the hash blocksize.
*
* We always break image data ranges at hash blocksize boundaries but
* note that data ranges and hash block ranges don't necessarily align.
* A data range might span multiple hash ranges or there might be
* multiple data ranges in the same hash range. In the latter case,
* we could simplify things by joining data ranges within the same hash
* range with zero-filled extra blocks so that we always had full hash
* ranges, but that would make images larger and result in writing
* extra data we don't have to when the image is deployed. Instead,
* we just create small hash ranges covering only the data itself.
*/
roffset = rstart % hashblksize;
while (rsize > 0) {
if (offset) {
hsize = hashblksize - offset;
uint32_t pstart, psize;
int spanschunk;
size_t bufoff;
hstart = rstart;
if (roffset) {
hsize = hashblksize - roffset;
if (hsize > rsize)
hsize = rsize;
offset = 0;
roffset = 0;
} else if (rsize > hashblksize)
hsize = hashblksize;
else
hsize = rsize;
#ifdef CHUNKIFY_DEBUG
fprintf(stderr, " [%lu-%lu]: ", rstart, rstart + hsize - 1);
fprintf(stderr, " [%u-%u]: ", hstart, hstart + hsize - 1);
#endif
/* XXX read/decompress data range */
sc = ndz_readdata(ndz2.ndz, cstate->chunkdatabuf, hsize, rstart);
sc = ndz_readdata(ndz2.ndz, cstate->chunkdatabuf, hsize, hstart);
if (sc != hsize) {
fprintf(stderr, "%s: unexpected read return %ld (instead of %u)\n",
ndz_filename(ndz2.ndz), (long)sc, hsize);
......@@ -415,105 +444,176 @@ chunkify(struct ndz_rangemap *mmap, struct ndz_range *range, void *arg)
}
/*
* See if we have an existing hash for the hash block
* Fetch or compute the hash value.
*/
rbytes = hsize * delta.ndz->sectsize;
hrange = ndz_rangemap_lookup(ndz2.sigmap, rstart, NULL);
assert(delta.ndz->hashcurentry < delta.ndz->hashentries);
hdata = &delta.ndz->hashdata[delta.ndz->hashcurentry++];
hdata->hashlen = hashlen;
hbytes = hsize * delta.ndz->sectsize;
hrange = ndz_rangemap_lookup(ndz2.sigmap, hstart, NULL);
if (hrange && hrange->data &&
hrange->start == rstart && hrange->end == rstart + hsize - 1) {
struct hashdata *hd = (struct hashdata *)hrange->data;
hash = hd->hash;
hrange->start == hstart && hrange->end == hstart + hsize - 1) {
struct ndz_hashdata *hd = (struct ndz_hashdata *)hrange->data;
memcpy(hdata->hash, hd->hash, hashlen);
#ifdef CHUNKIFY_DEBUG
fprintf(stderr, " found hash=%s\n", ndz_hash_dump(hash, hashlen));
fprintf(stderr, " found hash=%s\n",
ndz_hash_dump(hdata->hash, hashlen));
#endif
#if 1
/* sanity check */
ndz_hash_data(delta.ndz, cstate->chunkdatabuf, rbytes, hashbuf);
{
unsigned char hbuf[HASH_MAXSIZE];
ndz_hash_data(delta.ndz, cstate->chunkdatabuf, hbytes, hbuf);
#ifdef CHUNKIFY_DEBUG
fprintf(stderr, " computed hash=%s\n", ndz_hash_dump(hashbuf, hashlen));
fprintf(stderr, " computed hash=%s\n",
ndz_hash_dump(hbuf, hashlen));
#endif
if (memcmp(hash, hashbuf, hashlen)) {
fprintf(stderr, "*** [%lu-%lu]: hash does not compare!\n",
rstart, rstart + hsize - 1);
if (memcmp(hdata->hash, hbuf, hashlen)) {
fprintf(stderr, "*** [%u-%u]: hash does not compare!\n",
hstart, hstart + hsize - 1);
}
}
#endif
} else {
ndz_hash_data(delta.ndz, cstate->chunkdatabuf, rbytes, hashbuf);
hash = hashbuf;
ndz_hash_data(delta.ndz, cstate->chunkdatabuf, hbytes,
hdata->hash);
#ifdef CHUNKIFY_DEBUG
fprintf(stderr, " no hash found\n");
#endif
}
/*
* If there is not enough room for this range in the current chunk,
* write it out and start a new one.
* At this point we have a range of data ([hstart - hstart+hsize-1])
* of a specific size (hsize or hbytes) which we have hashed
* (hdata->hash). Now we compress and write it out to the new image
* file. This is complicated significantly by the fact that it might
* not all fit in the current chunk. If there is not enough room for
* this range in the current chunk, we split it and write what we can.
*
* This is complicated even further by our conservative algorithm
* for filling chunks, which is basically: if the amount of
* uncompressed data exceeds the amount of space left for the
* compressed data (plus a little slop in case it expands instead),
* then we stop. This is an iterative process since, most likely,
* the compressed data will be significantly smaller than the
* uncompressed data.
*/
if (rbytes > ndz_chunk_left(cstate->chunkobj)) {
bufoff = 0;
spanschunk = 0;
pstart = hstart;
psize = hsize;
while (psize > 0) {
uint32_t wsize;
size_t wbytes, chunkremaining;
chunkremaining = ndz_chunk_left(cstate->chunkobj);
if (chunkremaining < delta.ndz->sectsize) {
/* switch to new chunk */
#ifdef CHUNKIFY_DEBUG
fprintf(stderr, " chunk %u done, starting new one\n", cstate->chunkno);
fprintf(stderr, " chunk %u full (%lu bytes), writing...\n",
cstate->chunkno,
(unsigned long)ndz_chunk_datasize(cstate->chunkobj));
#endif
/* finalize the header */
cstate->header->size = ndz_chunk_datasize(cstate->chunkobj);
cstate->header->regioncount = (cstate->curregion - cstate->region + 1);
cstate->header->lastsect = rstart;
/* finalize the header */
cstate->header->size = ndz_chunk_datasize(cstate->chunkobj);
cstate->header->regioncount =
(cstate->curregion - cstate->region + 1);
/* XXX should always be zero */
if (cstate->chunkno == 0)
cstate->header->firstsect = 0;
cstate->header->lastsect = pstart;
/* and write it */
if (ndz_chunk_flush(cstate->chunkobj, 1) != 0) {
fprintf(stderr, "Error writing compressed data\n");
return 1;
}
cstate->chunkno++;
if (initnewchunk(cstate) != 0)
return 1;
cstate->header->firstsect = pstart;
cstate->curregion->start = pstart;
/* and write it */
if (ndz_chunk_flush(cstate->chunkobj, 1) != 0) {
fprintf(stderr, "Error writing compressed data\n");
return 1;
/* keep track if this hash range spans chunks */
if (psize < hsize)
spanschunk++;
chunkremaining = ndz_chunk_left(cstate->chunkobj);
assert(psize <= chunkremaining / delta.ndz->sectsize);
}
cstate->chunkno++;
if (initnewchunk(cstate) != 0)
return 1;
cstate->header->firstsect = rstart;
cstate->curregion->start = rstart;
}
assert(rbytes <= ndz_chunk_left(cstate->chunkobj));
/* write up to chunkremaining (truncated to sectorsize) bytes */
wsize = psize;
wbytes = wsize * delta.ndz->sectsize;
if (wbytes > chunkremaining) {
wsize = (chunkremaining / delta.ndz->sectsize);
wbytes = wsize * delta.ndz->sectsize;
}
assert(wsize > 0);
/*
* Append the hashed range to the current chunk.
*/
#ifdef CHUNKIFY_DEBUG
fprintf(stderr, " appending to chunk %u\n", cstate->chunkno);
fprintf(stderr, " appending %u sectors to chunk %u "
"(%ld bytes available)\n",
wsize, cstate->chunkno,
ndz_chunk_left(cstate->chunkobj));
#endif
cc = ndz_chunk_append(cstate->chunkobj, cstate->chunkdatabuf, rbytes);
if (cc < 0) {
fprintf(stderr, "Error compressing data\n");
return 1;
}
assert(cc == rbytes);
/* append to the current region or create a new one */
if (cstate->curregion->start + cstate->curregion->size == rstart)
cstate->curregion->size += hsize;
else {
cstate->curregion++;
cstate->curregion->start = rstart;
cstate->curregion->size = hsize;
}
cc = ndz_chunk_append(cstate->chunkobj,
cstate->chunkdatabuf + bufoff, wbytes);
if (cc < 0) {
fprintf(stderr, "Error compressing data\n");
return 1;
}
assert(cc == wbytes);
/* XXX add range/hashinfo to new sigmap */
/* append to the current region or create a new one */
if (cstate->curregion->start + cstate->curregion->size == pstart) {
cstate->curregion->size += wsize;
#ifdef CHUNKIFY_DEBUG
fprintf(stderr, " adjust range entry to [%u-%u]\n",
cstate->curregion->start,
cstate->curregion->start+cstate->curregion->size-1);
#endif
} else {
cstate->curregion++;
cstate->curregion->start = pstart;
cstate->curregion->size = wsize;
#ifdef CHUNKIFY_DEBUG
fprintf(stderr, " new range entry [%u-%u]\n",
cstate->curregion->start,
cstate->curregion->start+cstate->curregion->size-1);
#endif
}
#if 0
bufoff += wbytes;
pstart += wsize;
psize -= wsize;
chunkremaining = ndz_chunk_left(cstate->chunkobj);
}
/*
* If no hash was given, we have to compute it
* At this point we have written out the entire hash range.
* Add it to the hash map, recording the chunk(s) that it belongs to.
*/
if ((hash = rhash) == NULL) {
if (hash_range(rstart, hsize, hashbuf)) {
fprintf(stderr, "Error hashing image data\n");
return 1;
}
hash = hashbuf;
}
if (addhash(hinfop, rstart, hsize, hash) != 0) {
fprintf(stderr, "Out of memory for new hash map\n");
if (spanschunk)
hdata->chunkno = HASH_CHUNKSETSPAN(cstate->chunkno-1);
else
hdata->chunkno = cstate->chunkno;
#ifdef CHUNKIFY_DEBUG
fprintf(stderr, " write hash entry [%u-%u], chunk %u",
hstart, hstart + hsize - 1, HASH_CHUNKNO(hdata->chunkno));
if (HASH_CHUNKDOESSPAN(hdata->chunkno))
fprintf(stderr, "-%u", HASH_CHUNKNO(hdata->chunkno) + 1);
fprintf(stderr, "\n");
#endif
cc = ndz_rangemap_alloc(delta.sigmap, hstart, hsize, (void *)hdata);
if (cc) {
fprintf(stderr, "Could not add hashmap entry\n");
return 1;
}
#endif
rstart += hsize;
rsize -= hsize;
......@@ -524,19 +624,17 @@ chunkify(struct ndz_rangemap *mmap, struct ndz_range *range, void *arg)
*/
if (range->next == NULL) {
#ifdef CHUNKIFY_DEBUG
fprintf(stderr, " final chunk %u done\n", cstate->chunkno);
fprintf(stderr, " final chunk %u done (%lu bytes)\n",
cstate->chunkno,
(unsigned long)ndz_chunk_datasize(cstate->chunkobj));
#endif
/* finalize the header */
cstate->header->size = ndz_chunk_datasize(cstate->chunkobj);
cstate->header->regioncount = (cstate->curregion - cstate->region + 1);
/*
* XXX not right, need to use the last sector of the ndz2 map.
* But I'm not sure it is set correct and it doesn't really matter
* since we will never be zeroing when loading a delta image!
*/
cstate->header->lastsect = range->end + 1;
/* XXX */
cstate->header->lastsect = delta.ndz->maphi;
/* and write it */
if (ndz_chunk_flush(cstate->chunkobj, 1) != 0) {
......@@ -577,6 +675,7 @@ main(int argc, char **argv)
fprintf(stderr, "Invalid hash block size\n");
usage();
}
hashblksize /= 512;
break;
case 'D':
if (strcmp(optarg, "md5") == 0)
......@@ -653,12 +752,28 @@ main(int argc, char **argv)
argv[1], ndz2.ndz->hashtype, ndz2.ndz->hashblksize);
exit(1);
}
#if 1
/* XXX just duplicate image 2 */
{
struct ndz_rangemap *foo;
foo = ndz_rangemap_init(NDZ_LOADDR, NDZ_HIADDR-NDZ_LOADDR);
delta.map = ndz_compute_delta(foo, ndz2.sigmap);
}
#else
delta.map = ndz_compute_delta(ndz1.sigmap, ndz2.sigmap);
#endif
if (delta.map == NULL) {
fprintf(stderr, "Could not compute delta for %s and %s\n",
argv[0], argv[1]);
exit(1);
}
/*
* Delta map has same range as full image.
* XXX doesn't belong here.
*/
delta.ndz->maplo = ndz2.ndz->maplo;
delta.ndz->maphi = ndz2.ndz->maphi;
#if 1
printf("==== Delta hash ");
ndz_hashmap_dump(delta.map, (debug==0));
......@@ -693,6 +808,23 @@ main(int argc, char **argv)
exit(1);
}
/*
* Initialize signature file info for delta map.
* XXX doesn't belong here.
*/
delta.ndz->hashmap = delta.sigmap;
delta.ndz->hashdata = calloc(ndz2.ndz->hashentries,
sizeof(struct ndz_hashdata));
if (delta.ndz->hashdata == NULL) {
fprintf(stderr, "%s: could not allocate hashdata for delta image\n",
argv[2]);
exit(1);
}
delta.ndz->hashtype = hashtype;
delta.ndz->hashblksize = hashblksize;
delta.ndz->hashentries = ndz2.ndz->hashentries;
delta.ndz->hashcurentry = 0;
/*
* If there is anything in the resulting delta, produce an image!
*/
......@@ -700,14 +832,19 @@ main(int argc, char **argv)
struct chunkstate *cstate = calloc(1, sizeof(*cstate));
assert(cstate != NULL);
delta.ndz->hashtype = hashtype;
delta.ndz->hashblksize = hashblksize;
if (ndz_rangemap_iterate(delta.map, chunkify, cstate) != 0) {
fprintf(stderr, "%s: error while creating new delta image\n",
argv[2]);
exit(1);
}
free(cstate);
/* write the new sigfile */
if (ndz_writehashinfo(delta.ndz, delta.sigfile) != 0) {
fprintf(stderr, "%s: could not write signature file %s\n",
argv[2], delta.sigfile);
}
ndz_close(ndz2.ndz);
ndz_close(delta.ndz);
} else {
......
/*
* Copyright (c) 2000-2014 University of Utah and the Flux Group.
* Copyright (c) 2000-2015 University of Utah and the Flux Group.
*
* {{{EMULAB-LICENSE
*
......
This diff is collapsed.
......@@ -120,6 +120,7 @@ static int imageversion = 1;
static int dots = 0;
static int dotcol;
static int directio = 0;
static int ignoreskips = 0;
static struct timeval stamp;
#ifndef FRISBEE
static int infd;
......@@ -598,7 +599,7 @@ main(int argc, char *argv[])
#ifdef NOTHREADS
nothreads = 1;
#endif
while ((ch = getopt(argc, argv, "vdhs:zp:oOnFD:W:Cr:Na:ck:eu:f")) != -1)
while ((ch = getopt(argc, argv, "vdhs:zp:oOnFD:W:Cr:Na:ck:eu:fI")) != -1)
switch(ch) {
#ifdef FAKEFRISBEE
case 'F':
......@@ -722,6 +723,11 @@ main(int argc, char *argv[])
has_id = 1;
break;
/* ignore skipped ranges on output (which must be stdout) */
case 'I':
ignoreskips++;
break;
case 'h':
case '?':
default:
......@@ -769,6 +775,19 @@ main(int argc, char *argv[])
else if (argc == 2 && strcmp(argv[1], "-")) {
int flags;
/*
* This option is really a hack so we can compare the
* content of two images regardless of how they got split
* into chunks; i.e., for cases where we cannot just compare
* the image files directly. So we force them to use stdout
* to make it painfully obvious that this is not a useful
* option!
*/
if (ignoreskips) {
fprintf(stderr, "Must output to stdout with -I\n");
exit(1);
}
/*
* XXX perform seek and MBR checks before we truncate
* the output file. If they have their input/output
......@@ -824,7 +843,10 @@ main(int argc, char *argv[])
* we cannot really handle slice mode, we must always zero fill
* (cannot skip free space) and we cannot use pwrite.
*/
if (lseek(outfd, (off_t)0, SEEK_SET) < 0) {
if (ignoreskips) {
dofill = 0;
seekable = 0;
} else if (lseek(outfd, (off_t)0, SEEK_SET) < 0) {
if (slice) {
fprintf(stderr, "Output file is not seekable, "
"cannot specify a slice\n");
......@@ -1850,7 +1872,7 @@ writedata(off_t offset, size_t size, void *buf)
#endif
if (seekable) {
cc = pwrite(outfd, buf, size, offset);
} else if (offset == nextwriteoffset) {
} else if (offset == nextwriteoffset || ignoreskips) {
cc = write(outfd, buf, size);
} else {
fprintf(stderr, "Non-contiguous write @ %lld (should be %lld)\n",
......
1. Useful datums:
A. Why don't the old tools and the new library produce identical chunking?
This is because of an obscure block grouping difference that would have
been too obnoxious to replicate in the new code.
In the old code, each range is broken down into 128K max (uncompressed)
blocks which is the size of its read buffer. It then performs a check to
see if that up-to 128K block would fit in the space remaining in the current
chunk (subblockleft-1024). If not it further reduces the size of this block
to what remains.
The new code does not start checking for a "chunk fit" til it has broken
down the data into 64K max hash blocks. It then uses the same criteria
for fitting, but because it doesn't have that 128K division point that
old imagezip does, the chunking can be slightly different.
2. Useful testing tools:
A. compare two signature files that should be identical:
imagehash -CR -o foo.ndz.sig > /tmp/1
imagehash -CR -o Nfoo.ndz.sig > /tmp/2
diff /tmp/[12]
Need the -C option so it won't output chunk numbers. The new routines
and old tools don't chunk data precisely the same.
B. checking the data content of two images that should be identical
(make sure you have lots of disk space!):
imageunzip -I foo.ndz - > /tmp/1
imageunzip -I Nfoo.ndz - > /tmp/2
cmp /tmp/[12]
......@@ -399,11 +399,14 @@ ndz_chunk_flush(ndz_chunk_t chobj, int withheader)
ssize_t
ndz_chunk_left(ndz_chunk_t chobj)
{
ssize_t remain;
struct ndz_chunk *chunk = (struct ndz_chunk *)chobj;
if (chunk == NULL)
return -1;