lguest.c 57.8 KB
Newer Older
Rusty Russell's avatar
Rusty Russell committed
1 2 3 4 5 6
/*P:100
 * This is the Launcher code, a simple program which lays out the "physical"
 * memory for the new Guest by mapping the kernel image and the virtual
 * devices, then opens /dev/lguest to tell the kernel about the Guest and
 * control it.
:*/
7 8 9 10 11 12 13 14 15 16
#define _LARGEFILE64_SOURCE
#define _GNU_SOURCE
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <err.h>
#include <stdint.h>
#include <stdlib.h>
#include <elf.h>
#include <sys/mman.h>
17
#include <sys/param.h>
18 19 20
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/wait.h>
21
#include <sys/eventfd.h>
22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
#include <fcntl.h>
#include <stdbool.h>
#include <errno.h>
#include <ctype.h>
#include <sys/socket.h>
#include <sys/ioctl.h>
#include <sys/time.h>
#include <time.h>
#include <netinet/in.h>
#include <net/if.h>
#include <linux/sockios.h>
#include <linux/if_tun.h>
#include <sys/uio.h>
#include <termios.h>
#include <getopt.h>
#include <zlib.h>
38 39
#include <assert.h>
#include <sched.h>
40 41
#include <limits.h>
#include <stddef.h>
42
#include <signal.h>
43
#include "linux/lguest_launcher.h"
44 45 46 47
#include "linux/virtio_config.h"
#include "linux/virtio_net.h"
#include "linux/virtio_blk.h"
#include "linux/virtio_console.h"
Rusty Russell's avatar
Rusty Russell committed
48
#include "linux/virtio_rng.h"
49
#include "linux/virtio_ring.h"
50
#include "asm/bootparam.h"
Rusty Russell's avatar
Rusty Russell committed
51
/*L:110
Rusty Russell's avatar
Rusty Russell committed
52
 * We can ignore the 42 include files we need for this program, but I do want
Rusty Russell's avatar
Rusty Russell committed
53
 * to draw attention to the use of kernel-style types.
54 55 56 57
 *
 * As Linus said, "C is a Spartan language, and so should your naming be."  I
 * like these abbreviations, so we define them here.  Note that u64 is always
 * unsigned long long, which works on all Linux systems: this means that we can
Rusty Russell's avatar
Rusty Russell committed
58 59
 * use %llu in printf for any u64.
 */
60 61 62 63
typedef unsigned long long u64;
typedef uint32_t u32;
typedef uint16_t u16;
typedef uint8_t u8;
64
/*:*/
65 66 67 68 69 70

#define PAGE_PRESENT 0x7 	/* Present, RW, Execute */
#define BRIDGE_PFX "bridge:"
#ifndef SIOCBRADDIF
#define SIOCBRADDIF	0x89a2		/* add interface to bridge      */
#endif
71 72
/* We can have up to 256 pages for devices. */
#define DEVICE_PAGES 256
Rusty Russell's avatar
Rusty Russell committed
73 74
/* This will occupy 3 pages: it must be a power of 2. */
#define VIRTQUEUE_NUM 256
75

Rusty Russell's avatar
Rusty Russell committed
76 77 78 79
/*L:120
 * verbose is both a global flag and a macro.  The C preprocessor allows
 * this, and although I wouldn't recommend it, it works quite nicely here.
 */
80 81 82
static bool verbose;
#define verbose(args...) \
	do { if (verbose) printf(args); } while(0)
83 84
/*:*/

85 86 87 88
/* The pointer to the start of guest memory. */
static void *guest_base;
/* The maximum guest physical address allowed, and maximum possible. */
static unsigned long guest_limit, guest_max;
89 90
/* The /dev/lguest file descriptor. */
static int lguest_fd;
91

92 93 94
/* a per-cpu variable indicating whose vcpu is currently running */
static unsigned int __thread cpu_id;

95
/* This is our list of devices. */
96
struct device_list {
97 98 99 100 101 102
	/* Counter to assign interrupt numbers. */
	unsigned int next_irq;

	/* Counter to print out convenient device numbers. */
	unsigned int device_num;

103
	/* The descriptor page for the devices. */
104 105
	u8 *descpage;

106
	/* A single linked list of devices. */
107
	struct device *dev;
Rusty Russell's avatar
Rusty Russell committed
108
	/* And a pointer to the last device for easy append. */
109
	struct device *lastdev;
110 111
};

112 113 114
/* The list of Guest devices, based on command line arguments. */
static struct device_list devices;

115
/* The device structure describes a single device. */
116
struct device {
117
	/* The linked-list pointer. */
118
	struct device *next;
119

120
	/* The device's descriptor, as mapped into the Guest. */
121
	struct lguest_device_desc *desc;
122

123 124 125 126
	/* We can't trust desc values once Guest has booted: we use these. */
	unsigned int feature_len;
	unsigned int num_vq;

127 128
	/* The name of this device, for --verbose. */
	const char *name;
129

130 131
	/* Any queues attached to this device */
	struct virtqueue *vq;
132

133 134
	/* Is it operational */
	bool running;
135

136 137 138
	/* Does Guest want an intrrupt on empty? */
	bool irq_on_empty;

139 140 141 142
	/* Device-specific data. */
	void *priv;
};

143
/* The virtqueue structure describes a queue attached to a device. */
144
struct virtqueue {
145 146 147 148 149 150 151 152 153 154 155 156 157 158
	struct virtqueue *next;

	/* Which device owns me. */
	struct device *dev;

	/* The configuration for this queue. */
	struct lguest_vqconfig config;

	/* The actual ring of buffers. */
	struct vring vring;

	/* Last available index we saw. */
	u16 last_avail_idx;

159 160 161
	/* How many are used since we sent last irq? */
	unsigned int pending_used;

162 163
	/* Eventfd where Guest notifications arrive. */
	int eventfd;
Rusty Russell's avatar
Rusty Russell committed
164

165 166 167
	/* Function for the thread which is servicing this virtqueue. */
	void (*service)(struct virtqueue *vq);
	pid_t thread;
168 169
};

Balaji Rao's avatar
Balaji Rao committed
170 171 172
/* Remember the arguments to the program so we can "reboot" */
static char **main_args;

173 174 175
/* The original tty settings to restore on exit. */
static struct termios orig_term;

Rusty Russell's avatar
Rusty Russell committed
176 177
/*
 * We have to be careful with barriers: our devices are all run in separate
178
 * threads and so we need to make sure that changes visible to the Guest happen
Rusty Russell's avatar
Rusty Russell committed
179 180
 * in precise order.
 */
181
#define wmb() __asm__ __volatile__("" : : : "memory")
182
#define mb() __asm__ __volatile__("" : : : "memory")
183

Rusty Russell's avatar
Rusty Russell committed
184 185
/*
 * Convert an iovec element to the given type.
186 187 188 189 190 191
 *
 * This is a fairly ugly trick: we need to know the size of the type and
 * alignment requirement to check the pointer is kosher.  It's also nice to
 * have the name of the type in case we report failure.
 *
 * Typing those three things all the time is cumbersome and error prone, so we
Rusty Russell's avatar
Rusty Russell committed
192 193
 * have a macro which sets them all up and passes to the real function.
 */
194 195 196 197 198 199 200 201 202 203 204 205 206
#define convert(iov, type) \
	((type *)_convert((iov), sizeof(type), __alignof__(type), #type))

static void *_convert(struct iovec *iov, size_t size, size_t align,
		      const char *name)
{
	if (iov->iov_len != size)
		errx(1, "Bad iovec size %zu for %s", iov->iov_len, name);
	if ((unsigned long)iov->iov_base % align != 0)
		errx(1, "Bad alignment %p for %s", iov->iov_base, name);
	return iov->iov_base;
}

207 208 209
/* Wrapper for the last available index.  Makes it easier to change. */
#define lg_last_avail(vq)	((vq)->last_avail_idx)

Rusty Russell's avatar
Rusty Russell committed
210 211 212 213
/*
 * The virtio configuration space is defined to be little-endian.  x86 is
 * little-endian too, but it's nice to be explicit so we have these helpers.
 */
214 215 216 217 218
#define cpu_to_le16(v16) (v16)
#define cpu_to_le32(v32) (v32)
#define cpu_to_le64(v64) (v64)
#define le16_to_cpu(v16) (v16)
#define le32_to_cpu(v32) (v32)
219
#define le64_to_cpu(v64) (v64)
220

Rusty Russell's avatar
Rusty Russell committed
221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247
/* Is this iovec empty? */
static bool iov_empty(const struct iovec iov[], unsigned int num_iov)
{
	unsigned int i;

	for (i = 0; i < num_iov; i++)
		if (iov[i].iov_len)
			return false;
	return true;
}

/* Take len bytes from the front of this iovec. */
static void iov_consume(struct iovec iov[], unsigned num_iov, unsigned len)
{
	unsigned int i;

	for (i = 0; i < num_iov; i++) {
		unsigned int used;

		used = iov[i].iov_len < len ? iov[i].iov_len : len;
		iov[i].iov_base += used;
		iov[i].iov_len -= used;
		len -= used;
	}
	assert(len == 0);
}

Rusty Russell's avatar
Rusty Russell committed
248 249 250 251
/* The device virtqueue descriptors are followed by feature bitmasks. */
static u8 *get_feature_bits(struct device *dev)
{
	return (u8 *)(dev->desc + 1)
252
		+ dev->num_vq * sizeof(struct lguest_vqconfig);
Rusty Russell's avatar
Rusty Russell committed
253 254
}

Rusty Russell's avatar
Rusty Russell committed
255 256 257 258 259 260
/*L:100
 * The Launcher code itself takes us out into userspace, that scary place where
 * pointers run wild and free!  Unfortunately, like most userspace programs,
 * it's quite boring (which is why everyone likes to hack on the kernel!).
 * Perhaps if you make up an Lguest Drinking Game at this point, it will get
 * you through this section.  Or, maybe not.
261 262 263 264 265 266 267
 *
 * The Launcher sets up a big chunk of memory to be the Guest's "physical"
 * memory and stores it in "guest_base".  In other words, Guest physical ==
 * Launcher virtual with an offset.
 *
 * This can be tough to get your head around, but usually it just means that we
 * use these trivial conversion functions when the Guest gives us it's
Rusty Russell's avatar
Rusty Russell committed
268 269
 * "physical" addresses:
 */
270 271 272 273 274 275 276 277 278 279
static void *from_guest_phys(unsigned long addr)
{
	return guest_base + addr;
}

static unsigned long to_guest_phys(const void *addr)
{
	return (addr - guest_base);
}

280 281 282 283
/*L:130
 * Loading the Kernel.
 *
 * We start with couple of simple helper routines.  open_or_die() avoids
Rusty Russell's avatar
Rusty Russell committed
284 285
 * error-checking code cluttering the callers:
 */
286 287 288 289 290 291 292 293
static int open_or_die(const char *name, int flags)
{
	int fd = open(name, flags);
	if (fd < 0)
		err(1, "Failed to open %s", name);
	return fd;
}

294 295
/* map_zeroed_pages() takes a number of pages. */
static void *map_zeroed_pages(unsigned int num)
296
{
297 298
	int fd = open_or_die("/dev/zero", O_RDONLY);
	void *addr;
299

Rusty Russell's avatar
Rusty Russell committed
300 301 302 303
	/*
	 * We use a private mapping (ie. if we write to the page, it will be
	 * copied).
	 */
304 305 306
	addr = mmap(NULL, getpagesize() * num,
		    PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, fd, 0);
	if (addr == MAP_FAILED)
307
		err(1, "Mmapping %u pages of /dev/zero", num);
Rusty Russell's avatar
Rusty Russell committed
308 309 310 311 312

	/*
	 * One neat mmap feature is that you can close the fd, and it
	 * stays mapped.
	 */
313
	close(fd);
314 315 316 317 318 319 320 321 322 323 324 325 326

	return addr;
}

/* Get some more pages for a device. */
static void *get_pages(unsigned int num)
{
	void *addr = from_guest_phys(guest_limit);

	guest_limit += num * getpagesize();
	if (guest_limit > guest_max)
		errx(1, "Not enough memory for devices");
	return addr;
327 328
}

Rusty Russell's avatar
Rusty Russell committed
329 330
/*
 * This routine is used to load the kernel or initrd.  It tries mmap, but if
331
 * that fails (Plan 9's kernel file isn't nicely aligned on page boundaries),
Rusty Russell's avatar
Rusty Russell committed
332 333
 * it falls back to reading the memory in.
 */
334 335 336 337
static void map_at(int fd, void *addr, unsigned long offset, unsigned long len)
{
	ssize_t r;

Rusty Russell's avatar
Rusty Russell committed
338 339
	/*
	 * We map writable even though for some segments are marked read-only.
340 341 342 343 344
	 * The kernel really wants to be writable: it patches its own
	 * instructions.
	 *
	 * MAP_PRIVATE means that the page won't be copied until a write is
	 * done to it.  This allows us to share untouched memory between
Rusty Russell's avatar
Rusty Russell committed
345 346
	 * Guests.
	 */
347 348 349 350 351 352 353 354 355 356
	if (mmap(addr, len, PROT_READ|PROT_WRITE|PROT_EXEC,
		 MAP_FIXED|MAP_PRIVATE, fd, offset) != MAP_FAILED)
		return;

	/* pread does a seek and a read in one shot: saves a few lines. */
	r = pread(fd, addr, len, offset);
	if (r != len)
		err(1, "Reading offset %lu len %lu gave %zi", offset, len, r);
}

Rusty Russell's avatar
Rusty Russell committed
357 358
/*
 * This routine takes an open vmlinux image, which is in ELF, and maps it into
359 360 361 362
 * the Guest memory.  ELF = Embedded Linking Format, which is the format used
 * by all modern binaries on Linux including the kernel.
 *
 * The ELF headers give *two* addresses: a physical address, and a virtual
363 364
 * address.  We use the physical address; the Guest will map itself to the
 * virtual address.
365
 *
Rusty Russell's avatar
Rusty Russell committed
366 367
 * We return the starting address.
 */
368
static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr)
369 370 371 372
{
	Elf32_Phdr phdr[ehdr->e_phnum];
	unsigned int i;

Rusty Russell's avatar
Rusty Russell committed
373 374 375 376
	/*
	 * Sanity checks on the main ELF header: an x86 executable with a
	 * reasonable number of correctly-sized program headers.
	 */
377 378 379 380 381 382
	if (ehdr->e_type != ET_EXEC
	    || ehdr->e_machine != EM_386
	    || ehdr->e_phentsize != sizeof(Elf32_Phdr)
	    || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr))
		errx(1, "Malformed elf header");

Rusty Russell's avatar
Rusty Russell committed
383 384
	/*
	 * An ELF executable contains an ELF header and a number of "program"
385
	 * headers which indicate which parts ("segments") of the program to
Rusty Russell's avatar
Rusty Russell committed
386 387
	 * load where.
	 */
388 389

	/* We read in all the program headers at once: */
390 391 392 393 394
	if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0)
		err(1, "Seeking to program headers");
	if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr))
		err(1, "Reading program headers");

Rusty Russell's avatar
Rusty Russell committed
395 396 397 398
	/*
	 * Try all the headers: there are usually only three.  A read-only one,
	 * a read-write one, and a "note" section which we don't load.
	 */
399
	for (i = 0; i < ehdr->e_phnum; i++) {
400
		/* If this isn't a loadable segment, we ignore it */
401 402 403 404 405 406
		if (phdr[i].p_type != PT_LOAD)
			continue;

		verbose("Section %i: size %i addr %p\n",
			i, phdr[i].p_memsz, (void *)phdr[i].p_paddr);

407
		/* We map this section of the file at its physical address. */
408
		map_at(elf_fd, from_guest_phys(phdr[i].p_paddr),
409
		       phdr[i].p_offset, phdr[i].p_filesz);
410 411
	}

412 413
	/* The entry point is given in the ELF header. */
	return ehdr->e_entry;
414 415
}

Rusty Russell's avatar
Rusty Russell committed
416 417 418 419
/*L:150
 * A bzImage, unlike an ELF file, is not meant to be loaded.  You're supposed
 * to jump into it and it will unpack itself.  We used to have to perform some
 * hairy magic because the unpacking code scared me.
420
 *
Rusty Russell's avatar
Rusty Russell committed
421 422
 * Fortunately, Jeremy Fitzhardinge convinced me it wasn't that hard and wrote
 * a small patch to jump over the tricky bits in the Guest, so now we just read
Rusty Russell's avatar
Rusty Russell committed
423 424
 * the funky header so we know where in the file to load, and away we go!
 */
425
static unsigned long load_bzimage(int fd)
426
{
427
	struct boot_params boot;
Rusty Russell's avatar
Rusty Russell committed
428 429 430 431
	int r;
	/* Modern bzImages get loaded at 1M. */
	void *p = from_guest_phys(0x100000);

Rusty Russell's avatar
Rusty Russell committed
432 433 434 435
	/*
	 * Go back to the start of the file and read the header.  It should be
	 * a Linux boot header (see Documentation/x86/i386/boot.txt)
	 */
Rusty Russell's avatar
Rusty Russell committed
436
	lseek(fd, 0, SEEK_SET);
437
	read(fd, &boot, sizeof(boot));
Rusty Russell's avatar
Rusty Russell committed
438

439 440
	/* Inside the setup_hdr, we expect the magic "HdrS" */
	if (memcmp(&boot.hdr.header, "HdrS", 4) != 0)
Rusty Russell's avatar
Rusty Russell committed
441 442
		errx(1, "This doesn't look like a bzImage to me");

443 444
	/* Skip over the extra sectors of the header. */
	lseek(fd, (boot.hdr.setup_sects+1) * 512, SEEK_SET);
Rusty Russell's avatar
Rusty Russell committed
445 446 447 448 449

	/* Now read everything into memory. in nice big chunks. */
	while ((r = read(fd, p, 65536)) > 0)
		p += r;

450 451
	/* Finally, code32_start tells us where to enter the kernel. */
	return boot.hdr.code32_start;
452 453
}

Rusty Russell's avatar
Rusty Russell committed
454 455
/*L:140
 * Loading the kernel is easy when it's a "vmlinux", but most kernels
Rusty Russell's avatar
Rusty Russell committed
456
 * come wrapped up in the self-decompressing "bzImage" format.  With a little
Rusty Russell's avatar
Rusty Russell committed
457 458
 * work, we can load those, too.
 */
459
static unsigned long load_kernel(int fd)
460 461 462
{
	Elf32_Ehdr hdr;

463
	/* Read in the first few bytes. */
464 465 466
	if (read(fd, &hdr, sizeof(hdr)) != sizeof(hdr))
		err(1, "Reading kernel");

467
	/* If it's an ELF file, it starts with "\177ELF" */
468
	if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0)
469
		return map_elf(fd, &hdr);
470

471
	/* Otherwise we assume it's a bzImage, and try to load it. */
472
	return load_bzimage(fd);
473 474
}

Rusty Russell's avatar
Rusty Russell committed
475 476
/*
 * This is a trivial little helper to align pages.  Andi Kleen hated it because
477 478 479
 * it calls getpagesize() twice: "it's dumb code."
 *
 * Kernel guys get really het up about optimization, even when it's not
Rusty Russell's avatar
Rusty Russell committed
480 481
 * necessary.  I leave this code as a reaction against that.
 */
482 483
static inline unsigned long page_align(unsigned long addr)
{
484
	/* Add upwards and truncate downwards. */
485 486 487
	return ((addr + getpagesize()-1) & ~(getpagesize()-1));
}

Rusty Russell's avatar
Rusty Russell committed
488 489 490 491 492
/*L:180
 * An "initial ram disk" is a disk image loaded into memory along with the
 * kernel which the kernel can use to boot from without needing any drivers.
 * Most distributions now use this as standard: the initrd contains the code to
 * load the appropriate driver modules for the current machine.
493 494
 *
 * Importantly, James Morris works for RedHat, and Fedora uses initrds for its
Rusty Russell's avatar
Rusty Russell committed
495 496
 * kernels.  He sent me this (and tells me when I break it).
 */
497 498 499 500 501 502 503
static unsigned long load_initrd(const char *name, unsigned long mem)
{
	int ifd;
	struct stat st;
	unsigned long len;

	ifd = open_or_die(name, O_RDONLY);
504
	/* fstat() is needed to get the file size. */
505 506 507
	if (fstat(ifd, &st) < 0)
		err(1, "fstat() on initrd '%s'", name);

Rusty Russell's avatar
Rusty Russell committed
508 509 510 511
	/*
	 * We map the initrd at the top of memory, but mmap wants it to be
	 * page-aligned, so we round the size up for that.
	 */
512
	len = page_align(st.st_size);
513
	map_at(ifd, from_guest_phys(mem - len), 0, st.st_size);
Rusty Russell's avatar
Rusty Russell committed
514 515 516 517
	/*
	 * Once a file is mapped, you can close the file descriptor.  It's a
	 * little odd, but quite useful.
	 */
518
	close(ifd);
519
	verbose("mapped initrd %s size=%lu @ %p\n", name, len, (void*)mem-len);
520 521

	/* We return the initrd size. */
522 523
	return len;
}
Rusty Russell's avatar
Rusty Russell committed
524
/*:*/
525

Rusty Russell's avatar
Rusty Russell committed
526 527 528 529
/*
 * Simple routine to roll all the commandline arguments together with spaces
 * between them.
 */
530 531 532 533 534
static void concat(char *dst, char *args[])
{
	unsigned int i, len = 0;

	for (i = 0; args[i]; i++) {
535 536 537 538
		if (i) {
			strcat(dst+len, " ");
			len++;
		}
539
		strcpy(dst+len, args[i]);
540
		len += strlen(args[i]);
541 542 543 544 545
	}
	/* In case it's empty. */
	dst[len] = '\0';
}

Rusty Russell's avatar
Rusty Russell committed
546 547
/*L:185
 * This is where we actually tell the kernel to initialize the Guest.  We
Rusty Russell's avatar
Rusty Russell committed
548
 * saw the arguments it expects when we looked at initialize() in lguest_user.c:
549
 * the base of Guest "physical" memory, the top physical page to allow and the
Rusty Russell's avatar
Rusty Russell committed
550 551
 * entry point for the Guest.
 */
552
static void tell_kernel(unsigned long start)
553
{
554 555
	unsigned long args[] = { LHREQ_INITIALIZE,
				 (unsigned long)guest_base,
556
				 guest_limit / getpagesize(), start };
557 558
	verbose("Guest: %p - %p (%#lx)\n",
		guest_base, guest_base + guest_limit, guest_limit);
559 560
	lguest_fd = open_or_die("/dev/lguest", O_RDWR);
	if (write(lguest_fd, args, sizeof(args)) < 0)
561 562
		err(1, "Writing to /dev/lguest");
}
563
/*:*/
564

Rusty Russell's avatar
Rusty Russell committed
565
/*L:200
566 567
 * Device Handling.
 *
Rusty Russell's avatar
Rusty Russell committed
568
 * When the Guest gives us a buffer, it sends an array of addresses and sizes.
569
 * We need to make sure it's not trying to reach into the Launcher itself, so
Rusty Russell's avatar
Rusty Russell committed
570
 * we have a convenient routine which checks it and exits with an error message
571 572
 * if something funny is going on:
 */
573 574 575
static void *_check_pointer(unsigned long addr, unsigned int size,
			    unsigned int line)
{
Rusty Russell's avatar
Rusty Russell committed
576 577 578 579
	/*
	 * We have to separately check addr and addr+size, because size could
	 * be huge and addr + size might wrap around.
	 */
580
	if (addr >= guest_limit || addr + size >= guest_limit)
581
		errx(1, "%s:%i: Invalid address %#lx", __FILE__, line, addr);
Rusty Russell's avatar
Rusty Russell committed
582 583 584 585
	/*
	 * We return a pointer for the caller's convenience, now we know it's
	 * safe to use.
	 */
586
	return from_guest_phys(addr);
587
}
588
/* A macro which transparently hands the line number to the real function. */
589 590
#define check_pointer(addr,size) _check_pointer(addr, size, __LINE__)

Rusty Russell's avatar
Rusty Russell committed
591 592
/*
 * Each buffer in the virtqueues is actually a chain of descriptors.  This
Rusty Russell's avatar
Rusty Russell committed
593
 * function returns the next descriptor in the chain, or vq->vring.num if we're
Rusty Russell's avatar
Rusty Russell committed
594 595
 * at the end.
 */
596 597
static unsigned next_desc(struct vring_desc *desc,
			  unsigned int i, unsigned int max)
598 599 600 601
{
	unsigned int next;

	/* If this descriptor says it doesn't chain, we're done. */
602 603
	if (!(desc[i].flags & VRING_DESC_F_NEXT))
		return max;
604 605

	/* Check they're not leading us off end of descriptors. */
606
	next = desc[i].next;
607 608 609
	/* Make sure compiler knows to grab that: we don't want it changing! */
	wmb();

610
	if (next >= max)
611 612 613 614 615
		errx(1, "Desc next is %u", next);

	return next;
}

Rusty Russell's avatar
Rusty Russell committed
616 617 618 619
/*
 * This actually sends the interrupt for this virtqueue, if we've used a
 * buffer.
 */
620 621 622 623
static void trigger_irq(struct virtqueue *vq)
{
	unsigned long buf[] = { LHREQ_IRQ, vq->config.irq };

624 625 626 627 628
	/* Don't inform them if nothing used. */
	if (!vq->pending_used)
		return;
	vq->pending_used = 0;

629 630 631 632 633 634 635
	/* If they don't want an interrupt, don't send one... */
	if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) {
		/* ... unless they've asked us to force one on empty. */
		if (!vq->dev->irq_on_empty
		    || lg_last_avail(vq) != vq->vring.avail->idx)
			return;
	}
636 637 638 639 640 641

	/* Send the Guest an interrupt tell them we used something up. */
	if (write(lguest_fd, buf, sizeof(buf)) != 0)
		err(1, "Triggering irq %i", vq->config.irq);
}

Rusty Russell's avatar
Rusty Russell committed
642
/*
Rusty Russell's avatar
Rusty Russell committed
643
 * This looks in the virtqueue for the first available buffer, and converts
644 645 646 647
 * it to an iovec for convenient access.  Since descriptors consist of some
 * number of output then some number of input descriptors, it's actually two
 * iovecs, but we pack them into one and note how many of each there were.
 *
Rusty Russell's avatar
Rusty Russell committed
648
 * This function waits if necessary, and returns the descriptor number found.
Rusty Russell's avatar
Rusty Russell committed
649
 */
650 651 652
static unsigned wait_for_vq_desc(struct virtqueue *vq,
				 struct iovec iov[],
				 unsigned int *out_num, unsigned int *in_num)
653
{
654 655
	unsigned int i, head, max;
	struct vring_desc *desc;
656 657
	u16 last_avail = lg_last_avail(vq);

Rusty Russell's avatar
Rusty Russell committed
658
	/* There's nothing available? */
659 660 661
	while (last_avail == vq->vring.avail->idx) {
		u64 event;

Rusty Russell's avatar
Rusty Russell committed
662 663 664 665
		/*
		 * Since we're about to sleep, now is a good time to tell the
		 * Guest about what we've used up to now.
		 */
666 667
		trigger_irq(vq);

668 669 670
		/* OK, now we need to know about added descriptors. */
		vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY;

Rusty Russell's avatar
Rusty Russell committed
671 672 673 674
		/*
		 * They could have slipped one in as we were doing that: make
		 * sure it's written, then check again.
		 */
675 676 677 678 679 680
		mb();
		if (last_avail != vq->vring.avail->idx) {
			vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
			break;
		}

681 682 683
		/* Nothing new?  Wait for eventfd to tell us they refilled. */
		if (read(vq->eventfd, &event, sizeof(event)) != sizeof(event))
			errx(1, "Event read failed?");
684 685 686

		/* We don't need to be notified again. */
		vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
687
	}
688 689

	/* Check it isn't doing very strange things with descriptor numbers. */
690
	if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num)
691
		errx(1, "Guest moved used index from %u to %u",
692
		     last_avail, vq->vring.avail->idx);
693

Rusty Russell's avatar
Rusty Russell committed
694 695 696 697
	/*
	 * Grab the next descriptor number they're advertising, and increment
	 * the index we've seen.
	 */
698 699
	head = vq->vring.avail->ring[last_avail % vq->vring.num];
	lg_last_avail(vq)++;
700 701 702 703 704 705 706 707

	/* If their number is silly, that's a fatal mistake. */
	if (head >= vq->vring.num)
		errx(1, "Guest says index %u is available", head);

	/* When we start there are none of either input nor output. */
	*out_num = *in_num = 0;

708 709
	max = vq->vring.num;
	desc = vq->vring.desc;
710
	i = head;
711

Rusty Russell's avatar
Rusty Russell committed
712 713 714 715
	/*
	 * If this is an indirect entry, then this buffer contains a descriptor
	 * table which we handle as if it's any normal descriptor chain.
	 */
716 717 718 719 720 721 722 723 724
	if (desc[i].flags & VRING_DESC_F_INDIRECT) {
		if (desc[i].len % sizeof(struct vring_desc))
			errx(1, "Invalid size for indirect buffer table");

		max = desc[i].len / sizeof(struct vring_desc);
		desc = check_pointer(desc[i].addr, desc[i].len);
		i = 0;
	}

725 726
	do {
		/* Grab the first descriptor, and check it's OK. */
727
		iov[*out_num + *in_num].iov_len = desc[i].len;
728
		iov[*out_num + *in_num].iov_base
729
			= check_pointer(desc[i].addr, desc[i].len);
730
		/* If this is an input descriptor, increment that count. */
731
		if (desc[i].flags & VRING_DESC_F_WRITE)
732 733
			(*in_num)++;
		else {
Rusty Russell's avatar
Rusty Russell committed
734 735 736 737
			/*
			 * If it's an output descriptor, they're all supposed
			 * to come before any input descriptors.
			 */
738 739 740 741 742 743
			if (*in_num)
				errx(1, "Descriptor has out after in");
			(*out_num)++;
		}

		/* If we've got too many, that implies a descriptor loop. */
744
		if (*out_num + *in_num > max)
745
			errx(1, "Looped descriptor");
746
	} while ((i = next_desc(desc, i, max)) != max);
747

748
	return head;
749 750
}

Rusty Russell's avatar
Rusty Russell committed
751
/*
Rusty Russell's avatar
Rusty Russell committed
752 753 754
 * After we've used one of their buffers, we tell the Guest about it.  Sometime
 * later we'll want to send them an interrupt using trigger_irq(); note that
 * wait_for_vq_desc() does that for us if it has to wait.
Rusty Russell's avatar
Rusty Russell committed
755
 */
756
static void add_used(struct virtqueue *vq, unsigned int head, int len)
757
{
758 759
	struct vring_used_elem *used;

Rusty Russell's avatar
Rusty Russell committed
760 761 762 763
	/*
	 * The virtqueue contains a ring of used buffers.  Get a pointer to the
	 * next entry in that used ring.
	 */
764 765 766 767 768 769
	used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num];
	used->id = head;
	used->len = len;
	/* Make sure buffer is written before we update index. */
	wmb();
	vq->vring.used->idx++;
770
	vq->pending_used++;
771 772
}

773
/* And here's the combo meal deal.  Supersize me! */
774
static void add_used_and_trigger(struct virtqueue *vq, unsigned head, int len)
775
{
776
	add_used(vq, head, len);
777
	trigger_irq(vq);
778 779
}

Rusty Russell's avatar
Rusty Russell committed
780 781 782
/*
 * The Console
 *
Rusty Russell's avatar
Rusty Russell committed
783 784
 * We associate some data with the console for our exit hack.
 */
785
struct console_abort {
786
	/* How many times have they hit ^C? */
787
	int count;
788
	/* When did they start? */
789 790 791
	struct timeval start;
};

792
/* This is the routine which handles console input (ie. stdin). */
793
static void console_input(struct virtqueue *vq)
794 795
{
	int len;
796
	unsigned int head, in_num, out_num;
797 798
	struct console_abort *abort = vq->dev->priv;
	struct iovec iov[vq->vring.num];
799

Rusty Russell's avatar
Rusty Russell committed
800
	/* Make sure there's a descriptor available. */
801
	head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
802
	if (out_num)
803
		errx(1, "Output buffers in console in queue?");
804

Rusty Russell's avatar
Rusty Russell committed
805
	/* Read into it.  This is where we usually wait. */
806
	len = readv(STDIN_FILENO, iov, in_num);
807
	if (len <= 0) {
808
		/* Ran out of input? */
809
		warnx("Failed to get console input, ignoring console.");
Rusty Russell's avatar
Rusty Russell committed
810 811 812 813
		/*
		 * For simplicity, dying threads kill the whole Launcher.  So
		 * just nap here.
		 */
814 815
		for (;;)
			pause();
816 817
	}

Rusty Russell's avatar
Rusty Russell committed
818
	/* Tell the Guest we used a buffer. */
819
	add_used_and_trigger(vq, head, len);
820

Rusty Russell's avatar
Rusty Russell committed
821 822
	/*
	 * Three ^C within one second?  Exit.
823
	 *
824 825 826
	 * This is such a hack, but works surprisingly well.  Each ^C has to
	 * be in a buffer by itself, so they can't be too fast.  But we check
	 * that we get three within about a second, so they can't be too
Rusty Russell's avatar
Rusty Russell committed
827 828
	 * slow.
	 */
829
	if (len != 1 || ((char *)iov[0].iov_base)[0] != 3) {
830
		abort->count = 0;
831 832
		return;
	}
833

834 835 836 837 838 839 840 841 842 843 844
	abort->count++;
	if (abort->count == 1)
		gettimeofday(&abort->start, NULL);
	else if (abort->count == 3) {
		struct timeval now;
		gettimeofday(&now, NULL);
		/* Kill all Launcher processes with SIGINT, like normal ^C */
		if (now.tv_sec <= abort->start.tv_sec+1)
			kill(0, SIGINT);
		abort->count = 0;
	}
845 846
}

847 848
/* This is the routine which handles console output (ie. stdout). */
static void console_output(struct virtqueue *vq)
849
{
850 851 852
	unsigned int head, out, in;
	struct iovec iov[vq->vring.num];

Rusty Russell's avatar
Rusty Russell committed
853
	/* We usually wait in here, for the Guest to give us something. */
854 855 856
	head = wait_for_vq_desc(vq, iov, &out, &in);
	if (in)
		errx(1, "Input buffers in console output queue?");
Rusty Russell's avatar
Rusty Russell committed
857 858

	/* writev can return a partial write, so we loop here. */
859 860 861 862 863
	while (!iov_empty(iov, out)) {
		int len = writev(STDOUT_FILENO, iov, out);
		if (len <= 0)
			err(1, "Write to stdout gave %i", len);
		iov_consume(iov, out, len);
864
	}
Rusty Russell's avatar
Rusty Russell committed
865 866 867 868 869

	/*
	 * We're finished with that buffer: if we're going to sleep,
	 * wait_for_vq_desc() will prod the Guest with an interrupt.
	 */
870
	add_used(vq, head, 0);
871 872
}

Rusty Russell's avatar
Rusty Russell committed
873 874 875 876
/*
 * The Network
 *
 * Handling output for network is also simple: we get all the output buffers
877
 * and write them to /dev/net/tun.
878
 */
879 880 881 882 883
struct net_info {
	int tunfd;
};

static void net_output(struct virtqueue *vq)
884
{
885 886
	struct net_info *net_info = vq->dev->priv;
	unsigned int head, out, in;
887
	struct iovec iov[vq->vring.num];
888

Rusty Russell's avatar
Rusty Russell committed
889
	/* We usually wait in here for the Guest to give us a packet. */
890 891 892
	head = wait_for_vq_desc(vq, iov, &out, &in);
	if (in)
		errx(1, "Input buffers in net output queue?");
Rusty Russell's avatar
Rusty Russell committed
893 894 895 896
	/*
	 * Send the whole thing through to /dev/net/tun.  It expects the exact
	 * same format: what a coincidence!
	 */
897 898
	if (writev(net_info->tunfd, iov, out) < 0)
		errx(1, "Write to tun failed?");
Rusty Russell's avatar
Rusty Russell committed
899 900 901 902 903

	/*
	 * Done with that one; wait_for_vq_desc() will send the interrupt if
	 * all packets are processed.
	 */
904
	add_used(vq, head, 0);
905 906
}

Rusty Russell's avatar
Rusty Russell committed
907 908 909 910 911 912
/*
 * Handling network input is a bit trickier, because I've tried to optimize it.
 *
 * First we have a helper routine which tells is if from this file descriptor
 * (ie. the /dev/net/tun device) will block:
 */
913 914 915 916 917 918 919 920 921
static bool will_block(int fd)
{
	fd_set fdset;
	struct timeval zero = { 0, 0 };
	FD_ZERO(&fdset);
	FD_SET(fd, &fdset);
	return select(fd+1, &fdset, NULL, NULL, &zero) != 1;
}

Rusty Russell's avatar
Rusty Russell committed
922 923 924 925 926
/*
 * This handles packets coming in from the tun device to our Guest.  Like all
 * service routines, it gets called again as soon as it returns, so you don't
 * see a while(1) loop here.
 */
927
static void net_input(struct virtqueue *vq)
928 929
{
	int len;
930 931 932 933
	unsigned int head, out, in;
	struct iovec iov[vq->vring.num];
	struct net_info *net_info = vq->dev->priv;

Rusty Russell's avatar
Rusty Russell committed
934 935 936 937
	/*
	 * Get a descriptor to write an incoming packet into.  This will also
	 * send an interrupt if they're out of descriptors.
	 */
938 939 940
	head = wait_for_vq_desc(vq, iov, &out, &in);
	if (out)
		errx(1, "Output buffers in net input queue?");
941

Rusty Russell's avatar
Rusty Russell committed
942 943 944 945
	/*
	 * If it looks like we'll block reading from the tun device, send them
	 * an interrupt.
	 */
946 947 948
	if (vq->pending_used && will_block(net_info->tunfd))
		trigger_irq(vq);

Rusty Russell's avatar
Rusty Russell committed
949 950 951 952
	/*
	 * Read in the packet.  This is where we normally wait (when there's no
	 * incoming network traffic).
	 */
953
	len = readv(net_info->tunfd, iov, in);
954
	if (len <= 0)
955
		err(1, "Failed to read from tun.");
Rusty Russell's avatar
Rusty Russell committed
956 957 958 959 960

	/*
	 * Mark that packet buffer as used, but don't interrupt here.  We want
	 * to wait until we've done as much work as we can.
	 */
961
	add_used(vq, head, len);
962
}
Rusty Russell's avatar
Rusty Russell committed
963
/*:*/
964

Rusty Russell's avatar
Rusty Russell committed
965
/* This is the helper to create threads: run the service routine in a loop. */
966 967 968
static int do_thread(void *_vq)
{
	struct virtqueue *vq = _vq;
969

970 971 972 973
	for (;;)
		vq->service(vq);
	return 0;
}
974

Rusty Russell's avatar
Rusty Russell committed
975 976 977 978
/*
 * When a child dies, we kill our entire process group with SIGTERM.  This
 * also has the side effect that the shell restores the console for us!
 */
979 980 981
static void kill_launcher(int signal)
{
	kill(0, SIGTERM);
982 983
}

984
static void reset_device(struct device *dev)
985
{
986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010
	struct virtqueue *vq;

	verbose("Resetting device %s\n", dev->name);

	/* Clear any features they've acked. */
	memset(get_feature_bits(dev) + dev->feature_len, 0, dev->feature_len);

	/* We're going to be explicitly killing threads, so ignore them. */
	signal(SIGCHLD, SIG_IGN);

	/* Zero out the virtqueues, get rid of their threads */
	for (vq = dev->vq; vq; vq = vq->next) {
		if (vq->thread != (pid_t)-1) {
			kill(vq->thread, SIGTERM);
			waitpid(vq->thread, NULL, 0);
			vq->thread = (pid_t)-1;
		}
		memset(vq->vring.desc, 0,
		       vring_size(vq->config.num, LGUEST_VRING_ALIGN));
		lg_last_avail(vq) = 0;
	}
	dev->running = false;

	/* Now we care if threads die. */
	signal(SIGCHLD, (void *)kill_launcher);
1011 1012
}

Rusty Russell's avatar
Rusty Russell committed
1013 1014 1015
/*L:216
 * This actually creates the thread which services the virtqueue for a device.
 */
1016
static void create_thread(struct virtqueue *vq)
Rusty Russell's avatar