lguest.c 53.7 KB
Newer Older
1
/*P:100 This is the Launcher code, a simple program which lays out the
2
3
4
 * "physical" memory for the new Guest by mapping the kernel image and
 * the virtual devices, then opens /dev/lguest to tell the kernel
 * about the Guest and control it. :*/
5
6
7
8
9
10
11
12
13
14
#define _LARGEFILE64_SOURCE
#define _GNU_SOURCE
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <err.h>
#include <stdint.h>
#include <stdlib.h>
#include <elf.h>
#include <sys/mman.h>
15
#include <sys/param.h>
16
17
18
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/wait.h>
19
#include <sys/eventfd.h>
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
#include <fcntl.h>
#include <stdbool.h>
#include <errno.h>
#include <ctype.h>
#include <sys/socket.h>
#include <sys/ioctl.h>
#include <sys/time.h>
#include <time.h>
#include <netinet/in.h>
#include <net/if.h>
#include <linux/sockios.h>
#include <linux/if_tun.h>
#include <sys/uio.h>
#include <termios.h>
#include <getopt.h>
#include <zlib.h>
36
37
#include <assert.h>
#include <sched.h>
38
39
#include <limits.h>
#include <stddef.h>
40
#include <signal.h>
41
#include "linux/lguest_launcher.h"
42
43
44
45
#include "linux/virtio_config.h"
#include "linux/virtio_net.h"
#include "linux/virtio_blk.h"
#include "linux/virtio_console.h"
Rusty Russell's avatar
Rusty Russell committed
46
#include "linux/virtio_rng.h"
47
#include "linux/virtio_ring.h"
48
#include "asm/bootparam.h"
49
/*L:110 We can ignore the 39 include files we need for this program, but I do
50
51
52
53
54
55
56
57
58
59
 * want to draw attention to the use of kernel-style types.
 *
 * As Linus said, "C is a Spartan language, and so should your naming be."  I
 * like these abbreviations, so we define them here.  Note that u64 is always
 * unsigned long long, which works on all Linux systems: this means that we can
 * use %llu in printf for any u64. */
typedef unsigned long long u64;
typedef uint32_t u32;
typedef uint16_t u16;
typedef uint8_t u8;
60
/*:*/
61
62
63
64
65
66

#define PAGE_PRESENT 0x7 	/* Present, RW, Execute */
#define BRIDGE_PFX "bridge:"
#ifndef SIOCBRADDIF
#define SIOCBRADDIF	0x89a2		/* add interface to bridge      */
#endif
67
68
/* We can have up to 256 pages for devices. */
#define DEVICE_PAGES 256
Rusty Russell's avatar
Rusty Russell committed
69
70
/* This will occupy 3 pages: it must be a power of 2. */
#define VIRTQUEUE_NUM 256
71

72
73
/*L:120 verbose is both a global flag and a macro.  The C preprocessor allows
 * this, and although I wouldn't recommend it, it works quite nicely here. */
74
75
76
static bool verbose;
#define verbose(args...) \
	do { if (verbose) printf(args); } while(0)
77
78
/*:*/

79
80
81
82
/* The pointer to the start of guest memory. */
static void *guest_base;
/* The maximum guest physical address allowed, and maximum possible. */
static unsigned long guest_limit, guest_max;
83
84
/* The /dev/lguest file descriptor. */
static int lguest_fd;
85

86
87
88
/* a per-cpu variable indicating whose vcpu is currently running */
static unsigned int __thread cpu_id;

89
/* This is our list of devices. */
90
91
struct device_list
{
92
93
94
95
96
97
	/* Counter to assign interrupt numbers. */
	unsigned int next_irq;

	/* Counter to print out convenient device numbers. */
	unsigned int device_num;

98
	/* The descriptor page for the devices. */
99
100
	u8 *descpage;

101
	/* A single linked list of devices. */
102
	struct device *dev;
103
104
105
	/* And a pointer to the last device for easy append and also for
	 * configuration appending. */
	struct device *lastdev;
106
107
};

108
109
110
/* The list of Guest devices, based on command line arguments. */
static struct device_list devices;

111
/* The device structure describes a single device. */
112
113
struct device
{
114
	/* The linked-list pointer. */
115
	struct device *next;
116

117
	/* The device's descriptor, as mapped into the Guest. */
118
	struct lguest_device_desc *desc;
119

120
121
122
123
	/* We can't trust desc values once Guest has booted: we use these. */
	unsigned int feature_len;
	unsigned int num_vq;

124
125
	/* The name of this device, for --verbose. */
	const char *name;
126

127
128
	/* Any queues attached to this device */
	struct virtqueue *vq;
129

130
131
	/* Is it operational */
	bool running;
132

133
134
135
136
	/* Device-specific data. */
	void *priv;
};

137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
/* The virtqueue structure describes a queue attached to a device. */
struct virtqueue
{
	struct virtqueue *next;

	/* Which device owns me. */
	struct device *dev;

	/* The configuration for this queue. */
	struct lguest_vqconfig config;

	/* The actual ring of buffers. */
	struct vring vring;

	/* Last available index we saw. */
	u16 last_avail_idx;

154
155
156
	/* How many are used since we sent last irq? */
	unsigned int pending_used;

157
158
	/* Eventfd where Guest notifications arrive. */
	int eventfd;
Rusty Russell's avatar
Rusty Russell committed
159

160
161
162
	/* Function for the thread which is servicing this virtqueue. */
	void (*service)(struct virtqueue *vq);
	pid_t thread;
163
164
};

Balaji Rao's avatar
Balaji Rao committed
165
166
167
/* Remember the arguments to the program so we can "reboot" */
static char **main_args;

168
169
170
/* The original tty settings to restore on exit. */
static struct termios orig_term;

171
172
173
174
/* We have to be careful with barriers: our devices are all run in separate
 * threads and so we need to make sure that changes visible to the Guest happen
 * in precise order. */
#define wmb() __asm__ __volatile__("" : : : "memory")
175
#define mb() __asm__ __volatile__("" : : : "memory")
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197

/* Convert an iovec element to the given type.
 *
 * This is a fairly ugly trick: we need to know the size of the type and
 * alignment requirement to check the pointer is kosher.  It's also nice to
 * have the name of the type in case we report failure.
 *
 * Typing those three things all the time is cumbersome and error prone, so we
 * have a macro which sets them all up and passes to the real function. */
#define convert(iov, type) \
	((type *)_convert((iov), sizeof(type), __alignof__(type), #type))

static void *_convert(struct iovec *iov, size_t size, size_t align,
		      const char *name)
{
	if (iov->iov_len != size)
		errx(1, "Bad iovec size %zu for %s", iov->iov_len, name);
	if ((unsigned long)iov->iov_base % align != 0)
		errx(1, "Bad alignment %p for %s", iov->iov_base, name);
	return iov->iov_base;
}

198
199
200
/* Wrapper for the last available index.  Makes it easier to change. */
#define lg_last_avail(vq)	((vq)->last_avail_idx)

201
202
203
204
205
206
207
/* The virtio configuration space is defined to be little-endian.  x86 is
 * little-endian too, but it's nice to be explicit so we have these helpers. */
#define cpu_to_le16(v16) (v16)
#define cpu_to_le32(v32) (v32)
#define cpu_to_le64(v64) (v64)
#define le16_to_cpu(v16) (v16)
#define le32_to_cpu(v32) (v32)
208
#define le64_to_cpu(v64) (v64)
209

Rusty Russell's avatar
Rusty Russell committed
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
/* Is this iovec empty? */
static bool iov_empty(const struct iovec iov[], unsigned int num_iov)
{
	unsigned int i;

	for (i = 0; i < num_iov; i++)
		if (iov[i].iov_len)
			return false;
	return true;
}

/* Take len bytes from the front of this iovec. */
static void iov_consume(struct iovec iov[], unsigned num_iov, unsigned len)
{
	unsigned int i;

	for (i = 0; i < num_iov; i++) {
		unsigned int used;

		used = iov[i].iov_len < len ? iov[i].iov_len : len;
		iov[i].iov_base += used;
		iov[i].iov_len -= used;
		len -= used;
	}
	assert(len == 0);
}

Rusty Russell's avatar
Rusty Russell committed
237
238
239
240
/* The device virtqueue descriptors are followed by feature bitmasks. */
static u8 *get_feature_bits(struct device *dev)
{
	return (u8 *)(dev->desc + 1)
241
		+ dev->num_vq * sizeof(struct lguest_vqconfig);
Rusty Russell's avatar
Rusty Russell committed
242
243
}

244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
/*L:100 The Launcher code itself takes us out into userspace, that scary place
 * where pointers run wild and free!  Unfortunately, like most userspace
 * programs, it's quite boring (which is why everyone likes to hack on the
 * kernel!).  Perhaps if you make up an Lguest Drinking Game at this point, it
 * will get you through this section.  Or, maybe not.
 *
 * The Launcher sets up a big chunk of memory to be the Guest's "physical"
 * memory and stores it in "guest_base".  In other words, Guest physical ==
 * Launcher virtual with an offset.
 *
 * This can be tough to get your head around, but usually it just means that we
 * use these trivial conversion functions when the Guest gives us it's
 * "physical" addresses: */
static void *from_guest_phys(unsigned long addr)
{
	return guest_base + addr;
}

static unsigned long to_guest_phys(const void *addr)
{
	return (addr - guest_base);
}

267
268
269
270
271
/*L:130
 * Loading the Kernel.
 *
 * We start with couple of simple helper routines.  open_or_die() avoids
 * error-checking code cluttering the callers: */
272
273
274
275
276
277
278
279
static int open_or_die(const char *name, int flags)
{
	int fd = open(name, flags);
	if (fd < 0)
		err(1, "Failed to open %s", name);
	return fd;
}

280
281
/* map_zeroed_pages() takes a number of pages. */
static void *map_zeroed_pages(unsigned int num)
282
{
283
284
	int fd = open_or_die("/dev/zero", O_RDONLY);
	void *addr;
285

286
	/* We use a private mapping (ie. if we write to the page, it will be
287
288
289
290
291
	 * copied). */
	addr = mmap(NULL, getpagesize() * num,
		    PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, fd, 0);
	if (addr == MAP_FAILED)
		err(1, "Mmaping %u pages of /dev/zero", num);
292
	close(fd);
293
294
295
296
297
298
299
300
301
302
303
304
305

	return addr;
}

/* Get some more pages for a device. */
static void *get_pages(unsigned int num)
{
	void *addr = from_guest_phys(guest_limit);

	guest_limit += num * getpagesize();
	if (guest_limit > guest_max)
		errx(1, "Not enough memory for devices");
	return addr;
306
307
}

308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
/* This routine is used to load the kernel or initrd.  It tries mmap, but if
 * that fails (Plan 9's kernel file isn't nicely aligned on page boundaries),
 * it falls back to reading the memory in. */
static void map_at(int fd, void *addr, unsigned long offset, unsigned long len)
{
	ssize_t r;

	/* We map writable even though for some segments are marked read-only.
	 * The kernel really wants to be writable: it patches its own
	 * instructions.
	 *
	 * MAP_PRIVATE means that the page won't be copied until a write is
	 * done to it.  This allows us to share untouched memory between
	 * Guests. */
	if (mmap(addr, len, PROT_READ|PROT_WRITE|PROT_EXEC,
		 MAP_FIXED|MAP_PRIVATE, fd, offset) != MAP_FAILED)
		return;

	/* pread does a seek and a read in one shot: saves a few lines. */
	r = pread(fd, addr, len, offset);
	if (r != len)
		err(1, "Reading offset %lu len %lu gave %zi", offset, len, r);
}

332
333
334
335
336
/* This routine takes an open vmlinux image, which is in ELF, and maps it into
 * the Guest memory.  ELF = Embedded Linking Format, which is the format used
 * by all modern binaries on Linux including the kernel.
 *
 * The ELF headers give *two* addresses: a physical address, and a virtual
337
338
 * address.  We use the physical address; the Guest will map itself to the
 * virtual address.
339
340
 *
 * We return the starting address. */
341
static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr)
342
343
344
345
{
	Elf32_Phdr phdr[ehdr->e_phnum];
	unsigned int i;

346
347
	/* Sanity checks on the main ELF header: an x86 executable with a
	 * reasonable number of correctly-sized program headers. */
348
349
350
351
352
353
	if (ehdr->e_type != ET_EXEC
	    || ehdr->e_machine != EM_386
	    || ehdr->e_phentsize != sizeof(Elf32_Phdr)
	    || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr))
		errx(1, "Malformed elf header");

354
355
356
357
358
	/* An ELF executable contains an ELF header and a number of "program"
	 * headers which indicate which parts ("segments") of the program to
	 * load where. */

	/* We read in all the program headers at once: */
359
360
361
362
363
	if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0)
		err(1, "Seeking to program headers");
	if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr))
		err(1, "Reading program headers");

364
	/* Try all the headers: there are usually only three.  A read-only one,
365
	 * a read-write one, and a "note" section which we don't load. */
366
	for (i = 0; i < ehdr->e_phnum; i++) {
367
		/* If this isn't a loadable segment, we ignore it */
368
369
370
371
372
373
		if (phdr[i].p_type != PT_LOAD)
			continue;

		verbose("Section %i: size %i addr %p\n",
			i, phdr[i].p_memsz, (void *)phdr[i].p_paddr);

374
		/* We map this section of the file at its physical address. */
375
		map_at(elf_fd, from_guest_phys(phdr[i].p_paddr),
376
		       phdr[i].p_offset, phdr[i].p_filesz);
377
378
	}

379
380
	/* The entry point is given in the ELF header. */
	return ehdr->e_entry;
381
382
}

383
/*L:150 A bzImage, unlike an ELF file, is not meant to be loaded.  You're
Rusty Russell's avatar
Rusty Russell committed
384
385
 * supposed to jump into it and it will unpack itself.  We used to have to
 * perform some hairy magic because the unpacking code scared me.
386
 *
Rusty Russell's avatar
Rusty Russell committed
387
388
389
 * Fortunately, Jeremy Fitzhardinge convinced me it wasn't that hard and wrote
 * a small patch to jump over the tricky bits in the Guest, so now we just read
 * the funky header so we know where in the file to load, and away we go! */
390
static unsigned long load_bzimage(int fd)
391
{
392
	struct boot_params boot;
Rusty Russell's avatar
Rusty Russell committed
393
394
395
396
397
	int r;
	/* Modern bzImages get loaded at 1M. */
	void *p = from_guest_phys(0x100000);

	/* Go back to the start of the file and read the header.  It should be
Uwe Hermann's avatar
Uwe Hermann committed
398
	 * a Linux boot header (see Documentation/x86/i386/boot.txt) */
Rusty Russell's avatar
Rusty Russell committed
399
	lseek(fd, 0, SEEK_SET);
400
	read(fd, &boot, sizeof(boot));
Rusty Russell's avatar
Rusty Russell committed
401

402
403
	/* Inside the setup_hdr, we expect the magic "HdrS" */
	if (memcmp(&boot.hdr.header, "HdrS", 4) != 0)
Rusty Russell's avatar
Rusty Russell committed
404
405
		errx(1, "This doesn't look like a bzImage to me");

406
407
	/* Skip over the extra sectors of the header. */
	lseek(fd, (boot.hdr.setup_sects+1) * 512, SEEK_SET);
Rusty Russell's avatar
Rusty Russell committed
408
409
410
411
412

	/* Now read everything into memory. in nice big chunks. */
	while ((r = read(fd, p, 65536)) > 0)
		p += r;

413
414
	/* Finally, code32_start tells us where to enter the kernel. */
	return boot.hdr.code32_start;
415
416
}

417
/*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels
Rusty Russell's avatar
Rusty Russell committed
418
419
 * come wrapped up in the self-decompressing "bzImage" format.  With a little
 * work, we can load those, too. */
420
static unsigned long load_kernel(int fd)
421
422
423
{
	Elf32_Ehdr hdr;

424
	/* Read in the first few bytes. */
425
426
427
	if (read(fd, &hdr, sizeof(hdr)) != sizeof(hdr))
		err(1, "Reading kernel");

428
	/* If it's an ELF file, it starts with "\177ELF" */
429
	if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0)
430
		return map_elf(fd, &hdr);
431

432
	/* Otherwise we assume it's a bzImage, and try to load it. */
433
	return load_bzimage(fd);
434
435
}

436
437
438
439
440
/* This is a trivial little helper to align pages.  Andi Kleen hated it because
 * it calls getpagesize() twice: "it's dumb code."
 *
 * Kernel guys get really het up about optimization, even when it's not
 * necessary.  I leave this code as a reaction against that. */
441
442
static inline unsigned long page_align(unsigned long addr)
{
443
	/* Add upwards and truncate downwards. */
444
445
446
	return ((addr + getpagesize()-1) & ~(getpagesize()-1));
}

447
448
449
450
451
452
453
/*L:180 An "initial ram disk" is a disk image loaded into memory along with
 * the kernel which the kernel can use to boot from without needing any
 * drivers.  Most distributions now use this as standard: the initrd contains
 * the code to load the appropriate driver modules for the current machine.
 *
 * Importantly, James Morris works for RedHat, and Fedora uses initrds for its
 * kernels.  He sent me this (and tells me when I break it). */
454
455
456
457
458
459
460
static unsigned long load_initrd(const char *name, unsigned long mem)
{
	int ifd;
	struct stat st;
	unsigned long len;

	ifd = open_or_die(name, O_RDONLY);
461
	/* fstat() is needed to get the file size. */
462
463
464
	if (fstat(ifd, &st) < 0)
		err(1, "fstat() on initrd '%s'", name);

465
466
	/* We map the initrd at the top of memory, but mmap wants it to be
	 * page-aligned, so we round the size up for that. */
467
	len = page_align(st.st_size);
468
	map_at(ifd, from_guest_phys(mem - len), 0, st.st_size);
469
470
	/* Once a file is mapped, you can close the file descriptor.  It's a
	 * little odd, but quite useful. */
471
	close(ifd);
472
	verbose("mapped initrd %s size=%lu @ %p\n", name, len, (void*)mem-len);
473
474

	/* We return the initrd size. */
475
476
	return len;
}
Rusty Russell's avatar
Rusty Russell committed
477
/*:*/
478

479
480
/* Simple routine to roll all the commandline arguments together with spaces
 * between them. */
481
482
483
484
485
static void concat(char *dst, char *args[])
{
	unsigned int i, len = 0;

	for (i = 0; args[i]; i++) {
486
487
488
489
		if (i) {
			strcat(dst+len, " ");
			len++;
		}
490
		strcpy(dst+len, args[i]);
491
		len += strlen(args[i]);
492
493
494
495
496
	}
	/* In case it's empty. */
	dst[len] = '\0';
}

Rusty Russell's avatar
Rusty Russell committed
497
498
/*L:185 This is where we actually tell the kernel to initialize the Guest.  We
 * saw the arguments it expects when we looked at initialize() in lguest_user.c:
499
500
 * the base of Guest "physical" memory, the top physical page to allow and the
 * entry point for the Guest. */
501
static void tell_kernel(unsigned long start)
502
{
503
504
	unsigned long args[] = { LHREQ_INITIALIZE,
				 (unsigned long)guest_base,
505
				 guest_limit / getpagesize(), start };
506
507
	verbose("Guest: %p - %p (%#lx)\n",
		guest_base, guest_base + guest_limit, guest_limit);
508
509
	lguest_fd = open_or_die("/dev/lguest", O_RDWR);
	if (write(lguest_fd, args, sizeof(args)) < 0)
510
511
		err(1, "Writing to /dev/lguest");
}
512
/*:*/
513

Rusty Russell's avatar
Rusty Russell committed
514
/*
515
516
 * Device Handling.
 *
Rusty Russell's avatar
Rusty Russell committed
517
 * When the Guest gives us a buffer, it sends an array of addresses and sizes.
518
 * We need to make sure it's not trying to reach into the Launcher itself, so
Rusty Russell's avatar
Rusty Russell committed
519
 * we have a convenient routine which checks it and exits with an error message
520
521
 * if something funny is going on:
 */
522
523
524
static void *_check_pointer(unsigned long addr, unsigned int size,
			    unsigned int line)
{
525
526
	/* We have to separately check addr and addr+size, because size could
	 * be huge and addr + size might wrap around. */
527
	if (addr >= guest_limit || addr + size >= guest_limit)
528
		errx(1, "%s:%i: Invalid address %#lx", __FILE__, line, addr);
529
530
	/* We return a pointer for the caller's convenience, now we know it's
	 * safe to use. */
531
	return from_guest_phys(addr);
532
}
533
/* A macro which transparently hands the line number to the real function. */
534
535
#define check_pointer(addr,size) _check_pointer(addr, size, __LINE__)

Rusty Russell's avatar
Rusty Russell committed
536
537
538
/* Each buffer in the virtqueues is actually a chain of descriptors.  This
 * function returns the next descriptor in the chain, or vq->vring.num if we're
 * at the end. */
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
static unsigned next_desc(struct virtqueue *vq, unsigned int i)
{
	unsigned int next;

	/* If this descriptor says it doesn't chain, we're done. */
	if (!(vq->vring.desc[i].flags & VRING_DESC_F_NEXT))
		return vq->vring.num;

	/* Check they're not leading us off end of descriptors. */
	next = vq->vring.desc[i].next;
	/* Make sure compiler knows to grab that: we don't want it changing! */
	wmb();

	if (next >= vq->vring.num)
		errx(1, "Desc next is %u", next);

	return next;
}

558
559
560
561
562
/* This actually sends the interrupt for this virtqueue */
static void trigger_irq(struct virtqueue *vq)
{
	unsigned long buf[] = { LHREQ_IRQ, vq->config.irq };

563
564
565
566
567
	/* Don't inform them if nothing used. */
	if (!vq->pending_used)
		return;
	vq->pending_used = 0;

568
569
570
571
572
573
574
575
576
577
	/* If they don't want an interrupt, don't send one, unless empty. */
	if ((vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
	    && lg_last_avail(vq) != vq->vring.avail->idx)
		return;

	/* Send the Guest an interrupt tell them we used something up. */
	if (write(lguest_fd, buf, sizeof(buf)) != 0)
		err(1, "Triggering irq %i", vq->config.irq);
}

578
579
580
581
582
/* This looks in the virtqueue and for the first available buffer, and converts
 * it to an iovec for convenient access.  Since descriptors consist of some
 * number of output then some number of input descriptors, it's actually two
 * iovecs, but we pack them into one and note how many of each there were.
 *
583
584
585
586
 * This function returns the descriptor number found. */
static unsigned wait_for_vq_desc(struct virtqueue *vq,
				 struct iovec iov[],
				 unsigned int *out_num, unsigned int *in_num)
587
588
{
	unsigned int i, head;
589
590
591
592
593
	u16 last_avail = lg_last_avail(vq);

	while (last_avail == vq->vring.avail->idx) {
		u64 event;

594
595
596
		/* OK, tell Guest about progress up to now. */
		trigger_irq(vq);

597
598
599
600
601
602
603
604
605
606
607
		/* OK, now we need to know about added descriptors. */
		vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY;

		/* They could have slipped one in as we were doing that: make
		 * sure it's written, then check again. */
		mb();
		if (last_avail != vq->vring.avail->idx) {
			vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
			break;
		}

608
609
610
		/* Nothing new?  Wait for eventfd to tell us they refilled. */
		if (read(vq->eventfd, &event, sizeof(event)) != sizeof(event))
			errx(1, "Event read failed?");
611
612
613

		/* We don't need to be notified again. */
		vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
614
	}
615
616

	/* Check it isn't doing very strange things with descriptor numbers. */
617
	if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num)
618
		errx(1, "Guest moved used index from %u to %u",
619
		     last_avail, vq->vring.avail->idx);
620
621
622

	/* Grab the next descriptor number they're advertising, and increment
	 * the index we've seen. */
623
624
	head = vq->vring.avail->ring[last_avail % vq->vring.num];
	lg_last_avail(vq)++;
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654

	/* If their number is silly, that's a fatal mistake. */
	if (head >= vq->vring.num)
		errx(1, "Guest says index %u is available", head);

	/* When we start there are none of either input nor output. */
	*out_num = *in_num = 0;

	i = head;
	do {
		/* Grab the first descriptor, and check it's OK. */
		iov[*out_num + *in_num].iov_len = vq->vring.desc[i].len;
		iov[*out_num + *in_num].iov_base
			= check_pointer(vq->vring.desc[i].addr,
					vq->vring.desc[i].len);
		/* If this is an input descriptor, increment that count. */
		if (vq->vring.desc[i].flags & VRING_DESC_F_WRITE)
			(*in_num)++;
		else {
			/* If it's an output descriptor, they're all supposed
			 * to come before any input descriptors. */
			if (*in_num)
				errx(1, "Descriptor has out after in");
			(*out_num)++;
		}

		/* If we've got too many, that implies a descriptor loop. */
		if (*out_num + *in_num > vq->vring.num)
			errx(1, "Looped descriptor");
	} while ((i = next_desc(vq, i)) != vq->vring.num);
655

656
	return head;
657
658
}

Rusty Russell's avatar
Rusty Russell committed
659
/* After we've used one of their buffers, we tell them about it.  We'll then
660
661
 * want to send them an interrupt, using trigger_irq(). */
static void add_used(struct virtqueue *vq, unsigned int head, int len)
662
{
663
664
	struct vring_used_elem *used;

Rusty Russell's avatar
Rusty Russell committed
665
666
	/* The virtqueue contains a ring of used buffers.  Get a pointer to the
	 * next entry in that used ring. */
667
668
669
670
671
672
	used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num];
	used->id = head;
	used->len = len;
	/* Make sure buffer is written before we update index. */
	wmb();
	vq->vring.used->idx++;
673
	vq->pending_used++;
674
675
}

676
/* And here's the combo meal deal.  Supersize me! */
677
static void add_used_and_trigger(struct virtqueue *vq, unsigned head, int len)
678
{
679
	add_used(vq, head, len);
680
	trigger_irq(vq);
681
682
}

Rusty Russell's avatar
Rusty Russell committed
683
684
685
/*
 * The Console
 *
686
 * We associate some data with the console for our exit hack. */
687
688
struct console_abort
{
689
	/* How many times have they hit ^C? */
690
	int count;
691
	/* When did they start? */
692
693
694
	struct timeval start;
};

695
/* This is the routine which handles console input (ie. stdin). */
696
static void console_input(struct virtqueue *vq)
697
698
{
	int len;
699
	unsigned int head, in_num, out_num;
700
701
	struct console_abort *abort = vq->dev->priv;
	struct iovec iov[vq->vring.num];
702

703
704
	/* Make sure there's a descriptor waiting. */
	head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
705
	if (out_num)
706
		errx(1, "Output buffers in console in queue?");
707

708
709
	/* Read it in. */
	len = readv(STDIN_FILENO, iov, in_num);
710
	if (len <= 0) {
711
		/* Ran out of input? */
712
		warnx("Failed to get console input, ignoring console.");
713
714
715
716
		/* For simplicity, dying threads kill the whole Launcher.  So
		 * just nap here. */
		for (;;)
			pause();
717
718
	}

719
	add_used_and_trigger(vq, head, len);
720

721
722
	/* Three ^C within one second?  Exit.
	 *
723
724
725
726
727
	 * This is such a hack, but works surprisingly well.  Each ^C has to
	 * be in a buffer by itself, so they can't be too fast.  But we check
	 * that we get three within about a second, so they can't be too
	 * slow. */
	if (len != 1 || ((char *)iov[0].iov_base)[0] != 3) {
728
		abort->count = 0;
729
730
		return;
	}
731

732
733
734
735
736
737
738
739
740
741
742
	abort->count++;
	if (abort->count == 1)
		gettimeofday(&abort->start, NULL);
	else if (abort->count == 3) {
		struct timeval now;
		gettimeofday(&now, NULL);
		/* Kill all Launcher processes with SIGINT, like normal ^C */
		if (now.tv_sec <= abort->start.tv_sec+1)
			kill(0, SIGINT);
		abort->count = 0;
	}
743
744
}

745
746
/* This is the routine which handles console output (ie. stdout). */
static void console_output(struct virtqueue *vq)
747
{
748
749
750
	unsigned int head, out, in;
	struct iovec iov[vq->vring.num];

751
752
753
754
755
756
757
758
	head = wait_for_vq_desc(vq, iov, &out, &in);
	if (in)
		errx(1, "Input buffers in console output queue?");
	while (!iov_empty(iov, out)) {
		int len = writev(STDOUT_FILENO, iov, out);
		if (len <= 0)
			err(1, "Write to stdout gave %i", len);
		iov_consume(iov, out, len);
759
	}
760
	add_used(vq, head, 0);
761
762
}

Rusty Russell's avatar
Rusty Russell committed
763
764
765
766
/*
 * The Network
 *
 * Handling output for network is also simple: we get all the output buffers
767
 * and write them to /dev/net/tun.
768
 */
769
770
771
772
773
struct net_info {
	int tunfd;
};

static void net_output(struct virtqueue *vq)
774
{
775
776
	struct net_info *net_info = vq->dev->priv;
	unsigned int head, out, in;
777
	struct iovec iov[vq->vring.num];
778

779
780
781
782
783
	head = wait_for_vq_desc(vq, iov, &out, &in);
	if (in)
		errx(1, "Input buffers in net output queue?");
	if (writev(net_info->tunfd, iov, out) < 0)
		errx(1, "Write to tun failed?");
784
	add_used(vq, head, 0);
785
786
}

787
788
789
790
791
792
793
794
795
796
/* Will reading from this file descriptor block? */
static bool will_block(int fd)
{
	fd_set fdset;
	struct timeval zero = { 0, 0 };
	FD_ZERO(&fdset);
	FD_SET(fd, &fdset);
	return select(fd+1, &fdset, NULL, NULL, &zero) != 1;
}

797
/* This is where we handle packets coming in from the tun device to our
798
 * Guest. */
799
static void net_input(struct virtqueue *vq)
800
801
{
	int len;
802
803
804
805
806
807
808
	unsigned int head, out, in;
	struct iovec iov[vq->vring.num];
	struct net_info *net_info = vq->dev->priv;

	head = wait_for_vq_desc(vq, iov, &out, &in);
	if (out)
		errx(1, "Output buffers in net input queue?");
809
810
811
812
813

	/* Deliver interrupt now, since we're about to sleep. */
	if (vq->pending_used && will_block(net_info->tunfd))
		trigger_irq(vq);

814
	len = readv(net_info->tunfd, iov, in);
815
	if (len <= 0)
816
		err(1, "Failed to read from tun.");
817
	add_used(vq, head, len);
818
}
819

820
821
822
823
/* This is the helper to create threads. */
static int do_thread(void *_vq)
{
	struct virtqueue *vq = _vq;
824

825
826
827
828
	for (;;)
		vq->service(vq);
	return 0;
}
829

830
831
832
833
834
/* When a child dies, we kill our entire process group with SIGTERM.  This
 * also has the side effect that the shell restores the console for us! */
static void kill_launcher(int signal)
{
	kill(0, SIGTERM);
835
836
}

837
static void reset_device(struct device *dev)
838
{
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
	struct virtqueue *vq;

	verbose("Resetting device %s\n", dev->name);

	/* Clear any features they've acked. */
	memset(get_feature_bits(dev) + dev->feature_len, 0, dev->feature_len);

	/* We're going to be explicitly killing threads, so ignore them. */
	signal(SIGCHLD, SIG_IGN);

	/* Zero out the virtqueues, get rid of their threads */
	for (vq = dev->vq; vq; vq = vq->next) {
		if (vq->thread != (pid_t)-1) {
			kill(vq->thread, SIGTERM);
			waitpid(vq->thread, NULL, 0);
			vq->thread = (pid_t)-1;
		}
		memset(vq->vring.desc, 0,
		       vring_size(vq->config.num, LGUEST_VRING_ALIGN));
		lg_last_avail(vq) = 0;
	}
	dev->running = false;

	/* Now we care if threads die. */
	signal(SIGCHLD, (void *)kill_launcher);
864
865
}

866
static void create_thread(struct virtqueue *vq)
867
{
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
	/* Create stack for thread and run it.  Since stack grows
	 * upwards, we point the stack pointer to the end of this
	 * region. */
	char *stack = malloc(32768);
	unsigned long args[] = { LHREQ_EVENTFD,
				 vq->config.pfn*getpagesize(), 0 };

	/* Create a zero-initialized eventfd. */
	vq->eventfd = eventfd(0, 0);
	if (vq->eventfd < 0)
		err(1, "Creating eventfd");
	args[2] = vq->eventfd;

	/* Attach an eventfd to this virtqueue: it will go off
	 * when the Guest does an LHCALL_NOTIFY for this vq. */
	if (write(lguest_fd, &args, sizeof(args)) != 0)
		err(1, "Attaching eventfd");

	/* CLONE_VM: because it has to access the Guest memory, and
	 * SIGCHLD so we get a signal if it dies. */
	vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq);
	if (vq->thread == (pid_t)-1)
		err(1, "Creating clone");
	/* We close our local copy, now the child has it. */
	close(vq->eventfd);
893
894
}

895
static void start_device(struct device *dev)
Rusty Russell's avatar
Rusty Russell committed
896
{
897
	unsigned int i;
Rusty Russell's avatar
Rusty Russell committed
898
899
	struct virtqueue *vq;

900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
	verbose("Device %s OK: offered", dev->name);
	for (i = 0; i < dev->feature_len; i++)
		verbose(" %02x", get_feature_bits(dev)[i]);
	verbose(", accepted");
	for (i = 0; i < dev->feature_len; i++)
		verbose(" %02x", get_feature_bits(dev)
			[dev->feature_len+i]);

	for (vq = dev->vq; vq; vq = vq->next) {
		if (vq->service)
			create_thread(vq);
	}
	dev->running = true;
}

static void cleanup_devices(void)
{
	struct device *dev;

	for (dev = devices.dev; dev; dev = dev->next)
		reset_device(dev);
Rusty Russell's avatar
Rusty Russell committed
921

922
923
924
925
	/* If we saved off the original terminal settings, restore them now. */
	if (orig_term.c_lflag & (ISIG|ICANON|ECHO))
		tcsetattr(STDIN_FILENO, TCSANOW, &orig_term);
}
Rusty Russell's avatar
Rusty Russell committed
926

927
928
929
930
931
932
933
/* When the Guest tells us they updated the status field, we handle it. */
static void update_device_status(struct device *dev)
{
	/* A zero status is a reset, otherwise it's a set of flags. */
	if (dev->desc->status == 0)
		reset_device(dev);
	else if (dev->desc->status & VIRTIO_CONFIG_S_FAILED) {
934
		warnx("Device %s configuration FAILED", dev->name);
935
936
		if (dev->running)
			reset_device(dev);
937
	} else if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK) {
938
939
		if (!dev->running)
			start_device(dev);
Rusty Russell's avatar
Rusty Russell committed
940
941
942
	}
}

943
/* This is the generic routine we call when the Guest uses LHCALL_NOTIFY. */
944
static void handle_output(unsigned long addr)
945
946
{
	struct device *i;
947

948
	/* Check each device. */
949
	for (i = devices.dev; i; i = i->next) {
950
951
		struct virtqueue *vq;

952
		/* Notifications to device descriptors update device status. */
Rusty Russell's avatar
Rusty Russell committed
953
		if (from_guest_phys(addr) == i->desc) {
954
			update_device_status(i);
Rusty Russell's avatar
Rusty Russell committed
955
956
957
			return;
		}

958
		/* Devices *can* be used before status is set to DRIVER_OK. */
959
		for (vq = i->vq; vq; vq = vq->next) {
960
			if (addr != vq->config.pfn*getpagesize())
Rusty Russell's avatar
Rusty Russell committed
961
				continue;
962
963
964
			if (i->running)
				errx(1, "Notification on running %s", i->name);
			start_device(i);
Rusty Russell's avatar
Rusty Russell committed
965
			return;
966
967
		}
	}
968

969
970
971
972
973
974
975
	/* Early console write is done using notify on a nul-terminated string
	 * in Guest memory. */
	if (addr >= guest_limit)
		errx(1, "Bad NOTIFY %#lx", addr);

	write(STDOUT_FILENO, from_guest_phys(addr),
	      strnlen(from_guest_phys(addr), guest_limit - addr));
976
977
}

978
979
980
981
982
/*L:190
 * Device Setup
 *
 * All devices need a descriptor so the Guest knows it exists, and a "struct
 * device" so the Launcher can keep track of it.  We have common helper
983
984
 * routines to allocate and manage them.
 */
985

986
987
988
989
990
991
992
/* The layout of the device page is a "struct lguest_device_desc" followed by a
 * number of virtqueue descriptors, then two sets of feature bits, then an
 * array of configuration bytes.  This routine returns the configuration
 * pointer. */
static u8 *device_config(const struct device *dev)
{
	return (void *)(dev->desc + 1)
993
994
		+ dev->num_vq * sizeof(struct lguest_vqconfig)
		+ dev->feature_len * 2;
995
996
}

997
998
999
1000
/* This routine allocates a new "struct lguest_device_desc" from descriptor
 * table page just above the Guest's normal memory.  It returns a pointer to
 * that descriptor. */
static struct lguest_device_desc *new_dev_desc(u16 type)
1001
{
1002
1003
	struct lguest_device_desc d = { .type = type };
	void *p;
1004

1005
1006
1007
1008
1009
1010
	/* Figure out where the next device config is, based on the last one. */
	if (devices.lastdev)
		p = device_config(devices.lastdev)
			+ devices.lastdev->desc->config_len;
	else
		p = devices.descpage;
1011

1012
1013
1014
	/* We only have one page for all the descriptors. */
	if (p + sizeof(d) > (void *)devices.descpage + getpagesize())
		errx(1, "Too many devices");
1015

1016
1017
	/* p might not be aligned, so we memcpy in. */
	return memcpy(p, &d, sizeof(d));
1018
1019
}

1020
1021
/* Each device descriptor is followed by the description of its virtqueues.  We
 * specify how many descriptors the virtqueue is to have. */
1022
static void add_virtqueue(struct device *dev, unsigned int num_descs,
1023
			  void (*service)(struct virtqueue *))
1024
1025
1026
1027
1028
{
	unsigned int pages;
	struct virtqueue **i, *vq = malloc(sizeof(*vq));
	void *p;

1029
	/* First we need some memory for this virtqueue. */
1030
	pages = (vring_size(num_descs, LGUEST_VRING_ALIGN) + getpagesize() - 1)
1031
		/ getpagesize();
1032
1033
	p = get_pages(pages);

1034
1035
1036
1037
	/* Initialize the virtqueue */
	vq->next = NULL;
	vq->last_avail_idx = 0;
	vq->dev = dev;
1038
1039
	vq->service = service;
	vq->thread = (pid_t)-1;
1040

1041
1042
1043
1044
1045
1046
	/* Initialize the configuration. */
	vq->config.num = num_descs;
	vq->config.irq = devices.next_irq++;
	vq->config.pfn = to_guest_phys(p) / getpagesize();

	/* Initialize the vring. */
1047
	vring_init(&vq->vring, num_descs, p, LGUEST_VRING_ALIGN);
1048

1049
1050
1051
1052
1053
1054
	/* Append virtqueue to this device's descriptor.  We use
	 * device_config() to get the end of the device's current virtqueues;
	 * we check that we haven't added any config or feature information
	 * yet, otherwise we'd be overwriting them. */
	assert(dev->desc->config_len == 0 && dev->desc->feature_len == 0);
	memcpy(device_config(dev), &vq->config, sizeof(vq->config));
1055
	dev->num_vq++;
1056
1057
1058
	dev->desc->num_vq++;

	verbose("Virtqueue page %#lx\n", to_guest_phys(p));
1059
1060
1061
1062
1063

	/* Add to tail of list, so dev->vq is first vq, dev->vq->next is
	 * second.  */
	for (i = &dev->vq; *i; i = &(*i)->next);
	*i = vq;
1064
1065
}

Rusty Russell's avatar
Rusty Russell committed
1066
/* The first half of the feature bitmask is for us to advertise features.  The
1067
 * second half is for the Guest to accept features. */
1068
1069
static void add_feature(struct device *dev, unsigned bit)
{
Rusty Russell's avatar
Rusty Russell committed
1070
	u8 *features = get_feature_bits(dev);
1071
1072
1073
1074

	/* We can't extend the feature bits once we've added config bytes */
	if (dev->desc->feature_len <= bit / CHAR_BIT) {
		assert(dev->desc->config_len == 0);
1075
		dev->feature_len = dev->desc->feature_len = (bit/CHAR_BIT) + 1;
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
	}

	features[bit / CHAR_BIT] |= (1 << (bit % CHAR_BIT));
}

/* This routine sets the configuration fields for an existing device's
 * descriptor.  It only works for the last device, but that's OK because that's
 * how we use it. */
static void set_config(struct device *dev, unsigned len, const void *conf)
{
	/* Check we haven't overflowed our single page. */
	if (device_config(dev) + len > devices.descpage + getpagesize())
		errx(1, "Too many devices");

	/* Copy in the config information, and store the length. */
	memcpy(device_config(dev), conf, len);
	dev->desc->config_len = len;
}

1095
/* This routine does all the creation and setup of a new device, including
1096
1097
1098
 * calling new_dev_desc() to allocate the descriptor and device memory.
 *
 * See what I mean about userspace being boring? */
1099
static struct device *new_device(const char *name, u16 type)
1100
1101
1102
{
	struct device *dev = malloc(sizeof(*dev));

1103
	/* Now we populate the fields one at a time. */
1104
1105
	dev->desc = new_dev_desc(type);
	dev->name = name;
1106
	dev->vq = NULL;
1107
1108
	dev->feature_len = 0;
	dev->num_vq = 0;
1109
	dev->running = false;
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120

	/* Append to device list.  Prepending to a single-linked list is
	 * easier, but the user expects the devices to be arranged on the bus
	 * in command-line order.  The first network device on the command line
	 * is eth0, the first block device /dev/vda, etc. */
	if (devices.lastdev)
		devices.lastdev->next = dev;
	else
		devices.dev = dev;
	devices.lastdev = dev;

1121
1122
1123
	return dev;
}

1124
1125
/* Our first setup routine is the console.  It's a fairly simple device, but
 * UNIX tty handling makes it uglier than it could be. */
1126
static void setup_console(void)
1127
1128
1129
{
	struct device *dev;

1130
	/* If we can save the initial standard input settings... */
1131
1132
	if (tcgetattr(STDIN_FILENO, &orig_term) == 0) {
		struct termios term = orig_term;
1133
1134
		/* Then we turn off echo, line buffering and ^C etc.  We want a
		 * raw input stream to the Guest. */
1135
1136
1137
1138
		term.c_lflag &= ~(ISIG|ICANON|ECHO);
		tcsetattr(STDIN_FILENO, TCSANOW, &term);
	}

1139
1140
	dev = new_device("console", VIRTIO_ID_CONSOLE);

1141
	/* We store the console state in dev->priv, and initialize it. */
1142
1143
1144
	dev->priv = malloc(sizeof(struct console_abort));
	((struct console_abort *)dev->priv)->count = 0;

1145
1146
1147
	/* The console needs two virtqueues: the input then the output.  When
	 * they put something the input queue, we make sure we're listening to
	 * stdin.  When they put something in the output queue, we write it to
Rusty Russell's avatar
Rusty Russell committed
1148
	 * stdout. */
1149
1150
	add_virtqueue(dev, VIRTQUEUE_NUM, console_input);
	add_virtqueue(dev, VIRTQUEUE_NUM, console_output);
1151

1152
	verbose("device %u: console\n", ++devices.device_num);
1153
}
1154
/*:*/
1155

1156
1157
1158
/*M:010 Inter-guest networking is an interesting area.  Simplest is to have a
 * --sharenet=<name> option which opens or creates a named pipe.  This can be
 * used to send packets to another guest in a 1:1 manner.