lcd-domains-arch.c 98.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12
/*
 * LCD core VMX functions 
 *
 * Based on KVM and Dune.
 *
 * Authors:
 *
 *   Weibin Sun <wbsun@flux.utah.edu>
 *   Charlie Jacobsen <charlesj@cs.utah.edu>
 *
 */

13
#include <asm/virtext.h>
14
#include <asm/vmx.h>
15
#include <uapi/asm/vmx.h>
16
#include <asm/desc.h>
17
#include <asm/lcd-domains-arch.h>
18

Charlie Jacobsen's avatar
Charlie Jacobsen committed
19
#include <linux/bitmap.h>
20 21 22 23
#include <linux/spinlock.h>
#include <linux/gfp.h>
#include <linux/mm.h>
#include <linux/tboot.h>
24
#include <linux/slab.h>
25
#include <linux/kmsg_dump.h>
26 27 28
#include <linux/list.h>
#include <linux/module.h>
#include <linux/kernel.h>
29

30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
/* DEBUGGING -------------------------------------------------- */

#define LCD_ARCH_ERR(msg...) __lcd_arch_err(__FILE__, __LINE__, msg)
static inline void __lcd_arch_err(char *file, int lineno, char *fmt, ...)
{
	va_list args;
	printk(KERN_ERR "lcd-vmx: %s:%d: error: ", file, lineno);
	va_start(args, fmt);
	vprintk(fmt, args);
	va_end(args);
}
#define LCD_ARCH_MSG(msg...) __lcd_arch_msg(__FILE__, __LINE__, msg)
static inline void __lcd_arch_msg(char *file, int lineno, char *fmt, ...)
{
	va_list args;
	printk(KERN_ERR "lcd-vmx: %s:%d: note: ", file, lineno);
	va_start(args, fmt);
	vprintk(fmt, args);
	va_end(args);
}
#define LCD_ARCH_WARN(msg...) __lcd_arch_warn(__FILE__, __LINE__, msg)
static inline void __lcd_arch_warn(char *file, int lineno, char *fmt, ...)
{
	va_list args;
	printk(KERN_ERR "lcd-vmx: %s:%d: warning: ", file, lineno);
	va_start(args, fmt);
	vprintk(fmt, args);
	va_end(args);
}

60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
/* VMX DATA STRUCTURES -------------------------------------------------- */

struct vmx_vmcs_config {
	int size;
	int order;
	u32 revision_id;
	u32 pin_based_exec_controls;
	u32 primary_proc_based_exec_controls;
	u32 secondary_proc_based_exec_controls;
	u32 vmexit_controls;
	u32 vmentry_controls;
};

struct vmx_capability {
	u32 ept;
	u32 vpid;
};

78 79 80 81 82
/*
 * Declared in inline assembly in vmx_enter
 */
extern const unsigned long vmx_return;

83 84
/* SHARED / PERCPU VARS -------------------------------------------------- */

85
static struct kmem_cache *lcd_arch_thread_cache;
86

87 88
static struct vmx_vmcs_config vmcs_config;
static struct vmx_capability vmx_capability;
89

90
static atomic_t vmx_enable_failed;
91
static DEFINE_PER_CPU(int, vmx_enabled);
92
static DEFINE_PER_CPU(struct lcd_arch_vmcs *, vmxon_area);
93

94
static struct {
95 96
	DECLARE_BITMAP(bitmap, VMX_NR_VPIDS);
	spinlock_t lock;
97
} vpids;
98

99
static DEFINE_PER_CPU(struct lcd_arch_thread *, local_lcd_arch_thread);
100

101
static unsigned long *msr_bitmap;
102

103
/* DEBUGGING --------------------------------------------------*/
104

105 106 107
/**
 * Prints the vmx controls, lower and upper bounds on the controls,
 * and tries to find the bits that were rejected.
108 109
 *
 * Useful for debugging set up of the vmcs.
110
 */
111 112 113 114
static void print_vmx_controls(u32 controls, u32 mask, u32 msr)
{
	u32 msr_low;
	u32 msr_high;
115 116 117
	u32 bad_high;
	u32 bad_low;
	int i;
118 119 120 121 122 123 124

	/*
	 * See doc in adjust_vmx_controls
	 */

	rdmsr(msr, msr_low, msr_high);

125 126 127 128
	LCD_ARCH_MSG("  MSR LOW:             0x%08x\n", msr_low);
	LCD_ARCH_MSG("  ATTEMPTED CONTROLS:  0x%08x\n", controls);
	LCD_ARCH_MSG("  MSR HIGH:            0x%08x\n", msr_high);
	LCD_ARCH_MSG("  RESERVED BIT MASK:   0x%08x\n", mask);
129 130 131 132 133 134 135 136

	/*
	 * For each bit, if the reserved mask is not set *and* the msr high
	 * bit is not set, then the control bit should not be set.
	 */
	bad_high = ~msr_high & ~mask & controls;
	for (i = 0; i < 32; i++) {
		if (bad_high & 1)
137
			LCD_ARCH_MSG("  Control bit %d should be 0.\n", i);
138 139 140 141 142 143 144 145 146 147
		bad_high >>= 1;
	}

	/*
	 * For each bit, if the reserved mask is not set *and* the msr low
	 * bit is set, then the control bit should be set.
	 */
	bad_low = msr_low & ~mask & ~controls;
	for (i = 0; i < 32; i++) {
		if (bad_low & 1)
148
			LCD_ARCH_MSG("  Control bit %d should be 1.\n", i);
149 150 151
		bad_low >>= 1;
	}

152
	LCD_ARCH_MSG("See Intel SDM V3 24.{6,7,8,9} and Appendix A\n");
153 154
}

155
/* INVEPT / INVVPID --------------------------------------------------*/
156

157 158
static inline bool cpu_has_vmx_invvpid_single(void)
{
159 160 161
	return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
}

162 163
static inline bool cpu_has_vmx_invvpid_global(void)
{
164 165 166
	return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
}

167 168
static inline bool cpu_has_vmx_invept_context(void)
{
169 170 171
	return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
}

172 173
static inline bool cpu_has_vmx_invept_global(void)
{
174 175 176
	return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
}

177 178
static inline bool cpu_has_vmx_ept_ad_bits(void)
{
179 180 181
	return vmx_capability.ept & VMX_EPT_AD_BIT;
}

182
static inline void __invept(int ext, u64 eptp)
183
{
184 185
	u8 error;

186 187
	struct {
		u64 eptp, gpa;
188
	} operand = {eptp, 0};
189

190
	asm volatile (ASM_VMX_INVEPT "; setna %0"
191
                /* CF==1 or ZF==1 --> rc = -1 */
192 193
                : "=qm"(error) : "a" (&operand), "c" (ext) : "cc", "memory");
	if (error)
194
		LCD_ARCH_ERR("ext=%d, eptp=0x%llx\n", ext, eptp);
195
			
196 197
}

198 199 200
/**
 * Invalidates all mappings associated with eptp's.
 */
201 202
static inline void invept_global_context(void)
{
203
	if (cpu_has_vmx_invept_global())
204
		__invept(VMX_EPT_EXTENT_GLOBAL, 0);
205 206
}

207 208 209 210
/**
 * Invalidates all mappings associated with eptp, and possibly
 * others.
 */
211 212
static inline void invept_single_context(u64 eptp)
{
213
	if (cpu_has_vmx_invept_context())
214
		__invept(VMX_EPT_EXTENT_CONTEXT, eptp);
215
	else
216
		invept_global_context();
217 218
}

219
static inline void __invvpid(int ext, u16 vpid)
220
{
221 222
	u8 error;

223 224 225
	struct {
		u64 vpid : 16;
		u64 rsvd : 48;
226
		u64 addr;
227
	} operand = { vpid, 0, 0 };
228

229
	asm volatile (ASM_VMX_INVVPID "; setna %0"
230
                /* CF==1 or ZF==1 --> rc = -1 */
231 232
		: "=qm"(error) : "a"(&operand), "c"(ext) : "cc", "memory");
	if (error)
233
		LCD_ARCH_ERR("ext=%d, vpid=0x%hx\n", ext, vpid);
234 235
}

236 237 238 239
/**
 * Invalidates all mappings associated with vpid's other than
 * vpid = 0 (the host).
 */
240 241
static inline void invvpid_global_context(void)
{
242
	if (cpu_has_vmx_invvpid_global())
243
		__invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0);
244 245
}

246 247 248
/**
 * Invalidates all mappings associated with vpid.
 */
249 250
static inline void invvpid_single_context(u16 vpid)
{
251 252 253 254 255 256 257
	/*
	 * Don't invalidate host mappings
	 */
	if (vpid == 0)
		return;

	if (cpu_has_vmx_invvpid_single())
258
		__invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid);
259
	else
260
		invvpid_global_context();		
261 262
}

263 264
/* VMCS READ / WRITE --------------------------------------------------*/

265 266 267
/**
 * Takes vmcs from any state to {inactive, clear, not current}
 */
268
static void vmcs_clear(struct lcd_arch_vmcs *vmcs)
269
{
270
	u64 hpa = __pa(vmcs);
271 272 273
	u8 error;

	asm volatile (ASM_VMX_VMCLEAR_RAX "; setna %0"
Charles Jacobsen's avatar
Charles Jacobsen committed
274
                : "=qm"(error) : "a"(&hpa), "m"(hpa)
275 276
                : "cc", "memory");
	if (error)
277
		LCD_ARCH_ERR("vmclear fail: %p/%llx\n",	vmcs, hpa);
278 279
}

280 281 282 283
/**
 * Takes vmcs to {active, current} on cpu. Any vmcs reads and writes
 * will affect this vmcs.
 */
284
static void vmcs_load(struct lcd_arch_vmcs *vmcs)
285
{
286
	u64 hpa = __pa(vmcs);
287 288 289
	u8 error;

	asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0"
Charles Jacobsen's avatar
Charles Jacobsen committed
290
                : "=qm"(error) : "a"(&hpa), "m"(hpa)
291 292
                : "cc", "memory");
	if (error)
293
		LCD_ARCH_ERR("vmptrld %p/%llx failed\n", vmcs, hpa);
294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321
}

static __always_inline unsigned long vmcs_readl(unsigned long field)
{
	unsigned long value;

	asm volatile (ASM_VMX_VMREAD_RDX_RAX
                : "=a"(value) : "d"(field) : "cc");
	return value;
}

static __always_inline u16 vmcs_read16(unsigned long field)
{
	return vmcs_readl(field);
}

static __always_inline u32 vmcs_read32(unsigned long field)
{
	return vmcs_readl(field);
}

static __always_inline u64 vmcs_read64(unsigned long field)
{
	return vmcs_readl(field);
}

static noinline void vmwrite_error(unsigned long field, unsigned long value)
{
322
	LCD_ARCH_ERR("reg %lx value %lx (err %d)\n",
323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351
		field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
	dump_stack();
}

static void vmcs_writel(unsigned long field, unsigned long value)
{
	u8 error;

	asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0"
                : "=q"(error) : "a"(value), "d"(field) : "cc");
	if (unlikely(error))
		vmwrite_error(field, value);
}

static void vmcs_write16(unsigned long field, u16 value)
{
	vmcs_writel(field, value);
}

static void vmcs_write32(unsigned long field, u32 value)
{
	vmcs_writel(field, value);
}

static void vmcs_write64(unsigned long field, u64 value)
{
	vmcs_writel(field, value);
}

352 353
/* VMCS SETUP --------------------------------------------------*/

354 355 356
/**
 * Frees vmcs memory.
 */
357
static void vmx_free_vmcs(struct lcd_arch_vmcs *vmcs)
358
{
359 360 361 362 363 364 365
	free_pages((unsigned long)vmcs, vmcs_config.order);
}

/**
 * Allocates memory for a vmcs on cpu, and sets the
 * revision id.
 */
366
static struct lcd_arch_vmcs *vmx_alloc_vmcs(int cpu)
367
{
368 369
	int node;
	struct page *pages;
370
	struct lcd_arch_vmcs *vmcs;
371 372 373 374 375 376 377 378 379 380 381 382 383 384 385

	node = cpu_to_node(cpu);
	pages = alloc_pages_exact_node(node, GFP_KERNEL, vmcs_config.order);
	if (!pages)
		return NULL;
	vmcs = page_address(pages);
	memset(vmcs, 0, vmcs_config.size);

	vmcs->revision_id = vmcs_config.revision_id;

	return vmcs;
}

/* VMX ON/OFF --------------------------------------------------*/

386
static inline void __vmxon(hpa_t addr)
387
{
388 389 390
	u64 paddr;
	
	paddr = hpa_val(addr);
391
	asm volatile (ASM_VMX_VMXON_RAX
392
                : : "a"(&paddr), "m"(paddr)
393 394 395
                : "memory", "cc");
}

396 397
static inline void __vmxoff(void)
{
398 399 400
	asm volatile (ASM_VMX_VMXOFF : : : "cc");
}

401 402 403 404
/**
 * Helper for vmx_enable. A few more low-level checks and
 * settings, and then turns on vmx.
 */
405
static int __vmx_enable(struct lcd_arch_vmcs *vmxon_buf)
406
{
407
	hpa_t a;
408 409 410
	u64 old;
	u64 test_bits;

411
	a = va2hpa(vmxon_buf);
412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441

	/*
	 * Intel SDM V3 23.7
	 */

	/*
	 * We can't use vmx if someone else is
	 */
	if (read_cr4() & X86_CR4_VMXE)
		return -EBUSY;
	write_cr4(read_cr4() | X86_CR4_VMXE);

	/*
	 * Set MSR_IA32_FEATURE_CONTROL
	 */

	rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
	test_bits = FEATURE_CONTROL_LOCKED;
	test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
	if (tboot_enabled())
		test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;

	if ((old & test_bits) != test_bits) {
		/* enable and lock */
		wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
	}

	/*
	 * Turn on vmx
	 */
442
	__vmxon(a);
443 444 445 446 447 448 449 450


	return 0;
}

/**
 * Turn on vmx on calling cpu, using per cpu vmxon_area.
 *
451
 * unused is ignored (NULL is passed in lcd_arch_init).
452 453 454 455
 *
 * Important: Assumes preemption is disabled (it will be
 * if called via on_each_cpu).
 */
456 457
static void vmx_enable(void *unused)
{
458
	int ret;
459
	struct lcd_arch_vmcs *vmxon_buf;
460 461 462 463 464 465 466 467 468 469 470 471 472 473

	vmxon_buf = __get_cpu_var(vmxon_area);
	
	/*
	 * Turn on vmx
	 */
	ret = __vmx_enable(vmxon_buf);
	if (ret)
		goto failed;

	/*
	 * Flush TLB and caches of any old VPID and EPT
	 * mappings.
	 */
474 475
	invvpid_global_context();
	invept_global_context();
476 477 478

	__get_cpu_var(vmx_enabled) = 1;

479
	LCD_ARCH_MSG("VMX enabled on CPU %d\n",	raw_smp_processor_id());
480 481 482 483
	return;

failed:
	atomic_inc(&vmx_enable_failed);
484
	LCD_ARCH_ERR("failed to enable VMX, err = %d\n", ret);
485 486 487 488 489 490
	return;
}

/**
 * Turns off vmx on calling cpu.
 *
491
 * unused is ignored (NULL is passed in lcd_arch_init).
492 493 494 495
 *
 * Important: Assumes preemption is disabled. (It will
 * be if called from on_each_cpu.)
 */
496 497
static void vmx_disable(void *unused)
{
498 499 500 501 502 503 504 505 506 507
	if (__get_cpu_var(vmx_enabled)) {
		__vmxoff();
		write_cr4(read_cr4() & ~X86_CR4_VMXE);
		__get_cpu_var(vmx_enabled) = 0;
	}
}

/**
 * Frees any vmxon areas allocated for cpu's.
 */
508 509
static void vmx_free_vmxon_areas(void)
{
510 511 512 513 514 515 516 517 518 519 520
	int cpu;
	for_each_possible_cpu(cpu) {
		if (per_cpu(vmxon_area, cpu)) {
			vmx_free_vmcs(per_cpu(vmxon_area, cpu));
			per_cpu(vmxon_area, cpu) = NULL;
		}
	}
}

/* VMX SETTINGS --------------------------------------------------*/

521 522 523 524
/**
 * Clears the correct bit in the msr bitmap to allow vm access
 * to an msr.
 */
525 526
static void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
{
527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558
	int sz;
	sz = sizeof(unsigned long);

	/*
	 * Intel SDM V3 24.6.9 (MSR-Bitmap Addresses).
	 *
	 * The bitmap is 4KBs:
	 *
	 *  -- bitmap + 0KB (0x000) = read bitmap for low MSRs
	 *  -- bitmap + 1KB (0x400) = read bitmap for high MSRs
	 *  -- bitmap + 2KB (0x800) = write bitmap for low MSRs
	 *  -- bitmap + 3KB (0xc00) = write bitmap for high MSRs
	 *
	 * We have to divide by the size of an unsigned long to get
	 * the correct pointer offset.
	 */
	if (msr <= 0x1fff) {
		/*
		 * Low MSR
		 */
		__clear_bit(msr, msr_bitmap + 0x000 / sz); /* read  */
		__clear_bit(msr, msr_bitmap + 0x800 / sz); /* write */
	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
		/*
		 * High MSR
		 */
		msr &= 0x1fff;
		__clear_bit(msr, msr_bitmap + 0x400 / sz); /* read  */
		__clear_bit(msr, msr_bitmap + 0xc00 / sz); /* write */
	}
}
	
559 560 561
/**
 * Checks and sets basic vmcs settings (vmxon region size, etc.)
 */
562
static int vmcs_config_basic_settings(struct vmx_vmcs_config *vmcs_conf)
563
{
564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605
	u32 msr_low;
	u32 msr_high;

	/*
	 * Read and store basic vmcs settings.
	 *
	 * Intel SDM V3 Appendix A
	 */

	rdmsr(MSR_IA32_VMX_BASIC, msr_low, msr_high);

	/*
	 * VMCS size is never greater than 4KBs
	 */
	if ((msr_high & 0x1fff) > PAGE_SIZE)
		return -EIO;

	/* 
	 * 64-bit CPUs always have VMX_BASIC_MSR[48] == 0. Controls
	 * physical address width.
	 */
	if (msr_high & (1u<<16))
		return -EIO;

	/*
	 * Require Write-Back (WB) memory type for VMCS accesses.
	 */
	if (((msr_high >> 18) & 15) != 6)
		return -EIO;

	vmcs_conf->size  = msr_high & 0x1fff;
	vmcs_conf->order = get_order(vmcs_config.size);
	vmcs_conf->revision_id = msr_low;
	return 0;
}

/**
 * Returns 0 if controls not allowed, non-zero otherwise. If
 * successful, controls is updated with reserved bits properly
 * set. The negation of the reserved mask is used to ignore
 * reserved bits during the `checking' process.
 */
606 607
static int adjust_vmx_controls(u32 *controls, u32 reserved_mask, u32 msr)
{
608 609 610
	u32 msr_low;
	u32 msr_high;
	u32 controls_copy;
611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632
	
	/*
	 * Make sure the desired controls are possible. In the pin-based
	 * exec, primary and secondary exec, vmentry, and vmexit MSRs:
	 * 
	 * -- The low word contains the *minimum required* bits that must
	 *    be set to 1 (i.e., if the bit in the low msr is 1, the vmx
	 *    control bit must be 1).
	 *
	 * -- The high word contains the *maximum allowed* bits that can
	 *    be set to 1 (i.e., if the bit in the high msr is 0, the vmx
	 *    control must be 0).
	 *
	 * If these conditions aren't met, vmentry fails. Some of these
	 * bits are reserved, so a mask is used to ensure we're only
	 * checking those bits we care about.
	 *
	 * See Intel SDM V3 Appendix A.
	 */

	rdmsr(msr, msr_low, msr_high);

633 634
	controls_copy = *controls;

635 636 637 638
	/*
	 * (msr high bit not set, and not a reserved bit) ==> ctrl bit not set
	 */
	if (~msr_high & ~reserved_mask & controls_copy)
639 640
		return -1;

641 642 643 644
	/*
	 * (msr low bit set, and not a reserved bit) ==> ctrl bit set
	 */
	if (msr_low & ~reserved_mask & ~controls_copy)
645 646
		return -1;

647 648 649 650
	controls_copy &= msr_high;
	controls_copy |= msr_low;

	*controls = controls_copy;
651
	return 0;
652 653 654 655 656 657
}


/**
 * Populates default settings in vmcs_conf for
 * vm entries, vm exits, vm execution (e.g., interrupt handling),
658
 * etc. for all lcd types.
659
 */
660
static int setup_vmcs_config(struct vmx_vmcs_config *vmcs_conf)
661
{
662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686
	u32 pin_based_exec_controls;
	u32 primary_proc_based_exec_controls;
	u32 secondary_proc_based_exec_controls;
	u32 vmexit_controls;
	u32 vmentry_controls;

	/*
	 * Basic VMX Configuration
	 */
	if (vmcs_config_basic_settings(vmcs_conf) < 0)
		return -EIO;

	/*
	 * VMX Execution Controls (Intel SDM V3 24.6)
	 */
	
	/*
	 * Pin Based Execution Controls (exceptions, nmi's, ...)
	 * 
	 * -- external interrupts and nmi's cause vm exit.
	 */
	pin_based_exec_controls = PIN_BASED_EXT_INTR_MASK | 
		PIN_BASED_NMI_EXITING;
	if (adjust_vmx_controls(&pin_based_exec_controls,
					PIN_BASED_RESERVED_MASK,
687
					MSR_IA32_VMX_PINBASED_CTLS) < 0) {
688
		LCD_ARCH_ERR("pin based exec controls not allowed\n");
689 690 691
		print_vmx_controls(pin_based_exec_controls,
				PIN_BASED_RESERVED_MASK,
				MSR_IA32_VMX_PINBASED_CTLS);
692
		return -EIO;
693
	}
694 695 696 697 698 699 700 701 702

	/*
	 * Primary Processor Execution Controls
	 *
	 * -- HLT Exit
	 * -- Invalidate PG Exit
	 * -- MWAIT Exit
	 * -- RDPMC Exit
	 * -- L/S CR8 Exit
703
	 * -- L/S CR3 Exit   / required by emulab machines :(
704 705 706 707 708 709 710 711 712 713 714 715 716 717 718
	 * -- MOV DR Exit
	 * -- Unconditional I/O Exit (no I/O bitmap)
	 * -- Use MSR Bitmaps
	 * -- MONITOR Exit
	 * -- Activate Secondary Proc Exec Controls
	 *
	 * Note: TSC offsetting and TPR Shadowing are not set. We are
	 * currently not virtualizing access to the TPR.
	 */
	primary_proc_based_exec_controls = CPU_BASED_HLT_EXITING |
		CPU_BASED_INVLPG_EXITING |
		CPU_BASED_MWAIT_EXITING |
		CPU_BASED_RDPMC_EXITING |
		CPU_BASED_CR8_LOAD_EXITING |
		CPU_BASED_CR8_STORE_EXITING |
719 720
		CPU_BASED_CR3_LOAD_EXITING |
		CPU_BASED_CR3_STORE_EXITING |
721 722 723 724 725 726 727
		CPU_BASED_MOV_DR_EXITING |
		CPU_BASED_UNCOND_IO_EXITING |
		CPU_BASED_USE_MSR_BITMAPS |
		CPU_BASED_MONITOR_EXITING |
		CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
	if (adjust_vmx_controls(&primary_proc_based_exec_controls,
					CPU_BASED_RESERVED_MASK,
728
					MSR_IA32_VMX_PROCBASED_CTLS)) {
729
		LCD_ARCH_ERR("primary proc based exec ctrls not allowed\n");
730 731 732
		print_vmx_controls(primary_proc_based_exec_controls,
				CPU_BASED_RESERVED_MASK,
				MSR_IA32_VMX_PROCBASED_CTLS);
733
		return -EIO;
734
	}
735 736 737 738 739 740 741 742

	/*
	 * Secondary Processor Execution Controls
	 *
	 * -- Enable EPT
	 * -- Enable RDTSCP
	 * -- Enable VPID
	 * -- WBINVD Exit
743 744 745
	 *
	 * Note: Unrestricted guest and INVPCID not available on
	 * emulab machines.
746 747 748 749
	 */
	secondary_proc_based_exec_controls = SECONDARY_EXEC_ENABLE_EPT |
		SECONDARY_EXEC_RDTSCP |
		SECONDARY_EXEC_ENABLE_VPID |
750
		SECONDARY_EXEC_WBINVD_EXITING;
751
	if (adjust_vmx_controls(&secondary_proc_based_exec_controls,
752
					SECONDARY_EXEC_RESERVED_MASK,
753
					MSR_IA32_VMX_PROCBASED_CTLS2) < 0) {
754
		LCD_ARCH_ERR("secondary proc based exec ctls not allowed\n");
755 756 757
		print_vmx_controls(secondary_proc_based_exec_controls,
				SECONDARY_EXEC_RESERVED_MASK,
				MSR_IA32_VMX_PROCBASED_CTLS2);
758
		return -EIO;
759
	}
760 761 762 763 764 765 766 767 768 769 770 771 772

	/*
	 * Remember the EPT and VPID capabilities
	 */
	rdmsr(MSR_IA32_VMX_EPT_VPID_CAP,
		vmx_capability.ept, vmx_capability.vpid);


	/*
	 * VM Exit Controls (Intel SDM V3 24.7)
	 *
	 * -- Host Address Space (host in 64-bit mode on vm exit)
	 * -- Acknowledge interrupts on vm exit
773
	 * -- Save / load IA-32 EFER MSR on exit
774
	 * -- Save debug controls    / needed for emulab machines
775 776 777 778
	 */
	vmexit_controls = VM_EXIT_HOST_ADDR_SPACE_SIZE |
		VM_EXIT_ACK_INTR_ON_EXIT |
		VM_EXIT_SAVE_IA32_EFER |
779 780
		VM_EXIT_LOAD_IA32_EFER |
		VM_EXIT_SAVE_DEBUG_CONTROLS;
781 782
	if (adjust_vmx_controls(&vmexit_controls, 
					VM_EXIT_RESERVED_MASK,
783
					MSR_IA32_VMX_EXIT_CTLS) < 0) {
784
		LCD_ARCH_ERR("vmexit controls not allowed\n");
785 786 787 788
		
		print_vmx_controls(vmexit_controls,
				VM_EXIT_RESERVED_MASK,
				MSR_IA32_VMX_EXIT_CTLS);
789
		return -EIO;
790
	}
791 792 793 794 795 796

	/*
	 * VM Entry Controls (Intel SDM V3 24.8)
	 *
	 * -- IA-32E Mode inside guest
	 * -- Load IA-32 EFER MSR on entry
797
	 * -- Load debug controls  / needed on emulab
798 799
	 */
	vmentry_controls = VM_ENTRY_IA32E_MODE |
800 801
		VM_ENTRY_LOAD_IA32_EFER |
		VM_ENTRY_LOAD_DEBUG_CONTROLS;
802 803
	if (adjust_vmx_controls(&vmentry_controls,
					VM_ENTRY_RESERVED_MASK,
804
					MSR_IA32_VMX_ENTRY_CTLS) < 0) {
805
		LCD_ARCH_ERR("vm entry controls not allowed\n");
806 807 808 809 810
		
		print_vmx_controls(vmentry_controls,
				VM_ENTRY_RESERVED_MASK,
				MSR_IA32_VMX_ENTRY_CTLS);
		
811
		return -EIO;
812
	}
813 814


815
	vmcs_conf->pin_based_exec_controls = pin_based_exec_controls;
816 817 818 819 820 821 822 823 824 825
	vmcs_conf->primary_proc_based_exec_controls =
		primary_proc_based_exec_controls;
	vmcs_conf->secondary_proc_based_exec_controls = 
		secondary_proc_based_exec_controls;
	vmcs_conf->vmexit_controls = vmexit_controls;
	vmcs_conf->vmentry_controls = vmentry_controls;

	return 0;
}

826 827
/* VMX INIT / EXIT -------------------------------------------------- */

828
static void lcd_arch_tests(void);
829

830
int lcd_arch_init(void)
831
{
832 833 834 835 836 837 838 839
	int ret;
	int cpu;

	/*
	 * Check For VMX Features
	 */

	if (!cpu_has_vmx()) {
840
		LCD_ARCH_ERR("CPU does not support VMX\n");
841 842 843
		return -EIO;
	}

844
	if (setup_vmcs_config(&vmcs_config) < 0)
845 846 847 848 849 850 851
		return -EIO;

	/*
	 * Set up default MSR bitmap
	 */

	msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
Charlie Jacobsen's avatar
Charlie Jacobsen committed
852 853
	if (!msr_bitmap) {
		ret = -ENOMEM;
854
		goto failed1;
Charlie Jacobsen's avatar
Charlie Jacobsen committed
855
	}	
856 857 858 859 860

	memset(msr_bitmap, 0xff, PAGE_SIZE);
	vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE);
	vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE);

861 862 863
	/*
	 * Initialize VPID bitmap spinlock
	 */
864
	spin_lock_init(&vpids.lock);
865

866 867 868 869 870 871 872 873 874 875 876 877
	/*
	 * VPID 0 is reserved for host. See INVVPID instruction.
	 */
	set_bit(0, vpids.bitmap); 

	/*
	 * Allocate vmxon buffers for each cpu. A vmxon buffer is
	 * (currently) the same size as a vmcs, so we can re-use
	 * the vmx_alloc_vmcs routine.
	 */

	for_each_possible_cpu(cpu) {
878
		struct lcd_arch_vmcs *vmxon_buf;
879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896

		vmxon_buf = vmx_alloc_vmcs(cpu);
		if (!vmxon_buf) {
			vmx_free_vmxon_areas();
			return -ENOMEM;
		}

		per_cpu(vmxon_area, cpu) = vmxon_buf;
	}

	/*
	 * Turn on vmx on each cpu
	 *
	 * Note: on_each_cpu disables preemption
	 */

	atomic_set(&vmx_enable_failed, 0);
	if (on_each_cpu(vmx_enable, NULL, 1)) {
897
		LCD_ARCH_ERR("timeout waiting for VMX mode enable.\n");
898 899 900 901 902 903 904 905 906
		ret = -EIO;
		goto failed1; /* sadly we can't totally recover */
	}

	if (atomic_read(&vmx_enable_failed)) {
		ret = -EBUSY;
		goto failed2;
	}

907
	/*
908 909
	 * Init lcd_arch_thread cache (using instead of kmalloc since
	 * these structs need to be aligned properly)
910
	 */
911 912 913 914 915 916
	lcd_arch_thread_cache = kmem_cache_create("lcd_arch_thread", 
						sizeof(struct lcd_arch_thread),
						__alignof__(struct lcd_arch_thread),
						0, NULL);
	if (!lcd_arch_thread_cache) {
		LCD_ARCH_ERR("failed to set up kmem cache\n");
917 918 919 920
		ret = -ENOMEM;
		goto failed3;
	}

921 922 923 924 925
	/*
	 * Run tests
	 */
	lcd_arch_tests();

926 927
	return 0;

928
failed3:
929 930 931 932
failed2:
	on_each_cpu(vmx_disable, NULL, 1);
failed1:
	vmx_free_vmxon_areas();
Charlie Jacobsen's avatar
Charlie Jacobsen committed
933
	free_page((unsigned long)msr_bitmap);
934 935
	return ret;
}
936

937
void lcd_arch_exit(void)
938 939 940 941
{
	on_each_cpu(vmx_disable, NULL, 1);
	vmx_free_vmxon_areas();
	free_page((unsigned long)msr_bitmap);
942
	kmem_cache_destroy(lcd_arch_thread_cache);
943
}
944

945 946 947
module_init(lcd_arch_init);
module_exit(lcd_arch_exit);

948 949
/* VMX EPT -------------------------------------------------- */

Charlie Jacobsen's avatar
Charlie Jacobsen committed
950 951 952 953 954 955 956
/**
 * PAGE_SHIFT is assumed to be 12.
 */
#define VMX_EPTE_ADDR_MASK PAGE_MASK
#define VMX_EPT_ALL_MASK (VMX_EPT_READABLE_MASK | \
                          VMX_EPT_WRITABLE_MASK | \
			  VMX_EPT_EXECUTABLE_MASK)
957
static inline hpa_t vmx_epte_hpa(lcd_arch_epte_t epte)
958
{
959
	return __hpa(((u64)epte) & VMX_EPTE_ADDR_MASK);
960
}
961
static inline hva_t vmx_epte_hva(lcd_arch_epte_t epte)
962
{
963
	return hpa2hva(vmx_epte_hpa(epte));
964 965 966
}
static inline lcd_arch_epte_t * vmx_epte_dir_hva(lcd_arch_epte_t epte)
{
967
	return (lcd_arch_epte_t *)hva_val(vmx_epte_hva(epte));
968 969 970 971 972
}
static inline int vmx_epte_present(lcd_arch_epte_t epte)
{
	return (int)(((u64)epte) & VMX_EPT_ALL_MASK);
}
Charles Jacobsen's avatar
Charles Jacobsen committed
973 974 975 976 977 978
/*
 * level 0 (PML4) = bits 47:39 (9 bits)
 * level 1 (PDPT) = bits 38:30 (9 bits)
 * level 2 (PD)   = bits 29:21 (9 bits)
 * level 3 (PT)   = bits 20:12 (9 bits)
 */
979
static inline int vmx_ept_idx(gpa_t a, int lvl)
980
{
981
	/* we right shift by the correct amount, then mask off 9 bits */
982
	return (int)(((gpa_val(a)) >> (12 + 9 * (3 - lvl))) & ((1 << 9) - 1));
983
}
984
static inline u64 vmx_ept_offset(gpa_t a)
985
{
986
	return gpa_val(a) & ~(PAGE_MASK);
987
}
Charlie Jacobsen's avatar
Charlie Jacobsen committed
988 989 990 991 992 993 994 995 996 997 998 999 1000 1001

enum vmx_epte_mts {
	VMX_EPTE_MT_UC = 0, /* uncachable */
	VMX_EPTE_MT_WC = 1, /* write combining */
	VMX_EPTE_MT_WT = 4, /* write through */
	VMX_EPTE_MT_WP = 5, /* write protected */
	VMX_EPTE_MT_WB = 6, /* write back */
};

/**
 * Sets address in epte along with default access settings. Since
 * we are using a page walk length of 4, epte's at all levels have
 * the `size' bit (bit 7) set to 0. Page table entries (entries at the final
 * level) have the IPAT (ignore page attribute table) and EPT MT (memory
1002 1003 1004 1005 1006 1007 1008 1009
 * type) bits set. Paging levels are zero-indexed:
 *
 *  0 = PML4 entry
 *  1 = PDPTE entry
 *  2 = Page Directory entry
 *  3 = Page Table entry
 *
 *  See Intel SDM V3 Figure 28-1 and 28.2.2.
Charlie Jacobsen's avatar
Charlie Jacobsen committed
1010
 */
1011
static void vmx_epte_set(lcd_arch_epte_t *epte, hpa_t a, int level)
Charlie Jacobsen's avatar
Charlie Jacobsen committed
1012 1013 1014 1015 1016
{
	/*
	 * zero out epte, and set
	 */
	*epte = 0;
1017
	*epte = (hpa_val(a) & VMX_EPTE_ADDR_MASK) | VMX_EPT_ALL_MASK;
Charlie Jacobsen's avatar
Charlie Jacobsen committed
1018 1019 1020 1021 1022 1023 1024 1025 1026 1027
	if (level == 3) {
		/*
		 * Page table entry. Set EPT memory type to write back
		 * and ignore page attribute table.
		 */
		*epte |= (VMX_EPT_IPAT_BIT |
			(VMX_EPTE_MT_WB << VMX_EPT_MT_EPTE_SHIFT));
	}
}

1028
int lcd_arch_ept_walk(struct lcd_arch *lcd, gpa_t a, int create,
Charlie Jacobsen's avatar
Charlie Jacobsen committed
1029 1030 1031
		lcd_arch_epte_t **epte_out)
{
	int i;
1032
	lcd_arch_epte_t *dir;
Charlie Jacobsen's avatar
Charlie Jacobsen committed
1033
	u64 idx;
1034
	hva_t page;
Charlie Jacobsen's avatar
Charlie Jacobsen committed
1035

1036
	dir = lcd->ept.root;
Charlie Jacobsen's avatar
Charlie Jacobsen committed
1037 1038 1039 1040 1041 1042 1043

	/*
	 * Walk plm4 -> pdpt -> pd. Each step uses 9 bits
	 * of the gpa.
	 */
	for (i = 0; i < LCD_ARCH_EPT_WALK_LENGTH - 1; i++) {

1044
		idx = vmx_ept_idx(a, i);
Charlie Jacobsen's avatar
Charlie Jacobsen committed
1045

1046
		if (!vmx_epte_present(dir[idx])) {
Charlie Jacobsen's avatar
Charlie Jacobsen committed
1047
			
1048
			if (!create) {
1049
				LCD_ARCH_ERR("attempted lookup for unmapped gpa %lx, create was not allowed\n",
1050
					gpa_val(a));
Charlie Jacobsen's avatar
Charlie Jacobsen committed
1051
				return -ENOENT;
1052
			}
Charlie Jacobsen's avatar
Charlie Jacobsen committed
1053 1054 1055 1056
			/*
			 * Get host virtual addr of fresh page, and
			 * set the epte's addr to the host physical addr
			 */
1057
			page = __hva(__get_free_page(GFP_KERNEL));
1058
			if (!hva_val(page)) {
1059
				LCD_ARCH_ERR("alloc failed\n");
Charlie Jacobsen's avatar
Charlie Jacobsen committed
1060
				return -ENOMEM;
1061
			}
1062 1063
			memset(hva2va(page), 0, PAGE_SIZE);
			vmx_epte_set(&dir[idx], hva2hpa(page), i);
Charlie Jacobsen's avatar
Charlie Jacobsen committed
1064 1065
		}

1066
		dir = (lcd_arch_epte_t *) hva2va(vmx_epte_hva(dir[idx]));
Charlie Jacobsen's avatar
Charlie Jacobsen committed
1067
	}
Charles Jacobsen's avatar
Charles Jacobsen committed
1068

Charlie Jacobsen's avatar
Charlie Jacobsen committed
1069
	/*
Charles Jacobsen's avatar
Charles Jacobsen committed
1070
	 * dir points to page table (level 3)
Charlie Jacobsen's avatar
Charlie Jacobsen committed
1071
	 */
1072
	*epte_out = &dir[vmx_ept_idx(a, 3)];
Charlie Jacobsen's avatar
Charlie Jacobsen committed
1073 1074 1075
	return 0;
}

1076
void lcd_arch_ept_set(lcd_arch_epte_t *epte, hpa_t a)
Charlie Jacobsen's avatar
Charlie Jacobsen committed
1077
{
1078
	vmx_epte_set(epte, a, 3);
Charlie Jacobsen's avatar
Charlie Jacobsen committed
1079 1080
}

1081 1082 1083
int lcd_arch_ept_unset(lcd_arch_epte_t *epte)
{
	*epte = 0;
1084
	return 0;
1085 1086
}

1087
hpa_t lcd_arch_ept_hpa(lcd_arch_epte_t *epte)
Charlie Jacobsen's avatar
Charlie Jacobsen committed
1088
{
1089
	return vmx_epte_hpa(*epte);
Charlie Jacobsen's avatar
Charlie Jacobsen committed
1090 1091
}

1092
int lcd_arch_ept_map(struct lcd_arch *lcd, gpa_t ga, hpa_t ha,
1093 1094 1095 1096 1097 1098 1099 1100
				int create, int overwrite)
{