vmx.c 236 KB
Newer Older
Avi Kivity's avatar
Avi Kivity committed
1
2
3
4
5
6
7
/*
 * Kernel-based Virtual Machine driver for Linux
 *
 * This module enables machines with Intel VT-x extensions to run virtual
 * machines without emulation or binary translation.
 *
 * Copyright (C) 2006 Qumranet, Inc.
8
 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
Avi Kivity's avatar
Avi Kivity committed
9
10
11
12
13
14
15
16
17
18
 *
 * Authors:
 *   Avi Kivity   <avi@qumranet.com>
 *   Yaniv Kamay  <yaniv@qumranet.com>
 *
 * This work is licensed under the terms of the GNU GPL, version 2.  See
 * the COPYING file in the top-level directory.
 *
 */

19
#include "irq.h"
20
#include "mmu.h"
Avi Kivity's avatar
Avi Kivity committed
21
#include "cpuid.h"
Avi Kivity's avatar
Avi Kivity committed
22

23
#include <linux/kvm_host.h>
Avi Kivity's avatar
Avi Kivity committed
24
#include <linux/module.h>
25
#include <linux/kernel.h>
Avi Kivity's avatar
Avi Kivity committed
26
27
#include <linux/mm.h>
#include <linux/highmem.h>
Alexey Dobriyan's avatar
Alexey Dobriyan committed
28
#include <linux/sched.h>
29
#include <linux/moduleparam.h>
30
#include <linux/mod_devicetable.h>
31
#include <linux/ftrace_event.h>
32
#include <linux/slab.h>
33
#include <linux/tboot.h>
34
#include "kvm_cache_regs.h"
35
#include "x86.h"
Avi Kivity's avatar
Avi Kivity committed
36

Avi Kivity's avatar
Avi Kivity committed
37
#include <asm/io.h>
38
#include <asm/desc.h>
39
#include <asm/vmx.h>
40
#include <asm/virtext.h>
41
#include <asm/mce.h>
42
43
#include <asm/i387.h>
#include <asm/xcr.h>
44
#include <asm/perf_event.h>
45
#include <asm/kexec.h>
Avi Kivity's avatar
Avi Kivity committed
46

47
48
#include "trace.h"

49
#define __ex(x) __kvm_handle_fault_on_reboot(x)
50
51
#define __ex_clear(x, reg) \
	____kvm_handle_fault_on_reboot(x, "xor " reg " , " reg)
52

Avi Kivity's avatar
Avi Kivity committed
53
54
55
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");

56
57
58
59
60
61
static const struct x86_cpu_id vmx_cpu_id[] = {
	X86_FEATURE_MATCH(X86_FEATURE_VMX),
	{}
};
MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);

62
static bool __read_mostly enable_vpid = 1;
63
module_param_named(vpid, enable_vpid, bool, 0444);
64

65
static bool __read_mostly flexpriority_enabled = 1;
66
module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
67

68
static bool __read_mostly enable_ept = 1;
69
module_param_named(ept, enable_ept, bool, S_IRUGO);
Sheng Yang's avatar
Sheng Yang committed
70

71
static bool __read_mostly enable_unrestricted_guest = 1;
72
73
74
module_param_named(unrestricted_guest,
			enable_unrestricted_guest, bool, S_IRUGO);

75
76
77
static bool __read_mostly enable_ept_ad_bits = 1;
module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);

78
static bool __read_mostly emulate_invalid_guest_state = true;
79
module_param(emulate_invalid_guest_state, bool, S_IRUGO);
80

81
static bool __read_mostly vmm_exclusive = 1;
82
83
module_param(vmm_exclusive, bool, S_IRUGO);

84
static bool __read_mostly fasteoi = 1;
85
86
module_param(fasteoi, bool, S_IRUGO);

87
static bool __read_mostly enable_apicv = 1;
88
module_param(enable_apicv, bool, S_IRUGO);
89

90
91
static bool __read_mostly enable_shadow_vmcs = 1;
module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
92
93
94
95
96
/*
 * If nested=1, nested virtualization is supported, i.e., guests may use
 * VMX and be a hypervisor for its own guests. If nested=0, guests may not
 * use VMX instructions.
 */
97
static bool __read_mostly nested = 0;
98
99
module_param(nested, bool, S_IRUGO);

100
101
#define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
102
103
#define KVM_VM_CR0_ALWAYS_ON						\
	(KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
104
105
106
107
#define KVM_CR4_GUEST_OWNED_BITS				      \
	(X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR      \
	 | X86_CR4_OSXMMEXCPT)

108
109
110
#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)

111
112
#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))

113
114
115
116
/*
 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
 * ple_gap:    upper bound on the amount of time between two successive
 *             executions of PAUSE in a loop. Also indicate if ple enabled.
117
 *             According to test, this time is usually smaller than 128 cycles.
118
119
120
121
122
123
 * ple_window: upper bound on the amount of time a guest is allowed to execute
 *             in a PAUSE loop. Tests indicate that most spinlocks are held for
 *             less than 2^12 cycles
 * Time is measured based on a counter that runs at the same rate as the TSC,
 * refer SDM volume 3b section 21.6.13 & 22.1.3.
 */
124
#define KVM_VMX_DEFAULT_PLE_GAP    128
125
126
127
128
129
130
131
#define KVM_VMX_DEFAULT_PLE_WINDOW 4096
static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP;
module_param(ple_gap, int, S_IRUGO);

static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
module_param(ple_window, int, S_IRUGO);

Avi Kivity's avatar
Avi Kivity committed
132
133
extern const ulong vmx_return;

134
#define NR_AUTOLOAD_MSRS 8
135
#define VMCS02_POOL_SIZE 1
136

137
138
139
140
141
142
struct vmcs {
	u32 revision_id;
	u32 abort;
	char data[0];
};

143
144
145
146
147
148
149
150
151
152
153
154
/*
 * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also
 * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs
 * loaded on this CPU (so we can clear them if the CPU goes down).
 */
struct loaded_vmcs {
	struct vmcs *vmcs;
	int cpu;
	int launched;
	struct list_head loaded_vmcss_on_cpu_link;
};

155
156
157
struct shared_msr_entry {
	unsigned index;
	u64 data;
158
	u64 mask;
159
160
};

161
162
163
164
165
166
167
168
169
170
171
172
173
/*
 * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a
 * single nested guest (L2), hence the name vmcs12. Any VMX implementation has
 * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is
 * stored in guest memory specified by VMPTRLD, but is opaque to the guest,
 * which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
 * More than one of these structures may exist, if L1 runs multiple L2 guests.
 * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the
 * underlying hardware which will be used to run L2.
 * This structure is packed to ensure that its layout is identical across
 * machines (necessary for live migration).
 * If there are changes in this struct, VMCS12_REVISION must be changed.
 */
174
typedef u64 natural_width;
175
176
177
178
179
180
struct __packed vmcs12 {
	/* According to the Intel spec, a VMCS region must start with the
	 * following two fields. Then follow implementation-specific data.
	 */
	u32 revision_id;
	u32 abort;
181

Nadav Har'El's avatar
Nadav Har'El committed
182
183
184
	u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */
	u32 padding[7]; /* room for future expansion */

185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
	u64 io_bitmap_a;
	u64 io_bitmap_b;
	u64 msr_bitmap;
	u64 vm_exit_msr_store_addr;
	u64 vm_exit_msr_load_addr;
	u64 vm_entry_msr_load_addr;
	u64 tsc_offset;
	u64 virtual_apic_page_addr;
	u64 apic_access_addr;
	u64 ept_pointer;
	u64 guest_physical_address;
	u64 vmcs_link_pointer;
	u64 guest_ia32_debugctl;
	u64 guest_ia32_pat;
	u64 guest_ia32_efer;
	u64 guest_ia32_perf_global_ctrl;
	u64 guest_pdptr0;
	u64 guest_pdptr1;
	u64 guest_pdptr2;
	u64 guest_pdptr3;
	u64 host_ia32_pat;
	u64 host_ia32_efer;
	u64 host_ia32_perf_global_ctrl;
	u64 padding64[8]; /* room for future expansion */
	/*
	 * To allow migration of L1 (complete with its L2 guests) between
	 * machines of different natural widths (32 or 64 bit), we cannot have
	 * unsigned long fields with no explict size. We use u64 (aliased
	 * natural_width) instead. Luckily, x86 is little-endian.
	 */
	natural_width cr0_guest_host_mask;
	natural_width cr4_guest_host_mask;
	natural_width cr0_read_shadow;
	natural_width cr4_read_shadow;
	natural_width cr3_target_value0;
	natural_width cr3_target_value1;
	natural_width cr3_target_value2;
	natural_width cr3_target_value3;
	natural_width exit_qualification;
	natural_width guest_linear_address;
	natural_width guest_cr0;
	natural_width guest_cr3;
	natural_width guest_cr4;
	natural_width guest_es_base;
	natural_width guest_cs_base;
	natural_width guest_ss_base;
	natural_width guest_ds_base;
	natural_width guest_fs_base;
	natural_width guest_gs_base;
	natural_width guest_ldtr_base;
	natural_width guest_tr_base;
	natural_width guest_gdtr_base;
	natural_width guest_idtr_base;
	natural_width guest_dr7;
	natural_width guest_rsp;
	natural_width guest_rip;
	natural_width guest_rflags;
	natural_width guest_pending_dbg_exceptions;
	natural_width guest_sysenter_esp;
	natural_width guest_sysenter_eip;
	natural_width host_cr0;
	natural_width host_cr3;
	natural_width host_cr4;
	natural_width host_fs_base;
	natural_width host_gs_base;
	natural_width host_tr_base;
	natural_width host_gdtr_base;
	natural_width host_idtr_base;
	natural_width host_ia32_sysenter_esp;
	natural_width host_ia32_sysenter_eip;
	natural_width host_rsp;
	natural_width host_rip;
	natural_width paddingl[8]; /* room for future expansion */
	u32 pin_based_vm_exec_control;
	u32 cpu_based_vm_exec_control;
	u32 exception_bitmap;
	u32 page_fault_error_code_mask;
	u32 page_fault_error_code_match;
	u32 cr3_target_count;
	u32 vm_exit_controls;
	u32 vm_exit_msr_store_count;
	u32 vm_exit_msr_load_count;
	u32 vm_entry_controls;
	u32 vm_entry_msr_load_count;
	u32 vm_entry_intr_info_field;
	u32 vm_entry_exception_error_code;
	u32 vm_entry_instruction_len;
	u32 tpr_threshold;
	u32 secondary_vm_exec_control;
	u32 vm_instruction_error;
	u32 vm_exit_reason;
	u32 vm_exit_intr_info;
	u32 vm_exit_intr_error_code;
	u32 idt_vectoring_info_field;
	u32 idt_vectoring_error_code;
	u32 vm_exit_instruction_len;
	u32 vmx_instruction_info;
	u32 guest_es_limit;
	u32 guest_cs_limit;
	u32 guest_ss_limit;
	u32 guest_ds_limit;
	u32 guest_fs_limit;
	u32 guest_gs_limit;
	u32 guest_ldtr_limit;
	u32 guest_tr_limit;
	u32 guest_gdtr_limit;
	u32 guest_idtr_limit;
	u32 guest_es_ar_bytes;
	u32 guest_cs_ar_bytes;
	u32 guest_ss_ar_bytes;
	u32 guest_ds_ar_bytes;
	u32 guest_fs_ar_bytes;
	u32 guest_gs_ar_bytes;
	u32 guest_ldtr_ar_bytes;
	u32 guest_tr_ar_bytes;
	u32 guest_interruptibility_info;
	u32 guest_activity_state;
	u32 guest_sysenter_cs;
	u32 host_ia32_sysenter_cs;
304
305
	u32 vmx_preemption_timer_value;
	u32 padding32[7]; /* room for future expansion */
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
	u16 virtual_processor_id;
	u16 guest_es_selector;
	u16 guest_cs_selector;
	u16 guest_ss_selector;
	u16 guest_ds_selector;
	u16 guest_fs_selector;
	u16 guest_gs_selector;
	u16 guest_ldtr_selector;
	u16 guest_tr_selector;
	u16 host_es_selector;
	u16 host_cs_selector;
	u16 host_ss_selector;
	u16 host_ds_selector;
	u16 host_fs_selector;
	u16 host_gs_selector;
	u16 host_tr_selector;
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
};

/*
 * VMCS12_REVISION is an arbitrary id that should be changed if the content or
 * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and
 * VMPTRLD verifies that the VMCS region that L1 is loading contains this id.
 */
#define VMCS12_REVISION 0x11e57ed0

/*
 * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region
 * and any VMCS region. Although only sizeof(struct vmcs12) are used by the
 * current implementation, 4K are reserved to avoid future complications.
 */
#define VMCS12_SIZE 0x1000

338
339
340
341
342
343
344
/* Used to remember the last vmcs02 used for some recently used vmcs12s */
struct vmcs02_list {
	struct list_head list;
	gpa_t vmptr;
	struct loaded_vmcs vmcs02;
};

345
346
347
348
349
350
351
/*
 * The nested_vmx structure is part of vcpu_vmx, and holds information we need
 * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
 */
struct nested_vmx {
	/* Has the level1 guest done vmxon? */
	bool vmxon;
352
353
354
355
356
357

	/* The guest-physical address of the current VMCS L1 keeps for L2 */
	gpa_t current_vmptr;
	/* The host-usable pointer to the above */
	struct page *current_vmcs12_page;
	struct vmcs12 *current_vmcs12;
Abel Gordon's avatar
Abel Gordon committed
358
	struct vmcs *current_shadow_vmcs;
359
360
361
362
363
	/*
	 * Indicates if the shadow vmcs must be updated with the
	 * data hold by vmcs12
	 */
	bool sync_shadow_vmcs;
364
365
366
367

	/* vmcs02_list cache of VMCSs recently used to run L2 guests */
	struct list_head vmcs02_pool;
	int vmcs02_num;
368
	u64 vmcs01_tsc_offset;
369
370
	/* L2 must run next, and mustn't decide to exit to L1. */
	bool nested_run_pending;
371
372
373
374
375
	/*
	 * Guest pages referred to in vmcs02 with host-physical pointers, so
	 * we must keep them pinned while L2 runs.
	 */
	struct page *apic_access_page;
376
377
};

378
379
380
381
382
383
384
385
#define POSTED_INTR_ON  0
/* Posted-Interrupt Descriptor */
struct pi_desc {
	u32 pir[8];     /* Posted interrupt requested */
	u32 control;	/* bit 0 of control is outstanding notification bit */
	u32 rsvd[7];
} __aligned(64);

386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
static bool pi_test_and_set_on(struct pi_desc *pi_desc)
{
	return test_and_set_bit(POSTED_INTR_ON,
			(unsigned long *)&pi_desc->control);
}

static bool pi_test_and_clear_on(struct pi_desc *pi_desc)
{
	return test_and_clear_bit(POSTED_INTR_ON,
			(unsigned long *)&pi_desc->control);
}

static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
{
	return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
}

403
struct vcpu_vmx {
404
	struct kvm_vcpu       vcpu;
405
	unsigned long         host_rsp;
406
	u8                    fail;
Avi Kivity's avatar
Avi Kivity committed
407
	u8                    cpl;
408
	bool                  nmi_known_unmasked;
409
	u32                   exit_intr_info;
410
	u32                   idt_vectoring_info;
411
	ulong                 rflags;
412
	struct shared_msr_entry *guest_msrs;
413
414
	int                   nmsrs;
	int                   save_nmsrs;
415
	unsigned long	      host_idt_base;
416
#ifdef CONFIG_X86_64
417
418
	u64 		      msr_host_kernel_gs_base;
	u64 		      msr_guest_kernel_gs_base;
419
#endif
420
421
422
423
424
425
426
427
	/*
	 * loaded_vmcs points to the VMCS currently used in this vcpu. For a
	 * non-nested (L1) guest, it always points to vmcs01. For a nested
	 * guest (L2), it points to a different VMCS.
	 */
	struct loaded_vmcs    vmcs01;
	struct loaded_vmcs   *loaded_vmcs;
	bool                  __launched; /* temporary, used in vmx_vcpu_run */
428
429
430
431
432
	struct msr_autoload {
		unsigned nr;
		struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
		struct vmx_msr_entry host[NR_AUTOLOAD_MSRS];
	} msr_autoload;
433
434
435
	struct {
		int           loaded;
		u16           fs_sel, gs_sel, ldt_sel;
436
437
438
#ifdef CONFIG_X86_64
		u16           ds_sel, es_sel;
#endif
439
440
		int           gs_ldt_reload_needed;
		int           fs_reload_needed;
Mike Day's avatar
Mike Day committed
441
	} host_state;
442
	struct {
443
		int vm86_active;
444
		ulong save_rflags;
445
446
447
448
		struct kvm_segment segs[8];
	} rmode;
	struct {
		u32 bitmask; /* 4 bits per segment (1 bit per field) */
449
450
451
452
453
		struct kvm_save_segment {
			u16 selector;
			unsigned long base;
			u32 limit;
			u32 ar;
454
		} seg[8];
455
	} segment_cache;
456
	int vpid;
457
	bool emulation_required;
458
459
460
461
462

	/* Support for vnmi-less CPUs */
	int soft_vnmi_blocked;
	ktime_t entry_time;
	s64 vnmi_blocked_time;
463
	u32 exit_reason;
464
465

	bool rdtscp_enabled;
466

467
468
469
	/* Posted interrupt descriptor */
	struct pi_desc pi_desc;

470
471
	/* Support for a guest hypervisor (nested VMX) */
	struct nested_vmx nested;
472
473
};

474
475
476
477
478
479
480
481
482
enum segment_cache_field {
	SEG_FIELD_SEL = 0,
	SEG_FIELD_BASE = 1,
	SEG_FIELD_LIMIT = 2,
	SEG_FIELD_AR = 3,

	SEG_FIELD_NR = 4
};

483
484
static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
{
485
	return container_of(vcpu, struct vcpu_vmx, vcpu);
486
487
}

488
489
490
491
492
#define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
#define FIELD(number, name)	[number] = VMCS12_OFFSET(name)
#define FIELD64(number, name)	[number] = VMCS12_OFFSET(name), \
				[number##_HIGH] = VMCS12_OFFSET(name)+4

493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550

static const unsigned long shadow_read_only_fields[] = {
	/*
	 * We do NOT shadow fields that are modified when L0
	 * traps and emulates any vmx instruction (e.g. VMPTRLD,
	 * VMXON...) executed by L1.
	 * For example, VM_INSTRUCTION_ERROR is read
	 * by L1 if a vmx instruction fails (part of the error path).
	 * Note the code assumes this logic. If for some reason
	 * we start shadowing these fields then we need to
	 * force a shadow sync when L0 emulates vmx instructions
	 * (e.g. force a sync if VM_INSTRUCTION_ERROR is modified
	 * by nested_vmx_failValid)
	 */
	VM_EXIT_REASON,
	VM_EXIT_INTR_INFO,
	VM_EXIT_INSTRUCTION_LEN,
	IDT_VECTORING_INFO_FIELD,
	IDT_VECTORING_ERROR_CODE,
	VM_EXIT_INTR_ERROR_CODE,
	EXIT_QUALIFICATION,
	GUEST_LINEAR_ADDRESS,
	GUEST_PHYSICAL_ADDRESS
};
static const int max_shadow_read_only_fields =
	ARRAY_SIZE(shadow_read_only_fields);

static const unsigned long shadow_read_write_fields[] = {
	GUEST_RIP,
	GUEST_RSP,
	GUEST_CR0,
	GUEST_CR3,
	GUEST_CR4,
	GUEST_INTERRUPTIBILITY_INFO,
	GUEST_RFLAGS,
	GUEST_CS_SELECTOR,
	GUEST_CS_AR_BYTES,
	GUEST_CS_LIMIT,
	GUEST_CS_BASE,
	GUEST_ES_BASE,
	CR0_GUEST_HOST_MASK,
	CR0_READ_SHADOW,
	CR4_READ_SHADOW,
	TSC_OFFSET,
	EXCEPTION_BITMAP,
	CPU_BASED_VM_EXEC_CONTROL,
	VM_ENTRY_EXCEPTION_ERROR_CODE,
	VM_ENTRY_INTR_INFO_FIELD,
	VM_ENTRY_INSTRUCTION_LEN,
	VM_ENTRY_EXCEPTION_ERROR_CODE,
	HOST_FS_BASE,
	HOST_GS_BASE,
	HOST_FS_SELECTOR,
	HOST_GS_SELECTOR
};
static const int max_shadow_read_write_fields =
	ARRAY_SIZE(shadow_read_write_fields);

551
static const unsigned short vmcs_field_to_offset_table[] = {
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
	FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
	FIELD(GUEST_ES_SELECTOR, guest_es_selector),
	FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
	FIELD(GUEST_SS_SELECTOR, guest_ss_selector),
	FIELD(GUEST_DS_SELECTOR, guest_ds_selector),
	FIELD(GUEST_FS_SELECTOR, guest_fs_selector),
	FIELD(GUEST_GS_SELECTOR, guest_gs_selector),
	FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector),
	FIELD(GUEST_TR_SELECTOR, guest_tr_selector),
	FIELD(HOST_ES_SELECTOR, host_es_selector),
	FIELD(HOST_CS_SELECTOR, host_cs_selector),
	FIELD(HOST_SS_SELECTOR, host_ss_selector),
	FIELD(HOST_DS_SELECTOR, host_ds_selector),
	FIELD(HOST_FS_SELECTOR, host_fs_selector),
	FIELD(HOST_GS_SELECTOR, host_gs_selector),
	FIELD(HOST_TR_SELECTOR, host_tr_selector),
	FIELD64(IO_BITMAP_A, io_bitmap_a),
	FIELD64(IO_BITMAP_B, io_bitmap_b),
	FIELD64(MSR_BITMAP, msr_bitmap),
	FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr),
	FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr),
	FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr),
	FIELD64(TSC_OFFSET, tsc_offset),
	FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
	FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
	FIELD64(EPT_POINTER, ept_pointer),
	FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
	FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
	FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl),
	FIELD64(GUEST_IA32_PAT, guest_ia32_pat),
	FIELD64(GUEST_IA32_EFER, guest_ia32_efer),
	FIELD64(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl),
	FIELD64(GUEST_PDPTR0, guest_pdptr0),
	FIELD64(GUEST_PDPTR1, guest_pdptr1),
	FIELD64(GUEST_PDPTR2, guest_pdptr2),
	FIELD64(GUEST_PDPTR3, guest_pdptr3),
	FIELD64(HOST_IA32_PAT, host_ia32_pat),
	FIELD64(HOST_IA32_EFER, host_ia32_efer),
	FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl),
	FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control),
	FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control),
	FIELD(EXCEPTION_BITMAP, exception_bitmap),
	FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask),
	FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match),
	FIELD(CR3_TARGET_COUNT, cr3_target_count),
	FIELD(VM_EXIT_CONTROLS, vm_exit_controls),
	FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count),
	FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count),
	FIELD(VM_ENTRY_CONTROLS, vm_entry_controls),
	FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count),
	FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field),
	FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code),
	FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len),
	FIELD(TPR_THRESHOLD, tpr_threshold),
	FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control),
	FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error),
	FIELD(VM_EXIT_REASON, vm_exit_reason),
	FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info),
	FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code),
	FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field),
	FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code),
	FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len),
	FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info),
	FIELD(GUEST_ES_LIMIT, guest_es_limit),
	FIELD(GUEST_CS_LIMIT, guest_cs_limit),
	FIELD(GUEST_SS_LIMIT, guest_ss_limit),
	FIELD(GUEST_DS_LIMIT, guest_ds_limit),
	FIELD(GUEST_FS_LIMIT, guest_fs_limit),
	FIELD(GUEST_GS_LIMIT, guest_gs_limit),
	FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit),
	FIELD(GUEST_TR_LIMIT, guest_tr_limit),
	FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit),
	FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit),
	FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes),
	FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes),
	FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes),
	FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes),
	FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes),
	FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes),
	FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes),
	FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes),
	FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info),
	FIELD(GUEST_ACTIVITY_STATE, guest_activity_state),
	FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs),
	FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs),
637
	FIELD(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value),
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
	FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask),
	FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask),
	FIELD(CR0_READ_SHADOW, cr0_read_shadow),
	FIELD(CR4_READ_SHADOW, cr4_read_shadow),
	FIELD(CR3_TARGET_VALUE0, cr3_target_value0),
	FIELD(CR3_TARGET_VALUE1, cr3_target_value1),
	FIELD(CR3_TARGET_VALUE2, cr3_target_value2),
	FIELD(CR3_TARGET_VALUE3, cr3_target_value3),
	FIELD(EXIT_QUALIFICATION, exit_qualification),
	FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address),
	FIELD(GUEST_CR0, guest_cr0),
	FIELD(GUEST_CR3, guest_cr3),
	FIELD(GUEST_CR4, guest_cr4),
	FIELD(GUEST_ES_BASE, guest_es_base),
	FIELD(GUEST_CS_BASE, guest_cs_base),
	FIELD(GUEST_SS_BASE, guest_ss_base),
	FIELD(GUEST_DS_BASE, guest_ds_base),
	FIELD(GUEST_FS_BASE, guest_fs_base),
	FIELD(GUEST_GS_BASE, guest_gs_base),
	FIELD(GUEST_LDTR_BASE, guest_ldtr_base),
	FIELD(GUEST_TR_BASE, guest_tr_base),
	FIELD(GUEST_GDTR_BASE, guest_gdtr_base),
	FIELD(GUEST_IDTR_BASE, guest_idtr_base),
	FIELD(GUEST_DR7, guest_dr7),
	FIELD(GUEST_RSP, guest_rsp),
	FIELD(GUEST_RIP, guest_rip),
	FIELD(GUEST_RFLAGS, guest_rflags),
	FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions),
	FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp),
	FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip),
	FIELD(HOST_CR0, host_cr0),
	FIELD(HOST_CR3, host_cr3),
	FIELD(HOST_CR4, host_cr4),
	FIELD(HOST_FS_BASE, host_fs_base),
	FIELD(HOST_GS_BASE, host_gs_base),
	FIELD(HOST_TR_BASE, host_tr_base),
	FIELD(HOST_GDTR_BASE, host_gdtr_base),
	FIELD(HOST_IDTR_BASE, host_idtr_base),
	FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp),
	FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip),
	FIELD(HOST_RSP, host_rsp),
	FIELD(HOST_RIP, host_rip),
};
static const int max_vmcs_field = ARRAY_SIZE(vmcs_field_to_offset_table);

static inline short vmcs_field_to_offset(unsigned long field)
{
	if (field >= max_vmcs_field || vmcs_field_to_offset_table[field] == 0)
		return -1;
	return vmcs_field_to_offset_table[field];
}

690
691
692
693
694
695
696
697
static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
{
	return to_vmx(vcpu)->nested.current_vmcs12;
}

static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr)
{
	struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT);
698
	if (is_error_page(page))
699
		return NULL;
700

701
702
703
704
705
706
707
708
709
710
711
712
713
	return page;
}

static void nested_release_page(struct page *page)
{
	kvm_release_page_dirty(page);
}

static void nested_release_page_clean(struct page *page)
{
	kvm_release_page_clean(page);
}

714
static u64 construct_eptp(unsigned long root_hpa);
715
716
static void kvm_cpu_vmxon(u64 addr);
static void kvm_cpu_vmxoff(void);
717
static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
718
static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
719
720
721
722
static void vmx_set_segment(struct kvm_vcpu *vcpu,
			    struct kvm_segment *var, int seg);
static void vmx_get_segment(struct kvm_vcpu *vcpu,
			    struct kvm_segment *var, int seg);
723
724
static bool guest_state_valid(struct kvm_vcpu *vcpu);
static u32 vmx_segment_access_rights(struct kvm_segment *var);
725
static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu);
726
static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx);
727
static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
728

Avi Kivity's avatar
Avi Kivity committed
729
730
static DEFINE_PER_CPU(struct vmcs *, vmxarea);
static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
731
732
733
734
735
/*
 * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
 * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
 */
static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
736
static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
Avi Kivity's avatar
Avi Kivity committed
737

738
739
static unsigned long *vmx_io_bitmap_a;
static unsigned long *vmx_io_bitmap_b;
740
741
static unsigned long *vmx_msr_bitmap_legacy;
static unsigned long *vmx_msr_bitmap_longmode;
742
743
static unsigned long *vmx_msr_bitmap_legacy_x2apic;
static unsigned long *vmx_msr_bitmap_longmode_x2apic;
744
745
static unsigned long *vmx_vmread_bitmap;
static unsigned long *vmx_vmwrite_bitmap;
746

747
static bool cpu_has_load_ia32_efer;
748
static bool cpu_has_load_perf_global_ctrl;
749

750
751
752
static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
static DEFINE_SPINLOCK(vmx_vpid_lock);

753
static struct vmcs_config {
Avi Kivity's avatar
Avi Kivity committed
754
755
756
	int size;
	int order;
	u32 revision_id;
757
758
	u32 pin_based_exec_ctrl;
	u32 cpu_based_exec_ctrl;
759
	u32 cpu_based_2nd_exec_ctrl;
760
761
762
	u32 vmexit_ctrl;
	u32 vmentry_ctrl;
} vmcs_config;
Avi Kivity's avatar
Avi Kivity committed
763

Hannes Eder's avatar
Hannes Eder committed
764
static struct vmx_capability {
Sheng Yang's avatar
Sheng Yang committed
765
766
767
768
	u32 ept;
	u32 vpid;
} vmx_capability;

Avi Kivity's avatar
Avi Kivity committed
769
770
771
772
773
774
775
776
#define VMX_SEGMENT_FIELD(seg)					\
	[VCPU_SREG_##seg] = {                                   \
		.selector = GUEST_##seg##_SELECTOR,		\
		.base = GUEST_##seg##_BASE,		   	\
		.limit = GUEST_##seg##_LIMIT,		   	\
		.ar_bytes = GUEST_##seg##_AR_BYTES,	   	\
	}

777
static const struct kvm_vmx_segment_field {
Avi Kivity's avatar
Avi Kivity committed
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
	unsigned selector;
	unsigned base;
	unsigned limit;
	unsigned ar_bytes;
} kvm_vmx_segment_fields[] = {
	VMX_SEGMENT_FIELD(CS),
	VMX_SEGMENT_FIELD(DS),
	VMX_SEGMENT_FIELD(ES),
	VMX_SEGMENT_FIELD(FS),
	VMX_SEGMENT_FIELD(GS),
	VMX_SEGMENT_FIELD(SS),
	VMX_SEGMENT_FIELD(TR),
	VMX_SEGMENT_FIELD(LDTR),
};

793
794
static u64 host_efer;

Avi Kivity's avatar
Avi Kivity committed
795
796
static void ept_save_pdptrs(struct kvm_vcpu *vcpu);

797
/*
Brian Gerst's avatar
Brian Gerst committed
798
 * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it
799
800
 * away by decrementing the array size.
 */
Avi Kivity's avatar
Avi Kivity committed
801
static const u32 vmx_msr_index[] = {
802
#ifdef CONFIG_X86_64
803
	MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
Avi Kivity's avatar
Avi Kivity committed
804
#endif
Brian Gerst's avatar
Brian Gerst committed
805
	MSR_EFER, MSR_TSC_AUX, MSR_STAR,
Avi Kivity's avatar
Avi Kivity committed
806
};
807
#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
Avi Kivity's avatar
Avi Kivity committed
808

809
static inline bool is_page_fault(u32 intr_info)
Avi Kivity's avatar
Avi Kivity committed
810
811
812
{
	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
			     INTR_INFO_VALID_MASK)) ==
813
		(INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
Avi Kivity's avatar
Avi Kivity committed
814
815
}

816
static inline bool is_no_device(u32 intr_info)
817
818
819
{
	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
			     INTR_INFO_VALID_MASK)) ==
820
		(INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
821
822
}

823
static inline bool is_invalid_opcode(u32 intr_info)
824
825
826
{
	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
			     INTR_INFO_VALID_MASK)) ==
827
		(INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
828
829
}

830
static inline bool is_external_interrupt(u32 intr_info)
Avi Kivity's avatar
Avi Kivity committed
831
832
833
834
835
{
	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
		== (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
}

836
static inline bool is_machine_check(u32 intr_info)
837
838
839
840
841
842
{
	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
			     INTR_INFO_VALID_MASK)) ==
		(INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
}

843
static inline bool cpu_has_vmx_msr_bitmap(void)
844
{
845
	return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
846
847
}

848
static inline bool cpu_has_vmx_tpr_shadow(void)
849
{
850
	return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
851
852
}

853
static inline bool vm_need_tpr_shadow(struct kvm *kvm)
854
{
855
	return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm));
856
857
}

858
static inline bool cpu_has_secondary_exec_ctrls(void)
859
{
860
861
	return vmcs_config.cpu_based_exec_ctrl &
		CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
862
863
}

864
static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
865
{
866
867
868
869
	return vmcs_config.cpu_based_2nd_exec_ctrl &
		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
}

870
871
872
873
874
875
static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
{
	return vmcs_config.cpu_based_2nd_exec_ctrl &
		SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
}

876
877
878
879
880
881
static inline bool cpu_has_vmx_apic_register_virt(void)
{
	return vmcs_config.cpu_based_2nd_exec_ctrl &
		SECONDARY_EXEC_APIC_REGISTER_VIRT;
}

882
883
884
885
886
887
static inline bool cpu_has_vmx_virtual_intr_delivery(void)
{
	return vmcs_config.cpu_based_2nd_exec_ctrl &
		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
}

888
889
890
891
892
893
894
895
896
897
898
899
static inline bool cpu_has_vmx_posted_intr(void)
{
	return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
}

static inline bool cpu_has_vmx_apicv(void)
{
	return cpu_has_vmx_apic_register_virt() &&
		cpu_has_vmx_virtual_intr_delivery() &&
		cpu_has_vmx_posted_intr();
}

900
901
902
903
static inline bool cpu_has_vmx_flexpriority(void)
{
	return cpu_has_vmx_tpr_shadow() &&
		cpu_has_vmx_virtualize_apic_accesses();
904
905
}

906
907
static inline bool cpu_has_vmx_ept_execute_only(void)
{
908
	return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
909
910
911
912
}

static inline bool cpu_has_vmx_eptp_uncacheable(void)
{
913
	return vmx_capability.ept & VMX_EPTP_UC_BIT;
914
915
916
917
}

static inline bool cpu_has_vmx_eptp_writeback(void)
{
918
	return vmx_capability.ept & VMX_EPTP_WB_BIT;
919
920
921
922
}

static inline bool cpu_has_vmx_ept_2m_page(void)
{
923
	return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
924
925
}

926
927
static inline bool cpu_has_vmx_ept_1g_page(void)
{
928
	return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
929
930
}

931
932
933
934
935
static inline bool cpu_has_vmx_ept_4levels(void)
{
	return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
}

936
937
938
939
940
static inline bool cpu_has_vmx_ept_ad_bits(void)
{
	return vmx_capability.ept & VMX_EPT_AD_BIT;
}

941
static inline bool cpu_has_vmx_invept_context(void)
Sheng Yang's avatar
Sheng Yang committed
942
{
943
	return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
Sheng Yang's avatar
Sheng Yang committed
944
945
}

946
static inline bool cpu_has_vmx_invept_global(void)
Sheng Yang's avatar
Sheng Yang committed
947
{
948
	return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
Sheng Yang's avatar
Sheng Yang committed
949
950
}

951
952
953
954
955
static inline bool cpu_has_vmx_invvpid_single(void)
{
	return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
}

956
957
958
959
960
static inline bool cpu_has_vmx_invvpid_global(void)
{
	return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
}

961
static inline bool cpu_has_vmx_ept(void)
Sheng Yang's avatar
Sheng Yang committed
962
{
963
964
	return vmcs_config.cpu_based_2nd_exec_ctrl &
		SECONDARY_EXEC_ENABLE_EPT;
Sheng Yang's avatar
Sheng Yang committed
965
966
}

967
static inline bool cpu_has_vmx_unrestricted_guest(void)
968
969
970
971
972
{
	return vmcs_config.cpu_based_2nd_exec_ctrl &
		SECONDARY_EXEC_UNRESTRICTED_GUEST;
}

973
static inline bool cpu_has_vmx_ple(void)
974
975
976
977
978
{
	return vmcs_config.cpu_based_2nd_exec_ctrl &
		SECONDARY_EXEC_PAUSE_LOOP_EXITING;
}

979
static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm)
980
{
981
	return flexpriority_enabled && irqchip_in_kernel(kvm);
982
983
}

984
static inline bool cpu_has_vmx_vpid(void)
985
{
986
987
	return vmcs_config.cpu_based_2nd_exec_ctrl &
		SECONDARY_EXEC_ENABLE_VPID;
988
989
}

990
static inline bool cpu_has_vmx_rdtscp(void)
991
992
993
994
995
{
	return vmcs_config.cpu_based_2nd_exec_ctrl &
		SECONDARY_EXEC_RDTSCP;
}

996
997
998
999
1000
1001
static inline bool cpu_has_vmx_invpcid(void)
{
	return vmcs_config.cpu_based_2nd_exec_ctrl &
		SECONDARY_EXEC_ENABLE_INVPCID;
}

1002
static inline bool cpu_has_virtual_nmis(void)
1003
1004
1005
1006
{
	return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
}

1007
1008
1009
1010
1011
1012
static inline bool cpu_has_vmx_wbinvd_exit(void)
{
	return vmcs_config.cpu_based_2nd_exec_ctrl &
		SECONDARY_EXEC_WBINVD_EXITING;
}

1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
static inline bool cpu_has_vmx_shadow_vmcs(void)
{
	u64 vmx_msr;
	rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
	/* check if the cpu supports writing r/o exit information fields */
	if (!(vmx_msr & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS))
		return false;

	return vmcs_config.cpu_based_2nd_exec_ctrl &
		SECONDARY_EXEC_SHADOW_VMCS;
}

1025
1026
1027
1028
1029
static inline bool report_flexpriority(void)
{
	return flexpriority_enabled;
}

1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit)
{
	return vmcs12->cpu_based_vm_exec_control & bit;
}

static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
{
	return (vmcs12->cpu_based_vm_exec_control &
			CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
		(vmcs12->secondary_vm_exec_control & bit);
}

1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12,
	struct kvm_vcpu *vcpu)
{
	return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
}

static inline bool is_exception(u32 intr_info)
{
	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
		== (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK);
}

static void nested_vmx_vmexit(struct kvm_vcpu *vcpu);
1055
1056
1057
1058
static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
			struct vmcs12 *vmcs12,
			u32 reason, unsigned long qualification);

1059
static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
1060
1061
1062
{
	int i;

1063
	for (i = 0; i < vmx->nmsrs; ++i)
1064
		if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
1065
1066
1067
1068
			return i;
	return -1;
}

1069
1070
1071
1072
1073
1074
1075
1076
static inline void __invvpid(int ext, u16 vpid, gva_t gva)
{
    struct {
	u64 vpid : 16;
	u64 rsvd : 48;
	u64 gva;
    } operand = { vpid, 0, gva };

1077
    asm volatile (__ex(ASM_VMX_INVVPID)
1078
1079
1080
1081
1082
		  /* CF==1 or ZF==1 --> rc = -1 */
		  "; ja 1f ; ud2 ; 1:"
		  : : "a"(&operand), "c"(ext) : "cc", "memory");
}

1083
1084
1085
1086
1087
1088
static inline void __invept(int ext, u64 eptp, gpa_t gpa)
{
	struct {
		u64 eptp, gpa;
	} operand = {eptp, gpa};

1089
	asm volatile (__ex(ASM_VMX_INVEPT)
1090
1091
1092
1093
1094
			/* CF==1 or ZF==1 --> rc = -1 */
			"; ja 1f ; ud2 ; 1:\n"
			: : "a" (&operand), "c" (ext) : "cc", "memory");
}

1095
static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
1096
1097
1098
{
	int i;

1099
	i = __find_msr_index(vmx, msr);
1100
	if (i >= 0)
1101
		return &vmx->guest_msrs[i];
Al Viro's avatar
Al Viro committed
1102
	return NULL;
1103
1104
}

Avi Kivity's avatar
Avi Kivity committed
1105
1106
1107
1108
1109
static void vmcs_clear(struct vmcs *vmcs)
{
	u64 phys_addr = __pa(vmcs);
	u8 error;

1110
	asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0"
1111
		      : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
Avi Kivity's avatar
Avi Kivity committed
1112
1113
1114
1115
1116
1117
		      : "cc", "memory");
	if (error)
		printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
		       vmcs, phys_addr);
}

1118
1119
1120
1121
1122
1123
1124
static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
{
	vmcs_clear(loaded_vmcs->vmcs);
	loaded_vmcs->cpu = -1;
	loaded_vmcs->launched = 0;
}

1125
1126
1127
1128
1129
1130
static void vmcs_load(struct vmcs *vmcs)
{
	u64 phys_addr = __pa(vmcs);
	u8 error;

	asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
1131
			: "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
1132
1133
			: "cc", "memory");
	if (error)
1134
		printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n",
1135
1136
1137
		       vmcs, phys_addr);
}

1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
#ifdef CONFIG_KEXEC
/*
 * This bitmap is used to indicate whether the vmclear
 * operation is enabled on all cpus. All disabled by
 * default.
 */
static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;

static inline void crash_enable_local_vmclear(int cpu)
{
	cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
}

static inline void crash_disable_local_vmclear(int cpu)
{
	cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
}

static inline int crash_local_vmclear_enabled(int cpu)
{
	return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
}

static void crash_vmclear_local_loaded_vmcss(void)
{
	int cpu = raw_smp_processor_id();
	struct loaded_vmcs *v;

	if (!crash_local_vmclear_enabled(cpu))
		return;

	list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
			    loaded_vmcss_on_cpu_link)
		vmcs_clear(v->vmcs);
}
#else
static inline void crash_enable_local_vmclear(int cpu) { }
static inline void crash_disable_local_vmclear(int cpu) { }
#endif /* CONFIG_KEXEC */

1178
static void __loaded_vmcs_clear(void *arg)
Avi Kivity's avatar
Avi Kivity committed
1179
{
1180
	struct loaded_vmcs *loaded_vmcs = arg;
1181
	int cpu = raw_smp_processor_id();
Avi Kivity's avatar
Avi Kivity committed
1182

1183
1184
1185
	if (loaded_vmcs->cpu != cpu)
		return; /* vcpu migration can race with cpu offline */
	if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
Avi Kivity's avatar
Avi Kivity committed
1186
		per_cpu(current_vmcs, cpu) = NULL;
1187
	crash_disable_local_vmclear(cpu);
1188
	list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
1189
1190
1191
1192
1193
1194
1195
1196
1197

	/*
	 * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link
	 * is before setting loaded_vmcs->vcpu to -1 which is done in
	 * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist
	 * then adds the vmcs into percpu list before it is deleted.
	 */
	smp_wmb();

1198
	loaded_vmcs_init(loaded_vmcs);
1199
	crash_enable_local_vmclear(cpu);
Avi Kivity's avatar
Avi Kivity committed
1200
1201
}

1202
static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
1203
{
1204
1205
1206
1207
1208
	int cpu = loaded_vmcs->cpu;

	if (cpu != -1)
		smp_call_function_single(cpu,
			 __loaded_vmcs_clear, loaded_vmcs, 1);
1209
1210
}

1211
static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
1212
1213
1214
1215
{
	if (vmx->vpid == 0)
		return;

1216
1217
	if (cpu_has_vmx_invvpid_single())
		__invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
1218
1219
}

1220
1221
1222
1223
1224
1225
1226
1227
1228
static inline void vpid_sync_vcpu_global(void)
{
	if (cpu_has_vmx_invvpid_global())
		__invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
}

static inline void vpid_sync_context(struct vcpu_vmx *vmx)
{
	if (cpu_has_vmx_invvpid_single())
1229
		vpid_sync_vcpu_single(vmx);
1230
1231
1232
1233
	else
		vpid_sync_vcpu_global();
}

1234
1235
1236
1237
1238
1239
1240
1241
static inline void ept_sync_global(void)
{
	if (cpu_has_vmx_invept_global())
		__invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
}

static inline void ept_sync_context(u64 eptp)
{
1242
	if (enable_ept) {
1243
1244
1245
1246
1247
1248
1249
		if (cpu_has_vmx_invept_context())
			__invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
		else
			ept_sync_global();
	}
}

Avi Kivity's avatar
Avi Kivity committed
1250
static __always_inline unsigned long vmcs_readl(unsigned long field)
Avi Kivity's avatar
Avi Kivity committed
1251
{
1252
	unsigned long value;
Avi Kivity's avatar
Avi Kivity committed
1253

1254
1255
	asm volatile (__ex_clear(ASM_VMX_VMREAD_RDX_RAX, "%0")
		      : "=a"(value) : "d"(field) : "cc");
Avi Kivity's avatar
Avi Kivity committed
1256
1257
1258
	return value;
}

Avi Kivity's avatar
Avi Kivity committed
1259
static __always_inline u16 vmcs_read16(unsigned long field)
Avi Kivity's avatar
Avi Kivity committed
1260
1261
1262
1263
{
	return vmcs_readl(field);
}

Avi Kivity's avatar
Avi Kivity committed
1264
static __always_inline u32 vmcs_read32(unsigned long field)
Avi Kivity's avatar
Avi Kivity committed
1265
1266
1267
1268
{
	return vmcs_readl(field);
}

Avi Kivity's avatar
Avi Kivity committed
1269
static __always_inline u64 vmcs_read64(unsigned long field)
Avi Kivity's avatar
Avi Kivity committed
1270
{
1271
#ifdef CONFIG_X86_64
Avi Kivity's avatar
Avi Kivity committed
1272
1273
1274
1275
1276
1277
	return vmcs_readl(field);
#else
	return vmcs_readl(field) | ((u64)vmcs_readl(field+1) << 32);
#endif
}

1278
1279
1280
1281
1282
1283
1284
static noinline void vmwrite_error(unsigned long field, unsigned long value)
{
	printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
	       field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
	dump_stack();
}

Avi Kivity's avatar
Avi Kivity committed
1285
1286
1287
1288
static void vmcs_writel(unsigned long field, unsigned long value)
{
	u8 error;

1289
	asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) "; setna %0"
Mike Day's avatar
Mike Day committed
1290
		       : "=q"(error) : "a"(value), "d"(field) : "cc");
1291
1292
	if (unlikely(error))
		vmwrite_error(field, value);
Avi Kivity's avatar
Avi Kivity committed
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
}

static void vmcs_write16(unsigned long field, u16 value)
{
	vmcs_writel(field, value);
}

static void vmcs_write32(unsigned long field, u32 value)
{
	vmcs_writel(field, value);
}

static void vmcs_write64(unsigned long field, u64 value)
{
	vmcs_writel(field, value);
1308
#ifndef CONFIG_X86_64
Avi Kivity's avatar
Avi Kivity committed
1309
1310
1311
1312
1313
	asm volatile ("");
	vmcs_writel(field+1, value >> 32);
#endif
}

1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
static void vmcs_clear_bits(unsigned long field, u32 mask)
{
	vmcs_writel(field, vmcs_readl(field) & ~mask);
}

static void vmcs_set_bits(unsigned long field, u32 mask)
{
	vmcs_writel(field, vmcs_readl(field) | mask);
}

1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
static void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
{
	vmx->segment_cache.bitmask = 0;
}

static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
				       unsigned field)
{