enlighten.c 39.3 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
/*
 * Core of Xen paravirt_ops implementation.
 *
 * This file contains the xen_paravirt_ops structure itself, and the
 * implementations for:
 * - privileged instructions
 * - interrupt flags
 * - segment operations
 * - booting and setup
 *
 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
 */

14
#include <linux/cpu.h>
15
16
17
18
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/smp.h>
#include <linux/preempt.h>
19
#include <linux/hardirq.h>
20
21
22
23
#include <linux/percpu.h>
#include <linux/delay.h>
#include <linux/start_kernel.h>
#include <linux/sched.h>
24
#include <linux/kprobes.h>
25
26
#include <linux/bootmem.h>
#include <linux/module.h>
27
28
29
#include <linux/mm.h>
#include <linux/page-flags.h>
#include <linux/highmem.h>
30
#include <linux/console.h>
Chris Wright's avatar
Chris Wright committed
31
#include <linux/pci.h>
32
#include <linux/gfp.h>
33
#include <linux/memblock.h>
34

35
#include <xen/xen.h>
36
#include <xen/events.h>
37
#include <xen/interface/xen.h>
38
#include <xen/interface/version.h>
39
40
#include <xen/interface/physdev.h>
#include <xen/interface/vcpu.h>
41
#include <xen/interface/memory.h>
42
#include <xen/interface/xen-mca.h>
43
44
#include <xen/features.h>
#include <xen/page.h>
45
#include <xen/hvm.h>
46
#include <xen/hvc-console.h>
47
#include <xen/acpi.h>
48
49

#include <asm/paravirt.h>
Ingo Molnar's avatar
Ingo Molnar committed
50
#include <asm/apic.h>
51
#include <asm/page.h>
52
#include <asm/xen/pci.h>
53
54
55
56
#include <asm/xen/hypercall.h>
#include <asm/xen/hypervisor.h>
#include <asm/fixmap.h>
#include <asm/processor.h>
57
#include <asm/proto.h>
58
#include <asm/msr-index.h>
59
#include <asm/traps.h>
60
61
#include <asm/setup.h>
#include <asm/desc.h>
62
#include <asm/pgalloc.h>
63
#include <asm/pgtable.h>
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
64
#include <asm/tlbflush.h>
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
65
#include <asm/reboot.h>
66
#include <asm/stackprotector.h>
67
#include <asm/hypervisor.h>
68
#include <asm/mwait.h>
69
#include <asm/pci_x86.h>
70
71
72
73
74
75
76
77

#ifdef CONFIG_ACPI
#include <linux/acpi.h>
#include <asm/acpi.h>
#include <acpi/pdc_intel.h>
#include <acpi/processor.h>
#include <xen/interface/platform.h>
#endif
78
79

#include "xen-ops.h"
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
80
#include "mmu.h"
Ben Guthro's avatar
Ben Guthro committed
81
#include "smp.h"
82
83
84
85
86
87
#include "multicalls.h"

EXPORT_SYMBOL_GPL(hypercall_page);

DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
88

89
90
91
enum xen_domain_type xen_domain_type = XEN_NATIVE;
EXPORT_SYMBOL_GPL(xen_domain_type);

92
93
unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START;
EXPORT_SYMBOL(machine_to_phys_mapping);
94
95
unsigned long  machine_to_phys_nr;
EXPORT_SYMBOL(machine_to_phys_nr);
96

97
98
99
struct start_info *xen_start_info;
EXPORT_SYMBOL_GPL(xen_start_info);

100
struct shared_info xen_dummy_shared_info;
101

102
103
void *xen_initial_gdt;

104
RESERVE_BRK(shared_info_page_brk, PAGE_SIZE);
105
106
__read_mostly int xen_have_vector_callback;
EXPORT_SYMBOL_GPL(xen_have_vector_callback);
107

108
109
110
111
/*
 * Point at some empty memory to start with. We map the real shared_info
 * page as soon as fixmap is up and running.
 */
112
struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info;
113
114
115
116
117
118
119
120
121
122
123
124
125
126

/*
 * Flag to determine whether vcpu info placement is available on all
 * VCPUs.  We assume it is to start with, and then set it to zero on
 * the first failure.  This is because it can succeed on some VCPUs
 * and not others, since it can involve hypervisor memory allocation,
 * or because the guest failed to guarantee all the appropriate
 * constraints on all VCPUs (ie buffer can't cross a page boundary).
 *
 * Note that any particular CPU may be using a placed vcpu structure,
 * but we can only optimise if the all are.
 *
 * 0: not available, 1: available
 */
127
static int have_vcpu_info_placement = 1;
128

129
130
131
132
133
134
135
136
137
138
139
140
141
struct tls_descs {
	struct desc_struct desc[3];
};

/*
 * Updating the 3 TLS descriptors in the GDT on every task switch is
 * surprisingly expensive so we avoid updating them if they haven't
 * changed.  Since Xen writes different descriptors than the one
 * passed in the update_descriptor hypercall we keep shadow copies to
 * compare against.
 */
static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);

142
143
144
145
146
147
148
149
static void clamp_max_cpus(void)
{
#ifdef CONFIG_SMP
	if (setup_max_cpus > MAX_VIRT_CPUS)
		setup_max_cpus = MAX_VIRT_CPUS;
#endif
}

150
static void xen_vcpu_setup(int cpu)
151
{
152
153
154
155
	struct vcpu_register_vcpu_info info;
	int err;
	struct vcpu_info *vcpup;

156
	BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
157

158
159
	if (cpu < MAX_VIRT_CPUS)
		per_cpu(xen_vcpu,cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
160

161
162
163
164
165
	if (!have_vcpu_info_placement) {
		if (cpu >= MAX_VIRT_CPUS)
			clamp_max_cpus();
		return;
	}
166

167
	vcpup = &per_cpu(xen_vcpu_info, cpu);
168
	info.mfn = arbitrary_virt_to_mfn(vcpup);
169
170
171
172
173
174
175
176
177
178
	info.offset = offset_in_page(vcpup);

	/* Check to see if the hypervisor will put the vcpu_info
	   structure where we want it, which allows direct access via
	   a percpu-variable. */
	err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, cpu, &info);

	if (err) {
		printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err);
		have_vcpu_info_placement = 0;
179
		clamp_max_cpus();
180
181
182
183
184
	} else {
		/* This cpu is using the registered vcpu info, even if
		   later ones fail to. */
		per_cpu(xen_vcpu, cpu) = vcpup;
	}
185
186
}

187
188
189
190
191
192
193
/*
 * On restore, set the vcpu placement up again.
 * If it fails, then we're in a bad state, since
 * we can't back out from using it...
 */
void xen_vcpu_restore(void)
{
194
	int cpu;
195

196
197
	for_each_online_cpu(cpu) {
		bool other_cpu = (cpu != smp_processor_id());
198

199
200
201
		if (other_cpu &&
		    HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL))
			BUG();
202

203
		xen_setup_runstate_info(cpu);
204

205
		if (have_vcpu_info_placement)
206
207
			xen_vcpu_setup(cpu);

208
209
210
		if (other_cpu &&
		    HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL))
			BUG();
211
212
213
	}
}

214
215
static void __init xen_banner(void)
{
216
217
218
219
	unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL);
	struct xen_extraversion extra;
	HYPERVISOR_xen_version(XENVER_extraversion, &extra);

220
	printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
221
	       pv_info.name);
222
223
	printk(KERN_INFO "Xen version: %d.%d%s%s\n",
	       version >> 16, version & 0xffff, extra.extraversion,
224
	       xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
225
}
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
/* Check if running on Xen version (major, minor) or later */
bool
xen_running_on_version_or_later(unsigned int major, unsigned int minor)
{
	unsigned int version;

	if (!xen_domain())
		return false;

	version = HYPERVISOR_xen_version(XENVER_version, NULL);
	if ((((version >> 16) == major) && ((version & 0xffff) >= minor)) ||
		((version >> 16) > major))
		return true;
	return false;
}
241

242
243
244
#define CPUID_THERM_POWER_LEAF 6
#define APERFMPERF_PRESENT 0

245
246
247
static __read_mostly unsigned int cpuid_leaf1_edx_mask = ~0;
static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0;

248
249
250
251
static __read_mostly unsigned int cpuid_leaf1_ecx_set_mask;
static __read_mostly unsigned int cpuid_leaf5_ecx_val;
static __read_mostly unsigned int cpuid_leaf5_edx_val;

252
253
static void xen_cpuid(unsigned int *ax, unsigned int *bx,
		      unsigned int *cx, unsigned int *dx)
254
{
255
	unsigned maskebx = ~0;
256
	unsigned maskecx = ~0;
257
	unsigned maskedx = ~0;
258
	unsigned setecx = 0;
259
260
261
262
	/*
	 * Mask out inconvenient features, to try and disable as many
	 * unsupported kernel subsystems as possible.
	 */
263
264
	switch (*ax) {
	case 1:
265
		maskecx = cpuid_leaf1_ecx_mask;
266
		setecx = cpuid_leaf1_ecx_set_mask;
267
		maskedx = cpuid_leaf1_edx_mask;
268
269
		break;

270
271
272
273
274
275
276
277
	case CPUID_MWAIT_LEAF:
		/* Synthesize the values.. */
		*ax = 0;
		*bx = 0;
		*cx = cpuid_leaf5_ecx_val;
		*dx = cpuid_leaf5_edx_val;
		return;

278
279
280
281
282
	case CPUID_THERM_POWER_LEAF:
		/* Disabling APERFMPERF for kernel usage */
		maskecx = ~(1 << APERFMPERF_PRESENT);
		break;

283
284
285
286
	case 0xb:
		/* Suppress extended topology stuff */
		maskebx = 0;
		break;
287
	}
288
289

	asm(XEN_EMULATE_PREFIX "cpuid"
290
291
292
293
294
		: "=a" (*ax),
		  "=b" (*bx),
		  "=c" (*cx),
		  "=d" (*dx)
		: "0" (*ax), "2" (*cx));
295

296
	*bx &= maskebx;
297
	*cx &= maskecx;
298
	*cx |= setecx;
299
	*dx &= maskedx;
300

301
302
}

303
304
static bool __init xen_check_mwait(void)
{
305
#ifdef CONFIG_ACPI
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
	struct xen_platform_op op = {
		.cmd			= XENPF_set_processor_pminfo,
		.u.set_pminfo.id	= -1,
		.u.set_pminfo.type	= XEN_PM_PDC,
	};
	uint32_t buf[3];
	unsigned int ax, bx, cx, dx;
	unsigned int mwait_mask;

	/* We need to determine whether it is OK to expose the MWAIT
	 * capability to the kernel to harvest deeper than C3 states from ACPI
	 * _CST using the processor_harvest_xen.c module. For this to work, we
	 * need to gather the MWAIT_LEAF values (which the cstate.c code
	 * checks against). The hypervisor won't expose the MWAIT flag because
	 * it would break backwards compatibility; so we will find out directly
	 * from the hardware and hypercall.
	 */
	if (!xen_initial_domain())
		return false;

326
327
328
329
330
331
332
	/*
	 * When running under platform earlier than Xen4.2, do not expose
	 * mwait, to avoid the risk of loading native acpi pad driver
	 */
	if (!xen_running_on_version_or_later(4, 2))
		return false;

333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
	ax = 1;
	cx = 0;

	native_cpuid(&ax, &bx, &cx, &dx);

	mwait_mask = (1 << (X86_FEATURE_EST % 32)) |
		     (1 << (X86_FEATURE_MWAIT % 32));

	if ((cx & mwait_mask) != mwait_mask)
		return false;

	/* We need to emulate the MWAIT_LEAF and for that we need both
	 * ecx and edx. The hypercall provides only partial information.
	 */

	ax = CPUID_MWAIT_LEAF;
	bx = 0;
	cx = 0;
	dx = 0;

	native_cpuid(&ax, &bx, &cx, &dx);

	/* Ask the Hypervisor whether to clear ACPI_PDC_C_C2C3_FFH. If so,
	 * don't expose MWAIT_LEAF and let ACPI pick the IOPORT version of C3.
	 */
	buf[0] = ACPI_PDC_REVISION_ID;
	buf[1] = 1;
	buf[2] = (ACPI_PDC_C_CAPABILITY_SMP | ACPI_PDC_EST_CAPABILITY_SWSMP);

	set_xen_guest_handle(op.u.set_pminfo.pdc, buf);

	if ((HYPERVISOR_dom0_op(&op) == 0) &&
	    (buf[2] & (ACPI_PDC_C_C1_FFH | ACPI_PDC_C_C2C3_FFH))) {
		cpuid_leaf5_ecx_val = cx;
		cpuid_leaf5_edx_val = dx;
	}
	return true;
#else
	return false;
#endif
}
374
static void __init xen_init_cpuid_mask(void)
375
376
{
	unsigned int ax, bx, cx, dx;
377
	unsigned int xsave_mask;
378
379

	cpuid_leaf1_edx_mask =
380
		~((1 << X86_FEATURE_MTRR) |  /* disable MTRR */
381
382
383
384
385
386
		  (1 << X86_FEATURE_ACC));   /* thermal monitoring */

	if (!xen_initial_domain())
		cpuid_leaf1_edx_mask &=
			~((1 << X86_FEATURE_APIC) |  /* disable local APIC */
			  (1 << X86_FEATURE_ACPI));  /* disable ACPI */
387
	ax = 1;
388
	cx = 0;
389
	xen_cpuid(&ax, &bx, &cx, &dx);
390

391
392
393
394
395
396
397
	xsave_mask =
		(1 << (X86_FEATURE_XSAVE % 32)) |
		(1 << (X86_FEATURE_OSXSAVE % 32));

	/* Xen will set CR4.OSXSAVE if supported and not disabled by force */
	if ((cx & xsave_mask) != xsave_mask)
		cpuid_leaf1_ecx_mask &= ~xsave_mask; /* disable XSAVE & OSXSAVE */
398
399
	if (xen_check_mwait())
		cpuid_leaf1_ecx_set_mask = (1 << (X86_FEATURE_MWAIT % 32));
400
401
}

402
403
404
405
406
407
408
409
410
411
static void xen_set_debugreg(int reg, unsigned long val)
{
	HYPERVISOR_set_debugreg(reg, val);
}

static unsigned long xen_get_debugreg(int reg)
{
	return HYPERVISOR_get_debugreg(reg);
}

412
static void xen_end_context_switch(struct task_struct *next)
413
414
{
	xen_mc_flush();
415
	paravirt_end_context_switch(next);
416
417
418
419
420
421
422
}

static unsigned long xen_store_tr(void)
{
	return 0;
}

423
/*
424
425
426
427
 * Set the page permissions for a particular virtual address.  If the
 * address is a vmalloc mapping (or other non-linear mapping), then
 * find the linear mapping of the page and also set its protections to
 * match.
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
 */
static void set_aliased_prot(void *v, pgprot_t prot)
{
	int level;
	pte_t *ptep;
	pte_t pte;
	unsigned long pfn;
	struct page *page;

	ptep = lookup_address((unsigned long)v, &level);
	BUG_ON(ptep == NULL);

	pfn = pte_pfn(*ptep);
	page = pfn_to_page(pfn);

	pte = pfn_pte(pfn, prot);

	if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0))
		BUG();

	if (!PageHighMem(page)) {
		void *av = __va(PFN_PHYS(pfn));

		if (av != v)
			if (HYPERVISOR_update_va_mapping((unsigned long)av, pte, 0))
				BUG();
	} else
		kmap_flush_unused();
}

458
459
static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries)
{
460
	const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
461
462
	int i;

463
464
	for(i = 0; i < entries; i += entries_per_page)
		set_aliased_prot(ldt + i, PAGE_KERNEL_RO);
465
466
467
468
}

static void xen_free_ldt(struct desc_struct *ldt, unsigned entries)
{
469
	const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
470
471
	int i;

472
473
	for(i = 0; i < entries; i += entries_per_page)
		set_aliased_prot(ldt + i, PAGE_KERNEL);
474
475
}

476
477
478
479
480
static void xen_set_ldt(const void *addr, unsigned entries)
{
	struct mmuext_op *op;
	struct multicall_space mcs = xen_mc_entry(sizeof(*op));

481
482
	trace_xen_cpu_set_ldt(addr, entries);

483
484
	op = mcs.args;
	op->cmd = MMUEXT_SET_LDT;
485
	op->arg1.linear_addr = (unsigned long)addr;
486
487
488
489
490
491
492
	op->arg2.nr_ents = entries;

	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);

	xen_mc_issue(PARAVIRT_LAZY_CPU);
}

493
static void xen_load_gdt(const struct desc_ptr *dtr)
494
495
496
497
{
	unsigned long va = dtr->address;
	unsigned int size = dtr->size + 1;
	unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE;
498
	unsigned long frames[pages];
499
500
	int f;

501
502
503
504
	/*
	 * A GDT can be up to 64k in size, which corresponds to 8192
	 * 8-byte entries, or 16 4k pages..
	 */
505
506
507
508
509

	BUG_ON(size > 65536);
	BUG_ON(va & ~PAGE_MASK);

	for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
510
		int level;
511
		pte_t *ptep;
512
513
514
		unsigned long pfn, mfn;
		void *virt;

515
516
517
518
519
520
521
522
		/*
		 * The GDT is per-cpu and is in the percpu data area.
		 * That can be virtually mapped, so we need to do a
		 * page-walk to get the underlying MFN for the
		 * hypercall.  The page can also be in the kernel's
		 * linear range, so we need to RO that mapping too.
		 */
		ptep = lookup_address(va, &level);
523
524
525
526
527
528
529
		BUG_ON(ptep == NULL);

		pfn = pte_pfn(*ptep);
		mfn = pfn_to_mfn(pfn);
		virt = __va(PFN_PHYS(pfn));

		frames[f] = mfn;
530

531
		make_lowmem_page_readonly((void *)va);
532
		make_lowmem_page_readonly(virt);
533
534
	}

535
536
	if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct)))
		BUG();
537
538
}

539
540
541
/*
 * load_gdt for early boot, when the gdt is only mapped once
 */
542
static void __init xen_load_gdt_boot(const struct desc_ptr *dtr)
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
{
	unsigned long va = dtr->address;
	unsigned int size = dtr->size + 1;
	unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE;
	unsigned long frames[pages];
	int f;

	/*
	 * A GDT can be up to 64k in size, which corresponds to 8192
	 * 8-byte entries, or 16 4k pages..
	 */

	BUG_ON(size > 65536);
	BUG_ON(va & ~PAGE_MASK);

	for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
		pte_t pte;
		unsigned long pfn, mfn;

		pfn = virt_to_pfn(va);
		mfn = pfn_to_mfn(pfn);

		pte = pfn_pte(pfn, PAGE_KERNEL_RO);

		if (HYPERVISOR_update_va_mapping((unsigned long)va, pte, 0))
			BUG();

		frames[f] = mfn;
	}

	if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct)))
		BUG();
}

577
578
579
580
581
582
static inline bool desc_equal(const struct desc_struct *d1,
			      const struct desc_struct *d2)
{
	return d1->a == d2->a && d1->b == d2->b;
}

583
584
585
static void load_TLS_descriptor(struct thread_struct *t,
				unsigned int cpu, unsigned int i)
{
586
587
588
589
590
591
592
593
594
595
596
597
598
	struct desc_struct *shadow = &per_cpu(shadow_tls_desc, cpu).desc[i];
	struct desc_struct *gdt;
	xmaddr_t maddr;
	struct multicall_space mc;

	if (desc_equal(shadow, &t->tls_array[i]))
		return;

	*shadow = t->tls_array[i];

	gdt = get_cpu_gdt_table(cpu);
	maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
	mc = __xen_mc_entry(0);
599
600
601
602
603
604

	MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
}

static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
{
605
	/*
606
607
608
609
610
611
612
613
	 * XXX sleazy hack: If we're being called in a lazy-cpu zone
	 * and lazy gs handling is enabled, it means we're in a
	 * context switch, and %gs has just been saved.  This means we
	 * can zero it out to prevent faults on exit from the
	 * hypervisor if the next process has no %gs.  Either way, it
	 * has been saved, and the new value will get loaded properly.
	 * This will go away as soon as Xen has been modified to not
	 * save/restore %gs for normal hypercalls.
614
615
616
617
618
619
620
621
	 *
	 * On x86_64, this hack is not used for %gs, because gs points
	 * to KERNEL_GS_BASE (and uses it for PDA references), so we
	 * must not zero %gs on x86_64
	 *
	 * For x86_64, we need to zero %fs, otherwise we may get an
	 * exception between the new %fs descriptor being loaded and
	 * %fs being effectively cleared at __switch_to().
622
	 */
623
624
	if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
#ifdef CONFIG_X86_32
625
		lazy_load_gs(0);
626
627
628
629
630
631
632
633
634
635
636
637
#else
		loadsegment(fs, 0);
#endif
	}

	xen_mc_batch();

	load_TLS_descriptor(t, cpu, 0);
	load_TLS_descriptor(t, cpu, 1);
	load_TLS_descriptor(t, cpu, 2);

	xen_mc_issue(PARAVIRT_LAZY_CPU);
638
639
}

640
641
642
643
644
#ifdef CONFIG_X86_64
static void xen_load_gs_index(unsigned int idx)
{
	if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, idx))
		BUG();
645
}
646
#endif
647
648

static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
649
				const void *ptr)
650
{
651
	xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]);
652
	u64 entry = *(u64 *)ptr;
653

654
655
	trace_xen_cpu_write_ldt_entry(dt, entrynum, entry);

656
657
	preempt_disable();

658
659
660
	xen_mc_flush();
	if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry))
		BUG();
661
662

	preempt_enable();
663
664
}

665
static int cvt_gate_to_trap(int vector, const gate_desc *val,
666
667
			    struct trap_info *info)
{
668
669
	unsigned long addr;

670
	if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT)
671
672
673
		return 0;

	info->vector = vector;
674
675
676

	addr = gate_offset(*val);
#ifdef CONFIG_X86_64
677
678
679
	/*
	 * Look for known traps using IST, and substitute them
	 * appropriately.  The debugger ones are the only ones we care
680
681
	 * about.  Xen will handle faults like double_fault,
	 * so we should never see them.  Warn if
682
683
	 * there's an unexpected IST-using fault handler.
	 */
684
685
686
687
688
689
	if (addr == (unsigned long)debug)
		addr = (unsigned long)xen_debug;
	else if (addr == (unsigned long)int3)
		addr = (unsigned long)xen_int3;
	else if (addr == (unsigned long)stack_segment)
		addr = (unsigned long)xen_stack_segment;
690
691
692
693
694
695
	else if (addr == (unsigned long)double_fault ||
		 addr == (unsigned long)nmi) {
		/* Don't need to handle these */
		return 0;
#ifdef CONFIG_X86_MCE
	} else if (addr == (unsigned long)machine_check) {
696
697
698
699
700
		/*
		 * when xen hypervisor inject vMCE to guest,
		 * use native mce handler to handle it
		 */
		;
701
702
703
704
705
706
#endif
	} else {
		/* Some other trap using IST? */
		if (WARN_ON(val->ist != 0))
			return 0;
	}
707
708
709
#endif	/* CONFIG_X86_64 */
	info->address = addr;

710
711
	info->cs = gate_segment(*val);
	info->flags = val->dpl;
712
	/* interrupt gates clear IF */
713
714
	if (val->type == GATE_INTERRUPT)
		info->flags |= 1 << 2;
715
716
717
718
719

	return 1;
}

/* Locations of each CPU's IDT */
720
static DEFINE_PER_CPU(struct desc_ptr, idt_desc);
721
722
723

/* Set an IDT entry.  If the entry is part of the current IDT, then
   also update Xen. */
724
static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g)
725
726
{
	unsigned long p = (unsigned long)&dt[entrynum];
727
728
	unsigned long start, end;

729
730
	trace_xen_cpu_write_idt_entry(dt, entrynum, g);

731
732
	preempt_disable();

Christoph Lameter's avatar
Christoph Lameter committed
733
734
	start = __this_cpu_read(idt_desc.address);
	end = start + __this_cpu_read(idt_desc.size) + 1;
735
736
737

	xen_mc_flush();

738
	native_write_idt_entry(dt, entrynum, g);
739
740
741
742
743
744

	if (p >= start && (p + 8) <= end) {
		struct trap_info info[2];

		info[1].address = 0;

745
		if (cvt_gate_to_trap(entrynum, g, &info[0]))
746
747
748
			if (HYPERVISOR_set_trap_table(info))
				BUG();
	}
749
750

	preempt_enable();
751
752
}

753
static void xen_convert_trap_info(const struct desc_ptr *desc,
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
754
				  struct trap_info *traps)
755
756
757
{
	unsigned in, out, count;

758
	count = (desc->size+1) / sizeof(gate_desc);
759
760
761
	BUG_ON(count > 256);

	for (in = out = 0; in < count; in++) {
762
		gate_desc *entry = (gate_desc*)(desc->address) + in;
763

764
		if (cvt_gate_to_trap(in, entry, &traps[out]))
765
766
767
			out++;
	}
	traps[out].address = 0;
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
768
769
770
771
}

void xen_copy_trap_info(struct trap_info *traps)
{
772
	const struct desc_ptr *desc = &__get_cpu_var(idt_desc);
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
773
774
775
776
777
778
779

	xen_convert_trap_info(desc, traps);
}

/* Load a new IDT into Xen.  In principle this can be per-CPU, so we
   hold a spinlock to protect the static traps[] array (static because
   it avoids allocation, and saves stack space). */
780
static void xen_load_idt(const struct desc_ptr *desc)
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
781
782
783
784
{
	static DEFINE_SPINLOCK(lock);
	static struct trap_info traps[257];

785
786
	trace_xen_cpu_load_idt(desc);

Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
787
788
	spin_lock(&lock);

789
790
	__get_cpu_var(idt_desc) = *desc;

Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
791
	xen_convert_trap_info(desc, traps);
792
793
794
795
796
797
798
799
800
801
802

	xen_mc_flush();
	if (HYPERVISOR_set_trap_table(traps))
		BUG();

	spin_unlock(&lock);
}

/* Write a GDT descriptor entry.  Ignore LDT descriptors, since
   they're handled differently. */
static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
803
				const void *desc, int type)
804
{
805
806
	trace_xen_cpu_write_gdt_entry(dt, entry, desc, type);

807
808
	preempt_disable();

809
810
811
	switch (type) {
	case DESC_LDT:
	case DESC_TSS:
812
813
814
815
		/* ignore */
		break;

	default: {
816
		xmaddr_t maddr = arbitrary_virt_to_machine(&dt[entry]);
817
818

		xen_mc_flush();
819
		if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc))
820
821
822
823
			BUG();
	}

	}
824
825

	preempt_enable();
826
827
}

828
829
830
831
/*
 * Version of write_gdt_entry for use at early boot-time needed to
 * update an entry as simply as possible.
 */
832
static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry,
833
834
					    const void *desc, int type)
{
835
836
	trace_xen_cpu_write_gdt_entry(dt, entry, desc, type);

837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
	switch (type) {
	case DESC_LDT:
	case DESC_TSS:
		/* ignore */
		break;

	default: {
		xmaddr_t maddr = virt_to_machine(&dt[entry]);

		if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc))
			dt[entry] = *(struct desc_struct *)desc;
	}

	}
}

853
static void xen_load_sp0(struct tss_struct *tss,
854
			 struct thread_struct *thread)
855
{
856
857
858
	struct multicall_space mcs;

	mcs = xen_mc_entry(0);
859
	MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0);
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
	xen_mc_issue(PARAVIRT_LAZY_CPU);
}

static void xen_set_iopl_mask(unsigned mask)
{
	struct physdev_set_iopl set_iopl;

	/* Force the change at ring 0. */
	set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
	HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
}

static void xen_io_delay(void)
{
}

#ifdef CONFIG_X86_LOCAL_APIC
877
878
879
880
881
882
883
884
885
static unsigned long xen_set_apic_id(unsigned int x)
{
	WARN_ON(1);
	return x;
}
static unsigned int xen_get_apic_id(unsigned long x)
{
	return ((x)>>24) & 0xFFu;
}
886
static u32 xen_apic_read(u32 reg)
887
{
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
	struct xen_platform_op op = {
		.cmd = XENPF_get_cpuinfo,
		.interface_version = XENPF_INTERFACE_VERSION,
		.u.pcpu_info.xen_cpuid = 0,
	};
	int ret = 0;

	/* Shouldn't need this as APIC is turned off for PV, and we only
	 * get called on the bootup processor. But just in case. */
	if (!xen_initial_domain() || smp_processor_id())
		return 0;

	if (reg == APIC_LVR)
		return 0x10;

	if (reg != APIC_ID)
		return 0;

	ret = HYPERVISOR_dom0_op(&op);
	if (ret)
		return 0;

	return op.u.pcpu_info.apic_id << 24;
911
}
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
912

913
static void xen_apic_write(u32 reg, u32 val)
Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
914
915
916
917
{
	/* Warn to see if there's any stray references */
	WARN_ON(1);
}
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934

static u64 xen_apic_icr_read(void)
{
	return 0;
}

static void xen_apic_icr_write(u32 low, u32 id)
{
	/* Warn to see if there's any stray references */
	WARN_ON(1);
}

static void xen_apic_wait_icr_idle(void)
{
        return;
}

935
936
937
938
939
static u32 xen_safe_apic_wait_icr_idle(void)
{
        return 0;
}

Yinghai Lu's avatar
Yinghai Lu committed
940
941
942
943
944
945
946
947
static void set_xen_basic_apic_ops(void)
{
	apic->read = xen_apic_read;
	apic->write = xen_apic_write;
	apic->icr_read = xen_apic_icr_read;
	apic->icr_write = xen_apic_icr_write;
	apic->wait_icr_idle = xen_apic_wait_icr_idle;
	apic->safe_wait_icr_idle = xen_safe_apic_wait_icr_idle;
948
949
	apic->set_apic_id = xen_set_apic_id;
	apic->get_apic_id = xen_get_apic_id;
Ben Guthro's avatar
Ben Guthro committed
950
951
952
953
954
955
956
957

#ifdef CONFIG_SMP
	apic->send_IPI_allbutself = xen_send_IPI_allbutself;
	apic->send_IPI_mask_allbutself = xen_send_IPI_mask_allbutself;
	apic->send_IPI_mask = xen_send_IPI_mask;
	apic->send_IPI_all = xen_send_IPI_all;
	apic->send_IPI_self = xen_send_IPI_self;
#endif
Yinghai Lu's avatar
Yinghai Lu committed
958
}
959

960
961
#endif

962
963
964
965
966
967
968
969
970
971
972
static void xen_clts(void)
{
	struct multicall_space mcs;

	mcs = xen_mc_entry(0);

	MULTI_fpu_taskswitch(mcs.mc, 0);

	xen_mc_issue(PARAVIRT_LAZY_CPU);
}

973
974
975
976
static DEFINE_PER_CPU(unsigned long, xen_cr0_value);

static unsigned long xen_read_cr0(void)
{
977
	unsigned long cr0 = this_cpu_read(xen_cr0_value);
978
979
980

	if (unlikely(cr0 == 0)) {
		cr0 = native_read_cr0();
981
		this_cpu_write(xen_cr0_value, cr0);
982
983
984
985
986
	}

	return cr0;
}

987
988
989
990
static void xen_write_cr0(unsigned long cr0)
{
	struct multicall_space mcs;

991
	this_cpu_write(xen_cr0_value, cr0);
992

993
994
995
996
997
998
999
1000
1001
	/* Only pay attention to cr0.TS; everything else is
	   ignored. */
	mcs = xen_mc_entry(0);

	MULTI_fpu_taskswitch(mcs.mc, (cr0 & X86_CR0_TS) != 0);

	xen_mc_issue(PARAVIRT_LAZY_CPU);
}

1002
1003
static void xen_write_cr4(unsigned long cr4)
{
1004
1005
1006
1007
	cr4 &= ~X86_CR4_PGE;
	cr4 &= ~X86_CR4_PSE;

	native_write_cr4(cr4);
1008
}
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
#ifdef CONFIG_X86_64
static inline unsigned long xen_read_cr8(void)
{
	return 0;
}
static inline void xen_write_cr8(unsigned long val)
{
	BUG_ON(val);
}
#endif
1019
1020
1021
1022
1023
1024
static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
{
	int ret;

	ret = 0;

Tej's avatar
Tej committed
1025
	switch (msr) {
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
#ifdef CONFIG_X86_64
		unsigned which;
		u64 base;

	case MSR_FS_BASE:		which = SEGBASE_FS; goto set;
	case MSR_KERNEL_GS_BASE:	which = SEGBASE_GS_USER; goto set;
	case MSR_GS_BASE:		which = SEGBASE_GS_KERNEL; goto set;

	set:
		base = ((u64)high << 32) | low;
		if (HYPERVISOR_set_segment_base(which, base) != 0)
1037
			ret = -EIO;
1038
1039
		break;
#endif
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052

	case MSR_STAR:
	case MSR_CSTAR:
	case MSR_LSTAR:
	case MSR_SYSCALL_MASK:
	case MSR_IA32_SYSENTER_CS:
	case MSR_IA32_SYSENTER_ESP:
	case MSR_IA32_SYSENTER_EIP:
		/* Fast syscall setup is all done in hypercalls, so
		   these are all ignored.  Stub them out here to stop
		   Xen console noise. */
		break;

Jeremy Fitzhardinge's avatar
Jeremy Fitzhardinge committed
1053
1054
1055
1056
1057
	case MSR_IA32_CR_PAT:
		if (smp_processor_id() == 0)
			xen_set_pat(((u64)high << 32) | low);
		break;

1058
1059
1060
1061
1062
1063
1064
	default:
		ret = native_write_msr_safe(msr, low, high);
	}

	return ret;
}

1065
void xen_setup_shared_info(void)
1066
1067
{
	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1068
1069
1070
1071
1072
		set_fixmap(FIX_PARAVIRT_BOOTMAP,
			   xen_start_info->shared_info);

		HYPERVISOR_shared_info =
			(struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
1073
1074
1075
1076
	} else
		HYPERVISOR_shared_info =
			(struct shared_info *)__va(xen_start_info->shared_info);

1077
1078
1079
1080
#ifndef CONFIG_SMP
	/* In UP this is as good a place as any to set up shared info */
	xen_setup_vcpu_info_placement();
#endif
1081
1082

	xen_setup_mfn_list_list();
1083
1084
}

1085
/* This is called once we have the cpu_possible_mask */
1086
void xen_setup_vcpu_info_placement(void)
1087
1088
1089
1090
1091
1092
1093
1094
1095
{
	int cpu;

	for_each_possible_cpu(cpu)
		xen_vcpu_setup(cpu);

	/* xen_vcpu_setup managed to place the vcpu_info within the
	   percpu area for all cpus, so make use of it */
	if (have_vcpu_info_placement) {
1096
1097
1098
1099
		pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
		pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
		pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
		pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(xen_irq_enable_direct);
1100
		pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
1101
	}
1102
1103
}

1104
1105
static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
			  unsigned long addr, unsigned len)
1106
1107
1108
1109
1110
1111
{
	char *start, *end, *reloc;
	unsigned ret;

	start = end = reloc = NULL;

1112
1113
#define SITE(op, x)							\
	case PARAVIRT_PATCH(op.x):					\
1114
1115
1116
1117
1118
1119
1120
1121
	if (have_vcpu_info_placement) {					\
		start = (char *)xen_##x##_direct;			\
		end = xen_##x##_direct_end;				\
		reloc = xen_##x##_direct_reloc;				\
	}								\
	goto patch_site

	switch (type) {
1122
1123
1124
1125
		SITE(pv_irq_ops, irq_enable);
		SITE(pv_irq_ops, irq_disable);
		SITE(pv_irq_ops, save_fl);
		SITE(pv_irq_ops, restore_fl);
1126
1127
1128
1129
1130
1131
#undef SITE

	patch_site:
		if (start == NULL || (end-start) > len)
			goto default_patch;

1132
		ret = paravirt_patch_insns(insnbuf, len, start, end);
1133
1134
1135
1136
1137
1138
1139

		/* Note: because reloc is assigned from something that
		   appears to be an array, gcc assumes it's non-null,
		   but doesn't know its relationship with start and
		   end. */
		if (reloc > start && reloc < end) {
			int reloc_off = reloc - start;
1140
1141
			long *relocp = (long *)(insnbuf + reloc_off);
			long delta = start - (char *)addr;
1142
1143
1144
1145
1146
1147
1148

			*relocp += delta;
		}
		break;

	default_patch:
	default:
1149
1150
		ret = paravirt_patch_default(type, clobbers, insnbuf,
					     addr, len);
1151
1152
1153
1154
1155
1156
		break;
	}

	return ret;
}

1157
static const struct pv_info xen_info __initconst = {
1158
1159
1160
	.paravirt_enabled = 1,
	.shared_kernel_pmd = 0,

1161
1162
1163
1164
#ifdef CONFIG_X86_64
	.extra_user_64bit_cs = FLAT_USER_CS64,
#endif

1165
	.name = "Xen",
1166
};
1167

1168
static const struct pv_init_ops xen_init_ops __initconst = {
1169
	.patch = xen_patch,
1170
};
1171

1172
static const struct pv_cpu_ops xen_cpu_ops __initconst = {
1173
1174
1175
1176
1177
	.cpuid = xen_cpuid,

	.set_debugreg = xen_set_debugreg,
	.get_debugreg = xen_get_debugreg,

1178
	.clts = xen_clts,
1179

1180
	.read_cr0 = xen_read_cr0,
1181
	.write_cr0 = xen_write_cr0,
1182
1183
1184
1185
1186

	.read_cr4 = native_read_cr4,
	.read_cr4_safe = native_read_cr4_safe,
	.write_cr4 = xen_write_cr4,

1187
1188
1189
1190
1191
#ifdef CONFIG_X86_64
	.read_cr8 = xen_read_cr8,
	.write_cr8 = xen_write_cr8,
#endif

1192
1193
1194
	.wbinvd = native_wbinvd,

	.read_msr = native_read_msr_safe,
1195
	.write_msr = xen_write_msr_safe,
1196

1197
1198
1199
	.read_tsc = native_read_tsc,
	.read_pmc = native_read_pmc,

1200
1201
	.read_tscp = native_read_tscp,

1202
	.iret = xen_iret,
1203
	.irq_enable_sysexit = xen_sysexit,
1204
1205
1206
1207
#ifdef CONFIG_X86_64
	.usergs_sysret32 = xen_sysret32,
	.usergs_sysret64 = xen_sysret64,
#endif
1208
1209
1210
1211
1212
1213

	.load_tr_desc = paravirt_nop,
	.set_ldt = xen_set_ldt,
	.load_gdt = xen_load_gdt,
	.load_idt = xen_load_idt,
	.load_tls = xen_load_tls,
1214
1215
1216
#ifdef CONFIG_X86_64
	.load_gs_index = xen_load_gs_index,
#endif
1217

1218
1219
1220
	.alloc_ldt = xen_alloc_ldt,
	.free_ldt = xen_free_ldt,

1221
1222
1223
1224
1225
1226
1227
	.store_gdt = native_store_gdt,
	.store_idt = native_store_idt,
	.store_tr = xen_store_tr,

	.write_ldt_entry = xen_write_ldt_entry,
	.write_gdt_entry = xen_write_gdt_entry,
	.write_idt_entry = xen_write_idt_entry,
1228
	.load_sp0 = xen_load_sp0,
1229
1230
1231
1232

	.set_iopl_mask = xen_set_iopl_mask,
	.io_delay = xen_io_delay,

1233
1234
1235
	/* Xen takes care of %gs when switching to usermode for us */
	.swapgs = paravirt_nop,

1236
1237
	.start_context_switch =