vmi_32.c 26.7 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
/*
 * VMI specific paravirt-ops implementation
 *
 * Copyright (C) 2005, VMware, Inc.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
 * NON INFRINGEMENT.  See the GNU General Public License for more
 * details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 * Send feedback to zach@vmware.com
 *
 */

#include <linux/module.h>
#include <linux/cpu.h>
#include <linux/bootmem.h>
#include <linux/mm.h>
29
#include <linux/highmem.h>
Alexey Dobriyan's avatar
Alexey Dobriyan committed
30
#include <linux/sched.h>
31
32
33
34
#include <asm/vmi.h>
#include <asm/io.h>
#include <asm/fixmap.h>
#include <asm/apicdef.h>
Ingo Molnar's avatar
Ingo Molnar committed
35
#include <asm/apic.h>
36
37
#include <asm/processor.h>
#include <asm/timer.h>
38
#include <asm/vmi_time.h>
39
#include <asm/kmap_types.h>
40
#include <asm/setup.h>
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57

/* Convenient for calling VMI functions indirectly in the ROM */
typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void);
typedef u64 __attribute__((regparm(2))) (VROMLONGFUNC)(int);

#define call_vrom_func(rom,func) \
   (((VROMFUNC *)(rom->func))())

#define call_vrom_long_func(rom,func,arg) \
   (((VROMLONGFUNC *)(rom->func)) (arg))

static struct vrom_header *vmi_rom;
static int disable_pge;
static int disable_pse;
static int disable_sep;
static int disable_tsc;
static int disable_mtrr;
58
static int disable_noidle;
Zachary Amsden's avatar
Zachary Amsden committed
59
static int disable_vmi_timer;
60
61

/* Cached VMI operations */
62
static struct {
63
64
65
	void (*cpuid)(void /* non-c */);
	void (*_set_ldt)(u32 selector);
	void (*set_tr)(u32 selector);
66
	void (*write_idt_entry)(struct desc_struct *, int, u32, u32);
67
	void (*write_gdt_entry)(struct desc_struct *, int, u32, u32);
68
	void (*write_ldt_entry)(struct desc_struct *, int, u32, u32);
69
	void (*set_kernel_stack)(u32 selector, u32 sp0);
70
71
72
73
	void (*allocate_page)(u32, u32, u32, u32, u32);
	void (*release_page)(u32, u32);
	void (*set_pte)(pte_t, pte_t *, unsigned);
	void (*update_pte)(pte_t *, unsigned);
74
75
	void (*set_linear_mapping)(int, void *, u32, u32);
	void (*_flush_tlb)(int);
76
	void (*set_initial_ap_state)(int, int);
77
	void (*halt)(void);
78
  	void (*set_lazy_mode)(int mode);
79
80
} vmi_ops;

81
82
83
/* Cached VMI operations */
struct vmi_timer_ops vmi_timer_ops;

84
85
86
87
88
89
90
91
92
93
/*
 * VMI patching routines.
 */
#define MNEM_CALL 0xe8
#define MNEM_JMP  0xe9
#define MNEM_RET  0xc3

#define IRQ_PATCH_INT_MASK 0
#define IRQ_PATCH_DISABLE  5

94
static inline void patch_offset(void *insnbuf,
95
				unsigned long ip, unsigned long dest)
96
{
97
        *(unsigned long *)(insnbuf+1) = dest-ip-5;
98
99
}

100
static unsigned patch_internal(int call, unsigned len, void *insnbuf,
101
			       unsigned long ip)
102
103
104
105
106
107
108
{
	u64 reloc;
	struct vmi_relocation_info *const rel = (struct vmi_relocation_info *)&reloc;
	reloc = call_vrom_long_func(vmi_rom, get_reloc,	call);
	switch(rel->type) {
		case VMI_RELOCATION_CALL_REL:
			BUG_ON(len < 5);
109
			*(char *)insnbuf = MNEM_CALL;
110
			patch_offset(insnbuf, ip, (unsigned long)rel->eip);
111
112
113
114
			return 5;

		case VMI_RELOCATION_JUMP_REL:
			BUG_ON(len < 5);
115
			*(char *)insnbuf = MNEM_JMP;
116
			patch_offset(insnbuf, ip, (unsigned long)rel->eip);
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
			return 5;

		case VMI_RELOCATION_NOP:
			/* obliterate the whole thing */
			return 0;

		case VMI_RELOCATION_NONE:
			/* leave native code in place */
			break;

		default:
			BUG();
	}
	return len;
}

/*
 * Apply patch if appropriate, return length of new instruction
 * sequence.  The callee does nop padding for us.
 */
137
static unsigned vmi_patch(u8 type, u16 clobbers, void *insns,
138
			  unsigned long ip, unsigned len)
139
140
{
	switch (type) {
141
		case PARAVIRT_PATCH(pv_irq_ops.irq_disable):
142
			return patch_internal(VMI_CALL_DisableInterrupts, len,
143
					      insns, ip);
144
		case PARAVIRT_PATCH(pv_irq_ops.irq_enable):
145
			return patch_internal(VMI_CALL_EnableInterrupts, len,
146
					      insns, ip);
147
		case PARAVIRT_PATCH(pv_irq_ops.restore_fl):
148
			return patch_internal(VMI_CALL_SetInterruptMask, len,
149
					      insns, ip);
150
		case PARAVIRT_PATCH(pv_irq_ops.save_fl):
151
			return patch_internal(VMI_CALL_GetInterruptMask, len,
152
					      insns, ip);
153
		case PARAVIRT_PATCH(pv_cpu_ops.iret):
154
			return patch_internal(VMI_CALL_IRET, len, insns, ip);
155
		case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit):
156
			return patch_internal(VMI_CALL_SYSEXIT, len, insns, ip);
157
158
159
160
161
162
163
		default:
			break;
	}
	return len;
}

/* CPUID has non-C semantics, and paravirt-ops API doesn't match hardware ISA */
164
165
static void vmi_cpuid(unsigned int *ax, unsigned int *bx,
                               unsigned int *cx, unsigned int *dx)
166
167
{
	int override = 0;
168
	if (*ax == 1)
169
170
		override = 1;
        asm volatile ("call *%6"
171
172
173
174
175
                      : "=a" (*ax),
                        "=b" (*bx),
                        "=c" (*cx),
                        "=d" (*dx)
                      : "0" (*ax), "2" (*cx), "r" (vmi_ops.cpuid));
176
177
	if (override) {
		if (disable_pse)
178
			*dx &= ~X86_FEATURE_PSE;
179
		if (disable_pge)
180
			*dx &= ~X86_FEATURE_PGE;
181
		if (disable_sep)
182
			*dx &= ~X86_FEATURE_SEP;
183
		if (disable_tsc)
184
			*dx &= ~X86_FEATURE_TSC;
185
		if (disable_mtrr)
186
			*dx &= ~X86_FEATURE_MTRR;
187
188
189
190
191
192
	}
}

static inline void vmi_maybe_load_tls(struct desc_struct *gdt, int nr, struct desc_struct *new)
{
	if (gdt[nr].a != new->a || gdt[nr].b != new->b)
193
		write_gdt_entry(gdt, nr, new, 0);
194
195
196
197
198
199
200
201
202
203
204
205
206
}

static void vmi_load_tls(struct thread_struct *t, unsigned int cpu)
{
	struct desc_struct *gdt = get_cpu_gdt_table(cpu);
	vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 0, &t->tls_array[0]);
	vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 1, &t->tls_array[1]);
	vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 2, &t->tls_array[2]);
}

static void vmi_set_ldt(const void *addr, unsigned entries)
{
	unsigned cpu = smp_processor_id();
207
	struct desc_struct desc;
208

209
	pack_descriptor(&desc, (unsigned long)addr,
210
			entries * sizeof(struct desc_struct) - 1,
211
212
			DESC_LDT, 0);
	write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, &desc, DESC_LDT);
213
214
215
216
217
218
219
220
	vmi_ops._set_ldt(entries ? GDT_ENTRY_LDT*sizeof(struct desc_struct) : 0);
}

static void vmi_set_tr(void)
{
	vmi_ops.set_tr(GDT_ENTRY_TSS*sizeof(struct desc_struct));
}

221
222
223
static void vmi_write_idt_entry(gate_desc *dt, int entry, const gate_desc *g)
{
	u32 *idt_entry = (u32 *)g;
Ingo Molnar's avatar
Ingo Molnar committed
224
	vmi_ops.write_idt_entry(dt, entry, idt_entry[0], idt_entry[1]);
225
226
}

227
228
229
230
static void vmi_write_gdt_entry(struct desc_struct *dt, int entry,
				const void *desc, int type)
{
	u32 *gdt_entry = (u32 *)desc;
Ingo Molnar's avatar
Ingo Molnar committed
231
	vmi_ops.write_gdt_entry(dt, entry, gdt_entry[0], gdt_entry[1]);
232
233
}

234
235
236
237
static void vmi_write_ldt_entry(struct desc_struct *dt, int entry,
				const void *desc)
{
	u32 *ldt_entry = (u32 *)desc;
238
	vmi_ops.write_ldt_entry(dt, entry, ldt_entry[0], ldt_entry[1]);
239
240
}

241
static void vmi_load_sp0(struct tss_struct *tss,
242
243
				   struct thread_struct *thread)
{
244
	tss->x86_tss.sp0 = thread->sp0;
245
246

	/* This can only happen when SEP is enabled, no need to test "SEP"arately */
247
248
	if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
		tss->x86_tss.ss1 = thread->sysenter_cs;
249
250
		wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
	}
251
	vmi_ops.set_kernel_stack(__KERNEL_DS, tss->x86_tss.sp0);
252
253
254
255
}

static void vmi_flush_tlb_user(void)
{
256
	vmi_ops._flush_tlb(VMI_FLUSH_TLB);
257
258
259
260
}

static void vmi_flush_tlb_kernel(void)
{
261
	vmi_ops._flush_tlb(VMI_FLUSH_TLB | VMI_FLUSH_GLOBAL);
262
263
264
265
266
267
268
}

/* Stub to do nothing at all; used for delays and unimplemented calls */
static void vmi_nop(void)
{
}

269
270
#ifdef CONFIG_HIGHPTE
static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
Zachary Amsden's avatar
Zachary Amsden committed
271
{
272
273
	void *va = kmap_atomic(page, type);

Zachary Amsden's avatar
Zachary Amsden committed
274
275
276
277
278
279
280
281
282
283
284
285
286
	/*
	 * Internally, the VMI ROM must map virtual addresses to physical
	 * addresses for processing MMU updates.  By the time MMU updates
	 * are issued, this information is typically already lost.
	 * Fortunately, the VMI provides a cache of mapping slots for active
	 * page tables.
	 *
	 * We use slot zero for the linear mapping of physical memory, and
	 * in HIGHPTE kernels, slot 1 and 2 for KM_PTE0 and KM_PTE1.
	 *
	 *  args:                 SLOT                 VA    COUNT PFN
	 */
	BUG_ON(type != KM_PTE0 && type != KM_PTE1);
287
288
289
	vmi_ops.set_linear_mapping((type - KM_PTE0)+1, va, 1, page_to_pfn(page));

	return va;
Zachary Amsden's avatar
Zachary Amsden committed
290
}
291
#endif
Zachary Amsden's avatar
Zachary Amsden committed
292

293
static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn)
294
295
296
297
{
	vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
}

298
static void vmi_allocate_pmd(struct mm_struct *mm, unsigned long pfn)
299
300
301
302
303
304
305
306
307
{
 	/*
	 * This call comes in very early, before mem_map is setup.
	 * It is called only for swapper_pg_dir, which already has
	 * data on it.
	 */
	vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
}

308
static void vmi_allocate_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count)
309
310
311
312
{
	vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
}

313
static void vmi_release_pte(unsigned long pfn)
314
315
316
317
{
	vmi_ops.release_page(pfn, VMI_PAGE_L1);
}

318
static void vmi_release_pmd(unsigned long pfn)
319
320
321
322
{
	vmi_ops.release_page(pfn, VMI_PAGE_L2);
}

323
324
325
326
327
328
329
330
331
332
/*
 * We use the pgd_free hook for releasing the pgd page:
 */
static void vmi_pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
	unsigned long pfn = __pa(pgd) >> PAGE_SHIFT;

	vmi_ops.release_page(pfn, VMI_PAGE_L2);
}

333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
/*
 * Helper macros for MMU update flags.  We can defer updates until a flush
 * or page invalidation only if the update is to the current address space
 * (otherwise, there is no flush).  We must check against init_mm, since
 * this could be a kernel update, which usually passes init_mm, although
 * sometimes this check can be skipped if we know the particular function
 * is only called on user mode PTEs.  We could change the kernel to pass
 * current->active_mm here, but in particular, I was unsure if changing
 * mm/highmem.c to do this would still be correct on other architectures.
 */
#define is_current_as(mm, mustbeuser) ((mm) == current->active_mm ||    \
                                       (!mustbeuser && (mm) == &init_mm))
#define vmi_flags_addr(mm, addr, level, user)                           \
        ((level) | (is_current_as(mm, user) ?                           \
                (VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
#define vmi_flags_addr_defer(mm, addr, level, user)                     \
        ((level) | (is_current_as(mm, user) ?                           \
                (VMI_PAGE_DEFER | VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))

352
static void vmi_update_pte(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
353
354
355
356
{
	vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
}

357
static void vmi_update_pte_defer(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
358
359
360
361
362
363
364
365
366
367
{
	vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0));
}

static void vmi_set_pte(pte_t *ptep, pte_t pte)
{
	/* XXX because of set_pmd_pte, this can be called on PT or PD layers */
	vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT);
}

368
static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
369
370
371
372
373
374
375
{
	vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
}

static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval)
{
#ifdef CONFIG_X86_PAE
376
	const pte_t pte = { .pte = pmdval.pmd };
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
#else
	const pte_t pte = { pmdval.pud.pgd.pgd };
#endif
	vmi_ops.set_pte(pte, (pte_t *)pmdp, VMI_PAGE_PD);
}

#ifdef CONFIG_X86_PAE

static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval)
{
	/*
	 * XXX This is called from set_pmd_pte, but at both PT
	 * and PD layers so the VMI_PAGE_PT flag is wrong.  But
	 * it is only called for large page mapping changes,
	 * the Xen backend, doesn't support large pages, and the
	 * ESX backend doesn't depend on the flag.
	 */
	set_64bit((unsigned long long *)ptep,pte_val(pteval));
	vmi_ops.update_pte(ptep, VMI_PAGE_PT);
}

static void vmi_set_pud(pud_t *pudp, pud_t pudval)
{
	/* Um, eww */
401
	const pte_t pte = { .pte = pudval.pgd.pgd };
402
403
404
405
406
	vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP);
}

static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
{
407
	const pte_t pte = { .pte = 0 };
408
409
410
	vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
}

411
static void vmi_pmd_clear(pmd_t *pmd)
412
{
413
	const pte_t pte = { .pte = 0 };
414
415
416
417
418
	vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD);
}
#endif

#ifdef CONFIG_SMP
Zachary Amsden's avatar
Zachary Amsden committed
419
static void __devinit
420
421
422
vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
		     unsigned long start_esp)
{
Zachary Amsden's avatar
Zachary Amsden committed
423
424
	struct vmi_ap_state ap;

425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
	/* Default everything to zero.  This is fine for most GPRs. */
	memset(&ap, 0, sizeof(struct vmi_ap_state));

	ap.gdtr_limit = GDT_SIZE - 1;
	ap.gdtr_base = (unsigned long) get_cpu_gdt_table(phys_apicid);

	ap.idtr_limit = IDT_ENTRIES * 8 - 1;
	ap.idtr_base = (unsigned long) idt_table;

	ap.ldtr = 0;

	ap.cs = __KERNEL_CS;
	ap.eip = (unsigned long) start_eip;
	ap.ss = __KERNEL_DS;
	ap.esp = (unsigned long) start_esp;

	ap.ds = __USER_DS;
	ap.es = __USER_DS;
443
	ap.fs = __KERNEL_PERCPU;
Alok Kataria's avatar
Alok Kataria committed
444
	ap.gs = __KERNEL_STACK_CANARY;
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460

	ap.eflags = 0;

#ifdef CONFIG_X86_PAE
	/* efer should match BSP efer. */
	if (cpu_has_nx) {
		unsigned l, h;
		rdmsr(MSR_EFER, l, h);
		ap.efer = (unsigned long long) h << 32 | l;
	}
#endif

	ap.cr3 = __pa(swapper_pg_dir);
	/* Protected mode, paging, AM, WP, NE, MP. */
	ap.cr0 = 0x80050023;
	ap.cr4 = mmu_cr4_features;
Zachary Amsden's avatar
Zachary Amsden committed
461
	vmi_ops.set_initial_ap_state((u32)&ap, phys_apicid);
462
463
464
}
#endif

465
static void vmi_start_context_switch(struct task_struct *prev)
466
{
467
	paravirt_start_context_switch(prev);
468
469
	vmi_ops.set_lazy_mode(2);
}
470

471
static void vmi_end_context_switch(struct task_struct *next)
472
473
{
	vmi_ops.set_lazy_mode(0);
474
	paravirt_end_context_switch(next);
475
476
}

477
478
479
480
481
static void vmi_enter_lazy_mmu(void)
{
	paravirt_enter_lazy_mmu();
	vmi_ops.set_lazy_mode(1);
}
482

483
static void vmi_leave_lazy_mmu(void)
484
485
{
	vmi_ops.set_lazy_mode(0);
486
	paravirt_leave_lazy_mmu();
487
488
}

489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
static inline int __init check_vmi_rom(struct vrom_header *rom)
{
	struct pci_header *pci;
	struct pnp_header *pnp;
	const char *manufacturer = "UNKNOWN";
	const char *product = "UNKNOWN";
	const char *license = "unspecified";

	if (rom->rom_signature != 0xaa55)
		return 0;
	if (rom->vrom_signature != VMI_SIGNATURE)
		return 0;
	if (rom->api_version_maj != VMI_API_REV_MAJOR ||
	    rom->api_version_min+1 < VMI_API_REV_MINOR+1) {
		printk(KERN_WARNING "VMI: Found mismatched rom version %d.%d\n",
				rom->api_version_maj,
				rom->api_version_min);
		return 0;
	}

	/*
	 * Relying on the VMI_SIGNATURE field is not 100% safe, so check
	 * the PCI header and device type to make sure this is really a
	 * VMI device.
	 */
	if (!rom->pci_header_offs) {
		printk(KERN_WARNING "VMI: ROM does not contain PCI header.\n");
		return 0;
	}

	pci = (struct pci_header *)((char *)rom+rom->pci_header_offs);
	if (pci->vendorID != PCI_VENDOR_ID_VMWARE ||
	    pci->deviceID != PCI_DEVICE_ID_VMWARE_VMI) {
		/* Allow it to run... anyways, but warn */
		printk(KERN_WARNING "VMI: ROM from unknown manufacturer\n");
	}

	if (rom->pnp_header_offs) {
		pnp = (struct pnp_header *)((char *)rom+rom->pnp_header_offs);
		if (pnp->manufacturer_offset)
			manufacturer = (const char *)rom+pnp->manufacturer_offset;
		if (pnp->product_offset)
			product = (const char *)rom+pnp->product_offset;
	}

	if (rom->license_offs)
		license = (char *)rom+rom->license_offs;

	printk(KERN_INFO "VMI: Found %s %s, API version %d.%d, ROM version %d.%d\n",
		manufacturer, product,
		rom->api_version_maj, rom->api_version_min,
		pci->rom_version_maj, pci->rom_version_min);

542
543
544
545
546
547
548
549
	/* Don't allow BSD/MIT here for now because we don't want to end up
	   with any binary only shim layers */
	if (strcmp(license, "GPL") && strcmp(license, "GPL v2")) {
		printk(KERN_WARNING "VMI: Non GPL license `%s' found for ROM. Not used.\n",
			license);
		return 0;
	}

550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
	return 1;
}

/*
 * Probe for the VMI option ROM
 */
static inline int __init probe_vmi_rom(void)
{
	unsigned long base;

	/* VMI ROM is in option ROM area, check signature */
	for (base = 0xC0000; base < 0xE0000; base += 2048) {
		struct vrom_header *romstart;
		romstart = (struct vrom_header *)isa_bus_to_virt(base);
		if (check_vmi_rom(romstart)) {
			vmi_rom = romstart;
			return 1;
		}
	}
	return 0;
}

/*
 * VMI setup common to all processors
 */
void vmi_bringup(void)
{
 	/* We must establish the lowmem mapping for MMU ops to work */
Zachary Amsden's avatar
Zachary Amsden committed
578
	if (vmi_ops.set_linear_mapping)
579
		vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, MAXMEM_PFN, 0);
580
581
582
}

/*
Zachary Amsden's avatar
Zachary Amsden committed
583
 * Return a pointer to a VMI function or NULL if unimplemented
584
585
586
587
588
589
590
591
592
593
 */
static void *vmi_get_function(int vmicall)
{
	u64 reloc;
	const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
	reloc = call_vrom_long_func(vmi_rom, get_reloc,	vmicall);
	BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL);
	if (rel->type == VMI_RELOCATION_CALL_REL)
		return (void *)rel->eip;
	else
Zachary Amsden's avatar
Zachary Amsden committed
594
		return NULL;
595
596
597
598
}

/*
 * Helper macro for making the VMI paravirt-ops fill code readable.
Zachary Amsden's avatar
Zachary Amsden committed
599
600
 * For unimplemented operations, fall back to default, unless nop
 * is returned by the ROM.
601
602
603
604
605
 */
#define para_fill(opname, vmicall)				\
do {								\
	reloc = call_vrom_long_func(vmi_rom, get_reloc,		\
				    VMI_CALL_##vmicall);	\
606
	if (rel->type == VMI_RELOCATION_CALL_REL) 		\
607
		opname = (void *)rel->eip;			\
608
	else if (rel->type == VMI_RELOCATION_NOP) 		\
609
		opname = (void *)vmi_nop;			\
610
611
612
613
	else if (rel->type != VMI_RELOCATION_NONE)		\
		printk(KERN_WARNING "VMI: Unknown relocation "	\
				    "type %d for " #vmicall"\n",\
					rel->type);		\
Zachary Amsden's avatar
Zachary Amsden committed
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
} while (0)

/*
 * Helper macro for making the VMI paravirt-ops fill code readable.
 * For cached operations which do not match the VMI ROM ABI and must
 * go through a tranlation stub.  Ignore NOPs, since it is not clear
 * a NOP * VMI function corresponds to a NOP paravirt-op when the
 * functions are not in 1-1 correspondence.
 */
#define para_wrap(opname, wrapper, cache, vmicall)		\
do {								\
	reloc = call_vrom_long_func(vmi_rom, get_reloc,		\
				    VMI_CALL_##vmicall);	\
	BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL);		\
	if (rel->type == VMI_RELOCATION_CALL_REL) {		\
629
		opname = wrapper;				\
Zachary Amsden's avatar
Zachary Amsden committed
630
		vmi_ops.cache = (void *)rel->eip;		\
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
	}							\
} while (0)

/*
 * Activate the VMI interface and switch into paravirtualized mode
 */
static inline int __init activate_vmi(void)
{
	short kernel_cs;
	u64 reloc;
	const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;

	if (call_vrom_func(vmi_rom, vmi_init) != 0) {
		printk(KERN_ERR "VMI ROM failed to initialize!");
		return 0;
	}
	savesegment(cs, kernel_cs);

649
650
	pv_info.paravirt_enabled = 1;
	pv_info.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK;
651
	pv_info.name = "vmi [deprecated]";
652

653
	pv_init_ops.patch = vmi_patch;
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669

	/*
	 * Many of these operations are ABI compatible with VMI.
	 * This means we can fill in the paravirt-ops with direct
	 * pointers into the VMI ROM.  If the calling convention for
	 * these operations changes, this code needs to be updated.
	 *
	 * Exceptions
	 *  CPUID paravirt-op uses pointers, not the native ISA
	 *  halt has no VMI equivalent; all VMI halts are "safe"
	 *  no MSR support yet - just trap and emulate.  VMI uses the
	 *    same ABI as the native ISA, but Linux wants exceptions
	 *    from bogus MSR read / write handled
	 *  rdpmc is not yet used in Linux
	 */

Zachary Amsden's avatar
Zachary Amsden committed
670
	/* CPUID is special, so very special it gets wrapped like a present */
671
672
673
674
675
676
677
678
679
680
681
682
683
	para_wrap(pv_cpu_ops.cpuid, vmi_cpuid, cpuid, CPUID);

	para_fill(pv_cpu_ops.clts, CLTS);
	para_fill(pv_cpu_ops.get_debugreg, GetDR);
	para_fill(pv_cpu_ops.set_debugreg, SetDR);
	para_fill(pv_cpu_ops.read_cr0, GetCR0);
	para_fill(pv_mmu_ops.read_cr2, GetCR2);
	para_fill(pv_mmu_ops.read_cr3, GetCR3);
	para_fill(pv_cpu_ops.read_cr4, GetCR4);
	para_fill(pv_cpu_ops.write_cr0, SetCR0);
	para_fill(pv_mmu_ops.write_cr2, SetCR2);
	para_fill(pv_mmu_ops.write_cr3, SetCR3);
	para_fill(pv_cpu_ops.write_cr4, SetCR4);
684
685
686
687
688

	para_fill(pv_irq_ops.save_fl.func, GetInterruptMask);
	para_fill(pv_irq_ops.restore_fl.func, SetInterruptMask);
	para_fill(pv_irq_ops.irq_disable.func, DisableInterrupts);
	para_fill(pv_irq_ops.irq_enable.func, EnableInterrupts);
689
690
691

	para_fill(pv_cpu_ops.wbinvd, WBINVD);
	para_fill(pv_cpu_ops.read_tsc, RDTSC);
Zachary Amsden's avatar
Zachary Amsden committed
692
693

	/* The following we emulate with trap and emulate for now */
694
695
696
697
	/* paravirt_ops.read_msr = vmi_rdmsr */
	/* paravirt_ops.write_msr = vmi_wrmsr */
	/* paravirt_ops.rdpmc = vmi_rdpmc */

Zachary Amsden's avatar
Zachary Amsden committed
698
	/* TR interface doesn't pass TR value, wrap */
699
	para_wrap(pv_cpu_ops.load_tr_desc, vmi_set_tr, set_tr, SetTR);
700
701

	/* LDT is special, too */
702
703
704
705
706
707
708
709
	para_wrap(pv_cpu_ops.set_ldt, vmi_set_ldt, _set_ldt, SetLDT);

	para_fill(pv_cpu_ops.load_gdt, SetGDT);
	para_fill(pv_cpu_ops.load_idt, SetIDT);
	para_fill(pv_cpu_ops.store_gdt, GetGDT);
	para_fill(pv_cpu_ops.store_idt, GetIDT);
	para_fill(pv_cpu_ops.store_tr, GetTR);
	pv_cpu_ops.load_tls = vmi_load_tls;
710
711
	para_wrap(pv_cpu_ops.write_ldt_entry, vmi_write_ldt_entry,
		  write_ldt_entry, WriteLDTEntry);
712
713
	para_wrap(pv_cpu_ops.write_gdt_entry, vmi_write_gdt_entry,
		  write_gdt_entry, WriteGDTEntry);
714
715
	para_wrap(pv_cpu_ops.write_idt_entry, vmi_write_idt_entry,
		  write_idt_entry, WriteIDTEntry);
716
	para_wrap(pv_cpu_ops.load_sp0, vmi_load_sp0, set_kernel_stack, UpdateKernelStack);
717
718
	para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask);
	para_fill(pv_cpu_ops.io_delay, IODelay);
719

720
	para_wrap(pv_cpu_ops.start_context_switch, vmi_start_context_switch,
721
		  set_lazy_mode, SetLazyMode);
722
	para_wrap(pv_cpu_ops.end_context_switch, vmi_end_context_switch,
723
724
725
726
		  set_lazy_mode, SetLazyMode);

	para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu,
		  set_lazy_mode, SetLazyMode);
727
	para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy_mmu,
728
		  set_lazy_mode, SetLazyMode);
729

Zachary Amsden's avatar
Zachary Amsden committed
730
	/* user and kernel flush are just handled with different flags to FlushTLB */
731
732
733
	para_wrap(pv_mmu_ops.flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB);
	para_wrap(pv_mmu_ops.flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB);
	para_fill(pv_mmu_ops.flush_tlb_single, InvalPage);
734
735
736
737
738
739
740
741
742
743
744
745
746
747

	/*
	 * Until a standard flag format can be agreed on, we need to
	 * implement these as wrappers in Linux.  Get the VMI ROM
	 * function pointers for the two backend calls.
	 */
#ifdef CONFIG_X86_PAE
	vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxELong);
	vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxELong);
#else
	vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxE);
	vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxE);
#endif

Zachary Amsden's avatar
Zachary Amsden committed
748
	if (vmi_ops.set_pte) {
749
750
751
		pv_mmu_ops.set_pte = vmi_set_pte;
		pv_mmu_ops.set_pte_at = vmi_set_pte_at;
		pv_mmu_ops.set_pmd = vmi_set_pmd;
752
#ifdef CONFIG_X86_PAE
753
754
755
756
		pv_mmu_ops.set_pte_atomic = vmi_set_pte_atomic;
		pv_mmu_ops.set_pud = vmi_set_pud;
		pv_mmu_ops.pte_clear = vmi_pte_clear;
		pv_mmu_ops.pmd_clear = vmi_pmd_clear;
757
#endif
Zachary Amsden's avatar
Zachary Amsden committed
758
759
760
	}

	if (vmi_ops.update_pte) {
761
762
		pv_mmu_ops.pte_update = vmi_update_pte;
		pv_mmu_ops.pte_update_defer = vmi_update_pte_defer;
Zachary Amsden's avatar
Zachary Amsden committed
763
764
765
766
	}

	vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage);
	if (vmi_ops.allocate_page) {
767
768
769
		pv_mmu_ops.alloc_pte = vmi_allocate_pte;
		pv_mmu_ops.alloc_pmd = vmi_allocate_pmd;
		pv_mmu_ops.alloc_pmd_clone = vmi_allocate_pmd_clone;
Zachary Amsden's avatar
Zachary Amsden committed
770
771
772
773
	}

	vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage);
	if (vmi_ops.release_page) {
774
775
		pv_mmu_ops.release_pte = vmi_release_pte;
		pv_mmu_ops.release_pmd = vmi_release_pmd;
776
		pv_mmu_ops.pgd_free = vmi_pgd_free;
Zachary Amsden's avatar
Zachary Amsden committed
777
	}
778
779
780
781
782

	/* Set linear is needed in all cases */
	vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping);
#ifdef CONFIG_HIGHPTE
	if (vmi_ops.set_linear_mapping)
783
		pv_mmu_ops.kmap_atomic_pte = vmi_kmap_atomic_pte;
784
#endif
Zachary Amsden's avatar
Zachary Amsden committed
785

786
787
788
789
790
791
792
	/*
	 * These MUST always be patched.  Don't support indirect jumps
	 * through these operations, as the VMI interface may use either
	 * a jump or a call to get to these operations, depending on
	 * the backend.  They are performance critical anyway, so requiring
	 * a patch is not a big problem.
	 */
793
	pv_cpu_ops.irq_enable_sysexit = (void *)0xfeedbab0;
794
	pv_cpu_ops.iret = (void *)0xbadbab0;
795
796

#ifdef CONFIG_SMP
797
	para_wrap(pv_apic_ops.startup_ipi_hook, vmi_startup_ipi_hook, set_initial_ap_state, SetInitialAPState);
798
799
800
#endif

#ifdef CONFIG_X86_LOCAL_APIC
Yinghai Lu's avatar
Yinghai Lu committed
801
802
       para_fill(apic->read, APICRead);
       para_fill(apic->write, APICWrite);
803
804
#endif

805
806
807
808
	/*
	 * Check for VMI timer functionality by probing for a cycle frequency method
	 */
	reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_GetCycleFrequency);
Zachary Amsden's avatar
Zachary Amsden committed
809
	if (!disable_vmi_timer && rel->type != VMI_RELOCATION_NONE) {
810
811
812
813
814
815
816
817
818
819
		vmi_timer_ops.get_cycle_frequency = (void *)rel->eip;
		vmi_timer_ops.get_cycle_counter =
			vmi_get_function(VMI_CALL_GetCycleCounter);
		vmi_timer_ops.get_wallclock =
			vmi_get_function(VMI_CALL_GetWallclockTime);
		vmi_timer_ops.wallclock_updated =
			vmi_get_function(VMI_CALL_WallclockUpdated);
		vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm);
		vmi_timer_ops.cancel_alarm =
			 vmi_get_function(VMI_CALL_CancelAlarm);
820
		x86_init.timers.timer_init = vmi_time_init;
821
#ifdef CONFIG_X86_LOCAL_APIC
822
823
		x86_init.timers.setup_percpu_clockev = vmi_time_bsp_init;
		x86_cpuinit.setup_percpu_clockev = vmi_time_ap_init;
824
#endif
825
		pv_time_ops.sched_clock = vmi_sched_clock;
826
		x86_platform.calibrate_tsc = vmi_tsc_khz;
827
828
		x86_platform.get_wallclock = vmi_get_wallclock;
		x86_platform.set_wallclock = vmi_set_wallclock;
Zachary Amsden's avatar
Zachary Amsden committed
829
830
831
832
833
834

		/* We have true wallclock functions; disable CMOS clock sync */
		no_sync_cmos_clock = 1;
	} else {
		disable_noidle = 1;
		disable_vmi_timer = 1;
835
	}
Zachary Amsden's avatar
Zachary Amsden committed
836

837
	para_fill(pv_irq_ops.safe_halt, Halt);
838

839
840
841
842
843
844
	/*
	 * Alternative instruction rewriting doesn't happen soon enough
	 * to convert VMI_IRET to a call instead of a jump; so we have
	 * to do this before IRQs get reenabled.  Fortunately, it is
	 * idempotent.
	 */
845
	apply_paravirt(__parainstructions, __parainstructions_end);
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866

	vmi_bringup();

	return 1;
}

#undef para_fill

void __init vmi_init(void)
{
	if (!vmi_rom)
		probe_vmi_rom();
	else
		check_vmi_rom(vmi_rom);

	/* In case probing for or validating the ROM failed, basil */
	if (!vmi_rom)
		return;

	reserve_top_address(-vmi_rom->virtual_top);

867
#ifdef CONFIG_X86_IO_APIC
Zachary Amsden's avatar
Zachary Amsden committed
868
	/* This is virtual hardware; timer routing is wired correctly */
869
870
	no_timer_check = 1;
#endif
871
872
}

873
void __init vmi_activate(void)
874
875
876
877
878
879
880
881
{
	unsigned long flags;

	if (!vmi_rom)
		return;

	local_irq_save(flags);
	activate_vmi();
882
883
884
885
886
887
888
889
	local_irq_restore(flags & X86_EFLAGS_IF);
}

static int __init parse_vmi(char *arg)
{
	if (!arg)
		return -EINVAL;

890
	if (!strcmp(arg, "disable_pge")) {
891
		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE);
892
893
		disable_pge = 1;
	} else if (!strcmp(arg, "disable_pse")) {
894
		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PSE);
895
896
		disable_pse = 1;
	} else if (!strcmp(arg, "disable_sep")) {
897
		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP);
898
899
		disable_sep = 1;
	} else if (!strcmp(arg, "disable_tsc")) {
900
		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC);
901
902
		disable_tsc = 1;
	} else if (!strcmp(arg, "disable_mtrr")) {
903
		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_MTRR);
904
		disable_mtrr = 1;
Zachary Amsden's avatar
Zachary Amsden committed
905
906
907
	} else if (!strcmp(arg, "disable_timer")) {
		disable_vmi_timer = 1;
		disable_noidle = 1;
908
909
	} else if (!strcmp(arg, "disable_noidle"))
		disable_noidle = 1;
910
911
912
913
	return 0;
}

early_param("vmi", parse_vmi);