kvm_main.c 67.2 KB
Newer Older
Avi Kivity's avatar
Avi Kivity committed
1
2
3
4
5
6
7
/*
 * Kernel-based Virtual Machine driver for Linux
 *
 * This module enables machines with Intel VT-x extensions to run virtual
 * machines without emulation or binary translation.
 *
 * Copyright (C) 2006 Qumranet, Inc.
8
 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
Avi Kivity's avatar
Avi Kivity committed
9
10
11
12
13
14
15
16
17
18
 *
 * Authors:
 *   Avi Kivity   <avi@qumranet.com>
 *   Yaniv Kamay  <yaniv@qumranet.com>
 *
 * This work is licensed under the terms of the GNU GPL, version 2.  See
 * the COPYING file in the top-level directory.
 *
 */

19
#include "iodev.h"
Avi Kivity's avatar
Avi Kivity committed
20

21
#include <linux/kvm_host.h>
Avi Kivity's avatar
Avi Kivity committed
22
23
24
25
26
27
28
29
30
31
32
#include <linux/kvm.h>
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/percpu.h>
#include <linux/mm.h>
#include <linux/miscdevice.h>
#include <linux/vmalloc.h>
#include <linux/reboot.h>
#include <linux/debugfs.h>
#include <linux/highmem.h>
#include <linux/file.h>
33
#include <linux/syscore_ops.h>
Avi Kivity's avatar
Avi Kivity committed
34
#include <linux/cpu.h>
Alexey Dobriyan's avatar
Alexey Dobriyan committed
35
#include <linux/sched.h>
36
37
#include <linux/cpumask.h>
#include <linux/smp.h>
38
#include <linux/anon_inodes.h>
39
#include <linux/profile.h>
40
#include <linux/kvm_para.h>
41
#include <linux/pagemap.h>
42
#include <linux/mman.h>
43
#include <linux/swap.h>
44
#include <linux/bitops.h>
45
#include <linux/spinlock.h>
46
#include <linux/compat.h>
47
#include <linux/srcu.h>
48
#include <linux/hugetlb.h>
49
#include <linux/slab.h>
50
51
#include <linux/sort.h>
#include <linux/bsearch.h>
Avi Kivity's avatar
Avi Kivity committed
52

Avi Kivity's avatar
Avi Kivity committed
53
54
55
#include <asm/processor.h>
#include <asm/io.h>
#include <asm/uaccess.h>
56
#include <asm/pgtable.h>
Avi Kivity's avatar
Avi Kivity committed
57

58
#include "coalesced_mmio.h"
59
#include "async_pf.h"
60

61
62
63
#define CREATE_TRACE_POINTS
#include <trace/events/kvm.h>

Avi Kivity's avatar
Avi Kivity committed
64
65
66
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");

67
68
69
/*
 * Ordering of locks:
 *
70
 * 		kvm->lock --> kvm->slots_lock --> kvm->irq_lock
71
72
 */

73
DEFINE_RAW_SPINLOCK(kvm_lock);
74
LIST_HEAD(vm_list);
75

76
static cpumask_var_t cpus_hardware_enabled;
77
78
static int kvm_usage_count = 0;
static atomic_t hardware_enable_failed;
79

80
81
struct kmem_cache *kvm_vcpu_cache;
EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
Avi Kivity's avatar
Avi Kivity committed
82

83
84
static __read_mostly struct preempt_ops kvm_preempt_ops;

85
struct dentry *kvm_debugfs_dir;
Avi Kivity's avatar
Avi Kivity committed
86

Avi Kivity's avatar
Avi Kivity committed
87
88
static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
			   unsigned long arg);
89
90
91
92
#ifdef CONFIG_COMPAT
static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
				  unsigned long arg);
#endif
93
94
static int hardware_enable_all(void);
static void hardware_disable_all(void);
Avi Kivity's avatar
Avi Kivity committed
95

Marcelo Tosatti's avatar
Marcelo Tosatti committed
96
97
static void kvm_io_bus_destroy(struct kvm_io_bus *bus);

98
99
bool kvm_rebooting;
EXPORT_SYMBOL_GPL(kvm_rebooting);
100

101
102
static bool largepages_enabled = true;

Xiao Guangrong's avatar
Xiao Guangrong committed
103
bool kvm_is_mmio_pfn(pfn_t pfn)
Ben-Ami Yassour's avatar
Ben-Ami Yassour committed
104
{
105
	if (pfn_valid(pfn)) {
106
		int reserved;
107
		struct page *tail = pfn_to_page(pfn);
108
109
		struct page *head = compound_trans_head(tail);
		reserved = PageReserved(head);
110
111
		if (head != tail) {
			/*
112
113
114
115
116
117
118
119
			 * "head" is not a dangling pointer
			 * (compound_trans_head takes care of that)
			 * but the hugepage may have been splitted
			 * from under us (and we may not hold a
			 * reference count on the head page so it can
			 * be reused before we run PageReferenced), so
			 * we've to check PageTail before returning
			 * what we just read.
120
			 */
121
122
123
			smp_rmb();
			if (PageTail(tail))
				return reserved;
124
125
		}
		return PageReserved(tail);
126
	}
Ben-Ami Yassour's avatar
Ben-Ami Yassour committed
127
128
129
130

	return true;
}

Avi Kivity's avatar
Avi Kivity committed
131
132
133
/*
 * Switches to specified vcpu, until a matching vcpu_put()
 */
134
int vcpu_load(struct kvm_vcpu *vcpu)
Avi Kivity's avatar
Avi Kivity committed
135
{
136
137
	int cpu;

138
139
	if (mutex_lock_killable(&vcpu->mutex))
		return -EINTR;
140
141
142
143
144
145
146
147
	if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) {
		/* The thread running this VCPU changed. */
		struct pid *oldpid = vcpu->pid;
		struct pid *newpid = get_task_pid(current, PIDTYPE_PID);
		rcu_assign_pointer(vcpu->pid, newpid);
		synchronize_rcu();
		put_pid(oldpid);
	}
148
149
	cpu = get_cpu();
	preempt_notifier_register(&vcpu->preempt_notifier);
150
	kvm_arch_vcpu_load(vcpu, cpu);
151
	put_cpu();
152
	return 0;
Avi Kivity's avatar
Avi Kivity committed
153
154
}

155
void vcpu_put(struct kvm_vcpu *vcpu)
Avi Kivity's avatar
Avi Kivity committed
156
{
157
	preempt_disable();
158
	kvm_arch_vcpu_put(vcpu);
159
160
	preempt_notifier_unregister(&vcpu->preempt_notifier);
	preempt_enable();
Avi Kivity's avatar
Avi Kivity committed
161
162
163
	mutex_unlock(&vcpu->mutex);
}

164
165
166
167
static void ack_flush(void *_completed)
{
}

168
static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
169
{
170
	int i, cpu, me;
171
172
	cpumask_var_t cpus;
	bool called = true;
173
174
	struct kvm_vcpu *vcpu;

175
	zalloc_cpumask_var(&cpus, GFP_ATOMIC);
176

177
	me = get_cpu();
178
	kvm_for_each_vcpu(i, vcpu, kvm) {
179
		kvm_make_request(req, vcpu);
180
		cpu = vcpu->cpu;
181
182
183
184
185
186

		/* Set ->requests bit before we read ->mode */
		smp_mb();

		if (cpus != NULL && cpu != -1 && cpu != me &&
		      kvm_vcpu_exiting_guest_mode(vcpu) != OUTSIDE_GUEST_MODE)
187
			cpumask_set_cpu(cpu, cpus);
188
	}
189
190
191
192
193
194
	if (unlikely(cpus == NULL))
		smp_call_function_many(cpu_online_mask, ack_flush, NULL, 1);
	else if (!cpumask_empty(cpus))
		smp_call_function_many(cpus, ack_flush, NULL, 1);
	else
		called = false;
195
	put_cpu();
196
	free_cpumask_var(cpus);
197
	return called;
198
199
}

200
void kvm_flush_remote_tlbs(struct kvm *kvm)
201
{
202
	long dirty_count = kvm->tlbs_dirty;
203
204

	smp_mb();
205
206
	if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
		++kvm->stat.remote_tlb_flush;
207
	cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
208
209
}

210
211
212
213
void kvm_reload_remote_mmus(struct kvm *kvm)
{
	make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
}
214

215
216
217
218
219
void kvm_make_mclock_inprogress_request(struct kvm *kvm)
{
	make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
}

220
221
222
223
224
void kvm_make_update_eoibitmap_request(struct kvm *kvm)
{
	make_all_cpus_request(kvm, KVM_REQ_EOIBITMAP);
}

225
226
227
228
229
230
231
232
233
int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
{
	struct page *page;
	int r;

	mutex_init(&vcpu->mutex);
	vcpu->cpu = -1;
	vcpu->kvm = kvm;
	vcpu->vcpu_id = id;
234
	vcpu->pid = NULL;
Eddie Dong's avatar
Eddie Dong committed
235
	init_waitqueue_head(&vcpu->wq);
236
	kvm_async_pf_vcpu_init(vcpu);
237
238
239
240
241
242
243
244

	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
	if (!page) {
		r = -ENOMEM;
		goto fail;
	}
	vcpu->run = page_address(page);

245
246
	kvm_vcpu_set_in_spin_loop(vcpu, false);
	kvm_vcpu_set_dy_eligible(vcpu, false);
247
	vcpu->preempted = false;
248

249
	r = kvm_arch_vcpu_init(vcpu);
250
	if (r < 0)
251
		goto fail_free_run;
252
253
254
255
256
	return 0;

fail_free_run:
	free_page((unsigned long)vcpu->run);
fail:
257
	return r;
258
259
260
261
262
}
EXPORT_SYMBOL_GPL(kvm_vcpu_init);

void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
{
263
	put_pid(vcpu->pid);
264
	kvm_arch_vcpu_uninit(vcpu);
265
266
267
268
	free_page((unsigned long)vcpu->run);
}
EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);

269
270
271
272
273
274
275
276
277
278
279
#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
{
	return container_of(mn, struct kvm, mmu_notifier);
}

static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
					     struct mm_struct *mm,
					     unsigned long address)
{
	struct kvm *kvm = mmu_notifier_to_kvm(mn);
280
	int need_tlb_flush, idx;
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299

	/*
	 * When ->invalidate_page runs, the linux pte has been zapped
	 * already but the page is still allocated until
	 * ->invalidate_page returns. So if we increase the sequence
	 * here the kvm page fault will notice if the spte can't be
	 * established because the page is going to be freed. If
	 * instead the kvm page fault establishes the spte before
	 * ->invalidate_page runs, kvm_unmap_hva will release it
	 * before returning.
	 *
	 * The sequence increase only need to be seen at spin_unlock
	 * time, and not at spin_lock time.
	 *
	 * Increasing the sequence after the spin_unlock would be
	 * unsafe because the kvm page fault could then establish the
	 * pte after kvm_unmap_hva returned, without noticing the page
	 * is going to be freed.
	 */
300
	idx = srcu_read_lock(&kvm->srcu);
301
	spin_lock(&kvm->mmu_lock);
302

303
	kvm->mmu_notifier_seq++;
304
	need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty;
305
306
307
308
	/* we've to flush the tlb before the pages can be freed */
	if (need_tlb_flush)
		kvm_flush_remote_tlbs(kvm);

309
310
	spin_unlock(&kvm->mmu_lock);
	srcu_read_unlock(&kvm->srcu, idx);
311
312
}

313
314
315
316
317
318
static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
					struct mm_struct *mm,
					unsigned long address,
					pte_t pte)
{
	struct kvm *kvm = mmu_notifier_to_kvm(mn);
319
	int idx;
320

321
	idx = srcu_read_lock(&kvm->srcu);
322
323
324
325
	spin_lock(&kvm->mmu_lock);
	kvm->mmu_notifier_seq++;
	kvm_set_spte_hva(kvm, address, pte);
	spin_unlock(&kvm->mmu_lock);
326
	srcu_read_unlock(&kvm->srcu, idx);
327
328
}

329
330
331
332
333
334
static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
						    struct mm_struct *mm,
						    unsigned long start,
						    unsigned long end)
{
	struct kvm *kvm = mmu_notifier_to_kvm(mn);
335
	int need_tlb_flush = 0, idx;
336

337
	idx = srcu_read_lock(&kvm->srcu);
338
339
340
341
342
343
344
	spin_lock(&kvm->mmu_lock);
	/*
	 * The count increase must become visible at unlock time as no
	 * spte can be established without taking the mmu_lock and
	 * count is also read inside the mmu_lock critical section.
	 */
	kvm->mmu_notifier_count++;
345
	need_tlb_flush = kvm_unmap_hva_range(kvm, start, end);
346
	need_tlb_flush |= kvm->tlbs_dirty;
347
348
349
	/* we've to flush the tlb before the pages can be freed */
	if (need_tlb_flush)
		kvm_flush_remote_tlbs(kvm);
350
351
352

	spin_unlock(&kvm->mmu_lock);
	srcu_read_unlock(&kvm->srcu, idx);
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
}

static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
						  struct mm_struct *mm,
						  unsigned long start,
						  unsigned long end)
{
	struct kvm *kvm = mmu_notifier_to_kvm(mn);

	spin_lock(&kvm->mmu_lock);
	/*
	 * This sequence increase will notify the kvm page fault that
	 * the page that is going to be mapped in the spte could have
	 * been freed.
	 */
	kvm->mmu_notifier_seq++;
369
	smp_wmb();
370
371
	/*
	 * The above sequence increase must be visible before the
372
373
	 * below count decrease, which is ensured by the smp_wmb above
	 * in conjunction with the smp_rmb in mmu_notifier_retry().
374
375
376
377
378
379
380
381
382
383
384
385
	 */
	kvm->mmu_notifier_count--;
	spin_unlock(&kvm->mmu_lock);

	BUG_ON(kvm->mmu_notifier_count < 0);
}

static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
					      struct mm_struct *mm,
					      unsigned long address)
{
	struct kvm *kvm = mmu_notifier_to_kvm(mn);
386
	int young, idx;
387

388
	idx = srcu_read_lock(&kvm->srcu);
389
390
	spin_lock(&kvm->mmu_lock);

391
	young = kvm_age_hva(kvm, address);
392
393
394
	if (young)
		kvm_flush_remote_tlbs(kvm);

395
396
397
	spin_unlock(&kvm->mmu_lock);
	srcu_read_unlock(&kvm->srcu, idx);

398
399
400
	return young;
}

401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
				       struct mm_struct *mm,
				       unsigned long address)
{
	struct kvm *kvm = mmu_notifier_to_kvm(mn);
	int young, idx;

	idx = srcu_read_lock(&kvm->srcu);
	spin_lock(&kvm->mmu_lock);
	young = kvm_test_age_hva(kvm, address);
	spin_unlock(&kvm->mmu_lock);
	srcu_read_unlock(&kvm->srcu, idx);

	return young;
}

417
418
419
420
static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
				     struct mm_struct *mm)
{
	struct kvm *kvm = mmu_notifier_to_kvm(mn);
421
422
423
	int idx;

	idx = srcu_read_lock(&kvm->srcu);
424
	kvm_arch_flush_shadow_all(kvm);
425
	srcu_read_unlock(&kvm->srcu, idx);
426
427
}

428
429
430
431
432
static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
	.invalidate_page	= kvm_mmu_notifier_invalidate_page,
	.invalidate_range_start	= kvm_mmu_notifier_invalidate_range_start,
	.invalidate_range_end	= kvm_mmu_notifier_invalidate_range_end,
	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young,
433
	.test_young		= kvm_mmu_notifier_test_young,
434
	.change_pte		= kvm_mmu_notifier_change_pte,
435
	.release		= kvm_mmu_notifier_release,
436
};
437
438
439
440
441
442
443
444
445
446
447
448
449
450

static int kvm_init_mmu_notifier(struct kvm *kvm)
{
	kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
	return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
}

#else  /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */

static int kvm_init_mmu_notifier(struct kvm *kvm)
{
	return 0;
}

451
452
#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */

453
454
455
456
457
458
static void kvm_init_memslots_id(struct kvm *kvm)
{
	int i;
	struct kvm_memslots *slots = kvm->memslots;

	for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
459
		slots->id_to_index[i] = slots->memslots[i].id = i;
460
461
}

462
static struct kvm *kvm_create_vm(unsigned long type)
Avi Kivity's avatar
Avi Kivity committed
463
{
464
465
	int r, i;
	struct kvm *kvm = kvm_arch_alloc_vm();
Avi Kivity's avatar
Avi Kivity committed
466

467
468
469
	if (!kvm)
		return ERR_PTR(-ENOMEM);

470
	r = kvm_arch_init_vm(kvm, type);
471
472
	if (r)
		goto out_err_nodisable;
473
474
475
476
477

	r = hardware_enable_all();
	if (r)
		goto out_err_nodisable;

478
479
#ifdef CONFIG_HAVE_KVM_IRQCHIP
	INIT_HLIST_HEAD(&kvm->mask_notifier_list);
480
	INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
481
#endif
Avi Kivity's avatar
Avi Kivity committed
482

483
484
	BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);

485
486
487
	r = -ENOMEM;
	kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
	if (!kvm->memslots)
488
		goto out_err_nosrcu;
489
	kvm_init_memslots_id(kvm);
490
	if (init_srcu_struct(&kvm->srcu))
491
		goto out_err_nosrcu;
Marcelo Tosatti's avatar
Marcelo Tosatti committed
492
493
494
	for (i = 0; i < KVM_NR_BUSES; i++) {
		kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus),
					GFP_KERNEL);
495
		if (!kvm->buses[i])
Marcelo Tosatti's avatar
Marcelo Tosatti committed
496
497
			goto out_err;
	}
498

499
	spin_lock_init(&kvm->mmu_lock);
500
501
	kvm->mm = current->mm;
	atomic_inc(&kvm->mm->mm_count);
Gregory Haskins's avatar
Gregory Haskins committed
502
	kvm_eventfd_init(kvm);
Shaohua Li's avatar
Shaohua Li committed
503
	mutex_init(&kvm->lock);
504
	mutex_init(&kvm->irq_lock);
505
	mutex_init(&kvm->slots_lock);
Izik Eidus's avatar
Izik Eidus committed
506
	atomic_set(&kvm->users_count, 1);
507
508
509
510
511

	r = kvm_init_mmu_notifier(kvm);
	if (r)
		goto out_err;

512
	raw_spin_lock(&kvm_lock);
513
	list_add(&kvm->vm_list, &vm_list);
514
	raw_spin_unlock(&kvm_lock);
515

516
	return kvm;
517
518

out_err:
519
520
	cleanup_srcu_struct(&kvm->srcu);
out_err_nosrcu:
521
522
	hardware_disable_all();
out_err_nodisable:
Marcelo Tosatti's avatar
Marcelo Tosatti committed
523
524
	for (i = 0; i < KVM_NR_BUSES; i++)
		kfree(kvm->buses[i]);
525
	kfree(kvm->memslots);
526
	kvm_arch_free_vm(kvm);
527
	return ERR_PTR(r);
528
529
}

530
531
532
533
/*
 * Avoid using vmalloc for a small buffer.
 * Should not be used when the size is statically known.
 */
534
void *kvm_kvzalloc(unsigned long size)
535
536
537
538
539
540
541
{
	if (size > PAGE_SIZE)
		return vzalloc(size);
	else
		return kzalloc(size, GFP_KERNEL);
}

542
void kvm_kvfree(const void *addr)
543
544
545
546
547
548
549
{
	if (is_vmalloc_addr(addr))
		vfree(addr);
	else
		kfree(addr);
}

550
551
552
553
554
static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
{
	if (!memslot->dirty_bitmap)
		return;

555
	kvm_kvfree(memslot->dirty_bitmap);
556
557
558
	memslot->dirty_bitmap = NULL;
}

Avi Kivity's avatar
Avi Kivity committed
559
560
561
562
563
564
565
/*
 * Free any memory in @free but not in @dont.
 */
static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
				  struct kvm_memory_slot *dont)
{
	if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
566
		kvm_destroy_dirty_bitmap(free);
Avi Kivity's avatar
Avi Kivity committed
567

568
	kvm_arch_free_memslot(free, dont);
Marcelo Tosatti's avatar
Marcelo Tosatti committed
569

Avi Kivity's avatar
Avi Kivity committed
570
571
572
	free->npages = 0;
}

573
void kvm_free_physmem(struct kvm *kvm)
Avi Kivity's avatar
Avi Kivity committed
574
{
575
	struct kvm_memslots *slots = kvm->memslots;
576
	struct kvm_memory_slot *memslot;
577

578
579
	kvm_for_each_memslot(memslot, slots)
		kvm_free_physmem_slot(memslot, NULL);
Avi Kivity's avatar
Avi Kivity committed
580

581
	kfree(kvm->memslots);
Avi Kivity's avatar
Avi Kivity committed
582
583
}

584
585
static void kvm_destroy_vm(struct kvm *kvm)
{
Marcelo Tosatti's avatar
Marcelo Tosatti committed
586
	int i;
587
588
	struct mm_struct *mm = kvm->mm;

589
	kvm_arch_sync_events(kvm);
590
	raw_spin_lock(&kvm_lock);
591
	list_del(&kvm->vm_list);
592
	raw_spin_unlock(&kvm_lock);
593
	kvm_free_irq_routing(kvm);
Marcelo Tosatti's avatar
Marcelo Tosatti committed
594
595
	for (i = 0; i < KVM_NR_BUSES; i++)
		kvm_io_bus_destroy(kvm->buses[i]);
596
	kvm_coalesced_mmio_free(kvm);
597
598
#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
	mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
599
#else
600
	kvm_arch_flush_shadow_all(kvm);
601
#endif
602
	kvm_arch_destroy_vm(kvm);
603
604
605
	kvm_free_physmem(kvm);
	cleanup_srcu_struct(&kvm->srcu);
	kvm_arch_free_vm(kvm);
606
	hardware_disable_all();
607
	mmdrop(mm);
608
609
}

Izik Eidus's avatar
Izik Eidus committed
610
611
612
613
614
615
616
617
618
619
620
621
622
623
void kvm_get_kvm(struct kvm *kvm)
{
	atomic_inc(&kvm->users_count);
}
EXPORT_SYMBOL_GPL(kvm_get_kvm);

void kvm_put_kvm(struct kvm *kvm)
{
	if (atomic_dec_and_test(&kvm->users_count))
		kvm_destroy_vm(kvm);
}
EXPORT_SYMBOL_GPL(kvm_put_kvm);


624
625
626
627
static int kvm_vm_release(struct inode *inode, struct file *filp)
{
	struct kvm *kvm = filp->private_data;

Gregory Haskins's avatar
Gregory Haskins committed
628
629
	kvm_irqfd_release(kvm);

Izik Eidus's avatar
Izik Eidus committed
630
	kvm_put_kvm(kvm);
Avi Kivity's avatar
Avi Kivity committed
631
632
633
	return 0;
}

634
635
/*
 * Allocation size is twice as large as the actual dirty bitmap size.
636
 * See x86's kvm_vm_ioctl_get_dirty_log() why this is needed.
637
 */
638
639
static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
{
640
#ifndef CONFIG_S390
641
	unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
642

643
	memslot->dirty_bitmap = kvm_kvzalloc(dirty_bytes);
644
645
646
	if (!memslot->dirty_bitmap)
		return -ENOMEM;

647
#endif /* !CONFIG_S390 */
648
649
650
	return 0;
}

651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
static int cmp_memslot(const void *slot1, const void *slot2)
{
	struct kvm_memory_slot *s1, *s2;

	s1 = (struct kvm_memory_slot *)slot1;
	s2 = (struct kvm_memory_slot *)slot2;

	if (s1->npages < s2->npages)
		return 1;
	if (s1->npages > s2->npages)
		return -1;

	return 0;
}

/*
 * Sort the memslots base on its size, so the larger slots
 * will get better fit.
 */
static void sort_memslots(struct kvm_memslots *slots)
{
672
673
	int i;

674
675
	sort(slots->memslots, KVM_MEM_SLOTS_NUM,
	      sizeof(struct kvm_memory_slot), cmp_memslot, NULL);
676
677
678

	for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
		slots->id_to_index[slots->memslots[i].id] = i;
679
680
}

681
682
void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new,
		     u64 last_generation)
683
684
685
{
	if (new) {
		int id = new->id;
686
		struct kvm_memory_slot *old = id_to_memslot(slots, id);
687
		unsigned long npages = old->npages;
688

689
		*old = *new;
690
691
		if (new->npages != npages)
			sort_memslots(slots);
692
693
	}

694
	slots->generation = last_generation + 1;
695
696
}

697
698
static int check_memory_region_flags(struct kvm_userspace_memory_region *mem)
{
699
700
701
702
703
704
705
	u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;

#ifdef KVM_CAP_READONLY_MEM
	valid_flags |= KVM_MEM_READONLY;
#endif

	if (mem->flags & ~valid_flags)
706
707
708
709
710
		return -EINVAL;

	return 0;
}

711
712
713
714
715
716
717
718
719
720
721
static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
		struct kvm_memslots *slots, struct kvm_memory_slot *new)
{
	struct kvm_memslots *old_memslots = kvm->memslots;

	update_memslots(slots, new, kvm->memslots->generation);
	rcu_assign_pointer(kvm->memslots, slots);
	synchronize_srcu_expedited(&kvm->srcu);
	return old_memslots; 
}

Avi Kivity's avatar
Avi Kivity committed
722
723
724
725
726
/*
 * Allocate some memory and give it an address in the guest physical address
 * space.
 *
 * Discontiguous memory is allowed, mostly for framebuffers.
727
 *
728
 * Must be called holding mmap_sem for write.
Avi Kivity's avatar
Avi Kivity committed
729
 */
730
int __kvm_set_memory_region(struct kvm *kvm,
731
			    struct kvm_userspace_memory_region *mem)
Avi Kivity's avatar
Avi Kivity committed
732
{
733
	int r;
Avi Kivity's avatar
Avi Kivity committed
734
	gfn_t base_gfn;
735
	unsigned long npages;
736
	struct kvm_memory_slot *slot;
Avi Kivity's avatar
Avi Kivity committed
737
	struct kvm_memory_slot old, new;
738
	struct kvm_memslots *slots = NULL, *old_memslots;
739
	enum kvm_mr_change change;
Avi Kivity's avatar
Avi Kivity committed
740

741
742
743
744
	r = check_memory_region_flags(mem);
	if (r)
		goto out;

Avi Kivity's avatar
Avi Kivity committed
745
746
747
748
749
750
	r = -EINVAL;
	/* General sanity checks */
	if (mem->memory_size & (PAGE_SIZE - 1))
		goto out;
	if (mem->guest_phys_addr & (PAGE_SIZE - 1))
		goto out;
751
	/* We can read the guest memory with __xxx_user() later on. */
752
	if ((mem->slot < KVM_USER_MEM_SLOTS) &&
753
	    ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
754
755
756
	     !access_ok(VERIFY_WRITE,
			(void __user *)(unsigned long)mem->userspace_addr,
			mem->memory_size)))
757
		goto out;
758
	if (mem->slot >= KVM_MEM_SLOTS_NUM)
Avi Kivity's avatar
Avi Kivity committed
759
760
761
762
		goto out;
	if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
		goto out;

763
	slot = id_to_memslot(kvm->memslots, mem->slot);
Avi Kivity's avatar
Avi Kivity committed
764
765
766
	base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
	npages = mem->memory_size >> PAGE_SHIFT;

767
768
769
770
	r = -EINVAL;
	if (npages > KVM_MEM_MAX_NR_PAGES)
		goto out;

Avi Kivity's avatar
Avi Kivity committed
771
772
773
	if (!npages)
		mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;

774
	new = old = *slot;
Avi Kivity's avatar
Avi Kivity committed
775

776
	new.id = mem->slot;
Avi Kivity's avatar
Avi Kivity committed
777
778
779
780
781
	new.base_gfn = base_gfn;
	new.npages = npages;
	new.flags = mem->flags;

	r = -EINVAL;
782
783
784
785
786
	if (npages) {
		if (!old.npages)
			change = KVM_MR_CREATE;
		else { /* Modify an existing slot. */
			if ((mem->userspace_addr != old.userspace_addr) ||
787
788
			    (npages != old.npages) ||
			    ((new.flags ^ old.flags) & KVM_MEM_READONLY))
789
790
791
792
793
794
795
796
797
798
799
800
801
802
				goto out;

			if (base_gfn != old.base_gfn)
				change = KVM_MR_MOVE;
			else if (new.flags != old.flags)
				change = KVM_MR_FLAGS_ONLY;
			else { /* Nothing to change. */
				r = 0;
				goto out;
			}
		}
	} else if (old.npages) {
		change = KVM_MR_DELETE;
	} else /* Modify a non-existent slot: disallowed. */
803
		goto out;
Avi Kivity's avatar
Avi Kivity committed
804

805
	if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) {
806
807
808
		/* Check for overlaps */
		r = -EEXIST;
		kvm_for_each_memslot(slot, kvm->memslots) {
809
810
			if ((slot->id >= KVM_USER_MEM_SLOTS) ||
			    (slot->id == mem->slot))
811
812
813
814
815
				continue;
			if (!((base_gfn + npages <= slot->base_gfn) ||
			      (base_gfn >= slot->base_gfn + slot->npages)))
				goto out;
		}
Avi Kivity's avatar
Avi Kivity committed
816
817
818
819
	}

	/* Free page dirty bitmap if unneeded */
	if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
Al Viro's avatar
Al Viro committed
820
		new.dirty_bitmap = NULL;
Avi Kivity's avatar
Avi Kivity committed
821
822

	r = -ENOMEM;
823
	if (change == KVM_MR_CREATE) {
824
		new.userspace_addr = mem->userspace_addr;
825

826
827
		if (kvm_arch_create_memslot(&new, npages))
			goto out_free;
Avi Kivity's avatar
Avi Kivity committed
828
	}
829

Avi Kivity's avatar
Avi Kivity committed
830
831
	/* Allocate page dirty bitmap if needed */
	if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
832
		if (kvm_create_dirty_bitmap(&new) < 0)
833
			goto out_free;
Avi Kivity's avatar
Avi Kivity committed
834
835
	}

836
	if ((change == KVM_MR_DELETE) || (change == KVM_MR_MOVE)) {
837
		r = -ENOMEM;
838
839
		slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots),
				GFP_KERNEL);
840
841
		if (!slots)
			goto out_free;
842
843
844
		slot = id_to_memslot(slots, mem->slot);
		slot->flags |= KVM_MEMSLOT_INVALID;

845
		old_memslots = install_new_memslots(kvm, slots, NULL);
846

847
848
		/* slot was deleted or moved, clear iommu mapping */
		kvm_iommu_unmap_pages(kvm, &old);
849
850
		/* From this point no new shadow pages pointing to a deleted,
		 * or moved, memslot will be created.
851
852
853
854
855
		 *
		 * validation of sp->gfn happens in:
		 * 	- gfn_to_hva (kvm_read_guest, gfn_to_pfn)
		 * 	- kvm_is_visible_gfn (mmu_check_roots)
		 */
856
		kvm_arch_flush_shadow_memslot(kvm, slot);
857
		slots = old_memslots;
858
	}
859

860
	r = kvm_arch_prepare_memory_region(kvm, &new, mem, change);
861
	if (r)
862
		goto out_slots;
863

864
	r = -ENOMEM;
865
866
867
868
869
870
871
872
873
874
875
	/*
	 * We can re-use the old_memslots from above, the only difference
	 * from the currently installed memslots is the invalid flag.  This
	 * will get overwritten by update_memslots anyway.
	 */
	if (!slots) {
		slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots),
				GFP_KERNEL);
		if (!slots)
			goto out_free;
	}
876

877
878
	/*
	 * IOMMU mapping:  New slots need to be mapped.  Old slots need to be
879
880
881
882
883
884
	 * un-mapped and re-mapped if their base changes.  Since base change
	 * unmapping is handled above with slot deletion, mapping alone is
	 * needed here.  Anything else the iommu might care about for existing
	 * slots (size changes, userspace addr changes and read-only flag
	 * changes) is disallowed above, so any other attribute changes getting
	 * here can be skipped.
885
	 */
886
887
888
889
	if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) {
		r = kvm_iommu_map_pages(kvm, &new);
		if (r)
			goto out_slots;
890
891
	}

892
	/* actual memory is freed via old in kvm_free_physmem_slot below */
893
	if (change == KVM_MR_DELETE) {
894
		new.dirty_bitmap = NULL;
895
		memset(&new.arch, 0, sizeof(new.arch));
896
897
	}

898
	old_memslots = install_new_memslots(kvm, slots, &new);
899

900
	kvm_arch_commit_memory_region(kvm, mem, &old, change);
901

902
903
904
	kvm_free_physmem_slot(&old, &new);
	kfree(old_memslots);

Avi Kivity's avatar
Avi Kivity committed
905
906
	return 0;

907
908
out_slots:
	kfree(slots);
909
out_free:
Avi Kivity's avatar
Avi Kivity committed
910
911
912
	kvm_free_physmem_slot(&new, &old);
out:
	return r;
913
}
914
915
916
EXPORT_SYMBOL_GPL(__kvm_set_memory_region);

int kvm_set_memory_region(struct kvm *kvm,
917
			  struct kvm_userspace_memory_region *mem)
918
919
920
{
	int r;

921
	mutex_lock(&kvm->slots_lock);
922
	r = __kvm_set_memory_region(kvm, mem);
923
	mutex_unlock(&kvm->slots_lock);
924
925
	return r;
}
926
927
EXPORT_SYMBOL_GPL(kvm_set_memory_region);

928
int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
929
				   struct kvm_userspace_memory_region *mem)
930
{
931
	if (mem->slot >= KVM_USER_MEM_SLOTS)
932
		return -EINVAL;
933
	return kvm_set_memory_region(kvm, mem);
Avi Kivity's avatar
Avi Kivity committed
934
935
}

936
937
int kvm_get_dirty_log(struct kvm *kvm,
			struct kvm_dirty_log *log, int *is_dirty)
Avi Kivity's avatar
Avi Kivity committed
938
939
940
{
	struct kvm_memory_slot *memslot;
	int r, i;
941
	unsigned long n;
Avi Kivity's avatar
Avi Kivity committed
942
943
944
	unsigned long any = 0;

	r = -EINVAL;
945
	if (log->slot >= KVM_USER_MEM_SLOTS)
Avi Kivity's avatar
Avi Kivity committed
946
947
		goto out;

948
	memslot = id_to_memslot(kvm->memslots, log->slot);
Avi Kivity's avatar
Avi Kivity committed
949
950
951
952
	r = -ENOENT;
	if (!memslot->dirty_bitmap)
		goto out;

953
	n = kvm_dirty_bitmap_bytes(memslot);
Avi Kivity's avatar
Avi Kivity committed
954

955
	for (i = 0; !any && i < n/sizeof(long); ++i)
Avi Kivity's avatar
Avi Kivity committed
956
957
958
959
960
961
		any = memslot->dirty_bitmap[i];

	r = -EFAULT;
	if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
		goto out;

962
963
	if (any)
		*is_dirty = 1;
Avi Kivity's avatar
Avi Kivity committed
964
965
966
967
968
969

	r = 0;
out:
	return r;
}

970
971
972
973
974
bool kvm_largepages_enabled(void)
{
	return largepages_enabled;
}

975
976
977
978
979
980
void kvm_disable_largepages(void)
{
	largepages_enabled = false;
}
EXPORT_SYMBOL_GPL(kvm_disable_largepages);

981
982
983
984
struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
{
	return __gfn_to_memslot(kvm_memslots(kvm), gfn);
}
Avi Kivity's avatar
Avi Kivity committed
985
EXPORT_SYMBOL_GPL(gfn_to_memslot);
Avi Kivity's avatar
Avi Kivity committed
986

987
988
int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
{
989
	struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
990