memory.c 86.1 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
/*
 *  linux/mm/memory.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 */

/*
 * demand-loading started 01.12.91 - seems it is high on the list of
 * things wanted, and it should be easy to implement. - Linus
 */

/*
 * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
 * pages started 02.12.91, seems to work. - Linus.
 *
 * Tested sharing by executing about 30 /bin/sh: under the old kernel it
 * would have taken more than the 6M I have free, but it worked well as
 * far as I could see.
 *
 * Also corrected some "invalidate()"s - I wasn't doing enough of them.
 */

/*
 * Real VM (paging to/from disk) started 18.12.91. Much more work and
 * thought has to go into this. Oh, well..
 * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
 *		Found it. Everything seems to work now.
 * 20.12.91  -  Ok, making the swap-device changeable like the root.
 */

/*
 * 05.04.94  -  Multi-page memory management added for v1.1.
 * 		Idea by Alex Bligh (alex@cconcepts.co.uk)
 *
 * 16.07.99  -  Support of BIGMEM added by Gerhard Wichert, Siemens AG
 *		(Gerhard.Wichert@pdb.siemens.de)
 *
 * Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
 */

#include <linux/kernel_stat.h>
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/mman.h>
#include <linux/swap.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/rmap.h>
#include <linux/module.h>
50
#include <linux/delayacct.h>
Linus Torvalds's avatar
Linus Torvalds committed
51
#include <linux/init.h>
52
#include <linux/writeback.h>
53
#include <linux/memcontrol.h>
Andrea Arcangeli's avatar
Andrea Arcangeli committed
54
#include <linux/mmu_notifier.h>
55
56
57
#include <linux/kallsyms.h>
#include <linux/swapops.h>
#include <linux/elf.h>
Linus Torvalds's avatar
Linus Torvalds committed
58
59
60
61
62
63
64

#include <asm/pgalloc.h>
#include <asm/uaccess.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/pgtable.h>

65
66
#include "internal.h"

67
#ifndef CONFIG_NEED_MULTIPLE_NODES
Linus Torvalds's avatar
Linus Torvalds committed
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
/* use the per-pgdat data instead for discontigmem - mbligh */
unsigned long max_mapnr;
struct page *mem_map;

EXPORT_SYMBOL(max_mapnr);
EXPORT_SYMBOL(mem_map);
#endif

unsigned long num_physpages;
/*
 * A number of key systems in x86 including ioremap() rely on the assumption
 * that high_memory defines the upper bound on direct map memory, then end
 * of ZONE_NORMAL.  Under CONFIG_DISCONTIG this means that max_low_pfn and
 * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL
 * and ZONE_HIGHMEM.
 */
void * high_memory;

EXPORT_SYMBOL(num_physpages);
EXPORT_SYMBOL(high_memory);

89
90
91
92
93
94
95
96
97
98
99
100
/*
 * Randomize the address space (stacks, mmaps, brk, etc.).
 *
 * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
 *   as ancient (libc5 based) binaries can segfault. )
 */
int randomize_va_space __read_mostly =
#ifdef CONFIG_COMPAT_BRK
					1;
#else
					2;
#endif
101
102
103
104

static int __init disable_randmaps(char *s)
{
	randomize_va_space = 0;
105
	return 1;
106
107
108
109
}
__setup("norandmaps", disable_randmaps);


Linus Torvalds's avatar
Linus Torvalds committed
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
/*
 * If a p?d_bad entry is found while walking page tables, report
 * the error, before resetting entry to p?d_none.  Usually (but
 * very seldom) called out from the p?d_none_or_clear_bad macros.
 */

void pgd_clear_bad(pgd_t *pgd)
{
	pgd_ERROR(*pgd);
	pgd_clear(pgd);
}

void pud_clear_bad(pud_t *pud)
{
	pud_ERROR(*pud);
	pud_clear(pud);
}

void pmd_clear_bad(pmd_t *pmd)
{
	pmd_ERROR(*pmd);
	pmd_clear(pmd);
}

/*
 * Note: this doesn't free the actual pages themselves. That
 * has been handled earlier when unmapping all the memory regions.
 */
138
static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
Linus Torvalds's avatar
Linus Torvalds committed
139
{
140
	pgtable_t token = pmd_pgtable(*pmd);
141
	pmd_clear(pmd);
142
	pte_free_tlb(tlb, token);
143
	tlb->mm->nr_ptes--;
Linus Torvalds's avatar
Linus Torvalds committed
144
145
}

146
147
148
static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
				unsigned long addr, unsigned long end,
				unsigned long floor, unsigned long ceiling)
Linus Torvalds's avatar
Linus Torvalds committed
149
150
151
{
	pmd_t *pmd;
	unsigned long next;
152
	unsigned long start;
Linus Torvalds's avatar
Linus Torvalds committed
153

154
	start = addr;
Linus Torvalds's avatar
Linus Torvalds committed
155
156
157
158
159
	pmd = pmd_offset(pud, addr);
	do {
		next = pmd_addr_end(addr, end);
		if (pmd_none_or_clear_bad(pmd))
			continue;
160
		free_pte_range(tlb, pmd);
Linus Torvalds's avatar
Linus Torvalds committed
161
162
	} while (pmd++, addr = next, addr != end);

163
164
165
166
167
168
169
	start &= PUD_MASK;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= PUD_MASK;
		if (!ceiling)
			return;
Linus Torvalds's avatar
Linus Torvalds committed
170
	}
171
172
173
174
175
176
	if (end - 1 > ceiling - 1)
		return;

	pmd = pmd_offset(pud, start);
	pud_clear(pud);
	pmd_free_tlb(tlb, pmd);
Linus Torvalds's avatar
Linus Torvalds committed
177
178
}

179
180
181
static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
				unsigned long addr, unsigned long end,
				unsigned long floor, unsigned long ceiling)
Linus Torvalds's avatar
Linus Torvalds committed
182
183
184
{
	pud_t *pud;
	unsigned long next;
185
	unsigned long start;
Linus Torvalds's avatar
Linus Torvalds committed
186

187
	start = addr;
Linus Torvalds's avatar
Linus Torvalds committed
188
189
190
191
192
	pud = pud_offset(pgd, addr);
	do {
		next = pud_addr_end(addr, end);
		if (pud_none_or_clear_bad(pud))
			continue;
193
		free_pmd_range(tlb, pud, addr, next, floor, ceiling);
Linus Torvalds's avatar
Linus Torvalds committed
194
195
	} while (pud++, addr = next, addr != end);

196
197
198
199
200
201
202
	start &= PGDIR_MASK;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= PGDIR_MASK;
		if (!ceiling)
			return;
Linus Torvalds's avatar
Linus Torvalds committed
203
	}
204
205
206
207
208
209
	if (end - 1 > ceiling - 1)
		return;

	pud = pud_offset(pgd, start);
	pgd_clear(pgd);
	pud_free_tlb(tlb, pud);
Linus Torvalds's avatar
Linus Torvalds committed
210
211
212
}

/*
213
214
 * This function frees user-level page tables of a process.
 *
Linus Torvalds's avatar
Linus Torvalds committed
215
216
 * Must be called with pagetable lock held.
 */
217
void free_pgd_range(struct mmu_gather *tlb,
218
219
			unsigned long addr, unsigned long end,
			unsigned long floor, unsigned long ceiling)
Linus Torvalds's avatar
Linus Torvalds committed
220
221
222
{
	pgd_t *pgd;
	unsigned long next;
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
	unsigned long start;

	/*
	 * The next few lines have given us lots of grief...
	 *
	 * Why are we testing PMD* at this top level?  Because often
	 * there will be no work to do at all, and we'd prefer not to
	 * go all the way down to the bottom just to discover that.
	 *
	 * Why all these "- 1"s?  Because 0 represents both the bottom
	 * of the address space and the top of it (using -1 for the
	 * top wouldn't help much: the masks would do the wrong thing).
	 * The rule is that addr 0 and floor 0 refer to the bottom of
	 * the address space, but end 0 and ceiling 0 refer to the top
	 * Comparisons need to use "end - 1" and "ceiling - 1" (though
	 * that end 0 case should be mythical).
	 *
	 * Wherever addr is brought up or ceiling brought down, we must
	 * be careful to reject "the opposite 0" before it confuses the
	 * subsequent tests.  But what about where end is brought down
	 * by PMD_SIZE below? no, end can't go down to 0 there.
	 *
	 * Whereas we round start (addr) and ceiling down, by different
	 * masks at different levels, in order to test whether a table
	 * now has no other vmas using it, so can be freed, we don't
	 * bother to round floor or end up - the tests don't need that.
	 */
Linus Torvalds's avatar
Linus Torvalds committed
250

251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
	addr &= PMD_MASK;
	if (addr < floor) {
		addr += PMD_SIZE;
		if (!addr)
			return;
	}
	if (ceiling) {
		ceiling &= PMD_MASK;
		if (!ceiling)
			return;
	}
	if (end - 1 > ceiling - 1)
		end -= PMD_SIZE;
	if (addr > end - 1)
		return;

	start = addr;
268
	pgd = pgd_offset(tlb->mm, addr);
Linus Torvalds's avatar
Linus Torvalds committed
269
270
271
272
	do {
		next = pgd_addr_end(addr, end);
		if (pgd_none_or_clear_bad(pgd))
			continue;
273
		free_pud_range(tlb, pgd, addr, next, floor, ceiling);
Linus Torvalds's avatar
Linus Torvalds committed
274
	} while (pgd++, addr = next, addr != end);
275
276
}

277
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
278
		unsigned long floor, unsigned long ceiling)
279
280
281
282
283
{
	while (vma) {
		struct vm_area_struct *next = vma->vm_next;
		unsigned long addr = vma->vm_start;

284
285
286
287
288
289
		/*
		 * Hide vma from rmap and vmtruncate before freeing pgtables
		 */
		anon_vma_unlink(vma);
		unlink_file_vma(vma);

290
		if (is_vm_hugetlb_page(vma)) {
291
			hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
292
				floor, next? next->vm_start: ceiling);
293
294
295
296
297
		} else {
			/*
			 * Optimization: gather nearby vmas into one call down
			 */
			while (next && next->vm_start <= vma->vm_end + PMD_SIZE
298
			       && !is_vm_hugetlb_page(next)) {
299
300
				vma = next;
				next = vma->vm_next;
301
302
				anon_vma_unlink(vma);
				unlink_file_vma(vma);
303
304
305
306
			}
			free_pgd_range(tlb, addr, vma->vm_end,
				floor, next? next->vm_start: ceiling);
		}
307
308
		vma = next;
	}
Linus Torvalds's avatar
Linus Torvalds committed
309
310
}

311
int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
Linus Torvalds's avatar
Linus Torvalds committed
312
{
313
	pgtable_t new = pte_alloc_one(mm, address);
314
315
316
	if (!new)
		return -ENOMEM;

317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
	/*
	 * Ensure all pte setup (eg. pte page lock and page clearing) are
	 * visible before the pte is made visible to other CPUs by being
	 * put into page tables.
	 *
	 * The other side of the story is the pointer chasing in the page
	 * table walking code (when walking the page table without locking;
	 * ie. most of the time). Fortunately, these data accesses consist
	 * of a chain of data-dependent loads, meaning most CPUs (alpha
	 * being the notable exception) will already guarantee loads are
	 * seen in-order. See the alpha page table accessors for the
	 * smp_read_barrier_depends() barriers in page table walking code.
	 */
	smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */

332
	spin_lock(&mm->page_table_lock);
333
	if (!pmd_present(*pmd)) {	/* Has another populated it ? */
Linus Torvalds's avatar
Linus Torvalds committed
334
335
		mm->nr_ptes++;
		pmd_populate(mm, pmd, new);
336
		new = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
337
	}
338
	spin_unlock(&mm->page_table_lock);
339
340
	if (new)
		pte_free(mm, new);
341
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
342
343
}

344
int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
Linus Torvalds's avatar
Linus Torvalds committed
345
{
346
347
348
349
	pte_t *new = pte_alloc_one_kernel(&init_mm, address);
	if (!new)
		return -ENOMEM;

350
351
	smp_wmb(); /* See comment in __pte_alloc */

352
	spin_lock(&init_mm.page_table_lock);
353
	if (!pmd_present(*pmd)) {	/* Has another populated it ? */
354
		pmd_populate_kernel(&init_mm, pmd, new);
355
356
		new = NULL;
	}
357
	spin_unlock(&init_mm.page_table_lock);
358
359
	if (new)
		pte_free_kernel(&init_mm, new);
360
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
361
362
}

363
364
365
366
367
368
369
370
static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
{
	if (file_rss)
		add_mm_counter(mm, file_rss, file_rss);
	if (anon_rss)
		add_mm_counter(mm, anon_rss, anon_rss);
}

Nick Piggin's avatar
Nick Piggin committed
371
/*
372
373
374
 * This function is called to print an error when a bad pte
 * is found. For example, we might have a PFN-mapped pte in
 * a region that doesn't allow it.
Nick Piggin's avatar
Nick Piggin committed
375
376
377
 *
 * The calling function must still handle the error.
 */
378
379
static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
			  pte_t pte, struct page *page)
Nick Piggin's avatar
Nick Piggin committed
380
{
381
382
383
384
385
	pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
	pud_t *pud = pud_offset(pgd, addr);
	pmd_t *pmd = pmd_offset(pud, addr);
	struct address_space *mapping;
	pgoff_t index;
386
387
388
389
390
391
392
393
394
395
396
397
398
399
	static unsigned long resume;
	static unsigned long nr_shown;
	static unsigned long nr_unshown;

	/*
	 * Allow a burst of 60 reports, then keep quiet for that minute;
	 * or allow a steady drip of one report per second.
	 */
	if (nr_shown == 60) {
		if (time_before(jiffies, resume)) {
			nr_unshown++;
			return;
		}
		if (nr_unshown) {
400
401
			printk(KERN_ALERT
				"BUG: Bad page map: %lu messages suppressed\n",
402
403
404
405
406
407
408
				nr_unshown);
			nr_unshown = 0;
		}
		nr_shown = 0;
	}
	if (nr_shown++ == 0)
		resume = jiffies + 60 * HZ;
409
410
411
412

	mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
	index = linear_page_index(vma, addr);

413
414
	printk(KERN_ALERT
		"BUG: Bad page map in process %s  pte:%08llx pmd:%08llx\n",
415
416
417
		current->comm,
		(long long)pte_val(pte), (long long)pmd_val(*pmd));
	if (page) {
418
		printk(KERN_ALERT
419
420
421
422
		"page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n",
		page, (void *)page->flags, page_count(page),
		page_mapcount(page), page->mapping, page->index);
	}
423
	printk(KERN_ALERT
424
425
426
427
428
429
		"addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
		(void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
	/*
	 * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y
	 */
	if (vma->vm_ops)
430
		print_symbol(KERN_ALERT "vma->vm_ops->fault: %s\n",
431
432
				(unsigned long)vma->vm_ops->fault);
	if (vma->vm_file && vma->vm_file->f_op)
433
		print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n",
434
				(unsigned long)vma->vm_file->f_op->mmap);
Nick Piggin's avatar
Nick Piggin committed
435
	dump_stack();
436
	add_taint(TAINT_BAD_PAGE);
Nick Piggin's avatar
Nick Piggin committed
437
438
}

439
440
441
442
443
static inline int is_cow_mapping(unsigned int flags)
{
	return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
}

444
/*
445
 * vm_normal_page -- This function gets the "struct page" associated with a pte.
446
 *
447
448
449
 * "Special" mappings do not wish to be associated with a "struct page" (either
 * it doesn't exist, or it exists but they don't want to touch it). In this
 * case, NULL is returned here. "Normal" mappings do have a struct page.
Jared Hulbert's avatar
Jared Hulbert committed
450
 *
451
452
453
454
455
456
457
458
 * There are 2 broad cases. Firstly, an architecture may define a pte_special()
 * pte bit, in which case this function is trivial. Secondly, an architecture
 * may not have a spare pte bit, which requires a more complicated scheme,
 * described below.
 *
 * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
 * special mapping (even if there are underlying and valid "struct pages").
 * COWed pages of a VM_PFNMAP are always normal.
459
 *
Jared Hulbert's avatar
Jared Hulbert committed
460
461
 * The way we recognize COWed pages within VM_PFNMAP mappings is through the
 * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
462
463
 * set, and the vm_pgoff will point to the first PFN mapped: thus every special
 * mapping will always honor the rule
464
465
466
 *
 *	pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
 *
467
468
469
470
471
472
 * And for normal mappings this is false.
 *
 * This restricts such mappings to be a linear translation from virtual address
 * to pfn. To get around this restriction, we allow arbitrary mappings so long
 * as the vma is not a COW mapping; in that case, we know that all ptes are
 * special (because none can have been COWed).
Jared Hulbert's avatar
Jared Hulbert committed
473
474
 *
 *
475
 * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.
Jared Hulbert's avatar
Jared Hulbert committed
476
477
478
479
480
481
482
483
484
 *
 * VM_MIXEDMAP mappings can likewise contain memory with or without "struct
 * page" backing, however the difference is that _all_ pages with a struct
 * page (that is, those where pfn_valid is true) are refcounted and considered
 * normal pages by the VM. The disadvantage is that pages are refcounted
 * (which can be slower and simply not an option for some PFNMAP users). The
 * advantage is that we don't have to follow the strict linearity rule of
 * PFNMAP mappings in order to support COWable mappings.
 *
485
 */
486
487
488
489
490
491
492
#ifdef __HAVE_ARCH_PTE_SPECIAL
# define HAVE_PTE_SPECIAL 1
#else
# define HAVE_PTE_SPECIAL 0
#endif
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
				pte_t pte)
493
{
494
	unsigned long pfn = pte_pfn(pte);
495
496

	if (HAVE_PTE_SPECIAL) {
497
498
499
500
		if (likely(!pte_special(pte)))
			goto check_pfn;
		if (!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)))
			print_bad_pte(vma, addr, pte, NULL);
501
502
503
504
505
		return NULL;
	}

	/* !HAVE_PTE_SPECIAL case follows: */

Jared Hulbert's avatar
Jared Hulbert committed
506
507
508
509
510
511
	if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
		if (vma->vm_flags & VM_MIXEDMAP) {
			if (!pfn_valid(pfn))
				return NULL;
			goto out;
		} else {
512
513
			unsigned long off;
			off = (addr - vma->vm_start) >> PAGE_SHIFT;
Jared Hulbert's avatar
Jared Hulbert committed
514
515
516
517
518
			if (pfn == vma->vm_pgoff + off)
				return NULL;
			if (!is_cow_mapping(vma->vm_flags))
				return NULL;
		}
519
520
	}

521
522
523
524
525
check_pfn:
	if (unlikely(pfn > highest_memmap_pfn)) {
		print_bad_pte(vma, addr, pte, NULL);
		return NULL;
	}
526
527

	/*
528
529
	 * NOTE! We still have PageReserved() pages in the page tables.
	 * eg. VDSO mappings can cause them to exist.
530
	 */
Jared Hulbert's avatar
Jared Hulbert committed
531
out:
532
	return pfn_to_page(pfn);
533
534
}

Linus Torvalds's avatar
Linus Torvalds committed
535
536
537
538
539
540
/*
 * copy one vm_area from one task to the other. Assumes the page tables
 * already present in the new task to be cleared in the whole range
 * covered by this vma.
 */

541
static inline void
Linus Torvalds's avatar
Linus Torvalds committed
542
copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
Nick Piggin's avatar
Nick Piggin committed
543
		pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
544
		unsigned long addr, int *rss)
Linus Torvalds's avatar
Linus Torvalds committed
545
{
Nick Piggin's avatar
Nick Piggin committed
546
	unsigned long vm_flags = vma->vm_flags;
Linus Torvalds's avatar
Linus Torvalds committed
547
548
549
550
551
552
	pte_t pte = *src_pte;
	struct page *page;

	/* pte contains position in swap or file, so copy. */
	if (unlikely(!pte_present(pte))) {
		if (!pte_file(pte)) {
553
554
555
			swp_entry_t entry = pte_to_swp_entry(pte);

			swap_duplicate(entry);
Linus Torvalds's avatar
Linus Torvalds committed
556
557
558
			/* make sure dst_mm is on swapoff's mmlist. */
			if (unlikely(list_empty(&dst_mm->mmlist))) {
				spin_lock(&mmlist_lock);
559
560
561
				if (list_empty(&dst_mm->mmlist))
					list_add(&dst_mm->mmlist,
						 &src_mm->mmlist);
Linus Torvalds's avatar
Linus Torvalds committed
562
563
				spin_unlock(&mmlist_lock);
			}
564
565
566
567
568
569
570
571
572
573
			if (is_write_migration_entry(entry) &&
					is_cow_mapping(vm_flags)) {
				/*
				 * COW mappings require pages in both parent
				 * and child to be set to read.
				 */
				make_migration_entry_read(&entry);
				pte = swp_entry_to_pte(entry);
				set_pte_at(src_mm, addr, src_pte, pte);
			}
Linus Torvalds's avatar
Linus Torvalds committed
574
		}
575
		goto out_set_pte;
Linus Torvalds's avatar
Linus Torvalds committed
576
577
578
579
580
581
	}

	/*
	 * If it's a COW mapping, write protect it both
	 * in the parent and the child
	 */
582
	if (is_cow_mapping(vm_flags)) {
Linus Torvalds's avatar
Linus Torvalds committed
583
		ptep_set_wrprotect(src_mm, addr, src_pte);
584
		pte = pte_wrprotect(pte);
Linus Torvalds's avatar
Linus Torvalds committed
585
586
587
588
589
590
591
592
593
	}

	/*
	 * If it's a shared mapping, mark it clean in
	 * the child
	 */
	if (vm_flags & VM_SHARED)
		pte = pte_mkclean(pte);
	pte = pte_mkold(pte);
594
595
596
597

	page = vm_normal_page(vma, addr, pte);
	if (page) {
		get_page(page);
Nick Piggin's avatar
Nick Piggin committed
598
		page_dup_rmap(page, vma, addr);
599
600
		rss[!!PageAnon(page)]++;
	}
601
602
603

out_set_pte:
	set_pte_at(dst_mm, addr, dst_pte, pte);
Linus Torvalds's avatar
Linus Torvalds committed
604
605
606
607
608
609
610
}

static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
		pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
		unsigned long addr, unsigned long end)
{
	pte_t *src_pte, *dst_pte;
611
	spinlock_t *src_ptl, *dst_ptl;
612
	int progress = 0;
613
	int rss[2];
Linus Torvalds's avatar
Linus Torvalds committed
614
615

again:
616
	rss[1] = rss[0] = 0;
617
	dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
Linus Torvalds's avatar
Linus Torvalds committed
618
619
620
	if (!dst_pte)
		return -ENOMEM;
	src_pte = pte_offset_map_nested(src_pmd, addr);
621
	src_ptl = pte_lockptr(src_mm, src_pmd);
Ingo Molnar's avatar
Ingo Molnar committed
622
	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
623
	arch_enter_lazy_mmu_mode();
Linus Torvalds's avatar
Linus Torvalds committed
624
625
626
627
628
629

	do {
		/*
		 * We are holding two locks at this point - either of them
		 * could generate latencies in another task on another CPU.
		 */
630
631
632
		if (progress >= 32) {
			progress = 0;
			if (need_resched() ||
Nick Piggin's avatar
Nick Piggin committed
633
			    spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
634
635
				break;
		}
Linus Torvalds's avatar
Linus Torvalds committed
636
637
638
639
		if (pte_none(*src_pte)) {
			progress++;
			continue;
		}
640
		copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
Linus Torvalds's avatar
Linus Torvalds committed
641
642
643
		progress += 8;
	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);

644
	arch_leave_lazy_mmu_mode();
645
	spin_unlock(src_ptl);
Linus Torvalds's avatar
Linus Torvalds committed
646
	pte_unmap_nested(src_pte - 1);
647
	add_mm_rss(dst_mm, rss[0], rss[1]);
648
649
	pte_unmap_unlock(dst_pte - 1, dst_ptl);
	cond_resched();
Linus Torvalds's avatar
Linus Torvalds committed
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
	if (addr != end)
		goto again;
	return 0;
}

static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
		pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
		unsigned long addr, unsigned long end)
{
	pmd_t *src_pmd, *dst_pmd;
	unsigned long next;

	dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
	if (!dst_pmd)
		return -ENOMEM;
	src_pmd = pmd_offset(src_pud, addr);
	do {
		next = pmd_addr_end(addr, end);
		if (pmd_none_or_clear_bad(src_pmd))
			continue;
		if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
						vma, addr, next))
			return -ENOMEM;
	} while (dst_pmd++, src_pmd++, addr = next, addr != end);
	return 0;
}

static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
		pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
		unsigned long addr, unsigned long end)
{
	pud_t *src_pud, *dst_pud;
	unsigned long next;

	dst_pud = pud_alloc(dst_mm, dst_pgd, addr);
	if (!dst_pud)
		return -ENOMEM;
	src_pud = pud_offset(src_pgd, addr);
	do {
		next = pud_addr_end(addr, end);
		if (pud_none_or_clear_bad(src_pud))
			continue;
		if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
						vma, addr, next))
			return -ENOMEM;
	} while (dst_pud++, src_pud++, addr = next, addr != end);
	return 0;
}

int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
		struct vm_area_struct *vma)
{
	pgd_t *src_pgd, *dst_pgd;
	unsigned long next;
	unsigned long addr = vma->vm_start;
	unsigned long end = vma->vm_end;
Andrea Arcangeli's avatar
Andrea Arcangeli committed
706
	int ret;
Linus Torvalds's avatar
Linus Torvalds committed
707

708
709
710
711
712
713
	/*
	 * Don't copy ptes where a page fault will fill them correctly.
	 * Fork becomes much lighter when there are big shared or private
	 * readonly mappings. The tradeoff is that copy_page_range is more
	 * efficient than faulting.
	 */
714
	if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) {
715
716
717
718
		if (!vma->anon_vma)
			return 0;
	}

Linus Torvalds's avatar
Linus Torvalds committed
719
720
721
	if (is_vm_hugetlb_page(vma))
		return copy_hugetlb_page_range(dst_mm, src_mm, vma);

722
	if (unlikely(is_pfn_mapping(vma))) {
723
724
725
726
727
728
729
730
731
		/*
		 * We do not free on error cases below as remove_vma
		 * gets called on error from higher level routine
		 */
		ret = track_pfn_vma_copy(vma);
		if (ret)
			return ret;
	}

Andrea Arcangeli's avatar
Andrea Arcangeli committed
732
733
734
735
736
737
738
739
740
741
	/*
	 * We need to invalidate the secondary MMU mappings only when
	 * there could be a permission downgrade on the ptes of the
	 * parent mm. And a permission downgrade will only happen if
	 * is_cow_mapping() returns true.
	 */
	if (is_cow_mapping(vma->vm_flags))
		mmu_notifier_invalidate_range_start(src_mm, addr, end);

	ret = 0;
Linus Torvalds's avatar
Linus Torvalds committed
742
743
744
745
746
747
	dst_pgd = pgd_offset(dst_mm, addr);
	src_pgd = pgd_offset(src_mm, addr);
	do {
		next = pgd_addr_end(addr, end);
		if (pgd_none_or_clear_bad(src_pgd))
			continue;
Andrea Arcangeli's avatar
Andrea Arcangeli committed
748
749
750
751
752
		if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
					    vma, addr, next))) {
			ret = -ENOMEM;
			break;
		}
Linus Torvalds's avatar
Linus Torvalds committed
753
	} while (dst_pgd++, src_pgd++, addr = next, addr != end);
Andrea Arcangeli's avatar
Andrea Arcangeli committed
754
755
756
757
758

	if (is_cow_mapping(vma->vm_flags))
		mmu_notifier_invalidate_range_end(src_mm,
						  vma->vm_start, end);
	return ret;
Linus Torvalds's avatar
Linus Torvalds committed
759
760
}

761
static unsigned long zap_pte_range(struct mmu_gather *tlb,
Nick Piggin's avatar
Nick Piggin committed
762
				struct vm_area_struct *vma, pmd_t *pmd,
Linus Torvalds's avatar
Linus Torvalds committed
763
				unsigned long addr, unsigned long end,
764
				long *zap_work, struct zap_details *details)
Linus Torvalds's avatar
Linus Torvalds committed
765
{
Nick Piggin's avatar
Nick Piggin committed
766
	struct mm_struct *mm = tlb->mm;
Linus Torvalds's avatar
Linus Torvalds committed
767
	pte_t *pte;
768
	spinlock_t *ptl;
769
770
	int file_rss = 0;
	int anon_rss = 0;
Linus Torvalds's avatar
Linus Torvalds committed
771

772
	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
773
	arch_enter_lazy_mmu_mode();
Linus Torvalds's avatar
Linus Torvalds committed
774
775
	do {
		pte_t ptent = *pte;
776
777
		if (pte_none(ptent)) {
			(*zap_work)--;
Linus Torvalds's avatar
Linus Torvalds committed
778
			continue;
779
		}
780
781
782

		(*zap_work) -= PAGE_SIZE;

Linus Torvalds's avatar
Linus Torvalds committed
783
		if (pte_present(ptent)) {
784
			struct page *page;
785

786
			page = vm_normal_page(vma, addr, ptent);
Linus Torvalds's avatar
Linus Torvalds committed
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
			if (unlikely(details) && page) {
				/*
				 * unmap_shared_mapping_pages() wants to
				 * invalidate cache without truncating:
				 * unmap shared but keep private pages.
				 */
				if (details->check_mapping &&
				    details->check_mapping != page->mapping)
					continue;
				/*
				 * Each page->index must be checked when
				 * invalidating or truncating nonlinear.
				 */
				if (details->nonlinear_vma &&
				    (page->index < details->first_index ||
				     page->index > details->last_index))
					continue;
			}
Nick Piggin's avatar
Nick Piggin committed
805
			ptent = ptep_get_and_clear_full(mm, addr, pte,
806
							tlb->fullmm);
Linus Torvalds's avatar
Linus Torvalds committed
807
808
809
810
811
812
			tlb_remove_tlb_entry(tlb, pte, addr);
			if (unlikely(!page))
				continue;
			if (unlikely(details) && details->nonlinear_vma
			    && linear_page_index(details->nonlinear_vma,
						addr) != page->index)
Nick Piggin's avatar
Nick Piggin committed
813
				set_pte_at(mm, addr, pte,
Linus Torvalds's avatar
Linus Torvalds committed
814
815
					   pgoff_to_pte(page->index));
			if (PageAnon(page))
816
				anon_rss--;
817
818
819
			else {
				if (pte_dirty(ptent))
					set_page_dirty(page);
820
821
				if (pte_young(ptent) &&
				    likely(!VM_SequentialReadHint(vma)))
822
					mark_page_accessed(page);
823
				file_rss--;
824
			}
825
			page_remove_rmap(page);
826
827
			if (unlikely(page_mapcount(page) < 0))
				print_bad_pte(vma, addr, ptent, page);
Linus Torvalds's avatar
Linus Torvalds committed
828
829
830
831
832
833
834
835
836
			tlb_remove_page(tlb, page);
			continue;
		}
		/*
		 * If details->check_mapping, we leave swap entries;
		 * if details->nonlinear_vma, we leave file entries.
		 */
		if (unlikely(details))
			continue;
837
838
839
840
841
842
		if (pte_file(ptent)) {
			if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
				print_bad_pte(vma, addr, ptent, NULL);
		} else if
		  (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent))))
			print_bad_pte(vma, addr, ptent, NULL);
843
		pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
844
	} while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
845

846
	add_mm_rss(mm, file_rss, anon_rss);
847
	arch_leave_lazy_mmu_mode();
848
	pte_unmap_unlock(pte - 1, ptl);
849
850

	return addr;
Linus Torvalds's avatar
Linus Torvalds committed
851
852
}

853
static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
Nick Piggin's avatar
Nick Piggin committed
854
				struct vm_area_struct *vma, pud_t *pud,
Linus Torvalds's avatar
Linus Torvalds committed
855
				unsigned long addr, unsigned long end,
856
				long *zap_work, struct zap_details *details)
Linus Torvalds's avatar
Linus Torvalds committed
857
858
859
860
861
862
863
{
	pmd_t *pmd;
	unsigned long next;

	pmd = pmd_offset(pud, addr);
	do {
		next = pmd_addr_end(addr, end);
864
865
		if (pmd_none_or_clear_bad(pmd)) {
			(*zap_work)--;
Linus Torvalds's avatar
Linus Torvalds committed
866
			continue;
867
868
869
870
871
872
		}
		next = zap_pte_range(tlb, vma, pmd, addr, next,
						zap_work, details);
	} while (pmd++, addr = next, (addr != end && *zap_work > 0));

	return addr;
Linus Torvalds's avatar
Linus Torvalds committed
873
874
}

875
static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
Nick Piggin's avatar
Nick Piggin committed
876
				struct vm_area_struct *vma, pgd_t *pgd,
Linus Torvalds's avatar
Linus Torvalds committed
877
				unsigned long addr, unsigned long end,
878
				long *zap_work, struct zap_details *details)
Linus Torvalds's avatar
Linus Torvalds committed
879
880
881
882
883
884
885
{
	pud_t *pud;
	unsigned long next;

	pud = pud_offset(pgd, addr);
	do {
		next = pud_addr_end(addr, end);
886
887
		if (pud_none_or_clear_bad(pud)) {
			(*zap_work)--;
Linus Torvalds's avatar
Linus Torvalds committed
888
			continue;
889
890
891
892
893
894
		}
		next = zap_pmd_range(tlb, vma, pud, addr, next,
						zap_work, details);
	} while (pud++, addr = next, (addr != end && *zap_work > 0));

	return addr;
Linus Torvalds's avatar
Linus Torvalds committed
895
896
}

897
898
static unsigned long unmap_page_range(struct mmu_gather *tlb,
				struct vm_area_struct *vma,
Linus Torvalds's avatar
Linus Torvalds committed
899
				unsigned long addr, unsigned long end,
900
				long *zap_work, struct zap_details *details)
Linus Torvalds's avatar
Linus Torvalds committed
901
902
903
904
905
906
907
908
909
910
911
912
{
	pgd_t *pgd;
	unsigned long next;

	if (details && !details->check_mapping && !details->nonlinear_vma)
		details = NULL;

	BUG_ON(addr >= end);
	tlb_start_vma(tlb, vma);
	pgd = pgd_offset(vma->vm_mm, addr);
	do {
		next = pgd_addr_end(addr, end);
913
914
		if (pgd_none_or_clear_bad(pgd)) {
			(*zap_work)--;
Linus Torvalds's avatar
Linus Torvalds committed
915
			continue;
916
917
918
919
		}
		next = zap_pud_range(tlb, vma, pgd, addr, next,
						zap_work, details);
	} while (pgd++, addr = next, (addr != end && *zap_work > 0));
Linus Torvalds's avatar
Linus Torvalds committed
920
	tlb_end_vma(tlb, vma);
921
922

	return addr;
Linus Torvalds's avatar
Linus Torvalds committed
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
}

#ifdef CONFIG_PREEMPT
# define ZAP_BLOCK_SIZE	(8 * PAGE_SIZE)
#else
/* No preempt: go for improved straight-line efficiency */
# define ZAP_BLOCK_SIZE	(1024 * PAGE_SIZE)
#endif

/**
 * unmap_vmas - unmap a range of memory covered by a list of vma's
 * @tlbp: address of the caller's struct mmu_gather
 * @vma: the starting vma
 * @start_addr: virtual address at which to start unmapping
 * @end_addr: virtual address at which to end unmapping
 * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here
 * @details: details of nonlinear truncation or shared cache invalidation
 *
941
 * Returns the end address of the unmapping (restart addr if interrupted).
Linus Torvalds's avatar
Linus Torvalds committed
942
 *
943
 * Unmap all pages in the vma list.
Linus Torvalds's avatar
Linus Torvalds committed
944
 *
945
946
 * We aim to not hold locks for too long (for scheduling latency reasons).
 * So zap pages in ZAP_BLOCK_SIZE bytecounts.  This means we need to
Linus Torvalds's avatar
Linus Torvalds committed
947
948
949
950
951
952
953
954
955
956
957
 * return the ending mmu_gather to the caller.
 *
 * Only addresses between `start' and `end' will be unmapped.
 *
 * The VMA list must be sorted in ascending virtual address order.
 *
 * unmap_vmas() assumes that the caller will flush the whole unmapped address
 * range after unmap_vmas() returns.  So the only responsibility here is to
 * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
 * drops the lock and schedules.
 */
958
unsigned long unmap_vmas(struct mmu_gather **tlbp,
Linus Torvalds's avatar
Linus Torvalds committed
959
960
961
962
		struct vm_area_struct *vma, unsigned long start_addr,
		unsigned long end_addr, unsigned long *nr_accounted,
		struct zap_details *details)
{
963
	long zap_work = ZAP_BLOCK_SIZE;
Linus Torvalds's avatar
Linus Torvalds committed
964
965
	unsigned long tlb_start = 0;	/* For tlb_finish_mmu */
	int tlb_start_valid = 0;
966
	unsigned long start = start_addr;
Linus Torvalds's avatar
Linus Torvalds committed
967
	spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
968
	int fullmm = (*tlbp)->fullmm;
Andrea Arcangeli's avatar
Andrea Arcangeli committed
969
	struct mm_struct *mm = vma->vm_mm;
Linus Torvalds's avatar
Linus Torvalds committed
970

Andrea Arcangeli's avatar
Andrea Arcangeli committed
971
	mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
Linus Torvalds's avatar
Linus Torvalds committed
972
973
974
975
976
977
978
979
980
981
982
983
984
	for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
		unsigned long end;

		start = max(vma->vm_start, start_addr);
		if (start >= vma->vm_end)
			continue;
		end = min(vma->vm_end, end_addr);
		if (end <= vma->vm_start)
			continue;

		if (vma->vm_flags & VM_ACCOUNT)
			*nr_accounted += (end - start) >> PAGE_SHIFT;

985
		if (unlikely(is_pfn_mapping(vma)))
986
987
			untrack_pfn_vma(vma, 0, 0);

Linus Torvalds's avatar
Linus Torvalds committed
988
989
990
991
992
993
		while (start != end) {
			if (!tlb_start_valid) {
				tlb_start = start;
				tlb_start_valid = 1;
			}

994
			if (unlikely(is_vm_hugetlb_page(vma))) {
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
				/*
				 * It is undesirable to test vma->vm_file as it
				 * should be non-null for valid hugetlb area.
				 * However, vm_file will be NULL in the error
				 * cleanup path of do_mmap_pgoff. When
				 * hugetlbfs ->mmap method fails,
				 * do_mmap_pgoff() nullifies vma->vm_file
				 * before calling this function to clean up.
				 * Since no pte has actually been setup, it is
				 * safe to do nothing in this case.
				 */
				if (vma->vm_file) {
					unmap_hugepage_range(vma, start, end, NULL);
					zap_work -= (end - start) /
1009
					pages_per_huge_page(hstate_vma(vma));
1010
1011
				}

1012
1013
1014
1015
1016
1017
1018
1019
				start = end;
			} else
				start = unmap_page_range(*tlbp, vma,
						start, end, &zap_work, details);

			if (zap_work > 0) {
				BUG_ON(start != end);
				break;
Linus Torvalds's avatar
Linus Torvalds committed
1020
1021
1022
1023
1024
			}

			tlb_finish_mmu(*tlbp, tlb_start, start);

			if (need_resched() ||
Nick Piggin's avatar
Nick Piggin committed
1025
				(i_mmap_lock && spin_needbreak(i_mmap_lock))) {
Linus Torvalds's avatar
Linus Torvalds committed
1026
				if (i_mmap_lock) {
1027
					*tlbp = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
1028
1029
1030
1031
1032
					goto out;
				}
				cond_resched();
			}

1033
			*tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
Linus Torvalds's avatar
Linus Torvalds committed
1034
			tlb_start_valid = 0;
1035
			zap_work = ZAP_BLOCK_SIZE;
Linus Torvalds's avatar
Linus Torvalds committed
1036
1037
1038
		}
	}
out:
Andrea Arcangeli's avatar
Andrea Arcangeli committed
1039
	mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
1040
	return start;	/* which is now the end (or restart) address */
Linus Torvalds's avatar
Linus Torvalds committed
1041
1042
1043
1044
1045
1046
1047
1048
1049
}

/**
 * zap_page_range - remove user pages in a given range
 * @vma: vm_area_struct holding the applicable pages
 * @address: starting address of pages to zap
 * @size: number of bytes to zap
 * @details: details of nonlinear truncation or shared cache invalidation
 */
1050
unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
Linus Torvalds's avatar
Linus Torvalds committed
1051
1052
1053
1054
1055
1056
1057
1058
1059
		unsigned long size, struct zap_details *details)
{
	struct mm_struct *mm = vma->vm_mm;
	struct mmu_gather *tlb;
	unsigned long end = address + size;
	unsigned long nr_accounted = 0;

	lru_add_drain();
	tlb = tlb_gather_mmu(mm, 0);
1060
	update_hiwater_rss(mm);
1061
1062
1063
	end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
	if (tlb)
		tlb_finish_mmu(tlb, address, end);
1064
	return end;
Linus Torvalds's avatar
Linus Torvalds committed
1065
1066
}

1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
/**
 * zap_vma_ptes - remove ptes mapping the vma
 * @vma: vm_area_struct holding ptes to be zapped
 * @address: starting address of pages to zap
 * @size: number of bytes to zap
 *
 * This function only unmaps ptes assigned to VM_PFNMAP vmas.
 *
 * The entire address range must be fully contained within the vma.
 *
 * Returns 0 if successful.
 */
int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
		unsigned long size)
{
	if (address < vma->vm_start || address + size > vma->vm_end ||
	    		!(vma->vm_flags & VM_PFNMAP))
		return -1;
	zap_page_range(vma, address, size, NULL);
	return 0;
}
EXPORT_SYMBOL_GPL(zap_vma_ptes);

Linus Torvalds's avatar
Linus Torvalds committed
1090
1091
1092
/*
 * Do a quick page-table lookup for a single page.
 */
1093
struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1094
			unsigned int flags)
Linus Torvalds's avatar
Linus Torvalds committed
1095
1096
1097
1098
1099
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *ptep, pte;
1100
	spinlock_t *ptl;
Linus Torvalds's avatar
Linus Torvalds committed
1101
	struct page *page;
1102
	struct mm_struct *mm = vma->vm_mm;
Linus Torvalds's avatar
Linus Torvalds committed
1103

1104
1105
1106
1107
1108
	page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
	if (!IS_ERR(page)) {
		BUG_ON(flags & FOLL_GET);
		goto out;
	}
Linus Torvalds's avatar
Linus Torvalds committed
1109

1110
	page = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
1111
1112
	pgd = pgd_offset(mm, address);
	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
1113
		goto no_page_table;
Linus Torvalds's avatar
Linus Torvalds committed
1114
1115

	pud = pud_offset(pgd, address);
Andi Kleen's avatar
Andi Kleen committed
1116
	if (pud_none(*pud))
1117
		goto no_page_table;
Andi Kleen's avatar
Andi Kleen committed
1118
1119
1120
1121
1122
1123
1124
1125
	if (pud_huge(*pud)) {
		BUG_ON(flags & FOLL_GET);
		page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
		goto out;
	}
	if (unlikely(pud_bad(*pud)))
		goto no_page_table;

Linus Torvalds's avatar
Linus Torvalds committed
1126
	pmd = pmd_offset(pud, address);
1127
	if (pmd_none(*pmd))
1128
1129
1130
1131
		goto no_page_table;
	if (pmd_huge(*pmd)) {
		BUG_ON(flags & FOLL_GET);
		page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
Linus Torvalds's avatar
Linus Torvalds committed
1132
		goto out;
1133
	}
1134
1135
1136
	if (unlikely(pmd_bad(*pmd)))
		goto no_page_table;

1137
	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
Linus Torvalds's avatar
Linus Torvalds committed
1138
1139

	pte = *ptep;
1140
	if (!pte_present(pte))
1141
		goto no_page;
1142
1143
	if ((flags & FOLL_WRITE) && !pte_write(pte))
		goto unlock;
1144
1145
	page = vm_normal_page(vma, address, pte);
	if (unlikely(!page))
1146
		goto bad_page;
Linus Torvalds's avatar
Linus Torvalds committed
1147

1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
	if (flags & FOLL_GET)
		get_page(page);
	if (flags & FOLL_TOUCH) {
		if ((flags & FOLL_WRITE) &&
		    !pte_dirty(pte) && !PageDirty(page))
			set_page_dirty(page);
		mark_page_accessed(page);
	}
unlock:
	pte_unmap_unlock(ptep, ptl);
Linus Torvalds's avatar
Linus Torvalds committed
1158
out:
1159
	return page;
Linus Torvalds's avatar
Linus Torvalds committed
1160

1161
1162
1163
1164
1165
1166
1167
1168
1169
bad_page:
	pte_unmap_unlock(ptep, ptl);
	return ERR_PTR(-EFAULT);

no_page:
	pte_unmap_unlock(ptep, ptl);
	if (!pte_none(pte))
		return page;
	/* Fall through to ZERO_PAGE handling */
1170
1171
1172
1173
1174
1175
no_page_table:
	/*
	 * When core dumping an enormous anonymous area that nobody
	 * has touched so far, we don't want to allocate page tables.
	 */
	if (flags & FOLL_ANON) {
Nick Piggin's avatar
Nick Piggin committed
1176
		page = ZERO_PAGE(0);
1177
1178
1179
1180
1181
		if (flags & FOLL_GET)
			get_page(page);
		BUG_ON(flags & FOLL_WRITE);
	}
	return page;
Linus Torvalds's avatar
Linus Torvalds committed
1182
1183
}

1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
/* Can we do the FOLL_ANON optimization? */
static inline int use_zero_page(struct vm_area_struct *vma)
{
	/*
	 * We don't want to optimize FOLL_ANON for make_pages_present()
	 * when it tries to page in a VM_LOCKED region. As to VM_SHARED,
	 * we want to get the page from the page tables to make sure
	 * that we serialize and update with any other user of that
	 * mapping.
	 */
	if (vma->vm_flags & (VM_LOCKED | VM_SHARED))
		return 0;
	/*
Nick Piggin's avatar
Nick Piggin committed
1197
	 * And if we have a fault routine, it's not an anonymous region.
1198
	 */
Nick Piggin's avatar
Nick Piggin committed
1199
	return !vma->vm_ops || !vma->vm_ops->fault;
1200
1201
}

1202
1203
1204
1205


int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
		     unsigned long start, int len, int flags,
Linus Torvalds's avatar
Linus Torvalds committed
1206
1207
1208
		struct page **pages, struct vm_area_struct **vmas)
{
	int i;
1209
1210
1211
1212
	unsigned int vm_flags = 0;
	int write = !!(flags & GUP_FLAGS_WRITE);
	int force = !!(flags & GUP_FLAGS_FORCE);
	int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
1213
	int ignore_sigkill = !!(flags & GUP_FLAGS_IGNORE_SIGKILL);
Linus Torvalds's avatar
Linus Torvalds committed
1214

1215
1216
	if (len <= 0)
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
1217
1218
1219
1220
	/* 
	 * Require read or write permissions.
	 * If 'force' is set, we only require the "MAY" flags.
	 */
1221
1222
	vm_flags  = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
	vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
Linus Torvalds's avatar
Linus Torvalds committed
1223
1224
1225
	i = 0;

	do {
1226
1227
		struct vm_area_struct *vma;
		unsigned int foll_flags;
Linus Torvalds's avatar
Linus Torvalds committed
1228
1229
1230
1231
1232
1233
1234
1235
1236

		vma = find_extend_vma(mm, start);
		if (!vma && in_gate_area(tsk, start)) {
			unsigned long pg = start & PAGE_MASK;
			struct vm_area_struct *gate_vma = get_gate_vma(tsk);
			pgd_t *pgd;
			pud_t *pud;
			pmd_t *pmd;
			pte_t *pte;
1237
1238
1239

			/* user gate pages are read-only */
			if (!ignore && write)
Linus Torvalds's avatar
Linus Torvalds committed
1240
1241
1242
1243
1244
1245
1246
1247
1248
				return i ? : -EFAULT;
			if (pg > TASK_SIZE)
				pgd = pgd_offset_k(pg);
			else
				pgd = pgd_offset_gate(mm, pg);
			BUG_ON(pgd_none(*pgd));
			pud = pud_offset(pgd, pg);
			BUG_ON(pud_none(*pud));
			pmd = pmd_offset(pud, pg);
1249
1250
			if (pmd_none(*pmd))
				return i ? : -EFAULT;
Linus Torvalds's avatar
Linus Torvalds committed
1251
			pte = pte_offset_map(pmd, pg);
1252
1253
1254
1255
			if (pte_none(*pte)) {
				pte_unmap(pte);
				return i ? : -EFAULT;
			}
Linus Torvalds's avatar
Linus Torvalds committed
1256
			if (pages) {
1257
				struct page *page = vm_normal_page(gate_vma, start, *pte);
1258
1259
1260
				pages[i] = page;
				if (page)
					get_page(page);
Linus Torvalds's avatar
Linus Torvalds committed
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
			}
			pte_unmap(pte);
			if (vmas)
				vmas[i] = gate_vma;
			i++;
			start += PAGE_SIZE;
			len--;
			continue;
		}

1271
1272
1273
		if (!vma ||
		    (vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
		    (!ignore && !(vm_flags & vma->vm_flags)))
Linus Torvalds's avatar
Linus Torvalds committed
1274
1275
1276
1277
			return i ? : -EFAULT;

		if (is_vm_hugetlb_page(vma)) {
			i = follow_hugetlb_page(mm, vma, pages, vmas,
1278
						&start, &len, i, write);
Linus Torvalds's avatar
Linus Torvalds committed
1279
1280
			continue;
		}
1281
1282
1283
1284

		foll_flags = FOLL_TOUCH;
		if (pages)
			foll_flags |= FOLL_GET;
1285
		if (!write && use_zero_page(vma))
1286
1287
			foll_flags |= FOLL_ANON;

Linus Torvalds's avatar
Linus Torvalds committed
1288
		do {
1289
			struct page *page;
Linus Torvalds's avatar
Linus Torvalds committed
1290

1291
			/*
1292
1293
1294
1295
1296
			 * If we have a pending SIGKILL, don't keep faulting
			 * pages and potentially allocating memory, unless
			 * current is handling munlock--e.g., on exit. In
			 * that case, we are not allocating memory.  Rather,
			 * we're only unlocking already resident/mapped pages.
1297
			 */
1298
1299
1300
			if (unlikely(!ignore_sigkill &&
					fatal_signal_pending(current)))
				return i ? i : -ERESTARTSYS;
1301

1302