memory.c 102 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
/*
 *  linux/mm/memory.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 */

/*
 * demand-loading started 01.12.91 - seems it is high on the list of
 * things wanted, and it should be easy to implement. - Linus
 */

/*
 * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
 * pages started 02.12.91, seems to work. - Linus.
 *
 * Tested sharing by executing about 30 /bin/sh: under the old kernel it
 * would have taken more than the 6M I have free, but it worked well as
 * far as I could see.
 *
 * Also corrected some "invalidate()"s - I wasn't doing enough of them.
 */

/*
 * Real VM (paging to/from disk) started 18.12.91. Much more work and
 * thought has to go into this. Oh, well..
 * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
 *		Found it. Everything seems to work now.
 * 20.12.91  -  Ok, making the swap-device changeable like the root.
 */

/*
 * 05.04.94  -  Multi-page memory management added for v1.1.
 * 		Idea by Alex Bligh (alex@cconcepts.co.uk)
 *
 * 16.07.99  -  Support of BIGMEM added by Gerhard Wichert, Siemens AG
 *		(Gerhard.Wichert@pdb.siemens.de)
 *
 * Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
 */

#include <linux/kernel_stat.h>
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/mman.h>
#include <linux/swap.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
Hugh Dickins's avatar
Hugh Dickins committed
48
#include <linux/ksm.h>
Linus Torvalds's avatar
Linus Torvalds committed
49
#include <linux/rmap.h>
50
#include <linux/export.h>
51
#include <linux/delayacct.h>
Linus Torvalds's avatar
Linus Torvalds committed
52
#include <linux/init.h>
53
#include <linux/writeback.h>
54
#include <linux/memcontrol.h>
Andrea Arcangeli's avatar
Andrea Arcangeli committed
55
#include <linux/mmu_notifier.h>
56
57
58
#include <linux/kallsyms.h>
#include <linux/swapops.h>
#include <linux/elf.h>
59
#include <linux/gfp.h>
60
#include <linux/migrate.h>
Andy Shevchenko's avatar
Andy Shevchenko committed
61
#include <linux/string.h>
62
#include <linux/dma-debug.h>
63
#include <linux/debugfs.h>
Linus Torvalds's avatar
Linus Torvalds committed
64

65
#include <asm/io.h>
Linus Torvalds's avatar
Linus Torvalds committed
66
67
68
69
70
71
#include <asm/pgalloc.h>
#include <asm/uaccess.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/pgtable.h>

72
73
#include "internal.h"

74
75
#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
76
77
#endif

78
#ifndef CONFIG_NEED_MULTIPLE_NODES
Linus Torvalds's avatar
Linus Torvalds committed
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
/* use the per-pgdat data instead for discontigmem - mbligh */
unsigned long max_mapnr;
struct page *mem_map;

EXPORT_SYMBOL(max_mapnr);
EXPORT_SYMBOL(mem_map);
#endif

/*
 * A number of key systems in x86 including ioremap() rely on the assumption
 * that high_memory defines the upper bound on direct map memory, then end
 * of ZONE_NORMAL.  Under CONFIG_DISCONTIG this means that max_low_pfn and
 * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL
 * and ZONE_HIGHMEM.
 */
void * high_memory;

EXPORT_SYMBOL(high_memory);

98
99
100
101
102
103
104
105
106
107
108
109
/*
 * Randomize the address space (stacks, mmaps, brk, etc.).
 *
 * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
 *   as ancient (libc5 based) binaries can segfault. )
 */
int randomize_va_space __read_mostly =
#ifdef CONFIG_COMPAT_BRK
					1;
#else
					2;
#endif
110
111
112
113

static int __init disable_randmaps(char *s)
{
	randomize_va_space = 0;
114
	return 1;
115
116
117
}
__setup("norandmaps", disable_randmaps);

118
unsigned long zero_pfn __read_mostly;
Hugh Dickins's avatar
Hugh Dickins committed
119
unsigned long highest_memmap_pfn __read_mostly;
Hugh Dickins's avatar
Hugh Dickins committed
120
121
122
123
124
125
126
127
128
129

/*
 * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
 */
static int __init init_zero_pfn(void)
{
	zero_pfn = page_to_pfn(ZERO_PAGE(0));
	return 0;
}
core_initcall(init_zero_pfn);
130

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
131

132
133
#if defined(SPLIT_RSS_COUNTING)

134
void sync_mm_rss(struct mm_struct *mm)
135
136
137
138
{
	int i;

	for (i = 0; i < NR_MM_COUNTERS; i++) {
139
140
141
		if (current->rss_stat.count[i]) {
			add_mm_counter(mm, i, current->rss_stat.count[i]);
			current->rss_stat.count[i] = 0;
142
143
		}
	}
144
	current->rss_stat.events = 0;
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
}

static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
{
	struct task_struct *task = current;

	if (likely(task->mm == mm))
		task->rss_stat.count[member] += val;
	else
		add_mm_counter(mm, member, val);
}
#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)

/* sync counter once per 64 page faults */
#define TASK_RSS_EVENTS_THRESH	(64)
static void check_sync_rss_stat(struct task_struct *task)
{
	if (unlikely(task != current))
		return;
	if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
166
		sync_mm_rss(task->mm);
167
}
168
#else /* SPLIT_RSS_COUNTING */
169
170
171
172
173
174
175
176

#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)

static void check_sync_rss_stat(struct task_struct *task)
{
}

177
178
179
180
181
182
183
184
185
186
187
188
189
190
#endif /* SPLIT_RSS_COUNTING */

#ifdef HAVE_GENERIC_MMU_GATHER

static int tlb_next_batch(struct mmu_gather *tlb)
{
	struct mmu_gather_batch *batch;

	batch = tlb->active;
	if (batch->next) {
		tlb->active = batch->next;
		return 1;
	}

191
192
193
	if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
		return 0;

194
195
196
197
	batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
	if (!batch)
		return 0;

198
	tlb->batch_count++;
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
	batch->next = NULL;
	batch->nr   = 0;
	batch->max  = MAX_GATHER_BATCH;

	tlb->active->next = batch;
	tlb->active = batch;

	return 1;
}

/* tlb_gather_mmu
 *	Called to initialize an (on-stack) mmu_gather structure for page-table
 *	tear-down from @mm. The @fullmm argument is used when @mm is without
 *	users and we're going to destroy the full address space (exit/execve).
 */
214
void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end)
215
216
217
{
	tlb->mm = mm;

218
219
	/* Is it from 0 to ~0? */
	tlb->fullmm     = !(start | (end+1));
220
	tlb->need_flush_all = 0;
221
222
	tlb->start	= start;
	tlb->end	= end;
223
224
225
226
227
	tlb->need_flush = 0;
	tlb->local.next = NULL;
	tlb->local.nr   = 0;
	tlb->local.max  = ARRAY_SIZE(tlb->__pages);
	tlb->active     = &tlb->local;
228
	tlb->batch_count = 0;
229
230
231
232
233
234

#ifdef CONFIG_HAVE_RCU_TABLE_FREE
	tlb->batch = NULL;
#endif
}

235
static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
236
237
238
239
240
{
	tlb->need_flush = 0;
	tlb_flush(tlb);
#ifdef CONFIG_HAVE_RCU_TABLE_FREE
	tlb_table_flush(tlb);
241
#endif
242
243
244
245
246
}

static void tlb_flush_mmu_free(struct mmu_gather *tlb)
{
	struct mmu_gather_batch *batch;
247

248
249
250
251
252
253
254
	for (batch = &tlb->local; batch; batch = batch->next) {
		free_pages_and_swap_cache(batch->pages, batch->nr);
		batch->nr = 0;
	}
	tlb->active = &tlb->local;
}

255
256
257
258
259
260
261
262
void tlb_flush_mmu(struct mmu_gather *tlb)
{
	if (!tlb->need_flush)
		return;
	tlb_flush_mmu_tlbonly(tlb);
	tlb_flush_mmu_free(tlb);
}

263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
/* tlb_finish_mmu
 *	Called at the end of the shootdown operation to free up any resources
 *	that were required.
 */
void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
{
	struct mmu_gather_batch *batch, *next;

	tlb_flush_mmu(tlb);

	/* keep the page table cache within bounds */
	check_pgt_cache();

	for (batch = tlb->local.next; batch; batch = next) {
		next = batch->next;
		free_pages((unsigned long)batch, 0);
	}
	tlb->local.next = NULL;
}

/* __tlb_remove_page
 *	Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while
 *	handling the additional races in SMP caused by other CPUs caching valid
 *	mappings in their TLBs. Returns the number of free page slots left.
 *	When out of page slots we must call tlb_flush_mmu().
 */
int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
{
	struct mmu_gather_batch *batch;

Shaohua Li's avatar
Shaohua Li committed
293
	VM_BUG_ON(!tlb->need_flush);
294
295
296
297
298
299

	batch = tlb->active;
	batch->pages[batch->nr++] = page;
	if (batch->nr == batch->max) {
		if (!tlb_next_batch(tlb))
			return 0;
300
		batch = tlb->active;
301
	}
302
	VM_BUG_ON_PAGE(batch->nr > batch->max, page);
303
304
305
306
307
308

	return batch->max - batch->nr;
}

#endif /* HAVE_GENERIC_MMU_GATHER */

309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
#ifdef CONFIG_HAVE_RCU_TABLE_FREE

/*
 * See the comment near struct mmu_table_batch.
 */

static void tlb_remove_table_smp_sync(void *arg)
{
	/* Simply deliver the interrupt */
}

static void tlb_remove_table_one(void *table)
{
	/*
	 * This isn't an RCU grace period and hence the page-tables cannot be
	 * assumed to be actually RCU-freed.
	 *
	 * It is however sufficient for software page-table walkers that rely on
	 * IRQ disabling. See the comment near struct mmu_table_batch.
	 */
	smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
	__tlb_remove_table(table);
}

static void tlb_remove_table_rcu(struct rcu_head *head)
{
	struct mmu_table_batch *batch;
	int i;

	batch = container_of(head, struct mmu_table_batch, rcu);

	for (i = 0; i < batch->nr; i++)
		__tlb_remove_table(batch->tables[i]);

	free_page((unsigned long)batch);
}

void tlb_table_flush(struct mmu_gather *tlb)
{
	struct mmu_table_batch **batch = &tlb->batch;

	if (*batch) {
		call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
		*batch = NULL;
	}
}

void tlb_remove_table(struct mmu_gather *tlb, void *table)
{
	struct mmu_table_batch **batch = &tlb->batch;

	tlb->need_flush = 1;

	/*
	 * When there's less then two users of this mm there cannot be a
	 * concurrent page-table walk.
	 */
	if (atomic_read(&tlb->mm->mm_users) < 2) {
		__tlb_remove_table(table);
		return;
	}

	if (*batch == NULL) {
		*batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
		if (*batch == NULL) {
			tlb_remove_table_one(table);
			return;
		}
		(*batch)->nr = 0;
	}
	(*batch)->tables[(*batch)->nr++] = table;
	if ((*batch)->nr == MAX_TABLE_BATCH)
		tlb_table_flush(tlb);
}

384
#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
385

Linus Torvalds's avatar
Linus Torvalds committed
386
387
388
389
/*
 * Note: this doesn't free the actual pages themselves. That
 * has been handled earlier when unmapping all the memory regions.
 */
390
391
static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
			   unsigned long addr)
Linus Torvalds's avatar
Linus Torvalds committed
392
{
393
	pgtable_t token = pmd_pgtable(*pmd);
394
	pmd_clear(pmd);
395
	pte_free_tlb(tlb, token, addr);
396
	atomic_long_dec(&tlb->mm->nr_ptes);
Linus Torvalds's avatar
Linus Torvalds committed
397
398
}

399
400
401
static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
				unsigned long addr, unsigned long end,
				unsigned long floor, unsigned long ceiling)
Linus Torvalds's avatar
Linus Torvalds committed
402
403
404
{
	pmd_t *pmd;
	unsigned long next;
405
	unsigned long start;
Linus Torvalds's avatar
Linus Torvalds committed
406

407
	start = addr;
Linus Torvalds's avatar
Linus Torvalds committed
408
409
410
411
412
	pmd = pmd_offset(pud, addr);
	do {
		next = pmd_addr_end(addr, end);
		if (pmd_none_or_clear_bad(pmd))
			continue;
413
		free_pte_range(tlb, pmd, addr);
Linus Torvalds's avatar
Linus Torvalds committed
414
415
	} while (pmd++, addr = next, addr != end);

416
417
418
419
420
421
422
	start &= PUD_MASK;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= PUD_MASK;
		if (!ceiling)
			return;
Linus Torvalds's avatar
Linus Torvalds committed
423
	}
424
425
426
427
428
	if (end - 1 > ceiling - 1)
		return;

	pmd = pmd_offset(pud, start);
	pud_clear(pud);
429
	pmd_free_tlb(tlb, pmd, start);
Linus Torvalds's avatar
Linus Torvalds committed
430
431
}

432
433
434
static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
				unsigned long addr, unsigned long end,
				unsigned long floor, unsigned long ceiling)
Linus Torvalds's avatar
Linus Torvalds committed
435
436
437
{
	pud_t *pud;
	unsigned long next;
438
	unsigned long start;
Linus Torvalds's avatar
Linus Torvalds committed
439

440
	start = addr;
Linus Torvalds's avatar
Linus Torvalds committed
441
442
443
444
445
	pud = pud_offset(pgd, addr);
	do {
		next = pud_addr_end(addr, end);
		if (pud_none_or_clear_bad(pud))
			continue;
446
		free_pmd_range(tlb, pud, addr, next, floor, ceiling);
Linus Torvalds's avatar
Linus Torvalds committed
447
448
	} while (pud++, addr = next, addr != end);

449
450
451
452
453
454
455
	start &= PGDIR_MASK;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= PGDIR_MASK;
		if (!ceiling)
			return;
Linus Torvalds's avatar
Linus Torvalds committed
456
	}
457
458
459
460
461
	if (end - 1 > ceiling - 1)
		return;

	pud = pud_offset(pgd, start);
	pgd_clear(pgd);
462
	pud_free_tlb(tlb, pud, start);
Linus Torvalds's avatar
Linus Torvalds committed
463
464
465
}

/*
466
 * This function frees user-level page tables of a process.
Linus Torvalds's avatar
Linus Torvalds committed
467
 */
468
void free_pgd_range(struct mmu_gather *tlb,
469
470
			unsigned long addr, unsigned long end,
			unsigned long floor, unsigned long ceiling)
Linus Torvalds's avatar
Linus Torvalds committed
471
472
473
{
	pgd_t *pgd;
	unsigned long next;
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499

	/*
	 * The next few lines have given us lots of grief...
	 *
	 * Why are we testing PMD* at this top level?  Because often
	 * there will be no work to do at all, and we'd prefer not to
	 * go all the way down to the bottom just to discover that.
	 *
	 * Why all these "- 1"s?  Because 0 represents both the bottom
	 * of the address space and the top of it (using -1 for the
	 * top wouldn't help much: the masks would do the wrong thing).
	 * The rule is that addr 0 and floor 0 refer to the bottom of
	 * the address space, but end 0 and ceiling 0 refer to the top
	 * Comparisons need to use "end - 1" and "ceiling - 1" (though
	 * that end 0 case should be mythical).
	 *
	 * Wherever addr is brought up or ceiling brought down, we must
	 * be careful to reject "the opposite 0" before it confuses the
	 * subsequent tests.  But what about where end is brought down
	 * by PMD_SIZE below? no, end can't go down to 0 there.
	 *
	 * Whereas we round start (addr) and ceiling down, by different
	 * masks at different levels, in order to test whether a table
	 * now has no other vmas using it, so can be freed, we don't
	 * bother to round floor or end up - the tests don't need that.
	 */
Linus Torvalds's avatar
Linus Torvalds committed
500

501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
	addr &= PMD_MASK;
	if (addr < floor) {
		addr += PMD_SIZE;
		if (!addr)
			return;
	}
	if (ceiling) {
		ceiling &= PMD_MASK;
		if (!ceiling)
			return;
	}
	if (end - 1 > ceiling - 1)
		end -= PMD_SIZE;
	if (addr > end - 1)
		return;

517
	pgd = pgd_offset(tlb->mm, addr);
Linus Torvalds's avatar
Linus Torvalds committed
518
519
520
521
	do {
		next = pgd_addr_end(addr, end);
		if (pgd_none_or_clear_bad(pgd))
			continue;
522
		free_pud_range(tlb, pgd, addr, next, floor, ceiling);
Linus Torvalds's avatar
Linus Torvalds committed
523
	} while (pgd++, addr = next, addr != end);
524
525
}

526
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
527
		unsigned long floor, unsigned long ceiling)
528
529
530
531
532
{
	while (vma) {
		struct vm_area_struct *next = vma->vm_next;
		unsigned long addr = vma->vm_start;

533
		/*
npiggin@suse.de's avatar
npiggin@suse.de committed
534
535
		 * Hide vma from rmap and truncate_pagecache before freeing
		 * pgtables
536
		 */
537
		unlink_anon_vmas(vma);
538
539
		unlink_file_vma(vma);

540
		if (is_vm_hugetlb_page(vma)) {
541
			hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
542
				floor, next? next->vm_start: ceiling);
543
544
545
546
547
		} else {
			/*
			 * Optimization: gather nearby vmas into one call down
			 */
			while (next && next->vm_start <= vma->vm_end + PMD_SIZE
548
			       && !is_vm_hugetlb_page(next)) {
549
550
				vma = next;
				next = vma->vm_next;
551
				unlink_anon_vmas(vma);
552
				unlink_file_vma(vma);
553
554
555
556
			}
			free_pgd_range(tlb, addr, vma->vm_end,
				floor, next? next->vm_start: ceiling);
		}
557
558
		vma = next;
	}
Linus Torvalds's avatar
Linus Torvalds committed
559
560
}

561
562
int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
		pmd_t *pmd, unsigned long address)
Linus Torvalds's avatar
Linus Torvalds committed
563
{
564
	spinlock_t *ptl;
565
	pgtable_t new = pte_alloc_one(mm, address);
566
	int wait_split_huge_page;
567
568
569
	if (!new)
		return -ENOMEM;

570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
	/*
	 * Ensure all pte setup (eg. pte page lock and page clearing) are
	 * visible before the pte is made visible to other CPUs by being
	 * put into page tables.
	 *
	 * The other side of the story is the pointer chasing in the page
	 * table walking code (when walking the page table without locking;
	 * ie. most of the time). Fortunately, these data accesses consist
	 * of a chain of data-dependent loads, meaning most CPUs (alpha
	 * being the notable exception) will already guarantee loads are
	 * seen in-order. See the alpha page table accessors for the
	 * smp_read_barrier_depends() barriers in page table walking code.
	 */
	smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */

585
	ptl = pmd_lock(mm, pmd);
586
587
	wait_split_huge_page = 0;
	if (likely(pmd_none(*pmd))) {	/* Has another populated it ? */
588
		atomic_long_inc(&mm->nr_ptes);
Linus Torvalds's avatar
Linus Torvalds committed
589
		pmd_populate(mm, pmd, new);
590
		new = NULL;
591
592
	} else if (unlikely(pmd_trans_splitting(*pmd)))
		wait_split_huge_page = 1;
593
	spin_unlock(ptl);
594
595
	if (new)
		pte_free(mm, new);
596
597
	if (wait_split_huge_page)
		wait_split_huge_page(vma->anon_vma, pmd);
598
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
599
600
}

601
int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
Linus Torvalds's avatar
Linus Torvalds committed
602
{
603
604
605
606
	pte_t *new = pte_alloc_one_kernel(&init_mm, address);
	if (!new)
		return -ENOMEM;

607
608
	smp_wmb(); /* See comment in __pte_alloc */

609
	spin_lock(&init_mm.page_table_lock);
610
	if (likely(pmd_none(*pmd))) {	/* Has another populated it ? */
611
		pmd_populate_kernel(&init_mm, pmd, new);
612
		new = NULL;
613
614
	} else
		VM_BUG_ON(pmd_trans_splitting(*pmd));
615
	spin_unlock(&init_mm.page_table_lock);
616
617
	if (new)
		pte_free_kernel(&init_mm, new);
618
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
619
620
}

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
621
622
623
624
625
626
static inline void init_rss_vec(int *rss)
{
	memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
}

static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
627
{
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
628
629
	int i;

630
	if (current->mm == mm)
631
		sync_mm_rss(mm);
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
632
633
634
	for (i = 0; i < NR_MM_COUNTERS; i++)
		if (rss[i])
			add_mm_counter(mm, i, rss[i]);
635
636
}

Nick Piggin's avatar
Nick Piggin committed
637
/*
638
639
640
 * This function is called to print an error when a bad pte
 * is found. For example, we might have a PFN-mapped pte in
 * a region that doesn't allow it.
Nick Piggin's avatar
Nick Piggin committed
641
642
643
 *
 * The calling function must still handle the error.
 */
644
645
static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
			  pte_t pte, struct page *page)
Nick Piggin's avatar
Nick Piggin committed
646
{
647
648
649
650
651
	pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
	pud_t *pud = pud_offset(pgd, addr);
	pmd_t *pmd = pmd_offset(pud, addr);
	struct address_space *mapping;
	pgoff_t index;
652
653
654
655
656
657
658
659
660
661
662
663
664
665
	static unsigned long resume;
	static unsigned long nr_shown;
	static unsigned long nr_unshown;

	/*
	 * Allow a burst of 60 reports, then keep quiet for that minute;
	 * or allow a steady drip of one report per second.
	 */
	if (nr_shown == 60) {
		if (time_before(jiffies, resume)) {
			nr_unshown++;
			return;
		}
		if (nr_unshown) {
666
667
			printk(KERN_ALERT
				"BUG: Bad page map: %lu messages suppressed\n",
668
669
670
671
672
673
674
				nr_unshown);
			nr_unshown = 0;
		}
		nr_shown = 0;
	}
	if (nr_shown++ == 0)
		resume = jiffies + 60 * HZ;
675
676
677
678

	mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
	index = linear_page_index(vma, addr);

679
680
	printk(KERN_ALERT
		"BUG: Bad page map in process %s  pte:%08llx pmd:%08llx\n",
681
682
		current->comm,
		(long long)pte_val(pte), (long long)pmd_val(*pmd));
683
	if (page)
684
		dump_page(page, "bad pte");
685
	printk(KERN_ALERT
686
687
688
689
690
691
		"addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
		(void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
	/*
	 * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y
	 */
	if (vma->vm_ops)
Joe Perches's avatar
Joe Perches committed
692
693
		printk(KERN_ALERT "vma->vm_ops->fault: %pSR\n",
		       vma->vm_ops->fault);
Al Viro's avatar
Al Viro committed
694
	if (vma->vm_file)
Joe Perches's avatar
Joe Perches committed
695
696
		printk(KERN_ALERT "vma->vm_file->f_op->mmap: %pSR\n",
		       vma->vm_file->f_op->mmap);
Nick Piggin's avatar
Nick Piggin committed
697
	dump_stack();
698
	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
Nick Piggin's avatar
Nick Piggin committed
699
700
}

701
/*
702
 * vm_normal_page -- This function gets the "struct page" associated with a pte.
703
 *
704
705
706
 * "Special" mappings do not wish to be associated with a "struct page" (either
 * it doesn't exist, or it exists but they don't want to touch it). In this
 * case, NULL is returned here. "Normal" mappings do have a struct page.
Jared Hulbert's avatar
Jared Hulbert committed
707
 *
708
709
710
711
712
713
714
715
 * There are 2 broad cases. Firstly, an architecture may define a pte_special()
 * pte bit, in which case this function is trivial. Secondly, an architecture
 * may not have a spare pte bit, which requires a more complicated scheme,
 * described below.
 *
 * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
 * special mapping (even if there are underlying and valid "struct pages").
 * COWed pages of a VM_PFNMAP are always normal.
716
 *
Jared Hulbert's avatar
Jared Hulbert committed
717
718
 * The way we recognize COWed pages within VM_PFNMAP mappings is through the
 * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
719
720
 * set, and the vm_pgoff will point to the first PFN mapped: thus every special
 * mapping will always honor the rule
721
722
723
 *
 *	pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
 *
724
725
726
727
728
729
 * And for normal mappings this is false.
 *
 * This restricts such mappings to be a linear translation from virtual address
 * to pfn. To get around this restriction, we allow arbitrary mappings so long
 * as the vma is not a COW mapping; in that case, we know that all ptes are
 * special (because none can have been COWed).
Jared Hulbert's avatar
Jared Hulbert committed
730
731
 *
 *
732
 * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.
Jared Hulbert's avatar
Jared Hulbert committed
733
734
735
736
737
738
739
740
741
 *
 * VM_MIXEDMAP mappings can likewise contain memory with or without "struct
 * page" backing, however the difference is that _all_ pages with a struct
 * page (that is, those where pfn_valid is true) are refcounted and considered
 * normal pages by the VM. The disadvantage is that pages are refcounted
 * (which can be slower and simply not an option for some PFNMAP users). The
 * advantage is that we don't have to follow the strict linearity rule of
 * PFNMAP mappings in order to support COWable mappings.
 *
742
 */
743
744
745
746
747
748
749
#ifdef __HAVE_ARCH_PTE_SPECIAL
# define HAVE_PTE_SPECIAL 1
#else
# define HAVE_PTE_SPECIAL 0
#endif
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
				pte_t pte)
750
{
751
	unsigned long pfn = pte_pfn(pte);
752
753

	if (HAVE_PTE_SPECIAL) {
754
		if (likely(!pte_special(pte) || pte_numa(pte)))
755
			goto check_pfn;
Hugh Dickins's avatar
Hugh Dickins committed
756
757
		if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
			return NULL;
758
		if (!is_zero_pfn(pfn))
759
			print_bad_pte(vma, addr, pte, NULL);
760
761
762
763
764
		return NULL;
	}

	/* !HAVE_PTE_SPECIAL case follows: */

Jared Hulbert's avatar
Jared Hulbert committed
765
766
767
768
769
770
	if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
		if (vma->vm_flags & VM_MIXEDMAP) {
			if (!pfn_valid(pfn))
				return NULL;
			goto out;
		} else {
771
772
			unsigned long off;
			off = (addr - vma->vm_start) >> PAGE_SHIFT;
Jared Hulbert's avatar
Jared Hulbert committed
773
774
775
776
777
			if (pfn == vma->vm_pgoff + off)
				return NULL;
			if (!is_cow_mapping(vma->vm_flags))
				return NULL;
		}
778
779
	}

780
781
782
783
784
check_pfn:
	if (unlikely(pfn > highest_memmap_pfn)) {
		print_bad_pte(vma, addr, pte, NULL);
		return NULL;
	}
785

786
787
788
	if (is_zero_pfn(pfn))
		return NULL;

789
	/*
790
791
	 * NOTE! We still have PageReserved() pages in the page tables.
	 * eg. VDSO mappings can cause them to exist.
792
	 */
Jared Hulbert's avatar
Jared Hulbert committed
793
out:
794
	return pfn_to_page(pfn);
795
796
}

Linus Torvalds's avatar
Linus Torvalds committed
797
798
799
800
801
802
/*
 * copy one vm_area from one task to the other. Assumes the page tables
 * already present in the new task to be cleared in the whole range
 * covered by this vma.
 */

803
static inline unsigned long
Linus Torvalds's avatar
Linus Torvalds committed
804
copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
Nick Piggin's avatar
Nick Piggin committed
805
		pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
806
		unsigned long addr, int *rss)
Linus Torvalds's avatar
Linus Torvalds committed
807
{
Nick Piggin's avatar
Nick Piggin committed
808
	unsigned long vm_flags = vma->vm_flags;
Linus Torvalds's avatar
Linus Torvalds committed
809
810
811
812
813
814
	pte_t pte = *src_pte;
	struct page *page;

	/* pte contains position in swap or file, so copy. */
	if (unlikely(!pte_present(pte))) {
		if (!pte_file(pte)) {
815
816
			swp_entry_t entry = pte_to_swp_entry(pte);

817
818
819
			if (swap_duplicate(entry) < 0)
				return entry.val;

Linus Torvalds's avatar
Linus Torvalds committed
820
821
822
			/* make sure dst_mm is on swapoff's mmlist. */
			if (unlikely(list_empty(&dst_mm->mmlist))) {
				spin_lock(&mmlist_lock);
823
824
825
				if (list_empty(&dst_mm->mmlist))
					list_add(&dst_mm->mmlist,
						 &src_mm->mmlist);
Linus Torvalds's avatar
Linus Torvalds committed
826
827
				spin_unlock(&mmlist_lock);
			}
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
828
829
			if (likely(!non_swap_entry(entry)))
				rss[MM_SWAPENTS]++;
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
			else if (is_migration_entry(entry)) {
				page = migration_entry_to_page(entry);

				if (PageAnon(page))
					rss[MM_ANONPAGES]++;
				else
					rss[MM_FILEPAGES]++;

				if (is_write_migration_entry(entry) &&
				    is_cow_mapping(vm_flags)) {
					/*
					 * COW mappings require pages in both
					 * parent and child to be set to read.
					 */
					make_migration_entry_read(&entry);
					pte = swp_entry_to_pte(entry);
846
847
					if (pte_swp_soft_dirty(*src_pte))
						pte = pte_swp_mksoft_dirty(pte);
848
849
					set_pte_at(src_mm, addr, src_pte, pte);
				}
850
			}
Linus Torvalds's avatar
Linus Torvalds committed
851
		}
852
		goto out_set_pte;
Linus Torvalds's avatar
Linus Torvalds committed
853
854
855
856
857
858
	}

	/*
	 * If it's a COW mapping, write protect it both
	 * in the parent and the child
	 */
859
	if (is_cow_mapping(vm_flags)) {
Linus Torvalds's avatar
Linus Torvalds committed
860
		ptep_set_wrprotect(src_mm, addr, src_pte);
861
		pte = pte_wrprotect(pte);
Linus Torvalds's avatar
Linus Torvalds committed
862
863
864
865
866
867
868
869
870
	}

	/*
	 * If it's a shared mapping, mark it clean in
	 * the child
	 */
	if (vm_flags & VM_SHARED)
		pte = pte_mkclean(pte);
	pte = pte_mkold(pte);
871
872
873
874

	page = vm_normal_page(vma, addr, pte);
	if (page) {
		get_page(page);
875
		page_dup_rmap(page);
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
876
877
878
879
		if (PageAnon(page))
			rss[MM_ANONPAGES]++;
		else
			rss[MM_FILEPAGES]++;
880
	}
881
882
883

out_set_pte:
	set_pte_at(dst_mm, addr, dst_pte, pte);
884
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
885
886
}

887
static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
888
889
		   pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
		   unsigned long addr, unsigned long end)
Linus Torvalds's avatar
Linus Torvalds committed
890
{
891
	pte_t *orig_src_pte, *orig_dst_pte;
Linus Torvalds's avatar
Linus Torvalds committed
892
	pte_t *src_pte, *dst_pte;
893
	spinlock_t *src_ptl, *dst_ptl;
894
	int progress = 0;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
895
	int rss[NR_MM_COUNTERS];
896
	swp_entry_t entry = (swp_entry_t){0};
Linus Torvalds's avatar
Linus Torvalds committed
897
898

again:
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
899
900
	init_rss_vec(rss);

901
	dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
Linus Torvalds's avatar
Linus Torvalds committed
902
903
	if (!dst_pte)
		return -ENOMEM;
Peter Zijlstra's avatar
Peter Zijlstra committed
904
	src_pte = pte_offset_map(src_pmd, addr);
905
	src_ptl = pte_lockptr(src_mm, src_pmd);
Ingo Molnar's avatar
Ingo Molnar committed
906
	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
907
908
	orig_src_pte = src_pte;
	orig_dst_pte = dst_pte;
909
	arch_enter_lazy_mmu_mode();
Linus Torvalds's avatar
Linus Torvalds committed
910
911
912
913
914
915

	do {
		/*
		 * We are holding two locks at this point - either of them
		 * could generate latencies in another task on another CPU.
		 */
916
917
918
		if (progress >= 32) {
			progress = 0;
			if (need_resched() ||
Nick Piggin's avatar
Nick Piggin committed
919
			    spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
920
921
				break;
		}
Linus Torvalds's avatar
Linus Torvalds committed
922
923
924
925
		if (pte_none(*src_pte)) {
			progress++;
			continue;
		}
926
927
928
929
		entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
							vma, addr, rss);
		if (entry.val)
			break;
Linus Torvalds's avatar
Linus Torvalds committed
930
931
932
		progress += 8;
	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);

933
	arch_leave_lazy_mmu_mode();
934
	spin_unlock(src_ptl);
Peter Zijlstra's avatar
Peter Zijlstra committed
935
	pte_unmap(orig_src_pte);
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
936
	add_mm_rss_vec(dst_mm, rss);
937
	pte_unmap_unlock(orig_dst_pte, dst_ptl);
938
	cond_resched();
939
940
941
942
943
944

	if (entry.val) {
		if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
			return -ENOMEM;
		progress = 0;
	}
Linus Torvalds's avatar
Linus Torvalds committed
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
	if (addr != end)
		goto again;
	return 0;
}

static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
		pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
		unsigned long addr, unsigned long end)
{
	pmd_t *src_pmd, *dst_pmd;
	unsigned long next;

	dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
	if (!dst_pmd)
		return -ENOMEM;
	src_pmd = pmd_offset(src_pud, addr);
	do {
		next = pmd_addr_end(addr, end);
963
964
		if (pmd_trans_huge(*src_pmd)) {
			int err;
965
			VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
966
967
968
969
970
971
972
973
			err = copy_huge_pmd(dst_mm, src_mm,
					    dst_pmd, src_pmd, addr, vma);
			if (err == -ENOMEM)
				return -ENOMEM;
			if (!err)
				continue;
			/* fall through */
		}
Linus Torvalds's avatar
Linus Torvalds committed
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
		if (pmd_none_or_clear_bad(src_pmd))
			continue;
		if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
						vma, addr, next))
			return -ENOMEM;
	} while (dst_pmd++, src_pmd++, addr = next, addr != end);
	return 0;
}

static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
		pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
		unsigned long addr, unsigned long end)
{
	pud_t *src_pud, *dst_pud;
	unsigned long next;

	dst_pud = pud_alloc(dst_mm, dst_pgd, addr);
	if (!dst_pud)
		return -ENOMEM;
	src_pud = pud_offset(src_pgd, addr);
	do {
		next = pud_addr_end(addr, end);
		if (pud_none_or_clear_bad(src_pud))
			continue;
		if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
						vma, addr, next))
			return -ENOMEM;
For faster browsing, not all history is shown. View entire blame