memory.c 104 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
/*
 *  linux/mm/memory.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 */

/*
 * demand-loading started 01.12.91 - seems it is high on the list of
 * things wanted, and it should be easy to implement. - Linus
 */

/*
 * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
 * pages started 02.12.91, seems to work. - Linus.
 *
 * Tested sharing by executing about 30 /bin/sh: under the old kernel it
 * would have taken more than the 6M I have free, but it worked well as
 * far as I could see.
 *
 * Also corrected some "invalidate()"s - I wasn't doing enough of them.
 */

/*
 * Real VM (paging to/from disk) started 18.12.91. Much more work and
 * thought has to go into this. Oh, well..
 * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
 *		Found it. Everything seems to work now.
 * 20.12.91  -  Ok, making the swap-device changeable like the root.
 */

/*
 * 05.04.94  -  Multi-page memory management added for v1.1.
 * 		Idea by Alex Bligh (alex@cconcepts.co.uk)
 *
 * 16.07.99  -  Support of BIGMEM added by Gerhard Wichert, Siemens AG
 *		(Gerhard.Wichert@pdb.siemens.de)
 *
 * Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
 */

#include <linux/kernel_stat.h>
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/mman.h>
#include <linux/swap.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
48
#include <linux/ksm.h>
Linus Torvalds's avatar
Linus Torvalds committed
49
#include <linux/rmap.h>
50
#include <linux/export.h>
51
#include <linux/delayacct.h>
Linus Torvalds's avatar
Linus Torvalds committed
52
#include <linux/init.h>
53
#include <linux/writeback.h>
54
#include <linux/memcontrol.h>
Andrea Arcangeli's avatar
Andrea Arcangeli committed
55
#include <linux/mmu_notifier.h>
56 57 58
#include <linux/kallsyms.h>
#include <linux/swapops.h>
#include <linux/elf.h>
59
#include <linux/gfp.h>
60
#include <linux/migrate.h>
Andy Shevchenko's avatar
Andy Shevchenko committed
61
#include <linux/string.h>
62
#include <linux/dma-debug.h>
63
#include <linux/debugfs.h>
Linus Torvalds's avatar
Linus Torvalds committed
64

65
#include <asm/io.h>
Linus Torvalds's avatar
Linus Torvalds committed
66 67 68 69 70 71
#include <asm/pgalloc.h>
#include <asm/uaccess.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/pgtable.h>

72 73
#include "internal.h"

74 75
#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
76 77
#endif

78
#ifndef CONFIG_NEED_MULTIPLE_NODES
Linus Torvalds's avatar
Linus Torvalds committed
79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97
/* use the per-pgdat data instead for discontigmem - mbligh */
unsigned long max_mapnr;
struct page *mem_map;

EXPORT_SYMBOL(max_mapnr);
EXPORT_SYMBOL(mem_map);
#endif

/*
 * A number of key systems in x86 including ioremap() rely on the assumption
 * that high_memory defines the upper bound on direct map memory, then end
 * of ZONE_NORMAL.  Under CONFIG_DISCONTIG this means that max_low_pfn and
 * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL
 * and ZONE_HIGHMEM.
 */
void * high_memory;

EXPORT_SYMBOL(high_memory);

98 99 100 101 102 103 104 105 106 107 108 109
/*
 * Randomize the address space (stacks, mmaps, brk, etc.).
 *
 * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
 *   as ancient (libc5 based) binaries can segfault. )
 */
int randomize_va_space __read_mostly =
#ifdef CONFIG_COMPAT_BRK
					1;
#else
					2;
#endif
110 111 112 113

static int __init disable_randmaps(char *s)
{
	randomize_va_space = 0;
114
	return 1;
115 116 117
}
__setup("norandmaps", disable_randmaps);

118
unsigned long zero_pfn __read_mostly;
119
unsigned long highest_memmap_pfn __read_mostly;
Hugh Dickins's avatar
Hugh Dickins committed
120

121 122
EXPORT_SYMBOL(zero_pfn);

Hugh Dickins's avatar
Hugh Dickins committed
123 124 125 126 127 128 129 130 131
/*
 * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
 */
static int __init init_zero_pfn(void)
{
	zero_pfn = page_to_pfn(ZERO_PAGE(0));
	return 0;
}
core_initcall(init_zero_pfn);
132

133

134 135
#if defined(SPLIT_RSS_COUNTING)

136
void sync_mm_rss(struct mm_struct *mm)
137 138 139 140
{
	int i;

	for (i = 0; i < NR_MM_COUNTERS; i++) {
141 142 143
		if (current->rss_stat.count[i]) {
			add_mm_counter(mm, i, current->rss_stat.count[i]);
			current->rss_stat.count[i] = 0;
144 145
		}
	}
146
	current->rss_stat.events = 0;
147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167
}

static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
{
	struct task_struct *task = current;

	if (likely(task->mm == mm))
		task->rss_stat.count[member] += val;
	else
		add_mm_counter(mm, member, val);
}
#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)

/* sync counter once per 64 page faults */
#define TASK_RSS_EVENTS_THRESH	(64)
static void check_sync_rss_stat(struct task_struct *task)
{
	if (unlikely(task != current))
		return;
	if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
168
		sync_mm_rss(task->mm);
169
}
170
#else /* SPLIT_RSS_COUNTING */
171 172 173 174 175 176 177 178

#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)

static void check_sync_rss_stat(struct task_struct *task)
{
}

179 180 181 182 183 184 185 186 187 188 189 190 191 192
#endif /* SPLIT_RSS_COUNTING */

#ifdef HAVE_GENERIC_MMU_GATHER

static int tlb_next_batch(struct mmu_gather *tlb)
{
	struct mmu_gather_batch *batch;

	batch = tlb->active;
	if (batch->next) {
		tlb->active = batch->next;
		return 1;
	}

193 194 195
	if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
		return 0;

196 197 198 199
	batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
	if (!batch)
		return 0;

200
	tlb->batch_count++;
201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
	batch->next = NULL;
	batch->nr   = 0;
	batch->max  = MAX_GATHER_BATCH;

	tlb->active->next = batch;
	tlb->active = batch;

	return 1;
}

/* tlb_gather_mmu
 *	Called to initialize an (on-stack) mmu_gather structure for page-table
 *	tear-down from @mm. The @fullmm argument is used when @mm is without
 *	users and we're going to destroy the full address space (exit/execve).
 */
216
void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end)
217 218 219
{
	tlb->mm = mm;

220 221
	/* Is it from 0 to ~0? */
	tlb->fullmm     = !(start | (end+1));
222
	tlb->need_flush_all = 0;
223 224 225 226
	tlb->local.next = NULL;
	tlb->local.nr   = 0;
	tlb->local.max  = ARRAY_SIZE(tlb->__pages);
	tlb->active     = &tlb->local;
227
	tlb->batch_count = 0;
228 229 230 231

#ifdef CONFIG_HAVE_RCU_TABLE_FREE
	tlb->batch = NULL;
#endif
232 233

	__tlb_reset_range(tlb);
234 235
}

236
static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
237
{
238 239 240
	if (!tlb->end)
		return;

241
	tlb_flush(tlb);
242
	mmu_notifier_invalidate_range(tlb->mm, tlb->start, tlb->end);
243 244
#ifdef CONFIG_HAVE_RCU_TABLE_FREE
	tlb_table_flush(tlb);
245
#endif
246
	__tlb_reset_range(tlb);
247 248 249 250 251
}

static void tlb_flush_mmu_free(struct mmu_gather *tlb)
{
	struct mmu_gather_batch *batch;
252

253
	for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
254 255 256 257 258 259
		free_pages_and_swap_cache(batch->pages, batch->nr);
		batch->nr = 0;
	}
	tlb->active = &tlb->local;
}

260 261 262 263 264 265
void tlb_flush_mmu(struct mmu_gather *tlb)
{
	tlb_flush_mmu_tlbonly(tlb);
	tlb_flush_mmu_free(tlb);
}

266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295
/* tlb_finish_mmu
 *	Called at the end of the shootdown operation to free up any resources
 *	that were required.
 */
void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
{
	struct mmu_gather_batch *batch, *next;

	tlb_flush_mmu(tlb);

	/* keep the page table cache within bounds */
	check_pgt_cache();

	for (batch = tlb->local.next; batch; batch = next) {
		next = batch->next;
		free_pages((unsigned long)batch, 0);
	}
	tlb->local.next = NULL;
}

/* __tlb_remove_page
 *	Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while
 *	handling the additional races in SMP caused by other CPUs caching valid
 *	mappings in their TLBs. Returns the number of free page slots left.
 *	When out of page slots we must call tlb_flush_mmu().
 */
int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
{
	struct mmu_gather_batch *batch;

296
	VM_BUG_ON(!tlb->end);
297 298 299 300 301 302

	batch = tlb->active;
	batch->pages[batch->nr++] = page;
	if (batch->nr == batch->max) {
		if (!tlb_next_batch(tlb))
			return 0;
303
		batch = tlb->active;
304
	}
305
	VM_BUG_ON_PAGE(batch->nr > batch->max, page);
306 307 308 309 310 311

	return batch->max - batch->nr;
}

#endif /* HAVE_GENERIC_MMU_GATHER */

312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384
#ifdef CONFIG_HAVE_RCU_TABLE_FREE

/*
 * See the comment near struct mmu_table_batch.
 */

static void tlb_remove_table_smp_sync(void *arg)
{
	/* Simply deliver the interrupt */
}

static void tlb_remove_table_one(void *table)
{
	/*
	 * This isn't an RCU grace period and hence the page-tables cannot be
	 * assumed to be actually RCU-freed.
	 *
	 * It is however sufficient for software page-table walkers that rely on
	 * IRQ disabling. See the comment near struct mmu_table_batch.
	 */
	smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
	__tlb_remove_table(table);
}

static void tlb_remove_table_rcu(struct rcu_head *head)
{
	struct mmu_table_batch *batch;
	int i;

	batch = container_of(head, struct mmu_table_batch, rcu);

	for (i = 0; i < batch->nr; i++)
		__tlb_remove_table(batch->tables[i]);

	free_page((unsigned long)batch);
}

void tlb_table_flush(struct mmu_gather *tlb)
{
	struct mmu_table_batch **batch = &tlb->batch;

	if (*batch) {
		call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
		*batch = NULL;
	}
}

void tlb_remove_table(struct mmu_gather *tlb, void *table)
{
	struct mmu_table_batch **batch = &tlb->batch;

	/*
	 * When there's less then two users of this mm there cannot be a
	 * concurrent page-table walk.
	 */
	if (atomic_read(&tlb->mm->mm_users) < 2) {
		__tlb_remove_table(table);
		return;
	}

	if (*batch == NULL) {
		*batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
		if (*batch == NULL) {
			tlb_remove_table_one(table);
			return;
		}
		(*batch)->nr = 0;
	}
	(*batch)->tables[(*batch)->nr++] = table;
	if ((*batch)->nr == MAX_TABLE_BATCH)
		tlb_table_flush(tlb);
}

385
#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
386

Linus Torvalds's avatar
Linus Torvalds committed
387 388 389 390
/*
 * Note: this doesn't free the actual pages themselves. That
 * has been handled earlier when unmapping all the memory regions.
 */
391 392
static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
			   unsigned long addr)
Linus Torvalds's avatar
Linus Torvalds committed
393
{
394
	pgtable_t token = pmd_pgtable(*pmd);
395
	pmd_clear(pmd);
396
	pte_free_tlb(tlb, token, addr);
397
	atomic_long_dec(&tlb->mm->nr_ptes);
Linus Torvalds's avatar
Linus Torvalds committed
398 399
}

400 401 402
static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
				unsigned long addr, unsigned long end,
				unsigned long floor, unsigned long ceiling)
Linus Torvalds's avatar
Linus Torvalds committed
403 404 405
{
	pmd_t *pmd;
	unsigned long next;
406
	unsigned long start;
Linus Torvalds's avatar
Linus Torvalds committed
407

408
	start = addr;
Linus Torvalds's avatar
Linus Torvalds committed
409 410 411 412 413
	pmd = pmd_offset(pud, addr);
	do {
		next = pmd_addr_end(addr, end);
		if (pmd_none_or_clear_bad(pmd))
			continue;
414
		free_pte_range(tlb, pmd, addr);
Linus Torvalds's avatar
Linus Torvalds committed
415 416
	} while (pmd++, addr = next, addr != end);

417 418 419 420 421 422 423
	start &= PUD_MASK;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= PUD_MASK;
		if (!ceiling)
			return;
Linus Torvalds's avatar
Linus Torvalds committed
424
	}
425 426 427 428 429
	if (end - 1 > ceiling - 1)
		return;

	pmd = pmd_offset(pud, start);
	pud_clear(pud);
430
	pmd_free_tlb(tlb, pmd, start);
431
	mm_dec_nr_pmds(tlb->mm);
Linus Torvalds's avatar
Linus Torvalds committed
432 433
}

434 435 436
static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
				unsigned long addr, unsigned long end,
				unsigned long floor, unsigned long ceiling)
Linus Torvalds's avatar
Linus Torvalds committed
437 438 439
{
	pud_t *pud;
	unsigned long next;
440
	unsigned long start;
Linus Torvalds's avatar
Linus Torvalds committed
441

442
	start = addr;
Linus Torvalds's avatar
Linus Torvalds committed
443 444 445 446 447
	pud = pud_offset(pgd, addr);
	do {
		next = pud_addr_end(addr, end);
		if (pud_none_or_clear_bad(pud))
			continue;
448
		free_pmd_range(tlb, pud, addr, next, floor, ceiling);
Linus Torvalds's avatar
Linus Torvalds committed
449 450
	} while (pud++, addr = next, addr != end);

451 452 453 454 455 456 457
	start &= PGDIR_MASK;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= PGDIR_MASK;
		if (!ceiling)
			return;
Linus Torvalds's avatar
Linus Torvalds committed
458
	}
459 460 461 462 463
	if (end - 1 > ceiling - 1)
		return;

	pud = pud_offset(pgd, start);
	pgd_clear(pgd);
464
	pud_free_tlb(tlb, pud, start);
Linus Torvalds's avatar
Linus Torvalds committed
465 466 467
}

/*
468
 * This function frees user-level page tables of a process.
Linus Torvalds's avatar
Linus Torvalds committed
469
 */
470
void free_pgd_range(struct mmu_gather *tlb,
471 472
			unsigned long addr, unsigned long end,
			unsigned long floor, unsigned long ceiling)
Linus Torvalds's avatar
Linus Torvalds committed
473 474 475
{
	pgd_t *pgd;
	unsigned long next;
476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501

	/*
	 * The next few lines have given us lots of grief...
	 *
	 * Why are we testing PMD* at this top level?  Because often
	 * there will be no work to do at all, and we'd prefer not to
	 * go all the way down to the bottom just to discover that.
	 *
	 * Why all these "- 1"s?  Because 0 represents both the bottom
	 * of the address space and the top of it (using -1 for the
	 * top wouldn't help much: the masks would do the wrong thing).
	 * The rule is that addr 0 and floor 0 refer to the bottom of
	 * the address space, but end 0 and ceiling 0 refer to the top
	 * Comparisons need to use "end - 1" and "ceiling - 1" (though
	 * that end 0 case should be mythical).
	 *
	 * Wherever addr is brought up or ceiling brought down, we must
	 * be careful to reject "the opposite 0" before it confuses the
	 * subsequent tests.  But what about where end is brought down
	 * by PMD_SIZE below? no, end can't go down to 0 there.
	 *
	 * Whereas we round start (addr) and ceiling down, by different
	 * masks at different levels, in order to test whether a table
	 * now has no other vmas using it, so can be freed, we don't
	 * bother to round floor or end up - the tests don't need that.
	 */
Linus Torvalds's avatar
Linus Torvalds committed
502

503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518
	addr &= PMD_MASK;
	if (addr < floor) {
		addr += PMD_SIZE;
		if (!addr)
			return;
	}
	if (ceiling) {
		ceiling &= PMD_MASK;
		if (!ceiling)
			return;
	}
	if (end - 1 > ceiling - 1)
		end -= PMD_SIZE;
	if (addr > end - 1)
		return;

519
	pgd = pgd_offset(tlb->mm, addr);
Linus Torvalds's avatar
Linus Torvalds committed
520 521 522 523
	do {
		next = pgd_addr_end(addr, end);
		if (pgd_none_or_clear_bad(pgd))
			continue;
524
		free_pud_range(tlb, pgd, addr, next, floor, ceiling);
Linus Torvalds's avatar
Linus Torvalds committed
525
	} while (pgd++, addr = next, addr != end);
526 527
}

528
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
529
		unsigned long floor, unsigned long ceiling)
530 531 532 533 534
{
	while (vma) {
		struct vm_area_struct *next = vma->vm_next;
		unsigned long addr = vma->vm_start;

535
		/*
npiggin@suse.de's avatar
npiggin@suse.de committed
536 537
		 * Hide vma from rmap and truncate_pagecache before freeing
		 * pgtables
538
		 */
539
		unlink_anon_vmas(vma);
540 541
		unlink_file_vma(vma);

542
		if (is_vm_hugetlb_page(vma)) {
543
			hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
544
				floor, next? next->vm_start: ceiling);
545 546 547 548 549
		} else {
			/*
			 * Optimization: gather nearby vmas into one call down
			 */
			while (next && next->vm_start <= vma->vm_end + PMD_SIZE
550
			       && !is_vm_hugetlb_page(next)) {
551 552
				vma = next;
				next = vma->vm_next;
553
				unlink_anon_vmas(vma);
554
				unlink_file_vma(vma);
555 556 557 558
			}
			free_pgd_range(tlb, addr, vma->vm_end,
				floor, next? next->vm_start: ceiling);
		}
559 560
		vma = next;
	}
Linus Torvalds's avatar
Linus Torvalds committed
561 562
}

563 564
int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
		pmd_t *pmd, unsigned long address)
Linus Torvalds's avatar
Linus Torvalds committed
565
{
566
	spinlock_t *ptl;
567
	pgtable_t new = pte_alloc_one(mm, address);
568
	int wait_split_huge_page;
569 570 571
	if (!new)
		return -ENOMEM;

572 573 574 575 576 577 578 579 580 581 582 583 584 585 586
	/*
	 * Ensure all pte setup (eg. pte page lock and page clearing) are
	 * visible before the pte is made visible to other CPUs by being
	 * put into page tables.
	 *
	 * The other side of the story is the pointer chasing in the page
	 * table walking code (when walking the page table without locking;
	 * ie. most of the time). Fortunately, these data accesses consist
	 * of a chain of data-dependent loads, meaning most CPUs (alpha
	 * being the notable exception) will already guarantee loads are
	 * seen in-order. See the alpha page table accessors for the
	 * smp_read_barrier_depends() barriers in page table walking code.
	 */
	smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */

587
	ptl = pmd_lock(mm, pmd);
588 589
	wait_split_huge_page = 0;
	if (likely(pmd_none(*pmd))) {	/* Has another populated it ? */
590
		atomic_long_inc(&mm->nr_ptes);
Linus Torvalds's avatar
Linus Torvalds committed
591
		pmd_populate(mm, pmd, new);
592
		new = NULL;
593 594
	} else if (unlikely(pmd_trans_splitting(*pmd)))
		wait_split_huge_page = 1;
595
	spin_unlock(ptl);
596 597
	if (new)
		pte_free(mm, new);
598 599
	if (wait_split_huge_page)
		wait_split_huge_page(vma->anon_vma, pmd);
600
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
601 602
}

603
int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
Linus Torvalds's avatar
Linus Torvalds committed
604
{
605 606 607 608
	pte_t *new = pte_alloc_one_kernel(&init_mm, address);
	if (!new)
		return -ENOMEM;

609 610
	smp_wmb(); /* See comment in __pte_alloc */

611
	spin_lock(&init_mm.page_table_lock);
612
	if (likely(pmd_none(*pmd))) {	/* Has another populated it ? */
613
		pmd_populate_kernel(&init_mm, pmd, new);
614
		new = NULL;
615 616
	} else
		VM_BUG_ON(pmd_trans_splitting(*pmd));
617
	spin_unlock(&init_mm.page_table_lock);
618 619
	if (new)
		pte_free_kernel(&init_mm, new);
620
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
621 622
}

623 624 625 626 627 628
static inline void init_rss_vec(int *rss)
{
	memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
}

static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
629
{
630 631
	int i;

632
	if (current->mm == mm)
633
		sync_mm_rss(mm);
634 635 636
	for (i = 0; i < NR_MM_COUNTERS; i++)
		if (rss[i])
			add_mm_counter(mm, i, rss[i]);
637 638
}

639
/*
640 641 642
 * This function is called to print an error when a bad pte
 * is found. For example, we might have a PFN-mapped pte in
 * a region that doesn't allow it.
643 644 645
 *
 * The calling function must still handle the error.
 */
646 647
static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
			  pte_t pte, struct page *page)
648
{
649 650 651 652 653
	pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
	pud_t *pud = pud_offset(pgd, addr);
	pmd_t *pmd = pmd_offset(pud, addr);
	struct address_space *mapping;
	pgoff_t index;
654 655 656 657 658 659 660 661 662 663 664 665 666 667
	static unsigned long resume;
	static unsigned long nr_shown;
	static unsigned long nr_unshown;

	/*
	 * Allow a burst of 60 reports, then keep quiet for that minute;
	 * or allow a steady drip of one report per second.
	 */
	if (nr_shown == 60) {
		if (time_before(jiffies, resume)) {
			nr_unshown++;
			return;
		}
		if (nr_unshown) {
668 669
			printk(KERN_ALERT
				"BUG: Bad page map: %lu messages suppressed\n",
670 671 672 673 674 675 676
				nr_unshown);
			nr_unshown = 0;
		}
		nr_shown = 0;
	}
	if (nr_shown++ == 0)
		resume = jiffies + 60 * HZ;
677 678 679 680

	mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
	index = linear_page_index(vma, addr);

681 682
	printk(KERN_ALERT
		"BUG: Bad page map in process %s  pte:%08llx pmd:%08llx\n",
683 684
		current->comm,
		(long long)pte_val(pte), (long long)pmd_val(*pmd));
685
	if (page)
686
		dump_page(page, "bad pte");
687
	printk(KERN_ALERT
688 689 690 691 692
		"addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
		(void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
	/*
	 * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y
	 */
693 694 695 696 697
	pr_alert("file:%pD fault:%pf mmap:%pf readpage:%pf\n",
		 vma->vm_file,
		 vma->vm_ops ? vma->vm_ops->fault : NULL,
		 vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
		 mapping ? mapping->a_ops->readpage : NULL);
698
	dump_stack();
699
	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
700 701
}

702
/*
703
 * vm_normal_page -- This function gets the "struct page" associated with a pte.
704
 *
705 706 707
 * "Special" mappings do not wish to be associated with a "struct page" (either
 * it doesn't exist, or it exists but they don't want to touch it). In this
 * case, NULL is returned here. "Normal" mappings do have a struct page.
708
 *
709 710 711 712 713 714 715 716
 * There are 2 broad cases. Firstly, an architecture may define a pte_special()
 * pte bit, in which case this function is trivial. Secondly, an architecture
 * may not have a spare pte bit, which requires a more complicated scheme,
 * described below.
 *
 * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
 * special mapping (even if there are underlying and valid "struct pages").
 * COWed pages of a VM_PFNMAP are always normal.
717
 *
718 719
 * The way we recognize COWed pages within VM_PFNMAP mappings is through the
 * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
720 721
 * set, and the vm_pgoff will point to the first PFN mapped: thus every special
 * mapping will always honor the rule
722 723 724
 *
 *	pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
 *
725 726 727 728 729 730
 * And for normal mappings this is false.
 *
 * This restricts such mappings to be a linear translation from virtual address
 * to pfn. To get around this restriction, we allow arbitrary mappings so long
 * as the vma is not a COW mapping; in that case, we know that all ptes are
 * special (because none can have been COWed).
731 732
 *
 *
733
 * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.
734 735 736 737 738 739 740 741 742
 *
 * VM_MIXEDMAP mappings can likewise contain memory with or without "struct
 * page" backing, however the difference is that _all_ pages with a struct
 * page (that is, those where pfn_valid is true) are refcounted and considered
 * normal pages by the VM. The disadvantage is that pages are refcounted
 * (which can be slower and simply not an option for some PFNMAP users). The
 * advantage is that we don't have to follow the strict linearity rule of
 * PFNMAP mappings in order to support COWable mappings.
 *
743
 */
744 745 746 747 748 749 750
#ifdef __HAVE_ARCH_PTE_SPECIAL
# define HAVE_PTE_SPECIAL 1
#else
# define HAVE_PTE_SPECIAL 0
#endif
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
				pte_t pte)
751
{
752
	unsigned long pfn = pte_pfn(pte);
753 754

	if (HAVE_PTE_SPECIAL) {
755
		if (likely(!pte_special(pte)))
756
			goto check_pfn;
757 758
		if (vma->vm_ops && vma->vm_ops->find_special_page)
			return vma->vm_ops->find_special_page(vma, addr);
Hugh Dickins's avatar
Hugh Dickins committed
759 760
		if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
			return NULL;
761
		if (!is_zero_pfn(pfn))
762
			print_bad_pte(vma, addr, pte, NULL);
763 764 765 766 767
		return NULL;
	}

	/* !HAVE_PTE_SPECIAL case follows: */

768 769 770 771 772 773
	if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
		if (vma->vm_flags & VM_MIXEDMAP) {
			if (!pfn_valid(pfn))
				return NULL;
			goto out;
		} else {
774 775
			unsigned long off;
			off = (addr - vma->vm_start) >> PAGE_SHIFT;
776 777 778 779 780
			if (pfn == vma->vm_pgoff + off)
				return NULL;
			if (!is_cow_mapping(vma->vm_flags))
				return NULL;
		}
781 782
	}

783 784
	if (is_zero_pfn(pfn))
		return NULL;
785 786 787 788 789
check_pfn:
	if (unlikely(pfn > highest_memmap_pfn)) {
		print_bad_pte(vma, addr, pte, NULL);
		return NULL;
	}
790 791

	/*
792 793
	 * NOTE! We still have PageReserved() pages in the page tables.
	 * eg. VDSO mappings can cause them to exist.
794
	 */
795
out:
796
	return pfn_to_page(pfn);
797 798
}

Linus Torvalds's avatar
Linus Torvalds committed
799 800 801 802 803 804
/*
 * copy one vm_area from one task to the other. Assumes the page tables
 * already present in the new task to be cleared in the whole range
 * covered by this vma.
 */

805
static inline unsigned long
Linus Torvalds's avatar
Linus Torvalds committed
806
copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
807
		pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
808
		unsigned long addr, int *rss)
Linus Torvalds's avatar
Linus Torvalds committed
809
{
810
	unsigned long vm_flags = vma->vm_flags;
Linus Torvalds's avatar
Linus Torvalds committed
811 812 813 814 815
	pte_t pte = *src_pte;
	struct page *page;

	/* pte contains position in swap or file, so copy. */
	if (unlikely(!pte_present(pte))) {
816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849
		swp_entry_t entry = pte_to_swp_entry(pte);

		if (likely(!non_swap_entry(entry))) {
			if (swap_duplicate(entry) < 0)
				return entry.val;

			/* make sure dst_mm is on swapoff's mmlist. */
			if (unlikely(list_empty(&dst_mm->mmlist))) {
				spin_lock(&mmlist_lock);
				if (list_empty(&dst_mm->mmlist))
					list_add(&dst_mm->mmlist,
							&src_mm->mmlist);
				spin_unlock(&mmlist_lock);
			}
			rss[MM_SWAPENTS]++;
		} else if (is_migration_entry(entry)) {
			page = migration_entry_to_page(entry);

			if (PageAnon(page))
				rss[MM_ANONPAGES]++;
			else
				rss[MM_FILEPAGES]++;

			if (is_write_migration_entry(entry) &&
					is_cow_mapping(vm_flags)) {
				/*
				 * COW mappings require pages in both
				 * parent and child to be set to read.
				 */
				make_migration_entry_read(&entry);
				pte = swp_entry_to_pte(entry);
				if (pte_swp_soft_dirty(*src_pte))
					pte = pte_swp_mksoft_dirty(pte);
				set_pte_at(src_mm, addr, src_pte, pte);
850
			}
Linus Torvalds's avatar
Linus Torvalds committed
851
		}
852
		goto out_set_pte;
Linus Torvalds's avatar
Linus Torvalds committed
853 854 855 856 857 858
	}

	/*
	 * If it's a COW mapping, write protect it both
	 * in the parent and the child
	 */
859
	if (is_cow_mapping(vm_flags)) {
Linus Torvalds's avatar
Linus Torvalds committed
860
		ptep_set_wrprotect(src_mm, addr, src_pte);
861
		pte = pte_wrprotect(pte);
Linus Torvalds's avatar
Linus Torvalds committed
862 863 864 865 866 867 868 869 870
	}

	/*
	 * If it's a shared mapping, mark it clean in
	 * the child
	 */
	if (vm_flags & VM_SHARED)
		pte = pte_mkclean(pte);
	pte = pte_mkold(pte);
871 872 873 874

	page = vm_normal_page(vma, addr, pte);
	if (page) {
		get_page(page);
875
		page_dup_rmap(page);
876 877 878 879
		if (PageAnon(page))
			rss[MM_ANONPAGES]++;
		else
			rss[MM_FILEPAGES]++;
880
	}
881 882 883

out_set_pte:
	set_pte_at(dst_mm, addr, dst_pte, pte);
884
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
885 886
}

887
static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
888 889
		   pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
		   unsigned long addr, unsigned long end)
Linus Torvalds's avatar
Linus Torvalds committed
890
{
891
	pte_t *orig_src_pte, *orig_dst_pte;
Linus Torvalds's avatar
Linus Torvalds committed
892
	pte_t *src_pte, *dst_pte;
893
	spinlock_t *src_ptl, *dst_ptl;
894
	int progress = 0;
895
	int rss[NR_MM_COUNTERS];
896
	swp_entry_t entry = (swp_entry_t){0};
Linus Torvalds's avatar
Linus Torvalds committed
897 898

again:
899 900
	init_rss_vec(rss);

901
	dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
Linus Torvalds's avatar
Linus Torvalds committed
902 903
	if (!dst_pte)
		return -ENOMEM;
904
	src_pte = pte_offset_map(src_pmd, addr);
905
	src_ptl = pte_lockptr(src_mm, src_pmd);
906
	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
907 908
	orig_src_pte = src_pte;
	orig_dst_pte = dst_pte;
909
	arch_enter_lazy_mmu_mode();
Linus Torvalds's avatar
Linus Torvalds committed
910 911 912 913 914 915

	do {
		/*
		 * We are holding two locks at this point - either of them
		 * could generate latencies in another task on another CPU.
		 */
916 917 918
		if (progress >= 32) {
			progress = 0;
			if (need_resched() ||
919
			    spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
920 921
				break;
		}
Linus Torvalds's avatar
Linus Torvalds committed
922 923 924 925
		if (pte_none(*src_pte)) {
			progress++;
			continue;
		}
926 927 928 929
		entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
							vma, addr, rss);
		if (entry.val)
			break;
Linus Torvalds's avatar
Linus Torvalds committed
930 931 932
		progress += 8;
	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);

933
	arch_leave_lazy_mmu_mode();
934
	spin_unlock(src_ptl);
935
	pte_unmap(orig_src_pte);
936
	add_mm_rss_vec(dst_mm, rss);
937
	pte_unmap_unlock(orig_dst_pte, dst_ptl);
938
	cond_resched();
939 940 941 942 943 944

	if (entry.val) {
		if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
			return -ENOMEM;
		progress = 0;
	}
Linus Torvalds's avatar
Linus Torvalds committed
945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962
	if (addr != end)
		goto again;
	return 0;
}

static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
		pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
		unsigned long addr, unsigned long end)
{
	pmd_t *src_pmd, *dst_pmd;
	unsigned long next;

	dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
	if (!dst_pmd)
		return -ENOMEM;
	src_pmd = pmd_offset(src_pud, addr);
	do {
		next = pmd_addr_end(addr, end);
963 964
		if (pmd_trans_huge(*src_pmd)) {
			int err;
965
			VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
966 967 968 969 970 971 972 973
			err = copy_huge_pmd(dst_mm, src_mm,
					    dst_pmd, src_pmd, addr, vma);
			if (err == -ENOMEM)
				return -ENOMEM;
			if (!err)
				continue;
			/* fall through */
		}
Linus Torvalds's avatar
Linus Torvalds committed
974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011
		if (pmd_none_or_clear_bad(src_pmd))
			continue;
		if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
						vma, addr, next))
			return -ENOMEM;
	} while (dst_pmd++, src_pmd++, addr = next, addr != end);
	return 0;
}

static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
		pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
		unsigned long addr, unsigned long end)
{
	pud_t *src_pud, *dst_pud;
	unsigned long next;

	dst_pud = pud_alloc(dst_mm, dst_pgd, addr);
	if (!dst_pud)
		return -ENOMEM;
	src_pud = pud_offset(src_pgd, addr);
	do {
		next = pud_addr_end(addr, end);
		if (pud_none_or_clear_bad(src_pud))
			continue;
		if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
						vma, addr, next))
			return -ENOMEM;
	} while (dst_pud++, src_pud++, addr = next, addr != end);
	return 0;
}

int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
		struct vm_area_struct *vma)
{
	pgd_t *src_pgd, *dst_pgd;
	unsigned long next;
	unsigned long addr = vma->vm_start;
	unsigned long end = vma->vm_end;
1012 1013 1014
	unsigned long mmun_start;	/* For mmu_notifiers */
	unsigned long mmun_end;		/* For mmu_notifiers */
	bool is_cow;
Andrea Arcangeli's avatar
Andrea Arcangeli committed
1015
	int ret;
Linus Torvalds's avatar
Linus Torvalds committed
1016

1017 1018 1019 1020 1021 1022
	/*
	 * Don't copy ptes where a page fault will fill them correctly.
	 * Fork becomes much lighter when there are big shared or private
	 * readonly mappings. The tradeoff is that copy_page_range is more
	 * efficient than faulting.
	 */
1023 1024 1025
	if (!(vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) &&
			!vma->anon_vma)
		return 0;
1026

Linus Torvalds's avatar
Linus Torvalds committed
1027 1028 1029
	if (is_vm_hugetlb_page(vma))
		return copy_hugetlb_page_range(dst_mm, src_mm, vma);

1030
	if (unlikely(vma->vm_flags & VM_PFNMAP)) {
1031 1032 1033 1034
		/*
		 * We do not free on error cases below as remove_vma
		 * gets called on error from higher level routine
		 */
1035
		ret = track_pfn_copy(vma);
1036 1037 1038 1039
		if (ret)
			return ret;
	}

Andrea Arcangeli's avatar
Andrea Arcangeli committed
1040 1041 1042 1043 1044 1045
	/*
	 * We need to invalidate the secondary MMU mappings only when
	 * there could be a permission downgrade on the ptes of the
	 * parent mm. And a permission downgrade will only happen if
	 * is_cow_mapping() returns true.
	 */
1046 1047 1048 1049 1050 1051
	is_cow = is_cow_mapping(vma->vm_flags);
	mmun_start = addr;
	mmun_end   = end;
	if (is_cow)
		mmu_notifier_invalidate_range_start(src_mm, mmun_start,
						    mmun_end);
Andrea Arcangeli's avatar
Andrea Arcangeli committed
1052 1053

	ret = 0;
Linus Torvalds's avatar
Linus Torvalds committed
1054 1055 1056 1057 1058 1059
	dst_pgd = pgd_offset(dst_mm, addr);
	src_pgd = pgd_offset(src_mm, addr);
	do {
		next = pgd_addr_end(addr, end);
		if (pgd_none_or_clear_bad(src_pgd))
			continue;
Andrea Arcangeli's avatar
Andrea Arcangeli committed
1060 1061 1062 1063 1064
		if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
					    vma, addr, next))) {
			ret = -ENOMEM;
			break;
		}
Linus Torvalds's avatar
Linus Torvalds committed
1065
	} while (dst_pgd++, src_pgd++, addr = next, addr != end);
Andrea Arcangeli's avatar
Andrea Arcangeli committed
1066

1067 1068
	if (is_cow)
		mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end);
Andrea Arcangeli's avatar
Andrea Arcangeli committed
1069
	return ret;
Linus Torvalds's avatar
Linus Torvalds committed
1070 1071
}

1072
static unsigned long zap_pte_range(struct mmu_gather *tlb,
1073
				struct vm_area_struct *vma, pmd_t *pmd,
Linus Torvalds's avatar
Linus Torvalds committed
1074
				unsigned long addr, unsigned long end,
1075
				struct zap_details *details)
Linus Torvalds's avatar
Linus Torvalds committed
1076
{
1077
	struct mm_struct *mm = tlb->mm;
Peter Zijlstra's avatar
Peter Zijlstra committed
1078
	int force_flush = 0;
1079
	int rss[NR_MM_COUNTERS];
1080
	spinlock_t *ptl;
1081
	pte_t *start_pte;
1082
	pte_t *pte;
1083
	swp_entry_t entry;
1084

Peter Zijlstra's avatar
Peter Zijlstra committed
1085
again:
1086
	init_rss_vec(rss);
1087 1088
	start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
	pte = start_pte;
1089
	arch_enter_lazy_mmu_mode();
Linus Torvalds's avatar
Linus Torvalds committed
1090 1091
	do {
		pte_t ptent = *pte;
1092
		if (pte_none(ptent)) {
Linus Torvalds's avatar
Linus Torvalds committed
1093
			continue;
1094
		}
1095

Linus Torvalds's avatar
Linus Torvalds committed
1096
		if (pte_present(ptent)) {
1097
			struct page *page;
1098

1099
			page = vm_normal_page(vma, addr, ptent);
Linus Torvalds's avatar
Linus Torvalds committed
1100 1101 1102 1103 1104 1105 1106 1107 1108 1109
			if (unlikely(details) && page) {
				/*
				 * unmap_shared_mapping_pages() wants to
				 * invalidate cache without truncating:
				 * unmap shared but keep private pages.
				 */
				if (details->check_mapping &&
				    details->check_mapping != page->mapping)
					continue;
			}
1110
			ptent = ptep_get_and_clear_full(mm, addr, pte,
1111
							tlb->fullmm);
Linus Torvalds's avatar
Linus Torvalds committed
1112 1113 1114 1115
			tlb_remove_tlb_entry(tlb, pte, addr);
			if (unlikely(!page))
				continue;
			if (PageAnon(page))
1116
				rss[MM_ANONPAGES]--;
1117
			else {
1118 1119
				if (pte_dirty(ptent)) {
					force_flush = 1;
1120
					set_page_dirty(page);
1121
				}
1122
				if (pte_young(ptent) &&
1123
				    likely(!(vma->vm_flags & VM_SEQ_READ)))
1124
					mark_page_accessed(page);
1125
				rss[MM_FILEPAGES]--;
1126
			}
1127
			page_remove_rmap(page);
1128 1129
			if (unlikely(page_mapcount(page) < 0))
				print_bad_pte(vma, addr, ptent, page);
1130 1131
			if (unlikely(!__tlb_remove_page(tlb, page))) {
				force_flush = 1;
1132
				addr += PAGE_SIZE;
Peter Zijlstra's avatar
Peter Zijlstra committed
1133
				break;
1134
			}
Linus Torvalds's avatar
Linus Torvalds committed
1135 1136
			continue;
		}
1137
		/* If details->check_mapping, we leave swap entries. */
Linus Torvalds's avatar
Linus Torvalds committed
1138 1139
		if (unlikely(details))
			continue;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1140

1141 1142 1143 1144 1145
		entry = pte_to_swp_entry(ptent);
		if (!non_swap_entry(entry))
			rss[MM_SWAPENTS]--;
		else if (is_migration_entry(entry)) {
			struct page *page;
1146

1147
			page = migration_entry_to_page(entry);
1148

1149 1150 1151 1152
			if (PageAnon(page))
				rss[MM_ANONPAGES]--;
			else
				rss[MM_FILEPAGES]--;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1153
		}
1154 1155
		if (unlikely(!free_swap_and_cache(entry)))
			print_bad_pte(vma, addr, ptent, NULL);
1156
		pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1157
	} while (pte++, addr += PAGE_SIZE, addr != end);
1158

1159
	add_mm_rss_vec(mm, rss);
1160
	arch_leave_lazy_mmu_mode();
1161

1162
	/* Do the actual TLB flush before dropping ptl */
1163
	if (force_flush)
1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175
		tlb_flush_mmu_tlbonly(tlb);
	pte_unmap_unlock(start_pte, ptl);

	/*
	 * If we forced a TLB flush (either due to running out of
	 * batch buffers or because we needed to flush dirty TLB
	 * entries before releasing the ptl), free the batched
	 * memory too. Restart if we didn't do everything.
	 */
	if (force_flush) {
		force_flush = 0;
		tlb_flush_mmu_free(tlb);
1176 1177

		if (addr != end)
Peter Zijlstra's avatar
Peter Zijlstra committed
1178 1179 1180
			goto again;
	}

1181
	return addr;
Linus Torvalds's avatar
Linus Torvalds committed
1182 1183
}

1184
static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1185
				struct vm_area_struct *vma, pud_t *pud,
Linus Torvalds's avatar
Linus Torvalds committed
1186
				unsigned long addr, unsigned long end,
1187
				struct zap_details *details)
Linus Torvalds's avatar
Linus Torvalds committed
1188 1189 1190 1191 1192 1193 1194
{
	pmd_t *pmd;
	unsigned long next;

	pmd = pmd_offset(pud, addr);
	do {
		next = pmd_addr_end(addr, end);
1195
		if (pmd_trans_huge(*pmd)) {
1196
			if (next - addr != HPAGE_PMD_SIZE) {
1197 1198 1199 1200 1201 1202 1203 1204 1205
#ifdef CONFIG_DEBUG_VM
				if (!rwsem_is_locked(&tlb->mm->mmap_sem)) {
					pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n",
						__func__, addr, end,
						vma->vm_start,
						vma->vm_end);
					BUG();
				}
#endif
1206
				split_huge_page_pmd(vma, addr, pmd);
1207
			} else if (zap_huge_pmd(tlb, vma, pmd, addr))
1208
				goto next;
1209 1210
			/* fall through */
		}
1211 1212 1213 1214 1215 1216 1217 1218 1219
		/*
		 * Here there can be other concurrent MADV_DONTNEED or
		 * trans huge page faults running, and if the pmd is
		 * none or trans huge it can change under us. This is
		 * because MADV_DONTNEED holds the mmap_sem in read
		 * mode.
		 */
		if (pmd_none_or_trans_huge_or_clear_bad(pmd))
			goto next;
1220
		next = zap_pte_range(tlb, vma, pmd, addr, next, details);
1221
next:
1222 1223
		cond_resched();
	} while (pmd++, addr = next, addr != end);
1224 1225

	return addr;
Linus Torvalds's avatar
Linus Torvalds committed
1226 1227
}

1228
static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1229
				struct vm_area_struct *vma, pgd_t *pgd,
Linus Torvalds's avatar
Linus Torvalds committed
1230
				unsigned long addr, unsigned long end,
1231
				struct zap_details *details)
Linus Torvalds's avatar
Linus Torvalds committed
1232 1233 1234 1235 1236 1237 1238
{
	pud_t *pud;
	unsigned long next;

	pud = pud_offset(pgd, addr);
	do {
		next = pud_addr_end(addr, end);
1239
		if (pud_none_or_clear_bad(pud))
Linus Torvalds's avatar
Linus Torvalds committed
1240
			continue;
1241 1242
		next = zap_pmd_range(tlb, vma, pud, addr, next, details);
	} while (pud++, addr = next, addr != end);
1243 1244

	return addr;
Linus Torvalds's avatar
Linus Torvalds committed
1245 1246
}

1247 1248 1249 1250
static void unmap_page_range(struct mmu_gather *tlb,
			     struct vm_area_struct *vma,
			     unsigned long addr, unsigned long end,
			     struct zap_details *details)
Linus Torvalds's avatar
Linus Torvalds committed
1251 1252 1253 1254
{
	pgd_t *pgd;
	unsigned long next;

1255
	if (details && !details->check_mapping)
Linus Torvalds's avatar
Linus Torvalds committed
1256 1257 1258 1259 1260 1261 1262
		details = NULL;

	BUG_ON(addr >= end);
	tlb_start_vma(tlb, vma);
	pgd = pgd_offset(vma->vm_mm, addr);
	do {
		next = pgd_addr_end(addr, end);
1263
		if (pgd_none_or_clear_bad(pgd))
Linus Torvalds's avatar
Linus Torvalds committed
1264
			continue;
1265 1266
		next = zap_pud_range(tlb, vma, pgd, addr, next, details);
	} while (pgd++, addr = next, addr != end);
Linus Torvalds's avatar
Linus Torvalds committed
1267 1268
	tlb_end_vma(tlb, vma);
}
1269

1270 1271 1272

static void unmap_single_vma(struct mmu_gather *tlb,
		struct vm_area_struct *vma, unsigned long start_addr,
1273
		unsigned long end_addr,
1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284
		struct zap_details *details)
{
	unsigned long start = max(vma->vm_start, start_addr);
	unsigned long end;

	if (start >= vma->vm_end)
		return;
	end = min(vma->vm_end, end_addr);
	if (end <= vma->vm_start)
		return;

1285 1286 1287
	if (vma->vm_file)
		uprobe_munmap(vma, start, end);

1288
	if (unlikely(vma->vm_flags & VM_PFNMAP))
1289
		untrack_pfn(vma, 0, 0);
1290 1291 1292 1293 1294 1295 1296

	if (start != end) {
		if (unlikely(is_vm_hugetlb_page(vma))) {
			/*
			 * It is undesirable to test vma->vm_file as it
			 * should be non-null for valid hugetlb area.
			 * However, vm_file will be NULL in the error
1297
			 * cleanup path of mmap_region. When
1298
			 * hugetlbfs ->mmap method fails,
1299
			 * mmap_region() nullifies vma->vm_file
1300 1301 1302 1303
			 * before calling this function to clean up.
			 * Since no pte has actually been setup, it is
			 * safe to do nothing in this case.
			 */
1304
			if (vma->vm_file) {
1305
				i_mmap_lock_write(vma->vm_file->f_mapping);
1306
				__unmap_hugepage_range_final(tlb, vma, start, end, NULL);
1307
				i_mmap_unlock_write(vma->vm_file->f_mapping);
1308
			}
1309 1310 1311
		} else
			unmap_page_range(tlb, vma, start, end, details);
	}
Linus Torvalds's avatar
Linus Torvalds committed
1312 1313 1314 1315
}

/**
 * unmap_vmas - unmap a range of memory covered by a list of vma's
1316
 * @tlb: address of the caller's struct mmu_gather
Linus Torvalds's avatar
Linus Torvalds committed
1317 1318 1319 1320
 * @vma: the starting vma
 * @start_addr: virtual address at which to start unmapping
 * @end_addr: virtual address at which to end unmapping
 *
1321
 * Unmap all pages in the vma list.
Linus Torvalds's avatar
Linus Torvalds committed
1322 1323 1324 1325 1326 1327 1328 1329 1330 1331
 *
 * Only addresses between `start' and `end' will be unmapped.
 *
 * The VMA list must be sorted in ascending virtual address order.
 *
 * unmap_vmas() assumes that the caller will flush the whole unmapped address
 * range after unmap_vmas() returns.  So the only responsibility here is to
 * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
 * drops the lock and schedules.
 */
1332
void unmap_vmas(struct mmu_gather *tlb,
Linus Torvalds's avatar
Linus Torvalds committed
1333
		struct vm_area_struct *vma, unsigned long start_addr,
1334
		unsigned long end_addr)
Linus Torvalds's avatar
Linus Torvalds committed
1335
{
Andrea Arcangeli's avatar
Andrea Arcangeli committed
1336
	struct mm_struct *mm = vma->vm_mm;
Linus Torvalds's avatar
Linus Torvalds committed
1337

Andrea Arcangeli's avatar
Andrea Arcangeli committed
1338
	mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
1339
	for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
1340
		unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
Andrea Arcangeli's avatar
Andrea Arcangeli committed
1341
	mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
Linus Torvalds's avatar
Linus Torvalds committed
1342 1343 1344 1345 1346
}

/**
 * zap_page_range - remove user pages in a given range
 * @vma: vm_area_struct holding the applicable pages
1347
 * @start: starting address of pages to zap
Linus Torvalds's avatar
Linus Torvalds committed
1348
 * @size: number of bytes to zap
1349
 * @details: details of shared cache invalidation
1350 1351
 *
 * Caller must protect the VMA list
Linus Torvalds's avatar
Linus Torvalds committed
1352
 */
1353
void zap_page_range(struct vm_area_struct *vma, unsigned long start,
Linus Torvalds's avatar
Linus Torvalds committed
1354 1355 1356
		unsigned long size, struct zap_details *details)
{
	struct mm_struct *mm = vma->vm_mm;
Peter Zijlstra's avatar
Peter Zijlstra committed
1357
	struct mmu_gather tlb;
1358
	unsigned long end = start + size;
Linus Torvalds's avatar
Linus Torvalds committed
1359 1360

	lru_add_drain();
1361
	tlb_gather_mmu(&tlb, mm, start, end);
1362
	update_hiwater_rss(mm);
1363 1364
	mmu_notifier_invalidate_range_start(mm, start, end);
	for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
1365
		unmap_single_vma(&tlb, vma, start, end, details);
1366 1367
	mmu_notifier_invalidate_range_end(mm, start, end);
	tlb_finish_mmu(&tlb, start, end);
Linus Torvalds's avatar
Linus Torvalds committed
1368 1369
}

1370 1371 1372 1373 1374
/**
 * zap_page_range_single - remove user pages in a given range
 * @vma: vm_area_struct holding the applicable pages
 * @address: starting address of pages to zap
 * @size: number of bytes to zap
1375
 * @details: details of shared cache invalidation
1376 1377
 *
 * The range must fit into one VMA.
Linus Torvalds's avatar
Linus Torvalds committed
1378
 */
1379
static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
Linus Torvalds's avatar
Linus Torvalds committed
1380 1381 1382
		unsigned long size, struct zap_details *details)
{
	struct mm_struct *mm = vma->vm_mm;
Peter Zijlstra's avatar
Peter Zijlstra committed
1383
	struct mmu_gather tlb;
Linus Torvalds's avatar
Linus Torvalds committed
1384 1385 1386
	unsigned long end = address + size;

	lru_add_drain();
1387
	tlb_gather_mmu(&tlb, mm, address, end);
1388
	update_hiwater_rss(mm);
1389
	mmu_notifier_invalidate_range_start(mm, address, end);
1390
	unmap_single_vma(&tlb, vma, address, end, details);
1391
	mmu_notifier_invalidate_range_end(mm, address, end);
Peter Zijlstra's avatar
Peter Zijlstra committed
1392
	tlb_finish_mmu(&tlb, address, end);
Linus Torvalds's avatar
Linus Torvalds committed
1393 1394
}

Jack Steiner's avatar