memory.c 105 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
/*
 *  linux/mm/memory.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 */

/*
 * demand-loading started 01.12.91 - seems it is high on the list of
 * things wanted, and it should be easy to implement. - Linus
 */

/*
 * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
 * pages started 02.12.91, seems to work. - Linus.
 *
 * Tested sharing by executing about 30 /bin/sh: under the old kernel it
 * would have taken more than the 6M I have free, but it worked well as
 * far as I could see.
 *
 * Also corrected some "invalidate()"s - I wasn't doing enough of them.
 */

/*
 * Real VM (paging to/from disk) started 18.12.91. Much more work and
 * thought has to go into this. Oh, well..
 * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
 *		Found it. Everything seems to work now.
 * 20.12.91  -  Ok, making the swap-device changeable like the root.
 */

/*
 * 05.04.94  -  Multi-page memory management added for v1.1.
 * 		Idea by Alex Bligh (alex@cconcepts.co.uk)
 *
 * 16.07.99  -  Support of BIGMEM added by Gerhard Wichert, Siemens AG
 *		(Gerhard.Wichert@pdb.siemens.de)
 *
 * Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
 */

#include <linux/kernel_stat.h>
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/mman.h>
#include <linux/swap.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
Hugh Dickins's avatar
Hugh Dickins committed
48
#include <linux/ksm.h>
Linus Torvalds's avatar
Linus Torvalds committed
49
#include <linux/rmap.h>
50
#include <linux/export.h>
51
#include <linux/delayacct.h>
Linus Torvalds's avatar
Linus Torvalds committed
52
#include <linux/init.h>
53
#include <linux/writeback.h>
54
#include <linux/memcontrol.h>
Andrea Arcangeli's avatar
Andrea Arcangeli committed
55
#include <linux/mmu_notifier.h>
56 57 58
#include <linux/kallsyms.h>
#include <linux/swapops.h>
#include <linux/elf.h>
59
#include <linux/gfp.h>
60
#include <linux/migrate.h>
Andy Shevchenko's avatar
Andy Shevchenko committed
61
#include <linux/string.h>
62
#include <linux/dma-debug.h>
63
#include <linux/debugfs.h>
64
#include <linux/userfaultfd_k.h>
Linus Torvalds's avatar
Linus Torvalds committed
65

66
#include <asm/io.h>
Linus Torvalds's avatar
Linus Torvalds committed
67 68 69 70 71 72
#include <asm/pgalloc.h>
#include <asm/uaccess.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/pgtable.h>

73 74
#include "internal.h"

75 76
#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
77 78
#endif

79
#ifndef CONFIG_NEED_MULTIPLE_NODES
Linus Torvalds's avatar
Linus Torvalds committed
80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98
/* use the per-pgdat data instead for discontigmem - mbligh */
unsigned long max_mapnr;
struct page *mem_map;

EXPORT_SYMBOL(max_mapnr);
EXPORT_SYMBOL(mem_map);
#endif

/*
 * A number of key systems in x86 including ioremap() rely on the assumption
 * that high_memory defines the upper bound on direct map memory, then end
 * of ZONE_NORMAL.  Under CONFIG_DISCONTIG this means that max_low_pfn and
 * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL
 * and ZONE_HIGHMEM.
 */
void * high_memory;

EXPORT_SYMBOL(high_memory);

99 100 101 102 103 104 105 106 107 108 109 110
/*
 * Randomize the address space (stacks, mmaps, brk, etc.).
 *
 * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
 *   as ancient (libc5 based) binaries can segfault. )
 */
int randomize_va_space __read_mostly =
#ifdef CONFIG_COMPAT_BRK
					1;
#else
					2;
#endif
111 112 113 114

static int __init disable_randmaps(char *s)
{
	randomize_va_space = 0;
115
	return 1;
116 117 118
}
__setup("norandmaps", disable_randmaps);

119
unsigned long zero_pfn __read_mostly;
Hugh Dickins's avatar
Hugh Dickins committed
120
unsigned long highest_memmap_pfn __read_mostly;
Hugh Dickins's avatar
Hugh Dickins committed
121

122 123
EXPORT_SYMBOL(zero_pfn);

Hugh Dickins's avatar
Hugh Dickins committed
124 125 126 127 128 129 130 131 132
/*
 * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
 */
static int __init init_zero_pfn(void)
{
	zero_pfn = page_to_pfn(ZERO_PAGE(0));
	return 0;
}
core_initcall(init_zero_pfn);
133

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
134

135 136
#if defined(SPLIT_RSS_COUNTING)

137
void sync_mm_rss(struct mm_struct *mm)
138 139 140 141
{
	int i;

	for (i = 0; i < NR_MM_COUNTERS; i++) {
142 143 144
		if (current->rss_stat.count[i]) {
			add_mm_counter(mm, i, current->rss_stat.count[i]);
			current->rss_stat.count[i] = 0;
145 146
		}
	}
147
	current->rss_stat.events = 0;
148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168
}

static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
{
	struct task_struct *task = current;

	if (likely(task->mm == mm))
		task->rss_stat.count[member] += val;
	else
		add_mm_counter(mm, member, val);
}
#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)

/* sync counter once per 64 page faults */
#define TASK_RSS_EVENTS_THRESH	(64)
static void check_sync_rss_stat(struct task_struct *task)
{
	if (unlikely(task != current))
		return;
	if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
169
		sync_mm_rss(task->mm);
170
}
171
#else /* SPLIT_RSS_COUNTING */
172 173 174 175 176 177 178 179

#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)

static void check_sync_rss_stat(struct task_struct *task)
{
}

180 181 182 183
#endif /* SPLIT_RSS_COUNTING */

#ifdef HAVE_GENERIC_MMU_GATHER

184
static bool tlb_next_batch(struct mmu_gather *tlb)
185 186 187 188 189 190
{
	struct mmu_gather_batch *batch;

	batch = tlb->active;
	if (batch->next) {
		tlb->active = batch->next;
191
		return true;
192 193
	}

194
	if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
195
		return false;
196

197 198
	batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
	if (!batch)
199
		return false;
200

201
	tlb->batch_count++;
202 203 204 205 206 207 208
	batch->next = NULL;
	batch->nr   = 0;
	batch->max  = MAX_GATHER_BATCH;

	tlb->active->next = batch;
	tlb->active = batch;

209
	return true;
210 211 212 213 214 215 216
}

/* tlb_gather_mmu
 *	Called to initialize an (on-stack) mmu_gather structure for page-table
 *	tear-down from @mm. The @fullmm argument is used when @mm is without
 *	users and we're going to destroy the full address space (exit/execve).
 */
217
void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end)
218 219 220
{
	tlb->mm = mm;

221 222
	/* Is it from 0 to ~0? */
	tlb->fullmm     = !(start | (end+1));
223
	tlb->need_flush_all = 0;
224 225 226 227
	tlb->local.next = NULL;
	tlb->local.nr   = 0;
	tlb->local.max  = ARRAY_SIZE(tlb->__pages);
	tlb->active     = &tlb->local;
228
	tlb->batch_count = 0;
229 230 231 232

#ifdef CONFIG_HAVE_RCU_TABLE_FREE
	tlb->batch = NULL;
#endif
233 234

	__tlb_reset_range(tlb);
235 236
}

237
static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
238
{
239 240 241
	if (!tlb->end)
		return;

242
	tlb_flush(tlb);
243
	mmu_notifier_invalidate_range(tlb->mm, tlb->start, tlb->end);
244 245
#ifdef CONFIG_HAVE_RCU_TABLE_FREE
	tlb_table_flush(tlb);
246
#endif
247
	__tlb_reset_range(tlb);
248 249 250 251 252
}

static void tlb_flush_mmu_free(struct mmu_gather *tlb)
{
	struct mmu_gather_batch *batch;
253

254
	for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
255 256 257 258 259 260
		free_pages_and_swap_cache(batch->pages, batch->nr);
		batch->nr = 0;
	}
	tlb->active = &tlb->local;
}

261 262 263 264 265 266
void tlb_flush_mmu(struct mmu_gather *tlb)
{
	tlb_flush_mmu_tlbonly(tlb);
	tlb_flush_mmu_free(tlb);
}

267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296
/* tlb_finish_mmu
 *	Called at the end of the shootdown operation to free up any resources
 *	that were required.
 */
void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
{
	struct mmu_gather_batch *batch, *next;

	tlb_flush_mmu(tlb);

	/* keep the page table cache within bounds */
	check_pgt_cache();

	for (batch = tlb->local.next; batch; batch = next) {
		next = batch->next;
		free_pages((unsigned long)batch, 0);
	}
	tlb->local.next = NULL;
}

/* __tlb_remove_page
 *	Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while
 *	handling the additional races in SMP caused by other CPUs caching valid
 *	mappings in their TLBs. Returns the number of free page slots left.
 *	When out of page slots we must call tlb_flush_mmu().
 */
int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
{
	struct mmu_gather_batch *batch;

297
	VM_BUG_ON(!tlb->end);
298 299 300 301 302 303

	batch = tlb->active;
	batch->pages[batch->nr++] = page;
	if (batch->nr == batch->max) {
		if (!tlb_next_batch(tlb))
			return 0;
304
		batch = tlb->active;
305
	}
306
	VM_BUG_ON_PAGE(batch->nr > batch->max, page);
307 308 309 310 311 312

	return batch->max - batch->nr;
}

#endif /* HAVE_GENERIC_MMU_GATHER */

313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385
#ifdef CONFIG_HAVE_RCU_TABLE_FREE

/*
 * See the comment near struct mmu_table_batch.
 */

static void tlb_remove_table_smp_sync(void *arg)
{
	/* Simply deliver the interrupt */
}

static void tlb_remove_table_one(void *table)
{
	/*
	 * This isn't an RCU grace period and hence the page-tables cannot be
	 * assumed to be actually RCU-freed.
	 *
	 * It is however sufficient for software page-table walkers that rely on
	 * IRQ disabling. See the comment near struct mmu_table_batch.
	 */
	smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
	__tlb_remove_table(table);
}

static void tlb_remove_table_rcu(struct rcu_head *head)
{
	struct mmu_table_batch *batch;
	int i;

	batch = container_of(head, struct mmu_table_batch, rcu);

	for (i = 0; i < batch->nr; i++)
		__tlb_remove_table(batch->tables[i]);

	free_page((unsigned long)batch);
}

void tlb_table_flush(struct mmu_gather *tlb)
{
	struct mmu_table_batch **batch = &tlb->batch;

	if (*batch) {
		call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
		*batch = NULL;
	}
}

void tlb_remove_table(struct mmu_gather *tlb, void *table)
{
	struct mmu_table_batch **batch = &tlb->batch;

	/*
	 * When there's less then two users of this mm there cannot be a
	 * concurrent page-table walk.
	 */
	if (atomic_read(&tlb->mm->mm_users) < 2) {
		__tlb_remove_table(table);
		return;
	}

	if (*batch == NULL) {
		*batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
		if (*batch == NULL) {
			tlb_remove_table_one(table);
			return;
		}
		(*batch)->nr = 0;
	}
	(*batch)->tables[(*batch)->nr++] = table;
	if ((*batch)->nr == MAX_TABLE_BATCH)
		tlb_table_flush(tlb);
}

386
#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
387

Linus Torvalds's avatar
Linus Torvalds committed
388 389 390 391
/*
 * Note: this doesn't free the actual pages themselves. That
 * has been handled earlier when unmapping all the memory regions.
 */
392 393
static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
			   unsigned long addr)
Linus Torvalds's avatar
Linus Torvalds committed
394
{
395
	pgtable_t token = pmd_pgtable(*pmd);
396
	pmd_clear(pmd);
397
	pte_free_tlb(tlb, token, addr);
398
	atomic_long_dec(&tlb->mm->nr_ptes);
Linus Torvalds's avatar
Linus Torvalds committed
399 400
}

401 402 403
static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
				unsigned long addr, unsigned long end,
				unsigned long floor, unsigned long ceiling)
Linus Torvalds's avatar
Linus Torvalds committed
404 405 406
{
	pmd_t *pmd;
	unsigned long next;
407
	unsigned long start;
Linus Torvalds's avatar
Linus Torvalds committed
408

409
	start = addr;
Linus Torvalds's avatar
Linus Torvalds committed
410 411 412 413 414
	pmd = pmd_offset(pud, addr);
	do {
		next = pmd_addr_end(addr, end);
		if (pmd_none_or_clear_bad(pmd))
			continue;
415
		free_pte_range(tlb, pmd, addr);
Linus Torvalds's avatar
Linus Torvalds committed
416 417
	} while (pmd++, addr = next, addr != end);

418 419 420 421 422 423 424
	start &= PUD_MASK;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= PUD_MASK;
		if (!ceiling)
			return;
Linus Torvalds's avatar
Linus Torvalds committed
425
	}
426 427 428 429 430
	if (end - 1 > ceiling - 1)
		return;

	pmd = pmd_offset(pud, start);
	pud_clear(pud);
431
	pmd_free_tlb(tlb, pmd, start);
432
	mm_dec_nr_pmds(tlb->mm);
Linus Torvalds's avatar
Linus Torvalds committed
433 434
}

435 436 437
static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
				unsigned long addr, unsigned long end,
				unsigned long floor, unsigned long ceiling)
Linus Torvalds's avatar
Linus Torvalds committed
438 439 440
{
	pud_t *pud;
	unsigned long next;
441
	unsigned long start;
Linus Torvalds's avatar
Linus Torvalds committed
442

443
	start = addr;
Linus Torvalds's avatar
Linus Torvalds committed
444 445 446 447 448
	pud = pud_offset(pgd, addr);
	do {
		next = pud_addr_end(addr, end);
		if (pud_none_or_clear_bad(pud))
			continue;
449
		free_pmd_range(tlb, pud, addr, next, floor, ceiling);
Linus Torvalds's avatar
Linus Torvalds committed
450 451
	} while (pud++, addr = next, addr != end);

452 453 454 455 456 457 458
	start &= PGDIR_MASK;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= PGDIR_MASK;
		if (!ceiling)
			return;
Linus Torvalds's avatar
Linus Torvalds committed
459
	}
460 461 462 463 464
	if (end - 1 > ceiling - 1)
		return;

	pud = pud_offset(pgd, start);
	pgd_clear(pgd);
465
	pud_free_tlb(tlb, pud, start);
Linus Torvalds's avatar
Linus Torvalds committed
466 467 468
}

/*
469
 * This function frees user-level page tables of a process.
Linus Torvalds's avatar
Linus Torvalds committed
470
 */
471
void free_pgd_range(struct mmu_gather *tlb,
472 473
			unsigned long addr, unsigned long end,
			unsigned long floor, unsigned long ceiling)
Linus Torvalds's avatar
Linus Torvalds committed
474 475 476
{
	pgd_t *pgd;
	unsigned long next;
477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502

	/*
	 * The next few lines have given us lots of grief...
	 *
	 * Why are we testing PMD* at this top level?  Because often
	 * there will be no work to do at all, and we'd prefer not to
	 * go all the way down to the bottom just to discover that.
	 *
	 * Why all these "- 1"s?  Because 0 represents both the bottom
	 * of the address space and the top of it (using -1 for the
	 * top wouldn't help much: the masks would do the wrong thing).
	 * The rule is that addr 0 and floor 0 refer to the bottom of
	 * the address space, but end 0 and ceiling 0 refer to the top
	 * Comparisons need to use "end - 1" and "ceiling - 1" (though
	 * that end 0 case should be mythical).
	 *
	 * Wherever addr is brought up or ceiling brought down, we must
	 * be careful to reject "the opposite 0" before it confuses the
	 * subsequent tests.  But what about where end is brought down
	 * by PMD_SIZE below? no, end can't go down to 0 there.
	 *
	 * Whereas we round start (addr) and ceiling down, by different
	 * masks at different levels, in order to test whether a table
	 * now has no other vmas using it, so can be freed, we don't
	 * bother to round floor or end up - the tests don't need that.
	 */
Linus Torvalds's avatar
Linus Torvalds committed
503

504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519
	addr &= PMD_MASK;
	if (addr < floor) {
		addr += PMD_SIZE;
		if (!addr)
			return;
	}
	if (ceiling) {
		ceiling &= PMD_MASK;
		if (!ceiling)
			return;
	}
	if (end - 1 > ceiling - 1)
		end -= PMD_SIZE;
	if (addr > end - 1)
		return;

520
	pgd = pgd_offset(tlb->mm, addr);
Linus Torvalds's avatar
Linus Torvalds committed
521 522 523 524
	do {
		next = pgd_addr_end(addr, end);
		if (pgd_none_or_clear_bad(pgd))
			continue;
525
		free_pud_range(tlb, pgd, addr, next, floor, ceiling);
Linus Torvalds's avatar
Linus Torvalds committed
526
	} while (pgd++, addr = next, addr != end);
527 528
}

529
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
530
		unsigned long floor, unsigned long ceiling)
531 532 533 534 535
{
	while (vma) {
		struct vm_area_struct *next = vma->vm_next;
		unsigned long addr = vma->vm_start;

536
		/*
npiggin@suse.de's avatar
npiggin@suse.de committed
537 538
		 * Hide vma from rmap and truncate_pagecache before freeing
		 * pgtables
539
		 */
540
		unlink_anon_vmas(vma);
541 542
		unlink_file_vma(vma);

543
		if (is_vm_hugetlb_page(vma)) {
544
			hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
545
				floor, next? next->vm_start: ceiling);
546 547 548 549 550
		} else {
			/*
			 * Optimization: gather nearby vmas into one call down
			 */
			while (next && next->vm_start <= vma->vm_end + PMD_SIZE
551
			       && !is_vm_hugetlb_page(next)) {
552 553
				vma = next;
				next = vma->vm_next;
554
				unlink_anon_vmas(vma);
555
				unlink_file_vma(vma);
556 557 558 559
			}
			free_pgd_range(tlb, addr, vma->vm_end,
				floor, next? next->vm_start: ceiling);
		}
560 561
		vma = next;
	}
Linus Torvalds's avatar
Linus Torvalds committed
562 563
}

564 565
int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
		pmd_t *pmd, unsigned long address)
Linus Torvalds's avatar
Linus Torvalds committed
566
{
567
	spinlock_t *ptl;
568
	pgtable_t new = pte_alloc_one(mm, address);
569
	int wait_split_huge_page;
570 571 572
	if (!new)
		return -ENOMEM;

573 574 575 576 577 578 579 580 581 582 583 584 585 586 587
	/*
	 * Ensure all pte setup (eg. pte page lock and page clearing) are
	 * visible before the pte is made visible to other CPUs by being
	 * put into page tables.
	 *
	 * The other side of the story is the pointer chasing in the page
	 * table walking code (when walking the page table without locking;
	 * ie. most of the time). Fortunately, these data accesses consist
	 * of a chain of data-dependent loads, meaning most CPUs (alpha
	 * being the notable exception) will already guarantee loads are
	 * seen in-order. See the alpha page table accessors for the
	 * smp_read_barrier_depends() barriers in page table walking code.
	 */
	smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */

588
	ptl = pmd_lock(mm, pmd);
589 590
	wait_split_huge_page = 0;
	if (likely(pmd_none(*pmd))) {	/* Has another populated it ? */
591
		atomic_long_inc(&mm->nr_ptes);
Linus Torvalds's avatar
Linus Torvalds committed
592
		pmd_populate(mm, pmd, new);
593
		new = NULL;
594 595
	} else if (unlikely(pmd_trans_splitting(*pmd)))
		wait_split_huge_page = 1;
596
	spin_unlock(ptl);
597 598
	if (new)
		pte_free(mm, new);
599 600
	if (wait_split_huge_page)
		wait_split_huge_page(vma->anon_vma, pmd);
601
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
602 603
}

604
int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
Linus Torvalds's avatar
Linus Torvalds committed
605
{
606 607 608 609
	pte_t *new = pte_alloc_one_kernel(&init_mm, address);
	if (!new)
		return -ENOMEM;

610 611
	smp_wmb(); /* See comment in __pte_alloc */

612
	spin_lock(&init_mm.page_table_lock);
613
	if (likely(pmd_none(*pmd))) {	/* Has another populated it ? */
614
		pmd_populate_kernel(&init_mm, pmd, new);
615
		new = NULL;
616 617
	} else
		VM_BUG_ON(pmd_trans_splitting(*pmd));
618
	spin_unlock(&init_mm.page_table_lock);
619 620
	if (new)
		pte_free_kernel(&init_mm, new);
621
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
622 623
}

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
624 625 626 627 628 629
static inline void init_rss_vec(int *rss)
{
	memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
}

static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
630
{
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
631 632
	int i;

633
	if (current->mm == mm)
634
		sync_mm_rss(mm);
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
635 636 637
	for (i = 0; i < NR_MM_COUNTERS; i++)
		if (rss[i])
			add_mm_counter(mm, i, rss[i]);
638 639
}

640
/*
641 642 643
 * This function is called to print an error when a bad pte
 * is found. For example, we might have a PFN-mapped pte in
 * a region that doesn't allow it.
644 645 646
 *
 * The calling function must still handle the error.
 */
647 648
static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
			  pte_t pte, struct page *page)
649
{
650 651 652 653 654
	pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
	pud_t *pud = pud_offset(pgd, addr);
	pmd_t *pmd = pmd_offset(pud, addr);
	struct address_space *mapping;
	pgoff_t index;
655 656 657 658 659 660 661 662 663 664 665 666 667 668
	static unsigned long resume;
	static unsigned long nr_shown;
	static unsigned long nr_unshown;

	/*
	 * Allow a burst of 60 reports, then keep quiet for that minute;
	 * or allow a steady drip of one report per second.
	 */
	if (nr_shown == 60) {
		if (time_before(jiffies, resume)) {
			nr_unshown++;
			return;
		}
		if (nr_unshown) {
669 670
			printk(KERN_ALERT
				"BUG: Bad page map: %lu messages suppressed\n",
671 672 673 674 675 676 677
				nr_unshown);
			nr_unshown = 0;
		}
		nr_shown = 0;
	}
	if (nr_shown++ == 0)
		resume = jiffies + 60 * HZ;
678 679 680 681

	mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
	index = linear_page_index(vma, addr);

682 683
	printk(KERN_ALERT
		"BUG: Bad page map in process %s  pte:%08llx pmd:%08llx\n",
684 685
		current->comm,
		(long long)pte_val(pte), (long long)pmd_val(*pmd));
686
	if (page)
687
		dump_page(page, "bad pte");
688
	printk(KERN_ALERT
689 690 691 692 693
		"addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
		(void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
	/*
	 * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y
	 */
694 695 696 697 698
	pr_alert("file:%pD fault:%pf mmap:%pf readpage:%pf\n",
		 vma->vm_file,
		 vma->vm_ops ? vma->vm_ops->fault : NULL,
		 vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
		 mapping ? mapping->a_ops->readpage : NULL);
699
	dump_stack();
700
	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
701 702
}

703
/*
704
 * vm_normal_page -- This function gets the "struct page" associated with a pte.
705
 *
706 707 708
 * "Special" mappings do not wish to be associated with a "struct page" (either
 * it doesn't exist, or it exists but they don't want to touch it). In this
 * case, NULL is returned here. "Normal" mappings do have a struct page.
Jared Hulbert's avatar
Jared Hulbert committed
709
 *
710 711 712 713 714 715 716 717
 * There are 2 broad cases. Firstly, an architecture may define a pte_special()
 * pte bit, in which case this function is trivial. Secondly, an architecture
 * may not have a spare pte bit, which requires a more complicated scheme,
 * described below.
 *
 * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
 * special mapping (even if there are underlying and valid "struct pages").
 * COWed pages of a VM_PFNMAP are always normal.
718
 *
Jared Hulbert's avatar
Jared Hulbert committed
719 720
 * The way we recognize COWed pages within VM_PFNMAP mappings is through the
 * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
721 722
 * set, and the vm_pgoff will point to the first PFN mapped: thus every special
 * mapping will always honor the rule
723 724 725
 *
 *	pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
 *
726 727 728 729 730 731
 * And for normal mappings this is false.
 *
 * This restricts such mappings to be a linear translation from virtual address
 * to pfn. To get around this restriction, we allow arbitrary mappings so long
 * as the vma is not a COW mapping; in that case, we know that all ptes are
 * special (because none can have been COWed).
Jared Hulbert's avatar
Jared Hulbert committed
732 733
 *
 *
734
 * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.
Jared Hulbert's avatar
Jared Hulbert committed
735 736 737 738 739 740 741 742 743
 *
 * VM_MIXEDMAP mappings can likewise contain memory with or without "struct
 * page" backing, however the difference is that _all_ pages with a struct
 * page (that is, those where pfn_valid is true) are refcounted and considered
 * normal pages by the VM. The disadvantage is that pages are refcounted
 * (which can be slower and simply not an option for some PFNMAP users). The
 * advantage is that we don't have to follow the strict linearity rule of
 * PFNMAP mappings in order to support COWable mappings.
 *
744
 */
745 746 747 748 749 750 751
#ifdef __HAVE_ARCH_PTE_SPECIAL
# define HAVE_PTE_SPECIAL 1
#else
# define HAVE_PTE_SPECIAL 0
#endif
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
				pte_t pte)
752
{
753
	unsigned long pfn = pte_pfn(pte);
754 755

	if (HAVE_PTE_SPECIAL) {
756
		if (likely(!pte_special(pte)))
757
			goto check_pfn;
758 759
		if (vma->vm_ops && vma->vm_ops->find_special_page)
			return vma->vm_ops->find_special_page(vma, addr);
Hugh Dickins's avatar
Hugh Dickins committed
760 761
		if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
			return NULL;
762
		if (!is_zero_pfn(pfn))
763
			print_bad_pte(vma, addr, pte, NULL);
764 765 766 767 768
		return NULL;
	}

	/* !HAVE_PTE_SPECIAL case follows: */

Jared Hulbert's avatar
Jared Hulbert committed
769 770 771 772 773 774
	if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
		if (vma->vm_flags & VM_MIXEDMAP) {
			if (!pfn_valid(pfn))
				return NULL;
			goto out;
		} else {
775 776
			unsigned long off;
			off = (addr - vma->vm_start) >> PAGE_SHIFT;
Jared Hulbert's avatar
Jared Hulbert committed
777 778 779 780 781
			if (pfn == vma->vm_pgoff + off)
				return NULL;
			if (!is_cow_mapping(vma->vm_flags))
				return NULL;
		}
782 783
	}

784 785
	if (is_zero_pfn(pfn))
		return NULL;
786 787 788 789 790
check_pfn:
	if (unlikely(pfn > highest_memmap_pfn)) {
		print_bad_pte(vma, addr, pte, NULL);
		return NULL;
	}
791 792

	/*
793 794
	 * NOTE! We still have PageReserved() pages in the page tables.
	 * eg. VDSO mappings can cause them to exist.
795
	 */
Jared Hulbert's avatar
Jared Hulbert committed
796
out:
797
	return pfn_to_page(pfn);
798 799
}

Linus Torvalds's avatar
Linus Torvalds committed
800 801 802 803 804 805
/*
 * copy one vm_area from one task to the other. Assumes the page tables
 * already present in the new task to be cleared in the whole range
 * covered by this vma.
 */

806
static inline unsigned long
Linus Torvalds's avatar
Linus Torvalds committed
807
copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
808
		pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
809
		unsigned long addr, int *rss)
Linus Torvalds's avatar
Linus Torvalds committed
810
{
811
	unsigned long vm_flags = vma->vm_flags;
Linus Torvalds's avatar
Linus Torvalds committed
812 813 814 815 816
	pte_t pte = *src_pte;
	struct page *page;

	/* pte contains position in swap or file, so copy. */
	if (unlikely(!pte_present(pte))) {
817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850
		swp_entry_t entry = pte_to_swp_entry(pte);

		if (likely(!non_swap_entry(entry))) {
			if (swap_duplicate(entry) < 0)
				return entry.val;

			/* make sure dst_mm is on swapoff's mmlist. */
			if (unlikely(list_empty(&dst_mm->mmlist))) {
				spin_lock(&mmlist_lock);
				if (list_empty(&dst_mm->mmlist))
					list_add(&dst_mm->mmlist,
							&src_mm->mmlist);
				spin_unlock(&mmlist_lock);
			}
			rss[MM_SWAPENTS]++;
		} else if (is_migration_entry(entry)) {
			page = migration_entry_to_page(entry);

			if (PageAnon(page))
				rss[MM_ANONPAGES]++;
			else
				rss[MM_FILEPAGES]++;

			if (is_write_migration_entry(entry) &&
					is_cow_mapping(vm_flags)) {
				/*
				 * COW mappings require pages in both
				 * parent and child to be set to read.
				 */
				make_migration_entry_read(&entry);
				pte = swp_entry_to_pte(entry);
				if (pte_swp_soft_dirty(*src_pte))
					pte = pte_swp_mksoft_dirty(pte);
				set_pte_at(src_mm, addr, src_pte, pte);
851
			}
Linus Torvalds's avatar
Linus Torvalds committed
852
		}
853
		goto out_set_pte;
Linus Torvalds's avatar
Linus Torvalds committed
854 855 856 857 858 859
	}

	/*
	 * If it's a COW mapping, write protect it both
	 * in the parent and the child
	 */
860
	if (is_cow_mapping(vm_flags)) {
Linus Torvalds's avatar
Linus Torvalds committed
861
		ptep_set_wrprotect(src_mm, addr, src_pte);
862
		pte = pte_wrprotect(pte);
Linus Torvalds's avatar
Linus Torvalds committed
863 864 865 866 867 868 869 870 871
	}

	/*
	 * If it's a shared mapping, mark it clean in
	 * the child
	 */
	if (vm_flags & VM_SHARED)
		pte = pte_mkclean(pte);
	pte = pte_mkold(pte);
872 873 874 875

	page = vm_normal_page(vma, addr, pte);
	if (page) {
		get_page(page);
876
		page_dup_rmap(page);
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
877 878 879 880
		if (PageAnon(page))
			rss[MM_ANONPAGES]++;
		else
			rss[MM_FILEPAGES]++;
881
	}
882 883 884

out_set_pte:
	set_pte_at(dst_mm, addr, dst_pte, pte);
885
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
886 887
}

888
static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
889 890
		   pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
		   unsigned long addr, unsigned long end)
Linus Torvalds's avatar
Linus Torvalds committed
891
{
892
	pte_t *orig_src_pte, *orig_dst_pte;
Linus Torvalds's avatar
Linus Torvalds committed
893
	pte_t *src_pte, *dst_pte;
894
	spinlock_t *src_ptl, *dst_ptl;
895
	int progress = 0;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
896
	int rss[NR_MM_COUNTERS];
897
	swp_entry_t entry = (swp_entry_t){0};
Linus Torvalds's avatar
Linus Torvalds committed
898 899

again:
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
900 901
	init_rss_vec(rss);

902
	dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
Linus Torvalds's avatar
Linus Torvalds committed
903 904
	if (!dst_pte)
		return -ENOMEM;
905
	src_pte = pte_offset_map(src_pmd, addr);
906
	src_ptl = pte_lockptr(src_mm, src_pmd);
Ingo Molnar's avatar
Ingo Molnar committed
907
	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
908 909
	orig_src_pte = src_pte;
	orig_dst_pte = dst_pte;
910
	arch_enter_lazy_mmu_mode();
Linus Torvalds's avatar
Linus Torvalds committed
911 912 913 914 915 916

	do {
		/*
		 * We are holding two locks at this point - either of them
		 * could generate latencies in another task on another CPU.
		 */
917 918 919
		if (progress >= 32) {
			progress = 0;
			if (need_resched() ||
Nick Piggin's avatar
Nick Piggin committed
920
			    spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
921 922
				break;
		}
Linus Torvalds's avatar
Linus Torvalds committed
923 924 925 926
		if (pte_none(*src_pte)) {
			progress++;
			continue;
		}
927 928 929 930
		entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
							vma, addr, rss);
		if (entry.val)
			break;
Linus Torvalds's avatar
Linus Torvalds committed
931 932 933
		progress += 8;
	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);

934
	arch_leave_lazy_mmu_mode();
935
	spin_unlock(src_ptl);
936
	pte_unmap(orig_src_pte);
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
937
	add_mm_rss_vec(dst_mm, rss);
938
	pte_unmap_unlock(orig_dst_pte, dst_ptl);
939
	cond_resched();
940 941 942 943 944 945

	if (entry.val) {
		if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
			return -ENOMEM;
		progress = 0;
	}
Linus Torvalds's avatar
Linus Torvalds committed
946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963
	if (addr != end)
		goto again;
	return 0;
}

static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
		pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
		unsigned long addr, unsigned long end)
{
	pmd_t *src_pmd, *dst_pmd;
	unsigned long next;

	dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
	if (!dst_pmd)
		return -ENOMEM;
	src_pmd = pmd_offset(src_pud, addr);
	do {
		next = pmd_addr_end(addr, end);
964 965
		if (pmd_trans_huge(*src_pmd)) {
			int err;
966
			VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
967 968 969 970 971 972 973 974
			err = copy_huge_pmd(dst_mm, src_mm,
					    dst_pmd, src_pmd, addr, vma);
			if (err == -ENOMEM)
				return -ENOMEM;
			if (!err)
				continue;
			/* fall through */
		}
Linus Torvalds's avatar
Linus Torvalds committed
975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012
		if (pmd_none_or_clear_bad(src_pmd))
			continue;
		if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
						vma, addr, next))
			return -ENOMEM;
	} while (dst_pmd++, src_pmd++, addr = next, addr != end);
	return 0;
}

static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
		pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
		unsigned long addr, unsigned long end)
{
	pud_t *src_pud, *dst_pud;
	unsigned long next;

	dst_pud = pud_alloc(dst_mm, dst_pgd, addr);
	if (!dst_pud)
		return -ENOMEM;
	src_pud = pud_offset(src_pgd, addr);
	do {
		next = pud_addr_end(addr, end);
		if (pud_none_or_clear_bad(src_pud))
			continue;
		if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
						vma, addr, next))
			return -ENOMEM;
	} while (dst_pud++, src_pud++, addr = next, addr != end);
	return 0;
}

int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
		struct vm_area_struct *vma)
{
	pgd_t *src_pgd, *dst_pgd;
	unsigned long next;
	unsigned long addr = vma->vm_start;
	unsigned long end = vma->vm_end;
1013 1014 1015
	unsigned long mmun_start;	/* For mmu_notifiers */
	unsigned long mmun_end;		/* For mmu_notifiers */
	bool is_cow;
Andrea Arcangeli's avatar
Andrea Arcangeli committed
1016
	int ret;
Linus Torvalds's avatar
Linus Torvalds committed
1017

1018 1019 1020 1021 1022 1023
	/*
	 * Don't copy ptes where a page fault will fill them correctly.
	 * Fork becomes much lighter when there are big shared or private
	 * readonly mappings. The tradeoff is that copy_page_range is more
	 * efficient than faulting.
	 */
1024 1025 1026
	if (!(vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) &&
			!vma->anon_vma)
		return 0;
1027

Linus Torvalds's avatar
Linus Torvalds committed
1028 1029 1030
	if (is_vm_hugetlb_page(vma))
		return copy_hugetlb_page_range(dst_mm, src_mm, vma);

1031
	if (unlikely(vma->vm_flags & VM_PFNMAP)) {
1032 1033 1034 1035
		/*
		 * We do not free on error cases below as remove_vma
		 * gets called on error from higher level routine
		 */
1036
		ret = track_pfn_copy(vma);
1037 1038 1039 1040
		if (ret)
			return ret;
	}

Andrea Arcangeli's avatar
Andrea Arcangeli committed
1041 1042 1043 1044 1045 1046
	/*
	 * We need to invalidate the secondary MMU mappings only when
	 * there could be a permission downgrade on the ptes of the
	 * parent mm. And a permission downgrade will only happen if
	 * is_cow_mapping() returns true.
	 */
1047 1048 1049 1050 1051 1052
	is_cow = is_cow_mapping(vma->vm_flags);
	mmun_start = addr;
	mmun_end   = end;
	if (is_cow)
		mmu_notifier_invalidate_range_start(src_mm, mmun_start,
						    mmun_end);
Andrea Arcangeli's avatar
Andrea Arcangeli committed
1053 1054

	ret = 0;
Linus Torvalds's avatar
Linus Torvalds committed
1055 1056 1057 1058 1059 1060
	dst_pgd = pgd_offset(dst_mm, addr);
	src_pgd = pgd_offset(src_mm, addr);
	do {
		next = pgd_addr_end(addr, end);
		if (pgd_none_or_clear_bad(src_pgd))
			continue;
Andrea Arcangeli's avatar
Andrea Arcangeli committed
1061 1062 1063 1064 1065
		if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
					    vma, addr, next))) {
			ret = -ENOMEM;
			break;
		}
Linus Torvalds's avatar
Linus Torvalds committed
1066
	} while (dst_pgd++, src_pgd++, addr = next, addr != end);
Andrea Arcangeli's avatar
Andrea Arcangeli committed
1067

1068 1069
	if (is_cow)
		mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end);
Andrea Arcangeli's avatar
Andrea Arcangeli committed
1070
	return ret;
Linus Torvalds's avatar
Linus Torvalds committed
1071 1072
}

1073
static unsigned long zap_pte_range(struct mmu_gather *tlb,
1074
				struct vm_area_struct *vma, pmd_t *pmd,
Linus Torvalds's avatar
Linus Torvalds committed
1075
				unsigned long addr, unsigned long end,
1076
				struct zap_details *details)
Linus Torvalds's avatar
Linus Torvalds committed
1077
{
1078
	struct mm_struct *mm = tlb->mm;
Peter Zijlstra's avatar
Peter Zijlstra committed
1079
	int force_flush = 0;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1080
	int rss[NR_MM_COUNTERS];
1081
	spinlock_t *ptl;
1082
	pte_t *start_pte;
1083
	pte_t *pte;
1084
	swp_entry_t entry;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1085

Peter Zijlstra's avatar
Peter Zijlstra committed
1086
again:
1087
	init_rss_vec(rss);
1088 1089
	start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
	pte = start_pte;
1090
	arch_enter_lazy_mmu_mode();
Linus Torvalds's avatar
Linus Torvalds committed
1091 1092
	do {
		pte_t ptent = *pte;
1093
		if (pte_none(ptent)) {
Linus Torvalds's avatar
Linus Torvalds committed
1094
			continue;
1095
		}
1096

Linus Torvalds's avatar
Linus Torvalds committed
1097
		if (pte_present(ptent)) {
1098
			struct page *page;
1099

1100
			page = vm_normal_page(vma, addr, ptent);
Linus Torvalds's avatar
Linus Torvalds committed
1101 1102 1103 1104 1105 1106 1107 1108 1109 1110
			if (unlikely(details) && page) {
				/*
				 * unmap_shared_mapping_pages() wants to
				 * invalidate cache without truncating:
				 * unmap shared but keep private pages.
				 */
				if (details->check_mapping &&
				    details->check_mapping != page->mapping)
					continue;
			}
1111
			ptent = ptep_get_and_clear_full(mm, addr, pte,
1112
							tlb->fullmm);
Linus Torvalds's avatar
Linus Torvalds committed
1113 1114 1115 1116
			tlb_remove_tlb_entry(tlb, pte, addr);
			if (unlikely(!page))
				continue;
			if (PageAnon(page))
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1117
				rss[MM_ANONPAGES]--;
1118
			else {
1119 1120
				if (pte_dirty(ptent)) {
					force_flush = 1;
1121
					set_page_dirty(page);
1122
				}
1123
				if (pte_young(ptent) &&
1124
				    likely(!(vma->vm_flags & VM_SEQ_READ)))
1125
					mark_page_accessed(page);
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1126
				rss[MM_FILEPAGES]--;
1127
			}
1128
			page_remove_rmap(page);
1129 1130
			if (unlikely(page_mapcount(page) < 0))
				print_bad_pte(vma, addr, ptent, page);
1131 1132
			if (unlikely(!__tlb_remove_page(tlb, page))) {
				force_flush = 1;
1133
				addr += PAGE_SIZE;
Peter Zijlstra's avatar
Peter Zijlstra committed
1134
				break;
1135
			}
Linus Torvalds's avatar
Linus Torvalds committed
1136 1137
			continue;
		}
1138
		/* If details->check_mapping, we leave swap entries. */
Linus Torvalds's avatar
Linus Torvalds committed
1139 1140
		if (unlikely(details))
			continue;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1141

1142 1143 1144 1145 1146
		entry = pte_to_swp_entry(ptent);
		if (!non_swap_entry(entry))
			rss[MM_SWAPENTS]--;
		else if (is_migration_entry(entry)) {
			struct page *page;
1147

1148
			page = migration_entry_to_page(entry);
1149

1150 1151 1152 1153
			if (PageAnon(page))
				rss[MM_ANONPAGES]--;
			else
				rss[MM_FILEPAGES]--;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1154
		}
1155 1156
		if (unlikely(!free_swap_and_cache(entry)))
			print_bad_pte(vma, addr, ptent, NULL);
1157
		pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1158
	} while (pte++, addr += PAGE_SIZE, addr != end);
1159

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1160
	add_mm_rss_vec(mm, rss);
1161
	arch_leave_lazy_mmu_mode();
1162

1163
	/* Do the actual TLB flush before dropping ptl */
1164
	if (force_flush)
1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176
		tlb_flush_mmu_tlbonly(tlb);
	pte_unmap_unlock(start_pte, ptl);

	/*
	 * If we forced a TLB flush (either due to running out of
	 * batch buffers or because we needed to flush dirty TLB
	 * entries before releasing the ptl), free the batched
	 * memory too. Restart if we didn't do everything.
	 */
	if (force_flush) {
		force_flush = 0;
		tlb_flush_mmu_free(tlb);
1177 1178

		if (addr != end)
Peter Zijlstra's avatar
Peter Zijlstra committed
1179 1180 1181
			goto again;
	}

1182
	return addr;
Linus Torvalds's avatar
Linus Torvalds committed
1183 1184
}

1185
static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1186
				struct vm_area_struct *vma, pud_t *pud,
Linus Torvalds's avatar
Linus Torvalds committed
1187
				unsigned long addr, unsigned long end,
1188
				struct zap_details *details)
Linus Torvalds's avatar
Linus Torvalds committed
1189 1190 1191 1192 1193 1194 1195
{
	pmd_t *pmd;
	unsigned long next;

	pmd = pmd_offset(pud, addr);
	do {
		next = pmd_addr_end(addr, end);
1196
		if (pmd_trans_huge(*pmd)) {
1197
			if (next - addr != HPAGE_PMD_SIZE) {
1198 1199 1200 1201 1202 1203 1204 1205 1206
#ifdef CONFIG_DEBUG_VM
				if (!rwsem_is_locked(&tlb->mm->mmap_sem)) {
					pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n",
						__func__, addr, end,
						vma->vm_start,
						vma->vm_end);
					BUG();
				}
#endif
1207
				split_huge_page_pmd(vma, addr, pmd);
1208
			} else if (zap_huge_pmd(tlb, vma, pmd, addr))
1209
				goto next;
1210 1211
			/* fall through */
		}
1212 1213 1214 1215 1216 1217 1218 1219 1220
		/*
		 * Here there can be other concurrent MADV_DONTNEED or
		 * trans huge page faults running, and if the pmd is
		 * none or trans huge it can change under us. This is
		 * because MADV_DONTNEED holds the mmap_sem in read
		 * mode.
		 */
		if (pmd_none_or_trans_huge_or_clear_bad(pmd))
			goto next;
1221
		next = zap_pte_range(tlb, vma, pmd, addr, next, details);
1222
next:
1223 1224
		cond_resched();
	} while (pmd++, addr = next, addr != end);
1225 1226

	return addr;
Linus Torvalds's avatar
Linus Torvalds committed
1227 1228
}

1229
static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1230
				struct vm_area_struct *vma, pgd_t *pgd,
Linus Torvalds's avatar
Linus Torvalds committed
1231
				unsigned long addr, unsigned long end,
1232
				struct zap_details *details)
Linus Torvalds's avatar
Linus Torvalds committed
1233 1234 1235 1236 1237 1238 1239
{
	pud_t *pud;
	unsigned long next;

	pud = pud_offset(pgd, addr);
	do {
		next = pud_addr_end(addr, end);
1240
		if (pud_none_or_clear_bad(pud))
Linus Torvalds's avatar
Linus Torvalds committed
1241
			continue;
1242 1243
		next = zap_pmd_range(tlb, vma, pud, addr, next, details);
	} while (pud++, addr = next, addr != end);
1244 1245

	return addr;
Linus Torvalds's avatar
Linus Torvalds committed
1246 1247
}

1248 1249 1250 1251
static void unmap_page_range(struct mmu_gather *tlb,
			     struct vm_area_struct *vma,
			     unsigned long addr, unsigned long end,
			     struct zap_details *details)
Linus Torvalds's avatar
Linus Torvalds committed
1252 1253 1254 1255
{
	pgd_t *pgd;
	unsigned long next;

1256
	if (details && !details->check_mapping)
Linus Torvalds's avatar
Linus Torvalds committed
1257 1258 1259 1260 1261 1262 1263
		details = NULL;

	BUG_ON(addr >= end);
	tlb_start_vma(tlb, vma);
	pgd = pgd_offset(vma->vm_mm, addr);
	do {
		next = pgd_addr_end(addr, end);
1264
		if (pgd_none_or_clear_bad(pgd))
Linus Torvalds's avatar
Linus Torvalds committed
1265
			continue;
1266 1267
		next = zap_pud_range(tlb, vma, pgd, addr, next, details);
	} while (pgd++, addr = next, addr != end);
Linus Torvalds's avatar
Linus Torvalds committed
1268 1269
	tlb_end_vma(tlb, vma);
}
1270

1271 1272 1273

static void unmap_single_vma(struct mmu_gather *tlb,
		struct vm_area_struct *vma, unsigned long start_addr,
1274
		unsigned long end_addr,
1275 1276 1277 1278