rmap.c 48.8 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
/*
 * mm/rmap.c - physical to virtual reverse mappings
 *
 * Copyright 2001, Rik van Riel <riel@conectiva.com.br>
 * Released under the General Public License (GPL).
 *
 * Simple, low overhead reverse mapping scheme.
 * Please try to keep this thing as modular as possible.
 *
 * Provides methods for unmapping each kind of mapped page:
 * the anon methods track anonymous pages, and
 * the file methods track pages belonging to an inode.
 *
 * Original design by Rik van Riel <riel@conectiva.com.br> 2001
 * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
 * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
Hugh Dickins's avatar
Hugh Dickins committed
17
 * Contributions by Hugh Dickins 2003, 2004
Linus Torvalds's avatar
Linus Torvalds committed
18
19
20
21
22
 */

/*
 * Lock ordering in mm:
 *
23
 * inode->i_mutex	(while writing or truncating, not reading or faulting)
24
25
 *   mm->mmap_sem
 *     page->flags PG_locked (lock_page)
26
 *       mapping->i_mmap_mutex
27
 *         anon_vma->rwsem
28
29
30
31
32
 *           mm->page_table_lock or pte_lock
 *             zone->lru_lock (in mark_page_accessed, isolate_lru_page)
 *             swap_lock (in swap_duplicate, swap_info_get)
 *               mmlist_lock (in mmput, drain_mmlist and others)
 *               mapping->private_lock (in __set_page_dirty_buffers)
33
 *               inode->i_lock (in set_page_dirty's __mark_inode_dirty)
34
 *               bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
35
36
37
 *                 sb_lock (within inode_lock in fs/fs-writeback.c)
 *                 mapping->tree_lock (widely used, in set_page_dirty,
 *                           in arch-dependent flush_dcache_mmap_lock,
38
 *                           within bdi.wb->list_lock in __sync_single_inode)
39
 *
40
 * anon_vma->rwsem,mapping->i_mutex      (memory_failure, collect_procs_anon)
41
 *   ->tasklist_lock
42
 *     pte map lock
Linus Torvalds's avatar
Linus Torvalds committed
43
44
45
46
47
48
49
50
 */

#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/slab.h>
#include <linux/init.h>
51
#include <linux/ksm.h>
Linus Torvalds's avatar
Linus Torvalds committed
52
53
#include <linux/rmap.h>
#include <linux/rcupdate.h>
54
#include <linux/export.h>
55
#include <linux/memcontrol.h>
Andrea Arcangeli's avatar
Andrea Arcangeli committed
56
#include <linux/mmu_notifier.h>
57
#include <linux/migrate.h>
58
#include <linux/hugetlb.h>
59
#include <linux/backing-dev.h>
Linus Torvalds's avatar
Linus Torvalds committed
60
61
62

#include <asm/tlbflush.h>

63
64
#include "internal.h"

65
static struct kmem_cache *anon_vma_cachep;
66
static struct kmem_cache *anon_vma_chain_cachep;
67
68
69

static inline struct anon_vma *anon_vma_alloc(void)
{
70
71
72
73
74
75
76
77
78
79
80
81
82
	struct anon_vma *anon_vma;

	anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
	if (anon_vma) {
		atomic_set(&anon_vma->refcount, 1);
		/*
		 * Initialise the anon_vma root to point to itself. If called
		 * from fork, the root will be reset to the parents anon_vma.
		 */
		anon_vma->root = anon_vma;
	}

	return anon_vma;
83
84
}

85
static inline void anon_vma_free(struct anon_vma *anon_vma)
86
{
87
	VM_BUG_ON(atomic_read(&anon_vma->refcount));
88
89

	/*
90
	 * Synchronize against page_lock_anon_vma_read() such that
91
92
93
94
95
	 * we can safely hold the lock without the anon_vma getting
	 * freed.
	 *
	 * Relies on the full mb implied by the atomic_dec_and_test() from
	 * put_anon_vma() against the acquire barrier implied by
96
	 * down_read_trylock() from page_lock_anon_vma_read(). This orders:
97
	 *
98
99
	 * page_lock_anon_vma_read()	VS	put_anon_vma()
	 *   down_read_trylock()		  atomic_dec_and_test()
100
	 *   LOCK				  MB
101
	 *   atomic_read()			  rwsem_is_locked()
102
103
104
105
	 *
	 * LOCK should suffice since the actual taking of the lock must
	 * happen _before_ what follows.
	 */
106
	might_sleep();
107
	if (rwsem_is_locked(&anon_vma->root->rwsem)) {
108
		anon_vma_lock_write(anon_vma);
109
		anon_vma_unlock_write(anon_vma);
110
111
	}

112
113
	kmem_cache_free(anon_vma_cachep, anon_vma);
}
Linus Torvalds's avatar
Linus Torvalds committed
114

115
static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp)
116
{
117
	return kmem_cache_alloc(anon_vma_chain_cachep, gfp);
118
119
}

120
static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
121
122
123
124
{
	kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
}

125
126
127
128
129
130
131
static void anon_vma_chain_link(struct vm_area_struct *vma,
				struct anon_vma_chain *avc,
				struct anon_vma *anon_vma)
{
	avc->vma = vma;
	avc->anon_vma = anon_vma;
	list_add(&avc->same_vma, &vma->anon_vma_chain);
132
	anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
133
134
}

135
136
137
138
139
140
141
142
143
/**
 * anon_vma_prepare - attach an anon_vma to a memory region
 * @vma: the memory region in question
 *
 * This makes sure the memory mapping described by 'vma' has
 * an 'anon_vma' attached to it, so that we can associate the
 * anonymous pages mapped into it with that anon_vma.
 *
 * The common case will be that we already have one, but if
Figo.zhang's avatar
Figo.zhang committed
144
 * not we either need to find an adjacent mapping that we
145
146
147
148
149
 * can re-use the anon_vma from (very common when the only
 * reason for splitting a vma has been mprotect()), or we
 * allocate a new one.
 *
 * Anon-vma allocations are very subtle, because we may have
150
 * optimistically looked up an anon_vma in page_lock_anon_vma_read()
151
152
153
154
155
156
157
158
159
160
161
 * and that may actually touch the spinlock even in the newly
 * allocated vma (it depends on RCU to make sure that the
 * anon_vma isn't actually destroyed).
 *
 * As a result, we need to do proper anon_vma locking even
 * for the new allocation. At the same time, we do not want
 * to do any locking for the common case of already having
 * an anon_vma.
 *
 * This must be called with the mmap_sem held for reading.
 */
Linus Torvalds's avatar
Linus Torvalds committed
162
163
164
int anon_vma_prepare(struct vm_area_struct *vma)
{
	struct anon_vma *anon_vma = vma->anon_vma;
165
	struct anon_vma_chain *avc;
Linus Torvalds's avatar
Linus Torvalds committed
166
167
168
169

	might_sleep();
	if (unlikely(!anon_vma)) {
		struct mm_struct *mm = vma->vm_mm;
170
		struct anon_vma *allocated;
Linus Torvalds's avatar
Linus Torvalds committed
171

172
		avc = anon_vma_chain_alloc(GFP_KERNEL);
173
174
175
		if (!avc)
			goto out_enomem;

Linus Torvalds's avatar
Linus Torvalds committed
176
		anon_vma = find_mergeable_anon_vma(vma);
177
178
		allocated = NULL;
		if (!anon_vma) {
Linus Torvalds's avatar
Linus Torvalds committed
179
180
			anon_vma = anon_vma_alloc();
			if (unlikely(!anon_vma))
181
				goto out_enomem_free_avc;
Linus Torvalds's avatar
Linus Torvalds committed
182
183
184
			allocated = anon_vma;
		}

185
		anon_vma_lock_write(anon_vma);
Linus Torvalds's avatar
Linus Torvalds committed
186
187
188
189
		/* page_table_lock to protect against threads */
		spin_lock(&mm->page_table_lock);
		if (likely(!vma->anon_vma)) {
			vma->anon_vma = anon_vma;
190
			anon_vma_chain_link(vma, avc, anon_vma);
Linus Torvalds's avatar
Linus Torvalds committed
191
			allocated = NULL;
192
			avc = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
193
194
		}
		spin_unlock(&mm->page_table_lock);
195
		anon_vma_unlock_write(anon_vma);
196
197

		if (unlikely(allocated))
198
			put_anon_vma(allocated);
199
		if (unlikely(avc))
200
			anon_vma_chain_free(avc);
Linus Torvalds's avatar
Linus Torvalds committed
201
202
	}
	return 0;
203
204
205
206
207

 out_enomem_free_avc:
	anon_vma_chain_free(avc);
 out_enomem:
	return -ENOMEM;
Linus Torvalds's avatar
Linus Torvalds committed
208
209
}

210
211
212
213
214
215
216
217
218
219
220
221
222
/*
 * This is a useful helper function for locking the anon_vma root as
 * we traverse the vma->anon_vma_chain, looping over anon_vma's that
 * have the same vma.
 *
 * Such anon_vma's should have the same root, so you'd expect to see
 * just a single mutex_lock for the whole traversal.
 */
static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma)
{
	struct anon_vma *new_root = anon_vma->root;
	if (new_root != root) {
		if (WARN_ON_ONCE(root))
223
			up_write(&root->rwsem);
224
		root = new_root;
225
		down_write(&root->rwsem);
226
227
228
229
230
231
232
	}
	return root;
}

static inline void unlock_anon_vma_root(struct anon_vma *root)
{
	if (root)
233
		up_write(&root->rwsem);
234
235
}

236
237
238
239
240
/*
 * Attach the anon_vmas from src to dst.
 * Returns 0 on success, -ENOMEM on failure.
 */
int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
Linus Torvalds's avatar
Linus Torvalds committed
241
{
242
	struct anon_vma_chain *avc, *pavc;
243
	struct anon_vma *root = NULL;
244

245
	list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
246
247
		struct anon_vma *anon_vma;

248
249
250
251
252
253
254
255
		avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN);
		if (unlikely(!avc)) {
			unlock_anon_vma_root(root);
			root = NULL;
			avc = anon_vma_chain_alloc(GFP_KERNEL);
			if (!avc)
				goto enomem_failure;
		}
256
257
258
		anon_vma = pavc->anon_vma;
		root = lock_anon_vma_root(root, anon_vma);
		anon_vma_chain_link(dst, avc, anon_vma);
259
	}
260
	unlock_anon_vma_root(root);
261
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
262

263
264
265
 enomem_failure:
	unlink_anon_vmas(dst);
	return -ENOMEM;
Linus Torvalds's avatar
Linus Torvalds committed
266
267
}

268
269
270
271
272
273
/*
 * Attach vma to its own anon_vma, as well as to the anon_vmas that
 * the corresponding VMA in the parent process is attached to.
 * Returns 0 on success, non-zero on failure.
 */
int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
Linus Torvalds's avatar
Linus Torvalds committed
274
{
275
276
	struct anon_vma_chain *avc;
	struct anon_vma *anon_vma;
Linus Torvalds's avatar
Linus Torvalds committed
277

278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
	/* Don't bother if the parent process has no anon_vma here. */
	if (!pvma->anon_vma)
		return 0;

	/*
	 * First, attach the new VMA to the parent VMA's anon_vmas,
	 * so rmap can find non-COWed pages in child processes.
	 */
	if (anon_vma_clone(vma, pvma))
		return -ENOMEM;

	/* Then add our own anon_vma. */
	anon_vma = anon_vma_alloc();
	if (!anon_vma)
		goto out_error;
293
	avc = anon_vma_chain_alloc(GFP_KERNEL);
294
295
	if (!avc)
		goto out_error_free_anon_vma;
296
297
298
299
300
301

	/*
	 * The root anon_vma's spinlock is the lock actually used when we
	 * lock any of the anon_vmas in this anon_vma tree.
	 */
	anon_vma->root = pvma->anon_vma->root;
302
	/*
303
304
305
	 * With refcounts, an anon_vma can stay around longer than the
	 * process it belongs to. The root anon_vma needs to be pinned until
	 * this anon_vma is freed, because the lock lives in the root.
306
307
	 */
	get_anon_vma(anon_vma->root);
308
309
	/* Mark this anon_vma as the one where our new (COWed) pages go. */
	vma->anon_vma = anon_vma;
310
	anon_vma_lock_write(anon_vma);
311
	anon_vma_chain_link(vma, avc, anon_vma);
312
	anon_vma_unlock_write(anon_vma);
313
314
315
316

	return 0;

 out_error_free_anon_vma:
317
	put_anon_vma(anon_vma);
318
 out_error:
319
	unlink_anon_vmas(vma);
320
	return -ENOMEM;
Linus Torvalds's avatar
Linus Torvalds committed
321
322
}

323
324
325
void unlink_anon_vmas(struct vm_area_struct *vma)
{
	struct anon_vma_chain *avc, *next;
326
	struct anon_vma *root = NULL;
327

328
329
330
331
	/*
	 * Unlink each anon_vma chained to the VMA.  This list is ordered
	 * from newest to oldest, ensuring the root anon_vma gets freed last.
	 */
332
	list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
333
334
335
		struct anon_vma *anon_vma = avc->anon_vma;

		root = lock_anon_vma_root(root, anon_vma);
336
		anon_vma_interval_tree_remove(avc, &anon_vma->rb_root);
337
338
339
340
341

		/*
		 * Leave empty anon_vmas on the list - we'll need
		 * to free them outside the lock.
		 */
342
		if (RB_EMPTY_ROOT(&anon_vma->rb_root))
343
344
345
346
347
348
349
350
351
352
			continue;

		list_del(&avc->same_vma);
		anon_vma_chain_free(avc);
	}
	unlock_anon_vma_root(root);

	/*
	 * Iterate the list once more, it now only contains empty and unlinked
	 * anon_vmas, destroy them. Could not do before due to __put_anon_vma()
353
	 * needing to write-acquire the anon_vma->root->rwsem.
354
355
356
357
358
359
	 */
	list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
		struct anon_vma *anon_vma = avc->anon_vma;

		put_anon_vma(anon_vma);

360
361
362
363
364
		list_del(&avc->same_vma);
		anon_vma_chain_free(avc);
	}
}

365
static void anon_vma_ctor(void *data)
Linus Torvalds's avatar
Linus Torvalds committed
366
{
367
	struct anon_vma *anon_vma = data;
Linus Torvalds's avatar
Linus Torvalds committed
368

369
	init_rwsem(&anon_vma->rwsem);
370
	atomic_set(&anon_vma->refcount, 0);
371
	anon_vma->rb_root = RB_ROOT;
Linus Torvalds's avatar
Linus Torvalds committed
372
373
374
375
376
}

void __init anon_vma_init(void)
{
	anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
377
			0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor);
378
	anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC);
Linus Torvalds's avatar
Linus Torvalds committed
379
380
381
}

/*
382
383
384
385
386
387
388
389
390
 * Getting a lock on a stable anon_vma from a page off the LRU is tricky!
 *
 * Since there is no serialization what so ever against page_remove_rmap()
 * the best this function can do is return a locked anon_vma that might
 * have been relevant to this page.
 *
 * The page might have been remapped to a different anon_vma or the anon_vma
 * returned may already be freed (and even reused).
 *
391
392
393
394
395
 * In case it was remapped to a different anon_vma, the new anon_vma will be a
 * child of the old anon_vma, and the anon_vma lifetime rules will therefore
 * ensure that any anon_vma obtained from the page will still be valid for as
 * long as we observe page_mapped() [ hence all those page_mapped() tests ].
 *
396
397
398
399
400
401
402
 * All users of this function must be very careful when walking the anon_vma
 * chain and verify that the page in question is indeed mapped in it
 * [ something equivalent to page_mapped_in_vma() ].
 *
 * Since anon_vma's slab is DESTROY_BY_RCU and we know from page_remove_rmap()
 * that the anon_vma pointer from page->mapping is valid if there is a
 * mapcount, we can dereference the anon_vma after observing those.
Linus Torvalds's avatar
Linus Torvalds committed
403
 */
404
struct anon_vma *page_get_anon_vma(struct page *page)
Linus Torvalds's avatar
Linus Torvalds committed
405
{
406
	struct anon_vma *anon_vma = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
407
408
409
	unsigned long anon_mapping;

	rcu_read_lock();
410
	anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping);
Hugh Dickins's avatar
Hugh Dickins committed
411
	if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
Linus Torvalds's avatar
Linus Torvalds committed
412
413
414
415
416
		goto out;
	if (!page_mapped(page))
		goto out;

	anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
417
418
419
420
	if (!atomic_inc_not_zero(&anon_vma->refcount)) {
		anon_vma = NULL;
		goto out;
	}
421
422
423

	/*
	 * If this page is still mapped, then its anon_vma cannot have been
424
425
426
427
	 * freed.  But if it has been unmapped, we have no security against the
	 * anon_vma structure being freed and reused (for another anon_vma:
	 * SLAB_DESTROY_BY_RCU guarantees that - so the atomic_inc_not_zero()
	 * above cannot corrupt).
428
	 */
429
	if (!page_mapped(page)) {
430
		rcu_read_unlock();
431
		put_anon_vma(anon_vma);
432
		return NULL;
433
	}
Linus Torvalds's avatar
Linus Torvalds committed
434
435
out:
	rcu_read_unlock();
436
437
438
439

	return anon_vma;
}

440
441
442
443
444
445
446
/*
 * Similar to page_get_anon_vma() except it locks the anon_vma.
 *
 * Its a little more complex as it tries to keep the fast path to a single
 * atomic op -- the trylock. If we fail the trylock, we fall back to getting a
 * reference like with page_get_anon_vma() and then block on the mutex.
 */
447
struct anon_vma *page_lock_anon_vma_read(struct page *page)
448
{
449
	struct anon_vma *anon_vma = NULL;
450
	struct anon_vma *root_anon_vma;
451
	unsigned long anon_mapping;
452

453
454
455
456
457
458
459
460
	rcu_read_lock();
	anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping);
	if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
		goto out;
	if (!page_mapped(page))
		goto out;

	anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
461
	root_anon_vma = ACCESS_ONCE(anon_vma->root);
462
	if (down_read_trylock(&root_anon_vma->rwsem)) {
463
		/*
464
465
		 * If the page is still mapped, then this anon_vma is still
		 * its anon_vma, and holding the mutex ensures that it will
466
		 * not go away, see anon_vma_free().
467
		 */
468
		if (!page_mapped(page)) {
469
			up_read(&root_anon_vma->rwsem);
470
471
472
473
			anon_vma = NULL;
		}
		goto out;
	}
474

475
476
477
478
479
480
481
	/* trylock failed, we got to sleep */
	if (!atomic_inc_not_zero(&anon_vma->refcount)) {
		anon_vma = NULL;
		goto out;
	}

	if (!page_mapped(page)) {
482
		rcu_read_unlock();
483
		put_anon_vma(anon_vma);
484
		return NULL;
485
486
487
488
	}

	/* we pinned the anon_vma, its safe to sleep */
	rcu_read_unlock();
489
	anon_vma_lock_read(anon_vma);
490
491
492
493
494

	if (atomic_dec_and_test(&anon_vma->refcount)) {
		/*
		 * Oops, we held the last refcount, release the lock
		 * and bail -- can't simply use put_anon_vma() because
495
		 * we'll deadlock on the anon_vma_lock_write() recursion.
496
		 */
497
		anon_vma_unlock_read(anon_vma);
498
499
500
501
502
503
504
505
		__put_anon_vma(anon_vma);
		anon_vma = NULL;
	}

	return anon_vma;

out:
	rcu_read_unlock();
506
	return anon_vma;
507
508
}

509
void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
510
{
511
	anon_vma_unlock_read(anon_vma);
Linus Torvalds's avatar
Linus Torvalds committed
512
513
514
}

/*
515
 * At what user virtual address is page expected in @vma?
Linus Torvalds's avatar
Linus Torvalds committed
516
 */
517
518
static inline unsigned long
__vma_address(struct page *page, struct vm_area_struct *vma)
Linus Torvalds's avatar
Linus Torvalds committed
519
520
521
{
	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);

522
523
	if (unlikely(is_vm_hugetlb_page(vma)))
		pgoff = page->index << huge_page_order(page_hstate(page));
524
525
526
527
528
529
530
531
532
533
534
535

	return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
}

inline unsigned long
vma_address(struct page *page, struct vm_area_struct *vma)
{
	unsigned long address = __vma_address(page, vma);

	/* page should be within @vma mapping range */
	VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);

Linus Torvalds's avatar
Linus Torvalds committed
536
537
538
539
	return address;
}

/*
Huang Shijie's avatar
Huang Shijie committed
540
 * At what user virtual address is page expected in vma?
541
 * Caller should check the page is actually part of the vma.
Linus Torvalds's avatar
Linus Torvalds committed
542
543
544
 */
unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
{
545
	unsigned long address;
546
	if (PageAnon(page)) {
547
548
549
550
551
552
553
		struct anon_vma *page__anon_vma = page_anon_vma(page);
		/*
		 * Note: swapoff's unuse_vma() is more efficient with this
		 * check, and needs it to match anon_vma when KSM is active.
		 */
		if (!vma->anon_vma || !page__anon_vma ||
		    vma->anon_vma->root != page__anon_vma->root)
554
555
			return -EFAULT;
	} else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
556
557
		if (!vma->vm_file ||
		    vma->vm_file->f_mapping != page->mapping)
Linus Torvalds's avatar
Linus Torvalds committed
558
559
560
			return -EFAULT;
	} else
		return -EFAULT;
561
562
563
564
	address = __vma_address(page, vma);
	if (unlikely(address < vma->vm_start || address >= vma->vm_end))
		return -EFAULT;
	return address;
Linus Torvalds's avatar
Linus Torvalds committed
565
566
}

Bob Liu's avatar
Bob Liu committed
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd = NULL;

	pgd = pgd_offset(mm, address);
	if (!pgd_present(*pgd))
		goto out;

	pud = pud_offset(pgd, address);
	if (!pud_present(*pud))
		goto out;

	pmd = pmd_offset(pud, address);
	if (!pmd_present(*pmd))
		pmd = NULL;
out:
	return pmd;
}

Nikita Danilov's avatar
Nikita Danilov committed
588
589
590
/*
 * Check that @page is mapped at @address into @mm.
 *
Nick Piggin's avatar
Nick Piggin committed
591
592
593
594
 * If @sync is false, page_check_address may perform a racy check to avoid
 * the page table lock when the pte is not present (helpful when reclaiming
 * highly shared pages).
 *
595
 * On success returns with pte mapped and locked.
Nikita Danilov's avatar
Nikita Danilov committed
596
 */
597
pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
Nick Piggin's avatar
Nick Piggin committed
598
			  unsigned long address, spinlock_t **ptlp, int sync)
Nikita Danilov's avatar
Nikita Danilov committed
599
600
601
{
	pmd_t *pmd;
	pte_t *pte;
602
	spinlock_t *ptl;
Nikita Danilov's avatar
Nikita Danilov committed
603

604
	if (unlikely(PageHuge(page))) {
605
		/* when pud is not present, pte will be NULL */
606
		pte = huge_pte_offset(mm, address);
607
608
609
		if (!pte)
			return NULL;

610
		ptl = huge_pte_lockptr(page_hstate(page), mm, pte);
611
612
613
		goto check;
	}

Bob Liu's avatar
Bob Liu committed
614
615
	pmd = mm_find_pmd(mm, address);
	if (!pmd)
616
617
		return NULL;

618
619
	if (pmd_trans_huge(*pmd))
		return NULL;
620
621
622

	pte = pte_offset_map(pmd, address);
	/* Make a quick check before getting the lock */
Nick Piggin's avatar
Nick Piggin committed
623
	if (!sync && !pte_present(*pte)) {
624
625
626
627
		pte_unmap(pte);
		return NULL;
	}

628
	ptl = pte_lockptr(mm, pmd);
629
check:
630
631
632
633
	spin_lock(ptl);
	if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
		*ptlp = ptl;
		return pte;
Nikita Danilov's avatar
Nikita Danilov committed
634
	}
635
636
	pte_unmap_unlock(pte, ptl);
	return NULL;
Nikita Danilov's avatar
Nikita Danilov committed
637
638
}

639
640
641
642
643
644
645
646
647
/**
 * page_mapped_in_vma - check whether a page is really mapped in a VMA
 * @page: the page to test
 * @vma: the VMA to test
 *
 * Returns 1 if the page is mapped into the page tables of the VMA, 0
 * if the page is not mapped into the page tables of this VMA.  Only
 * valid for normal file or anonymous VMAs.
 */
648
int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
649
650
651
652
653
{
	unsigned long address;
	pte_t *pte;
	spinlock_t *ptl;

654
655
	address = __vma_address(page, vma);
	if (unlikely(address < vma->vm_start || address >= vma->vm_end))
656
657
658
659
660
661
662
663
664
		return 0;
	pte = page_check_address(page, vma->vm_mm, address, &ptl, 1);
	if (!pte)			/* the page is not in this mm */
		return 0;
	pte_unmap_unlock(pte, ptl);

	return 1;
}

665
666
667
668
669
670
struct page_referenced_arg {
	int mapcount;
	int referenced;
	unsigned long vm_flags;
	struct mem_cgroup *memcg;
};
Linus Torvalds's avatar
Linus Torvalds committed
671
/*
672
 * arg: page_referenced_arg will be passed
Linus Torvalds's avatar
Linus Torvalds committed
673
 */
674
static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
675
			unsigned long address, void *arg)
Linus Torvalds's avatar
Linus Torvalds committed
676
677
{
	struct mm_struct *mm = vma->vm_mm;
678
	spinlock_t *ptl;
Linus Torvalds's avatar
Linus Torvalds committed
679
	int referenced = 0;
680
	struct page_referenced_arg *pra = arg;
Linus Torvalds's avatar
Linus Torvalds committed
681

682
683
684
	if (unlikely(PageTransHuge(page))) {
		pmd_t *pmd;

685
686
687
688
		/*
		 * rmap might return false positives; we must filter
		 * these out using page_check_address_pmd().
		 */
689
		pmd = page_check_address_pmd(page, mm, address,
690
691
					     PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl);
		if (!pmd)
692
			return SWAP_AGAIN;
693
694

		if (vma->vm_flags & VM_LOCKED) {
695
			spin_unlock(ptl);
696
697
			pra->vm_flags |= VM_LOCKED;
			return SWAP_FAIL; /* To break the loop */
698
699
700
701
		}

		/* go ahead even if the pmd is pmd_trans_splitting() */
		if (pmdp_clear_flush_young_notify(vma, address, pmd))
702
			referenced++;
703
		spin_unlock(ptl);
704
705
706
	} else {
		pte_t *pte;

707
708
709
710
		/*
		 * rmap might return false positives; we must filter
		 * these out using page_check_address().
		 */
711
712
		pte = page_check_address(page, mm, address, &ptl, 0);
		if (!pte)
713
			return SWAP_AGAIN;
714

715
716
		if (vma->vm_flags & VM_LOCKED) {
			pte_unmap_unlock(pte, ptl);
717
718
			pra->vm_flags |= VM_LOCKED;
			return SWAP_FAIL; /* To break the loop */
719
720
		}

721
722
723
724
725
726
727
728
		if (ptep_clear_flush_young_notify(vma, address, pte)) {
			/*
			 * Don't treat a reference through a sequentially read
			 * mapping as such.  If the page has been used in
			 * another mapping, we will catch it; if this other
			 * mapping is already gone, the unmap path will have
			 * set PG_referenced or activated the page.
			 */
729
			if (likely(!(vma->vm_flags & VM_SEQ_READ)))
730
731
732
733
734
				referenced++;
		}
		pte_unmap_unlock(pte, ptl);
	}

735
736
737
	if (referenced) {
		pra->referenced++;
		pra->vm_flags |= vma->vm_flags;
Linus Torvalds's avatar
Linus Torvalds committed
738
	}
739

740
741
742
743
744
	pra->mapcount--;
	if (!pra->mapcount)
		return SWAP_SUCCESS; /* To break the loop */

	return SWAP_AGAIN;
Linus Torvalds's avatar
Linus Torvalds committed
745
746
}

747
static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg)
Linus Torvalds's avatar
Linus Torvalds committed
748
{
749
750
	struct page_referenced_arg *pra = arg;
	struct mem_cgroup *memcg = pra->memcg;
Linus Torvalds's avatar
Linus Torvalds committed
751

752
753
	if (!mm_match_cgroup(vma->vm_mm, memcg))
		return true;
Linus Torvalds's avatar
Linus Torvalds committed
754

755
	return false;
Linus Torvalds's avatar
Linus Torvalds committed
756
757
758
759
760
761
}

/**
 * page_referenced - test if the page was referenced
 * @page: the page to test
 * @is_locked: caller holds lock on the page
762
 * @memcg: target memory cgroup
763
 * @vm_flags: collect encountered vma->vm_flags who actually referenced the page
Linus Torvalds's avatar
Linus Torvalds committed
764
765
766
767
 *
 * Quick test_and_clear_referenced for all mappings to a page,
 * returns the number of ptes which referenced the page.
 */
768
769
int page_referenced(struct page *page,
		    int is_locked,
770
		    struct mem_cgroup *memcg,
771
		    unsigned long *vm_flags)
Linus Torvalds's avatar
Linus Torvalds committed
772
{
773
	int ret;
774
	int we_locked = 0;
775
776
777
778
779
780
781
782
783
	struct page_referenced_arg pra = {
		.mapcount = page_mapcount(page),
		.memcg = memcg,
	};
	struct rmap_walk_control rwc = {
		.rmap_one = page_referenced_one,
		.arg = (void *)&pra,
		.anon_lock = page_lock_anon_vma_read,
	};
Linus Torvalds's avatar
Linus Torvalds committed
784

785
	*vm_flags = 0;
786
787
788
789
790
791
792
793
794
795
	if (!page_mapped(page))
		return 0;

	if (!page_rmapping(page))
		return 0;

	if (!is_locked && (!PageAnon(page) || PageKsm(page))) {
		we_locked = trylock_page(page);
		if (!we_locked)
			return 1;
Linus Torvalds's avatar
Linus Torvalds committed
796
	}
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813

	/*
	 * If we are reclaiming on behalf of a cgroup, skip
	 * counting on behalf of references from different
	 * cgroups
	 */
	if (memcg) {
		rwc.invalid_vma = invalid_page_referenced_vma;
	}

	ret = rmap_walk(page, &rwc);
	*vm_flags = pra.vm_flags;

	if (we_locked)
		unlock_page(page);

	return pra.referenced;
Linus Torvalds's avatar
Linus Torvalds committed
814
815
}

816
static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
817
			    unsigned long address, void *arg)
818
819
{
	struct mm_struct *mm = vma->vm_mm;
820
	pte_t *pte;
821
822
	spinlock_t *ptl;
	int ret = 0;
823
	int *cleaned = arg;
824

Nick Piggin's avatar
Nick Piggin committed
825
	pte = page_check_address(page, mm, address, &ptl, 1);
826
827
828
	if (!pte)
		goto out;

829
830
	if (pte_dirty(*pte) || pte_write(*pte)) {
		pte_t entry;
831

832
		flush_cache_page(vma, address, pte_pfn(*pte));
833
		entry = ptep_clear_flush(vma, address, pte);
834
835
		entry = pte_wrprotect(entry);
		entry = pte_mkclean(entry);
836
		set_pte_at(mm, address, pte, entry);
837
838
		ret = 1;
	}
839
840

	pte_unmap_unlock(pte, ptl);
841

842
	if (ret) {
843
		mmu_notifier_invalidate_page(mm, address);
844
845
		(*cleaned)++;
	}
846
out:
847
	return SWAP_AGAIN;
848
849
}

850
static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg)
851
{
852
	if (vma->vm_flags & VM_SHARED)
853
		return false;
854

855
	return true;
856
857
858
859
}

int page_mkclean(struct page *page)
{
860
861
862
863
864
865
866
	int cleaned = 0;
	struct address_space *mapping;
	struct rmap_walk_control rwc = {
		.arg = (void *)&cleaned,
		.rmap_one = page_mkclean_one,
		.invalid_vma = invalid_mkclean_vma,
	};
867
868
869

	BUG_ON(!PageLocked(page));

870
871
872
873
874
875
876
877
	if (!page_mapped(page))
		return 0;

	mapping = page_mapping(page);
	if (!mapping)
		return 0;

	rmap_walk(page, &rwc);
878

879
	return cleaned;
880
}
Jaya Kumar's avatar
Jaya Kumar committed
881
EXPORT_SYMBOL_GPL(page_mkclean);
882

883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
/**
 * page_move_anon_rmap - move a page to our anon_vma
 * @page:	the page to move to our anon_vma
 * @vma:	the vma the page belongs to
 * @address:	the user virtual address mapped
 *
 * When a page belongs exclusively to one process after a COW event,
 * that page can be moved into the anon_vma that belongs to just that
 * process, so the rmap code will not search the parent or sibling
 * processes.
 */
void page_move_anon_rmap(struct page *page,
	struct vm_area_struct *vma, unsigned long address)
{
	struct anon_vma *anon_vma = vma->anon_vma;

899
	VM_BUG_ON_PAGE(!PageLocked(page), page);
900
	VM_BUG_ON(!anon_vma);
901
	VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page);
902
903
904
905
906

	anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
	page->mapping = (struct address_space *) anon_vma;
}

Nick Piggin's avatar
Nick Piggin committed
907
/**
Andi Kleen's avatar
Andi Kleen committed
908
909
910
911
 * __page_set_anon_rmap - set up new anonymous rmap
 * @page:	Page to add to rmap	
 * @vma:	VM area to add page to.
 * @address:	User virtual address of the mapping	
912
 * @exclusive:	the page is exclusively owned by the current process
Nick Piggin's avatar
Nick Piggin committed
913
914
 */
static void __page_set_anon_rmap(struct page *page,
915
	struct vm_area_struct *vma, unsigned long address, int exclusive)
Nick Piggin's avatar
Nick Piggin committed
916
{
917
	struct anon_vma *anon_vma = vma->anon_vma;
918

919
	BUG_ON(!anon_vma);
920

Andi Kleen's avatar
Andi Kleen committed
921
922
923
	if (PageAnon(page))
		return;

924
	/*
925
926
927
	 * If the page isn't exclusively mapped into this vma,
	 * we must use the _oldest_ possible anon_vma for the
	 * page mapping!
928
	 */
Andi Kleen's avatar
Andi Kleen committed
929
	if (!exclusive)
930
		anon_vma = anon_vma->root;
Nick Piggin's avatar
Nick Piggin committed
931
932
933
934
935
936

	anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
	page->mapping = (struct address_space *) anon_vma;
	page->index = linear_page_index(vma, address);
}

Nick Piggin's avatar
Nick Piggin committed
937
/**
Randy Dunlap's avatar
Randy Dunlap committed
938
 * __page_check_anon_rmap - sanity check anonymous rmap addition
Nick Piggin's avatar
Nick Piggin committed
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
 * @page:	the page to add the mapping to
 * @vma:	the vm area in which the mapping is added
 * @address:	the user virtual address mapped
 */
static void __page_check_anon_rmap(struct page *page,
	struct vm_area_struct *vma, unsigned long address)
{
#ifdef CONFIG_DEBUG_VM
	/*
	 * The page's anon-rmap details (mapping and index) are guaranteed to
	 * be set up correctly at this point.
	 *
	 * We have exclusion against page_add_anon_rmap because the caller
	 * always holds the page locked, except if called from page_dup_rmap,
	 * in which case the page is already known to be setup.
	 *
	 * We have exclusion against page_add_new_anon_rmap because those pages
	 * are initially only visible via the pagetables, and the pte is locked
	 * over the call to page_add_new_anon_rmap.
	 */
959
	BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root);
Nick Piggin's avatar
Nick Piggin committed
960
961
962
963
	BUG_ON(page->index != linear_page_index(vma, address));
#endif
}

Linus Torvalds's avatar
Linus Torvalds committed
964
965
966
967
968
969
/**
 * page_add_anon_rmap - add pte mapping to an anonymous page
 * @page:	the page to add the mapping to
 * @vma:	the vm area in which the mapping is added
 * @address:	the user virtual address mapped
 *
970
 * The caller needs to hold the pte lock, and the page must be locked in
971
972
973
 * the anon_vma case: to serialize mapping,index checking after setting,
 * and to ensure that PageAnon is not being upgraded racily to PageKsm
 * (but PageKsm is never downgraded to PageAnon).
Linus Torvalds's avatar
Linus Torvalds committed
974
975
976
 */
void page_add_anon_rmap(struct page *page,
	struct vm_area_struct *vma, unsigned long address)
977
978
979
980
981
982
983
984
985
986
987
{
	do_page_add_anon_rmap(page, vma, address, 0);
}

/*
 * Special version of the above for do_swap_page, which often runs
 * into pages that are exclusively owned by the current process.
 * Everybody else should continue to use page_add_anon_rmap above.
 */
void do_page_add_anon_rmap(struct page *page,
	struct vm_area_struct *vma, unsigned long address, int exclusive)
Linus Torvalds's avatar
Linus Torvalds committed
988
{
989
	int first = atomic_inc_and_test(&page->_mapcount);
990
	if (first) {
991
992
993
994
995
996
		/*
		 * We use the irq-unsafe __{inc|mod}_zone_page_stat because
		 * these counters are not modified in interrupt context, and
		 * pte lock(a spinlock) is held, which implies preemption
		 * disabled.
		 */
997
		if (PageTransHuge(page))
998
999
			__inc_zone_page_state(page,
					      NR_ANON_TRANSPARENT_HUGEPAGES);
1000
1001
		__mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
				hpage_nr_pages(page));
1002
	}
1003
1004
1005
	if (unlikely(PageKsm(page)))
		return;

1006
	VM_BUG_ON_PAGE(!PageLocked(page), page);
1007
	/* address might be in next vma when migration races vma_adjust */
1008
	if (first)
1009
		__page_set_anon_rmap(page, vma, address, exclusive);
1010
	else
Nick Piggin's avatar
Nick Piggin committed
1011
		__page_check_anon_rmap(page, vma, address);
Linus Torvalds's avatar
Linus Torvalds committed
1012
1013
}

Randy Dunlap's avatar
Randy Dunlap committed
1014
/**
Nick Piggin's avatar
Nick Piggin committed
1015
1016
1017
1018
1019
1020
1021
 * page_add_new_anon_rmap - add pte mapping to a new anonymous page
 * @page:	the page to add the mapping to
 * @vma:	the vm area in which the mapping is added
 * @address:	the user virtual address mapped
 *
 * Same as page_add_anon_rmap but must only be called on *new* pages.
 * This means the inc-and-test can be bypassed.
Nick Piggin's avatar
Nick Piggin committed
1022
 * Page does not have to be locked.
Nick Piggin's avatar
Nick Piggin committed
1023
1024
1025
1026
 */
void page_add_new_anon_rmap(struct page *page,
	struct vm_area_struct *vma, unsigned long address)
{
1027
	VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
1028
1029
	SetPageSwapBacked(page);
	atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
1030
	if (PageTransHuge(page))
1031
		__inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
1032
1033
	__mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
			hpage_nr_pages(page));
1034
	__page_set_anon_rmap(page, vma, address, 1);
1035
1036
1037

	VM_BUG_ON_PAGE(PageLRU(page), page);
	if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL