Commit b291f000 authored by Nick Piggin's avatar Nick Piggin Committed by Linus Torvalds
Browse files

mlock: mlocked pages are unevictable



Make sure that mlocked pages also live on the unevictable LRU, so kswapd
will not scan them over and over again.

This is achieved through various strategies:

1) add yet another page flag--PG_mlocked--to indicate that
   the page is locked for efficient testing in vmscan and,
   optionally, fault path.  This allows early culling of
   unevictable pages, preventing them from getting to
   page_referenced()/try_to_unmap().  Also allows separate
   accounting of mlock'd pages, as Nick's original patch
   did.

   Note:  Nick's original mlock patch used a PG_mlocked
   flag.  I had removed this in favor of the PG_unevictable
   flag + an mlock_count [new page struct member].  I
   restored the PG_mlocked flag to eliminate the new
   count field.

2) add the mlock/unevictable infrastructure to mm/mlock.c,
   with internal APIs in mm/internal.h.  This is a rework
   of Nick's original patch to these files, taking into
   account that mlocked pages are now kept on unevictable
   LRU list.

3) update vmscan.c:page_evictable() to check PageMlocked()
   and, if vma passed in, the vm_flags.  Note that the vma
   will only be passed in for new pages in the fault path;
   and then only if the "cull unevictable pages in fault
   path" patch is included.

4) add try_to_unlock() to rmap.c to walk a page's rmap and
   ClearPageMlocked() if no other vmas have it mlocked.
   Reuses as much of try_to_unmap() as possible.  This
   effectively replaces the use of one of the lru list links
   as an mlock count.  If this mechanism let's pages in mlocked
   vmas leak through w/o PG_mlocked set [I don't know that it
   does], we should catch them later in try_to_unmap().  One
   hopes this will be rare, as it will be relatively expensive.

Original mm/internal.h, mm/rmap.c and mm/mlock.c changes:
Signed-off-by: default avatarNick Piggin <npiggin@suse.de>

splitlru: introduce __get_user_pages():

  New munlock processing need to GUP_FLAGS_IGNORE_VMA_PERMISSIONS.
  because current get_user_pages() can't grab PROT_NONE pages theresore it
  cause PROT_NONE pages can't munlock.

[akpm@linux-foundation.org: fix this for pagemap-pass-mm-into-pagewalkers.patch]
[akpm@linux-foundation.org: untangle patch interdependencies]
[akpm@linux-foundation.org: fix things after out-of-order merging]
[hugh@veritas.com: fix page-flags mess]
[lee.schermerhorn@hp.com: fix munlock page table walk - now requires 'mm']
[kosaki.motohiro@jp.fujitsu.com: build fix]
[kosaki.motohiro@jp.fujitsu.com: fix truncate race and sevaral comments]
[kosaki.motohiro@jp.fujitsu.com: splitlru: introduce __get_user_pages()]
Signed-off-by: default avatarKOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: default avatarRik van Riel <riel@redhat.com>
Signed-off-by: default avatarLee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: default avatarHugh Dickins <hugh@veritas.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 89e004ea
......@@ -131,6 +131,11 @@ extern unsigned int kobjsize(const void *objp);
#define VM_SequentialReadHint(v) ((v)->vm_flags & VM_SEQ_READ)
#define VM_RandomReadHint(v) ((v)->vm_flags & VM_RAND_READ)
/*
* special vmas that are non-mergable, non-mlock()able
*/
#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP)
/*
* mapping from the currently active vm_flags protection bits (the
* low four bits) to a page protection mask..
......
......@@ -96,6 +96,7 @@ enum pageflags {
PG_swapbacked, /* Page is backed by RAM/swap */
#ifdef CONFIG_UNEVICTABLE_LRU
PG_unevictable, /* Page is "unevictable" */
PG_mlocked, /* Page is vma mlocked */
#endif
#ifdef CONFIG_IA64_UNCACHED_ALLOCATOR
PG_uncached, /* Page has been mapped as uncached */
......@@ -232,7 +233,17 @@ PAGEFLAG_FALSE(SwapCache)
#ifdef CONFIG_UNEVICTABLE_LRU
PAGEFLAG(Unevictable, unevictable) __CLEARPAGEFLAG(Unevictable, unevictable)
TESTCLEARFLAG(Unevictable, unevictable)
#define MLOCK_PAGES 1
PAGEFLAG(Mlocked, mlocked) __CLEARPAGEFLAG(Mlocked, mlocked)
TESTSCFLAG(Mlocked, mlocked)
#else
#define MLOCK_PAGES 0
PAGEFLAG_FALSE(Mlocked)
SETPAGEFLAG_NOOP(Mlocked) TESTCLEARFLAG_FALSE(Mlocked)
PAGEFLAG_FALSE(Unevictable) TESTCLEARFLAG_FALSE(Unevictable)
SETPAGEFLAG_NOOP(Unevictable) CLEARPAGEFLAG_NOOP(Unevictable)
__CLEARPAGEFLAG_NOOP(Unevictable)
......@@ -354,15 +365,17 @@ static inline void __ClearPageTail(struct page *page)
#endif /* !PAGEFLAGS_EXTENDED */
#ifdef CONFIG_UNEVICTABLE_LRU
#define __PG_UNEVICTABLE (1 << PG_unevictable)
#define __PG_UNEVICTABLE (1 << PG_unevictable)
#define __PG_MLOCKED (1 << PG_mlocked)
#else
#define __PG_UNEVICTABLE 0
#define __PG_UNEVICTABLE 0
#define __PG_MLOCKED 0
#endif
#define PAGE_FLAGS (1 << PG_lru | 1 << PG_private | 1 << PG_locked | \
1 << PG_buddy | 1 << PG_writeback | \
1 << PG_slab | 1 << PG_swapcache | 1 << PG_active | \
__PG_UNEVICTABLE)
__PG_UNEVICTABLE | __PG_MLOCKED)
/*
* Flags checked in bad_page(). Pages on the free list should not have
......
......@@ -117,6 +117,19 @@ unsigned long page_address_in_vma(struct page *, struct vm_area_struct *);
*/
int page_mkclean(struct page *);
#ifdef CONFIG_UNEVICTABLE_LRU
/*
* called in munlock()/munmap() path to check for other vmas holding
* the page mlocked.
*/
int try_to_munlock(struct page *);
#else
static inline int try_to_munlock(struct page *page)
{
return 0; /* a.k.a. SWAP_SUCCESS */
}
#endif
#else /* !CONFIG_MMU */
#define anon_vma_init() do {} while (0)
......@@ -140,5 +153,6 @@ static inline int page_mkclean(struct page *page)
#define SWAP_SUCCESS 0
#define SWAP_AGAIN 1
#define SWAP_FAIL 2
#define SWAP_MLOCK 3
#endif /* _LINUX_RMAP_H */
......@@ -61,6 +61,10 @@ static inline unsigned long page_order(struct page *page)
return page_private(page);
}
extern int mlock_vma_pages_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end);
extern void munlock_vma_pages_all(struct vm_area_struct *vma);
#ifdef CONFIG_UNEVICTABLE_LRU
/*
* unevictable_migrate_page() called only from migrate_page_copy() to
......@@ -79,6 +83,65 @@ static inline void unevictable_migrate_page(struct page *new, struct page *old)
}
#endif
#ifdef CONFIG_UNEVICTABLE_LRU
/*
* Called only in fault path via page_evictable() for a new page
* to determine if it's being mapped into a LOCKED vma.
* If so, mark page as mlocked.
*/
static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page)
{
VM_BUG_ON(PageLRU(page));
if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED))
return 0;
SetPageMlocked(page);
return 1;
}
/*
* must be called with vma's mmap_sem held for read, and page locked.
*/
extern void mlock_vma_page(struct page *page);
/*
* Clear the page's PageMlocked(). This can be useful in a situation where
* we want to unconditionally remove a page from the pagecache -- e.g.,
* on truncation or freeing.
*
* It is legal to call this function for any page, mlocked or not.
* If called for a page that is still mapped by mlocked vmas, all we do
* is revert to lazy LRU behaviour -- semantics are not broken.
*/
extern void __clear_page_mlock(struct page *page);
static inline void clear_page_mlock(struct page *page)
{
if (unlikely(TestClearPageMlocked(page)))
__clear_page_mlock(page);
}
/*
* mlock_migrate_page - called only from migrate_page_copy() to
* migrate the Mlocked page flag
*/
static inline void mlock_migrate_page(struct page *newpage, struct page *page)
{
if (TestClearPageMlocked(page))
SetPageMlocked(newpage);
}
#else /* CONFIG_UNEVICTABLE_LRU */
static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
{
return 0;
}
static inline void clear_page_mlock(struct page *page) { }
static inline void mlock_vma_page(struct page *page) { }
static inline void mlock_migrate_page(struct page *new, struct page *old) { }
#endif /* CONFIG_UNEVICTABLE_LRU */
/*
* FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node,
......@@ -148,4 +211,12 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
}
#endif /* CONFIG_SPARSEMEM */
#define GUP_FLAGS_WRITE 0x1
#define GUP_FLAGS_FORCE 0x2
#define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4
int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int len, int flags,
struct page **pages, struct vm_area_struct **vmas);
#endif
......@@ -64,6 +64,8 @@
#include "internal.h"
#include "internal.h"
#ifndef CONFIG_NEED_MULTIPLE_NODES
/* use the per-pgdat data instead for discontigmem - mbligh */
unsigned long max_mapnr;
......@@ -1129,12 +1131,17 @@ static inline int use_zero_page(struct vm_area_struct *vma)
return !vma->vm_ops || !vma->vm_ops->fault;
}
int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int len, int write, int force,
int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int len, int flags,
struct page **pages, struct vm_area_struct **vmas)
{
int i;
unsigned int vm_flags;
unsigned int vm_flags = 0;
int write = !!(flags & GUP_FLAGS_WRITE);
int force = !!(flags & GUP_FLAGS_FORCE);
int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
if (len <= 0)
return 0;
......@@ -1158,7 +1165,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
if (write) /* user gate pages are read-only */
/* user gate pages are read-only */
if (!ignore && write)
return i ? : -EFAULT;
if (pg > TASK_SIZE)
pgd = pgd_offset_k(pg);
......@@ -1190,8 +1199,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
continue;
}
if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP))
|| !(vm_flags & vma->vm_flags))
if (!vma ||
(vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
(!ignore && !(vm_flags & vma->vm_flags)))
return i ? : -EFAULT;
if (is_vm_hugetlb_page(vma)) {
......@@ -1266,6 +1276,23 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
} while (len);
return i;
}
int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int len, int write, int force,
struct page **pages, struct vm_area_struct **vmas)
{
int flags = 0;
if (write)
flags |= GUP_FLAGS_WRITE;
if (force)
flags |= GUP_FLAGS_FORCE;
return __get_user_pages(tsk, mm,
start, len, flags,
pages, vmas);
}
EXPORT_SYMBOL(get_user_pages);
pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
......@@ -1858,6 +1885,15 @@ gotten:
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
if (!new_page)
goto oom;
/*
* Don't let another task, with possibly unlocked vma,
* keep the mlocked page.
*/
if (vma->vm_flags & VM_LOCKED) {
lock_page(old_page); /* for LRU manipulation */
clear_page_mlock(old_page);
unlock_page(old_page);
}
cow_user_page(new_page, old_page, address, vma);
__SetPageUptodate(new_page);
......@@ -2325,7 +2361,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
page_add_anon_rmap(page, vma, address);
swap_free(entry);
if (vm_swap_full())
if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
remove_exclusive_swap_page(page);
unlock_page(page);
......@@ -2465,6 +2501,12 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
ret = VM_FAULT_OOM;
goto out;
}
/*
* Don't let another task, with possibly unlocked vma,
* keep the mlocked page.
*/
if (vma->vm_flags & VM_LOCKED)
clear_page_mlock(vmf.page);
copy_user_highpage(page, vmf.page, address, vma);
__SetPageUptodate(page);
} else {
......
......@@ -371,6 +371,8 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
__set_page_dirty_nobuffers(newpage);
}
mlock_migrate_page(newpage, page);
#ifdef CONFIG_SWAP
ClearPageSwapCache(page);
#endif
......
......@@ -8,10 +8,18 @@
#include <linux/capability.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/pagemap.h>
#include <linux/mempolicy.h>
#include <linux/syscalls.h>
#include <linux/sched.h>
#include <linux/module.h>
#include <linux/rmap.h>
#include <linux/mmzone.h>
#include <linux/hugetlb.h>
#include "internal.h"
int can_do_mlock(void)
{
......@@ -23,17 +31,360 @@ int can_do_mlock(void)
}
EXPORT_SYMBOL(can_do_mlock);
#ifdef CONFIG_UNEVICTABLE_LRU
/*
* Mlocked pages are marked with PageMlocked() flag for efficient testing
* in vmscan and, possibly, the fault path; and to support semi-accurate
* statistics.
*
* An mlocked page [PageMlocked(page)] is unevictable. As such, it will
* be placed on the LRU "unevictable" list, rather than the [in]active lists.
* The unevictable list is an LRU sibling list to the [in]active lists.
* PageUnevictable is set to indicate the unevictable state.
*
* When lazy mlocking via vmscan, it is important to ensure that the
* vma's VM_LOCKED status is not concurrently being modified, otherwise we
* may have mlocked a page that is being munlocked. So lazy mlock must take
* the mmap_sem for read, and verify that the vma really is locked
* (see mm/rmap.c).
*/
/*
* LRU accounting for clear_page_mlock()
*/
void __clear_page_mlock(struct page *page)
{
VM_BUG_ON(!PageLocked(page));
if (!page->mapping) { /* truncated ? */
return;
}
if (!isolate_lru_page(page)) {
putback_lru_page(page);
} else {
/*
* Page not on the LRU yet. Flush all pagevecs and retry.
*/
lru_add_drain_all();
if (!isolate_lru_page(page))
putback_lru_page(page);
}
}
/*
* Mark page as mlocked if not already.
* If page on LRU, isolate and putback to move to unevictable list.
*/
void mlock_vma_page(struct page *page)
{
BUG_ON(!PageLocked(page));
if (!TestSetPageMlocked(page) && !isolate_lru_page(page))
putback_lru_page(page);
}
/*
* called from munlock()/munmap() path with page supposedly on the LRU.
*
* Note: unlike mlock_vma_page(), we can't just clear the PageMlocked
* [in try_to_munlock()] and then attempt to isolate the page. We must
* isolate the page to keep others from messing with its unevictable
* and mlocked state while trying to munlock. However, we pre-clear the
* mlocked state anyway as we might lose the isolation race and we might
* not get another chance to clear PageMlocked. If we successfully
* isolate the page and try_to_munlock() detects other VM_LOCKED vmas
* mapping the page, it will restore the PageMlocked state, unless the page
* is mapped in a non-linear vma. So, we go ahead and SetPageMlocked(),
* perhaps redundantly.
* If we lose the isolation race, and the page is mapped by other VM_LOCKED
* vmas, we'll detect this in vmscan--via try_to_munlock() or try_to_unmap()
* either of which will restore the PageMlocked state by calling
* mlock_vma_page() above, if it can grab the vma's mmap sem.
*/
static void munlock_vma_page(struct page *page)
{
BUG_ON(!PageLocked(page));
if (TestClearPageMlocked(page) && !isolate_lru_page(page)) {
try_to_munlock(page);
putback_lru_page(page);
}
}
/*
* mlock a range of pages in the vma.
*
* This takes care of making the pages present too.
*
* vma->vm_mm->mmap_sem must be held for write.
*/
static int __mlock_vma_pages_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long addr = start;
struct page *pages[16]; /* 16 gives a reasonable batch */
int write = !!(vma->vm_flags & VM_WRITE);
int nr_pages = (end - start) / PAGE_SIZE;
int ret;
VM_BUG_ON(start & ~PAGE_MASK || end & ~PAGE_MASK);
VM_BUG_ON(start < vma->vm_start || end > vma->vm_end);
VM_BUG_ON(!rwsem_is_locked(&vma->vm_mm->mmap_sem));
lru_add_drain_all(); /* push cached pages to LRU */
while (nr_pages > 0) {
int i;
cond_resched();
/*
* get_user_pages makes pages present if we are
* setting mlock. and this extra reference count will
* disable migration of this page. However, page may
* still be truncated out from under us.
*/
ret = get_user_pages(current, mm, addr,
min_t(int, nr_pages, ARRAY_SIZE(pages)),
write, 0, pages, NULL);
/*
* This can happen for, e.g., VM_NONLINEAR regions before
* a page has been allocated and mapped at a given offset,
* or for addresses that map beyond end of a file.
* We'll mlock the the pages if/when they get faulted in.
*/
if (ret < 0)
break;
if (ret == 0) {
/*
* We know the vma is there, so the only time
* we cannot get a single page should be an
* error (ret < 0) case.
*/
WARN_ON(1);
break;
}
lru_add_drain(); /* push cached pages to LRU */
for (i = 0; i < ret; i++) {
struct page *page = pages[i];
lock_page(page);
/*
* Because we lock page here and migration is blocked
* by the elevated reference, we need only check for
* page truncation (file-cache only).
*/
if (page->mapping)
mlock_vma_page(page);
unlock_page(page);
put_page(page); /* ref from get_user_pages() */
/*
* here we assume that get_user_pages() has given us
* a list of virtually contiguous pages.
*/
addr += PAGE_SIZE; /* for next get_user_pages() */
nr_pages--;
}
}
lru_add_drain_all(); /* to update stats */
return 0; /* count entire vma as locked_vm */
}
/*
* private structure for munlock page table walk
*/
struct munlock_page_walk {
struct vm_area_struct *vma;
pmd_t *pmd; /* for migration_entry_wait() */
};
/*
* munlock normal pages for present ptes
*/
static int __munlock_pte_handler(pte_t *ptep, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
struct munlock_page_walk *mpw = walk->private;
swp_entry_t entry;
struct page *page;
pte_t pte;
retry:
pte = *ptep;
/*
* If it's a swap pte, we might be racing with page migration.
*/
if (unlikely(!pte_present(pte))) {
if (!is_swap_pte(pte))
goto out;
entry = pte_to_swp_entry(pte);
if (is_migration_entry(entry)) {
migration_entry_wait(mpw->vma->vm_mm, mpw->pmd, addr);
goto retry;
}
goto out;
}
page = vm_normal_page(mpw->vma, addr, pte);
if (!page)
goto out;
lock_page(page);
if (!page->mapping) {
unlock_page(page);
goto retry;
}
munlock_vma_page(page);
unlock_page(page);
out:
return 0;
}
/*
* Save pmd for pte handler for waiting on migration entries
*/
static int __munlock_pmd_handler(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
struct munlock_page_walk *mpw = walk->private;
mpw->pmd = pmd;
return 0;
}
/*
* munlock a range of pages in the vma using standard page table walk.
*
* vma->vm_mm->mmap_sem must be held for write.
*/
static void __munlock_vma_pages_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
struct mm_struct *mm = vma->vm_mm;
struct munlock_page_walk mpw = {
.vma = vma,
};
struct mm_walk munlock_page_walk = {
.pmd_entry = __munlock_pmd_handler,
.pte_entry = __munlock_pte_handler,
.private = &mpw,
.mm = mm,
};
VM_BUG_ON(start & ~PAGE_MASK || end & ~PAGE_MASK);
VM_BUG_ON(!rwsem_is_locked(&vma->vm_mm->mmap_sem));
VM_BUG_ON(start < vma->vm_start);
VM_BUG_ON(end > vma->vm_end);
lru_add_drain_all(); /* push cached pages to LRU */
walk_page_range(start, end, &munlock_page_walk);
lru_add_drain_all(); /* to update stats */
}
#else /* CONFIG_UNEVICTABLE_LRU */
/*
* Just make pages present if VM_LOCKED. No-op if unlocking.
*/
static int __mlock_vma_pages_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
if (vma->vm_flags & VM_LOCKED)
make_pages_present(start, end);
return 0;
}
/*
* munlock a range of pages in the vma -- no-op.
*/
static void __munlock_vma_pages_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
}
#endif /* CONFIG_UNEVICTABLE_LRU */
/*
* mlock all pages in this vma range. For mmap()/mremap()/...
*/
int mlock_vma_pages_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
int nr_pages = (end - start) / PAGE_SIZE;
BUG_ON(!(vma->vm_flags & VM_LOCKED));
/*
* filter unlockable vmas
*/
if (vma->vm_flags & (VM_IO | VM_PFNMAP))
goto no_mlock;
if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
is_vm_hugetlb_page(vma) ||
vma == get_gate_vma(current)))
return __mlock_vma_pages_range(vma, start, end);
/*
* User mapped kernel pages or huge pages:
* make these pages present to populate the ptes, but
* fall thru' to reset VM_LOCKED--no need to unlock, and
* return nr_pages so these don't get counted against task's
* locked limit. huge pages are already counted against
* locked vm limit.
*/
make_pages_present(start, end);