Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
/*
* linux/mm/memory.c
*
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
*/
/*
* demand-loading started 01.12.91 - seems it is high on the list of
* things wanted, and it should be easy to implement. - Linus
*/
/*
* Ok, demand-loading was easy, shared pages a little bit tricker. Shared
* pages started 02.12.91, seems to work. - Linus.
*
* Tested sharing by executing about 30 /bin/sh: under the old kernel it
* would have taken more than the 6M I have free, but it worked well as
* far as I could see.
*
* Also corrected some "invalidate()"s - I wasn't doing enough of them.
*/
/*
* Real VM (paging to/from disk) started 18.12.91. Much more work and
* thought has to go into this. Oh, well..
* 19.12.91 - works, somewhat. Sometimes I get faults, don't know why.
* Found it. Everything seems to work now.
* 20.12.91 - Ok, making the swap-device changeable like the root.
*/
/*
* 05.04.94 - Multi-page memory management added for v1.1.
* Idea by Alex Bligh (alex@cconcepts.co.uk)
*
* 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG
* (Gerhard.Wichert@pdb.siemens.de)
*
* Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
*/
#include <linux/kernel_stat.h>
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/mman.h>
#include <linux/swap.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/rmap.h>
#include <linux/module.h>
#include <linux/init.h>
#include <asm/pgalloc.h>
#include <asm/uaccess.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/pgtable.h>
#include <linux/swapops.h>
#include <linux/elf.h>
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
/* use the per-pgdat data instead for discontigmem - mbligh */
unsigned long max_mapnr;
struct page *mem_map;
EXPORT_SYMBOL(max_mapnr);
EXPORT_SYMBOL(mem_map);
#endif
unsigned long num_physpages;
/*
* A number of key systems in x86 including ioremap() rely on the assumption
* that high_memory defines the upper bound on direct map memory, then end
* of ZONE_NORMAL. Under CONFIG_DISCONTIG this means that max_low_pfn and
* highstart_pfn must be the same; there must be no gap between ZONE_NORMAL
* and ZONE_HIGHMEM.
*/
void * high_memory;
unsigned long vmalloc_earlyreserve;
EXPORT_SYMBOL(num_physpages);
EXPORT_SYMBOL(high_memory);
EXPORT_SYMBOL(vmalloc_earlyreserve);
/*
* If a p?d_bad entry is found while walking page tables, report
* the error, before resetting entry to p?d_none. Usually (but
* very seldom) called out from the p?d_none_or_clear_bad macros.
*/
void pgd_clear_bad(pgd_t *pgd)
{
pgd_ERROR(*pgd);
pgd_clear(pgd);
}
void pud_clear_bad(pud_t *pud)
{
pud_ERROR(*pud);
pud_clear(pud);
}
void pmd_clear_bad(pmd_t *pmd)
{
pmd_ERROR(*pmd);
pmd_clear(pmd);
}
/*
* Note: this doesn't free the actual pages themselves. That
* has been handled earlier when unmapping all the memory regions.
*/
static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
struct page *page = pmd_page(*pmd);
pmd_clear(pmd);
pte_free_tlb(tlb, page);
dec_page_state(nr_page_table_pages);
tlb->mm->nr_ptes--;
static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
unsigned long addr, unsigned long end,
unsigned long floor, unsigned long ceiling)
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
if (pmd_none_or_clear_bad(pmd))
continue;
start &= PUD_MASK;
if (start < floor)
return;
if (ceiling) {
ceiling &= PUD_MASK;
if (!ceiling)
return;
if (end - 1 > ceiling - 1)
return;
pmd = pmd_offset(pud, start);
pud_clear(pud);
pmd_free_tlb(tlb, pmd);
static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
unsigned long addr, unsigned long end,
unsigned long floor, unsigned long ceiling)
pud = pud_offset(pgd, addr);
do {
next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud))
continue;
free_pmd_range(tlb, pud, addr, next, floor, ceiling);
start &= PGDIR_MASK;
if (start < floor)
return;
if (ceiling) {
ceiling &= PGDIR_MASK;
if (!ceiling)
return;
if (end - 1 > ceiling - 1)
return;
pud = pud_offset(pgd, start);
pgd_clear(pgd);
pud_free_tlb(tlb, pud);
* This function frees user-level page tables of a process.
*
void free_pgd_range(struct mmu_gather **tlb,
unsigned long addr, unsigned long end,
unsigned long floor, unsigned long ceiling)
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
unsigned long start;
/*
* The next few lines have given us lots of grief...
*
* Why are we testing PMD* at this top level? Because often
* there will be no work to do at all, and we'd prefer not to
* go all the way down to the bottom just to discover that.
*
* Why all these "- 1"s? Because 0 represents both the bottom
* of the address space and the top of it (using -1 for the
* top wouldn't help much: the masks would do the wrong thing).
* The rule is that addr 0 and floor 0 refer to the bottom of
* the address space, but end 0 and ceiling 0 refer to the top
* Comparisons need to use "end - 1" and "ceiling - 1" (though
* that end 0 case should be mythical).
*
* Wherever addr is brought up or ceiling brought down, we must
* be careful to reject "the opposite 0" before it confuses the
* subsequent tests. But what about where end is brought down
* by PMD_SIZE below? no, end can't go down to 0 there.
*
* Whereas we round start (addr) and ceiling down, by different
* masks at different levels, in order to test whether a table
* now has no other vmas using it, so can be freed, we don't
* bother to round floor or end up - the tests don't need that.
*/
addr &= PMD_MASK;
if (addr < floor) {
addr += PMD_SIZE;
if (!addr)
return;
}
if (ceiling) {
ceiling &= PMD_MASK;
if (!ceiling)
return;
}
if (end - 1 > ceiling - 1)
end -= PMD_SIZE;
if (addr > end - 1)
return;
start = addr;
pgd = pgd_offset((*tlb)->mm, addr);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
free_pud_range(*tlb, pgd, addr, next, floor, ceiling);
flush_tlb_pgtables((*tlb)->mm, start, end);
}
void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
unsigned long floor, unsigned long ceiling)
{
while (vma) {
struct vm_area_struct *next = vma->vm_next;
unsigned long addr = vma->vm_start;
if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) {
hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
floor, next? next->vm_start: ceiling);
} else {
/*
* Optimization: gather nearby vmas into one call down
*/
while (next && next->vm_start <= vma->vm_end + PMD_SIZE
&& !is_hugepage_only_range(vma->vm_mm, next->vm_start,
HPAGE_SIZE)) {
vma = next;
next = vma->vm_next;
}
free_pgd_range(tlb, addr, vma->vm_end,
floor, next? next->vm_start: ceiling);
}
pte_t fastcall *pte_alloc_map(struct mm_struct *mm, pmd_t *pmd,
unsigned long address)
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
{
if (!pmd_present(*pmd)) {
struct page *new;
spin_unlock(&mm->page_table_lock);
new = pte_alloc_one(mm, address);
spin_lock(&mm->page_table_lock);
if (!new)
return NULL;
/*
* Because we dropped the lock, we should re-check the
* entry, as somebody else could have populated it..
*/
if (pmd_present(*pmd)) {
pte_free(new);
goto out;
}
mm->nr_ptes++;
inc_page_state(nr_page_table_pages);
pmd_populate(mm, pmd, new);
}
out:
return pte_offset_map(pmd, address);
}
pte_t fastcall * pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
{
if (!pmd_present(*pmd)) {
pte_t *new;
spin_unlock(&mm->page_table_lock);
new = pte_alloc_one_kernel(mm, address);
spin_lock(&mm->page_table_lock);
if (!new)
return NULL;
/*
* Because we dropped the lock, we should re-check the
* entry, as somebody else could have populated it..
*/
if (pmd_present(*pmd)) {
pte_free_kernel(new);
goto out;
}
pmd_populate_kernel(mm, pmd, new);
}
out:
return pte_offset_kernel(pmd, address);
}
static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
{
if (file_rss)
add_mm_counter(mm, file_rss, file_rss);
if (anon_rss)
add_mm_counter(mm, anon_rss, anon_rss);
}
/*
* This function is called to print an error when a pte in a
* !VM_RESERVED region is found pointing to an invalid pfn (which
* is an error.
*
* The calling function must still handle the error.
*/
void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr)
{
printk(KERN_ERR "Bad pte = %08llx, process = %s, "
"vm_flags = %lx, vaddr = %lx\n",
(long long)pte_val(pte),
(vma->vm_mm == current->mm ? current->comm : "???"),
vma->vm_flags, vaddr);
dump_stack();
}
/*
* copy one vm_area from one task to the other. Assumes the page tables
* already present in the new task to be cleared in the whole range
* covered by this vma.
*
* dst->page_table_lock is held on entry and exit,
* but may be dropped within p[mg]d_alloc() and pte_alloc_map().
*/
copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
pte_t pte = *src_pte;
struct page *page;
unsigned long pfn;
/* pte contains position in swap or file, so copy. */
if (unlikely(!pte_present(pte))) {
if (!pte_file(pte)) {
swap_duplicate(pte_to_swp_entry(pte));
/* make sure dst_mm is on swapoff's mmlist. */
if (unlikely(list_empty(&dst_mm->mmlist))) {
spin_lock(&mmlist_lock);
list_add(&dst_mm->mmlist, &src_mm->mmlist);
spin_unlock(&mmlist_lock);
}
}
/* If the region is VM_RESERVED, the mapping is not
* mapped via rmap - duplicate the pte as is.
*/
if (vm_flags & VM_RESERVED)
goto out_set_pte;
/* If the pte points outside of valid memory but
* the region is not VM_RESERVED, we have a problem.
if (unlikely(!pfn_valid(pfn))) {
print_bad_pte(vma, pte, addr);
goto out_set_pte; /* try to do something sane */
}
/*
* If it's a COW mapping, write protect it both
* in the parent and the child
*/
if ((vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE) {
ptep_set_wrprotect(src_mm, addr, src_pte);
pte = *src_pte;
}
/*
* If it's a shared mapping, mark it clean in
* the child
*/
if (vm_flags & VM_SHARED)
pte = pte_mkclean(pte);
pte = pte_mkold(pte);
get_page(page);
page_dup_rmap(page);
out_set_pte:
set_pte_at(dst_mm, addr, dst_pte, pte);
}
static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
{
pte_t *src_pte, *dst_pte;
dst_pte = pte_alloc_map(dst_mm, dst_pmd, addr);
if (!dst_pte)
return -ENOMEM;
src_pte = pte_offset_map_nested(src_pmd, addr);
spin_lock(&src_mm->page_table_lock);
do {
/*
* We are holding two locks at this point - either of them
* could generate latencies in another task on another CPU.
*/
if (progress >= 32) {
progress = 0;
if (need_resched() ||
need_lockbreak(&src_mm->page_table_lock) ||
need_lockbreak(&dst_mm->page_table_lock))
break;
}
if (pte_none(*src_pte)) {
progress++;
continue;
}
copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
progress += 8;
} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
spin_unlock(&src_mm->page_table_lock);
pte_unmap_nested(src_pte - 1);
pte_unmap(dst_pte - 1);
add_mm_rss(dst_mm, rss[0], rss[1]);
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
cond_resched_lock(&dst_mm->page_table_lock);
if (addr != end)
goto again;
return 0;
}
static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
{
pmd_t *src_pmd, *dst_pmd;
unsigned long next;
dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
if (!dst_pmd)
return -ENOMEM;
src_pmd = pmd_offset(src_pud, addr);
do {
next = pmd_addr_end(addr, end);
if (pmd_none_or_clear_bad(src_pmd))
continue;
if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
vma, addr, next))
return -ENOMEM;
} while (dst_pmd++, src_pmd++, addr = next, addr != end);
return 0;
}
static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
{
pud_t *src_pud, *dst_pud;
unsigned long next;
dst_pud = pud_alloc(dst_mm, dst_pgd, addr);
if (!dst_pud)
return -ENOMEM;
src_pud = pud_offset(src_pgd, addr);
do {
next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(src_pud))
continue;
if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
vma, addr, next))
return -ENOMEM;
} while (dst_pud++, src_pud++, addr = next, addr != end);
return 0;
}
int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
struct vm_area_struct *vma)
{
pgd_t *src_pgd, *dst_pgd;
unsigned long next;
unsigned long addr = vma->vm_start;
unsigned long end = vma->vm_end;
/*
* Don't copy ptes where a page fault will fill them correctly.
* Fork becomes much lighter when there are big shared or private
* readonly mappings. The tradeoff is that copy_page_range is more
* efficient than faulting.
*/
if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_RESERVED))) {
if (!vma->anon_vma)
return 0;
}
if (is_vm_hugetlb_page(vma))
return copy_hugetlb_page_range(dst_mm, src_mm, vma);
dst_pgd = pgd_offset(dst_mm, addr);
src_pgd = pgd_offset(src_mm, addr);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(src_pgd))
continue;
if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
vma, addr, next))
return -ENOMEM;
} while (dst_pgd++, src_pgd++, addr = next, addr != end);
return 0;
}
static void zap_pte_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
struct zap_details *details)
{
int file_rss = 0;
int anon_rss = 0;
pte = pte_offset_map(pmd, addr);
do {
pte_t ptent = *pte;
if (pte_none(ptent))
continue;
if (pte_present(ptent)) {
struct page *page = NULL;
if (!(vma->vm_flags & VM_RESERVED)) {
unsigned long pfn = pte_pfn(ptent);
if (unlikely(!pfn_valid(pfn)))
print_bad_pte(vma, ptent, addr);
else
page = pfn_to_page(pfn);
}
if (unlikely(details) && page) {
/*
* unmap_shared_mapping_pages() wants to
* invalidate cache without truncating:
* unmap shared but keep private pages.
*/
if (details->check_mapping &&
details->check_mapping != page->mapping)
continue;
/*
* Each page->index must be checked when
* invalidating or truncating nonlinear.
*/
if (details->nonlinear_vma &&
(page->index < details->first_index ||
page->index > details->last_index))
continue;
}
ptent = ptep_get_and_clear_full(mm, addr, pte,
tlb_remove_tlb_entry(tlb, pte, addr);
if (unlikely(!page))
continue;
if (unlikely(details) && details->nonlinear_vma
&& linear_page_index(details->nonlinear_vma,
addr) != page->index)
else {
if (pte_dirty(ptent))
set_page_dirty(page);
if (pte_young(ptent))
mark_page_accessed(page);
page_remove_rmap(page);
tlb_remove_page(tlb, page);
continue;
}
/*
* If details->check_mapping, we leave swap entries;
* if details->nonlinear_vma, we leave file entries.
*/
if (unlikely(details))
continue;
if (!pte_file(ptent))
free_swap_and_cache(pte_to_swp_entry(ptent));
pte_clear_full(mm, addr, pte, tlb->fullmm);
static inline void zap_pmd_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pud_t *pud,
unsigned long addr, unsigned long end,
struct zap_details *details)
{
pmd_t *pmd;
unsigned long next;
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
if (pmd_none_or_clear_bad(pmd))
continue;
zap_pte_range(tlb, vma, pmd, addr, next, details);
static inline void zap_pud_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pgd_t *pgd,
unsigned long addr, unsigned long end,
struct zap_details *details)
{
pud_t *pud;
unsigned long next;
pud = pud_offset(pgd, addr);
do {
next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud))
continue;
zap_pmd_range(tlb, vma, pud, addr, next, details);
} while (pud++, addr = next, addr != end);
}
static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long addr, unsigned long end,
struct zap_details *details)
{
pgd_t *pgd;
unsigned long next;
if (details && !details->check_mapping && !details->nonlinear_vma)
details = NULL;
BUG_ON(addr >= end);
tlb_start_vma(tlb, vma);
pgd = pgd_offset(vma->vm_mm, addr);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
zap_pud_range(tlb, vma, pgd, addr, next, details);
} while (pgd++, addr = next, addr != end);
tlb_end_vma(tlb, vma);
}
#ifdef CONFIG_PREEMPT
# define ZAP_BLOCK_SIZE (8 * PAGE_SIZE)
#else
/* No preempt: go for improved straight-line efficiency */
# define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE)
#endif
/**
* unmap_vmas - unmap a range of memory covered by a list of vma's
* @tlbp: address of the caller's struct mmu_gather
* @mm: the controlling mm_struct
* @vma: the starting vma
* @start_addr: virtual address at which to start unmapping
* @end_addr: virtual address at which to end unmapping
* @nr_accounted: Place number of unmapped pages in vm-accountable vma's here
* @details: details of nonlinear truncation or shared cache invalidation
*
* Returns the end address of the unmapping (restart addr if interrupted).
*
* Unmap all pages in the vma list. Called under page_table_lock.
*
* We aim to not hold page_table_lock for too long (for scheduling latency
* reasons). So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to
* return the ending mmu_gather to the caller.
*
* Only addresses between `start' and `end' will be unmapped.
*
* The VMA list must be sorted in ascending virtual address order.
*
* unmap_vmas() assumes that the caller will flush the whole unmapped address
* range after unmap_vmas() returns. So the only responsibility here is to
* ensure that any thus-far unmapped pages are flushed before unmap_vmas()
* drops the lock and schedules.
*/
unsigned long unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long start_addr,
unsigned long end_addr, unsigned long *nr_accounted,
struct zap_details *details)
{
unsigned long zap_bytes = ZAP_BLOCK_SIZE;
unsigned long tlb_start = 0; /* For tlb_finish_mmu */
int tlb_start_valid = 0;
spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
unsigned long end;
start = max(vma->vm_start, start_addr);
if (start >= vma->vm_end)
continue;
end = min(vma->vm_end, end_addr);
if (end <= vma->vm_start)
continue;
if (vma->vm_flags & VM_ACCOUNT)
*nr_accounted += (end - start) >> PAGE_SHIFT;
while (start != end) {
unsigned long block;
if (!tlb_start_valid) {
tlb_start = start;
tlb_start_valid = 1;
}
if (is_vm_hugetlb_page(vma)) {
block = end - start;
unmap_hugepage_range(vma, start, end);
} else {
block = min(zap_bytes, end - start);
unmap_page_range(*tlbp, vma, start,
start + block, details);
}
start += block;
zap_bytes -= block;
if ((long)zap_bytes > 0)
continue;
tlb_finish_mmu(*tlbp, tlb_start, start);
if (need_resched() ||
need_lockbreak(&mm->page_table_lock) ||
(i_mmap_lock && need_lockbreak(i_mmap_lock))) {
if (i_mmap_lock) {
/* must reset count of rss freed */
*tlbp = tlb_gather_mmu(mm, fullmm);
goto out;
}
spin_unlock(&mm->page_table_lock);
cond_resched();
spin_lock(&mm->page_table_lock);
}
*tlbp = tlb_gather_mmu(mm, fullmm);
tlb_start_valid = 0;
zap_bytes = ZAP_BLOCK_SIZE;
}
}
out:
return start; /* which is now the end (or restart) address */
}
/**
* zap_page_range - remove user pages in a given range
* @vma: vm_area_struct holding the applicable pages
* @address: starting address of pages to zap
* @size: number of bytes to zap
* @details: details of nonlinear truncation or shared cache invalidation
*/
unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
unsigned long size, struct zap_details *details)
{
struct mm_struct *mm = vma->vm_mm;
struct mmu_gather *tlb;
unsigned long end = address + size;
unsigned long nr_accounted = 0;
if (is_vm_hugetlb_page(vma)) {
zap_hugepage_range(vma, address, size);
}
lru_add_drain();
spin_lock(&mm->page_table_lock);
tlb = tlb_gather_mmu(mm, 0);
end = unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details);
tlb_finish_mmu(tlb, address, end);
spin_unlock(&mm->page_table_lock);
}
/*
* Do a quick page-table lookup for a single page.
* mm->page_table_lock must be held.
*/
static struct page *__follow_page(struct mm_struct *mm, unsigned long address,
int read, int write, int accessed)
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *ptep, pte;
unsigned long pfn;
struct page *page;
page = follow_huge_addr(mm, address, write);
if (! IS_ERR(page))
return page;
pgd = pgd_offset(mm, address);
if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
goto out;
pud = pud_offset(pgd, address);
if (pud_none(*pud) || unlikely(pud_bad(*pud)))
goto out;
pmd = pmd_offset(pud, address);
if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
goto out;
if (pmd_huge(*pmd))
return follow_huge_pmd(mm, address, pmd, write);
ptep = pte_offset_map(pmd, address);
if (!ptep)
goto out;
pte = *ptep;
pte_unmap(ptep);
if (pte_present(pte)) {
goto out;
if (read && !pte_read(pte))
goto out;
pfn = pte_pfn(pte);
if (pfn_valid(pfn)) {
page = pfn_to_page(pfn);
if (accessed) {
if (write && !pte_dirty(pte) &&!PageDirty(page))
set_page_dirty(page);
mark_page_accessed(page);
return page;
}
}
out:
return NULL;
}
follow_page(struct mm_struct *mm, unsigned long address, int write)
{
return __follow_page(mm, address, 0, write, 1);
/*
* check_user_page_readable() can be called frm niterrupt context by oprofile,
* so we need to avoid taking any non-irq-safe locks
*/
int check_user_page_readable(struct mm_struct *mm, unsigned long address)
return __follow_page(mm, address, 1, 0, 0) != NULL;
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
}
EXPORT_SYMBOL(check_user_page_readable);
static inline int
untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma,
unsigned long address)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
/* Check if the vma is for an anonymous mapping. */
if (vma->vm_ops && vma->vm_ops->nopage)
return 0;
/* Check if page directory entry exists. */
pgd = pgd_offset(mm, address);
if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
return 1;
pud = pud_offset(pgd, address);
if (pud_none(*pud) || unlikely(pud_bad(*pud)))
return 1;
/* Check if page middle directory entry exists. */
pmd = pmd_offset(pud, address);
if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
return 1;
/* There is a pte slot for 'address' in 'mm'. */
return 0;
}
int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int len, int write, int force,
struct page **pages, struct vm_area_struct **vmas)
{
int i;
unsigned int flags;
/*
* Require read or write permissions.
* If 'force' is set, we only require the "MAY" flags.
*/
flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
i = 0;
do {
struct vm_area_struct * vma;
vma = find_extend_vma(mm, start);
if (!vma && in_gate_area(tsk, start)) {
unsigned long pg = start & PAGE_MASK;
struct vm_area_struct *gate_vma = get_gate_vma(tsk);
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
if (write) /* user gate pages are read-only */
return i ? : -EFAULT;
if (pg > TASK_SIZE)
pgd = pgd_offset_k(pg);
else
pgd = pgd_offset_gate(mm, pg);
BUG_ON(pgd_none(*pgd));
pud = pud_offset(pgd, pg);
BUG_ON(pud_none(*pud));
pmd = pmd_offset(pud, pg);
if (pmd_none(*pmd))
return i ? : -EFAULT;
if (pte_none(*pte)) {
pte_unmap(pte);
return i ? : -EFAULT;
}
if (pages) {
pages[i] = pte_page(*pte);
get_page(pages[i]);
}
pte_unmap(pte);
if (vmas)
vmas[i] = gate_vma;
i++;
start += PAGE_SIZE;
len--;
continue;
}
if (!vma || (vma->vm_flags & (VM_IO | VM_RESERVED))