Newer
Older
/*
* linux/mm/vmscan.c
*
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
*
* Swap reorganised 29.12.95, Stephen Tweedie.
* kswapd added: 7.1.96 sct
* Removed kswapd_ctl limits, and swap out as many pages as needed
* to bring the system back to freepages.high: 2.4.97, Rik van Riel.
* Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
* Multiqueue VM started 5.8.00, Rik van Riel.
*/
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/kernel_stat.h>
#include <linux/swap.h>
#include <linux/pagemap.h>
#include <linux/init.h>
#include <linux/highmem.h>
#include <linux/file.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h> /* for try_to_release_page(),
buffer_heads_over_limit */
#include <linux/mm_inline.h>
#include <linux/pagevec.h>
#include <linux/backing-dev.h>
#include <linux/rmap.h>
#include <linux/topology.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
#include <linux/notifier.h>
#include <linux/rwsem.h>
#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
#include <linux/swapops.h>
struct scan_control {
/* Incremented by the number of inactive pages that were scanned */
unsigned long nr_scanned;
/* This context's GFP mask */
/* Can pages be swapped as part of reclaim? */
int may_swap;
/* This context's SWAP_CLUSTER_MAX. If freeing memory for
* suspend, we effectively ignore SWAP_CLUSTER_MAX.
* In this context, it doesn't matter that we scan the
* whole list at once. */
int swap_cluster_max;
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
};
#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
#ifdef ARCH_HAS_PREFETCH
#define prefetch_prev_lru_page(_page, _base, _field) \
do { \
if ((_page)->lru.prev != _base) { \
struct page *prev; \
\
prev = lru_to_page(&(_page->lru)); \
prefetch(&prev->_field); \
} \
} while (0)
#else
#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
#endif
#ifdef ARCH_HAS_PREFETCHW
#define prefetchw_prev_lru_page(_page, _base, _field) \
do { \
if ((_page)->lru.prev != _base) { \
struct page *prev; \
\
prev = lru_to_page(&(_page->lru)); \
prefetchw(&prev->_field); \
} \
} while (0)
#else
#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
#endif
/*
* From 0 .. 100. Higher means more swappy.
*/
int vm_swappiness = 60;
long vm_total_pages; /* The total number of pages which the VM controls */
static LIST_HEAD(shrinker_list);
static DECLARE_RWSEM(shrinker_rwsem);
/*
* Add a shrinker callback to be called from the vm
*/
void register_shrinker(struct shrinker *shrinker)
shrinker->nr = 0;
down_write(&shrinker_rwsem);
list_add_tail(&shrinker->list, &shrinker_list);
up_write(&shrinker_rwsem);
EXPORT_SYMBOL(register_shrinker);
void unregister_shrinker(struct shrinker *shrinker)
{
down_write(&shrinker_rwsem);
list_del(&shrinker->list);
up_write(&shrinker_rwsem);
}
EXPORT_SYMBOL(unregister_shrinker);
#define SHRINK_BATCH 128
/*
* Call the shrink functions to age shrinkable caches
*
* Here we assume it costs one seek to replace a lru page and that it also
* takes a seek to recreate a cache object. With this in mind we age equal
* percentages of the lru and ageable caches. This should balance the seeks
* generated by these structures.
*
* If the vm encountered mapped pages on the LRU it increase the pressure on
* slab to avoid swapping.
*
* We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.
*
* `lru_pages' represents the number of on-LRU pages in all the zones which
* are eligible for the caller's allocation attempt. It is used for balancing
* slab reclaim versus page reclaim.
*
* Returns the number of slab objects which we shrunk.
unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
unsigned long lru_pages)
if (scanned == 0)
scanned = SWAP_CLUSTER_MAX;
if (!down_read_trylock(&shrinker_rwsem))
return 1; /* Assume we'll be able to shrink next time */
list_for_each_entry(shrinker, &shrinker_list, list) {
unsigned long long delta;
unsigned long total_scan;
unsigned long max_pass = (*shrinker->shrink)(0, gfp_mask);
delta *= max_pass;
do_div(delta, lru_pages + 1);
shrinker->nr += delta;
if (shrinker->nr < 0) {
printk(KERN_ERR "%s: nr=%ld\n",
__FUNCTION__, shrinker->nr);
shrinker->nr = max_pass;
}
/*
* Avoid risking looping forever due to too large nr value:
* never try to free more than twice the estimate number of
* freeable entries.
*/
if (shrinker->nr > max_pass * 2)
shrinker->nr = max_pass * 2;
total_scan = shrinker->nr;
shrinker->nr = 0;
while (total_scan >= SHRINK_BATCH) {
long this_scan = SHRINK_BATCH;
int shrink_ret;
nr_before = (*shrinker->shrink)(0, gfp_mask);
shrink_ret = (*shrinker->shrink)(this_scan, gfp_mask);
if (shrink_ret < nr_before)
ret += nr_before - shrink_ret;
count_vm_events(SLABS_SCANNED, this_scan);
total_scan -= this_scan;
cond_resched();
}
shrinker->nr += total_scan;
}
up_read(&shrinker_rwsem);
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
}
/* Called without lock on whether page is mapped, so answer is unstable */
static inline int page_mapping_inuse(struct page *page)
{
struct address_space *mapping;
/* Page is in somebody's page tables. */
if (page_mapped(page))
return 1;
/* Be more reluctant to reclaim swapcache than pagecache */
if (PageSwapCache(page))
return 1;
mapping = page_mapping(page);
if (!mapping)
return 0;
/* File is mmap'd by somebody? */
return mapping_mapped(mapping);
}
static inline int is_page_cache_freeable(struct page *page)
{
return page_count(page) - !!PagePrivate(page) == 2;
}
static int may_write_to_queue(struct backing_dev_info *bdi)
{
if (current->flags & PF_SWAPWRITE)
return 1;
if (!bdi_write_congested(bdi))
return 1;
if (bdi == current->backing_dev_info)
return 1;
return 0;
}
/*
* We detected a synchronous write error writing a page out. Probably
* -ENOSPC. We need to propagate that into the address_space for a subsequent
* fsync(), msync() or close().
*
* The tricky part is that after writepage we cannot touch the mapping: nothing
* prevents it from being freed up. But we have a ref on the page and once
* that page is locked, the mapping is pinned.
*
* We're allowed to run sleeping lock_page() here because we know the caller has
* __GFP_FS.
*/
static void handle_write_error(struct address_space *mapping,
struct page *page, int error)
{
lock_page(page);
if (page_mapping(page) == mapping)
mapping_set_error(mapping, error);
Andy Whitcroft
committed
/* Request for sync pageout. */
enum pageout_io {
PAGEOUT_IO_ASYNC,
PAGEOUT_IO_SYNC,
};
/* possible outcome of pageout() */
typedef enum {
/* failed to write page out, page is locked */
PAGE_KEEP,
/* move page to the active list, page is locked */
PAGE_ACTIVATE,
/* page has been sent to the disk successfully, page is unlocked */
PAGE_SUCCESS,
/* page is clean and locked */
PAGE_CLEAN,
} pageout_t;
* pageout is called by shrink_page_list() for each dirty page.
* Calls ->writepage().
Andy Whitcroft
committed
static pageout_t pageout(struct page *page, struct address_space *mapping,
enum pageout_io sync_writeback)
{
/*
* If the page is dirty, only perform writeback if that write
* will be non-blocking. To prevent this allocation from being
* stalled by pagecache activity. But note that there may be
* stalls if we need to run get_block(). We could test
* PagePrivate for that.
*
* If this process is currently in generic_file_write() against
* this page's queue, we can perform writeback even if that
* will block.
*
* If the page is swapcache, write it back even if that would
* block, for some throttling. This happens by accident, because
* swap_backing_dev_info is bust: it doesn't reflect the
* congestion state of the swapdevs. Easy to fix, if needed.
* See swapfile.c:page_queue_congested().
*/
if (!is_page_cache_freeable(page))
return PAGE_KEEP;
if (!mapping) {
/*
* Some data journaling orphaned pages can have
* page->mapping == NULL while being dirty with clean buffers.
*/
if (PagePrivate(page)) {
if (try_to_free_buffers(page)) {
ClearPageDirty(page);
printk("%s: orphaned page\n", __FUNCTION__);
return PAGE_CLEAN;
}
}
return PAGE_KEEP;
}
if (mapping->a_ops->writepage == NULL)
return PAGE_ACTIVATE;
if (!may_write_to_queue(mapping->backing_dev_info))
return PAGE_KEEP;
if (clear_page_dirty_for_io(page)) {
int res;
struct writeback_control wbc = {
.sync_mode = WB_SYNC_NONE,
.nr_to_write = SWAP_CLUSTER_MAX,
.range_start = 0,
.range_end = LLONG_MAX,
.nonblocking = 1,
.for_reclaim = 1,
};
SetPageReclaim(page);
res = mapping->a_ops->writepage(page, &wbc);
if (res < 0)
handle_write_error(mapping, page, res);
if (res == AOP_WRITEPAGE_ACTIVATE) {
ClearPageReclaim(page);
return PAGE_ACTIVATE;
}
Andy Whitcroft
committed
/*
* Wait on writeback if requested to. This happens when
* direct reclaiming a large contiguous area and the
* first attempt to free a range of pages fails.
*/
if (PageWriteback(page) && sync_writeback == PAGEOUT_IO_SYNC)
wait_on_page_writeback(page);
if (!PageWriteback(page)) {
/* synchronous write or broken a_ops? */
ClearPageReclaim(page);
}
inc_zone_page_state(page, NR_VMSCAN_WRITE);
return PAGE_SUCCESS;
}
return PAGE_CLEAN;
}
/*
* Attempt to detach a locked page from its ->mapping. If it is dirty or if
* someone else has a ref on the page, abort and return 0. If it was
* successfully detached, return 1. Assumes the caller has a single ref on
* this page.
*/
int remove_mapping(struct address_space *mapping, struct page *page)
BUG_ON(!PageLocked(page));
BUG_ON(mapping != page_mapping(page));
write_lock_irq(&mapping->tree_lock);
/*
* The non racy check for a busy page.
*
* Must be careful with the order of the tests. When someone has
* a ref to the page, it may be possible that they dirty it then
* drop the reference. So if PageDirty is tested before page_count
* here, then the following race may occur:
*
* get_user_pages(&page);
* [user mapping goes away]
* write_to(page);
* !PageDirty(page) [good]
* SetPageDirty(page);
* put_page(page);
* !page_count(page) [good, discard it]
*
* [oops, our write_to data is lost]
*
* Reversing the order of the tests ensures such a situation cannot
* escape unnoticed. The smp_rmb is needed to ensure the page->flags
* load is not satisfied before that of page->_count.
*
* Note that if SetPageDirty is always performed via set_page_dirty,
* and thus under tree_lock, then this ordering is not required.
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
*/
if (unlikely(page_count(page) != 2))
goto cannot_free;
smp_rmb();
if (unlikely(PageDirty(page)))
goto cannot_free;
if (PageSwapCache(page)) {
swp_entry_t swap = { .val = page_private(page) };
__delete_from_swap_cache(page);
write_unlock_irq(&mapping->tree_lock);
swap_free(swap);
__put_page(page); /* The pagecache ref */
return 1;
}
__remove_from_page_cache(page);
write_unlock_irq(&mapping->tree_lock);
__put_page(page);
return 1;
cannot_free:
write_unlock_irq(&mapping->tree_lock);
return 0;
}
* shrink_page_list() returns the number of reclaimed pages
static unsigned long shrink_page_list(struct list_head *page_list,
Andy Whitcroft
committed
struct scan_control *sc,
enum pageout_io sync_writeback)
{
LIST_HEAD(ret_pages);
struct pagevec freed_pvec;
int pgactivate = 0;
cond_resched();
pagevec_init(&freed_pvec, 1);
while (!list_empty(page_list)) {
struct address_space *mapping;
struct page *page;
int may_enter_fs;
int referenced;
cond_resched();
page = lru_to_page(page_list);
list_del(&page->lru);
if (TestSetPageLocked(page))
goto keep;
Christoph Lameter
committed
if (!sc->may_swap && page_mapped(page))
goto keep_locked;
/* Double the slab pressure for mapped and swapcache pages */
if (page_mapped(page) || PageSwapCache(page))
sc->nr_scanned++;
Andy Whitcroft
committed
may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
if (PageWriteback(page)) {
/*
* Synchronous reclaim is performed in two passes,
* first an asynchronous pass over the list to
* start parallel writeback, and a second synchronous
* pass to wait for the IO to complete. Wait here
* for any page for which writeback has already
* started.
*/
if (sync_writeback == PAGEOUT_IO_SYNC && may_enter_fs)
wait_on_page_writeback(page);
else
goto keep_locked;
}
referenced = page_referenced(page, 1);
/* In active use or really unfreeable? Activate it. */
if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
referenced && page_mapping_inuse(page))
goto activate_locked;
#ifdef CONFIG_SWAP
/*
* Anonymous process memory has backing store?
* Try to allocate it some swap space here.
*/
Christoph Lameter
committed
if (PageAnon(page) && !PageSwapCache(page))
if (!add_to_swap(page, GFP_ATOMIC))
goto activate_locked;
#endif /* CONFIG_SWAP */
mapping = page_mapping(page);
/*
* The page is mapped into the page tables of one or more
* processes. Try to unmap it here.
*/
if (page_mapped(page) && mapping) {
switch (try_to_unmap(page, 0)) {
case SWAP_FAIL:
goto activate_locked;
case SWAP_AGAIN:
goto keep_locked;
case SWAP_SUCCESS:
; /* try to free the page below */
}
}
if (PageDirty(page)) {
if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && referenced)
goto keep_locked;
if (!may_enter_fs)
goto keep_locked;
if (!sc->may_writepage)
goto keep_locked;
/* Page is dirty, try to write it out here */
Andy Whitcroft
committed
switch (pageout(page, mapping, sync_writeback)) {
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
case PAGE_KEEP:
goto keep_locked;
case PAGE_ACTIVATE:
goto activate_locked;
case PAGE_SUCCESS:
if (PageWriteback(page) || PageDirty(page))
goto keep;
/*
* A synchronous write - probably a ramdisk. Go
* ahead and try to reclaim the page.
*/
if (TestSetPageLocked(page))
goto keep;
if (PageDirty(page) || PageWriteback(page))
goto keep_locked;
mapping = page_mapping(page);
case PAGE_CLEAN:
; /* try to free the page below */
}
}
/*
* If the page has buffers, try to free the buffer mappings
* associated with this page. If we succeed we try to free
* the page as well.
*
* We do this even if the page is PageDirty().
* try_to_release_page() does not perform I/O, but it is
* possible for a page to have PageDirty set, but it is actually
* clean (all its buffers are clean). This happens if the
* buffers were written out directly, with submit_bh(). ext3
* will do this, as well as the blockdev mapping.
* try_to_release_page() will discover that cleanness and will
* drop the buffers and mark the page clean - it can be freed.
*
* Rarely, pages can have buffers and no ->mapping. These are
* the pages which were not successfully invalidated in
* truncate_complete_page(). We try to drop those buffers here
* and if that worked, and the page is no longer mapped into
* process address space (page_count == 1) it can be freed.
* Otherwise, leave the page on the LRU so it is swappable.
*/
if (PagePrivate(page)) {
if (!try_to_release_page(page, sc->gfp_mask))
goto activate_locked;
if (!mapping && page_count(page) == 1)
goto free_it;
}
if (!mapping || !remove_mapping(mapping, page))
goto keep_locked;
if (!pagevec_add(&freed_pvec, page))
__pagevec_release_nonlru(&freed_pvec);
continue;
activate_locked:
SetPageActive(page);
pgactivate++;
keep_locked:
unlock_page(page);
keep:
list_add(&page->lru, &ret_pages);
}
list_splice(&ret_pages, page_list);
if (pagevec_count(&freed_pvec))
__pagevec_release_nonlru(&freed_pvec);
count_vm_events(PGACTIVATE, pgactivate);
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
/* LRU Isolation modes. */
#define ISOLATE_INACTIVE 0 /* Isolate inactive pages. */
#define ISOLATE_ACTIVE 1 /* Isolate active pages. */
#define ISOLATE_BOTH 2 /* Isolate both active and inactive pages. */
/*
* Attempt to remove the specified page from its LRU. Only take this page
* if it is of the appropriate PageActive status. Pages which are being
* freed elsewhere are also ignored.
*
* page: page to consider
* mode: one of the LRU isolation modes defined above
*
* returns 0 on success, -ve errno on failure.
*/
static int __isolate_lru_page(struct page *page, int mode)
{
int ret = -EINVAL;
/* Only take pages on the LRU. */
if (!PageLRU(page))
return ret;
/*
* When checking the active state, we need to be sure we are
* dealing with comparible boolean values. Take the logical not
* of each.
*/
if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
return ret;
ret = -EBUSY;
if (likely(get_page_unless_zero(page))) {
/*
* Be careful not to clear PageLRU until after we're
* sure the page is not being freed elsewhere -- the
* page release code relies on it.
*/
ClearPageLRU(page);
ret = 0;
}
return ret;
}
/*
* zone->lru_lock is heavily contended. Some of the functions that
* shrink the lists perform better by taking out a batch of pages
* and working on them outside the LRU lock.
*
* For pagecache intensive workloads, this function is the hottest
* spot in the kernel (apart from copy_*_user functions).
*
* Appropriate locks must be held before calling this function.
*
* @nr_to_scan: The number of pages to look through on the list.
* @src: The LRU list to pull pages off.
* @dst: The temp list to put pages on to.
* @scanned: The number of pages that were scanned.
* @order: The caller's attempted allocation order
* @mode: One of the LRU isolation modes
*
* returns how many pages were moved onto *@dst.
*/
static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
struct list_head *src, struct list_head *dst,
for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
struct page *page;
unsigned long pfn;
unsigned long end_pfn;
unsigned long page_pfn;
int zone_id;
page = lru_to_page(src);
prefetchw_prev_lru_page(page, src, flags);
switch (__isolate_lru_page(page, mode)) {
case 0:
list_move(&page->lru, dst);
break;
case -EBUSY:
/* else it is being freed elsewhere */
list_move(&page->lru, src);
continue;
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
default:
BUG();
}
if (!order)
continue;
/*
* Attempt to take all pages in the order aligned region
* surrounding the tag page. Only take those pages of
* the same active state as that tag page. We may safely
* round the target page pfn down to the requested order
* as the mem_map is guarenteed valid out to MAX_ORDER,
* where that page is in a different zone we will detect
* it from its zone id and abort this block scan.
*/
zone_id = page_zone_id(page);
page_pfn = page_to_pfn(page);
pfn = page_pfn & ~((1 << order) - 1);
end_pfn = pfn + (1 << order);
for (; pfn < end_pfn; pfn++) {
struct page *cursor_page;
/* The target page is in the block, ignore it. */
if (unlikely(pfn == page_pfn))
continue;
/* Avoid holes within the zone. */
if (unlikely(!pfn_valid_within(pfn)))
break;
cursor_page = pfn_to_page(pfn);
/* Check that we have not crossed a zone boundary. */
if (unlikely(page_zone_id(cursor_page) != zone_id))
continue;
switch (__isolate_lru_page(cursor_page, mode)) {
case 0:
list_move(&cursor_page->lru, dst);
nr_taken++;
scan++;
break;
case -EBUSY:
/* else it is being freed elsewhere */
list_move(&cursor_page->lru, src);
default:
break;
}
}
}
*scanned = scan;
return nr_taken;
}
/*
* clear_active_flags() is a helper for shrink_active_list(), clearing
* any active bits from the pages in the list.
*/
static unsigned long clear_active_flags(struct list_head *page_list)
{
int nr_active = 0;
struct page *page;
list_for_each_entry(page, page_list, lru)
if (PageActive(page)) {
ClearPageActive(page);
nr_active++;
}
return nr_active;
}
* shrink_inactive_list() is a helper for shrink_zone(). It returns the number
* of reclaimed pages
static unsigned long shrink_inactive_list(unsigned long max_scan,
struct zone *zone, struct scan_control *sc)
pagevec_init(&pvec, 1);
lru_add_drain();
spin_lock_irq(&zone->lru_lock);
unsigned long nr_taken;
unsigned long nr_scan;
unsigned long nr_freed;
nr_taken = isolate_lru_pages(sc->swap_cluster_max,
&zone->inactive_list,
&page_list, &nr_scan, sc->order,
(sc->order > PAGE_ALLOC_COSTLY_ORDER)?
ISOLATE_BOTH : ISOLATE_INACTIVE);
nr_active = clear_active_flags(&page_list);
Andy Whitcroft
committed
__count_vm_events(PGDEACTIVATE, nr_active);
__mod_zone_page_state(zone, NR_ACTIVE, -nr_active);
__mod_zone_page_state(zone, NR_INACTIVE,
-(nr_taken - nr_active));
zone->pages_scanned += nr_scan;
spin_unlock_irq(&zone->lru_lock);
Andy Whitcroft
committed
nr_freed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);
/*
* If we are direct reclaiming for contiguous pages and we do
* not reclaim everything in the list, try again and wait
* for IO to complete. This will stall high-order allocations
* but that should be acceptable to the caller
*/
if (nr_freed < nr_taken && !current_is_kswapd() &&
sc->order > PAGE_ALLOC_COSTLY_ORDER) {
congestion_wait(WRITE, HZ/10);
/*
* The attempt at page out may have made some
* of the pages active, mark them inactive again.
*/
nr_active = clear_active_flags(&page_list);
count_vm_events(PGDEACTIVATE, nr_active);
nr_freed += shrink_page_list(&page_list, sc,
PAGEOUT_IO_SYNC);
}
local_irq_disable();
if (current_is_kswapd()) {
__count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
__count_vm_events(KSWAPD_STEAL, nr_freed);
__count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
__count_zone_vm_events(PGSTEAL, zone, nr_freed);
if (nr_taken == 0)
goto done;
/*
* Put back any unfreeable pages.
*/
while (!list_empty(&page_list)) {
page = lru_to_page(&page_list);
list_del(&page->lru);
if (PageActive(page))
add_page_to_active_list(zone, page);
else
add_page_to_inactive_list(zone, page);
if (!pagevec_add(&pvec, page)) {
spin_unlock_irq(&zone->lru_lock);
__pagevec_release(&pvec);
spin_lock_irq(&zone->lru_lock);
}
}
spin_unlock(&zone->lru_lock);
local_irq_enable();
/*
* We are about to scan this zone at a certain priority level. If that priority
* level is smaller (ie: more urgent) than the previous priority, then note
* that priority level within the zone. This is done so that when the next
* process comes in to scan this zone, it will immediately start out at this
* priority level rather than having to build up its own scanning priority.
* Here, this priority affects only the reclaim-mapped threshold.
*/
static inline void note_zone_scanning_priority(struct zone *zone, int priority)
{
if (priority < zone->prev_priority)
zone->prev_priority = priority;
}
static inline int zone_is_near_oom(struct zone *zone)
{
return zone->pages_scanned >= (zone_page_state(zone, NR_ACTIVE)
+ zone_page_state(zone, NR_INACTIVE))*3;
/*
* This moves pages from the active list to the inactive list.
*
* We move them the other way if the page is referenced by one or more
* processes, from rmap.
*
* If the pages are mostly unmapped, the processing is fast and it is
* appropriate to hold zone->lru_lock across the whole operation. But if
* the pages are mapped, the processing is slow (page_referenced()) so we
* should drop zone->lru_lock around each page. It's impossible to balance
* this, so instead we remove the pages from the LRU while processing them.
* It is safe to rely on PG_active against the non-LRU pages in here because
* nobody will play with that bit on a non-LRU page.
*
* The downside is that we have to touch page->_count against each page.
* But we had to alter page->flags anyway.
*/
static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
struct scan_control *sc, int priority)
LIST_HEAD(l_hold); /* The pages which were snipped off */
LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */
LIST_HEAD(l_active); /* Pages to go onto the active_list */
struct page *page;
struct pagevec pvec;
int reclaim_mapped = 0;
Christoph Lameter
committed
if (sc->may_swap) {
long mapped_ratio;
long distress;
long swap_tendency;
if (zone_is_near_oom(zone))
goto force_reclaim_mapped;
/*
* `distress' is a measure of how much trouble we're having
* reclaiming pages. 0 -> no problems. 100 -> great trouble.
*/
distress = 100 >> min(zone->prev_priority, priority);
/*
* The point of this algorithm is to decide when to start
* reclaiming mapped memory instead of just pagecache. Work out
* how much memory
* is mapped.
*/
mapped_ratio = ((global_page_state(NR_FILE_MAPPED) +
global_page_state(NR_ANON_PAGES)) * 100) /
Christoph Lameter
committed
vm_total_pages;
/*
* Now decide how much we really want to unmap some pages. The
* mapped ratio is downgraded - just because there's a lot of
* mapped memory doesn't necessarily mean that page reclaim
* isn't succeeding.
*
* The distress ratio is important - we don't want to start
* going oom.
*
* A 100% value of vm_swappiness overrides this algorithm
* altogether.
*/
swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* If there's huge imbalance between active and inactive
* (think active 100 times larger than inactive) we should
* become more permissive, or the system will take too much
* cpu before it start swapping during memory pressure.
* Distress is about avoiding early-oom, this is about
* making swappiness graceful despite setting it to low
* values.
*
* Avoid div by zero with nr_inactive+1, and max resulting
* value is vm_total_pages.
*/
imbalance = zone_page_state(zone, NR_ACTIVE);
imbalance /= zone_page_state(zone, NR_INACTIVE) + 1;
/*
* Reduce the effect of imbalance if swappiness is low,
* this means for a swappiness very low, the imbalance
* must be much higher than 100 for this logic to make
* the difference.
*
* Max temporary value is vm_total_pages*100.
*/
imbalance *= (vm_swappiness + 1);
imbalance /= 100;
/*
* If not much of the ram is mapped, makes the imbalance
* less relevant, it's high priority we refill the inactive
* list with mapped pages only in presence of high ratio of
* mapped pages.