vmscan.c 100 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
/*
 *  linux/mm/vmscan.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 *
 *  Swap reorganised 29.12.95, Stephen Tweedie.
 *  kswapd added: 7.1.96  sct
 *  Removed kswapd_ctl limits, and swap out as many pages as needed
 *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
 *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
 *  Multiqueue VM started 5.8.00, Rik van Riel.
 */

#include <linux/mm.h>
#include <linux/module.h>
16
#include <linux/gfp.h>
Linus Torvalds's avatar
Linus Torvalds committed
17
18
19
20
21
#include <linux/kernel_stat.h>
#include <linux/swap.h>
#include <linux/pagemap.h>
#include <linux/init.h>
#include <linux/highmem.h>
22
#include <linux/vmstat.h>
Linus Torvalds's avatar
Linus Torvalds committed
23
24
25
26
27
28
29
30
31
32
33
#include <linux/file.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h>	/* for try_to_release_page(),
					buffer_heads_over_limit */
#include <linux/mm_inline.h>
#include <linux/backing-dev.h>
#include <linux/rmap.h>
#include <linux/topology.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
34
#include <linux/compaction.h>
Linus Torvalds's avatar
Linus Torvalds committed
35
36
#include <linux/notifier.h>
#include <linux/rwsem.h>
37
#include <linux/delay.h>
38
#include <linux/kthread.h>
39
#include <linux/freezer.h>
40
#include <linux/memcontrol.h>
41
#include <linux/delayacct.h>
42
#include <linux/sysctl.h>
43
#include <linux/oom.h>
44
#include <linux/prefetch.h>
Linus Torvalds's avatar
Linus Torvalds committed
45
46
47
48
49
50

#include <asm/tlbflush.h>
#include <asm/div64.h>

#include <linux/swapops.h>

51
52
#include "internal.h"

53
54
55
#define CREATE_TRACE_POINTS
#include <trace/events/vmscan.h>

Linus Torvalds's avatar
Linus Torvalds committed
56
57
58
59
struct scan_control {
	/* Incremented by the number of inactive pages that were scanned */
	unsigned long nr_scanned;

60
61
62
	/* Number of pages freed so far during a call to shrink_zones() */
	unsigned long nr_reclaimed;

63
64
65
	/* How many pages shrink_list() should reclaim */
	unsigned long nr_to_reclaim;

66
67
	unsigned long hibernation_mode;

Linus Torvalds's avatar
Linus Torvalds committed
68
	/* This context's GFP mask */
Al Viro's avatar
Al Viro committed
69
	gfp_t gfp_mask;
Linus Torvalds's avatar
Linus Torvalds committed
70
71
72

	int may_writepage;

73
74
	/* Can mapped pages be reclaimed? */
	int may_unmap;
75

76
77
78
	/* Can pages be swapped as part of reclaim? */
	int may_swap;

Andy Whitcroft's avatar
Andy Whitcroft committed
79
	int order;
80

81
82
83
	/* Scan (total_size >> priority) pages at once */
	int priority;

84
85
86
87
88
	/*
	 * The memory cgroup that hit its limit and as a result is the
	 * primary target of this reclaim invocation.
	 */
	struct mem_cgroup *target_mem_cgroup;
89

90
91
92
93
94
	/*
	 * Nodemask of nodes allowed by the caller. If NULL, all nodes
	 * are scanned.
	 */
	nodemask_t	*nodemask;
Linus Torvalds's avatar
Linus Torvalds committed
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
};

#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))

#ifdef ARCH_HAS_PREFETCH
#define prefetch_prev_lru_page(_page, _base, _field)			\
	do {								\
		if ((_page)->lru.prev != _base) {			\
			struct page *prev;				\
									\
			prev = lru_to_page(&(_page->lru));		\
			prefetch(&prev->_field);			\
		}							\
	} while (0)
#else
#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
#endif

#ifdef ARCH_HAS_PREFETCHW
#define prefetchw_prev_lru_page(_page, _base, _field)			\
	do {								\
		if ((_page)->lru.prev != _base) {			\
			struct page *prev;				\
									\
			prev = lru_to_page(&(_page->lru));		\
			prefetchw(&prev->_field);			\
		}							\
	} while (0)
#else
#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
#endif

/*
 * From 0 .. 100.  Higher means more swappy.
 */
int vm_swappiness = 60;
131
long vm_total_pages;	/* The total number of pages which the VM controls */
Linus Torvalds's avatar
Linus Torvalds committed
132
133
134
135

static LIST_HEAD(shrinker_list);
static DECLARE_RWSEM(shrinker_rwsem);

Andrew Morton's avatar
Andrew Morton committed
136
#ifdef CONFIG_MEMCG
137
138
static bool global_reclaim(struct scan_control *sc)
{
139
	return !sc->target_mem_cgroup;
140
}
141
#else
142
143
144
145
static bool global_reclaim(struct scan_control *sc)
{
	return true;
}
146
147
#endif

148
static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
149
{
150
	if (!mem_cgroup_disabled())
151
		return mem_cgroup_get_lru_size(lruvec, lru);
152

153
	return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru);
154
155
}

Linus Torvalds's avatar
Linus Torvalds committed
156
157
158
/*
 * Add a shrinker callback to be called from the vm
 */
159
void register_shrinker(struct shrinker *shrinker)
Linus Torvalds's avatar
Linus Torvalds committed
160
{
161
	atomic_long_set(&shrinker->nr_in_batch, 0);
162
163
164
	down_write(&shrinker_rwsem);
	list_add_tail(&shrinker->list, &shrinker_list);
	up_write(&shrinker_rwsem);
Linus Torvalds's avatar
Linus Torvalds committed
165
}
166
EXPORT_SYMBOL(register_shrinker);
Linus Torvalds's avatar
Linus Torvalds committed
167
168
169
170

/*
 * Remove one
 */
171
void unregister_shrinker(struct shrinker *shrinker)
Linus Torvalds's avatar
Linus Torvalds committed
172
173
174
175
176
{
	down_write(&shrinker_rwsem);
	list_del(&shrinker->list);
	up_write(&shrinker_rwsem);
}
177
EXPORT_SYMBOL(unregister_shrinker);
Linus Torvalds's avatar
Linus Torvalds committed
178

179
180
181
182
183
184
185
186
static inline int do_shrinker_shrink(struct shrinker *shrinker,
				     struct shrink_control *sc,
				     unsigned long nr_to_scan)
{
	sc->nr_to_scan = nr_to_scan;
	return (*shrinker->shrink)(shrinker, sc);
}

Linus Torvalds's avatar
Linus Torvalds committed
187
188
189
190
191
192
193
194
195
#define SHRINK_BATCH 128
/*
 * Call the shrink functions to age shrinkable caches
 *
 * Here we assume it costs one seek to replace a lru page and that it also
 * takes a seek to recreate a cache object.  With this in mind we age equal
 * percentages of the lru and ageable caches.  This should balance the seeks
 * generated by these structures.
 *
Simon Arlott's avatar
Simon Arlott committed
196
 * If the vm encountered mapped pages on the LRU it increase the pressure on
Linus Torvalds's avatar
Linus Torvalds committed
197
198
199
200
201
202
203
 * slab to avoid swapping.
 *
 * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.
 *
 * `lru_pages' represents the number of on-LRU pages in all the zones which
 * are eligible for the caller's allocation attempt.  It is used for balancing
 * slab reclaim versus page reclaim.
204
205
 *
 * Returns the number of slab objects which we shrunk.
Linus Torvalds's avatar
Linus Torvalds committed
206
 */
207
unsigned long shrink_slab(struct shrink_control *shrink,
208
			  unsigned long nr_pages_scanned,
209
			  unsigned long lru_pages)
Linus Torvalds's avatar
Linus Torvalds committed
210
211
{
	struct shrinker *shrinker;
212
	unsigned long ret = 0;
Linus Torvalds's avatar
Linus Torvalds committed
213

214
215
	if (nr_pages_scanned == 0)
		nr_pages_scanned = SWAP_CLUSTER_MAX;
Linus Torvalds's avatar
Linus Torvalds committed
216

217
218
219
220
221
	if (!down_read_trylock(&shrinker_rwsem)) {
		/* Assume we'll be able to shrink next time */
		ret = 1;
		goto out;
	}
Linus Torvalds's avatar
Linus Torvalds committed
222
223
224

	list_for_each_entry(shrinker, &shrinker_list, list) {
		unsigned long long delta;
225
226
		long total_scan;
		long max_pass;
227
		int shrink_ret = 0;
228
229
		long nr;
		long new_nr;
230
231
		long batch_size = shrinker->batch ? shrinker->batch
						  : SHRINK_BATCH;
Linus Torvalds's avatar
Linus Torvalds committed
232

233
234
235
236
		max_pass = do_shrinker_shrink(shrinker, shrink, 0);
		if (max_pass <= 0)
			continue;

237
238
239
240
241
		/*
		 * copy the current shrinker scan count into a local variable
		 * and zero it so that other concurrent shrinker invocations
		 * don't also do this scanning work.
		 */
242
		nr = atomic_long_xchg(&shrinker->nr_in_batch, 0);
243
244

		total_scan = nr;
245
		delta = (4 * nr_pages_scanned) / shrinker->seeks;
246
		delta *= max_pass;
Linus Torvalds's avatar
Linus Torvalds committed
247
		do_div(delta, lru_pages + 1);
248
249
		total_scan += delta;
		if (total_scan < 0) {
250
251
			printk(KERN_ERR "shrink_slab: %pF negative objects to "
			       "delete nr=%ld\n",
252
253
			       shrinker->shrink, total_scan);
			total_scan = max_pass;
254
255
		}

256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
		/*
		 * We need to avoid excessive windup on filesystem shrinkers
		 * due to large numbers of GFP_NOFS allocations causing the
		 * shrinkers to return -1 all the time. This results in a large
		 * nr being built up so when a shrink that can do some work
		 * comes along it empties the entire cache due to nr >>>
		 * max_pass.  This is bad for sustaining a working set in
		 * memory.
		 *
		 * Hence only allow the shrinker to scan the entire cache when
		 * a large delta change is calculated directly.
		 */
		if (delta < max_pass / 4)
			total_scan = min(total_scan, max_pass / 2);

271
272
273
274
275
		/*
		 * Avoid risking looping forever due to too large nr value:
		 * never try to free more than twice the estimate number of
		 * freeable entries.
		 */
276
277
		if (total_scan > max_pass * 2)
			total_scan = max_pass * 2;
Linus Torvalds's avatar
Linus Torvalds committed
278

279
		trace_mm_shrink_slab_start(shrinker, shrink, nr,
280
281
282
					nr_pages_scanned, lru_pages,
					max_pass, delta, total_scan);

283
		while (total_scan >= batch_size) {
284
			int nr_before;
Linus Torvalds's avatar
Linus Torvalds committed
285

286
287
			nr_before = do_shrinker_shrink(shrinker, shrink, 0);
			shrink_ret = do_shrinker_shrink(shrinker, shrink,
288
							batch_size);
Linus Torvalds's avatar
Linus Torvalds committed
289
290
			if (shrink_ret == -1)
				break;
291
292
			if (shrink_ret < nr_before)
				ret += nr_before - shrink_ret;
293
294
			count_vm_events(SLABS_SCANNED, batch_size);
			total_scan -= batch_size;
Linus Torvalds's avatar
Linus Torvalds committed
295
296
297
298

			cond_resched();
		}

299
300
301
302
303
		/*
		 * move the unused scan count back into the shrinker in a
		 * manner that handles concurrent updates. If we exhausted the
		 * scan, there is no need to do an update.
		 */
304
305
306
307
308
		if (total_scan > 0)
			new_nr = atomic_long_add_return(total_scan,
					&shrinker->nr_in_batch);
		else
			new_nr = atomic_long_read(&shrinker->nr_in_batch);
309
310

		trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
Linus Torvalds's avatar
Linus Torvalds committed
311
312
	}
	up_read(&shrinker_rwsem);
313
314
out:
	cond_resched();
315
	return ret;
Linus Torvalds's avatar
Linus Torvalds committed
316
317
318
319
}

static inline int is_page_cache_freeable(struct page *page)
{
320
321
322
323
324
	/*
	 * A freeable page cache page is referenced only by the caller
	 * that isolated the page, the page cache radix tree and
	 * optional buffer heads at page->private.
	 */
325
	return page_count(page) - page_has_private(page) == 2;
Linus Torvalds's avatar
Linus Torvalds committed
326
327
}

328
329
static int may_write_to_queue(struct backing_dev_info *bdi,
			      struct scan_control *sc)
Linus Torvalds's avatar
Linus Torvalds committed
330
{
331
	if (current->flags & PF_SWAPWRITE)
Linus Torvalds's avatar
Linus Torvalds committed
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
		return 1;
	if (!bdi_write_congested(bdi))
		return 1;
	if (bdi == current->backing_dev_info)
		return 1;
	return 0;
}

/*
 * We detected a synchronous write error writing a page out.  Probably
 * -ENOSPC.  We need to propagate that into the address_space for a subsequent
 * fsync(), msync() or close().
 *
 * The tricky part is that after writepage we cannot touch the mapping: nothing
 * prevents it from being freed up.  But we have a ref on the page and once
 * that page is locked, the mapping is pinned.
 *
 * We're allowed to run sleeping lock_page() here because we know the caller has
 * __GFP_FS.
 */
static void handle_write_error(struct address_space *mapping,
				struct page *page, int error)
{
Jens Axboe's avatar
Jens Axboe committed
355
	lock_page(page);
356
357
	if (page_mapping(page) == mapping)
		mapping_set_error(mapping, error);
Linus Torvalds's avatar
Linus Torvalds committed
358
359
360
	unlock_page(page);
}

361
362
363
364
365
366
367
368
369
370
371
372
/* possible outcome of pageout() */
typedef enum {
	/* failed to write page out, page is locked */
	PAGE_KEEP,
	/* move page to the active list, page is locked */
	PAGE_ACTIVATE,
	/* page has been sent to the disk successfully, page is unlocked */
	PAGE_SUCCESS,
	/* page is clean and locked */
	PAGE_CLEAN,
} pageout_t;

Linus Torvalds's avatar
Linus Torvalds committed
373
/*
374
375
 * pageout is called by shrink_page_list() for each dirty page.
 * Calls ->writepage().
Linus Torvalds's avatar
Linus Torvalds committed
376
 */
377
static pageout_t pageout(struct page *page, struct address_space *mapping,
378
			 struct scan_control *sc)
Linus Torvalds's avatar
Linus Torvalds committed
379
380
381
382
383
384
385
386
{
	/*
	 * If the page is dirty, only perform writeback if that write
	 * will be non-blocking.  To prevent this allocation from being
	 * stalled by pagecache activity.  But note that there may be
	 * stalls if we need to run get_block().  We could test
	 * PagePrivate for that.
	 *
387
	 * If this process is currently in __generic_file_aio_write() against
Linus Torvalds's avatar
Linus Torvalds committed
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
	 * this page's queue, we can perform writeback even if that
	 * will block.
	 *
	 * If the page is swapcache, write it back even if that would
	 * block, for some throttling. This happens by accident, because
	 * swap_backing_dev_info is bust: it doesn't reflect the
	 * congestion state of the swapdevs.  Easy to fix, if needed.
	 */
	if (!is_page_cache_freeable(page))
		return PAGE_KEEP;
	if (!mapping) {
		/*
		 * Some data journaling orphaned pages can have
		 * page->mapping == NULL while being dirty with clean buffers.
		 */
403
		if (page_has_private(page)) {
Linus Torvalds's avatar
Linus Torvalds committed
404
405
			if (try_to_free_buffers(page)) {
				ClearPageDirty(page);
406
				printk("%s: orphaned page\n", __func__);
Linus Torvalds's avatar
Linus Torvalds committed
407
408
409
410
411
412
413
				return PAGE_CLEAN;
			}
		}
		return PAGE_KEEP;
	}
	if (mapping->a_ops->writepage == NULL)
		return PAGE_ACTIVATE;
414
	if (!may_write_to_queue(mapping->backing_dev_info, sc))
Linus Torvalds's avatar
Linus Torvalds committed
415
416
417
418
419
420
421
		return PAGE_KEEP;

	if (clear_page_dirty_for_io(page)) {
		int res;
		struct writeback_control wbc = {
			.sync_mode = WB_SYNC_NONE,
			.nr_to_write = SWAP_CLUSTER_MAX,
422
423
			.range_start = 0,
			.range_end = LLONG_MAX,
Linus Torvalds's avatar
Linus Torvalds committed
424
425
426
427
428
429
430
			.for_reclaim = 1,
		};

		SetPageReclaim(page);
		res = mapping->a_ops->writepage(page, &wbc);
		if (res < 0)
			handle_write_error(mapping, page, res);
431
		if (res == AOP_WRITEPAGE_ACTIVATE) {
Linus Torvalds's avatar
Linus Torvalds committed
432
433
434
			ClearPageReclaim(page);
			return PAGE_ACTIVATE;
		}
435

Linus Torvalds's avatar
Linus Torvalds committed
436
437
438
439
		if (!PageWriteback(page)) {
			/* synchronous write or broken a_ops? */
			ClearPageReclaim(page);
		}
Mel Gorman's avatar
Mel Gorman committed
440
		trace_mm_vmscan_writepage(page, trace_reclaim_flags(page));
441
		inc_zone_page_state(page, NR_VMSCAN_WRITE);
Linus Torvalds's avatar
Linus Torvalds committed
442
443
444
445
446
447
		return PAGE_SUCCESS;
	}

	return PAGE_CLEAN;
}

448
/*
Nick Piggin's avatar
Nick Piggin committed
449
450
 * Same as remove_mapping, but if the page is removed from the mapping, it
 * gets returned with a refcount of 0.
451
 */
Nick Piggin's avatar
Nick Piggin committed
452
static int __remove_mapping(struct address_space *mapping, struct page *page)
453
{
454
455
	BUG_ON(!PageLocked(page));
	BUG_ON(mapping != page_mapping(page));
456

Nick Piggin's avatar
Nick Piggin committed
457
	spin_lock_irq(&mapping->tree_lock);
458
	/*
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
	 * The non racy check for a busy page.
	 *
	 * Must be careful with the order of the tests. When someone has
	 * a ref to the page, it may be possible that they dirty it then
	 * drop the reference. So if PageDirty is tested before page_count
	 * here, then the following race may occur:
	 *
	 * get_user_pages(&page);
	 * [user mapping goes away]
	 * write_to(page);
	 *				!PageDirty(page)    [good]
	 * SetPageDirty(page);
	 * put_page(page);
	 *				!page_count(page)   [good, discard it]
	 *
	 * [oops, our write_to data is lost]
	 *
	 * Reversing the order of the tests ensures such a situation cannot
	 * escape unnoticed. The smp_rmb is needed to ensure the page->flags
	 * load is not satisfied before that of page->_count.
	 *
	 * Note that if SetPageDirty is always performed via set_page_dirty,
	 * and thus under tree_lock, then this ordering is not required.
482
	 */
Nick Piggin's avatar
Nick Piggin committed
483
	if (!page_freeze_refs(page, 2))
484
		goto cannot_free;
Nick Piggin's avatar
Nick Piggin committed
485
486
487
	/* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
	if (unlikely(PageDirty(page))) {
		page_unfreeze_refs(page, 2);
488
		goto cannot_free;
Nick Piggin's avatar
Nick Piggin committed
489
	}
490
491
492
493

	if (PageSwapCache(page)) {
		swp_entry_t swap = { .val = page_private(page) };
		__delete_from_swap_cache(page);
Nick Piggin's avatar
Nick Piggin committed
494
		spin_unlock_irq(&mapping->tree_lock);
495
		swapcache_free(swap, page);
Nick Piggin's avatar
Nick Piggin committed
496
	} else {
497
498
499
500
		void (*freepage)(struct page *);

		freepage = mapping->a_ops->freepage;

501
		__delete_from_page_cache(page);
Nick Piggin's avatar
Nick Piggin committed
502
		spin_unlock_irq(&mapping->tree_lock);
503
		mem_cgroup_uncharge_cache_page(page);
504
505
506

		if (freepage != NULL)
			freepage(page);
507
508
509
510
511
	}

	return 1;

cannot_free:
Nick Piggin's avatar
Nick Piggin committed
512
	spin_unlock_irq(&mapping->tree_lock);
513
514
515
	return 0;
}

Nick Piggin's avatar
Nick Piggin committed
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
/*
 * Attempt to detach a locked page from its ->mapping.  If it is dirty or if
 * someone else has a ref on the page, abort and return 0.  If it was
 * successfully detached, return 1.  Assumes the caller has a single ref on
 * this page.
 */
int remove_mapping(struct address_space *mapping, struct page *page)
{
	if (__remove_mapping(mapping, page)) {
		/*
		 * Unfreezing the refcount with 1 rather than 2 effectively
		 * drops the pagecache ref for us without requiring another
		 * atomic operation.
		 */
		page_unfreeze_refs(page, 1);
		return 1;
	}
	return 0;
}

536
537
538
539
540
541
542
543
544
545
546
547
548
/**
 * putback_lru_page - put previously isolated page onto appropriate LRU list
 * @page: page to be put back to appropriate lru list
 *
 * Add previously isolated @page to appropriate LRU list.
 * Page may still be unevictable for other reasons.
 *
 * lru_lock must not be held, interrupts must be enabled.
 */
void putback_lru_page(struct page *page)
{
	int lru;
	int active = !!TestClearPageActive(page);
549
	int was_unevictable = PageUnevictable(page);
550
551
552
553
554
555

	VM_BUG_ON(PageLRU(page));

redo:
	ClearPageUnevictable(page);

556
	if (page_evictable(page)) {
557
558
559
560
561
562
		/*
		 * For evictable pages, we can use the cache.
		 * In event of a race, worst case is we end up with an
		 * unevictable page on [in]active list.
		 * We know how to handle that.
		 */
563
		lru = active + page_lru_base_type(page);
564
565
566
567
568
569
570
571
		lru_cache_add_lru(page, lru);
	} else {
		/*
		 * Put unevictable pages directly on zone's unevictable
		 * list.
		 */
		lru = LRU_UNEVICTABLE;
		add_page_to_unevictable_list(page);
572
		/*
573
574
575
		 * When racing with an mlock or AS_UNEVICTABLE clearing
		 * (page is unlocked) make sure that if the other thread
		 * does not observe our setting of PG_lru and fails
576
		 * isolation/check_move_unevictable_pages,
577
		 * we see PG_mlocked/AS_UNEVICTABLE cleared below and move
578
579
		 * the page back to the evictable list.
		 *
580
		 * The other side is TestClearPageMlocked() or shmem_lock().
581
582
		 */
		smp_mb();
583
584
585
586
587
588
589
	}

	/*
	 * page's status can change while we move it among lru. If an evictable
	 * page is on unevictable list, it never be freed. To avoid that,
	 * check after we added it to the list, again.
	 */
590
	if (lru == LRU_UNEVICTABLE && page_evictable(page)) {
591
592
593
594
595
596
597
598
599
600
		if (!isolate_lru_page(page)) {
			put_page(page);
			goto redo;
		}
		/* This means someone else dropped this page from LRU
		 * So, it will be freed or putback to LRU again. There is
		 * nothing to do here.
		 */
	}

601
602
603
604
605
	if (was_unevictable && lru != LRU_UNEVICTABLE)
		count_vm_event(UNEVICTABLE_PGRESCUED);
	else if (!was_unevictable && lru == LRU_UNEVICTABLE)
		count_vm_event(UNEVICTABLE_PGCULLED);

606
607
608
	put_page(page);		/* drop ref from isolate */
}

609
610
611
enum page_references {
	PAGEREF_RECLAIM,
	PAGEREF_RECLAIM_CLEAN,
612
	PAGEREF_KEEP,
613
614
615
616
617
618
	PAGEREF_ACTIVATE,
};

static enum page_references page_check_references(struct page *page,
						  struct scan_control *sc)
{
619
	int referenced_ptes, referenced_page;
620
621
	unsigned long vm_flags;

622
623
	referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
					  &vm_flags);
624
	referenced_page = TestClearPageReferenced(page);
625
626
627
628
629
630
631
632

	/*
	 * Mlock lost the isolation race with us.  Let try_to_unmap()
	 * move the page to the unevictable list.
	 */
	if (vm_flags & VM_LOCKED)
		return PAGEREF_RECLAIM;

633
	if (referenced_ptes) {
634
		if (PageSwapBacked(page))
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
			return PAGEREF_ACTIVATE;
		/*
		 * All mapped pages start out with page table
		 * references from the instantiating fault, so we need
		 * to look twice if a mapped file page is used more
		 * than once.
		 *
		 * Mark it and spare it for another trip around the
		 * inactive list.  Another page table reference will
		 * lead to its activation.
		 *
		 * Note: the mark is set for activated pages as well
		 * so that recently deactivated but used pages are
		 * quickly recovered.
		 */
		SetPageReferenced(page);

652
		if (referenced_page || referenced_ptes > 1)
653
654
			return PAGEREF_ACTIVATE;

655
656
657
658
659
660
		/*
		 * Activate file-backed executable pages after first usage.
		 */
		if (vm_flags & VM_EXEC)
			return PAGEREF_ACTIVATE;

661
662
		return PAGEREF_KEEP;
	}
663
664

	/* Reclaim if clean, defer dirty pages to writeback */
665
	if (referenced_page && !PageSwapBacked(page))
666
667
668
		return PAGEREF_RECLAIM_CLEAN;

	return PAGEREF_RECLAIM;
669
670
}

Linus Torvalds's avatar
Linus Torvalds committed
671
/*
672
 * shrink_page_list() returns the number of reclaimed pages
Linus Torvalds's avatar
Linus Torvalds committed
673
 */
674
static unsigned long shrink_page_list(struct list_head *page_list,
675
				      struct zone *zone,
676
				      struct scan_control *sc,
677
				      enum ttu_flags ttu_flags,
678
				      unsigned long *ret_nr_dirty,
679
680
				      unsigned long *ret_nr_writeback,
				      bool force_reclaim)
Linus Torvalds's avatar
Linus Torvalds committed
681
682
{
	LIST_HEAD(ret_pages);
683
	LIST_HEAD(free_pages);
Linus Torvalds's avatar
Linus Torvalds committed
684
	int pgactivate = 0;
685
686
	unsigned long nr_dirty = 0;
	unsigned long nr_congested = 0;
687
	unsigned long nr_reclaimed = 0;
688
	unsigned long nr_writeback = 0;
Linus Torvalds's avatar
Linus Torvalds committed
689
690
691

	cond_resched();

692
	mem_cgroup_uncharge_start();
Linus Torvalds's avatar
Linus Torvalds committed
693
694
695
696
	while (!list_empty(page_list)) {
		struct address_space *mapping;
		struct page *page;
		int may_enter_fs;
697
		enum page_references references = PAGEREF_RECLAIM_CLEAN;
Linus Torvalds's avatar
Linus Torvalds committed
698
699
700
701
702
703

		cond_resched();

		page = lru_to_page(page_list);
		list_del(&page->lru);

Nick Piggin's avatar
Nick Piggin committed
704
		if (!trylock_page(page))
Linus Torvalds's avatar
Linus Torvalds committed
705
706
			goto keep;

Nick Piggin's avatar
Nick Piggin committed
707
		VM_BUG_ON(PageActive(page));
708
		VM_BUG_ON(page_zone(page) != zone);
Linus Torvalds's avatar
Linus Torvalds committed
709
710

		sc->nr_scanned++;
711

712
		if (unlikely(!page_evictable(page)))
713
			goto cull_mlocked;
714

715
		if (!sc->may_unmap && page_mapped(page))
716
717
			goto keep_locked;

Linus Torvalds's avatar
Linus Torvalds committed
718
719
720
721
		/* Double the slab pressure for mapped and swapcache pages */
		if (page_mapped(page) || PageSwapCache(page))
			sc->nr_scanned++;

722
723
724
725
		may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
			(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));

		if (PageWriteback(page)) {
726
727
728
			/*
			 * memcg doesn't have any dirty pages throttling so we
			 * could easily OOM just because too many pages are in
729
			 * writeback and there is nothing else to reclaim.
730
			 *
731
			 * Check __GFP_IO, certainly because a loop driver
732
733
734
735
			 * thread might enter reclaim, and deadlock if it waits
			 * on a page for which it is needed to do the write
			 * (loop masks off __GFP_IO|__GFP_FS for this reason);
			 * but more thought would probably show more reasons.
736
737
738
739
740
741
			 *
			 * Don't require __GFP_FS, since we're not going into
			 * the FS, just waiting on its writeback completion.
			 * Worryingly, ext4 gfs2 and xfs allocate pages with
			 * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so
			 * testing may_enter_fs here is liable to OOM on them.
742
			 */
743
744
745
746
747
748
749
750
751
752
753
754
755
756
			if (global_reclaim(sc) ||
			    !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
				/*
				 * This is slightly racy - end_page_writeback()
				 * might have just cleared PageReclaim, then
				 * setting PageReclaim here end up interpreted
				 * as PageReadahead - but that does not matter
				 * enough to care.  What we do want is for this
				 * page to have PageReclaim set next time memcg
				 * reclaim reaches the tests above, so it will
				 * then wait_on_page_writeback() to avoid OOM;
				 * and it's also appropriate in global reclaim.
				 */
				SetPageReclaim(page);
757
				nr_writeback++;
758
				goto keep_locked;
759
			}
760
			wait_on_page_writeback(page);
761
		}
Linus Torvalds's avatar
Linus Torvalds committed
762

763
764
765
		if (!force_reclaim)
			references = page_check_references(page, sc);

766
767
		switch (references) {
		case PAGEREF_ACTIVATE:
Linus Torvalds's avatar
Linus Torvalds committed
768
			goto activate_locked;
769
770
		case PAGEREF_KEEP:
			goto keep_locked;
771
772
773
774
		case PAGEREF_RECLAIM:
		case PAGEREF_RECLAIM_CLEAN:
			; /* try to reclaim the page below */
		}
Linus Torvalds's avatar
Linus Torvalds committed
775
776
777
778
779

		/*
		 * Anonymous process memory has backing store?
		 * Try to allocate it some swap space here.
		 */
780
		if (PageAnon(page) && !PageSwapCache(page)) {
781
782
			if (!(sc->gfp_mask & __GFP_IO))
				goto keep_locked;
783
			if (!add_to_swap(page))
Linus Torvalds's avatar
Linus Torvalds committed
784
				goto activate_locked;
785
			may_enter_fs = 1;
786
		}
Linus Torvalds's avatar
Linus Torvalds committed
787
788
789
790
791
792
793
794

		mapping = page_mapping(page);

		/*
		 * The page is mapped into the page tables of one or more
		 * processes. Try to unmap it here.
		 */
		if (page_mapped(page) && mapping) {
795
			switch (try_to_unmap(page, ttu_flags)) {
Linus Torvalds's avatar
Linus Torvalds committed
796
797
798
799
			case SWAP_FAIL:
				goto activate_locked;
			case SWAP_AGAIN:
				goto keep_locked;
800
801
			case SWAP_MLOCK:
				goto cull_mlocked;
Linus Torvalds's avatar
Linus Torvalds committed
802
803
804
805
806
807
			case SWAP_SUCCESS:
				; /* try to free the page below */
			}
		}

		if (PageDirty(page)) {
808
809
			nr_dirty++;

810
811
			/*
			 * Only kswapd can writeback filesystem pages to
812
813
			 * avoid risk of stack overflow but do not writeback
			 * unless under significant pressure.
814
			 */
815
			if (page_is_file_cache(page) &&
816
817
					(!current_is_kswapd() ||
					 sc->priority >= DEF_PRIORITY - 2)) {
818
819
820
821
822
823
824
825
826
				/*
				 * Immediately reclaim when written back.
				 * Similar in principal to deactivate_page()
				 * except we already have the page isolated
				 * and know it's dirty
				 */
				inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE);
				SetPageReclaim(page);

827
828
829
				goto keep_locked;
			}

830
			if (references == PAGEREF_RECLAIM_CLEAN)
Linus Torvalds's avatar
Linus Torvalds committed
831
				goto keep_locked;
832
			if (!may_enter_fs)
Linus Torvalds's avatar
Linus Torvalds committed
833
				goto keep_locked;
834
			if (!sc->may_writepage)
Linus Torvalds's avatar
Linus Torvalds committed
835
836
837
				goto keep_locked;

			/* Page is dirty, try to write it out here */
838
			switch (pageout(page, mapping, sc)) {
Linus Torvalds's avatar
Linus Torvalds committed
839
			case PAGE_KEEP:
840
				nr_congested++;
Linus Torvalds's avatar
Linus Torvalds committed
841
842
843
844
				goto keep_locked;
			case PAGE_ACTIVATE:
				goto activate_locked;
			case PAGE_SUCCESS:
845
				if (PageWriteback(page))
846
					goto keep;
847
				if (PageDirty(page))
Linus Torvalds's avatar
Linus Torvalds committed
848
					goto keep;
849

Linus Torvalds's avatar
Linus Torvalds committed
850
851
852
853
				/*
				 * A synchronous write - probably a ramdisk.  Go
				 * ahead and try to reclaim the page.
				 */
Nick Piggin's avatar
Nick Piggin committed
854
				if (!trylock_page(page))
Linus Torvalds's avatar
Linus Torvalds committed
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
					goto keep;
				if (PageDirty(page) || PageWriteback(page))
					goto keep_locked;
				mapping = page_mapping(page);
			case PAGE_CLEAN:
				; /* try to free the page below */
			}
		}

		/*
		 * If the page has buffers, try to free the buffer mappings
		 * associated with this page. If we succeed we try to free
		 * the page as well.
		 *
		 * We do this even if the page is PageDirty().
		 * try_to_release_page() does not perform I/O, but it is
		 * possible for a page to have PageDirty set, but it is actually
		 * clean (all its buffers are clean).  This happens if the
		 * buffers were written out directly, with submit_bh(). ext3
874
		 * will do this, as well as the blockdev mapping.
Linus Torvalds's avatar
Linus Torvalds committed
875
876
877
878
879
880
881
882
883
884
		 * try_to_release_page() will discover that cleanness and will
		 * drop the buffers and mark the page clean - it can be freed.
		 *
		 * Rarely, pages can have buffers and no ->mapping.  These are
		 * the pages which were not successfully invalidated in
		 * truncate_complete_page().  We try to drop those buffers here
		 * and if that worked, and the page is no longer mapped into
		 * process address space (page_count == 1) it can be freed.
		 * Otherwise, leave the page on the LRU so it is swappable.
		 */
885
		if (page_has_private(page)) {
Linus Torvalds's avatar
Linus Torvalds committed
886
887
			if (!try_to_release_page(page, sc->gfp_mask))
				goto activate_locked;
Nick Piggin's avatar
Nick Piggin committed
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
			if (!mapping && page_count(page) == 1) {
				unlock_page(page);
				if (put_page_testzero(page))
					goto free_it;
				else {
					/*
					 * rare race with speculative reference.
					 * the speculative reference will free
					 * this page shortly, so we may
					 * increment nr_reclaimed here (and
					 * leave it off the LRU).
					 */
					nr_reclaimed++;
					continue;
				}
			}
Linus Torvalds's avatar
Linus Torvalds committed
904
905
		}

Nick Piggin's avatar
Nick Piggin committed
906
		if (!mapping || !__remove_mapping(mapping, page))
907
			goto keep_locked;
Linus Torvalds's avatar
Linus Torvalds committed
908

Nick Piggin's avatar
Nick Piggin committed
909
910
911
912
913
914
915
916
		/*
		 * At this point, we have no other references and there is
		 * no way to pick any more up (removed from LRU, removed
		 * from pagecache). Can use non-atomic bitops now (and
		 * we obviously don't have to worry about waking up a process
		 * waiting on the page lock, because there are no references.
		 */
		__clear_page_locked(page);
Nick Piggin's avatar
Nick Piggin committed
917
free_it:
918
		nr_reclaimed++;
919
920
921
922
923
924

		/*
		 * Is there need to periodically free_page_list? It would
		 * appear not as the counts should be low
		 */
		list_add(&page->lru, &free_pages);
Linus Torvalds's avatar
Linus Torvalds committed
925
926
		continue;

927
cull_mlocked:
928
929
		if (PageSwapCache(page))
			try_to_free_swap(page);
930
931
932
933
		unlock_page(page);
		putback_lru_page(page);
		continue;

Linus Torvalds's avatar
Linus Torvalds committed
934
activate_locked:
935
936
		/* Not a candidate for swapping, so reclaim swap space. */
		if (PageSwapCache(page) && vm_swap_full())
937
			try_to_free_swap(page);
938
		VM_BUG_ON(PageActive(page));
Linus Torvalds's avatar
Linus Torvalds committed
939
940
941
942
943
944
		SetPageActive(page);
		pgactivate++;
keep_locked:
		unlock_page(page);
keep:
		list_add(&page->lru, &ret_pages);
945
		VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
Linus Torvalds's avatar
Linus Torvalds committed
946
	}
947

948
949
950
951
952
953
	/*
	 * Tag a zone as congested if all the dirty pages encountered were
	 * backed by a congested BDI. In this case, reclaimers should just
	 * back off and wait for congestion to clear because further reclaim
	 * will encounter the same problem
	 */
954
	if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc))
955
		zone_set_flag(zone, ZONE_CONGESTED);
956

957
	free_hot_cold_page_list(&free_pages, 1);
958

Linus Torvalds's avatar
Linus Torvalds committed
959
	list_splice(&ret_pages, page_list);
960
	count_vm_events(PGACTIVATE, pgactivate);
961
	mem_cgroup_uncharge_end();
962
963
	*ret_nr_dirty += nr_dirty;
	*ret_nr_writeback += nr_writeback;
964
	return nr_reclaimed;
Linus Torvalds's avatar
Linus Torvalds committed
965
966
}

967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
unsigned long reclaim_clean_pages_from_list(struct zone *zone,
					    struct list_head *page_list)
{
	struct scan_control sc = {
		.gfp_mask = GFP_KERNEL,
		.priority = DEF_PRIORITY,
		.may_unmap = 1,
	};
	unsigned long ret, dummy1, dummy2;
	struct page *page, *next;
	LIST_HEAD(clean_pages);

	list_for_each_entry_safe(page, next, page_list, lru) {
		if (page_is_file_cache(page) && !PageDirty(page)) {
			ClearPageActive(page);
			list_move(&page->lru, &clean_pages);
		}
	}

	ret = shrink_page_list(&clean_pages, zone, &sc,
				TTU_UNMAP|TTU_IGNORE_ACCESS,
				&dummy1, &dummy2, true);
	list_splice(&clean_pages, page_list);
	__mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
	return ret;
}

Andy Whitcroft's avatar
Andy Whitcroft committed
994
995
996
997
998
999
1000
1001
1002
1003
/*
 * Attempt to remove the specified page from its LRU.  Only take this page
 * if it is of the appropriate PageActive status.  Pages which are being
 * freed elsewhere are also ignored.
 *
 * page:	page to consider
 * mode:	one of the LRU isolation modes defined above
 *
 * returns 0 on success, -ve errno on failure.
 */
1004
int __isolate_lru_page(struct page *page, isolate_mode_t mode)
Andy Whitcroft's avatar
Andy Whitcroft committed
1005
1006
1007
1008
1009
1010
1011
{
	int ret = -EINVAL;

	/* Only take pages on the LRU. */
	if (!PageLRU(page))
		return ret;

Minchan Kim's avatar
Minchan Kim committed
1012
1013
	/* Compaction should not handle unevictable pages but CMA can do so */
	if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
1014
1015
		return ret;

Andy Whitcroft's avatar
Andy Whitcroft committed
1016
	ret = -EBUSY;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1017

1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
	/*
	 * To minimise LRU disruption, the caller can indicate that it only
	 * wants to isolate pages it will be able to operate on without
	 * blocking - clean pages for the most part.
	 *
	 * ISOLATE_CLEAN means that only clean pages should be isolated. This
	 * is used by reclaim when it is cannot write to backing storage
	 *
	 * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages
	 * that it is possible to migrate without blocking
	 */
	if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) {
		/* All the caller can do on PageWriteback is block */
		if (PageWriteback(page))
			return ret;

		if (PageDirty(page)) {
			struct address_space *mapping;

			/* ISOLATE_CLEAN means only clean pages */
			if (mode & ISOLATE_CLEAN)
				return ret;

			/*
			 * Only pages without mappings or that have a
			 * ->migratepage callback are possible to migrate
			 * without blocking
			 */
			mapping = page_mapping(page);
			if (mapping && !mapping->a_ops->migratepage)
				return ret;
		}
	}
1051

1052
1053
1054
	if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
		return ret;

Andy Whitcroft's avatar
Andy Whitcroft committed
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
	if (likely(get_page_unless_zero(page))) {
		/*
		 * Be careful not to clear PageLRU until after we're
		 * sure the page is not being freed elsewhere -- the
		 * page release code relies on it.
		 */
		ClearPageLRU(page);
		ret = 0;
	}

	return ret;
}

Linus Torvalds's avatar
Linus Torvalds committed
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
/*
 * zone->lru_lock is heavily contended.  Some of the functions that
 * shrink the lists perform better by taking out a batch of pages
 * and working on them outside the LRU lock.
 *
 * For pagecache intensive workloads, this function is the hottest
 * spot in the kernel (apart from copy_*_user functions).
 *
 * Appropriate locks must be held before calling this function.
 *
 * @nr_to_scan:	The number of pages to look through on the list.
1079
 * @lruvec:	The LRU vector to pull pages from.
Linus Torvalds's avatar
Linus Torvalds committed
1080
 * @dst:	The temp list to put pages on to.
Hugh Dickins's avatar
Hugh Dickins committed
1081
 * @nr_scanned:	The number of pages that were scanned.
1082
 * @sc:		The scan_control struct for this reclaim session
Andy Whitcroft's avatar
Andy Whitcroft committed
1083
 * @mode:	One of the LRU isolation modes
1084
 * @lru:	LRU list id for isolating
Linus Torvalds's avatar
Linus Torvalds committed
1085
1086
1087
 *
 * returns how many pages were moved onto *@dst.
 */
1088
static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1089
		struct lruvec *lruvec, struct list_head *dst,
1090
		unsigned long *nr_scanned, struct scan_control *sc,
1091
		isolate_mode_t mode, enum lru_list lru)
Linus Torvalds's avatar
Linus Torvalds committed
1092
{
1093
	struct list_head *src = &lruvec->lists[lru];
1094
	unsigned long nr_taken = 0;
1095
	unsigned long scan;
Linus Torvalds's avatar
Linus Torvalds committed
1096

1097
	for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
Andy Whitcroft's avatar
Andy Whitcroft committed
1098
		struct page *page;
1099
		int nr_pages;
Andy Whitcroft's avatar
Andy Whitcroft committed
1100

Linus Torvalds's avatar
Linus Torvalds committed
1101
1102
1103
		page = lru_to_page(src);
		prefetchw_prev_lru_page(page, src, flags);

Nick Piggin's avatar
Nick Piggin committed
1104
		VM_BUG_ON(!PageLRU(page));
Nick Piggin's avatar
Nick Piggin committed
1105

1106
		switch (__isolate_lru_page(page, mode)) {
Andy Whitcroft's avatar
Andy Whitcroft committed
1107
		case 0:
1108
1109
			nr_pages = hpage_nr_pages(page);
			mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);
Andy Whitcroft's avatar
Andy Whitcroft committed
1110
			list_move(&page->lru, dst);
1111
			nr_taken += nr_pages;
Andy Whitcroft's avatar
Andy Whitcroft committed
1112
1113
1114
1115
1116
1117
			break;

		case -EBUSY:
			/* else it is being freed elsewhere */
			list_move(&page->lru, src);
			continue;
1118

Andy Whitcroft's avatar
Andy Whitcroft committed
1119
1120
1121
		default:
			BUG();
		}
Linus Torvalds's avatar
Linus Torvalds committed
1122
1123
	}

Hugh Dickins's avatar
Hugh Dickins committed
1124
	*nr_scanned = scan;
1125
1126
	trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan,
				    nr_taken, mode, is_file_lru(lru));
Linus Torvalds's avatar
Linus Torvalds committed
1127
1128
1129
	return nr_taken;
}

1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
/**
 * isolate_lru_page - tries to isolate a page from its LRU list
 * @page: page to isolate from its LRU list
 *
 * Isolates a @page from an LRU list, clears PageLRU and adjusts the
 * vmstat statistic corresponding to whatever LRU list the page was on.
 *
 * Returns 0 if the page was removed from an LRU list.
 * Returns -EBUSY if the page was not on an LRU list.
 *
 * The returned page will have PageLRU() cleared.  If it was found on
1141
1142
1143
 * the active list, it will have PageActive set.  If it was found on
 * the unevictable list, it will have the PageUnevictable bit set. That flag
 * may need to be cleared by the caller before letting the page go.
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
 *
 * The vmstat statistic corresponding to the list on which the page was
 * found will be decremented.
 *
 * Restrictions:
 * (1) Must be called with an elevated refcount on the page. This is a
 *     fundamentnal difference from isolate_lru_pages (which is called
 *     without a stable reference).
 * (2) the lru_lock must not be held.
 * (3) interrupts must be enabled.
 */
int isolate_lru_page(struct page *page)
{
	int ret = -EBUSY;

1159
1160
	VM_BUG_ON(!page_count(page));

1161
1162
	if (PageLRU(page)) {
		struct zone *zone = page_zone(page);
1163
		struct lruvec *lruvec;
1164
1165

		spin_lock_irq(&zone->lru_lock);
1166
		lruvec = mem_cgroup_page_lruvec(page, zone);
1167
		if (PageLRU(page)) {
1168
			int lru = page_lru(page);
1169
			get_page(page);
1170
			ClearPageLRU(page);
1171
1172
			del_page_from_lru_list(page, lruvec, lru);
			ret = 0;
1173
1174
1175
1176
1177
1178
		}
		spin_unlock_irq(&zone->lru_lock);
	}
	return ret;
}

1179
/*
1180
1181
1182
1183
1184
 * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
 * then get resheduled. When there are massive number of tasks doing page
 * allocation, such sleeping direct reclaimers may keep piling up on each CPU,
 * the LRU list will go small and be scanned faster than necessary, leading to
 * unnecessary swapping, thrashing and OOM.
1185
1186
1187
1188
1189
1190
1191
1192
1193
 */
static int too_many_isolated(struct zone *zone, int file,
		struct scan_control *sc)
{
	unsigned long inactive, isolated;

	if (current_is_kswapd())
		return 0;

1194
	if (!global_reclaim(sc))
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
		return 0;

	if (file) {
		inactive = zone_page_state(zone, NR_INACTIVE_FILE);
		isolated = zone_page_state(zone, NR_ISOLATED_FILE);
	} else {
		inactive = zone_page_state(zone, NR_INACTIVE_ANON);
		isolated = zone_page_state(zone, NR_ISOLATED_ANON);
	}

1205
1206
1207
1208
1209
1210
1211
1212
	/*
	 * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
	 * won't get blocked by normal direct-reclaimers, forming a circular
	 * deadlock.
	 */
	if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS)
		inactive >>= 3;

1213
1214
1215
	return isolated > inactive;
}

1216
static noinline_for_stack void
1217
putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
1218
{
1219
1220
	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
	struct zone *zone = lruvec_zone(lruvec);
1221
	LIST_HEAD(pages_to_free);
1222
1223
1224
1225
1226

	/*
	 * Put back any unfreeable pages.
	 */
	while (!list_empty(page_list)) {
1227
		struct page *page = lru_to_page(page_list);
1228
		int lru;
1229

1230
1231
		VM_BUG_ON(PageLRU(page));
		list_del(&page->lru);
1232
		if (unlikely(!page_evictable(page))) {
1233
1234
1235
1236
1237
			spin_unlock_irq(&zone->lru_lock);
			putback_lru_page(page);
			spin_lock_irq(&zone->lru_lock);
			continue;
		}
1238
1239
1240

		lruvec = mem_cgroup_page_lruvec(page, zone);

1241
		SetPageLRU(page);
1242
		lru = page_lru(page);
1243
1244
		add_page_to_lru_list(page, lruvec, lru);

1245
1246
		if (is_active_lru(lru)) {
			int file = is_file_lru(lru);
1247
1248
			int numpages = hpage_nr_pages(page);
			reclaim_stat->recent_rotated[file] += numpages;
1249
		}
1250
1251
1252
		if (put_page_testzero(page)) {
			__ClearPageLRU(page);
			__ClearPageActive(page);
1253
			del_page_from_lru_list(page, lruvec, lru);
1254
1255
1256
1257
1258
1259
1260

			if (unlikely(PageCompound(page))) {
				spin_unlock_irq(&zone->lru_lock);
				(*get_compound_page_dtor(page))(page);
				spin_lock_irq(&zone->lru_lock);
			} else
				list_add(&page->lru, &pages_to_free);