page_alloc.c 170 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
/*
 *  linux/mm/page_alloc.c
 *
 *  Manages the free list, the system allocates free pages here.
 *  Note that kmalloc() lives in slab.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 *  Swap reorganised 29.12.95, Stephen Tweedie
 *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
 *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
 *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
 *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
 *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
 *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
 */

#include <linux/stddef.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/interrupt.h>
#include <linux/pagemap.h>
22
#include <linux/jiffies.h>
Linus Torvalds's avatar
Linus Torvalds committed
23
#include <linux/bootmem.h>
24
#include <linux/memblock.h>
Linus Torvalds's avatar
Linus Torvalds committed
25
#include <linux/compiler.h>
26
#include <linux/kernel.h>
27
#include <linux/kmemcheck.h>
Linus Torvalds's avatar
Linus Torvalds committed
28
29
30
31
32
#include <linux/module.h>
#include <linux/suspend.h>
#include <linux/pagevec.h>
#include <linux/blkdev.h>
#include <linux/slab.h>
33
#include <linux/ratelimit.h>
34
#include <linux/oom.h>
Linus Torvalds's avatar
Linus Torvalds committed
35
36
37
38
39
#include <linux/notifier.h>
#include <linux/topology.h>
#include <linux/sysctl.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
40
#include <linux/memory_hotplug.h>
Linus Torvalds's avatar
Linus Torvalds committed
41
42
#include <linux/nodemask.h>
#include <linux/vmalloc.h>
43
#include <linux/vmstat.h>
44
#include <linux/mempolicy.h>
45
#include <linux/stop_machine.h>
46
47
#include <linux/sort.h>
#include <linux/pfn.h>
48
#include <linux/backing-dev.h>
49
#include <linux/fault-inject.h>
50
#include <linux/page-isolation.h>
51
#include <linux/page_cgroup.h>
52
#include <linux/debugobjects.h>
53
#include <linux/kmemleak.h>
54
#include <linux/compaction.h>
55
#include <trace/events/kmem.h>
56
#include <linux/ftrace_event.h>
57
#include <linux/memcontrol.h>
58
#include <linux/prefetch.h>
59
#include <linux/migrate.h>
60
#include <linux/page-debug-flags.h>
Linus Torvalds's avatar
Linus Torvalds committed
61
62

#include <asm/tlbflush.h>
63
#include <asm/div64.h>
Linus Torvalds's avatar
Linus Torvalds committed
64
65
#include "internal.h"

66
67
68
69
70
#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
DEFINE_PER_CPU(int, numa_node);
EXPORT_PER_CPU_SYMBOL(numa_node);
#endif

71
72
73
74
75
76
77
78
79
80
81
#ifdef CONFIG_HAVE_MEMORYLESS_NODES
/*
 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
 * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
 * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
 * defined in <linux/topology.h>.
 */
DEFINE_PER_CPU(int, _numa_mem_);		/* Kernel "local memory" node */
EXPORT_PER_CPU_SYMBOL(_numa_mem_);
#endif

Linus Torvalds's avatar
Linus Torvalds committed
82
/*
83
 * Array of node states.
Linus Torvalds's avatar
Linus Torvalds committed
84
 */
85
86
87
88
89
90
91
nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
	[N_POSSIBLE] = NODE_MASK_ALL,
	[N_ONLINE] = { { [0] = 1UL } },
#ifndef CONFIG_NUMA
	[N_NORMAL_MEMORY] = { { [0] = 1UL } },
#ifdef CONFIG_HIGHMEM
	[N_HIGH_MEMORY] = { { [0] = 1UL } },
92
93
94
#endif
#ifdef CONFIG_MOVABLE_NODE
	[N_MEMORY] = { { [0] = 1UL } },
95
96
97
98
99
100
#endif
	[N_CPU] = { { [0] = 1UL } },
#endif	/* NUMA */
};
EXPORT_SYMBOL(node_states);

101
unsigned long totalram_pages __read_mostly;
102
unsigned long totalreserve_pages __read_mostly;
103
104
105
106
107
108
109
110
/*
 * When calculating the number of globally allowed dirty pages, there
 * is a certain number of per-zone reserves that should not be
 * considered dirtyable memory.  This is the sum of those reserves
 * over all existing zones that contribute dirtyable memory.
 */
unsigned long dirty_balance_reserve __read_mostly;

111
int percpu_pagelist_fraction;
112
gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
Linus Torvalds's avatar
Linus Torvalds committed
113

114
115
116
117
118
119
120
121
122
#ifdef CONFIG_PM_SLEEP
/*
 * The following functions are used by the suspend/hibernate code to temporarily
 * change gfp_allowed_mask in order to avoid using I/O during memory allocations
 * while devices are suspended.  To avoid races with the suspend/hibernate code,
 * they should always be called with pm_mutex held (gfp_allowed_mask also should
 * only be modified with pm_mutex held, unless the suspend/hibernate code is
 * guaranteed not to run in parallel with that modification).
 */
123
124
125
126

static gfp_t saved_gfp_mask;

void pm_restore_gfp_mask(void)
127
128
{
	WARN_ON(!mutex_is_locked(&pm_mutex));
129
130
131
132
	if (saved_gfp_mask) {
		gfp_allowed_mask = saved_gfp_mask;
		saved_gfp_mask = 0;
	}
133
134
}

135
void pm_restrict_gfp_mask(void)
136
137
{
	WARN_ON(!mutex_is_locked(&pm_mutex));
138
139
140
	WARN_ON(saved_gfp_mask);
	saved_gfp_mask = gfp_allowed_mask;
	gfp_allowed_mask &= ~GFP_IOFS;
141
}
142
143
144
145
146
147
148

bool pm_suspended_storage(void)
{
	if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
		return false;
	return true;
}
149
150
#endif /* CONFIG_PM_SLEEP */

151
152
153
154
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
int pageblock_order __read_mostly;
#endif

155
static void __free_pages_ok(struct page *page, unsigned int order);
156

Linus Torvalds's avatar
Linus Torvalds committed
157
158
159
160
161
162
163
/*
 * results with 256, 32 in the lowmem_reserve sysctl:
 *	1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
 *	1G machine -> (16M dma, 784M normal, 224M high)
 *	NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
 *	HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
 *	HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
164
165
166
 *
 * TBD: should special case ZONE_DMA32 machines here - in those we normally
 * don't need any ZONE_NORMAL reservation
Linus Torvalds's avatar
Linus Torvalds committed
167
 */
168
int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
169
#ifdef CONFIG_ZONE_DMA
170
	 256,
171
#endif
172
#ifdef CONFIG_ZONE_DMA32
173
	 256,
174
#endif
175
#ifdef CONFIG_HIGHMEM
Mel Gorman's avatar
Mel Gorman committed
176
	 32,
177
#endif
Mel Gorman's avatar
Mel Gorman committed
178
	 32,
179
};
Linus Torvalds's avatar
Linus Torvalds committed
180
181
182

EXPORT_SYMBOL(totalram_pages);

183
static char * const zone_names[MAX_NR_ZONES] = {
184
#ifdef CONFIG_ZONE_DMA
185
	 "DMA",
186
#endif
187
#ifdef CONFIG_ZONE_DMA32
188
	 "DMA32",
189
#endif
190
	 "Normal",
191
#ifdef CONFIG_HIGHMEM
Mel Gorman's avatar
Mel Gorman committed
192
	 "HighMem",
193
#endif
Mel Gorman's avatar
Mel Gorman committed
194
	 "Movable",
195
196
};

Linus Torvalds's avatar
Linus Torvalds committed
197
198
int min_free_kbytes = 1024;

199
200
static unsigned long __meminitdata nr_kernel_pages;
static unsigned long __meminitdata nr_all_pages;
201
static unsigned long __meminitdata dma_reserve;
Linus Torvalds's avatar
Linus Torvalds committed
202

Tejun Heo's avatar
Tejun Heo committed
203
204
205
206
207
208
209
210
211
212
213
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
static unsigned long __initdata required_kernelcore;
static unsigned long __initdata required_movablecore;
static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];

/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
int movable_zone;
EXPORT_SYMBOL(movable_zone);
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
214

Miklos Szeredi's avatar
Miklos Szeredi committed
215
216
#if MAX_NUMNODES > 1
int nr_node_ids __read_mostly = MAX_NUMNODES;
217
int nr_online_nodes __read_mostly = 1;
Miklos Szeredi's avatar
Miklos Szeredi committed
218
EXPORT_SYMBOL(nr_node_ids);
219
EXPORT_SYMBOL(nr_online_nodes);
Miklos Szeredi's avatar
Miklos Szeredi committed
220
221
#endif

222
223
int page_group_by_mobility_disabled __read_mostly;

224
225
226
227
228
/*
 * NOTE:
 * Don't use set_pageblock_migratetype(page, MIGRATE_ISOLATE) directly.
 * Instead, use {un}set_pageblock_isolate.
 */
229
void set_pageblock_migratetype(struct page *page, int migratetype)
230
{
231
232
233
234

	if (unlikely(page_group_by_mobility_disabled))
		migratetype = MIGRATE_UNMOVABLE;

235
236
237
238
	set_pageblock_flags_group(page, (unsigned long)migratetype,
					PB_migrate, PB_migrate_end);
}

239
240
bool oom_killer_disabled __read_mostly;

Nick Piggin's avatar
Nick Piggin committed
241
#ifdef CONFIG_DEBUG_VM
242
static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
Linus Torvalds's avatar
Linus Torvalds committed
243
{
244
245
246
	int ret = 0;
	unsigned seq;
	unsigned long pfn = page_to_pfn(page);
247

248
249
250
251
252
253
254
255
256
	do {
		seq = zone_span_seqbegin(zone);
		if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
			ret = 1;
		else if (pfn < zone->zone_start_pfn)
			ret = 1;
	} while (zone_span_seqretry(zone, seq));

	return ret;
257
258
259
260
}

static int page_is_consistent(struct zone *zone, struct page *page)
{
261
	if (!pfn_valid_within(page_to_pfn(page)))
262
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
263
	if (zone != page_zone(page))
264
265
266
267
268
269
270
271
272
273
		return 0;

	return 1;
}
/*
 * Temporary debugging check for pages not lying within a given zone.
 */
static int bad_range(struct zone *zone, struct page *page)
{
	if (page_outside_zone_boundaries(zone, page))
Linus Torvalds's avatar
Linus Torvalds committed
274
		return 1;
275
276
277
	if (!page_is_consistent(zone, page))
		return 1;

Linus Torvalds's avatar
Linus Torvalds committed
278
279
	return 0;
}
Nick Piggin's avatar
Nick Piggin committed
280
281
282
283
284
285
286
#else
static inline int bad_range(struct zone *zone, struct page *page)
{
	return 0;
}
#endif

287
static void bad_page(struct page *page)
Linus Torvalds's avatar
Linus Torvalds committed
288
{
289
290
291
292
	static unsigned long resume;
	static unsigned long nr_shown;
	static unsigned long nr_unshown;

293
294
	/* Don't complain about poisoned pages */
	if (PageHWPoison(page)) {
295
		reset_page_mapcount(page); /* remove PageBuddy */
296
297
298
		return;
	}

299
300
301
302
303
304
305
306
307
308
	/*
	 * Allow a burst of 60 reports, then keep quiet for that minute;
	 * or allow a steady drip of one report per second.
	 */
	if (nr_shown == 60) {
		if (time_before(jiffies, resume)) {
			nr_unshown++;
			goto out;
		}
		if (nr_unshown) {
309
310
			printk(KERN_ALERT
			      "BUG: Bad page state: %lu messages suppressed\n",
311
312
313
314
315
316
317
318
				nr_unshown);
			nr_unshown = 0;
		}
		nr_shown = 0;
	}
	if (nr_shown++ == 0)
		resume = jiffies + 60 * HZ;

319
	printk(KERN_ALERT "BUG: Bad page state in process %s  pfn:%05lx\n",
320
		current->comm, page_to_pfn(page));
321
	dump_page(page);
322

323
	print_modules();
Linus Torvalds's avatar
Linus Torvalds committed
324
	dump_stack();
325
out:
326
	/* Leave bad fields for debug, except PageBuddy could make trouble */
327
	reset_page_mapcount(page); /* remove PageBuddy */
328
	add_taint(TAINT_BAD_PAGE);
Linus Torvalds's avatar
Linus Torvalds committed
329
330
331
332
333
334
335
336
337
}

/*
 * Higher-order pages are called "compound pages".  They are structured thusly:
 *
 * The first PAGE_SIZE page is called the "head page".
 *
 * The remaining PAGE_SIZE pages are called "tail pages".
 *
338
339
 * All pages have PG_compound set.  All tail pages have their ->first_page
 * pointing at the head page.
Linus Torvalds's avatar
Linus Torvalds committed
340
 *
341
342
343
 * The first tail page's ->lru.next holds the address of the compound page's
 * put_page() function.  Its ->lru.prev holds the order of allocation.
 * This usage means that zero-order pages may not be compound.
Linus Torvalds's avatar
Linus Torvalds committed
344
 */
345
346
347

static void free_compound_page(struct page *page)
{
348
	__free_pages_ok(page, compound_order(page));
349
350
}

351
void prep_compound_page(struct page *page, unsigned long order)
352
353
354
355
356
357
358
359
360
361
{
	int i;
	int nr_pages = 1 << order;

	set_compound_page_dtor(page, free_compound_page);
	set_compound_order(page, order);
	__SetPageHead(page);
	for (i = 1; i < nr_pages; i++) {
		struct page *p = page + i;
		__SetPageTail(p);
362
		set_page_count(p, 0);
363
364
365
366
		p->first_page = page;
	}
}

367
/* update __split_huge_page_refcount if you change this function */
368
static int destroy_compound_page(struct page *page, unsigned long order)
Linus Torvalds's avatar
Linus Torvalds committed
369
370
371
{
	int i;
	int nr_pages = 1 << order;
372
	int bad = 0;
Linus Torvalds's avatar
Linus Torvalds committed
373

374
375
	if (unlikely(compound_order(page) != order) ||
	    unlikely(!PageHead(page))) {
376
		bad_page(page);
377
378
		bad++;
	}
Linus Torvalds's avatar
Linus Torvalds committed
379

380
	__ClearPageHead(page);
381

382
383
	for (i = 1; i < nr_pages; i++) {
		struct page *p = page + i;
Linus Torvalds's avatar
Linus Torvalds committed
384

385
		if (unlikely(!PageTail(p) || (p->first_page != page))) {
386
			bad_page(page);
387
388
			bad++;
		}
389
		__ClearPageTail(p);
Linus Torvalds's avatar
Linus Torvalds committed
390
	}
391
392

	return bad;
Linus Torvalds's avatar
Linus Torvalds committed
393
394
}

Nick Piggin's avatar
Nick Piggin committed
395
396
397
398
static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
{
	int i;

399
400
401
402
	/*
	 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
	 * and __GFP_HIGHMEM from hard or soft interrupt context.
	 */
Nick Piggin's avatar
Nick Piggin committed
403
	VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
Nick Piggin's avatar
Nick Piggin committed
404
405
406
407
	for (i = 0; i < (1 << order); i++)
		clear_highpage(page + i);
}

408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
#ifdef CONFIG_DEBUG_PAGEALLOC
unsigned int _debug_guardpage_minorder;

static int __init debug_guardpage_minorder_setup(char *buf)
{
	unsigned long res;

	if (kstrtoul(buf, 10, &res) < 0 ||  res > MAX_ORDER / 2) {
		printk(KERN_ERR "Bad debug_guardpage_minorder value\n");
		return 0;
	}
	_debug_guardpage_minorder = res;
	printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);
	return 0;
}
__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);

static inline void set_page_guard_flag(struct page *page)
{
	__set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
}

static inline void clear_page_guard_flag(struct page *page)
{
	__clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
}
#else
static inline void set_page_guard_flag(struct page *page) { }
static inline void clear_page_guard_flag(struct page *page) { }
#endif

439
440
static inline void set_page_order(struct page *page, int order)
{
441
	set_page_private(page, order);
442
	__SetPageBuddy(page);
Linus Torvalds's avatar
Linus Torvalds committed
443
444
445
446
}

static inline void rmv_page_order(struct page *page)
{
447
	__ClearPageBuddy(page);
448
	set_page_private(page, 0);
Linus Torvalds's avatar
Linus Torvalds committed
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
}

/*
 * Locate the struct page for both the matching buddy in our
 * pair (buddy1) and the combined O(n+1) page they form (page).
 *
 * 1) Any buddy B1 will have an order O twin B2 which satisfies
 * the following equation:
 *     B2 = B1 ^ (1 << O)
 * For example, if the starting buddy (buddy2) is #8 its order
 * 1 buddy is #10:
 *     B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
 *
 * 2) Any buddy B will have an order O+1 parent P which
 * satisfies the following equation:
 *     P = B & ~(1 << O)
 *
Andreas Mohr's avatar
Andreas Mohr committed
466
 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
Linus Torvalds's avatar
Linus Torvalds committed
467
468
 */
static inline unsigned long
469
__find_buddy_index(unsigned long page_idx, unsigned int order)
Linus Torvalds's avatar
Linus Torvalds committed
470
{
471
	return page_idx ^ (1 << order);
Linus Torvalds's avatar
Linus Torvalds committed
472
473
474
475
476
}

/*
 * This function checks whether a page is free && is the buddy
 * we can do coalesce a page and its buddy if
Nick Piggin's avatar
Nick Piggin committed
477
 * (a) the buddy is not in a hole &&
478
 * (b) the buddy is in the buddy system &&
479
480
 * (c) a page and its buddy have the same order &&
 * (d) a page and its buddy are in the same zone.
481
 *
Andrea Arcangeli's avatar
Andrea Arcangeli committed
482
483
 * For recording whether a page is in the buddy system, we set ->_mapcount -2.
 * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock.
Linus Torvalds's avatar
Linus Torvalds committed
484
 *
485
 * For recording page's order, we use page_private(page).
Linus Torvalds's avatar
Linus Torvalds committed
486
 */
487
488
static inline int page_is_buddy(struct page *page, struct page *buddy,
								int order)
Linus Torvalds's avatar
Linus Torvalds committed
489
{
490
	if (!pfn_valid_within(page_to_pfn(buddy)))
Nick Piggin's avatar
Nick Piggin committed
491
492
		return 0;

493
494
495
	if (page_zone_id(page) != page_zone_id(buddy))
		return 0;

496
497
498
499
500
	if (page_is_guard(buddy) && page_order(buddy) == order) {
		VM_BUG_ON(page_count(buddy) != 0);
		return 1;
	}

501
	if (PageBuddy(buddy) && page_order(buddy) == order) {
502
		VM_BUG_ON(page_count(buddy) != 0);
503
		return 1;
504
	}
505
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
}

/*
 * Freeing function for a buddy system allocator.
 *
 * The concept of a buddy system is to maintain direct-mapped table
 * (containing bit values) for memory blocks of various "orders".
 * The bottom level table contains the map for the smallest allocatable
 * units of memory (here, pages), and each level above it describes
 * pairs of units from the levels below, hence, "buddies".
 * At a high level, all that happens here is marking the table entry
 * at the bottom level available, and propagating the changes upward
 * as necessary, plus some accounting needed to play nicely with other
 * parts of the VM system.
 * At each level, we keep a list of pages, which are heads of continuous
Andrea Arcangeli's avatar
Andrea Arcangeli committed
521
 * free pages of length of (1 << order) and marked with _mapcount -2. Page's
522
 * order is recorded in page_private(page) field.
Linus Torvalds's avatar
Linus Torvalds committed
523
 * So when we are allocating or freeing one, we can derive the state of the
524
525
 * other.  That is, if we allocate a small block, and both were
 * free, the remainder of the region must be split into blocks.
Linus Torvalds's avatar
Linus Torvalds committed
526
 * If a block is freed, and its buddy is also free, then this
527
 * triggers coalescing into a block of larger size.
Linus Torvalds's avatar
Linus Torvalds committed
528
 *
529
 * -- nyc
Linus Torvalds's avatar
Linus Torvalds committed
530
531
 */

Nick Piggin's avatar
Nick Piggin committed
532
static inline void __free_one_page(struct page *page,
533
534
		struct zone *zone, unsigned int order,
		int migratetype)
Linus Torvalds's avatar
Linus Torvalds committed
535
536
{
	unsigned long page_idx;
537
	unsigned long combined_idx;
538
	unsigned long uninitialized_var(buddy_idx);
539
	struct page *buddy;
Linus Torvalds's avatar
Linus Torvalds committed
540

541
	if (unlikely(PageCompound(page)))
542
543
		if (unlikely(destroy_compound_page(page, order)))
			return;
Linus Torvalds's avatar
Linus Torvalds committed
544

545
546
	VM_BUG_ON(migratetype == -1);

Linus Torvalds's avatar
Linus Torvalds committed
547
548
	page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);

549
	VM_BUG_ON(page_idx & ((1 << order) - 1));
Nick Piggin's avatar
Nick Piggin committed
550
	VM_BUG_ON(bad_range(zone, page));
Linus Torvalds's avatar
Linus Torvalds committed
551
552

	while (order < MAX_ORDER-1) {
553
554
		buddy_idx = __find_buddy_index(page_idx, order);
		buddy = page + (buddy_idx - page_idx);
555
		if (!page_is_buddy(page, buddy, order))
556
			break;
557
558
559
560
561
562
563
		/*
		 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
		 * merge with it and move up one order.
		 */
		if (page_is_guard(buddy)) {
			clear_page_guard_flag(buddy);
			set_page_private(page, 0);
564
565
			__mod_zone_freepage_state(zone, 1 << order,
						  migratetype);
566
567
568
569
570
		} else {
			list_del(&buddy->lru);
			zone->free_area[order].nr_free--;
			rmv_page_order(buddy);
		}
571
		combined_idx = buddy_idx & page_idx;
Linus Torvalds's avatar
Linus Torvalds committed
572
573
574
575
576
		page = page + (combined_idx - page_idx);
		page_idx = combined_idx;
		order++;
	}
	set_page_order(page, order);
577
578
579
580
581
582
583
584
585

	/*
	 * If this is not the largest possible page, check if the buddy
	 * of the next-highest order is free. If it is, it's possible
	 * that pages are being freed that will coalesce soon. In case,
	 * that is happening, add the free page to the tail of the list
	 * so it's less likely to be used soon and more likely to be merged
	 * as a higher order page
	 */
586
	if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
587
		struct page *higher_page, *higher_buddy;
588
589
590
		combined_idx = buddy_idx & page_idx;
		higher_page = page + (combined_idx - page_idx);
		buddy_idx = __find_buddy_index(combined_idx, order + 1);
591
		higher_buddy = higher_page + (buddy_idx - combined_idx);
592
593
594
595
596
597
598
599
600
		if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
			list_add_tail(&page->lru,
				&zone->free_area[order].free_list[migratetype]);
			goto out;
		}
	}

	list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
out:
Linus Torvalds's avatar
Linus Torvalds committed
601
602
603
	zone->free_area[order].nr_free++;
}

604
static inline int free_pages_check(struct page *page)
Linus Torvalds's avatar
Linus Torvalds committed
605
{
Nick Piggin's avatar
Nick Piggin committed
606
607
	if (unlikely(page_mapcount(page) |
		(page->mapping != NULL)  |
608
		(atomic_read(&page->_count) != 0) |
609
610
		(page->flags & PAGE_FLAGS_CHECK_AT_FREE) |
		(mem_cgroup_bad_page_check(page)))) {
611
		bad_page(page);
612
		return 1;
613
	}
614
	reset_page_last_nid(page);
615
616
617
	if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
		page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
618
619
620
}

/*
621
 * Frees a number of pages from the PCP lists
Linus Torvalds's avatar
Linus Torvalds committed
622
 * Assumes all pages on list are in same zone, and of same order.
623
 * count is the number of pages to free.
Linus Torvalds's avatar
Linus Torvalds committed
624
625
626
627
628
629
630
 *
 * If the zone was previously in an "all pages pinned" state then look to
 * see if this freeing clears that state.
 *
 * And clear the zone's pages_scanned counter, to hold off the "all pages are
 * pinned" detection logic.
 */
631
632
static void free_pcppages_bulk(struct zone *zone, int count,
					struct per_cpu_pages *pcp)
Linus Torvalds's avatar
Linus Torvalds committed
633
{
634
	int migratetype = 0;
635
	int batch_free = 0;
636
	int to_free = count;
637

Nick Piggin's avatar
Nick Piggin committed
638
	spin_lock(&zone->lock);
639
	zone->all_unreclaimable = 0;
Linus Torvalds's avatar
Linus Torvalds committed
640
	zone->pages_scanned = 0;
641

642
	while (to_free) {
Nick Piggin's avatar
Nick Piggin committed
643
		struct page *page;
644
645
646
		struct list_head *list;

		/*
647
648
649
650
651
		 * Remove pages from lists in a round-robin fashion. A
		 * batch_free count is maintained that is incremented when an
		 * empty list is encountered.  This is so more pages are freed
		 * off fuller lists instead of spinning excessively around empty
		 * lists
652
653
		 */
		do {
654
			batch_free++;
655
656
657
658
			if (++migratetype == MIGRATE_PCPTYPES)
				migratetype = 0;
			list = &pcp->lists[migratetype];
		} while (list_empty(list));
Nick Piggin's avatar
Nick Piggin committed
659

660
661
662
663
		/* This is the only non-empty list. Free them all. */
		if (batch_free == MIGRATE_PCPTYPES)
			batch_free = to_free;

664
		do {
665
666
			int mt;	/* migratetype of the to-be-freed page */

667
668
669
			page = list_entry(list->prev, struct page, lru);
			/* must delete as __free_one_page list manipulates */
			list_del(&page->lru);
670
			mt = get_freepage_migratetype(page);
671
			/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
672
673
			__free_one_page(page, zone, 0, mt);
			trace_mm_page_pcpu_drain(page, 0, mt);
674
675
676
677
678
			if (likely(get_pageblock_migratetype(page) != MIGRATE_ISOLATE)) {
				__mod_zone_page_state(zone, NR_FREE_PAGES, 1);
				if (is_migrate_cma(mt))
					__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
			}
679
		} while (--to_free && --batch_free && !list_empty(list));
Linus Torvalds's avatar
Linus Torvalds committed
680
	}
Nick Piggin's avatar
Nick Piggin committed
681
	spin_unlock(&zone->lock);
Linus Torvalds's avatar
Linus Torvalds committed
682
683
}

684
685
static void free_one_page(struct zone *zone, struct page *page, int order,
				int migratetype)
Linus Torvalds's avatar
Linus Torvalds committed
686
{
687
	spin_lock(&zone->lock);
688
	zone->all_unreclaimable = 0;
689
	zone->pages_scanned = 0;
690

691
	__free_one_page(page, zone, order, migratetype);
692
	if (unlikely(migratetype != MIGRATE_ISOLATE))
693
		__mod_zone_freepage_state(zone, 1 << order, migratetype);
694
	spin_unlock(&zone->lock);
Nick Piggin's avatar
Nick Piggin committed
695
696
}

697
static bool free_pages_prepare(struct page *page, unsigned int order)
Nick Piggin's avatar
Nick Piggin committed
698
{
Linus Torvalds's avatar
Linus Torvalds committed
699
	int i;
700
	int bad = 0;
Linus Torvalds's avatar
Linus Torvalds committed
701

702
	trace_mm_page_free(page, order);
703
704
	kmemcheck_free_shadow(page, order);

Andrea Arcangeli's avatar
Andrea Arcangeli committed
705
706
707
708
	if (PageAnon(page))
		page->mapping = NULL;
	for (i = 0; i < (1 << order); i++)
		bad += free_pages_check(page + i);
709
	if (bad)
710
		return false;
711

712
	if (!PageHighMem(page)) {
Nick Piggin's avatar
Nick Piggin committed
713
		debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
714
715
716
		debug_check_no_obj_freed(page_address(page),
					   PAGE_SIZE << order);
	}
Nick Piggin's avatar
Nick Piggin committed
717
	arch_free_page(page, order);
Nick Piggin's avatar
Nick Piggin committed
718
	kernel_map_pages(page, 1 << order, 0);
Nick Piggin's avatar
Nick Piggin committed
719

720
721
722
723
724
725
	return true;
}

static void __free_pages_ok(struct page *page, unsigned int order)
{
	unsigned long flags;
726
	int migratetype;
727
728
729
730

	if (!free_pages_prepare(page, order))
		return;

Nick Piggin's avatar
Nick Piggin committed
731
	local_irq_save(flags);
732
	__count_vm_events(PGFREE, 1 << order);
733
734
735
	migratetype = get_pageblock_migratetype(page);
	set_freepage_migratetype(page, migratetype);
	free_one_page(page_zone(page), page, order, migratetype);
Nick Piggin's avatar
Nick Piggin committed
736
	local_irq_restore(flags);
Linus Torvalds's avatar
Linus Torvalds committed
737
738
}

739
740
741
742
743
744
745
/*
 * Read access to zone->managed_pages is safe because it's unsigned long,
 * but we still need to serialize writers. Currently all callers of
 * __free_pages_bootmem() except put_page_bootmem() should only be used
 * at boot time. So for shorter boot time, we shift the burden to
 * put_page_bootmem() to serialize writers.
 */
746
void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
747
{
748
749
	unsigned int nr_pages = 1 << order;
	unsigned int loop;
750

751
752
753
754
755
756
757
758
	prefetchw(page);
	for (loop = 0; loop < nr_pages; loop++) {
		struct page *p = &page[loop];

		if (loop + 1 < nr_pages)
			prefetchw(p + 1);
		__ClearPageReserved(p);
		set_page_count(p, 0);
759
	}
760

761
	page_zone(page)->managed_pages += 1 << order;
762
763
	set_page_refcounted(page);
	__free_pages(page, order);
764
765
}

766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
#ifdef CONFIG_CMA
/* Free whole pageblock and set it's migration type to MIGRATE_CMA. */
void __init init_cma_reserved_pageblock(struct page *page)
{
	unsigned i = pageblock_nr_pages;
	struct page *p = page;

	do {
		__ClearPageReserved(p);
		set_page_count(p, 0);
	} while (++p, --i);

	set_page_refcounted(page);
	set_pageblock_migratetype(page, MIGRATE_CMA);
	__free_pages(page, pageblock_order);
	totalram_pages += pageblock_nr_pages;
}
#endif
Linus Torvalds's avatar
Linus Torvalds committed
784
785
786
787
788
789
790
791
792
793
794
795
796

/*
 * The order of subdivision here is critical for the IO subsystem.
 * Please do not alter this order without good reasons and regression
 * testing. Specifically, as large blocks of memory are subdivided,
 * the order in which smaller blocks are delivered depends on the order
 * they're subdivided in this function. This is the primary factor
 * influencing the order in which pages are delivered to the IO
 * subsystem according to empirical testing, and this is also justified
 * by considering the behavior of a buddy system containing a single
 * large block of memory acted on by a series of small allocations.
 * This behavior is a critical factor in sglist merging's success.
 *
797
 * -- nyc
Linus Torvalds's avatar
Linus Torvalds committed
798
 */
Nick Piggin's avatar
Nick Piggin committed
799
static inline void expand(struct zone *zone, struct page *page,
800
801
	int low, int high, struct free_area *area,
	int migratetype)
Linus Torvalds's avatar
Linus Torvalds committed
802
803
804
805
806
807
808
{
	unsigned long size = 1 << high;

	while (high > low) {
		area--;
		high--;
		size >>= 1;
Nick Piggin's avatar
Nick Piggin committed
809
		VM_BUG_ON(bad_range(zone, &page[size]));
810
811
812
813
814
815
816
817
818
819
820
821
822

#ifdef CONFIG_DEBUG_PAGEALLOC
		if (high < debug_guardpage_minorder()) {
			/*
			 * Mark as guard pages (or page), that will allow to
			 * merge back to allocator when buddy will be freed.
			 * Corresponding page table entries will not be touched,
			 * pages will stay not present in virtual address space
			 */
			INIT_LIST_HEAD(&page[size].lru);
			set_page_guard_flag(&page[size]);
			set_page_private(&page[size], high);
			/* Guard pages are not available for any usage */
823
824
			__mod_zone_freepage_state(zone, -(1 << high),
						  migratetype);
825
826
827
			continue;
		}
#endif
828
		list_add(&page[size].lru, &area->free_list[migratetype]);
Linus Torvalds's avatar
Linus Torvalds committed
829
830
831
832
833
834
835
836
		area->nr_free++;
		set_page_order(&page[size], high);
	}
}

/*
 * This page is about to be returned from the page allocator
 */
837
static inline int check_new_page(struct page *page)
Linus Torvalds's avatar
Linus Torvalds committed
838
{
Nick Piggin's avatar
Nick Piggin committed
839
840
	if (unlikely(page_mapcount(page) |
		(page->mapping != NULL)  |
841
		(atomic_read(&page->_count) != 0)  |
842
843
		(page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
		(mem_cgroup_bad_page_check(page)))) {
844
		bad_page(page);
845
		return 1;
846
	}
847
848
849
850
851
852
853
854
855
856
857
858
	return 0;
}

static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
{
	int i;

	for (i = 0; i < (1 << order); i++) {
		struct page *p = page + i;
		if (unlikely(check_new_page(p)))
			return 1;
	}
859

860
	set_page_private(page, 0);
861
	set_page_refcounted(page);
Nick Piggin's avatar
Nick Piggin committed
862
863

	arch_alloc_page(page, order);
Linus Torvalds's avatar
Linus Torvalds committed
864
	kernel_map_pages(page, 1 << order, 1);
Nick Piggin's avatar
Nick Piggin committed
865
866
867
868
869
870
871

	if (gfp_flags & __GFP_ZERO)
		prep_zero_page(page, order, gfp_flags);

	if (order && (gfp_flags & __GFP_COMP))
		prep_compound_page(page, order);

872
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
873
874
}

875
876
877
878
/*
 * Go through the free lists for the given migratetype and remove
 * the smallest available page from the freelists
 */
879
880
static inline
struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
						int migratetype)
{
	unsigned int current_order;
	struct free_area * area;
	struct page *page;

	/* Find a page of the appropriate size in the preferred list */
	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
		area = &(zone->free_area[current_order]);
		if (list_empty(&area->free_list[migratetype]))
			continue;

		page = list_entry(area->free_list[migratetype].next,
							struct page, lru);
		list_del(&page->lru);
		rmv_page_order(page);
		area->nr_free--;
		expand(zone, page, order, current_order, area, migratetype);
		return page;
	}

	return NULL;
}


906
907
908
909
/*
 * This array describes the order lists are fallen back to when
 * the free lists for the desirable migrate type are depleted
 */
910
911
912
913
914
915
916
917
918
static int fallbacks[MIGRATE_TYPES][4] = {
	[MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,     MIGRATE_RESERVE },
	[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,     MIGRATE_RESERVE },
#ifdef CONFIG_CMA
	[MIGRATE_MOVABLE]     = { MIGRATE_CMA,         MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
	[MIGRATE_CMA]         = { MIGRATE_RESERVE }, /* Never used */
#else
	[MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE,   MIGRATE_RESERVE },
#endif
919
920
	[MIGRATE_RESERVE]     = { MIGRATE_RESERVE }, /* Never used */
	[MIGRATE_ISOLATE]     = { MIGRATE_RESERVE }, /* Never used */
921
922
};

923
924
/*
 * Move the free pages in a range to the free lists of the requested type.
925
 * Note that start_page and end_pages are not aligned on a pageblock
926
927
 * boundary. If alignment is required, use move_freepages_block()
 */
928
int move_freepages(struct zone *zone,
Adrian Bunk's avatar
Adrian Bunk committed
929
930
			  struct page *start_page, struct page *end_page,
			  int migratetype)
931
932
933
{
	struct page *page;
	unsigned long order;
934
	int pages_moved = 0;
935
936
937
938
939
940
941

#ifndef CONFIG_HOLES_IN_ZONE
	/*
	 * page_zone is not safe to call in this context when
	 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
	 * anyway as we check zone boundaries in move_freepages_block().
	 * Remove at a later date when no bug reports exist related to
Mel Gorman's avatar
Mel Gorman committed
942
	 * grouping pages by mobility
943
944
945
946
947
	 */
	BUG_ON(page_zone(start_page) != page_zone(end_page));
#endif

	for (page = start_page; page <= end_page;) {
948
949
950
		/* Make sure we are not inadvertently changing nodes */
		VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));

951
952
953
954
955
956
957
958
959
960
961
		if (!pfn_valid_within(page_to_pfn(page))) {
			page++;
			continue;
		}

		if (!PageBuddy(page)) {
			page++;
			continue;
		}

		order = page_order(page);
962
963
		list_move(&page->lru,
			  &zone->free_area[order].free_list[migratetype]);
964
		set_freepage_migratetype(page, migratetype);
965
		page += 1 << order;
966
		pages_moved += 1 << order;
967
968
	}

969
	return pages_moved;
970
971
}

972
int move_freepages_block(struct zone *zone, struct page *page,
973
				int migratetype)
974
975
976
977
978
{
	unsigned long start_pfn, end_pfn;
	struct page *start_page, *end_page;

	start_pfn = page_to_pfn(page);
979
	start_pfn = start_pfn & ~(pageblock_nr_pages-1);
980
	start_page = pfn_to_page(start_pfn);
981
982
	end_page = start_page + pageblock_nr_pages - 1;
	end_pfn = start_pfn + pageblock_nr_pages - 1;
983
984
985
986
987
988
989
990
991
992

	/* Do not cross zone boundaries */
	if (start_pfn < zone->zone_start_pfn)
		start_page = page;
	if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages)
		return 0;

	return move_freepages(zone, start_page, end_page, migratetype);
}

993
994
995
996
997
998
999
1000
1001
1002
1003
static void change_pageblock_range(struct page *pageblock_page,
					int start_order, int migratetype)
{
	int nr_pageblocks = 1 << (start_order - pageblock_order);

	while (nr_pageblocks--) {
		set_pageblock_migratetype(pageblock_page, migratetype);
		pageblock_page += pageblock_nr_pages;
	}
}

1004
/* Remove an element from the buddy allocator from the fallback list */
1005
1006
static inline struct page *
__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
1007
1008
1009
1010
1011
1012
1013
1014
1015
{
	struct free_area * area;
	int current_order;
	struct page *page;
	int migratetype, i;

	/* Find the largest possible block of pages in the other list */
	for (current_order = MAX_ORDER-1; current_order >= order;
						--current_order) {
1016
		for (i = 0;; i++) {
1017
1018
			migratetype = fallbacks[start_migratetype][i];

1019
1020
			/* MIGRATE_RESERVE handled later if necessary */
			if (migratetype == MIGRATE_RESERVE)
1021
				break;
1022

1023
1024
1025
1026
1027
1028
1029
1030
1031
			area = &(zone->free_area[current_order]);
			if (list_empty(&area->free_list[migratetype]))
				continue;

			page = list_entry(area->free_list[migratetype].next,
					struct page, lru);
			area->nr_free--;

			/*
1032
			 * If breaking a large block of pages, move all free
1033
1034
			 * pages to the preferred allocation list. If falling
			 * back for a reclaimable kernel allocation, be more
Lucas De Marchi's avatar
Lucas De Marchi committed
1035
			 * aggressive about taking ownership of free pages
1036
1037
1038
1039
1040
1041
			 *
			 * On the other hand, never change migration
			 * type of MIGRATE_CMA pageblocks nor move CMA
			 * pages on different free lists. We don't
			 * want unmovable pages to be allocated from
			 * MIGRATE_CMA areas.
1042
			 */
1043
1044
1045
1046
1047
			if (!is_migrate_cma(migratetype) &&
			    (unlikely(current_order >= pageblock_order / 2) ||
			     start_migratetype == MIGRATE_RECLAIMABLE ||
			     page_group_by_mobility_disabled)) {
				int pages;