page_alloc.c 119 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
/*
 *  linux/mm/page_alloc.c
 *
 *  Manages the free list, the system allocates free pages here.
 *  Note that kmalloc() lives in slab.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 *  Swap reorganised 29.12.95, Stephen Tweedie
 *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
 *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
 *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
 *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
 *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
 *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
 */

#include <linux/stddef.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/interrupt.h>
#include <linux/pagemap.h>
#include <linux/bootmem.h>
#include <linux/compiler.h>
24
#include <linux/kernel.h>
Linus Torvalds's avatar
Linus Torvalds committed
25 26 27 28 29 30 31 32 33 34
#include <linux/module.h>
#include <linux/suspend.h>
#include <linux/pagevec.h>
#include <linux/blkdev.h>
#include <linux/slab.h>
#include <linux/notifier.h>
#include <linux/topology.h>
#include <linux/sysctl.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
35
#include <linux/memory_hotplug.h>
Linus Torvalds's avatar
Linus Torvalds committed
36 37
#include <linux/nodemask.h>
#include <linux/vmalloc.h>
38
#include <linux/mempolicy.h>
39
#include <linux/stop_machine.h>
40 41
#include <linux/sort.h>
#include <linux/pfn.h>
42
#include <linux/backing-dev.h>
43
#include <linux/fault-inject.h>
Linus Torvalds's avatar
Linus Torvalds committed
44 45

#include <asm/tlbflush.h>
46
#include <asm/div64.h>
Linus Torvalds's avatar
Linus Torvalds committed
47 48 49
#include "internal.h"

/*
50
 * Array of node states.
Linus Torvalds's avatar
Linus Torvalds committed
51
 */
52 53 54 55 56 57 58 59 60 61 62 63 64
nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
	[N_POSSIBLE] = NODE_MASK_ALL,
	[N_ONLINE] = { { [0] = 1UL } },
#ifndef CONFIG_NUMA
	[N_NORMAL_MEMORY] = { { [0] = 1UL } },
#ifdef CONFIG_HIGHMEM
	[N_HIGH_MEMORY] = { { [0] = 1UL } },
#endif
	[N_CPU] = { { [0] = 1UL } },
#endif	/* NUMA */
};
EXPORT_SYMBOL(node_states);

65
unsigned long totalram_pages __read_mostly;
66
unsigned long totalreserve_pages __read_mostly;
Linus Torvalds's avatar
Linus Torvalds committed
67
long nr_swap_pages;
68
int percpu_pagelist_fraction;
Linus Torvalds's avatar
Linus Torvalds committed
69

70
static void __free_pages_ok(struct page *page, unsigned int order);
71

Linus Torvalds's avatar
Linus Torvalds committed
72 73 74 75 76 77 78
/*
 * results with 256, 32 in the lowmem_reserve sysctl:
 *	1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
 *	1G machine -> (16M dma, 784M normal, 224M high)
 *	NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
 *	HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
 *	HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
79 80 81
 *
 * TBD: should special case ZONE_DMA32 machines here - in those we normally
 * don't need any ZONE_NORMAL reservation
Linus Torvalds's avatar
Linus Torvalds committed
82
 */
83
int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
84
#ifdef CONFIG_ZONE_DMA
85
	 256,
86
#endif
87
#ifdef CONFIG_ZONE_DMA32
88
	 256,
89
#endif
90
#ifdef CONFIG_HIGHMEM
Mel Gorman's avatar
Mel Gorman committed
91
	 32,
92
#endif
Mel Gorman's avatar
Mel Gorman committed
93
	 32,
94
};
Linus Torvalds's avatar
Linus Torvalds committed
95 96 97

EXPORT_SYMBOL(totalram_pages);

98
static char * const zone_names[MAX_NR_ZONES] = {
99
#ifdef CONFIG_ZONE_DMA
100
	 "DMA",
101
#endif
102
#ifdef CONFIG_ZONE_DMA32
103
	 "DMA32",
104
#endif
105
	 "Normal",
106
#ifdef CONFIG_HIGHMEM
Mel Gorman's avatar
Mel Gorman committed
107
	 "HighMem",
108
#endif
Mel Gorman's avatar
Mel Gorman committed
109
	 "Movable",
110 111
};

Linus Torvalds's avatar
Linus Torvalds committed
112 113
int min_free_kbytes = 1024;

114 115
unsigned long __meminitdata nr_kernel_pages;
unsigned long __meminitdata nr_all_pages;
116
static unsigned long __meminitdata dma_reserve;
Linus Torvalds's avatar
Linus Torvalds committed
117

118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138
#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
  /*
   * MAX_ACTIVE_REGIONS determines the maxmimum number of distinct
   * ranges of memory (RAM) that may be registered with add_active_range().
   * Ranges passed to add_active_range() will be merged if possible
   * so the number of times add_active_range() can be called is
   * related to the number of nodes and the number of holes
   */
  #ifdef CONFIG_MAX_ACTIVE_REGIONS
    /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */
    #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS
  #else
    #if MAX_NUMNODES >= 32
      /* If there can be many nodes, allow up to 50 holes per node */
      #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50)
    #else
      /* By default, allow up to 256 distinct regions */
      #define MAX_ACTIVE_REGIONS 256
    #endif
  #endif

139 140 141 142
  static struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS];
  static int __meminitdata nr_nodemap_entries;
  static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
  static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
143
#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
144 145
  static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES];
  static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES];
146
#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
Mel Gorman's avatar
Mel Gorman committed
147
  unsigned long __initdata required_kernelcore;
148
  unsigned long __initdata required_movablecore;
149
  unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
Mel Gorman's avatar
Mel Gorman committed
150 151 152 153

  /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
  int movable_zone;
  EXPORT_SYMBOL(movable_zone);
154 155
#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */

Miklos Szeredi's avatar
Miklos Szeredi committed
156 157 158 159 160
#if MAX_NUMNODES > 1
int nr_node_ids __read_mostly = MAX_NUMNODES;
EXPORT_SYMBOL(nr_node_ids);
#endif

161
#ifdef CONFIG_PAGE_GROUP_BY_MOBILITY
162 163 164 165 166 167 168 169 170 171 172
static inline int get_pageblock_migratetype(struct page *page)
{
	return get_pageblock_flags_group(page, PB_migrate, PB_migrate_end);
}

static void set_pageblock_migratetype(struct page *page, int migratetype)
{
	set_pageblock_flags_group(page, (unsigned long)migratetype,
					PB_migrate, PB_migrate_end);
}

173
static inline int allocflags_to_migratetype(gfp_t gfp_flags, int order)
174
{
175 176
	WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK);

177 178 179 180 181 182
	/* Cluster high-order atomic allocations together */
	if (unlikely(order > 0) &&
			(!(gfp_flags & __GFP_WAIT) || in_interrupt()))
		return MIGRATE_HIGHATOMIC;

	/* Cluster based on mobility */
183 184
	return (((gfp_flags & __GFP_MOVABLE) != 0) << 1) |
		((gfp_flags & __GFP_RECLAIMABLE) != 0);
185 186
}

187 188 189 190 191 192 193 194 195 196
#else
static inline int get_pageblock_migratetype(struct page *page)
{
	return MIGRATE_UNMOVABLE;
}

static void set_pageblock_migratetype(struct page *page, int migratetype)
{
}

197
static inline int allocflags_to_migratetype(gfp_t gfp_flags, int order)
198 199 200 201 202
{
	return MIGRATE_UNMOVABLE;
}
#endif /* CONFIG_PAGE_GROUP_BY_MOBILITY */

Nick Piggin's avatar
Nick Piggin committed
203
#ifdef CONFIG_DEBUG_VM
204
static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
Linus Torvalds's avatar
Linus Torvalds committed
205
{
206 207 208
	int ret = 0;
	unsigned seq;
	unsigned long pfn = page_to_pfn(page);
209

210 211 212 213 214 215 216 217 218
	do {
		seq = zone_span_seqbegin(zone);
		if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
			ret = 1;
		else if (pfn < zone->zone_start_pfn)
			ret = 1;
	} while (zone_span_seqretry(zone, seq));

	return ret;
219 220 221 222
}

static int page_is_consistent(struct zone *zone, struct page *page)
{
223
	if (!pfn_valid_within(page_to_pfn(page)))
224
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
225
	if (zone != page_zone(page))
226 227 228 229 230 231 232 233 234 235
		return 0;

	return 1;
}
/*
 * Temporary debugging check for pages not lying within a given zone.
 */
static int bad_range(struct zone *zone, struct page *page)
{
	if (page_outside_zone_boundaries(zone, page))
Linus Torvalds's avatar
Linus Torvalds committed
236
		return 1;
237 238 239
	if (!page_is_consistent(zone, page))
		return 1;

Linus Torvalds's avatar
Linus Torvalds committed
240 241
	return 0;
}
Nick Piggin's avatar
Nick Piggin committed
242 243 244 245 246 247 248
#else
static inline int bad_range(struct zone *zone, struct page *page)
{
	return 0;
}
#endif

249
static void bad_page(struct page *page)
Linus Torvalds's avatar
Linus Torvalds committed
250
{
251
	printk(KERN_EMERG "Bad page state in process '%s'\n"
252 253 254
		KERN_EMERG "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n"
		KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
		KERN_EMERG "Backtrace:\n",
255 256 257
		current->comm, page, (int)(2*sizeof(unsigned long)),
		(unsigned long)page->flags, page->mapping,
		page_mapcount(page), page_count(page));
Linus Torvalds's avatar
Linus Torvalds committed
258
	dump_stack();
259 260
	page->flags &= ~(1 << PG_lru	|
			1 << PG_private |
Linus Torvalds's avatar
Linus Torvalds committed
261 262 263
			1 << PG_locked	|
			1 << PG_active	|
			1 << PG_dirty	|
264 265
			1 << PG_reclaim |
			1 << PG_slab    |
Linus Torvalds's avatar
Linus Torvalds committed
266
			1 << PG_swapcache |
267 268
			1 << PG_writeback |
			1 << PG_buddy );
Linus Torvalds's avatar
Linus Torvalds committed
269 270 271
	set_page_count(page, 0);
	reset_page_mapcount(page);
	page->mapping = NULL;
272
	add_taint(TAINT_BAD_PAGE);
Linus Torvalds's avatar
Linus Torvalds committed
273 274 275 276 277 278 279 280 281 282 283 284
}

/*
 * Higher-order pages are called "compound pages".  They are structured thusly:
 *
 * The first PAGE_SIZE page is called the "head page".
 *
 * The remaining PAGE_SIZE pages are called "tail pages".
 *
 * All pages have PG_compound set.  All pages have their ->private pointing at
 * the head page (even the head page has this).
 *
285 286 287
 * The first tail page's ->lru.next holds the address of the compound page's
 * put_page() function.  Its ->lru.prev holds the order of allocation.
 * This usage means that zero-order pages may not be compound.
Linus Torvalds's avatar
Linus Torvalds committed
288
 */
289 290 291

static void free_compound_page(struct page *page)
{
292
	__free_pages_ok(page, compound_order(page));
293 294
}

Linus Torvalds's avatar
Linus Torvalds committed
295 296 297 298 299
static void prep_compound_page(struct page *page, unsigned long order)
{
	int i;
	int nr_pages = 1 << order;

300
	set_compound_page_dtor(page, free_compound_page);
301
	set_compound_order(page, order);
302
	__SetPageHead(page);
303
	for (i = 1; i < nr_pages; i++) {
Linus Torvalds's avatar
Linus Torvalds committed
304 305
		struct page *p = page + i;

306 307
		__SetPageTail(p);
		p->first_page = page;
Linus Torvalds's avatar
Linus Torvalds committed
308 309 310 311 312 313 314 315
	}
}

static void destroy_compound_page(struct page *page, unsigned long order)
{
	int i;
	int nr_pages = 1 << order;

316
	if (unlikely(compound_order(page) != order))
317
		bad_page(page);
Linus Torvalds's avatar
Linus Torvalds committed
318

319
	if (unlikely(!PageHead(page)))
320
			bad_page(page);
321
	__ClearPageHead(page);
322
	for (i = 1; i < nr_pages; i++) {
Linus Torvalds's avatar
Linus Torvalds committed
323 324
		struct page *p = page + i;

325
		if (unlikely(!PageTail(p) |
326
				(p->first_page != page)))
327
			bad_page(page);
328
		__ClearPageTail(p);
Linus Torvalds's avatar
Linus Torvalds committed
329 330 331
	}
}

Nick Piggin's avatar
Nick Piggin committed
332 333 334 335
static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
{
	int i;

Nick Piggin's avatar
Nick Piggin committed
336
	VM_BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
337 338 339 340
	/*
	 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
	 * and __GFP_HIGHMEM from hard or soft interrupt context.
	 */
Nick Piggin's avatar
Nick Piggin committed
341
	VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
Nick Piggin's avatar
Nick Piggin committed
342 343 344 345
	for (i = 0; i < (1 << order); i++)
		clear_highpage(page + i);
}

Linus Torvalds's avatar
Linus Torvalds committed
346 347 348 349 350
/*
 * function for dealing with page's order in buddy system.
 * zone->lock is already acquired when we use these.
 * So, we don't need atomic page->flags operations here.
 */
351 352
static inline unsigned long page_order(struct page *page)
{
353
	return page_private(page);
Linus Torvalds's avatar
Linus Torvalds committed
354 355
}

356 357
static inline void set_page_order(struct page *page, int order)
{
358
	set_page_private(page, order);
359
	__SetPageBuddy(page);
Linus Torvalds's avatar
Linus Torvalds committed
360 361 362 363
}

static inline void rmv_page_order(struct page *page)
{
364
	__ClearPageBuddy(page);
365
	set_page_private(page, 0);
Linus Torvalds's avatar
Linus Torvalds committed
366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382
}

/*
 * Locate the struct page for both the matching buddy in our
 * pair (buddy1) and the combined O(n+1) page they form (page).
 *
 * 1) Any buddy B1 will have an order O twin B2 which satisfies
 * the following equation:
 *     B2 = B1 ^ (1 << O)
 * For example, if the starting buddy (buddy2) is #8 its order
 * 1 buddy is #10:
 *     B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
 *
 * 2) Any buddy B will have an order O+1 parent P which
 * satisfies the following equation:
 *     P = B & ~(1 << O)
 *
Andreas Mohr's avatar
Andreas Mohr committed
383
 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
Linus Torvalds's avatar
Linus Torvalds committed
384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401
 */
static inline struct page *
__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
{
	unsigned long buddy_idx = page_idx ^ (1 << order);

	return page + (buddy_idx - page_idx);
}

static inline unsigned long
__find_combined_index(unsigned long page_idx, unsigned int order)
{
	return (page_idx & ~(1 << order));
}

/*
 * This function checks whether a page is free && is the buddy
 * we can do coalesce a page and its buddy if
Nick Piggin's avatar
Nick Piggin committed
402
 * (a) the buddy is not in a hole &&
403
 * (b) the buddy is in the buddy system &&
404 405
 * (c) a page and its buddy have the same order &&
 * (d) a page and its buddy are in the same zone.
406 407 408
 *
 * For recording whether a page is in the buddy system, we use PG_buddy.
 * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
Linus Torvalds's avatar
Linus Torvalds committed
409
 *
410
 * For recording page's order, we use page_private(page).
Linus Torvalds's avatar
Linus Torvalds committed
411
 */
412 413
static inline int page_is_buddy(struct page *page, struct page *buddy,
								int order)
Linus Torvalds's avatar
Linus Torvalds committed
414
{
415
	if (!pfn_valid_within(page_to_pfn(buddy)))
Nick Piggin's avatar
Nick Piggin committed
416 417
		return 0;

418 419 420 421 422
	if (page_zone_id(page) != page_zone_id(buddy))
		return 0;

	if (PageBuddy(buddy) && page_order(buddy) == order) {
		BUG_ON(page_count(buddy) != 0);
423
		return 1;
424
	}
425
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
426 427 428 429 430 431 432 433 434 435 436 437 438 439 440
}

/*
 * Freeing function for a buddy system allocator.
 *
 * The concept of a buddy system is to maintain direct-mapped table
 * (containing bit values) for memory blocks of various "orders".
 * The bottom level table contains the map for the smallest allocatable
 * units of memory (here, pages), and each level above it describes
 * pairs of units from the levels below, hence, "buddies".
 * At a high level, all that happens here is marking the table entry
 * at the bottom level available, and propagating the changes upward
 * as necessary, plus some accounting needed to play nicely with other
 * parts of the VM system.
 * At each level, we keep a list of pages, which are heads of continuous
441
 * free pages of length of (1 << order) and marked with PG_buddy. Page's
442
 * order is recorded in page_private(page) field.
Linus Torvalds's avatar
Linus Torvalds committed
443 444 445 446 447 448 449 450 451
 * So when we are allocating or freeing one, we can derive the state of the
 * other.  That is, if we allocate a small block, and both were   
 * free, the remainder of the region must be split into blocks.   
 * If a block is freed, and its buddy is also free, then this
 * triggers coalescing into a block of larger size.            
 *
 * -- wli
 */

Nick Piggin's avatar
Nick Piggin committed
452
static inline void __free_one_page(struct page *page,
Linus Torvalds's avatar
Linus Torvalds committed
453 454 455 456
		struct zone *zone, unsigned int order)
{
	unsigned long page_idx;
	int order_size = 1 << order;
457
	int migratetype = get_pageblock_migratetype(page);
Linus Torvalds's avatar
Linus Torvalds committed
458

459
	if (unlikely(PageCompound(page)))
Linus Torvalds's avatar
Linus Torvalds committed
460 461 462 463
		destroy_compound_page(page, order);

	page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);

Nick Piggin's avatar
Nick Piggin committed
464 465
	VM_BUG_ON(page_idx & (order_size - 1));
	VM_BUG_ON(bad_range(zone, page));
Linus Torvalds's avatar
Linus Torvalds committed
466

467
	__mod_zone_page_state(zone, NR_FREE_PAGES, order_size);
Linus Torvalds's avatar
Linus Torvalds committed
468 469 470 471 472
	while (order < MAX_ORDER-1) {
		unsigned long combined_idx;
		struct page *buddy;

		buddy = __page_find_buddy(page, page_idx, order);
473
		if (!page_is_buddy(page, buddy, order))
Linus Torvalds's avatar
Linus Torvalds committed
474
			break;		/* Move the buddy up one level. */
Nick Piggin's avatar
Nick Piggin committed
475

Linus Torvalds's avatar
Linus Torvalds committed
476
		list_del(&buddy->lru);
477
		zone->free_area[order].nr_free--;
Linus Torvalds's avatar
Linus Torvalds committed
478
		rmv_page_order(buddy);
Nick Piggin's avatar
Nick Piggin committed
479
		combined_idx = __find_combined_index(page_idx, order);
Linus Torvalds's avatar
Linus Torvalds committed
480 481 482 483 484
		page = page + (combined_idx - page_idx);
		page_idx = combined_idx;
		order++;
	}
	set_page_order(page, order);
485 486
	list_add(&page->lru,
		&zone->free_area[order].free_list[migratetype]);
Linus Torvalds's avatar
Linus Torvalds committed
487 488 489
	zone->free_area[order].nr_free++;
}

490
static inline int free_pages_check(struct page *page)
Linus Torvalds's avatar
Linus Torvalds committed
491
{
Nick Piggin's avatar
Nick Piggin committed
492 493 494
	if (unlikely(page_mapcount(page) |
		(page->mapping != NULL)  |
		(page_count(page) != 0)  |
Linus Torvalds's avatar
Linus Torvalds committed
495 496 497 498 499 500 501
		(page->flags & (
			1 << PG_lru	|
			1 << PG_private |
			1 << PG_locked	|
			1 << PG_active	|
			1 << PG_slab	|
			1 << PG_swapcache |
Nick Piggin's avatar
Nick Piggin committed
502
			1 << PG_writeback |
503 504
			1 << PG_reserved |
			1 << PG_buddy ))))
505
		bad_page(page);
Linus Torvalds's avatar
Linus Torvalds committed
506
	if (PageDirty(page))
Nick Piggin's avatar
Nick Piggin committed
507
		__ClearPageDirty(page);
508 509 510 511 512 513
	/*
	 * For now, we report if PG_reserved was found set, but do not
	 * clear it, and do not free the page.  But we shall soon need
	 * to do more, for when the ZERO_PAGE count wraps negative.
	 */
	return PageReserved(page);
Linus Torvalds's avatar
Linus Torvalds committed
514 515 516 517 518
}

/*
 * Frees a list of pages. 
 * Assumes all pages on list are in same zone, and of same order.
519
 * count is the number of pages to free.
Linus Torvalds's avatar
Linus Torvalds committed
520 521 522 523 524 525 526
 *
 * If the zone was previously in an "all pages pinned" state then look to
 * see if this freeing clears that state.
 *
 * And clear the zone's pages_scanned counter, to hold off the "all pages are
 * pinned" detection logic.
 */
Nick Piggin's avatar
Nick Piggin committed
527 528
static void free_pages_bulk(struct zone *zone, int count,
					struct list_head *list, int order)
Linus Torvalds's avatar
Linus Torvalds committed
529
{
Nick Piggin's avatar
Nick Piggin committed
530
	spin_lock(&zone->lock);
Linus Torvalds's avatar
Linus Torvalds committed
531 532
	zone->all_unreclaimable = 0;
	zone->pages_scanned = 0;
Nick Piggin's avatar
Nick Piggin committed
533 534 535
	while (count--) {
		struct page *page;

Nick Piggin's avatar
Nick Piggin committed
536
		VM_BUG_ON(list_empty(list));
Linus Torvalds's avatar
Linus Torvalds committed
537
		page = list_entry(list->prev, struct page, lru);
Nick Piggin's avatar
Nick Piggin committed
538
		/* have to delete it as __free_one_page list manipulates */
Linus Torvalds's avatar
Linus Torvalds committed
539
		list_del(&page->lru);
Nick Piggin's avatar
Nick Piggin committed
540
		__free_one_page(page, zone, order);
Linus Torvalds's avatar
Linus Torvalds committed
541
	}
Nick Piggin's avatar
Nick Piggin committed
542
	spin_unlock(&zone->lock);
Linus Torvalds's avatar
Linus Torvalds committed
543 544
}

Nick Piggin's avatar
Nick Piggin committed
545
static void free_one_page(struct zone *zone, struct page *page, int order)
Linus Torvalds's avatar
Linus Torvalds committed
546
{
547 548 549
	spin_lock(&zone->lock);
	zone->all_unreclaimable = 0;
	zone->pages_scanned = 0;
550
	__free_one_page(page, zone, order);
551
	spin_unlock(&zone->lock);
Nick Piggin's avatar
Nick Piggin committed
552 553 554 555 556
}

static void __free_pages_ok(struct page *page, unsigned int order)
{
	unsigned long flags;
Linus Torvalds's avatar
Linus Torvalds committed
557
	int i;
558
	int reserved = 0;
Linus Torvalds's avatar
Linus Torvalds committed
559 560

	for (i = 0 ; i < (1 << order) ; ++i)
561
		reserved += free_pages_check(page + i);
562 563 564
	if (reserved)
		return;

Nick Piggin's avatar
Nick Piggin committed
565 566
	if (!PageHighMem(page))
		debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
Nick Piggin's avatar
Nick Piggin committed
567
	arch_free_page(page, order);
Nick Piggin's avatar
Nick Piggin committed
568
	kernel_map_pages(page, 1 << order, 0);
Nick Piggin's avatar
Nick Piggin committed
569

Nick Piggin's avatar
Nick Piggin committed
570
	local_irq_save(flags);
571
	__count_vm_events(PGFREE, 1 << order);
Nick Piggin's avatar
Nick Piggin committed
572
	free_one_page(page_zone(page), page, order);
Nick Piggin's avatar
Nick Piggin committed
573
	local_irq_restore(flags);
Linus Torvalds's avatar
Linus Torvalds committed
574 575
}

576 577 578 579 580 581 582 583
/*
 * permit the bootmem allocator to evade page validation on high-order frees
 */
void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
{
	if (order == 0) {
		__ClearPageReserved(page);
		set_page_count(page, 0);
584
		set_page_refcounted(page);
Nick Piggin's avatar
Nick Piggin committed
585
		__free_page(page);
586 587 588
	} else {
		int loop;

Nick Piggin's avatar
Nick Piggin committed
589
		prefetchw(page);
590 591 592
		for (loop = 0; loop < BITS_PER_LONG; loop++) {
			struct page *p = &page[loop];

Nick Piggin's avatar
Nick Piggin committed
593 594
			if (loop + 1 < BITS_PER_LONG)
				prefetchw(p + 1);
595 596 597 598
			__ClearPageReserved(p);
			set_page_count(p, 0);
		}

599
		set_page_refcounted(page);
Nick Piggin's avatar
Nick Piggin committed
600
		__free_pages(page, order);
601 602 603
	}
}

Linus Torvalds's avatar
Linus Torvalds committed
604 605 606 607 608 609 610 611 612 613 614 615 616 617 618

/*
 * The order of subdivision here is critical for the IO subsystem.
 * Please do not alter this order without good reasons and regression
 * testing. Specifically, as large blocks of memory are subdivided,
 * the order in which smaller blocks are delivered depends on the order
 * they're subdivided in this function. This is the primary factor
 * influencing the order in which pages are delivered to the IO
 * subsystem according to empirical testing, and this is also justified
 * by considering the behavior of a buddy system containing a single
 * large block of memory acted on by a series of small allocations.
 * This behavior is a critical factor in sglist merging's success.
 *
 * -- wli
 */
Nick Piggin's avatar
Nick Piggin committed
619
static inline void expand(struct zone *zone, struct page *page,
620 621
	int low, int high, struct free_area *area,
	int migratetype)
Linus Torvalds's avatar
Linus Torvalds committed
622 623 624 625 626 627 628
{
	unsigned long size = 1 << high;

	while (high > low) {
		area--;
		high--;
		size >>= 1;
Nick Piggin's avatar
Nick Piggin committed
629
		VM_BUG_ON(bad_range(zone, &page[size]));
630
		list_add(&page[size].lru, &area->free_list[migratetype]);
Linus Torvalds's avatar
Linus Torvalds committed
631 632 633 634 635 636 637 638
		area->nr_free++;
		set_page_order(&page[size], high);
	}
}

/*
 * This page is about to be returned from the page allocator
 */
Nick Piggin's avatar
Nick Piggin committed
639
static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
Linus Torvalds's avatar
Linus Torvalds committed
640
{
Nick Piggin's avatar
Nick Piggin committed
641 642 643
	if (unlikely(page_mapcount(page) |
		(page->mapping != NULL)  |
		(page_count(page) != 0)  |
644 645
		(page->flags & (
			1 << PG_lru	|
Linus Torvalds's avatar
Linus Torvalds committed
646 647 648 649
			1 << PG_private	|
			1 << PG_locked	|
			1 << PG_active	|
			1 << PG_dirty	|
650
			1 << PG_slab    |
Linus Torvalds's avatar
Linus Torvalds committed
651
			1 << PG_swapcache |
Nick Piggin's avatar
Nick Piggin committed
652
			1 << PG_writeback |
653 654
			1 << PG_reserved |
			1 << PG_buddy ))))
655
		bad_page(page);
Linus Torvalds's avatar
Linus Torvalds committed
656

657 658 659 660 661 662 663
	/*
	 * For now, we report if PG_reserved was found set, but do not
	 * clear it, and do not allocate the page: as a safety net.
	 */
	if (PageReserved(page))
		return 1;

664
	page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_readahead |
Linus Torvalds's avatar
Linus Torvalds committed
665
			1 << PG_referenced | 1 << PG_arch_1 |
666
			1 << PG_owner_priv_1 | 1 << PG_mappedtodisk);
667
	set_page_private(page, 0);
668
	set_page_refcounted(page);
Nick Piggin's avatar
Nick Piggin committed
669 670

	arch_alloc_page(page, order);
Linus Torvalds's avatar
Linus Torvalds committed
671
	kernel_map_pages(page, 1 << order, 1);
Nick Piggin's avatar
Nick Piggin committed
672 673 674 675 676 677 678

	if (gfp_flags & __GFP_ZERO)
		prep_zero_page(page, order, gfp_flags);

	if (order && (gfp_flags & __GFP_COMP))
		prep_compound_page(page, order);

679
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
680 681
}

682
#ifdef CONFIG_PAGE_GROUP_BY_MOBILITY
683 684 685 686 687
/*
 * This array describes the order lists are fallen back to when
 * the free lists for the desirable migrate type are depleted
 */
static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = {
688 689 690 691
	[MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,  MIGRATE_HIGHATOMIC },
	[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,  MIGRATE_HIGHATOMIC },
	[MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE,MIGRATE_HIGHATOMIC },
	[MIGRATE_HIGHATOMIC]  = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE,MIGRATE_MOVABLE},
692 693
};

694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759
/*
 * Move the free pages in a range to the free lists of the requested type.
 * Note that start_page and end_pages are not aligned in a MAX_ORDER_NR_PAGES
 * boundary. If alignment is required, use move_freepages_block()
 */
int move_freepages(struct zone *zone,
			struct page *start_page, struct page *end_page,
			int migratetype)
{
	struct page *page;
	unsigned long order;
	int blocks_moved = 0;

#ifndef CONFIG_HOLES_IN_ZONE
	/*
	 * page_zone is not safe to call in this context when
	 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
	 * anyway as we check zone boundaries in move_freepages_block().
	 * Remove at a later date when no bug reports exist related to
	 * CONFIG_PAGE_GROUP_BY_MOBILITY
	 */
	BUG_ON(page_zone(start_page) != page_zone(end_page));
#endif

	for (page = start_page; page <= end_page;) {
		if (!pfn_valid_within(page_to_pfn(page))) {
			page++;
			continue;
		}

		if (!PageBuddy(page)) {
			page++;
			continue;
		}

		order = page_order(page);
		list_del(&page->lru);
		list_add(&page->lru,
			&zone->free_area[order].free_list[migratetype]);
		page += 1 << order;
		blocks_moved++;
	}

	return blocks_moved;
}

int move_freepages_block(struct zone *zone, struct page *page, int migratetype)
{
	unsigned long start_pfn, end_pfn;
	struct page *start_page, *end_page;

	start_pfn = page_to_pfn(page);
	start_pfn = start_pfn & ~(MAX_ORDER_NR_PAGES-1);
	start_page = pfn_to_page(start_pfn);
	end_page = start_page + MAX_ORDER_NR_PAGES - 1;
	end_pfn = start_pfn + MAX_ORDER_NR_PAGES - 1;

	/* Do not cross zone boundaries */
	if (start_pfn < zone->zone_start_pfn)
		start_page = page;
	if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages)
		return 0;

	return move_freepages(zone, start_page, end_page, migratetype);
}

760 761 762 763 764 765 766 767
/* Remove an element from the buddy allocator from the fallback list */
static struct page *__rmqueue_fallback(struct zone *zone, int order,
						int start_migratetype)
{
	struct free_area * area;
	int current_order;
	struct page *page;
	int migratetype, i;
768
	int nonatomic_fallback_atomic = 0;
769

770
retry:
771 772 773 774 775 776
	/* Find the largest possible block of pages in the other list */
	for (current_order = MAX_ORDER-1; current_order >= order;
						--current_order) {
		for (i = 0; i < MIGRATE_TYPES - 1; i++) {
			migratetype = fallbacks[start_migratetype][i];

777 778 779 780 781 782 783 784 785
			/*
			 * Make it hard to fallback to blocks used for
			 * high-order atomic allocations
			 */
			if (migratetype == MIGRATE_HIGHATOMIC &&
				start_migratetype != MIGRATE_UNMOVABLE &&
				!nonatomic_fallback_atomic)
				continue;

786 787 788 789 790 791 792 793 794
			area = &(zone->free_area[current_order]);
			if (list_empty(&area->free_list[migratetype]))
				continue;

			page = list_entry(area->free_list[migratetype].next,
					struct page, lru);
			area->nr_free--;

			/*
795 796
			 * If breaking a large block of pages, move all free
			 * pages to the preferred allocation list
797
			 */
798
			if (unlikely(current_order >= MAX_ORDER / 2)) {
799
				migratetype = start_migratetype;
800 801
				move_freepages_block(zone, page, migratetype);
			}
802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817

			/* Remove the page from the freelists */
			list_del(&page->lru);
			rmv_page_order(page);
			__mod_zone_page_state(zone, NR_FREE_PAGES,
							-(1UL << order));

			if (current_order == MAX_ORDER - 1)
				set_pageblock_migratetype(page,
							start_migratetype);

			expand(zone, page, order, current_order, area, migratetype);
			return page;
		}
	}

818 819 820 821 822 823
	/* Allow fallback to high-order atomic blocks if memory is that low */
	if (!nonatomic_fallback_atomic) {
		nonatomic_fallback_atomic = 1;
		goto retry;
	}

824 825
	return NULL;
}
826 827 828 829 830 831 832
#else
static struct page *__rmqueue_fallback(struct zone *zone, int order,
						int start_migratetype)
{
	return NULL;
}
#endif /* CONFIG_PAGE_GROUP_BY_MOBILITY */
833

Linus Torvalds's avatar
Linus Torvalds committed
834 835 836 837
/* 
 * Do the hard work of removing an element from the buddy allocator.
 * Call me with the zone->lock already held.
 */
838 839
static struct page *__rmqueue(struct zone *zone, unsigned int order,
						int migratetype)
Linus Torvalds's avatar
Linus Torvalds committed
840 841 842 843 844
{
	struct free_area * area;
	unsigned int current_order;
	struct page *page;

845
	/* Find a page of the appropriate size in the preferred list */
Linus Torvalds's avatar
Linus Torvalds committed
846
	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
847 848
		area = &(zone->free_area[current_order]);
		if (list_empty(&area->free_list[migratetype]))
Linus Torvalds's avatar
Linus Torvalds committed
849 850
			continue;

851 852
		page = list_entry(area->free_list[migratetype].next,
							struct page, lru);
Linus Torvalds's avatar
Linus Torvalds committed
853 854 855
		list_del(&page->lru);
		rmv_page_order(page);
		area->nr_free--;
856
		__mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order));
857 858
		expand(zone, page, order, current_order, area, migratetype);
		goto got_page;
Linus Torvalds's avatar
Linus Torvalds committed
859 860
	}

861 862 863 864 865
	page = __rmqueue_fallback(zone, order, migratetype);

got_page:

	return page;
Linus Torvalds's avatar
Linus Torvalds committed
866 867 868 869 870 871 872 873
}

/* 
 * Obtain a specified number of elements from the buddy allocator, all under
 * a single hold of the lock, for efficiency.  Add them to the supplied list.
 * Returns the number of new pages which were placed at *list.
 */
static int rmqueue_bulk(struct zone *zone, unsigned int order, 
874 875
			unsigned long count, struct list_head *list,
			int migratetype)
Linus Torvalds's avatar
Linus Torvalds committed
876 877 878
{
	int i;
	
Nick Piggin's avatar
Nick Piggin committed
879
	spin_lock(&zone->lock);
Linus Torvalds's avatar
Linus Torvalds committed
880
	for (i = 0; i < count; ++i) {
881
		struct page *page = __rmqueue(zone, order, migratetype);
Nick Piggin's avatar
Nick Piggin committed
882
		if (unlikely(page == NULL))
Linus Torvalds's avatar
Linus Torvalds committed
883
			break;
884 885
		list_add(&page->lru, list);
		set_page_private(page, migratetype);
Linus Torvalds's avatar
Linus Torvalds committed
886
	}
Nick Piggin's avatar
Nick Piggin committed
887
	spin_unlock(&zone->lock);
Nick Piggin's avatar
Nick Piggin committed
888
	return i;
Linus Torvalds's avatar
Linus Torvalds committed
889 890
}

891
#ifdef CONFIG_NUMA
892
/*
893 894 895 896
 * Called from the vmstat counter updater to drain pagesets of this
 * currently executing processor on remote nodes after they have
 * expired.
 *
897 898
 * Note that this function must be called with the thread pinned to
 * a single processor.
899
 */
900
void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
901 902
{
	unsigned long flags;
903
	int to_drain;
904

905 906 907 908 909 910 911 912
	local_irq_save(flags);
	if (pcp->count >= pcp->batch)
		to_drain = pcp->batch;
	else
		to_drain = pcp->count;
	free_pages_bulk(zone, to_drain, &pcp->list, 0);
	pcp->count -= to_drain;
	local_irq_restore(flags);
913 914 915
}
#endif

Linus Torvalds's avatar
Linus Torvalds committed
916 917
static void __drain_pages(unsigned int cpu)
{
Nick Piggin's avatar
Nick Piggin committed
918
	unsigned long flags;
Linus Torvalds's avatar
Linus Torvalds committed
919 920 921 922 923 924
	struct zone *zone;
	int i;

	for_each_zone(zone) {
		struct per_cpu_pageset *pset;

925 926 927
		if (!populated_zone(zone))
			continue;

928
		pset = zone_pcp(zone, cpu);
Linus Torvalds's avatar
Linus Torvalds committed
929 930 931 932
		for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
			struct per_cpu_pages *pcp;

			pcp = &pset->pcp[i];
Nick Piggin's avatar
Nick Piggin committed
933
			local_irq_save(flags);
Nick Piggin's avatar
Nick Piggin committed
934 935
			free_pages_bulk(zone, pcp->count, &pcp->list, 0);
			pcp->count = 0;
Nick Piggin's avatar
Nick Piggin committed
936
			local_irq_restore(flags);
Linus Torvalds's avatar
Linus Torvalds committed
937 938 939 940
		}
	}
}

941
#ifdef CONFIG_HIBERNATION
Linus Torvalds's avatar
Linus Torvalds committed
942 943 944

void mark_free_pages(struct zone *zone)
{
945 946
	unsigned long pfn, max_zone_pfn;
	unsigned long flags;
947
	int order, t;
Linus Torvalds's avatar
Linus Torvalds committed
948 949 950 951 952 953
	struct list_head *curr;

	if (!zone->spanned_pages)
		return;

	spin_lock_irqsave(&zone->lock, flags);
954 955 956 957 958 959

	max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
	for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
		if (pfn_valid(pfn)) {
			struct page *page = pfn_to_page(pfn);

960 961
			if (!swsusp_page_is_forbidden(page))
				swsusp_unset_page_free(page);
962
		}
Linus Torvalds's avatar
Linus Torvalds committed
963

964 965
	for_each_migratetype_order(order, t) {
		list_for_each(curr, &zone->free_area[order].free_list[t]) {
966
			unsigned long i;
Linus Torvalds's avatar
Linus Torvalds committed
967

968 969
			pfn = page_to_pfn(list_entry(curr, struct page, lru));
			for (i = 0; i < (1UL << order); i++)
970
				swsusp_set_page_free(pfn_to_page(pfn + i));
971
		}
972
	}
Linus Torvalds's avatar
Linus Torvalds committed
973 974
	spin_unlock_irqrestore(&zone->lock, flags);
}
975
#endif /* CONFIG_PM */
Linus Torvalds's avatar
Linus Torvalds committed
976

977
#if defined(CONFIG_HIBERNATION) || defined(CONFIG_PAGE_GROUP_BY_MOBILITY)
Linus Torvalds's avatar
Linus Torvalds committed
978 979 980 981 982 983 984 985 986 987 988
/*
 * Spill all of this CPU's per-cpu pages back into the buddy allocator.
 */
void drain_local_pages(void)
{
	unsigned long flags;

	local_irq_save(flags);	
	__drain_pages(smp_processor_id());
	local_irq_restore(flags);	
}
989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010

void smp_drain_local_pages(void *arg)
{
	drain_local_pages();
}

/*
 * Spill all the per-cpu pages from all CPUs back into the buddy allocator
 */
void drain_all_local_pages(void)
{
	unsigned long flags;

	local_irq_save(flags);
	__drain_pages(smp_processor_id());
	local_irq_restore(flags);

	smp_call_function(smp_drain_local_pages, NULL, 0, 1);
}
#else
void drain_all_local_pages(void) {}
#endif /* CONFIG_HIBERNATION || CONFIG_PAGE_GROUP_BY_MOBILITY */
Linus Torvalds's avatar
Linus Torvalds committed
1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022

/*
 * Free a 0-order page
 */
static void fastcall free_hot_cold_page(struct page *page, int cold)
{
	struct zone *zone = page_zone(page);
	struct per_cpu_pages *pcp;
	unsigned long flags;

	if (PageAnon(page))
		page->mapping = NULL;
1023
	if (free_pages_check(page))
1024 1025
		return;

Nick Piggin's avatar
Nick Piggin committed
1026 1027
	if (!PageHighMem(page))
		debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
Nick Piggin's avatar
Nick Piggin committed
1028
	arch_free_page(page, 0);
1029 1030
	kernel_map_pages(page, 1, 0);

1031
	pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
Linus Torvalds's avatar
Linus Torvalds committed
1032
	local_irq_save(flags);
1033
	__count_vm_event(PGFREE);
Linus Torvalds's avatar
Linus Torvalds committed
1034
	list_add(&page->lru, &pcp->list);
1035
	set_page_private(page, get_pageblock_migratetype(page));
Linus Torvalds's avatar
Linus Torvalds committed