mempolicy.c 71.6 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
/*
 * Simple NUMA memory policy for the Linux kernel.
 *
 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5
 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
Linus Torvalds's avatar
Linus Torvalds committed
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
 * Subject to the GNU Public License, version 2.
 *
 * NUMA policy allows the user to give hints in which node(s) memory should
 * be allocated.
 *
 * Support four policies per VMA and per process:
 *
 * The VMA policy has priority over the process policy for a page fault.
 *
 * interleave     Allocate memory interleaved over a set of nodes,
 *                with normal fallback if it fails.
 *                For VMA based allocations this interleaves based on the
 *                offset into the backing object or offset into the mapping
 *                for anonymous memory. For process policy an process counter
 *                is used.
21
 *
Linus Torvalds's avatar
Linus Torvalds committed
22
23
 * bind           Only allocate memory on a specific set of nodes,
 *                no fallback.
24
25
26
27
 *                FIXME: memory is allocated starting with the first node
 *                to the last. It would be better if bind would truly restrict
 *                the allocation to memory nodes instead
 *
Linus Torvalds's avatar
Linus Torvalds committed
28
 * preferred       Try a specific node first before normal fallback.
David Rientjes's avatar
David Rientjes committed
29
 *                As a special case NUMA_NO_NODE here means do the allocation
Linus Torvalds's avatar
Linus Torvalds committed
30
31
32
 *                on the local CPU. This is normally identical to default,
 *                but useful to set in a VMA when you have a non default
 *                process policy.
33
 *
Linus Torvalds's avatar
Linus Torvalds committed
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
 * default        Allocate on the local node first, or when on a VMA
 *                use the process policy. This is what Linux always did
 *		  in a NUMA aware kernel and still does by, ahem, default.
 *
 * The process policy is applied for most non interrupt memory allocations
 * in that process' context. Interrupts ignore the policies and always
 * try to allocate on the local CPU. The VMA policy is only applied for memory
 * allocations for a VMA in the VM.
 *
 * Currently there are a few corner cases in swapping where the policy
 * is not applied, but the majority should be handled. When process policy
 * is used it is not remembered over swap outs/swap ins.
 *
 * Only the highest zone in the zone hierarchy gets policied. Allocations
 * requesting a lower zone just use default policy. This implies that
 * on systems with highmem kernel lowmem allocation don't get policied.
 * Same with GFP_DMA allocations.
 *
 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
 * all users and remembered even when nobody has memory mapped.
 */

/* Notebook:
   fix mmap readahead to honour policy and enable policy for any page cache
   object
   statistics for bigpages
   global policy for page cache? currently it uses process policy. Requires
   first item above.
   handle mremap for shared memory (currently ignored for the policy)
   grows down?
   make bind policy root only? It can trigger oom much faster and the
   kernel is not always grateful with that.
*/

#include <linux/mempolicy.h>
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/hugetlb.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/nodemask.h>
#include <linux/cpuset.h>
#include <linux/slab.h>
#include <linux/string.h>
78
#include <linux/export.h>
79
#include <linux/nsproxy.h>
Linus Torvalds's avatar
Linus Torvalds committed
80
81
82
#include <linux/interrupt.h>
#include <linux/init.h>
#include <linux/compat.h>
83
#include <linux/swap.h>
84
85
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
86
#include <linux/migrate.h>
87
#include <linux/ksm.h>
88
#include <linux/rmap.h>
89
#include <linux/security.h>
Adrian Bunk's avatar
Adrian Bunk committed
90
#include <linux/syscalls.h>
91
#include <linux/ctype.h>
92
#include <linux/mm_inline.h>
93
#include <linux/mmu_notifier.h>
94

Linus Torvalds's avatar
Linus Torvalds committed
95
96
#include <asm/tlbflush.h>
#include <asm/uaccess.h>
97
#include <linux/random.h>
Linus Torvalds's avatar
Linus Torvalds committed
98

99
100
#include "internal.h"

101
/* Internal flags */
102
#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
103
#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
104

105
106
static struct kmem_cache *policy_cache;
static struct kmem_cache *sn_cache;
Linus Torvalds's avatar
Linus Torvalds committed
107
108
109

/* Highest zone. An specific allocation for a zone below that is not
   policied. */
110
enum zone_type policy_zone = 0;
Linus Torvalds's avatar
Linus Torvalds committed
111

112
113
114
/*
 * run-time system-wide default policy => local allocation
 */
115
static struct mempolicy default_policy = {
Linus Torvalds's avatar
Linus Torvalds committed
116
	.refcnt = ATOMIC_INIT(1), /* never free it */
117
	.mode = MPOL_PREFERRED,
118
	.flags = MPOL_F_LOCAL,
Linus Torvalds's avatar
Linus Torvalds committed
119
120
};

121
122
123
124
125
126
127
static struct mempolicy preferred_node_policy[MAX_NUMNODES];

static struct mempolicy *get_task_policy(struct task_struct *p)
{
	struct mempolicy *pol = p->mempolicy;

	if (!pol) {
128
		int node = numa_node_id();
129

130
131
132
133
134
135
136
137
138
		if (node != NUMA_NO_NODE) {
			pol = &preferred_node_policy[node];
			/*
			 * preferred_node_policy is not initialised early in
			 * boot
			 */
			if (!pol->mode)
				pol = NULL;
		}
139
140
141
142
143
	}

	return pol;
}

144
145
static const struct mempolicy_operations {
	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
	/*
	 * If read-side task has no lock to protect task->mempolicy, write-side
	 * task will rebind the task->mempolicy by two step. The first step is
	 * setting all the newly nodes, and the second step is cleaning all the
	 * disallowed nodes. In this way, we can avoid finding no node to alloc
	 * page.
	 * If we have a lock to protect task->mempolicy in read-side, we do
	 * rebind directly.
	 *
	 * step:
	 * 	MPOL_REBIND_ONCE - do rebind work at once
	 * 	MPOL_REBIND_STEP1 - set all the newly nodes
	 * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
	 */
	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
			enum mpol_rebind_step step);
162
163
} mpol_ops[MPOL_MAX];

164
/* Check that the nodemask contains at least one populated zone */
165
static int is_valid_nodemask(const nodemask_t *nodemask)
Linus Torvalds's avatar
Linus Torvalds committed
166
{
167
	return nodes_intersects(*nodemask, node_states[N_MEMORY]);
Linus Torvalds's avatar
Linus Torvalds committed
168
169
}

170
171
static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
{
Bob Liu's avatar
Bob Liu committed
172
	return pol->flags & MPOL_MODE_FLAGS;
173
174
175
176
177
178
179
180
}

static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
				   const nodemask_t *rel)
{
	nodemask_t tmp;
	nodes_fold(tmp, *orig, nodes_weight(*rel));
	nodes_onto(*ret, tmp, *rel);
181
182
}

183
184
185
186
187
188
189
190
191
192
193
static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
{
	if (nodes_empty(*nodes))
		return -EINVAL;
	pol->v.nodes = *nodes;
	return 0;
}

static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
{
	if (!nodes)
194
		pol->flags |= MPOL_F_LOCAL;	/* local allocation */
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
	else if (nodes_empty(*nodes))
		return -EINVAL;			/*  no allowed nodes */
	else
		pol->v.preferred_node = first_node(*nodes);
	return 0;
}

static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
{
	if (!is_valid_nodemask(nodes))
		return -EINVAL;
	pol->v.nodes = *nodes;
	return 0;
}

210
211
212
213
214
215
216
217
218
/*
 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
 * any, for the new policy.  mpol_new() has already validated the nodes
 * parameter with respect to the policy mode and flags.  But, we need to
 * handle an empty nodemask with MPOL_PREFERRED here.
 *
 * Must be called holding task's alloc_lock to protect task's mems_allowed
 * and mempolicy.  May also be called holding the mmap_semaphore for write.
 */
219
220
static int mpol_set_nodemask(struct mempolicy *pol,
		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
221
222
223
224
225
226
{
	int ret;

	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
	if (pol == NULL)
		return 0;
227
	/* Check N_MEMORY */
228
	nodes_and(nsc->mask1,
229
		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
230
231
232
233
234
235

	VM_BUG_ON(!nodes);
	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
		nodes = NULL;	/* explicit local allocation */
	else {
		if (pol->flags & MPOL_F_RELATIVE_NODES)
236
			mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
237
		else
238
239
			nodes_and(nsc->mask2, *nodes, nsc->mask1);

240
241
242
243
244
245
246
		if (mpol_store_user_nodemask(pol))
			pol->w.user_nodemask = *nodes;
		else
			pol->w.cpuset_mems_allowed =
						cpuset_current_mems_allowed;
	}

247
248
249
250
	if (nodes)
		ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
	else
		ret = mpol_ops[pol->mode].create(pol, NULL);
251
252
253
254
255
256
257
	return ret;
}

/*
 * This function just creates a new policy, does some check and simple
 * initialization. You must invoke mpol_set_nodemask() to set nodes.
 */
258
259
static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
				  nodemask_t *nodes)
Linus Torvalds's avatar
Linus Torvalds committed
260
261
262
{
	struct mempolicy *policy;

263
	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
David Rientjes's avatar
David Rientjes committed
264
		 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
265

266
267
	if (mode == MPOL_DEFAULT) {
		if (nodes && !nodes_empty(*nodes))
268
			return ERR_PTR(-EINVAL);
269
		return NULL;
270
	}
271
272
273
274
275
276
277
278
279
280
281
282
283
	VM_BUG_ON(!nodes);

	/*
	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
	 * All other modes require a valid pointer to a non-empty nodemask.
	 */
	if (mode == MPOL_PREFERRED) {
		if (nodes_empty(*nodes)) {
			if (((flags & MPOL_F_STATIC_NODES) ||
			     (flags & MPOL_F_RELATIVE_NODES)))
				return ERR_PTR(-EINVAL);
		}
284
285
286
287
	} else if (mode == MPOL_LOCAL) {
		if (!nodes_empty(*nodes))
			return ERR_PTR(-EINVAL);
		mode = MPOL_PREFERRED;
288
289
	} else if (nodes_empty(*nodes))
		return ERR_PTR(-EINVAL);
Linus Torvalds's avatar
Linus Torvalds committed
290
291
292
293
	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
	if (!policy)
		return ERR_PTR(-ENOMEM);
	atomic_set(&policy->refcnt, 1);
294
	policy->mode = mode;
295
	policy->flags = flags;
296

Linus Torvalds's avatar
Linus Torvalds committed
297
	return policy;
298
299
}

300
301
302
303
304
305
306
307
/* Slow path of a mpol destructor. */
void __mpol_put(struct mempolicy *p)
{
	if (!atomic_dec_and_test(&p->refcnt))
		return;
	kmem_cache_free(policy_cache, p);
}

308
309
static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
				enum mpol_rebind_step step)
310
311
312
{
}

313
314
315
316
317
318
319
320
/*
 * step:
 * 	MPOL_REBIND_ONCE  - do rebind work at once
 * 	MPOL_REBIND_STEP1 - set all the newly nodes
 * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
 */
static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
				 enum mpol_rebind_step step)
321
322
323
324
325
326
327
328
{
	nodemask_t tmp;

	if (pol->flags & MPOL_F_STATIC_NODES)
		nodes_and(tmp, pol->w.user_nodemask, *nodes);
	else if (pol->flags & MPOL_F_RELATIVE_NODES)
		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
	else {
329
330
331
332
333
334
335
336
337
338
339
340
341
		/*
		 * if step == 1, we use ->w.cpuset_mems_allowed to cache the
		 * result
		 */
		if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
			nodes_remap(tmp, pol->v.nodes,
					pol->w.cpuset_mems_allowed, *nodes);
			pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
		} else if (step == MPOL_REBIND_STEP2) {
			tmp = pol->w.cpuset_mems_allowed;
			pol->w.cpuset_mems_allowed = *nodes;
		} else
			BUG();
342
	}
343

344
345
346
347
348
349
350
351
352
353
	if (nodes_empty(tmp))
		tmp = *nodes;

	if (step == MPOL_REBIND_STEP1)
		nodes_or(pol->v.nodes, pol->v.nodes, tmp);
	else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
		pol->v.nodes = tmp;
	else
		BUG();

354
355
356
357
358
359
360
361
362
363
	if (!node_isset(current->il_next, tmp)) {
		current->il_next = next_node(current->il_next, tmp);
		if (current->il_next >= MAX_NUMNODES)
			current->il_next = first_node(tmp);
		if (current->il_next >= MAX_NUMNODES)
			current->il_next = numa_node_id();
	}
}

static void mpol_rebind_preferred(struct mempolicy *pol,
364
365
				  const nodemask_t *nodes,
				  enum mpol_rebind_step step)
366
367
368
369
370
371
{
	nodemask_t tmp;

	if (pol->flags & MPOL_F_STATIC_NODES) {
		int node = first_node(pol->w.user_nodemask);

372
		if (node_isset(node, *nodes)) {
373
			pol->v.preferred_node = node;
374
375
376
			pol->flags &= ~MPOL_F_LOCAL;
		} else
			pol->flags |= MPOL_F_LOCAL;
377
378
379
	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
		pol->v.preferred_node = first_node(tmp);
380
	} else if (!(pol->flags & MPOL_F_LOCAL)) {
381
382
383
384
385
		pol->v.preferred_node = node_remap(pol->v.preferred_node,
						   pol->w.cpuset_mems_allowed,
						   *nodes);
		pol->w.cpuset_mems_allowed = *nodes;
	}
Linus Torvalds's avatar
Linus Torvalds committed
386
387
}

388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
/*
 * mpol_rebind_policy - Migrate a policy to a different set of nodes
 *
 * If read-side task has no lock to protect task->mempolicy, write-side
 * task will rebind the task->mempolicy by two step. The first step is
 * setting all the newly nodes, and the second step is cleaning all the
 * disallowed nodes. In this way, we can avoid finding no node to alloc
 * page.
 * If we have a lock to protect task->mempolicy in read-side, we do
 * rebind directly.
 *
 * step:
 * 	MPOL_REBIND_ONCE  - do rebind work at once
 * 	MPOL_REBIND_STEP1 - set all the newly nodes
 * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
 */
static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
				enum mpol_rebind_step step)
406
407
408
{
	if (!pol)
		return;
409
	if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
410
411
	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
		return;
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426

	if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
		return;

	if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
		BUG();

	if (step == MPOL_REBIND_STEP1)
		pol->flags |= MPOL_F_REBINDING;
	else if (step == MPOL_REBIND_STEP2)
		pol->flags &= ~MPOL_F_REBINDING;
	else if (step >= MPOL_REBIND_NSTEP)
		BUG();

	mpol_ops[pol->mode].rebind(pol, newmask, step);
427
428
429
430
431
}

/*
 * Wrapper for mpol_rebind_policy() that just requires task
 * pointer, and updates task mempolicy.
432
433
 *
 * Called with task's alloc_lock held.
434
435
 */

436
437
void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
			enum mpol_rebind_step step)
438
{
439
	mpol_rebind_policy(tsk->mempolicy, new, step);
440
441
442
443
444
445
446
447
448
449
450
451
452
453
}

/*
 * Rebind each vma in mm to new nodemask.
 *
 * Call holding a reference to mm.  Takes mm->mmap_sem during call.
 */

void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
{
	struct vm_area_struct *vma;

	down_write(&mm->mmap_sem);
	for (vma = mm->mmap; vma; vma = vma->vm_next)
454
		mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
455
456
457
	up_write(&mm->mmap_sem);
}

458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
	[MPOL_DEFAULT] = {
		.rebind = mpol_rebind_default,
	},
	[MPOL_INTERLEAVE] = {
		.create = mpol_new_interleave,
		.rebind = mpol_rebind_nodemask,
	},
	[MPOL_PREFERRED] = {
		.create = mpol_new_preferred,
		.rebind = mpol_rebind_preferred,
	},
	[MPOL_BIND] = {
		.create = mpol_new_bind,
		.rebind = mpol_rebind_nodemask,
	},
};

476
477
static void migrate_page_add(struct page *page, struct list_head *pagelist,
				unsigned long flags);
478

479
480
481
482
483
/*
 * Scan through pages checking if pages follow certain conditions,
 * and move them to the pagelist if they do.
 */
static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
484
485
		unsigned long addr, unsigned long end,
		const nodemask_t *nodes, unsigned long flags,
486
		void *private)
Linus Torvalds's avatar
Linus Torvalds committed
487
{
488
489
	pte_t *orig_pte;
	pte_t *pte;
490
	spinlock_t *ptl;
491

492
	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
493
	do {
494
		struct page *page;
495
		int nid;
496
497

		if (!pte_present(*pte))
Linus Torvalds's avatar
Linus Torvalds committed
498
			continue;
499
500
		page = vm_normal_page(vma, addr, *pte);
		if (!page)
Linus Torvalds's avatar
Linus Torvalds committed
501
			continue;
502
		/*
503
504
		 * vm_normal_page() filters out zero pages, but there might
		 * still be PageReserved pages to skip, perhaps in a VDSO.
505
		 */
Hugh Dickins's avatar
Hugh Dickins committed
506
		if (PageReserved(page))
507
			continue;
508
		nid = page_to_nid(page);
509
510
511
		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
			continue;

Stephen Wilson's avatar
Stephen Wilson committed
512
		if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
513
			migrate_page_add(page, private, flags);
514
515
		else
			break;
516
	} while (pte++, addr += PAGE_SIZE, addr != end);
517
	pte_unmap_unlock(orig_pte, ptl);
518
519
520
	return addr != end;
}

521
522
static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma,
		pmd_t *pmd, const nodemask_t *nodes, unsigned long flags,
523
524
525
526
527
				    void *private)
{
#ifdef CONFIG_HUGETLB_PAGE
	int nid;
	struct page *page;
528
	spinlock_t *ptl;
529

530
	ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);
531
532
533
534
535
536
537
538
539
	page = pte_page(huge_ptep_get((pte_t *)pmd));
	nid = page_to_nid(page);
	if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
		goto unlock;
	/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
	if (flags & (MPOL_MF_MOVE_ALL) ||
	    (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
		isolate_huge_page(page, private);
unlock:
540
	spin_unlock(ptl);
541
542
543
544
545
#else
	BUG();
#endif
}

546
static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud,
547
548
		unsigned long addr, unsigned long end,
		const nodemask_t *nodes, unsigned long flags,
549
		void *private)
550
551
552
553
554
555
556
{
	pmd_t *pmd;
	unsigned long next;

	pmd = pmd_offset(pud, addr);
	do {
		next = pmd_addr_end(addr, end);
557
558
559
		if (!pmd_present(*pmd))
			continue;
		if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) {
560
			queue_pages_hugetlb_pmd_range(vma, pmd, nodes,
561
562
563
						flags, private);
			continue;
		}
564
		split_huge_page_pmd(vma, addr, pmd);
565
		if (pmd_none_or_trans_huge_or_clear_bad(pmd))
566
			continue;
567
		if (queue_pages_pte_range(vma, pmd, addr, next, nodes,
568
				    flags, private))
569
570
571
572
573
			return -EIO;
	} while (pmd++, addr = next, addr != end);
	return 0;
}

574
static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
575
576
		unsigned long addr, unsigned long end,
		const nodemask_t *nodes, unsigned long flags,
577
		void *private)
578
579
580
581
582
583
584
{
	pud_t *pud;
	unsigned long next;

	pud = pud_offset(pgd, addr);
	do {
		next = pud_addr_end(addr, end);
585
586
		if (pud_huge(*pud) && is_vm_hugetlb_page(vma))
			continue;
587
588
		if (pud_none_or_clear_bad(pud))
			continue;
589
		if (queue_pages_pmd_range(vma, pud, addr, next, nodes,
590
				    flags, private))
591
592
593
594
595
			return -EIO;
	} while (pud++, addr = next, addr != end);
	return 0;
}

596
static inline int queue_pages_pgd_range(struct vm_area_struct *vma,
597
598
		unsigned long addr, unsigned long end,
		const nodemask_t *nodes, unsigned long flags,
599
		void *private)
600
601
602
603
{
	pgd_t *pgd;
	unsigned long next;

Nick Piggin's avatar
Nick Piggin committed
604
	pgd = pgd_offset(vma->vm_mm, addr);
605
606
607
608
	do {
		next = pgd_addr_end(addr, end);
		if (pgd_none_or_clear_bad(pgd))
			continue;
609
		if (queue_pages_pud_range(vma, pgd, addr, next, nodes,
610
				    flags, private))
611
612
613
			return -EIO;
	} while (pgd++, addr = next, addr != end);
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
614
615
}

616
#ifdef CONFIG_NUMA_BALANCING
617
/*
618
619
620
621
622
623
624
 * This is used to mark a range of virtual addresses to be inaccessible.
 * These are later cleared by a NUMA hinting fault. Depending on these
 * faults, pages may be migrated for better NUMA placement.
 *
 * This is assuming that NUMA faults are handled using PROT_NONE. If
 * an architecture makes a different choice, it will need further
 * changes to the core.
625
 */
626
627
unsigned long change_prot_numa(struct vm_area_struct *vma,
			unsigned long addr, unsigned long end)
628
{
629
	int nr_updated;
630

631
	nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
632
633
	if (nr_updated)
		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
634

635
	return nr_updated;
636
637
638
639
640
641
642
}
#else
static unsigned long change_prot_numa(struct vm_area_struct *vma,
			unsigned long addr, unsigned long end)
{
	return 0;
}
643
#endif /* CONFIG_NUMA_BALANCING */
644

645
/*
646
647
648
649
650
 * Walk through page tables and collect pages to be migrated.
 *
 * If pages found in a given range are on a set of nodes (determined by
 * @nodes and @flags,) it's isolated and queued to the pagelist which is
 * passed via @private.)
651
 */
Linus Torvalds's avatar
Linus Torvalds committed
652
static struct vm_area_struct *
653
queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
654
		const nodemask_t *nodes, unsigned long flags, void *private)
Linus Torvalds's avatar
Linus Torvalds committed
655
656
657
658
{
	int err;
	struct vm_area_struct *first, *vma, *prev;

659

Linus Torvalds's avatar
Linus Torvalds committed
660
661
662
663
664
	first = find_vma(mm, start);
	if (!first)
		return ERR_PTR(-EFAULT);
	prev = NULL;
	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
665
666
667
668
669
670
671
		unsigned long endvma = vma->vm_end;

		if (endvma > end)
			endvma = end;
		if (vma->vm_start > start)
			start = vma->vm_start;

672
673
674
675
676
677
		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
			if (!vma->vm_next && vma->vm_end < end)
				return ERR_PTR(-EFAULT);
			if (prev && prev->vm_end < vma->vm_start)
				return ERR_PTR(-EFAULT);
		}
678
679
680
681
682
683
684

		if (flags & MPOL_MF_LAZY) {
			change_prot_numa(vma, start, endvma);
			goto next;
		}

		if ((flags & MPOL_MF_STRICT) ||
685
		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
686
		      vma_migratable(vma))) {
687

688
			err = queue_pages_pgd_range(vma, start, endvma, nodes,
689
						flags, private);
Linus Torvalds's avatar
Linus Torvalds committed
690
691
692
693
694
			if (err) {
				first = ERR_PTR(err);
				break;
			}
		}
695
next:
Linus Torvalds's avatar
Linus Torvalds committed
696
697
698
699
700
		prev = vma;
	}
	return first;
}

701
702
703
704
705
706
/*
 * Apply policy to a single VMA
 * This must be called with the mmap_sem held for writing.
 */
static int vma_replace_policy(struct vm_area_struct *vma,
						struct mempolicy *pol)
707
{
708
709
710
	int err;
	struct mempolicy *old;
	struct mempolicy *new;
711
712
713
714
715
716

	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
		 vma->vm_ops, vma->vm_file,
		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);

717
718
719
720
721
	new = mpol_dup(pol);
	if (IS_ERR(new))
		return PTR_ERR(new);

	if (vma->vm_ops && vma->vm_ops->set_policy) {
722
		err = vma->vm_ops->set_policy(vma, new);
723
724
		if (err)
			goto err_out;
725
	}
726
727
728
729
730
731
732
733

	old = vma->vm_policy;
	vma->vm_policy = new; /* protected by mmap_sem */
	mpol_put(old);

	return 0;
 err_out:
	mpol_put(new);
734
735
736
	return err;
}

Linus Torvalds's avatar
Linus Torvalds committed
737
/* Step 2: apply policy to a range and do splits. */
738
739
static int mbind_range(struct mm_struct *mm, unsigned long start,
		       unsigned long end, struct mempolicy *new_pol)
Linus Torvalds's avatar
Linus Torvalds committed
740
741
{
	struct vm_area_struct *next;
742
743
744
	struct vm_area_struct *prev;
	struct vm_area_struct *vma;
	int err = 0;
745
	pgoff_t pgoff;
746
747
	unsigned long vmstart;
	unsigned long vmend;
Linus Torvalds's avatar
Linus Torvalds committed
748

749
	vma = find_vma(mm, start);
750
751
752
	if (!vma || vma->vm_start > start)
		return -EFAULT;

753
	prev = vma->vm_prev;
754
755
756
	if (start > vma->vm_start)
		prev = vma;

757
	for (; vma && vma->vm_start < end; prev = vma, vma = next) {
Linus Torvalds's avatar
Linus Torvalds committed
758
		next = vma->vm_next;
759
760
761
		vmstart = max(start, vma->vm_start);
		vmend   = min(end, vma->vm_end);

762
763
764
765
766
		if (mpol_equal(vma_policy(vma), new_pol))
			continue;

		pgoff = vma->vm_pgoff +
			((vmstart - vma->vm_start) >> PAGE_SHIFT);
767
		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
768
				  vma->anon_vma, vma->vm_file, pgoff,
769
				  new_pol);
770
771
772
		if (prev) {
			vma = prev;
			next = vma->vm_next;
773
774
775
776
			if (mpol_equal(vma_policy(vma), new_pol))
				continue;
			/* vma_merge() joined vma && vma->next, case 8 */
			goto replace;
777
778
779
780
781
782
783
784
785
786
787
		}
		if (vma->vm_start != vmstart) {
			err = split_vma(vma->vm_mm, vma, vmstart, 1);
			if (err)
				goto out;
		}
		if (vma->vm_end != vmend) {
			err = split_vma(vma->vm_mm, vma, vmend, 0);
			if (err)
				goto out;
		}
788
 replace:
789
		err = vma_replace_policy(vma, new_pol);
790
791
		if (err)
			goto out;
Linus Torvalds's avatar
Linus Torvalds committed
792
	}
793
794

 out:
Linus Torvalds's avatar
Linus Torvalds committed
795
796
797
798
	return err;
}

/* Set the process memory policy */
799
800
static long do_set_mempolicy(unsigned short mode, unsigned short flags,
			     nodemask_t *nodes)
Linus Torvalds's avatar
Linus Torvalds committed
801
{
802
	struct mempolicy *new, *old;
803
	struct mm_struct *mm = current->mm;
804
	NODEMASK_SCRATCH(scratch);
805
	int ret;
Linus Torvalds's avatar
Linus Torvalds committed
806

807
808
	if (!scratch)
		return -ENOMEM;
809

810
811
812
813
814
	new = mpol_new(mode, flags, nodes);
	if (IS_ERR(new)) {
		ret = PTR_ERR(new);
		goto out;
	}
815
816
817
818
819
820
821
822
	/*
	 * prevent changing our mempolicy while show_numa_maps()
	 * is using it.
	 * Note:  do_set_mempolicy() can be called at init time
	 * with no 'mm'.
	 */
	if (mm)
		down_write(&mm->mmap_sem);
823
	task_lock(current);
824
	ret = mpol_set_nodemask(new, nodes, scratch);
825
826
827
828
829
	if (ret) {
		task_unlock(current);
		if (mm)
			up_write(&mm->mmap_sem);
		mpol_put(new);
830
		goto out;
831
832
	}
	old = current->mempolicy;
Linus Torvalds's avatar
Linus Torvalds committed
833
	current->mempolicy = new;
834
	if (new && new->mode == MPOL_INTERLEAVE &&
835
	    nodes_weight(new->v.nodes))
836
		current->il_next = first_node(new->v.nodes);
837
	task_unlock(current);
838
839
840
	if (mm)
		up_write(&mm->mmap_sem);

841
	mpol_put(old);
842
843
844
845
	ret = 0;
out:
	NODEMASK_SCRATCH_FREE(scratch);
	return ret;
Linus Torvalds's avatar
Linus Torvalds committed
846
847
}

848
849
/*
 * Return nodemask for policy for get_mempolicy() query
850
851
 *
 * Called with task's alloc_lock held
852
853
 */
static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
Linus Torvalds's avatar
Linus Torvalds committed
854
{
855
	nodes_clear(*nodes);
856
857
858
	if (p == &default_policy)
		return;

859
	switch (p->mode) {
860
861
	case MPOL_BIND:
		/* Fall through */
Linus Torvalds's avatar
Linus Torvalds committed
862
	case MPOL_INTERLEAVE:
863
		*nodes = p->v.nodes;
Linus Torvalds's avatar
Linus Torvalds committed
864
865
		break;
	case MPOL_PREFERRED:
866
		if (!(p->flags & MPOL_F_LOCAL))
867
			node_set(p->v.preferred_node, *nodes);
868
		/* else return empty node mask for local allocation */
Linus Torvalds's avatar
Linus Torvalds committed
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
		break;
	default:
		BUG();
	}
}

static int lookup_node(struct mm_struct *mm, unsigned long addr)
{
	struct page *p;
	int err;

	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
	if (err >= 0) {
		err = page_to_nid(p);
		put_page(p);
	}
	return err;
}

/* Retrieve NUMA policy */
Adrian Bunk's avatar
Adrian Bunk committed
889
890
static long do_get_mempolicy(int *policy, nodemask_t *nmask,
			     unsigned long addr, unsigned long flags)
Linus Torvalds's avatar
Linus Torvalds committed
891
{
892
	int err;
Linus Torvalds's avatar
Linus Torvalds committed
893
894
895
896
	struct mm_struct *mm = current->mm;
	struct vm_area_struct *vma = NULL;
	struct mempolicy *pol = current->mempolicy;

897
898
	if (flags &
		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
Linus Torvalds's avatar
Linus Torvalds committed
899
		return -EINVAL;
900
901
902
903
904

	if (flags & MPOL_F_MEMS_ALLOWED) {
		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
			return -EINVAL;
		*policy = 0;	/* just so it's initialized */
905
		task_lock(current);
906
		*nmask  = cpuset_current_mems_allowed;
907
		task_unlock(current);
908
909
910
		return 0;
	}

Linus Torvalds's avatar
Linus Torvalds committed
911
	if (flags & MPOL_F_ADDR) {
912
913
914
915
916
		/*
		 * Do NOT fall back to task policy if the
		 * vma/shared policy at addr is NULL.  We
		 * want to return MPOL_DEFAULT in this case.
		 */
Linus Torvalds's avatar
Linus Torvalds committed
917
918
919
920
921
922
923
924
925
926
927
928
929
930
		down_read(&mm->mmap_sem);
		vma = find_vma_intersection(mm, addr, addr+1);
		if (!vma) {
			up_read(&mm->mmap_sem);
			return -EFAULT;
		}
		if (vma->vm_ops && vma->vm_ops->get_policy)
			pol = vma->vm_ops->get_policy(vma, addr);
		else
			pol = vma->vm_policy;
	} else if (addr)
		return -EINVAL;

	if (!pol)
931
		pol = &default_policy;	/* indicates default behavior */
Linus Torvalds's avatar
Linus Torvalds committed
932
933
934
935
936
937

	if (flags & MPOL_F_NODE) {
		if (flags & MPOL_F_ADDR) {
			err = lookup_node(mm, addr);
			if (err < 0)
				goto out;
938
			*policy = err;
Linus Torvalds's avatar
Linus Torvalds committed
939
		} else if (pol == current->mempolicy &&
940
				pol->mode == MPOL_INTERLEAVE) {
941
			*policy = current->il_next;
Linus Torvalds's avatar
Linus Torvalds committed
942
943
944
945
		} else {
			err = -EINVAL;
			goto out;
		}
946
947
948
	} else {
		*policy = pol == &default_policy ? MPOL_DEFAULT :
						pol->mode;
949
950
951
952
953
		/*
		 * Internal mempolicy flags must be masked off before exposing
		 * the policy to userspace.
		 */
		*policy |= (pol->flags & MPOL_MODE_FLAGS);
954
	}
Linus Torvalds's avatar
Linus Torvalds committed
955
956
957
958
959
960
961

	if (vma) {
		up_read(&current->mm->mmap_sem);
		vma = NULL;
	}

	err = 0;
962
	if (nmask) {
963
964
965
966
967
968
969
		if (mpol_store_user_nodemask(pol)) {
			*nmask = pol->w.user_nodemask;
		} else {
			task_lock(current);
			get_policy_nodemask(pol, nmask);
			task_unlock(current);
		}
970
	}
Linus Torvalds's avatar
Linus Torvalds committed
971
972

 out:
973
	mpol_cond_put(pol);
Linus Torvalds's avatar
Linus Torvalds committed
974
975
976
977
978
	if (vma)
		up_read(&current->mm->mmap_sem);
	return err;
}

979
#ifdef CONFIG_MIGRATION
980
981
982
/*
 * page migration
 */
983
984
static void migrate_page_add(struct page *page, struct list_head *pagelist,
				unsigned long flags)
985
986
{
	/*
987
	 * Avoid migrating a page that is shared with others.
988
	 */
989
990
991
	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
		if (!isolate_lru_page(page)) {
			list_add_tail(&page->lru, pagelist);
992
993
			inc_zone_page_state(page, NR_ISOLATED_ANON +
					    page_is_file_cache(page));
994
995
		}
	}
996
}
997

998
static struct page *new_node_page(struct page *page, unsigned long node, int **x)
999
{
1000
1001
1002
1003
1004
	if (PageHuge(page))
		return alloc_huge_page_node(page_hstate(compound_head(page)),
					node);
	else
		return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
1005
1006
}

1007
1008
1009
1010
/*
 * Migrate pages from one node to a target node.
 * Returns error or the number of pages not migrated.
 */
Adrian Bunk's avatar
Adrian Bunk committed
1011
1012
static int migrate_to_node(struct mm_struct *mm, int source, int dest,
			   int flags)
1013
1014
1015
1016
1017
1018
1019
{
	nodemask_t nmask;
	LIST_HEAD(pagelist);
	int err = 0;

	nodes_clear(nmask);
	node_set(source, nmask);
1020

1021
1022
1023
1024
1025
1026
	/*
	 * This does not "check" the range but isolates all pages that
	 * need migration.  Between passing in the full user address
	 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
	 */
	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1027
	queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1028
1029
			flags | MPOL_MF_DISCONTIG_OK, &pagelist);

1030
	if (!list_empty(&pagelist)) {
1031
		err = migrate_pages(&pagelist, new_node_page, dest,
1032
					MIGRATE_SYNC, MR_SYSCALL);
1033
		if (err)
1034
			putback_movable_pages(&pagelist);
1035
	}
1036

1037
	return err;
1038
1039
}

1040
/*
1041
1042
 * Move pages between the two nodesets so as to preserve the physical
 * layout as much as possible.
1043
1044
1045
 *
 * Returns the number of page that could not be moved.
 */
1046
1047
int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
		     const nodemask_t *to, int flags)
1048
{
1049
	int busy = 0;
1050
	int err;
1051
	nodemask_t tmp;
1052

1053
1054
1055
1056
	err = migrate_prep();
	if (err)
		return err;

1057
	down_read(&mm->mmap_sem);
1058

1059
	err = migrate_vmas(mm, from, to, flags);
1060
1061
1062
	if (err)
		goto out;

1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
	/*
	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
	 * bit in 'tmp', and return that <source, dest> pair for migration.
	 * The pair of nodemasks 'to' and 'from' define the map.
	 *
	 * If no pair of bits is found that way, fallback to picking some
	 * pair of 'source' and 'dest' bits that are not the same.  If the
	 * 'source' and 'dest' bits are the same, this represents a node
	 * that will be migrating to itself, so no pages need move.
	 *
	 * If no bits are left in 'tmp', or if all remaining bits left
	 * in 'tmp' correspond to the same bit in 'to', return false
	 * (nothing left to migrate).
	 *
	 * This lets us pick a pair of nodes to migrate between, such that
	 * if possible the dest node is not already occupied by some other
	 * source node, minimizing the risk of overloading the memory on a
	 * node that would happen if we migrated incoming memory to a node
	 * before migrating outgoing memory source that same node.
	 *
	 * A single scan of tmp is sufficient.  As we go, we remember the
	 * most recent <s, d> pair that moved (s != d).  If we find a pair
	 * that not only moved, but what's better, moved to an empty slot
	 * (d is not set in tmp), then we break out then, with that pair.
1088
	 * Otherwise when we finish scanning from_tmp, we at least have the
1089
1090
1091
1092
	 * most recent <s, d> pair that moved.  If we get all the way through
	 * the scan of tmp without finding any node that moved, much less
	 * moved to an empty node, then there is nothing left worth migrating.
	 */
1093

1094
	tmp = *from;
1095
1096
	while (!nodes_empty(tmp)) {
		int s,d;
Jianguo Wu's avatar
Jianguo Wu committed
1097
		int source = NUMA_NO_NODE;
1098
1099
1100
		int dest = 0;

		for_each_node_mask(s, tmp) {
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116

			/*
			 * do_migrate_pages() tries to maintain the relative
			 * node relationship of the pages established between
			 * threads and memory areas.
                         *
			 * However if the number of source nodes is not equal to
			 * the number of destination nodes we can not preserve
			 * this node relative relationship.  In that case, skip
			 * copying memory from a node that is in the destination
			 * mask.
			 *
			 * Example: [2,3,4] -> [3,4,5] moves everything.
			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
			 */

1117
1118
			if ((nodes_weight(*from) != nodes_weight(*to)) &&
						(node_isset(s, *to)))
1119
1120
				continue;

1121
			d = node_remap(s, *from, *to);
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
			if (s == d)
				continue;

			source = s;	/* Node moved. Memorize */
			dest = d;

			/* dest not in remaining from nodes? */
			if (!node_isset(dest, tmp))
				break;
		}
Jianguo Wu's avatar
Jianguo Wu committed
1132
		if (source == NUMA_NO_NODE)
1133
1134
1135
1136
1137
1138
1139
1140
			break;

		node_clear(source, tmp);
		err = migrate_to_node(mm, source, dest, flags);
		if (err > 0)
			busy += err;
		if (err < 0)
			break;
1141
	}
1142
out:
1143
	up_read(&mm->mmap_sem);
1144
1145
1146
	if (err < 0)
		return err;
	return busy;
1147
1148
1149

}

1150
1151
1152
1153
1154
1155
1156
/*
 * Allocate a new page for page migration based on vma policy.
 * Start assuming that page is mapped by vma pointed to by @private.
 * Search forward from there, if not.  N.B., this assumes that the
 * list of pages handed to migrate_pages()--which is how we get here--
 * is in virtual address order.
 */
1157
static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1158
1159
{
	struct vm_area_struct *vma = (struct vm_area_struct *)private;
1160
	unsigned long uninitialized_var(address);
1161

1162
1163
1164
1165
1166
1167
	while (vma) {
		address = page_address_in_vma(page, vma);
		if (address != -EFAULT)
			break;
		vma = vma->vm_next;
	}
1168
1169

	if (PageHuge(page)) {
1170
1171
		BUG_ON(!vma);
		return alloc_huge_page_noerr(vma, address, 1);
1172
	}
1173
	/*
1174
	 * if !vma, alloc_page_vma() will use task or system default policy
1175
	 */
1176
	return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1177
}
1178
1179
1180
1181
1182
#else

static void