memcontrol.c 56 KB
Newer Older
1
2
3
4
5
/* memcontrol.c - Memory Controller
 *
 * Copyright IBM Corporation, 2007
 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
 *
6
7
8
 * Copyright 2007 OpenVZ SWsoft Inc
 * Author: Pavel Emelianov <xemul@openvz.org>
 *
9
10
11
12
13
14
15
16
17
18
19
20
21
22
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */

#include <linux/res_counter.h>
#include <linux/memcontrol.h>
#include <linux/cgroup.h>
23
#include <linux/mm.h>
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
24
#include <linux/pagemap.h>
25
#include <linux/smp.h>
26
#include <linux/page-flags.h>
27
#include <linux/backing-dev.h>
28
29
#include <linux/bit_spinlock.h>
#include <linux/rcupdate.h>
30
#include <linux/mutex.h>
31
#include <linux/slab.h>
32
33
34
#include <linux/swap.h>
#include <linux/spinlock.h>
#include <linux/fs.h>
35
#include <linux/seq_file.h>
36
#include <linux/vmalloc.h>
37
#include <linux/mm_inline.h>
38
#include <linux/page_cgroup.h>
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
39
#include "internal.h"
40

41
42
#include <asm/uaccess.h>

43
44
struct cgroup_subsys mem_cgroup_subsys __read_mostly;
#define MEM_CGROUP_RECLAIM_RETRIES	5
45

46
47
48
49
50
51
52
53
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
/* Turned on only when memory cgroup is enabled && really_do_swap_account = 0 */
int do_swap_account __read_mostly;
static int really_do_swap_account __initdata = 1; /* for remember boot option*/
#else
#define do_swap_account		(0)
#endif

54
static DEFINE_MUTEX(memcg_tasklist);	/* can be hold under cgroup_mutex */
55

56
57
58
59
60
61
62
63
64
/*
 * Statistics for memory cgroup.
 */
enum mem_cgroup_stat_index {
	/*
	 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
	 */
	MEM_CGROUP_STAT_CACHE, 	   /* # of pages charged as cache */
	MEM_CGROUP_STAT_RSS,	   /* # of pages charged as rss */
65
66
	MEM_CGROUP_STAT_PGPGIN_COUNT,	/* # of pages paged in */
	MEM_CGROUP_STAT_PGPGOUT_COUNT,	/* # of pages paged out */
67
68
69
70
71
72
73
74
75

	MEM_CGROUP_STAT_NSTATS,
};

struct mem_cgroup_stat_cpu {
	s64 count[MEM_CGROUP_STAT_NSTATS];
} ____cacheline_aligned_in_smp;

struct mem_cgroup_stat {
76
	struct mem_cgroup_stat_cpu cpustat[0];
77
78
79
80
81
};

/*
 * For accounting under irq disable, no need for increment preempt count.
 */
82
static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat,
83
84
		enum mem_cgroup_stat_index idx, int val)
{
85
	stat->count[idx] += val;
86
87
88
89
90
91
92
93
94
95
96
97
}

static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
		enum mem_cgroup_stat_index idx)
{
	int cpu;
	s64 ret = 0;
	for_each_possible_cpu(cpu)
		ret += stat->cpustat[cpu].count[idx];
	return ret;
}

98
99
100
101
/*
 * per-zone information in memory controller.
 */
struct mem_cgroup_per_zone {
102
103
104
	/*
	 * spin_lock to protect the per cgroup LRU
	 */
105
106
	struct list_head	lists[NR_LRU_LISTS];
	unsigned long		count[NR_LRU_LISTS];
KOSAKI Motohiro's avatar
KOSAKI Motohiro committed
107
108

	struct zone_reclaim_stat reclaim_stat;
109
110
111
112
113
114
115
116
117
118
119
120
};
/* Macro for accessing counter */
#define MEM_CGROUP_ZSTAT(mz, idx)	((mz)->count[(idx)])

struct mem_cgroup_per_node {
	struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
};

struct mem_cgroup_lru_info {
	struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
};

121
122
123
124
125
126
127
/*
 * The memory controller data structure. The memory controller controls both
 * page cache and RSS per cgroup. We would eventually like to provide
 * statistics based on the statistics developed by Rik Van Riel for clock-pro,
 * to help the administrator determine what knobs to tune.
 *
 * TODO: Add a water mark for the memory controller. Reclaim will begin when
128
129
130
 * we hit the water mark. May be even add a low water mark, such that
 * no reclaim occurs from a cgroup at it's low water mark, this is
 * a feature that will be implemented much later in the future.
131
132
133
134
135
136
137
 */
struct mem_cgroup {
	struct cgroup_subsys_state css;
	/*
	 * the counter to account for memory usage
	 */
	struct res_counter res;
138
139
140
141
	/*
	 * the counter to account for mem+swap usage.
	 */
	struct res_counter memsw;
142
143
144
145
	/*
	 * Per cgroup active and inactive list, similar to the
	 * per zone LRU lists.
	 */
146
	struct mem_cgroup_lru_info info;
147

KOSAKI Motohiro's avatar
KOSAKI Motohiro committed
148
149
150
151
152
	/*
	  protect against reclaim related member.
	*/
	spinlock_t reclaim_param_lock;

153
	int	prev_priority;	/* for recording reclaim priority */
154
155
156

	/*
	 * While reclaiming in a hiearchy, we cache the last child we
157
	 * reclaimed from. Protected by hierarchy_mutex
158
159
	 */
	struct mem_cgroup *last_scanned_child;
160
161
162
163
	/*
	 * Should the accounting and control be hierarchical, per subtree?
	 */
	bool use_hierarchy;
164
	unsigned long	last_oom_jiffies;
165
	atomic_t	refcnt;
166

KOSAKI Motohiro's avatar
KOSAKI Motohiro committed
167
168
	unsigned int	swappiness;

169
	/*
170
	 * statistics. This must be placed at the end of memcg.
171
172
	 */
	struct mem_cgroup_stat stat;
173
174
};

175
176
177
enum charge_type {
	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
	MEM_CGROUP_CHARGE_TYPE_MAPPED,
178
	MEM_CGROUP_CHARGE_TYPE_SHMEM,	/* used by page migration of shmem */
179
	MEM_CGROUP_CHARGE_TYPE_FORCE,	/* used by force_empty */
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
180
	MEM_CGROUP_CHARGE_TYPE_SWAPOUT,	/* for accounting swapcache */
181
182
183
	NR_CHARGE_TYPE,
};

184
185
186
187
/* only for here (for easy reading.) */
#define PCGF_CACHE	(1UL << PCG_CACHE)
#define PCGF_USED	(1UL << PCG_USED)
#define PCGF_LOCK	(1UL << PCG_LOCK)
188
189
static const unsigned long
pcg_default_flags[NR_CHARGE_TYPE] = {
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
190
191
192
	PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */
	PCGF_USED | PCGF_LOCK, /* Anon */
	PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
193
	0, /* FORCE */
194
195
};

196
197
198
199
200
201
202
203
204
205
/* for encoding cft->private value on file */
#define _MEM			(0)
#define _MEMSWAP		(1)
#define MEMFILE_PRIVATE(x, val)	(((x) << 16) | (val))
#define MEMFILE_TYPE(val)	(((val) >> 16) & 0xffff)
#define MEMFILE_ATTR(val)	((val) & 0xffff)

static void mem_cgroup_get(struct mem_cgroup *mem);
static void mem_cgroup_put(struct mem_cgroup *mem);

206
207
208
static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
					 struct page_cgroup *pc,
					 bool charge)
209
210
211
{
	int val = (charge)? 1 : -1;
	struct mem_cgroup_stat *stat = &mem->stat;
212
	struct mem_cgroup_stat_cpu *cpustat;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
213
	int cpu = get_cpu();
214

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
215
	cpustat = &stat->cpustat[cpu];
216
	if (PageCgroupCache(pc))
217
		__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
218
	else
219
		__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val);
220
221

	if (charge)
222
		__mem_cgroup_stat_add_safe(cpustat,
223
224
				MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
	else
225
		__mem_cgroup_stat_add_safe(cpustat,
226
				MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
227
	put_cpu();
228
229
}

230
static struct mem_cgroup_per_zone *
231
232
233
234
235
mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
{
	return &mem->info.nodeinfo[nid]->zoneinfo[zid];
}

236
static struct mem_cgroup_per_zone *
237
238
239
240
241
page_cgroup_zoneinfo(struct page_cgroup *pc)
{
	struct mem_cgroup *mem = pc->mem_cgroup;
	int nid = page_cgroup_nid(pc);
	int zid = page_cgroup_zid(pc);
242

243
244
245
	if (!mem)
		return NULL;

246
247
248
249
	return mem_cgroup_zoneinfo(mem, nid, zid);
}

static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem,
250
					enum lru_list idx)
251
252
253
254
255
256
257
258
259
260
261
{
	int nid, zid;
	struct mem_cgroup_per_zone *mz;
	u64 total = 0;

	for_each_online_node(nid)
		for (zid = 0; zid < MAX_NR_ZONES; zid++) {
			mz = mem_cgroup_zoneinfo(mem, nid, zid);
			total += MEM_CGROUP_ZSTAT(mz, idx);
		}
	return total;
262
263
}

264
static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
265
266
267
268
269
270
{
	return container_of(cgroup_subsys_state(cont,
				mem_cgroup_subsys_id), struct mem_cgroup,
				css);
}

271
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
272
{
273
274
275
276
277
278
279
280
	/*
	 * mm_update_next_owner() may clear mm->owner to NULL
	 * if it races with swapoff, page migration, etc.
	 * So this can be called with p == NULL.
	 */
	if (unlikely(!p))
		return NULL;

281
282
283
284
	return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
				struct mem_cgroup, css);
}

285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
{
	struct mem_cgroup *mem = NULL;
	/*
	 * Because we have no locks, mm->owner's may be being moved to other
	 * cgroup. We use css_tryget() here even if this looks
	 * pessimistic (rather than adding locks here).
	 */
	rcu_read_lock();
	do {
		mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
		if (unlikely(!mem))
			break;
	} while (!css_tryget(&mem->css));
	rcu_read_unlock();
	return mem;
}

static bool mem_cgroup_is_obsolete(struct mem_cgroup *mem)
{
	if (!mem)
		return true;
	return css_is_removed(&mem->css);
}

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
310
311
312
313
314
315
316
317
318
319
320
321
322
/*
 * Following LRU functions are allowed to be used without PCG_LOCK.
 * Operations are called by routine of global LRU independently from memcg.
 * What we have to take care of here is validness of pc->mem_cgroup.
 *
 * Changes to pc->mem_cgroup happens when
 * 1. charge
 * 2. moving account
 * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
 * It is added to LRU before charge.
 * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
 * When moving account, the page is not on LRU. It's isolated.
 */
323

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
324
325
326
327
328
void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
{
	struct page_cgroup *pc;
	struct mem_cgroup *mem;
	struct mem_cgroup_per_zone *mz;
329

330
	if (mem_cgroup_disabled())
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
331
332
333
		return;
	pc = lookup_page_cgroup(page);
	/* can happen while we handle swapcache. */
334
	if (list_empty(&pc->lru) || !pc->mem_cgroup)
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
335
		return;
336
337
338
339
	/*
	 * We don't check PCG_USED bit. It's cleared when the "page" is finally
	 * removed from global LRU.
	 */
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
340
341
	mz = page_cgroup_zoneinfo(pc);
	mem = pc->mem_cgroup;
342
	MEM_CGROUP_ZSTAT(mz, lru) -= 1;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
343
344
	list_del_init(&pc->lru);
	return;
345
346
}

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
347
void mem_cgroup_del_lru(struct page *page)
348
{
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
349
350
	mem_cgroup_del_lru_list(page, page_lru(page));
}
351

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
352
353
354
355
void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
{
	struct mem_cgroup_per_zone *mz;
	struct page_cgroup *pc;
356

357
	if (mem_cgroup_disabled())
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
358
		return;
359

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
360
	pc = lookup_page_cgroup(page);
361
362
363
364
	/*
	 * Used bit is set without atomic ops but after smp_wmb().
	 * For making pc->mem_cgroup visible, insert smp_rmb() here.
	 */
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
365
366
367
368
369
370
	smp_rmb();
	/* unused page is not rotated. */
	if (!PageCgroupUsed(pc))
		return;
	mz = page_cgroup_zoneinfo(pc);
	list_move(&pc->lru, &mz->lists[lru]);
371
372
}

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
373
void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
374
{
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
375
376
	struct page_cgroup *pc;
	struct mem_cgroup_per_zone *mz;
377

378
	if (mem_cgroup_disabled())
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
379
380
		return;
	pc = lookup_page_cgroup(page);
381
382
383
384
	/*
	 * Used bit is set without atomic ops but after smp_wmb().
	 * For making pc->mem_cgroup visible, insert smp_rmb() here.
	 */
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
385
386
	smp_rmb();
	if (!PageCgroupUsed(pc))
387
		return;
388

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
389
	mz = page_cgroup_zoneinfo(pc);
390
	MEM_CGROUP_ZSTAT(mz, lru) += 1;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
391
392
	list_add(&pc->lru, &mz->lists[lru]);
}
393

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
394
/*
395
396
397
398
399
 * At handling SwapCache, pc->mem_cgroup may be changed while it's linked to
 * lru because the page may.be reused after it's fully uncharged (because of
 * SwapCache behavior).To handle that, unlink page_cgroup from LRU when charge
 * it again. This function is only used to charge SwapCache. It's done under
 * lock_page and expected that zone->lru_lock is never held.
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
400
 */
401
static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page)
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
402
{
403
404
405
406
407
408
409
410
411
412
413
414
	unsigned long flags;
	struct zone *zone = page_zone(page);
	struct page_cgroup *pc = lookup_page_cgroup(page);

	spin_lock_irqsave(&zone->lru_lock, flags);
	/*
	 * Forget old LRU when this page_cgroup is *not* used. This Used bit
	 * is guarded by lock_page() because the page is SwapCache.
	 */
	if (!PageCgroupUsed(pc))
		mem_cgroup_del_lru_list(page, page_lru(page));
	spin_unlock_irqrestore(&zone->lru_lock, flags);
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
415
416
}

417
418
419
420
421
422
423
424
425
426
427
428
429
430
static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page)
{
	unsigned long flags;
	struct zone *zone = page_zone(page);
	struct page_cgroup *pc = lookup_page_cgroup(page);

	spin_lock_irqsave(&zone->lru_lock, flags);
	/* link when the page is linked to LRU but page_cgroup isn't */
	if (PageLRU(page) && list_empty(&pc->lru))
		mem_cgroup_add_lru_list(page, page_lru(page));
	spin_unlock_irqrestore(&zone->lru_lock, flags);
}


KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
431
432
433
void mem_cgroup_move_lists(struct page *page,
			   enum lru_list from, enum lru_list to)
{
434
	if (mem_cgroup_disabled())
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
435
436
437
		return;
	mem_cgroup_del_lru_list(page, from);
	mem_cgroup_add_lru_list(page, to);
438
439
}

440
441
442
443
444
int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
{
	int ret;

	task_lock(task);
445
	ret = task->mm && mm_match_cgroup(task->mm, mem);
446
447
448
449
	task_unlock(task);
	return ret;
}

450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
/*
 * Calculate mapped_ratio under memory controller. This will be used in
 * vmscan.c for deteremining we have to reclaim mapped pages.
 */
int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
{
	long total, rss;

	/*
	 * usage is recorded in bytes. But, here, we assume the number of
	 * physical pages can be represented by "long" on any arch.
	 */
	total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L;
	rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
	return (int)((rss * 100L) / total);
}
466

467
468
469
470
471
/*
 * prev_priority control...this will be used in memory reclaim path.
 */
int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
{
KOSAKI Motohiro's avatar
KOSAKI Motohiro committed
472
473
474
475
476
477
478
	int prev_priority;

	spin_lock(&mem->reclaim_param_lock);
	prev_priority = mem->prev_priority;
	spin_unlock(&mem->reclaim_param_lock);

	return prev_priority;
479
480
481
482
}

void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
{
KOSAKI Motohiro's avatar
KOSAKI Motohiro committed
483
	spin_lock(&mem->reclaim_param_lock);
484
485
	if (priority < mem->prev_priority)
		mem->prev_priority = priority;
KOSAKI Motohiro's avatar
KOSAKI Motohiro committed
486
	spin_unlock(&mem->reclaim_param_lock);
487
488
489
490
}

void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
{
KOSAKI Motohiro's avatar
KOSAKI Motohiro committed
491
	spin_lock(&mem->reclaim_param_lock);
492
	mem->prev_priority = priority;
KOSAKI Motohiro's avatar
KOSAKI Motohiro committed
493
	spin_unlock(&mem->reclaim_param_lock);
494
495
}

496
static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
497
498
499
{
	unsigned long active;
	unsigned long inactive;
500
501
	unsigned long gb;
	unsigned long inactive_ratio;
502
503
504
505

	inactive = mem_cgroup_get_all_zonestat(memcg, LRU_INACTIVE_ANON);
	active = mem_cgroup_get_all_zonestat(memcg, LRU_ACTIVE_ANON);

506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
	gb = (inactive + active) >> (30 - PAGE_SHIFT);
	if (gb)
		inactive_ratio = int_sqrt(10 * gb);
	else
		inactive_ratio = 1;

	if (present_pages) {
		present_pages[0] = inactive;
		present_pages[1] = active;
	}

	return inactive_ratio;
}

int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
{
	unsigned long active;
	unsigned long inactive;
	unsigned long present_pages[2];
	unsigned long inactive_ratio;

	inactive_ratio = calc_inactive_ratio(memcg, present_pages);

	inactive = present_pages[0];
	active = present_pages[1];

	if (inactive * inactive_ratio < active)
533
534
535
536
537
		return 1;

	return 0;
}

538
539
540
541
542
543
544
545
546
547
548
unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
				       struct zone *zone,
				       enum lru_list lru)
{
	int nid = zone->zone_pgdat->node_id;
	int zid = zone_idx(zone);
	struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);

	return MEM_CGROUP_ZSTAT(mz, lru);
}

KOSAKI Motohiro's avatar
KOSAKI Motohiro committed
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
						      struct zone *zone)
{
	int nid = zone->zone_pgdat->node_id;
	int zid = zone_idx(zone);
	struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);

	return &mz->reclaim_stat;
}

struct zone_reclaim_stat *
mem_cgroup_get_reclaim_stat_from_page(struct page *page)
{
	struct page_cgroup *pc;
	struct mem_cgroup_per_zone *mz;

	if (mem_cgroup_disabled())
		return NULL;

	pc = lookup_page_cgroup(page);
569
570
571
572
573
574
575
576
	/*
	 * Used bit is set without atomic ops but after smp_wmb().
	 * For making pc->mem_cgroup visible, insert smp_rmb() here.
	 */
	smp_rmb();
	if (!PageCgroupUsed(pc))
		return NULL;

KOSAKI Motohiro's avatar
KOSAKI Motohiro committed
577
578
579
580
581
582
583
	mz = page_cgroup_zoneinfo(pc);
	if (!mz)
		return NULL;

	return &mz->reclaim_stat;
}

584
585
586
587
588
unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
					struct list_head *dst,
					unsigned long *scanned, int order,
					int mode, struct zone *z,
					struct mem_cgroup *mem_cont,
589
					int active, int file)
590
591
592
593
594
595
{
	unsigned long nr_taken = 0;
	struct page *page;
	unsigned long scan;
	LIST_HEAD(pc_list);
	struct list_head *src;
596
	struct page_cgroup *pc, *tmp;
597
598
599
	int nid = z->zone_pgdat->node_id;
	int zid = zone_idx(z);
	struct mem_cgroup_per_zone *mz;
600
	int lru = LRU_FILE * !!file + !!active;
601

602
	BUG_ON(!mem_cont);
603
	mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
604
	src = &mz->lists[lru];
605

606
607
	scan = 0;
	list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
608
		if (scan >= nr_to_scan)
609
			break;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
610
611

		page = pc->page;
612
613
		if (unlikely(!PageCgroupUsed(pc)))
			continue;
614
		if (unlikely(!PageLRU(page)))
615
616
			continue;

617
		scan++;
618
		if (__isolate_lru_page(page, mode, file) == 0) {
619
620
621
622
623
624
625
626
627
			list_move(&page->lru, dst);
			nr_taken++;
		}
	}

	*scanned = scan;
	return nr_taken;
}

628
629
630
631
632
#define mem_cgroup_from_res_counter(counter, member)	\
	container_of(counter, struct mem_cgroup, member)

/*
 * This routine finds the DFS walk successor. This routine should be
633
 * called with hierarchy_mutex held
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
 */
static struct mem_cgroup *
mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem)
{
	struct cgroup *cgroup, *curr_cgroup, *root_cgroup;

	curr_cgroup = curr->css.cgroup;
	root_cgroup = root_mem->css.cgroup;

	if (!list_empty(&curr_cgroup->children)) {
		/*
		 * Walk down to children
		 */
		mem_cgroup_put(curr);
		cgroup = list_entry(curr_cgroup->children.next,
						struct cgroup, sibling);
		curr = mem_cgroup_from_cont(cgroup);
		mem_cgroup_get(curr);
		goto done;
	}

visit_parent:
	if (curr_cgroup == root_cgroup) {
		mem_cgroup_put(curr);
		curr = root_mem;
		mem_cgroup_get(curr);
		goto done;
	}

	/*
	 * Goto next sibling
	 */
	if (curr_cgroup->sibling.next != &curr_cgroup->parent->children) {
		mem_cgroup_put(curr);
		cgroup = list_entry(curr_cgroup->sibling.next, struct cgroup,
						sibling);
		curr = mem_cgroup_from_cont(cgroup);
		mem_cgroup_get(curr);
		goto done;
	}

	/*
	 * Go up to next parent and next parent's sibling if need be
	 */
	curr_cgroup = curr_cgroup->parent;
	goto visit_parent;

done:
	root_mem->last_scanned_child = curr;
	return curr;
}

/*
 * Visit the first child (need not be the first child as per the ordering
 * of the cgroup list, since we track last_scanned_child) of @mem and use
 * that to reclaim free pages from.
 */
static struct mem_cgroup *
mem_cgroup_get_first_node(struct mem_cgroup *root_mem)
{
	struct cgroup *cgroup;
	struct mem_cgroup *ret;
696
697
698
	bool obsolete;

	obsolete = mem_cgroup_is_obsolete(root_mem->last_scanned_child);
699
700
701
702

	/*
	 * Scan all children under the mem_cgroup mem
	 */
703
	mutex_lock(&mem_cgroup_subsys.hierarchy_mutex);
704
705
706
707
708
709
710
	if (list_empty(&root_mem->css.cgroup->children)) {
		ret = root_mem;
		goto done;
	}

	if (!root_mem->last_scanned_child || obsolete) {

711
		if (obsolete && root_mem->last_scanned_child)
712
713
714
715
716
717
718
719
720
721
722
723
			mem_cgroup_put(root_mem->last_scanned_child);

		cgroup = list_first_entry(&root_mem->css.cgroup->children,
				struct cgroup, sibling);
		ret = mem_cgroup_from_cont(cgroup);
		mem_cgroup_get(ret);
	} else
		ret = mem_cgroup_get_next_node(root_mem->last_scanned_child,
						root_mem);

done:
	root_mem->last_scanned_child = ret;
724
	mutex_unlock(&mem_cgroup_subsys.hierarchy_mutex);
725
726
727
	return ret;
}

728
729
730
731
732
733
734
735
736
737
738
739
static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
{
	if (do_swap_account) {
		if (res_counter_check_under_limit(&mem->res) &&
			res_counter_check_under_limit(&mem->memsw))
			return true;
	} else
		if (res_counter_check_under_limit(&mem->res))
			return true;
	return false;
}

KOSAKI Motohiro's avatar
KOSAKI Motohiro committed
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
static unsigned int get_swappiness(struct mem_cgroup *memcg)
{
	struct cgroup *cgrp = memcg->css.cgroup;
	unsigned int swappiness;

	/* root ? */
	if (cgrp->parent == NULL)
		return vm_swappiness;

	spin_lock(&memcg->reclaim_param_lock);
	swappiness = memcg->swappiness;
	spin_unlock(&memcg->reclaim_param_lock);

	return swappiness;
}

756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
/*
 * Dance down the hierarchy if needed to reclaim memory. We remember the
 * last child we reclaimed from, so that we don't end up penalizing
 * one child extensively based on its position in the children list.
 *
 * root_mem is the original ancestor that we've been reclaim from.
 */
static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
						gfp_t gfp_mask, bool noswap)
{
	struct mem_cgroup *next_mem;
	int ret = 0;

	/*
	 * Reclaim unconditionally and don't check for return value.
	 * We need to reclaim in the current group and down the tree.
	 * One might think about checking for children before reclaiming,
	 * but there might be left over accounting, even after children
	 * have left.
	 */
KOSAKI Motohiro's avatar
KOSAKI Motohiro committed
776
777
	ret = try_to_free_mem_cgroup_pages(root_mem, gfp_mask, noswap,
					   get_swappiness(root_mem));
778
	if (mem_cgroup_check_under_limit(root_mem))
779
		return 0;
780
781
	if (!root_mem->use_hierarchy)
		return ret;
782
783
784
785

	next_mem = mem_cgroup_get_first_node(root_mem);

	while (next_mem != root_mem) {
786
		if (mem_cgroup_is_obsolete(next_mem)) {
787
788
789
790
			mem_cgroup_put(next_mem);
			next_mem = mem_cgroup_get_first_node(root_mem);
			continue;
		}
KOSAKI Motohiro's avatar
KOSAKI Motohiro committed
791
792
		ret = try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap,
						   get_swappiness(next_mem));
793
		if (mem_cgroup_check_under_limit(root_mem))
794
			return 0;
795
		mutex_lock(&mem_cgroup_subsys.hierarchy_mutex);
796
		next_mem = mem_cgroup_get_next_node(next_mem, root_mem);
797
		mutex_unlock(&mem_cgroup_subsys.hierarchy_mutex);
798
799
800
801
	}
	return ret;
}

802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
bool mem_cgroup_oom_called(struct task_struct *task)
{
	bool ret = false;
	struct mem_cgroup *mem;
	struct mm_struct *mm;

	rcu_read_lock();
	mm = task->mm;
	if (!mm)
		mm = &init_mm;
	mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
	if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10))
		ret = true;
	rcu_read_unlock();
	return ret;
}
818
819
820
/*
 * Unlike exported interface, "oom" parameter is added. if oom==true,
 * oom-killer can be invoked.
821
 */
822
static int __mem_cgroup_try_charge(struct mm_struct *mm,
823
824
			gfp_t gfp_mask, struct mem_cgroup **memcg,
			bool oom)
825
{
826
	struct mem_cgroup *mem, *mem_over_limit;
827
	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
828
	struct res_counter *fail_res;
829
830
831
832
833
834
835

	if (unlikely(test_thread_flag(TIF_MEMDIE))) {
		/* Don't account this! */
		*memcg = NULL;
		return 0;
	}

836
	/*
837
838
	 * We always charge the cgroup the mm_struct belongs to.
	 * The mm_struct's mem_cgroup changes on task migration if the
839
840
841
	 * thread group leader migrates. It's possible that mm is not
	 * set, if so charge the init_mm (happens for pagecache usage).
	 */
842
843
844
	mem = *memcg;
	if (likely(!mem)) {
		mem = try_get_mem_cgroup_from_mm(mm);
845
		*memcg = mem;
846
	} else {
847
		css_get(&mem->css);
848
	}
849
850
851
852
	if (unlikely(!mem))
		return 0;

	VM_BUG_ON(mem_cgroup_is_obsolete(mem));
853

854
855
856
	while (1) {
		int ret;
		bool noswap = false;
857

858
		ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res);
859
860
861
		if (likely(!ret)) {
			if (!do_swap_account)
				break;
862
863
			ret = res_counter_charge(&mem->memsw, PAGE_SIZE,
							&fail_res);
864
865
866
867
868
			if (likely(!ret))
				break;
			/* mem+swap counter fails */
			res_counter_uncharge(&mem->res, PAGE_SIZE);
			noswap = true;
869
870
871
872
873
874
875
			mem_over_limit = mem_cgroup_from_res_counter(fail_res,
									memsw);
		} else
			/* mem counter fails */
			mem_over_limit = mem_cgroup_from_res_counter(fail_res,
									res);

876
		if (!(gfp_mask & __GFP_WAIT))
877
			goto nomem;
878

879
880
		ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask,
							noswap);
881
882

		/*
883
884
885
886
887
		 * try_to_free_mem_cgroup_pages() might not give us a full
		 * picture of reclaim. Some pages are reclaimed and might be
		 * moved to swap cache or just unmapped from the cgroup.
		 * Check the limit again to see if the reclaim reduced the
		 * current usage of the cgroup before giving up
888
		 *
889
		 */
890
891
		if (mem_cgroup_check_under_limit(mem_over_limit))
			continue;
892
893

		if (!nr_retries--) {
894
			if (oom) {
895
				mutex_lock(&memcg_tasklist);
896
				mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
897
				mutex_unlock(&memcg_tasklist);
898
				mem_over_limit->last_oom_jiffies = jiffies;
899
			}
900
			goto nomem;
901
		}
902
	}
903
904
905
906
907
	return 0;
nomem:
	css_put(&mem->css);
	return -ENOMEM;
}
908

909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page)
{
	struct mem_cgroup *mem;
	swp_entry_t ent;

	if (!PageSwapCache(page))
		return NULL;

	ent.val = page_private(page);
	mem = lookup_swap_cgroup(ent);
	if (!mem)
		return NULL;
	if (!css_tryget(&mem->css))
		return NULL;
	return mem;
}

926
/*
927
 * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be
928
929
930
931
932
933
934
935
936
937
 * USED state. If already USED, uncharge and return.
 */

static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
				     struct page_cgroup *pc,
				     enum charge_type ctype)
{
	/* try_charge() can return NULL to *memcg, taking care of it. */
	if (!mem)
		return;
938
939
940
941
942

	lock_page_cgroup(pc);
	if (unlikely(PageCgroupUsed(pc))) {
		unlock_page_cgroup(pc);
		res_counter_uncharge(&mem->res, PAGE_SIZE);
943
944
		if (do_swap_account)
			res_counter_uncharge(&mem->memsw, PAGE_SIZE);
945
		css_put(&mem->css);
946
		return;
947
	}
948
	pc->mem_cgroup = mem;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
949
	smp_wmb();
950
	pc->flags = pcg_default_flags[ctype];
951

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
952
	mem_cgroup_charge_statistics(mem, pc, true);
953
954

	unlock_page_cgroup(pc);
955
}
956

957
958
959
960
961
962
963
/**
 * mem_cgroup_move_account - move account of the page
 * @pc:	page_cgroup of the page.
 * @from: mem_cgroup which the page is moved from.
 * @to:	mem_cgroup which the page is moved to. @from != @to.
 *
 * The caller must confirm following.
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
964
 * - page is not on LRU (isolate_page() is useful.)
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
 *
 * returns 0 at success,
 * returns -EBUSY when lock is busy or "pc" is unstable.
 *
 * This function does "uncharge" from old cgroup but doesn't do "charge" to
 * new cgroup. It should be done by a caller.
 */

static int mem_cgroup_move_account(struct page_cgroup *pc,
	struct mem_cgroup *from, struct mem_cgroup *to)
{
	struct mem_cgroup_per_zone *from_mz, *to_mz;
	int nid, zid;
	int ret = -EBUSY;

	VM_BUG_ON(from == to);
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
981
	VM_BUG_ON(PageLRU(pc->page));
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996

	nid = page_cgroup_nid(pc);
	zid = page_cgroup_zid(pc);
	from_mz =  mem_cgroup_zoneinfo(from, nid, zid);
	to_mz =  mem_cgroup_zoneinfo(to, nid, zid);

	if (!trylock_page_cgroup(pc))
		return ret;

	if (!PageCgroupUsed(pc))
		goto out;

	if (pc->mem_cgroup != from)
		goto out;

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
997
998
999
1000
1001
1002
1003
1004
1005
	css_put(&from->css);
	res_counter_uncharge(&from->res, PAGE_SIZE);
	mem_cgroup_charge_statistics(from, pc, false);
	if (do_swap_account)
		res_counter_uncharge(&from->memsw, PAGE_SIZE);
	pc->mem_cgroup = to;
	mem_cgroup_charge_statistics(to, pc, true);
	css_get(&to->css);
	ret = 0;
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
out:
	unlock_page_cgroup(pc);
	return ret;
}

/*
 * move charges to its parent.
 */

static int mem_cgroup_move_parent(struct page_cgroup *pc,
				  struct mem_cgroup *child,
				  gfp_t gfp_mask)
{
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1019
	struct page *page = pc->page;
1020
1021
1022
1023
1024
1025
1026
1027
1028
	struct cgroup *cg = child->css.cgroup;
	struct cgroup *pcg = cg->parent;
	struct mem_cgroup *parent;
	int ret;

	/* Is ROOT ? */
	if (!pcg)
		return -EINVAL;

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1029

1030
1031
	parent = mem_cgroup_from_cont(pcg);

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1032

1033
	ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
1034
	if (ret || !parent)
1035
1036
		return ret;

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1037
1038
1039
1040
1041
1042
1043
	if (!get_page_unless_zero(page))
		return -EBUSY;

	ret = isolate_lru_page(page);

	if (ret)
		goto cancel;
1044
1045
1046

	ret = mem_cgroup_move_account(pc, child, parent);

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1047
	/* drop extra refcnt by try_charge() (move_account increment one) */
1048
	css_put(&parent->css);
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1049
1050
1051
1052
	putback_lru_page(page);
	if (!ret) {
		put_page(page);
		return 0;
1053
	}
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1054
1055
1056
1057
1058
1059
	/* uncharge if move fails */
cancel:
	res_counter_uncharge(&parent->res, PAGE_SIZE);
	if (do_swap_account)
		res_counter_uncharge(&parent->memsw, PAGE_SIZE);
	put_page(page);
1060
1061
1062
	return ret;
}

1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
/*
 * Charge the memory controller for page usage.
 * Return
 * 0 if the charge was successful
 * < 0 if the cgroup is over its limit
 */
static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
				gfp_t gfp_mask, enum charge_type ctype,
				struct mem_cgroup *memcg)
{
	struct mem_cgroup *mem;
	struct page_cgroup *pc;
	int ret;

	pc = lookup_page_cgroup(page);
	/* can happen at boot */
	if (unlikely(!pc))
		return 0;
	prefetchw(pc);

	mem = memcg;
1084
	ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
1085
	if (ret || !mem)
1086
1087
1088
		return ret;

	__mem_cgroup_commit_charge(mem, pc, ctype);
1089
1090
1091
	return 0;
}

1092
1093
int mem_cgroup_newpage_charge(struct page *page,
			      struct mm_struct *mm, gfp_t gfp_mask)
1094
{
1095
	if (mem_cgroup_disabled())
1096
		return 0;
1097
1098
	if (PageCompound(page))
		return 0;
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
	/*
	 * If already mapped, we don't have to account.
	 * If page cache, page->mapping has address_space.
	 * But page->mapping may have out-of-use anon_vma pointer,
	 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
	 * is NULL.
  	 */
	if (page_mapped(page) || (page->mapping && !PageAnon(page)))
		return 0;
	if (unlikely(!mm))
		mm = &init_mm;
1110
	return mem_cgroup_charge_common(page, mm, gfp_mask,
1111
				MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
1112
1113
}

1114
1115
int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
				gfp_t gfp_mask)
1116
{
1117
1118
1119
	struct mem_cgroup *mem = NULL;
	int ret;

1120
	if (mem_cgroup_disabled())
1121
		return 0;
1122
1123
	if (PageCompound(page))
		return 0;
1124
1125
1126
1127
1128
1129
1130
1131
	/*
	 * Corner case handling. This is called from add_to_page_cache()
	 * in usual. But some FS (shmem) precharges this page before calling it
	 * and call add_to_page_cache() with GFP_NOWAIT.
	 *
	 * For GFP_NOWAIT case, the page may be pre-charged before calling
	 * add_to_page_cache(). (See shmem.c) check it here and avoid to call
	 * charge twice. (It works but has to pay a bit larger cost.)
1132
1133
	 * And when the page is SwapCache, it should take swap information
	 * into account. This is under lock_page() now.
1134
1135
1136
1137
	 */
	if (!(gfp_mask & __GFP_WAIT)) {
		struct page_cgroup *pc;

1138
1139
1140
1141
1142
1143
1144

		pc = lookup_page_cgroup(page);
		if (!pc)
			return 0;
		lock_page_cgroup(pc);
		if (PageCgroupUsed(pc)) {
			unlock_page_cgroup(pc);
1145
1146
			return 0;
		}
1147
		unlock_page_cgroup(pc);
1148
1149
	}

1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
	if (do_swap_account && PageSwapCache(page)) {
		mem = try_get_mem_cgroup_from_swapcache(page);
		if (mem)
			mm = NULL;
		  else
			mem = NULL;
		/* SwapCache may be still linked to LRU now. */
		mem_cgroup_lru_del_before_commit_swapcache(page);
	}

	if (unlikely(!mm && !mem))
1161
		mm = &init_mm;
1162

1163
1164
	if (page_is_file_cache(page))
		return mem_cgroup_charge_common(page, mm, gfp_mask,
1165
				MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183

	ret = mem_cgroup_charge_common(page, mm, gfp_mask,
				MEM_CGROUP_CHARGE_TYPE_SHMEM, mem);
	if (mem)
		css_put(&mem->css);
	if (PageSwapCache(page))
		mem_cgroup_lru_add_after_commit_swapcache(page);

	if (do_swap_account && !ret && PageSwapCache(page)) {
		swp_entry_t ent = {.val = page_private(page)};
		/* avoid double counting */
		mem = swap_cgroup_record(ent, NULL);
		if (mem) {
			res_counter_uncharge(&mem->memsw, PAGE_SIZE);
			mem_cgroup_put(mem);
		}
	}
	return ret;
1184
1185
}

1186
1187
1188
1189
1190
1191
/*
 * While swap-in, try_charge -> commit or cancel, the page is locked.
 * And when try_charge() successfully returns, one refcnt to memcg without
 * struct page_cgroup is aquired. This refcnt will be cumsumed by
 * "commit()" or removed by "cancel()"
 */
1192
1193
1194
1195
1196
int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
				 struct page *page,
				 gfp_t mask, struct mem_cgroup **ptr)
{
	struct mem_cgroup *mem;
1197
	int ret;
1198

1199
	if (mem_cgroup_disabled())
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
		return 0;

	if (!do_swap_account)
		goto charge_cur_mm;
	/*
	 * A racing thread's fault, or swapoff, may have already updated
	 * the pte, and even removed page from swap cache: return success
	 * to go on to do_swap_page()'s pte_same() test, which should fail.
	 */
	if (!PageSwapCache(page))
		return 0;
1211
	mem = try_get_mem_cgroup_from_swapcache(page);
1212
1213
	if (!mem)
		goto charge_cur_mm;
1214
	*ptr = mem;
1215
1216
1217
1218
	ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
	/* drop extra refcnt from tryget */
	css_put(&mem->css);
	return ret;
1219
1220
1221
1222
1223
1224
charge_cur_mm:
	if (unlikely(!mm))
		mm = &init_mm;
	return __mem_cgroup_try_charge(mm, mask, ptr, true);
}

1225
1226
1227
1228
void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
{
	struct page_cgroup *pc;

1229
	if (mem_cgroup_disabled())
1230
1231
1232
1233
		return;
	if (!ptr)
		return;
	pc = lookup_page_cgroup(page);
1234
	mem_cgroup_lru_del_before_commit_swapcache(page);
1235
	__mem_cgroup_commit_charge(ptr, pc, MEM_CGROUP_CHARGE_TYPE_MAPPED);
1236
	mem_cgroup_lru_add_after_commit_swapcache(page);
1237
1238
1239
	/*
	 * Now swap is on-memory. This means this page may be
	 * counted both as mem and swap....double count.
1240
1241
1242
	 * Fix it by uncharging from memsw. Basically, this SwapCache is stable
	 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()
	 * may call delete_from_swap_cache() before reach here.
1243
	 */
1244
	if (do_swap_account && PageSwapCache(page)) {
1245
1246
1247
1248
1249
1250
1251
1252
1253
		swp_entry_t ent = {.val = page_private(page)};
		struct mem_cgroup *memcg;
		memcg = swap_cgroup_record(ent, NULL);
		if (memcg) {
			res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
			mem_cgroup_put(memcg);
		}

	}
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1254
	/* add this page(page_cgroup) to the LRU we want. */
1255

1256
1257
1258
1259
}

void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
{
1260
	if (mem_cgroup_disabled())
1261
1262
1263
1264
		return;
	if (!mem)
		return;
	res_counter_uncharge(&mem->res, PAGE_SIZE);
1265
1266
	if (do_swap_account)
		res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1267
1268
1269
1270
	css_put(&mem->css);
}


1271
/*
1272
 * uncharge if !page_mapped(page)
1273
 */
1274
static struct mem_cgroup *
1275
__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
1276
{
1277
	struct page_cgroup *pc;
1278
	struct mem_cgroup *mem = NULL;
1279
	struct mem_cgroup_per_zone *mz;
1280

1281
	if (mem_cgroup_disabled())
1282
		return NULL;
1283

KAMEZAWA Hiroyuki's avatar