memcontrol.c 56.9 KB
Newer Older
1
2
3
4
5
/* memcontrol.c - Memory Controller
 *
 * Copyright IBM Corporation, 2007
 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
 *
6
7
8
 * Copyright 2007 OpenVZ SWsoft Inc
 * Author: Pavel Emelianov <xemul@openvz.org>
 *
9
10
11
12
13
14
15
16
17
18
19
20
21
22
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */

#include <linux/res_counter.h>
#include <linux/memcontrol.h>
#include <linux/cgroup.h>
23
#include <linux/mm.h>
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
24
#include <linux/pagemap.h>
25
#include <linux/smp.h>
26
#include <linux/page-flags.h>
27
#include <linux/backing-dev.h>
28
29
#include <linux/bit_spinlock.h>
#include <linux/rcupdate.h>
30
#include <linux/mutex.h>
31
#include <linux/slab.h>
32
33
34
#include <linux/swap.h>
#include <linux/spinlock.h>
#include <linux/fs.h>
35
#include <linux/seq_file.h>
36
#include <linux/vmalloc.h>
37
#include <linux/mm_inline.h>
38
#include <linux/page_cgroup.h>
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
39
#include "internal.h"
40

41
42
#include <asm/uaccess.h>

43
44
struct cgroup_subsys mem_cgroup_subsys __read_mostly;
#define MEM_CGROUP_RECLAIM_RETRIES	5
45

46
47
48
49
50
51
52
53
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
/* Turned on only when memory cgroup is enabled && really_do_swap_account = 0 */
int do_swap_account __read_mostly;
static int really_do_swap_account __initdata = 1; /* for remember boot option*/
#else
#define do_swap_account		(0)
#endif

54
static DEFINE_MUTEX(memcg_tasklist);	/* can be hold under cgroup_mutex */
55

56
57
58
59
60
61
62
63
64
/*
 * Statistics for memory cgroup.
 */
enum mem_cgroup_stat_index {
	/*
	 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
	 */
	MEM_CGROUP_STAT_CACHE, 	   /* # of pages charged as cache */
	MEM_CGROUP_STAT_RSS,	   /* # of pages charged as rss */
65
66
	MEM_CGROUP_STAT_PGPGIN_COUNT,	/* # of pages paged in */
	MEM_CGROUP_STAT_PGPGOUT_COUNT,	/* # of pages paged out */
67
68
69
70
71
72
73
74
75

	MEM_CGROUP_STAT_NSTATS,
};

struct mem_cgroup_stat_cpu {
	s64 count[MEM_CGROUP_STAT_NSTATS];
} ____cacheline_aligned_in_smp;

struct mem_cgroup_stat {
76
	struct mem_cgroup_stat_cpu cpustat[0];
77
78
79
80
81
};

/*
 * For accounting under irq disable, no need for increment preempt count.
 */
82
static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat,
83
84
		enum mem_cgroup_stat_index idx, int val)
{
85
	stat->count[idx] += val;
86
87
88
89
90
91
92
93
94
95
96
97
}

static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
		enum mem_cgroup_stat_index idx)
{
	int cpu;
	s64 ret = 0;
	for_each_possible_cpu(cpu)
		ret += stat->cpustat[cpu].count[idx];
	return ret;
}

98
99
100
101
/*
 * per-zone information in memory controller.
 */
struct mem_cgroup_per_zone {
102
103
104
	/*
	 * spin_lock to protect the per cgroup LRU
	 */
105
106
	struct list_head	lists[NR_LRU_LISTS];
	unsigned long		count[NR_LRU_LISTS];
KOSAKI Motohiro's avatar
KOSAKI Motohiro committed
107
108

	struct zone_reclaim_stat reclaim_stat;
109
110
111
112
113
114
115
116
117
118
119
120
};
/* Macro for accessing counter */
#define MEM_CGROUP_ZSTAT(mz, idx)	((mz)->count[(idx)])

struct mem_cgroup_per_node {
	struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
};

struct mem_cgroup_lru_info {
	struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
};

121
122
123
124
125
126
127
/*
 * The memory controller data structure. The memory controller controls both
 * page cache and RSS per cgroup. We would eventually like to provide
 * statistics based on the statistics developed by Rik Van Riel for clock-pro,
 * to help the administrator determine what knobs to tune.
 *
 * TODO: Add a water mark for the memory controller. Reclaim will begin when
128
129
130
 * we hit the water mark. May be even add a low water mark, such that
 * no reclaim occurs from a cgroup at it's low water mark, this is
 * a feature that will be implemented much later in the future.
131
132
133
134
135
136
137
 */
struct mem_cgroup {
	struct cgroup_subsys_state css;
	/*
	 * the counter to account for memory usage
	 */
	struct res_counter res;
138
139
140
141
	/*
	 * the counter to account for mem+swap usage.
	 */
	struct res_counter memsw;
142
143
144
145
	/*
	 * Per cgroup active and inactive list, similar to the
	 * per zone LRU lists.
	 */
146
	struct mem_cgroup_lru_info info;
147

KOSAKI Motohiro's avatar
KOSAKI Motohiro committed
148
149
150
151
152
	/*
	  protect against reclaim related member.
	*/
	spinlock_t reclaim_param_lock;

153
	int	prev_priority;	/* for recording reclaim priority */
154
155
156

	/*
	 * While reclaiming in a hiearchy, we cache the last child we
157
	 * reclaimed from. Protected by hierarchy_mutex
158
159
	 */
	struct mem_cgroup *last_scanned_child;
160
161
162
163
	/*
	 * Should the accounting and control be hierarchical, per subtree?
	 */
	bool use_hierarchy;
164
	unsigned long	last_oom_jiffies;
165
	atomic_t	refcnt;
166

KOSAKI Motohiro's avatar
KOSAKI Motohiro committed
167
168
	unsigned int	swappiness;

169
	/*
170
	 * statistics. This must be placed at the end of memcg.
171
172
	 */
	struct mem_cgroup_stat stat;
173
174
};

175
176
177
enum charge_type {
	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
	MEM_CGROUP_CHARGE_TYPE_MAPPED,
178
	MEM_CGROUP_CHARGE_TYPE_SHMEM,	/* used by page migration of shmem */
179
	MEM_CGROUP_CHARGE_TYPE_FORCE,	/* used by force_empty */
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
180
	MEM_CGROUP_CHARGE_TYPE_SWAPOUT,	/* for accounting swapcache */
181
182
183
	NR_CHARGE_TYPE,
};

184
185
186
187
/* only for here (for easy reading.) */
#define PCGF_CACHE	(1UL << PCG_CACHE)
#define PCGF_USED	(1UL << PCG_USED)
#define PCGF_LOCK	(1UL << PCG_LOCK)
188
189
static const unsigned long
pcg_default_flags[NR_CHARGE_TYPE] = {
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
190
191
192
	PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */
	PCGF_USED | PCGF_LOCK, /* Anon */
	PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
193
	0, /* FORCE */
194
195
};

196
197
198
199
200
201
202
203
204
/* for encoding cft->private value on file */
#define _MEM			(0)
#define _MEMSWAP		(1)
#define MEMFILE_PRIVATE(x, val)	(((x) << 16) | (val))
#define MEMFILE_TYPE(val)	(((val) >> 16) & 0xffff)
#define MEMFILE_ATTR(val)	((val) & 0xffff)

static void mem_cgroup_get(struct mem_cgroup *mem);
static void mem_cgroup_put(struct mem_cgroup *mem);
205
static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
206

207
208
209
static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
					 struct page_cgroup *pc,
					 bool charge)
210
211
212
{
	int val = (charge)? 1 : -1;
	struct mem_cgroup_stat *stat = &mem->stat;
213
	struct mem_cgroup_stat_cpu *cpustat;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
214
	int cpu = get_cpu();
215

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
216
	cpustat = &stat->cpustat[cpu];
217
	if (PageCgroupCache(pc))
218
		__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
219
	else
220
		__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val);
221
222

	if (charge)
223
		__mem_cgroup_stat_add_safe(cpustat,
224
225
				MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
	else
226
		__mem_cgroup_stat_add_safe(cpustat,
227
				MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
228
	put_cpu();
229
230
}

231
static struct mem_cgroup_per_zone *
232
233
234
235
236
mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
{
	return &mem->info.nodeinfo[nid]->zoneinfo[zid];
}

237
static struct mem_cgroup_per_zone *
238
239
240
241
242
page_cgroup_zoneinfo(struct page_cgroup *pc)
{
	struct mem_cgroup *mem = pc->mem_cgroup;
	int nid = page_cgroup_nid(pc);
	int zid = page_cgroup_zid(pc);
243

244
245
246
	if (!mem)
		return NULL;

247
248
249
250
	return mem_cgroup_zoneinfo(mem, nid, zid);
}

static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem,
251
					enum lru_list idx)
252
253
254
255
256
257
258
259
260
261
262
{
	int nid, zid;
	struct mem_cgroup_per_zone *mz;
	u64 total = 0;

	for_each_online_node(nid)
		for (zid = 0; zid < MAX_NR_ZONES; zid++) {
			mz = mem_cgroup_zoneinfo(mem, nid, zid);
			total += MEM_CGROUP_ZSTAT(mz, idx);
		}
	return total;
263
264
}

265
static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
266
267
268
269
270
271
{
	return container_of(cgroup_subsys_state(cont,
				mem_cgroup_subsys_id), struct mem_cgroup,
				css);
}

272
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
273
{
274
275
276
277
278
279
280
281
	/*
	 * mm_update_next_owner() may clear mm->owner to NULL
	 * if it races with swapoff, page migration, etc.
	 * So this can be called with p == NULL.
	 */
	if (unlikely(!p))
		return NULL;

282
283
284
285
	return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
				struct mem_cgroup, css);
}

286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
{
	struct mem_cgroup *mem = NULL;
	/*
	 * Because we have no locks, mm->owner's may be being moved to other
	 * cgroup. We use css_tryget() here even if this looks
	 * pessimistic (rather than adding locks here).
	 */
	rcu_read_lock();
	do {
		mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
		if (unlikely(!mem))
			break;
	} while (!css_tryget(&mem->css));
	rcu_read_unlock();
	return mem;
}

static bool mem_cgroup_is_obsolete(struct mem_cgroup *mem)
{
	if (!mem)
		return true;
	return css_is_removed(&mem->css);
}

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
311
312
313
314
315
316
317
318
319
320
321
322
323
/*
 * Following LRU functions are allowed to be used without PCG_LOCK.
 * Operations are called by routine of global LRU independently from memcg.
 * What we have to take care of here is validness of pc->mem_cgroup.
 *
 * Changes to pc->mem_cgroup happens when
 * 1. charge
 * 2. moving account
 * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
 * It is added to LRU before charge.
 * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
 * When moving account, the page is not on LRU. It's isolated.
 */
324

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
325
326
327
328
329
void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
{
	struct page_cgroup *pc;
	struct mem_cgroup *mem;
	struct mem_cgroup_per_zone *mz;
330

331
	if (mem_cgroup_disabled())
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
332
333
334
		return;
	pc = lookup_page_cgroup(page);
	/* can happen while we handle swapcache. */
335
	if (list_empty(&pc->lru) || !pc->mem_cgroup)
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
336
		return;
337
338
339
340
	/*
	 * We don't check PCG_USED bit. It's cleared when the "page" is finally
	 * removed from global LRU.
	 */
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
341
342
	mz = page_cgroup_zoneinfo(pc);
	mem = pc->mem_cgroup;
343
	MEM_CGROUP_ZSTAT(mz, lru) -= 1;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
344
345
	list_del_init(&pc->lru);
	return;
346
347
}

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
348
void mem_cgroup_del_lru(struct page *page)
349
{
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
350
351
	mem_cgroup_del_lru_list(page, page_lru(page));
}
352

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
353
354
355
356
void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
{
	struct mem_cgroup_per_zone *mz;
	struct page_cgroup *pc;
357

358
	if (mem_cgroup_disabled())
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
359
		return;
360

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
361
	pc = lookup_page_cgroup(page);
362
363
364
365
	/*
	 * Used bit is set without atomic ops but after smp_wmb().
	 * For making pc->mem_cgroup visible, insert smp_rmb() here.
	 */
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
366
367
368
369
370
371
	smp_rmb();
	/* unused page is not rotated. */
	if (!PageCgroupUsed(pc))
		return;
	mz = page_cgroup_zoneinfo(pc);
	list_move(&pc->lru, &mz->lists[lru]);
372
373
}

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
374
void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
375
{
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
376
377
	struct page_cgroup *pc;
	struct mem_cgroup_per_zone *mz;
378

379
	if (mem_cgroup_disabled())
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
380
381
		return;
	pc = lookup_page_cgroup(page);
382
383
384
385
	/*
	 * Used bit is set without atomic ops but after smp_wmb().
	 * For making pc->mem_cgroup visible, insert smp_rmb() here.
	 */
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
386
387
	smp_rmb();
	if (!PageCgroupUsed(pc))
388
		return;
389

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
390
	mz = page_cgroup_zoneinfo(pc);
391
	MEM_CGROUP_ZSTAT(mz, lru) += 1;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
392
393
	list_add(&pc->lru, &mz->lists[lru]);
}
394

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
395
/*
396
397
398
399
400
 * At handling SwapCache, pc->mem_cgroup may be changed while it's linked to
 * lru because the page may.be reused after it's fully uncharged (because of
 * SwapCache behavior).To handle that, unlink page_cgroup from LRU when charge
 * it again. This function is only used to charge SwapCache. It's done under
 * lock_page and expected that zone->lru_lock is never held.
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
401
 */
402
static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page)
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
403
{
404
405
406
407
408
409
410
411
412
413
414
415
	unsigned long flags;
	struct zone *zone = page_zone(page);
	struct page_cgroup *pc = lookup_page_cgroup(page);

	spin_lock_irqsave(&zone->lru_lock, flags);
	/*
	 * Forget old LRU when this page_cgroup is *not* used. This Used bit
	 * is guarded by lock_page() because the page is SwapCache.
	 */
	if (!PageCgroupUsed(pc))
		mem_cgroup_del_lru_list(page, page_lru(page));
	spin_unlock_irqrestore(&zone->lru_lock, flags);
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
416
417
}

418
419
420
421
422
423
424
425
426
427
428
429
430
431
static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page)
{
	unsigned long flags;
	struct zone *zone = page_zone(page);
	struct page_cgroup *pc = lookup_page_cgroup(page);

	spin_lock_irqsave(&zone->lru_lock, flags);
	/* link when the page is linked to LRU but page_cgroup isn't */
	if (PageLRU(page) && list_empty(&pc->lru))
		mem_cgroup_add_lru_list(page, page_lru(page));
	spin_unlock_irqrestore(&zone->lru_lock, flags);
}


KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
432
433
434
void mem_cgroup_move_lists(struct page *page,
			   enum lru_list from, enum lru_list to)
{
435
	if (mem_cgroup_disabled())
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
436
437
438
		return;
	mem_cgroup_del_lru_list(page, from);
	mem_cgroup_add_lru_list(page, to);
439
440
}

441
442
443
444
445
int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
{
	int ret;

	task_lock(task);
446
	ret = task->mm && mm_match_cgroup(task->mm, mem);
447
448
449
450
	task_unlock(task);
	return ret;
}

451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
/*
 * Calculate mapped_ratio under memory controller. This will be used in
 * vmscan.c for deteremining we have to reclaim mapped pages.
 */
int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
{
	long total, rss;

	/*
	 * usage is recorded in bytes. But, here, we assume the number of
	 * physical pages can be represented by "long" on any arch.
	 */
	total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L;
	rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
	return (int)((rss * 100L) / total);
}
467

468
469
470
471
472
/*
 * prev_priority control...this will be used in memory reclaim path.
 */
int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
{
KOSAKI Motohiro's avatar
KOSAKI Motohiro committed
473
474
475
476
477
478
479
	int prev_priority;

	spin_lock(&mem->reclaim_param_lock);
	prev_priority = mem->prev_priority;
	spin_unlock(&mem->reclaim_param_lock);

	return prev_priority;
480
481
482
483
}

void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
{
KOSAKI Motohiro's avatar
KOSAKI Motohiro committed
484
	spin_lock(&mem->reclaim_param_lock);
485
486
	if (priority < mem->prev_priority)
		mem->prev_priority = priority;
KOSAKI Motohiro's avatar
KOSAKI Motohiro committed
487
	spin_unlock(&mem->reclaim_param_lock);
488
489
490
491
}

void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
{
KOSAKI Motohiro's avatar
KOSAKI Motohiro committed
492
	spin_lock(&mem->reclaim_param_lock);
493
	mem->prev_priority = priority;
KOSAKI Motohiro's avatar
KOSAKI Motohiro committed
494
	spin_unlock(&mem->reclaim_param_lock);
495
496
}

497
static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
498
499
500
{
	unsigned long active;
	unsigned long inactive;
501
502
	unsigned long gb;
	unsigned long inactive_ratio;
503
504
505
506

	inactive = mem_cgroup_get_all_zonestat(memcg, LRU_INACTIVE_ANON);
	active = mem_cgroup_get_all_zonestat(memcg, LRU_ACTIVE_ANON);

507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
	gb = (inactive + active) >> (30 - PAGE_SHIFT);
	if (gb)
		inactive_ratio = int_sqrt(10 * gb);
	else
		inactive_ratio = 1;

	if (present_pages) {
		present_pages[0] = inactive;
		present_pages[1] = active;
	}

	return inactive_ratio;
}

int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
{
	unsigned long active;
	unsigned long inactive;
	unsigned long present_pages[2];
	unsigned long inactive_ratio;

	inactive_ratio = calc_inactive_ratio(memcg, present_pages);

	inactive = present_pages[0];
	active = present_pages[1];

	if (inactive * inactive_ratio < active)
534
535
536
537
538
		return 1;

	return 0;
}

539
540
541
542
543
544
545
546
547
548
549
unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
				       struct zone *zone,
				       enum lru_list lru)
{
	int nid = zone->zone_pgdat->node_id;
	int zid = zone_idx(zone);
	struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);

	return MEM_CGROUP_ZSTAT(mz, lru);
}

KOSAKI Motohiro's avatar
KOSAKI Motohiro committed
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
						      struct zone *zone)
{
	int nid = zone->zone_pgdat->node_id;
	int zid = zone_idx(zone);
	struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);

	return &mz->reclaim_stat;
}

struct zone_reclaim_stat *
mem_cgroup_get_reclaim_stat_from_page(struct page *page)
{
	struct page_cgroup *pc;
	struct mem_cgroup_per_zone *mz;

	if (mem_cgroup_disabled())
		return NULL;

	pc = lookup_page_cgroup(page);
570
571
572
573
574
575
576
577
	/*
	 * Used bit is set without atomic ops but after smp_wmb().
	 * For making pc->mem_cgroup visible, insert smp_rmb() here.
	 */
	smp_rmb();
	if (!PageCgroupUsed(pc))
		return NULL;

KOSAKI Motohiro's avatar
KOSAKI Motohiro committed
578
579
580
581
582
583
584
	mz = page_cgroup_zoneinfo(pc);
	if (!mz)
		return NULL;

	return &mz->reclaim_stat;
}

585
586
587
588
589
unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
					struct list_head *dst,
					unsigned long *scanned, int order,
					int mode, struct zone *z,
					struct mem_cgroup *mem_cont,
590
					int active, int file)
591
592
593
594
595
596
{
	unsigned long nr_taken = 0;
	struct page *page;
	unsigned long scan;
	LIST_HEAD(pc_list);
	struct list_head *src;
597
	struct page_cgroup *pc, *tmp;
598
599
600
	int nid = z->zone_pgdat->node_id;
	int zid = zone_idx(z);
	struct mem_cgroup_per_zone *mz;
601
	int lru = LRU_FILE * !!file + !!active;
602

603
	BUG_ON(!mem_cont);
604
	mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
605
	src = &mz->lists[lru];
606

607
608
	scan = 0;
	list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
609
		if (scan >= nr_to_scan)
610
			break;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
611
612

		page = pc->page;
613
614
		if (unlikely(!PageCgroupUsed(pc)))
			continue;
615
		if (unlikely(!PageLRU(page)))
616
617
			continue;

618
		scan++;
619
		if (__isolate_lru_page(page, mode, file) == 0) {
620
621
622
623
624
625
626
627
628
			list_move(&page->lru, dst);
			nr_taken++;
		}
	}

	*scanned = scan;
	return nr_taken;
}

629
630
631
632
633
#define mem_cgroup_from_res_counter(counter, member)	\
	container_of(counter, struct mem_cgroup, member)

/*
 * This routine finds the DFS walk successor. This routine should be
634
 * called with hierarchy_mutex held
635
636
 */
static struct mem_cgroup *
637
__mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem)
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
{
	struct cgroup *cgroup, *curr_cgroup, *root_cgroup;

	curr_cgroup = curr->css.cgroup;
	root_cgroup = root_mem->css.cgroup;

	if (!list_empty(&curr_cgroup->children)) {
		/*
		 * Walk down to children
		 */
		cgroup = list_entry(curr_cgroup->children.next,
						struct cgroup, sibling);
		curr = mem_cgroup_from_cont(cgroup);
		goto done;
	}

visit_parent:
	if (curr_cgroup == root_cgroup) {
656
657
		/* caller handles NULL case */
		curr = NULL;
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
		goto done;
	}

	/*
	 * Goto next sibling
	 */
	if (curr_cgroup->sibling.next != &curr_cgroup->parent->children) {
		cgroup = list_entry(curr_cgroup->sibling.next, struct cgroup,
						sibling);
		curr = mem_cgroup_from_cont(cgroup);
		goto done;
	}

	/*
	 * Go up to next parent and next parent's sibling if need be
	 */
	curr_cgroup = curr_cgroup->parent;
	goto visit_parent;

done:
	return curr;
}

/*
 * Visit the first child (need not be the first child as per the ordering
 * of the cgroup list, since we track last_scanned_child) of @mem and use
 * that to reclaim free pages from.
 */
static struct mem_cgroup *
687
mem_cgroup_get_next_node(struct mem_cgroup *root_mem)
688
689
{
	struct cgroup *cgroup;
690
	struct mem_cgroup *orig, *next;
691
692
	bool obsolete;

693
694
695
	/*
	 * Scan all children under the mem_cgroup mem
	 */
696
	mutex_lock(&mem_cgroup_subsys.hierarchy_mutex);
697
698
699
700

	orig = root_mem->last_scanned_child;
	obsolete = mem_cgroup_is_obsolete(orig);

701
	if (list_empty(&root_mem->css.cgroup->children)) {
702
703
704
705
706
707
708
		/*
		 * root_mem might have children before and last_scanned_child
		 * may point to one of them. We put it later.
		 */
		if (orig)
			VM_BUG_ON(!obsolete);
		next = NULL;
709
710
711
		goto done;
	}

712
	if (!orig || obsolete) {
713
714
		cgroup = list_first_entry(&root_mem->css.cgroup->children,
				struct cgroup, sibling);
715
		next = mem_cgroup_from_cont(cgroup);
716
	} else
717
		next = __mem_cgroup_get_next_node(orig, root_mem);
718
719

done:
720
721
722
723
724
	if (next)
		mem_cgroup_get(next);
	root_mem->last_scanned_child = next;
	if (orig)
		mem_cgroup_put(orig);
725
	mutex_unlock(&mem_cgroup_subsys.hierarchy_mutex);
726
	return (next) ? next : root_mem;
727
728
}

729
730
731
732
733
734
735
736
737
738
739
740
static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
{
	if (do_swap_account) {
		if (res_counter_check_under_limit(&mem->res) &&
			res_counter_check_under_limit(&mem->memsw))
			return true;
	} else
		if (res_counter_check_under_limit(&mem->res))
			return true;
	return false;
}

KOSAKI Motohiro's avatar
KOSAKI Motohiro committed
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
static unsigned int get_swappiness(struct mem_cgroup *memcg)
{
	struct cgroup *cgrp = memcg->css.cgroup;
	unsigned int swappiness;

	/* root ? */
	if (cgrp->parent == NULL)
		return vm_swappiness;

	spin_lock(&memcg->reclaim_param_lock);
	swappiness = memcg->swappiness;
	spin_unlock(&memcg->reclaim_param_lock);

	return swappiness;
}

757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
/*
 * Dance down the hierarchy if needed to reclaim memory. We remember the
 * last child we reclaimed from, so that we don't end up penalizing
 * one child extensively based on its position in the children list.
 *
 * root_mem is the original ancestor that we've been reclaim from.
 */
static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
						gfp_t gfp_mask, bool noswap)
{
	struct mem_cgroup *next_mem;
	int ret = 0;

	/*
	 * Reclaim unconditionally and don't check for return value.
	 * We need to reclaim in the current group and down the tree.
	 * One might think about checking for children before reclaiming,
	 * but there might be left over accounting, even after children
	 * have left.
	 */
777
	ret += try_to_free_mem_cgroup_pages(root_mem, gfp_mask, noswap,
KOSAKI Motohiro's avatar
KOSAKI Motohiro committed
778
					   get_swappiness(root_mem));
779
	if (mem_cgroup_check_under_limit(root_mem))
780
		return 1;	/* indicate reclaim has succeeded */
781
782
	if (!root_mem->use_hierarchy)
		return ret;
783

784
	next_mem = mem_cgroup_get_next_node(root_mem);
785
786

	while (next_mem != root_mem) {
787
		if (mem_cgroup_is_obsolete(next_mem)) {
788
			next_mem = mem_cgroup_get_next_node(root_mem);
789
790
			continue;
		}
791
		ret += try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap,
KOSAKI Motohiro's avatar
KOSAKI Motohiro committed
792
						   get_swappiness(next_mem));
793
		if (mem_cgroup_check_under_limit(root_mem))
794
			return 1;	/* indicate reclaim has succeeded */
795
		next_mem = mem_cgroup_get_next_node(root_mem);
796
797
798
799
	}
	return ret;
}

800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
bool mem_cgroup_oom_called(struct task_struct *task)
{
	bool ret = false;
	struct mem_cgroup *mem;
	struct mm_struct *mm;

	rcu_read_lock();
	mm = task->mm;
	if (!mm)
		mm = &init_mm;
	mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
	if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10))
		ret = true;
	rcu_read_unlock();
	return ret;
}
816
817
818
/*
 * Unlike exported interface, "oom" parameter is added. if oom==true,
 * oom-killer can be invoked.
819
 */
820
static int __mem_cgroup_try_charge(struct mm_struct *mm,
821
822
			gfp_t gfp_mask, struct mem_cgroup **memcg,
			bool oom)
823
{
824
	struct mem_cgroup *mem, *mem_over_limit;
825
	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
826
	struct res_counter *fail_res;
827
828
829
830
831
832
833

	if (unlikely(test_thread_flag(TIF_MEMDIE))) {
		/* Don't account this! */
		*memcg = NULL;
		return 0;
	}

834
	/*
835
836
	 * We always charge the cgroup the mm_struct belongs to.
	 * The mm_struct's mem_cgroup changes on task migration if the
837
838
839
	 * thread group leader migrates. It's possible that mm is not
	 * set, if so charge the init_mm (happens for pagecache usage).
	 */
840
841
842
	mem = *memcg;
	if (likely(!mem)) {
		mem = try_get_mem_cgroup_from_mm(mm);
843
		*memcg = mem;
844
	} else {
845
		css_get(&mem->css);
846
	}
847
848
849
850
	if (unlikely(!mem))
		return 0;

	VM_BUG_ON(mem_cgroup_is_obsolete(mem));
851

852
853
854
	while (1) {
		int ret;
		bool noswap = false;
855

856
		ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res);
857
858
859
		if (likely(!ret)) {
			if (!do_swap_account)
				break;
860
861
			ret = res_counter_charge(&mem->memsw, PAGE_SIZE,
							&fail_res);
862
863
864
865
866
			if (likely(!ret))
				break;
			/* mem+swap counter fails */
			res_counter_uncharge(&mem->res, PAGE_SIZE);
			noswap = true;
867
868
869
870
871
872
873
			mem_over_limit = mem_cgroup_from_res_counter(fail_res,
									memsw);
		} else
			/* mem counter fails */
			mem_over_limit = mem_cgroup_from_res_counter(fail_res,
									res);

874
		if (!(gfp_mask & __GFP_WAIT))
875
			goto nomem;
876

877
878
		ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask,
							noswap);
879
880
		if (ret)
			continue;
881
882

		/*
883
884
885
886
887
		 * try_to_free_mem_cgroup_pages() might not give us a full
		 * picture of reclaim. Some pages are reclaimed and might be
		 * moved to swap cache or just unmapped from the cgroup.
		 * Check the limit again to see if the reclaim reduced the
		 * current usage of the cgroup before giving up
888
		 *
889
		 */
890
891
		if (mem_cgroup_check_under_limit(mem_over_limit))
			continue;
892
893

		if (!nr_retries--) {
894
			if (oom) {
895
				mutex_lock(&memcg_tasklist);
896
				mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
897
				mutex_unlock(&memcg_tasklist);
898
				mem_over_limit->last_oom_jiffies = jiffies;
899
			}
900
			goto nomem;
901
		}
902
	}
903
904
905
906
907
	return 0;
nomem:
	css_put(&mem->css);
	return -ENOMEM;
}
908

909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page)
{
	struct mem_cgroup *mem;
	swp_entry_t ent;

	if (!PageSwapCache(page))
		return NULL;

	ent.val = page_private(page);
	mem = lookup_swap_cgroup(ent);
	if (!mem)
		return NULL;
	if (!css_tryget(&mem->css))
		return NULL;
	return mem;
}

926
/*
927
 * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be
928
929
930
931
932
933
934
935
936
937
 * USED state. If already USED, uncharge and return.
 */

static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
				     struct page_cgroup *pc,
				     enum charge_type ctype)
{
	/* try_charge() can return NULL to *memcg, taking care of it. */
	if (!mem)
		return;
938
939
940
941
942

	lock_page_cgroup(pc);
	if (unlikely(PageCgroupUsed(pc))) {
		unlock_page_cgroup(pc);
		res_counter_uncharge(&mem->res, PAGE_SIZE);
943
944
		if (do_swap_account)
			res_counter_uncharge(&mem->memsw, PAGE_SIZE);
945
		css_put(&mem->css);
946
		return;
947
	}
948
	pc->mem_cgroup = mem;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
949
	smp_wmb();
950
	pc->flags = pcg_default_flags[ctype];
951

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
952
	mem_cgroup_charge_statistics(mem, pc, true);
953
954

	unlock_page_cgroup(pc);
955
}
956

957
958
959
960
961
962
963
/**
 * mem_cgroup_move_account - move account of the page
 * @pc:	page_cgroup of the page.
 * @from: mem_cgroup which the page is moved from.
 * @to:	mem_cgroup which the page is moved to. @from != @to.
 *
 * The caller must confirm following.
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
964
 * - page is not on LRU (isolate_page() is useful.)
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
 *
 * returns 0 at success,
 * returns -EBUSY when lock is busy or "pc" is unstable.
 *
 * This function does "uncharge" from old cgroup but doesn't do "charge" to
 * new cgroup. It should be done by a caller.
 */

static int mem_cgroup_move_account(struct page_cgroup *pc,
	struct mem_cgroup *from, struct mem_cgroup *to)
{
	struct mem_cgroup_per_zone *from_mz, *to_mz;
	int nid, zid;
	int ret = -EBUSY;

	VM_BUG_ON(from == to);
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
981
	VM_BUG_ON(PageLRU(pc->page));
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996

	nid = page_cgroup_nid(pc);
	zid = page_cgroup_zid(pc);
	from_mz =  mem_cgroup_zoneinfo(from, nid, zid);
	to_mz =  mem_cgroup_zoneinfo(to, nid, zid);

	if (!trylock_page_cgroup(pc))
		return ret;

	if (!PageCgroupUsed(pc))
		goto out;

	if (pc->mem_cgroup != from)
		goto out;

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
997
998
999
1000
	res_counter_uncharge(&from->res, PAGE_SIZE);
	mem_cgroup_charge_statistics(from, pc, false);
	if (do_swap_account)
		res_counter_uncharge(&from->memsw, PAGE_SIZE);
1001
1002
1003
	css_put(&from->css);

	css_get(&to->css);
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1004
1005
1006
	pc->mem_cgroup = to;
	mem_cgroup_charge_statistics(to, pc, true);
	ret = 0;
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
out:
	unlock_page_cgroup(pc);
	return ret;
}

/*
 * move charges to its parent.
 */

static int mem_cgroup_move_parent(struct page_cgroup *pc,
				  struct mem_cgroup *child,
				  gfp_t gfp_mask)
{
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1020
	struct page *page = pc->page;
1021
1022
1023
1024
1025
1026
1027
1028
1029
	struct cgroup *cg = child->css.cgroup;
	struct cgroup *pcg = cg->parent;
	struct mem_cgroup *parent;
	int ret;

	/* Is ROOT ? */
	if (!pcg)
		return -EINVAL;

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1030

1031
1032
	parent = mem_cgroup_from_cont(pcg);

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1033

1034
	ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
1035
	if (ret || !parent)
1036
1037
		return ret;

1038
1039
1040
1041
	if (!get_page_unless_zero(page)) {
		ret = -EBUSY;
		goto uncharge;
	}
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1042
1043
1044
1045
1046

	ret = isolate_lru_page(page);

	if (ret)
		goto cancel;
1047
1048
1049

	ret = mem_cgroup_move_account(pc, child, parent);

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1050
1051
1052
	putback_lru_page(page);
	if (!ret) {
		put_page(page);
1053
1054
		/* drop extra refcnt by try_charge() */
		css_put(&parent->css);
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1055
		return 0;
1056
	}
1057

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1058
cancel:
1059
1060
1061
1062
1063
	put_page(page);
uncharge:
	/* drop extra refcnt by try_charge() */
	css_put(&parent->css);
	/* uncharge if move fails */
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1064
1065
1066
	res_counter_uncharge(&parent->res, PAGE_SIZE);
	if (do_swap_account)
		res_counter_uncharge(&parent->memsw, PAGE_SIZE);
1067
1068
1069
	return ret;
}

1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
/*
 * Charge the memory controller for page usage.
 * Return
 * 0 if the charge was successful
 * < 0 if the cgroup is over its limit
 */
static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
				gfp_t gfp_mask, enum charge_type ctype,
				struct mem_cgroup *memcg)
{
	struct mem_cgroup *mem;
	struct page_cgroup *pc;
	int ret;

	pc = lookup_page_cgroup(page);
	/* can happen at boot */
	if (unlikely(!pc))
		return 0;
	prefetchw(pc);

	mem = memcg;
1091
	ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
1092
	if (ret || !mem)
1093
1094
1095
		return ret;

	__mem_cgroup_commit_charge(mem, pc, ctype);
1096
1097
1098
	return 0;
}

1099
1100
int mem_cgroup_newpage_charge(struct page *page,
			      struct mm_struct *mm, gfp_t gfp_mask)
1101
{
1102
	if (mem_cgroup_disabled())
1103
		return 0;
1104
1105
	if (PageCompound(page))
		return 0;
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
	/*
	 * If already mapped, we don't have to account.
	 * If page cache, page->mapping has address_space.
	 * But page->mapping may have out-of-use anon_vma pointer,
	 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
	 * is NULL.
  	 */
	if (page_mapped(page) || (page->mapping && !PageAnon(page)))
		return 0;
	if (unlikely(!mm))
		mm = &init_mm;
1117
	return mem_cgroup_charge_common(page, mm, gfp_mask,
1118
				MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
1119
1120
}

1121
1122
int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
				gfp_t gfp_mask)
1123
{
1124
1125
1126
	struct mem_cgroup *mem = NULL;
	int ret;

1127
	if (mem_cgroup_disabled())
1128
		return 0;
1129
1130
	if (PageCompound(page))
		return 0;
1131
1132
1133
1134
1135
1136
1137
1138
	/*
	 * Corner case handling. This is called from add_to_page_cache()
	 * in usual. But some FS (shmem) precharges this page before calling it
	 * and call add_to_page_cache() with GFP_NOWAIT.
	 *
	 * For GFP_NOWAIT case, the page may be pre-charged before calling
	 * add_to_page_cache(). (See shmem.c) check it here and avoid to call
	 * charge twice. (It works but has to pay a bit larger cost.)
1139
1140
	 * And when the page is SwapCache, it should take swap information
	 * into account. This is under lock_page() now.
1141
1142
1143
1144
	 */
	if (!(gfp_mask & __GFP_WAIT)) {
		struct page_cgroup *pc;

1145
1146
1147
1148
1149
1150
1151

		pc = lookup_page_cgroup(page);
		if (!pc)
			return 0;
		lock_page_cgroup(pc);
		if (PageCgroupUsed(pc)) {
			unlock_page_cgroup(pc);
1152
1153
			return 0;
		}
1154
		unlock_page_cgroup(pc);
1155
1156
	}

1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
	if (do_swap_account && PageSwapCache(page)) {
		mem = try_get_mem_cgroup_from_swapcache(page);
		if (mem)
			mm = NULL;
		  else
			mem = NULL;
		/* SwapCache may be still linked to LRU now. */
		mem_cgroup_lru_del_before_commit_swapcache(page);
	}

	if (unlikely(!mm && !mem))
1168
		mm = &init_mm;
1169

1170
1171
	if (page_is_file_cache(page))
		return mem_cgroup_charge_common(page, mm, gfp_mask,
1172
				MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190

	ret = mem_cgroup_charge_common(page, mm, gfp_mask,
				MEM_CGROUP_CHARGE_TYPE_SHMEM, mem);
	if (mem)
		css_put(&mem->css);
	if (PageSwapCache(page))
		mem_cgroup_lru_add_after_commit_swapcache(page);

	if (do_swap_account && !ret && PageSwapCache(page)) {
		swp_entry_t ent = {.val = page_private(page)};
		/* avoid double counting */
		mem = swap_cgroup_record(ent, NULL);
		if (mem) {
			res_counter_uncharge(&mem->memsw, PAGE_SIZE);
			mem_cgroup_put(mem);
		}
	}
	return ret;
1191
1192
}

1193
1194
1195
1196
1197
1198
/*
 * While swap-in, try_charge -> commit or cancel, the page is locked.
 * And when try_charge() successfully returns, one refcnt to memcg without
 * struct page_cgroup is aquired. This refcnt will be cumsumed by
 * "commit()" or removed by "cancel()"
 */
1199
1200
1201
1202
1203
int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
				 struct page *page,
				 gfp_t mask, struct mem_cgroup **ptr)
{
	struct mem_cgroup *mem;
1204
	int ret;
1205

1206
	if (mem_cgroup_disabled())
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
		return 0;

	if (!do_swap_account)
		goto charge_cur_mm;
	/*
	 * A racing thread's fault, or swapoff, may have already updated
	 * the pte, and even removed page from swap cache: return success
	 * to go on to do_swap_page()'s pte_same() test, which should fail.
	 */
	if (!PageSwapCache(page))
		return 0;
1218
	mem = try_get_mem_cgroup_from_swapcache(page);
1219
1220
	if (!mem)
		goto charge_cur_mm;
1221
	*ptr = mem;
1222
1223
1224
1225
	ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
	/* drop extra refcnt from tryget */
	css_put(&mem->css);
	return ret;
1226
1227
1228
1229
1230
1231
charge_cur_mm:
	if (unlikely(!mm))
		mm = &init_mm;
	return __mem_cgroup_try_charge(mm, mask, ptr, true);
}

1232
1233
1234
1235
void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
{
	struct page_cgroup *pc;

1236
	if (mem_cgroup_disabled())
1237
1238
1239
1240
		return;
	if (!ptr)
		return;
	pc = lookup_page_cgroup(page);
1241
	mem_cgroup_lru_del_before_commit_swapcache(page);
1242
	__mem_cgroup_commit_charge(ptr, pc, MEM_CGROUP_CHARGE_TYPE_MAPPED);
1243
	mem_cgroup_lru_add_after_commit_swapcache(page);
1244
1245
1246
	/*
	 * Now swap is on-memory. This means this page may be
	 * counted both as mem and swap....double count.
1247
1248
1249
	 * Fix it by uncharging from memsw. Basically, this SwapCache is stable
	 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()
	 * may call delete_from_swap_cache() before reach here.
1250
	 */
1251
	if (do_swap_account && PageSwapCache(page)) {