memcontrol.c 188 KB
Newer Older
1
2
3
4
5
/* memcontrol.c - Memory Controller
 *
 * Copyright IBM Corporation, 2007
 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
 *
6
7
8
 * Copyright 2007 OpenVZ SWsoft Inc
 * Author: Pavel Emelianov <xemul@openvz.org>
 *
9
10
11
12
 * Memory thresholds
 * Copyright (C) 2009 Nokia Corporation
 * Author: Kirill A. Shutemov
 *
13
14
15
16
 * Kernel Memory Controller
 * Copyright (C) 2012 Parallels Inc. and Google Inc.
 * Authors: Glauber Costa and Suleiman Souhlal
 *
17
18
19
20
21
22
23
24
25
26
27
28
29
30
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */

#include <linux/res_counter.h>
#include <linux/memcontrol.h>
#include <linux/cgroup.h>
31
#include <linux/mm.h>
32
#include <linux/hugetlb.h>
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
33
#include <linux/pagemap.h>
34
#include <linux/smp.h>
35
#include <linux/page-flags.h>
36
#include <linux/backing-dev.h>
37
38
#include <linux/bit_spinlock.h>
#include <linux/rcupdate.h>
39
#include <linux/limits.h>
40
#include <linux/export.h>
41
#include <linux/mutex.h>
42
#include <linux/rbtree.h>
43
#include <linux/slab.h>
44
#include <linux/swap.h>
45
#include <linux/swapops.h>
46
#include <linux/spinlock.h>
47
48
#include <linux/eventfd.h>
#include <linux/sort.h>
49
#include <linux/fs.h>
50
#include <linux/seq_file.h>
51
#include <linux/vmalloc.h>
52
#include <linux/vmpressure.h>
53
#include <linux/mm_inline.h>
54
#include <linux/page_cgroup.h>
55
#include <linux/cpu.h>
56
#include <linux/oom.h>
57
#include <linux/lockdep.h>
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
58
#include "internal.h"
Glauber Costa's avatar
Glauber Costa committed
59
#include <net/sock.h>
Michal Hocko's avatar
Michal Hocko committed
60
#include <net/ip.h>
Glauber Costa's avatar
Glauber Costa committed
61
#include <net/tcp_memcontrol.h>
62
#include "slab.h"
63

64
65
#include <asm/uaccess.h>

66
67
#include <trace/events/vmscan.h>

68
struct cgroup_subsys mem_cgroup_subsys __read_mostly;
69
70
EXPORT_SYMBOL(mem_cgroup_subsys);

71
#define MEM_CGROUP_RECLAIM_RETRIES	5
72
static struct mem_cgroup *root_mem_cgroup __read_mostly;
73

Andrew Morton's avatar
Andrew Morton committed
74
#ifdef CONFIG_MEMCG_SWAP
Li Zefan's avatar
Li Zefan committed
75
/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
76
int do_swap_account __read_mostly;
77
78

/* for remember boot option*/
Andrew Morton's avatar
Andrew Morton committed
79
#ifdef CONFIG_MEMCG_SWAP_ENABLED
80
81
82
83
84
static int really_do_swap_account __initdata = 1;
#else
static int really_do_swap_account __initdata = 0;
#endif

85
#else
86
#define do_swap_account		0
87
88
89
#endif


90
91
92
static const char * const mem_cgroup_stat_names[] = {
	"cache",
	"rss",
93
	"rss_huge",
94
	"mapped_file",
95
	"writeback",
96
97
98
	"swap",
};

99
100
101
enum mem_cgroup_events_index {
	MEM_CGROUP_EVENTS_PGPGIN,	/* # of pages paged in */
	MEM_CGROUP_EVENTS_PGPGOUT,	/* # of pages paged out */
102
103
	MEM_CGROUP_EVENTS_PGFAULT,	/* # of page-faults */
	MEM_CGROUP_EVENTS_PGMAJFAULT,	/* # of major page-faults */
104
105
	MEM_CGROUP_EVENTS_NSTATS,
};
106
107
108
109
110
111
112
113

static const char * const mem_cgroup_events_names[] = {
	"pgpgin",
	"pgpgout",
	"pgfault",
	"pgmajfault",
};

114
115
116
117
118
119
120
121
static const char * const mem_cgroup_lru_names[] = {
	"inactive_anon",
	"active_anon",
	"inactive_file",
	"active_file",
	"unevictable",
};

122
123
124
125
126
127
128
129
/*
 * Per memcg event counter is incremented at every pagein/pageout. With THP,
 * it will be incremated by the number of pages. This counter is used for
 * for trigger some periodic events. This is straightforward and better
 * than using jiffies etc. to handle periodic memcg event.
 */
enum mem_cgroup_events_target {
	MEM_CGROUP_TARGET_THRESH,
130
	MEM_CGROUP_TARGET_SOFTLIMIT,
131
	MEM_CGROUP_TARGET_NUMAINFO,
132
133
	MEM_CGROUP_NTARGETS,
};
134
135
136
#define THRESHOLDS_EVENTS_TARGET 128
#define SOFTLIMIT_EVENTS_TARGET 1024
#define NUMAINFO_EVENTS_TARGET	1024
137

138
struct mem_cgroup_stat_cpu {
139
	long count[MEM_CGROUP_STAT_NSTATS];
140
	unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
141
	unsigned long nr_page_events;
142
	unsigned long targets[MEM_CGROUP_NTARGETS];
143
144
};

145
struct mem_cgroup_reclaim_iter {
Michal Hocko's avatar
Michal Hocko committed
146
147
148
149
	/*
	 * last scanned hierarchy member. Valid only if last_dead_count
	 * matches memcg->dead_count of the hierarchy root group.
	 */
150
	struct mem_cgroup *last_visited;
Michal Hocko's avatar
Michal Hocko committed
151
152
	unsigned long last_dead_count;

153
154
155
156
	/* scan generation, increased every round-trip */
	unsigned int generation;
};

157
158
159
160
/*
 * per-zone information in memory controller.
 */
struct mem_cgroup_per_zone {
161
	struct lruvec		lruvec;
162
	unsigned long		lru_size[NR_LRU_LISTS];
KOSAKI Motohiro's avatar
KOSAKI Motohiro committed
163

164
165
	struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];

166
167
168
169
	struct rb_node		tree_node;	/* RB tree node */
	unsigned long long	usage_in_excess;/* Set to the value by which */
						/* the soft limit is exceeded*/
	bool			on_tree;
170
	struct mem_cgroup	*memcg;		/* Back pointer, we cannot */
171
						/* use container_of	   */
172
173
174
175
176
177
};

struct mem_cgroup_per_node {
	struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
};

178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
/*
 * Cgroups above their limits are maintained in a RB-Tree, independent of
 * their hierarchy representation
 */

struct mem_cgroup_tree_per_zone {
	struct rb_root rb_root;
	spinlock_t lock;
};

struct mem_cgroup_tree_per_node {
	struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
};

struct mem_cgroup_tree {
	struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
};

static struct mem_cgroup_tree soft_limit_tree __read_mostly;

198
199
200
201
202
struct mem_cgroup_threshold {
	struct eventfd_ctx *eventfd;
	u64 threshold;
};

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
203
/* For threshold */
204
struct mem_cgroup_threshold_ary {
205
	/* An array index points to threshold just below or equal to usage. */
206
	int current_threshold;
207
208
209
210
211
	/* Size of entries[] */
	unsigned int size;
	/* Array of thresholds */
	struct mem_cgroup_threshold entries[0];
};
212
213
214
215
216
217
218
219
220
221
222
223

struct mem_cgroup_thresholds {
	/* Primary thresholds array */
	struct mem_cgroup_threshold_ary *primary;
	/*
	 * Spare threshold array.
	 * This is needed to make mem_cgroup_unregister_event() "never fail".
	 * It must be able to store at least primary->size - 1 entries.
	 */
	struct mem_cgroup_threshold_ary *spare;
};

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
224
225
226
227
228
/* for OOM */
struct mem_cgroup_eventfd_list {
	struct list_head list;
	struct eventfd_ctx *eventfd;
};
229

230
231
static void mem_cgroup_threshold(struct mem_cgroup *memcg);
static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
232

233
234
235
236
237
238
239
/*
 * The memory controller data structure. The memory controller controls both
 * page cache and RSS per cgroup. We would eventually like to provide
 * statistics based on the statistics developed by Rik Van Riel for clock-pro,
 * to help the administrator determine what knobs to tune.
 *
 * TODO: Add a water mark for the memory controller. Reclaim will begin when
240
241
242
 * we hit the water mark. May be even add a low water mark, such that
 * no reclaim occurs from a cgroup at it's low water mark, this is
 * a feature that will be implemented much later in the future.
243
244
245
246
247
248
249
 */
struct mem_cgroup {
	struct cgroup_subsys_state css;
	/*
	 * the counter to account for memory usage
	 */
	struct res_counter res;
250

251
252
253
	/* vmpressure notifications */
	struct vmpressure vmpressure;

254
255
256
257
	/*
	 * the counter to account for mem+swap usage.
	 */
	struct res_counter memsw;
258

259
260
261
262
	/*
	 * the counter to account for kernel memory usage.
	 */
	struct res_counter kmem;
263
264
265
266
	/*
	 * Should the accounting and control be hierarchical, per subtree?
	 */
	bool use_hierarchy;
267
	unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */
268
269
270

	bool		oom_lock;
	atomic_t	under_oom;
271
	atomic_t	oom_wakeups;
272

273
	int	swappiness;
274
275
	/* OOM-Killer disable */
	int		oom_kill_disable;
KOSAKI Motohiro's avatar
KOSAKI Motohiro committed
276

277
278
279
	/* set when res.limit == memsw.limit */
	bool		memsw_is_minimum;

280
281
282
283
	/* protect arrays of thresholds */
	struct mutex thresholds_lock;

	/* thresholds for memory usage. RCU-protected */
284
	struct mem_cgroup_thresholds thresholds;
285

286
	/* thresholds for mem+swap usage. RCU-protected */
287
	struct mem_cgroup_thresholds memsw_thresholds;
288

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
289
290
	/* For oom notifier event fd */
	struct list_head oom_notify;
291

292
293
294
295
	/*
	 * Should we move charges of a task when a task is moved into this
	 * mem_cgroup ? And what type of charges should we move ?
	 */
Andrew Morton's avatar
Andrew Morton committed
296
	unsigned long move_charge_at_immigrate;
297
298
299
300
	/*
	 * set > 0 if pages under this cgroup are moving to other cgroup.
	 */
	atomic_t	moving_account;
301
302
	/* taken only while moving_account > 0 */
	spinlock_t	move_lock;
303
	/*
304
	 * percpu counter.
305
	 */
306
	struct mem_cgroup_stat_cpu __percpu *stat;
307
308
309
310
311
312
	/*
	 * used when a cpu is offlined or other synchronizations
	 * See mem_cgroup_read_stat().
	 */
	struct mem_cgroup_stat_cpu nocpu_base;
	spinlock_t pcp_counter_lock;
Glauber Costa's avatar
Glauber Costa committed
313

Michal Hocko's avatar
Michal Hocko committed
314
	atomic_t	dead_count;
Michal Hocko's avatar
Michal Hocko committed
315
#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
316
	struct cg_proto tcp_mem;
Glauber Costa's avatar
Glauber Costa committed
317
#endif
318
319
320
321
322
323
324
325
#if defined(CONFIG_MEMCG_KMEM)
	/* analogous to slab_common's slab_caches list. per-memcg */
	struct list_head memcg_slab_caches;
	/* Not a spinlock, we can take a lot of time walking the list */
	struct mutex slab_caches_mutex;
        /* Index in the kmem_cache->memcg_params->memcg_caches array */
	int kmemcg_id;
#endif
326
327
328
329
330
331
332

	int last_scanned_node;
#if MAX_NUMNODES > 1
	nodemask_t	scan_nodes;
	atomic_t	numainfo_events;
	atomic_t	numainfo_updating;
#endif
333

334
335
	struct mem_cgroup_per_node *nodeinfo[0];
	/* WARNING: nodeinfo must be the last member here */
336
337
};

338
339
340
static size_t memcg_size(void)
{
	return sizeof(struct mem_cgroup) +
341
		nr_node_ids * sizeof(struct mem_cgroup_per_node *);
342
343
}

344
345
346
/* internal only representation about the status of kmem accounting. */
enum {
	KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */
347
	KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */
348
	KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */
349
350
};

351
352
353
/* We account when limit is on, but only after call sites are patched */
#define KMEM_ACCOUNTED_MASK \
		((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED))
354
355
356
357
358
359

#ifdef CONFIG_MEMCG_KMEM
static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
{
	set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
}
360
361
362
363
364
365

static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
{
	return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
}

366
367
368
369
370
static void memcg_kmem_set_activated(struct mem_cgroup *memcg)
{
	set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
}

371
372
373
374
375
static void memcg_kmem_clear_activated(struct mem_cgroup *memcg)
{
	clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
}

376
377
static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
{
378
379
380
381
382
	/*
	 * Our caller must use css_get() first, because memcg_uncharge_kmem()
	 * will call css_put() if it sees the memcg is dead.
	 */
	smp_wmb();
383
384
385
386
387
388
389
390
391
	if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
		set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);
}

static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)
{
	return test_and_clear_bit(KMEM_ACCOUNTED_DEAD,
				  &memcg->kmem_account_flags);
}
392
393
#endif

394
395
/* Stuffs for move charges at task migration. */
/*
396
397
 * Types of charges to be moved. "move_charge_at_immitgrate" and
 * "immigrate_flags" are treated as a left-shifted bitmap of these types.
398
399
 */
enum move_type {
400
	MOVE_CHARGE_TYPE_ANON,	/* private anonymous page and swap of it */
401
	MOVE_CHARGE_TYPE_FILE,	/* file page(including tmpfs) and swap of it */
402
403
404
	NR_MOVE_TYPE,
};

405
406
/* "mc" and its members are protected by cgroup_mutex */
static struct move_charge_struct {
407
	spinlock_t	  lock; /* for from, to */
408
409
	struct mem_cgroup *from;
	struct mem_cgroup *to;
410
	unsigned long immigrate_flags;
411
	unsigned long precharge;
412
	unsigned long moved_charge;
413
	unsigned long moved_swap;
414
415
416
	struct task_struct *moving_task;	/* a task moving charges */
	wait_queue_head_t waitq;		/* a waitq for other context */
} mc = {
417
	.lock = __SPIN_LOCK_UNLOCKED(mc.lock),
418
419
	.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
};
420

421
422
static bool move_anon(void)
{
423
	return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags);
424
425
}

426
427
static bool move_file(void)
{
428
	return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags);
429
430
}

431
432
433
434
/*
 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
 * limit reclaim to prevent infinite loops, if they ever occur.
 */
435
#define	MEM_CGROUP_MAX_RECLAIM_LOOPS		100
436
#define	MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS	2
437

438
439
enum charge_type {
	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
440
	MEM_CGROUP_CHARGE_TYPE_ANON,
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
441
	MEM_CGROUP_CHARGE_TYPE_SWAPOUT,	/* for accounting swapcache */
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
442
	MEM_CGROUP_CHARGE_TYPE_DROP,	/* a page was unused swap cache */
443
444
445
	NR_CHARGE_TYPE,
};

446
/* for encoding cft->private value on file */
447
448
449
450
enum res_type {
	_MEM,
	_MEMSWAP,
	_OOM_TYPE,
451
	_KMEM,
452
453
};

454
455
#define MEMFILE_PRIVATE(x, val)	((x) << 16 | (val))
#define MEMFILE_TYPE(val)	((val) >> 16 & 0xffff)
456
#define MEMFILE_ATTR(val)	((val) & 0xffff)
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
457
458
/* Used for OOM nofiier */
#define OOM_CONTROL		(0)
459

460
461
462
463
464
465
466
467
/*
 * Reclaim flags for mem_cgroup_hierarchical_reclaim
 */
#define MEM_CGROUP_RECLAIM_NOSWAP_BIT	0x0
#define MEM_CGROUP_RECLAIM_NOSWAP	(1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
#define MEM_CGROUP_RECLAIM_SHRINK_BIT	0x1
#define MEM_CGROUP_RECLAIM_SHRINK	(1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)

468
469
470
471
472
473
474
/*
 * The memcg_create_mutex will be held whenever a new cgroup is created.
 * As a consequence, any change that needs to protect against new child cgroups
 * appearing has to hold it as well.
 */
static DEFINE_MUTEX(memcg_create_mutex);

475
476
struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
{
477
	return s ? container_of(s, struct mem_cgroup, css) : NULL;
478
479
}

480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
/* Some nice accessors for the vmpressure. */
struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
{
	if (!memcg)
		memcg = root_mem_cgroup;
	return &memcg->vmpressure;
}

struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
{
	return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
}

struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css)
{
	return &mem_cgroup_from_css(css)->vmpressure;
}

498
499
500
501
502
static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
{
	return (memcg == root_mem_cgroup);
}

503
504
505
506
507
508
/*
 * We restrict the id in the range of [1, 65535], so it can fit into
 * an unsigned short.
 */
#define MEM_CGROUP_ID_MAX	USHRT_MAX

Li Zefan's avatar
Li Zefan committed
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
{
	/*
	 * The ID of the root cgroup is 0, but memcg treat 0 as an
	 * invalid ID, so we return (cgroup_id + 1).
	 */
	return memcg->css.cgroup->id + 1;
}

static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
{
	struct cgroup_subsys_state *css;

	css = css_from_id(id - 1, &mem_cgroup_subsys);
	return mem_cgroup_from_css(css);
}

Glauber Costa's avatar
Glauber Costa committed
526
/* Writing them here to avoid exposing memcg's inner layout */
Michal Hocko's avatar
Michal Hocko committed
527
#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
Glauber Costa's avatar
Glauber Costa committed
528
529
530

void sock_update_memcg(struct sock *sk)
{
531
	if (mem_cgroup_sockets_enabled) {
Glauber Costa's avatar
Glauber Costa committed
532
		struct mem_cgroup *memcg;
533
		struct cg_proto *cg_proto;
Glauber Costa's avatar
Glauber Costa committed
534
535
536

		BUG_ON(!sk->sk_prot->proto_cgroup);

537
538
539
540
541
542
543
544
545
546
		/* Socket cloning can throw us here with sk_cgrp already
		 * filled. It won't however, necessarily happen from
		 * process context. So the test for root memcg given
		 * the current task's memcg won't help us in this case.
		 *
		 * Respecting the original socket's memcg is a better
		 * decision in this case.
		 */
		if (sk->sk_cgrp) {
			BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));
547
			css_get(&sk->sk_cgrp->memcg->css);
548
549
550
			return;
		}

Glauber Costa's avatar
Glauber Costa committed
551
552
		rcu_read_lock();
		memcg = mem_cgroup_from_task(current);
553
		cg_proto = sk->sk_prot->proto_cgroup(memcg);
554
555
		if (!mem_cgroup_is_root(memcg) &&
		    memcg_proto_active(cg_proto) && css_tryget(&memcg->css)) {
556
			sk->sk_cgrp = cg_proto;
Glauber Costa's avatar
Glauber Costa committed
557
558
559
560
561
562
563
564
		}
		rcu_read_unlock();
	}
}
EXPORT_SYMBOL(sock_update_memcg);

void sock_release_memcg(struct sock *sk)
{
565
	if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
Glauber Costa's avatar
Glauber Costa committed
566
567
568
		struct mem_cgroup *memcg;
		WARN_ON(!sk->sk_cgrp->memcg);
		memcg = sk->sk_cgrp->memcg;
569
		css_put(&sk->sk_cgrp->memcg->css);
Glauber Costa's avatar
Glauber Costa committed
570
571
	}
}
Glauber Costa's avatar
Glauber Costa committed
572
573
574
575
576
577

struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
{
	if (!memcg || mem_cgroup_is_root(memcg))
		return NULL;

578
	return &memcg->tcp_mem;
Glauber Costa's avatar
Glauber Costa committed
579
580
}
EXPORT_SYMBOL(tcp_proto_cgroup);
Glauber Costa's avatar
Glauber Costa committed
581

582
583
static void disarm_sock_keys(struct mem_cgroup *memcg)
{
584
	if (!memcg_proto_activated(&memcg->tcp_mem))
585
586
587
588
589
590
591
592
593
		return;
	static_key_slow_dec(&memcg_socket_limit_enabled);
}
#else
static void disarm_sock_keys(struct mem_cgroup *memcg)
{
}
#endif

594
#ifdef CONFIG_MEMCG_KMEM
595
596
/*
 * This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
Li Zefan's avatar
Li Zefan committed
597
598
599
600
601
 * The main reason for not using cgroup id for this:
 *  this works better in sparse environments, where we have a lot of memcgs,
 *  but only a few kmem-limited. Or also, if we have, for instance, 200
 *  memcgs, and none but the 200th is kmem-limited, we'd have to have a
 *  200 entry array for that.
602
603
604
605
606
607
 *
 * The current size of the caches array is stored in
 * memcg_limited_groups_array_size.  It will double each time we have to
 * increase it.
 */
static DEFINE_IDA(kmem_limited_groups);
608
609
int memcg_limited_groups_array_size;

610
611
612
613
614
615
/*
 * MIN_SIZE is different than 1, because we would like to avoid going through
 * the alloc/free process all the time. In a small machine, 4 kmem-limited
 * cgroups is a reasonable guess. In the future, it could be a parameter or
 * tunable, but that is strictly not necessary.
 *
Li Zefan's avatar
Li Zefan committed
616
 * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get
617
618
 * this constant directly from cgroup, but it is understandable that this is
 * better kept as an internal representation in cgroup.c. In any case, the
Li Zefan's avatar
Li Zefan committed
619
 * cgrp_id space is not getting any smaller, and we don't have to necessarily
620
621
622
 * increase ours as well if it increases.
 */
#define MEMCG_CACHES_MIN_SIZE 4
Li Zefan's avatar
Li Zefan committed
623
#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
624

625
626
627
628
629
630
/*
 * A lot of the calls to the cache allocation functions are expected to be
 * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
 * conditional to this static branch, we'll have to allow modules that does
 * kmem_cache_alloc and the such to see this symbol as well
 */
631
struct static_key memcg_kmem_enabled_key;
632
EXPORT_SYMBOL(memcg_kmem_enabled_key);
633
634
635

static void disarm_kmem_keys(struct mem_cgroup *memcg)
{
636
	if (memcg_kmem_is_active(memcg)) {
637
		static_key_slow_dec(&memcg_kmem_enabled_key);
638
639
		ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id);
	}
640
641
642
643
644
	/*
	 * This check can't live in kmem destruction function,
	 * since the charges will outlive the cgroup
	 */
	WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0);
645
646
647
648
649
650
651
652
653
654
655
656
657
}
#else
static void disarm_kmem_keys(struct mem_cgroup *memcg)
{
}
#endif /* CONFIG_MEMCG_KMEM */

static void disarm_static_keys(struct mem_cgroup *memcg)
{
	disarm_sock_keys(memcg);
	disarm_kmem_keys(memcg);
}

658
static void drain_all_stock_async(struct mem_cgroup *memcg);
659

660
static struct mem_cgroup_per_zone *
661
mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
662
{
663
	VM_BUG_ON((unsigned)nid >= nr_node_ids);
664
	return &memcg->nodeinfo[nid]->zoneinfo[zid];
665
666
}

667
struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
668
{
669
	return &memcg->css;
670
671
}

672
static struct mem_cgroup_per_zone *
673
page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
674
{
675
676
	int nid = page_to_nid(page);
	int zid = page_zonenum(page);
677

678
	return mem_cgroup_zoneinfo(memcg, nid, zid);
679
680
}

681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
static struct mem_cgroup_tree_per_zone *
soft_limit_tree_node_zone(int nid, int zid)
{
	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
}

static struct mem_cgroup_tree_per_zone *
soft_limit_tree_from_page(struct page *page)
{
	int nid = page_to_nid(page);
	int zid = page_zonenum(page);

	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
}

static void
__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,
				struct mem_cgroup_per_zone *mz,
				struct mem_cgroup_tree_per_zone *mctz,
				unsigned long long new_usage_in_excess)
{
	struct rb_node **p = &mctz->rb_root.rb_node;
	struct rb_node *parent = NULL;
	struct mem_cgroup_per_zone *mz_node;

	if (mz->on_tree)
		return;

	mz->usage_in_excess = new_usage_in_excess;
	if (!mz->usage_in_excess)
		return;
	while (*p) {
		parent = *p;
		mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
					tree_node);
		if (mz->usage_in_excess < mz_node->usage_in_excess)
			p = &(*p)->rb_left;
		/*
		 * We can't avoid mem cgroups that are over their soft
		 * limit by the same amount
		 */
		else if (mz->usage_in_excess >= mz_node->usage_in_excess)
			p = &(*p)->rb_right;
	}
	rb_link_node(&mz->tree_node, parent, p);
	rb_insert_color(&mz->tree_node, &mctz->rb_root);
	mz->on_tree = true;
}

static void
__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
				struct mem_cgroup_per_zone *mz,
				struct mem_cgroup_tree_per_zone *mctz)
{
	if (!mz->on_tree)
		return;
	rb_erase(&mz->tree_node, &mctz->rb_root);
	mz->on_tree = false;
}

static void
mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
				struct mem_cgroup_per_zone *mz,
				struct mem_cgroup_tree_per_zone *mctz)
{
	spin_lock(&mctz->lock);
	__mem_cgroup_remove_exceeded(memcg, mz, mctz);
	spin_unlock(&mctz->lock);
}


static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
{
	unsigned long long excess;
	struct mem_cgroup_per_zone *mz;
	struct mem_cgroup_tree_per_zone *mctz;
	int nid = page_to_nid(page);
	int zid = page_zonenum(page);
	mctz = soft_limit_tree_from_page(page);

	/*
	 * Necessary to update all ancestors when hierarchy is used.
	 * because their event counter is not touched.
	 */
	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
		mz = mem_cgroup_zoneinfo(memcg, nid, zid);
		excess = res_counter_soft_limit_excess(&memcg->res);
		/*
		 * We have to update the tree if mz is on RB-tree or
		 * mem is over its softlimit.
		 */
		if (excess || mz->on_tree) {
			spin_lock(&mctz->lock);
			/* if on-tree, remove it */
			if (mz->on_tree)
				__mem_cgroup_remove_exceeded(memcg, mz, mctz);
			/*
			 * Insert again. mz->usage_in_excess will be updated.
			 * If excess is 0, no tree ops.
			 */
			__mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);
			spin_unlock(&mctz->lock);
		}
	}
}

static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
{
	int node, zone;
	struct mem_cgroup_per_zone *mz;
	struct mem_cgroup_tree_per_zone *mctz;

	for_each_node(node) {
		for (zone = 0; zone < MAX_NR_ZONES; zone++) {
			mz = mem_cgroup_zoneinfo(memcg, node, zone);
			mctz = soft_limit_tree_node_zone(node, zone);
			mem_cgroup_remove_exceeded(memcg, mz, mctz);
		}
	}
}

static struct mem_cgroup_per_zone *
__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
{
	struct rb_node *rightmost = NULL;
	struct mem_cgroup_per_zone *mz;

retry:
	mz = NULL;
	rightmost = rb_last(&mctz->rb_root);
	if (!rightmost)
		goto done;		/* Nothing to reclaim from */

	mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
	/*
	 * Remove the node now but someone else can add it back,
	 * we will to add it back at the end of reclaim to its correct
	 * position in the tree.
	 */
	__mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
	if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
		!css_tryget(&mz->memcg->css))
		goto retry;
done:
	return mz;
}

static struct mem_cgroup_per_zone *
mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
{
	struct mem_cgroup_per_zone *mz;

	spin_lock(&mctz->lock);
	mz = __mem_cgroup_largest_soft_limit_node(mctz);
	spin_unlock(&mctz->lock);
	return mz;
}

839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
/*
 * Implementation Note: reading percpu statistics for memcg.
 *
 * Both of vmstat[] and percpu_counter has threshold and do periodic
 * synchronization to implement "quick" read. There are trade-off between
 * reading cost and precision of value. Then, we may have a chance to implement
 * a periodic synchronizion of counter in memcg's counter.
 *
 * But this _read() function is used for user interface now. The user accounts
 * memory usage by memory cgroup and he _always_ requires exact value because
 * he accounts memory. Even if we provide quick-and-fuzzy read, we always
 * have to visit all online cpus and make sum. So, for now, unnecessary
 * synchronization is not implemented. (just implemented for cpu hotplug)
 *
 * If there are kernel internal actions which can make use of some not-exact
 * value, and reading all cpu value can be performance bottleneck in some
 * common workload, threashold and synchonization as vmstat[] should be
 * implemented.
 */
858
static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
859
				 enum mem_cgroup_stat_index idx)
860
{
861
	long val = 0;
862
863
	int cpu;

864
865
	get_online_cpus();
	for_each_online_cpu(cpu)
866
		val += per_cpu(memcg->stat->count[idx], cpu);
867
#ifdef CONFIG_HOTPLUG_CPU
868
869
870
	spin_lock(&memcg->pcp_counter_lock);
	val += memcg->nocpu_base.count[idx];
	spin_unlock(&memcg->pcp_counter_lock);
871
872
#endif
	put_online_cpus();
873
874
875
	return val;
}

876
static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
877
878
879
					 bool charge)
{
	int val = (charge) ? 1 : -1;
880
	this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
881
882
}

883
static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
884
885
886
887
888
					    enum mem_cgroup_events_index idx)
{
	unsigned long val = 0;
	int cpu;

889
	get_online_cpus();
890
	for_each_online_cpu(cpu)
891
		val += per_cpu(memcg->stat->events[idx], cpu);
892
#ifdef CONFIG_HOTPLUG_CPU
893
894
895
	spin_lock(&memcg->pcp_counter_lock);
	val += memcg->nocpu_base.events[idx];
	spin_unlock(&memcg->pcp_counter_lock);
896
#endif
897
	put_online_cpus();
898
899
900
	return val;
}

901
static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
902
					 struct page *page,
903
					 bool anon, int nr_pages)
904
{
905
906
	preempt_disable();

907
908
909
910
911
912
	/*
	 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
	 * counted as CACHE even if it's on ANON LRU.
	 */
	if (anon)
		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
913
				nr_pages);
914
	else
915
		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
916
				nr_pages);
917

918
919
920
921
	if (PageTransHuge(page))
		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
				nr_pages);

922
923
	/* pagein of a big page is an event. So, ignore page size */
	if (nr_pages > 0)
924
		__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
925
	else {
926
		__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
927
928
		nr_pages = -nr_pages; /* for event */
	}
929

930
	__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
931

932
	preempt_enable();
933
934
}

935
unsigned long
936
mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
937
938
939
940
941
942
943
944
{
	struct mem_cgroup_per_zone *mz;

	mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
	return mz->lru_size[lru];
}

static unsigned long
945
mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
946
			unsigned int lru_mask)
947
948
{
	struct mem_cgroup_per_zone *mz;
Hugh Dickins's avatar
Hugh Dickins committed
949
	enum lru_list lru;
950
951
	unsigned long ret = 0;

952
	mz = mem_cgroup_zoneinfo(memcg, nid, zid);
953

Hugh Dickins's avatar
Hugh Dickins committed
954
955
956
	for_each_lru(lru) {
		if (BIT(lru) & lru_mask)
			ret += mz->lru_size[lru];
957
958
959
960
961
	}
	return ret;
}

static unsigned long
962
mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
963
964
			int nid, unsigned int lru_mask)
{
965
966
967
	u64 total = 0;
	int zid;

968
	for (zid = 0; zid < MAX_NR_ZONES; zid++)
969
970
		total += mem_cgroup_zone_nr_lru_pages(memcg,
						nid, zid, lru_mask);
971

972
973
	return total;
}
974

975
static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
976
			unsigned int lru_mask)
977
{
978
	int nid;
979
980
	u64 total = 0;

981
	for_each_node_state(nid, N_MEMORY)
982
		total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
983
	return total;
984
985
}

986
987
static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
				       enum mem_cgroup_events_target target)
988
989
990
{
	unsigned long val, next;

991
	val = __this_cpu_read(memcg->stat->nr_page_events);
992
	next = __this_cpu_read(memcg->stat->targets[target]);
993
	/* from time_after() in jiffies.h */
994
995
996
997
998
	if ((long)next - (long)val < 0) {
		switch (target) {
		case MEM_CGROUP_TARGET_THRESH:
			next = val + THRESHOLDS_EVENTS_TARGET;
			break;
999
1000
		case MEM_CGROUP_TARGET_SOFTLIMIT:
			next = val + SOFTLIMIT_EVENTS_TARGET;