memcontrol.c 145 KB
Newer Older
1 2 3 4 5
/* memcontrol.c - Memory Controller
 *
 * Copyright IBM Corporation, 2007
 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
 *
6 7 8
 * Copyright 2007 OpenVZ SWsoft Inc
 * Author: Pavel Emelianov <xemul@openvz.org>
 *
9 10 11 12
 * Memory thresholds
 * Copyright (C) 2009 Nokia Corporation
 * Author: Kirill A. Shutemov
 *
13 14 15 16
 * Kernel Memory Controller
 * Copyright (C) 2012 Parallels Inc. and Google Inc.
 * Authors: Glauber Costa and Suleiman Souhlal
 *
17 18 19 20 21 22
 * Native page reclaim
 * Charge lifetime sanitation
 * Lockless page tracking & accounting
 * Unified hierarchy configuration model
 * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
 *
23 24 25 26 27 28 29 30 31 32 33
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */

34
#include <linux/page_counter.h>
35 36
#include <linux/memcontrol.h>
#include <linux/cgroup.h>
37
#include <linux/mm.h>
38
#include <linux/hugetlb.h>
39
#include <linux/pagemap.h>
40
#include <linux/smp.h>
41
#include <linux/page-flags.h>
42
#include <linux/backing-dev.h>
43 44
#include <linux/bit_spinlock.h>
#include <linux/rcupdate.h>
45
#include <linux/limits.h>
46
#include <linux/export.h>
47
#include <linux/mutex.h>
48
#include <linux/rbtree.h>
49
#include <linux/slab.h>
50
#include <linux/swap.h>
51
#include <linux/swapops.h>
52
#include <linux/spinlock.h>
53
#include <linux/eventfd.h>
54
#include <linux/poll.h>
55
#include <linux/sort.h>
56
#include <linux/fs.h>
57
#include <linux/seq_file.h>
58
#include <linux/vmpressure.h>
59
#include <linux/mm_inline.h>
60
#include <linux/swap_cgroup.h>
61
#include <linux/cpu.h>
62
#include <linux/oom.h>
63
#include <linux/lockdep.h>
64
#include <linux/file.h>
65
#include <linux/tracehook.h>
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
66
#include "internal.h"
Glauber Costa's avatar
Glauber Costa committed
67
#include <net/sock.h>
68
#include <net/ip.h>
Glauber Costa's avatar
Glauber Costa committed
69
#include <net/tcp_memcontrol.h>
70
#include "slab.h"
71

72 73
#include <asm/uaccess.h>

74 75
#include <trace/events/vmscan.h>

76 77
struct cgroup_subsys memory_cgrp_subsys __read_mostly;
EXPORT_SYMBOL(memory_cgrp_subsys);
78

79
#define MEM_CGROUP_RECLAIM_RETRIES	5
80
static struct mem_cgroup *root_mem_cgroup __read_mostly;
Tejun Heo's avatar
Tejun Heo committed
81
struct cgroup_subsys_state *mem_cgroup_root_css __read_mostly;
82

83
/* Whether the swap controller is active */
84
#ifdef CONFIG_MEMCG_SWAP
85 86
int do_swap_account __read_mostly;
#else
87
#define do_swap_account		0
88 89
#endif

90 91 92
static const char * const mem_cgroup_stat_names[] = {
	"cache",
	"rss",
93
	"rss_huge",
94
	"mapped_file",
95
	"dirty",
96
	"writeback",
97 98 99 100 101 102 103 104 105 106
	"swap",
};

static const char * const mem_cgroup_events_names[] = {
	"pgpgin",
	"pgpgout",
	"pgfault",
	"pgmajfault",
};

107 108 109 110 111 112 113 114
static const char * const mem_cgroup_lru_names[] = {
	"inactive_anon",
	"active_anon",
	"inactive_file",
	"active_file",
	"unevictable",
};

115 116 117
#define THRESHOLDS_EVENTS_TARGET 128
#define SOFTLIMIT_EVENTS_TARGET 1024
#define NUMAINFO_EVENTS_TARGET	1024
118

119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138
/*
 * Cgroups above their limits are maintained in a RB-Tree, independent of
 * their hierarchy representation
 */

struct mem_cgroup_tree_per_zone {
	struct rb_root rb_root;
	spinlock_t lock;
};

struct mem_cgroup_tree_per_node {
	struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
};

struct mem_cgroup_tree {
	struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
};

static struct mem_cgroup_tree soft_limit_tree __read_mostly;

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
139 140 141 142 143
/* for OOM */
struct mem_cgroup_eventfd_list {
	struct list_head list;
	struct eventfd_ctx *eventfd;
};
144

145 146 147
/*
 * cgroup_event represents events which userspace want to receive.
 */
148
struct mem_cgroup_event {
149
	/*
150
	 * memcg which the event belongs to.
151
	 */
152
	struct mem_cgroup *memcg;
153 154 155 156 157 158 159 160
	/*
	 * eventfd to signal userspace about the event.
	 */
	struct eventfd_ctx *eventfd;
	/*
	 * Each of these stored in a list by the cgroup.
	 */
	struct list_head list;
161 162 163 164 165
	/*
	 * register_event() callback will be used to add new userspace
	 * waiter for changes related to this event.  Use eventfd_signal()
	 * on eventfd to send notification to userspace.
	 */
166
	int (*register_event)(struct mem_cgroup *memcg,
Tejun Heo's avatar
Tejun Heo committed
167
			      struct eventfd_ctx *eventfd, const char *args);
168 169 170 171 172
	/*
	 * unregister_event() callback will be called when userspace closes
	 * the eventfd or on cgroup removing.  This callback must be set,
	 * if you want provide notification functionality.
	 */
173
	void (*unregister_event)(struct mem_cgroup *memcg,
174
				 struct eventfd_ctx *eventfd);
175 176 177 178 179 180 181 182 183 184
	/*
	 * All fields below needed to unregister event when
	 * userspace closes eventfd.
	 */
	poll_table pt;
	wait_queue_head_t *wqh;
	wait_queue_t wait;
	struct work_struct remove;
};

185 186
static void mem_cgroup_threshold(struct mem_cgroup *memcg);
static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
187

188 189
/* Stuffs for move charges at task migration. */
/*
190
 * Types of charges to be moved.
191
 */
192 193 194
#define MOVE_ANON	0x1U
#define MOVE_FILE	0x2U
#define MOVE_MASK	(MOVE_ANON | MOVE_FILE)
195

196 197
/* "mc" and its members are protected by cgroup_mutex */
static struct move_charge_struct {
198
	spinlock_t	  lock; /* for from, to */
199 200
	struct mem_cgroup *from;
	struct mem_cgroup *to;
201
	unsigned long flags;
202
	unsigned long precharge;
203
	unsigned long moved_charge;
204
	unsigned long moved_swap;
205 206 207
	struct task_struct *moving_task;	/* a task moving charges */
	wait_queue_head_t waitq;		/* a waitq for other context */
} mc = {
208
	.lock = __SPIN_LOCK_UNLOCKED(mc.lock),
209 210
	.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
};
211

212 213 214 215
/*
 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
 * limit reclaim to prevent infinite loops, if they ever occur.
 */
216
#define	MEM_CGROUP_MAX_RECLAIM_LOOPS		100
217
#define	MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS	2
218

219 220
enum charge_type {
	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
221
	MEM_CGROUP_CHARGE_TYPE_ANON,
222
	MEM_CGROUP_CHARGE_TYPE_SWAPOUT,	/* for accounting swapcache */
223
	MEM_CGROUP_CHARGE_TYPE_DROP,	/* a page was unused swap cache */
224 225 226
	NR_CHARGE_TYPE,
};

227
/* for encoding cft->private value on file */
228 229 230 231
enum res_type {
	_MEM,
	_MEMSWAP,
	_OOM_TYPE,
232
	_KMEM,
233 234
};

235 236
#define MEMFILE_PRIVATE(x, val)	((x) << 16 | (val))
#define MEMFILE_TYPE(val)	((val) >> 16 & 0xffff)
237
#define MEMFILE_ATTR(val)	((val) & 0xffff)
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
238 239
/* Used for OOM nofiier */
#define OOM_CONTROL		(0)
240

241 242 243 244 245 246 247
/*
 * The memcg_create_mutex will be held whenever a new cgroup is created.
 * As a consequence, any change that needs to protect against new child cgroups
 * appearing has to hold it as well.
 */
static DEFINE_MUTEX(memcg_create_mutex);

248 249 250 251 252 253 254 255 256 257 258 259 260
/* Some nice accessors for the vmpressure. */
struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
{
	if (!memcg)
		memcg = root_mem_cgroup;
	return &memcg->vmpressure;
}

struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
{
	return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
}

261 262 263 264 265
static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
{
	return (memcg == root_mem_cgroup);
}

266 267 268 269 270 271
/*
 * We restrict the id in the range of [1, 65535], so it can fit into
 * an unsigned short.
 */
#define MEM_CGROUP_ID_MAX	USHRT_MAX

Li Zefan's avatar
Li Zefan committed
272 273
static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
{
274
	return memcg->css.id;
Li Zefan's avatar
Li Zefan committed
275 276
}

277 278 279 280 281 282
/*
 * A helper function to get mem_cgroup from ID. must be called under
 * rcu_read_lock().  The caller is responsible for calling
 * css_tryget_online() if the mem_cgroup is used for charging. (dropping
 * refcnt from swap can be called against removed memcg.)
 */
Li Zefan's avatar
Li Zefan committed
283 284 285 286
static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
{
	struct cgroup_subsys_state *css;

287
	css = css_from_id(id, &memory_cgrp_subsys);
Li Zefan's avatar
Li Zefan committed
288 289 290
	return mem_cgroup_from_css(css);
}

Glauber Costa's avatar
Glauber Costa committed
291
/* Writing them here to avoid exposing memcg's inner layout */
292
#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
Glauber Costa's avatar
Glauber Costa committed
293 294 295

void sock_update_memcg(struct sock *sk)
{
296
	if (mem_cgroup_sockets_enabled) {
Glauber Costa's avatar
Glauber Costa committed
297
		struct mem_cgroup *memcg;
298
		struct cg_proto *cg_proto;
Glauber Costa's avatar
Glauber Costa committed
299 300 301

		BUG_ON(!sk->sk_prot->proto_cgroup);

302 303 304 305 306 307 308 309 310 311
		/* Socket cloning can throw us here with sk_cgrp already
		 * filled. It won't however, necessarily happen from
		 * process context. So the test for root memcg given
		 * the current task's memcg won't help us in this case.
		 *
		 * Respecting the original socket's memcg is a better
		 * decision in this case.
		 */
		if (sk->sk_cgrp) {
			BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));
312
			css_get(&sk->sk_cgrp->memcg->css);
313 314 315
			return;
		}

Glauber Costa's avatar
Glauber Costa committed
316 317
		rcu_read_lock();
		memcg = mem_cgroup_from_task(current);
318
		cg_proto = sk->sk_prot->proto_cgroup(memcg);
319
		if (cg_proto && test_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags) &&
320
		    css_tryget_online(&memcg->css)) {
321
			sk->sk_cgrp = cg_proto;
Glauber Costa's avatar
Glauber Costa committed
322 323 324 325 326 327 328 329
		}
		rcu_read_unlock();
	}
}
EXPORT_SYMBOL(sock_update_memcg);

void sock_release_memcg(struct sock *sk)
{
330
	if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
Glauber Costa's avatar
Glauber Costa committed
331 332 333
		struct mem_cgroup *memcg;
		WARN_ON(!sk->sk_cgrp->memcg);
		memcg = sk->sk_cgrp->memcg;
334
		css_put(&sk->sk_cgrp->memcg->css);
Glauber Costa's avatar
Glauber Costa committed
335 336
	}
}
Glauber Costa's avatar
Glauber Costa committed
337 338 339 340 341 342

struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
{
	if (!memcg || mem_cgroup_is_root(memcg))
		return NULL;

343
	return &memcg->tcp_mem;
Glauber Costa's avatar
Glauber Costa committed
344 345
}
EXPORT_SYMBOL(tcp_proto_cgroup);
Glauber Costa's avatar
Glauber Costa committed
346

347 348
#endif

349
#ifdef CONFIG_MEMCG_KMEM
350
/*
351
 * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
Li Zefan's avatar
Li Zefan committed
352 353 354 355 356
 * The main reason for not using cgroup id for this:
 *  this works better in sparse environments, where we have a lot of memcgs,
 *  but only a few kmem-limited. Or also, if we have, for instance, 200
 *  memcgs, and none but the 200th is kmem-limited, we'd have to have a
 *  200 entry array for that.
357
 *
358 359
 * The current size of the caches array is stored in memcg_nr_cache_ids. It
 * will double each time we have to increase it.
360
 */
361 362
static DEFINE_IDA(memcg_cache_ida);
int memcg_nr_cache_ids;
363

364 365 366 367 368 369 370 371 372 373 374 375 376
/* Protects memcg_nr_cache_ids */
static DECLARE_RWSEM(memcg_cache_ids_sem);

void memcg_get_cache_ids(void)
{
	down_read(&memcg_cache_ids_sem);
}

void memcg_put_cache_ids(void)
{
	up_read(&memcg_cache_ids_sem);
}

377 378 379 380 381 382
/*
 * MIN_SIZE is different than 1, because we would like to avoid going through
 * the alloc/free process all the time. In a small machine, 4 kmem-limited
 * cgroups is a reasonable guess. In the future, it could be a parameter or
 * tunable, but that is strictly not necessary.
 *
Li Zefan's avatar
Li Zefan committed
383
 * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get
384 385
 * this constant directly from cgroup, but it is understandable that this is
 * better kept as an internal representation in cgroup.c. In any case, the
Li Zefan's avatar
Li Zefan committed
386
 * cgrp_id space is not getting any smaller, and we don't have to necessarily
387 388 389
 * increase ours as well if it increases.
 */
#define MEMCG_CACHES_MIN_SIZE 4
Li Zefan's avatar
Li Zefan committed
390
#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
391

392 393 394 395 396 397
/*
 * A lot of the calls to the cache allocation functions are expected to be
 * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
 * conditional to this static branch, we'll have to allow modules that does
 * kmem_cache_alloc and the such to see this symbol as well
 */
398
struct static_key memcg_kmem_enabled_key;
399
EXPORT_SYMBOL(memcg_kmem_enabled_key);
400 401 402

#endif /* CONFIG_MEMCG_KMEM */

403
static struct mem_cgroup_per_zone *
404
mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
405
{
406 407 408
	int nid = zone_to_nid(zone);
	int zid = zone_idx(zone);

409
	return &memcg->nodeinfo[nid]->zoneinfo[zid];
410 411
}

412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437
/**
 * mem_cgroup_css_from_page - css of the memcg associated with a page
 * @page: page of interest
 *
 * If memcg is bound to the default hierarchy, css of the memcg associated
 * with @page is returned.  The returned css remains associated with @page
 * until it is released.
 *
 * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
 * is returned.
 *
 * XXX: The above description of behavior on the default hierarchy isn't
 * strictly true yet as replace_page_cache_page() can modify the
 * association before @page is released even on the default hierarchy;
 * however, the current and planned usages don't mix the the two functions
 * and replace_page_cache_page() will soon be updated to make the invariant
 * actually true.
 */
struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
{
	struct mem_cgroup *memcg;

	rcu_read_lock();

	memcg = page->mem_cgroup;

438
	if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
439 440 441 442 443 444
		memcg = root_mem_cgroup;

	rcu_read_unlock();
	return &memcg->css;
}

445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472
/**
 * page_cgroup_ino - return inode number of the memcg a page is charged to
 * @page: the page
 *
 * Look up the closest online ancestor of the memory cgroup @page is charged to
 * and return its inode number or 0 if @page is not charged to any cgroup. It
 * is safe to call this function without holding a reference to @page.
 *
 * Note, this function is inherently racy, because there is nothing to prevent
 * the cgroup inode from getting torn down and potentially reallocated a moment
 * after page_cgroup_ino() returns, so it only should be used by callers that
 * do not care (such as procfs interfaces).
 */
ino_t page_cgroup_ino(struct page *page)
{
	struct mem_cgroup *memcg;
	unsigned long ino = 0;

	rcu_read_lock();
	memcg = READ_ONCE(page->mem_cgroup);
	while (memcg && !(memcg->css.flags & CSS_ONLINE))
		memcg = parent_mem_cgroup(memcg);
	if (memcg)
		ino = cgroup_ino(memcg->css.cgroup);
	rcu_read_unlock();
	return ino;
}

473
static struct mem_cgroup_per_zone *
474
mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page)
475
{
476 477
	int nid = page_to_nid(page);
	int zid = page_zonenum(page);
478

479
	return &memcg->nodeinfo[nid]->zoneinfo[zid];
480 481
}

482 483 484 485 486 487 488 489 490 491 492 493 494 495 496
static struct mem_cgroup_tree_per_zone *
soft_limit_tree_node_zone(int nid, int zid)
{
	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
}

static struct mem_cgroup_tree_per_zone *
soft_limit_tree_from_page(struct page *page)
{
	int nid = page_to_nid(page);
	int zid = page_zonenum(page);

	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
}

497 498
static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz,
					 struct mem_cgroup_tree_per_zone *mctz,
499
					 unsigned long new_usage_in_excess)
500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528
{
	struct rb_node **p = &mctz->rb_root.rb_node;
	struct rb_node *parent = NULL;
	struct mem_cgroup_per_zone *mz_node;

	if (mz->on_tree)
		return;

	mz->usage_in_excess = new_usage_in_excess;
	if (!mz->usage_in_excess)
		return;
	while (*p) {
		parent = *p;
		mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
					tree_node);
		if (mz->usage_in_excess < mz_node->usage_in_excess)
			p = &(*p)->rb_left;
		/*
		 * We can't avoid mem cgroups that are over their soft
		 * limit by the same amount
		 */
		else if (mz->usage_in_excess >= mz_node->usage_in_excess)
			p = &(*p)->rb_right;
	}
	rb_link_node(&mz->tree_node, parent, p);
	rb_insert_color(&mz->tree_node, &mctz->rb_root);
	mz->on_tree = true;
}

529 530
static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
					 struct mem_cgroup_tree_per_zone *mctz)
531 532 533 534 535 536 537
{
	if (!mz->on_tree)
		return;
	rb_erase(&mz->tree_node, &mctz->rb_root);
	mz->on_tree = false;
}

538 539
static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
				       struct mem_cgroup_tree_per_zone *mctz)
540
{
541 542 543
	unsigned long flags;

	spin_lock_irqsave(&mctz->lock, flags);
544
	__mem_cgroup_remove_exceeded(mz, mctz);
545
	spin_unlock_irqrestore(&mctz->lock, flags);
546 547
}

548 549 550
static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
{
	unsigned long nr_pages = page_counter_read(&memcg->memory);
551
	unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
552 553 554 555 556 557 558
	unsigned long excess = 0;

	if (nr_pages > soft_limit)
		excess = nr_pages - soft_limit;

	return excess;
}
559 560 561

static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
{
562
	unsigned long excess;
563 564 565
	struct mem_cgroup_per_zone *mz;
	struct mem_cgroup_tree_per_zone *mctz;

566
	mctz = soft_limit_tree_from_page(page);
567 568 569 570 571
	/*
	 * Necessary to update all ancestors when hierarchy is used.
	 * because their event counter is not touched.
	 */
	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
572
		mz = mem_cgroup_page_zoneinfo(memcg, page);
573
		excess = soft_limit_excess(memcg);
574 575 576 577 578
		/*
		 * We have to update the tree if mz is on RB-tree or
		 * mem is over its softlimit.
		 */
		if (excess || mz->on_tree) {
579 580 581
			unsigned long flags;

			spin_lock_irqsave(&mctz->lock, flags);
582 583
			/* if on-tree, remove it */
			if (mz->on_tree)
584
				__mem_cgroup_remove_exceeded(mz, mctz);
585 586 587 588
			/*
			 * Insert again. mz->usage_in_excess will be updated.
			 * If excess is 0, no tree ops.
			 */
589
			__mem_cgroup_insert_exceeded(mz, mctz, excess);
590
			spin_unlock_irqrestore(&mctz->lock, flags);
591 592 593 594 595 596 597
		}
	}
}

static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
{
	struct mem_cgroup_tree_per_zone *mctz;
598 599
	struct mem_cgroup_per_zone *mz;
	int nid, zid;
600

601 602 603 604
	for_each_node(nid) {
		for (zid = 0; zid < MAX_NR_ZONES; zid++) {
			mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
			mctz = soft_limit_tree_node_zone(nid, zid);
605
			mem_cgroup_remove_exceeded(mz, mctz);
606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627
		}
	}
}

static struct mem_cgroup_per_zone *
__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
{
	struct rb_node *rightmost = NULL;
	struct mem_cgroup_per_zone *mz;

retry:
	mz = NULL;
	rightmost = rb_last(&mctz->rb_root);
	if (!rightmost)
		goto done;		/* Nothing to reclaim from */

	mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
	/*
	 * Remove the node now but someone else can add it back,
	 * we will to add it back at the end of reclaim to its correct
	 * position in the tree.
	 */
628
	__mem_cgroup_remove_exceeded(mz, mctz);
629
	if (!soft_limit_excess(mz->memcg) ||
630
	    !css_tryget_online(&mz->memcg->css))
631 632 633 634 635 636 637 638 639 640
		goto retry;
done:
	return mz;
}

static struct mem_cgroup_per_zone *
mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
{
	struct mem_cgroup_per_zone *mz;

641
	spin_lock_irq(&mctz->lock);
642
	mz = __mem_cgroup_largest_soft_limit_node(mctz);
643
	spin_unlock_irq(&mctz->lock);
644 645 646
	return mz;
}

647
/*
648 649
 * Return page count for single (non recursive) @memcg.
 *
650 651 652 653 654
 * Implementation Note: reading percpu statistics for memcg.
 *
 * Both of vmstat[] and percpu_counter has threshold and do periodic
 * synchronization to implement "quick" read. There are trade-off between
 * reading cost and precision of value. Then, we may have a chance to implement
655
 * a periodic synchronization of counter in memcg's counter.
656 657 658 659 660 661 662 663 664
 *
 * But this _read() function is used for user interface now. The user accounts
 * memory usage by memory cgroup and he _always_ requires exact value because
 * he accounts memory. Even if we provide quick-and-fuzzy read, we always
 * have to visit all online cpus and make sum. So, for now, unnecessary
 * synchronization is not implemented. (just implemented for cpu hotplug)
 *
 * If there are kernel internal actions which can make use of some not-exact
 * value, and reading all cpu value can be performance bottleneck in some
665
 * common workload, threshold and synchronization as vmstat[] should be
666 667
 * implemented.
 */
668 669
static unsigned long
mem_cgroup_read_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx)
670
{
671
	long val = 0;
672 673
	int cpu;

674
	/* Per-cpu values can be negative, use a signed accumulator */
675
	for_each_possible_cpu(cpu)
676
		val += per_cpu(memcg->stat->count[idx], cpu);
677 678 679 680 681 682
	/*
	 * Summing races with updates, so val may be negative.  Avoid exposing
	 * transient negative values.
	 */
	if (val < 0)
		val = 0;
683 684 685
	return val;
}

686
static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
687 688 689 690 691
					    enum mem_cgroup_events_index idx)
{
	unsigned long val = 0;
	int cpu;

692
	for_each_possible_cpu(cpu)
693
		val += per_cpu(memcg->stat->events[idx], cpu);
694 695 696
	return val;
}

697
static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
698
					 struct page *page,
699
					 int nr_pages)
700
{
701 702 703 704
	/*
	 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
	 * counted as CACHE even if it's on ANON LRU.
	 */
705
	if (PageAnon(page))
706
		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
707
				nr_pages);
708
	else
709
		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
710
				nr_pages);
711

712 713 714 715
	if (PageTransHuge(page))
		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
				nr_pages);

716 717
	/* pagein of a big page is an event. So, ignore page size */
	if (nr_pages > 0)
718
		__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
719
	else {
720
		__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
721 722
		nr_pages = -nr_pages; /* for event */
	}
723

724
	__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
725 726
}

727 728 729
static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
						  int nid,
						  unsigned int lru_mask)
730
{
731
	unsigned long nr = 0;
732 733
	int zid;

734
	VM_BUG_ON((unsigned)nid >= nr_node_ids);
735

736 737 738 739 740 741 742 743 744 745 746 747
	for (zid = 0; zid < MAX_NR_ZONES; zid++) {
		struct mem_cgroup_per_zone *mz;
		enum lru_list lru;

		for_each_lru(lru) {
			if (!(BIT(lru) & lru_mask))
				continue;
			mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
			nr += mz->lru_size[lru];
		}
	}
	return nr;
748
}
749

750
static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
751
			unsigned int lru_mask)
752
{
753
	unsigned long nr = 0;
754
	int nid;
755

756
	for_each_node_state(nid, N_MEMORY)
757 758
		nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
	return nr;
759 760
}

761 762
static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
				       enum mem_cgroup_events_target target)
763 764 765
{
	unsigned long val, next;

766
	val = __this_cpu_read(memcg->stat->nr_page_events);
767
	next = __this_cpu_read(memcg->stat->targets[target]);
768
	/* from time_after() in jiffies.h */
769 770 771 772 773
	if ((long)next - (long)val < 0) {
		switch (target) {
		case MEM_CGROUP_TARGET_THRESH:
			next = val + THRESHOLDS_EVENTS_TARGET;
			break;
774 775 776
		case MEM_CGROUP_TARGET_SOFTLIMIT:
			next = val + SOFTLIMIT_EVENTS_TARGET;
			break;
777 778 779 780 781 782 783 784
		case MEM_CGROUP_TARGET_NUMAINFO:
			next = val + NUMAINFO_EVENTS_TARGET;
			break;
		default:
			break;
		}
		__this_cpu_write(memcg->stat->targets[target], next);
		return true;
785
	}
786
	return false;
787 788 789 790 791 792
}

/*
 * Check events in order.
 *
 */
793
static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
794 795
{
	/* threshold event is triggered in finer grain than soft limit */
796 797
	if (unlikely(mem_cgroup_event_ratelimit(memcg,
						MEM_CGROUP_TARGET_THRESH))) {
798
		bool do_softlimit;
799
		bool do_numainfo __maybe_unused;
800

801 802
		do_softlimit = mem_cgroup_event_ratelimit(memcg,
						MEM_CGROUP_TARGET_SOFTLIMIT);
803 804 805 806
#if MAX_NUMNODES > 1
		do_numainfo = mem_cgroup_event_ratelimit(memcg,
						MEM_CGROUP_TARGET_NUMAINFO);
#endif
807
		mem_cgroup_threshold(memcg);
808 809
		if (unlikely(do_softlimit))
			mem_cgroup_update_tree(memcg, page);
810
#if MAX_NUMNODES > 1
811
		if (unlikely(do_numainfo))
812
			atomic_inc(&memcg->numainfo_events);
813
#endif
814
	}
815 816
}

817
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
818
{
819 820 821 822 823 824 825 826
	/*
	 * mm_update_next_owner() may clear mm->owner to NULL
	 * if it races with swapoff, page migration, etc.
	 * So this can be called with p == NULL.
	 */
	if (unlikely(!p))
		return NULL;

827
	return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
828
}
829
EXPORT_SYMBOL(mem_cgroup_from_task);
830

831
static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
832
{
833
	struct mem_cgroup *memcg = NULL;
834

835 836
	rcu_read_lock();
	do {
837 838 839 840 841 842
		/*
		 * Page cache insertions can happen withou an
		 * actual mm context, e.g. during disk probing
		 * on boot, loopback IO, acct() writes etc.
		 */
		if (unlikely(!mm))
843
			memcg = root_mem_cgroup;
844 845 846 847 848
		else {
			memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
			if (unlikely(!memcg))
				memcg = root_mem_cgroup;
		}
849
	} while (!css_tryget_online(&memcg->css));
850
	rcu_read_unlock();
851
	return memcg;
852 853
}

854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870
/**
 * mem_cgroup_iter - iterate over memory cgroup hierarchy
 * @root: hierarchy root
 * @prev: previously returned memcg, NULL on first invocation
 * @reclaim: cookie for shared reclaim walks, NULL for full walks
 *
 * Returns references to children of the hierarchy below @root, or
 * @root itself, or %NULL after a full round-trip.
 *
 * Caller must pass the return value in @prev on subsequent
 * invocations for reference counting, or use mem_cgroup_iter_break()
 * to cancel a hierarchy walk before the round-trip is complete.
 *
 * Reclaimers can specify a zone and a priority level in @reclaim to
 * divide up the memcgs in the hierarchy among all concurrent
 * reclaimers operating on the same zone and priority.
 */
871
struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
872
				   struct mem_cgroup *prev,
873
				   struct mem_cgroup_reclaim_cookie *reclaim)
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
874
{
875
	struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
876
	struct cgroup_subsys_state *css = NULL;
877
	struct mem_cgroup *memcg = NULL;
878
	struct mem_cgroup *pos = NULL;
879

880 881
	if (mem_cgroup_disabled())
		return NULL;
882

883 884
	if (!root)
		root = root_mem_cgroup;
885

886
	if (prev && !reclaim)
887
		pos = prev;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
888

889 890
	if (!root->use_hierarchy && root != root_mem_cgroup) {
		if (prev)
891
			goto out;
892
		return root;
893
	}
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
894

895
	rcu_read_lock();
896

897 898 899 900 901 902 903 904 905 906
	if (reclaim) {
		struct mem_cgroup_per_zone *mz;

		mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
		iter = &mz->iter[reclaim->priority];

		if (prev && reclaim->generation != iter->generation)
			goto out_unlock;

		do {
907
			pos = READ_ONCE(iter->position);
908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930
			/*
			 * A racing update may change the position and
			 * put the last reference, hence css_tryget(),
			 * or retry to see the updated position.
			 */
		} while (pos && !css_tryget(&pos->css));
	}

	if (pos)
		css = &pos->css;

	for (;;) {
		css = css_next_descendant_pre(css, &root->css);
		if (!css) {
			/*
			 * Reclaimers share the hierarchy walk, and a
			 * new one might jump in right at the end of
			 * the hierarchy - make sure they see at least
			 * one group and restart from the beginning.
			 */
			if (!prev)
				continue;
			break;
931
		}
932

933 934 935 936 937 938
		/*
		 * Verify the css and acquire a reference.  The root
		 * is provided by the caller, so we know it's alive
		 * and kicking, and don't take an extra reference.
		 */
		memcg = mem_cgroup_from_css(css);
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
939

940 941
		if (css == &root->css)
			break;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
942

943
		if (css_tryget(css)) {
944 945 946 947 948 949 950
			/*
			 * Make sure the memcg is initialized:
			 * mem_cgroup_css_online() orders the the
			 * initialization against setting the flag.
			 */
			if (smp_load_acquire(&memcg->initialized))
				break;
951

952
			css_put(css);
953
		}
954

955
		memcg = NULL;
956
	}
957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976

	if (reclaim) {
		if (cmpxchg(&iter->position, pos, memcg) == pos) {
			if (memcg)
				css_get(&memcg->css);
			if (pos)
				css_put(&pos->css);
		}

		/*
		 * pairs with css_tryget when dereferencing iter->position
		 * above.
		 */
		if (pos)
			css_put(&pos->css);

		if (!memcg)
			iter->generation++;
		else if (!prev)
			reclaim->generation = iter->generation;
977
	}
978

979 980
out_unlock:
	rcu_read_unlock();
981
out:
982 983 984
	if (prev && prev != root)
		css_put(&prev->css);

985
	return memcg;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
986
}
987

988 989 990 991 992 993 994
/**
 * mem_cgroup_iter_break - abort a hierarchy walk prematurely
 * @root: hierarchy root
 * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
 */
void mem_cgroup_iter_break(struct mem_cgroup *root,
			   struct mem_cgroup *prev)
995 996 997 998 999 1000
{
	if (!root)
		root = root_mem_cgroup;
	if (prev && prev != root)
		css_put(&prev->css);
}
1001

1002 1003 1004 1005 1006 1007
/*
 * Iteration constructs for visiting all cgroups (under a tree).  If
 * loops are exited prematurely (break), mem_cgroup_iter_break() must
 * be used for reference counting.
 */
#define for_each_mem_cgroup_tree(iter, root)		\
1008
	for (iter = mem_cgroup_iter(root, NULL, NULL);	\
1009
	     iter != NULL;				\
1010
	     iter = mem_cgroup_iter(root, iter, NULL))
1011

1012
#define for_each_mem_cgroup(iter)			\
1013
	for (iter = mem_cgroup_iter(NULL, NULL, NULL);	\
1014
	     iter != NULL;				\
1015
	     iter = mem_cgroup_iter(NULL, iter, NULL))
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1016

1017 1018 1019
/**
 * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
 * @zone: zone of the wanted lruvec
1020
 * @memcg: memcg of the wanted lruvec
1021 1022 1023 1024 1025 1026 1027 1028 1029
 *
 * Returns the lru list vector holding pages for the given @zone and
 * @mem.  This can be the global zone lruvec, if the memory controller
 * is disabled.
 */
struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
				      struct mem_cgroup *memcg)
{
	struct mem_cgroup_per_zone *mz;
1030
	struct lruvec *lruvec;
1031

1032 1033 1034 1035
	if (mem_cgroup_disabled()) {
		lruvec = &zone->lruvec;
		goto out;
	}
1036

1037
	mz = mem_cgroup_zone_zoneinfo(memcg, zone);
1038 1039 1040 1041 1042 1043 1044 1045 1046 1047
	lruvec = &mz->lruvec;
out:
	/*
	 * Since a node can be onlined after the mem_cgroup was created,
	 * we have to be prepared to initialize lruvec->zone here;
	 * and if offlined then reonlined, we need to reinitialize it.
	 */
	if (unlikely(lruvec->zone != zone))
		lruvec->zone = zone;
	return lruvec;
1048 1049 1050
}

/**
1051
 * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
1052
 * @page: the page
1053
 * @zone: zone of the page
1054 1055 1056 1057
 *
 * This function is only safe when following the LRU page isolation
 * and putback protocol: the LRU lock must be held, and the page must
 * either be PageLRU() or the caller must have isolated/allocated it.
1058
 */
1059
struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1060 1061
{
	struct mem_cgroup_per_zone *mz;
1062
	struct mem_cgroup *memcg;
1063
	struct lruvec *lruvec;
1064

1065 1066 1067 1068
	if (mem_cgroup_disabled()) {
		lruvec = &zone->lruvec;
		goto out;
	}
1069

1070
	memcg = page->mem_cgroup;
1071
	/*
1072
	 * Swapcache readahead pages are added to the LRU - and
1073
	 * possibly migrated - before they are charged.
1074
	 */
1075 1076
	if (!memcg)
		memcg = root_mem_cgroup;
1077

1078
	mz = mem_cgroup_page_zoneinfo(memcg, page);
1079 1080 1081 1082 1083 1084 1085 1086 1087 1088
	lruvec = &mz->lruvec;
out:
	/*
	 * Since a node can be onlined after the mem_cgroup was created,
	 * we have to be prepared to initialize lruvec->zone here;
	 * and if offlined then reonlined, we need to reinitialize it.
	 */
	if (unlikely(lruvec->zone != zone))
		lruvec->zone = zone;
	return lruvec;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1089
}
1090

1091
/**
1092 1093 1094 1095
 * mem_cgroup_update_lru_size - account for adding or removing an lru page
 * @lruvec: mem_cgroup per zone lru vector
 * @lru: index of lru list the page is sitting on
 * @nr_pages: positive when adding or negative when removing
1096
 *
1097 1098
 * This function must be called when a page is added to or removed from an
 * lru list.
1099
 */
1100 1101
void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
				int nr_pages)
1102 1103
{
	struct mem_cgroup_per_zone *mz;
1104
	unsigned long *lru_size;
1105 1106 1107 1108

	if (mem_cgroup_disabled())
		return;

1109 1110 1111 1112
	mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
	lru_size = mz->lru_size + lru;
	*lru_size += nr_pages;
	VM_BUG_ON((long)(*lru_size) < 0);
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1113
}
1114

1115
bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
1116
{
1117
	struct mem_cgroup *task_memcg;
1118
	struct task_struct *p;
1119
	bool ret;
1120

1121
	p = find_lock_task_mm(task);
1122
	if (p) {
1123
		task_memcg = get_mem_cgroup_from_mm(p->mm);
1124 1125 1126 1127 1128 1129 1130
		task_unlock(p);
	} else {
		/*
		 * All threads may have already detached their mm's, but the oom
		 * killer still needs to detect if they have already been oom
		 * killed to prevent needlessly killing additional tasks.
		 */
1131
		rcu_read_lock();
1132 1133
		task_memcg = mem_cgroup_from_task(task);
		css_get(&task_memcg->css);
1134
		rcu_read_unlock();
1135
	}
1136 1137
	ret = mem_cgroup_is_descendant(task_memcg, memcg);
	css_put(&task_memcg->css);