cgroup.c 167 KB
Newer Older
1 2 3 4 5 6
/*
 *  Generic process-grouping system.
 *
 *  Based originally on the cpuset system, extracted by Paul Menage
 *  Copyright (C) 2006 Google, Inc
 *
7 8 9 10
 *  Notifications support
 *  Copyright (C) 2009 Nokia Corporation
 *  Author: Kirill A. Shutemov
 *
11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
 *  Copyright notices from the original cpuset code:
 *  --------------------------------------------------
 *  Copyright (C) 2003 BULL SA.
 *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
 *
 *  Portions derived from Patrick Mochel's sysfs code.
 *  sysfs is Copyright (c) 2001-3 Patrick Mochel
 *
 *  2003-10-10 Written by Simon Derr.
 *  2003-10-22 Updates by Stephen Hemminger.
 *  2004 May-July Rework by Paul Jackson.
 *  ---------------------------------------------------
 *
 *  This file is subject to the terms and conditions of the GNU General Public
 *  License.  See the file COPYING in the main directory of the Linux
 *  distribution for more details.
 */

29 30
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

31
#include <linux/cgroup.h>
32
#include <linux/cred.h>
33
#include <linux/ctype.h>
34
#include <linux/errno.h>
35
#include <linux/init_task.h>
36 37
#include <linux/kernel.h>
#include <linux/list.h>
38
#include <linux/magic.h>
39 40 41 42
#include <linux/mm.h>
#include <linux/mutex.h>
#include <linux/mount.h>
#include <linux/pagemap.h>
43
#include <linux/proc_fs.h>
44 45 46 47
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
48
#include <linux/percpu-rwsem.h>
49
#include <linux/string.h>
50
#include <linux/sort.h>
51
#include <linux/kmod.h>
Balbir Singh's avatar
Balbir Singh committed
52 53
#include <linux/delayacct.h>
#include <linux/cgroupstats.h>
54
#include <linux/hashtable.h>
Li Zefan's avatar
Li Zefan committed
55
#include <linux/pid_namespace.h>
56
#include <linux/idr.h>
57
#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
58
#include <linux/kthread.h>
59
#include <linux/delay.h>
Arun Sharma's avatar
Arun Sharma committed
60
#include <linux/atomic.h>
61
#include <linux/cpuset.h>
62 63 64
#include <linux/proc_ns.h>
#include <linux/nsproxy.h>
#include <linux/proc_ns.h>
Tejun Heo's avatar
Tejun Heo committed
65
#include <net/sock.h>
66

67 68 69 70 71 72 73 74
/*
 * pidlists linger the following amount before being destroyed.  The goal
 * is avoiding frequent destruction in the middle of consecutive read calls
 * Expiring in the middle is a performance problem not a correctness one.
 * 1 sec should be enough.
 */
#define CGROUP_PIDLIST_DESTROY_DELAY	HZ

75 76 77
#define CGROUP_FILE_NAME_MAX		(MAX_CGROUP_TYPE_NAMELEN +	\
					 MAX_CFTYPE_NAME + 2)

Tejun Heo's avatar
Tejun Heo committed
78 79 80 81
/*
 * cgroup_mutex is the master lock.  Any modification to cgroup or its
 * hierarchy must be performed while holding it.
 *
82
 * css_set_lock protects task->cgroups pointer, the list of css_set
83
 * objects, and the chain of tasks off each css_set.
Tejun Heo's avatar
Tejun Heo committed
84
 *
85 86
 * These locks are exported if CONFIG_PROVE_RCU so that accessors in
 * cgroup.h can use them for lockdep annotations.
Tejun Heo's avatar
Tejun Heo committed
87
 */
88 89
#ifdef CONFIG_PROVE_RCU
DEFINE_MUTEX(cgroup_mutex);
90
DEFINE_SPINLOCK(css_set_lock);
91
EXPORT_SYMBOL_GPL(cgroup_mutex);
92
EXPORT_SYMBOL_GPL(css_set_lock);
93
#else
94
static DEFINE_MUTEX(cgroup_mutex);
95
static DEFINE_SPINLOCK(css_set_lock);
96 97
#endif

98
/*
99 100
 * Protects cgroup_idr and css_idr so that IDs can be released without
 * grabbing cgroup_mutex.
101 102 103
 */
static DEFINE_SPINLOCK(cgroup_idr_lock);

104 105 106 107 108 109
/*
 * Protects cgroup_file->kn for !self csses.  It synchronizes notifications
 * against file removal/re-creation across css hiding.
 */
static DEFINE_SPINLOCK(cgroup_file_kn_lock);

110 111 112 113 114
/*
 * Protects cgroup_subsys->release_agent_path.  Modifying it also requires
 * cgroup_mutex.  Reading requires either cgroup_mutex or this spinlock.
 */
static DEFINE_SPINLOCK(release_agent_path_lock);
115

116 117
struct percpu_rw_semaphore cgroup_threadgroup_rwsem;

Tejun Heo's avatar
Tejun Heo committed
118
#define cgroup_assert_mutex_or_rcu_locked()				\
119 120
	RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&			\
			   !lockdep_is_held(&cgroup_mutex),		\
Tejun Heo's avatar
Tejun Heo committed
121
			   "cgroup_mutex or RCU read lock required");
122

123 124 125 126 127 128 129 130
/*
 * cgroup destruction makes heavy use of work items and there can be a lot
 * of concurrent destructions.  Use a separate workqueue so that cgroup
 * destruction work items don't end up filling up max_active of system_wq
 * which may lead to deadlock.
 */
static struct workqueue_struct *cgroup_destroy_wq;

131 132 133 134 135 136
/*
 * pidlist destructions need to be flushed on cgroup destruction.  Use a
 * separate workqueue as flush domain.
 */
static struct workqueue_struct *cgroup_pidlist_destroy_wq;

Tejun Heo's avatar
Tejun Heo committed
137
/* generate an array of cgroup subsystem pointers */
138
#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
Tejun Heo's avatar
Tejun Heo committed
139
static struct cgroup_subsys *cgroup_subsys[] = {
140 141
#include <linux/cgroup_subsys.h>
};
142 143 144 145 146
#undef SUBSYS

/* array of cgroup subsystem names */
#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
static const char *cgroup_subsys_name[] = {
147 148
#include <linux/cgroup_subsys.h>
};
149
#undef SUBSYS
150

151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171
/* array of static_keys for cgroup_subsys_enabled() and cgroup_subsys_on_dfl() */
#define SUBSYS(_x)								\
	DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key);			\
	DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key);			\
	EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key);			\
	EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key);
#include <linux/cgroup_subsys.h>
#undef SUBSYS

#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key,
static struct static_key_true *cgroup_subsys_enabled_key[] = {
#include <linux/cgroup_subsys.h>
};
#undef SUBSYS

#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key,
static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
#include <linux/cgroup_subsys.h>
};
#undef SUBSYS

172
/*
173
 * The default hierarchy, reserved for the subsystems that are otherwise
174 175
 * unattached - it never has more than a single cgroup, and all tasks are
 * part of that cgroup.
176
 */
177
struct cgroup_root cgrp_dfl_root;
Tejun Heo's avatar
Tejun Heo committed
178
EXPORT_SYMBOL_GPL(cgrp_dfl_root);
179

180 181 182 183 184
/*
 * The default hierarchy always exists but is hidden until mounted for the
 * first time.  This is for backward compatibility.
 */
static bool cgrp_dfl_root_visible;
185

186 187 188
/* Controllers blocked by the commandline in v1 */
static unsigned long cgroup_no_v1_mask;

189
/* some controllers are not supported in the default hierarchy */
190
static unsigned long cgrp_dfl_root_inhibit_ss_mask;
191

192 193
/* The list of hierarchy roots */

194 195
static LIST_HEAD(cgroup_roots);
static int cgroup_root_count;
196

Tejun Heo's avatar
Tejun Heo committed
197
/* hierarchy ID allocation and mapping, protected by cgroup_mutex */
198
static DEFINE_IDR(cgroup_hierarchy_idr);
199

200
/*
201 202 203 204 205
 * Assign a monotonically increasing serial number to csses.  It guarantees
 * cgroups with bigger numbers are newer than those with smaller numbers.
 * Also, as csses are always appended to the parent's ->children list, it
 * guarantees that sibling csses are always sorted in the ascending serial
 * number order on the list.  Protected by cgroup_mutex.
206
 */
207
static u64 css_serial_nr_next = 1;
208

209 210 211 212
/*
 * These bitmask flags indicate whether tasks in the fork and exit paths have
 * fork/exit handlers to call. This avoids us having to do extra work in the
 * fork/exit path to check which subsystems have fork/exit callbacks.
213
 */
214 215
static unsigned long have_fork_callback __read_mostly;
static unsigned long have_exit_callback __read_mostly;
216
static unsigned long have_free_callback __read_mostly;
217

218 219 220 221 222 223 224 225 226
/* cgroup namespace for init task */
struct cgroup_namespace init_cgroup_ns = {
	.count		= { .counter = 2, },
	.user_ns	= &init_user_ns,
	.ns.ops		= &cgroupns_operations,
	.ns.inum	= PROC_CGROUP_INIT_INO,
	.root_cset	= &init_css_set,
};

227 228 229
/* Ditto for the can_fork callback. */
static unsigned long have_canfork_callback __read_mostly;

230
static struct file_system_type cgroup2_fs_type;
231 232
static struct cftype cgroup_dfl_base_files[];
static struct cftype cgroup_legacy_base_files[];
233

234
static int rebind_subsystems(struct cgroup_root *dst_root,
235
			     unsigned long ss_mask);
236
static void css_task_iter_advance(struct css_task_iter *it);
237
static int cgroup_destroy_locked(struct cgroup *cgrp);
238 239
static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
		      bool visible);
240
static void css_release(struct percpu_ref *ref);
241
static void kill_css(struct cgroup_subsys_state *css);
242 243
static int cgroup_addrm_files(struct cgroup_subsys_state *css,
			      struct cgroup *cgrp, struct cftype cfts[],
244
			      bool is_add);
245

246 247 248 249 250 251 252 253 254 255 256 257 258
/**
 * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID
 * @ssid: subsys ID of interest
 *
 * cgroup_subsys_enabled() can only be used with literal subsys names which
 * is fine for individual subsystems but unsuitable for cgroup core.  This
 * is slower static_key_enabled() based test indexed by @ssid.
 */
static bool cgroup_ssid_enabled(int ssid)
{
	return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
}

259 260 261 262 263
static bool cgroup_ssid_no_v1(int ssid)
{
	return cgroup_no_v1_mask & (1 << ssid);
}

264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321
/**
 * cgroup_on_dfl - test whether a cgroup is on the default hierarchy
 * @cgrp: the cgroup of interest
 *
 * The default hierarchy is the v2 interface of cgroup and this function
 * can be used to test whether a cgroup is on the default hierarchy for
 * cases where a subsystem should behave differnetly depending on the
 * interface version.
 *
 * The set of behaviors which change on the default hierarchy are still
 * being determined and the mount option is prefixed with __DEVEL__.
 *
 * List of changed behaviors:
 *
 * - Mount options "noprefix", "xattr", "clone_children", "release_agent"
 *   and "name" are disallowed.
 *
 * - When mounting an existing superblock, mount options should match.
 *
 * - Remount is disallowed.
 *
 * - rename(2) is disallowed.
 *
 * - "tasks" is removed.  Everything should be at process granularity.  Use
 *   "cgroup.procs" instead.
 *
 * - "cgroup.procs" is not sorted.  pids will be unique unless they got
 *   recycled inbetween reads.
 *
 * - "release_agent" and "notify_on_release" are removed.  Replacement
 *   notification mechanism will be implemented.
 *
 * - "cgroup.clone_children" is removed.
 *
 * - "cgroup.subtree_populated" is available.  Its value is 0 if the cgroup
 *   and its descendants contain no task; otherwise, 1.  The file also
 *   generates kernfs notification which can be monitored through poll and
 *   [di]notify when the value of the file changes.
 *
 * - cpuset: tasks will be kept in empty cpusets when hotplug happens and
 *   take masks of ancestors with non-empty cpus/mems, instead of being
 *   moved to an ancestor.
 *
 * - cpuset: a task can be moved into an empty cpuset, and again it takes
 *   masks of ancestors.
 *
 * - memcg: use_hierarchy is on by default and the cgroup file for the flag
 *   is not created.
 *
 * - blkcg: blk-throttle becomes properly hierarchical.
 *
 * - debug: disallowed on the default hierarchy.
 */
static bool cgroup_on_dfl(const struct cgroup *cgrp)
{
	return cgrp->root == &cgrp_dfl_root;
}

322 323 324 325 326 327 328
/* IDR wrappers which synchronize using cgroup_idr_lock */
static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
			    gfp_t gfp_mask)
{
	int ret;

	idr_preload(gfp_mask);
329
	spin_lock_bh(&cgroup_idr_lock);
330
	ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM);
331
	spin_unlock_bh(&cgroup_idr_lock);
332 333 334 335 336 337 338 339
	idr_preload_end();
	return ret;
}

static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
{
	void *ret;

340
	spin_lock_bh(&cgroup_idr_lock);
341
	ret = idr_replace(idr, ptr, id);
342
	spin_unlock_bh(&cgroup_idr_lock);
343 344 345 346 347
	return ret;
}

static void cgroup_idr_remove(struct idr *idr, int id)
{
348
	spin_lock_bh(&cgroup_idr_lock);
349
	idr_remove(idr, id);
350
	spin_unlock_bh(&cgroup_idr_lock);
351 352
}

Tejun Heo's avatar
Tejun Heo committed
353 354 355 356 357 358 359 360 361
static struct cgroup *cgroup_parent(struct cgroup *cgrp)
{
	struct cgroup_subsys_state *parent_css = cgrp->self.parent;

	if (parent_css)
		return container_of(parent_css, struct cgroup, self);
	return NULL;
}

Tejun Heo's avatar
Tejun Heo committed
362 363 364
/**
 * cgroup_css - obtain a cgroup's css for the specified subsystem
 * @cgrp: the cgroup of interest
365
 * @ss: the subsystem of interest (%NULL returns @cgrp->self)
Tejun Heo's avatar
Tejun Heo committed
366
 *
367 368 369 370 371
 * Return @cgrp's css (cgroup_subsys_state) associated with @ss.  This
 * function must be called either under cgroup_mutex or rcu_read_lock() and
 * the caller is responsible for pinning the returned css if it wants to
 * keep accessing it outside the said locks.  This function may return
 * %NULL if @cgrp doesn't have @subsys_id enabled.
Tejun Heo's avatar
Tejun Heo committed
372 373
 */
static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
374
					      struct cgroup_subsys *ss)
Tejun Heo's avatar
Tejun Heo committed
375
{
376
	if (ss)
377
		return rcu_dereference_check(cgrp->subsys[ss->id],
378
					lockdep_is_held(&cgroup_mutex));
379
	else
380
		return &cgrp->self;
Tejun Heo's avatar
Tejun Heo committed
381
}
382

383 384 385
/**
 * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
 * @cgrp: the cgroup of interest
386
 * @ss: the subsystem of interest (%NULL returns @cgrp->self)
387
 *
Chen Hanxiao's avatar
Chen Hanxiao committed
388
 * Similar to cgroup_css() but returns the effective css, which is defined
389 390 391 392 393 394 395 396 397 398
 * as the matching css of the nearest ancestor including self which has @ss
 * enabled.  If @ss is associated with the hierarchy @cgrp is on, this
 * function is guaranteed to return non-NULL css.
 */
static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
						struct cgroup_subsys *ss)
{
	lockdep_assert_held(&cgroup_mutex);

	if (!ss)
399
		return &cgrp->self;
400 401 402 403

	if (!(cgrp->root->subsys_mask & (1 << ss->id)))
		return NULL;

404 405 406 407
	/*
	 * This function is used while updating css associations and thus
	 * can't test the csses directly.  Use ->child_subsys_mask.
	 */
Tejun Heo's avatar
Tejun Heo committed
408 409 410
	while (cgroup_parent(cgrp) &&
	       !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id)))
		cgrp = cgroup_parent(cgrp);
411 412

	return cgroup_css(cgrp, ss);
Tejun Heo's avatar
Tejun Heo committed
413
}
414

415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447
/**
 * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem
 * @cgrp: the cgroup of interest
 * @ss: the subsystem of interest
 *
 * Find and get the effective css of @cgrp for @ss.  The effective css is
 * defined as the matching css of the nearest ancestor including self which
 * has @ss enabled.  If @ss is not mounted on the hierarchy @cgrp is on,
 * the root css is returned, so this function always returns a valid css.
 * The returned css must be put using css_put().
 */
struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
					     struct cgroup_subsys *ss)
{
	struct cgroup_subsys_state *css;

	rcu_read_lock();

	do {
		css = cgroup_css(cgrp, ss);

		if (css && css_tryget_online(css))
			goto out_unlock;
		cgrp = cgroup_parent(cgrp);
	} while (cgrp);

	css = init_css_set.subsys[ss->id];
	css_get(css);
out_unlock:
	rcu_read_unlock();
	return css;
}

448
/* convenient tests for these bits */
449
static inline bool cgroup_is_dead(const struct cgroup *cgrp)
450
{
451
	return !(cgrp->self.flags & CSS_ONLINE);
452 453
}

454 455 456 457 458 459 460 461 462 463 464
static void cgroup_get(struct cgroup *cgrp)
{
	WARN_ON_ONCE(cgroup_is_dead(cgrp));
	css_get(&cgrp->self);
}

static bool cgroup_tryget(struct cgroup *cgrp)
{
	return css_tryget(&cgrp->self);
}

Tejun Heo's avatar
Tejun Heo committed
465
struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
466
{
Tejun Heo's avatar
Tejun Heo committed
467
	struct cgroup *cgrp = of->kn->parent->priv;
Tejun Heo's avatar
Tejun Heo committed
468
	struct cftype *cft = of_cft(of);
Tejun Heo's avatar
Tejun Heo committed
469 470 471 472 473 474 475 476 477 478 479 480

	/*
	 * This is open and unprotected implementation of cgroup_css().
	 * seq_css() is only called from a kernfs file operation which has
	 * an active reference on the file.  Because all the subsystem
	 * files are drained before a css is disassociated with a cgroup,
	 * the matching css from the cgroup's subsys table is guaranteed to
	 * be and stay valid until the enclosing operation is complete.
	 */
	if (cft->ss)
		return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
	else
481
		return &cgrp->self;
482
}
Tejun Heo's avatar
Tejun Heo committed
483
EXPORT_SYMBOL_GPL(of_css);
484

485
static int notify_on_release(const struct cgroup *cgrp)
486
{
487
	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
488 489
}

Tejun Heo's avatar
Tejun Heo committed
490 491 492 493 494 495
/**
 * for_each_css - iterate all css's of a cgroup
 * @css: the iteration cursor
 * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
 * @cgrp: the target cgroup to iterate css's of
 *
496
 * Should be called under cgroup_[tree_]mutex.
Tejun Heo's avatar
Tejun Heo committed
497 498 499 500 501 502 503 504
 */
#define for_each_css(css, ssid, cgrp)					\
	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)	\
		if (!((css) = rcu_dereference_check(			\
				(cgrp)->subsys[(ssid)],			\
				lockdep_is_held(&cgroup_mutex)))) { }	\
		else

505 506 507 508 509 510 511 512 513 514 515 516 517 518
/**
 * for_each_e_css - iterate all effective css's of a cgroup
 * @css: the iteration cursor
 * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
 * @cgrp: the target cgroup to iterate css's of
 *
 * Should be called under cgroup_[tree_]mutex.
 */
#define for_each_e_css(css, ssid, cgrp)					\
	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)	\
		if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
			;						\
		else

519
/**
Tejun Heo's avatar
Tejun Heo committed
520
 * for_each_subsys - iterate all enabled cgroup subsystems
521
 * @ss: the iteration cursor
522
 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
523
 */
524
#define for_each_subsys(ss, ssid)					\
Tejun Heo's avatar
Tejun Heo committed
525 526
	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT &&		\
	     (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
527

528 529 530 531 532 533 534 535 536 537 538
/**
 * for_each_subsys_which - filter for_each_subsys with a bitmask
 * @ss: the iteration cursor
 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
 * @ss_maskp: a pointer to the bitmask
 *
 * The block will only run for cases where the ssid-th bit (1 << ssid) of
 * mask is set to 1.
 */
#define for_each_subsys_which(ss, ssid, ss_maskp)			\
	if (!CGROUP_SUBSYS_COUNT) /* to avoid spurious gcc warning */	\
539
		(ssid) = 0;						\
540 541 542 543 544 545
	else								\
		for_each_set_bit(ssid, ss_maskp, CGROUP_SUBSYS_COUNT)	\
			if (((ss) = cgroup_subsys[ssid]) && false)	\
				break;					\
			else

546 547
/* iterate across the hierarchies */
#define for_each_root(root)						\
548
	list_for_each_entry((root), &cgroup_roots, root_list)
549

550 551
/* iterate over child cgrps, lock should be held throughout iteration */
#define cgroup_for_each_live_child(child, cgrp)				\
552
	list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
Tejun Heo's avatar
Tejun Heo committed
553
		if (({ lockdep_assert_held(&cgroup_mutex);		\
554 555 556
		       cgroup_is_dead(child); }))			\
			;						\
		else
557

558
static void cgroup_release_agent(struct work_struct *work);
559
static void check_for_release(struct cgroup *cgrp);
560

561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578
/*
 * A cgroup can be associated with multiple css_sets as different tasks may
 * belong to different cgroups on different hierarchies.  In the other
 * direction, a css_set is naturally associated with multiple cgroups.
 * This M:N relationship is represented by the following link structure
 * which exists for each association and allows traversing the associations
 * from both sides.
 */
struct cgrp_cset_link {
	/* the cgroup and css_set this link associates */
	struct cgroup		*cgrp;
	struct css_set		*cset;

	/* list of cgrp_cset_links anchored at cgrp->cset_links */
	struct list_head	cset_link;

	/* list of cgrp_cset_links anchored at css_set->cgrp_links */
	struct list_head	cgrp_link;
579 580
};

581 582
/*
 * The default css_set - used by init and its children prior to any
583 584 585 586 587
 * hierarchies being mounted. It contains a pointer to the root state
 * for each subsystem. Also used to anchor the list of css_sets. Not
 * reference-counted, to improve performance when child cgroups
 * haven't been created.
 */
588
struct css_set init_css_set = {
589 590 591 592 593 594
	.refcount		= ATOMIC_INIT(1),
	.cgrp_links		= LIST_HEAD_INIT(init_css_set.cgrp_links),
	.tasks			= LIST_HEAD_INIT(init_css_set.tasks),
	.mg_tasks		= LIST_HEAD_INIT(init_css_set.mg_tasks),
	.mg_preload_node	= LIST_HEAD_INIT(init_css_set.mg_preload_node),
	.mg_node		= LIST_HEAD_INIT(init_css_set.mg_node),
595
	.task_iters		= LIST_HEAD_INIT(init_css_set.task_iters),
596
};
597

598
static int css_set_count	= 1;	/* 1 for init_css_set */
599

600 601 602 603 604 605
/**
 * css_set_populated - does a css_set contain any tasks?
 * @cset: target css_set
 */
static bool css_set_populated(struct css_set *cset)
{
606
	lockdep_assert_held(&css_set_lock);
607 608 609 610

	return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks);
}

611 612 613 614 615
/**
 * cgroup_update_populated - updated populated count of a cgroup
 * @cgrp: the target cgroup
 * @populated: inc or dec populated count
 *
616 617 618 619
 * One of the css_sets associated with @cgrp is either getting its first
 * task or losing the last.  Update @cgrp->populated_cnt accordingly.  The
 * count is propagated towards root so that a given cgroup's populated_cnt
 * is zero iff the cgroup and all its descendants don't contain any tasks.
620 621 622 623 624 625 626 627 628
 *
 * @cgrp's interface file "cgroup.populated" is zero if
 * @cgrp->populated_cnt is zero and 1 otherwise.  When @cgrp->populated_cnt
 * changes from or to zero, userland is notified that the content of the
 * interface file has changed.  This can be used to detect when @cgrp and
 * its descendants become populated or empty.
 */
static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
{
629
	lockdep_assert_held(&css_set_lock);
630 631 632 633 634 635 636 637 638 639 640 641

	do {
		bool trigger;

		if (populated)
			trigger = !cgrp->populated_cnt++;
		else
			trigger = !--cgrp->populated_cnt;

		if (!trigger)
			break;

642
		check_for_release(cgrp);
643 644
		cgroup_file_notify(&cgrp->events_file);

Tejun Heo's avatar
Tejun Heo committed
645
		cgrp = cgroup_parent(cgrp);
646 647 648
	} while (cgrp);
}

649 650 651 652 653 654 655 656 657 658 659 660
/**
 * css_set_update_populated - update populated state of a css_set
 * @cset: target css_set
 * @populated: whether @cset is populated or depopulated
 *
 * @cset is either getting the first task or losing the last.  Update the
 * ->populated_cnt of all associated cgroups accordingly.
 */
static void css_set_update_populated(struct css_set *cset, bool populated)
{
	struct cgrp_cset_link *link;

661
	lockdep_assert_held(&css_set_lock);
662 663 664 665 666

	list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
		cgroup_update_populated(link->cgrp, populated);
}

667 668 669 670 671 672 673 674 675 676 677
/**
 * css_set_move_task - move a task from one css_set to another
 * @task: task being moved
 * @from_cset: css_set @task currently belongs to (may be NULL)
 * @to_cset: new css_set @task is being moved to (may be NULL)
 * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks
 *
 * Move @task from @from_cset to @to_cset.  If @task didn't belong to any
 * css_set, @from_cset can be NULL.  If @task is being disassociated
 * instead of moved, @to_cset can be NULL.
 *
678 679 680
 * This function automatically handles populated_cnt updates and
 * css_task_iter adjustments but the caller is responsible for managing
 * @from_cset and @to_cset's reference counts.
681 682 683 684 685
 */
static void css_set_move_task(struct task_struct *task,
			      struct css_set *from_cset, struct css_set *to_cset,
			      bool use_mg_tasks)
{
686
	lockdep_assert_held(&css_set_lock);
687 688

	if (from_cset) {
689 690
		struct css_task_iter *it, *pos;

691
		WARN_ON_ONCE(list_empty(&task->cg_list));
692 693 694 695 696 697 698 699 700 701 702 703 704

		/*
		 * @task is leaving, advance task iterators which are
		 * pointing to it so that they can resume at the next
		 * position.  Advancing an iterator might remove it from
		 * the list, use safe walk.  See css_task_iter_advance*()
		 * for details.
		 */
		list_for_each_entry_safe(it, pos, &from_cset->task_iters,
					 iters_node)
			if (it->task_pos == &task->cg_list)
				css_task_iter_advance(it);

705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728
		list_del_init(&task->cg_list);
		if (!css_set_populated(from_cset))
			css_set_update_populated(from_cset, false);
	} else {
		WARN_ON_ONCE(!list_empty(&task->cg_list));
	}

	if (to_cset) {
		/*
		 * We are synchronized through cgroup_threadgroup_rwsem
		 * against PF_EXITING setting such that we can't race
		 * against cgroup_exit() changing the css_set to
		 * init_css_set and dropping the old one.
		 */
		WARN_ON_ONCE(task->flags & PF_EXITING);

		if (!css_set_populated(to_cset))
			css_set_update_populated(to_cset, true);
		rcu_assign_pointer(task->cgroups, to_cset);
		list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
							     &to_cset->tasks);
	}
}

729 730 731 732 733
/*
 * hash table for cgroup groups. This improves the performance to find
 * an existing css_set. This hash doesn't (currently) take into
 * account cgroups in empty hierarchies.
 */
734
#define CSS_SET_HASH_BITS	7
735
static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
736

737
static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
738
{
739
	unsigned long key = 0UL;
740 741
	struct cgroup_subsys *ss;
	int i;
742

743
	for_each_subsys(ss, i)
744 745
		key += (unsigned long)css[i];
	key = (key >> 16) ^ key;
746

747
	return key;
748 749
}

Zefan Li's avatar
Zefan Li committed
750
static void put_css_set_locked(struct css_set *cset)
751
{
752
	struct cgrp_cset_link *link, *tmp_link;
753 754
	struct cgroup_subsys *ss;
	int ssid;
755

756
	lockdep_assert_held(&css_set_lock);
757 758

	if (!atomic_dec_and_test(&cset->refcount))
759
		return;
760

761 762
	/* This css_set is dead. unlink it and release cgroup and css refs */
	for_each_subsys(ss, ssid) {
763
		list_del(&cset->e_cset_node[ssid]);
764 765
		css_put(cset->subsys[ssid]);
	}
766
	hash_del(&cset->hlist);
767 768
	css_set_count--;

769 770 771
	list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
		list_del(&link->cset_link);
		list_del(&link->cgrp_link);
772 773
		if (cgroup_parent(link->cgrp))
			cgroup_put(link->cgrp);
774
		kfree(link);
775
	}
776

777
	kfree_rcu(cset, rcu_head);
778 779
}

Zefan Li's avatar
Zefan Li committed
780
static void put_css_set(struct css_set *cset)
781 782 783 784 785 786 787 788 789
{
	/*
	 * Ensure that the refcount doesn't hit zero while any readers
	 * can see it. Similar to atomic_dec_and_lock(), but for an
	 * rwlock
	 */
	if (atomic_add_unless(&cset->refcount, -1, 1))
		return;

790
	spin_lock_bh(&css_set_lock);
Zefan Li's avatar
Zefan Li committed
791
	put_css_set_locked(cset);
792
	spin_unlock_bh(&css_set_lock);
793 794
}

795 796 797
/*
 * refcounted get/put for css_set objects
 */
798
static inline void get_css_set(struct css_set *cset)
799
{
800
	atomic_inc(&cset->refcount);
801 802
}

803
/**
804
 * compare_css_sets - helper function for find_existing_css_set().
805 806
 * @cset: candidate css_set being tested
 * @old_cset: existing css_set for a task
807 808 809
 * @new_cgrp: cgroup that's being entered by the task
 * @template: desired set of css pointers in css_set (pre-calculated)
 *
Li Zefan's avatar
Li Zefan committed
810
 * Returns true if "cset" matches "old_cset" except for the hierarchy
811 812
 * which "new_cgrp" belongs to, for which it should match "new_cgrp".
 */
813 814
static bool compare_css_sets(struct css_set *cset,
			     struct css_set *old_cset,
815 816 817 818 819
			     struct cgroup *new_cgrp,
			     struct cgroup_subsys_state *template[])
{
	struct list_head *l1, *l2;

820 821 822 823 824 825
	/*
	 * On the default hierarchy, there can be csets which are
	 * associated with the same set of cgroups but different csses.
	 * Let's first ensure that csses match.
	 */
	if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
826 827 828 829
		return false;

	/*
	 * Compare cgroup pointers in order to distinguish between
830 831 832
	 * different cgroups in hierarchies.  As different cgroups may
	 * share the same effective css, this comparison is always
	 * necessary.
833
	 */
834 835
	l1 = &cset->cgrp_links;
	l2 = &old_cset->cgrp_links;
836
	while (1) {
837
		struct cgrp_cset_link *link1, *link2;
838
		struct cgroup *cgrp1, *cgrp2;
839 840 841 842

		l1 = l1->next;
		l2 = l2->next;
		/* See if we reached the end - both lists are equal length. */
843 844
		if (l1 == &cset->cgrp_links) {
			BUG_ON(l2 != &old_cset->cgrp_links);
845 846
			break;
		} else {
847
			BUG_ON(l2 == &old_cset->cgrp_links);
848 849
		}
		/* Locate the cgroups associated with these links. */
850 851 852 853
		link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
		link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
		cgrp1 = link1->cgrp;
		cgrp2 = link2->cgrp;
854
		/* Hierarchies should be linked in the same order. */
855
		BUG_ON(cgrp1->root != cgrp2->root);
856 857 858 859 860 861 862 863

		/*
		 * If this hierarchy is the hierarchy of the cgroup
		 * that's changing, then we need to check that this
		 * css_set points to the new cgroup; if it's any other
		 * hierarchy, then this css_set should point to the
		 * same cgroup as the old css_set.
		 */
864 865
		if (cgrp1->root == new_cgrp->root) {
			if (cgrp1 != new_cgrp)
866 867
				return false;
		} else {
868
			if (cgrp1 != cgrp2)
869 870 871 872 873 874
				return false;
		}
	}
	return true;
}

875 876 877 878 879
/**
 * find_existing_css_set - init css array and find the matching css_set
 * @old_cset: the css_set that we're using before the cgroup transition
 * @cgrp: the cgroup that we're moving into
 * @template: out param for the new set of csses, should be clear on entry
880
 */
881 882 883
static struct css_set *find_existing_css_set(struct css_set *old_cset,
					struct cgroup *cgrp,
					struct cgroup_subsys_state *template[])
884
{
885
	struct cgroup_root *root = cgrp->root;
886
	struct cgroup_subsys *ss;
887
	struct css_set *cset;
888
	unsigned long key;
889
	int i;
890

Ben Blum's avatar
Ben Blum committed
891 892 893 894 895
	/*
	 * Build the set of subsystem state objects that we want to see in the
	 * new css_set. while subsystems can change globally, the entries here
	 * won't change, so no need for locking.
	 */
896
	for_each_subsys(ss, i) {
897
		if (root->subsys_mask & (1UL << i)) {
898 899 900 901 902
			/*
			 * @ss is in this hierarchy, so we want the
			 * effective css from @cgrp.
			 */
			template[i] = cgroup_e_css(cgrp, ss);
903
		} else {
904 905 906 907
			/*
			 * @ss is not in this hierarchy, so we don't want
			 * to change the css.
			 */
908
			template[i] = old_cset->subsys[i];
909 910 911
		}
	}

912
	key = css_set_hash(template);
913 914
	hash_for_each_possible(css_set_table, cset, hlist, key) {
		if (!compare_css_sets(cset, old_cset, cgrp, template))
915 916 917
			continue;

		/* This css_set matches what we need */
918
		return cset;
919
	}
920 921 922 923 924

	/* No existing cgroup group matched */
	return NULL;
}

925
static void free_cgrp_cset_links(struct list_head *links_to_free)
926
{
927
	struct cgrp_cset_link *link, *tmp_link;
928

929 930
	list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
		list_del(&link->cset_link);
931 932 933 934
		kfree(link);
	}
}

935 936 937 938 939 940 941
/**
 * allocate_cgrp_cset_links - allocate cgrp_cset_links
 * @count: the number of links to allocate
 * @tmp_links: list_head the allocated links are put on
 *
 * Allocate @count cgrp_cset_link structures and chain them on @tmp_links
 * through ->cset_link.  Returns 0 on success or -errno.
942
 */
943
static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
944
{
945
	struct cgrp_cset_link *link;
946
	int i;
947 948 949

	INIT_LIST_HEAD(tmp_links);

950
	for (i = 0; i < count; i++) {
951
		link = kzalloc(sizeof(*link), GFP_KERNEL);
952
		if (!link) {
953
			free_cgrp_cset_links(tmp_links);
954 955
			return -ENOMEM;
		}
956
		list_add(&link->cset_link, tmp_links);
957 958 959 960
	}
	return 0;
}

961 962
/**
 * link_css_set - a helper function to link a css_set to a cgroup
963
 * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
964
 * @cset: the css_set to be linked
965 966
 * @cgrp: the destination cgroup
 */
967 968
static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
			 struct cgroup *cgrp)
969
{
970
	struct cgrp_cset_link *link;
971

972
	BUG_ON(list_empty(tmp_links));
Tejun Heo's avatar
Tejun Heo committed
973 974 975 976

	if (cgroup_on_dfl(cgrp))
		cset->dfl_cgrp = cgrp;

977 978
	link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
	link->cset = cset;
979
	link->cgrp = cgrp;
980

981
	/*
982 983
	 * Always add links to the tail of the lists so that the lists are
	 * in choronological order.
984
	 */
985
	list_move_tail(&link->cset_link, &cgrp->cset_links);
986
	list_add_tail(&link->cgrp_link, &cset->cgrp_links);
987 988 989

	if (cgroup_parent(cgrp))
		cgroup_get(cgrp);
990 991
}

992 993 994 995 996 997 998
/**
 * find_css_set - return a new css_set with one cgroup updated
 * @old_cset: the baseline css_set
 * @cgrp: the cgroup to be updated
 *
 * Return a new css_set that's equivalent to @old_cset, but with @cgrp
 * substituted into the appropriate hierarchy.
999
 */
1000 1001
static struct css_set *find_css_set(struct css_set *old_cset,
				    struct cgroup *cgrp)
1002
{
1003
	struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
1004
	struct css_set *cset;
1005 1006
	struct list_head tmp_links;
	struct cgrp_cset_link *link;
1007
	struct cgroup_subsys *ss;
1008
	unsigned long key;
1009
	int ssid;
1010

1011 1012
	lockdep_assert_held(&cgroup_mutex);

1013 1014
	/* First see if we already have a cgroup group that matches
	 * the desired set */
1015
	spin_lock_bh(&css_set_lock);
1016 1017 1018
	cset = find_existing_css_set(old_cset, cgrp, template);
	if (cset)
		get_css_set(cset);
1019
	spin_unlock_bh(&css_set_lock);
1020

1021 1022
	if (cset)
		return cset;
1023

1024
	cset = kzalloc(sizeof(*cset), GFP_KERNEL);
1025
	if (!cset)
1026 1027
		return NULL;

1028
	/* Allocate all the cgrp_cset_link objects that we'll need */
1029
	if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
1030
		kfree(cset);
1031 1032 1033
		return NULL;
	}

1034
	atomic_set(&cset->refcount, 1);
1035
	INIT_LIST_HEAD(&cset->cgrp_links);
1036
	INIT_LIST_HEAD(&cset->tasks);
Tejun Heo's avatar
Tejun Heo committed
1037
	INIT_LIST_HEAD(&cset->mg_tasks);
1038
	INIT_LIST_HEAD(&cset->mg_preload_node);
1039
	INIT_LIST_HEAD(&cset->mg_node);
1040
	INIT_LIST_HEAD(&cset->task_iters);