cgroup.c 82.8 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
/*
 *  Generic process-grouping system.
 *
 *  Based originally on the cpuset system, extracted by Paul Menage
 *  Copyright (C) 2006 Google, Inc
 *
 *  Copyright notices from the original cpuset code:
 *  --------------------------------------------------
 *  Copyright (C) 2003 BULL SA.
 *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
 *
 *  Portions derived from Patrick Mochel's sysfs code.
 *  sysfs is Copyright (c) 2001-3 Patrick Mochel
 *
 *  2003-10-10 Written by Simon Derr.
 *  2003-10-22 Updates by Stephen Hemminger.
 *  2004 May-July Rework by Paul Jackson.
 *  ---------------------------------------------------
 *
 *  This file is subject to the terms and conditions of the GNU General Public
 *  License.  See the file COPYING in the main directory of the Linux
 *  distribution for more details.
 */

#include <linux/cgroup.h>
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/mm.h>
#include <linux/mutex.h>
#include <linux/mount.h>
#include <linux/pagemap.h>
34
#include <linux/proc_fs.h>
35
36
#include <linux/rcupdate.h>
#include <linux/sched.h>
37
#include <linux/backing-dev.h>
38
39
40
41
42
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/magic.h>
#include <linux/spinlock.h>
#include <linux/string.h>
43
#include <linux/sort.h>
44
#include <linux/kmod.h>
Balbir Singh's avatar
Balbir Singh committed
45
46
#include <linux/delayacct.h>
#include <linux/cgroupstats.h>
47
#include <linux/hash.h>
48
#include <linux/namei.h>
Balbir Singh's avatar
Balbir Singh committed
49

50
51
#include <asm/atomic.h>

52
53
static DEFINE_MUTEX(cgroup_mutex);

54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
/* Generate an array of cgroup subsystem pointers */
#define SUBSYS(_x) &_x ## _subsys,

static struct cgroup_subsys *subsys[] = {
#include <linux/cgroup_subsys.h>
};

/*
 * A cgroupfs_root represents the root of a cgroup hierarchy,
 * and may be associated with a superblock to form an active
 * hierarchy
 */
struct cgroupfs_root {
	struct super_block *sb;

	/*
	 * The bitmask of subsystems intended to be attached to this
	 * hierarchy
	 */
	unsigned long subsys_bits;

	/* The bitmask of subsystems currently attached to this hierarchy */
	unsigned long actual_subsys_bits;

	/* A list running through the attached subsystems */
	struct list_head subsys_list;

	/* The root cgroup for this hierarchy */
	struct cgroup top_cgroup;

	/* Tracks how many cgroups are currently defined in hierarchy.*/
	int number_of_cgroups;

	/* A list running through the mounted hierarchies */
	struct list_head root_list;

	/* Hierarchy-specific flags */
	unsigned long flags;
92

93
	/* The path to use for release notifications. */
94
	char release_agent_path[PATH_MAX];
95
96
97
98
99
100
101
102
103
104
105
106
107
};


/*
 * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
 * subsystems that are otherwise unattached - it never has more than a
 * single cgroup, and all tasks are part of that cgroup.
 */
static struct cgroupfs_root rootnode;

/* The list of hierarchy roots */

static LIST_HEAD(roots);
108
static int root_count;
109
110
111
112
113

/* dummytop is a shorthand for the dummy hierarchy's top cgroup */
#define dummytop (&rootnode.top_cgroup)

/* This flag indicates whether tasks in the fork and exit paths should
Li Zefan's avatar
Li Zefan committed
114
115
116
 * check for fork/exit handlers to call. This avoids us having to do
 * extra work in the fork/exit path if none of the subsystems need to
 * be called.
117
 */
118
static int need_forkexit_callback __read_mostly;
119
static int need_mm_owner_callback __read_mostly;
120
121

/* convenient tests for these bits */
122
inline int cgroup_is_removed(const struct cgroup *cgrp)
123
{
124
	return test_bit(CGRP_REMOVED, &cgrp->flags);
125
126
127
128
129
130
131
}

/* bits in struct cgroupfs_root flags field */
enum {
	ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
};

132
static int cgroup_is_releasable(const struct cgroup *cgrp)
133
134
{
	const int bits =
135
136
137
		(1 << CGRP_RELEASABLE) |
		(1 << CGRP_NOTIFY_ON_RELEASE);
	return (cgrp->flags & bits) == bits;
138
139
}

140
static int notify_on_release(const struct cgroup *cgrp)
141
{
142
	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
143
144
}

145
146
147
148
149
150
151
152
153
154
155
/*
 * for_each_subsys() allows you to iterate on each subsystem attached to
 * an active hierarchy
 */
#define for_each_subsys(_root, _ss) \
list_for_each_entry(_ss, &_root->subsys_list, sibling)

/* for_each_root() allows you to iterate across the active hierarchies */
#define for_each_root(_root) \
list_for_each_entry(_root, &roots, root_list)

156
157
158
159
160
161
/* the list of cgroups eligible for automatic release. Protected by
 * release_list_lock */
static LIST_HEAD(release_list);
static DEFINE_SPINLOCK(release_list_lock);
static void cgroup_release_agent(struct work_struct *work);
static DECLARE_WORK(release_agent_work, cgroup_release_agent);
162
static void check_for_release(struct cgroup *cgrp);
163

164
165
166
167
168
169
/* Link structure for associating css_set objects with cgroups */
struct cg_cgroup_link {
	/*
	 * List running through cg_cgroup_links associated with a
	 * cgroup, anchored on cgroup->css_sets
	 */
170
	struct list_head cgrp_link_list;
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
	/*
	 * List running through cg_cgroup_links pointing at a
	 * single css_set object, anchored on css_set->cg_links
	 */
	struct list_head cg_link_list;
	struct css_set *cg;
};

/* The default css_set - used by init and its children prior to any
 * hierarchies being mounted. It contains a pointer to the root state
 * for each subsystem. Also used to anchor the list of css_sets. Not
 * reference-counted, to improve performance when child cgroups
 * haven't been created.
 */

static struct css_set init_css_set;
static struct cg_cgroup_link init_css_set_link;

/* css_set_lock protects the list of css_set objects, and the
 * chain of tasks off each css_set.  Nests outside task->alloc_lock
 * due to cgroup_iter_start() */
static DEFINE_RWLOCK(css_set_lock);
static int css_set_count;

195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
/* hash table for cgroup groups. This improves the performance to
 * find an existing css_set */
#define CSS_SET_HASH_BITS	7
#define CSS_SET_TABLE_SIZE	(1 << CSS_SET_HASH_BITS)
static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];

static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
{
	int i;
	int index;
	unsigned long tmp = 0UL;

	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
		tmp += (unsigned long)css[i];
	tmp = (tmp >> 16) ^ tmp;

	index = hash_long(tmp, CSS_SET_HASH_BITS);

	return &css_set_table[index];
}

216
217
218
219
/* We don't maintain the lists running through each css_set to its
 * task until after the first call to cgroup_iter_start(). This
 * reduces the fork()/exit() overhead for people who have cgroups
 * compiled into their kernel but not actually in use */
220
static int use_task_css_set_links __read_mostly;
221
222
223
224
225
226
227

/* When we create or destroy a css_set, the operation simply
 * takes/releases a reference count on all the cgroups referenced
 * by subsystems in this css_set. This can end up multiple-counting
 * some cgroups, but that's OK - the ref-count is just a
 * busy/not-busy indicator; ensuring that we only count each cgroup
 * once would require taking a global lock to ensure that no
228
229
230
231
232
233
234
 * subsystems moved between hierarchies while we were doing so.
 *
 * Possible TODO: decide at boot time based on the number of
 * registered subsystems and the number of CPUs or NUMA nodes whether
 * it's better for performance to ref-count every subsystem, or to
 * take a global lock and only add one ref count to each hierarchy.
 */
235
236
237
238

/*
 * unlink a css_set from the list and free it
 */
239
static void unlink_css_set(struct css_set *cg)
240
{
241
242
243
	struct cg_cgroup_link *link;
	struct cg_cgroup_link *saved_link;

244
	hlist_del(&cg->hlist);
245
	css_set_count--;
246
247
248

	list_for_each_entry_safe(link, saved_link, &cg->cg_links,
				 cg_link_list) {
249
		list_del(&link->cg_link_list);
250
		list_del(&link->cgrp_link_list);
251
252
		kfree(link);
	}
253
254
}

255
static void __put_css_set(struct css_set *cg, int taskexit)
256
257
{
	int i;
258
259
260
261
262
263
264
265
266
267
268
269
	/*
	 * Ensure that the refcount doesn't hit zero while any readers
	 * can see it. Similar to atomic_dec_and_lock(), but for an
	 * rwlock
	 */
	if (atomic_add_unless(&cg->refcount, -1, 1))
		return;
	write_lock(&css_set_lock);
	if (!atomic_dec_and_test(&cg->refcount)) {
		write_unlock(&css_set_lock);
		return;
	}
270
	unlink_css_set(cg);
271
	write_unlock(&css_set_lock);
272
273
274

	rcu_read_lock();
	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
275
276
277
		struct cgroup *cgrp = cg->subsys[i]->cgroup;
		if (atomic_dec_and_test(&cgrp->count) &&
		    notify_on_release(cgrp)) {
278
			if (taskexit)
279
280
				set_bit(CGRP_RELEASABLE, &cgrp->flags);
			check_for_release(cgrp);
281
282
283
		}
	}
	rcu_read_unlock();
284
	kfree(cg);
285
286
}

287
288
289
290
291
/*
 * refcounted get/put for css_set objects
 */
static inline void get_css_set(struct css_set *cg)
{
292
	atomic_inc(&cg->refcount);
293
294
295
296
}

static inline void put_css_set(struct css_set *cg)
{
297
	__put_css_set(cg, 0);
298
299
}

300
301
static inline void put_css_set_taskexit(struct css_set *cg)
{
302
	__put_css_set(cg, 1);
303
304
}

305
306
307
/*
 * find_existing_css_set() is a helper for
 * find_css_set(), and checks to see whether an existing
308
 * css_set is suitable.
309
310
311
312
 *
 * oldcg: the cgroup group that we're using before the cgroup
 * transition
 *
313
 * cgrp: the cgroup that we're moving into
314
315
316
317
318
319
 *
 * template: location in which to build the desired set of subsystem
 * state objects for the new cgroup group
 */
static struct css_set *find_existing_css_set(
	struct css_set *oldcg,
320
	struct cgroup *cgrp,
321
	struct cgroup_subsys_state *template[])
322
323
{
	int i;
324
	struct cgroupfs_root *root = cgrp->root;
325
326
327
	struct hlist_head *hhead;
	struct hlist_node *node;
	struct css_set *cg;
328
329
330
331

	/* Built the set of subsystem state objects that we want to
	 * see in the new css_set */
	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
Li Zefan's avatar
Li Zefan committed
332
		if (root->subsys_bits & (1UL << i)) {
333
334
335
			/* Subsystem is in this hierarchy. So we want
			 * the subsystem state from the new
			 * cgroup */
336
			template[i] = cgrp->subsys[i];
337
338
339
340
341
342
343
		} else {
			/* Subsystem is not in this hierarchy, so we
			 * don't want to change the subsystem state */
			template[i] = oldcg->subsys[i];
		}
	}

344
345
	hhead = css_set_hash(template);
	hlist_for_each_entry(cg, node, hhead, hlist) {
346
347
348
349
		if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) {
			/* All subsystems matched */
			return cg;
		}
350
	}
351
352
353
354
355

	/* No existing cgroup group matched */
	return NULL;
}

356
357
358
359
360
361
362
363
364
365
366
static void free_cg_links(struct list_head *tmp)
{
	struct cg_cgroup_link *link;
	struct cg_cgroup_link *saved_link;

	list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) {
		list_del(&link->cgrp_link_list);
		kfree(link);
	}
}

367
368
/*
 * allocate_cg_links() allocates "count" cg_cgroup_link structures
369
 * and chains them on tmp through their cgrp_link_list fields. Returns 0 on
370
371
372
373
374
375
376
377
378
379
 * success or a negative error
 */
static int allocate_cg_links(int count, struct list_head *tmp)
{
	struct cg_cgroup_link *link;
	int i;
	INIT_LIST_HEAD(tmp);
	for (i = 0; i < count; i++) {
		link = kmalloc(sizeof(*link), GFP_KERNEL);
		if (!link) {
380
			free_cg_links(tmp);
381
382
			return -ENOMEM;
		}
383
		list_add(&link->cgrp_link_list, tmp);
384
385
386
387
388
389
390
391
392
393
394
395
	}
	return 0;
}

/*
 * find_css_set() takes an existing cgroup group and a
 * cgroup object, and returns a css_set object that's
 * equivalent to the old group, but with the given cgroup
 * substituted into the appropriate hierarchy. Must be called with
 * cgroup_mutex held
 */
static struct css_set *find_css_set(
396
	struct css_set *oldcg, struct cgroup *cgrp)
397
398
399
400
401
402
403
404
{
	struct css_set *res;
	struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
	int i;

	struct list_head tmp_cg_links;
	struct cg_cgroup_link *link;

405
406
	struct hlist_head *hhead;

407
408
	/* First see if we already have a cgroup group that matches
	 * the desired set */
409
	read_lock(&css_set_lock);
410
	res = find_existing_css_set(oldcg, cgrp, template);
411
412
	if (res)
		get_css_set(res);
413
	read_unlock(&css_set_lock);
414
415
416
417
418
419
420
421
422
423
424
425
426
427

	if (res)
		return res;

	res = kmalloc(sizeof(*res), GFP_KERNEL);
	if (!res)
		return NULL;

	/* Allocate all the cg_cgroup_link objects that we'll need */
	if (allocate_cg_links(root_count, &tmp_cg_links) < 0) {
		kfree(res);
		return NULL;
	}

428
	atomic_set(&res->refcount, 1);
429
430
	INIT_LIST_HEAD(&res->cg_links);
	INIT_LIST_HEAD(&res->tasks);
431
	INIT_HLIST_NODE(&res->hlist);
432
433
434
435
436
437
438
439

	/* Copy the set of subsystem state objects generated in
	 * find_existing_css_set() */
	memcpy(res->subsys, template, sizeof(res->subsys));

	write_lock(&css_set_lock);
	/* Add reference counts and links from the new css_set. */
	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
440
		struct cgroup *cgrp = res->subsys[i]->cgroup;
441
		struct cgroup_subsys *ss = subsys[i];
442
		atomic_inc(&cgrp->count);
443
444
445
446
447
448
449
450
451
		/*
		 * We want to add a link once per cgroup, so we
		 * only do it for the first subsystem in each
		 * hierarchy
		 */
		if (ss->root->subsys_list.next == &ss->sibling) {
			BUG_ON(list_empty(&tmp_cg_links));
			link = list_entry(tmp_cg_links.next,
					  struct cg_cgroup_link,
452
453
454
					  cgrp_link_list);
			list_del(&link->cgrp_link_list);
			list_add(&link->cgrp_link_list, &cgrp->css_sets);
455
456
457
458
459
460
461
			link->cg = res;
			list_add(&link->cg_link_list, &res->cg_links);
		}
	}
	if (list_empty(&rootnode.subsys_list)) {
		link = list_entry(tmp_cg_links.next,
				  struct cg_cgroup_link,
462
463
464
				  cgrp_link_list);
		list_del(&link->cgrp_link_list);
		list_add(&link->cgrp_link_list, &dummytop->css_sets);
465
466
467
468
469
470
471
		link->cg = res;
		list_add(&link->cg_link_list, &res->cg_links);
	}

	BUG_ON(!list_empty(&tmp_cg_links));

	css_set_count++;
472
473
474
475
476

	/* Add this cgroup group to the hash table */
	hhead = css_set_hash(res->subsys);
	hlist_add_head(&res->hlist, hhead);

477
478
479
	write_unlock(&css_set_lock);

	return res;
480
481
}

482
483
484
485
486
487
488
489
490
491
/*
 * There is one global cgroup mutex. We also require taking
 * task_lock() when dereferencing a task's cgroup subsys pointers.
 * See "The task_lock() exception", at the end of this comment.
 *
 * A task must hold cgroup_mutex to modify cgroups.
 *
 * Any task can increment and decrement the count field without lock.
 * So in general, code holding cgroup_mutex can't rely on the count
 * field not changing.  However, if the count goes to zero, then only
492
 * cgroup_attach_task() can increment it again.  Because a count of zero
493
494
495
496
497
498
499
500
501
502
503
504
505
 * means that no tasks are currently attached, therefore there is no
 * way a task attached to that cgroup can fork (the other way to
 * increment the count).  So code holding cgroup_mutex can safely
 * assume that if the count is zero, it will stay zero. Similarly, if
 * a task holds cgroup_mutex on a cgroup with zero count, it
 * knows that the cgroup won't be removed, as cgroup_rmdir()
 * needs that mutex.
 *
 * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
 * (usually) take cgroup_mutex.  These are the two most performance
 * critical pieces of code here.  The exception occurs on cgroup_exit(),
 * when a task in a notify_on_release cgroup exits.  Then cgroup_mutex
 * is taken, and if the cgroup count is zero, a usermode call made
Li Zefan's avatar
Li Zefan committed
506
507
 * to the release agent with the name of the cgroup (path relative to
 * the root of cgroup file system) as the argument.
508
509
510
511
512
513
514
515
516
517
518
 *
 * A cgroup can only be deleted if both its 'count' of using tasks
 * is zero, and its list of 'children' cgroups is empty.  Since all
 * tasks in the system use _some_ cgroup, and since there is always at
 * least one task in the system (init, pid == 1), therefore, top_cgroup
 * always has either children cgroups and/or using tasks.  So we don't
 * need a special hack to ensure that top_cgroup cannot be deleted.
 *
 *	The task_lock() exception
 *
 * The need for this exception arises from the action of
519
 * cgroup_attach_task(), which overwrites one tasks cgroup pointer with
Li Zefan's avatar
Li Zefan committed
520
 * another.  It does so using cgroup_mutex, however there are
521
522
523
 * several performance critical places that need to reference
 * task->cgroup without the expense of grabbing a system global
 * mutex.  Therefore except as noted below, when dereferencing or, as
524
 * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use
525
526
527
528
 * task_lock(), which acts on a spinlock (task->alloc_lock) already in
 * the task_struct routinely used for such matters.
 *
 * P.S.  One more locking exception.  RCU is used to guard the
529
 * update of a tasks cgroup pointer by cgroup_attach_task()
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
 */

/**
 * cgroup_lock - lock out any changes to cgroup structures
 *
 */
void cgroup_lock(void)
{
	mutex_lock(&cgroup_mutex);
}

/**
 * cgroup_unlock - release lock on cgroup changes
 *
 * Undo the lock taken in a previous cgroup_lock() call.
 */
void cgroup_unlock(void)
{
	mutex_unlock(&cgroup_mutex);
}

/*
 * A couple of forward declarations required, due to cyclic reference loop:
 * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
 * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations
 * -> cgroup_mkdir.
 */

static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
560
static int cgroup_populate_dir(struct cgroup *cgrp);
561
static struct inode_operations cgroup_dir_inode_operations;
562
563
564
static struct file_operations proc_cgroupstats_operations;

static struct backing_dev_info cgroup_backing_dev_info = {
565
	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
566
};
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582

static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
{
	struct inode *inode = new_inode(sb);

	if (inode) {
		inode->i_mode = mode;
		inode->i_uid = current->fsuid;
		inode->i_gid = current->fsgid;
		inode->i_blocks = 0;
		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
		inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
	}
	return inode;
}

583
584
585
586
587
588
589
590
591
592
593
594
595
/*
 * Call subsys's pre_destroy handler.
 * This is called before css refcnt check.
 */
static void cgroup_call_pre_destroy(struct cgroup *cgrp)
{
	struct cgroup_subsys *ss;
	for_each_subsys(cgrp->root, ss)
		if (ss->pre_destroy && cgrp->subsys[ss->subsys_id])
			ss->pre_destroy(ss, cgrp);
	return;
}

596
597
598
599
static void cgroup_diput(struct dentry *dentry, struct inode *inode)
{
	/* is dentry a directory ? if so, kfree() associated cgroup */
	if (S_ISDIR(inode->i_mode)) {
600
		struct cgroup *cgrp = dentry->d_fsdata;
601
		struct cgroup_subsys *ss;
602
		BUG_ON(!(cgroup_is_removed(cgrp)));
603
604
605
606
607
608
609
		/* It's possible for external users to be holding css
		 * reference counts on a cgroup; css_put() needs to
		 * be able to access the cgroup after decrementing
		 * the reference count in order to know if it needs to
		 * queue the cgroup to be handled by the release
		 * agent */
		synchronize_rcu();
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626

		mutex_lock(&cgroup_mutex);
		/*
		 * Release the subsystem state objects.
		 */
		for_each_subsys(cgrp->root, ss) {
			if (cgrp->subsys[ss->subsys_id])
				ss->destroy(ss, cgrp);
		}

		cgrp->root->number_of_cgroups--;
		mutex_unlock(&cgroup_mutex);

		/* Drop the active superblock reference that we took when we
		 * created the cgroup */
		deactivate_super(cgrp->root->sb);

627
		kfree(cgrp);
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
	}
	iput(inode);
}

static void remove_dir(struct dentry *d)
{
	struct dentry *parent = dget(d->d_parent);

	d_delete(d);
	simple_rmdir(parent->d_inode, d);
	dput(parent);
}

static void cgroup_clear_directory(struct dentry *dentry)
{
	struct list_head *node;

	BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
	spin_lock(&dcache_lock);
	node = dentry->d_subdirs.next;
	while (node != &dentry->d_subdirs) {
		struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
		list_del_init(node);
		if (d->d_inode) {
			/* This should never be called on a cgroup
			 * directory with child cgroups */
			BUG_ON(d->d_inode->i_mode & S_IFDIR);
			d = dget_locked(d);
			spin_unlock(&dcache_lock);
			d_delete(d);
			simple_unlink(dentry->d_inode, d);
			dput(d);
			spin_lock(&dcache_lock);
		}
		node = dentry->d_subdirs.next;
	}
	spin_unlock(&dcache_lock);
}

/*
 * NOTE : the dentry must have been dget()'ed
 */
static void cgroup_d_remove_dir(struct dentry *dentry)
{
	cgroup_clear_directory(dentry);

	spin_lock(&dcache_lock);
	list_del_init(&dentry->d_u.d_child);
	spin_unlock(&dcache_lock);
	remove_dir(dentry);
}

static int rebind_subsystems(struct cgroupfs_root *root,
			      unsigned long final_bits)
{
	unsigned long added_bits, removed_bits;
684
	struct cgroup *cgrp = &root->top_cgroup;
685
686
687
688
689
690
	int i;

	removed_bits = root->actual_subsys_bits & ~final_bits;
	added_bits = final_bits & ~root->actual_subsys_bits;
	/* Check that any added subsystems are currently free */
	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
Li Zefan's avatar
Li Zefan committed
691
		unsigned long bit = 1UL << i;
692
693
694
695
696
697
698
699
700
701
702
703
704
		struct cgroup_subsys *ss = subsys[i];
		if (!(bit & added_bits))
			continue;
		if (ss->root != &rootnode) {
			/* Subsystem isn't free */
			return -EBUSY;
		}
	}

	/* Currently we don't handle adding/removing subsystems when
	 * any child cgroups exist. This is theoretically supportable
	 * but involves complex error handling, so it's being left until
	 * later */
705
	if (!list_empty(&cgrp->children))
706
707
708
709
710
711
712
713
		return -EBUSY;

	/* Process each subsystem */
	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
		struct cgroup_subsys *ss = subsys[i];
		unsigned long bit = 1UL << i;
		if (bit & added_bits) {
			/* We're binding this subsystem to this hierarchy */
714
			BUG_ON(cgrp->subsys[i]);
715
716
			BUG_ON(!dummytop->subsys[i]);
			BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
717
718
			cgrp->subsys[i] = dummytop->subsys[i];
			cgrp->subsys[i]->cgroup = cgrp;
719
720
721
			list_add(&ss->sibling, &root->subsys_list);
			rcu_assign_pointer(ss->root, root);
			if (ss->bind)
722
				ss->bind(ss, cgrp);
723
724
725

		} else if (bit & removed_bits) {
			/* We're removing this subsystem */
726
727
			BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
			BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
728
729
730
			if (ss->bind)
				ss->bind(ss, dummytop);
			dummytop->subsys[i]->cgroup = dummytop;
731
			cgrp->subsys[i] = NULL;
732
733
734
735
			rcu_assign_pointer(subsys[i]->root, &rootnode);
			list_del(&ss->sibling);
		} else if (bit & final_bits) {
			/* Subsystem state should already exist */
736
			BUG_ON(!cgrp->subsys[i]);
737
738
		} else {
			/* Subsystem state shouldn't exist */
739
			BUG_ON(cgrp->subsys[i]);
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
		}
	}
	root->subsys_bits = root->actual_subsys_bits = final_bits;
	synchronize_rcu();

	return 0;
}

static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
{
	struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info;
	struct cgroup_subsys *ss;

	mutex_lock(&cgroup_mutex);
	for_each_subsys(root, ss)
		seq_printf(seq, ",%s", ss->name);
	if (test_bit(ROOT_NOPREFIX, &root->flags))
		seq_puts(seq, ",noprefix");
758
759
	if (strlen(root->release_agent_path))
		seq_printf(seq, ",release_agent=%s", root->release_agent_path);
760
761
762
763
764
765
766
	mutex_unlock(&cgroup_mutex);
	return 0;
}

struct cgroup_sb_opts {
	unsigned long subsys_bits;
	unsigned long flags;
767
	char *release_agent;
768
769
770
771
772
773
774
775
776
777
778
};

/* Convert a hierarchy specifier into a bitmask of subsystems and
 * flags. */
static int parse_cgroupfs_options(char *data,
				     struct cgroup_sb_opts *opts)
{
	char *token, *o = data ?: "all";

	opts->subsys_bits = 0;
	opts->flags = 0;
779
	opts->release_agent = NULL;
780
781
782
783
784

	while ((token = strsep(&o, ",")) != NULL) {
		if (!*token)
			return -EINVAL;
		if (!strcmp(token, "all")) {
785
786
787
788
789
790
791
792
			/* Add all non-disabled subsystems */
			int i;
			opts->subsys_bits = 0;
			for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
				struct cgroup_subsys *ss = subsys[i];
				if (!ss->disabled)
					opts->subsys_bits |= 1ul << i;
			}
793
794
		} else if (!strcmp(token, "noprefix")) {
			set_bit(ROOT_NOPREFIX, &opts->flags);
795
796
797
798
799
800
801
802
803
		} else if (!strncmp(token, "release_agent=", 14)) {
			/* Specifying two release agents is forbidden */
			if (opts->release_agent)
				return -EINVAL;
			opts->release_agent = kzalloc(PATH_MAX, GFP_KERNEL);
			if (!opts->release_agent)
				return -ENOMEM;
			strncpy(opts->release_agent, token + 14, PATH_MAX - 1);
			opts->release_agent[PATH_MAX - 1] = 0;
804
805
806
807
808
809
		} else {
			struct cgroup_subsys *ss;
			int i;
			for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
				ss = subsys[i];
				if (!strcmp(token, ss->name)) {
810
811
					if (!ss->disabled)
						set_bit(i, &opts->subsys_bits);
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
					break;
				}
			}
			if (i == CGROUP_SUBSYS_COUNT)
				return -ENOENT;
		}
	}

	/* We can't have an empty hierarchy */
	if (!opts->subsys_bits)
		return -EINVAL;

	return 0;
}

static int cgroup_remount(struct super_block *sb, int *flags, char *data)
{
	int ret = 0;
	struct cgroupfs_root *root = sb->s_fs_info;
831
	struct cgroup *cgrp = &root->top_cgroup;
832
833
	struct cgroup_sb_opts opts;

834
	mutex_lock(&cgrp->dentry->d_inode->i_mutex);
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
	mutex_lock(&cgroup_mutex);

	/* See what subsystems are wanted */
	ret = parse_cgroupfs_options(data, &opts);
	if (ret)
		goto out_unlock;

	/* Don't allow flags to change at remount */
	if (opts.flags != root->flags) {
		ret = -EINVAL;
		goto out_unlock;
	}

	ret = rebind_subsystems(root, opts.subsys_bits);

	/* (re)populate subsystem files */
	if (!ret)
852
		cgroup_populate_dir(cgrp);
853

854
855
	if (opts.release_agent)
		strcpy(root->release_agent_path, opts.release_agent);
856
 out_unlock:
857
858
	if (opts.release_agent)
		kfree(opts.release_agent);
859
	mutex_unlock(&cgroup_mutex);
860
	mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
861
862
863
864
865
866
867
868
869
870
	return ret;
}

static struct super_operations cgroup_ops = {
	.statfs = simple_statfs,
	.drop_inode = generic_delete_inode,
	.show_options = cgroup_show_options,
	.remount_fs = cgroup_remount,
};

871
872
873
874
875
876
877
878
static void init_cgroup_housekeeping(struct cgroup *cgrp)
{
	INIT_LIST_HEAD(&cgrp->sibling);
	INIT_LIST_HEAD(&cgrp->children);
	INIT_LIST_HEAD(&cgrp->css_sets);
	INIT_LIST_HEAD(&cgrp->release_list);
	init_rwsem(&cgrp->pids_mutex);
}
879
880
static void init_cgroup_root(struct cgroupfs_root *root)
{
881
	struct cgroup *cgrp = &root->top_cgroup;
882
883
884
	INIT_LIST_HEAD(&root->subsys_list);
	INIT_LIST_HEAD(&root->root_list);
	root->number_of_cgroups = 1;
885
886
	cgrp->root = root;
	cgrp->top_cgroup = cgrp;
887
	init_cgroup_housekeeping(cgrp);
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
}

static int cgroup_test_super(struct super_block *sb, void *data)
{
	struct cgroupfs_root *new = data;
	struct cgroupfs_root *root = sb->s_fs_info;

	/* First check subsystems */
	if (new->subsys_bits != root->subsys_bits)
	    return 0;

	/* Next check flags */
	if (new->flags != root->flags)
		return 0;

	return 1;
}

static int cgroup_set_super(struct super_block *sb, void *data)
{
	int ret;
	struct cgroupfs_root *root = data;

	ret = set_anon_super(sb, NULL);
	if (ret)
		return ret;

	sb->s_fs_info = root;
	root->sb = sb;

	sb->s_blocksize = PAGE_CACHE_SIZE;
	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
	sb->s_magic = CGROUP_SUPER_MAGIC;
	sb->s_op = &cgroup_ops;

	return 0;
}

static int cgroup_get_rootdir(struct super_block *sb)
{
	struct inode *inode =
		cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
	struct dentry *dentry;

	if (!inode)
		return -ENOMEM;

	inode->i_fop = &simple_dir_operations;
	inode->i_op = &cgroup_dir_inode_operations;
	/* directories start off with i_nlink == 2 (for "." entry) */
	inc_nlink(inode);
	dentry = d_alloc_root(inode);
	if (!dentry) {
		iput(inode);
		return -ENOMEM;
	}
	sb->s_root = dentry;
	return 0;
}

static int cgroup_get_sb(struct file_system_type *fs_type,
			 int flags, const char *unused_dev_name,
			 void *data, struct vfsmount *mnt)
{
	struct cgroup_sb_opts opts;
	int ret = 0;
	struct super_block *sb;
	struct cgroupfs_root *root;
956
	struct list_head tmp_cg_links;
957
958
959

	/* First find the desired set of subsystems */
	ret = parse_cgroupfs_options(data, &opts);
960
961
962
	if (ret) {
		if (opts.release_agent)
			kfree(opts.release_agent);
963
		return ret;
964
	}
965
966

	root = kzalloc(sizeof(*root), GFP_KERNEL);
967
968
969
	if (!root) {
		if (opts.release_agent)
			kfree(opts.release_agent);
970
		return -ENOMEM;
971
	}
972
973
974
975

	init_cgroup_root(root);
	root->subsys_bits = opts.subsys_bits;
	root->flags = opts.flags;
976
977
978
979
	if (opts.release_agent) {
		strcpy(root->release_agent_path, opts.release_agent);
		kfree(opts.release_agent);
	}
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994

	sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root);

	if (IS_ERR(sb)) {
		kfree(root);
		return PTR_ERR(sb);
	}

	if (sb->s_fs_info != root) {
		/* Reusing an existing superblock */
		BUG_ON(sb->s_root == NULL);
		kfree(root);
		root = NULL;
	} else {
		/* New superblock */
995
		struct cgroup *cgrp = &root->top_cgroup;
996
		struct inode *inode;
997
		int i;
998
999
1000
1001
1002
1003

		BUG_ON(sb->s_root != NULL);

		ret = cgroup_get_rootdir(sb);
		if (ret)
			goto drop_new_super;
1004
		inode = sb->s_root->d_inode;
1005

1006
		mutex_lock(&inode->i_mutex);
1007
1008
		mutex_lock(&cgroup_mutex);

1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
		/*
		 * We're accessing css_set_count without locking
		 * css_set_lock here, but that's OK - it can only be
		 * increased by someone holding cgroup_lock, and
		 * that's us. The worst that can happen is that we
		 * have some link structures left over
		 */
		ret = allocate_cg_links(css_set_count, &tmp_cg_links);
		if (ret) {
			mutex_unlock(&cgroup_mutex);
			mutex_unlock(&inode->i_mutex);
			goto drop_new_super;
		}

1023
1024
1025
		ret = rebind_subsystems(root, root->subsys_bits);
		if (ret == -EBUSY) {
			mutex_unlock(&cgroup_mutex);
1026
			mutex_unlock(&inode->i_mutex);
1027
1028
1029
1030
1031
1032
1033
			goto drop_new_super;
		}

		/* EBUSY should be the only error here */
		BUG_ON(ret);

		list_add(&root->root_list, &roots);
1034
		root_count++;
1035
1036
1037
1038

		sb->s_root->d_fsdata = &root->top_cgroup;
		root->top_cgroup.dentry = sb->s_root;

1039
1040
1041
		/* Link the top cgroup in this hierarchy into all
		 * the css_set objects */
		write_lock(&css_set_lock);
1042
1043
1044
		for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
			struct hlist_head *hhead = &css_set_table[i];
			struct hlist_node *node;
1045
			struct css_set *cg;
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060

			hlist_for_each_entry(cg, node, hhead, hlist) {
				struct cg_cgroup_link *link;

				BUG_ON(list_empty(&tmp_cg_links));
				link = list_entry(tmp_cg_links.next,
						  struct cg_cgroup_link,
						  cgrp_link_list);
				list_del(&link->cgrp_link_list);
				link->cg = cg;
				list_add(&link->cgrp_link_list,
					 &root->top_cgroup.css_sets);
				list_add(&link->cg_link_list, &cg->cg_links);
			}
		}
1061
1062
1063
1064
		write_unlock(&css_set_lock);

		free_cg_links(&tmp_cg_links);

1065
1066
		BUG_ON(!list_empty(&cgrp->sibling));
		BUG_ON(!list_empty(&cgrp->children));
1067
1068
		BUG_ON(root->number_of_cgroups != 1);

1069
		cgroup_populate_dir(cgrp);
1070
		mutex_unlock(&inode->i_mutex);
1071
1072
1073
1074
1075
1076
1077
1078
		mutex_unlock(&cgroup_mutex);
	}

	return simple_set_mnt(mnt, sb);

 drop_new_super:
	up_write(&sb->s_umount);
	deactivate_super(sb);
1079
	free_cg_links(&tmp_cg_links);
1080
1081
1082
1083
1084
	return ret;
}

static void cgroup_kill_sb(struct super_block *sb) {
	struct cgroupfs_root *root = sb->s_fs_info;
1085
	struct cgroup *cgrp = &root->top_cgroup;
1086
	int ret;
1087
1088
	struct cg_cgroup_link *link;
	struct cg_cgroup_link *saved_link;
1089
1090
1091
1092

	BUG_ON(!root);

	BUG_ON(root->number_of_cgroups != 1);
1093
1094
	BUG_ON(!list_empty(&cgrp->children));
	BUG_ON(!list_empty(&cgrp->sibling));
1095
1096
1097
1098
1099
1100
1101
1102

	mutex_lock(&cgroup_mutex);

	/* Rebind all subsystems back to the default hierarchy */
	ret = rebind_subsystems(root, 0);
	/* Shouldn't be able to fail ... */
	BUG_ON(ret);

1103
1104
1105
1106
1107
	/*
	 * Release all the links from css_sets to this hierarchy's
	 * root cgroup
	 */
	write_lock(&css_set_lock);
1108
1109
1110

	list_for_each_entry_safe(link, saved_link, &cgrp->css_sets,
				 cgrp_link_list) {
1111
		list_del(&link->cg_link_list);
1112
		list_del(&link->cgrp_link_list);
1113
1114
1115
1116
1117
		kfree(link);
	}
	write_unlock(&css_set_lock);

	if (!list_empty(&root->root_list)) {
1118
		list_del(&root->root_list);
1119
1120
		root_count--;
	}
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
	mutex_unlock(&cgroup_mutex);

	kfree(root);
	kill_litter_super(sb);
}

static struct file_system_type cgroup_fs_type = {
	.name = "cgroup",
	.get_sb = cgroup_get_sb,
	.kill_sb = cgroup_kill_sb,
};

1133
static inline struct cgroup *__d_cgrp(struct dentry *dentry)
1134
1135
1136
1137
1138
1139
1140
1141
1142
{
	return dentry->d_fsdata;
}

static inline struct cftype *__d_cft(struct dentry *dentry)
{
	return dentry->d_fsdata;
}

Li Zefan's avatar
Li Zefan committed
1143
1144
1145
1146
1147
1148
1149
/**
 * cgroup_path - generate the path of a cgroup
 * @cgrp: the cgroup in question
 * @buf: the buffer to write the path into
 * @buflen: the length of the buffer
 *
 * Called with cgroup_mutex held. Writes path of cgroup into buf.
1150
1151
 * Returns 0 on success, -errno on error.
 */
1152
int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1153
1154
1155
{
	char *start;

1156
	if (cgrp == dummytop) {
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
		/*
		 * Inactive subsystems have no dentry for their root
		 * cgroup
		 */
		strcpy(buf, "/");
		return 0;
	}

	start = buf + buflen;

	*--start = '\0';
	for (;;) {
1169
		int len = cgrp->dentry->d_name.len;
1170
1171
		if ((start -= len) < buf)
			return -ENAMETOOLONG;
1172
1173
1174
		memcpy(start, cgrp->dentry->d_name.name, len);
		cgrp = cgrp->parent;
		if (!cgrp)
1175
			break;
1176
		if (!cgrp->parent)
1177
1178
1179
1180
1181
1182
1183
1184
1185
			continue;
		if (--start < buf)
			return -ENAMETOOLONG;
		*start = '/';
	}
	memmove(buf, start, buf + buflen - start);
	return 0;
}

1186
1187
1188
1189
1190
/*
 * Return the first subsystem attached to a cgroup's hierarchy, and
 * its subsystem id.
 */

1191
static void get_first_subsys(const struct cgroup *cgrp,
1192
1193
			struct cgroup_subsys_state **css, int *subsys_id)
{
1194
	const struct cgroupfs_root *root = cgrp->root;
1195
1196
1197
1198
1199
	const struct cgroup_subsys *test_ss;
	BUG_ON(list_empty(&root->subsys_list));
	test_ss = list_entry(root->subsys_list.next,
			     struct cgroup_subsys, sibling);
	if (css) {
1200
		*css = cgrp->subsys[test_ss->subsys_id];
1201
1202
1203
1204
1205
1206
		BUG_ON(!*css);
	}
	if (subsys_id)
		*subsys_id = test_ss->subsys_id;
}

Li Zefan's avatar
Li Zefan committed
1207
1208
1209
1210
/**
 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
 * @cgrp: the cgroup the task is attaching to
 * @tsk: the task to be attached
1211
 *
Li Zefan's avatar
Li Zefan committed
1212
1213
 * Call holding cgroup_mutex. May take task_lock of
 * the task 'tsk' during call.
1214
 */
1215
int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1216
1217
1218
{
	int retval = 0;
	struct cgroup_subsys *ss;
1219
	struct cgroup *oldcgrp;
1220
1221
	struct css_set *cg = tsk->cgroups;
	struct css_set *newcg;
1222
	struct cgroupfs_root *root = cgrp->root;
1223
1224
	int subsys_id;

1225
	get_first_subsys(cgrp, NULL, &subsys_id);
1226
1227

	/* Nothing to do if the task is already in that cgroup */
1228
1229
	oldcgrp = task_cgroup(tsk, subsys_id);
	if (cgrp == oldcgrp)
1230
1231
1232
1233
		return 0;

	for_each_subsys(root, ss) {
		if (ss->can_attach) {
1234
			retval = ss->can_attach(ss, cgrp, tsk);
Paul Jackson's avatar
Paul Jackson committed
1235
			if (retval)
1236
1237
1238
1239
				return retval;
		}
	}

1240
1241
1242
1243
	/*
	 * Locate or allocate a new css_set for this task,
	 * based on its final set of cgroups
	 */
1244
	newcg = find_css_set(cg, cgrp);
Paul Jackson's avatar
Paul Jackson committed
1245
	if (!newcg)
1246
1247
		return -ENOMEM;

1248
1249
1250
	task_lock(tsk);
	if (tsk->flags & PF_EXITING) {
		task_unlock(tsk);
1251
		put_css_set(newcg);
1252
1253
		return -ESRCH;
	}
1254
	rcu_assign_pointer(tsk->cgroups, newcg);
1255
1256
	task_unlock(tsk);

1257
1258
1259
1260
1261
1262
1263
1264
	/* Update the css_set linked lists if we're using them */
	write_lock(&css_set_lock);
	if (!list_empty(&tsk->cg_list)) {
		list_del(&tsk->cg_list);
		list_add(&tsk->cg_list, &newcg->tasks);
	}
	write_unlock(&css_set_lock);

1265
	for_each_subsys(root, ss) {
Paul Jackson's avatar
Paul Jackson committed
1266
		if (ss->attach)
1267
			ss->attach(ss, cgrp, oldcgrp, tsk);
1268
	}
1269
	set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
1270
	synchronize_rcu();
1271
	put_css_set(cg);
1272
1273
1274
1275
	return 0;
}

/*
1276
1277
 * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
 * held. May take task_lock of task
1278
 */
1279
static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
1280
1281
1282
1283
1284
1285
{
	struct task_struct *tsk;
	int ret;

	if (pid) {
		rcu_read_lock();
1286
		tsk = find_task_by_vpid(pid);
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
		if (!tsk || tsk->flags & PF_EXITING) {
			rcu_read_unlock();
			return -ESRCH;
		}
		get_task_struct(tsk);
		rcu_read_unlock();

		if ((current->euid) && (current->euid != tsk->uid)
		    && (current->euid != tsk->suid)) {
			put_task_struct(tsk);
			return -EACCES;
		}
	} else {
		tsk = current;
		get_task_struct(tsk);
	}

1304
	ret = cgroup_attach_task(cgrp, tsk);
1305
1306
1307
1308
	put_task_struct(tsk);
	return ret;
}

1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
{
	int ret;
	if (!cgroup_lock_live_group(cgrp))
		return -ENODEV;
	ret = attach_task_by_pid(cgrp, pid);
	cgroup_unlock();
	return ret;
}

1319
1320
1321
1322
1323
/* The various types of files and directories in a cgroup file system */
enum cgroup_filetype {
	FILE_ROOT,
	FILE_DIR,
	FILE_TASKLIST,
1324
1325
	FILE_NOTIFY_ON_RELEASE,
	FILE_RELEASE_AGENT,
1326
1327
};

1328
1329
1330
1331
/**
 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
 * @cgrp: the cgroup to be checked for liveness
 *
1332
1333
 * On success, returns true; the lock should be later released with
 * cgroup_unlock(). On failure returns false with no lock held.
1334
 */
1335
bool cgroup_lock_live_group(struct cgroup *cgrp)
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
{
	mutex_lock(&cgroup_mutex);
	if (cgroup_is_removed(cgrp)) {
		mutex_unlock(&cgroup_mutex);
		return false;
	}
	return true;
}

static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
				      const char *buffer)
{
	BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
	if (!cgroup_lock_live_group(cgrp))
		return -ENODEV;
	strcpy(cgrp->root->release_agent_path, buffer);
1352
	cgroup_unlock();
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
	return 0;
}

static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
				     struct seq_file *seq)
{
	if (!cgroup_lock_live_group(cgrp))
		return -ENODEV;
	seq_puts(seq, cgrp->root->release_agent_path);
	seq_putc(seq, '\n');
1363
	cgroup_unlock();
1364
1365
1366
	return 0;
}

1367
1368
1369
/* A buffer size big enough for numbers or short strings */
#define CGROUP_LOCAL_BUFFER_SIZE 64

1370
static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
1371
1372
1373
				struct file *file,
				const char __user *userbuf,
				size_t nbytes, loff_t *unused_ppos)
1374
{
1375
	char buffer[CGROUP_LOCAL_BUFFER_SIZE];
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
	int retval = 0;
	char *end;

	if (!nbytes)
		return -EINVAL;
	if (nbytes >= sizeof(buffer))
		return -E2BIG;
	if (copy_from_user(buffer, userbuf, nbytes))
		return -EFAULT;

	buffer[nbytes] = 0;     /* nul-terminate */
1387
	strstrip(buffer);
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
	if (cft->write_u64) {
		u64 val = simple_strtoull(buffer, &end, 0);
		if (*end)
			return -EINVAL;
		retval = cft->write_u64(cgrp, cft, val);
	} else {
		s64 val = simple_strtoll(buffer, &end, 0);
		if (*end)
			return -EINVAL;
		retval = cft->write_s64(cgrp, cft, val);
	}
1399
1400
1401
1402
1403
	if (!retval)
		retval = nbytes;
	return retval;
}

1404
1405
1406
1407
1408
static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
				   struct file *file,
				   const char __user *userbuf,
				   size_t nbytes, loff_t *unused_ppos)
{
1409
	char local_buffer[CGROUP_LOCAL_BUFFER_SIZE];
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
	int retval = 0;
	size_t max_bytes = cft->max_write_len;
	char *buffer = local_buffer;

	if (!max_bytes)
		max_bytes = sizeof(local_buffer) - 1;
	if (nbytes >= max_bytes)
		return -E2BIG;
	/* Allocate a dynamic buffer if we need one */
	if (nbytes >= sizeof(local_buffer)) {
		buffer = kmalloc(nbytes + 1, GFP_KERNEL);
		if (buffer == NULL)
			return -ENOMEM;
	}
Li Zefan's avatar
Li Zefan committed
1424
1425
1426
1427
	if (nbytes && copy_from_user(buffer, userbuf, nbytes)) {
		retval = -EFAULT;
		goto out;
	}
1428
1429
1430
1431
1432
1433

	buffer[nbytes] = 0;     /* nul-terminate */
	strstrip(buffer);
	retval = cft->write_string(cgrp, cft, buffer);
	if (!retval)
		retval = nbytes;
Li Zefan's avatar
Li Zefan committed
1434
out:
1435
1436
1437
1438
1439
	if (buffer != local_buffer)
		kfree(buffer);
	return retval;
}