device_cgroup.c 15.8 KB
Newer Older
1
/*
2
 * device_cgroup.c - device cgroup subsystem
3 4 5 6 7 8 9 10 11
 *
 * Copyright 2007 IBM Corp
 */

#include <linux/device_cgroup.h>
#include <linux/cgroup.h>
#include <linux/ctype.h>
#include <linux/list.h>
#include <linux/uaccess.h>
12
#include <linux/seq_file.h>
13
#include <linux/slab.h>
14
#include <linux/rcupdate.h>
15
#include <linux/mutex.h>
16 17 18 19 20 21 22 23 24 25

#define ACC_MKNOD 1
#define ACC_READ  2
#define ACC_WRITE 4
#define ACC_MASK (ACC_MKNOD | ACC_READ | ACC_WRITE)

#define DEV_BLOCK 1
#define DEV_CHAR  2
#define DEV_ALL   4  /* this represents all devices */

26 27
static DEFINE_MUTEX(devcgroup_mutex);

28 29 30 31 32 33
enum devcg_behavior {
	DEVCG_DEFAULT_NONE,
	DEVCG_DEFAULT_ALLOW,
	DEVCG_DEFAULT_DENY,
};

34
/*
35
 * exception list locking rules:
36
 * hold devcgroup_mutex for update/read.
37
 * hold rcu_read_lock() for read.
38 39
 */

40
struct dev_exception_item {
41 42 43 44
	u32 major, minor;
	short type;
	short access;
	struct list_head list;
45
	struct rcu_head rcu;
46 47 48 49
};

struct dev_cgroup {
	struct cgroup_subsys_state css;
50
	struct list_head exceptions;
51
	enum devcg_behavior behavior;
52 53
};

54 55 56 57 58
static inline struct dev_cgroup *css_to_devcgroup(struct cgroup_subsys_state *s)
{
	return container_of(s, struct dev_cgroup, css);
}

59 60
static inline struct dev_cgroup *cgroup_to_devcgroup(struct cgroup *cgroup)
{
61
	return css_to_devcgroup(cgroup_subsys_state(cgroup, devices_subsys_id));
62 63
}

64 65 66 67 68
static inline struct dev_cgroup *task_devcgroup(struct task_struct *task)
{
	return css_to_devcgroup(task_subsys_state(task, devices_subsys_id));
}

69 70
struct cgroup_subsys devices_subsys;

71 72
static int devcgroup_can_attach(struct cgroup *new_cgrp,
				struct cgroup_taskset *set)
73
{
74
	struct task_struct *task = cgroup_taskset_first(set);
75

76 77
	if (current != task && !capable(CAP_SYS_ADMIN))
		return -EPERM;
78 79 80 81
	return 0;
}

/*
82
 * called under devcgroup_mutex
83
 */
84
static int dev_exceptions_copy(struct list_head *dest, struct list_head *orig)
85
{
86
	struct dev_exception_item *ex, *tmp, *new;
87

88 89
	lockdep_assert_held(&devcgroup_mutex);

90 91
	list_for_each_entry(ex, orig, list) {
		new = kmemdup(ex, sizeof(*ex), GFP_KERNEL);
92 93 94 95 96 97 98 99
		if (!new)
			goto free_and_exit;
		list_add_tail(&new->list, dest);
	}

	return 0;

free_and_exit:
100 101 102
	list_for_each_entry_safe(ex, tmp, dest, list) {
		list_del(&ex->list);
		kfree(ex);
103 104 105 106 107
	}
	return -ENOMEM;
}

/*
108
 * called under devcgroup_mutex
109
 */
110 111
static int dev_exception_add(struct dev_cgroup *dev_cgroup,
			     struct dev_exception_item *ex)
112
{
113
	struct dev_exception_item *excopy, *walk;
114

115 116
	lockdep_assert_held(&devcgroup_mutex);

117 118
	excopy = kmemdup(ex, sizeof(*ex), GFP_KERNEL);
	if (!excopy)
119 120
		return -ENOMEM;

121 122
	list_for_each_entry(walk, &dev_cgroup->exceptions, list) {
		if (walk->type != ex->type)
123
			continue;
124
		if (walk->major != ex->major)
125
			continue;
126
		if (walk->minor != ex->minor)
127 128
			continue;

129 130 131
		walk->access |= ex->access;
		kfree(excopy);
		excopy = NULL;
132 133
	}

134 135
	if (excopy != NULL)
		list_add_tail_rcu(&excopy->list, &dev_cgroup->exceptions);
136 137 138 139
	return 0;
}

/*
140
 * called under devcgroup_mutex
141
 */
142 143
static void dev_exception_rm(struct dev_cgroup *dev_cgroup,
			     struct dev_exception_item *ex)
144
{
145
	struct dev_exception_item *walk, *tmp;
146

147 148
	lockdep_assert_held(&devcgroup_mutex);

149 150
	list_for_each_entry_safe(walk, tmp, &dev_cgroup->exceptions, list) {
		if (walk->type != ex->type)
151
			continue;
152
		if (walk->major != ex->major)
153
			continue;
154
		if (walk->minor != ex->minor)
155 156
			continue;

157
		walk->access &= ~ex->access;
158
		if (!walk->access) {
159
			list_del_rcu(&walk->list);
160
			kfree_rcu(walk, rcu);
161 162 163 164
		}
	}
}

165 166 167 168 169 170 171 172 173 174
static void __dev_exception_clean(struct dev_cgroup *dev_cgroup)
{
	struct dev_exception_item *ex, *tmp;

	list_for_each_entry_safe(ex, tmp, &dev_cgroup->exceptions, list) {
		list_del_rcu(&ex->list);
		kfree_rcu(ex, rcu);
	}
}

175
/**
176 177
 * dev_exception_clean - frees all entries of the exception list
 * @dev_cgroup: dev_cgroup with the exception list to be cleaned
178 179 180
 *
 * called under devcgroup_mutex
 */
181
static void dev_exception_clean(struct dev_cgroup *dev_cgroup)
182
{
183 184
	lockdep_assert_held(&devcgroup_mutex);

185
	__dev_exception_clean(dev_cgroup);
186 187
}

188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225
/**
 * devcgroup_online - initializes devcgroup's behavior and exceptions based on
 * 		      parent's
 * @cgroup: cgroup getting online
 * returns 0 in case of success, error code otherwise
 */
static int devcgroup_online(struct cgroup *cgroup)
{
	struct dev_cgroup *dev_cgroup, *parent_dev_cgroup = NULL;
	int ret = 0;

	mutex_lock(&devcgroup_mutex);
	dev_cgroup = cgroup_to_devcgroup(cgroup);
	if (cgroup->parent)
		parent_dev_cgroup = cgroup_to_devcgroup(cgroup->parent);

	if (parent_dev_cgroup == NULL)
		dev_cgroup->behavior = DEVCG_DEFAULT_ALLOW;
	else {
		ret = dev_exceptions_copy(&dev_cgroup->exceptions,
					  &parent_dev_cgroup->exceptions);
		if (!ret)
			dev_cgroup->behavior = parent_dev_cgroup->behavior;
	}
	mutex_unlock(&devcgroup_mutex);

	return ret;
}

static void devcgroup_offline(struct cgroup *cgroup)
{
	struct dev_cgroup *dev_cgroup = cgroup_to_devcgroup(cgroup);

	mutex_lock(&devcgroup_mutex);
	dev_cgroup->behavior = DEVCG_DEFAULT_NONE;
	mutex_unlock(&devcgroup_mutex);
}

226 227 228
/*
 * called from kernel/cgroup.c with cgroup_lock() held.
 */
229
static struct cgroup_subsys_state *devcgroup_css_alloc(struct cgroup *cgroup)
230
{
231
	struct dev_cgroup *dev_cgroup;
232 233 234 235 236
	struct cgroup *parent_cgroup;

	dev_cgroup = kzalloc(sizeof(*dev_cgroup), GFP_KERNEL);
	if (!dev_cgroup)
		return ERR_PTR(-ENOMEM);
237
	INIT_LIST_HEAD(&dev_cgroup->exceptions);
238
	dev_cgroup->behavior = DEVCG_DEFAULT_NONE;
239 240 241 242 243
	parent_cgroup = cgroup->parent;

	return &dev_cgroup->css;
}

244
static void devcgroup_css_free(struct cgroup *cgroup)
245 246 247 248
{
	struct dev_cgroup *dev_cgroup;

	dev_cgroup = cgroup_to_devcgroup(cgroup);
249
	__dev_exception_clean(dev_cgroup);
250 251 252 253 254
	kfree(dev_cgroup);
}

#define DEVCG_ALLOW 1
#define DEVCG_DENY 2
255 256
#define DEVCG_LIST 3

257
#define MAJMINLEN 13
258
#define ACCLEN 4
259 260 261 262

static void set_access(char *acc, short access)
{
	int idx = 0;
263
	memset(acc, 0, ACCLEN);
264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282
	if (access & ACC_READ)
		acc[idx++] = 'r';
	if (access & ACC_WRITE)
		acc[idx++] = 'w';
	if (access & ACC_MKNOD)
		acc[idx++] = 'm';
}

static char type_to_char(short type)
{
	if (type == DEV_ALL)
		return 'a';
	if (type == DEV_CHAR)
		return 'c';
	if (type == DEV_BLOCK)
		return 'b';
	return 'X';
}

283
static void set_majmin(char *str, unsigned m)
284 285
{
	if (m == ~0)
Li Zefan's avatar
Li Zefan committed
286
		strcpy(str, "*");
287
	else
Li Zefan's avatar
Li Zefan committed
288
		sprintf(str, "%u", m);
289 290
}

291 292
static int devcgroup_seq_read(struct cgroup *cgroup, struct cftype *cft,
				struct seq_file *m)
293
{
294
	struct dev_cgroup *devcgroup = cgroup_to_devcgroup(cgroup);
295
	struct dev_exception_item *ex;
296
	char maj[MAJMINLEN], min[MAJMINLEN], acc[ACCLEN];
297

298
	rcu_read_lock();
299 300 301 302 303 304
	/*
	 * To preserve the compatibility:
	 * - Only show the "all devices" when the default policy is to allow
	 * - List the exceptions in case the default policy is to deny
	 * This way, the file remains as a "whitelist of devices"
	 */
305
	if (devcgroup->behavior == DEVCG_DEFAULT_ALLOW) {
306 307 308 309
		set_access(acc, ACC_MASK);
		set_majmin(maj, ~0);
		set_majmin(min, ~0);
		seq_printf(m, "%c %s:%s %s\n", type_to_char(DEV_ALL),
310
			   maj, min, acc);
311
	} else {
312 313 314 315 316
		list_for_each_entry_rcu(ex, &devcgroup->exceptions, list) {
			set_access(acc, ex->access);
			set_majmin(maj, ex->major);
			set_majmin(min, ex->minor);
			seq_printf(m, "%c %s:%s %s\n", type_to_char(ex->type),
317 318
				   maj, min, acc);
		}
319
	}
320
	rcu_read_unlock();
321

322
	return 0;
323 324
}

325
/**
326 327 328 329 330
 * may_access - verifies if a new exception is part of what is allowed
 *		by a dev cgroup based on the default policy +
 *		exceptions. This is used to make sure a child cgroup
 *		won't have more privileges than its parent or to
 *		verify if a certain access is allowed.
331
 * @dev_cgroup: dev cgroup to be tested against
332
 * @refex: new exception
333
 * @behavior: behavior of the exception
334
 */
335
static bool may_access(struct dev_cgroup *dev_cgroup,
336 337
		       struct dev_exception_item *refex,
		       enum devcg_behavior behavior)
338
{
339
	struct dev_exception_item *ex;
340
	bool match = false;
341

342 343 344 345
	rcu_lockdep_assert(rcu_read_lock_held() ||
			   lockdep_is_held(&devcgroup_mutex),
			   "device_cgroup::may_access() called without proper synchronization");

Tejun Heo's avatar
Tejun Heo committed
346
	list_for_each_entry_rcu(ex, &dev_cgroup->exceptions, list) {
347
		if ((refex->type & DEV_BLOCK) && !(ex->type & DEV_BLOCK))
348
			continue;
349
		if ((refex->type & DEV_CHAR) && !(ex->type & DEV_CHAR))
350
			continue;
351
		if (ex->major != ~0 && ex->major != refex->major)
352
			continue;
353
		if (ex->minor != ~0 && ex->minor != refex->minor)
354
			continue;
355
		if (refex->access & (~ex->access))
356
			continue;
357 358
		match = true;
		break;
359
	}
360

361 362 363 364 365 366 367 368 369 370 371 372
	if (dev_cgroup->behavior == DEVCG_DEFAULT_ALLOW) {
		if (behavior == DEVCG_DEFAULT_ALLOW) {
			/* the exception will deny access to certain devices */
			return true;
		} else {
			/* the exception will allow access to certain devices */
			if (match)
				/*
				 * a new exception allowing access shouldn't
				 * match an parent's exception
				 */
				return false;
373
			return true;
374
		}
375
	} else {
376 377 378
		/* only behavior == DEVCG_DEFAULT_DENY allowed here */
		if (match)
			/* parent has an exception that matches the proposed */
379
			return true;
380 381
		else
			return false;
382 383
	}
	return false;
384 385 386 387
}

/*
 * parent_has_perm:
388
 * when adding a new allow rule to a device exception list, the rule
389 390
 * must be allowed in the parent device
 */
391
static int parent_has_perm(struct dev_cgroup *childcg,
392
				  struct dev_exception_item *ex)
393
{
394
	struct cgroup *pcg = childcg->css.cgroup->parent;
395 396 397 398 399
	struct dev_cgroup *parent;

	if (!pcg)
		return 1;
	parent = cgroup_to_devcgroup(pcg);
400
	return may_access(parent, ex, childcg->behavior);
401 402
}

403 404 405 406 407 408 409 410
/**
 * may_allow_all - checks if it's possible to change the behavior to
 *		   allow based on parent's rules.
 * @parent: device cgroup's parent
 * returns: != 0 in case it's allowed, 0 otherwise
 */
static inline int may_allow_all(struct dev_cgroup *parent)
{
411 412
	if (!parent)
		return 1;
413 414 415
	return parent->behavior == DEVCG_DEFAULT_ALLOW;
}

416
/*
417
 * Modify the exception list using allow/deny rules.
418 419
 * CAP_SYS_ADMIN is needed for this.  It's at least separate from CAP_MKNOD
 * so we can give a container CAP_MKNOD to let it create devices but not
420
 * modify the exception list.
421 422
 * It seems likely we'll want to add a CAP_CONTAINER capability to allow
 * us to also grant CAP_SYS_ADMIN to containers without giving away the
423
 * device exception list controls, but for now we'll stick with CAP_SYS_ADMIN
424 425 426 427 428
 *
 * Taking rules away is always allowed (given CAP_SYS_ADMIN).  Granting
 * new access is only allowed if you're in the top-level cgroup, or your
 * parent cgroup has the access you're asking for.
 */
429 430
static int devcgroup_update_access(struct dev_cgroup *devcgroup,
				   int filetype, const char *buffer)
431
{
432
	const char *b;
433
	char temp[12];		/* 11 + 1 characters needed for a u32 */
434
	int count, rc = 0;
435
	struct dev_exception_item ex;
436
	struct cgroup *p = devcgroup->css.cgroup;
437
	struct dev_cgroup *parent = NULL;
438 439 440 441

	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;

442 443 444
	if (p->parent)
		parent = cgroup_to_devcgroup(p->parent);

445
	memset(&ex, 0, sizeof(ex));
446 447 448 449
	b = buffer;

	switch (*b) {
	case 'a':
450 451
		switch (filetype) {
		case DEVCG_ALLOW:
452
			if (!may_allow_all(parent))
453
				return -EPERM;
454
			dev_exception_clean(devcgroup);
455 456 457 458
			devcgroup->behavior = DEVCG_DEFAULT_ALLOW;
			if (!parent)
				break;

459 460 461 462
			rc = dev_exceptions_copy(&devcgroup->exceptions,
						 &parent->exceptions);
			if (rc)
				return rc;
463 464
			break;
		case DEVCG_DENY:
465
			dev_exception_clean(devcgroup);
466
			devcgroup->behavior = DEVCG_DEFAULT_DENY;
467 468 469 470 471
			break;
		default:
			return -EINVAL;
		}
		return 0;
472
	case 'b':
473
		ex.type = DEV_BLOCK;
474 475
		break;
	case 'c':
476
		ex.type = DEV_CHAR;
477 478
		break;
	default:
479
		return -EINVAL;
480 481
	}
	b++;
482 483
	if (!isspace(*b))
		return -EINVAL;
484 485
	b++;
	if (*b == '*') {
486
		ex.major = ~0;
487 488
		b++;
	} else if (isdigit(*b)) {
489 490 491 492 493 494 495 496 497 498
		memset(temp, 0, sizeof(temp));
		for (count = 0; count < sizeof(temp) - 1; count++) {
			temp[count] = *b;
			b++;
			if (!isdigit(*b))
				break;
		}
		rc = kstrtou32(temp, 10, &ex.major);
		if (rc)
			return -EINVAL;
499
	} else {
500
		return -EINVAL;
501
	}
502 503
	if (*b != ':')
		return -EINVAL;
504 505 506 507
	b++;

	/* read minor */
	if (*b == '*') {
508
		ex.minor = ~0;
509 510
		b++;
	} else if (isdigit(*b)) {
511 512 513 514 515 516 517 518 519 520
		memset(temp, 0, sizeof(temp));
		for (count = 0; count < sizeof(temp) - 1; count++) {
			temp[count] = *b;
			b++;
			if (!isdigit(*b))
				break;
		}
		rc = kstrtou32(temp, 10, &ex.minor);
		if (rc)
			return -EINVAL;
521
	} else {
522
		return -EINVAL;
523
	}
524 525
	if (!isspace(*b))
		return -EINVAL;
526 527 528
	for (b++, count = 0; count < 3; count++, b++) {
		switch (*b) {
		case 'r':
529
			ex.access |= ACC_READ;
530 531
			break;
		case 'w':
532
			ex.access |= ACC_WRITE;
533 534
			break;
		case 'm':
535
			ex.access |= ACC_MKNOD;
536 537 538 539 540 541
			break;
		case '\n':
		case '\0':
			count = 3;
			break;
		default:
542
			return -EINVAL;
543 544 545 546 547
		}
	}

	switch (filetype) {
	case DEVCG_ALLOW:
548
		if (!parent_has_perm(devcgroup, &ex))
549
			return -EPERM;
550 551 552 553 554
		/*
		 * If the default policy is to allow by default, try to remove
		 * an matching exception instead. And be silent about it: we
		 * don't want to break compatibility
		 */
555
		if (devcgroup->behavior == DEVCG_DEFAULT_ALLOW) {
556
			dev_exception_rm(devcgroup, &ex);
557 558
			return 0;
		}
559
		return dev_exception_add(devcgroup, &ex);
560
	case DEVCG_DENY:
561 562 563 564 565
		/*
		 * If the default policy is to deny by default, try to remove
		 * an matching exception instead. And be silent about it: we
		 * don't want to break compatibility
		 */
566
		if (devcgroup->behavior == DEVCG_DEFAULT_DENY) {
567
			dev_exception_rm(devcgroup, &ex);
568 569
			return 0;
		}
570
		return dev_exception_add(devcgroup, &ex);
571
	default:
572
		return -EINVAL;
573
	}
574 575
	return 0;
}
576

577 578 579 580
static int devcgroup_access_write(struct cgroup *cgrp, struct cftype *cft,
				  const char *buffer)
{
	int retval;
581 582

	mutex_lock(&devcgroup_mutex);
583 584
	retval = devcgroup_update_access(cgroup_to_devcgroup(cgrp),
					 cft->private, buffer);
585
	mutex_unlock(&devcgroup_mutex);
586 587 588 589 590 591
	return retval;
}

static struct cftype dev_cgroup_files[] = {
	{
		.name = "allow",
592
		.write_string  = devcgroup_access_write,
593 594 595 596
		.private = DEVCG_ALLOW,
	},
	{
		.name = "deny",
597
		.write_string = devcgroup_access_write,
598 599
		.private = DEVCG_DENY,
	},
600 601 602 603 604
	{
		.name = "list",
		.read_seq_string = devcgroup_seq_read,
		.private = DEVCG_LIST,
	},
605
	{ }	/* terminate */
606 607 608 609 610
};

struct cgroup_subsys devices_subsys = {
	.name = "devices",
	.can_attach = devcgroup_can_attach,
611 612
	.css_alloc = devcgroup_css_alloc,
	.css_free = devcgroup_css_free,
613 614
	.css_online = devcgroup_online,
	.css_offline = devcgroup_offline,
615
	.subsys_id = devices_subsys_id,
616
	.base_cftypes = dev_cgroup_files,
617 618 619 620 621 622 623 624 625

	/*
	 * While devices cgroup has the rudimentary hierarchy support which
	 * checks the parent's restriction, it doesn't properly propagates
	 * config changes in ancestors to their descendents.  A child
	 * should only be allowed to add more restrictions to the parent's
	 * configuration.  Fix it and remove the following.
	 */
	.broken_hierarchy = true,
626 627
};

628 629 630 631 632 633 634 635 636 637
/**
 * __devcgroup_check_permission - checks if an inode operation is permitted
 * @dev_cgroup: the dev cgroup to be tested against
 * @type: device type
 * @major: device major number
 * @minor: device minor number
 * @access: combination of ACC_WRITE, ACC_READ and ACC_MKNOD
 *
 * returns 0 on success, -EPERM case the operation is not permitted
 */
638
static int __devcgroup_check_permission(short type, u32 major, u32 minor,
639
				        short access)
640
{
641
	struct dev_cgroup *dev_cgroup;
642
	struct dev_exception_item ex;
643
	int rc;
644

645 646 647 648 649
	memset(&ex, 0, sizeof(ex));
	ex.type = type;
	ex.major = major;
	ex.minor = minor;
	ex.access = access;
650

651
	rcu_read_lock();
652
	dev_cgroup = task_devcgroup(current);
653
	rc = may_access(dev_cgroup, &ex, dev_cgroup->behavior);
654
	rcu_read_unlock();
655

656 657
	if (!rc)
		return -EPERM;
658

659 660
	return 0;
}
661

662 663 664 665 666 667 668 669 670 671 672 673 674
int __devcgroup_inode_permission(struct inode *inode, int mask)
{
	short type, access = 0;

	if (S_ISBLK(inode->i_mode))
		type = DEV_BLOCK;
	if (S_ISCHR(inode->i_mode))
		type = DEV_CHAR;
	if (mask & MAY_WRITE)
		access |= ACC_WRITE;
	if (mask & MAY_READ)
		access |= ACC_READ;

675 676
	return __devcgroup_check_permission(type, imajor(inode), iminor(inode),
			access);
677 678 679 680
}

int devcgroup_inode_mknod(int mode, dev_t dev)
{
681
	short type;
682

683 684 685
	if (!S_ISBLK(mode) && !S_ISCHR(mode))
		return 0;

686 687 688 689
	if (S_ISBLK(mode))
		type = DEV_BLOCK;
	else
		type = DEV_CHAR;
690

691 692
	return __devcgroup_check_permission(type, MAJOR(dev), MINOR(dev),
			ACC_MKNOD);
693

694
}