namei.c 107 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
/*
 *  linux/fs/namei.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

/*
 * Some corrections by tytso.
 */

/* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
 * lookup logic.
 */
/* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
 */

#include <linux/init.h>
18
#include <linux/export.h>
19
#include <linux/kernel.h>
Linus Torvalds's avatar
Linus Torvalds committed
20
21
22
23
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/pagemap.h>
Robert Love's avatar
Robert Love committed
24
#include <linux/fsnotify.h>
Linus Torvalds's avatar
Linus Torvalds committed
25
26
#include <linux/personality.h>
#include <linux/security.h>
Mimi Zohar's avatar
Mimi Zohar committed
27
#include <linux/ima.h>
Linus Torvalds's avatar
Linus Torvalds committed
28
29
30
#include <linux/syscalls.h>
#include <linux/mount.h>
#include <linux/audit.h>
31
#include <linux/capability.h>
32
#include <linux/file.h>
33
#include <linux/fcntl.h>
34
#include <linux/device_cgroup.h>
35
#include <linux/fs_struct.h>
36
#include <linux/posix_acl.h>
Linus Torvalds's avatar
Linus Torvalds committed
37
38
#include <asm/uaccess.h>

39
#include "internal.h"
40
#include "mount.h"
41

Linus Torvalds's avatar
Linus Torvalds committed
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
/* [Feb-1997 T. Schoebel-Theuer]
 * Fundamental changes in the pathname lookup mechanisms (namei)
 * were necessary because of omirr.  The reason is that omirr needs
 * to know the _real_ pathname, not the user-supplied one, in case
 * of symlinks (and also when transname replacements occur).
 *
 * The new code replaces the old recursive symlink resolution with
 * an iterative one (in case of non-nested symlink chains).  It does
 * this with calls to <fs>_follow_link().
 * As a side effect, dir_namei(), _namei() and follow_link() are now 
 * replaced with a single function lookup_dentry() that can handle all 
 * the special cases of the former code.
 *
 * With the new dcache, the pathname is stored at each inode, at least as
 * long as the refcount of the inode is positive.  As a side effect, the
 * size of the dcache depends on the inode cache and thus is dynamic.
 *
 * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
 * resolution to correspond with current state of the code.
 *
 * Note that the symlink resolution is not *completely* iterative.
 * There is still a significant amount of tail- and mid- recursion in
 * the algorithm.  Also, note that <fs>_readlink() is not used in
 * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
 * may return different results than <fs>_follow_link().  Many virtual
 * filesystems (including /proc) exhibit this behavior.
 */

/* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
 * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
 * and the name already exists in form of a symlink, try to create the new
 * name indicated by the symlink. The old code always complained that the
 * name already exists, due to not following the symlink even if its target
 * is nonexistent.  The new semantics affects also mknod() and link() when
Lucas De Marchi's avatar
Lucas De Marchi committed
76
 * the name is a symlink pointing to a non-existent name.
Linus Torvalds's avatar
Linus Torvalds committed
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
 *
 * I don't know which semantics is the right one, since I have no access
 * to standards. But I found by trial that HP-UX 9.0 has the full "new"
 * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
 * "old" one. Personally, I think the new semantics is much more logical.
 * Note that "ln old new" where "new" is a symlink pointing to a non-existing
 * file does succeed in both HP-UX and SunOs, but not in Solaris
 * and in the old Linux semantics.
 */

/* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
 * semantics.  See the comments in "open_namei" and "do_link" below.
 *
 * [10-Sep-98 Alan Modra] Another symlink change.
 */

/* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
 *	inside the path - always follow.
 *	in the last component in creation/removal/renaming - never follow.
 *	if LOOKUP_FOLLOW passed - follow.
 *	if the pathname has trailing slashes - follow.
 *	otherwise - don't follow.
 * (applied in that order).
 *
 * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
 * restored for 2.4. This is the last surviving part of old 4.2BSD bug.
 * During the 2.4 we need to fix the userland stuff depending on it -
 * hopefully we will be able to get rid of that wart in 2.5. So far only
 * XEmacs seems to be relying on it...
 */
/*
 * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
109
 * implemented.  Let's see if raised priority of ->s_vfs_rename_mutex gives
Linus Torvalds's avatar
Linus Torvalds committed
110
111
112
113
114
115
116
117
118
119
 * any extra contention...
 */

/* In order to reduce some races, while at the same time doing additional
 * checking and hopefully speeding things up, we copy filenames to the
 * kernel data space before using them..
 *
 * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
 * PATH_MAX includes the nul terminator --RR.
 */
120
void final_putname(struct filename *name)
Linus Torvalds's avatar
Linus Torvalds committed
121
{
122
123
124
125
126
127
	if (name->separate) {
		__putname(name->name);
		kfree(name);
	} else {
		__putname(name);
	}
128
129
}

130
131
#define EMBEDDED_NAME_MAX	(PATH_MAX - sizeof(struct filename))

132
133
134
135
static struct filename *
getname_flags(const char __user *filename, int flags, int *empty)
{
	struct filename *result, *err;
136
	int len;
137
138
	long max;
	char *kname;
139

140
141
142
143
	result = audit_reusename(filename);
	if (result)
		return result;

144
	result = __getname();
145
	if (unlikely(!result))
146
147
		return ERR_PTR(-ENOMEM);

148
149
150
151
152
	/*
	 * First, try to embed the struct filename inside the names_cache
	 * allocation
	 */
	kname = (char *)result + sizeof(*result);
153
	result->name = kname;
154
155
156
157
158
	result->separate = false;
	max = EMBEDDED_NAME_MAX;

recopy:
	len = strncpy_from_user(kname, filename, max);
159
160
	if (unlikely(len < 0)) {
		err = ERR_PTR(len);
161
		goto error;
162
	}
163

164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
	/*
	 * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a
	 * separate struct filename so we can dedicate the entire
	 * names_cache allocation for the pathname, and re-do the copy from
	 * userland.
	 */
	if (len == EMBEDDED_NAME_MAX && max == EMBEDDED_NAME_MAX) {
		kname = (char *)result;

		result = kzalloc(sizeof(*result), GFP_KERNEL);
		if (!result) {
			err = ERR_PTR(-ENOMEM);
			result = (struct filename *)kname;
			goto error;
		}
		result->name = kname;
		result->separate = true;
		max = PATH_MAX;
		goto recopy;
	}

185
186
187
	/* The empty path is special. */
	if (unlikely(!len)) {
		if (empty)
188
			*empty = 1;
189
190
191
		err = ERR_PTR(-ENOENT);
		if (!(flags & LOOKUP_EMPTY))
			goto error;
Linus Torvalds's avatar
Linus Torvalds committed
192
	}
193
194

	err = ERR_PTR(-ENAMETOOLONG);
195
196
197
198
199
200
	if (unlikely(len >= PATH_MAX))
		goto error;

	result->uptr = filename;
	audit_getname(result);
	return result;
201
202

error:
203
	final_putname(result);
204
	return err;
Linus Torvalds's avatar
Linus Torvalds committed
205
206
}

207
208
struct filename *
getname(const char __user * filename)
Al Viro's avatar
Al Viro committed
209
{
210
	return getname_flags(filename, 0, NULL);
Al Viro's avatar
Al Viro committed
211
}
212
EXPORT_SYMBOL(getname);
Al Viro's avatar
Al Viro committed
213

Linus Torvalds's avatar
Linus Torvalds committed
214
#ifdef CONFIG_AUDITSYSCALL
215
void putname(struct filename *name)
Linus Torvalds's avatar
Linus Torvalds committed
216
{
217
	if (unlikely(!audit_dummy_context()))
218
219
		return audit_putname(name);
	final_putname(name);
Linus Torvalds's avatar
Linus Torvalds committed
220
221
222
}
#endif

223
224
static int check_acl(struct inode *inode, int mask)
{
225
#ifdef CONFIG_FS_POSIX_ACL
226
227
228
	struct posix_acl *acl;

	if (mask & MAY_NOT_BLOCK) {
229
230
		acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
	        if (!acl)
231
	                return -EAGAIN;
232
233
234
		/* no ->get_acl() calls in RCU mode... */
		if (acl == ACL_NOT_CACHED)
			return -ECHILD;
235
	        return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK);
236
237
238
239
240
	}

	acl = get_cached_acl(inode, ACL_TYPE_ACCESS);

	/*
241
242
243
	 * A filesystem can force a ACL callback by just never filling the
	 * ACL cache. But normally you'd fill the cache either at inode
	 * instantiation time, or on the first ->get_acl call.
244
	 *
245
246
	 * If the filesystem doesn't have a get_acl() function at all, we'll
	 * just create the negative cache entry.
247
248
	 */
	if (acl == ACL_NOT_CACHED) {
249
250
251
252
253
254
255
256
	        if (inode->i_op->get_acl) {
			acl = inode->i_op->get_acl(inode, ACL_TYPE_ACCESS);
			if (IS_ERR(acl))
				return PTR_ERR(acl);
		} else {
		        set_cached_acl(inode, ACL_TYPE_ACCESS, NULL);
		        return -EAGAIN;
		}
257
258
259
260
261
262
263
	}

	if (acl) {
	        int error = posix_acl_permission(inode, acl, mask);
	        posix_acl_release(acl);
	        return error;
	}
264
#endif
265
266
267
268

	return -EAGAIN;
}

269
/*
270
 * This does the basic permission checking
Linus Torvalds's avatar
Linus Torvalds committed
271
 */
272
static int acl_permission_check(struct inode *inode, int mask)
Linus Torvalds's avatar
Linus Torvalds committed
273
{
274
	unsigned int mode = inode->i_mode;
Linus Torvalds's avatar
Linus Torvalds committed
275

276
	if (likely(uid_eq(current_fsuid(), inode->i_uid)))
Linus Torvalds's avatar
Linus Torvalds committed
277
278
		mode >>= 6;
	else {
279
		if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
280
			int error = check_acl(inode, mask);
281
282
			if (error != -EAGAIN)
				return error;
Linus Torvalds's avatar
Linus Torvalds committed
283
284
285
286
287
288
289
290
291
		}

		if (in_group_p(inode->i_gid))
			mode >>= 3;
	}

	/*
	 * If the DACs are ok we don't need any capability check.
	 */
292
	if ((mask & ~mode & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
Linus Torvalds's avatar
Linus Torvalds committed
293
		return 0;
294
295
296
297
	return -EACCES;
}

/**
298
 * generic_permission -  check for access rights on a Posix-like filesystem
299
 * @inode:	inode to check access rights for
300
 * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...)
301
302
303
304
 *
 * Used to check for read/write/execute permissions on a file.
 * We use "fsuid" for this, letting us set arbitrary permissions
 * for filesystem access without changing the "normal" uids which
305
306
307
308
309
 * are used for other things.
 *
 * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
 * request cannot be satisfied (eg. requires blocking or too much complexity).
 * It would then be called again in ref-walk mode.
310
 */
311
int generic_permission(struct inode *inode, int mask)
312
313
314
315
{
	int ret;

	/*
316
	 * Do the basic permission checks.
317
	 */
318
	ret = acl_permission_check(inode, mask);
319
320
	if (ret != -EACCES)
		return ret;
Linus Torvalds's avatar
Linus Torvalds committed
321

322
323
	if (S_ISDIR(inode->i_mode)) {
		/* DACs are overridable for directories */
324
		if (inode_capable(inode, CAP_DAC_OVERRIDE))
325
326
			return 0;
		if (!(mask & MAY_WRITE))
327
			if (inode_capable(inode, CAP_DAC_READ_SEARCH))
328
329
330
				return 0;
		return -EACCES;
	}
Linus Torvalds's avatar
Linus Torvalds committed
331
332
	/*
	 * Read/write DACs are always overridable.
333
334
	 * Executable DACs are overridable when there is
	 * at least one exec bit set.
Linus Torvalds's avatar
Linus Torvalds committed
335
	 */
336
	if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
337
		if (inode_capable(inode, CAP_DAC_OVERRIDE))
Linus Torvalds's avatar
Linus Torvalds committed
338
339
340
341
342
			return 0;

	/*
	 * Searching includes executable on directories, else just read.
	 */
343
	mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
344
	if (mask == MAY_READ)
345
		if (inode_capable(inode, CAP_DAC_READ_SEARCH))
Linus Torvalds's avatar
Linus Torvalds committed
346
347
348
349
350
			return 0;

	return -EACCES;
}

351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
/*
 * We _really_ want to just do "generic_permission()" without
 * even looking at the inode->i_op values. So we keep a cache
 * flag in inode->i_opflags, that says "this has not special
 * permission function, use the fast case".
 */
static inline int do_inode_permission(struct inode *inode, int mask)
{
	if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
		if (likely(inode->i_op->permission))
			return inode->i_op->permission(inode, mask);

		/* This gets set once for the inode lifetime */
		spin_lock(&inode->i_lock);
		inode->i_opflags |= IOP_FASTPERM;
		spin_unlock(&inode->i_lock);
	}
	return generic_permission(inode, mask);
}

Christoph Hellwig's avatar
Christoph Hellwig committed
371
/**
David Howells's avatar
David Howells committed
372
373
374
 * __inode_permission - Check for access rights to a given inode
 * @inode: Inode to check permission on
 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
Christoph Hellwig's avatar
Christoph Hellwig committed
375
 *
David Howells's avatar
David Howells committed
376
 * Check for read/write/execute permissions on an inode.
377
378
 *
 * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
David Howells's avatar
David Howells committed
379
380
381
 *
 * This does not check for a read-only file system.  You probably want
 * inode_permission().
Christoph Hellwig's avatar
Christoph Hellwig committed
382
 */
David Howells's avatar
David Howells committed
383
int __inode_permission(struct inode *inode, int mask)
Linus Torvalds's avatar
Linus Torvalds committed
384
{
385
	int retval;
Linus Torvalds's avatar
Linus Torvalds committed
386

387
	if (unlikely(mask & MAY_WRITE)) {
Linus Torvalds's avatar
Linus Torvalds committed
388
389
390
391
392
393
394
		/*
		 * Nobody gets write access to an immutable file.
		 */
		if (IS_IMMUTABLE(inode))
			return -EACCES;
	}

395
	retval = do_inode_permission(inode, mask);
Linus Torvalds's avatar
Linus Torvalds committed
396
397
398
	if (retval)
		return retval;

399
400
401
402
	retval = devcgroup_inode_permission(inode, mask);
	if (retval)
		return retval;

403
	return security_inode_permission(inode, mask);
Linus Torvalds's avatar
Linus Torvalds committed
404
405
}

David Howells's avatar
David Howells committed
406
407
408
/**
 * sb_permission - Check superblock-level permissions
 * @sb: Superblock of inode to check permission on
409
 * @inode: Inode to check permission on
David Howells's avatar
David Howells committed
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 *
 * Separate out file-system wide checks from inode-specific permission checks.
 */
static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
{
	if (unlikely(mask & MAY_WRITE)) {
		umode_t mode = inode->i_mode;

		/* Nobody gets write access to a read-only fs. */
		if ((sb->s_flags & MS_RDONLY) &&
		    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
			return -EROFS;
	}
	return 0;
}

/**
 * inode_permission - Check for access rights to a given inode
 * @inode: Inode to check permission on
 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 *
 * Check for read/write/execute permissions on an inode.  We use fs[ug]id for
 * this, letting us set arbitrary permissions for filesystem access without
 * changing the "normal" UIDs which are used for other things.
 *
 * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
 */
int inode_permission(struct inode *inode, int mask)
{
	int retval;

	retval = sb_permission(inode->i_sb, inode, mask);
	if (retval)
		return retval;
	return __inode_permission(inode, mask);
}

Jan Blunck's avatar
Jan Blunck committed
448
449
450
451
452
453
/**
 * path_get - get a reference to a path
 * @path: path to get the reference to
 *
 * Given a path increment the reference count to the dentry and the vfsmount.
 */
454
void path_get(const struct path *path)
Jan Blunck's avatar
Jan Blunck committed
455
456
457
458
459
460
{
	mntget(path->mnt);
	dget(path->dentry);
}
EXPORT_SYMBOL(path_get);

Jan Blunck's avatar
Jan Blunck committed
461
462
463
464
465
466
/**
 * path_put - put a reference to a path
 * @path: path to put the reference to
 *
 * Given a path decrement the reference count to the dentry and the vfsmount.
 */
467
void path_put(const struct path *path)
Linus Torvalds's avatar
Linus Torvalds committed
468
{
Jan Blunck's avatar
Jan Blunck committed
469
470
	dput(path->dentry);
	mntput(path->mnt);
Linus Torvalds's avatar
Linus Torvalds committed
471
}
Jan Blunck's avatar
Jan Blunck committed
472
EXPORT_SYMBOL(path_put);
Linus Torvalds's avatar
Linus Torvalds committed
473

Al Viro's avatar
Al Viro committed
474
/*
Nick Piggin's avatar
Nick Piggin committed
475
 * Path walking has 2 modes, rcu-walk and ref-walk (see
Al Viro's avatar
Al Viro committed
476
477
478
479
480
481
482
 * Documentation/filesystems/path-lookup.txt).  In situations when we can't
 * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
 * normal reference counts on dentries and vfsmounts to transition to rcu-walk
 * mode.  Refcounts are grabbed at the last known good point before rcu-walk
 * got stuck, so ref-walk may continue from there. If this is not successful
 * (eg. a seqcount has changed), then failure is returned and it's up to caller
 * to restart the path walk from the beginning in ref-walk mode.
Nick Piggin's avatar
Nick Piggin committed
483
484
 */

Al Viro's avatar
Al Viro committed
485
486
487
488
489
490
491
492
493
494
495
496
static inline void lock_rcu_walk(void)
{
	br_read_lock(&vfsmount_lock);
	rcu_read_lock();
}

static inline void unlock_rcu_walk(void)
{
	rcu_read_unlock();
	br_read_unlock(&vfsmount_lock);
}

Nick Piggin's avatar
Nick Piggin committed
497
/**
Al Viro's avatar
Al Viro committed
498
499
500
 * unlazy_walk - try to switch to ref-walk mode.
 * @nd: nameidata pathwalk data
 * @dentry: child of nd->path.dentry or NULL
501
 * Returns: 0 on success, -ECHILD on failure
Nick Piggin's avatar
Nick Piggin committed
502
 *
Al Viro's avatar
Al Viro committed
503
504
505
 * unlazy_walk attempts to legitimize the current nd->path, nd->root and dentry
 * for ref-walk mode.  @dentry must be a path found by a do_lookup call on
 * @nd or NULL.  Must be called from rcu-walk context.
Nick Piggin's avatar
Nick Piggin committed
506
 */
Al Viro's avatar
Al Viro committed
507
static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
Nick Piggin's avatar
Nick Piggin committed
508
509
510
511
512
{
	struct fs_struct *fs = current->fs;
	struct dentry *parent = nd->path.dentry;

	BUG_ON(!(nd->flags & LOOKUP_RCU));
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533

	/*
	 * Get a reference to the parent first: we're
	 * going to make "path_put(nd->path)" valid in
	 * non-RCU context for "terminate_walk()".
	 *
	 * If this doesn't work, return immediately with
	 * RCU walking still active (and then we will do
	 * the RCU walk cleanup in terminate_walk()).
	 */
	if (!lockref_get_not_dead(&parent->d_lockref))
		return -ECHILD;

	/*
	 * After the mntget(), we terminate_walk() will do
	 * the right thing for non-RCU mode, and all our
	 * subsequent exit cases should unlock_rcu_walk()
	 * before returning.
	 */
	mntget(nd->path.mnt);
	nd->flags &= ~LOOKUP_RCU;
534
535
536
537
538
539
540
541
542
543
544
545

	/*
	 * For a negative lookup, the lookup sequence point is the parents
	 * sequence point, and it only needs to revalidate the parent dentry.
	 *
	 * For a positive lookup, we need to move both the parent and the
	 * dentry from the RCU domain to be properly refcounted. And the
	 * sequence number in the dentry validates *both* dentry counters,
	 * since we checked the sequence number of the parent after we got
	 * the child sequence number. So we know the parent must still
	 * be valid if the child sequence number is still valid.
	 */
Al Viro's avatar
Al Viro committed
546
	if (!dentry) {
547
548
		if (read_seqcount_retry(&parent->d_seq, nd->seq))
			goto out;
Al Viro's avatar
Al Viro committed
549
550
		BUG_ON(nd->inode != parent->d_inode);
	} else {
551
552
553
554
		if (!lockref_get_not_dead(&dentry->d_lockref))
			goto out;
		if (read_seqcount_retry(&dentry->d_seq, nd->seq))
			goto drop_dentry;
Al Viro's avatar
Al Viro committed
555
	}
556
557
558
559
560
561
562
563
564

	/*
	 * Sequence counts matched. Now make sure that the root is
	 * still valid and get it if required.
	 */
	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
		spin_lock(&fs->lock);
		if (nd->root.mnt != fs->root.mnt || nd->root.dentry != fs->root.dentry)
			goto unlock_and_drop_dentry;
Nick Piggin's avatar
Nick Piggin committed
565
566
567
568
		path_get(&nd->root);
		spin_unlock(&fs->lock);
	}

Al Viro's avatar
Al Viro committed
569
	unlock_rcu_walk();
Nick Piggin's avatar
Nick Piggin committed
570
	return 0;
Al Viro's avatar
Al Viro committed
571

572
573
574
575
unlock_and_drop_dentry:
	spin_unlock(&fs->lock);
drop_dentry:
	unlock_rcu_walk();
576
	dput(dentry);
577
	goto drop_root_mnt;
578
579
out:
	unlock_rcu_walk();
580
581
582
drop_root_mnt:
	if (!(nd->flags & LOOKUP_ROOT))
		nd->root.mnt = NULL;
Nick Piggin's avatar
Nick Piggin committed
583
584
585
	return -ECHILD;
}

586
static inline int d_revalidate(struct dentry *dentry, unsigned int flags)
587
{
588
	return dentry->d_op->d_revalidate(dentry, flags);
589
590
}

591
592
593
/**
 * complete_walk - successful completion of path walk
 * @nd:  pointer nameidata
594
 *
595
596
597
598
599
 * If we had been in RCU mode, drop out of it and legitimize nd->path.
 * Revalidate the final result, unless we'd already done that during
 * the path walk or the filesystem doesn't ask for it.  Return 0 on
 * success, -error on failure.  In case of failure caller does not
 * need to drop nd->path.
600
 */
601
static int complete_walk(struct nameidata *nd)
602
{
Al Viro's avatar
Al Viro committed
603
	struct dentry *dentry = nd->path.dentry;
604
605
	int status;

606
607
608
609
	if (nd->flags & LOOKUP_RCU) {
		nd->flags &= ~LOOKUP_RCU;
		if (!(nd->flags & LOOKUP_ROOT))
			nd->root.mnt = NULL;
610

611
612
613
614
615
		if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) {
			unlock_rcu_walk();
			return -ECHILD;
		}
		if (read_seqcount_retry(&dentry->d_seq, nd->seq)) {
Al Viro's avatar
Al Viro committed
616
			unlock_rcu_walk();
617
			dput(dentry);
618
619
620
			return -ECHILD;
		}
		mntget(nd->path.mnt);
Al Viro's avatar
Al Viro committed
621
		unlock_rcu_walk();
622
623
	}

Al Viro's avatar
Al Viro committed
624
625
626
	if (likely(!(nd->flags & LOOKUP_JUMPED)))
		return 0;

627
	if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE)))
628
629
		return 0;

630
	status = dentry->d_op->d_weak_revalidate(dentry, nd->flags);
631
632
633
	if (status > 0)
		return 0;

Al Viro's avatar
Al Viro committed
634
	if (!status)
635
		status = -ESTALE;
Al Viro's avatar
Al Viro committed
636

637
	path_put(&nd->path);
638
639
640
	return status;
}

Al Viro's avatar
Al Viro committed
641
642
static __always_inline void set_root(struct nameidata *nd)
{
643
644
	if (!nd->root.mnt)
		get_fs_root(current->fs, &nd->root);
Al Viro's avatar
Al Viro committed
645
646
}

647
648
static int link_path_walk(const char *, struct nameidata *);

Nick Piggin's avatar
Nick Piggin committed
649
650
651
652
static __always_inline void set_root_rcu(struct nameidata *nd)
{
	if (!nd->root.mnt) {
		struct fs_struct *fs = current->fs;
Nick Piggin's avatar
Nick Piggin committed
653
654
655
656
657
		unsigned seq;

		do {
			seq = read_seqcount_begin(&fs->seq);
			nd->root = fs->root;
658
			nd->seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
Nick Piggin's avatar
Nick Piggin committed
659
		} while (read_seqcount_retry(&fs->seq, seq));
Nick Piggin's avatar
Nick Piggin committed
660
661
662
	}
}

Jan Blunck's avatar
Jan Blunck committed
663
static void path_put_conditional(struct path *path, struct nameidata *nd)
664
665
{
	dput(path->dentry);
666
	if (path->mnt != nd->path.mnt)
667
668
669
		mntput(path->mnt);
}

670
671
static inline void path_to_nameidata(const struct path *path,
					struct nameidata *nd)
672
{
Nick Piggin's avatar
Nick Piggin committed
673
674
675
676
	if (!(nd->flags & LOOKUP_RCU)) {
		dput(nd->path.dentry);
		if (nd->path.mnt != path->mnt)
			mntput(nd->path.mnt);
677
	}
Nick Piggin's avatar
Nick Piggin committed
678
	nd->path.mnt = path->mnt;
679
	nd->path.dentry = path->dentry;
680
681
}

Christoph Hellwig's avatar
Christoph Hellwig committed
682
683
684
685
686
687
688
689
690
691
692
693
694
/*
 * Helper to directly jump to a known parsed path from ->follow_link,
 * caller must have taken a reference to path beforehand.
 */
void nd_jump_link(struct nameidata *nd, struct path *path)
{
	path_put(&nd->path);

	nd->path = *path;
	nd->inode = nd->path.dentry->d_inode;
	nd->flags |= LOOKUP_JUMPED;
}

695
696
697
static inline void put_link(struct nameidata *nd, struct path *link, void *cookie)
{
	struct inode *inode = link->dentry->d_inode;
698
	if (inode->i_op->put_link)
699
700
701
702
		inode->i_op->put_link(link->dentry, nd, cookie);
	path_put(link);
}

703
704
int sysctl_protected_symlinks __read_mostly = 0;
int sysctl_protected_hardlinks __read_mostly = 0;
Kees Cook's avatar
Kees Cook committed
705
706
707
708

/**
 * may_follow_link - Check symlink following for unsafe situations
 * @link: The path of the symlink
709
 * @nd: nameidata pathwalk data
Kees Cook's avatar
Kees Cook committed
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
 *
 * In the case of the sysctl_protected_symlinks sysctl being enabled,
 * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is
 * in a sticky world-writable directory. This is to protect privileged
 * processes from failing races against path names that may change out
 * from under them by way of other users creating malicious symlinks.
 * It will permit symlinks to be followed only when outside a sticky
 * world-writable directory, or when the uid of the symlink and follower
 * match, or when the directory owner matches the symlink's owner.
 *
 * Returns 0 if following the symlink is allowed, -ve on error.
 */
static inline int may_follow_link(struct path *link, struct nameidata *nd)
{
	const struct inode *inode;
	const struct inode *parent;

	if (!sysctl_protected_symlinks)
		return 0;

	/* Allowed if owner and follower match. */
	inode = link->dentry->d_inode;
732
	if (uid_eq(current_cred()->fsuid, inode->i_uid))
Kees Cook's avatar
Kees Cook committed
733
734
735
736
737
738
739
740
		return 0;

	/* Allowed if parent directory not sticky and world-writable. */
	parent = nd->path.dentry->d_inode;
	if ((parent->i_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH))
		return 0;

	/* Allowed if parent directory and link owner match. */
741
	if (uid_eq(parent->i_uid, inode->i_uid))
Kees Cook's avatar
Kees Cook committed
742
743
		return 0;

744
	audit_log_link_denied("follow_link", link);
Kees Cook's avatar
Kees Cook committed
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
	path_put_conditional(link, nd);
	path_put(&nd->path);
	return -EACCES;
}

/**
 * safe_hardlink_source - Check for safe hardlink conditions
 * @inode: the source inode to hardlink from
 *
 * Return false if at least one of the following conditions:
 *    - inode is not a regular file
 *    - inode is setuid
 *    - inode is setgid and group-exec
 *    - access failure for read and write
 *
 * Otherwise returns true.
 */
static bool safe_hardlink_source(struct inode *inode)
{
	umode_t mode = inode->i_mode;

	/* Special files should not get pinned to the filesystem. */
	if (!S_ISREG(mode))
		return false;

	/* Setuid files should not get pinned to the filesystem. */
	if (mode & S_ISUID)
		return false;

	/* Executable setgid files should not get pinned to the filesystem. */
	if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
		return false;

	/* Hardlinking to unreadable or unwritable sources is dangerous. */
	if (inode_permission(inode, MAY_READ | MAY_WRITE))
		return false;

	return true;
}

/**
 * may_linkat - Check permissions for creating a hardlink
 * @link: the source to hardlink from
 *
 * Block hardlink when all of:
 *  - sysctl_protected_hardlinks enabled
 *  - fsuid does not match inode
 *  - hardlink source is unsafe (see safe_hardlink_source() above)
 *  - not CAP_FOWNER
 *
 * Returns 0 if successful, -ve on error.
 */
static int may_linkat(struct path *link)
{
	const struct cred *cred;
	struct inode *inode;

	if (!sysctl_protected_hardlinks)
		return 0;

	cred = current_cred();
	inode = link->dentry->d_inode;

	/* Source inode owner (or CAP_FOWNER) can hardlink all they like,
	 * otherwise, it must be a safe source.
	 */
811
	if (uid_eq(cred->fsuid, inode->i_uid) || safe_hardlink_source(inode) ||
Kees Cook's avatar
Kees Cook committed
812
813
814
	    capable(CAP_FOWNER))
		return 0;

815
	audit_log_link_denied("linkat", link);
Kees Cook's avatar
Kees Cook committed
816
817
818
	return -EPERM;
}

Al Viro's avatar
Al Viro committed
819
static __always_inline int
820
follow_link(struct path *link, struct nameidata *nd, void **p)
Linus Torvalds's avatar
Linus Torvalds committed
821
{
822
	struct dentry *dentry = link->dentry;
823
824
	int error;
	char *s;
Linus Torvalds's avatar
Linus Torvalds committed
825

826
827
	BUG_ON(nd->flags & LOOKUP_RCU);

Al Viro's avatar
Al Viro committed
828
829
830
	if (link->mnt == nd->path.mnt)
		mntget(link->mnt);

831
832
833
834
	error = -ELOOP;
	if (unlikely(current->total_link_count >= 40))
		goto out_put_nd_path;

835
836
837
	cond_resched();
	current->total_link_count++;

Al Viro's avatar
Al Viro committed
838
	touch_atime(link);
Linus Torvalds's avatar
Linus Torvalds committed
839
	nd_set_link(nd, NULL);
Al Viro's avatar
Al Viro committed
840

841
	error = security_inode_follow_link(link->dentry, nd);
842
843
	if (error)
		goto out_put_nd_path;
844

845
	nd->last_type = LAST_BIND;
Al Viro's avatar
Al Viro committed
846
847
	*p = dentry->d_inode->i_op->follow_link(dentry, nd);
	error = PTR_ERR(*p);
848
	if (IS_ERR(*p))
849
		goto out_put_nd_path;
850
851
852
853

	error = 0;
	s = nd_get_link(nd);
	if (s) {
854
855
856
857
858
859
860
861
862
863
864
865
866
867
		if (unlikely(IS_ERR(s))) {
			path_put(&nd->path);
			put_link(nd, link, *p);
			return PTR_ERR(s);
		}
		if (*s == '/') {
			set_root(nd);
			path_put(&nd->path);
			nd->path = nd->root;
			path_get(&nd->root);
			nd->flags |= LOOKUP_JUMPED;
		}
		nd->inode = nd->path.dentry->d_inode;
		error = link_path_walk(s, nd);
Christoph Hellwig's avatar
Christoph Hellwig committed
868
869
		if (unlikely(error))
			put_link(nd, link, *p);
Linus Torvalds's avatar
Linus Torvalds committed
870
	}
871
872
873
874

	return error;

out_put_nd_path:
875
	*p = NULL;
876
877
	path_put(&nd->path);
	path_put(link);
Linus Torvalds's avatar
Linus Torvalds committed
878
879
880
	return error;
}

Nick Piggin's avatar
Nick Piggin committed
881
882
static int follow_up_rcu(struct path *path)
{
883
884
	struct mount *mnt = real_mount(path->mnt);
	struct mount *parent;
Nick Piggin's avatar
Nick Piggin committed
885
886
	struct dentry *mountpoint;

887
888
	parent = mnt->mnt_parent;
	if (&parent->mnt == path->mnt)
Nick Piggin's avatar
Nick Piggin committed
889
		return 0;
890
	mountpoint = mnt->mnt_mountpoint;
Nick Piggin's avatar
Nick Piggin committed
891
	path->dentry = mountpoint;
892
	path->mnt = &parent->mnt;
Nick Piggin's avatar
Nick Piggin committed
893
894
895
	return 1;
}

896
897
898
899
900
901
902
903
904
905
/*
 * follow_up - Find the mountpoint of path's vfsmount
 *
 * Given a path, find the mountpoint of its source file system.
 * Replace @path with the path of the mountpoint in the parent mount.
 * Up is towards /.
 *
 * Return 1 if we went up a level and 0 if we were already at the
 * root.
 */
Al Viro's avatar
Al Viro committed
906
int follow_up(struct path *path)
Linus Torvalds's avatar
Linus Torvalds committed
907
{
908
909
	struct mount *mnt = real_mount(path->mnt);
	struct mount *parent;
Linus Torvalds's avatar
Linus Torvalds committed
910
	struct dentry *mountpoint;
Nick Piggin's avatar
Nick Piggin committed
911

Andi Kleen's avatar
Andi Kleen committed
912
	br_read_lock(&vfsmount_lock);
913
	parent = mnt->mnt_parent;
Al Viro's avatar
Al Viro committed
914
	if (parent == mnt) {
Andi Kleen's avatar
Andi Kleen committed
915
		br_read_unlock(&vfsmount_lock);
Linus Torvalds's avatar
Linus Torvalds committed
916
917
		return 0;
	}
918
	mntget(&parent->mnt);
919
	mountpoint = dget(mnt->mnt_mountpoint);
Andi Kleen's avatar
Andi Kleen committed
920
	br_read_unlock(&vfsmount_lock);
Al Viro's avatar
Al Viro committed
921
922
923
	dput(path->dentry);
	path->dentry = mountpoint;
	mntput(path->mnt);
924
	path->mnt = &parent->mnt;
Linus Torvalds's avatar
Linus Torvalds committed
925
926
927
	return 1;
}

Nick Piggin's avatar
Nick Piggin committed
928
/*
929
930
931
 * Perform an automount
 * - return -EISDIR to tell follow_managed() to stop and return the path we
 *   were called with.
Linus Torvalds's avatar
Linus Torvalds committed
932
 */
933
934
static int follow_automount(struct path *path, unsigned flags,
			    bool *need_mntput)
Nick Piggin's avatar
Nick Piggin committed
935
{
936
	struct vfsmount *mnt;
937
	int err;
938
939
940
941

	if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
		return -EREMOTE;

942
943
944
945
946
947
948
949
950
951
	/* We don't want to mount if someone's just doing a stat -
	 * unless they're stat'ing a directory and appended a '/' to
	 * the name.
	 *
	 * We do, however, want to mount if someone wants to open or
	 * create a file of any type under the mountpoint, wants to
	 * traverse through the mountpoint or wants to open the
	 * mounted directory.  Also, autofs may mark negative dentries
	 * as being automount points.  These will need the attentions
	 * of the daemon to instantiate them before they can be used.
952
	 */
953
	if (!(flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
954
		     LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
955
956
957
	    path->dentry->d_inode)
		return -EISDIR;

958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
	current->total_link_count++;
	if (current->total_link_count >= 40)
		return -ELOOP;

	mnt = path->dentry->d_op->d_automount(path);
	if (IS_ERR(mnt)) {
		/*
		 * The filesystem is allowed to return -EISDIR here to indicate
		 * it doesn't want to automount.  For instance, autofs would do
		 * this so that its userspace daemon can mount on this dentry.
		 *
		 * However, we can only permit this if it's a terminal point in
		 * the path being looked up; if it wasn't then the remainder of
		 * the path is inaccessible and we should say so.
		 */
Al Viro's avatar
Al Viro committed
973
		if (PTR_ERR(mnt) == -EISDIR && (flags & LOOKUP_PARENT))
974
975
			return -EREMOTE;
		return PTR_ERR(mnt);
Nick Piggin's avatar
Nick Piggin committed
976
	}
977

978
979
	if (!mnt) /* mount collision */
		return 0;
Nick Piggin's avatar
Nick Piggin committed
980

981
982
983
984
985
	if (!*need_mntput) {
		/* lock_mount() may release path->mnt on error */
		mntget(path->mnt);
		*need_mntput = true;
	}
986
	err = finish_automount(mnt, path);
987

988
989
990
	switch (err) {
	case -EBUSY:
		/* Someone else made a mount here whilst we were busy */
991
		return 0;
992
	case 0:
993
		path_put(path);
994
995
996
		path->mnt = mnt;
		path->dentry = dget(mnt->mnt_root);
		return 0;
997
998
	default:
		return err;
999
	}
1000

Al Viro's avatar
Al Viro committed
1001
1002
}

1003
1004
/*
 * Handle a dentry that is managed in some way.
1005
 * - Flagged for transit management (autofs)
1006
1007
1008
1009
1010
1011
1012
1013
 * - Flagged as mountpoint
 * - Flagged as automount point
 *
 * This may only be called in refwalk mode.
 *
 * Serialization is taken care of in namespace.c
 */
static int follow_managed(struct path *path, unsigned flags)
Linus Torvalds's avatar
Linus Torvalds committed
1014
{
1015
	struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */
1016
1017
	unsigned managed;
	bool need_mntput = false;
1018
	int ret = 0;
1019
1020
1021
1022
1023
1024
1025

	/* Given that we're not holding a lock here, we retain the value in a
	 * local variable for each dentry as we look at it so that we don't see
	 * the components of that value change under us */
	while (managed = ACCESS_ONCE(path->dentry->d_flags),
	       managed &= DCACHE_MANAGED_DENTRY,
	       unlikely(managed != 0)) {
1026
1027
1028
1029
1030
		/* Allow the filesystem to manage the transit without i_mutex
		 * being held. */
		if (managed & DCACHE_MANAGE_TRANSIT) {
			BUG_ON(!path->dentry->d_op);
			BUG_ON(!path->dentry->d_op->d_manage);
1031
			ret = path->dentry->d_op->d_manage(path->dentry, false);
1032
			if (ret < 0)
1033
				break;
1034
1035
		}

1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
		/* Transit to a mounted filesystem. */
		if (managed & DCACHE_MOUNTED) {
			struct vfsmount *mounted = lookup_mnt(path);
			if (mounted) {
				dput(path->dentry);
				if (need_mntput)
					mntput(path->mnt);
				path->mnt = mounted;
				path->dentry = dget(mounted->mnt_root);
				need_mntput = true;
				continue;
			}

			/* Something is mounted on this dentry in another
			 * namespace and/or whatever was mounted there in this
			 * namespace got unmounted before we managed to get the
			 * vfsmount_lock */
		}

		/* Handle an automount point */
		if (managed & DCACHE_NEED_AUTOMOUNT) {
			ret = follow_automount(path, flags, &need_mntput);
			if (ret < 0)
1059
				break;
1060
1061
1062
1063
1064
			continue;
		}

		/* We didn't change the current path point */
		break;
Linus Torvalds's avatar
Linus Torvalds committed
1065
	}
1066
1067
1068
1069
1070

	if (need_mntput && path->mnt == mnt)
		mntput(path->mnt);
	if (ret == -EISDIR)
		ret = 0;
1071
	return ret < 0 ? ret : need_mntput;
Linus Torvalds's avatar
Linus Torvalds committed
1072
1073
}

1074
int follow_down_one(struct path *path)
Linus Torvalds's avatar
Linus Torvalds committed
1075
1076
1077
{
	struct vfsmount *mounted;

Al Viro's avatar
Al Viro committed
1078
	mounted = lookup_mnt(path);
Linus Torvalds's avatar
Linus Torvalds committed
1079
	if (mounted) {
Al Viro's avatar
Al Viro committed
1080
1081
1082
1083
		dput(path->dentry);
		mntput(path->mnt);
		path->mnt = mounted;
		path->dentry = dget(mounted->mnt_root);
Linus Torvalds's avatar
Linus Torvalds committed
1084
1085
1086
1087
1088
		return 1;
	}
	return 0;
}

1089
1090
1091
1092
1093
1094
static inline bool managed_dentry_might_block(struct dentry *dentry)
{
	return (dentry->d_flags & DCACHE_MANAGE_TRANSIT &&
		dentry->d_op->d_manage(dentry, true) < 0);
}

1095
/*
1096
1097
 * Try to skip to top of mountpoint pile in rcuwalk mode.  Fail if
 * we meet a managed dentry that would need blocking.
1098
1099
 */
static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
1100
			       struct inode **inode)
1101
{
1102
	for (;;) {
1103
		struct mount *mounted;
1104
1105
1106
1107
		/*
		 * Don't forget we might have a non-mountpoint managed dentry
		 * that wants to block transit.
		 */
1108
		if (unlikely(managed_dentry_might_block(path->dentry)))
1109
			return false;
1110
1111
1112
1113

		if (!d_mountpoint(path->dentry))
			break;

1114
1115
1116
		mounted = __lookup_mnt(path->mnt, path->dentry, 1);
		if (!mounted)
			break;
1117
1118
		path->mnt = &mounted->mnt;
		path->dentry = mounted->mnt.mnt_root;
1119
		nd->flags |= LOOKUP_JUMPED;
1120
		nd->seq = read_seqcount_begin(&path->dentry->d_seq);
1121
1122
1123
1124
1125
1126
		/*
		 * Update the inode too. We don't need to re-check the
		 * dentry sequence number here after this d_inode read,
		 * because a mount-point is always pinned.
		 */
		*inode = path->dentry->d_inode;
1127
1128
1129
1130
	}
	return true;
}

1131
static void follow_mount_rcu(struct nameidata *nd)
1132
{
1133
	while (d_mountpoint(nd->path.dentry)) {
1134
		struct mount *mounted;
1135
		mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry, 1);
1136
1137
		if (!mounted)
			break;
1138
1139
		nd->path.mnt = &mounted->mnt;
		nd->path.dentry = mounted->mnt.mnt_root;
1140
		nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
1141
1142
1143
	}
}

Nick Piggin's avatar
Nick Piggin committed
1144
1145
1146
1147
static int follow_dotdot_rcu(struct nameidata *nd)
{
	set_root_rcu(nd);

1148
	while (1) {
Nick Piggin's avatar
Nick Piggin committed
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
		if (nd->path.dentry == nd->root.dentry &&
		    nd->path.mnt == nd->root.mnt) {
			break;
		}
		if (nd->path.dentry != nd->path.mnt->mnt_root) {
			struct dentry *old = nd->path.de