file_table.c 10.1 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
6
7
8
9
10
/*
 *  linux/fs/file_table.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *  Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
 */

#include <linux/string.h>
#include <linux/slab.h>
#include <linux/file.h>
Al Viro's avatar
Al Viro committed
11
#include <linux/fdtable.h>
Linus Torvalds's avatar
Linus Torvalds committed
12
13
14
15
16
#include <linux/init.h>
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/security.h>
#include <linux/eventpoll.h>
17
#include <linux/rcupdate.h>
Linus Torvalds's avatar
Linus Torvalds committed
18
#include <linux/mount.h>
19
#include <linux/capability.h>
Linus Torvalds's avatar
Linus Torvalds committed
20
#include <linux/cdev.h>
Robert Love's avatar
Robert Love committed
21
#include <linux/fsnotify.h>
Dipankar Sarma's avatar
Dipankar Sarma committed
22
23
#include <linux/sysctl.h>
#include <linux/percpu_counter.h>
24
#include <linux/ima.h>
Dipankar Sarma's avatar
Dipankar Sarma committed
25
26

#include <asm/atomic.h>
Linus Torvalds's avatar
Linus Torvalds committed
27

28
29
#include "internal.h"

Linus Torvalds's avatar
Linus Torvalds committed
30
31
32
33
34
35
/* sysctl tunables... */
struct files_stat_struct files_stat = {
	.max_files = NR_FILE
};

/* public. Not pretty! */
Dipankar Sarma's avatar
Dipankar Sarma committed
36
__cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock);
Linus Torvalds's avatar
Linus Torvalds committed
37

38
39
40
/* SLAB cache for file structures */
static struct kmem_cache *filp_cachep __read_mostly;

Dipankar Sarma's avatar
Dipankar Sarma committed
41
static struct percpu_counter nr_files __cacheline_aligned_in_smp;
Linus Torvalds's avatar
Linus Torvalds committed
42

Dipankar Sarma's avatar
Dipankar Sarma committed
43
static inline void file_free_rcu(struct rcu_head *head)
Linus Torvalds's avatar
Linus Torvalds committed
44
{
45
46
47
	struct file *f = container_of(head, struct file, f_u.fu_rcuhead);

	put_cred(f->f_cred);
Dipankar Sarma's avatar
Dipankar Sarma committed
48
	kmem_cache_free(filp_cachep, f);
Linus Torvalds's avatar
Linus Torvalds committed
49
50
}

Dipankar Sarma's avatar
Dipankar Sarma committed
51
static inline void file_free(struct file *f)
Linus Torvalds's avatar
Linus Torvalds committed
52
{
Dipankar Sarma's avatar
Dipankar Sarma committed
53
	percpu_counter_dec(&nr_files);
54
	file_check_state(f);
Dipankar Sarma's avatar
Dipankar Sarma committed
55
	call_rcu(&f->f_u.fu_rcuhead, file_free_rcu);
Linus Torvalds's avatar
Linus Torvalds committed
56
57
}

Dipankar Sarma's avatar
Dipankar Sarma committed
58
59
60
61
/*
 * Return the total number of open files in the system
 */
static int get_nr_files(void)
Linus Torvalds's avatar
Linus Torvalds committed
62
{
Dipankar Sarma's avatar
Dipankar Sarma committed
63
	return percpu_counter_read_positive(&nr_files);
Linus Torvalds's avatar
Linus Torvalds committed
64
65
}

Dipankar Sarma's avatar
Dipankar Sarma committed
66
67
68
69
/*
 * Return the maximum number of open files in the system
 */
int get_max_files(void)
70
{
Dipankar Sarma's avatar
Dipankar Sarma committed
71
	return files_stat.max_files;
72
}
Dipankar Sarma's avatar
Dipankar Sarma committed
73
74
75
76
77
78
EXPORT_SYMBOL_GPL(get_max_files);

/*
 * Handle nr_files sysctl
 */
#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
79
int proc_nr_files(ctl_table *table, int write,
Dipankar Sarma's avatar
Dipankar Sarma committed
80
81
82
                     void __user *buffer, size_t *lenp, loff_t *ppos)
{
	files_stat.nr_files = get_nr_files();
83
	return proc_dointvec(table, write, buffer, lenp, ppos);
Dipankar Sarma's avatar
Dipankar Sarma committed
84
85
}
#else
86
int proc_nr_files(ctl_table *table, int write,
Dipankar Sarma's avatar
Dipankar Sarma committed
87
88
89
90
91
                     void __user *buffer, size_t *lenp, loff_t *ppos)
{
	return -ENOSYS;
}
#endif
92

Linus Torvalds's avatar
Linus Torvalds committed
93
94
95
/* Find an unused file structure and return a pointer to it.
 * Returns NULL, if there are no more free file structures or
 * we run out of memory.
96
97
98
99
100
101
 *
 * Be very careful using this.  You are responsible for
 * getting write access to any mount that you might assign
 * to this filp, if it is opened for write.  If this is not
 * done, you will imbalance int the mount's writer count
 * and a warning at __fput() time.
Linus Torvalds's avatar
Linus Torvalds committed
102
103
104
 */
struct file *get_empty_filp(void)
{
105
	const struct cred *cred = current_cred();
106
	static int old_max;
Linus Torvalds's avatar
Linus Torvalds committed
107
108
109
110
111
	struct file * f;

	/*
	 * Privileged users can go above max_files
	 */
Dipankar Sarma's avatar
Dipankar Sarma committed
112
113
114
115
116
	if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) {
		/*
		 * percpu_counters are inaccurate.  Do an expensive check before
		 * we go and fail.
		 */
117
		if (percpu_counter_sum_positive(&nr_files) >= files_stat.max_files)
Dipankar Sarma's avatar
Dipankar Sarma committed
118
119
			goto over;
	}
120

121
	f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL);
122
123
124
	if (f == NULL)
		goto fail;

Dipankar Sarma's avatar
Dipankar Sarma committed
125
	percpu_counter_inc(&nr_files);
126
127
	if (security_file_alloc(f))
		goto fail_sec;
Linus Torvalds's avatar
Linus Torvalds committed
128

129
	INIT_LIST_HEAD(&f->f_u.fu_list);
Al Viro's avatar
Al Viro committed
130
	atomic_long_set(&f->f_count, 1);
131
	rwlock_init(&f->f_owner.lock);
132
	f->f_cred = get_cred(cred);
133
	spin_lock_init(&f->f_lock);
134
	eventpoll_init_file(f);
135
136
137
138
	/* f->f_version: 0 */
	return f;

over:
Linus Torvalds's avatar
Linus Torvalds committed
139
	/* Ran out of filps - report that */
Dipankar Sarma's avatar
Dipankar Sarma committed
140
	if (get_nr_files() > old_max) {
Linus Torvalds's avatar
Linus Torvalds committed
141
		printk(KERN_INFO "VFS: file-max limit %d reached\n",
Dipankar Sarma's avatar
Dipankar Sarma committed
142
143
					get_max_files());
		old_max = get_nr_files();
Linus Torvalds's avatar
Linus Torvalds committed
144
	}
145
146
147
148
	goto fail;

fail_sec:
	file_free(f);
Linus Torvalds's avatar
Linus Torvalds committed
149
150
151
152
fail:
	return NULL;
}

153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
/**
 * alloc_file - allocate and initialize a 'struct file'
 * @mnt: the vfsmount on which the file will reside
 * @dentry: the dentry representing the new file
 * @mode: the mode with which the new file will be opened
 * @fop: the 'struct file_operations' for the new file
 *
 * Use this instead of get_empty_filp() to get a new
 * 'struct file'.  Do so because of the same initialization
 * pitfalls reasons listed for init_file().  This is a
 * preferred interface to using init_file().
 *
 * If all the callers of init_file() are eliminated, its
 * code should be moved into this function.
 */
168
169
struct file *alloc_file(struct path *path, fmode_t mode,
		const struct file_operations *fop)
170
171
172
173
174
175
176
{
	struct file *file;

	file = get_empty_filp();
	if (!file)
		return NULL;

177
178
	file->f_path = *path;
	file->f_mapping = path->dentry->d_inode->i_mapping;
179
180
	file->f_mode = mode;
	file->f_op = fop;
181
182
183
184
185
186
187

	/*
	 * These mounts don't really matter in practice
	 * for r/o bind mounts.  They aren't userspace-
	 * visible.  We do this for consistency, and so
	 * that we can do debugging checks at __fput()
	 */
188
	if ((mode & FMODE_WRITE) && !special_file(path->dentry->d_inode->i_mode)) {
189
		file_take_write(file);
190
		WARN_ON(mnt_clone_write(path->mnt));
191
	}
192
	ima_counts_get(file);
Al Viro's avatar
Al Viro committed
193
	return file;
194
}
Roland Dreier's avatar
Roland Dreier committed
195
EXPORT_SYMBOL(alloc_file);
196

197
void fput(struct file *file)
Linus Torvalds's avatar
Linus Torvalds committed
198
{
Al Viro's avatar
Al Viro committed
199
	if (atomic_long_dec_and_test(&file->f_count))
Linus Torvalds's avatar
Linus Torvalds committed
200
201
202
203
204
		__fput(file);
}

EXPORT_SYMBOL(fput);

205
206
207
208
209
210
211
212
213
214
/**
 * drop_file_write_access - give up ability to write to a file
 * @file: the file to which we will stop writing
 *
 * This is a central place which will give up the ability
 * to write to @file, along with access to write through
 * its vfsmount.
 */
void drop_file_write_access(struct file *file)
{
215
	struct vfsmount *mnt = file->f_path.mnt;
216
217
218
219
	struct dentry *dentry = file->f_path.dentry;
	struct inode *inode = dentry->d_inode;

	put_write_access(inode);
220
221
222
223
224
225
226

	if (special_file(inode->i_mode))
		return;
	if (file_check_writeable(file) != 0)
		return;
	mnt_drop_write(mnt);
	file_release_write(file);
227
228
229
}
EXPORT_SYMBOL_GPL(drop_file_write_access);

Linus Torvalds's avatar
Linus Torvalds committed
230
231
232
/* __fput is called from task context when aio completion releases the last
 * last use of a struct file *.  Do not use otherwise.
 */
233
void __fput(struct file *file)
Linus Torvalds's avatar
Linus Torvalds committed
234
{
235
236
	struct dentry *dentry = file->f_path.dentry;
	struct vfsmount *mnt = file->f_path.mnt;
Linus Torvalds's avatar
Linus Torvalds committed
237
238
239
	struct inode *inode = dentry->d_inode;

	might_sleep();
Robert Love's avatar
Robert Love committed
240
241

	fsnotify_close(file);
Linus Torvalds's avatar
Linus Torvalds committed
242
243
244
245
246
247
248
	/*
	 * The function eventpoll_release() should be the first called
	 * in the file cleanup chain.
	 */
	eventpoll_release(file);
	locks_remove_flock(file);

Al Viro's avatar
Al Viro committed
249
250
251
252
	if (unlikely(file->f_flags & FASYNC)) {
		if (file->f_op && file->f_op->fasync)
			file->f_op->fasync(-1, file, 0);
	}
Linus Torvalds's avatar
Linus Torvalds committed
253
254
255
	if (file->f_op && file->f_op->release)
		file->f_op->release(inode, file);
	security_file_free(file);
256
	ima_file_free(file);
257
	if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL))
Linus Torvalds's avatar
Linus Torvalds committed
258
259
		cdev_put(inode->i_cdev);
	fops_put(file->f_op);
260
	put_pid(file->f_owner.pid);
Linus Torvalds's avatar
Linus Torvalds committed
261
	file_kill(file);
262
263
	if (file->f_mode & FMODE_WRITE)
		drop_file_write_access(file);
264
265
	file->f_path.dentry = NULL;
	file->f_path.mnt = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
266
267
268
269
270
	file_free(file);
	dput(dentry);
	mntput(mnt);
}

271
struct file *fget(unsigned int fd)
Linus Torvalds's avatar
Linus Torvalds committed
272
273
274
275
{
	struct file *file;
	struct files_struct *files = current->files;

276
	rcu_read_lock();
Linus Torvalds's avatar
Linus Torvalds committed
277
	file = fcheck_files(files, fd);
278
	if (file) {
Al Viro's avatar
Al Viro committed
279
		if (!atomic_long_inc_not_zero(&file->f_count)) {
280
281
282
283
284
285
286
			/* File object ref couldn't be taken */
			rcu_read_unlock();
			return NULL;
		}
	}
	rcu_read_unlock();

Linus Torvalds's avatar
Linus Torvalds committed
287
288
289
290
291
292
293
294
295
296
297
298
	return file;
}

EXPORT_SYMBOL(fget);

/*
 * Lightweight file lookup - no refcnt increment if fd table isn't shared. 
 * You can use this only if it is guranteed that the current task already 
 * holds a refcnt to that file. That check has to be done at fget() only
 * and a flag is returned to be passed to the corresponding fput_light().
 * There must not be a cloning between an fget_light/fput_light pair.
 */
299
struct file *fget_light(unsigned int fd, int *fput_needed)
Linus Torvalds's avatar
Linus Torvalds committed
300
301
302
303
304
305
306
307
{
	struct file *file;
	struct files_struct *files = current->files;

	*fput_needed = 0;
	if (likely((atomic_read(&files->count) == 1))) {
		file = fcheck_files(files, fd);
	} else {
308
		rcu_read_lock();
Linus Torvalds's avatar
Linus Torvalds committed
309
310
		file = fcheck_files(files, fd);
		if (file) {
Al Viro's avatar
Al Viro committed
311
			if (atomic_long_inc_not_zero(&file->f_count))
312
313
314
315
				*fput_needed = 1;
			else
				/* Didn't get the reference, someone's freed */
				file = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
316
		}
317
		rcu_read_unlock();
Linus Torvalds's avatar
Linus Torvalds committed
318
	}
319

Linus Torvalds's avatar
Linus Torvalds committed
320
321
322
323
324
325
	return file;
}


void put_filp(struct file *file)
{
Al Viro's avatar
Al Viro committed
326
	if (atomic_long_dec_and_test(&file->f_count)) {
Linus Torvalds's avatar
Linus Torvalds committed
327
328
329
330
331
332
333
334
335
336
337
		security_file_free(file);
		file_kill(file);
		file_free(file);
	}
}

void file_move(struct file *file, struct list_head *list)
{
	if (!list)
		return;
	file_list_lock();
338
	list_move(&file->f_u.fu_list, list);
Linus Torvalds's avatar
Linus Torvalds committed
339
340
341
342
343
	file_list_unlock();
}

void file_kill(struct file *file)
{
344
	if (!list_empty(&file->f_u.fu_list)) {
Linus Torvalds's avatar
Linus Torvalds committed
345
		file_list_lock();
346
		list_del_init(&file->f_u.fu_list);
Linus Torvalds's avatar
Linus Torvalds committed
347
348
349
350
351
352
		file_list_unlock();
	}
}

int fs_may_remount_ro(struct super_block *sb)
{
353
	struct file *file;
Linus Torvalds's avatar
Linus Torvalds committed
354
355
356

	/* Check that no files are currently opened for writing. */
	file_list_lock();
357
	list_for_each_entry(file, &sb->s_files, f_u.fu_list) {
358
		struct inode *inode = file->f_path.dentry->d_inode;
Linus Torvalds's avatar
Linus Torvalds committed
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374

		/* File with pending delete? */
		if (inode->i_nlink == 0)
			goto too_bad;

		/* Writeable file? */
		if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE))
			goto too_bad;
	}
	file_list_unlock();
	return 1; /* Tis' cool bro. */
too_bad:
	file_list_unlock();
	return 0;
}

375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
/**
 *	mark_files_ro - mark all files read-only
 *	@sb: superblock in question
 *
 *	All files are marked read-only.  We don't care about pending
 *	delete files so this should be used in 'force' mode only.
 */
void mark_files_ro(struct super_block *sb)
{
	struct file *f;

retry:
	file_list_lock();
	list_for_each_entry(f, &sb->s_files, f_u.fu_list) {
		struct vfsmount *mnt;
		if (!S_ISREG(f->f_path.dentry->d_inode->i_mode))
		       continue;
		if (!file_count(f))
			continue;
		if (!(f->f_mode & FMODE_WRITE))
			continue;
		f->f_mode &= ~FMODE_WRITE;
		if (file_check_writeable(f) != 0)
			continue;
		file_release_write(f);
		mnt = mntget(f->f_path.mnt);
		file_list_unlock();
		/*
		 * This can sleep, so we can't hold
		 * the file_list_lock() spinlock.
		 */
		mnt_drop_write(mnt);
		mntput(mnt);
		goto retry;
	}
	file_list_unlock();
}

Linus Torvalds's avatar
Linus Torvalds committed
413
414
415
void __init files_init(unsigned long mempages)
{ 
	int n; 
416
417
418
419
420
421

	filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
			SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);

	/*
	 * One file with associated inode and dcache is very roughly 1K.
Linus Torvalds's avatar
Linus Torvalds committed
422
423
424
425
426
427
428
	 * Per default don't use more than 10% of our memory for files. 
	 */ 

	n = (mempages * (PAGE_SIZE / 1024)) / 10;
	files_stat.max_files = n; 
	if (files_stat.max_files < NR_FILE)
		files_stat.max_files = NR_FILE;
429
	files_defer_init();
430
	percpu_counter_init(&nr_files, 0);
Linus Torvalds's avatar
Linus Torvalds committed
431
}