Commit 2d9048e2 authored by Amy Griffis's avatar Amy Griffis Committed by Al Viro
Browse files

[PATCH] inotify (1/5): split kernel API from userspace support



The following series of patches introduces a kernel API for inotify,
making it possible for kernel modules to benefit from inotify's
mechanism for watching inodes.  With these patches, inotify will
maintain for each caller a list of watches (via an embedded struct
inotify_watch), where each inotify_watch is associated with a
corresponding struct inode.  The caller registers an event handler and
specifies for which filesystem events their event handler should be
called per inotify_watch.
Signed-off-by: default avatarAmy Griffis <amy.griffis@hp.com>
Acked-by: default avatarRobert Love <rml@novell.com>
Acked-by: default avatarJohn McCutchan <john@johnmccutchan.com>
Signed-off-by: default avatarAl Viro <viro@zeniv.linux.org.uk>
parent 90204e0b
...@@ -393,18 +393,30 @@ config INOTIFY ...@@ -393,18 +393,30 @@ config INOTIFY
bool "Inotify file change notification support" bool "Inotify file change notification support"
default y default y
---help--- ---help---
Say Y here to enable inotify support and the associated system Say Y here to enable inotify support. Inotify is a file change
calls. Inotify is a file change notification system and a notification system and a replacement for dnotify. Inotify fixes
replacement for dnotify. Inotify fixes numerous shortcomings in numerous shortcomings in dnotify and introduces several new features
dnotify and introduces several new features. It allows monitoring including multiple file events, one-shot support, and unmount
of both files and directories via a single open fd. Other features
include multiple file events, one-shot support, and unmount
notification. notification.
For more information, see Documentation/filesystems/inotify.txt For more information, see Documentation/filesystems/inotify.txt
If unsure, say Y. If unsure, say Y.
config INOTIFY_USER
bool "Inotify support for userspace"
depends on INOTIFY
default y
---help---
Say Y here to enable inotify support for userspace, including the
associated system calls. Inotify allows monitoring of both files and
directories via a single open fd. Events are read from the file
descriptor, which is also select()- and poll()-able.
For more information, see Documentation/filesystems/inotify.txt
If unsure, say Y.
config QUOTA config QUOTA
bool "Quota support" bool "Quota support"
help help
......
...@@ -13,6 +13,7 @@ obj-y := open.o read_write.o file_table.o buffer.o bio.o super.o \ ...@@ -13,6 +13,7 @@ obj-y := open.o read_write.o file_table.o buffer.o bio.o super.o \
ioprio.o pnode.o drop_caches.o splice.o sync.o ioprio.o pnode.o drop_caches.o splice.o sync.o
obj-$(CONFIG_INOTIFY) += inotify.o obj-$(CONFIG_INOTIFY) += inotify.o
obj-$(CONFIG_INOTIFY_USER) += inotify_user.o
obj-$(CONFIG_EPOLL) += eventpoll.o obj-$(CONFIG_EPOLL) += eventpoll.o
obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o
......
...@@ -5,7 +5,10 @@ ...@@ -5,7 +5,10 @@
* John McCutchan <ttb@tentacle.dhs.org> * John McCutchan <ttb@tentacle.dhs.org>
* Robert Love <rml@novell.com> * Robert Love <rml@novell.com>
* *
* Kernel API added by: Amy Griffis <amy.griffis@hp.com>
*
* Copyright (C) 2005 John McCutchan * Copyright (C) 2005 John McCutchan
* Copyright 2006 Hewlett-Packard Development Company, L.P.
* *
* This program is free software; you can redistribute it and/or modify it * This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the * under the terms of the GNU General Public License as published by the
...@@ -20,35 +23,17 @@ ...@@ -20,35 +23,17 @@
#include <linux/module.h> #include <linux/module.h>
#include <linux/kernel.h> #include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/spinlock.h> #include <linux/spinlock.h>
#include <linux/idr.h> #include <linux/idr.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/fs.h> #include <linux/fs.h>
#include <linux/file.h>
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/poll.h>
#include <linux/init.h> #include <linux/init.h>
#include <linux/list.h> #include <linux/list.h>
#include <linux/writeback.h> #include <linux/writeback.h>
#include <linux/inotify.h> #include <linux/inotify.h>
#include <linux/syscalls.h>
#include <asm/ioctls.h>
static atomic_t inotify_cookie; static atomic_t inotify_cookie;
static kmem_cache_t *watch_cachep __read_mostly;
static kmem_cache_t *event_cachep __read_mostly;
static struct vfsmount *inotify_mnt __read_mostly;
/* these are configurable via /proc/sys/fs/inotify/ */
int inotify_max_user_instances __read_mostly;
int inotify_max_user_watches __read_mostly;
int inotify_max_queued_events __read_mostly;
/* /*
* Lock ordering: * Lock ordering:
* *
...@@ -56,327 +41,108 @@ int inotify_max_queued_events __read_mostly; ...@@ -56,327 +41,108 @@ int inotify_max_queued_events __read_mostly;
* iprune_mutex (synchronize shrink_icache_memory()) * iprune_mutex (synchronize shrink_icache_memory())
* inode_lock (protects the super_block->s_inodes list) * inode_lock (protects the super_block->s_inodes list)
* inode->inotify_mutex (protects inode->inotify_watches and watches->i_list) * inode->inotify_mutex (protects inode->inotify_watches and watches->i_list)
* inotify_dev->mutex (protects inotify_device and watches->d_list) * inotify_handle->mutex (protects inotify_handle and watches->h_list)
*
* The inode->inotify_mutex and inotify_handle->mutex and held during execution
* of a caller's event handler. Thus, the caller must not hold any locks
* taken in their event handler while calling any of the published inotify
* interfaces.
*/ */
/* /*
* Lifetimes of the three main data structures--inotify_device, inode, and * Lifetimes of the three main data structures--inotify_handle, inode, and
* inotify_watch--are managed by reference count. * inotify_watch--are managed by reference count.
* *
* inotify_device: Lifetime is from inotify_init() until release. Additional * inotify_handle: Lifetime is from inotify_init() to inotify_destroy().
* references can bump the count via get_inotify_dev() and drop the count via * Additional references can bump the count via get_inotify_handle() and drop
* put_inotify_dev(). * the count via put_inotify_handle().
* *
* inotify_watch: Lifetime is from create_watch() to destory_watch(). * inotify_watch: for inotify's purposes, lifetime is from inotify_add_watch()
* Additional references can bump the count via get_inotify_watch() and drop * to remove_watch_no_event(). Additional references can bump the count via
* the count via put_inotify_watch(). * get_inotify_watch() and drop the count via put_inotify_watch(). The caller
* is reponsible for the final put after receiving IN_IGNORED, or when using
* IN_ONESHOT after receiving the first event. Inotify does the final put if
* inotify_destroy() is called.
* *
* inode: Pinned so long as the inode is associated with a watch, from * inode: Pinned so long as the inode is associated with a watch, from
* create_watch() to put_inotify_watch(). * inotify_add_watch() to the final put_inotify_watch().
*/ */
/* /*
* struct inotify_device - represents an inotify instance * struct inotify_handle - represents an inotify instance
* *
* This structure is protected by the mutex 'mutex'. * This structure is protected by the mutex 'mutex'.
*/ */
struct inotify_device { struct inotify_handle {
wait_queue_head_t wq; /* wait queue for i/o */
struct idr idr; /* idr mapping wd -> watch */ struct idr idr; /* idr mapping wd -> watch */
struct mutex mutex; /* protects this bad boy */ struct mutex mutex; /* protects this bad boy */
struct list_head events; /* list of queued events */
struct list_head watches; /* list of watches */ struct list_head watches; /* list of watches */
atomic_t count; /* reference count */ atomic_t count; /* reference count */
struct user_struct *user; /* user who opened this dev */
unsigned int queue_size; /* size of the queue (bytes) */
unsigned int event_count; /* number of pending events */
unsigned int max_events; /* maximum number of events */
u32 last_wd; /* the last wd allocated */ u32 last_wd; /* the last wd allocated */
const struct inotify_operations *in_ops; /* inotify caller operations */
}; };
/* static inline void get_inotify_handle(struct inotify_handle *ih)
* struct inotify_kernel_event - An inotify event, originating from a watch and
* queued for user-space. A list of these is attached to each instance of the
* device. In read(), this list is walked and all events that can fit in the
* buffer are returned.
*
* Protected by dev->mutex of the device in which we are queued.
*/
struct inotify_kernel_event {
struct inotify_event event; /* the user-space event */
struct list_head list; /* entry in inotify_device's list */
char *name; /* filename, if any */
};
/*
* struct inotify_watch - represents a watch request on a specific inode
*
* d_list is protected by dev->mutex of the associated watch->dev.
* i_list and mask are protected by inode->inotify_mutex of the associated inode.
* dev, inode, and wd are never written to once the watch is created.
*/
struct inotify_watch {
struct list_head d_list; /* entry in inotify_device's list */
struct list_head i_list; /* entry in inode's list */
atomic_t count; /* reference count */
struct inotify_device *dev; /* associated device */
struct inode *inode; /* associated inode */
s32 wd; /* watch descriptor */
u32 mask; /* event mask for this watch */
};
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
static int zero;
ctl_table inotify_table[] = {
{
.ctl_name = INOTIFY_MAX_USER_INSTANCES,
.procname = "max_user_instances",
.data = &inotify_max_user_instances,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_dointvec_minmax,
.strategy = &sysctl_intvec,
.extra1 = &zero,
},
{
.ctl_name = INOTIFY_MAX_USER_WATCHES,
.procname = "max_user_watches",
.data = &inotify_max_user_watches,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_dointvec_minmax,
.strategy = &sysctl_intvec,
.extra1 = &zero,
},
{
.ctl_name = INOTIFY_MAX_QUEUED_EVENTS,
.procname = "max_queued_events",
.data = &inotify_max_queued_events,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_dointvec_minmax,
.strategy = &sysctl_intvec,
.extra1 = &zero
},
{ .ctl_name = 0 }
};
#endif /* CONFIG_SYSCTL */
static inline void get_inotify_dev(struct inotify_device *dev)
{ {
atomic_inc(&dev->count); atomic_inc(&ih->count);
} }
static inline void put_inotify_dev(struct inotify_device *dev) static inline void put_inotify_handle(struct inotify_handle *ih)
{ {
if (atomic_dec_and_test(&dev->count)) { if (atomic_dec_and_test(&ih->count)) {
atomic_dec(&dev->user->inotify_devs); idr_destroy(&ih->idr);
free_uid(dev->user); kfree(ih);
idr_destroy(&dev->idr);
kfree(dev);
} }
} }
static inline void get_inotify_watch(struct inotify_watch *watch) /**
* get_inotify_watch - grab a reference to an inotify_watch
* @watch: watch to grab
*/
void get_inotify_watch(struct inotify_watch *watch)
{ {
atomic_inc(&watch->count); atomic_inc(&watch->count);
} }
EXPORT_SYMBOL_GPL(get_inotify_watch);
/* /**
* put_inotify_watch - decrements the ref count on a given watch. cleans up * put_inotify_watch - decrements the ref count on a given watch. cleans up
* the watch and its references if the count reaches zero. * watch references if the count reaches zero. inotify_watch is freed by
* inotify callers via the destroy_watch() op.
* @watch: watch to release
*/ */
static inline void put_inotify_watch(struct inotify_watch *watch) void put_inotify_watch(struct inotify_watch *watch)
{ {
if (atomic_dec_and_test(&watch->count)) { if (atomic_dec_and_test(&watch->count)) {
put_inotify_dev(watch->dev); struct inotify_handle *ih = watch->ih;
iput(watch->inode);
kmem_cache_free(watch_cachep, watch);
}
}
/*
* kernel_event - create a new kernel event with the given parameters
*
* This function can sleep.
*/
static struct inotify_kernel_event * kernel_event(s32 wd, u32 mask, u32 cookie,
const char *name)
{
struct inotify_kernel_event *kevent;
kevent = kmem_cache_alloc(event_cachep, GFP_KERNEL);
if (unlikely(!kevent))
return NULL;
/* we hand this out to user-space, so zero it just in case */
memset(&kevent->event, 0, sizeof(struct inotify_event));
kevent->event.wd = wd;
kevent->event.mask = mask;
kevent->event.cookie = cookie;
INIT_LIST_HEAD(&kevent->list);
if (name) {
size_t len, rem, event_size = sizeof(struct inotify_event);
/*
* We need to pad the filename so as to properly align an
* array of inotify_event structures. Because the structure is
* small and the common case is a small filename, we just round
* up to the next multiple of the structure's sizeof. This is
* simple and safe for all architectures.
*/
len = strlen(name) + 1;
rem = event_size - len;
if (len > event_size) {
rem = event_size - (len % event_size);
if (len % event_size == 0)
rem = 0;
}
kevent->name = kmalloc(len + rem, GFP_KERNEL);
if (unlikely(!kevent->name)) {
kmem_cache_free(event_cachep, kevent);
return NULL;
}
memcpy(kevent->name, name, len);
if (rem)
memset(kevent->name + len, 0, rem);
kevent->event.len = len + rem;
} else {
kevent->event.len = 0;
kevent->name = NULL;
}
return kevent;
}
/*
* inotify_dev_get_event - return the next event in the given dev's queue
*
* Caller must hold dev->mutex.
*/
static inline struct inotify_kernel_event *
inotify_dev_get_event(struct inotify_device *dev)
{
return list_entry(dev->events.next, struct inotify_kernel_event, list);
}
/*
* inotify_dev_queue_event - add a new event to the given device
*
* Caller must hold dev->mutex. Can sleep (calls kernel_event()).
*/
static void inotify_dev_queue_event(struct inotify_device *dev,
struct inotify_watch *watch, u32 mask,
u32 cookie, const char *name)
{
struct inotify_kernel_event *kevent, *last;
/* coalescing: drop this event if it is a dupe of the previous */
last = inotify_dev_get_event(dev);
if (last && last->event.mask == mask && last->event.wd == watch->wd &&
last->event.cookie == cookie) {
const char *lastname = last->name;
if (!name && !lastname)
return;
if (name && lastname && !strcmp(lastname, name))
return;
}
/* the queue overflowed and we already sent the Q_OVERFLOW event */
if (unlikely(dev->event_count > dev->max_events))
return;
/* if the queue overflows, we need to notify user space */
if (unlikely(dev->event_count == dev->max_events))
kevent = kernel_event(-1, IN_Q_OVERFLOW, cookie, NULL);
else
kevent = kernel_event(watch->wd, mask, cookie, name);
if (unlikely(!kevent))
return;
/* queue the event and wake up anyone waiting */
dev->event_count++;
dev->queue_size += sizeof(struct inotify_event) + kevent->event.len;
list_add_tail(&kevent->list, &dev->events);
wake_up_interruptible(&dev->wq);
}
/*
* remove_kevent - cleans up and ultimately frees the given kevent
*
* Caller must hold dev->mutex.
*/
static void remove_kevent(struct inotify_device *dev,
struct inotify_kernel_event *kevent)
{
list_del(&kevent->list);
dev->event_count--; iput(watch->inode);
dev->queue_size -= sizeof(struct inotify_event) + kevent->event.len; ih->in_ops->destroy_watch(watch);
put_inotify_handle(ih);
kfree(kevent->name);
kmem_cache_free(event_cachep, kevent);
}
/*
* inotify_dev_event_dequeue - destroy an event on the given device
*
* Caller must hold dev->mutex.
*/
static void inotify_dev_event_dequeue(struct inotify_device *dev)
{
if (!list_empty(&dev->events)) {
struct inotify_kernel_event *kevent;
kevent = inotify_dev_get_event(dev);
remove_kevent(dev, kevent);
} }
} }
EXPORT_SYMBOL_GPL(put_inotify_watch);
/* /*
* inotify_dev_get_wd - returns the next WD for use by the given dev * inotify_handle_get_wd - returns the next WD for use by the given handle
* *
* Callers must hold dev->mutex. This function can sleep. * Callers must hold ih->mutex. This function can sleep.
*/ */
static int inotify_dev_get_wd(struct inotify_device *dev, static int inotify_handle_get_wd(struct inotify_handle *ih,
struct inotify_watch *watch) struct inotify_watch *watch)
{ {
int ret; int ret;
do { do {
if (unlikely(!idr_pre_get(&dev->idr, GFP_KERNEL))) if (unlikely(!idr_pre_get(&ih->idr, GFP_KERNEL)))
return -ENOSPC; return -ENOSPC;
ret = idr_get_new_above(&dev->idr, watch, dev->last_wd+1, &watch->wd); ret = idr_get_new_above(&ih->idr, watch, ih->last_wd+1, &watch->wd);
} while (ret == -EAGAIN); } while (ret == -EAGAIN);
return ret; if (likely(!ret))
} ih->last_wd = watch->wd;
/* return ret;
* find_inode - resolve a user-given path to a specific inode and return a nd
*/
static int find_inode(const char __user *dirname, struct nameidata *nd,
unsigned flags)
{
int error;
error = __user_walk(dirname, flags, nd);
if (error)
return error;
/* you can only watch an inode if you have read permissions on it */
error = vfs_permission(nd, MAY_READ);
if (error)
path_release(nd);
return error;
} }
/* /*
...@@ -422,67 +188,18 @@ static void set_dentry_child_flags(struct inode *inode, int watched) ...@@ -422,67 +188,18 @@ static void set_dentry_child_flags(struct inode *inode, int watched)
} }
/* /*
* create_watch - creates a watch on the given device. * inotify_find_handle - find the watch associated with the given inode and
* * handle
* Callers must hold dev->mutex. Calls inotify_dev_get_wd() so may sleep.
* Both 'dev' and 'inode' (by way of nameidata) need to be pinned.
*/
static struct inotify_watch *create_watch(struct inotify_device *dev,
u32 mask, struct inode *inode)
{
struct inotify_watch *watch;
int ret;
if (atomic_read(&dev->user->inotify_watches) >=
inotify_max_user_watches)
return ERR_PTR(-ENOSPC);
watch = kmem_cache_alloc(watch_cachep, GFP_KERNEL);
if (unlikely(!watch))
return ERR_PTR(-ENOMEM);
ret = inotify_dev_get_wd(dev, watch);
if (unlikely(ret)) {
kmem_cache_free(watch_cachep, watch);
return ERR_PTR(ret);
}
dev->last_wd = watch->wd;
watch->mask = mask;
atomic_set(&watch->count, 0);
INIT_LIST_HEAD(&watch->d_list);
INIT_LIST_HEAD(&watch->i_list);
/* save a reference to device and bump the count to make it official */
get_inotify_dev(dev);
watch->dev = dev;
/*
* Save a reference to the inode and bump the ref count to make it
* official. We hold a reference to nameidata, which makes this safe.
*/
watch->inode = igrab(inode);
/* bump our own count, corresponding to our entry in dev->watches */
get_inotify_watch(watch);
atomic_inc(&dev->user->inotify_watches);
return watch;
}
/*
* inotify_find_dev - find the watch associated with the given inode and dev
* *
* Callers must hold inode->inotify_mutex. * Callers must hold inode->inotify_mutex.
*/ */
static struct inotify_watch *inode_find_dev(struct inode *inode, static struct inotify_watch *inode_find_handle(struct inode *inode,
struct inotify_device *dev) struct inotify_handle *ih)
{ {
struct inotify_watch *watch; struct inotify_watch *watch;
list_for_each_entry(watch, &inode->inotify_watches, i_list) { list_for_each_entry(watch, &inode->inotify_watches, i_list) {
if (watch->dev == dev) if (watch->ih == ih)
return watch; return watch;
} }
...@@ -491,39 +208,34 @@ static struct inotify_watch *inode_find_dev(struct inode *inode, ...@@ -491,39 +208,34 @@ static struct inotify_watch *inode_find_dev(struct inode *inode,
/* /*
* remove_watch_no_event - remove_watch() without the IN_IGNORED event. * remove_watch_no_event - remove_watch() without the IN_IGNORED event.
*
* Callers must hold both inode->inotify_mutex and ih->mutex.
*/ */
static void remove_watch_no_event(struct inotify_watch *watch, static void remove_watch_no_event(struct inotify_watch *watch,
struct inotify_device *dev) struct inotify_handle *ih)
{ {
list_del(&watch->i_list);