Commit b0d40c92 authored by Dave Chinner's avatar Dave Chinner Committed by Al Viro

superblock: introduce per-sb cache shrinker infrastructure

With context based shrinkers, we can implement a per-superblock
shrinker that shrinks the caches attached to the superblock. We
currently have global shrinkers for the inode and dentry caches that
split up into per-superblock operations via a coarse proportioning
method that does not batch very well.  The global shrinkers also
have a dependency - dentries pin inodes - so we have to be very
careful about how we register the global shrinkers so that the
implicit call order is always correct.

With a per-sb shrinker callout, we can encode this dependency
directly into the per-sb shrinker, hence avoiding the need for
strictly ordering shrinker registrations. We also have no need for
any proportioning code for the shrinker subsystem already provides
this functionality across all shrinkers. Allowing the shrinker to
operate on a single superblock at a time means that we do less
superblock list traversals and locking and reclaim should batch more
effectively. This should result in less CPU overhead for reclaim and
potentially faster reclaim of items from each filesystem.
Signed-off-by: default avatarDave Chinner <dchinner@redhat.com>
Signed-off-by: default avatarAl Viro <viro@zeniv.linux.org.uk>
parent 12ad3ab6
...@@ -743,13 +743,11 @@ static void shrink_dentry_list(struct list_head *list) ...@@ -743,13 +743,11 @@ static void shrink_dentry_list(struct list_head *list)
* *
* If flags contains DCACHE_REFERENCED reference dentries will not be pruned. * If flags contains DCACHE_REFERENCED reference dentries will not be pruned.
*/ */
static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags) static void __shrink_dcache_sb(struct super_block *sb, int count, int flags)
{ {
/* called from prune_dcache() and shrink_dcache_parent() */
struct dentry *dentry; struct dentry *dentry;
LIST_HEAD(referenced); LIST_HEAD(referenced);
LIST_HEAD(tmp); LIST_HEAD(tmp);
int cnt = *count;
relock: relock:
spin_lock(&dcache_lru_lock); spin_lock(&dcache_lru_lock);
...@@ -777,7 +775,7 @@ relock: ...@@ -777,7 +775,7 @@ relock:
} else { } else {
list_move_tail(&dentry->d_lru, &tmp); list_move_tail(&dentry->d_lru, &tmp);
spin_unlock(&dentry->d_lock); spin_unlock(&dentry->d_lock);
if (!--cnt) if (!--count)
break; break;
} }
cond_resched_lock(&dcache_lru_lock); cond_resched_lock(&dcache_lru_lock);
...@@ -787,83 +785,22 @@ relock: ...@@ -787,83 +785,22 @@ relock:
spin_unlock(&dcache_lru_lock); spin_unlock(&dcache_lru_lock);
shrink_dentry_list(&tmp); shrink_dentry_list(&tmp);
*count = cnt;
} }
/** /**
* prune_dcache - shrink the dcache * prune_dcache_sb - shrink the dcache
* @count: number of entries to try to free * @nr_to_scan: number of entries to try to free
* *
* Shrink the dcache. This is done when we need more memory, or simply when we * Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is
* need to unmount something (at which point we need to unuse all dentries). * done when we need more memory an called from the superblock shrinker
* function.
* *
* This function may fail to free any resources if all the dentries are in use. * This function may fail to free any resources if all the dentries are in
* use.
*/ */
static void prune_dcache(int count) void prune_dcache_sb(struct super_block *sb, int nr_to_scan)
{ {
struct super_block *sb, *p = NULL; __shrink_dcache_sb(sb, nr_to_scan, DCACHE_REFERENCED);
int w_count;
int unused = dentry_stat.nr_unused;
int prune_ratio;
int pruned;
if (unused == 0 || count == 0)
return;
if (count >= unused)
prune_ratio = 1;
else
prune_ratio = unused / count;
spin_lock(&sb_lock);
list_for_each_entry(sb, &super_blocks, s_list) {
if (list_empty(&sb->s_instances))
continue;
if (sb->s_nr_dentry_unused == 0)
continue;
sb->s_count++;
/* Now, we reclaim unused dentrins with fairness.
* We reclaim them same percentage from each superblock.
* We calculate number of dentries to scan on this sb
* as follows, but the implementation is arranged to avoid
* overflows:
* number of dentries to scan on this sb =
* count * (number of dentries on this sb /
* number of dentries in the machine)
*/
spin_unlock(&sb_lock);
if (prune_ratio != 1)
w_count = (sb->s_nr_dentry_unused / prune_ratio) + 1;
else
w_count = sb->s_nr_dentry_unused;
pruned = w_count;
/*
* We need to be sure this filesystem isn't being unmounted,
* otherwise we could race with generic_shutdown_super(), and
* end up holding a reference to an inode while the filesystem
* is unmounted. So we try to get s_umount, and make sure
* s_root isn't NULL.
*/
if (down_read_trylock(&sb->s_umount)) {
if ((sb->s_root != NULL) &&
(!list_empty(&sb->s_dentry_lru))) {
__shrink_dcache_sb(sb, &w_count,
DCACHE_REFERENCED);
pruned -= w_count;
}
up_read(&sb->s_umount);
}
spin_lock(&sb_lock);
if (p)
__put_super(p);
count -= pruned;
p = sb;
/* more work left to do? */
if (count <= 0)
break;
}
if (p)
__put_super(p);
spin_unlock(&sb_lock);
} }
/** /**
...@@ -1238,42 +1175,10 @@ void shrink_dcache_parent(struct dentry * parent) ...@@ -1238,42 +1175,10 @@ void shrink_dcache_parent(struct dentry * parent)
int found; int found;
while ((found = select_parent(parent)) != 0) while ((found = select_parent(parent)) != 0)
__shrink_dcache_sb(sb, &found, 0); __shrink_dcache_sb(sb, found, 0);
} }
EXPORT_SYMBOL(shrink_dcache_parent); EXPORT_SYMBOL(shrink_dcache_parent);
/*
* Scan `sc->nr_slab_to_reclaim' dentries and return the number which remain.
*
* We need to avoid reentering the filesystem if the caller is performing a
* GFP_NOFS allocation attempt. One example deadlock is:
*
* ext2_new_block->getblk->GFP->shrink_dcache_memory->prune_dcache->
* prune_one_dentry->dput->dentry_iput->iput->inode->i_sb->s_op->put_inode->
* ext2_discard_prealloc->ext2_free_blocks->lock_super->DEADLOCK.
*
* In this case we return -1 to tell the caller that we baled.
*/
static int shrink_dcache_memory(struct shrinker *shrink,
struct shrink_control *sc)
{
int nr = sc->nr_to_scan;
gfp_t gfp_mask = sc->gfp_mask;
if (nr) {
if (!(gfp_mask & __GFP_FS))
return -1;
prune_dcache(nr);
}
return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
}
static struct shrinker dcache_shrinker = {
.shrink = shrink_dcache_memory,
.seeks = DEFAULT_SEEKS,
};
/** /**
* __d_alloc - allocate a dcache entry * __d_alloc - allocate a dcache entry
* @sb: filesystem it will belong to * @sb: filesystem it will belong to
...@@ -3083,8 +2988,6 @@ static void __init dcache_init(void) ...@@ -3083,8 +2988,6 @@ static void __init dcache_init(void)
*/ */
dentry_cache = KMEM_CACHE(dentry, dentry_cache = KMEM_CACHE(dentry,
SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD); SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD);
register_shrinker(&dcache_shrinker);
/* Hash may have been set up in dcache_init_early */ /* Hash may have been set up in dcache_init_early */
if (!hashdist) if (!hashdist)
......
...@@ -73,7 +73,7 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock); ...@@ -73,7 +73,7 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock);
* *
* We don't actually need it to protect anything in the umount path, * We don't actually need it to protect anything in the umount path,
* but only need to cycle through it to make sure any inode that * but only need to cycle through it to make sure any inode that
* prune_icache took off the LRU list has been fully torn down by the * prune_icache_sb took off the LRU list has been fully torn down by the
* time we are past evict_inodes. * time we are past evict_inodes.
*/ */
static DECLARE_RWSEM(iprune_sem); static DECLARE_RWSEM(iprune_sem);
...@@ -544,7 +544,7 @@ void evict_inodes(struct super_block *sb) ...@@ -544,7 +544,7 @@ void evict_inodes(struct super_block *sb)
dispose_list(&dispose); dispose_list(&dispose);
/* /*
* Cycle through iprune_sem to make sure any inode that prune_icache * Cycle through iprune_sem to make sure any inode that prune_icache_sb
* moved off the list before we took the lock has been fully torn * moved off the list before we took the lock has been fully torn
* down. * down.
*/ */
...@@ -612,9 +612,10 @@ static int can_unuse(struct inode *inode) ...@@ -612,9 +612,10 @@ static int can_unuse(struct inode *inode)
} }
/* /*
* Scan `goal' inodes on the unused list for freeable ones. They are moved to a * Walk the superblock inode LRU for freeable inodes and attempt to free them.
* temporary list and then are freed outside sb->s_inode_lru_lock by * This is called from the superblock shrinker function with a number of inodes
* dispose_list(). * to trim from the LRU. Inodes to be freed are moved to a temporary list and
* then are freed outside inode_lock by dispose_list().
* *
* Any inodes which are pinned purely because of attached pagecache have their * Any inodes which are pinned purely because of attached pagecache have their
* pagecache removed. If the inode has metadata buffers attached to * pagecache removed. If the inode has metadata buffers attached to
...@@ -628,14 +629,15 @@ static int can_unuse(struct inode *inode) ...@@ -628,14 +629,15 @@ static int can_unuse(struct inode *inode)
* LRU does not have strict ordering. Hence we don't want to reclaim inodes * LRU does not have strict ordering. Hence we don't want to reclaim inodes
* with this flag set because they are the inodes that are out of order. * with this flag set because they are the inodes that are out of order.
*/ */
static void shrink_icache_sb(struct super_block *sb, int *nr_to_scan) void prune_icache_sb(struct super_block *sb, int nr_to_scan)
{ {
LIST_HEAD(freeable); LIST_HEAD(freeable);
int nr_scanned; int nr_scanned;
unsigned long reap = 0; unsigned long reap = 0;
down_read(&iprune_sem);
spin_lock(&sb->s_inode_lru_lock); spin_lock(&sb->s_inode_lru_lock);
for (nr_scanned = *nr_to_scan; nr_scanned >= 0; nr_scanned--) { for (nr_scanned = nr_to_scan; nr_scanned >= 0; nr_scanned--) {
struct inode *inode; struct inode *inode;
if (list_empty(&sb->s_inode_lru)) if (list_empty(&sb->s_inode_lru))
...@@ -707,111 +709,11 @@ static void shrink_icache_sb(struct super_block *sb, int *nr_to_scan) ...@@ -707,111 +709,11 @@ static void shrink_icache_sb(struct super_block *sb, int *nr_to_scan)
else else
__count_vm_events(PGINODESTEAL, reap); __count_vm_events(PGINODESTEAL, reap);
spin_unlock(&sb->s_inode_lru_lock); spin_unlock(&sb->s_inode_lru_lock);
*nr_to_scan = nr_scanned;
dispose_list(&freeable); dispose_list(&freeable);
}
static void prune_icache(int count)
{
struct super_block *sb, *p = NULL;
int w_count;
int unused = inodes_stat.nr_unused;
int prune_ratio;
int pruned;
if (unused == 0 || count == 0)
return;
down_read(&iprune_sem);
if (count >= unused)
prune_ratio = 1;
else
prune_ratio = unused / count;
spin_lock(&sb_lock);
list_for_each_entry(sb, &super_blocks, s_list) {
if (list_empty(&sb->s_instances))
continue;
if (sb->s_nr_inodes_unused == 0)
continue;
sb->s_count++;
/* Now, we reclaim unused dentrins with fairness.
* We reclaim them same percentage from each superblock.
* We calculate number of dentries to scan on this sb
* as follows, but the implementation is arranged to avoid
* overflows:
* number of dentries to scan on this sb =
* count * (number of dentries on this sb /
* number of dentries in the machine)
*/
spin_unlock(&sb_lock);
if (prune_ratio != 1)
w_count = (sb->s_nr_inodes_unused / prune_ratio) + 1;
else
w_count = sb->s_nr_inodes_unused;
pruned = w_count;
/*
* We need to be sure this filesystem isn't being unmounted,
* otherwise we could race with generic_shutdown_super(), and
* end up holding a reference to an inode while the filesystem
* is unmounted. So we try to get s_umount, and make sure
* s_root isn't NULL.
*/
if (down_read_trylock(&sb->s_umount)) {
if ((sb->s_root != NULL) &&
(!list_empty(&sb->s_dentry_lru))) {
shrink_icache_sb(sb, &w_count);
pruned -= w_count;
}
up_read(&sb->s_umount);
}
spin_lock(&sb_lock);
if (p)
__put_super(p);
count -= pruned;
p = sb;
/* more work left to do? */
if (count <= 0)
break;
}
if (p)
__put_super(p);
spin_unlock(&sb_lock);
up_read(&iprune_sem); up_read(&iprune_sem);
} }
/*
* shrink_icache_memory() will attempt to reclaim some unused inodes. Here,
* "unused" means that no dentries are referring to the inodes: the files are
* not open and the dcache references to those inodes have already been
* reclaimed.
*
* This function is passed the number of inodes to scan, and it returns the
* total number of remaining possibly-reclaimable inodes.
*/
static int shrink_icache_memory(struct shrinker *shrink,
struct shrink_control *sc)
{
int nr = sc->nr_to_scan;
gfp_t gfp_mask = sc->gfp_mask;
if (nr) {
/*
* Nasty deadlock avoidance. We may hold various FS locks,
* and we don't want to recurse into the FS that called us
* in clear_inode() and friends..
*/
if (!(gfp_mask & __GFP_FS))
return -1;
prune_icache(nr);
}
return (get_nr_inodes_unused() / 100) * sysctl_vfs_cache_pressure;
}
static struct shrinker icache_shrinker = {
.shrink = shrink_icache_memory,
.seeks = DEFAULT_SEEKS,
};
static void __wait_on_freeing_inode(struct inode *inode); static void __wait_on_freeing_inode(struct inode *inode);
/* /*
* Called with the inode lock held. * Called with the inode lock held.
...@@ -1691,7 +1593,6 @@ void __init inode_init(void) ...@@ -1691,7 +1593,6 @@ void __init inode_init(void)
(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
SLAB_MEM_SPREAD), SLAB_MEM_SPREAD),
init_once); init_once);
register_shrinker(&icache_shrinker);
/* Hash may have been set up in inode_init_early */ /* Hash may have been set up in inode_init_early */
if (!hashdist) if (!hashdist)
......
...@@ -38,6 +38,48 @@ ...@@ -38,6 +38,48 @@
LIST_HEAD(super_blocks); LIST_HEAD(super_blocks);
DEFINE_SPINLOCK(sb_lock); DEFINE_SPINLOCK(sb_lock);
/*
* One thing we have to be careful of with a per-sb shrinker is that we don't
* drop the last active reference to the superblock from within the shrinker.
* If that happens we could trigger unregistering the shrinker from within the
* shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we
* take a passive reference to the superblock to avoid this from occurring.
*/
static int prune_super(struct shrinker *shrink, struct shrink_control *sc)
{
struct super_block *sb;
int count;
sb = container_of(shrink, struct super_block, s_shrink);
/*
* Deadlock avoidance. We may hold various FS locks, and we don't want
* to recurse into the FS that called us in clear_inode() and friends..
*/
if (sc->nr_to_scan && !(sc->gfp_mask & __GFP_FS))
return -1;
if (!grab_super_passive(sb))
return -1;
if (sc->nr_to_scan) {
/* proportion the scan between the two caches */
int total;
total = sb->s_nr_dentry_unused + sb->s_nr_inodes_unused + 1;
count = (sc->nr_to_scan * sb->s_nr_dentry_unused) / total;
/* prune dcache first as icache is pinned by it */
prune_dcache_sb(sb, count);
prune_icache_sb(sb, sc->nr_to_scan - count);
}
count = ((sb->s_nr_dentry_unused + sb->s_nr_inodes_unused) / 100)
* sysctl_vfs_cache_pressure;
drop_super(sb);
return count;
}
/** /**
* alloc_super - create new superblock * alloc_super - create new superblock
* @type: filesystem type superblock should belong to * @type: filesystem type superblock should belong to
...@@ -116,6 +158,9 @@ static struct super_block *alloc_super(struct file_system_type *type) ...@@ -116,6 +158,9 @@ static struct super_block *alloc_super(struct file_system_type *type)
s->s_op = &default_op; s->s_op = &default_op;
s->s_time_gran = 1000000000; s->s_time_gran = 1000000000;
s->cleancache_poolid = -1; s->cleancache_poolid = -1;
s->s_shrink.seeks = DEFAULT_SEEKS;
s->s_shrink.shrink = prune_super;
} }
out: out:
return s; return s;
...@@ -183,6 +228,10 @@ void deactivate_locked_super(struct super_block *s) ...@@ -183,6 +228,10 @@ void deactivate_locked_super(struct super_block *s)
if (atomic_dec_and_test(&s->s_active)) { if (atomic_dec_and_test(&s->s_active)) {
cleancache_flush_fs(s); cleancache_flush_fs(s);
fs->kill_sb(s); fs->kill_sb(s);
/* caches are now gone, we can safely kill the shrinker now */
unregister_shrinker(&s->s_shrink);
/* /*
* We need to call rcu_barrier so all the delayed rcu free * We need to call rcu_barrier so all the delayed rcu free
* inodes are flushed before we release the fs module. * inodes are flushed before we release the fs module.
...@@ -311,7 +360,6 @@ void generic_shutdown_super(struct super_block *sb) ...@@ -311,7 +360,6 @@ void generic_shutdown_super(struct super_block *sb)
{ {
const struct super_operations *sop = sb->s_op; const struct super_operations *sop = sb->s_op;
if (sb->s_root) { if (sb->s_root) {
shrink_dcache_for_umount(sb); shrink_dcache_for_umount(sb);
sync_filesystem(sb); sync_filesystem(sb);
...@@ -399,6 +447,7 @@ retry: ...@@ -399,6 +447,7 @@ retry:
list_add(&s->s_instances, &type->fs_supers); list_add(&s->s_instances, &type->fs_supers);
spin_unlock(&sb_lock); spin_unlock(&sb_lock);
get_filesystem(type); get_filesystem(type);
register_shrinker(&s->s_shrink);
return s; return s;
} }
......
...@@ -393,6 +393,7 @@ struct inodes_stat_t { ...@@ -393,6 +393,7 @@ struct inodes_stat_t {
#include <linux/semaphore.h> #include <linux/semaphore.h>
#include <linux/fiemap.h> #include <linux/fiemap.h>
#include <linux/rculist_bl.h> #include <linux/rculist_bl.h>
#include <linux/shrinker.h>
#include <linux/atomic.h> #include <linux/atomic.h>
#include <asm/byteorder.h> #include <asm/byteorder.h>
...@@ -1444,8 +1445,14 @@ struct super_block { ...@@ -1444,8 +1445,14 @@ struct super_block {
* Saved pool identifier for cleancache (-1 means none) * Saved pool identifier for cleancache (-1 means none)
*/ */
int cleancache_poolid; int cleancache_poolid;
struct shrinker s_shrink; /* per-sb shrinker handle */
}; };
/* superblock cache pruning functions */
extern void prune_icache_sb(struct super_block *sb, int nr_to_scan);
extern void prune_dcache_sb(struct super_block *sb, int nr_to_scan);
extern struct timespec current_fs_time(struct super_block *sb); extern struct timespec current_fs_time(struct super_block *sb);
/* /*
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include <linux/range.h> #include <linux/range.h>
#include <linux/pfn.h> #include <linux/pfn.h>
#include <linux/bit_spinlock.h> #include <linux/bit_spinlock.h>
#include <linux/shrinker.h>
struct mempolicy; struct mempolicy;
struct anon_vma; struct anon_vma;
...@@ -1121,45 +1122,6 @@ static inline void sync_mm_rss(struct task_struct *task, struct mm_struct *mm) ...@@ -1121,45 +1122,6 @@ static inline void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
} }
#endif #endif
/*
* This struct is used to pass information from page reclaim to the shrinkers.
* We consolidate the values for easier extention later.
*/
struct shrink_control {
gfp_t gfp_mask;
/* How many slab objects shrinker() should scan and try to reclaim */
unsigned long nr_to_scan;
};
/*
* A callback you can register to apply pressure to ageable caches.
*
* 'sc' is passed shrink_control which includes a count 'nr_to_scan'
* and a 'gfpmask'. It should look through the least-recently-used
* 'nr_to_scan' entries and attempt to free them up. It should return
* the number of objects which remain in the cache. If it returns -1, it means
* it cannot do any scanning at this time (eg. there is a risk of deadlock).
*
* The 'gfpmask' refers to the allocation we are currently trying to
* fulfil.
*
* Note that 'shrink' will be passed nr_to_scan == 0 when the VM is
* querying the cache size, so a fastpath for that case is appropriate.
*/
struct shrinker {
int (*shrink)(struct shrinker *, struct shrink_control *sc);
int seeks; /* seeks to recreate an obj */
long batch; /* reclaim batch size, 0 = default */
/* These are for internal use */
struct list_head list;
long nr; /* objs pending delete */
};
#define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
extern void register_shrinker(struct shrinker *);
extern void unregister_shrinker(struct shrinker *);
int vma_wants_writenotify(struct vm_area_struct *vma); int vma_wants_writenotify(struct vm_area_struct *vma);
extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
......
#ifndef _LINUX_SHRINKER_H
#define _LINUX_SHRINKER_H
/*
* This struct is used to pass information from page reclaim to the shrinkers.
* We consolidate the values for easier extention later.
*/
struct shrink_control {
gfp_t gfp_mask;
/* How many slab objects shrinker() should scan and try to reclaim */
unsigned long nr_to_scan;
};
/*
* A callback you can register to apply pressure to ageable caches.
*
* 'sc' is passed shrink_control which includes a count 'nr_to_scan'
* and a 'gfpmask'. It should look through the least-recently-used
* 'nr_to_scan' entries and attempt to free them up. It should return
* the number of objects which remain in the cache. If it returns -1, it means
* it cannot do any scanning at this time (eg. there is a risk of deadlock).
*
* The 'gfpmask' refers to the allocation we are currently trying to
* fulfil.
*
* Note that 'shrink' will be passed nr_to_scan == 0 when the VM is
* querying the cache size, so a fastpath for that case is appropriate.
*/
struct shrinker {
int (*shrink)(struct shrinker *, struct shrink_control *sc);
int seeks; /* seeks to recreate an obj */
long batch; /* reclaim batch size, 0 = default */
/* These are for internal use */
struct list_head list;
long nr; /* objs pending delete */
};
#define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
extern void register_shrinker(struct shrinker *);
extern void unregister_shrinker(struct shrinker *);
#endif
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment