Commit 97defe1e authored by Thomas Graf's avatar Thomas Graf Committed by David S. Miller
Browse files

rhashtable: Per bucket locks & deferred expansion/shrinking

Introduces an array of spinlocks to protect bucket mutations. The number
of spinlocks per CPU is configurable and selected based on the hash of
the bucket. This allows for parallel insertions and removals of entries
which do not share a lock.

The patch also defers expansion and shrinking to a worker queue which
allows insertion and removal from atomic context. Insertions and
deletions may occur in parallel to it and are only held up briefly
while the particular bucket is linked or unzipped.

Mutations of the bucket table pointer is protected by a new mutex, read
access is RCU protected.

In the event of an expansion or shrinking, the new bucket table allocated
is exposed as a so called future table as soon as the resize process
starts.  Lookups, deletions, and insertions will briefly use both tables.
The future table becomes the main table after an RCU grace period and
initial linking of the old to the new table was performed. Optimization
of the chains to make use of the new number of buckets follows only the
new table is in use.

The side effect of this is that during that RCU grace period, a bucket
traversal using any rht_for_each() variant on the main table will not see
any insertions performed during the RCU grace period which would at that
point land in the future table. The lookup will see them as it searches
both tables if needed.

Having multiple insertions and removals occur in parallel requires nelems
to become an atomic counter.
Signed-off-by: default avatarThomas Graf <>
Signed-off-by: default avatarDavid S. Miller <>
parent 113948d8
......@@ -19,6 +19,7 @@
#include <linux/rculist.h>
#include <linux/workqueue.h>
struct rhash_head {
struct rhash_head __rcu *next;
......@@ -26,8 +27,17 @@ struct rhash_head {
#define INIT_HASH_HEAD(ptr) ((ptr)->next = NULL)
* struct bucket_table - Table of hash buckets
* @size: Number of hash buckets
* @locks_mask: Mask to apply before accessing locks[]
* @locks: Array of spinlocks protecting individual buckets
* @buckets: size * hash buckets
struct bucket_table {
size_t size;
unsigned int locks_mask;
spinlock_t *locks;
struct rhash_head __rcu *buckets[];
......@@ -45,11 +55,11 @@ struct rhashtable;
* @hash_rnd: Seed to use while hashing
* @max_shift: Maximum number of shifts while expanding
* @min_shift: Minimum number of shifts while shrinking
* @locks_mul: Number of bucket locks to allocate per cpu (default: 128)
* @hashfn: Function to hash key
* @obj_hashfn: Function to hash object
* @grow_decision: If defined, may return true if table should expand
* @shrink_decision: If defined, may return true if table should shrink
* @mutex_is_held: Must return true if protecting mutex is held
struct rhashtable_params {
size_t nelem_hint;
......@@ -59,37 +69,42 @@ struct rhashtable_params {
u32 hash_rnd;
size_t max_shift;
size_t min_shift;
size_t locks_mul;
rht_hashfn_t hashfn;
rht_obj_hashfn_t obj_hashfn;
bool (*grow_decision)(const struct rhashtable *ht,
size_t new_size);
bool (*shrink_decision)(const struct rhashtable *ht,
size_t new_size);
int (*mutex_is_held)(void *parent);
void *parent;
* struct rhashtable - Hash table handle
* @tbl: Bucket table
* @future_tbl: Table under construction during expansion/shrinking
* @nelems: Number of elements in table
* @shift: Current size (1 << shift)
* @p: Configuration parameters
* @run_work: Deferred worker to expand/shrink asynchronously
* @mutex: Mutex to protect current/future table swapping
* @being_destroyed: True if table is set up for destruction
struct rhashtable {
struct bucket_table __rcu *tbl;
size_t nelems;
struct bucket_table __rcu *future_tbl;
atomic_t nelems;
size_t shift;
struct rhashtable_params p;
struct delayed_work run_work;
struct mutex mutex;
bool being_destroyed;
int lockdep_rht_mutex_is_held(const struct rhashtable *ht);
int lockdep_rht_mutex_is_held(struct rhashtable *ht);
int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash);
static inline int lockdep_rht_mutex_is_held(const struct rhashtable *ht)
static inline int lockdep_rht_mutex_is_held(struct rhashtable *ht)
return 1;
......@@ -112,11 +127,11 @@ bool rht_shrink_below_30(const struct rhashtable *ht, size_t new_size);
int rhashtable_expand(struct rhashtable *ht);
int rhashtable_shrink(struct rhashtable *ht);
void *rhashtable_lookup(const struct rhashtable *ht, const void *key);
void *rhashtable_lookup_compare(const struct rhashtable *ht, const void *key,
void *rhashtable_lookup(struct rhashtable *ht, const void *key);
void *rhashtable_lookup_compare(struct rhashtable *ht, const void *key,
bool (*compare)(void *, void *), void *arg);
void rhashtable_destroy(const struct rhashtable *ht);
void rhashtable_destroy(struct rhashtable *ht);
#define rht_dereference(p, ht) \
rcu_dereference_protected(p, lockdep_rht_mutex_is_held(ht))
This diff is collapsed.
......@@ -33,7 +33,7 @@ static bool nft_hash_lookup(const struct nft_set *set,
const struct nft_data *key,
struct nft_data *data)
const struct rhashtable *priv = nft_set_priv(set);
struct rhashtable *priv = nft_set_priv(set);
const struct nft_hash_elem *he;
he = rhashtable_lookup(priv, key);
......@@ -113,7 +113,7 @@ static bool nft_hash_compare(void *ptr, void *arg)
static int nft_hash_get(const struct nft_set *set, struct nft_set_elem *elem)
const struct rhashtable *priv = nft_set_priv(set);
struct rhashtable *priv = nft_set_priv(set);
struct nft_compare_arg arg = {
.set = set,
.elem = elem,
......@@ -129,7 +129,7 @@ static int nft_hash_get(const struct nft_set *set, struct nft_set_elem *elem)
static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set,
struct nft_set_iter *iter)
const struct rhashtable *priv = nft_set_priv(set);
struct rhashtable *priv = nft_set_priv(set);
const struct bucket_table *tbl;
const struct nft_hash_elem *he;
struct nft_set_elem elem;
......@@ -162,13 +162,6 @@ static unsigned int nft_hash_privsize(const struct nlattr * const nla[])
return sizeof(struct rhashtable);
static int lockdep_nfnl_lock_is_held(void *parent)
return lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES);
static int nft_hash_init(const struct nft_set *set,
const struct nft_set_desc *desc,
const struct nlattr * const tb[])
......@@ -182,9 +175,6 @@ static int nft_hash_init(const struct nft_set *set,
.hashfn = jhash,
.grow_decision = rht_grow_above_75,
.shrink_decision = rht_shrink_below_30,
.mutex_is_held = lockdep_nfnl_lock_is_held,
return rhashtable_init(priv, &params);
......@@ -192,16 +182,23 @@ static int nft_hash_init(const struct nft_set *set,
static void nft_hash_destroy(const struct nft_set *set)
const struct rhashtable *priv = nft_set_priv(set);
const struct bucket_table *tbl = priv->tbl;
struct rhashtable *priv = nft_set_priv(set);
const struct bucket_table *tbl;
struct nft_hash_elem *he;
struct rhash_head *pos, *next;
unsigned int i;
/* Stop an eventual async resizing */
priv->being_destroyed = true;
tbl = rht_dereference(priv->tbl, priv);
for (i = 0; i < tbl->size; i++) {
rht_for_each_entry_safe(he, pos, next, tbl, i, node)
nft_hash_elem_destroy(set, he);
......@@ -114,15 +114,6 @@ static atomic_t nl_table_users = ATOMIC_INIT(0);
static int lockdep_nl_sk_hash_is_held(void *parent)
if (debug_locks)
return lockdep_is_held(&nl_sk_hash_lock) || lockdep_is_held(&nl_table_lock);
return 1;
static ATOMIC_NOTIFIER_HEAD(netlink_chain);
static DEFINE_SPINLOCK(netlink_tap_lock);
......@@ -1063,7 +1054,8 @@ static int netlink_insert(struct sock *sk, struct net *net, u32 portid)
goto err;
err = -ENOMEM;
if (BITS_PER_LONG > 32 && unlikely(table->hash.nelems >= UINT_MAX))
if (BITS_PER_LONG > 32 &&
unlikely(atomic_read(&table->hash.nelems) >= UINT_MAX))
goto err;
nlk_sk(sk)->portid = portid;
......@@ -3122,9 +3114,6 @@ static int __init netlink_proto_init(void)
.max_shift = 16, /* 64K */
.grow_decision = rht_grow_above_75,
.shrink_decision = rht_shrink_below_30,
.mutex_is_held = lockdep_nl_sk_hash_is_held,
if (err != 0)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment