Commit 72a3effa authored by Eric Dumazet's avatar Eric Dumazet Committed by David S. Miller

[NET]: Size listen hash tables using backlog hint

We currently allocate a fixed size (TCP_SYNQ_HSIZE=512) slots hash table for
each LISTEN socket, regardless of various parameters (listen backlog for
example)

On x86_64, this means order-1 allocations (might fail), even for 'small'
sockets, expecting few connections. On the contrary, a huge server wanting a
backlog of 50000 is slowed down a bit because of this fixed limit.

This patch makes the sizing of listen hash table a dynamic parameter,
depending of :
- net.core.somaxconn tunable (default is 128)
- net.ipv4.tcp_max_syn_backlog tunable (default : 256, 1024 or 128)
- backlog value given by user application  (2nd parameter of listen())

For large allocations (bigger than PAGE_SIZE), we use vmalloc() instead of
kmalloc().

We still limit memory allocation with the two existing tunables (somaxconn &
tcp_max_syn_backlog). So for standard setups, this patch actually reduce RAM
usage.
Signed-off-by: default avatarEric Dumazet <dada1@cosmosbay.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 3c62f75a
...@@ -28,8 +28,8 @@ struct proto; ...@@ -28,8 +28,8 @@ struct proto;
struct request_sock_ops { struct request_sock_ops {
int family; int family;
kmem_cache_t *slab;
int obj_size; int obj_size;
kmem_cache_t *slab;
int (*rtx_syn_ack)(struct sock *sk, int (*rtx_syn_ack)(struct sock *sk,
struct request_sock *req, struct request_sock *req,
struct dst_entry *dst); struct dst_entry *dst);
...@@ -51,13 +51,13 @@ struct request_sock { ...@@ -51,13 +51,13 @@ struct request_sock {
u32 rcv_wnd; /* rcv_wnd offered first time */ u32 rcv_wnd; /* rcv_wnd offered first time */
u32 ts_recent; u32 ts_recent;
unsigned long expires; unsigned long expires;
struct request_sock_ops *rsk_ops; const struct request_sock_ops *rsk_ops;
struct sock *sk; struct sock *sk;
u32 secid; u32 secid;
u32 peer_secid; u32 peer_secid;
}; };
static inline struct request_sock *reqsk_alloc(struct request_sock_ops *ops) static inline struct request_sock *reqsk_alloc(const struct request_sock_ops *ops)
{ {
struct request_sock *req = kmem_cache_alloc(ops->slab, SLAB_ATOMIC); struct request_sock *req = kmem_cache_alloc(ops->slab, SLAB_ATOMIC);
...@@ -121,7 +121,7 @@ struct request_sock_queue { ...@@ -121,7 +121,7 @@ struct request_sock_queue {
}; };
extern int reqsk_queue_alloc(struct request_sock_queue *queue, extern int reqsk_queue_alloc(struct request_sock_queue *queue,
const int nr_table_entries); unsigned int nr_table_entries);
static inline struct listen_sock *reqsk_queue_yank_listen_sk(struct request_sock_queue *queue) static inline struct listen_sock *reqsk_queue_yank_listen_sk(struct request_sock_queue *queue)
{ {
......
...@@ -138,7 +138,6 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo); ...@@ -138,7 +138,6 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
#define MAX_TCP_SYNCNT 127 #define MAX_TCP_SYNCNT 127
#define TCP_SYNQ_INTERVAL (HZ/5) /* Period of SYNACK timer */ #define TCP_SYNQ_INTERVAL (HZ/5) /* Period of SYNACK timer */
#define TCP_SYNQ_HSIZE 512 /* Size of SYNACK hash table */
#define TCP_PAWS_24DAYS (60 * 60 * 24 * 24) #define TCP_PAWS_24DAYS (60 * 60 * 24 * 24)
#define TCP_PAWS_MSL 60 /* Per-host timestamps are invalidated #define TCP_PAWS_MSL 60 /* Per-host timestamps are invalidated
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include <linux/random.h> #include <linux/random.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/string.h> #include <linux/string.h>
#include <linux/vmalloc.h>
#include <net/request_sock.h> #include <net/request_sock.h>
...@@ -29,22 +30,31 @@ ...@@ -29,22 +30,31 @@
* it is absolutely not enough even at 100conn/sec. 256 cures most * it is absolutely not enough even at 100conn/sec. 256 cures most
* of problems. This value is adjusted to 128 for very small machines * of problems. This value is adjusted to 128 for very small machines
* (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb). * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
* Further increasing requires to change hash table size. * Note : Dont forget somaxconn that may limit backlog too.
*/ */
int sysctl_max_syn_backlog = 256; int sysctl_max_syn_backlog = 256;
int reqsk_queue_alloc(struct request_sock_queue *queue, int reqsk_queue_alloc(struct request_sock_queue *queue,
const int nr_table_entries) unsigned int nr_table_entries)
{ {
const int lopt_size = sizeof(struct listen_sock) + size_t lopt_size = sizeof(struct listen_sock);
nr_table_entries * sizeof(struct request_sock *); struct listen_sock *lopt;
struct listen_sock *lopt = kzalloc(lopt_size, GFP_KERNEL);
nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog);
nr_table_entries = max_t(u32, nr_table_entries, 8);
nr_table_entries = roundup_pow_of_two(nr_table_entries + 1);
lopt_size += nr_table_entries * sizeof(struct request_sock *);
if (lopt_size > PAGE_SIZE)
lopt = __vmalloc(lopt_size,
GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
PAGE_KERNEL);
else
lopt = kzalloc(lopt_size, GFP_KERNEL);
if (lopt == NULL) if (lopt == NULL)
return -ENOMEM; return -ENOMEM;
for (lopt->max_qlen_log = 6; for (lopt->max_qlen_log = 3;
(1 << lopt->max_qlen_log) < sysctl_max_syn_backlog; (1 << lopt->max_qlen_log) < nr_table_entries;
lopt->max_qlen_log++); lopt->max_qlen_log++);
get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd)); get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd));
...@@ -65,9 +75,11 @@ void reqsk_queue_destroy(struct request_sock_queue *queue) ...@@ -65,9 +75,11 @@ void reqsk_queue_destroy(struct request_sock_queue *queue)
{ {
/* make all the listen_opt local to us */ /* make all the listen_opt local to us */
struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue); struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue);
size_t lopt_size = sizeof(struct listen_sock) +
lopt->nr_table_entries * sizeof(struct request_sock *);
if (lopt->qlen != 0) { if (lopt->qlen != 0) {
int i; unsigned int i;
for (i = 0; i < lopt->nr_table_entries; i++) { for (i = 0; i < lopt->nr_table_entries; i++) {
struct request_sock *req; struct request_sock *req;
...@@ -81,7 +93,10 @@ void reqsk_queue_destroy(struct request_sock_queue *queue) ...@@ -81,7 +93,10 @@ void reqsk_queue_destroy(struct request_sock_queue *queue)
} }
BUG_TRAP(lopt->qlen == 0); BUG_TRAP(lopt->qlen == 0);
kfree(lopt); if (lopt_size > PAGE_SIZE)
vfree(lopt);
else
kfree(lopt);
} }
EXPORT_SYMBOL(reqsk_queue_destroy); EXPORT_SYMBOL(reqsk_queue_destroy);
...@@ -1022,7 +1022,7 @@ static void dccp_v4_reqsk_destructor(struct request_sock *req) ...@@ -1022,7 +1022,7 @@ static void dccp_v4_reqsk_destructor(struct request_sock *req)
kfree(inet_rsk(req)->opt); kfree(inet_rsk(req)->opt);
} }
static struct request_sock_ops dccp_request_sock_ops = { static struct request_sock_ops dccp_request_sock_ops _read_mostly = {
.family = PF_INET, .family = PF_INET,
.obj_size = sizeof(struct dccp_request_sock), .obj_size = sizeof(struct dccp_request_sock),
.rtx_syn_ack = dccp_v4_send_response, .rtx_syn_ack = dccp_v4_send_response,
......
...@@ -262,12 +262,12 @@ int dccp_destroy_sock(struct sock *sk) ...@@ -262,12 +262,12 @@ int dccp_destroy_sock(struct sock *sk)
EXPORT_SYMBOL_GPL(dccp_destroy_sock); EXPORT_SYMBOL_GPL(dccp_destroy_sock);
static inline int dccp_listen_start(struct sock *sk) static inline int dccp_listen_start(struct sock *sk, int backlog)
{ {
struct dccp_sock *dp = dccp_sk(sk); struct dccp_sock *dp = dccp_sk(sk);
dp->dccps_role = DCCP_ROLE_LISTEN; dp->dccps_role = DCCP_ROLE_LISTEN;
return inet_csk_listen_start(sk, TCP_SYNQ_HSIZE); return inet_csk_listen_start(sk, backlog);
} }
int dccp_disconnect(struct sock *sk, int flags) int dccp_disconnect(struct sock *sk, int flags)
...@@ -788,7 +788,7 @@ int inet_dccp_listen(struct socket *sock, int backlog) ...@@ -788,7 +788,7 @@ int inet_dccp_listen(struct socket *sock, int backlog)
* FIXME: here it probably should be sk->sk_prot->listen_start * FIXME: here it probably should be sk->sk_prot->listen_start
* see tcp_listen_start * see tcp_listen_start
*/ */
err = dccp_listen_start(sk); err = dccp_listen_start(sk, backlog);
if (err) if (err)
goto out; goto out;
} }
......
...@@ -204,7 +204,7 @@ int inet_listen(struct socket *sock, int backlog) ...@@ -204,7 +204,7 @@ int inet_listen(struct socket *sock, int backlog)
* we can only allow the backlog to be adjusted. * we can only allow the backlog to be adjusted.
*/ */
if (old_state != TCP_LISTEN) { if (old_state != TCP_LISTEN) {
err = inet_csk_listen_start(sk, TCP_SYNQ_HSIZE); err = inet_csk_listen_start(sk, backlog);
if (err) if (err)
goto out; goto out;
} }
......
...@@ -343,7 +343,7 @@ struct dst_entry* inet_csk_route_req(struct sock *sk, ...@@ -343,7 +343,7 @@ struct dst_entry* inet_csk_route_req(struct sock *sk,
EXPORT_SYMBOL_GPL(inet_csk_route_req); EXPORT_SYMBOL_GPL(inet_csk_route_req);
static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport, static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport,
const u32 rnd, const u16 synq_hsize) const u32 rnd, const u32 synq_hsize)
{ {
return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1); return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1);
} }
......
...@@ -715,7 +715,7 @@ static struct ip_options *tcp_v4_save_options(struct sock *sk, ...@@ -715,7 +715,7 @@ static struct ip_options *tcp_v4_save_options(struct sock *sk,
return dopt; return dopt;
} }
struct request_sock_ops tcp_request_sock_ops = { struct request_sock_ops tcp_request_sock_ops __read_mostly = {
.family = PF_INET, .family = PF_INET,
.obj_size = sizeof(struct tcp_request_sock), .obj_size = sizeof(struct tcp_request_sock),
.rtx_syn_ack = tcp_v4_send_synack, .rtx_syn_ack = tcp_v4_send_synack,
...@@ -1385,7 +1385,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur) ...@@ -1385,7 +1385,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
if (st->state == TCP_SEQ_STATE_OPENREQ) { if (st->state == TCP_SEQ_STATE_OPENREQ) {
struct request_sock *req = cur; struct request_sock *req = cur;
icsk = inet_csk(st->syn_wait_sk); icsk = inet_csk(st->syn_wait_sk);
req = req->dl_next; req = req->dl_next;
while (1) { while (1) {
while (req) { while (req) {
...@@ -1395,7 +1395,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur) ...@@ -1395,7 +1395,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
} }
req = req->dl_next; req = req->dl_next;
} }
if (++st->sbucket >= TCP_SYNQ_HSIZE) if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
break; break;
get_req: get_req:
req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket]; req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
......
...@@ -526,7 +526,7 @@ static void tcp_v6_reqsk_destructor(struct request_sock *req) ...@@ -526,7 +526,7 @@ static void tcp_v6_reqsk_destructor(struct request_sock *req)
kfree_skb(inet6_rsk(req)->pktopts); kfree_skb(inet6_rsk(req)->pktopts);
} }
static struct request_sock_ops tcp6_request_sock_ops = { static struct request_sock_ops tcp6_request_sock_ops _read_mostly = {
.family = AF_INET6, .family = AF_INET6,
.obj_size = sizeof(struct tcp6_request_sock), .obj_size = sizeof(struct tcp6_request_sock),
.rtx_syn_ack = tcp_v6_send_synack, .rtx_syn_ack = tcp_v6_send_synack,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment