Commit 76973dd7 authored by David S. Miller's avatar David S. Miller

Merge branch 'setsockopt_incoming_cpu'

Eric Dumazet says:

====================
tcp: better smp listener behavior

As promised in last patch series, we implement a better SO_REUSEPORT
strategy, based on cpu hints if given by the application.

We also moved sk_refcnt out of the cache line containing the lookup
keys, as it was considerably slowing down smp operations because
of false sharing. This was simpler than converting listen sockets
to conventional RCU (to avoid sk_refcnt dirtying)

Could process 6.0 Mpps SYN instead of 4.2 Mpps on my test server.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents c7d39e32 d475f090
......@@ -356,8 +356,8 @@ static inline struct tcp_sock *tcp_sk(const struct sock *sk)
struct tcp_timewait_sock {
struct inet_timewait_sock tw_sk;
u32 tw_rcv_nxt;
u32 tw_snd_nxt;
#define tw_rcv_nxt tw_sk.__tw_common.skc_tw_rcv_nxt
#define tw_snd_nxt tw_sk.__tw_common.skc_tw_snd_nxt
u32 tw_rcv_wnd;
u32 tw_ts_offset;
u32 tw_ts_recent;
......
......@@ -70,6 +70,7 @@ struct inet_timewait_sock {
#define tw_dport __tw_common.skc_dport
#define tw_num __tw_common.skc_num
#define tw_cookie __tw_common.skc_cookie
#define tw_dr __tw_common.skc_tw_dr
int tw_timeout;
volatile unsigned char tw_substate;
......@@ -88,7 +89,6 @@ struct inet_timewait_sock {
kmemcheck_bitfield_end(flags);
struct timer_list tw_timer;
struct inet_bind_bucket *tw_tb;
struct inet_timewait_death_row *tw_dr;
};
#define tw_tclass tw_tos
......
......@@ -50,16 +50,15 @@ struct request_sock {
struct sock_common __req_common;
#define rsk_refcnt __req_common.skc_refcnt
#define rsk_hash __req_common.skc_hash
#define rsk_listener __req_common.skc_listener
#define rsk_window_clamp __req_common.skc_window_clamp
#define rsk_rcv_wnd __req_common.skc_rcv_wnd
struct request_sock *dl_next;
struct sock *rsk_listener;
u16 mss;
u8 num_retrans; /* number of retransmits */
u8 cookie_ts:1; /* syncookie: encode tcpopts in timestamp */
u8 num_timeout:7; /* number of timeouts */
/* The following two fields can be easily recomputed I think -AK */
u32 window_clamp; /* window clamp at creation time */
u32 rcv_wnd; /* rcv_wnd offered first time */
u32 ts_recent;
struct timer_list rsk_timer;
const struct request_sock_ops *rsk_ops;
......
......@@ -150,6 +150,10 @@ typedef __u64 __bitwise __addrpair;
* @skc_node: main hash linkage for various protocol lookup tables
* @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
* @skc_tx_queue_mapping: tx queue number for this connection
* @skc_flags: place holder for sk_flags
* %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
* %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
* @skc_incoming_cpu: record/match cpu processing incoming packets
* @skc_refcnt: reference count
*
* This is the minimal network layer representation of sockets, the header
......@@ -200,6 +204,16 @@ struct sock_common {
atomic64_t skc_cookie;
/* following fields are padding to force
* offset(struct sock, sk_refcnt) == 128 on 64bit arches
* assuming IPV6 is enabled. We use this padding differently
* for different kind of 'sockets'
*/
union {
unsigned long skc_flags;
struct sock *skc_listener; /* request_sock */
struct inet_timewait_death_row *skc_tw_dr; /* inet_timewait_sock */
};
/*
* fields between dontcopy_begin/dontcopy_end
* are not copied in sock_copy()
......@@ -212,9 +226,20 @@ struct sock_common {
struct hlist_nulls_node skc_nulls_node;
};
int skc_tx_queue_mapping;
union {
int skc_incoming_cpu;
u32 skc_rcv_wnd;
u32 skc_tw_rcv_nxt; /* struct tcp_timewait_sock */
};
atomic_t skc_refcnt;
/* private: */
int skc_dontcopy_end[0];
union {
u32 skc_rxhash;
u32 skc_window_clamp;
u32 skc_tw_snd_nxt; /* struct tcp_timewait_sock */
};
/* public: */
};
......@@ -243,8 +268,6 @@ struct cg_proto;
* @sk_pacing_rate: Pacing rate (if supported by transport/packet scheduler)
* @sk_max_pacing_rate: Maximum pacing rate (%SO_MAX_PACING_RATE)
* @sk_sndbuf: size of send buffer in bytes
* @sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
* %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
* @sk_no_check_tx: %SO_NO_CHECK setting, set checksum in TX packets
* @sk_no_check_rx: allow zero checksum in RX packets
* @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO)
......@@ -273,8 +296,6 @@ struct cg_proto;
* @sk_rcvlowat: %SO_RCVLOWAT setting
* @sk_rcvtimeo: %SO_RCVTIMEO setting
* @sk_sndtimeo: %SO_SNDTIMEO setting
* @sk_rxhash: flow hash received from netif layer
* @sk_incoming_cpu: record cpu processing incoming packets
* @sk_txhash: computed flow hash for use on transmit
* @sk_filter: socket filtering instructions
* @sk_timer: sock cleanup timer
......@@ -331,6 +352,9 @@ struct sock {
#define sk_v6_daddr __sk_common.skc_v6_daddr
#define sk_v6_rcv_saddr __sk_common.skc_v6_rcv_saddr
#define sk_cookie __sk_common.skc_cookie
#define sk_incoming_cpu __sk_common.skc_incoming_cpu
#define sk_flags __sk_common.skc_flags
#define sk_rxhash __sk_common.skc_rxhash
socket_lock_t sk_lock;
struct sk_buff_head sk_receive_queue;
......@@ -350,14 +374,6 @@ struct sock {
} sk_backlog;
#define sk_rmem_alloc sk_backlog.rmem_alloc
int sk_forward_alloc;
#ifdef CONFIG_RPS
__u32 sk_rxhash;
#endif
u16 sk_incoming_cpu;
/* 16bit hole
* Warned : sk_incoming_cpu can be set from softirq,
* Do not use this hole without fully understanding possible issues.
*/
__u32 sk_txhash;
#ifdef CONFIG_NET_RX_BUSY_POLL
......@@ -373,7 +389,6 @@ struct sock {
#ifdef CONFIG_XFRM
struct xfrm_policy *sk_policy[2];
#endif
unsigned long sk_flags;
struct dst_entry *sk_rx_dst;
struct dst_entry __rcu *sk_dst_cache;
spinlock_t sk_dst_lock;
......
......@@ -988,6 +988,10 @@ set_rcvbuf:
sk->sk_max_pacing_rate);
break;
case SO_INCOMING_CPU:
sk->sk_incoming_cpu = val;
break;
default:
ret = -ENOPROTOOPT;
break;
......@@ -2379,6 +2383,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_max_pacing_rate = ~0U;
sk->sk_pacing_rate = ~0U;
sk->sk_incoming_cpu = -1;
/*
* Before updating sk_refcnt, we must commit prior changes to memory
* (Documentation/RCU/rculist_nulls.txt for details)
......
......@@ -185,6 +185,8 @@ static inline int compute_score(struct sock *sk, struct net *net,
return -1;
score += 4;
}
if (sk->sk_incoming_cpu == raw_smp_processor_id())
score++;
}
return score;
}
......
......@@ -382,10 +382,10 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
}
/* Try to redo what tcp_v4_send_synack did. */
req->window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW);
req->rsk_window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW);
tcp_select_initial_window(tcp_full_space(sk), req->mss,
&req->rcv_wnd, &req->window_clamp,
&req->rsk_rcv_wnd, &req->rsk_window_clamp,
ireq->wscale_ok, &rcv_wscale,
dst_metric(&rt->dst, RTAX_INITRWND));
......
......@@ -6022,7 +6022,7 @@ static void tcp_openreq_init(struct request_sock *req,
{
struct inet_request_sock *ireq = inet_rsk(req);
req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */
req->rsk_rcv_wnd = 0; /* So that tcp_send_synack() knows! */
req->cookie_ts = 0;
tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
......
......@@ -803,7 +803,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
*/
tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd,
tcp_time_stamp,
req->ts_recent,
0,
......
......@@ -381,18 +381,18 @@ void tcp_openreq_init_rwin(struct request_sock *req,
window_clamp = READ_ONCE(tp->window_clamp);
/* Set this up on the first call only */
req->window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW);
req->rsk_window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW);
/* limit the window selection if the user enforce a smaller rx buffer */
if (sk_listener->sk_userlocks & SOCK_RCVBUF_LOCK &&
(req->window_clamp > full_space || req->window_clamp == 0))
req->window_clamp = full_space;
(req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0))
req->rsk_window_clamp = full_space;
/* tcp_full_space because it is guaranteed to be the first packet */
tcp_select_initial_window(full_space,
mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
&req->rcv_wnd,
&req->window_clamp,
&req->rsk_rcv_wnd,
&req->rsk_window_clamp,
ireq->wscale_ok,
&rcv_wscale,
dst_metric(dst, RTAX_INITRWND));
......@@ -512,9 +512,9 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
if (sysctl_tcp_fack)
tcp_enable_fack(newtp);
}
newtp->window_clamp = req->window_clamp;
newtp->rcv_ssthresh = req->rcv_wnd;
newtp->rcv_wnd = req->rcv_wnd;
newtp->window_clamp = req->rsk_window_clamp;
newtp->rcv_ssthresh = req->rsk_rcv_wnd;
newtp->rcv_wnd = req->rsk_rcv_wnd;
newtp->rx_opt.wscale_ok = ireq->wscale_ok;
if (newtp->rx_opt.wscale_ok) {
newtp->rx_opt.snd_wscale = ireq->snd_wscale;
......@@ -707,7 +707,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
/* RFC793: "first check sequence number". */
if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rcv_wnd)) {
tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rsk_rcv_wnd)) {
/* Out of window: send ACK and drop. */
if (!(flg & TCP_FLAG_RST))
req->rsk_ops->send_ack(sk, skb, req);
......
......@@ -3023,7 +3023,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);
/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
th->window = htons(min(req->rcv_wnd, 65535U));
th->window = htons(min(req->rsk_rcv_wnd, 65535U));
tcp_options_write((__be32 *)(th + 1), NULL, &opts);
th->doff = (tcp_header_size >> 2);
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_OUTSEGS);
......
......@@ -375,7 +375,8 @@ static inline int compute_score(struct sock *sk, struct net *net,
return -1;
score += 4;
}
if (sk->sk_incoming_cpu == raw_smp_processor_id())
score++;
return score;
}
......@@ -419,6 +420,9 @@ static inline int compute_score2(struct sock *sk, struct net *net,
score += 4;
}
if (sk->sk_incoming_cpu == raw_smp_processor_id())
score++;
return score;
}
......
......@@ -114,6 +114,8 @@ static inline int compute_score(struct sock *sk, struct net *net,
return -1;
score++;
}
if (sk->sk_incoming_cpu == raw_smp_processor_id())
score++;
}
return score;
}
......
......@@ -235,9 +235,9 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
goto out_free;
}
req->window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW);
req->rsk_window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW);
tcp_select_initial_window(tcp_full_space(sk), req->mss,
&req->rcv_wnd, &req->window_clamp,
&req->rsk_rcv_wnd, &req->rsk_window_clamp,
ireq->wscale_ok, &rcv_wscale,
dst_metric(dst, RTAX_INITRWND));
......
......@@ -931,7 +931,7 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
*/
tcp_v6_send_ack(sk, skb, (sk->sk_state == TCP_LISTEN) ?
tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd,
tcp_time_stamp, req->ts_recent, sk->sk_bound_dev_if,
tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr),
0, 0);
......
......@@ -182,10 +182,12 @@ static inline int compute_score(struct sock *sk, struct net *net,
score++;
}
if (sk->sk_incoming_cpu == raw_smp_processor_id())
score++;
return score;
}
#define SCORE2_MAX (1 + 1 + 1)
static inline int compute_score2(struct sock *sk, struct net *net,
const struct in6_addr *saddr, __be16 sport,
const struct in6_addr *daddr,
......@@ -223,6 +225,9 @@ static inline int compute_score2(struct sock *sk, struct net *net,
score++;
}
if (sk->sk_incoming_cpu == raw_smp_processor_id())
score++;
return score;
}
......@@ -251,8 +256,7 @@ begin:
hash = udp6_ehashfn(net, daddr, hnum,
saddr, sport);
matches = 1;
} else if (score == SCORE2_MAX)
goto exact_match;
}
} else if (score == badness && reuseport) {
matches++;
if (reciprocal_scale(hash, matches) == 0)
......@@ -269,7 +273,6 @@ begin:
goto begin;
if (result) {
exact_match:
if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
result = NULL;
else if (unlikely(compute_score2(result, net, saddr, sport,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment