Commit 1bed966c authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'tcp_fastopen_server'

Jerry Chu says:

This patch series provides the server (passive open) side code
for TCP Fast Open. Together with the earlier client side patches
it completes the TCP Fast Open implementation.

The server side Fast Open code accepts data carried in the SYN
packet with a valid Fast Open cookie, and passes it to the
application right away, allowing application to send back response
data, all before TCP's 3-way handshake finishes.

A simple cookie scheme together with capping the number of
outstanding TFO requests (still in TCP_SYN_RECV state) to a limit
per listener forms the main line of defense against spoofed SYN

For more details about TCP Fast Open see our IETF internet draft
and a research paper at

A prototype implementation was first developed by Sivasankar
Radhakrishnan (

A patch based on an older version of Linux kernel has been
undergoing internal tests at Google for the past few months.

Jerry Chu (3):
  tcp: TCP Fast Open Server - header & support functions
  tcp: TCP Fast Open Server - support TFO listeners
  tcp: TCP Fast Open Server - main code path
Signed-off-by: default avatarDavid S. Miller <>
parents 2a35cfa5 168a8f58
......@@ -467,16 +467,31 @@ tcp_syncookies - BOOLEAN
tcp_fastopen - INTEGER
Enable TCP Fast Open feature (draft-ietf-tcpm-fastopen) to send data
in the opening SYN packet. To use this feature, the client application
must not use connect(). Instead, it should use sendmsg() or sendto()
with MSG_FASTOPEN flag which performs a TCP handshake automatically.
The values (bitmap) are:
1: Enables sending data in the opening SYN on the client
5: Enables sending data in the opening SYN on the client regardless
of cookie availability.
must use sendmsg() or sendto() with MSG_FASTOPEN flag rather than
connect() to perform a TCP handshake automatically.
The values (bitmap) are
1: Enables sending data in the opening SYN on the client.
2: Enables TCP Fast Open on the server side, i.e., allowing data in
a SYN packet to be accepted and passed to the application before
3-way hand shake finishes.
4: Send data in the opening SYN regardless of cookie availability and
without a cookie option.
0x100: Accept SYN data w/o validating the cookie.
0x200: Accept data-in-SYN w/o any cookie option present.
0x400/0x800: Enable Fast Open on all listeners regardless of the
TCP_FASTOPEN socket option. The two different flags designate two
different ways of setting max_qlen without the TCP_FASTOPEN socket
Default: 0
Note that the client & server side Fast Open flags (1 and 2
respectively) must be also enabled before the rest of flags can take
See include/net/tcp.h and the code for more details.
tcp_syn_retries - INTEGER
Number of times initial SYNs for an active TCP connection attempt
will be retransmitted. Should not be higher than 255. Default value
......@@ -241,6 +241,10 @@ enum
......@@ -110,6 +110,7 @@ enum {
#define TCP_QUEUE_SEQ 21
#define TCP_FASTOPEN 23 /* Enable FastOpen on listeners */
struct tcp_repair_opt {
__u32 opt_code;
......@@ -246,6 +247,7 @@ static inline unsigned int tcp_optlen(const struct sk_buff *skb)
/* TCP Fast Open */
#define TCP_FASTOPEN_COOKIE_MIN 4 /* Min Fast Open Cookie size in bytes */
#define TCP_FASTOPEN_COOKIE_MAX 16 /* Max Fast Open Cookie size in bytes */
#define TCP_FASTOPEN_COOKIE_SIZE 8 /* the size employed by this impl. */
/* TCP Fast Open Cookie as stored in memory */
struct tcp_fastopen_cookie {
......@@ -312,9 +314,14 @@ struct tcp_request_sock {
/* Only used by TCP MD5 Signature so far. */
const struct tcp_request_sock_ops *af_specific;
struct sock *listener; /* needed for TFO */
u32 rcv_isn;
u32 snt_isn;
u32 snt_synack; /* synack sent time */
u32 rcv_nxt; /* the ack # by SYNACK. For
* FastOpen it's the seq#
* after data-in-SYN.
static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)
......@@ -505,14 +512,18 @@ struct tcp_sock {
struct tcp_md5sig_info __rcu *md5sig_info;
/* TCP fastopen related information */
struct tcp_fastopen_request *fastopen_req;
/* When the cookie options are generated and exchanged, then this
* object holds a reference to them (cookie_values->kref). Also
* contains related tcp_cookie_transactions fields.
struct tcp_cookie_values *cookie_values;
/* TCP fastopen related information */
struct tcp_fastopen_request *fastopen_req;
/* fastopen_rsk points to request_sock that resulted in this big
* socket. Used to retransmit SYNACKs etc.
struct request_sock *fastopen_rsk;
enum tsq_flags {
......@@ -552,6 +563,34 @@ static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk)
return (struct tcp_timewait_sock *)sk;
static inline bool tcp_passive_fastopen(const struct sock *sk)
return (sk->sk_state == TCP_SYN_RECV &&
tcp_sk(sk)->fastopen_rsk != NULL);
static inline bool fastopen_cookie_present(struct tcp_fastopen_cookie *foc)
return foc->len != -1;
static inline int fastopen_init_queue(struct sock *sk, int backlog)
struct request_sock_queue *queue =
if (queue->fastopenq == NULL) {
queue->fastopenq = kzalloc(
sizeof(struct fastopen_queue),
if (queue->fastopenq == NULL)
return -ENOMEM;
queue->fastopenq->max_qlen = backlog;
return 0;
#endif /* __KERNEL__ */
#endif /* _LINUX_TCP_H */
......@@ -106,6 +106,34 @@ struct listen_sock {
struct request_sock *syn_table[0];
* For a TCP Fast Open listener -
* lock - protects the access to all the reqsk, which is co-owned by
* the listener and the child socket.
* qlen - pending TFO requests (still in TCP_SYN_RECV).
* max_qlen - max TFO reqs allowed before TFO is disabled.
* XXX (TFO) - ideally these fields can be made as part of "listen_sock"
* structure above. But there is some implementation difficulty due to
* listen_sock being part of request_sock_queue hence will be freed when
* a listener is stopped. But TFO related fields may continue to be
* accessed even after a listener is closed, until its sk_refcnt drops
* to 0 implying no more outstanding TFO reqs. One solution is to keep
* listen_opt around until sk_refcnt drops to 0. But there is some other
* complexity that needs to be resolved. E.g., a listener can be disabled
* temporarily through shutdown()->tcp_disconnect(), and re-enabled later.
struct fastopen_queue {
struct request_sock *rskq_rst_head; /* Keep track of past TFO */
struct request_sock *rskq_rst_tail; /* requests that caused RST.
* This is part of the defense
* against spoofing attack.
spinlock_t lock;
int qlen; /* # of pending (TCP_SYN_RECV) reqs */
int max_qlen; /* != 0 iff TFO is currently enabled */
/** struct request_sock_queue - queue of request_socks
* @rskq_accept_head - FIFO head of established children
......@@ -129,6 +157,12 @@ struct request_sock_queue {
u8 rskq_defer_accept;
/* 3 bytes hole, try to pack */
struct listen_sock *listen_opt;
struct fastopen_queue *fastopenq; /* This is non-NULL iff TFO has been
* enabled on this listener. Check
* max_qlen != 0 in fastopen_queue
* to determine if TFO is enabled
* right at this moment.
extern int reqsk_queue_alloc(struct request_sock_queue *queue,
......@@ -136,6 +170,8 @@ extern int reqsk_queue_alloc(struct request_sock_queue *queue,
extern void __reqsk_queue_destroy(struct request_sock_queue *queue);
extern void reqsk_queue_destroy(struct request_sock_queue *queue);
extern void reqsk_fastopen_remove(struct sock *sk,
struct request_sock *req, bool reset);
static inline struct request_sock *
reqsk_queue_yank_acceptq(struct request_sock_queue *queue)
......@@ -190,19 +226,6 @@ static inline struct request_sock *reqsk_queue_remove(struct request_sock_queue
return req;
static inline struct sock *reqsk_queue_get_child(struct request_sock_queue *queue,
struct sock *parent)
struct request_sock *req = reqsk_queue_remove(queue);
struct sock *child = req->sk;
WARN_ON(child == NULL);
return child;
static inline int reqsk_queue_removed(struct request_sock_queue *queue,
struct request_sock *req)
......@@ -224,8 +224,24 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
/* Bit Flags for sysctl_tcp_fastopen */
#define TFO_CLIENT_NO_COOKIE 4 /* Data in SYN w/o cookie option */
/* Process SYN data but skip cookie validation */
/* Accept SYN data w/o any cookie option */
/* Force enable TFO on all listeners, i.e., not requiring the
* TCP_FASTOPEN socket option. SOCKOPT1/2 determine how to set max_qlen.
#define TFO_SERVER_WO_SOCKOPT1 0x400
#define TFO_SERVER_WO_SOCKOPT2 0x800
/* Always create TFO child sockets on a TFO listener even when
* cookie/data not present. (For testing purpose!)
#define TFO_SERVER_ALWAYS 0x1000
extern struct inet_timewait_death_row tcp_death_row;
/* sysctl variables for tcp */
......@@ -408,7 +424,8 @@ extern enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *
const struct tcphdr *th);
extern struct sock * tcp_check_req(struct sock *sk,struct sk_buff *skb,
struct request_sock *req,
struct request_sock **prev);
struct request_sock **prev,
bool fastopen);
extern int tcp_child_process(struct sock *parent, struct sock *child,
struct sk_buff *skb);
extern bool tcp_use_frto(struct sock *sk);
......@@ -421,12 +438,6 @@ extern void tcp_metrics_init(void);
extern bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool paws_check);
extern bool tcp_remember_stamp(struct sock *sk);
extern bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw);
extern void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
struct tcp_fastopen_cookie *cookie,
int *syn_loss, unsigned long *last_syn_loss);
extern void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
struct tcp_fastopen_cookie *cookie,
bool syn_lost);
extern void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst);
extern void tcp_disable_fack(struct tcp_sock *tp);
extern void tcp_close(struct sock *sk, long timeout);
......@@ -468,7 +479,8 @@ extern int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr,
extern int tcp_connect(struct sock *sk);
extern struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
struct request_sock *req,
struct request_values *rvp);
struct request_values *rvp,
struct tcp_fastopen_cookie *foc);
extern int tcp_disconnect(struct sock *sk, int flags);
void tcp_connect_init(struct sock *sk);
......@@ -537,6 +549,7 @@ extern void tcp_send_delayed_ack(struct sock *sk);
extern void tcp_cwnd_application_limited(struct sock *sk);
extern void tcp_resume_early_retransmit(struct sock *sk);
extern void tcp_rearm_rto(struct sock *sk);
extern void tcp_reset(struct sock *sk);
/* tcp_timer.c */
extern void tcp_init_xmit_timers(struct sock *);
......@@ -586,6 +599,7 @@ extern int tcp_mtu_to_mss(struct sock *sk, int pmtu);
extern int tcp_mss_to_mtu(struct sock *sk, int mss);
extern void tcp_mtup_init(struct sock *sk);
extern void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt);
extern void tcp_init_buffer_space(struct sock *sk);
static inline void tcp_bound_rto(const struct sock *sk)
......@@ -1104,6 +1118,7 @@ static inline void tcp_openreq_init(struct request_sock *req,
req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */
req->cookie_ts = 0;
tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
req->mss = rx_opt->mss_clamp;
req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
ireq->tstamp_ok = rx_opt->tstamp_ok;
......@@ -1308,15 +1323,34 @@ extern int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *, const struct sk_buff
extern int tcp_md5_hash_key(struct tcp_md5sig_pool *hp,
const struct tcp_md5sig_key *key);
/* From tcp_fastopen.c */
extern void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
struct tcp_fastopen_cookie *cookie,
int *syn_loss, unsigned long *last_syn_loss);
extern void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
struct tcp_fastopen_cookie *cookie,
bool syn_lost);
struct tcp_fastopen_request {
/* Fast Open cookie. Size 0 means a cookie request */
struct tcp_fastopen_cookie cookie;
struct msghdr *data; /* data in MSG_FASTOPEN */
u16 copied; /* queued in tcp_connect() */
void tcp_free_fastopen_req(struct tcp_sock *tp);
extern struct tcp_fastopen_context __rcu *tcp_fastopen_ctx;
int tcp_fastopen_reset_cipher(void *key, unsigned int len);
void tcp_fastopen_cookie_gen(__be32 addr, struct tcp_fastopen_cookie *foc);
/* Fastopen key context */
struct tcp_fastopen_context {
struct crypto_cipher __rcu *tfm;
struct rcu_head rcu;
/* write queue abstraction */
static inline void tcp_write_queue_purge(struct sock *sk)
......@@ -15,6 +15,7 @@
#include <linux/random.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/tcp.h>
#include <linux/vmalloc.h>
#include <net/request_sock.h>
......@@ -130,3 +131,97 @@ void reqsk_queue_destroy(struct request_sock_queue *queue)
* This function is called to set a Fast Open socket's "fastopen_rsk" field
* to NULL when a TFO socket no longer needs to access the request_sock.
* This happens only after 3WHS has been either completed or aborted (e.g.,
* RST is received).
* Before TFO, a child socket is created only after 3WHS is completed,
* hence it never needs to access the request_sock. things get a lot more
* complex with TFO. A child socket, accepted or not, has to access its
* request_sock for 3WHS processing, e.g., to retransmit SYN-ACK pkts,
* until 3WHS is either completed or aborted. Afterwards the req will stay
* until either the child socket is accepted, or in the rare case when the
* listener is closed before the child is accepted.
* In short, a request socket is only freed after BOTH 3WHS has completed
* (or aborted) and the child socket has been accepted (or listener closed).
* When a child socket is accepted, its corresponding req->sk is set to
* NULL since it's no longer needed. More importantly, "req->sk == NULL"
* will be used by the code below to determine if a child socket has been
* accepted or not, and the check is protected by the fastopenq->lock
* described below.
* Note that fastopen_rsk is only accessed from the child socket's context
* with its socket lock held. But a request_sock (req) can be accessed by
* both its child socket through fastopen_rsk, and a listener socket through
* icsk_accept_queue.rskq_accept_head. To protect the access a simple spin
* lock per listener "icsk->icsk_accept_queue.fastopenq->lock" is created.
* only in the rare case when both the listener and the child locks are held,
* e.g., in inet_csk_listen_stop() do we not need to acquire the lock.
* The lock also protects other fields such as fastopenq->qlen, which is
* decremented by this function when fastopen_rsk is no longer needed.
* Note that another solution was to simply use the existing socket lock
* from the listener. But first socket lock is difficult to use. It is not
* a simple spin lock - one must consider sock_owned_by_user() and arrange
* to use sk_add_backlog() stuff. But what really makes it infeasible is the
* locking hierarchy violation. E.g., inet_csk_listen_stop() may try to
* acquire a child's lock while holding listener's socket lock. A corner
* case might also exist in tcp_v4_hnd_req() that will trigger this locking
* order.
* When a TFO req is created, it needs to sock_hold its listener to prevent
* the latter data structure from going away.
* This function also sets "treq->listener" to NULL and unreference listener
* socket. treq->listener is used by the listener so it is protected by the
* fastopenq->lock in this function.
void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,
bool reset)
struct sock *lsk = tcp_rsk(req)->listener;
struct fastopen_queue *fastopenq =
BUG_ON(!spin_is_locked(&sk->sk_lock.slock) && !sock_owned_by_user(sk));
tcp_sk(sk)->fastopen_rsk = NULL;
tcp_rsk(req)->listener = NULL;
if (req->sk) /* the child socket hasn't been accepted yet */
goto out;
if (!reset || lsk->sk_state != TCP_LISTEN) {
/* If the listener has been closed don't bother with the
* special RST handling below.
/* Wait for 60secs before removing a req that has triggered RST.
* This is a simple defense against TFO spoofing attack - by
* counting the req against fastopen.max_qlen, and disabling
* TFO when the qlen exceeds max_qlen.
* For more details see CoNext'11 "TCP Fast Open" paper.
req->expires = jiffies + 60*HZ;
if (fastopenq->rskq_rst_head == NULL)
fastopenq->rskq_rst_head = req;
fastopenq->rskq_rst_tail->dl_next = req;
req->dl_next = NULL;
fastopenq->rskq_rst_tail = req;
......@@ -149,6 +149,11 @@ void inet_sock_destruct(struct sock *sk)
pr_err("Attempt to release alive inet socket %p\n", sk);
if (sk->sk_type == SOCK_STREAM) {
struct fastopen_queue *fastopenq =
......@@ -212,6 +217,26 @@ int inet_listen(struct socket *sock, int backlog)
* we can only allow the backlog to be adjusted.
if (old_state != TCP_LISTEN) {
/* Check special setups for testing purpose to enable TFO w/o
* requiring TCP_FASTOPEN sockopt.
* Note that only TCP sockets (SOCK_STREAM) will reach here.
* Also fastopenq may already been allocated because this
* socket was in TCP_LISTEN state previously but was
* shutdown() (rather than close()).
if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) != 0 &&
inet_csk(sk)->icsk_accept_queue.fastopenq == NULL) {
if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) != 0)
err = fastopen_init_queue(sk, backlog);
else if ((sysctl_tcp_fastopen &
err = fastopen_init_queue(sk,
((uint)sysctl_tcp_fastopen) >> 16);
err = 0;
if (err)
goto out;
err = inet_csk_listen_start(sk, backlog);
if (err)
goto out;
......@@ -701,7 +726,8 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags)
WARN_ON(!((1 << sk2->sk_state) &
sock_graft(sk2, newsock);
......@@ -283,7 +283,9 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
struct inet_connection_sock *icsk = inet_csk(sk);
struct request_sock_queue *queue = &icsk->icsk_accept_queue;
struct sock *newsk;
struct request_sock *req;
int error;
......@@ -296,7 +298,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
goto out_err;
/* Find already established connection */
if (reqsk_queue_empty(&icsk->icsk_accept_queue)) {
if (reqsk_queue_empty(queue)) {
long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
/* If this is a non blocking socket don't sleep */
......@@ -308,14 +310,32 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
if (error)
goto out_err;
newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk);
WARN_ON(newsk->sk_state == TCP_SYN_RECV);
req = reqsk_queue_remove(queue);
newsk = req->sk;
if (sk->sk_type == SOCK_STREAM && queue->fastopenq != NULL) {
if (tcp_rsk(req)->listener) {
/* We are still waiting for the final ACK from 3WHS
* so can't free req now. Instead, we set req->sk to
* NULL to signify that the child socket is taken
* so reqsk_fastopen_remove() will free the req
* when 3WHS finishes (or is aborted).
req->sk = NULL;
req = NULL;
if (req)
return newsk;
newsk = NULL;
req = NULL;
*err = error;
goto out;
......@@ -720,13 +740,14 @@ EXPORT_SYMBOL_GPL(inet_csk_listen_start);
void inet_csk_listen_stop(struct sock *sk)
struct inet_connection_sock *icsk = inet_csk(sk);
struct request_sock_queue *queue = &icsk->icsk_accept_queue;
struct request_sock *acc_req;
struct request_sock *req;
/* make all the listen_opt local to us */
acc_req = reqsk_queue_yank_acceptq(&icsk->icsk_accept_queue);
acc_req = reqsk_queue_yank_acceptq(queue);
/* Following specs, it would be better either to send FIN
* (and enter FIN-WAIT-1, it is normal close)
......@@ -736,7 +757,7 @@ void inet_csk_listen_stop(struct sock *sk)
* To be honest, we are not able to make either
* of the variants now. --ANK
while ((req = acc_req) != NULL) {
struct sock *child = req->sk;
......@@ -754,6 +775,19 @@ void inet_csk_listen_stop(struct sock *sk)
if (sk->sk_type == SOCK_STREAM && tcp_rsk(req)->listener) {
BUG_ON(tcp_sk(child)->fastopen_rsk != req);
BUG_ON(sk != tcp_rsk(req)->listener);
/* Paranoid, to prevent race condition if
* an inbound pkt destined for child is
* blocked by sock lock in tcp_v4_rcv().
* Also to satisfy an assertion in
* tcp_v4_destroy_sock().
tcp_sk(child)->fastopen_rsk = NULL;
......@@ -763,6 +797,17 @@ void inet_csk_listen_stop(struct sock *sk)
if (queue->fastopenq != NULL) {
/* Free all the reqs queued in rskq_rst_head. */
acc_req = queue->fastopenq->rskq_rst_head;
queue->fastopenq->rskq_rst_head = NULL;
while ((req = acc_req) != NULL) {
acc_req = req->dl_next;
......@@ -263,6 +263,10 @@ static const struct snmp_mib snmp4_net_list[] = {
......@@ -319,6 +319,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
ireq->tstamp_ok = tcp_opt.saw_tstamp;
req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0;
treq->listener = NULL;