Commit 7f8a436e authored by Joe Stringer's avatar Joe Stringer Committed by David S. Miller

openvswitch: Add conntrack action

Expose the kernel connection tracker via OVS. Userspace components can
make use of the CT action to populate the connection state (ct_state)
field for a flow. This state can be subsequently matched.

Exposed connection states are OVS_CS_F_*:
- NEW (0x01) - Beginning of a new connection.
- ESTABLISHED (0x02) - Part of an existing connection.
- RELATED (0x04) - Related to an established connection.
- INVALID (0x20) - Could not track the connection for this packet.
- REPLY_DIR (0x40) - This packet is in the reply direction for the flow.
- TRACKED (0x80) - This packet has been sent through conntrack.

When the CT action is executed by itself, it will send the packet
through the connection tracker and populate the ct_state field with one
or more of the connection state flags above. The CT action will always
set the TRACKED bit.

When the COMMIT flag is passed to the conntrack action, this specifies
that information about the connection should be stored. This allows
subsequent packets for the same (or related) connections to be
correlated with this connection. Sending subsequent packets for the
connection through conntrack allows the connection tracker to consider
the packets as ESTABLISHED, RELATED, and/or REPLY_DIR.

The CT action may optionally take a zone to track the flow within. This
allows connections with the same 5-tuple to be kept logically separate
from connections in other zones. If the zone is specified, then the
"ct_zone" match field will be subsequently populated with the zone id.

IP fragments are handled by transparently assembling them as part of the
CT action. The maximum received unit (MRU) size is tracked so that
refragmentation can occur during output.

IP frag handling contributed by Andy Zhou.

Based on original design by Justin Pettit.
Signed-off-by: default avatarJoe Stringer <joestringer@nicira.com>
Signed-off-by: default avatarJustin Pettit <jpettit@nicira.com>
Signed-off-by: default avatarAndy Zhou <azhou@nicira.com>
Acked-by: default avatarThomas Graf <tgraf@suug.ch>
Acked-by: default avatarPravin B Shelar <pshelar@nicira.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent e79e2595
......@@ -164,6 +164,9 @@ enum ovs_packet_cmd {
* %OVS_USERSPACE_ATTR_EGRESS_TUN_PORT attribute, which is sent only if the
* output port is actually a tunnel port. Contains the output tunnel key
* extracted from the packet as nested %OVS_TUNNEL_KEY_ATTR_* attributes.
* @OVS_PACKET_ATTR_MRU: Present for an %OVS_PACKET_CMD_ACTION and
* %OVS_PACKET_ATTR_USERSPACE action specify the Maximum received fragment
* size.
*
* These attributes follow the &struct ovs_header within the Generic Netlink
* payload for %OVS_PACKET_* commands.
......@@ -180,6 +183,7 @@ enum ovs_packet_attr {
OVS_PACKET_ATTR_UNUSED2,
OVS_PACKET_ATTR_PROBE, /* Packet operation is a feature probe,
error logging should be suppressed. */
OVS_PACKET_ATTR_MRU, /* Maximum received IP fragment size. */
__OVS_PACKET_ATTR_MAX
};
......@@ -319,6 +323,8 @@ enum ovs_key_attr {
OVS_KEY_ATTR_MPLS, /* array of struct ovs_key_mpls.
* The implementation may restrict
* the accepted length of the array. */
OVS_KEY_ATTR_CT_STATE, /* u8 bitmask of OVS_CS_F_* */
OVS_KEY_ATTR_CT_ZONE, /* u16 connection tracking zone. */
#ifdef __KERNEL__
OVS_KEY_ATTR_TUNNEL_INFO, /* struct ip_tunnel_info */
......@@ -431,6 +437,15 @@ struct ovs_key_nd {
__u8 nd_tll[ETH_ALEN];
};
/* OVS_KEY_ATTR_CT_STATE flags */
#define OVS_CS_F_NEW 0x01 /* Beginning of a new connection. */
#define OVS_CS_F_ESTABLISHED 0x02 /* Part of an existing connection. */
#define OVS_CS_F_RELATED 0x04 /* Related to an established
* connection. */
#define OVS_CS_F_INVALID 0x20 /* Could not track connection. */
#define OVS_CS_F_REPLY_DIR 0x40 /* Flow is in the reply direction. */
#define OVS_CS_F_TRACKED 0x80 /* Conntrack has occurred. */
/**
* enum ovs_flow_attr - attributes for %OVS_FLOW_* commands.
* @OVS_FLOW_ATTR_KEY: Nested %OVS_KEY_ATTR_* attributes specifying the flow
......@@ -594,6 +609,28 @@ struct ovs_action_hash {
uint32_t hash_basis;
};
/**
* enum ovs_ct_attr - Attributes for %OVS_ACTION_ATTR_CT action.
* @OVS_CT_ATTR_FLAGS: u32 connection tracking flags.
* @OVS_CT_ATTR_ZONE: u16 connection tracking zone.
*/
enum ovs_ct_attr {
OVS_CT_ATTR_UNSPEC,
OVS_CT_ATTR_FLAGS, /* u8 bitmask of OVS_CT_F_*. */
OVS_CT_ATTR_ZONE, /* u16 zone id. */
__OVS_CT_ATTR_MAX
};
#define OVS_CT_ATTR_MAX (__OVS_CT_ATTR_MAX - 1)
/*
* OVS_CT_ATTR_FLAGS flags - bitmask of %OVS_CT_F_*
* @OVS_CT_F_COMMIT: Commits the flow to the conntrack table. This allows
* future packets for the same connection to be identified as 'established'
* or 'related'.
*/
#define OVS_CT_F_COMMIT 0x01
/**
* enum ovs_action_attr - Action types.
*
......@@ -623,6 +660,8 @@ struct ovs_action_hash {
* indicate the new packet contents. This could potentially still be
* %ETH_P_MPLS if the resulting MPLS label stack is not empty. If there
* is no MPLS label stack, as determined by ethertype, no action is taken.
* @OVS_ACTION_ATTR_CT: Track the connection. Populate the conntrack-related
* entries in the flow key.
*
* Only a single header can be set with a single %OVS_ACTION_ATTR_SET. Not all
* fields within a header are modifiable, e.g. the IPv4 protocol and fragment
......@@ -648,6 +687,7 @@ enum ovs_action_attr {
* data immediately followed by a mask.
* The data must be zero for the unmasked
* bits. */
OVS_ACTION_ATTR_CT, /* One nested OVS_CT_ATTR_* . */
__OVS_ACTION_ATTR_MAX, /* Nothing past this will be accepted
* from userspace. */
......
......@@ -31,6 +31,17 @@ config OPENVSWITCH
If unsure, say N.
config OPENVSWITCH_CONNTRACK
bool "Open vSwitch conntrack action support"
depends on OPENVSWITCH
depends on NF_CONNTRACK
default OPENVSWITCH
---help---
If you say Y here, then Open vSwitch module will be able to pass
packets through conntrack.
Say N to exclude this support and reduce the binary size.
config OPENVSWITCH_GRE
tristate "Open vSwitch GRE tunneling support"
depends on OPENVSWITCH
......
......@@ -15,6 +15,8 @@ openvswitch-y := \
vport-internal_dev.o \
vport-netdev.o
openvswitch-$(CONFIG_OPENVSWITCH_CONNTRACK) += conntrack.o
obj-$(CONFIG_OPENVSWITCH_VXLAN)+= vport-vxlan.o
obj-$(CONFIG_OPENVSWITCH_GENEVE)+= vport-geneve.o
obj-$(CONFIG_OPENVSWITCH_GRE) += vport-gre.o
......@@ -22,6 +22,7 @@
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/openvswitch.h>
#include <linux/netfilter_ipv6.h>
#include <linux/sctp.h>
#include <linux/tcp.h>
#include <linux/udp.h>
......@@ -29,6 +30,7 @@
#include <linux/if_arp.h>
#include <linux/if_vlan.h>
#include <net/dst.h>
#include <net/ip.h>
#include <net/ipv6.h>
#include <net/checksum.h>
......@@ -38,6 +40,7 @@
#include "datapath.h"
#include "flow.h"
#include "conntrack.h"
#include "vport.h"
static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
......@@ -52,6 +55,20 @@ struct deferred_action {
struct sw_flow_key pkt_key;
};
#define MAX_L2_LEN (VLAN_ETH_HLEN + 3 * MPLS_HLEN)
struct ovs_frag_data {
unsigned long dst;
struct vport *vport;
struct ovs_skb_cb cb;
__be16 inner_protocol;
__u16 vlan_tci;
__be16 vlan_proto;
unsigned int l2_len;
u8 l2_data[MAX_L2_LEN];
};
static DEFINE_PER_CPU(struct ovs_frag_data, ovs_frag_data_storage);
#define DEFERRED_ACTION_FIFO_SIZE 10
struct action_fifo {
int head;
......@@ -602,14 +619,145 @@ static int set_sctp(struct sk_buff *skb, struct sw_flow_key *flow_key,
return 0;
}
static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port)
static int ovs_vport_output(struct sock *sock, struct sk_buff *skb)
{
struct ovs_frag_data *data = this_cpu_ptr(&ovs_frag_data_storage);
struct vport *vport = data->vport;
if (skb_cow_head(skb, data->l2_len) < 0) {
kfree_skb(skb);
return -ENOMEM;
}
__skb_dst_copy(skb, data->dst);
*OVS_CB(skb) = data->cb;
skb->inner_protocol = data->inner_protocol;
skb->vlan_tci = data->vlan_tci;
skb->vlan_proto = data->vlan_proto;
/* Reconstruct the MAC header. */
skb_push(skb, data->l2_len);
memcpy(skb->data, &data->l2_data, data->l2_len);
ovs_skb_postpush_rcsum(skb, skb->data, data->l2_len);
skb_reset_mac_header(skb);
ovs_vport_send(vport, skb);
return 0;
}
static unsigned int
ovs_dst_get_mtu(const struct dst_entry *dst)
{
return dst->dev->mtu;
}
static struct dst_ops ovs_dst_ops = {
.family = AF_UNSPEC,
.mtu = ovs_dst_get_mtu,
};
/* prepare_frag() is called once per (larger-than-MTU) frame; its inverse is
* ovs_vport_output(), which is called once per fragmented packet.
*/
static void prepare_frag(struct vport *vport, struct sk_buff *skb)
{
unsigned int hlen = skb_network_offset(skb);
struct ovs_frag_data *data;
data = this_cpu_ptr(&ovs_frag_data_storage);
data->dst = skb->_skb_refdst;
data->vport = vport;
data->cb = *OVS_CB(skb);
data->inner_protocol = skb->inner_protocol;
data->vlan_tci = skb->vlan_tci;
data->vlan_proto = skb->vlan_proto;
data->l2_len = hlen;
memcpy(&data->l2_data, skb->data, hlen);
memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
skb_pull(skb, hlen);
}
static void ovs_fragment(struct vport *vport, struct sk_buff *skb, u16 mru,
__be16 ethertype)
{
if (skb_network_offset(skb) > MAX_L2_LEN) {
OVS_NLERR(1, "L2 header too long to fragment");
return;
}
if (ethertype == htons(ETH_P_IP)) {
struct dst_entry ovs_dst;
unsigned long orig_dst;
prepare_frag(vport, skb);
dst_init(&ovs_dst, &ovs_dst_ops, NULL, 1,
DST_OBSOLETE_NONE, DST_NOCOUNT);
ovs_dst.dev = vport->dev;
orig_dst = skb->_skb_refdst;
skb_dst_set_noref(skb, &ovs_dst);
IPCB(skb)->frag_max_size = mru;
ip_do_fragment(skb->sk, skb, ovs_vport_output);
refdst_drop(orig_dst);
} else if (ethertype == htons(ETH_P_IPV6)) {
const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops();
unsigned long orig_dst;
struct rt6_info ovs_rt;
if (!v6ops) {
kfree_skb(skb);
return;
}
prepare_frag(vport, skb);
memset(&ovs_rt, 0, sizeof(ovs_rt));
dst_init(&ovs_rt.dst, &ovs_dst_ops, NULL, 1,
DST_OBSOLETE_NONE, DST_NOCOUNT);
ovs_rt.dst.dev = vport->dev;
orig_dst = skb->_skb_refdst;
skb_dst_set_noref(skb, &ovs_rt.dst);
IP6CB(skb)->frag_max_size = mru;
v6ops->fragment(skb->sk, skb, ovs_vport_output);
refdst_drop(orig_dst);
} else {
WARN_ONCE(1, "Failed fragment ->%s: eth=%04x, MRU=%d, MTU=%d.",
ovs_vport_name(vport), ntohs(ethertype), mru,
vport->dev->mtu);
kfree_skb(skb);
}
}
static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port,
struct sw_flow_key *key)
{
struct vport *vport = ovs_vport_rcu(dp, out_port);
if (likely(vport))
ovs_vport_send(vport, skb);
else
if (likely(vport)) {
u16 mru = OVS_CB(skb)->mru;
if (likely(!mru || (skb->len <= mru + ETH_HLEN))) {
ovs_vport_send(vport, skb);
} else if (mru <= vport->dev->mtu) {
__be16 ethertype = key->eth.type;
if (!is_flow_key_valid(key)) {
if (eth_p_mpls(skb->protocol))
ethertype = skb->inner_protocol;
else
ethertype = vlan_get_protocol(skb);
}
ovs_fragment(vport, skb, mru, ethertype);
} else {
kfree_skb(skb);
}
} else {
kfree_skb(skb);
}
}
static int output_userspace(struct datapath *dp, struct sk_buff *skb,
......@@ -623,6 +771,7 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb,
memset(&upcall, 0, sizeof(upcall));
upcall.cmd = OVS_PACKET_CMD_ACTION;
upcall.mru = OVS_CB(skb)->mru;
for (a = nla_data(attr), rem = nla_len(attr); rem > 0;
a = nla_next(a, &rem)) {
......@@ -816,6 +965,11 @@ static int execute_masked_set_action(struct sk_buff *skb,
err = set_mpls(skb, flow_key, nla_data(a), get_mask(a,
__be32 *));
break;
case OVS_KEY_ATTR_CT_STATE:
case OVS_KEY_ATTR_CT_ZONE:
err = -EINVAL;
break;
}
return err;
......@@ -885,7 +1039,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
struct sk_buff *out_skb = skb_clone(skb, GFP_ATOMIC);
if (out_skb)
do_output(dp, out_skb, prev_port);
do_output(dp, out_skb, prev_port, key);
prev_port = -1;
}
......@@ -942,6 +1096,15 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
case OVS_ACTION_ATTR_SAMPLE:
err = sample(dp, skb, key, a, attr, len);
break;
case OVS_ACTION_ATTR_CT:
err = ovs_ct_execute(ovs_dp_get_net(dp), skb, key,
nla_data(a));
/* Hide stolen IP fragments from user space. */
if (err == -EINPROGRESS)
return 0;
break;
}
if (unlikely(err)) {
......@@ -951,7 +1114,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
}
if (prev_port != -1)
do_output(dp, skb, prev_port);
do_output(dp, skb, prev_port, key);
else
consume_skb(skb);
......
/*
* Copyright (c) 2015 Nicira, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#include <linux/module.h>
#include <linux/openvswitch.h>
#include <net/ip.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_zones.h>
#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
#include "datapath.h"
#include "conntrack.h"
#include "flow.h"
#include "flow_netlink.h"
struct ovs_ct_len_tbl {
size_t maxlen;
size_t minlen;
};
/* Conntrack action context for execution. */
struct ovs_conntrack_info {
struct nf_conntrack_zone zone;
struct nf_conn *ct;
u32 flags;
u16 family;
};
static u16 key_to_nfproto(const struct sw_flow_key *key)
{
switch (ntohs(key->eth.type)) {
case ETH_P_IP:
return NFPROTO_IPV4;
case ETH_P_IPV6:
return NFPROTO_IPV6;
default:
return NFPROTO_UNSPEC;
}
}
/* Map SKB connection state into the values used by flow definition. */
static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo)
{
u8 ct_state = OVS_CS_F_TRACKED;
switch (ctinfo) {
case IP_CT_ESTABLISHED_REPLY:
case IP_CT_RELATED_REPLY:
case IP_CT_NEW_REPLY:
ct_state |= OVS_CS_F_REPLY_DIR;
break;
default:
break;
}
switch (ctinfo) {
case IP_CT_ESTABLISHED:
case IP_CT_ESTABLISHED_REPLY:
ct_state |= OVS_CS_F_ESTABLISHED;
break;
case IP_CT_RELATED:
case IP_CT_RELATED_REPLY:
ct_state |= OVS_CS_F_RELATED;
break;
case IP_CT_NEW:
case IP_CT_NEW_REPLY:
ct_state |= OVS_CS_F_NEW;
break;
default:
break;
}
return ct_state;
}
static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state,
const struct nf_conntrack_zone *zone)
{
key->ct.state = state;
key->ct.zone = zone->id;
}
/* Update 'key' based on skb->nfct. If 'post_ct' is true, then OVS has
* previously sent the packet to conntrack via the ct action.
*/
static void ovs_ct_update_key(const struct sk_buff *skb,
struct sw_flow_key *key, bool post_ct)
{
const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
enum ip_conntrack_info ctinfo;
struct nf_conn *ct;
u8 state = 0;
ct = nf_ct_get(skb, &ctinfo);
if (ct) {
state = ovs_ct_get_state(ctinfo);
if (ct->master)
state |= OVS_CS_F_RELATED;
zone = nf_ct_zone(ct);
} else if (post_ct) {
state = OVS_CS_F_TRACKED | OVS_CS_F_INVALID;
}
__ovs_ct_update_key(key, state, zone);
}
void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key)
{
ovs_ct_update_key(skb, key, false);
}
int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb)
{
if (nla_put_u8(skb, OVS_KEY_ATTR_CT_STATE, key->ct.state))
return -EMSGSIZE;
if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) &&
nla_put_u16(skb, OVS_KEY_ATTR_CT_ZONE, key->ct.zone))
return -EMSGSIZE;
return 0;
}
static int handle_fragments(struct net *net, struct sw_flow_key *key,
u16 zone, struct sk_buff *skb)
{
struct ovs_skb_cb ovs_cb = *OVS_CB(skb);
if (key->eth.type == htons(ETH_P_IP)) {
enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone;
int err;
memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
err = ip_defrag(skb, user);
if (err)
return err;
ovs_cb.mru = IPCB(skb)->frag_max_size;
} else if (key->eth.type == htons(ETH_P_IPV6)) {
#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone;
struct sk_buff *reasm;
memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
reasm = nf_ct_frag6_gather(skb, user);
if (!reasm)
return -EINPROGRESS;
if (skb == reasm)
return -EINVAL;
key->ip.proto = ipv6_hdr(reasm)->nexthdr;
skb_morph(skb, reasm);
consume_skb(reasm);
ovs_cb.mru = IP6CB(skb)->frag_max_size;
#else
return -EPFNOSUPPORT;
#endif
} else {
return -EPFNOSUPPORT;
}
key->ip.frag = OVS_FRAG_TYPE_NONE;
skb_clear_hash(skb);
skb->ignore_df = 1;
*OVS_CB(skb) = ovs_cb;
return 0;
}
static struct nf_conntrack_expect *
ovs_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone,
u16 proto, const struct sk_buff *skb)
{
struct nf_conntrack_tuple tuple;
if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), proto, &tuple))
return NULL;
return __nf_ct_expect_find(net, zone, &tuple);
}
/* Determine whether skb->nfct is equal to the result of conntrack lookup. */
static bool skb_nfct_cached(const struct net *net, const struct sk_buff *skb,
const struct ovs_conntrack_info *info)
{
enum ip_conntrack_info ctinfo;
struct nf_conn *ct;
ct = nf_ct_get(skb, &ctinfo);
if (!ct)
return false;
if (!net_eq(net, read_pnet(&ct->ct_net)))
return false;
if (!nf_ct_zone_equal_any(info->ct, nf_ct_zone(ct)))
return false;
return true;
}
static int __ovs_ct_lookup(struct net *net, const struct sw_flow_key *key,
const struct ovs_conntrack_info *info,
struct sk_buff *skb)
{
/* If we are recirculating packets to match on conntrack fields and
* committing with a separate conntrack action, then we don't need to
* actually run the packet through conntrack twice unless it's for a
* different zone.
*/
if (!skb_nfct_cached(net, skb, info)) {
struct nf_conn *tmpl = info->ct;
/* Associate skb with specified zone. */
if (tmpl) {
if (skb->nfct)
nf_conntrack_put(skb->nfct);
nf_conntrack_get(&tmpl->ct_general);
skb->nfct = &tmpl->ct_general;
skb->nfctinfo = IP_CT_NEW;
}
if (nf_conntrack_in(net, info->family, NF_INET_PRE_ROUTING,
skb) != NF_ACCEPT)
return -ENOENT;
}
return 0;
}
/* Lookup connection and read fields into key. */
static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
const struct ovs_conntrack_info *info,
struct sk_buff *skb)
{
struct nf_conntrack_expect *exp;
exp = ovs_ct_expect_find(net, &info->zone, info->family, skb);
if (exp) {
u8 state;
state = OVS_CS_F_TRACKED | OVS_CS_F_NEW | OVS_CS_F_RELATED;
__ovs_ct_update_key(key, state, &info->zone);
} else {
int err;
err = __ovs_ct_lookup(net, key, info, skb);
if (err)
return err;
ovs_ct_update_key(skb, key, true);
}
return 0;
}