Commit f5796684 authored by Jesse Gross's avatar Jesse Gross Committed by David S. Miller

openvswitch: Add support for Geneve tunneling.

The Openvswitch implementation is completely agnostic to the options
that are in use and can handle newly defined options without
further work. It does this by simply matching on a byte array
of options and allowing userspace to setup flows on this array.
Signed-off-by: default avatarJesse Gross <jesse@nicira.com>
Singed-off-by: default avatarAnsis Atteka <aatteka@nicira.com>
Signed-off-by: default avatarAndy Zhou <azhou@nicira.com>
Acked-by: default avatarThomas Graf <tgraf@noironetworks.com>
Acked-by: default avatarPravin B Shelar <pshelar@nicira.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 6b205b2c
......@@ -86,17 +86,18 @@ struct ip_tunnel {
struct gro_cells gro_cells;
};
#define TUNNEL_CSUM __cpu_to_be16(0x01)
#define TUNNEL_ROUTING __cpu_to_be16(0x02)
#define TUNNEL_KEY __cpu_to_be16(0x04)
#define TUNNEL_SEQ __cpu_to_be16(0x08)
#define TUNNEL_STRICT __cpu_to_be16(0x10)
#define TUNNEL_REC __cpu_to_be16(0x20)
#define TUNNEL_VERSION __cpu_to_be16(0x40)
#define TUNNEL_NO_KEY __cpu_to_be16(0x80)
#define TUNNEL_CSUM __cpu_to_be16(0x01)
#define TUNNEL_ROUTING __cpu_to_be16(0x02)
#define TUNNEL_KEY __cpu_to_be16(0x04)
#define TUNNEL_SEQ __cpu_to_be16(0x08)
#define TUNNEL_STRICT __cpu_to_be16(0x10)
#define TUNNEL_REC __cpu_to_be16(0x20)
#define TUNNEL_VERSION __cpu_to_be16(0x40)
#define TUNNEL_NO_KEY __cpu_to_be16(0x80)
#define TUNNEL_DONT_FRAGMENT __cpu_to_be16(0x0100)
#define TUNNEL_OAM __cpu_to_be16(0x0200)
#define TUNNEL_CRIT_OPT __cpu_to_be16(0x0400)
#define TUNNEL_OAM __cpu_to_be16(0x0200)
#define TUNNEL_CRIT_OPT __cpu_to_be16(0x0400)
#define TUNNEL_OPTIONS_PRESENT __cpu_to_be16(0x0800)
struct tnl_ptk_info {
__be16 flags;
......
......@@ -192,6 +192,7 @@ enum ovs_vport_type {
OVS_VPORT_TYPE_INTERNAL, /* network device implemented by datapath */
OVS_VPORT_TYPE_GRE, /* GRE tunnel. */
OVS_VPORT_TYPE_VXLAN, /* VXLAN tunnel. */
OVS_VPORT_TYPE_GENEVE, /* Geneve tunnel. */
__OVS_VPORT_TYPE_MAX
};
......@@ -310,6 +311,7 @@ enum ovs_tunnel_key_attr {
OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT, /* No argument, set DF. */
OVS_TUNNEL_KEY_ATTR_CSUM, /* No argument. CSUM packet. */
OVS_TUNNEL_KEY_ATTR_OAM, /* No argument. OAM frame. */
OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS, /* Array of Geneve options. */
__OVS_TUNNEL_KEY_ATTR_MAX
};
......
......@@ -54,3 +54,14 @@ config OPENVSWITCH_VXLAN
Say N to exclude this support and reduce the binary size.
If unsure, say Y.
config OPENVSWITCH_GENEVE
bool "Open vSwitch Geneve tunneling support"
depends on INET
depends on OPENVSWITCH
depends on GENEVE && !(OPENVSWITCH=y && GENEVE=m)
default y
---help---
If you say Y here, then the Open vSwitch will be able create geneve vport.
Say N to exclude this support and reduce the binary size.
......@@ -15,6 +15,10 @@ openvswitch-y := \
vport-internal_dev.o \
vport-netdev.o
ifneq ($(CONFIG_OPENVSWITCH_GENEVE),)
openvswitch-y += vport-geneve.o
endif
ifneq ($(CONFIG_OPENVSWITCH_VXLAN),)
openvswitch-y += vport-vxlan.o
endif
......
......@@ -370,6 +370,7 @@ static size_t key_attr_size(void)
+ nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT */
+ nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_CSUM */
+ nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_OAM */
+ nla_total_size(256) /* OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS */
+ nla_total_size(4) /* OVS_KEY_ATTR_IN_PORT */
+ nla_total_size(4) /* OVS_KEY_ATTR_SKB_MARK */
+ nla_total_size(12) /* OVS_KEY_ATTR_ETHERNET */
......@@ -556,10 +557,12 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
err = ovs_nla_copy_actions(a[OVS_PACKET_ATTR_ACTIONS],
&flow->key, 0, &acts);
rcu_assign_pointer(flow->sf_acts, acts);
if (err)
goto err_flow_free;
rcu_assign_pointer(flow->sf_acts, acts);
OVS_CB(packet)->egress_tun_info = NULL;
OVS_CB(packet)->flow = flow;
packet->priority = flow->key.phy.priority;
packet->mark = flow->key.phy.skb_mark;
......
......@@ -448,6 +448,9 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
int error;
struct ethhdr *eth;
/* Flags are always used as part of stats */
key->tp.flags = 0;
skb_reset_mac_header(skb);
/* Link layer. We are guaranteed to have at least the 14 byte Ethernet
......@@ -646,10 +649,23 @@ int ovs_flow_key_extract(struct ovs_tunnel_info *tun_info,
struct sk_buff *skb, struct sw_flow_key *key)
{
/* Extract metadata from packet. */
if (tun_info)
if (tun_info) {
memcpy(&key->tun_key, &tun_info->tunnel, sizeof(key->tun_key));
else
if (tun_info->options) {
BUILD_BUG_ON((1 << (sizeof(tun_info->options_len) *
8)) - 1
> sizeof(key->tun_opts));
memcpy(GENEVE_OPTS(key, tun_info->options_len),
tun_info->options, tun_info->options_len);
key->tun_opts_len = tun_info->options_len;
} else {
key->tun_opts_len = 0;
}
} else {
key->tun_opts_len = 0;
memset(&key->tun_key, 0, sizeof(key->tun_key));
}
key->phy.priority = skb->priority;
key->phy.in_port = OVS_CB(skb)->input_vport->port_no;
......
......@@ -51,11 +51,24 @@ struct ovs_key_ipv4_tunnel {
struct ovs_tunnel_info {
struct ovs_key_ipv4_tunnel tunnel;
struct geneve_opt *options;
u8 options_len;
};
/* Store options at the end of the array if they are less than the
* maximum size. This allows us to get the benefits of variable length
* matching for small options.
*/
#define GENEVE_OPTS(flow_key, opt_len) \
((struct geneve_opt *)((flow_key)->tun_opts + \
FIELD_SIZEOF(struct sw_flow_key, tun_opts) - \
opt_len))
static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info,
const struct iphdr *iph,
__be64 tun_id, __be16 tun_flags)
__be64 tun_id, __be16 tun_flags,
struct geneve_opt *opts,
u8 opts_len)
{
tun_info->tunnel.tun_id = tun_id;
tun_info->tunnel.ipv4_src = iph->saddr;
......@@ -67,9 +80,14 @@ static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info,
/* clear struct padding. */
memset((unsigned char *)&tun_info->tunnel + OVS_TUNNEL_KEY_SIZE, 0,
sizeof(tun_info->tunnel) - OVS_TUNNEL_KEY_SIZE);
tun_info->options = opts;
tun_info->options_len = opts_len;
}
struct sw_flow_key {
u8 tun_opts[255];
u8 tun_opts_len;
struct ovs_key_ipv4_tunnel tun_key; /* Encapsulating tunnel key. */
struct {
u32 priority; /* Packet QoS priority. */
......
......@@ -42,6 +42,7 @@
#include <linux/icmp.h>
#include <linux/icmpv6.h>
#include <linux/rculist.h>
#include <net/geneve.h>
#include <net/ip.h>
#include <net/ipv6.h>
#include <net/ndisc.h>
......@@ -88,18 +89,20 @@ static void update_range__(struct sw_flow_match *match,
} \
} while (0)
#define SW_FLOW_KEY_MEMCPY(match, field, value_p, len, is_mask) \
do { \
update_range__(match, offsetof(struct sw_flow_key, field), \
len, is_mask); \
if (is_mask) { \
if ((match)->mask) \
memcpy(&(match)->mask->key.field, value_p, len);\
} else { \
memcpy(&(match)->key->field, value_p, len); \
} \
#define SW_FLOW_KEY_MEMCPY_OFFSET(match, offset, value_p, len, is_mask) \
do { \
update_range__(match, offset, len, is_mask); \
if (is_mask) \
memcpy((u8 *)&(match)->mask->key + offset, value_p, \
len); \
else \
memcpy((u8 *)(match)->key + offset, value_p, len); \
} while (0)
#define SW_FLOW_KEY_MEMCPY(match, field, value_p, len, is_mask) \
SW_FLOW_KEY_MEMCPY_OFFSET(match, offsetof(struct sw_flow_key, field), \
value_p, len, is_mask)
static u16 range_n_bytes(const struct sw_flow_key_range *range)
{
return range->end - range->start;
......@@ -335,6 +338,7 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr,
int rem;
bool ttl = false;
__be16 tun_flags = 0;
unsigned long opt_key_offset;
nla_for_each_nested(a, attr, rem) {
int type = nla_type(a);
......@@ -347,6 +351,7 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr,
[OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT] = 0,
[OVS_TUNNEL_KEY_ATTR_CSUM] = 0,
[OVS_TUNNEL_KEY_ATTR_OAM] = 0,
[OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS] = -1,
};
if (type > OVS_TUNNEL_KEY_ATTR_MAX) {
......@@ -355,7 +360,8 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr,
return -EINVAL;
}
if (ovs_tunnel_key_lens[type] != nla_len(a)) {
if (ovs_tunnel_key_lens[type] != nla_len(a) &&
ovs_tunnel_key_lens[type] != -1) {
OVS_NLERR("IPv4 tunnel attribute type has unexpected "
" length (type=%d, length=%d, expected=%d).\n",
type, nla_len(a), ovs_tunnel_key_lens[type]);
......@@ -394,7 +400,60 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr,
case OVS_TUNNEL_KEY_ATTR_OAM:
tun_flags |= TUNNEL_OAM;
break;
case OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS:
tun_flags |= TUNNEL_OPTIONS_PRESENT;
if (nla_len(a) > sizeof(match->key->tun_opts)) {
OVS_NLERR("Geneve option length exceeds maximum size (len %d, max %zu).\n",
nla_len(a),
sizeof(match->key->tun_opts));
return -EINVAL;
}
if (nla_len(a) % 4 != 0) {
OVS_NLERR("Geneve option length is not a multiple of 4 (len %d).\n",
nla_len(a));
return -EINVAL;
}
/* We need to record the length of the options passed
* down, otherwise packets with the same format but
* additional options will be silently matched.
*/
if (!is_mask) {
SW_FLOW_KEY_PUT(match, tun_opts_len, nla_len(a),
false);
} else {
/* This is somewhat unusual because it looks at
* both the key and mask while parsing the
* attributes (and by extension assumes the key
* is parsed first). Normally, we would verify
* that each is the correct length and that the
* attributes line up in the validate function.
* However, that is difficult because this is
* variable length and we won't have the
* information later.
*/
if (match->key->tun_opts_len != nla_len(a)) {
OVS_NLERR("Geneve option key length (%d) is different from mask length (%d).",
match->key->tun_opts_len,
nla_len(a));
return -EINVAL;
}
SW_FLOW_KEY_PUT(match, tun_opts_len, 0xff,
true);
}
opt_key_offset = (unsigned long)GENEVE_OPTS(
(struct sw_flow_key *)0,
nla_len(a));
SW_FLOW_KEY_MEMCPY_OFFSET(match, opt_key_offset,
nla_data(a), nla_len(a),
is_mask);
break;
default:
OVS_NLERR("Unknown IPv4 tunnel attribute (%d).\n",
type);
return -EINVAL;
}
}
......@@ -421,16 +480,11 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr,
return 0;
}
static int ipv4_tun_to_nlattr(struct sk_buff *skb,
const struct ovs_key_ipv4_tunnel *tun_key,
const struct ovs_key_ipv4_tunnel *output)
static int __ipv4_tun_to_nlattr(struct sk_buff *skb,
const struct ovs_key_ipv4_tunnel *output,
const struct geneve_opt *tun_opts,
int swkey_tun_opts_len)
{
struct nlattr *nla;
nla = nla_nest_start(skb, OVS_KEY_ATTR_TUNNEL);
if (!nla)
return -EMSGSIZE;
if (output->tun_flags & TUNNEL_KEY &&
nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id))
return -EMSGSIZE;
......@@ -454,12 +508,35 @@ static int ipv4_tun_to_nlattr(struct sk_buff *skb,
if ((output->tun_flags & TUNNEL_OAM) &&
nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_OAM))
return -EMSGSIZE;
if (tun_opts &&
nla_put(skb, OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS,
swkey_tun_opts_len, tun_opts))
return -EMSGSIZE;
nla_nest_end(skb, nla);
return 0;
}
static int ipv4_tun_to_nlattr(struct sk_buff *skb,
const struct ovs_key_ipv4_tunnel *output,
const struct geneve_opt *tun_opts,
int swkey_tun_opts_len)
{
struct nlattr *nla;
int err;
nla = nla_nest_start(skb, OVS_KEY_ATTR_TUNNEL);
if (!nla)
return -EMSGSIZE;
err = __ipv4_tun_to_nlattr(skb, output, tun_opts, swkey_tun_opts_len);
if (err)
return err;
nla_nest_end(skb, nla);
return 0;
}
static int metadata_from_nlattrs(struct sw_flow_match *match, u64 *attrs,
const struct nlattr **a, bool is_mask)
{
......@@ -905,9 +982,16 @@ int ovs_nla_put_flow(const struct sw_flow_key *swkey,
if (nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, output->phy.priority))
goto nla_put_failure;
if ((swkey->tun_key.ipv4_dst || is_mask) &&
ipv4_tun_to_nlattr(skb, &swkey->tun_key, &output->tun_key))
goto nla_put_failure;
if ((swkey->tun_key.ipv4_dst || is_mask)) {
const struct geneve_opt *opts = NULL;
if (output->tun_key.tun_flags & TUNNEL_OPTIONS_PRESENT)
opts = GENEVE_OPTS(output, swkey->tun_opts_len);
if (ipv4_tun_to_nlattr(skb, &output->tun_key, opts,
swkey->tun_opts_len))
goto nla_put_failure;
}
if (swkey->phy.in_port == DP_MAX_PORTS) {
if (is_mask && (output->phy.in_port == 0xffff))
......@@ -1290,17 +1374,55 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
if (err)
return err;
if (key.tun_opts_len) {
struct geneve_opt *option = GENEVE_OPTS(&key,
key.tun_opts_len);
int opts_len = key.tun_opts_len;
bool crit_opt = false;
while (opts_len > 0) {
int len;
if (opts_len < sizeof(*option))
return -EINVAL;
len = sizeof(*option) + option->length * 4;
if (len > opts_len)
return -EINVAL;
crit_opt |= !!(option->type & GENEVE_CRIT_OPT_TYPE);
option = (struct geneve_opt *)((u8 *)option + len);
opts_len -= len;
};
key.tun_key.tun_flags |= crit_opt ? TUNNEL_CRIT_OPT : 0;
};
start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SET);
if (start < 0)
return start;
a = __add_action(sfa, OVS_KEY_ATTR_TUNNEL_INFO, NULL,
sizeof(*tun_info));
sizeof(*tun_info) + key.tun_opts_len);
if (IS_ERR(a))
return PTR_ERR(a);
tun_info = nla_data(a);
tun_info->tunnel = key.tun_key;
tun_info->options_len = key.tun_opts_len;
if (tun_info->options_len) {
/* We need to store the options in the action itself since
* everything else will go away after flow setup. We can append
* it to tun_info and then point there.
*/
memcpy((tun_info + 1), GENEVE_OPTS(&key, key.tun_opts_len),
key.tun_opts_len);
tun_info->options = (struct geneve_opt *)(tun_info + 1);
} else {
tun_info->options = NULL;
}
add_nested_action_end(*sfa, start);
......@@ -1592,7 +1714,9 @@ static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb)
return -EMSGSIZE;
err = ipv4_tun_to_nlattr(skb, &tun_info->tunnel,
nla_data(ovs_key));
tun_info->options_len ?
tun_info->options : NULL,
tun_info->options_len);
if (err)
return err;
nla_nest_end(skb, start);
......
/*
* Copyright (c) 2014 Nicira, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/version.h>
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/net.h>
#include <linux/rculist.h>
#include <linux/udp.h>
#include <linux/if_vlan.h>
#include <net/geneve.h>
#include <net/icmp.h>
#include <net/ip.h>
#include <net/route.h>
#include <net/udp.h>
#include <net/xfrm.h>
#include "datapath.h"
#include "vport.h"
/**
* struct geneve_port - Keeps track of open UDP ports
* @sock: The socket created for this port number.
* @name: vport name.
*/
struct geneve_port {
struct geneve_sock *gs;
char name[IFNAMSIZ];
};
static LIST_HEAD(geneve_ports);
static inline struct geneve_port *geneve_vport(const struct vport *vport)
{
return vport_priv(vport);
}
static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb)
{
return (struct genevehdr *)(udp_hdr(skb) + 1);
}
/* Convert 64 bit tunnel ID to 24 bit VNI. */
static void tunnel_id_to_vni(__be64 tun_id, __u8 *vni)
{
#ifdef __BIG_ENDIAN
vni[0] = (__force __u8)(tun_id >> 16);
vni[1] = (__force __u8)(tun_id >> 8);
vni[2] = (__force __u8)tun_id;
#else
vni[0] = (__force __u8)((__force u64)tun_id >> 40);
vni[1] = (__force __u8)((__force u64)tun_id >> 48);
vni[2] = (__force __u8)((__force u64)tun_id >> 56);
#endif
}
/* Convert 24 bit VNI to 64 bit tunnel ID. */
static __be64 vni_to_tunnel_id(__u8 *vni)
{
#ifdef __BIG_ENDIAN
return (vni[0] << 16) | (vni[1] << 8) | vni[2];
#else
return (__force __be64)(((__force u64)vni[0] << 40) |
((__force u64)vni[1] << 48) |
((__force u64)vni[2] << 56));
#endif
}
static void geneve_rcv(struct geneve_sock *gs, struct sk_buff *skb)
{
struct vport *vport = gs->rcv_data;
struct genevehdr *geneveh = geneve_hdr(skb);
int opts_len;
struct ovs_tunnel_info tun_info;
__be64 key;
__be16 flags;
opts_len = geneveh->opt_len * 4;
flags = TUNNEL_KEY | TUNNEL_OPTIONS_PRESENT |
(udp_hdr(skb)->check != 0 ? TUNNEL_CSUM : 0) |
(geneveh->oam ? TUNNEL_OAM : 0) |
(geneveh->critical ? TUNNEL_CRIT_OPT : 0);
key = vni_to_tunnel_id(geneveh->vni);
ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), key, flags,
geneveh->options, opts_len);
ovs_vport_receive(vport, skb, &tun_info);
}
static int geneve_get_options(const struct vport *vport,
struct sk_buff *skb)
{
struct geneve_port *geneve_port = geneve_vport(vport);
__be16 sport;
sport = ntohs(inet_sk(geneve_port->gs->sock->sk)->inet_sport);
if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, sport))
return -EMSGSIZE;
return 0;
}
static void geneve_tnl_destroy(struct vport *vport)
{
struct geneve_port *geneve_port = geneve_vport(vport);
geneve_sock_release(geneve_port->gs);
ovs_vport_deferred_free(vport);
}
static struct vport *geneve_tnl_create(const struct vport_parms *parms)
{
struct net *net = ovs_dp_get_net(parms->dp);
struct nlattr *options = parms->options;
struct geneve_port *geneve_port;
struct geneve_sock *gs;
struct vport *vport;
struct nlattr *a;
int err;
u16 dst_port;
if (!options) {
err = -EINVAL;
goto error;
}
a = nla_find_nested(options, OVS_TUNNEL_ATTR_DST_PORT);
if (a && nla_len(a) == sizeof(u16)) {
dst_port = nla_get_u16(a);
} else {
/* Require destination port from userspace. */
err = -EINVAL;
goto error;
}
vport = ovs_vport_alloc(sizeof(struct geneve_port),
&ovs_geneve_vport_ops, parms);
if (IS_ERR(vport))
return vport;
geneve_port = geneve_vport(vport);
strncpy(geneve_port->name, parms->name, IFNAMSIZ);
gs = geneve_sock_add(net, htons(dst_port), geneve_rcv, vport, true, 0);
if (IS_ERR(gs)) {
ovs_vport_free(vport);
return (void *)gs;
}
geneve_port->gs = gs;
return vport;
error:
return ERR_PTR(err);
}
static int geneve_tnl_send(struct vport *vport, struct sk_buff *skb)
{
struct ovs_key_ipv4_tunnel *tun_key;
struct ovs_tunnel_info *tun_info;
struct net *net = ovs_dp_get_net(vport->dp);
struct geneve_port *geneve_port = geneve_vport(vport);
__be16 dport = inet_sk(geneve_port->gs->sock->sk)->inet_sport;
__be16 sport;
struct rtable *rt;