Commit dc3d807d authored by David S. Miller's avatar David S. Miller
Browse files

openvswitch: gre tunneling support.



Pravin B Shelar says:

====================
Following patch series adds support for gre tunneling.
First six patches extend kernel gre and ip_tunnel modules
api so that there is more code sharing between gre modules
and ovs. Rest of patches adds ovs tunneling infrastructre
and gre protocol vport.

V2 fixes two patches according to comments from Jesse.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents ac8025a6 aa310701
......@@ -1021,7 +1021,6 @@ static netdev_tx_t vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
struct vxlan_dev *vxlan = netdev_priv(dev);
struct rtable *rt;
const struct iphdr *old_iph;
struct iphdr *iph;
struct vxlanhdr *vxh;
struct udphdr *uh;
struct flowi4 fl4;
......@@ -1030,6 +1029,7 @@ static netdev_tx_t vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
u32 vni;
__be16 df = 0;
__u8 tos, ttl;
int err;
dst_port = rdst->remote_port ? rdst->remote_port : vxlan->dst_port;
vni = rdst->remote_vni;
......@@ -1097,13 +1097,6 @@ static netdev_tx_t vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
vxlan_encap_bypass(skb, vxlan, dst_vxlan);
return NETDEV_TX_OK;
}
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
IPSKB_REROUTED);
skb_dst_drop(skb);
skb_dst_set(skb, &rt->dst);
vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
vxh->vx_flags = htonl(VXLAN_FLAGS);
vxh->vx_vni = htonl(vni << 8);
......@@ -1118,27 +1111,18 @@ static netdev_tx_t vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
uh->len = htons(skb->len);
uh->check = 0;
__skb_push(skb, sizeof(*iph));
skb_reset_network_header(skb);
iph = ip_hdr(skb);
iph->version = 4;
iph->ihl = sizeof(struct iphdr) >> 2;
iph->frag_off = df;
iph->protocol = IPPROTO_UDP;
iph->tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
iph->daddr = dst;
iph->saddr = fl4.saddr;
iph->ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
tunnel_ip_select_ident(skb, old_iph, &rt->dst);
nf_reset(skb);
vxlan_set_owner(dev, skb);
if (handle_offloads(skb))
goto drop;
iptunnel_xmit(skb, dev);
tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
err = iptunnel_xmit(dev_net(dev), rt, skb, fl4.saddr, dst,
IPPROTO_UDP, tos, ttl, df);
iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
return NETDEV_TX_OK;
drop:
......
......@@ -7,6 +7,7 @@
#define GREPROTO_CISCO 0
#define GREPROTO_PPTP 1
#define GREPROTO_MAX 2
#define GRE_IP_PROTO_MAX 2
struct gre_protocol {
int (*handler)(struct sk_buff *skb);
......@@ -22,6 +23,32 @@ struct gre_base_hdr {
int gre_add_protocol(const struct gre_protocol *proto, u8 version);
int gre_del_protocol(const struct gre_protocol *proto, u8 version);
struct gre_cisco_protocol {
int (*handler)(struct sk_buff *skb, const struct tnl_ptk_info *tpi);
int (*err_handler)(struct sk_buff *skb, u32 info,
const struct tnl_ptk_info *tpi);
u8 priority;
};
int gre_cisco_register(struct gre_cisco_protocol *proto);
int gre_cisco_unregister(struct gre_cisco_protocol *proto);
void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
int hdr_len);
struct sk_buff *gre_handle_offloads(struct sk_buff *skb, bool gre_csum);
static inline int ip_gre_calc_hlen(__be16 o_flags)
{
int addend = 4;
if (o_flags&TUNNEL_CSUM)
addend += 4;
if (o_flags&TUNNEL_KEY)
addend += 4;
if (o_flags&TUNNEL_SEQ)
addend += 4;
return addend;
}
static inline __be16 gre_flags_to_tnl_flags(__be16 flags)
{
__be16 tflags = 0;
......
......@@ -73,6 +73,7 @@ struct ip_tunnel {
#define TUNNEL_REC __cpu_to_be16(0x20)
#define TUNNEL_VERSION __cpu_to_be16(0x40)
#define TUNNEL_NO_KEY __cpu_to_be16(0x80)
#define TUNNEL_DONT_FRAGMENT __cpu_to_be16(0x0100)
struct tnl_ptk_info {
__be16 flags;
......@@ -155,23 +156,28 @@ static inline void tunnel_ip_select_ident(struct sk_buff *skb,
(skb_shinfo(skb)->gso_segs ?: 1) - 1);
}
static inline void iptunnel_xmit(struct sk_buff *skb, struct net_device *dev)
{
int err;
int pkt_len = skb->len - skb_transport_offset(skb);
struct pcpu_tstats *tstats = this_cpu_ptr(dev->tstats);
int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto);
int iptunnel_xmit(struct net *net, struct rtable *rt,
struct sk_buff *skb,
__be32 src, __be32 dst, __u8 proto,
__u8 tos, __u8 ttl, __be16 df);
nf_reset(skb);
static inline void iptunnel_xmit_stats(int err,
struct net_device_stats *err_stats,
struct pcpu_tstats __percpu *stats)
{
if (err > 0) {
struct pcpu_tstats *tstats = this_cpu_ptr(stats);
err = ip_local_out(skb);
if (likely(net_xmit_eval(err) == 0)) {
u64_stats_update_begin(&tstats->syncp);
tstats->tx_bytes += pkt_len;
tstats->tx_bytes += err;
tstats->tx_packets++;
u64_stats_update_end(&tstats->syncp);
} else if (err < 0) {
err_stats->tx_errors++;
err_stats->tx_aborted_errors++;
} else {
dev->stats.tx_errors++;
dev->stats.tx_aborted_errors++;
err_stats->tx_dropped++;
}
}
#endif /* __NET_IP_TUNNELS_H */
......@@ -164,6 +164,7 @@ enum ovs_vport_type {
OVS_VPORT_TYPE_UNSPEC,
OVS_VPORT_TYPE_NETDEV, /* network device */
OVS_VPORT_TYPE_INTERNAL, /* network device implemented by datapath */
OVS_VPORT_TYPE_GRE, /* GRE tunnel. */
__OVS_VPORT_TYPE_MAX
};
......@@ -246,11 +247,29 @@ enum ovs_key_attr {
OVS_KEY_ATTR_ARP, /* struct ovs_key_arp */
OVS_KEY_ATTR_ND, /* struct ovs_key_nd */
OVS_KEY_ATTR_SKB_MARK, /* u32 skb mark */
OVS_KEY_ATTR_TUNNEL, /* Nested set of ovs_tunnel attributes */
#ifdef __KERNEL__
OVS_KEY_ATTR_IPV4_TUNNEL, /* struct ovs_key_ipv4_tunnel */
#endif
__OVS_KEY_ATTR_MAX
};
#define OVS_KEY_ATTR_MAX (__OVS_KEY_ATTR_MAX - 1)
enum ovs_tunnel_key_attr {
OVS_TUNNEL_KEY_ATTR_ID, /* be64 Tunnel ID */
OVS_TUNNEL_KEY_ATTR_IPV4_SRC, /* be32 src IP address. */
OVS_TUNNEL_KEY_ATTR_IPV4_DST, /* be32 dst IP address. */
OVS_TUNNEL_KEY_ATTR_TOS, /* u8 Tunnel IP ToS. */
OVS_TUNNEL_KEY_ATTR_TTL, /* u8 Tunnel IP TTL. */
OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT, /* No argument, set DF. */
OVS_TUNNEL_KEY_ATTR_CSUM, /* No argument. CSUM packet. */
__OVS_TUNNEL_KEY_ATTR_MAX
};
#define OVS_TUNNEL_KEY_ATTR_MAX (__OVS_TUNNEL_KEY_ATTR_MAX - 1)
/**
* enum ovs_frag_type - IPv4 and IPv6 fragment type
* @OVS_FRAG_TYPE_NONE: Packet is not a fragment.
......
......@@ -11,7 +11,7 @@ obj-y := route.o inetpeer.o protocol.o \
tcp_offload.o datagram.o raw.o udp.o udplite.o \
udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
fib_frontend.o fib_semantics.o fib_trie.o \
inet_fragment.o ping.o
inet_fragment.o ping.o ip_tunnel_core.o
obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o
obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
......
......@@ -13,6 +13,8 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
#include <linux/if.h>
#include <linux/icmp.h>
#include <linux/kernel.h>
#include <linux/kmod.h>
#include <linux/skbuff.h>
......@@ -24,51 +26,270 @@
#include <net/protocol.h>
#include <net/gre.h>
#include <net/icmp.h>
#include <net/route.h>
#include <net/xfrm.h>
static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly;
static DEFINE_SPINLOCK(gre_proto_lock);
static struct gre_cisco_protocol __rcu *gre_cisco_proto_list[GRE_IP_PROTO_MAX];
int gre_add_protocol(const struct gre_protocol *proto, u8 version)
{
if (version >= GREPROTO_MAX)
goto err_out;
spin_lock(&gre_proto_lock);
if (gre_proto[version])
goto err_out_unlock;
RCU_INIT_POINTER(gre_proto[version], proto);
spin_unlock(&gre_proto_lock);
return 0;
return -EINVAL;
err_out_unlock:
spin_unlock(&gre_proto_lock);
err_out:
return -1;
return (cmpxchg((const struct gre_protocol **)&gre_proto[version], NULL, proto) == NULL) ?
0 : -EBUSY;
}
EXPORT_SYMBOL_GPL(gre_add_protocol);
int gre_del_protocol(const struct gre_protocol *proto, u8 version)
{
int ret;
if (version >= GREPROTO_MAX)
goto err_out;
spin_lock(&gre_proto_lock);
if (rcu_dereference_protected(gre_proto[version],
lockdep_is_held(&gre_proto_lock)) != proto)
goto err_out_unlock;
RCU_INIT_POINTER(gre_proto[version], NULL);
spin_unlock(&gre_proto_lock);
return -EINVAL;
ret = (cmpxchg((const struct gre_protocol **)&gre_proto[version], proto, NULL) == proto) ?
0 : -EBUSY;
if (ret)
return ret;
synchronize_rcu();
return 0;
err_out_unlock:
spin_unlock(&gre_proto_lock);
err_out:
return -1;
}
EXPORT_SYMBOL_GPL(gre_del_protocol);
void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
int hdr_len)
{
struct gre_base_hdr *greh;
skb_push(skb, hdr_len);
greh = (struct gre_base_hdr *)skb->data;
greh->flags = tnl_flags_to_gre_flags(tpi->flags);
greh->protocol = tpi->proto;
if (tpi->flags&(TUNNEL_KEY|TUNNEL_CSUM|TUNNEL_SEQ)) {
__be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4);
if (tpi->flags&TUNNEL_SEQ) {
*ptr = tpi->seq;
ptr--;
}
if (tpi->flags&TUNNEL_KEY) {
*ptr = tpi->key;
ptr--;
}
if (tpi->flags&TUNNEL_CSUM &&
!(skb_shinfo(skb)->gso_type & SKB_GSO_GRE)) {
*ptr = 0;
*(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0,
skb->len, 0));
}
}
}
EXPORT_SYMBOL_GPL(gre_build_header);
struct sk_buff *gre_handle_offloads(struct sk_buff *skb, bool gre_csum)
{
int err;
if (likely(!skb->encapsulation)) {
skb_reset_inner_headers(skb);
skb->encapsulation = 1;
}
if (skb_is_gso(skb)) {
err = skb_unclone(skb, GFP_ATOMIC);
if (unlikely(err))
goto error;
skb_shinfo(skb)->gso_type |= SKB_GSO_GRE;
return skb;
} else if (skb->ip_summed == CHECKSUM_PARTIAL && gre_csum) {
err = skb_checksum_help(skb);
if (unlikely(err))
goto error;
} else if (skb->ip_summed != CHECKSUM_PARTIAL)
skb->ip_summed = CHECKSUM_NONE;
return skb;
error:
kfree_skb(skb);
return ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(gre_handle_offloads);
static __sum16 check_checksum(struct sk_buff *skb)
{
__sum16 csum = 0;
switch (skb->ip_summed) {
case CHECKSUM_COMPLETE:
csum = csum_fold(skb->csum);
if (!csum)
break;
/* Fall through. */
case CHECKSUM_NONE:
skb->csum = 0;
csum = __skb_checksum_complete(skb);
skb->ip_summed = CHECKSUM_COMPLETE;
break;
}
return csum;
}
static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
bool *csum_err)
{
unsigned int ip_hlen = ip_hdrlen(skb);
const struct gre_base_hdr *greh;
__be32 *options;
int hdr_len;
if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr))))
return -EINVAL;
greh = (struct gre_base_hdr *)(skb_network_header(skb) + ip_hlen);
if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING)))
return -EINVAL;
tpi->flags = gre_flags_to_tnl_flags(greh->flags);
hdr_len = ip_gre_calc_hlen(tpi->flags);
if (!pskb_may_pull(skb, hdr_len))
return -EINVAL;
greh = (struct gre_base_hdr *)(skb_network_header(skb) + ip_hlen);
tpi->proto = greh->protocol;
options = (__be32 *)(greh + 1);
if (greh->flags & GRE_CSUM) {
if (check_checksum(skb)) {
*csum_err = true;
return -EINVAL;
}
options++;
}
if (greh->flags & GRE_KEY) {
tpi->key = *options;
options++;
} else
tpi->key = 0;
if (unlikely(greh->flags & GRE_SEQ)) {
tpi->seq = *options;
options++;
} else
tpi->seq = 0;
/* WCCP version 1 and 2 protocol decoding.
* - Change protocol to IP
* - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
*/
if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) {
tpi->proto = htons(ETH_P_IP);
if ((*(u8 *)options & 0xF0) != 0x40) {
hdr_len += 4;
if (!pskb_may_pull(skb, hdr_len))
return -EINVAL;
}
}
return iptunnel_pull_header(skb, hdr_len, tpi->proto);
}
static int gre_cisco_rcv(struct sk_buff *skb)
{
struct tnl_ptk_info tpi;
int i;
bool csum_err = false;
if (parse_gre_header(skb, &tpi, &csum_err) < 0)
goto drop;
rcu_read_lock();
for (i = 0; i < GRE_IP_PROTO_MAX; i++) {
struct gre_cisco_protocol *proto;
int ret;
proto = rcu_dereference(gre_cisco_proto_list[i]);
if (!proto)
continue;
ret = proto->handler(skb, &tpi);
if (ret == PACKET_RCVD) {
rcu_read_unlock();
return 0;
}
}
rcu_read_unlock();
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
drop:
kfree_skb(skb);
return 0;
}
static void gre_cisco_err(struct sk_buff *skb, u32 info)
{
/* All the routers (except for Linux) return only
* 8 bytes of packet payload. It means, that precise relaying of
* ICMP in the real Internet is absolutely infeasible.
*
* Moreover, Cisco "wise men" put GRE key to the third word
* in GRE header. It makes impossible maintaining even soft
* state for keyed
* GRE tunnels with enabled checksum. Tell them "thank you".
*
* Well, I wonder, rfc1812 was written by Cisco employee,
* what the hell these idiots break standards established
* by themselves???
*/
const int type = icmp_hdr(skb)->type;
const int code = icmp_hdr(skb)->code;
struct tnl_ptk_info tpi;
bool csum_err = false;
int i;
if (parse_gre_header(skb, &tpi, &csum_err)) {
if (!csum_err) /* ignore csum errors. */
return;
}
if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
ipv4_update_pmtu(skb, dev_net(skb->dev), info,
skb->dev->ifindex, 0, IPPROTO_GRE, 0);
return;
}
if (type == ICMP_REDIRECT) {
ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex, 0,
IPPROTO_GRE, 0);
return;
}
rcu_read_lock();
for (i = 0; i < GRE_IP_PROTO_MAX; i++) {
struct gre_cisco_protocol *proto;
proto = rcu_dereference(gre_cisco_proto_list[i]);
if (!proto)
continue;
if (proto->err_handler(skb, info, &tpi) == PACKET_RCVD)
goto out;
}
out:
rcu_read_unlock();
}
static int gre_rcv(struct sk_buff *skb)
{
const struct gre_protocol *proto;
......@@ -220,27 +441,68 @@ static const struct net_offload gre_offload = {
},
};
static const struct gre_protocol ipgre_protocol = {
.handler = gre_cisco_rcv,
.err_handler = gre_cisco_err,
};
int gre_cisco_register(struct gre_cisco_protocol *newp)
{
struct gre_cisco_protocol **proto = (struct gre_cisco_protocol **)
&gre_cisco_proto_list[newp->priority];
return (cmpxchg(proto, NULL, newp) == NULL) ? 0 : -EBUSY;
}
EXPORT_SYMBOL_GPL(gre_cisco_register);
int gre_cisco_unregister(struct gre_cisco_protocol *del_proto)
{
struct gre_cisco_protocol **proto = (struct gre_cisco_protocol **)
&gre_cisco_proto_list[del_proto->priority];
int ret;
ret = (cmpxchg(proto, del_proto, NULL) == del_proto) ? 0 : -EINVAL;
if (ret)
return ret;
synchronize_net();
return 0;
}
EXPORT_SYMBOL_GPL(gre_cisco_unregister);
static int __init gre_init(void)
{
pr_info("GRE over IPv4 demultiplexor driver\n");
if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) {
pr_err("can't add protocol\n");
return -EAGAIN;
goto err;
}
if (gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0) {
pr_info("%s: can't add ipgre handler\n", __func__);
goto err_gre;
}
if (inet_add_offload(&gre_offload, IPPROTO_GRE)) {
pr_err("can't add protocol offload\n");
inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
return -EAGAIN;
goto err_gso;
}
return 0;
err_gso:
gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
err_gre:
inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
err:
return -EAGAIN;
}
static void __exit gre_exit(void)
{
inet_del_offload(&gre_offload, IPPROTO_GRE);
gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
}
......@@ -250,4 +512,3 @@ module_exit(gre_exit);
MODULE_DESCRIPTION("GRE over IPv4 demultiplexer driver");
MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)");
MODULE_LICENSE("GPL");
......@@ -121,103 +121,8 @@ static int ipgre_tunnel_init(struct net_device *dev);
static int ipgre_net_id __read_mostly;
static int gre_tap_net_id __read_mostly;
static __sum16 check_checksum(struct sk_buff *skb)
{
__sum16 csum = 0;
switch (skb->ip_summed) {
case CHECKSUM_COMPLETE:
csum = csum_fold(skb->csum);
if (!csum)
break;
/* Fall through. */
case CHECKSUM_NONE:
skb->csum = 0;
csum = __skb_checksum_complete(skb);
skb->ip_summed = CHECKSUM_COMPLETE;
break;
}
return csum;
}
static int ip_gre_calc_hlen(__be16 o_flags)
{
int addend = 4;
if (o_flags&TUNNEL_CSUM)
addend += 4;