datapath.c 52.8 KB
Newer Older
1
/*
2
 * Copyright (c) 2007-2014 Nicira, Inc.
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of version 2 of the GNU General Public
 * License as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
 * 02110-1301, USA
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/init.h>
#include <linux/module.h>
#include <linux/if_arp.h>
#include <linux/if_vlan.h>
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/jhash.h>
#include <linux/delay.h>
#include <linux/time.h>
#include <linux/etherdevice.h>
#include <linux/genetlink.h>
#include <linux/kernel.h>
#include <linux/kthread.h>
#include <linux/mutex.h>
#include <linux/percpu.h>
#include <linux/rcupdate.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/ethtool.h>
#include <linux/wait.h>
#include <asm/div64.h>
#include <linux/highmem.h>
#include <linux/netfilter_bridge.h>
#include <linux/netfilter_ipv4.h>
#include <linux/inetdevice.h>
#include <linux/list.h>
#include <linux/openvswitch.h>
#include <linux/rculist.h>
#include <linux/dmi.h>
#include <net/genetlink.h>
51 52
#include <net/net_namespace.h>
#include <net/netns/generic.h>
53 54 55

#include "datapath.h"
#include "flow.h"
56
#include "flow_table.h"
57
#include "flow_netlink.h"
58
#include "vport-internal_dev.h"
59
#include "vport-netdev.h"
60

61
int ovs_net_id __read_mostly;
62
EXPORT_SYMBOL_GPL(ovs_net_id);
63

64 65 66 67
static struct genl_family dp_packet_genl_family;
static struct genl_family dp_flow_genl_family;
static struct genl_family dp_datapath_genl_family;

68 69
static const struct genl_multicast_group ovs_dp_flow_multicast_group = {
	.name = OVS_FLOW_MCGROUP,
70 71
};

72 73
static const struct genl_multicast_group ovs_dp_datapath_multicast_group = {
	.name = OVS_DATAPATH_MCGROUP,
74 75
};

76 77
static const struct genl_multicast_group ovs_dp_vport_multicast_group = {
	.name = OVS_VPORT_MCGROUP,
78 79
};

80 81
/* Check if need to build a reply message.
 * OVS userspace sets the NLM_F_ECHO flag if it needs the reply. */
82 83
static bool ovs_must_notify(struct genl_family *family, struct genl_info *info,
			    unsigned int group)
84 85
{
	return info->nlhdr->nlmsg_flags & NLM_F_ECHO ||
86 87
	       genl_has_listeners(family, genl_info_net(info)->genl_sock,
				  group);
88 89
}

90
static void ovs_notify(struct genl_family *family,
91
		       struct sk_buff *skb, struct genl_info *info)
92
{
93
	genl_notify(family, skb, genl_info_net(info), info->snd_portid,
94
		    0, info->nlhdr, GFP_KERNEL);
95 96
}

97 98 99
/**
 * DOC: Locking:
 *
100 101 102 103
 * All writes e.g. Writes to device state (add/remove datapath, port, set
 * operations on vports, etc.), Writes to other state (flow table
 * modifications, set miscellaneous datapath parameters, etc.) are protected
 * by ovs_lock.
104 105 106 107 108 109
 *
 * Reads are protected by RCU.
 *
 * There are a few special cases (mostly stats) that have their own
 * synchronization but they nest under all of above and don't interact with
 * each other.
110 111
 *
 * The RTNL lock nests inside ovs_mutex.
112 113
 */

114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133
static DEFINE_MUTEX(ovs_mutex);

void ovs_lock(void)
{
	mutex_lock(&ovs_mutex);
}

void ovs_unlock(void)
{
	mutex_unlock(&ovs_mutex);
}

#ifdef CONFIG_LOCKDEP
int lockdep_ovsl_is_held(void)
{
	if (debug_locks)
		return lockdep_is_held(&ovs_mutex);
	else
		return 1;
}
134
EXPORT_SYMBOL_GPL(lockdep_ovsl_is_held);
135 136
#endif

137
static struct vport *new_vport(const struct vport_parms *);
138
static int queue_gso_packets(struct datapath *dp, struct sk_buff *,
139
			     const struct sw_flow_key *,
140
			     const struct dp_upcall_info *);
141
static int queue_userspace_packet(struct datapath *dp, struct sk_buff *,
142
				  const struct sw_flow_key *,
143 144
				  const struct dp_upcall_info *);

145 146
/* Must be called with rcu_read_lock. */
static struct datapath *get_dp_rcu(struct net *net, int dp_ifindex)
147
{
148
	struct net_device *dev = dev_get_by_index_rcu(net, dp_ifindex);
149 150 151 152

	if (dev) {
		struct vport *vport = ovs_internal_dev_get_vport(dev);
		if (vport)
153
			return vport->dp;
154
	}
155 156 157 158 159 160 161 162 163 164 165 166 167 168

	return NULL;
}

/* The caller must hold either ovs_mutex or rcu_read_lock to keep the
 * returned dp pointer valid.
 */
static inline struct datapath *get_dp(struct net *net, int dp_ifindex)
{
	struct datapath *dp;

	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_ovsl_is_held());
	rcu_read_lock();
	dp = get_dp_rcu(net, dp_ifindex);
169 170 171 172 173
	rcu_read_unlock();

	return dp;
}

174
/* Must be called with rcu_read_lock or ovs_mutex. */
175
const char *ovs_dp_name(const struct datapath *dp)
176
{
177
	struct vport *vport = ovs_vport_ovsl_rcu(dp, OVSP_LOCAL);
178 179 180
	return vport->ops->get_name(vport);
}

181
static int get_dpifindex(const struct datapath *dp)
182 183 184 185 186 187
{
	struct vport *local;
	int ifindex;

	rcu_read_lock();

188
	local = ovs_vport_rcu(dp, OVSP_LOCAL);
189
	if (local)
190
		ifindex = netdev_vport_priv(local)->dev->ifindex;
191 192 193 194 195 196 197 198 199 200 201 202
	else
		ifindex = 0;

	rcu_read_unlock();

	return ifindex;
}

static void destroy_dp_rcu(struct rcu_head *rcu)
{
	struct datapath *dp = container_of(rcu, struct datapath, rcu);

203
	ovs_flow_tbl_destroy(&dp->table);
204
	free_percpu(dp->stats_percpu);
205
	release_net(ovs_dp_get_net(dp));
206
	kfree(dp->ports);
207 208 209
	kfree(dp);
}

210 211 212 213 214 215
static struct hlist_head *vport_hash_bucket(const struct datapath *dp,
					    u16 port_no)
{
	return &dp->ports[port_no & (DP_VPORT_HASH_BUCKETS - 1)];
}

216
/* Called with ovs_mutex or RCU read lock. */
217 218 219 220 221 222
struct vport *ovs_lookup_vport(const struct datapath *dp, u16 port_no)
{
	struct vport *vport;
	struct hlist_head *head;

	head = vport_hash_bucket(dp, port_no);
223
	hlist_for_each_entry_rcu(vport, head, dp_hash_node) {
224 225 226 227 228 229
		if (vport->port_no == port_no)
			return vport;
	}
	return NULL;
}

230
/* Called with ovs_mutex. */
231 232 233 234 235 236 237
static struct vport *new_vport(const struct vport_parms *parms)
{
	struct vport *vport;

	vport = ovs_vport_add(parms);
	if (!IS_ERR(vport)) {
		struct datapath *dp = parms->dp;
238
		struct hlist_head *head = vport_hash_bucket(dp, vport->port_no);
239

240
		hlist_add_head_rcu(&vport->dp_hash_node, head);
241 242 243 244 245 246
	}
	return vport;
}

void ovs_dp_detach_port(struct vport *p)
{
247
	ASSERT_OVSL();
248 249

	/* First drop references to device. */
250
	hlist_del_rcu(&p->dp_hash_node);
251 252 253 254 255 256

	/* Then destroy it. */
	ovs_vport_del(p);
}

/* Must be called with rcu_read_lock. */
257
void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key)
258
{
259
	const struct vport *p = OVS_CB(skb)->input_vport;
260 261
	struct datapath *dp = p->dp;
	struct sw_flow *flow;
262
	struct sw_flow_actions *sf_acts;
263 264
	struct dp_stats_percpu *stats;
	u64 *stats_counter;
265
	u32 n_mask_hit;
266

267
	stats = this_cpu_ptr(dp->stats_percpu);
268 269

	/* Look up flow. */
270
	flow = ovs_flow_tbl_lookup_stats(&dp->table, key, &n_mask_hit);
271 272
	if (unlikely(!flow)) {
		struct dp_upcall_info upcall;
273
		int error;
274 275 276

		upcall.cmd = OVS_PACKET_CMD_MISS;
		upcall.userdata = NULL;
277
		upcall.portid = ovs_vport_find_upcall_portid(p, skb);
278
		upcall.egress_tun_info = NULL;
279
		error = ovs_dp_upcall(dp, skb, key, &upcall);
280 281 282 283
		if (unlikely(error))
			kfree_skb(skb);
		else
			consume_skb(skb);
284 285 286 287
		stats_counter = &stats->n_missed;
		goto out;
	}

288 289 290
	ovs_flow_stats_update(flow, key->tp.flags, skb);
	sf_acts = rcu_dereference(flow->sf_acts);
	ovs_execute_actions(dp, skb, sf_acts, key);
291

292
	stats_counter = &stats->n_hit;
293 294 295

out:
	/* Update datapath statistics. */
296
	u64_stats_update_begin(&stats->syncp);
297
	(*stats_counter)++;
298
	stats->n_mask_hit += n_mask_hit;
299
	u64_stats_update_end(&stats->syncp);
300 301 302
}

int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb,
303
		  const struct sw_flow_key *key,
304
		  const struct dp_upcall_info *upcall_info)
305 306 307 308
{
	struct dp_stats_percpu *stats;
	int err;

309
	if (upcall_info->portid == 0) {
310 311 312 313 314
		err = -ENOTCONN;
		goto err;
	}

	if (!skb_is_gso(skb))
315
		err = queue_userspace_packet(dp, skb, key, upcall_info);
316
	else
317
		err = queue_gso_packets(dp, skb, key, upcall_info);
318 319 320 321 322 323
	if (err)
		goto err;

	return 0;

err:
324
	stats = this_cpu_ptr(dp->stats_percpu);
325

326
	u64_stats_update_begin(&stats->syncp);
327
	stats->n_lost++;
328
	u64_stats_update_end(&stats->syncp);
329 330 331 332

	return err;
}

333
static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb,
334
			     const struct sw_flow_key *key,
335 336
			     const struct dp_upcall_info *upcall_info)
{
337
	unsigned short gso_type = skb_shinfo(skb)->gso_type;
338 339
	struct sw_flow_key later_key;
	struct sk_buff *segs, *nskb;
340
	struct ovs_skb_cb ovs_cb;
341 342
	int err;

343
	ovs_cb = *OVS_CB(skb);
344
	segs = __skb_gso_segment(skb, NETIF_F_SG, false);
345
	*OVS_CB(skb) = ovs_cb;
346 347
	if (IS_ERR(segs))
		return PTR_ERR(segs);
348 349
	if (segs == NULL)
		return -EINVAL;
350

351 352 353 354 355 356 357 358 359
	if (gso_type & SKB_GSO_UDP) {
		/* The initial flow key extracted by ovs_flow_key_extract()
		 * in this case is for a first fragment, so we need to
		 * properly mark later fragments.
		 */
		later_key = *key;
		later_key.ip.frag = OVS_FRAG_TYPE_LATER;
	}

360 361 362
	/* Queue all of the segments. */
	skb = segs;
	do {
363 364 365 366 367
		*OVS_CB(skb) = ovs_cb;
		if (gso_type & SKB_GSO_UDP && skb != segs)
			key = &later_key;

		err = queue_userspace_packet(dp, skb, key, upcall_info);
368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384
		if (err)
			break;

	} while ((skb = skb->next));

	/* Free all of the segments. */
	skb = segs;
	do {
		nskb = skb->next;
		if (err)
			kfree_skb(skb);
		else
			consume_skb(skb);
	} while ((skb = nskb));
	return err;
}

385
static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info,
386
			      unsigned int hdrlen)
387 388
{
	size_t size = NLMSG_ALIGN(sizeof(struct ovs_header))
389
		+ nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */
390
		+ nla_total_size(ovs_key_attr_size()); /* OVS_PACKET_ATTR_KEY */
391 392

	/* OVS_PACKET_ATTR_USERDATA */
393 394 395 396 397 398
	if (upcall_info->userdata)
		size += NLA_ALIGN(upcall_info->userdata->nla_len);

	/* OVS_PACKET_ATTR_EGRESS_TUN_KEY */
	if (upcall_info->egress_tun_info)
		size += nla_total_size(ovs_tun_key_attr_size());
399 400 401 402

	return size;
}

403
static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
404
				  const struct sw_flow_key *key,
405 406 407 408
				  const struct dp_upcall_info *upcall_info)
{
	struct ovs_header *upcall;
	struct sk_buff *nskb = NULL;
Li RongQing's avatar
Li RongQing committed
409
	struct sk_buff *user_skb = NULL; /* to be queued to userspace */
410
	struct nlattr *nla;
411
	struct genl_info info = {
412
		.dst_sk = ovs_dp_get_net(dp)->genl_sock,
413 414 415
		.snd_portid = upcall_info->portid,
	};
	size_t len;
416
	unsigned int hlen;
417 418 419 420 421
	int err, dp_ifindex;

	dp_ifindex = get_dpifindex(dp);
	if (!dp_ifindex)
		return -ENODEV;
422 423 424 425 426 427

	if (vlan_tx_tag_present(skb)) {
		nskb = skb_clone(skb, GFP_ATOMIC);
		if (!nskb)
			return -ENOMEM;

428 429
		nskb = vlan_insert_tag_set_proto(nskb, nskb->vlan_proto,
						 vlan_tx_tag_get(nskb));
430
		if (!nskb)
431 432 433 434 435 436 437 438 439 440 441
			return -ENOMEM;

		nskb->vlan_tci = 0;
		skb = nskb;
	}

	if (nla_attr_size(skb->len) > USHRT_MAX) {
		err = -EFBIG;
		goto out;
	}

442 443 444 445 446 447 448 449 450 451 452 453 454 455
	/* Complete checksum if needed */
	if (skb->ip_summed == CHECKSUM_PARTIAL &&
	    (err = skb_checksum_help(skb)))
		goto out;

	/* Older versions of OVS user space enforce alignment of the last
	 * Netlink attribute to NLA_ALIGNTO which would require extensive
	 * padding logic. Only perform zerocopy if padding is not required.
	 */
	if (dp->user_features & OVS_DP_F_UNALIGNED)
		hlen = skb_zerocopy_headlen(skb);
	else
		hlen = skb->len;

456
	len = upcall_msg_size(upcall_info, hlen);
457
	user_skb = genlmsg_new_unicast(len, &info, GFP_ATOMIC);
458 459 460 461 462 463 464 465 466 467
	if (!user_skb) {
		err = -ENOMEM;
		goto out;
	}

	upcall = genlmsg_put(user_skb, 0, 0, &dp_packet_genl_family,
			     0, upcall_info->cmd);
	upcall->dp_ifindex = dp_ifindex;

	nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_KEY);
468
	err = ovs_nla_put_flow(key, key, user_skb);
469
	BUG_ON(err);
470 471 472
	nla_nest_end(user_skb, nla);

	if (upcall_info->userdata)
473 474 475
		__nla_put(user_skb, OVS_PACKET_ATTR_USERDATA,
			  nla_len(upcall_info->userdata),
			  nla_data(upcall_info->userdata));
476

477 478 479 480 481 482 483 484
	if (upcall_info->egress_tun_info) {
		nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_EGRESS_TUN_KEY);
		err = ovs_nla_put_egress_tunnel_key(user_skb,
						    upcall_info->egress_tun_info);
		BUG_ON(err);
		nla_nest_end(user_skb, nla);
	}

485 486 487 488 489 490 491
	/* Only reserve room for attribute header, packet data is added
	 * in skb_zerocopy() */
	if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0))) {
		err = -ENOBUFS;
		goto out;
	}
	nla->nla_len = nla_attr_size(skb->len);
492

493 494 495
	err = skb_zerocopy(user_skb, skb, skb->len, hlen);
	if (err)
		goto out;
496

497 498 499 500 501 502 503 504
	/* Pad OVS_PACKET_ATTR_PACKET if linear copy was performed */
	if (!(dp->user_features & OVS_DP_F_UNALIGNED)) {
		size_t plen = NLA_ALIGN(user_skb->len) - user_skb->len;

		if (plen > 0)
			memset(skb_put(user_skb, plen), 0, plen);
	}

505
	((struct nlmsghdr *) user_skb->data)->nlmsg_len = user_skb->len;
506

507
	err = genlmsg_unicast(ovs_dp_get_net(dp), user_skb, upcall_info->portid);
Li RongQing's avatar
Li RongQing committed
508
	user_skb = NULL;
509
out:
510 511
	if (err)
		skb_tx_error(skb);
Li RongQing's avatar
Li RongQing committed
512
	kfree_skb(user_skb);
513 514 515 516 517 518 519 520 521 522 523
	kfree_skb(nskb);
	return err;
}

static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
{
	struct ovs_header *ovs_header = info->userhdr;
	struct nlattr **a = info->attrs;
	struct sw_flow_actions *acts;
	struct sk_buff *packet;
	struct sw_flow *flow;
524
	struct sw_flow_actions *sf_acts;
525 526
	struct datapath *dp;
	struct ethhdr *eth;
527
	struct vport *input_vport;
528 529
	int len;
	int err;
530
	bool log = !a[OVS_FLOW_ATTR_PROBE];
531 532 533

	err = -EINVAL;
	if (!a[OVS_PACKET_ATTR_PACKET] || !a[OVS_PACKET_ATTR_KEY] ||
534
	    !a[OVS_PACKET_ATTR_ACTIONS])
535 536 537 538 539 540 541 542 543
		goto err;

	len = nla_len(a[OVS_PACKET_ATTR_PACKET]);
	packet = __dev_alloc_skb(NET_IP_ALIGN + len, GFP_KERNEL);
	err = -ENOMEM;
	if (!packet)
		goto err;
	skb_reserve(packet, NET_IP_ALIGN);

544
	nla_memcpy(__skb_put(packet, len), a[OVS_PACKET_ATTR_PACKET], len);
545 546 547 548 549 550 551

	skb_reset_mac_header(packet);
	eth = eth_hdr(packet);

	/* Normally, setting the skb 'protocol' field would be handled by a
	 * call to eth_type_trans(), but it assumes there's a sending
	 * device, which we may not have. */
Simon Horman's avatar
Simon Horman committed
552
	if (ntohs(eth->h_proto) >= ETH_P_802_3_MIN)
553 554 555 556 557
		packet->protocol = eth->h_proto;
	else
		packet->protocol = htons(ETH_P_802_2);

	/* Build an sw_flow for sending this packet. */
558
	flow = ovs_flow_alloc();
559 560 561 562
	err = PTR_ERR(flow);
	if (IS_ERR(flow))
		goto err_kfree_skb;

563
	err = ovs_flow_key_extract_userspace(a[OVS_PACKET_ATTR_KEY], packet,
564
					     &flow->key, log);
565 566 567
	if (err)
		goto err_flow_free;

568
	err = ovs_nla_copy_actions(a[OVS_PACKET_ATTR_ACTIONS],
569
				   &flow->key, &acts, log);
570 571
	if (err)
		goto err_flow_free;
572

573 574
	rcu_assign_pointer(flow->sf_acts, acts);
	OVS_CB(packet)->egress_tun_info = NULL;
575
	packet->priority = flow->key.phy.priority;
576
	packet->mark = flow->key.phy.skb_mark;
577 578

	rcu_read_lock();
579
	dp = get_dp_rcu(sock_net(skb->sk), ovs_header->dp_ifindex);
580 581 582 583
	err = -ENODEV;
	if (!dp)
		goto err_unlock;

584 585 586 587 588 589 590 591
	input_vport = ovs_vport_rcu(dp, flow->key.phy.in_port);
	if (!input_vport)
		input_vport = ovs_vport_rcu(dp, OVSP_LOCAL);

	if (!input_vport)
		goto err_unlock;

	OVS_CB(packet)->input_vport = input_vport;
592
	sf_acts = rcu_dereference(flow->sf_acts);
593

594
	local_bh_disable();
595
	err = ovs_execute_actions(dp, packet, sf_acts, &flow->key);
596 597 598
	local_bh_enable();
	rcu_read_unlock();

599
	ovs_flow_free(flow, false);
600 601 602 603 604
	return err;

err_unlock:
	rcu_read_unlock();
err_flow_free:
605
	ovs_flow_free(flow, false);
606 607 608 609 610 611 612
err_kfree_skb:
	kfree_skb(packet);
err:
	return err;
}

static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = {
613
	[OVS_PACKET_ATTR_PACKET] = { .len = ETH_HLEN },
614 615 616 617
	[OVS_PACKET_ATTR_KEY] = { .type = NLA_NESTED },
	[OVS_PACKET_ATTR_ACTIONS] = { .type = NLA_NESTED },
};

618
static const struct genl_ops dp_packet_genl_ops[] = {
619 620 621 622 623 624 625
	{ .cmd = OVS_PACKET_CMD_EXECUTE,
	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
	  .policy = packet_policy,
	  .doit = ovs_packet_cmd_execute
	}
};

626 627 628 629 630 631 632 633 634 635 636 637
static struct genl_family dp_packet_genl_family = {
	.id = GENL_ID_GENERATE,
	.hdrsize = sizeof(struct ovs_header),
	.name = OVS_PACKET_FAMILY,
	.version = OVS_PACKET_VERSION,
	.maxattr = OVS_PACKET_ATTR_MAX,
	.netnsok = true,
	.parallel_ops = true,
	.ops = dp_packet_genl_ops,
	.n_ops = ARRAY_SIZE(dp_packet_genl_ops),
};

638
static void get_dp_stats(const struct datapath *dp, struct ovs_dp_stats *stats,
639
			 struct ovs_dp_megaflow_stats *mega_stats)
640 641 642
{
	int i;

643 644
	memset(mega_stats, 0, sizeof(*mega_stats));

645
	stats->n_flows = ovs_flow_tbl_count(&dp->table);
646
	mega_stats->n_masks = ovs_flow_tbl_num_masks(&dp->table);
647 648

	stats->n_hit = stats->n_missed = stats->n_lost = 0;
649

650 651 652 653 654 655 656 657
	for_each_possible_cpu(i) {
		const struct dp_stats_percpu *percpu_stats;
		struct dp_stats_percpu local_stats;
		unsigned int start;

		percpu_stats = per_cpu_ptr(dp->stats_percpu, i);

		do {
658
			start = u64_stats_fetch_begin_irq(&percpu_stats->syncp);
659
			local_stats = *percpu_stats;
660
		} while (u64_stats_fetch_retry_irq(&percpu_stats->syncp, start));
661 662 663 664

		stats->n_hit += local_stats.n_hit;
		stats->n_missed += local_stats.n_missed;
		stats->n_lost += local_stats.n_lost;
665
		mega_stats->n_mask_hit += local_stats.n_mask_hit;
666 667 668
	}
}

669 670 671
static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts)
{
	return NLMSG_ALIGN(sizeof(struct ovs_header))
672 673
		+ nla_total_size(ovs_key_attr_size()) /* OVS_FLOW_ATTR_KEY */
		+ nla_total_size(ovs_key_attr_size()) /* OVS_FLOW_ATTR_MASK */
674 675 676 677 678 679
		+ nla_total_size(sizeof(struct ovs_flow_stats)) /* OVS_FLOW_ATTR_STATS */
		+ nla_total_size(1) /* OVS_FLOW_ATTR_TCP_FLAGS */
		+ nla_total_size(8) /* OVS_FLOW_ATTR_USED */
		+ nla_total_size(acts->actions_len); /* OVS_FLOW_ATTR_ACTIONS */
}

680
/* Called with ovs_mutex or RCU read lock. */
681 682
static int ovs_flow_cmd_fill_match(const struct sw_flow *flow,
				   struct sk_buff *skb)
683 684 685 686
{
	struct nlattr *nla;
	int err;

687
	/* Fill flow key. */
688 689
	nla = nla_nest_start(skb, OVS_FLOW_ATTR_KEY);
	if (!nla)
690
		return -EMSGSIZE;
691

692
	err = ovs_nla_put_flow(&flow->unmasked_key, &flow->unmasked_key, skb);
693
	if (err)
694 695
		return err;

696 697
	nla_nest_end(skb, nla);

698
	/* Fill flow mask. */
699 700
	nla = nla_nest_start(skb, OVS_FLOW_ATTR_MASK);
	if (!nla)
701
		return -EMSGSIZE;
702

703
	err = ovs_nla_put_flow(&flow->key, &flow->mask->key, skb);
704
	if (err)
705
		return err;
706

707
	nla_nest_end(skb, nla);
708 709 710 711 712 713 714 715 716 717
	return 0;
}

/* Called with ovs_mutex or RCU read lock. */
static int ovs_flow_cmd_fill_stats(const struct sw_flow *flow,
				   struct sk_buff *skb)
{
	struct ovs_flow_stats stats;
	__be16 tcp_flags;
	unsigned long used;
718

719
	ovs_flow_stats_get(flow, &stats, &used, &tcp_flags);
720

721 722
	if (used &&
	    nla_put_u64(skb, OVS_FLOW_ATTR_USED, ovs_flow_used_time(used)))
723
		return -EMSGSIZE;
724

725
	if (stats.n_packets &&
726
	    nla_put(skb, OVS_FLOW_ATTR_STATS, sizeof(struct ovs_flow_stats), &stats))
727
		return -EMSGSIZE;
728

729 730
	if ((u8)ntohs(tcp_flags) &&
	     nla_put_u8(skb, OVS_FLOW_ATTR_TCP_FLAGS, (u8)ntohs(tcp_flags)))
731 732 733 734 735 736 737 738 739 740 741
		return -EMSGSIZE;

	return 0;
}

/* Called with ovs_mutex or RCU read lock. */
static int ovs_flow_cmd_fill_actions(const struct sw_flow *flow,
				     struct sk_buff *skb, int skb_orig_len)
{
	struct nlattr *start;
	int err;
742 743 744 745 746 747 748 749 750 751 752

	/* If OVS_FLOW_ATTR_ACTIONS doesn't fit, skip dumping the actions if
	 * this is the first flow to be dumped into 'skb'.  This is unusual for
	 * Netlink but individual action lists can be longer than
	 * NLMSG_GOODSIZE and thus entirely undumpable if we didn't do this.
	 * The userspace caller can always fetch the actions separately if it
	 * really wants them.  (Most userspace callers in fact don't care.)
	 *
	 * This can only fail for dump operations because the skb is always
	 * properly sized for single flows.
	 */
753 754
	start = nla_nest_start(skb, OVS_FLOW_ATTR_ACTIONS);
	if (start) {
755 756
		const struct sw_flow_actions *sf_acts;

757
		sf_acts = rcu_dereference_ovsl(flow->sf_acts);
758 759
		err = ovs_nla_put_actions(sf_acts->actions,
					  sf_acts->actions_len, skb);
760

761 762 763 764
		if (!err)
			nla_nest_end(skb, start);
		else {
			if (skb_orig_len)
765
				return err;
766 767 768

			nla_nest_cancel(skb, start);
		}
769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802
	} else if (skb_orig_len) {
		return -EMSGSIZE;
	}

	return 0;
}

/* Called with ovs_mutex or RCU read lock. */
static int ovs_flow_cmd_fill_info(const struct sw_flow *flow, int dp_ifindex,
				  struct sk_buff *skb, u32 portid,
				  u32 seq, u32 flags, u8 cmd)
{
	const int skb_orig_len = skb->len;
	struct ovs_header *ovs_header;
	int err;

	ovs_header = genlmsg_put(skb, portid, seq, &dp_flow_genl_family,
				 flags, cmd);
	if (!ovs_header)
		return -EMSGSIZE;

	ovs_header->dp_ifindex = dp_ifindex;

	err = ovs_flow_cmd_fill_match(flow, skb);
	if (err)
		goto error;

	err = ovs_flow_cmd_fill_stats(flow, skb);
	if (err)
		goto error;

	err = ovs_flow_cmd_fill_actions(flow, skb, skb_orig_len);
	if (err)
		goto error;
803 804 805 806 807 808 809 810

	return genlmsg_end(skb, ovs_header);

error:
	genlmsg_cancel(skb, ovs_header);
	return err;
}

811 812
/* May not be called with RCU read lock. */
static struct sk_buff *ovs_flow_cmd_alloc_info(const struct sw_flow_actions *acts,
813 814
					       struct genl_info *info,
					       bool always)
815
{
816
	struct sk_buff *skb;
817

818
	if (!always && !ovs_must_notify(&dp_flow_genl_family, info, 0))
819 820
		return NULL;

821
	skb = genlmsg_new_unicast(ovs_flow_cmd_msg_size(acts), info, GFP_KERNEL);
822 823 824 825
	if (!skb)
		return ERR_PTR(-ENOMEM);

	return skb;
826 827
}

828 829 830 831 832
/* Called with ovs_mutex. */
static struct sk_buff *ovs_flow_cmd_build_info(const struct sw_flow *flow,
					       int dp_ifindex,
					       struct genl_info *info, u8 cmd,
					       bool always)
833 834 835 836
{
	struct sk_buff *skb;
	int retval;

837 838
	skb = ovs_flow_cmd_alloc_info(ovsl_dereference(flow->sf_acts), info,
				      always);
839
	if (IS_ERR_OR_NULL(skb))
840
		return skb;
841

842 843 844
	retval = ovs_flow_cmd_fill_info(flow, dp_ifindex, skb,
					info->snd_portid, info->snd_seq, 0,
					cmd);
845 846 847 848
	BUG_ON(retval < 0);
	return skb;
}

849
static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
850 851 852
{
	struct nlattr **a = info->attrs;
	struct ovs_header *ovs_header = info->userhdr;
853
	struct sw_flow *flow, *new_flow;
854
	struct sw_flow_mask mask;
855 856
	struct sk_buff *reply;
	struct datapath *dp;
857
	struct sw_flow_actions *acts;
858
	struct sw_flow_match match;
859
	int error;
860
	bool log = !a[OVS_FLOW_ATTR_PROBE];
861

862
	/* Must have key and actions. */
863
	error = -EINVAL;
864
	if (!a[OVS_FLOW_ATTR_KEY]) {
865
		OVS_NLERR(log, "Flow key attr not present in new flow.");
866
		goto error;
867 868
	}
	if (!a[OVS_FLOW_ATTR_ACTIONS]) {
869
		OVS_NLERR(log, "Flow actions attr not present in new flow.");
870
		goto error;
871
	}
872

873 874 875 876 877 878 879 880 881 882 883
	/* Most of the time we need to allocate a new flow, do it before
	 * locking.
	 */
	new_flow = ovs_flow_alloc();
	if (IS_ERR(new_flow)) {
		error = PTR_ERR(new_flow);
		goto error;
	}

	/* Extract key. */
	ovs_match_init(&match, &new_flow->unmasked_key, &mask);
884 885
	error = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY],
				  a[OVS_FLOW_ATTR_MASK], log);
886
	if (error)
887
		goto err_kfree_flow;
888

889
	ovs_flow_mask_key(&new_flow->key, &new_flow->unmasked_key, &mask);
890

891 892
	/* Validate actions. */
	error = ovs_nla_copy_actions(a[OVS_FLOW_ATTR_ACTIONS], &new_flow->key,
893
				     &acts, log);
894
	if (error) {
895
		OVS_NLERR(log, "Flow actions may not be safe on all matching packets.");
896
		goto err_kfree_flow;
897 898 899 900 901 902
	}

	reply = ovs_flow_cmd_alloc_info(acts, info, false);
	if (IS_ERR(reply)) {
		error = PTR_ERR(reply);
		goto err_kfree_acts;
903 904
	}

905
	ovs_lock();
906
	dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
907 908
	if (unlikely(!dp)) {
		error = -ENODEV;
909
		goto err_unlock_ovs;
910
	}
911
	/* Check if this is a duplicate flow */
912 913 914
	flow = ovs_flow_tbl_lookup(&dp->table, &new_flow->unmasked_key);
	if (likely(!flow)) {
		rcu_assign_pointer(new_flow->sf_acts, acts);
915 916

		/* Put flow in bucket. */
917 918
		error = ovs_flow_tbl_insert(&dp->table, new_flow, &mask);
		if (unlikely(error)) {
919
			acts = NULL;
920 921 922 923 924 925 926 927 928 929
			goto err_unlock_ovs;
		}

		if (unlikely(reply)) {
			error = ovs_flow_cmd_fill_info(new_flow,
						       ovs_header->dp_ifindex,
						       reply, info->snd_portid,
						       info->snd_seq, 0,
						       OVS_FLOW_CMD_NEW);
			BUG_ON(error < 0);
930
		}
931
		ovs_unlock();
932
	} else {
933 934
		struct sw_flow_actions *old_acts;

935 936 937 938 939 940
		/* Bail out if we're not allowed to modify an existing flow.
		 * We accept NLM_F_CREATE in place of the intended NLM_F_EXCL
		 * because Generic Netlink treats the latter as a dump
		 * request.  We also accept NLM_F_EXCL in case that bug ever
		 * gets fixed.
		 */
941 942 943
		if (unlikely(info->nlhdr->nlmsg_flags & (NLM_F_CREATE
							 | NLM_F_EXCL))) {
			error = -EEXIST;
944
			goto err_unlock_ovs;
945
		}
946
		/* The unmasked key has to be the same for flow updates. */
947
		if (unlikely(!ovs_flow_cmp_unmasked_key(flow, &match))) {
948
			/* Look for any overlapping flow. */
949 950 951 952 953
			flow = ovs_flow_tbl_lookup_exact(&dp->table, &match);
			if (!flow) {
				error = -ENOENT;
				goto err_unlock_ovs;
			}
954
		}
955 956 957 958
		/* Update actions. */
		old_acts = ovsl_dereference(flow->sf_acts);
		rcu_assign_pointer(flow->sf_acts, acts);

959 960 961 962 963 964 965 966 967
		if (unlikely(reply)) {
			error = ovs_flow_cmd_fill_info(flow,
						       ovs_header->dp_ifindex,
						       reply, info->snd_portid,
						       info->snd_seq, 0,
						       OVS_FLOW_CMD_NEW);
			BUG_ON(error < 0);
		}
		ovs_unlock();
968

969 970
		ovs_nla_free_flow_actions(old_acts);
		ovs_flow_free(new_flow, false);
971
	}
972 973 974

	if (reply)
		ovs_notify(&dp_flow_genl_family, reply, info);
975 976 977 978
	return 0;

err_unlock_ovs:
	ovs_unlock();
979 980
	kfree_skb(reply);
err_kfree_acts:
981
	kfree(acts);
982 983
err_kfree_flow:
	ovs_flow_free(new_flow, false);