datapath.c 52.3 KB
Newer Older
1
/*
2
 * Copyright (c) 2007-2014 Nicira, Inc.
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of version 2 of the GNU General Public
 * License as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
 * 02110-1301, USA
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/init.h>
#include <linux/module.h>
#include <linux/if_arp.h>
#include <linux/if_vlan.h>
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/jhash.h>
#include <linux/delay.h>
#include <linux/time.h>
#include <linux/etherdevice.h>
#include <linux/genetlink.h>
#include <linux/kernel.h>
#include <linux/kthread.h>
#include <linux/mutex.h>
#include <linux/percpu.h>
#include <linux/rcupdate.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/ethtool.h>
#include <linux/wait.h>
#include <asm/div64.h>
#include <linux/highmem.h>
#include <linux/netfilter_bridge.h>
#include <linux/netfilter_ipv4.h>
#include <linux/inetdevice.h>
#include <linux/list.h>
#include <linux/openvswitch.h>
#include <linux/rculist.h>
#include <linux/dmi.h>
#include <net/genetlink.h>
51 52
#include <net/net_namespace.h>
#include <net/netns/generic.h>
53 54 55

#include "datapath.h"
#include "flow.h"
56
#include "flow_table.h"
57
#include "flow_netlink.h"
58
#include "vport-internal_dev.h"
59
#include "vport-netdev.h"
60

61
int ovs_net_id __read_mostly;
62
EXPORT_SYMBOL_GPL(ovs_net_id);
63

64 65 66 67
static struct genl_family dp_packet_genl_family;
static struct genl_family dp_flow_genl_family;
static struct genl_family dp_datapath_genl_family;

68 69
static const struct genl_multicast_group ovs_dp_flow_multicast_group = {
	.name = OVS_FLOW_MCGROUP,
70 71
};

72 73
static const struct genl_multicast_group ovs_dp_datapath_multicast_group = {
	.name = OVS_DATAPATH_MCGROUP,
74 75
};

76 77
static const struct genl_multicast_group ovs_dp_vport_multicast_group = {
	.name = OVS_VPORT_MCGROUP,
78 79
};

80 81
/* Check if need to build a reply message.
 * OVS userspace sets the NLM_F_ECHO flag if it needs the reply. */
82 83
static bool ovs_must_notify(struct genl_family *family, struct genl_info *info,
			    unsigned int group)
84 85
{
	return info->nlhdr->nlmsg_flags & NLM_F_ECHO ||
86 87
	       genl_has_listeners(family, genl_info_net(info)->genl_sock,
				  group);
88 89
}

90
static void ovs_notify(struct genl_family *family,
91
		       struct sk_buff *skb, struct genl_info *info)
92
{
93
	genl_notify(family, skb, genl_info_net(info), info->snd_portid,
94
		    0, info->nlhdr, GFP_KERNEL);
95 96
}

97 98 99
/**
 * DOC: Locking:
 *
100 101 102 103
 * All writes e.g. Writes to device state (add/remove datapath, port, set
 * operations on vports, etc.), Writes to other state (flow table
 * modifications, set miscellaneous datapath parameters, etc.) are protected
 * by ovs_lock.
104 105 106 107 108 109
 *
 * Reads are protected by RCU.
 *
 * There are a few special cases (mostly stats) that have their own
 * synchronization but they nest under all of above and don't interact with
 * each other.
110 111
 *
 * The RTNL lock nests inside ovs_mutex.
112 113
 */

114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133
static DEFINE_MUTEX(ovs_mutex);

void ovs_lock(void)
{
	mutex_lock(&ovs_mutex);
}

void ovs_unlock(void)
{
	mutex_unlock(&ovs_mutex);
}

#ifdef CONFIG_LOCKDEP
int lockdep_ovsl_is_held(void)
{
	if (debug_locks)
		return lockdep_is_held(&ovs_mutex);
	else
		return 1;
}
134
EXPORT_SYMBOL_GPL(lockdep_ovsl_is_held);
135 136
#endif

137
static struct vport *new_vport(const struct vport_parms *);
138
static int queue_gso_packets(struct datapath *dp, struct sk_buff *,
139
			     const struct sw_flow_key *,
140
			     const struct dp_upcall_info *);
141
static int queue_userspace_packet(struct datapath *dp, struct sk_buff *,
142
				  const struct sw_flow_key *,
143 144
				  const struct dp_upcall_info *);

145 146
/* Must be called with rcu_read_lock. */
static struct datapath *get_dp_rcu(struct net *net, int dp_ifindex)
147
{
148
	struct net_device *dev = dev_get_by_index_rcu(net, dp_ifindex);
149 150 151 152

	if (dev) {
		struct vport *vport = ovs_internal_dev_get_vport(dev);
		if (vport)
153
			return vport->dp;
154
	}
155 156 157 158 159 160 161 162 163 164 165 166 167 168

	return NULL;
}

/* The caller must hold either ovs_mutex or rcu_read_lock to keep the
 * returned dp pointer valid.
 */
static inline struct datapath *get_dp(struct net *net, int dp_ifindex)
{
	struct datapath *dp;

	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_ovsl_is_held());
	rcu_read_lock();
	dp = get_dp_rcu(net, dp_ifindex);
169 170 171 172 173
	rcu_read_unlock();

	return dp;
}

174
/* Must be called with rcu_read_lock or ovs_mutex. */
175
const char *ovs_dp_name(const struct datapath *dp)
176
{
177
	struct vport *vport = ovs_vport_ovsl_rcu(dp, OVSP_LOCAL);
178 179 180
	return vport->ops->get_name(vport);
}

181
static int get_dpifindex(const struct datapath *dp)
182 183 184 185 186 187
{
	struct vport *local;
	int ifindex;

	rcu_read_lock();

188
	local = ovs_vport_rcu(dp, OVSP_LOCAL);
189
	if (local)
190
		ifindex = netdev_vport_priv(local)->dev->ifindex;
191 192 193 194 195 196 197 198 199 200 201 202
	else
		ifindex = 0;

	rcu_read_unlock();

	return ifindex;
}

static void destroy_dp_rcu(struct rcu_head *rcu)
{
	struct datapath *dp = container_of(rcu, struct datapath, rcu);

203
	ovs_flow_tbl_destroy(&dp->table);
204
	free_percpu(dp->stats_percpu);
205
	release_net(ovs_dp_get_net(dp));
206
	kfree(dp->ports);
207 208 209
	kfree(dp);
}

210 211 212 213 214 215
static struct hlist_head *vport_hash_bucket(const struct datapath *dp,
					    u16 port_no)
{
	return &dp->ports[port_no & (DP_VPORT_HASH_BUCKETS - 1)];
}

216
/* Called with ovs_mutex or RCU read lock. */
217 218 219 220 221 222
struct vport *ovs_lookup_vport(const struct datapath *dp, u16 port_no)
{
	struct vport *vport;
	struct hlist_head *head;

	head = vport_hash_bucket(dp, port_no);
223
	hlist_for_each_entry_rcu(vport, head, dp_hash_node) {
224 225 226 227 228 229
		if (vport->port_no == port_no)
			return vport;
	}
	return NULL;
}

230
/* Called with ovs_mutex. */
231 232 233 234 235 236 237
static struct vport *new_vport(const struct vport_parms *parms)
{
	struct vport *vport;

	vport = ovs_vport_add(parms);
	if (!IS_ERR(vport)) {
		struct datapath *dp = parms->dp;
238
		struct hlist_head *head = vport_hash_bucket(dp, vport->port_no);
239

240
		hlist_add_head_rcu(&vport->dp_hash_node, head);
241 242 243 244 245 246
	}
	return vport;
}

void ovs_dp_detach_port(struct vport *p)
{
247
	ASSERT_OVSL();
248 249

	/* First drop references to device. */
250
	hlist_del_rcu(&p->dp_hash_node);
251 252 253 254 255 256

	/* Then destroy it. */
	ovs_vport_del(p);
}

/* Must be called with rcu_read_lock. */
257
void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key)
258
{
259
	const struct vport *p = OVS_CB(skb)->input_vport;
260 261
	struct datapath *dp = p->dp;
	struct sw_flow *flow;
262
	struct sw_flow_actions *sf_acts;
263 264
	struct dp_stats_percpu *stats;
	u64 *stats_counter;
265
	u32 n_mask_hit;
266

267
	stats = this_cpu_ptr(dp->stats_percpu);
268 269

	/* Look up flow. */
270
	flow = ovs_flow_tbl_lookup_stats(&dp->table, key, &n_mask_hit);
271 272
	if (unlikely(!flow)) {
		struct dp_upcall_info upcall;
273
		int error;
274 275 276

		upcall.cmd = OVS_PACKET_CMD_MISS;
		upcall.userdata = NULL;
277
		upcall.portid = ovs_vport_find_upcall_portid(p, skb);
278
		upcall.egress_tun_info = NULL;
279
		error = ovs_dp_upcall(dp, skb, key, &upcall);
280 281 282 283
		if (unlikely(error))
			kfree_skb(skb);
		else
			consume_skb(skb);
284 285 286 287
		stats_counter = &stats->n_missed;
		goto out;
	}

288 289 290
	ovs_flow_stats_update(flow, key->tp.flags, skb);
	sf_acts = rcu_dereference(flow->sf_acts);
	ovs_execute_actions(dp, skb, sf_acts, key);
291

292
	stats_counter = &stats->n_hit;
293 294 295

out:
	/* Update datapath statistics. */
296
	u64_stats_update_begin(&stats->syncp);
297
	(*stats_counter)++;
298
	stats->n_mask_hit += n_mask_hit;
299
	u64_stats_update_end(&stats->syncp);
300 301 302
}

int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb,
303
		  const struct sw_flow_key *key,
304
		  const struct dp_upcall_info *upcall_info)
305 306 307 308
{
	struct dp_stats_percpu *stats;
	int err;

309
	if (upcall_info->portid == 0) {
310 311 312 313 314
		err = -ENOTCONN;
		goto err;
	}

	if (!skb_is_gso(skb))
315
		err = queue_userspace_packet(dp, skb, key, upcall_info);
316
	else
317
		err = queue_gso_packets(dp, skb, key, upcall_info);
318 319 320 321 322 323
	if (err)
		goto err;

	return 0;

err:
324
	stats = this_cpu_ptr(dp->stats_percpu);
325

326
	u64_stats_update_begin(&stats->syncp);
327
	stats->n_lost++;
328
	u64_stats_update_end(&stats->syncp);
329 330 331 332

	return err;
}

333
static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb,
334
			     const struct sw_flow_key *key,
335 336
			     const struct dp_upcall_info *upcall_info)
{
337
	unsigned short gso_type = skb_shinfo(skb)->gso_type;
338 339
	struct sw_flow_key later_key;
	struct sk_buff *segs, *nskb;
340
	struct ovs_skb_cb ovs_cb;
341 342
	int err;

343
	ovs_cb = *OVS_CB(skb);
344
	segs = __skb_gso_segment(skb, NETIF_F_SG, false);
345
	*OVS_CB(skb) = ovs_cb;
346 347
	if (IS_ERR(segs))
		return PTR_ERR(segs);
348 349
	if (segs == NULL)
		return -EINVAL;
350

351 352 353 354 355 356 357 358 359
	if (gso_type & SKB_GSO_UDP) {
		/* The initial flow key extracted by ovs_flow_key_extract()
		 * in this case is for a first fragment, so we need to
		 * properly mark later fragments.
		 */
		later_key = *key;
		later_key.ip.frag = OVS_FRAG_TYPE_LATER;
	}

360 361 362
	/* Queue all of the segments. */
	skb = segs;
	do {
363 364 365 366 367
		*OVS_CB(skb) = ovs_cb;
		if (gso_type & SKB_GSO_UDP && skb != segs)
			key = &later_key;

		err = queue_userspace_packet(dp, skb, key, upcall_info);
368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384
		if (err)
			break;

	} while ((skb = skb->next));

	/* Free all of the segments. */
	skb = segs;
	do {
		nskb = skb->next;
		if (err)
			kfree_skb(skb);
		else
			consume_skb(skb);
	} while ((skb = nskb));
	return err;
}

385
static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info,
386
			      unsigned int hdrlen)
387 388
{
	size_t size = NLMSG_ALIGN(sizeof(struct ovs_header))
389
		+ nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */
390
		+ nla_total_size(ovs_key_attr_size()); /* OVS_PACKET_ATTR_KEY */
391 392

	/* OVS_PACKET_ATTR_USERDATA */
393 394 395 396 397 398
	if (upcall_info->userdata)
		size += NLA_ALIGN(upcall_info->userdata->nla_len);

	/* OVS_PACKET_ATTR_EGRESS_TUN_KEY */
	if (upcall_info->egress_tun_info)
		size += nla_total_size(ovs_tun_key_attr_size());
399 400 401 402

	return size;
}

403
static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
404
				  const struct sw_flow_key *key,
405 406 407 408
				  const struct dp_upcall_info *upcall_info)
{
	struct ovs_header *upcall;
	struct sk_buff *nskb = NULL;
Li RongQing's avatar
Li RongQing committed
409
	struct sk_buff *user_skb = NULL; /* to be queued to userspace */
410
	struct nlattr *nla;
411
	struct genl_info info = {
412
		.dst_sk = ovs_dp_get_net(dp)->genl_sock,
413 414 415
		.snd_portid = upcall_info->portid,
	};
	size_t len;
416
	unsigned int hlen;
417 418 419 420 421
	int err, dp_ifindex;

	dp_ifindex = get_dpifindex(dp);
	if (!dp_ifindex)
		return -ENODEV;
422 423 424 425 426 427

	if (vlan_tx_tag_present(skb)) {
		nskb = skb_clone(skb, GFP_ATOMIC);
		if (!nskb)
			return -ENOMEM;

428
		nskb = __vlan_put_tag(nskb, nskb->vlan_proto, vlan_tx_tag_get(nskb));
429
		if (!nskb)
430 431 432 433 434 435 436 437 438 439 440
			return -ENOMEM;

		nskb->vlan_tci = 0;
		skb = nskb;
	}

	if (nla_attr_size(skb->len) > USHRT_MAX) {
		err = -EFBIG;
		goto out;
	}

441 442 443 444 445 446 447 448 449 450 451 452 453 454
	/* Complete checksum if needed */
	if (skb->ip_summed == CHECKSUM_PARTIAL &&
	    (err = skb_checksum_help(skb)))
		goto out;

	/* Older versions of OVS user space enforce alignment of the last
	 * Netlink attribute to NLA_ALIGNTO which would require extensive
	 * padding logic. Only perform zerocopy if padding is not required.
	 */
	if (dp->user_features & OVS_DP_F_UNALIGNED)
		hlen = skb_zerocopy_headlen(skb);
	else
		hlen = skb->len;

455
	len = upcall_msg_size(upcall_info, hlen);
456
	user_skb = genlmsg_new_unicast(len, &info, GFP_ATOMIC);
457 458 459 460 461 462 463 464 465 466
	if (!user_skb) {
		err = -ENOMEM;
		goto out;
	}

	upcall = genlmsg_put(user_skb, 0, 0, &dp_packet_genl_family,
			     0, upcall_info->cmd);
	upcall->dp_ifindex = dp_ifindex;

	nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_KEY);
467
	err = ovs_nla_put_flow(key, key, user_skb);
468
	BUG_ON(err);
469 470 471
	nla_nest_end(user_skb, nla);

	if (upcall_info->userdata)
472 473 474
		__nla_put(user_skb, OVS_PACKET_ATTR_USERDATA,
			  nla_len(upcall_info->userdata),
			  nla_data(upcall_info->userdata));
475

476 477 478 479 480 481 482 483
	if (upcall_info->egress_tun_info) {
		nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_EGRESS_TUN_KEY);
		err = ovs_nla_put_egress_tunnel_key(user_skb,
						    upcall_info->egress_tun_info);
		BUG_ON(err);
		nla_nest_end(user_skb, nla);
	}

484 485 486 487 488 489 490
	/* Only reserve room for attribute header, packet data is added
	 * in skb_zerocopy() */
	if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0))) {
		err = -ENOBUFS;
		goto out;
	}
	nla->nla_len = nla_attr_size(skb->len);
491

492 493 494
	err = skb_zerocopy(user_skb, skb, skb->len, hlen);
	if (err)
		goto out;
495

496 497 498 499 500 501 502 503
	/* Pad OVS_PACKET_ATTR_PACKET if linear copy was performed */
	if (!(dp->user_features & OVS_DP_F_UNALIGNED)) {
		size_t plen = NLA_ALIGN(user_skb->len) - user_skb->len;

		if (plen > 0)
			memset(skb_put(user_skb, plen), 0, plen);
	}

504
	((struct nlmsghdr *) user_skb->data)->nlmsg_len = user_skb->len;
505

506
	err = genlmsg_unicast(ovs_dp_get_net(dp), user_skb, upcall_info->portid);
Li RongQing's avatar
Li RongQing committed
507
	user_skb = NULL;
508
out:
509 510
	if (err)
		skb_tx_error(skb);
Li RongQing's avatar
Li RongQing committed
511
	kfree_skb(user_skb);
512 513 514 515 516 517 518 519 520 521 522
	kfree_skb(nskb);
	return err;
}

static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
{
	struct ovs_header *ovs_header = info->userhdr;
	struct nlattr **a = info->attrs;
	struct sw_flow_actions *acts;
	struct sk_buff *packet;
	struct sw_flow *flow;
523
	struct sw_flow_actions *sf_acts;
524 525
	struct datapath *dp;
	struct ethhdr *eth;
526
	struct vport *input_vport;
527 528 529 530 531
	int len;
	int err;

	err = -EINVAL;
	if (!a[OVS_PACKET_ATTR_PACKET] || !a[OVS_PACKET_ATTR_KEY] ||
532
	    !a[OVS_PACKET_ATTR_ACTIONS])
533 534 535 536 537 538 539 540 541
		goto err;

	len = nla_len(a[OVS_PACKET_ATTR_PACKET]);
	packet = __dev_alloc_skb(NET_IP_ALIGN + len, GFP_KERNEL);
	err = -ENOMEM;
	if (!packet)
		goto err;
	skb_reserve(packet, NET_IP_ALIGN);

542
	nla_memcpy(__skb_put(packet, len), a[OVS_PACKET_ATTR_PACKET], len);
543 544 545 546 547 548 549

	skb_reset_mac_header(packet);
	eth = eth_hdr(packet);

	/* Normally, setting the skb 'protocol' field would be handled by a
	 * call to eth_type_trans(), but it assumes there's a sending
	 * device, which we may not have. */
Simon Horman's avatar
Simon Horman committed
550
	if (ntohs(eth->h_proto) >= ETH_P_802_3_MIN)
551 552 553 554 555
		packet->protocol = eth->h_proto;
	else
		packet->protocol = htons(ETH_P_802_2);

	/* Build an sw_flow for sending this packet. */
556
	flow = ovs_flow_alloc();
557 558 559 560
	err = PTR_ERR(flow);
	if (IS_ERR(flow))
		goto err_kfree_skb;

561 562
	err = ovs_flow_key_extract_userspace(a[OVS_PACKET_ATTR_KEY], packet,
					     &flow->key);
563 564 565
	if (err)
		goto err_flow_free;

566
	err = ovs_nla_copy_actions(a[OVS_PACKET_ATTR_ACTIONS],
567
				   &flow->key, &acts);
568 569
	if (err)
		goto err_flow_free;
570

571 572
	rcu_assign_pointer(flow->sf_acts, acts);
	OVS_CB(packet)->egress_tun_info = NULL;
573
	packet->priority = flow->key.phy.priority;
574
	packet->mark = flow->key.phy.skb_mark;
575 576

	rcu_read_lock();
577
	dp = get_dp_rcu(sock_net(skb->sk), ovs_header->dp_ifindex);
578 579 580 581
	err = -ENODEV;
	if (!dp)
		goto err_unlock;

582 583 584 585 586 587 588 589
	input_vport = ovs_vport_rcu(dp, flow->key.phy.in_port);
	if (!input_vport)
		input_vport = ovs_vport_rcu(dp, OVSP_LOCAL);

	if (!input_vport)
		goto err_unlock;

	OVS_CB(packet)->input_vport = input_vport;
590
	sf_acts = rcu_dereference(flow->sf_acts);
591

592
	local_bh_disable();
593
	err = ovs_execute_actions(dp, packet, sf_acts, &flow->key);
594 595 596
	local_bh_enable();
	rcu_read_unlock();

597
	ovs_flow_free(flow, false);
598 599 600 601 602
	return err;

err_unlock:
	rcu_read_unlock();
err_flow_free:
603
	ovs_flow_free(flow, false);
604 605 606 607 608 609 610
err_kfree_skb:
	kfree_skb(packet);
err:
	return err;
}

static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = {
611
	[OVS_PACKET_ATTR_PACKET] = { .len = ETH_HLEN },
612 613 614 615
	[OVS_PACKET_ATTR_KEY] = { .type = NLA_NESTED },
	[OVS_PACKET_ATTR_ACTIONS] = { .type = NLA_NESTED },
};

616
static const struct genl_ops dp_packet_genl_ops[] = {
617 618 619 620 621 622 623
	{ .cmd = OVS_PACKET_CMD_EXECUTE,
	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
	  .policy = packet_policy,
	  .doit = ovs_packet_cmd_execute
	}
};

624 625 626 627 628 629 630 631 632 633 634 635
static struct genl_family dp_packet_genl_family = {
	.id = GENL_ID_GENERATE,
	.hdrsize = sizeof(struct ovs_header),
	.name = OVS_PACKET_FAMILY,
	.version = OVS_PACKET_VERSION,
	.maxattr = OVS_PACKET_ATTR_MAX,
	.netnsok = true,
	.parallel_ops = true,
	.ops = dp_packet_genl_ops,
	.n_ops = ARRAY_SIZE(dp_packet_genl_ops),
};

636
static void get_dp_stats(const struct datapath *dp, struct ovs_dp_stats *stats,
637
			 struct ovs_dp_megaflow_stats *mega_stats)
638 639 640
{
	int i;

641 642
	memset(mega_stats, 0, sizeof(*mega_stats));

643
	stats->n_flows = ovs_flow_tbl_count(&dp->table);
644
	mega_stats->n_masks = ovs_flow_tbl_num_masks(&dp->table);
645 646

	stats->n_hit = stats->n_missed = stats->n_lost = 0;
647

648 649 650 651 652 653 654 655
	for_each_possible_cpu(i) {
		const struct dp_stats_percpu *percpu_stats;
		struct dp_stats_percpu local_stats;
		unsigned int start;

		percpu_stats = per_cpu_ptr(dp->stats_percpu, i);

		do {
656
			start = u64_stats_fetch_begin_irq(&percpu_stats->syncp);
657
			local_stats = *percpu_stats;
658
		} while (u64_stats_fetch_retry_irq(&percpu_stats->syncp, start));
659 660 661 662

		stats->n_hit += local_stats.n_hit;
		stats->n_missed += local_stats.n_missed;
		stats->n_lost += local_stats.n_lost;
663
		mega_stats->n_mask_hit += local_stats.n_mask_hit;
664 665 666
	}
}

667 668 669
static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts)
{
	return NLMSG_ALIGN(sizeof(struct ovs_header))
670 671
		+ nla_total_size(ovs_key_attr_size()) /* OVS_FLOW_ATTR_KEY */
		+ nla_total_size(ovs_key_attr_size()) /* OVS_FLOW_ATTR_MASK */
672 673 674 675 676 677
		+ nla_total_size(sizeof(struct ovs_flow_stats)) /* OVS_FLOW_ATTR_STATS */
		+ nla_total_size(1) /* OVS_FLOW_ATTR_TCP_FLAGS */
		+ nla_total_size(8) /* OVS_FLOW_ATTR_USED */
		+ nla_total_size(acts->actions_len); /* OVS_FLOW_ATTR_ACTIONS */
}

678
/* Called with ovs_mutex or RCU read lock. */
679 680
static int ovs_flow_cmd_fill_match(const struct sw_flow *flow,
				   struct sk_buff *skb)
681 682 683 684
{
	struct nlattr *nla;
	int err;

685
	/* Fill flow key. */
686 687
	nla = nla_nest_start(skb, OVS_FLOW_ATTR_KEY);
	if (!nla)
688
		return -EMSGSIZE;
689

690
	err = ovs_nla_put_flow(&flow->unmasked_key, &flow->unmasked_key, skb);
691
	if (err)
692 693
		return err;

694 695
	nla_nest_end(skb, nla);

696
	/* Fill flow mask. */
697 698
	nla = nla_nest_start(skb, OVS_FLOW_ATTR_MASK);
	if (!nla)
699
		return -EMSGSIZE;
700

701
	err = ovs_nla_put_flow(&flow->key, &flow->mask->key, skb);
702
	if (err)
703
		return err;
704

705
	nla_nest_end(skb, nla);
706 707 708 709 710 711 712 713 714 715
	return 0;
}

/* Called with ovs_mutex or RCU read lock. */
static int ovs_flow_cmd_fill_stats(const struct sw_flow *flow,
				   struct sk_buff *skb)
{
	struct ovs_flow_stats stats;
	__be16 tcp_flags;
	unsigned long used;
716

717
	ovs_flow_stats_get(flow, &stats, &used, &tcp_flags);
718

719 720
	if (used &&
	    nla_put_u64(skb, OVS_FLOW_ATTR_USED, ovs_flow_used_time(used)))
721
		return -EMSGSIZE;
722

723
	if (stats.n_packets &&
724
	    nla_put(skb, OVS_FLOW_ATTR_STATS, sizeof(struct ovs_flow_stats), &stats))
725
		return -EMSGSIZE;
726

727 728
	if ((u8)ntohs(tcp_flags) &&
	     nla_put_u8(skb, OVS_FLOW_ATTR_TCP_FLAGS, (u8)ntohs(tcp_flags)))
729 730 731 732 733 734 735 736 737 738 739
		return -EMSGSIZE;

	return 0;
}

/* Called with ovs_mutex or RCU read lock. */
static int ovs_flow_cmd_fill_actions(const struct sw_flow *flow,
				     struct sk_buff *skb, int skb_orig_len)
{
	struct nlattr *start;
	int err;
740 741 742 743 744 745 746 747 748 749 750

	/* If OVS_FLOW_ATTR_ACTIONS doesn't fit, skip dumping the actions if
	 * this is the first flow to be dumped into 'skb'.  This is unusual for
	 * Netlink but individual action lists can be longer than
	 * NLMSG_GOODSIZE and thus entirely undumpable if we didn't do this.
	 * The userspace caller can always fetch the actions separately if it
	 * really wants them.  (Most userspace callers in fact don't care.)
	 *
	 * This can only fail for dump operations because the skb is always
	 * properly sized for single flows.
	 */
751 752
	start = nla_nest_start(skb, OVS_FLOW_ATTR_ACTIONS);
	if (start) {
753 754
		const struct sw_flow_actions *sf_acts;

755
		sf_acts = rcu_dereference_ovsl(flow->sf_acts);
756 757
		err = ovs_nla_put_actions(sf_acts->actions,
					  sf_acts->actions_len, skb);
758

759 760 761 762
		if (!err)
			nla_nest_end(skb, start);
		else {
			if (skb_orig_len)
763
				return err;
764 765 766

			nla_nest_cancel(skb, start);
		}
767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800
	} else if (skb_orig_len) {
		return -EMSGSIZE;
	}

	return 0;
}

/* Called with ovs_mutex or RCU read lock. */
static int ovs_flow_cmd_fill_info(const struct sw_flow *flow, int dp_ifindex,
				  struct sk_buff *skb, u32 portid,
				  u32 seq, u32 flags, u8 cmd)
{
	const int skb_orig_len = skb->len;
	struct ovs_header *ovs_header;
	int err;

	ovs_header = genlmsg_put(skb, portid, seq, &dp_flow_genl_family,
				 flags, cmd);
	if (!ovs_header)
		return -EMSGSIZE;

	ovs_header->dp_ifindex = dp_ifindex;

	err = ovs_flow_cmd_fill_match(flow, skb);
	if (err)
		goto error;

	err = ovs_flow_cmd_fill_stats(flow, skb);
	if (err)
		goto error;

	err = ovs_flow_cmd_fill_actions(flow, skb, skb_orig_len);
	if (err)
		goto error;
801 802 803 804 805 806 807 808

	return genlmsg_end(skb, ovs_header);

error:
	genlmsg_cancel(skb, ovs_header);
	return err;
}

809 810
/* May not be called with RCU read lock. */
static struct sk_buff *ovs_flow_cmd_alloc_info(const struct sw_flow_actions *acts,
811 812
					       struct genl_info *info,
					       bool always)
813
{
814
	struct sk_buff *skb;
815

816
	if (!always && !ovs_must_notify(&dp_flow_genl_family, info, 0))
817 818
		return NULL;

819
	skb = genlmsg_new_unicast(ovs_flow_cmd_msg_size(acts), info, GFP_KERNEL);
820 821 822 823
	if (!skb)
		return ERR_PTR(-ENOMEM);

	return skb;
824 825
}

826 827 828 829 830
/* Called with ovs_mutex. */
static struct sk_buff *ovs_flow_cmd_build_info(const struct sw_flow *flow,
					       int dp_ifindex,
					       struct genl_info *info, u8 cmd,
					       bool always)
831 832 833 834
{
	struct sk_buff *skb;
	int retval;

835 836
	skb = ovs_flow_cmd_alloc_info(ovsl_dereference(flow->sf_acts), info,
				      always);
837
	if (IS_ERR_OR_NULL(skb))
838
		return skb;
839

840 841 842
	retval = ovs_flow_cmd_fill_info(flow, dp_ifindex, skb,
					info->snd_portid, info->snd_seq, 0,
					cmd);
843 844 845 846
	BUG_ON(retval < 0);
	return skb;
}

847
static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
848 849 850
{
	struct nlattr **a = info->attrs;
	struct ovs_header *ovs_header = info->userhdr;
851
	struct sw_flow *flow, *new_flow;
852
	struct sw_flow_mask mask;
853 854
	struct sk_buff *reply;
	struct datapath *dp;
855
	struct sw_flow_actions *acts;
856
	struct sw_flow_match match;
857 858
	int error;

859
	/* Must have key and actions. */
860
	error = -EINVAL;
861 862
	if (!a[OVS_FLOW_ATTR_KEY]) {
		OVS_NLERR("Flow key attribute not present in new flow.\n");
863
		goto error;
864 865 866
	}
	if (!a[OVS_FLOW_ATTR_ACTIONS]) {
		OVS_NLERR("Flow actions attribute not present in new flow.\n");
867
		goto error;
868
	}
869

870 871 872 873 874 875 876 877 878 879 880
	/* Most of the time we need to allocate a new flow, do it before
	 * locking.
	 */
	new_flow = ovs_flow_alloc();
	if (IS_ERR(new_flow)) {
		error = PTR_ERR(new_flow);
		goto error;
	}

	/* Extract key. */
	ovs_match_init(&match, &new_flow->unmasked_key, &mask);
881
	error = ovs_nla_get_match(&match,
882
				  a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK]);
883
	if (error)
884
		goto err_kfree_flow;
885

886
	ovs_flow_mask_key(&new_flow->key, &new_flow->unmasked_key, &mask);
887

888 889
	/* Validate actions. */
	error = ovs_nla_copy_actions(a[OVS_FLOW_ATTR_ACTIONS], &new_flow->key,
890
				     &acts);
891 892
	if (error) {
		OVS_NLERR("Flow actions may not be safe on all matching packets.\n");
893
		goto err_kfree_flow;
894 895 896 897 898 899
	}

	reply = ovs_flow_cmd_alloc_info(acts, info, false);
	if (IS_ERR(reply)) {
		error = PTR_ERR(reply);
		goto err_kfree_acts;
900 901
	}

902
	ovs_lock();
903
	dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
904 905
	if (unlikely(!dp)) {
		error = -ENODEV;
906
		goto err_unlock_ovs;
907
	}
908
	/* Check if this is a duplicate flow */
909 910 911
	flow = ovs_flow_tbl_lookup(&dp->table, &new_flow->unmasked_key);
	if (likely(!flow)) {
		rcu_assign_pointer(new_flow->sf_acts, acts);
912 913

		/* Put flow in bucket. */
914 915
		error = ovs_flow_tbl_insert(&dp->table, new_flow, &mask);
		if (unlikely(error)) {
916
			acts = NULL;
917 918 919 920 921 922 923 924 925 926
			goto err_unlock_ovs;
		}

		if (unlikely(reply)) {
			error = ovs_flow_cmd_fill_info(new_flow,
						       ovs_header->dp_ifindex,
						       reply, info->snd_portid,
						       info->snd_seq, 0,
						       OVS_FLOW_CMD_NEW);
			BUG_ON(error < 0);
927
		}
928
		ovs_unlock();
929
	} else {
930 931
		struct sw_flow_actions *old_acts;

932 933 934 935 936 937
		/* Bail out if we're not allowed to modify an existing flow.
		 * We accept NLM_F_CREATE in place of the intended NLM_F_EXCL
		 * because Generic Netlink treats the latter as a dump
		 * request.  We also accept NLM_F_EXCL in case that bug ever
		 * gets fixed.
		 */
938 939 940
		if (unlikely(info->nlhdr->nlmsg_flags & (NLM_F_CREATE
							 | NLM_F_EXCL))) {
			error = -EEXIST;
941
			goto err_unlock_ovs;
942
		}
943
		/* The unmasked key has to be the same for flow updates. */
944
		if (unlikely(!ovs_flow_cmp_unmasked_key(flow, &match))) {
945 946 947 948 949
			flow = ovs_flow_tbl_lookup_exact(&dp->table, &match);
			if (!flow) {
				error = -ENOENT;
				goto err_unlock_ovs;
			}
950
		}
951 952 953 954
		/* Update actions. */
		old_acts = ovsl_dereference(flow->sf_acts);
		rcu_assign_pointer(flow->sf_acts, acts);

955 956 957 958 959 960 961 962 963
		if (unlikely(reply)) {
			error = ovs_flow_cmd_fill_info(flow,
						       ovs_header->dp_ifindex,
						       reply, info->snd_portid,
						       info->snd_seq, 0,
						       OVS_FLOW_CMD_NEW);
			BUG_ON(error < 0);
		}
		ovs_unlock();
964

965 966
		ovs_nla_free_flow_actions(old_acts);
		ovs_flow_free(new_flow, false);
967
	}
968 969 970

	if (reply)
		ovs_notify(&dp_flow_genl_family, reply, info);
971 972 973 974
	return 0;

err_unlock_ovs:
	ovs_unlock();
975 976
	kfree_skb(reply);
err_kfree_acts:
977
	kfree(acts);
978 979
err_kfree_flow:
	ovs_flow_free(new_flow, false);
980 981 982
error:
	return error;
}
983

984
/* Factor out action copy to avoid "Wframe-larger-than=1024" warning. */
985 986 987 988 989 990 991 992 993
static struct sw_flow_actions *get_flow_actions(const struct nlattr *a,
						const struct sw_flow_key *key,
						const struct sw_flow_mask *mask)
{
	struct sw_flow_actions *acts;
	struct sw_flow_key masked_key;
	int error;

	ovs_flow_mask_key(&masked_key, key, mask);
994
	error = ovs_nla_copy_actions(a, &masked_key, &acts);