virtio_net.c 50.1 KB
Newer Older
1
/* A network driver using virtio.
Rusty Russell's avatar
Rusty Russell committed
2
3
4
5
6
7
8
9
10
11
12
13
14
15
 *
 * Copyright 2007 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
16
 * along with this program; if not, see <http://www.gnu.org/licenses/>.
Rusty Russell's avatar
Rusty Russell committed
17
18
19
20
 */
//#define DEBUG
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
21
#include <linux/ethtool.h>
Rusty Russell's avatar
Rusty Russell committed
22
23
24
25
#include <linux/module.h>
#include <linux/virtio.h>
#include <linux/virtio_net.h>
#include <linux/scatterlist.h>
26
#include <linux/if_vlan.h>
27
#include <linux/slab.h>
28
#include <linux/cpu.h>
29
#include <linux/average.h>
30
#include <net/busy_poll.h>
Rusty Russell's avatar
Rusty Russell committed
31

32
static int napi_weight = NAPI_POLL_WEIGHT;
33
34
module_param(napi_weight, int, 0444);

35
static bool csum = true, gso = true;
36
37
38
module_param(csum, bool, 0444);
module_param(gso, bool, 0444);

Rusty Russell's avatar
Rusty Russell committed
39
/* FIXME: MTU in config. */
40
#define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
41
#define GOOD_COPY_LEN	128
Rusty Russell's avatar
Rusty Russell committed
42

43
44
45
46
47
48
49
50
51
52
/* Weight used for the RX packet size EWMA. The average packet size is used to
 * determine the packet buffer size when refilling RX rings. As the entire RX
 * ring may be refilled at once, the weight is chosen so that the EWMA will be
 * insensitive to short-term, transient changes in packet size.
 */
#define RECEIVE_AVG_WEIGHT 64

/* Minimum alignment for mergeable packet buffers. */
#define MERGEABLE_BUFFER_ALIGN max(L1_CACHE_BYTES, 256)

53
#define VIRTNET_DRIVER_VERSION "1.0.0"
54

55
struct virtnet_stats {
56
57
	struct u64_stats_sync tx_syncp;
	struct u64_stats_sync rx_syncp;
58
59
60
61
62
63
64
	u64 tx_bytes;
	u64 tx_packets;

	u64 rx_bytes;
	u64 rx_packets;
};

65
66
67
68
69
70
71
/* Internal representation of a send virtqueue */
struct send_queue {
	/* Virtqueue associated with this send _queue */
	struct virtqueue *vq;

	/* TX: fragments + linear part + virtio header */
	struct scatterlist sg[MAX_SKB_FRAGS + 2];
Jason Wang's avatar
Jason Wang committed
72
73
74

	/* Name of the send queue: output.$index */
	char name[40];
75
76
77
78
79
80
81
};

/* Internal representation of a receive virtqueue */
struct receive_queue {
	/* Virtqueue associated with this receive_queue */
	struct virtqueue *vq;

Rusty Russell's avatar
Rusty Russell committed
82
83
	struct napi_struct napi;

84
85
86
	/* Chain pages by the private ptr. */
	struct page *pages;

87
88
89
	/* Average packet length for mergeable receive buffers. */
	struct ewma mrg_avg_pkt_len;

90
91
92
	/* Page frag for packet buffer allocation. */
	struct page_frag alloc_frag;

93
94
	/* RX: fragments + linear part + virtio header */
	struct scatterlist sg[MAX_SKB_FRAGS + 2];
Jason Wang's avatar
Jason Wang committed
95
96
97

	/* Name of this receive queue: input.$index */
	char name[40];
98
99
100
101
102
103
};

struct virtnet_info {
	struct virtio_device *vdev;
	struct virtqueue *cvq;
	struct net_device *dev;
Jason Wang's avatar
Jason Wang committed
104
105
	struct send_queue *sq;
	struct receive_queue *rq;
106
107
	unsigned int status;

Jason Wang's avatar
Jason Wang committed
108
109
110
111
112
113
	/* Max # of queue pairs supported by the device */
	u16 max_queue_pairs;

	/* # of queue pairs currently used by the driver */
	u16 curr_queue_pairs;

114
115
116
	/* I like... big packets and I cannot lie! */
	bool big_packets;

117
118
119
	/* Host will merge rx buffers for big packets (shake it! shake it!) */
	bool mergeable_rx_bufs;

Jason Wang's avatar
Jason Wang committed
120
121
122
	/* Has control virtqueue */
	bool has_cvq;

123
124
125
	/* Host can handle any s/g split between our header and packet data */
	bool any_header_sg;

126
127
128
	/* enable config space updates */
	bool config_enable;

129
130
131
	/* Active statistics */
	struct virtnet_stats __percpu *stats;

132
133
134
	/* Work struct for refilling if we run low on memory. */
	struct delayed_work refill;

135
136
137
138
139
	/* Work struct for config space updates */
	struct work_struct config_work;

	/* Lock for config space updates */
	struct mutex config_lock;
Jason Wang's avatar
Jason Wang committed
140
141
142

	/* Does the affinity hint is set for virtqueues? */
	bool affinity_hint_set;
143

144
145
	/* CPU hot plug notifier */
	struct notifier_block nb;
Rusty Russell's avatar
Rusty Russell committed
146
147
};

148
149
150
151
152
153
154
struct skb_vnet_hdr {
	union {
		struct virtio_net_hdr hdr;
		struct virtio_net_hdr_mrg_rxbuf mhdr;
	};
};

155
156
157
158
159
160
161
162
163
164
struct padded_vnet_hdr {
	struct virtio_net_hdr hdr;
	/*
	 * virtio_net_hdr should be in a separated sg buffer because of a
	 * QEMU bug, and data sg buffer shares same page with this header sg.
	 * This padding makes next sg 16 byte aligned after virtio_net_hdr.
	 */
	char padding[6];
};

Jason Wang's avatar
Jason Wang committed
165
166
167
168
169
/* Converting between virtqueue no. and kernel tx/rx queue no.
 * 0:rx0 1:tx0 2:rx1 3:tx1 ... 2N:rxN 2N+1:txN 2N+2:cvq
 */
static int vq2txq(struct virtqueue *vq)
{
170
	return (vq->index - 1) / 2;
Jason Wang's avatar
Jason Wang committed
171
172
173
174
175
176
177
178
179
}

static int txq2vq(int txq)
{
	return txq * 2 + 1;
}

static int vq2rxq(struct virtqueue *vq)
{
180
	return vq->index / 2;
Jason Wang's avatar
Jason Wang committed
181
182
183
184
185
186
187
}

static int rxq2vq(int rxq)
{
	return rxq * 2;
}

188
static inline struct skb_vnet_hdr *skb_vnet_hdr(struct sk_buff *skb)
Rusty Russell's avatar
Rusty Russell committed
189
{
190
	return (struct skb_vnet_hdr *)skb->cb;
Rusty Russell's avatar
Rusty Russell committed
191
192
}

193
194
195
196
/*
 * private is used to chain pages for big packets, put the whole
 * most recent used list in the beginning for reuse
 */
197
static void give_pages(struct receive_queue *rq, struct page *page)
198
{
199
	struct page *end;
200

201
	/* Find end of list, sew whole thing into vi->rq.pages. */
202
	for (end = page; end->private; end = (struct page *)end->private);
203
204
	end->private = (unsigned long)rq->pages;
	rq->pages = page;
205
206
}

207
static struct page *get_a_page(struct receive_queue *rq, gfp_t gfp_mask)
208
{
209
	struct page *p = rq->pages;
210

211
	if (p) {
212
		rq->pages = (struct page *)p->private;
213
214
215
		/* clear private here, it is used to chain pages */
		p->private = 0;
	} else
216
217
218
219
		p = alloc_page(gfp_mask);
	return p;
}

220
static void skb_xmit_done(struct virtqueue *vq)
Rusty Russell's avatar
Rusty Russell committed
221
{
222
	struct virtnet_info *vi = vq->vdev->priv;
Rusty Russell's avatar
Rusty Russell committed
223

224
	/* Suppress further interrupts. */
225
	virtqueue_disable_cb(vq);
226

227
	/* We were probably waiting for more output buffers. */
Jason Wang's avatar
Jason Wang committed
228
	netif_wake_subqueue(vi->dev, vq2txq(vq));
Rusty Russell's avatar
Rusty Russell committed
229
230
}

231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
static unsigned int mergeable_ctx_to_buf_truesize(unsigned long mrg_ctx)
{
	unsigned int truesize = mrg_ctx & (MERGEABLE_BUFFER_ALIGN - 1);
	return (truesize + 1) * MERGEABLE_BUFFER_ALIGN;
}

static void *mergeable_ctx_to_buf_address(unsigned long mrg_ctx)
{
	return (void *)(mrg_ctx & -MERGEABLE_BUFFER_ALIGN);

}

static unsigned long mergeable_buf_to_ctx(void *buf, unsigned int truesize)
{
	unsigned int size = truesize / MERGEABLE_BUFFER_ALIGN;
	return (unsigned long)buf | (size - 1);
}

249
/* Called from bottom half context */
250
static struct sk_buff *page_to_skb(struct receive_queue *rq,
251
252
				   struct page *page, unsigned int offset,
				   unsigned int len, unsigned int truesize)
253
{
254
	struct virtnet_info *vi = rq->vq->vdev->priv;
255
256
	struct sk_buff *skb;
	struct skb_vnet_hdr *hdr;
257
	unsigned int copy, hdr_len, hdr_padded_len;
258
	char *p;
259

260
	p = page_address(page) + offset;
261

262
263
264
265
	/* copy small packet so we can reuse these pages for small data */
	skb = netdev_alloc_skb_ip_align(vi->dev, GOOD_COPY_LEN);
	if (unlikely(!skb))
		return NULL;
266

267
	hdr = skb_vnet_hdr(skb);
268

269
270
	if (vi->mergeable_rx_bufs) {
		hdr_len = sizeof hdr->mhdr;
271
		hdr_padded_len = sizeof hdr->mhdr;
272
273
	} else {
		hdr_len = sizeof hdr->hdr;
274
		hdr_padded_len = sizeof(struct padded_vnet_hdr);
275
	}
276

277
	memcpy(hdr, p, hdr_len);
278

279
	len -= hdr_len;
280
281
	offset += hdr_padded_len;
	p += hdr_padded_len;
282

283
284
285
286
	copy = len;
	if (copy > skb_tailroom(skb))
		copy = skb_tailroom(skb);
	memcpy(skb_put(skb, copy), p, copy);
287

288
289
	len -= copy;
	offset += copy;
290

291
292
293
294
295
296
297
298
	if (vi->mergeable_rx_bufs) {
		if (len)
			skb_add_rx_frag(skb, 0, page, offset, len, truesize);
		else
			put_page(page);
		return skb;
	}

299
300
301
302
303
304
305
	/*
	 * Verify that we can indeed put this data into a skb.
	 * This is here to handle cases when the device erroneously
	 * tries to receive more than is possible. This is usually
	 * the case of a broken device.
	 */
	if (unlikely(len > MAX_SKB_FRAGS * PAGE_SIZE)) {
306
		net_dbg_ratelimited("%s: too much data\n", skb->dev->name);
307
308
309
		dev_kfree_skb(skb);
		return NULL;
	}
310
	BUG_ON(offset >= PAGE_SIZE);
311
	while (len) {
312
313
314
315
		unsigned int frag_size = min((unsigned)PAGE_SIZE - offset, len);
		skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page, offset,
				frag_size, truesize);
		len -= frag_size;
316
317
318
		page = (struct page *)page->private;
		offset = 0;
	}
319

320
	if (page)
321
		give_pages(rq, page);
322

323
324
	return skb;
}
325

326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
static struct sk_buff *receive_small(void *buf, unsigned int len)
{
	struct sk_buff * skb = buf;

	len -= sizeof(struct virtio_net_hdr);
	skb_trim(skb, len);

	return skb;
}

static struct sk_buff *receive_big(struct net_device *dev,
				   struct receive_queue *rq,
				   void *buf,
				   unsigned int len)
{
	struct page *page = buf;
	struct sk_buff *skb = page_to_skb(rq, page, 0, len, PAGE_SIZE);

	if (unlikely(!skb))
		goto err;

	return skb;

err:
	dev->stats.rx_dropped++;
	give_pages(rq, page);
	return NULL;
}

355
356
static struct sk_buff *receive_mergeable(struct net_device *dev,
					 struct receive_queue *rq,
357
					 unsigned long ctx,
358
					 unsigned int len)
359
{
360
	void *buf = mergeable_ctx_to_buf_address(ctx);
361
362
363
364
	struct skb_vnet_hdr *hdr = buf;
	int num_buf = hdr->mhdr.num_buffers;
	struct page *page = virt_to_head_page(buf);
	int offset = buf - page_address(page);
365
366
	unsigned int truesize = max(len, mergeable_ctx_to_buf_truesize(ctx));

367
	struct sk_buff *head_skb = page_to_skb(rq, page, offset, len, truesize);
368
	struct sk_buff *curr_skb = head_skb;
369

370
371
	if (unlikely(!curr_skb))
		goto err_skb;
372
	while (--num_buf) {
373
374
		int num_skb_frags;

375
376
		ctx = (unsigned long)virtqueue_get_buf(rq->vq, &len);
		if (unlikely(!ctx)) {
377
378
379
380
			pr_debug("%s: rx error: %d buffers out of %d missing\n",
				 dev->name, num_buf, hdr->mhdr.num_buffers);
			dev->stats.rx_length_errors++;
			goto err_buf;
381
		}
382

383
		buf = mergeable_ctx_to_buf_address(ctx);
384
385
386
		page = virt_to_head_page(buf);

		num_skb_frags = skb_shinfo(curr_skb)->nr_frags;
387
388
		if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) {
			struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC);
389
390
391

			if (unlikely(!nskb))
				goto err_skb;
392
393
394
395
396
397
398
399
			if (curr_skb == head_skb)
				skb_shinfo(curr_skb)->frag_list = nskb;
			else
				curr_skb->next = nskb;
			curr_skb = nskb;
			head_skb->truesize += nskb->truesize;
			num_skb_frags = 0;
		}
400
		truesize = max(len, mergeable_ctx_to_buf_truesize(ctx));
401
402
403
		if (curr_skb != head_skb) {
			head_skb->data_len += len;
			head_skb->len += len;
404
			head_skb->truesize += truesize;
405
		}
406
		offset = buf - page_address(page);
407
408
409
		if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
			put_page(page);
			skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
410
					     len, truesize);
411
412
		} else {
			skb_add_rx_frag(curr_skb, num_skb_frags, page,
413
					offset, len, truesize);
414
		}
415
416
	}

417
	ewma_add(&rq->mrg_avg_pkt_len, head_skb->len);
418
419
420
421
422
	return head_skb;

err_skb:
	put_page(page);
	while (--num_buf) {
423
424
		ctx = (unsigned long)virtqueue_get_buf(rq->vq, &len);
		if (unlikely(!ctx)) {
425
426
427
428
429
			pr_debug("%s: rx error: %d buffers missing\n",
				 dev->name, num_buf);
			dev->stats.rx_length_errors++;
			break;
		}
430
		page = virt_to_head_page(mergeable_ctx_to_buf_address(ctx));
431
		put_page(page);
432
	}
433
434
435
436
err_buf:
	dev->stats.rx_dropped++;
	dev_kfree_skb(head_skb);
	return NULL;
437
438
}

439
static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len)
440
{
441
442
	struct virtnet_info *vi = rq->vq->vdev->priv;
	struct net_device *dev = vi->dev;
443
	struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
444
445
	struct sk_buff *skb;
	struct skb_vnet_hdr *hdr;
446

447
448
449
	if (unlikely(len < sizeof(struct virtio_net_hdr) + ETH_HLEN)) {
		pr_debug("%s: short packet %i\n", dev->name, len);
		dev->stats.rx_length_errors++;
450
451
452
453
454
		if (vi->mergeable_rx_bufs) {
			unsigned long ctx = (unsigned long)buf;
			void *base = mergeable_ctx_to_buf_address(ctx);
			put_page(virt_to_head_page(base));
		} else if (vi->big_packets) {
455
			give_pages(rq, buf);
456
		} else {
457
			dev_kfree_skb(buf);
458
		}
459
460
		return;
	}
461

462
	if (vi->mergeable_rx_bufs)
463
		skb = receive_mergeable(dev, rq, (unsigned long)buf, len);
464
465
466
467
468
469
470
	else if (vi->big_packets)
		skb = receive_big(dev, rq, buf, len);
	else
		skb = receive_small(buf, len);

	if (unlikely(!skb))
		return;
471

472
	hdr = skb_vnet_hdr(skb);
473

474
	u64_stats_update_begin(&stats->rx_syncp);
475
476
	stats->rx_bytes += skb->len;
	stats->rx_packets++;
477
	u64_stats_update_end(&stats->rx_syncp);
Rusty Russell's avatar
Rusty Russell committed
478

479
	if (hdr->hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
Rusty Russell's avatar
Rusty Russell committed
480
		pr_debug("Needs csum!\n");
481
482
483
		if (!skb_partial_csum_set(skb,
					  hdr->hdr.csum_start,
					  hdr->hdr.csum_offset))
Rusty Russell's avatar
Rusty Russell committed
484
			goto frame_err;
485
486
	} else if (hdr->hdr.flags & VIRTIO_NET_HDR_F_DATA_VALID) {
		skb->ip_summed = CHECKSUM_UNNECESSARY;
Rusty Russell's avatar
Rusty Russell committed
487
488
	}

489
490
491
492
	skb->protocol = eth_type_trans(skb, dev);
	pr_debug("Receiving skb proto 0x%04x len %i type %i\n",
		 ntohs(skb->protocol), skb->len, skb->pkt_type);

493
	if (hdr->hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
Rusty Russell's avatar
Rusty Russell committed
494
		pr_debug("GSO!\n");
495
		switch (hdr->hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
Rusty Russell's avatar
Rusty Russell committed
496
		case VIRTIO_NET_HDR_GSO_TCPV4:
497
			skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
Rusty Russell's avatar
Rusty Russell committed
498
499
			break;
		case VIRTIO_NET_HDR_GSO_UDP:
500
			skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
Rusty Russell's avatar
Rusty Russell committed
501
502
			break;
		case VIRTIO_NET_HDR_GSO_TCPV6:
503
			skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6;
Rusty Russell's avatar
Rusty Russell committed
504
505
			break;
		default:
506
507
			net_warn_ratelimited("%s: bad gso type %u.\n",
					     dev->name, hdr->hdr.gso_type);
Rusty Russell's avatar
Rusty Russell committed
508
509
510
			goto frame_err;
		}

511
		if (hdr->hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
512
			skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
513

514
		skb_shinfo(skb)->gso_size = hdr->hdr.gso_size;
Rusty Russell's avatar
Rusty Russell committed
515
		if (skb_shinfo(skb)->gso_size == 0) {
516
			net_warn_ratelimited("%s: zero gso size.\n", dev->name);
Rusty Russell's avatar
Rusty Russell committed
517
518
519
520
521
522
523
524
			goto frame_err;
		}

		/* Header must be checked, and gso_segs computed. */
		skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
		skb_shinfo(skb)->gso_segs = 0;
	}

525
526
	skb_mark_napi_id(skb, &rq->napi);

Rusty Russell's avatar
Rusty Russell committed
527
528
529
530
531
532
533
534
	netif_receive_skb(skb);
	return;

frame_err:
	dev->stats.rx_frame_errors++;
	dev_kfree_skb(skb);
}

535
static int add_recvbuf_small(struct receive_queue *rq, gfp_t gfp)
Rusty Russell's avatar
Rusty Russell committed
536
{
537
	struct virtnet_info *vi = rq->vq->vdev->priv;
Rusty Russell's avatar
Rusty Russell committed
538
	struct sk_buff *skb;
539
540
	struct skb_vnet_hdr *hdr;
	int err;
541

542
	skb = __netdev_alloc_skb_ip_align(vi->dev, GOOD_PACKET_LEN, gfp);
543
544
	if (unlikely(!skb))
		return -ENOMEM;
Rusty Russell's avatar
Rusty Russell committed
545

546
	skb_put(skb, GOOD_PACKET_LEN);
547

548
	hdr = skb_vnet_hdr(skb);
549
	sg_set_buf(rq->sg, &hdr->hdr, sizeof hdr->hdr);
550

551
	skb_to_sgvec(skb, rq->sg + 1, 0, skb->len);
552

553
	err = virtqueue_add_inbuf(rq->vq, rq->sg, 2, skb, gfp);
554
555
	if (err < 0)
		dev_kfree_skb(skb);
556

557
558
	return err;
}
559

560
static int add_recvbuf_big(struct receive_queue *rq, gfp_t gfp)
561
562
563
564
565
{
	struct page *first, *list = NULL;
	char *p;
	int i, err, offset;

566
	/* page in rq->sg[MAX_SKB_FRAGS + 1] is list tail */
567
	for (i = MAX_SKB_FRAGS + 1; i > 1; --i) {
568
		first = get_a_page(rq, gfp);
569
570
		if (!first) {
			if (list)
571
				give_pages(rq, list);
572
			return -ENOMEM;
573
		}
574
		sg_set_buf(&rq->sg[i], page_address(first), PAGE_SIZE);
575

576
577
578
579
		/* chain new page in list head to match sg */
		first->private = (unsigned long)list;
		list = first;
	}
Rusty Russell's avatar
Rusty Russell committed
580

581
	first = get_a_page(rq, gfp);
582
	if (!first) {
583
		give_pages(rq, list);
584
585
586
587
		return -ENOMEM;
	}
	p = page_address(first);

588
589
590
	/* rq->sg[0], rq->sg[1] share the same page */
	/* a separated rq->sg[0] for virtio_net_hdr only due to QEMU bug */
	sg_set_buf(&rq->sg[0], p, sizeof(struct virtio_net_hdr));
591

592
	/* rq->sg[1] for data packet, from offset */
593
	offset = sizeof(struct padded_vnet_hdr);
594
	sg_set_buf(&rq->sg[1], p + offset, PAGE_SIZE - offset);
595
596
597

	/* chain first in list head */
	first->private = (unsigned long)list;
598
599
	err = virtqueue_add_inbuf(rq->vq, rq->sg, MAX_SKB_FRAGS + 2,
				  first, gfp);
600
	if (err < 0)
601
		give_pages(rq, first);
602
603

	return err;
Rusty Russell's avatar
Rusty Russell committed
604
605
}

606
static unsigned int get_mergeable_buf_len(struct ewma *avg_pkt_len)
607
{
608
	const size_t hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
609
610
611
612
613
614
615
616
617
	unsigned int len;

	len = hdr_len + clamp_t(unsigned int, ewma_read(avg_pkt_len),
			GOOD_PACKET_LEN, PAGE_SIZE - hdr_len);
	return ALIGN(len, MERGEABLE_BUFFER_ALIGN);
}

static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
{
618
619
	struct page_frag *alloc_frag = &rq->alloc_frag;
	char *buf;
620
	unsigned long ctx;
621
	int err;
622
	unsigned int len, hole;
623

624
	len = get_mergeable_buf_len(&rq->mrg_avg_pkt_len);
625
	if (unlikely(!skb_page_frag_refill(len, alloc_frag, gfp)))
626
		return -ENOMEM;
627

628
	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
629
	ctx = mergeable_buf_to_ctx(buf, len);
630
631
632
	get_page(alloc_frag->page);
	alloc_frag->offset += len;
	hole = alloc_frag->size - alloc_frag->offset;
633
634
635
636
637
638
	if (hole < len) {
		/* To avoid internal fragmentation, if there is very likely not
		 * enough space for another buffer, add the remaining space to
		 * the current buffer. This extra space is not included in
		 * the truesize stored in ctx.
		 */
639
640
641
		len += hole;
		alloc_frag->offset += hole;
	}
642

643
	sg_init_one(rq->sg, buf, len);
644
	err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, (void *)ctx, gfp);
645
	if (err < 0)
646
		put_page(virt_to_head_page(buf));
647

648
649
	return err;
}
650

651
652
653
654
655
656
657
/*
 * Returns false if we couldn't fill entirely (OOM).
 *
 * Normally run in the receive path, but can also be run from ndo_open
 * before we're receiving packets, or from refill_work which is
 * careful to disable receiving (using napi_disable).
 */
658
static bool try_fill_recv(struct receive_queue *rq, gfp_t gfp)
659
{
660
	struct virtnet_info *vi = rq->vq->vdev->priv;
661
	int err;
662
	bool oom;
663

664
	gfp |= __GFP_COLD;
665
666
	do {
		if (vi->mergeable_rx_bufs)
667
			err = add_recvbuf_mergeable(rq, gfp);
668
		else if (vi->big_packets)
669
			err = add_recvbuf_big(rq, gfp);
670
		else
671
			err = add_recvbuf_small(rq, gfp);
672

673
		oom = err == -ENOMEM;
674
		if (err)
675
			break;
676
	} while (rq->vq->num_free);
677
	virtqueue_kick(rq->vq);
678
	return !oom;
679
680
}

681
static void skb_recv_done(struct virtqueue *rvq)
Rusty Russell's avatar
Rusty Russell committed
682
683
{
	struct virtnet_info *vi = rvq->vdev->priv;
Jason Wang's avatar
Jason Wang committed
684
	struct receive_queue *rq = &vi->rq[vq2rxq(rvq)];
685

686
	/* Schedule NAPI, Suppress further interrupts if successful. */
687
	if (napi_schedule_prep(&rq->napi)) {
688
		virtqueue_disable_cb(rvq);
689
		__napi_schedule(&rq->napi);
690
	}
Rusty Russell's avatar
Rusty Russell committed
691
692
}

693
static void virtnet_napi_enable(struct receive_queue *rq)
694
{
695
	napi_enable(&rq->napi);
696
697
698
699
700

	/* If all buffers were filled by other side before we napi_enabled, we
	 * won't get another interrupt, so process any outstanding packets
	 * now.  virtnet_poll wants re-enable the queue, so we disable here.
	 * We synchronize against interrupts via NAPI_STATE_SCHED */
701
702
	if (napi_schedule_prep(&rq->napi)) {
		virtqueue_disable_cb(rq->vq);
703
		local_bh_disable();
704
		__napi_schedule(&rq->napi);
705
		local_bh_enable();
706
707
708
	}
}

709
710
static void refill_work(struct work_struct *work)
{
711
712
	struct virtnet_info *vi =
		container_of(work, struct virtnet_info, refill.work);
713
	bool still_empty;
Jason Wang's avatar
Jason Wang committed
714
715
	int i;

716
	for (i = 0; i < vi->curr_queue_pairs; i++) {
Jason Wang's avatar
Jason Wang committed
717
		struct receive_queue *rq = &vi->rq[i];
718

Jason Wang's avatar
Jason Wang committed
719
720
721
		napi_disable(&rq->napi);
		still_empty = !try_fill_recv(rq, GFP_KERNEL);
		virtnet_napi_enable(rq);
722

Jason Wang's avatar
Jason Wang committed
723
724
725
726
727
728
		/* In theory, this can happen: if we don't get any buffers in
		 * we will *never* try to fill again.
		 */
		if (still_empty)
			schedule_delayed_work(&vi->refill, HZ/2);
	}
729
730
}

731
static int virtnet_receive(struct receive_queue *rq, int budget)
Rusty Russell's avatar
Rusty Russell committed
732
{
733
	struct virtnet_info *vi = rq->vq->vdev->priv;
734
	unsigned int len, received = 0;
735
	void *buf;
Rusty Russell's avatar
Rusty Russell committed
736
737

	while (received < budget &&
738
739
	       (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
		receive_buf(rq, buf, len);
Rusty Russell's avatar
Rusty Russell committed
740
741
742
		received++;
	}

743
	if (rq->vq->num_free > virtqueue_get_vring_size(rq->vq) / 2) {
744
		if (!try_fill_recv(rq, GFP_ATOMIC))
745
			schedule_delayed_work(&vi->refill, 0);
746
	}
Rusty Russell's avatar
Rusty Russell committed
747

748
749
750
751
752
753
754
755
756
757
758
759
	return received;
}

static int virtnet_poll(struct napi_struct *napi, int budget)
{
	struct receive_queue *rq =
		container_of(napi, struct receive_queue, napi);
	unsigned int r, received = 0;

again:
	received += virtnet_receive(rq, budget - received);

760
761
	/* Out of packets? */
	if (received < budget) {
762
		r = virtqueue_enable_cb_prepare(rq->vq);
763
		napi_complete(napi);
764
		if (unlikely(virtqueue_poll(rq->vq, r)) &&
765
		    napi_schedule_prep(napi)) {
766
			virtqueue_disable_cb(rq->vq);
767
			__napi_schedule(napi);
Rusty Russell's avatar
Rusty Russell committed
768
			goto again;
769
		}
Rusty Russell's avatar
Rusty Russell committed
770
771
772
773
774
	}

	return received;
}

775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
#ifdef CONFIG_NET_RX_BUSY_POLL
/* must be called with local_bh_disable()d */
static int virtnet_busy_poll(struct napi_struct *napi)
{
	struct receive_queue *rq =
		container_of(napi, struct receive_queue, napi);
	struct virtnet_info *vi = rq->vq->vdev->priv;
	int r, received = 0, budget = 4;

	if (!(vi->status & VIRTIO_NET_S_LINK_UP))
		return LL_FLUSH_FAILED;

	if (!napi_schedule_prep(napi))
		return LL_FLUSH_BUSY;

	virtqueue_disable_cb(rq->vq);

again:
	received += virtnet_receive(rq, budget);

	r = virtqueue_enable_cb_prepare(rq->vq);
	clear_bit(NAPI_STATE_SCHED, &napi->state);
	if (unlikely(virtqueue_poll(rq->vq, r)) &&
	    napi_schedule_prep(napi)) {
		virtqueue_disable_cb(rq->vq);
		if (received < budget) {
			budget -= received;
			goto again;
		} else {
			__napi_schedule(napi);
		}
	}

	return received;
}
#endif	/* CONFIG_NET_RX_BUSY_POLL */

Jason Wang's avatar
Jason Wang committed
812
813
814
815
816
static int virtnet_open(struct net_device *dev)
{
	struct virtnet_info *vi = netdev_priv(dev);
	int i;

817
818
819
820
821
	for (i = 0; i < vi->max_queue_pairs; i++) {
		if (i < vi->curr_queue_pairs)
			/* Make sure we have some buffers: if oom use wq. */
			if (!try_fill_recv(&vi->rq[i], GFP_KERNEL))
				schedule_delayed_work(&vi->refill, 0);
Jason Wang's avatar
Jason Wang committed
822
823
824
825
826
827
		virtnet_napi_enable(&vi->rq[i]);
	}

	return 0;
}

828
static void free_old_xmit_skbs(struct send_queue *sq)
Rusty Russell's avatar
Rusty Russell committed
829
830
{
	struct sk_buff *skb;
831
	unsigned int len;
832
	struct virtnet_info *vi = sq->vq->vdev->priv;
833
	struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
Rusty Russell's avatar
Rusty Russell committed
834

835
	while ((skb = virtqueue_get_buf(sq->vq, &len)) != NULL) {
Rusty Russell's avatar
Rusty Russell committed
836
		pr_debug("Sent skb %p\n", skb);
837

838
		u64_stats_update_begin(&stats->tx_syncp);
839
840
		stats->tx_bytes += skb->len;
		stats->tx_packets++;
841
		u64_stats_update_end(&stats->tx_syncp);
842

843
		dev_kfree_skb_any(skb);
Rusty Russell's avatar
Rusty Russell committed
844
845
846
	}
}

847
static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
Rusty Russell's avatar
Rusty Russell committed
848
{
849
	struct skb_vnet_hdr *hdr;
Rusty Russell's avatar
Rusty Russell committed
850
	const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest;
851
	struct virtnet_info *vi = sq->vq->vdev->priv;
852
	unsigned num_sg;
853
854
	unsigned hdr_len;
	bool can_push;
Rusty Russell's avatar
Rusty Russell committed
855

Johannes Berg's avatar
Johannes Berg committed
856
	pr_debug("%s: xmit %p %pM\n", vi->dev->name, skb, dest);
857
858
859
860
861
862
863
864
865
866
867
868
869
870
	if (vi->mergeable_rx_bufs)
		hdr_len = sizeof hdr->mhdr;
	else
		hdr_len = sizeof hdr->hdr;

	can_push = vi->any_header_sg &&
		!((unsigned long)skb->data & (__alignof__(*hdr) - 1)) &&
		!skb_header_cloned(skb) && skb_headroom(skb) >= hdr_len;
	/* Even if we can, don't push here yet as this would skew
	 * csum_start offset below. */
	if (can_push)
		hdr = (struct skb_vnet_hdr *)(skb->data - hdr_len);
	else
		hdr = skb_vnet_hdr(skb);
Rusty Russell's avatar
Rusty Russell committed
871
872

	if (skb->ip_summed == CHECKSUM_PARTIAL) {
873
		hdr->hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
874
		hdr->hdr.csum_start = skb_checksum_start_offset(skb);
875
		hdr->hdr.csum_offset = skb->csum_offset;
Rusty Russell's avatar
Rusty Russell committed
876
	} else {
877
878
		hdr->hdr.flags = 0;
		hdr->hdr.csum_offset = hdr->hdr.csum_start = 0;
Rusty Russell's avatar
Rusty Russell committed
879
880
881
	}

	if (skb_is_gso(skb)) {
882
883
		hdr->hdr.hdr_len = skb_headlen(skb);
		hdr->hdr.gso_size = skb_shinfo(skb)->gso_size;