1 // SPDX-License-Identifier: (GPL-2.0 OR MIT)
2 /* Google virtual Ethernet (gve) driver
4 * Copyright (C) 2015-2019 Google, Inc.
8 #include "gve_adminq.h"
10 #include <linux/tcp.h>
11 #include <linux/vmalloc.h>
12 #include <linux/skbuff.h>
14 static inline void gve_tx_put_doorbell(struct gve_priv
*priv
,
15 struct gve_queue_resources
*q_resources
,
18 iowrite32be(val
, &priv
->db_bar2
[be32_to_cpu(q_resources
->db_index
)]);
21 /* gvnic can only transmit from a Registered Segment.
22 * We copy skb payloads into the registered segment before writing Tx
23 * descriptors and ringing the Tx doorbell.
25 * gve_tx_fifo_* manages the Registered Segment as a FIFO - clients must
26 * free allocations in the order they were allocated.
29 static int gve_tx_fifo_init(struct gve_priv
*priv
, struct gve_tx_fifo
*fifo
)
31 fifo
->base
= vmap(fifo
->qpl
->pages
, fifo
->qpl
->num_entries
, VM_MAP
,
33 if (unlikely(!fifo
->base
)) {
34 netif_err(priv
, drv
, priv
->dev
, "Failed to vmap fifo, qpl_id = %d\n",
39 fifo
->size
= fifo
->qpl
->num_entries
* PAGE_SIZE
;
40 atomic_set(&fifo
->available
, fifo
->size
);
45 static void gve_tx_fifo_release(struct gve_priv
*priv
, struct gve_tx_fifo
*fifo
)
47 WARN(atomic_read(&fifo
->available
) != fifo
->size
,
48 "Releasing non-empty fifo");
53 static int gve_tx_fifo_pad_alloc_one_frag(struct gve_tx_fifo
*fifo
,
56 return (fifo
->head
+ bytes
< fifo
->size
) ? 0 : fifo
->size
- fifo
->head
;
59 static bool gve_tx_fifo_can_alloc(struct gve_tx_fifo
*fifo
, size_t bytes
)
61 return (atomic_read(&fifo
->available
) <= bytes
) ? false : true;
64 /* gve_tx_alloc_fifo - Allocate fragment(s) from Tx FIFO
65 * @fifo: FIFO to allocate from
66 * @bytes: Allocation size
67 * @iov: Scatter-gather elements to fill with allocation fragment base/len
69 * Returns number of valid elements in iov[] or negative on error.
71 * Allocations from a given FIFO must be externally synchronized but concurrent
72 * allocation and frees are allowed.
74 static int gve_tx_alloc_fifo(struct gve_tx_fifo
*fifo
, size_t bytes
,
75 struct gve_tx_iovec iov
[2])
77 size_t overflow
, padding
;
84 /* This check happens before we know how much padding is needed to
85 * align to a cacheline boundary for the payload, but that is fine,
86 * because the FIFO head always start aligned, and the FIFO's boundaries
87 * are aligned, so if there is space for the data, there is space for
88 * the padding to the next alignment.
90 WARN(!gve_tx_fifo_can_alloc(fifo
, bytes
),
91 "Reached %s when there's not enough space in the fifo", __func__
);
95 iov
[0].iov_offset
= fifo
->head
;
96 iov
[0].iov_len
= bytes
;
99 if (fifo
->head
> fifo
->size
) {
100 /* If the allocation did not fit in the tail fragment of the
101 * FIFO, also use the head fragment.
104 overflow
= fifo
->head
- fifo
->size
;
105 iov
[0].iov_len
-= overflow
;
106 iov
[1].iov_offset
= 0; /* Start of fifo*/
107 iov
[1].iov_len
= overflow
;
109 fifo
->head
= overflow
;
112 /* Re-align to a cacheline boundary */
113 aligned_head
= L1_CACHE_ALIGN(fifo
->head
);
114 padding
= aligned_head
- fifo
->head
;
115 iov
[nfrags
- 1].iov_padding
= padding
;
116 atomic_sub(bytes
+ padding
, &fifo
->available
);
117 fifo
->head
= aligned_head
;
119 if (fifo
->head
== fifo
->size
)
125 /* gve_tx_free_fifo - Return space to Tx FIFO
126 * @fifo: FIFO to return fragments to
127 * @bytes: Bytes to free
129 static void gve_tx_free_fifo(struct gve_tx_fifo
*fifo
, size_t bytes
)
131 atomic_add(bytes
, &fifo
->available
);
134 static void gve_tx_remove_from_block(struct gve_priv
*priv
, int queue_idx
)
136 struct gve_notify_block
*block
=
137 &priv
->ntfy_blocks
[gve_tx_idx_to_ntfy(priv
, queue_idx
)];
142 static int gve_clean_tx_done(struct gve_priv
*priv
, struct gve_tx_ring
*tx
,
143 u32 to_do
, bool try_to_wake
);
145 static void gve_tx_free_ring(struct gve_priv
*priv
, int idx
)
147 struct gve_tx_ring
*tx
= &priv
->tx
[idx
];
148 struct device
*hdev
= &priv
->pdev
->dev
;
152 gve_tx_remove_from_block(priv
, idx
);
153 slots
= tx
->mask
+ 1;
154 gve_clean_tx_done(priv
, tx
, tx
->req
, false);
155 netdev_tx_reset_queue(tx
->netdev_txq
);
157 dma_free_coherent(hdev
, sizeof(*tx
->q_resources
),
158 tx
->q_resources
, tx
->q_resources_bus
);
159 tx
->q_resources
= NULL
;
161 gve_tx_fifo_release(priv
, &tx
->tx_fifo
);
162 gve_unassign_qpl(priv
, tx
->tx_fifo
.qpl
->id
);
163 tx
->tx_fifo
.qpl
= NULL
;
165 bytes
= sizeof(*tx
->desc
) * slots
;
166 dma_free_coherent(hdev
, bytes
, tx
->desc
, tx
->bus
);
172 netif_dbg(priv
, drv
, priv
->dev
, "freed tx queue %d\n", idx
);
175 static void gve_tx_add_to_block(struct gve_priv
*priv
, int queue_idx
)
177 int ntfy_idx
= gve_tx_idx_to_ntfy(priv
, queue_idx
);
178 struct gve_notify_block
*block
= &priv
->ntfy_blocks
[ntfy_idx
];
179 struct gve_tx_ring
*tx
= &priv
->tx
[queue_idx
];
182 tx
->ntfy_id
= ntfy_idx
;
185 static int gve_tx_alloc_ring(struct gve_priv
*priv
, int idx
)
187 struct gve_tx_ring
*tx
= &priv
->tx
[idx
];
188 struct device
*hdev
= &priv
->pdev
->dev
;
189 u32 slots
= priv
->tx_desc_cnt
;
192 /* Make sure everything is zeroed to start */
193 memset(tx
, 0, sizeof(*tx
));
196 tx
->mask
= slots
- 1;
199 tx
->info
= vzalloc(sizeof(*tx
->info
) * slots
);
204 bytes
= sizeof(*tx
->desc
) * slots
;
205 tx
->desc
= dma_alloc_coherent(hdev
, bytes
, &tx
->bus
, GFP_KERNEL
);
207 goto abort_with_info
;
209 tx
->tx_fifo
.qpl
= gve_assign_tx_qpl(priv
);
212 if (gve_tx_fifo_init(priv
, &tx
->tx_fifo
))
213 goto abort_with_desc
;
216 dma_alloc_coherent(hdev
,
217 sizeof(*tx
->q_resources
),
218 &tx
->q_resources_bus
,
220 if (!tx
->q_resources
)
221 goto abort_with_fifo
;
223 netif_dbg(priv
, drv
, priv
->dev
, "tx[%d]->bus=%lx\n", idx
,
224 (unsigned long)tx
->bus
);
225 tx
->netdev_txq
= netdev_get_tx_queue(priv
->dev
, idx
);
226 gve_tx_add_to_block(priv
, idx
);
231 gve_tx_fifo_release(priv
, &tx
->tx_fifo
);
233 dma_free_coherent(hdev
, bytes
, tx
->desc
, tx
->bus
);
241 int gve_tx_alloc_rings(struct gve_priv
*priv
)
246 for (i
= 0; i
< priv
->tx_cfg
.num_queues
; i
++) {
247 err
= gve_tx_alloc_ring(priv
, i
);
249 netif_err(priv
, drv
, priv
->dev
,
250 "Failed to alloc tx ring=%d: err=%d\n",
255 /* Unallocate if there was an error */
259 for (j
= 0; j
< i
; j
++)
260 gve_tx_free_ring(priv
, j
);
265 void gve_tx_free_rings(struct gve_priv
*priv
)
269 for (i
= 0; i
< priv
->tx_cfg
.num_queues
; i
++)
270 gve_tx_free_ring(priv
, i
);
273 /* gve_tx_avail - Calculates the number of slots available in the ring
274 * @tx: tx ring to check
276 * Returns the number of slots available
278 * The capacity of the queue is mask + 1. We don't need to reserve an entry.
280 static inline u32
gve_tx_avail(struct gve_tx_ring
*tx
)
282 return tx
->mask
+ 1 - (tx
->req
- tx
->done
);
285 static inline int gve_skb_fifo_bytes_required(struct gve_tx_ring
*tx
,
288 int pad_bytes
, align_hdr_pad
;
292 hlen
= skb_is_gso(skb
) ? skb_checksum_start_offset(skb
) +
293 tcp_hdrlen(skb
) : skb_headlen(skb
);
295 pad_bytes
= gve_tx_fifo_pad_alloc_one_frag(&tx
->tx_fifo
,
297 /* We need to take into account the header alignment padding. */
298 align_hdr_pad
= L1_CACHE_ALIGN(hlen
) - hlen
;
299 bytes
= align_hdr_pad
+ pad_bytes
+ skb
->len
;
304 /* The most descriptors we could need are 3 - 1 for the headers, 1 for
305 * the beginning of the payload at the end of the FIFO, and 1 if the
306 * payload wraps to the beginning of the FIFO.
308 #define MAX_TX_DESC_NEEDED 3
310 /* Check if sufficient resources (descriptor ring space, FIFO space) are
311 * available to transmit the given number of bytes.
313 static inline bool gve_can_tx(struct gve_tx_ring
*tx
, int bytes_required
)
315 return (gve_tx_avail(tx
) >= MAX_TX_DESC_NEEDED
&&
316 gve_tx_fifo_can_alloc(&tx
->tx_fifo
, bytes_required
));
319 /* Stops the queue if the skb cannot be transmitted. */
320 static int gve_maybe_stop_tx(struct gve_tx_ring
*tx
, struct sk_buff
*skb
)
324 bytes_required
= gve_skb_fifo_bytes_required(tx
, skb
);
325 if (likely(gve_can_tx(tx
, bytes_required
)))
328 /* No space, so stop the queue */
330 netif_tx_stop_queue(tx
->netdev_txq
);
331 smp_mb(); /* sync with restarting queue in gve_clean_tx_done() */
333 /* Now check for resources again, in case gve_clean_tx_done() freed
334 * resources after we checked and we stopped the queue after
335 * gve_clean_tx_done() checked.
337 * gve_maybe_stop_tx() gve_clean_tx_done()
338 * nsegs/can_alloc test failed
340 * if (tx queue stopped)
341 * netif_tx_queue_wake()
342 * netif_tx_stop_queue()
343 * Need to check again for space here!
345 if (likely(!gve_can_tx(tx
, bytes_required
)))
348 netif_tx_start_queue(tx
->netdev_txq
);
353 static void gve_tx_fill_pkt_desc(union gve_tx_desc
*pkt_desc
,
354 struct sk_buff
*skb
, bool is_gso
,
355 int l4_hdr_offset
, u32 desc_cnt
,
358 /* l4_hdr_offset and csum_offset are in units of 16-bit words */
360 pkt_desc
->pkt
.type_flags
= GVE_TXD_TSO
| GVE_TXF_L4CSUM
;
361 pkt_desc
->pkt
.l4_csum_offset
= skb
->csum_offset
>> 1;
362 pkt_desc
->pkt
.l4_hdr_offset
= l4_hdr_offset
>> 1;
363 } else if (likely(skb
->ip_summed
== CHECKSUM_PARTIAL
)) {
364 pkt_desc
->pkt
.type_flags
= GVE_TXD_STD
| GVE_TXF_L4CSUM
;
365 pkt_desc
->pkt
.l4_csum_offset
= skb
->csum_offset
>> 1;
366 pkt_desc
->pkt
.l4_hdr_offset
= l4_hdr_offset
>> 1;
368 pkt_desc
->pkt
.type_flags
= GVE_TXD_STD
;
369 pkt_desc
->pkt
.l4_csum_offset
= 0;
370 pkt_desc
->pkt
.l4_hdr_offset
= 0;
372 pkt_desc
->pkt
.desc_cnt
= desc_cnt
;
373 pkt_desc
->pkt
.len
= cpu_to_be16(skb
->len
);
374 pkt_desc
->pkt
.seg_len
= cpu_to_be16(hlen
);
375 pkt_desc
->pkt
.seg_addr
= cpu_to_be64(addr
);
378 static void gve_tx_fill_seg_desc(union gve_tx_desc
*seg_desc
,
379 struct sk_buff
*skb
, bool is_gso
,
382 seg_desc
->seg
.type_flags
= GVE_TXD_SEG
;
384 if (skb_is_gso_v6(skb
))
385 seg_desc
->seg
.type_flags
|= GVE_TXSF_IPV6
;
386 seg_desc
->seg
.l3_offset
= skb_network_offset(skb
) >> 1;
387 seg_desc
->seg
.mss
= cpu_to_be16(skb_shinfo(skb
)->gso_size
);
389 seg_desc
->seg
.seg_len
= cpu_to_be16(len
);
390 seg_desc
->seg
.seg_addr
= cpu_to_be64(addr
);
393 static void gve_dma_sync_for_device(struct device
*dev
, dma_addr_t
*page_buses
,
394 u64 iov_offset
, u64 iov_len
)
396 u64 last_page
= (iov_offset
+ iov_len
- 1) / PAGE_SIZE
;
397 u64 first_page
= iov_offset
/ PAGE_SIZE
;
401 for (page
= first_page
; page
<= last_page
; page
++) {
402 dma
= page_buses
[page
];
403 dma_sync_single_for_device(dev
, dma
, PAGE_SIZE
, DMA_TO_DEVICE
);
407 static int gve_tx_add_skb(struct gve_tx_ring
*tx
, struct sk_buff
*skb
,
410 int pad_bytes
, hlen
, hdr_nfrags
, payload_nfrags
, l4_hdr_offset
;
411 union gve_tx_desc
*pkt_desc
, *seg_desc
;
412 struct gve_tx_buffer_state
*info
;
413 bool is_gso
= skb_is_gso(skb
);
414 u32 idx
= tx
->req
& tx
->mask
;
420 info
= &tx
->info
[idx
];
421 pkt_desc
= &tx
->desc
[idx
];
423 l4_hdr_offset
= skb_checksum_start_offset(skb
);
424 /* If the skb is gso, then we want the tcp header in the first segment
425 * otherwise we want the linear portion of the skb (which will contain
426 * the checksum because skb->csum_start and skb->csum_offset are given
427 * relative to skb->head) in the first segment.
429 hlen
= is_gso
? l4_hdr_offset
+ tcp_hdrlen(skb
) :
433 /* We don't want to split the header, so if necessary, pad to the end
434 * of the fifo and then put the header at the beginning of the fifo.
436 pad_bytes
= gve_tx_fifo_pad_alloc_one_frag(&tx
->tx_fifo
, hlen
);
437 hdr_nfrags
= gve_tx_alloc_fifo(&tx
->tx_fifo
, hlen
+ pad_bytes
,
439 WARN(!hdr_nfrags
, "hdr_nfrags should never be 0!");
440 payload_nfrags
= gve_tx_alloc_fifo(&tx
->tx_fifo
, skb
->len
- hlen
,
441 &info
->iov
[payload_iov
]);
443 gve_tx_fill_pkt_desc(pkt_desc
, skb
, is_gso
, l4_hdr_offset
,
444 1 + payload_nfrags
, hlen
,
445 info
->iov
[hdr_nfrags
- 1].iov_offset
);
447 skb_copy_bits(skb
, 0,
448 tx
->tx_fifo
.base
+ info
->iov
[hdr_nfrags
- 1].iov_offset
,
450 gve_dma_sync_for_device(dev
, tx
->tx_fifo
.qpl
->page_buses
,
451 info
->iov
[hdr_nfrags
- 1].iov_offset
,
452 info
->iov
[hdr_nfrags
- 1].iov_len
);
455 for (i
= payload_iov
; i
< payload_nfrags
+ payload_iov
; i
++) {
456 next_idx
= (tx
->req
+ 1 + i
- payload_iov
) & tx
->mask
;
457 seg_desc
= &tx
->desc
[next_idx
];
459 gve_tx_fill_seg_desc(seg_desc
, skb
, is_gso
,
460 info
->iov
[i
].iov_len
,
461 info
->iov
[i
].iov_offset
);
463 skb_copy_bits(skb
, copy_offset
,
464 tx
->tx_fifo
.base
+ info
->iov
[i
].iov_offset
,
465 info
->iov
[i
].iov_len
);
466 gve_dma_sync_for_device(dev
, tx
->tx_fifo
.qpl
->page_buses
,
467 info
->iov
[i
].iov_offset
,
468 info
->iov
[i
].iov_len
);
469 copy_offset
+= info
->iov
[i
].iov_len
;
472 return 1 + payload_nfrags
;
475 netdev_tx_t
gve_tx(struct sk_buff
*skb
, struct net_device
*dev
)
477 struct gve_priv
*priv
= netdev_priv(dev
);
478 struct gve_tx_ring
*tx
;
481 WARN(skb_get_queue_mapping(skb
) > priv
->tx_cfg
.num_queues
,
482 "skb queue index out of range");
483 tx
= &priv
->tx
[skb_get_queue_mapping(skb
)];
484 if (unlikely(gve_maybe_stop_tx(tx
, skb
))) {
485 /* We need to ring the txq doorbell -- we have stopped the Tx
486 * queue for want of resources, but prior calls to gve_tx()
487 * may have added descriptors without ringing the doorbell.
490 gve_tx_put_doorbell(priv
, tx
->q_resources
, tx
->req
);
491 return NETDEV_TX_BUSY
;
493 nsegs
= gve_tx_add_skb(tx
, skb
, &priv
->pdev
->dev
);
495 netdev_tx_sent_queue(tx
->netdev_txq
, skb
->len
);
496 skb_tx_timestamp(skb
);
498 /* give packets to NIC */
501 if (!netif_xmit_stopped(tx
->netdev_txq
) && netdev_xmit_more())
504 gve_tx_put_doorbell(priv
, tx
->q_resources
, tx
->req
);
508 #define GVE_TX_START_THRESH PAGE_SIZE
510 static int gve_clean_tx_done(struct gve_priv
*priv
, struct gve_tx_ring
*tx
,
511 u32 to_do
, bool try_to_wake
)
513 struct gve_tx_buffer_state
*info
;
514 u64 pkts
= 0, bytes
= 0;
515 size_t space_freed
= 0;
520 for (j
= 0; j
< to_do
; j
++) {
521 idx
= tx
->done
& tx
->mask
;
522 netif_info(priv
, tx_done
, priv
->dev
,
523 "[%d] %s: idx=%d (req=%u done=%u)\n",
524 tx
->q_num
, __func__
, idx
, tx
->req
, tx
->done
);
525 info
= &tx
->info
[idx
];
533 dev_consume_skb_any(skb
);
535 for (i
= 0; i
< ARRAY_SIZE(info
->iov
); i
++) {
536 space_freed
+= info
->iov
[i
].iov_len
+
537 info
->iov
[i
].iov_padding
;
538 info
->iov
[i
].iov_len
= 0;
539 info
->iov
[i
].iov_padding
= 0;
545 gve_tx_free_fifo(&tx
->tx_fifo
, space_freed
);
546 u64_stats_update_begin(&tx
->statss
);
547 tx
->bytes_done
+= bytes
;
548 tx
->pkt_done
+= pkts
;
549 u64_stats_update_end(&tx
->statss
);
550 netdev_tx_completed_queue(tx
->netdev_txq
, pkts
, bytes
);
552 /* start the queue if we've stopped it */
554 /* Make sure that the doorbells are synced */
557 if (try_to_wake
&& netif_tx_queue_stopped(tx
->netdev_txq
) &&
558 likely(gve_can_tx(tx
, GVE_TX_START_THRESH
))) {
560 netif_tx_wake_queue(tx
->netdev_txq
);
566 __be32
gve_tx_load_event_counter(struct gve_priv
*priv
,
567 struct gve_tx_ring
*tx
)
569 u32 counter_index
= be32_to_cpu((tx
->q_resources
->counter_index
));
571 return READ_ONCE(priv
->counter_array
[counter_index
]);
574 bool gve_tx_poll(struct gve_notify_block
*block
, int budget
)
576 struct gve_priv
*priv
= block
->priv
;
577 struct gve_tx_ring
*tx
= block
->tx
;
582 /* If budget is 0, do all the work */
586 /* Find out how much work there is to be done */
587 tx
->last_nic_done
= gve_tx_load_event_counter(priv
, tx
);
588 nic_done
= be32_to_cpu(tx
->last_nic_done
);
590 /* Do as much work as we have that the budget will
593 to_do
= min_t(u32
, (nic_done
- tx
->done
), budget
);
594 gve_clean_tx_done(priv
, tx
, to_do
, true);
596 /* If we still have work we want to repoll */
597 repoll
|= (nic_done
!= tx
->done
);