2 * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
35 #include <linux/mlx4/cq.h>
36 #include <linux/slab.h>
37 #include <linux/mlx4/qp.h>
38 #include <linux/skbuff.h>
39 #include <linux/if_vlan.h>
40 #include <linux/prefetch.h>
41 #include <linux/vmalloc.h>
42 #include <linux/tcp.h>
44 #include <linux/ipv6.h>
45 #include <linux/moduleparam.h>
46 #include <linux/indirect_call_wrapper.h>
50 int mlx4_en_create_tx_ring(struct mlx4_en_priv
*priv
,
51 struct mlx4_en_tx_ring
**pring
, u32 size
,
52 u16 stride
, int node
, int queue_index
)
54 struct mlx4_en_dev
*mdev
= priv
->mdev
;
55 struct mlx4_en_tx_ring
*ring
;
59 ring
= kzalloc_node(sizeof(*ring
), GFP_KERNEL
, node
);
61 en_err(priv
, "Failed allocating TX ring\n");
66 ring
->size_mask
= size
- 1;
67 ring
->sp_stride
= stride
;
68 ring
->full_size
= ring
->size
- HEADROOM
- MAX_DESC_TXBBS
;
70 tmp
= size
* sizeof(struct mlx4_en_tx_info
);
71 ring
->tx_info
= kvmalloc_node(tmp
, GFP_KERNEL
, node
);
77 en_dbg(DRV
, priv
, "Allocated tx_info ring at addr:%p size:%d\n",
80 ring
->bounce_buf
= kmalloc_node(MAX_DESC_SIZE
, GFP_KERNEL
, node
);
81 if (!ring
->bounce_buf
) {
82 ring
->bounce_buf
= kmalloc(MAX_DESC_SIZE
, GFP_KERNEL
);
83 if (!ring
->bounce_buf
) {
88 ring
->buf_size
= ALIGN(size
* ring
->sp_stride
, MLX4_EN_PAGE_SIZE
);
90 /* Allocate HW buffers on provided NUMA node */
91 set_dev_node(&mdev
->dev
->persist
->pdev
->dev
, node
);
92 err
= mlx4_alloc_hwq_res(mdev
->dev
, &ring
->sp_wqres
, ring
->buf_size
);
93 set_dev_node(&mdev
->dev
->persist
->pdev
->dev
, mdev
->dev
->numa_node
);
95 en_err(priv
, "Failed allocating hwq resources\n");
99 ring
->buf
= ring
->sp_wqres
.buf
.direct
.buf
;
101 en_dbg(DRV
, priv
, "Allocated TX ring (addr:%p) - buf:%p size:%d buf_size:%d dma:%llx\n",
102 ring
, ring
->buf
, ring
->size
, ring
->buf_size
,
103 (unsigned long long) ring
->sp_wqres
.buf
.direct
.map
);
105 err
= mlx4_qp_reserve_range(mdev
->dev
, 1, 1, &ring
->qpn
,
106 MLX4_RESERVE_ETH_BF_QP
,
107 MLX4_RES_USAGE_DRIVER
);
109 en_err(priv
, "failed reserving qp for TX ring\n");
113 err
= mlx4_qp_alloc(mdev
->dev
, ring
->qpn
, &ring
->sp_qp
);
115 en_err(priv
, "Failed allocating qp %d\n", ring
->qpn
);
118 ring
->sp_qp
.event
= mlx4_en_sqp_event
;
120 err
= mlx4_bf_alloc(mdev
->dev
, &ring
->bf
, node
);
122 en_dbg(DRV
, priv
, "working without blueflame (%d)\n", err
);
123 ring
->bf
.uar
= &mdev
->priv_uar
;
124 ring
->bf
.uar
->map
= mdev
->uar_map
;
125 ring
->bf_enabled
= false;
126 ring
->bf_alloced
= false;
127 priv
->pflags
&= ~MLX4_EN_PRIV_FLAGS_BLUEFLAME
;
129 ring
->bf_alloced
= true;
130 ring
->bf_enabled
= !!(priv
->pflags
&
131 MLX4_EN_PRIV_FLAGS_BLUEFLAME
);
134 ring
->hwtstamp_tx_type
= priv
->hwtstamp_config
.tx_type
;
135 ring
->queue_index
= queue_index
;
137 if (queue_index
< priv
->num_tx_rings_p_up
)
138 cpumask_set_cpu(cpumask_local_spread(queue_index
,
139 priv
->mdev
->dev
->numa_node
),
140 &ring
->sp_affinity_mask
);
146 mlx4_qp_release_range(mdev
->dev
, ring
->qpn
, 1);
148 mlx4_free_hwq_res(mdev
->dev
, &ring
->sp_wqres
, ring
->buf_size
);
150 kfree(ring
->bounce_buf
);
151 ring
->bounce_buf
= NULL
;
153 kvfree(ring
->tx_info
);
154 ring
->tx_info
= NULL
;
161 void mlx4_en_destroy_tx_ring(struct mlx4_en_priv
*priv
,
162 struct mlx4_en_tx_ring
**pring
)
164 struct mlx4_en_dev
*mdev
= priv
->mdev
;
165 struct mlx4_en_tx_ring
*ring
= *pring
;
166 en_dbg(DRV
, priv
, "Destroying tx ring, qpn: %d\n", ring
->qpn
);
168 if (ring
->bf_alloced
)
169 mlx4_bf_free(mdev
->dev
, &ring
->bf
);
170 mlx4_qp_remove(mdev
->dev
, &ring
->sp_qp
);
171 mlx4_qp_free(mdev
->dev
, &ring
->sp_qp
);
172 mlx4_qp_release_range(priv
->mdev
->dev
, ring
->qpn
, 1);
173 mlx4_free_hwq_res(mdev
->dev
, &ring
->sp_wqres
, ring
->buf_size
);
174 kfree(ring
->bounce_buf
);
175 ring
->bounce_buf
= NULL
;
176 kvfree(ring
->tx_info
);
177 ring
->tx_info
= NULL
;
182 int mlx4_en_activate_tx_ring(struct mlx4_en_priv
*priv
,
183 struct mlx4_en_tx_ring
*ring
,
184 int cq
, int user_prio
)
186 struct mlx4_en_dev
*mdev
= priv
->mdev
;
191 ring
->cons
= 0xffffffff;
192 ring
->last_nr_txbb
= 1;
193 memset(ring
->tx_info
, 0, ring
->size
* sizeof(struct mlx4_en_tx_info
));
194 memset(ring
->buf
, 0, ring
->buf_size
);
195 ring
->free_tx_desc
= mlx4_en_free_tx_desc
;
197 ring
->sp_qp_state
= MLX4_QP_STATE_RST
;
198 ring
->doorbell_qpn
= cpu_to_be32(ring
->sp_qp
.qpn
<< 8);
199 ring
->mr_key
= cpu_to_be32(mdev
->mr
.key
);
201 mlx4_en_fill_qp_context(priv
, ring
->size
, ring
->sp_stride
, 1, 0, ring
->qpn
,
202 ring
->sp_cqn
, user_prio
, &ring
->sp_context
);
203 if (ring
->bf_alloced
)
204 ring
->sp_context
.usr_page
=
205 cpu_to_be32(mlx4_to_hw_uar_index(mdev
->dev
,
206 ring
->bf
.uar
->index
));
208 err
= mlx4_qp_to_ready(mdev
->dev
, &ring
->sp_wqres
.mtt
, &ring
->sp_context
,
209 &ring
->sp_qp
, &ring
->sp_qp_state
);
210 if (!cpumask_empty(&ring
->sp_affinity_mask
))
211 netif_set_xps_queue(priv
->dev
, &ring
->sp_affinity_mask
,
217 void mlx4_en_deactivate_tx_ring(struct mlx4_en_priv
*priv
,
218 struct mlx4_en_tx_ring
*ring
)
220 struct mlx4_en_dev
*mdev
= priv
->mdev
;
222 mlx4_qp_modify(mdev
->dev
, NULL
, ring
->sp_qp_state
,
223 MLX4_QP_STATE_RST
, NULL
, 0, 0, &ring
->sp_qp
);
226 static inline bool mlx4_en_is_tx_ring_full(struct mlx4_en_tx_ring
*ring
)
228 return ring
->prod
- ring
->cons
> ring
->full_size
;
231 static void mlx4_en_stamp_wqe(struct mlx4_en_priv
*priv
,
232 struct mlx4_en_tx_ring
*ring
, int index
,
235 __be32 stamp
= cpu_to_be32(STAMP_VAL
| (!!owner
<< STAMP_SHIFT
));
236 struct mlx4_en_tx_desc
*tx_desc
= ring
->buf
+ (index
<< LOG_TXBB_SIZE
);
237 struct mlx4_en_tx_info
*tx_info
= &ring
->tx_info
[index
];
238 void *end
= ring
->buf
+ ring
->buf_size
;
239 __be32
*ptr
= (__be32
*)tx_desc
;
242 /* Optimize the common case when there are no wraparounds */
243 if (likely((void *)tx_desc
+
244 (tx_info
->nr_txbb
<< LOG_TXBB_SIZE
) <= end
)) {
245 /* Stamp the freed descriptor */
246 for (i
= 0; i
< tx_info
->nr_txbb
<< LOG_TXBB_SIZE
;
252 /* Stamp the freed descriptor */
253 for (i
= 0; i
< tx_info
->nr_txbb
<< LOG_TXBB_SIZE
;
257 if ((void *)ptr
>= end
) {
259 stamp
^= cpu_to_be32(0x80000000);
265 INDIRECT_CALLABLE_DECLARE(u32
mlx4_en_free_tx_desc(struct mlx4_en_priv
*priv
,
266 struct mlx4_en_tx_ring
*ring
,
267 int index
, u64 timestamp
,
270 u32
mlx4_en_free_tx_desc(struct mlx4_en_priv
*priv
,
271 struct mlx4_en_tx_ring
*ring
,
272 int index
, u64 timestamp
,
275 struct mlx4_en_tx_info
*tx_info
= &ring
->tx_info
[index
];
276 struct mlx4_en_tx_desc
*tx_desc
= ring
->buf
+ (index
<< LOG_TXBB_SIZE
);
277 struct mlx4_wqe_data_seg
*data
= (void *) tx_desc
+ tx_info
->data_offset
;
278 void *end
= ring
->buf
+ ring
->buf_size
;
279 struct sk_buff
*skb
= tx_info
->skb
;
280 int nr_maps
= tx_info
->nr_maps
;
283 /* We do not touch skb here, so prefetch skb->users location
284 * to speedup consume_skb()
286 prefetchw(&skb
->users
);
288 if (unlikely(timestamp
)) {
289 struct skb_shared_hwtstamps hwts
;
291 mlx4_en_fill_hwtstamps(priv
->mdev
, &hwts
, timestamp
);
292 skb_tstamp_tx(skb
, &hwts
);
297 dma_unmap_single(priv
->ddev
,
299 tx_info
->map0_byte_count
,
302 dma_unmap_page(priv
->ddev
,
304 tx_info
->map0_byte_count
,
306 /* Optimize the common case when there are no wraparounds */
307 if (likely((void *)tx_desc
+
308 (tx_info
->nr_txbb
<< LOG_TXBB_SIZE
) <= end
)) {
309 for (i
= 1; i
< nr_maps
; i
++) {
311 dma_unmap_page(priv
->ddev
,
312 (dma_addr_t
)be64_to_cpu(data
->addr
),
313 be32_to_cpu(data
->byte_count
),
317 if ((void *)data
>= end
)
318 data
= ring
->buf
+ ((void *)data
- end
);
320 for (i
= 1; i
< nr_maps
; i
++) {
322 /* Check for wraparound before unmapping */
323 if ((void *) data
>= end
)
325 dma_unmap_page(priv
->ddev
,
326 (dma_addr_t
)be64_to_cpu(data
->addr
),
327 be32_to_cpu(data
->byte_count
),
332 napi_consume_skb(skb
, napi_mode
);
334 return tx_info
->nr_txbb
;
337 INDIRECT_CALLABLE_DECLARE(u32
mlx4_en_recycle_tx_desc(struct mlx4_en_priv
*priv
,
338 struct mlx4_en_tx_ring
*ring
,
339 int index
, u64 timestamp
,
342 u32
mlx4_en_recycle_tx_desc(struct mlx4_en_priv
*priv
,
343 struct mlx4_en_tx_ring
*ring
,
344 int index
, u64 timestamp
,
347 struct mlx4_en_tx_info
*tx_info
= &ring
->tx_info
[index
];
348 struct mlx4_en_rx_alloc frame
= {
349 .page
= tx_info
->page
,
350 .dma
= tx_info
->map0_dma
,
353 if (!napi_mode
|| !mlx4_en_rx_recycle(ring
->recycle_ring
, &frame
)) {
354 dma_unmap_page(priv
->ddev
, tx_info
->map0_dma
,
355 PAGE_SIZE
, priv
->dma_dir
);
356 put_page(tx_info
->page
);
359 return tx_info
->nr_txbb
;
362 int mlx4_en_free_tx_buf(struct net_device
*dev
, struct mlx4_en_tx_ring
*ring
)
364 struct mlx4_en_priv
*priv
= netdev_priv(dev
);
367 /* Skip last polled descriptor */
368 ring
->cons
+= ring
->last_nr_txbb
;
369 en_dbg(DRV
, priv
, "Freeing Tx buf - cons:0x%x prod:0x%x\n",
370 ring
->cons
, ring
->prod
);
372 if ((u32
) (ring
->prod
- ring
->cons
) > ring
->size
) {
373 if (netif_msg_tx_err(priv
))
374 en_warn(priv
, "Tx consumer passed producer!\n");
378 while (ring
->cons
!= ring
->prod
) {
379 ring
->last_nr_txbb
= ring
->free_tx_desc(priv
, ring
,
380 ring
->cons
& ring
->size_mask
,
381 0, 0 /* Non-NAPI caller */);
382 ring
->cons
+= ring
->last_nr_txbb
;
387 netdev_tx_reset_queue(ring
->tx_queue
);
390 en_dbg(DRV
, priv
, "Freed %d uncompleted tx descriptors\n", cnt
);
395 static void mlx4_en_handle_err_cqe(struct mlx4_en_priv
*priv
, struct mlx4_err_cqe
*err_cqe
,
396 u16 cqe_index
, struct mlx4_en_tx_ring
*ring
)
398 struct mlx4_en_dev
*mdev
= priv
->mdev
;
399 struct mlx4_en_tx_info
*tx_info
;
400 struct mlx4_en_tx_desc
*tx_desc
;
404 en_err(priv
, "CQE error - cqn 0x%x, ci 0x%x, vendor syndrome: 0x%x syndrome: 0x%x\n",
405 ring
->sp_cqn
, cqe_index
, err_cqe
->vendor_err_syndrome
, err_cqe
->syndrome
);
406 print_hex_dump(KERN_WARNING
, "", DUMP_PREFIX_OFFSET
, 16, 1, err_cqe
, sizeof(*err_cqe
),
409 wqe_index
= be16_to_cpu(err_cqe
->wqe_index
) & ring
->size_mask
;
410 tx_info
= &ring
->tx_info
[wqe_index
];
411 desc_size
= tx_info
->nr_txbb
<< LOG_TXBB_SIZE
;
412 en_err(priv
, "Related WQE - qpn 0x%x, wqe index 0x%x, wqe size 0x%x\n", ring
->qpn
,
413 wqe_index
, desc_size
);
414 tx_desc
= ring
->buf
+ (wqe_index
<< LOG_TXBB_SIZE
);
415 print_hex_dump(KERN_WARNING
, "", DUMP_PREFIX_OFFSET
, 16, 1, tx_desc
, desc_size
, false);
417 if (test_and_set_bit(MLX4_EN_STATE_FLAG_RESTARTING
, &priv
->state
))
420 en_err(priv
, "Scheduling port restart\n");
421 queue_work(mdev
->workqueue
, &priv
->restart_task
);
424 int mlx4_en_process_tx_cq(struct net_device
*dev
,
425 struct mlx4_en_cq
*cq
, int napi_budget
)
427 struct mlx4_en_priv
*priv
= netdev_priv(dev
);
428 struct mlx4_cq
*mcq
= &cq
->mcq
;
429 struct mlx4_en_tx_ring
*ring
= priv
->tx_ring
[cq
->type
][cq
->ring
];
430 struct mlx4_cqe
*cqe
;
431 u16 index
, ring_index
, stamp_index
;
432 u32 txbbs_skipped
= 0;
434 u32 cons_index
= mcq
->cons_index
;
436 u32 size_mask
= ring
->size_mask
;
437 struct mlx4_cqe
*buf
= cq
->buf
;
440 int factor
= priv
->cqe_factor
;
442 int budget
= priv
->tx_work_limit
;
446 if (unlikely(!priv
->port_up
))
449 netdev_txq_bql_complete_prefetchw(ring
->tx_queue
);
451 index
= cons_index
& size_mask
;
452 cqe
= mlx4_en_get_cqe(buf
, index
, priv
->cqe_size
) + factor
;
453 last_nr_txbb
= READ_ONCE(ring
->last_nr_txbb
);
454 ring_cons
= READ_ONCE(ring
->cons
);
455 ring_index
= ring_cons
& size_mask
;
456 stamp_index
= ring_index
;
458 /* Process all completed CQEs */
459 while (XNOR(cqe
->owner_sr_opcode
& MLX4_CQE_OWNER_MASK
,
460 cons_index
& size
) && (done
< budget
)) {
464 * make sure we read the CQE after we read the
469 if (unlikely((cqe
->owner_sr_opcode
& MLX4_CQE_OPCODE_MASK
) ==
470 MLX4_CQE_OPCODE_ERROR
))
471 if (!test_and_set_bit(MLX4_EN_TX_RING_STATE_RECOVERING
, &ring
->state
))
472 mlx4_en_handle_err_cqe(priv
, (struct mlx4_err_cqe
*)cqe
, index
,
475 /* Skip over last polled CQE */
476 new_index
= be16_to_cpu(cqe
->wqe_index
) & size_mask
;
481 txbbs_skipped
+= last_nr_txbb
;
482 ring_index
= (ring_index
+ last_nr_txbb
) & size_mask
;
484 if (unlikely(ring
->tx_info
[ring_index
].ts_requested
))
485 timestamp
= mlx4_en_get_cqe_ts(cqe
);
487 /* free next descriptor */
488 last_nr_txbb
= INDIRECT_CALL_2(ring
->free_tx_desc
,
489 mlx4_en_free_tx_desc
,
490 mlx4_en_recycle_tx_desc
,
491 priv
, ring
, ring_index
,
492 timestamp
, napi_budget
);
494 mlx4_en_stamp_wqe(priv
, ring
, stamp_index
,
495 !!((ring_cons
+ txbbs_stamp
) &
497 stamp_index
= ring_index
;
498 txbbs_stamp
= txbbs_skipped
;
500 bytes
+= ring
->tx_info
[ring_index
].nr_bytes
;
501 } while ((++done
< budget
) && (ring_index
!= new_index
));
504 index
= cons_index
& size_mask
;
505 cqe
= mlx4_en_get_cqe(buf
, index
, priv
->cqe_size
) + factor
;
509 * To prevent CQ overflow we first update CQ consumer and only then
512 mcq
->cons_index
= cons_index
;
516 /* we want to dirty this cache line once */
517 WRITE_ONCE(ring
->last_nr_txbb
, last_nr_txbb
);
518 WRITE_ONCE(ring
->cons
, ring_cons
+ txbbs_skipped
);
520 if (cq
->type
== TX_XDP
)
523 netdev_tx_completed_queue(ring
->tx_queue
, packets
, bytes
);
525 /* Wakeup Tx queue if this stopped, and ring is not full.
527 if (netif_tx_queue_stopped(ring
->tx_queue
) &&
528 !mlx4_en_is_tx_ring_full(ring
)) {
529 netif_tx_wake_queue(ring
->tx_queue
);
536 void mlx4_en_tx_irq(struct mlx4_cq
*mcq
)
538 struct mlx4_en_cq
*cq
= container_of(mcq
, struct mlx4_en_cq
, mcq
);
539 struct mlx4_en_priv
*priv
= netdev_priv(cq
->dev
);
541 if (likely(priv
->port_up
))
542 napi_schedule_irqoff(&cq
->napi
);
544 mlx4_en_arm_cq(priv
, cq
);
547 /* TX CQ polling - called by NAPI */
548 int mlx4_en_poll_tx_cq(struct napi_struct
*napi
, int budget
)
550 struct mlx4_en_cq
*cq
= container_of(napi
, struct mlx4_en_cq
, napi
);
551 struct net_device
*dev
= cq
->dev
;
552 struct mlx4_en_priv
*priv
= netdev_priv(dev
);
555 work_done
= mlx4_en_process_tx_cq(dev
, cq
, budget
);
556 if (work_done
>= budget
)
559 if (napi_complete_done(napi
, work_done
))
560 mlx4_en_arm_cq(priv
, cq
);
565 static struct mlx4_en_tx_desc
*mlx4_en_bounce_to_desc(struct mlx4_en_priv
*priv
,
566 struct mlx4_en_tx_ring
*ring
,
568 unsigned int desc_size
)
570 u32 copy
= (ring
->size
- index
) << LOG_TXBB_SIZE
;
573 for (i
= desc_size
- copy
- 4; i
>= 0; i
-= 4) {
574 if ((i
& (TXBB_SIZE
- 1)) == 0)
577 *((u32
*) (ring
->buf
+ i
)) =
578 *((u32
*) (ring
->bounce_buf
+ copy
+ i
));
581 for (i
= copy
- 4; i
>= 4 ; i
-= 4) {
582 if ((i
& (TXBB_SIZE
- 1)) == 0)
585 *((u32
*)(ring
->buf
+ (index
<< LOG_TXBB_SIZE
) + i
)) =
586 *((u32
*) (ring
->bounce_buf
+ i
));
589 /* Return real descriptor location */
590 return ring
->buf
+ (index
<< LOG_TXBB_SIZE
);
593 /* Decide if skb can be inlined in tx descriptor to avoid dma mapping
595 * It seems strange we do not simply use skb_copy_bits().
596 * This would allow to inline all skbs iff skb->len <= inline_thold
598 * Note that caller already checked skb was not a gso packet
600 static bool is_inline(int inline_thold
, const struct sk_buff
*skb
,
601 const struct skb_shared_info
*shinfo
,
606 if (skb
->len
> inline_thold
|| !inline_thold
)
609 if (shinfo
->nr_frags
== 1) {
610 ptr
= skb_frag_address_safe(&shinfo
->frags
[0]);
616 if (shinfo
->nr_frags
)
621 static int inline_size(const struct sk_buff
*skb
)
623 if (skb
->len
+ CTRL_SIZE
+ sizeof(struct mlx4_wqe_inline_seg
)
624 <= MLX4_INLINE_ALIGN
)
625 return ALIGN(skb
->len
+ CTRL_SIZE
+
626 sizeof(struct mlx4_wqe_inline_seg
), 16);
628 return ALIGN(skb
->len
+ CTRL_SIZE
+ 2 *
629 sizeof(struct mlx4_wqe_inline_seg
), 16);
632 static int get_real_size(const struct sk_buff
*skb
,
633 const struct skb_shared_info
*shinfo
,
634 struct net_device
*dev
,
635 int *lso_header_size
,
639 struct mlx4_en_priv
*priv
= netdev_priv(dev
);
642 if (shinfo
->gso_size
) {
644 if (skb
->encapsulation
)
645 *lso_header_size
= (skb_inner_transport_header(skb
) - skb
->data
) + inner_tcp_hdrlen(skb
);
647 *lso_header_size
= skb_transport_offset(skb
) + tcp_hdrlen(skb
);
648 real_size
= CTRL_SIZE
+ shinfo
->nr_frags
* DS_SIZE
+
649 ALIGN(*lso_header_size
+ 4, DS_SIZE
);
650 if (unlikely(*lso_header_size
!= skb_headlen(skb
))) {
651 /* We add a segment for the skb linear buffer only if
652 * it contains data */
653 if (*lso_header_size
< skb_headlen(skb
))
654 real_size
+= DS_SIZE
;
656 if (netif_msg_tx_err(priv
))
657 en_warn(priv
, "Non-linear headers\n");
662 *lso_header_size
= 0;
663 *inline_ok
= is_inline(priv
->prof
->inline_thold
, skb
,
667 real_size
= inline_size(skb
);
669 real_size
= CTRL_SIZE
+
670 (shinfo
->nr_frags
+ 1) * DS_SIZE
;
676 static void build_inline_wqe(struct mlx4_en_tx_desc
*tx_desc
,
677 const struct sk_buff
*skb
,
678 const struct skb_shared_info
*shinfo
,
681 struct mlx4_wqe_inline_seg
*inl
= &tx_desc
->inl
;
682 int spc
= MLX4_INLINE_ALIGN
- CTRL_SIZE
- sizeof(*inl
);
683 unsigned int hlen
= skb_headlen(skb
);
685 if (skb
->len
<= spc
) {
686 if (likely(skb
->len
>= MIN_PKT_LEN
)) {
687 inl
->byte_count
= cpu_to_be32(1 << 31 | skb
->len
);
689 inl
->byte_count
= cpu_to_be32(1 << 31 | MIN_PKT_LEN
);
690 memset(((void *)(inl
+ 1)) + skb
->len
, 0,
691 MIN_PKT_LEN
- skb
->len
);
693 skb_copy_from_linear_data(skb
, inl
+ 1, hlen
);
694 if (shinfo
->nr_frags
)
695 memcpy(((void *)(inl
+ 1)) + hlen
, fragptr
,
696 skb_frag_size(&shinfo
->frags
[0]));
699 inl
->byte_count
= cpu_to_be32(1 << 31 | spc
);
701 skb_copy_from_linear_data(skb
, inl
+ 1, hlen
);
703 memcpy(((void *)(inl
+ 1)) + hlen
,
704 fragptr
, spc
- hlen
);
705 fragptr
+= spc
- hlen
;
707 inl
= (void *) (inl
+ 1) + spc
;
708 memcpy(((void *)(inl
+ 1)), fragptr
, skb
->len
- spc
);
710 skb_copy_from_linear_data(skb
, inl
+ 1, spc
);
711 inl
= (void *) (inl
+ 1) + spc
;
712 skb_copy_from_linear_data_offset(skb
, spc
, inl
+ 1,
714 if (shinfo
->nr_frags
)
715 memcpy(((void *)(inl
+ 1)) + hlen
- spc
,
717 skb_frag_size(&shinfo
->frags
[0]));
721 inl
->byte_count
= cpu_to_be32(1 << 31 | (skb
->len
- spc
));
725 u16
mlx4_en_select_queue(struct net_device
*dev
, struct sk_buff
*skb
,
726 struct net_device
*sb_dev
)
728 struct mlx4_en_priv
*priv
= netdev_priv(dev
);
729 u16 rings_p_up
= priv
->num_tx_rings_p_up
;
731 if (netdev_get_num_tc(dev
))
732 return netdev_pick_tx(dev
, skb
, NULL
);
734 return netdev_pick_tx(dev
, skb
, NULL
) % rings_p_up
;
737 static void mlx4_bf_copy(void __iomem
*dst
, const void *src
,
738 unsigned int bytecnt
)
740 __iowrite64_copy(dst
, src
, bytecnt
/ 8);
743 void mlx4_en_xmit_doorbell(struct mlx4_en_tx_ring
*ring
)
746 /* Since there is no iowrite*_native() that writes the
747 * value as is, without byteswapping - using the one
748 * the doesn't do byteswapping in the relevant arch
751 #if defined(__LITTLE_ENDIAN)
756 (__force u32
)ring
->doorbell_qpn
,
757 ring
->bf
.uar
->map
+ MLX4_SEND_DOORBELL
);
760 static void mlx4_en_tx_write_desc(struct mlx4_en_tx_ring
*ring
,
761 struct mlx4_en_tx_desc
*tx_desc
,
762 union mlx4_wqe_qpn_vlan qpn_vlan
,
763 int desc_size
, int bf_index
,
764 __be32 op_own
, bool bf_ok
,
767 tx_desc
->ctrl
.qpn_vlan
= qpn_vlan
;
770 op_own
|= htonl((bf_index
& 0xffff) << 8);
771 /* Ensure new descriptor hits memory
772 * before setting ownership of this descriptor to HW
775 tx_desc
->ctrl
.owner_opcode
= op_own
;
779 mlx4_bf_copy(ring
->bf
.reg
+ ring
->bf
.offset
, &tx_desc
->ctrl
,
784 ring
->bf
.offset
^= ring
->bf
.buf_size
;
786 /* Ensure new descriptor hits memory
787 * before setting ownership of this descriptor to HW
790 tx_desc
->ctrl
.owner_opcode
= op_own
;
792 mlx4_en_xmit_doorbell(ring
);
798 static bool mlx4_en_build_dma_wqe(struct mlx4_en_priv
*priv
,
799 struct skb_shared_info
*shinfo
,
800 struct mlx4_wqe_data_seg
*data
,
804 struct mlx4_en_tx_info
*tx_info
)
806 struct device
*ddev
= priv
->ddev
;
811 /* Map fragments if any */
812 for (i_frag
= shinfo
->nr_frags
- 1; i_frag
>= 0; i_frag
--) {
813 const skb_frag_t
*frag
= &shinfo
->frags
[i_frag
];
814 byte_count
= skb_frag_size(frag
);
815 dma
= skb_frag_dma_map(ddev
, frag
,
818 if (dma_mapping_error(ddev
, dma
))
821 data
->addr
= cpu_to_be64(dma
);
824 data
->byte_count
= cpu_to_be32(byte_count
);
828 /* Map linear part if needed */
829 if (tx_info
->linear
) {
830 byte_count
= skb_headlen(skb
) - lso_header_size
;
832 dma
= dma_map_single(ddev
, skb
->data
+
833 lso_header_size
, byte_count
,
835 if (dma_mapping_error(ddev
, dma
))
838 data
->addr
= cpu_to_be64(dma
);
841 data
->byte_count
= cpu_to_be32(byte_count
);
843 /* tx completion can avoid cache line miss for common cases */
844 tx_info
->map0_dma
= dma
;
845 tx_info
->map0_byte_count
= byte_count
;
850 en_err(priv
, "DMA mapping error\n");
852 while (++i_frag
< shinfo
->nr_frags
) {
854 dma_unmap_page(ddev
, (dma_addr_t
)be64_to_cpu(data
->addr
),
855 be32_to_cpu(data
->byte_count
),
862 netdev_tx_t
mlx4_en_xmit(struct sk_buff
*skb
, struct net_device
*dev
)
864 struct skb_shared_info
*shinfo
= skb_shinfo(skb
);
865 struct mlx4_en_priv
*priv
= netdev_priv(dev
);
866 union mlx4_wqe_qpn_vlan qpn_vlan
= {};
867 struct mlx4_en_tx_ring
*ring
;
868 struct mlx4_en_tx_desc
*tx_desc
;
869 struct mlx4_wqe_data_seg
*data
;
870 struct mlx4_en_tx_info
*tx_info
;
871 u32 __maybe_unused ring_cons
;
879 void *fragptr
= NULL
;
887 tx_ind
= skb_get_queue_mapping(skb
);
888 ring
= priv
->tx_ring
[TX
][tx_ind
];
890 if (unlikely(!priv
->port_up
))
893 real_size
= get_real_size(skb
, shinfo
, dev
, &lso_header_size
,
894 &inline_ok
, &fragptr
);
895 if (unlikely(!real_size
))
898 /* Align descriptor to TXBB size */
899 desc_size
= ALIGN(real_size
, TXBB_SIZE
);
900 nr_txbb
= desc_size
>> LOG_TXBB_SIZE
;
901 if (unlikely(nr_txbb
> MAX_DESC_TXBBS
)) {
902 if (netif_msg_tx_err(priv
))
903 en_warn(priv
, "Oversized header or SG list\n");
907 bf_ok
= ring
->bf_enabled
;
908 if (skb_vlan_tag_present(skb
)) {
911 qpn_vlan
.vlan_tag
= cpu_to_be16(skb_vlan_tag_get(skb
));
912 vlan_proto
= be16_to_cpu(skb
->vlan_proto
);
913 if (vlan_proto
== ETH_P_8021AD
)
914 qpn_vlan
.ins_vlan
= MLX4_WQE_CTRL_INS_SVLAN
;
915 else if (vlan_proto
== ETH_P_8021Q
)
916 qpn_vlan
.ins_vlan
= MLX4_WQE_CTRL_INS_CVLAN
;
918 qpn_vlan
.ins_vlan
= 0;
922 netdev_txq_bql_enqueue_prefetchw(ring
->tx_queue
);
924 /* Packet is good - grab an index and transmit it */
925 index
= ring
->prod
& ring
->size_mask
;
926 bf_index
= ring
->prod
;
928 /* See if we have enough space for whole descriptor TXBB for setting
929 * SW ownership on next descriptor; if not, use a bounce buffer. */
930 if (likely(index
+ nr_txbb
<= ring
->size
))
931 tx_desc
= ring
->buf
+ (index
<< LOG_TXBB_SIZE
);
933 tx_desc
= (struct mlx4_en_tx_desc
*) ring
->bounce_buf
;
938 /* Save skb in tx_info ring */
939 tx_info
= &ring
->tx_info
[index
];
941 tx_info
->nr_txbb
= nr_txbb
;
943 if (!lso_header_size
) {
944 data
= &tx_desc
->data
;
945 data_offset
= offsetof(struct mlx4_en_tx_desc
, data
);
947 int lso_align
= ALIGN(lso_header_size
+ 4, DS_SIZE
);
949 data
= (void *)&tx_desc
->lso
+ lso_align
;
950 data_offset
= offsetof(struct mlx4_en_tx_desc
, lso
) + lso_align
;
953 /* valid only for none inline segments */
954 tx_info
->data_offset
= data_offset
;
956 tx_info
->inl
= inline_ok
;
958 tx_info
->linear
= lso_header_size
< skb_headlen(skb
) && !inline_ok
;
960 tx_info
->nr_maps
= shinfo
->nr_frags
+ tx_info
->linear
;
961 data
+= tx_info
->nr_maps
- 1;
964 if (!mlx4_en_build_dma_wqe(priv
, shinfo
, data
, skb
,
965 lso_header_size
, ring
->mr_key
,
970 * For timestamping add flag to skb_shinfo and
971 * set flag for further reference
973 tx_info
->ts_requested
= 0;
974 if (unlikely(ring
->hwtstamp_tx_type
== HWTSTAMP_TX_ON
&&
975 shinfo
->tx_flags
& SKBTX_HW_TSTAMP
)) {
976 shinfo
->tx_flags
|= SKBTX_IN_PROGRESS
;
977 tx_info
->ts_requested
= 1;
980 /* Prepare ctrl segement apart opcode+ownership, which depends on
981 * whether LSO is used */
982 tx_desc
->ctrl
.srcrb_flags
= priv
->ctrl_flags
;
983 if (likely(skb
->ip_summed
== CHECKSUM_PARTIAL
)) {
984 if (!skb
->encapsulation
)
985 tx_desc
->ctrl
.srcrb_flags
|= cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM
|
986 MLX4_WQE_CTRL_TCP_UDP_CSUM
);
988 tx_desc
->ctrl
.srcrb_flags
|= cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM
);
992 if (priv
->flags
& MLX4_EN_FLAG_ENABLE_HW_LOOPBACK
) {
995 /* Copy dst mac address to wqe. This allows loopback in eSwitch,
996 * so that VFs and PF can communicate with each other
998 ethh
= (struct ethhdr
*)skb
->data
;
999 tx_desc
->ctrl
.srcrb_flags16
[0] = get_unaligned((__be16
*)ethh
->h_dest
);
1000 tx_desc
->ctrl
.imm
= get_unaligned((__be32
*)(ethh
->h_dest
+ 2));
1003 /* Handle LSO (TSO) packets */
1004 if (lso_header_size
) {
1007 /* Mark opcode as LSO */
1008 op_own
= cpu_to_be32(MLX4_OPCODE_LSO
| (1 << 6)) |
1009 ((ring
->prod
& ring
->size
) ?
1010 cpu_to_be32(MLX4_EN_BIT_DESC_OWN
) : 0);
1012 /* Fill in the LSO prefix */
1013 tx_desc
->lso
.mss_hdr_size
= cpu_to_be32(
1014 shinfo
->gso_size
<< 16 | lso_header_size
);
1017 * note that we already verified that it is linear */
1018 memcpy(tx_desc
->lso
.header
, skb
->data
, lso_header_size
);
1020 ring
->tso_packets
++;
1022 i
= shinfo
->gso_segs
;
1023 tx_info
->nr_bytes
= skb
->len
+ (i
- 1) * lso_header_size
;
1026 /* Normal (Non LSO) packet */
1027 op_own
= cpu_to_be32(MLX4_OPCODE_SEND
) |
1028 ((ring
->prod
& ring
->size
) ?
1029 cpu_to_be32(MLX4_EN_BIT_DESC_OWN
) : 0);
1030 tx_info
->nr_bytes
= max_t(unsigned int, skb
->len
, ETH_ZLEN
);
1033 ring
->bytes
+= tx_info
->nr_bytes
;
1036 build_inline_wqe(tx_desc
, skb
, shinfo
, fragptr
);
1038 if (skb
->encapsulation
) {
1046 ip
.hdr
= skb_inner_network_header(skb
);
1047 proto
= (ip
.v4
->version
== 4) ? ip
.v4
->protocol
:
1050 if (proto
== IPPROTO_TCP
|| proto
== IPPROTO_UDP
)
1051 op_own
|= cpu_to_be32(MLX4_WQE_CTRL_IIP
| MLX4_WQE_CTRL_ILP
);
1053 op_own
|= cpu_to_be32(MLX4_WQE_CTRL_IIP
);
1056 ring
->prod
+= nr_txbb
;
1058 /* If we used a bounce buffer then copy descriptor back into place */
1059 if (unlikely(bounce
))
1060 tx_desc
= mlx4_en_bounce_to_desc(priv
, ring
, index
, desc_size
);
1062 skb_tx_timestamp(skb
);
1064 /* Check available TXBBs And 2K spare for prefetch */
1065 stop_queue
= mlx4_en_is_tx_ring_full(ring
);
1066 if (unlikely(stop_queue
)) {
1067 netif_tx_stop_queue(ring
->tx_queue
);
1068 ring
->queue_stopped
++;
1071 send_doorbell
= __netdev_tx_sent_queue(ring
->tx_queue
,
1073 netdev_xmit_more());
1075 real_size
= (real_size
/ 16) & 0x3f;
1077 bf_ok
&= desc_size
<= MAX_BF
&& send_doorbell
;
1080 qpn_vlan
.bf_qpn
= ring
->doorbell_qpn
| cpu_to_be32(real_size
);
1082 qpn_vlan
.fence_size
= real_size
;
1084 mlx4_en_tx_write_desc(ring
, tx_desc
, qpn_vlan
, desc_size
, bf_index
,
1085 op_own
, bf_ok
, send_doorbell
);
1087 if (unlikely(stop_queue
)) {
1088 /* If queue was emptied after the if (stop_queue) , and before
1089 * the netif_tx_stop_queue() - need to wake the queue,
1090 * or else it will remain stopped forever.
1091 * Need a memory barrier to make sure ring->cons was not
1092 * updated before queue was stopped.
1096 if (unlikely(!mlx4_en_is_tx_ring_full(ring
))) {
1097 netif_tx_wake_queue(ring
->tx_queue
);
1101 return NETDEV_TX_OK
;
1106 dev_kfree_skb_any(skb
);
1107 return NETDEV_TX_OK
;
1110 #define MLX4_EN_XDP_TX_NRTXBB 1
1111 #define MLX4_EN_XDP_TX_REAL_SZ (((CTRL_SIZE + MLX4_EN_XDP_TX_NRTXBB * DS_SIZE) \
1114 void mlx4_en_init_tx_xdp_ring_descs(struct mlx4_en_priv
*priv
,
1115 struct mlx4_en_tx_ring
*ring
)
1119 for (i
= 0; i
< ring
->size
; i
++) {
1120 struct mlx4_en_tx_info
*tx_info
= &ring
->tx_info
[i
];
1121 struct mlx4_en_tx_desc
*tx_desc
= ring
->buf
+
1122 (i
<< LOG_TXBB_SIZE
);
1124 tx_info
->map0_byte_count
= PAGE_SIZE
;
1125 tx_info
->nr_txbb
= MLX4_EN_XDP_TX_NRTXBB
;
1126 tx_info
->data_offset
= offsetof(struct mlx4_en_tx_desc
, data
);
1127 tx_info
->ts_requested
= 0;
1128 tx_info
->nr_maps
= 1;
1129 tx_info
->linear
= 1;
1132 tx_desc
->data
.lkey
= ring
->mr_key
;
1133 tx_desc
->ctrl
.qpn_vlan
.fence_size
= MLX4_EN_XDP_TX_REAL_SZ
;
1134 tx_desc
->ctrl
.srcrb_flags
= priv
->ctrl_flags
;
1138 netdev_tx_t
mlx4_en_xmit_frame(struct mlx4_en_rx_ring
*rx_ring
,
1139 struct mlx4_en_rx_alloc
*frame
,
1140 struct mlx4_en_priv
*priv
, unsigned int length
,
1141 int tx_ind
, bool *doorbell_pending
)
1143 struct mlx4_en_tx_desc
*tx_desc
;
1144 struct mlx4_en_tx_info
*tx_info
;
1145 struct mlx4_wqe_data_seg
*data
;
1146 struct mlx4_en_tx_ring
*ring
;
1151 if (unlikely(!priv
->port_up
))
1154 ring
= priv
->tx_ring
[TX_XDP
][tx_ind
];
1156 if (unlikely(mlx4_en_is_tx_ring_full(ring
)))
1159 index
= ring
->prod
& ring
->size_mask
;
1160 tx_info
= &ring
->tx_info
[index
];
1162 tx_desc
= ring
->buf
+ (index
<< LOG_TXBB_SIZE
);
1163 data
= &tx_desc
->data
;
1167 tx_info
->page
= frame
->page
;
1169 tx_info
->map0_dma
= dma
;
1170 tx_info
->nr_bytes
= max_t(unsigned int, length
, ETH_ZLEN
);
1172 dma_sync_single_range_for_device(priv
->ddev
, dma
, frame
->page_offset
,
1173 length
, PCI_DMA_TODEVICE
);
1175 data
->addr
= cpu_to_be64(dma
+ frame
->page_offset
);
1177 data
->byte_count
= cpu_to_be32(length
);
1179 /* tx completion can avoid cache line miss for common cases */
1181 op_own
= cpu_to_be32(MLX4_OPCODE_SEND
) |
1182 ((ring
->prod
& ring
->size
) ?
1183 cpu_to_be32(MLX4_EN_BIT_DESC_OWN
) : 0);
1187 ring
->prod
+= MLX4_EN_XDP_TX_NRTXBB
;
1189 /* Ensure new descriptor hits memory
1190 * before setting ownership of this descriptor to HW
1193 tx_desc
->ctrl
.owner_opcode
= op_own
;
1196 *doorbell_pending
= true;
1198 return NETDEV_TX_OK
;
1201 rx_ring
->xdp_tx_full
++;
1202 *doorbell_pending
= true;
1204 return NETDEV_TX_BUSY
;