1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
3 * Copyright(c) 2020 - 2023 Cornelis Networks, Inc.
4 * Copyright(c) 2015 - 2018 Intel Corporation.
8 #include <linux/types.h>
9 #include <linux/device.h>
10 #include <linux/dmapool.h>
11 #include <linux/slab.h>
12 #include <linux/list.h>
13 #include <linux/highmem.h>
15 #include <linux/uio.h>
16 #include <linux/rbtree.h>
17 #include <linux/spinlock.h>
18 #include <linux/delay.h>
19 #include <linux/kthread.h>
20 #include <linux/mmu_context.h>
21 #include <linux/module.h>
22 #include <linux/vmalloc.h>
23 #include <linux/string.h>
27 #include "user_sdma.h"
28 #include "verbs.h" /* for the headers */
29 #include "common.h" /* for struct hfi1_tid_info */
32 static uint hfi1_sdma_comp_ring_size
= 128;
33 module_param_named(sdma_comp_size
, hfi1_sdma_comp_ring_size
, uint
, S_IRUGO
);
34 MODULE_PARM_DESC(sdma_comp_size
, "Size of User SDMA completion ring. Default: 128");
36 static unsigned initial_pkt_count
= 8;
38 static int user_sdma_send_pkts(struct user_sdma_request
*req
, u16 maxpkts
);
39 static void user_sdma_txreq_cb(struct sdma_txreq
*txreq
, int status
);
40 static inline void pq_update(struct hfi1_user_sdma_pkt_q
*pq
);
41 static void user_sdma_free_request(struct user_sdma_request
*req
);
42 static int check_header_template(struct user_sdma_request
*req
,
43 struct hfi1_pkt_header
*hdr
, u32 lrhlen
,
45 static int set_txreq_header(struct user_sdma_request
*req
,
46 struct user_sdma_txreq
*tx
, u32 datalen
);
47 static int set_txreq_header_ahg(struct user_sdma_request
*req
,
48 struct user_sdma_txreq
*tx
, u32 len
);
49 static inline void set_comp_state(struct hfi1_user_sdma_pkt_q
*pq
,
50 struct hfi1_user_sdma_comp_q
*cq
,
51 u16 idx
, enum hfi1_sdma_comp_state state
,
53 static inline u32
set_pkt_bth_psn(__be32 bthpsn
, u8 expct
, u32 frags
);
54 static inline u32
get_lrh_len(struct hfi1_pkt_header
, u32 len
);
56 static int defer_packet_queue(
57 struct sdma_engine
*sde
,
58 struct iowait_work
*wait
,
59 struct sdma_txreq
*txreq
,
62 static void activate_packet_queue(struct iowait
*wait
, int reason
);
64 static int defer_packet_queue(
65 struct sdma_engine
*sde
,
66 struct iowait_work
*wait
,
67 struct sdma_txreq
*txreq
,
71 struct hfi1_user_sdma_pkt_q
*pq
=
72 container_of(wait
->iow
, struct hfi1_user_sdma_pkt_q
, busy
);
74 write_seqlock(&sde
->waitlock
);
75 trace_hfi1_usdma_defer(pq
, sde
, &pq
->busy
);
76 if (sdma_progress(sde
, seq
, txreq
))
79 * We are assuming that if the list is enqueued somewhere, it
80 * is to the dmawait list since that is the only place where
81 * it is supposed to be enqueued.
83 xchg(&pq
->state
, SDMA_PKT_Q_DEFERRED
);
84 if (list_empty(&pq
->busy
.list
)) {
85 pq
->busy
.lock
= &sde
->waitlock
;
86 iowait_get_priority(&pq
->busy
);
87 iowait_queue(pkts_sent
, &pq
->busy
, &sde
->dmawait
);
89 write_sequnlock(&sde
->waitlock
);
92 write_sequnlock(&sde
->waitlock
);
96 static void activate_packet_queue(struct iowait
*wait
, int reason
)
98 struct hfi1_user_sdma_pkt_q
*pq
=
99 container_of(wait
, struct hfi1_user_sdma_pkt_q
, busy
);
101 trace_hfi1_usdma_activate(pq
, wait
, reason
);
102 xchg(&pq
->state
, SDMA_PKT_Q_ACTIVE
);
103 wake_up(&wait
->wait_dma
);
106 int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata
*uctxt
,
107 struct hfi1_filedata
*fd
)
111 struct hfi1_devdata
*dd
;
112 struct hfi1_user_sdma_comp_q
*cq
;
113 struct hfi1_user_sdma_pkt_q
*pq
;
118 if (!hfi1_sdma_comp_ring_size
)
123 pq
= kzalloc(sizeof(*pq
), GFP_KERNEL
);
127 pq
->ctxt
= uctxt
->ctxt
;
128 pq
->subctxt
= fd
->subctxt
;
129 pq
->n_max_reqs
= hfi1_sdma_comp_ring_size
;
130 atomic_set(&pq
->n_reqs
, 0);
131 init_waitqueue_head(&pq
->wait
);
132 atomic_set(&pq
->n_locked
, 0);
134 iowait_init(&pq
->busy
, 0, NULL
, NULL
, defer_packet_queue
,
135 activate_packet_queue
, NULL
, NULL
);
138 pq
->reqs
= kcalloc(hfi1_sdma_comp_ring_size
,
144 pq
->req_in_use
= bitmap_zalloc(hfi1_sdma_comp_ring_size
, GFP_KERNEL
);
146 goto pq_reqs_no_in_use
;
148 snprintf(buf
, 64, "txreq-kmem-cache-%u-%u-%u", dd
->unit
, uctxt
->ctxt
,
150 pq
->txreq_cache
= kmem_cache_create(buf
,
151 sizeof(struct user_sdma_txreq
),
155 if (!pq
->txreq_cache
) {
156 dd_dev_err(dd
, "[%u] Failed to allocate TxReq cache\n",
161 cq
= kzalloc(sizeof(*cq
), GFP_KERNEL
);
165 cq
->comps
= vmalloc_user(PAGE_ALIGN(sizeof(*cq
->comps
)
166 * hfi1_sdma_comp_ring_size
));
170 cq
->nentries
= hfi1_sdma_comp_ring_size
;
172 ret
= hfi1_init_system_pinning(pq
);
176 rcu_assign_pointer(fd
->pq
, pq
);
186 kmem_cache_destroy(pq
->txreq_cache
);
188 bitmap_free(pq
->req_in_use
);
197 static void flush_pq_iowait(struct hfi1_user_sdma_pkt_q
*pq
)
200 seqlock_t
*lock
= pq
->busy
.lock
;
204 write_seqlock_irqsave(lock
, flags
);
205 if (!list_empty(&pq
->busy
.list
)) {
206 list_del_init(&pq
->busy
.list
);
207 pq
->busy
.lock
= NULL
;
209 write_sequnlock_irqrestore(lock
, flags
);
212 int hfi1_user_sdma_free_queues(struct hfi1_filedata
*fd
,
213 struct hfi1_ctxtdata
*uctxt
)
215 struct hfi1_user_sdma_pkt_q
*pq
;
217 trace_hfi1_sdma_user_free_queues(uctxt
->dd
, uctxt
->ctxt
, fd
->subctxt
);
219 spin_lock(&fd
->pq_rcu_lock
);
220 pq
= srcu_dereference_check(fd
->pq
, &fd
->pq_srcu
,
221 lockdep_is_held(&fd
->pq_rcu_lock
));
223 rcu_assign_pointer(fd
->pq
, NULL
);
224 spin_unlock(&fd
->pq_rcu_lock
);
225 synchronize_srcu(&fd
->pq_srcu
);
226 /* at this point there can be no more new requests */
227 iowait_sdma_drain(&pq
->busy
);
228 /* Wait until all requests have been freed. */
229 wait_event_interruptible(
231 !atomic_read(&pq
->n_reqs
));
233 hfi1_free_system_pinning(pq
);
234 bitmap_free(pq
->req_in_use
);
235 kmem_cache_destroy(pq
->txreq_cache
);
239 spin_unlock(&fd
->pq_rcu_lock
);
242 vfree(fd
->cq
->comps
);
249 static u8
dlid_to_selector(u16 dlid
)
251 static u8 mapping
[256];
252 static int initialized
;
257 memset(mapping
, 0xFF, 256);
261 hash
= ((dlid
>> 8) ^ dlid
) & 0xFF;
262 if (mapping
[hash
] == 0xFF) {
263 mapping
[hash
] = next
;
264 next
= (next
+ 1) & 0x7F;
267 return mapping
[hash
];
271 * hfi1_user_sdma_process_request() - Process and start a user sdma request
272 * @fd: valid file descriptor
273 * @iovec: array of io vectors to process
274 * @dim: overall iovec array size
275 * @count: number of io vector array entries processed
277 int hfi1_user_sdma_process_request(struct hfi1_filedata
*fd
,
278 struct iovec
*iovec
, unsigned long dim
,
279 unsigned long *count
)
282 struct hfi1_ctxtdata
*uctxt
= fd
->uctxt
;
283 struct hfi1_user_sdma_pkt_q
*pq
=
284 srcu_dereference(fd
->pq
, &fd
->pq_srcu
);
285 struct hfi1_user_sdma_comp_q
*cq
= fd
->cq
;
286 struct hfi1_devdata
*dd
= pq
->dd
;
287 unsigned long idx
= 0;
288 u8 pcount
= initial_pkt_count
;
289 struct sdma_req_info info
;
290 struct user_sdma_request
*req
;
297 if (iovec
[idx
].iov_len
< sizeof(info
) + sizeof(req
->hdr
)) {
300 "[%u:%u:%u] First vector not big enough for header %lu/%lu",
301 dd
->unit
, uctxt
->ctxt
, fd
->subctxt
,
302 iovec
[idx
].iov_len
, sizeof(info
) + sizeof(req
->hdr
));
305 ret
= copy_from_user(&info
, iovec
[idx
].iov_base
, sizeof(info
));
307 hfi1_cdbg(SDMA
, "[%u:%u:%u] Failed to copy info QW (%d)",
308 dd
->unit
, uctxt
->ctxt
, fd
->subctxt
, ret
);
312 trace_hfi1_sdma_user_reqinfo(dd
, uctxt
->ctxt
, fd
->subctxt
,
314 if (info
.comp_idx
>= hfi1_sdma_comp_ring_size
) {
316 "[%u:%u:%u:%u] Invalid comp index",
317 dd
->unit
, uctxt
->ctxt
, fd
->subctxt
, info
.comp_idx
);
322 * Sanity check the header io vector count. Need at least 1 vector
323 * (header) and cannot be larger than the actual io vector count.
325 if (req_iovcnt(info
.ctrl
) < 1 || req_iovcnt(info
.ctrl
) > dim
) {
327 "[%u:%u:%u:%u] Invalid iov count %d, dim %ld",
328 dd
->unit
, uctxt
->ctxt
, fd
->subctxt
, info
.comp_idx
,
329 req_iovcnt(info
.ctrl
), dim
);
333 if (!info
.fragsize
) {
335 "[%u:%u:%u:%u] Request does not specify fragsize",
336 dd
->unit
, uctxt
->ctxt
, fd
->subctxt
, info
.comp_idx
);
340 /* Try to claim the request. */
341 if (test_and_set_bit(info
.comp_idx
, pq
->req_in_use
)) {
342 hfi1_cdbg(SDMA
, "[%u:%u:%u] Entry %u is in use",
343 dd
->unit
, uctxt
->ctxt
, fd
->subctxt
,
348 * All safety checks have been done and this request has been claimed.
350 trace_hfi1_sdma_user_process_request(dd
, uctxt
->ctxt
, fd
->subctxt
,
352 req
= pq
->reqs
+ info
.comp_idx
;
353 req
->data_iovs
= req_iovcnt(info
.ctrl
) - 1; /* subtract header vector */
362 req
->seqsubmitted
= 0;
365 INIT_LIST_HEAD(&req
->txps
);
367 memcpy(&req
->info
, &info
, sizeof(info
));
369 /* The request is initialized, count it */
370 atomic_inc(&pq
->n_reqs
);
372 if (req_opcode(info
.ctrl
) == EXPECTED
) {
373 /* expected must have a TID info and at least one data vector */
374 if (req
->data_iovs
< 2) {
376 "Not enough vectors for expected request");
383 if (!info
.npkts
|| req
->data_iovs
> MAX_VECTORS_PER_REQ
) {
384 SDMA_DBG(req
, "Too many vectors (%u/%u)", req
->data_iovs
,
385 MAX_VECTORS_PER_REQ
);
390 /* Copy the header from the user buffer */
391 ret
= copy_from_user(&req
->hdr
, iovec
[idx
].iov_base
+ sizeof(info
),
394 SDMA_DBG(req
, "Failed to copy header template (%d)", ret
);
399 /* If Static rate control is not enabled, sanitize the header. */
400 if (!HFI1_CAP_IS_USET(STATIC_RATE_CTRL
))
403 /* Validate the opcode. Do not trust packets from user space blindly. */
404 opcode
= (be32_to_cpu(req
->hdr
.bth
[0]) >> 24) & 0xff;
405 if ((opcode
& USER_OPCODE_CHECK_MASK
) !=
406 USER_OPCODE_CHECK_VAL
) {
407 SDMA_DBG(req
, "Invalid opcode (%d)", opcode
);
412 * Validate the vl. Do not trust packets from user space blindly.
413 * VL comes from PBC, SC comes from LRH, and the VL needs to
414 * match the SC look up.
416 vl
= (le16_to_cpu(req
->hdr
.pbc
[0]) >> 12) & 0xF;
417 sc
= (((be16_to_cpu(req
->hdr
.lrh
[0]) >> 12) & 0xF) |
418 (((le16_to_cpu(req
->hdr
.pbc
[1]) >> 14) & 0x1) << 4));
419 if (vl
>= dd
->pport
->vls_operational
||
420 vl
!= sc_to_vlt(dd
, sc
)) {
421 SDMA_DBG(req
, "Invalid SC(%u)/VL(%u)", sc
, vl
);
426 /* Checking P_KEY for requests from user-space */
427 pkey
= (u16
)be32_to_cpu(req
->hdr
.bth
[0]);
428 slid
= be16_to_cpu(req
->hdr
.lrh
[3]);
429 if (egress_pkey_check(dd
->pport
, slid
, pkey
, sc
, PKEY_CHECK_INVALID
)) {
435 * Also should check the BTH.lnh. If it says the next header is GRH then
436 * the RXE parsing will be off and will land in the middle of the KDETH
437 * or miss it entirely.
439 if ((be16_to_cpu(req
->hdr
.lrh
[0]) & 0x3) == HFI1_LRH_GRH
) {
440 SDMA_DBG(req
, "User tried to pass in a GRH");
445 req
->koffset
= le32_to_cpu(req
->hdr
.kdeth
.swdata
[6]);
447 * Calculate the initial TID offset based on the values of
448 * KDETH.OFFSET and KDETH.OM that are passed in.
450 req
->tidoffset
= KDETH_GET(req
->hdr
.kdeth
.ver_tid_offset
, OFFSET
) *
451 (KDETH_GET(req
->hdr
.kdeth
.ver_tid_offset
, OM
) ?
452 KDETH_OM_LARGE
: KDETH_OM_SMALL
);
453 trace_hfi1_sdma_user_initial_tidoffset(dd
, uctxt
->ctxt
, fd
->subctxt
,
454 info
.comp_idx
, req
->tidoffset
);
457 /* Save all the IO vector structures */
458 for (i
= 0; i
< req
->data_iovs
; i
++) {
459 req
->iovs
[i
].offset
= 0;
460 INIT_LIST_HEAD(&req
->iovs
[i
].list
);
461 memcpy(&req
->iovs
[i
].iov
,
463 sizeof(req
->iovs
[i
].iov
));
464 if (req
->iovs
[i
].iov
.iov_len
== 0) {
468 req
->data_len
+= req
->iovs
[i
].iov
.iov_len
;
470 trace_hfi1_sdma_user_data_length(dd
, uctxt
->ctxt
, fd
->subctxt
,
471 info
.comp_idx
, req
->data_len
);
472 if (pcount
> req
->info
.npkts
)
473 pcount
= req
->info
.npkts
;
476 * User space will provide the TID info only when the
477 * request type is EXPECTED. This is true even if there is
478 * only one packet in the request and the header is already
479 * setup. The reason for the singular TID case is that the
480 * driver needs to perform safety checks.
482 if (req_opcode(req
->info
.ctrl
) == EXPECTED
) {
483 u16 ntids
= iovec
[idx
].iov_len
/ sizeof(*req
->tids
);
486 if (!ntids
|| ntids
> MAX_TID_PAIR_ENTRIES
) {
492 * We have to copy all of the tids because they may vary
493 * in size and, therefore, the TID count might not be
494 * equal to the pkt count. However, there is no way to
495 * tell at this point.
497 tmp
= memdup_array_user(iovec
[idx
].iov_base
,
498 ntids
, sizeof(*req
->tids
));
501 SDMA_DBG(req
, "Failed to copy %d TIDs (%d)",
511 dlid
= be16_to_cpu(req
->hdr
.lrh
[1]);
512 selector
= dlid_to_selector(dlid
);
513 selector
+= uctxt
->ctxt
+ fd
->subctxt
;
514 req
->sde
= sdma_select_user_engine(dd
, selector
, vl
);
516 if (!req
->sde
|| !sdma_running(req
->sde
)) {
521 /* We don't need an AHG entry if the request contains only one packet */
522 if (req
->info
.npkts
> 1 && HFI1_CAP_IS_USET(SDMA_AHG
))
523 req
->ahg_idx
= sdma_ahg_alloc(req
->sde
);
525 set_comp_state(pq
, cq
, info
.comp_idx
, QUEUED
, 0);
526 pq
->state
= SDMA_PKT_Q_ACTIVE
;
529 * This is a somewhat blocking send implementation.
530 * The driver will block the caller until all packets of the
531 * request have been submitted to the SDMA engine. However, it
532 * will not wait for send completions.
534 while (req
->seqsubmitted
!= req
->info
.npkts
) {
535 ret
= user_sdma_send_pkts(req
, pcount
);
541 we_ret
= wait_event_interruptible_timeout(
543 pq
->state
== SDMA_PKT_Q_ACTIVE
,
545 SDMA_IOWAIT_TIMEOUT
));
546 trace_hfi1_usdma_we(pq
, we_ret
);
555 * If the submitted seqsubmitted == npkts, the completion routine
556 * controls the final state. If sequbmitted < npkts, wait for any
557 * outstanding packets to finish before cleaning up.
559 if (req
->seqsubmitted
< req
->info
.npkts
) {
560 if (req
->seqsubmitted
)
561 wait_event(pq
->busy
.wait_dma
,
562 (req
->seqcomp
== req
->seqsubmitted
- 1));
563 user_sdma_free_request(req
);
565 set_comp_state(pq
, cq
, info
.comp_idx
, ERROR
, ret
);
570 static inline u32
compute_data_length(struct user_sdma_request
*req
,
571 struct user_sdma_txreq
*tx
)
574 * Determine the proper size of the packet data.
575 * The size of the data of the first packet is in the header
576 * template. However, it includes the header and ICRC, which need
578 * The minimum representable packet data length in a header is 4 bytes,
579 * therefore, when the data length request is less than 4 bytes, there's
580 * only one packet, and the packet data length is equal to that of the
581 * request data length.
582 * The size of the remaining packets is the minimum of the frag
583 * size (MTU) or remaining data in the request.
588 if (req
->data_len
< sizeof(u32
))
591 len
= ((be16_to_cpu(req
->hdr
.lrh
[2]) << 2) -
592 (sizeof(tx
->hdr
) - 4));
593 } else if (req_opcode(req
->info
.ctrl
) == EXPECTED
) {
594 u32 tidlen
= EXP_TID_GET(req
->tids
[req
->tididx
], LEN
) *
597 * Get the data length based on the remaining space in the
600 len
= min(tidlen
- req
->tidoffset
, (u32
)req
->info
.fragsize
);
601 /* If we've filled up the TID pair, move to the next one. */
602 if (unlikely(!len
) && ++req
->tididx
< req
->n_tids
&&
603 req
->tids
[req
->tididx
]) {
604 tidlen
= EXP_TID_GET(req
->tids
[req
->tididx
],
607 len
= min_t(u32
, tidlen
, req
->info
.fragsize
);
610 * Since the TID pairs map entire pages, make sure that we
611 * are not going to try to send more data that we have
614 len
= min(len
, req
->data_len
- req
->sent
);
616 len
= min(req
->data_len
- req
->sent
, (u32
)req
->info
.fragsize
);
618 trace_hfi1_sdma_user_compute_length(req
->pq
->dd
,
626 static inline u32
pad_len(u32 len
)
628 if (len
& (sizeof(u32
) - 1))
629 len
+= sizeof(u32
) - (len
& (sizeof(u32
) - 1));
633 static inline u32
get_lrh_len(struct hfi1_pkt_header hdr
, u32 len
)
635 /* (Size of complete header - size of PBC) + 4B ICRC + data length */
636 return ((sizeof(hdr
) - sizeof(hdr
.pbc
)) + 4 + len
);
639 static int user_sdma_txadd_ahg(struct user_sdma_request
*req
,
640 struct user_sdma_txreq
*tx
,
644 u16 pbclen
= le16_to_cpu(req
->hdr
.pbc
[0]);
645 u32 lrhlen
= get_lrh_len(req
->hdr
, pad_len(datalen
));
646 struct hfi1_user_sdma_pkt_q
*pq
= req
->pq
;
649 * Copy the request header into the tx header
650 * because the HW needs a cacheline-aligned
652 * This copy can be optimized out if the hdr
653 * member of user_sdma_request were also
656 memcpy(&tx
->hdr
, &req
->hdr
, sizeof(tx
->hdr
));
657 if (PBC2LRH(pbclen
) != lrhlen
) {
658 pbclen
= (pbclen
& 0xf000) | LRH2PBC(lrhlen
);
659 tx
->hdr
.pbc
[0] = cpu_to_le16(pbclen
);
661 ret
= check_header_template(req
, &tx
->hdr
, lrhlen
, datalen
);
664 ret
= sdma_txinit_ahg(&tx
->txreq
, SDMA_TXREQ_F_AHG_COPY
,
665 sizeof(tx
->hdr
) + datalen
, req
->ahg_idx
,
666 0, NULL
, 0, user_sdma_txreq_cb
);
669 ret
= sdma_txadd_kvaddr(pq
->dd
, &tx
->txreq
, &tx
->hdr
, sizeof(tx
->hdr
));
671 sdma_txclean(pq
->dd
, &tx
->txreq
);
675 static int user_sdma_send_pkts(struct user_sdma_request
*req
, u16 maxpkts
)
680 struct user_sdma_txreq
*tx
= NULL
;
681 struct hfi1_user_sdma_pkt_q
*pq
= NULL
;
682 struct user_sdma_iovec
*iovec
= NULL
;
689 /* If tx completion has reported an error, we are done. */
690 if (READ_ONCE(req
->has_error
))
694 * Check if we might have sent the entire request already
696 if (unlikely(req
->seqnum
== req
->info
.npkts
)) {
697 if (!list_empty(&req
->txps
))
702 if (!maxpkts
|| maxpkts
> req
->info
.npkts
- req
->seqnum
)
703 maxpkts
= req
->info
.npkts
- req
->seqnum
;
705 while (npkts
< maxpkts
) {
709 * Check whether any of the completions have come back
710 * with errors. If so, we are not going to process any
711 * more packets from this request.
713 if (READ_ONCE(req
->has_error
))
716 tx
= kmem_cache_alloc(pq
->txreq_cache
, GFP_KERNEL
);
722 INIT_LIST_HEAD(&tx
->list
);
725 * For the last packet set the ACK request
726 * and disable header suppression.
728 if (req
->seqnum
== req
->info
.npkts
- 1)
729 tx
->flags
|= (TXREQ_FLAGS_REQ_ACK
|
730 TXREQ_FLAGS_REQ_DISABLE_SH
);
733 * Calculate the payload size - this is min of the fragment
734 * (MTU) size or the remaining bytes in the request but only
735 * if we have payload data.
738 iovec
= &req
->iovs
[req
->iov_idx
];
739 if (READ_ONCE(iovec
->offset
) == iovec
->iov
.iov_len
) {
740 if (++req
->iov_idx
== req
->data_iovs
) {
744 iovec
= &req
->iovs
[req
->iov_idx
];
745 WARN_ON(iovec
->offset
);
748 datalen
= compute_data_length(req
, tx
);
751 * Disable header suppression for the payload <= 8DWS.
752 * If there is an uncorrectable error in the receive
753 * data FIFO when the received payload size is less than
754 * or equal to 8DWS then the RxDmaDataFifoRdUncErr is
755 * not reported.There is set RHF.EccErr if the header
760 "Request has data but pkt len is 0");
763 } else if (datalen
<= 32) {
764 tx
->flags
|= TXREQ_FLAGS_REQ_DISABLE_SH
;
768 if (req
->ahg_idx
>= 0) {
770 ret
= user_sdma_txadd_ahg(req
, tx
, datalen
);
776 changes
= set_txreq_header_ahg(req
, tx
,
784 ret
= sdma_txinit(&tx
->txreq
, 0, sizeof(req
->hdr
) +
785 datalen
, user_sdma_txreq_cb
);
789 * Modify the header for this packet. This only needs
790 * to be done if we are not going to use AHG. Otherwise,
791 * the HW will do it based on the changes we gave it
792 * during sdma_txinit_ahg().
794 ret
= set_txreq_header(req
, tx
, datalen
);
799 req
->koffset
+= datalen
;
800 if (req_opcode(req
->info
.ctrl
) == EXPECTED
)
801 req
->tidoffset
+= datalen
;
802 req
->sent
+= datalen
;
804 ret
= hfi1_add_pages_to_sdma_packet(req
, tx
, iovec
,
808 iovec
= &req
->iovs
[req
->iov_idx
];
810 list_add_tail(&tx
->txreq
.list
, &req
->txps
);
812 * It is important to increment this here as it is used to
813 * generate the BTH.PSN and, therefore, can't be bulk-updated
814 * outside of the loop.
816 tx
->seqnum
= req
->seqnum
++;
820 ret
= sdma_send_txlist(req
->sde
,
821 iowait_get_ib_work(&pq
->busy
),
823 req
->seqsubmitted
+= count
;
824 if (req
->seqsubmitted
== req
->info
.npkts
) {
826 * The txreq has already been submitted to the HW queue
827 * so we can free the AHG entry now. Corruption will not
828 * happen due to the sequential manner in which
829 * descriptors are processed.
831 if (req
->ahg_idx
>= 0)
832 sdma_ahg_free(req
->sde
, req
->ahg_idx
);
837 sdma_txclean(pq
->dd
, &tx
->txreq
);
839 kmem_cache_free(pq
->txreq_cache
, tx
);
843 static int check_header_template(struct user_sdma_request
*req
,
844 struct hfi1_pkt_header
*hdr
, u32 lrhlen
,
848 * Perform safety checks for any type of packet:
849 * - transfer size is multiple of 64bytes
850 * - packet length is multiple of 4 bytes
851 * - packet length is not larger than MTU size
853 * These checks are only done for the first packet of the
854 * transfer since the header is "given" to us by user space.
855 * For the remainder of the packets we compute the values.
857 if (req
->info
.fragsize
% PIO_BLOCK_SIZE
|| lrhlen
& 0x3 ||
858 lrhlen
> get_lrh_len(*hdr
, req
->info
.fragsize
))
861 if (req_opcode(req
->info
.ctrl
) == EXPECTED
) {
863 * The header is checked only on the first packet. Furthermore,
864 * we ensure that at least one TID entry is copied when the
865 * request is submitted. Therefore, we don't have to verify that
866 * tididx points to something sane.
868 u32 tidval
= req
->tids
[req
->tididx
],
869 tidlen
= EXP_TID_GET(tidval
, LEN
) * PAGE_SIZE
,
870 tididx
= EXP_TID_GET(tidval
, IDX
),
871 tidctrl
= EXP_TID_GET(tidval
, CTRL
),
873 __le32 kval
= hdr
->kdeth
.ver_tid_offset
;
875 tidoff
= KDETH_GET(kval
, OFFSET
) *
876 (KDETH_GET(req
->hdr
.kdeth
.ver_tid_offset
, OM
) ?
877 KDETH_OM_LARGE
: KDETH_OM_SMALL
);
879 * Expected receive packets have the following
881 * - offset is not larger than the TID size
882 * - TIDCtrl values match between header and TID array
883 * - TID indexes match between header and TID array
885 if ((tidoff
+ datalen
> tidlen
) ||
886 KDETH_GET(kval
, TIDCTRL
) != tidctrl
||
887 KDETH_GET(kval
, TID
) != tididx
)
894 * Correctly set the BTH.PSN field based on type of
895 * transfer - eager packets can just increment the PSN but
896 * expected packets encode generation and sequence in the
897 * BTH.PSN field so just incrementing will result in errors.
899 static inline u32
set_pkt_bth_psn(__be32 bthpsn
, u8 expct
, u32 frags
)
901 u32 val
= be32_to_cpu(bthpsn
),
902 mask
= (HFI1_CAP_IS_KSET(EXTENDED_PSN
) ? 0x7fffffffull
:
906 psn
= (psn
& ~HFI1_KDETH_BTH_SEQ_MASK
) |
907 ((psn
+ frags
) & HFI1_KDETH_BTH_SEQ_MASK
);
913 static int set_txreq_header(struct user_sdma_request
*req
,
914 struct user_sdma_txreq
*tx
, u32 datalen
)
916 struct hfi1_user_sdma_pkt_q
*pq
= req
->pq
;
917 struct hfi1_pkt_header
*hdr
= &tx
->hdr
;
918 u8 omfactor
; /* KDETH.OM */
921 u32 tidval
= 0, lrhlen
= get_lrh_len(*hdr
, pad_len(datalen
));
923 /* Copy the header template to the request before modification */
924 memcpy(hdr
, &req
->hdr
, sizeof(*hdr
));
927 * Check if the PBC and LRH length are mismatched. If so
928 * adjust both in the header.
930 pbclen
= le16_to_cpu(hdr
->pbc
[0]);
931 if (PBC2LRH(pbclen
) != lrhlen
) {
932 pbclen
= (pbclen
& 0xf000) | LRH2PBC(lrhlen
);
933 hdr
->pbc
[0] = cpu_to_le16(pbclen
);
934 hdr
->lrh
[2] = cpu_to_be16(lrhlen
>> 2);
937 * This is the first packet in the sequence that has
938 * a "static" size that can be used for the rest of
939 * the packets (besides the last one).
941 if (unlikely(req
->seqnum
== 2)) {
943 * From this point on the lengths in both the
944 * PBC and LRH are the same until the last
946 * Adjust the template so we don't have to update
949 req
->hdr
.pbc
[0] = hdr
->pbc
[0];
950 req
->hdr
.lrh
[2] = hdr
->lrh
[2];
954 * We only have to modify the header if this is not the
955 * first packet in the request. Otherwise, we use the
956 * header given to us.
958 if (unlikely(!req
->seqnum
)) {
959 ret
= check_header_template(req
, hdr
, lrhlen
, datalen
);
965 hdr
->bth
[2] = cpu_to_be32(
966 set_pkt_bth_psn(hdr
->bth
[2],
967 (req_opcode(req
->info
.ctrl
) == EXPECTED
),
970 /* Set ACK request on last packet */
971 if (unlikely(tx
->flags
& TXREQ_FLAGS_REQ_ACK
))
972 hdr
->bth
[2] |= cpu_to_be32(1UL << 31);
974 /* Set the new offset */
975 hdr
->kdeth
.swdata
[6] = cpu_to_le32(req
->koffset
);
976 /* Expected packets have to fill in the new TID information */
977 if (req_opcode(req
->info
.ctrl
) == EXPECTED
) {
978 tidval
= req
->tids
[req
->tididx
];
980 * If the offset puts us at the end of the current TID,
981 * advance everything.
983 if ((req
->tidoffset
) == (EXP_TID_GET(tidval
, LEN
) *
987 * Since we don't copy all the TIDs, all at once,
988 * we have to check again.
990 if (++req
->tididx
> req
->n_tids
- 1 ||
991 !req
->tids
[req
->tididx
]) {
994 tidval
= req
->tids
[req
->tididx
];
996 omfactor
= EXP_TID_GET(tidval
, LEN
) * PAGE_SIZE
>=
997 KDETH_OM_MAX_SIZE
? KDETH_OM_LARGE_SHIFT
:
998 KDETH_OM_SMALL_SHIFT
;
999 /* Set KDETH.TIDCtrl based on value for this TID. */
1000 KDETH_SET(hdr
->kdeth
.ver_tid_offset
, TIDCTRL
,
1001 EXP_TID_GET(tidval
, CTRL
));
1002 /* Set KDETH.TID based on value for this TID */
1003 KDETH_SET(hdr
->kdeth
.ver_tid_offset
, TID
,
1004 EXP_TID_GET(tidval
, IDX
));
1005 /* Clear KDETH.SH when DISABLE_SH flag is set */
1006 if (unlikely(tx
->flags
& TXREQ_FLAGS_REQ_DISABLE_SH
))
1007 KDETH_SET(hdr
->kdeth
.ver_tid_offset
, SH
, 0);
1009 * Set the KDETH.OFFSET and KDETH.OM based on size of
1012 trace_hfi1_sdma_user_tid_info(
1013 pq
->dd
, pq
->ctxt
, pq
->subctxt
, req
->info
.comp_idx
,
1014 req
->tidoffset
, req
->tidoffset
>> omfactor
,
1015 omfactor
!= KDETH_OM_SMALL_SHIFT
);
1016 KDETH_SET(hdr
->kdeth
.ver_tid_offset
, OFFSET
,
1017 req
->tidoffset
>> omfactor
);
1018 KDETH_SET(hdr
->kdeth
.ver_tid_offset
, OM
,
1019 omfactor
!= KDETH_OM_SMALL_SHIFT
);
1022 trace_hfi1_sdma_user_header(pq
->dd
, pq
->ctxt
, pq
->subctxt
,
1023 req
->info
.comp_idx
, hdr
, tidval
);
1024 return sdma_txadd_kvaddr(pq
->dd
, &tx
->txreq
, hdr
, sizeof(*hdr
));
1027 static int set_txreq_header_ahg(struct user_sdma_request
*req
,
1028 struct user_sdma_txreq
*tx
, u32 datalen
)
1030 u32 ahg
[AHG_KDETH_ARRAY_SIZE
];
1032 u8 omfactor
; /* KDETH.OM */
1033 struct hfi1_user_sdma_pkt_q
*pq
= req
->pq
;
1034 struct hfi1_pkt_header
*hdr
= &req
->hdr
;
1035 u16 pbclen
= le16_to_cpu(hdr
->pbc
[0]);
1036 u32 val32
, tidval
= 0, lrhlen
= get_lrh_len(*hdr
, pad_len(datalen
));
1037 size_t array_size
= ARRAY_SIZE(ahg
);
1039 if (PBC2LRH(pbclen
) != lrhlen
) {
1040 /* PBC.PbcLengthDWs */
1041 idx
= ahg_header_set(ahg
, idx
, array_size
, 0, 0, 12,
1042 (__force u16
)cpu_to_le16(LRH2PBC(lrhlen
)));
1045 /* LRH.PktLen (we need the full 16 bits due to byte swap) */
1046 idx
= ahg_header_set(ahg
, idx
, array_size
, 3, 0, 16,
1047 (__force u16
)cpu_to_be16(lrhlen
>> 2));
1053 * Do the common updates
1055 /* BTH.PSN and BTH.A */
1056 val32
= (be32_to_cpu(hdr
->bth
[2]) + req
->seqnum
) &
1057 (HFI1_CAP_IS_KSET(EXTENDED_PSN
) ? 0x7fffffff : 0xffffff);
1058 if (unlikely(tx
->flags
& TXREQ_FLAGS_REQ_ACK
))
1060 idx
= ahg_header_set(ahg
, idx
, array_size
, 6, 0, 16,
1061 (__force u16
)cpu_to_be16(val32
>> 16));
1064 idx
= ahg_header_set(ahg
, idx
, array_size
, 6, 16, 16,
1065 (__force u16
)cpu_to_be16(val32
& 0xffff));
1069 idx
= ahg_header_set(ahg
, idx
, array_size
, 15, 0, 16,
1070 (__force u16
)cpu_to_le16(req
->koffset
& 0xffff));
1073 idx
= ahg_header_set(ahg
, idx
, array_size
, 15, 16, 16,
1074 (__force u16
)cpu_to_le16(req
->koffset
>> 16));
1077 if (req_opcode(req
->info
.ctrl
) == EXPECTED
) {
1080 tidval
= req
->tids
[req
->tididx
];
1083 * If the offset puts us at the end of the current TID,
1084 * advance everything.
1086 if ((req
->tidoffset
) == (EXP_TID_GET(tidval
, LEN
) *
1090 * Since we don't copy all the TIDs, all at once,
1091 * we have to check again.
1093 if (++req
->tididx
> req
->n_tids
- 1 ||
1094 !req
->tids
[req
->tididx
])
1096 tidval
= req
->tids
[req
->tididx
];
1098 omfactor
= ((EXP_TID_GET(tidval
, LEN
) *
1100 KDETH_OM_MAX_SIZE
) ? KDETH_OM_LARGE_SHIFT
:
1101 KDETH_OM_SMALL_SHIFT
;
1102 /* KDETH.OM and KDETH.OFFSET (TID) */
1103 idx
= ahg_header_set(
1104 ahg
, idx
, array_size
, 7, 0, 16,
1105 ((!!(omfactor
- KDETH_OM_SMALL_SHIFT
)) << 15 |
1106 ((req
->tidoffset
>> omfactor
)
1110 /* KDETH.TIDCtrl, KDETH.TID, KDETH.Intr, KDETH.SH */
1111 val
= cpu_to_le16(((EXP_TID_GET(tidval
, CTRL
) & 0x3) << 10) |
1112 (EXP_TID_GET(tidval
, IDX
) & 0x3ff));
1114 if (unlikely(tx
->flags
& TXREQ_FLAGS_REQ_DISABLE_SH
)) {
1115 val
|= cpu_to_le16((KDETH_GET(hdr
->kdeth
.ver_tid_offset
,
1117 AHG_KDETH_INTR_SHIFT
));
1119 val
|= KDETH_GET(hdr
->kdeth
.ver_tid_offset
, SH
) ?
1120 cpu_to_le16(0x1 << AHG_KDETH_SH_SHIFT
) :
1121 cpu_to_le16((KDETH_GET(hdr
->kdeth
.ver_tid_offset
,
1123 AHG_KDETH_INTR_SHIFT
));
1126 idx
= ahg_header_set(ahg
, idx
, array_size
,
1127 7, 16, 14, (__force u16
)val
);
1132 trace_hfi1_sdma_user_header_ahg(pq
->dd
, pq
->ctxt
, pq
->subctxt
,
1133 req
->info
.comp_idx
, req
->sde
->this_idx
,
1134 req
->ahg_idx
, ahg
, idx
, tidval
);
1135 sdma_txinit_ahg(&tx
->txreq
,
1136 SDMA_TXREQ_F_USE_AHG
,
1137 datalen
, req
->ahg_idx
, idx
,
1138 ahg
, sizeof(req
->hdr
),
1139 user_sdma_txreq_cb
);
1145 * user_sdma_txreq_cb() - SDMA tx request completion callback.
1146 * @txreq: valid sdma tx request
1147 * @status: success/failure of request
1149 * Called when the SDMA progress state machine gets notification that
1150 * the SDMA descriptors for this tx request have been processed by the
1151 * DMA engine. Called in interrupt context.
1152 * Only do work on completed sequences.
1154 static void user_sdma_txreq_cb(struct sdma_txreq
*txreq
, int status
)
1156 struct user_sdma_txreq
*tx
=
1157 container_of(txreq
, struct user_sdma_txreq
, txreq
);
1158 struct user_sdma_request
*req
;
1159 struct hfi1_user_sdma_pkt_q
*pq
;
1160 struct hfi1_user_sdma_comp_q
*cq
;
1161 enum hfi1_sdma_comp_state state
= COMPLETE
;
1170 if (status
!= SDMA_TXREQ_S_OK
) {
1171 SDMA_DBG(req
, "SDMA completion with error %d",
1173 WRITE_ONCE(req
->has_error
, 1);
1177 req
->seqcomp
= tx
->seqnum
;
1178 kmem_cache_free(pq
->txreq_cache
, tx
);
1180 /* sequence isn't complete? We are done */
1181 if (req
->seqcomp
!= req
->info
.npkts
- 1)
1184 user_sdma_free_request(req
);
1185 set_comp_state(pq
, cq
, req
->info
.comp_idx
, state
, status
);
1189 static inline void pq_update(struct hfi1_user_sdma_pkt_q
*pq
)
1191 if (atomic_dec_and_test(&pq
->n_reqs
))
1195 static void user_sdma_free_request(struct user_sdma_request
*req
)
1197 if (!list_empty(&req
->txps
)) {
1198 struct sdma_txreq
*t
, *p
;
1200 list_for_each_entry_safe(t
, p
, &req
->txps
, list
) {
1201 struct user_sdma_txreq
*tx
=
1202 container_of(t
, struct user_sdma_txreq
, txreq
);
1203 list_del_init(&t
->list
);
1204 sdma_txclean(req
->pq
->dd
, t
);
1205 kmem_cache_free(req
->pq
->txreq_cache
, tx
);
1210 clear_bit(req
->info
.comp_idx
, req
->pq
->req_in_use
);
1213 static inline void set_comp_state(struct hfi1_user_sdma_pkt_q
*pq
,
1214 struct hfi1_user_sdma_comp_q
*cq
,
1215 u16 idx
, enum hfi1_sdma_comp_state state
,
1219 cq
->comps
[idx
].errcode
= -ret
;
1220 smp_wmb(); /* make sure errcode is visible first */
1221 cq
->comps
[idx
].status
= state
;
1222 trace_hfi1_sdma_user_completion(pq
->dd
, pq
->ctxt
, pq
->subctxt
,