2 * Copyright(c) 2016 Intel Corporation.
4 * This file is provided under a dual BSD/GPLv2 license. When using or
5 * redistributing this file, you may do so under either license.
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of version 2 of the GNU General Public License as
11 * published by the Free Software Foundation.
13 * This program is distributed in the hope that it will be useful, but
14 * WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * General Public License for more details.
20 * Redistribution and use in source and binary forms, with or without
21 * modification, are permitted provided that the following conditions
24 * - Redistributions of source code must retain the above copyright
25 * notice, this list of conditions and the following disclaimer.
26 * - Redistributions in binary form must reproduce the above copyright
27 * notice, this list of conditions and the following disclaimer in
28 * the documentation and/or other materials provided with the
30 * - Neither the name of Intel Corporation nor the names of its
31 * contributors may be used to endorse or promote products derived
32 * from this software without specific prior written permission.
34 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
35 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
36 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
37 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
38 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
39 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
40 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
41 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
42 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
43 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
44 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48 #include <linux/hash.h>
49 #include <linux/bitops.h>
50 #include <linux/lockdep.h>
51 #include <linux/vmalloc.h>
52 #include <linux/slab.h>
53 #include <rdma/ib_verbs.h>
54 #include <rdma/ib_hdrs.h>
59 static void rvt_rc_timeout(unsigned long arg
);
62 * Convert the AETH RNR timeout code into the number of microseconds.
64 static const u32 ib_rvt_rnr_table
[32] = {
65 655360, /* 00: 655.36 */
85 10240, /* 14: 10.24 */
86 15360, /* 15: 15.36 */
87 20480, /* 16: 20.48 */
88 30720, /* 17: 30.72 */
89 40960, /* 18: 40.96 */
90 61440, /* 19: 61.44 */
91 81920, /* 1A: 81.92 */
92 122880, /* 1B: 122.88 */
93 163840, /* 1C: 163.84 */
94 245760, /* 1D: 245.76 */
95 327680, /* 1E: 327.68 */
96 491520 /* 1F: 491.52 */
100 * Note that it is OK to post send work requests in the SQE and ERR
101 * states; rvt_do_send() will process them and generate error
102 * completions as per IB 1.2 C10-96.
104 const int ib_rvt_state_ops
[IB_QPS_ERR
+ 1] = {
106 [IB_QPS_INIT
] = RVT_POST_RECV_OK
,
107 [IB_QPS_RTR
] = RVT_POST_RECV_OK
| RVT_PROCESS_RECV_OK
,
108 [IB_QPS_RTS
] = RVT_POST_RECV_OK
| RVT_PROCESS_RECV_OK
|
109 RVT_POST_SEND_OK
| RVT_PROCESS_SEND_OK
|
110 RVT_PROCESS_NEXT_SEND_OK
,
111 [IB_QPS_SQD
] = RVT_POST_RECV_OK
| RVT_PROCESS_RECV_OK
|
112 RVT_POST_SEND_OK
| RVT_PROCESS_SEND_OK
,
113 [IB_QPS_SQE
] = RVT_POST_RECV_OK
| RVT_PROCESS_RECV_OK
|
114 RVT_POST_SEND_OK
| RVT_FLUSH_SEND
,
115 [IB_QPS_ERR
] = RVT_POST_RECV_OK
| RVT_FLUSH_RECV
|
116 RVT_POST_SEND_OK
| RVT_FLUSH_SEND
,
118 EXPORT_SYMBOL(ib_rvt_state_ops
);
121 * Translate ib_wr_opcode into ib_wc_opcode.
123 const enum ib_wc_opcode ib_rvt_wc_opcode
[] = {
124 [IB_WR_RDMA_WRITE
] = IB_WC_RDMA_WRITE
,
125 [IB_WR_RDMA_WRITE_WITH_IMM
] = IB_WC_RDMA_WRITE
,
126 [IB_WR_SEND
] = IB_WC_SEND
,
127 [IB_WR_SEND_WITH_IMM
] = IB_WC_SEND
,
128 [IB_WR_RDMA_READ
] = IB_WC_RDMA_READ
,
129 [IB_WR_ATOMIC_CMP_AND_SWP
] = IB_WC_COMP_SWAP
,
130 [IB_WR_ATOMIC_FETCH_AND_ADD
] = IB_WC_FETCH_ADD
,
131 [IB_WR_SEND_WITH_INV
] = IB_WC_SEND
,
132 [IB_WR_LOCAL_INV
] = IB_WC_LOCAL_INV
,
133 [IB_WR_REG_MR
] = IB_WC_REG_MR
135 EXPORT_SYMBOL(ib_rvt_wc_opcode
);
137 static void get_map_page(struct rvt_qpn_table
*qpt
,
138 struct rvt_qpn_map
*map
,
141 unsigned long page
= get_zeroed_page(gfp
);
144 * Free the page if someone raced with us installing it.
147 spin_lock(&qpt
->lock
);
151 map
->page
= (void *)page
;
152 spin_unlock(&qpt
->lock
);
156 * init_qpn_table - initialize the QP number table for a device
157 * @qpt: the QPN table
159 static int init_qpn_table(struct rvt_dev_info
*rdi
, struct rvt_qpn_table
*qpt
)
162 struct rvt_qpn_map
*map
;
165 if (!(rdi
->dparms
.qpn_res_end
>= rdi
->dparms
.qpn_res_start
))
168 spin_lock_init(&qpt
->lock
);
170 qpt
->last
= rdi
->dparms
.qpn_start
;
171 qpt
->incr
= rdi
->dparms
.qpn_inc
<< rdi
->dparms
.qos_shift
;
174 * Drivers may want some QPs beyond what we need for verbs let them use
175 * our qpn table. No need for two. Lets go ahead and mark the bitmaps
176 * for those. The reserved range must be *after* the range which verbs
180 /* Figure out number of bit maps needed before reserved range */
181 qpt
->nmaps
= rdi
->dparms
.qpn_res_start
/ RVT_BITS_PER_PAGE
;
183 /* This should always be zero */
184 offset
= rdi
->dparms
.qpn_res_start
& RVT_BITS_PER_PAGE_MASK
;
186 /* Starting with the first reserved bit map */
187 map
= &qpt
->map
[qpt
->nmaps
];
189 rvt_pr_info(rdi
, "Reserving QPNs from 0x%x to 0x%x for non-verbs use\n",
190 rdi
->dparms
.qpn_res_start
, rdi
->dparms
.qpn_res_end
);
191 for (i
= rdi
->dparms
.qpn_res_start
; i
<= rdi
->dparms
.qpn_res_end
; i
++) {
193 get_map_page(qpt
, map
, GFP_KERNEL
);
199 set_bit(offset
, map
->page
);
201 if (offset
== RVT_BITS_PER_PAGE
) {
212 * free_qpn_table - free the QP number table for a device
213 * @qpt: the QPN table
215 static void free_qpn_table(struct rvt_qpn_table
*qpt
)
219 for (i
= 0; i
< ARRAY_SIZE(qpt
->map
); i
++)
220 free_page((unsigned long)qpt
->map
[i
].page
);
224 * rvt_driver_qp_init - Init driver qp resources
225 * @rdi: rvt dev strucutre
227 * Return: 0 on success
229 int rvt_driver_qp_init(struct rvt_dev_info
*rdi
)
234 if (!rdi
->dparms
.qp_table_size
)
238 * If driver is not doing any QP allocation then make sure it is
239 * providing the necessary QP functions.
241 if (!rdi
->driver_f
.free_all_qps
||
242 !rdi
->driver_f
.qp_priv_alloc
||
243 !rdi
->driver_f
.qp_priv_free
||
244 !rdi
->driver_f
.notify_qp_reset
||
245 !rdi
->driver_f
.notify_restart_rc
)
248 /* allocate parent object */
249 rdi
->qp_dev
= kzalloc_node(sizeof(*rdi
->qp_dev
), GFP_KERNEL
,
254 /* allocate hash table */
255 rdi
->qp_dev
->qp_table_size
= rdi
->dparms
.qp_table_size
;
256 rdi
->qp_dev
->qp_table_bits
= ilog2(rdi
->dparms
.qp_table_size
);
257 rdi
->qp_dev
->qp_table
=
258 kmalloc_node(rdi
->qp_dev
->qp_table_size
*
259 sizeof(*rdi
->qp_dev
->qp_table
),
260 GFP_KERNEL
, rdi
->dparms
.node
);
261 if (!rdi
->qp_dev
->qp_table
)
264 for (i
= 0; i
< rdi
->qp_dev
->qp_table_size
; i
++)
265 RCU_INIT_POINTER(rdi
->qp_dev
->qp_table
[i
], NULL
);
267 spin_lock_init(&rdi
->qp_dev
->qpt_lock
);
269 /* initialize qpn map */
270 if (init_qpn_table(rdi
, &rdi
->qp_dev
->qpn_table
))
273 spin_lock_init(&rdi
->n_qps_lock
);
278 kfree(rdi
->qp_dev
->qp_table
);
279 free_qpn_table(&rdi
->qp_dev
->qpn_table
);
288 * free_all_qps - check for QPs still in use
289 * @qpt: the QP table to empty
291 * There should not be any QPs still in use.
292 * Free memory for table.
294 static unsigned rvt_free_all_qps(struct rvt_dev_info
*rdi
)
298 unsigned n
, qp_inuse
= 0;
299 spinlock_t
*ql
; /* work around too long line below */
301 if (rdi
->driver_f
.free_all_qps
)
302 qp_inuse
= rdi
->driver_f
.free_all_qps(rdi
);
304 qp_inuse
+= rvt_mcast_tree_empty(rdi
);
309 ql
= &rdi
->qp_dev
->qpt_lock
;
310 spin_lock_irqsave(ql
, flags
);
311 for (n
= 0; n
< rdi
->qp_dev
->qp_table_size
; n
++) {
312 qp
= rcu_dereference_protected(rdi
->qp_dev
->qp_table
[n
],
313 lockdep_is_held(ql
));
314 RCU_INIT_POINTER(rdi
->qp_dev
->qp_table
[n
], NULL
);
316 for (; qp
; qp
= rcu_dereference_protected(qp
->next
,
317 lockdep_is_held(ql
)))
320 spin_unlock_irqrestore(ql
, flags
);
326 * rvt_qp_exit - clean up qps on device exit
327 * @rdi: rvt dev structure
329 * Check for qp leaks and free resources.
331 void rvt_qp_exit(struct rvt_dev_info
*rdi
)
333 u32 qps_inuse
= rvt_free_all_qps(rdi
);
336 rvt_pr_err(rdi
, "QP memory leak! %u still in use\n",
341 kfree(rdi
->qp_dev
->qp_table
);
342 free_qpn_table(&rdi
->qp_dev
->qpn_table
);
346 static inline unsigned mk_qpn(struct rvt_qpn_table
*qpt
,
347 struct rvt_qpn_map
*map
, unsigned off
)
349 return (map
- qpt
->map
) * RVT_BITS_PER_PAGE
+ off
;
353 * alloc_qpn - Allocate the next available qpn or zero/one for QP type
354 * IB_QPT_SMI/IB_QPT_GSI
355 *@rdi: rvt device info structure
356 *@qpt: queue pair number table pointer
357 *@port_num: IB port number, 1 based, comes from core
359 * Return: The queue pair number
361 static int alloc_qpn(struct rvt_dev_info
*rdi
, struct rvt_qpn_table
*qpt
,
362 enum ib_qp_type type
, u8 port_num
, gfp_t gfp
)
364 u32 i
, offset
, max_scan
, qpn
;
365 struct rvt_qpn_map
*map
;
368 if (rdi
->driver_f
.alloc_qpn
)
369 return rdi
->driver_f
.alloc_qpn(rdi
, qpt
, type
, port_num
, gfp
);
371 if (type
== IB_QPT_SMI
|| type
== IB_QPT_GSI
) {
374 ret
= type
== IB_QPT_GSI
;
375 n
= 1 << (ret
+ 2 * (port_num
- 1));
376 spin_lock(&qpt
->lock
);
381 spin_unlock(&qpt
->lock
);
385 qpn
= qpt
->last
+ qpt
->incr
;
386 if (qpn
>= RVT_QPN_MAX
)
387 qpn
= qpt
->incr
| ((qpt
->last
& 1) ^ 1);
388 /* offset carries bit 0 */
389 offset
= qpn
& RVT_BITS_PER_PAGE_MASK
;
390 map
= &qpt
->map
[qpn
/ RVT_BITS_PER_PAGE
];
391 max_scan
= qpt
->nmaps
- !offset
;
393 if (unlikely(!map
->page
)) {
394 get_map_page(qpt
, map
, gfp
);
395 if (unlikely(!map
->page
))
399 if (!test_and_set_bit(offset
, map
->page
)) {
406 * This qpn might be bogus if offset >= BITS_PER_PAGE.
407 * That is OK. It gets re-assigned below
409 qpn
= mk_qpn(qpt
, map
, offset
);
410 } while (offset
< RVT_BITS_PER_PAGE
&& qpn
< RVT_QPN_MAX
);
412 * In order to keep the number of pages allocated to a
413 * minimum, we scan the all existing pages before increasing
414 * the size of the bitmap table.
416 if (++i
> max_scan
) {
417 if (qpt
->nmaps
== RVT_QPNMAP_ENTRIES
)
419 map
= &qpt
->map
[qpt
->nmaps
++];
420 /* start at incr with current bit 0 */
421 offset
= qpt
->incr
| (offset
& 1);
422 } else if (map
< &qpt
->map
[qpt
->nmaps
]) {
424 /* start at incr with current bit 0 */
425 offset
= qpt
->incr
| (offset
& 1);
428 /* wrap to first map page, invert bit 0 */
429 offset
= qpt
->incr
| ((offset
& 1) ^ 1);
431 /* there can be no set bits in low-order QoS bits */
432 WARN_ON(offset
& (BIT(rdi
->dparms
.qos_shift
) - 1));
433 qpn
= mk_qpn(qpt
, map
, offset
);
442 static void free_qpn(struct rvt_qpn_table
*qpt
, u32 qpn
)
444 struct rvt_qpn_map
*map
;
446 map
= qpt
->map
+ qpn
/ RVT_BITS_PER_PAGE
;
448 clear_bit(qpn
& RVT_BITS_PER_PAGE_MASK
, map
->page
);
452 * rvt_clear_mr_refs - Drop help mr refs
453 * @qp: rvt qp data structure
454 * @clr_sends: If shoudl clear send side or not
456 static void rvt_clear_mr_refs(struct rvt_qp
*qp
, int clr_sends
)
459 struct rvt_dev_info
*rdi
= ib_to_rvt(qp
->ibqp
.device
);
461 if (test_and_clear_bit(RVT_R_REWIND_SGE
, &qp
->r_aflags
))
462 rvt_put_ss(&qp
->s_rdma_read_sge
);
464 rvt_put_ss(&qp
->r_sge
);
467 while (qp
->s_last
!= qp
->s_head
) {
468 struct rvt_swqe
*wqe
= rvt_get_swqe_ptr(qp
, qp
->s_last
);
471 for (i
= 0; i
< wqe
->wr
.num_sge
; i
++) {
472 struct rvt_sge
*sge
= &wqe
->sg_list
[i
];
476 if (qp
->ibqp
.qp_type
== IB_QPT_UD
||
477 qp
->ibqp
.qp_type
== IB_QPT_SMI
||
478 qp
->ibqp
.qp_type
== IB_QPT_GSI
)
479 atomic_dec(&ibah_to_rvtah(
480 wqe
->ud_wr
.ah
)->refcount
);
481 if (++qp
->s_last
>= qp
->s_size
)
483 smp_wmb(); /* see qp_set_savail */
486 rvt_put_mr(qp
->s_rdma_mr
);
487 qp
->s_rdma_mr
= NULL
;
491 if (qp
->ibqp
.qp_type
!= IB_QPT_RC
)
494 for (n
= 0; n
< rvt_max_atomic(rdi
); n
++) {
495 struct rvt_ack_entry
*e
= &qp
->s_ack_queue
[n
];
497 if (e
->rdma_sge
.mr
) {
498 rvt_put_mr(e
->rdma_sge
.mr
);
499 e
->rdma_sge
.mr
= NULL
;
505 * rvt_remove_qp - remove qp form table
506 * @rdi: rvt dev struct
509 * Remove the QP from the table so it can't be found asynchronously by
510 * the receive routine.
512 static void rvt_remove_qp(struct rvt_dev_info
*rdi
, struct rvt_qp
*qp
)
514 struct rvt_ibport
*rvp
= rdi
->ports
[qp
->port_num
- 1];
515 u32 n
= hash_32(qp
->ibqp
.qp_num
, rdi
->qp_dev
->qp_table_bits
);
519 spin_lock_irqsave(&rdi
->qp_dev
->qpt_lock
, flags
);
521 if (rcu_dereference_protected(rvp
->qp
[0],
522 lockdep_is_held(&rdi
->qp_dev
->qpt_lock
)) == qp
) {
523 RCU_INIT_POINTER(rvp
->qp
[0], NULL
);
524 } else if (rcu_dereference_protected(rvp
->qp
[1],
525 lockdep_is_held(&rdi
->qp_dev
->qpt_lock
)) == qp
) {
526 RCU_INIT_POINTER(rvp
->qp
[1], NULL
);
529 struct rvt_qp __rcu
**qpp
;
532 qpp
= &rdi
->qp_dev
->qp_table
[n
];
533 for (; (q
= rcu_dereference_protected(*qpp
,
534 lockdep_is_held(&rdi
->qp_dev
->qpt_lock
))) != NULL
;
537 RCU_INIT_POINTER(*qpp
,
538 rcu_dereference_protected(qp
->next
,
539 lockdep_is_held(&rdi
->qp_dev
->qpt_lock
)));
541 trace_rvt_qpremove(qp
, n
);
547 spin_unlock_irqrestore(&rdi
->qp_dev
->qpt_lock
, flags
);
555 * rvt_init_qp - initialize the QP state to the reset state
556 * @qp: the QP to init or reinit
559 * This function is called from both rvt_create_qp() and
560 * rvt_reset_qp(). The difference is that the reset
561 * patch the necessary locks to protect against concurent
564 static void rvt_init_qp(struct rvt_dev_info
*rdi
, struct rvt_qp
*qp
,
565 enum ib_qp_type type
)
569 qp
->qp_access_flags
= 0;
570 qp
->s_flags
&= RVT_S_SIGNAL_REQ_WR
;
576 qp
->s_sending_psn
= 0;
577 qp
->s_sending_hpsn
= 0;
581 if (type
== IB_QPT_RC
) {
582 qp
->s_state
= IB_OPCODE_RC_SEND_LAST
;
583 qp
->r_state
= IB_OPCODE_RC_SEND_LAST
;
585 qp
->s_state
= IB_OPCODE_UC_SEND_LAST
;
586 qp
->r_state
= IB_OPCODE_UC_SEND_LAST
;
588 qp
->s_ack_state
= IB_OPCODE_RC_ACKNOWLEDGE
;
599 qp
->s_mig_state
= IB_MIG_MIGRATED
;
600 qp
->r_head_ack_queue
= 0;
601 qp
->s_tail_ack_queue
= 0;
602 qp
->s_num_rd_atomic
= 0;
604 qp
->r_rq
.wq
->head
= 0;
605 qp
->r_rq
.wq
->tail
= 0;
607 qp
->r_sge
.num_sge
= 0;
608 atomic_set(&qp
->s_reserved_used
, 0);
612 * rvt_reset_qp - initialize the QP state to the reset state
613 * @qp: the QP to reset
616 * r_lock, s_hlock, and s_lock are required to be held by the caller
618 static void rvt_reset_qp(struct rvt_dev_info
*rdi
, struct rvt_qp
*qp
,
619 enum ib_qp_type type
)
620 __must_hold(&qp
->s_lock
)
621 __must_hold(&qp
->s_hlock
)
622 __must_hold(&qp
->r_lock
)
624 lockdep_assert_held(&qp
->r_lock
);
625 lockdep_assert_held(&qp
->s_hlock
);
626 lockdep_assert_held(&qp
->s_lock
);
627 if (qp
->state
!= IB_QPS_RESET
) {
628 qp
->state
= IB_QPS_RESET
;
630 /* Let drivers flush their waitlist */
631 rdi
->driver_f
.flush_qp_waiters(qp
);
632 rvt_stop_rc_timers(qp
);
633 qp
->s_flags
&= ~(RVT_S_TIMER
| RVT_S_ANY_WAIT
);
634 spin_unlock(&qp
->s_lock
);
635 spin_unlock(&qp
->s_hlock
);
636 spin_unlock_irq(&qp
->r_lock
);
638 /* Stop the send queue and the retry timer */
639 rdi
->driver_f
.stop_send_queue(qp
);
640 rvt_del_timers_sync(qp
);
641 /* Wait for things to stop */
642 rdi
->driver_f
.quiesce_qp(qp
);
644 /* take qp out the hash and wait for it to be unused */
645 rvt_remove_qp(rdi
, qp
);
646 wait_event(qp
->wait
, !atomic_read(&qp
->refcount
));
648 /* grab the lock b/c it was locked at call time */
649 spin_lock_irq(&qp
->r_lock
);
650 spin_lock(&qp
->s_hlock
);
651 spin_lock(&qp
->s_lock
);
653 rvt_clear_mr_refs(qp
, 1);
655 * Let the driver do any tear down or re-init it needs to for
656 * a qp that has been reset
658 rdi
->driver_f
.notify_qp_reset(qp
);
660 rvt_init_qp(rdi
, qp
, type
);
661 lockdep_assert_held(&qp
->r_lock
);
662 lockdep_assert_held(&qp
->s_hlock
);
663 lockdep_assert_held(&qp
->s_lock
);
667 * rvt_create_qp - create a queue pair for a device
668 * @ibpd: the protection domain who's device we create the queue pair for
669 * @init_attr: the attributes of the queue pair
670 * @udata: user data for libibverbs.so
672 * Queue pair creation is mostly an rvt issue. However, drivers have their own
673 * unique idea of what queue pair numbers mean. For instance there is a reserved
676 * Return: the queue pair on success, otherwise returns an errno.
678 * Called by the ib_create_qp() core verbs function.
680 struct ib_qp
*rvt_create_qp(struct ib_pd
*ibpd
,
681 struct ib_qp_init_attr
*init_attr
,
682 struct ib_udata
*udata
)
686 struct rvt_swqe
*swq
= NULL
;
689 struct ib_qp
*ret
= ERR_PTR(-ENOMEM
);
690 struct rvt_dev_info
*rdi
= ib_to_rvt(ibpd
->device
);
696 return ERR_PTR(-EINVAL
);
698 if (init_attr
->cap
.max_send_sge
> rdi
->dparms
.props
.max_sge
||
699 init_attr
->cap
.max_send_wr
> rdi
->dparms
.props
.max_qp_wr
||
700 init_attr
->create_flags
& ~(IB_QP_CREATE_USE_GFP_NOIO
))
701 return ERR_PTR(-EINVAL
);
703 /* GFP_NOIO is applicable to RC QP's only */
705 if (init_attr
->create_flags
& IB_QP_CREATE_USE_GFP_NOIO
&&
706 init_attr
->qp_type
!= IB_QPT_RC
)
707 return ERR_PTR(-EINVAL
);
709 gfp
= init_attr
->create_flags
& IB_QP_CREATE_USE_GFP_NOIO
?
710 GFP_NOIO
: GFP_KERNEL
;
712 /* Check receive queue parameters if no SRQ is specified. */
713 if (!init_attr
->srq
) {
714 if (init_attr
->cap
.max_recv_sge
> rdi
->dparms
.props
.max_sge
||
715 init_attr
->cap
.max_recv_wr
> rdi
->dparms
.props
.max_qp_wr
)
716 return ERR_PTR(-EINVAL
);
718 if (init_attr
->cap
.max_send_sge
+
719 init_attr
->cap
.max_send_wr
+
720 init_attr
->cap
.max_recv_sge
+
721 init_attr
->cap
.max_recv_wr
== 0)
722 return ERR_PTR(-EINVAL
);
725 init_attr
->cap
.max_send_wr
+ 1 +
726 rdi
->dparms
.reserved_operations
;
727 switch (init_attr
->qp_type
) {
730 if (init_attr
->port_num
== 0 ||
731 init_attr
->port_num
> ibpd
->device
->phys_port_cnt
)
732 return ERR_PTR(-EINVAL
);
736 sz
= sizeof(struct rvt_sge
) *
737 init_attr
->cap
.max_send_sge
+
738 sizeof(struct rvt_swqe
);
742 gfp
| __GFP_ZERO
, PAGE_KERNEL
);
748 return ERR_PTR(-ENOMEM
);
752 if (init_attr
->srq
) {
753 struct rvt_srq
*srq
= ibsrq_to_rvtsrq(init_attr
->srq
);
755 if (srq
->rq
.max_sge
> 1)
756 sg_list_sz
= sizeof(*qp
->r_sg_list
) *
757 (srq
->rq
.max_sge
- 1);
758 } else if (init_attr
->cap
.max_recv_sge
> 1)
759 sg_list_sz
= sizeof(*qp
->r_sg_list
) *
760 (init_attr
->cap
.max_recv_sge
- 1);
761 qp
= kzalloc_node(sz
+ sg_list_sz
, gfp
, rdi
->dparms
.node
);
765 RCU_INIT_POINTER(qp
->next
, NULL
);
766 if (init_attr
->qp_type
== IB_QPT_RC
) {
769 sizeof(*qp
->s_ack_queue
) *
773 if (!qp
->s_ack_queue
)
776 /* initialize timers needed for rc qp */
777 setup_timer(&qp
->s_timer
, rvt_rc_timeout
, (unsigned long)qp
);
778 hrtimer_init(&qp
->s_rnr_timer
, CLOCK_MONOTONIC
,
780 qp
->s_rnr_timer
.function
= rvt_rc_rnr_retry
;
783 * Driver needs to set up it's private QP structure and do any
784 * initialization that is needed.
786 priv
= rdi
->driver_f
.qp_priv_alloc(rdi
, qp
, gfp
);
792 qp
->timeout_jiffies
=
793 usecs_to_jiffies((4096UL * (1UL << qp
->timeout
)) /
795 if (init_attr
->srq
) {
798 qp
->r_rq
.size
= init_attr
->cap
.max_recv_wr
+ 1;
799 qp
->r_rq
.max_sge
= init_attr
->cap
.max_recv_sge
;
800 sz
= (sizeof(struct ib_sge
) * qp
->r_rq
.max_sge
) +
801 sizeof(struct rvt_rwqe
);
803 qp
->r_rq
.wq
= vmalloc_user(
804 sizeof(struct rvt_rwq
) +
806 else if (gfp
== GFP_NOIO
)
807 qp
->r_rq
.wq
= __vmalloc(
808 sizeof(struct rvt_rwq
) +
810 gfp
| __GFP_ZERO
, PAGE_KERNEL
);
812 qp
->r_rq
.wq
= vzalloc_node(
813 sizeof(struct rvt_rwq
) +
817 goto bail_driver_priv
;
821 * ib_create_qp() will initialize qp->ibqp
822 * except for qp->ibqp.qp_num.
824 spin_lock_init(&qp
->r_lock
);
825 spin_lock_init(&qp
->s_hlock
);
826 spin_lock_init(&qp
->s_lock
);
827 spin_lock_init(&qp
->r_rq
.lock
);
828 atomic_set(&qp
->refcount
, 0);
829 atomic_set(&qp
->local_ops_pending
, 0);
830 init_waitqueue_head(&qp
->wait
);
831 init_timer(&qp
->s_timer
);
832 qp
->s_timer
.data
= (unsigned long)qp
;
833 INIT_LIST_HEAD(&qp
->rspwait
);
834 qp
->state
= IB_QPS_RESET
;
837 qp
->s_avail
= init_attr
->cap
.max_send_wr
;
838 qp
->s_max_sge
= init_attr
->cap
.max_send_sge
;
839 if (init_attr
->sq_sig_type
== IB_SIGNAL_REQ_WR
)
840 qp
->s_flags
= RVT_S_SIGNAL_REQ_WR
;
842 err
= alloc_qpn(rdi
, &rdi
->qp_dev
->qpn_table
,
844 init_attr
->port_num
, gfp
);
849 qp
->ibqp
.qp_num
= err
;
850 qp
->port_num
= init_attr
->port_num
;
851 rvt_init_qp(rdi
, qp
, init_attr
->qp_type
);
855 /* Don't support raw QPs */
856 return ERR_PTR(-EINVAL
);
859 init_attr
->cap
.max_inline_data
= 0;
862 * Return the address of the RWQ as the offset to mmap.
863 * See rvt_mmap() for details.
865 if (udata
&& udata
->outlen
>= sizeof(__u64
)) {
869 err
= ib_copy_to_udata(udata
, &offset
,
876 u32 s
= sizeof(struct rvt_rwq
) + qp
->r_rq
.size
* sz
;
878 qp
->ip
= rvt_create_mmap_info(rdi
, s
,
879 ibpd
->uobject
->context
,
882 ret
= ERR_PTR(-ENOMEM
);
886 err
= ib_copy_to_udata(udata
, &qp
->ip
->offset
,
887 sizeof(qp
->ip
->offset
));
893 qp
->pid
= current
->pid
;
896 spin_lock(&rdi
->n_qps_lock
);
897 if (rdi
->n_qps_allocated
== rdi
->dparms
.props
.max_qp
) {
898 spin_unlock(&rdi
->n_qps_lock
);
899 ret
= ERR_PTR(-ENOMEM
);
903 rdi
->n_qps_allocated
++;
905 * Maintain a busy_jiffies variable that will be added to the timeout
906 * period in mod_retry_timer and add_retry_timer. This busy jiffies
907 * is scaled by the number of rc qps created for the device to reduce
908 * the number of timeouts occurring when there is a large number of
909 * qps. busy_jiffies is incremented every rc qp scaling interval.
910 * The scaling interval is selected based on extensive performance
911 * evaluation of targeted workloads.
913 if (init_attr
->qp_type
== IB_QPT_RC
) {
915 rdi
->busy_jiffies
= rdi
->n_rc_qps
/ RC_QP_SCALING_INTERVAL
;
917 spin_unlock(&rdi
->n_qps_lock
);
920 spin_lock_irq(&rdi
->pending_lock
);
921 list_add(&qp
->ip
->pending_mmaps
, &rdi
->pending_mmaps
);
922 spin_unlock_irq(&rdi
->pending_lock
);
928 * We have our QP and its good, now keep track of what types of opcodes
929 * can be processed on this QP. We do this by keeping track of what the
930 * 3 high order bits of the opcode are.
932 switch (init_attr
->qp_type
) {
936 qp
->allowed_ops
= IB_OPCODE_UD
;
939 qp
->allowed_ops
= IB_OPCODE_RC
;
942 qp
->allowed_ops
= IB_OPCODE_UC
;
945 ret
= ERR_PTR(-EINVAL
);
953 kref_put(&qp
->ip
->ref
, rvt_release_mmap_info
);
956 free_qpn(&rdi
->qp_dev
->qpn_table
, qp
->ibqp
.qp_num
);
963 rdi
->driver_f
.qp_priv_free(rdi
, qp
);
966 kfree(qp
->s_ack_queue
);
976 * rvt_error_qp - put a QP into the error state
977 * @qp: the QP to put into the error state
978 * @err: the receive completion error to signal if a RWQE is active
980 * Flushes both send and receive work queues.
982 * Return: true if last WQE event should be generated.
983 * The QP r_lock and s_lock should be held and interrupts disabled.
984 * If we are already in error state, just return.
986 int rvt_error_qp(struct rvt_qp
*qp
, enum ib_wc_status err
)
990 struct rvt_dev_info
*rdi
= ib_to_rvt(qp
->ibqp
.device
);
992 lockdep_assert_held(&qp
->r_lock
);
993 lockdep_assert_held(&qp
->s_lock
);
994 if (qp
->state
== IB_QPS_ERR
|| qp
->state
== IB_QPS_RESET
)
997 qp
->state
= IB_QPS_ERR
;
999 if (qp
->s_flags
& (RVT_S_TIMER
| RVT_S_WAIT_RNR
)) {
1000 qp
->s_flags
&= ~(RVT_S_TIMER
| RVT_S_WAIT_RNR
);
1001 del_timer(&qp
->s_timer
);
1004 if (qp
->s_flags
& RVT_S_ANY_WAIT_SEND
)
1005 qp
->s_flags
&= ~RVT_S_ANY_WAIT_SEND
;
1007 rdi
->driver_f
.notify_error_qp(qp
);
1009 /* Schedule the sending tasklet to drain the send work queue. */
1010 if (ACCESS_ONCE(qp
->s_last
) != qp
->s_head
)
1011 rdi
->driver_f
.schedule_send(qp
);
1013 rvt_clear_mr_refs(qp
, 0);
1015 memset(&wc
, 0, sizeof(wc
));
1017 wc
.opcode
= IB_WC_RECV
;
1019 if (test_and_clear_bit(RVT_R_WRID_VALID
, &qp
->r_aflags
)) {
1020 wc
.wr_id
= qp
->r_wr_id
;
1022 rvt_cq_enter(ibcq_to_rvtcq(qp
->ibqp
.recv_cq
), &wc
, 1);
1024 wc
.status
= IB_WC_WR_FLUSH_ERR
;
1031 spin_lock(&qp
->r_rq
.lock
);
1033 /* sanity check pointers before trusting them */
1036 if (head
>= qp
->r_rq
.size
)
1039 if (tail
>= qp
->r_rq
.size
)
1041 while (tail
!= head
) {
1042 wc
.wr_id
= rvt_get_rwqe_ptr(&qp
->r_rq
, tail
)->wr_id
;
1043 if (++tail
>= qp
->r_rq
.size
)
1045 rvt_cq_enter(ibcq_to_rvtcq(qp
->ibqp
.recv_cq
), &wc
, 1);
1049 spin_unlock(&qp
->r_rq
.lock
);
1050 } else if (qp
->ibqp
.event_handler
) {
1057 EXPORT_SYMBOL(rvt_error_qp
);
1060 * Put the QP into the hash table.
1061 * The hash table holds a reference to the QP.
1063 static void rvt_insert_qp(struct rvt_dev_info
*rdi
, struct rvt_qp
*qp
)
1065 struct rvt_ibport
*rvp
= rdi
->ports
[qp
->port_num
- 1];
1066 unsigned long flags
;
1069 spin_lock_irqsave(&rdi
->qp_dev
->qpt_lock
, flags
);
1071 if (qp
->ibqp
.qp_num
<= 1) {
1072 rcu_assign_pointer(rvp
->qp
[qp
->ibqp
.qp_num
], qp
);
1074 u32 n
= hash_32(qp
->ibqp
.qp_num
, rdi
->qp_dev
->qp_table_bits
);
1076 qp
->next
= rdi
->qp_dev
->qp_table
[n
];
1077 rcu_assign_pointer(rdi
->qp_dev
->qp_table
[n
], qp
);
1078 trace_rvt_qpinsert(qp
, n
);
1081 spin_unlock_irqrestore(&rdi
->qp_dev
->qpt_lock
, flags
);
1085 * rvt_modify_qp - modify the attributes of a queue pair
1086 * @ibqp: the queue pair who's attributes we're modifying
1087 * @attr: the new attributes
1088 * @attr_mask: the mask of attributes to modify
1089 * @udata: user data for libibverbs.so
1091 * Return: 0 on success, otherwise returns an errno.
1093 int rvt_modify_qp(struct ib_qp
*ibqp
, struct ib_qp_attr
*attr
,
1094 int attr_mask
, struct ib_udata
*udata
)
1096 struct rvt_dev_info
*rdi
= ib_to_rvt(ibqp
->device
);
1097 struct rvt_qp
*qp
= ibqp_to_rvtqp(ibqp
);
1098 enum ib_qp_state cur_state
, new_state
;
1102 int pmtu
= 0; /* for gcc warning only */
1103 enum rdma_link_layer link
;
1105 link
= rdma_port_get_link_layer(ibqp
->device
, qp
->port_num
);
1107 spin_lock_irq(&qp
->r_lock
);
1108 spin_lock(&qp
->s_hlock
);
1109 spin_lock(&qp
->s_lock
);
1111 cur_state
= attr_mask
& IB_QP_CUR_STATE
?
1112 attr
->cur_qp_state
: qp
->state
;
1113 new_state
= attr_mask
& IB_QP_STATE
? attr
->qp_state
: cur_state
;
1115 if (!ib_modify_qp_is_ok(cur_state
, new_state
, ibqp
->qp_type
,
1119 if (rdi
->driver_f
.check_modify_qp
&&
1120 rdi
->driver_f
.check_modify_qp(qp
, attr
, attr_mask
, udata
))
1123 if (attr_mask
& IB_QP_AV
) {
1124 if (attr
->ah_attr
.dlid
>= be16_to_cpu(IB_MULTICAST_LID_BASE
))
1126 if (rvt_check_ah(qp
->ibqp
.device
, &attr
->ah_attr
))
1130 if (attr_mask
& IB_QP_ALT_PATH
) {
1131 if (attr
->alt_ah_attr
.dlid
>=
1132 be16_to_cpu(IB_MULTICAST_LID_BASE
))
1134 if (rvt_check_ah(qp
->ibqp
.device
, &attr
->alt_ah_attr
))
1136 if (attr
->alt_pkey_index
>= rvt_get_npkeys(rdi
))
1140 if (attr_mask
& IB_QP_PKEY_INDEX
)
1141 if (attr
->pkey_index
>= rvt_get_npkeys(rdi
))
1144 if (attr_mask
& IB_QP_MIN_RNR_TIMER
)
1145 if (attr
->min_rnr_timer
> 31)
1148 if (attr_mask
& IB_QP_PORT
)
1149 if (qp
->ibqp
.qp_type
== IB_QPT_SMI
||
1150 qp
->ibqp
.qp_type
== IB_QPT_GSI
||
1151 attr
->port_num
== 0 ||
1152 attr
->port_num
> ibqp
->device
->phys_port_cnt
)
1155 if (attr_mask
& IB_QP_DEST_QPN
)
1156 if (attr
->dest_qp_num
> RVT_QPN_MASK
)
1159 if (attr_mask
& IB_QP_RETRY_CNT
)
1160 if (attr
->retry_cnt
> 7)
1163 if (attr_mask
& IB_QP_RNR_RETRY
)
1164 if (attr
->rnr_retry
> 7)
1168 * Don't allow invalid path_mtu values. OK to set greater
1169 * than the active mtu (or even the max_cap, if we have tuned
1170 * that to a small mtu. We'll set qp->path_mtu
1171 * to the lesser of requested attribute mtu and active,
1172 * for packetizing messages.
1173 * Note that the QP port has to be set in INIT and MTU in RTR.
1175 if (attr_mask
& IB_QP_PATH_MTU
) {
1176 pmtu
= rdi
->driver_f
.get_pmtu_from_attr(rdi
, qp
, attr
);
1181 if (attr_mask
& IB_QP_PATH_MIG_STATE
) {
1182 if (attr
->path_mig_state
== IB_MIG_REARM
) {
1183 if (qp
->s_mig_state
== IB_MIG_ARMED
)
1185 if (new_state
!= IB_QPS_RTS
)
1187 } else if (attr
->path_mig_state
== IB_MIG_MIGRATED
) {
1188 if (qp
->s_mig_state
== IB_MIG_REARM
)
1190 if (new_state
!= IB_QPS_RTS
&& new_state
!= IB_QPS_SQD
)
1192 if (qp
->s_mig_state
== IB_MIG_ARMED
)
1199 if (attr_mask
& IB_QP_MAX_DEST_RD_ATOMIC
)
1200 if (attr
->max_dest_rd_atomic
> rdi
->dparms
.max_rdma_atomic
)
1203 switch (new_state
) {
1205 if (qp
->state
!= IB_QPS_RESET
)
1206 rvt_reset_qp(rdi
, qp
, ibqp
->qp_type
);
1210 /* Allow event to re-trigger if QP set to RTR more than once */
1211 qp
->r_flags
&= ~RVT_R_COMM_EST
;
1212 qp
->state
= new_state
;
1216 qp
->s_draining
= qp
->s_last
!= qp
->s_cur
;
1217 qp
->state
= new_state
;
1221 if (qp
->ibqp
.qp_type
== IB_QPT_RC
)
1223 qp
->state
= new_state
;
1227 lastwqe
= rvt_error_qp(qp
, IB_WC_WR_FLUSH_ERR
);
1231 qp
->state
= new_state
;
1235 if (attr_mask
& IB_QP_PKEY_INDEX
)
1236 qp
->s_pkey_index
= attr
->pkey_index
;
1238 if (attr_mask
& IB_QP_PORT
)
1239 qp
->port_num
= attr
->port_num
;
1241 if (attr_mask
& IB_QP_DEST_QPN
)
1242 qp
->remote_qpn
= attr
->dest_qp_num
;
1244 if (attr_mask
& IB_QP_SQ_PSN
) {
1245 qp
->s_next_psn
= attr
->sq_psn
& rdi
->dparms
.psn_modify_mask
;
1246 qp
->s_psn
= qp
->s_next_psn
;
1247 qp
->s_sending_psn
= qp
->s_next_psn
;
1248 qp
->s_last_psn
= qp
->s_next_psn
- 1;
1249 qp
->s_sending_hpsn
= qp
->s_last_psn
;
1252 if (attr_mask
& IB_QP_RQ_PSN
)
1253 qp
->r_psn
= attr
->rq_psn
& rdi
->dparms
.psn_modify_mask
;
1255 if (attr_mask
& IB_QP_ACCESS_FLAGS
)
1256 qp
->qp_access_flags
= attr
->qp_access_flags
;
1258 if (attr_mask
& IB_QP_AV
) {
1259 qp
->remote_ah_attr
= attr
->ah_attr
;
1260 qp
->s_srate
= attr
->ah_attr
.static_rate
;
1261 qp
->srate_mbps
= ib_rate_to_mbps(qp
->s_srate
);
1264 if (attr_mask
& IB_QP_ALT_PATH
) {
1265 qp
->alt_ah_attr
= attr
->alt_ah_attr
;
1266 qp
->s_alt_pkey_index
= attr
->alt_pkey_index
;
1269 if (attr_mask
& IB_QP_PATH_MIG_STATE
) {
1270 qp
->s_mig_state
= attr
->path_mig_state
;
1272 qp
->remote_ah_attr
= qp
->alt_ah_attr
;
1273 qp
->port_num
= qp
->alt_ah_attr
.port_num
;
1274 qp
->s_pkey_index
= qp
->s_alt_pkey_index
;
1278 if (attr_mask
& IB_QP_PATH_MTU
) {
1279 qp
->pmtu
= rdi
->driver_f
.mtu_from_qp(rdi
, qp
, pmtu
);
1280 qp
->path_mtu
= rdi
->driver_f
.mtu_to_path_mtu(qp
->pmtu
);
1281 qp
->log_pmtu
= ilog2(qp
->pmtu
);
1284 if (attr_mask
& IB_QP_RETRY_CNT
) {
1285 qp
->s_retry_cnt
= attr
->retry_cnt
;
1286 qp
->s_retry
= attr
->retry_cnt
;
1289 if (attr_mask
& IB_QP_RNR_RETRY
) {
1290 qp
->s_rnr_retry_cnt
= attr
->rnr_retry
;
1291 qp
->s_rnr_retry
= attr
->rnr_retry
;
1294 if (attr_mask
& IB_QP_MIN_RNR_TIMER
)
1295 qp
->r_min_rnr_timer
= attr
->min_rnr_timer
;
1297 if (attr_mask
& IB_QP_TIMEOUT
) {
1298 qp
->timeout
= attr
->timeout
;
1299 qp
->timeout_jiffies
=
1300 usecs_to_jiffies((4096UL * (1UL << qp
->timeout
)) /
1304 if (attr_mask
& IB_QP_QKEY
)
1305 qp
->qkey
= attr
->qkey
;
1307 if (attr_mask
& IB_QP_MAX_DEST_RD_ATOMIC
)
1308 qp
->r_max_rd_atomic
= attr
->max_dest_rd_atomic
;
1310 if (attr_mask
& IB_QP_MAX_QP_RD_ATOMIC
)
1311 qp
->s_max_rd_atomic
= attr
->max_rd_atomic
;
1313 if (rdi
->driver_f
.modify_qp
)
1314 rdi
->driver_f
.modify_qp(qp
, attr
, attr_mask
, udata
);
1316 spin_unlock(&qp
->s_lock
);
1317 spin_unlock(&qp
->s_hlock
);
1318 spin_unlock_irq(&qp
->r_lock
);
1320 if (cur_state
== IB_QPS_RESET
&& new_state
== IB_QPS_INIT
)
1321 rvt_insert_qp(rdi
, qp
);
1324 ev
.device
= qp
->ibqp
.device
;
1325 ev
.element
.qp
= &qp
->ibqp
;
1326 ev
.event
= IB_EVENT_QP_LAST_WQE_REACHED
;
1327 qp
->ibqp
.event_handler(&ev
, qp
->ibqp
.qp_context
);
1330 ev
.device
= qp
->ibqp
.device
;
1331 ev
.element
.qp
= &qp
->ibqp
;
1332 ev
.event
= IB_EVENT_PATH_MIG
;
1333 qp
->ibqp
.event_handler(&ev
, qp
->ibqp
.qp_context
);
1338 spin_unlock(&qp
->s_lock
);
1339 spin_unlock(&qp
->s_hlock
);
1340 spin_unlock_irq(&qp
->r_lock
);
1344 /** rvt_free_qpn - Free a qpn from the bit map
1346 * @qpn: queue pair number to free
1348 static void rvt_free_qpn(struct rvt_qpn_table
*qpt
, u32 qpn
)
1350 struct rvt_qpn_map
*map
;
1352 map
= qpt
->map
+ qpn
/ RVT_BITS_PER_PAGE
;
1354 clear_bit(qpn
& RVT_BITS_PER_PAGE_MASK
, map
->page
);
1358 * rvt_destroy_qp - destroy a queue pair
1359 * @ibqp: the queue pair to destroy
1361 * Note that this can be called while the QP is actively sending or
1364 * Return: 0 on success.
1366 int rvt_destroy_qp(struct ib_qp
*ibqp
)
1368 struct rvt_qp
*qp
= ibqp_to_rvtqp(ibqp
);
1369 struct rvt_dev_info
*rdi
= ib_to_rvt(ibqp
->device
);
1371 spin_lock_irq(&qp
->r_lock
);
1372 spin_lock(&qp
->s_hlock
);
1373 spin_lock(&qp
->s_lock
);
1374 rvt_reset_qp(rdi
, qp
, ibqp
->qp_type
);
1375 spin_unlock(&qp
->s_lock
);
1376 spin_unlock(&qp
->s_hlock
);
1377 spin_unlock_irq(&qp
->r_lock
);
1379 /* qpn is now available for use again */
1380 rvt_free_qpn(&rdi
->qp_dev
->qpn_table
, qp
->ibqp
.qp_num
);
1382 spin_lock(&rdi
->n_qps_lock
);
1383 rdi
->n_qps_allocated
--;
1384 if (qp
->ibqp
.qp_type
== IB_QPT_RC
) {
1386 rdi
->busy_jiffies
= rdi
->n_rc_qps
/ RC_QP_SCALING_INTERVAL
;
1388 spin_unlock(&rdi
->n_qps_lock
);
1391 kref_put(&qp
->ip
->ref
, rvt_release_mmap_info
);
1395 rdi
->driver_f
.qp_priv_free(rdi
, qp
);
1396 kfree(qp
->s_ack_queue
);
1402 * rvt_query_qp - query an ipbq
1403 * @ibqp: IB qp to query
1404 * @attr: attr struct to fill in
1405 * @attr_mask: attr mask ignored
1406 * @init_attr: struct to fill in
1410 int rvt_query_qp(struct ib_qp
*ibqp
, struct ib_qp_attr
*attr
,
1411 int attr_mask
, struct ib_qp_init_attr
*init_attr
)
1413 struct rvt_qp
*qp
= ibqp_to_rvtqp(ibqp
);
1414 struct rvt_dev_info
*rdi
= ib_to_rvt(ibqp
->device
);
1416 attr
->qp_state
= qp
->state
;
1417 attr
->cur_qp_state
= attr
->qp_state
;
1418 attr
->path_mtu
= qp
->path_mtu
;
1419 attr
->path_mig_state
= qp
->s_mig_state
;
1420 attr
->qkey
= qp
->qkey
;
1421 attr
->rq_psn
= qp
->r_psn
& rdi
->dparms
.psn_mask
;
1422 attr
->sq_psn
= qp
->s_next_psn
& rdi
->dparms
.psn_mask
;
1423 attr
->dest_qp_num
= qp
->remote_qpn
;
1424 attr
->qp_access_flags
= qp
->qp_access_flags
;
1425 attr
->cap
.max_send_wr
= qp
->s_size
- 1 -
1426 rdi
->dparms
.reserved_operations
;
1427 attr
->cap
.max_recv_wr
= qp
->ibqp
.srq
? 0 : qp
->r_rq
.size
- 1;
1428 attr
->cap
.max_send_sge
= qp
->s_max_sge
;
1429 attr
->cap
.max_recv_sge
= qp
->r_rq
.max_sge
;
1430 attr
->cap
.max_inline_data
= 0;
1431 attr
->ah_attr
= qp
->remote_ah_attr
;
1432 attr
->alt_ah_attr
= qp
->alt_ah_attr
;
1433 attr
->pkey_index
= qp
->s_pkey_index
;
1434 attr
->alt_pkey_index
= qp
->s_alt_pkey_index
;
1435 attr
->en_sqd_async_notify
= 0;
1436 attr
->sq_draining
= qp
->s_draining
;
1437 attr
->max_rd_atomic
= qp
->s_max_rd_atomic
;
1438 attr
->max_dest_rd_atomic
= qp
->r_max_rd_atomic
;
1439 attr
->min_rnr_timer
= qp
->r_min_rnr_timer
;
1440 attr
->port_num
= qp
->port_num
;
1441 attr
->timeout
= qp
->timeout
;
1442 attr
->retry_cnt
= qp
->s_retry_cnt
;
1443 attr
->rnr_retry
= qp
->s_rnr_retry_cnt
;
1444 attr
->alt_port_num
= qp
->alt_ah_attr
.port_num
;
1445 attr
->alt_timeout
= qp
->alt_timeout
;
1447 init_attr
->event_handler
= qp
->ibqp
.event_handler
;
1448 init_attr
->qp_context
= qp
->ibqp
.qp_context
;
1449 init_attr
->send_cq
= qp
->ibqp
.send_cq
;
1450 init_attr
->recv_cq
= qp
->ibqp
.recv_cq
;
1451 init_attr
->srq
= qp
->ibqp
.srq
;
1452 init_attr
->cap
= attr
->cap
;
1453 if (qp
->s_flags
& RVT_S_SIGNAL_REQ_WR
)
1454 init_attr
->sq_sig_type
= IB_SIGNAL_REQ_WR
;
1456 init_attr
->sq_sig_type
= IB_SIGNAL_ALL_WR
;
1457 init_attr
->qp_type
= qp
->ibqp
.qp_type
;
1458 init_attr
->port_num
= qp
->port_num
;
1463 * rvt_post_receive - post a receive on a QP
1464 * @ibqp: the QP to post the receive on
1465 * @wr: the WR to post
1466 * @bad_wr: the first bad WR is put here
1468 * This may be called from interrupt context.
1470 * Return: 0 on success otherwise errno
1472 int rvt_post_recv(struct ib_qp
*ibqp
, struct ib_recv_wr
*wr
,
1473 struct ib_recv_wr
**bad_wr
)
1475 struct rvt_qp
*qp
= ibqp_to_rvtqp(ibqp
);
1476 struct rvt_rwq
*wq
= qp
->r_rq
.wq
;
1477 unsigned long flags
;
1478 int qp_err_flush
= (ib_rvt_state_ops
[qp
->state
] & RVT_FLUSH_RECV
) &&
1481 /* Check that state is OK to post receive. */
1482 if (!(ib_rvt_state_ops
[qp
->state
] & RVT_POST_RECV_OK
) || !wq
) {
1487 for (; wr
; wr
= wr
->next
) {
1488 struct rvt_rwqe
*wqe
;
1492 if ((unsigned)wr
->num_sge
> qp
->r_rq
.max_sge
) {
1497 spin_lock_irqsave(&qp
->r_rq
.lock
, flags
);
1498 next
= wq
->head
+ 1;
1499 if (next
>= qp
->r_rq
.size
)
1501 if (next
== wq
->tail
) {
1502 spin_unlock_irqrestore(&qp
->r_rq
.lock
, flags
);
1506 if (unlikely(qp_err_flush
)) {
1509 memset(&wc
, 0, sizeof(wc
));
1511 wc
.opcode
= IB_WC_RECV
;
1512 wc
.wr_id
= wr
->wr_id
;
1513 wc
.status
= IB_WC_WR_FLUSH_ERR
;
1514 rvt_cq_enter(ibcq_to_rvtcq(qp
->ibqp
.recv_cq
), &wc
, 1);
1516 wqe
= rvt_get_rwqe_ptr(&qp
->r_rq
, wq
->head
);
1517 wqe
->wr_id
= wr
->wr_id
;
1518 wqe
->num_sge
= wr
->num_sge
;
1519 for (i
= 0; i
< wr
->num_sge
; i
++)
1520 wqe
->sg_list
[i
] = wr
->sg_list
[i
];
1522 * Make sure queue entry is written
1523 * before the head index.
1528 spin_unlock_irqrestore(&qp
->r_rq
.lock
, flags
);
1534 * rvt_qp_valid_operation - validate post send wr request
1536 * @post-parms - the post send table for the driver
1537 * @wr - the work request
1539 * The routine validates the operation based on the
1540 * validation table an returns the length of the operation
1541 * which can extend beyond the ib_send_bw. Operation
1542 * dependent flags key atomic operation validation.
1544 * There is an exception for UD qps that validates the pd and
1545 * overrides the length to include the additional UD specific
1548 * Returns a negative error or the length of the work request
1549 * for building the swqe.
1551 static inline int rvt_qp_valid_operation(
1553 const struct rvt_operation_params
*post_parms
,
1554 struct ib_send_wr
*wr
)
1558 if (wr
->opcode
>= RVT_OPERATION_MAX
|| !post_parms
[wr
->opcode
].length
)
1560 if (!(post_parms
[wr
->opcode
].qpt_support
& BIT(qp
->ibqp
.qp_type
)))
1562 if ((post_parms
[wr
->opcode
].flags
& RVT_OPERATION_PRIV
) &&
1563 ibpd_to_rvtpd(qp
->ibqp
.pd
)->user
)
1565 if (post_parms
[wr
->opcode
].flags
& RVT_OPERATION_ATOMIC_SGE
&&
1566 (wr
->num_sge
== 0 ||
1567 wr
->sg_list
[0].length
< sizeof(u64
) ||
1568 wr
->sg_list
[0].addr
& (sizeof(u64
) - 1)))
1570 if (post_parms
[wr
->opcode
].flags
& RVT_OPERATION_ATOMIC
&&
1571 !qp
->s_max_rd_atomic
)
1573 len
= post_parms
[wr
->opcode
].length
;
1575 if (qp
->ibqp
.qp_type
!= IB_QPT_UC
&&
1576 qp
->ibqp
.qp_type
!= IB_QPT_RC
) {
1577 if (qp
->ibqp
.pd
!= ud_wr(wr
)->ah
->pd
)
1579 len
= sizeof(struct ib_ud_wr
);
1585 * rvt_qp_is_avail - determine queue capacity
1587 * @rdi - the rdmavt device
1588 * @reserved_op - is reserved operation
1590 * This assumes the s_hlock is held but the s_last
1591 * qp variable is uncontrolled.
1593 * For non reserved operations, the qp->s_avail
1596 * The return value is zero or a -ENOMEM.
1598 static inline int rvt_qp_is_avail(
1600 struct rvt_dev_info
*rdi
,
1607 /* see rvt_qp_wqe_unreserve() */
1608 smp_mb__before_atomic();
1609 reserved_used
= atomic_read(&qp
->s_reserved_used
);
1610 if (unlikely(reserved_op
)) {
1611 /* see rvt_qp_wqe_unreserve() */
1612 smp_mb__before_atomic();
1613 if (reserved_used
>= rdi
->dparms
.reserved_operations
)
1617 /* non-reserved operations */
1618 if (likely(qp
->s_avail
))
1620 smp_read_barrier_depends(); /* see rc.c */
1621 slast
= ACCESS_ONCE(qp
->s_last
);
1622 if (qp
->s_head
>= slast
)
1623 avail
= qp
->s_size
- (qp
->s_head
- slast
);
1625 avail
= slast
- qp
->s_head
;
1627 /* see rvt_qp_wqe_unreserve() */
1628 smp_mb__before_atomic();
1629 reserved_used
= atomic_read(&qp
->s_reserved_used
);
1631 (rdi
->dparms
.reserved_operations
- reserved_used
);
1632 /* insure we don't assign a negative s_avail */
1633 if ((s32
)avail
<= 0)
1635 qp
->s_avail
= avail
;
1636 if (WARN_ON(qp
->s_avail
>
1637 (qp
->s_size
- 1 - rdi
->dparms
.reserved_operations
)))
1639 "More avail entries than QP RB size.\nQP: %u, size: %u, avail: %u\nhead: %u, tail: %u, cur: %u, acked: %u, last: %u",
1640 qp
->ibqp
.qp_num
, qp
->s_size
, qp
->s_avail
,
1641 qp
->s_head
, qp
->s_tail
, qp
->s_cur
,
1642 qp
->s_acked
, qp
->s_last
);
1647 * rvt_post_one_wr - post one RC, UC, or UD send work request
1648 * @qp: the QP to post on
1649 * @wr: the work request to send
1651 static int rvt_post_one_wr(struct rvt_qp
*qp
,
1652 struct ib_send_wr
*wr
,
1655 struct rvt_swqe
*wqe
;
1660 struct rvt_lkey_table
*rkt
;
1662 struct rvt_dev_info
*rdi
= ib_to_rvt(qp
->ibqp
.device
);
1667 int local_ops_delayed
= 0;
1669 BUILD_BUG_ON(IB_QPT_MAX
>= (sizeof(u32
) * BITS_PER_BYTE
));
1671 /* IB spec says that num_sge == 0 is OK. */
1672 if (unlikely(wr
->num_sge
> qp
->s_max_sge
))
1675 ret
= rvt_qp_valid_operation(qp
, rdi
->post_parms
, wr
);
1681 * Local operations include fast register and local invalidate.
1682 * Fast register needs to be processed immediately because the
1683 * registered lkey may be used by following work requests and the
1684 * lkey needs to be valid at the time those requests are posted.
1685 * Local invalidate can be processed immediately if fencing is
1686 * not required and no previous local invalidate ops are pending.
1687 * Signaled local operations that have been processed immediately
1688 * need to have requests with "completion only" flags set posted
1689 * to the send queue in order to generate completions.
1691 if ((rdi
->post_parms
[wr
->opcode
].flags
& RVT_OPERATION_LOCAL
)) {
1692 switch (wr
->opcode
) {
1694 ret
= rvt_fast_reg_mr(qp
,
1697 reg_wr(wr
)->access
);
1698 if (ret
|| !(wr
->send_flags
& IB_SEND_SIGNALED
))
1701 case IB_WR_LOCAL_INV
:
1702 if ((wr
->send_flags
& IB_SEND_FENCE
) ||
1703 atomic_read(&qp
->local_ops_pending
)) {
1704 local_ops_delayed
= 1;
1706 ret
= rvt_invalidate_rkey(
1707 qp
, wr
->ex
.invalidate_rkey
);
1708 if (ret
|| !(wr
->send_flags
& IB_SEND_SIGNALED
))
1717 reserved_op
= rdi
->post_parms
[wr
->opcode
].flags
&
1718 RVT_OPERATION_USE_RESERVE
;
1719 /* check for avail */
1720 ret
= rvt_qp_is_avail(qp
, rdi
, reserved_op
);
1723 next
= qp
->s_head
+ 1;
1724 if (next
>= qp
->s_size
)
1727 rkt
= &rdi
->lkey_table
;
1728 pd
= ibpd_to_rvtpd(qp
->ibqp
.pd
);
1729 wqe
= rvt_get_swqe_ptr(qp
, qp
->s_head
);
1731 /* cplen has length from above */
1732 memcpy(&wqe
->wr
, wr
, cplen
);
1737 acc
= wr
->opcode
>= IB_WR_RDMA_READ
?
1738 IB_ACCESS_LOCAL_WRITE
: 0;
1739 for (i
= 0; i
< wr
->num_sge
; i
++) {
1740 u32 length
= wr
->sg_list
[i
].length
;
1745 ok
= rvt_lkey_ok(rkt
, pd
, &wqe
->sg_list
[j
],
1746 &wr
->sg_list
[i
], acc
);
1749 goto bail_inval_free
;
1751 wqe
->length
+= length
;
1754 wqe
->wr
.num_sge
= j
;
1757 /* general part of wqe valid - allow for driver checks */
1758 if (rdi
->driver_f
.check_send_wqe
) {
1759 ret
= rdi
->driver_f
.check_send_wqe(qp
, wqe
);
1761 goto bail_inval_free
;
1766 log_pmtu
= qp
->log_pmtu
;
1767 if (qp
->ibqp
.qp_type
!= IB_QPT_UC
&&
1768 qp
->ibqp
.qp_type
!= IB_QPT_RC
) {
1769 struct rvt_ah
*ah
= ibah_to_rvtah(wqe
->ud_wr
.ah
);
1771 log_pmtu
= ah
->log_pmtu
;
1772 atomic_inc(&ibah_to_rvtah(ud_wr(wr
)->ah
)->refcount
);
1775 if (rdi
->post_parms
[wr
->opcode
].flags
& RVT_OPERATION_LOCAL
) {
1776 if (local_ops_delayed
)
1777 atomic_inc(&qp
->local_ops_pending
);
1779 wqe
->wr
.send_flags
|= RVT_SEND_COMPLETION_ONLY
;
1784 wqe
->ssn
= qp
->s_ssn
++;
1785 wqe
->psn
= qp
->s_next_psn
;
1786 wqe
->lpsn
= wqe
->psn
+
1788 ((wqe
->length
- 1) >> log_pmtu
) :
1790 qp
->s_next_psn
= wqe
->lpsn
+ 1;
1792 trace_rvt_post_one_wr(qp
, wqe
);
1793 if (unlikely(reserved_op
))
1794 rvt_qp_wqe_reserve(qp
, wqe
);
1797 smp_wmb(); /* see request builders */
1803 /* release mr holds */
1805 struct rvt_sge
*sge
= &wqe
->sg_list
[--j
];
1807 rvt_put_mr(sge
->mr
);
1813 * rvt_post_send - post a send on a QP
1814 * @ibqp: the QP to post the send on
1815 * @wr: the list of work requests to post
1816 * @bad_wr: the first bad WR is put here
1818 * This may be called from interrupt context.
1820 * Return: 0 on success else errno
1822 int rvt_post_send(struct ib_qp
*ibqp
, struct ib_send_wr
*wr
,
1823 struct ib_send_wr
**bad_wr
)
1825 struct rvt_qp
*qp
= ibqp_to_rvtqp(ibqp
);
1826 struct rvt_dev_info
*rdi
= ib_to_rvt(ibqp
->device
);
1827 unsigned long flags
= 0;
1832 spin_lock_irqsave(&qp
->s_hlock
, flags
);
1835 * Ensure QP state is such that we can send. If not bail out early,
1836 * there is no need to do this every time we post a send.
1838 if (unlikely(!(ib_rvt_state_ops
[qp
->state
] & RVT_POST_SEND_OK
))) {
1839 spin_unlock_irqrestore(&qp
->s_hlock
, flags
);
1844 * If the send queue is empty, and we only have a single WR then just go
1845 * ahead and kick the send engine into gear. Otherwise we will always
1846 * just schedule the send to happen later.
1848 call_send
= qp
->s_head
== ACCESS_ONCE(qp
->s_last
) && !wr
->next
;
1850 for (; wr
; wr
= wr
->next
) {
1851 err
= rvt_post_one_wr(qp
, wr
, &call_send
);
1852 if (unlikely(err
)) {
1859 spin_unlock_irqrestore(&qp
->s_hlock
, flags
);
1862 rdi
->driver_f
.do_send(qp
);
1864 rdi
->driver_f
.schedule_send_no_lock(qp
);
1870 * rvt_post_srq_receive - post a receive on a shared receive queue
1871 * @ibsrq: the SRQ to post the receive on
1872 * @wr: the list of work requests to post
1873 * @bad_wr: A pointer to the first WR to cause a problem is put here
1875 * This may be called from interrupt context.
1877 * Return: 0 on success else errno
1879 int rvt_post_srq_recv(struct ib_srq
*ibsrq
, struct ib_recv_wr
*wr
,
1880 struct ib_recv_wr
**bad_wr
)
1882 struct rvt_srq
*srq
= ibsrq_to_rvtsrq(ibsrq
);
1884 unsigned long flags
;
1886 for (; wr
; wr
= wr
->next
) {
1887 struct rvt_rwqe
*wqe
;
1891 if ((unsigned)wr
->num_sge
> srq
->rq
.max_sge
) {
1896 spin_lock_irqsave(&srq
->rq
.lock
, flags
);
1898 next
= wq
->head
+ 1;
1899 if (next
>= srq
->rq
.size
)
1901 if (next
== wq
->tail
) {
1902 spin_unlock_irqrestore(&srq
->rq
.lock
, flags
);
1907 wqe
= rvt_get_rwqe_ptr(&srq
->rq
, wq
->head
);
1908 wqe
->wr_id
= wr
->wr_id
;
1909 wqe
->num_sge
= wr
->num_sge
;
1910 for (i
= 0; i
< wr
->num_sge
; i
++)
1911 wqe
->sg_list
[i
] = wr
->sg_list
[i
];
1912 /* Make sure queue entry is written before the head index. */
1915 spin_unlock_irqrestore(&srq
->rq
.lock
, flags
);
1921 * qp_comm_est - handle trap with QP established
1924 void rvt_comm_est(struct rvt_qp
*qp
)
1926 qp
->r_flags
|= RVT_R_COMM_EST
;
1927 if (qp
->ibqp
.event_handler
) {
1930 ev
.device
= qp
->ibqp
.device
;
1931 ev
.element
.qp
= &qp
->ibqp
;
1932 ev
.event
= IB_EVENT_COMM_EST
;
1933 qp
->ibqp
.event_handler(&ev
, qp
->ibqp
.qp_context
);
1936 EXPORT_SYMBOL(rvt_comm_est
);
1938 void rvt_rc_error(struct rvt_qp
*qp
, enum ib_wc_status err
)
1940 unsigned long flags
;
1943 spin_lock_irqsave(&qp
->s_lock
, flags
);
1944 lastwqe
= rvt_error_qp(qp
, err
);
1945 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
1950 ev
.device
= qp
->ibqp
.device
;
1951 ev
.element
.qp
= &qp
->ibqp
;
1952 ev
.event
= IB_EVENT_QP_LAST_WQE_REACHED
;
1953 qp
->ibqp
.event_handler(&ev
, qp
->ibqp
.qp_context
);
1956 EXPORT_SYMBOL(rvt_rc_error
);
1959 * rvt_rnr_tbl_to_usec - return index into ib_rvt_rnr_table
1960 * @index - the index
1961 * return usec from an index into ib_rvt_rnr_table
1963 unsigned long rvt_rnr_tbl_to_usec(u32 index
)
1965 return ib_rvt_rnr_table
[(index
& IB_AETH_CREDIT_MASK
)];
1967 EXPORT_SYMBOL(rvt_rnr_tbl_to_usec
);
1969 static inline unsigned long rvt_aeth_to_usec(u32 aeth
)
1971 return ib_rvt_rnr_table
[(aeth
>> IB_AETH_CREDIT_SHIFT
) &
1972 IB_AETH_CREDIT_MASK
];
1976 * rvt_add_retry_timer - add/start a retry timer
1978 * add a retry timer on the QP
1980 void rvt_add_retry_timer(struct rvt_qp
*qp
)
1982 struct ib_qp
*ibqp
= &qp
->ibqp
;
1983 struct rvt_dev_info
*rdi
= ib_to_rvt(ibqp
->device
);
1985 lockdep_assert_held(&qp
->s_lock
);
1986 qp
->s_flags
|= RVT_S_TIMER
;
1987 /* 4.096 usec. * (1 << qp->timeout) */
1988 qp
->s_timer
.expires
= jiffies
+ qp
->timeout_jiffies
+
1990 add_timer(&qp
->s_timer
);
1992 EXPORT_SYMBOL(rvt_add_retry_timer
);
1995 * rvt_add_rnr_timer - add/start an rnr timer
1997 * @aeth - aeth of RNR timeout, simulated aeth for loopback
1998 * add an rnr timer on the QP
2000 void rvt_add_rnr_timer(struct rvt_qp
*qp
, u32 aeth
)
2004 lockdep_assert_held(&qp
->s_lock
);
2005 qp
->s_flags
|= RVT_S_WAIT_RNR
;
2006 to
= rvt_aeth_to_usec(aeth
);
2007 hrtimer_start(&qp
->s_rnr_timer
,
2008 ns_to_ktime(1000 * to
), HRTIMER_MODE_REL
);
2010 EXPORT_SYMBOL(rvt_add_rnr_timer
);
2013 * rvt_stop_rc_timers - stop all timers
2015 * stop any pending timers
2017 void rvt_stop_rc_timers(struct rvt_qp
*qp
)
2019 lockdep_assert_held(&qp
->s_lock
);
2020 /* Remove QP from all timers */
2021 if (qp
->s_flags
& (RVT_S_TIMER
| RVT_S_WAIT_RNR
)) {
2022 qp
->s_flags
&= ~(RVT_S_TIMER
| RVT_S_WAIT_RNR
);
2023 del_timer(&qp
->s_timer
);
2024 hrtimer_try_to_cancel(&qp
->s_rnr_timer
);
2027 EXPORT_SYMBOL(rvt_stop_rc_timers
);
2030 * rvt_stop_rnr_timer - stop an rnr timer
2033 * stop an rnr timer and return if the timer
2036 static int rvt_stop_rnr_timer(struct rvt_qp
*qp
)
2040 lockdep_assert_held(&qp
->s_lock
);
2041 /* Remove QP from rnr timer */
2042 if (qp
->s_flags
& RVT_S_WAIT_RNR
) {
2043 qp
->s_flags
&= ~RVT_S_WAIT_RNR
;
2044 rval
= hrtimer_try_to_cancel(&qp
->s_rnr_timer
);
2050 * rvt_del_timers_sync - wait for any timeout routines to exit
2053 void rvt_del_timers_sync(struct rvt_qp
*qp
)
2055 del_timer_sync(&qp
->s_timer
);
2056 hrtimer_cancel(&qp
->s_rnr_timer
);
2058 EXPORT_SYMBOL(rvt_del_timers_sync
);
2061 * This is called from s_timer for missing responses.
2063 static void rvt_rc_timeout(unsigned long arg
)
2065 struct rvt_qp
*qp
= (struct rvt_qp
*)arg
;
2066 struct rvt_dev_info
*rdi
= ib_to_rvt(qp
->ibqp
.device
);
2067 unsigned long flags
;
2069 spin_lock_irqsave(&qp
->r_lock
, flags
);
2070 spin_lock(&qp
->s_lock
);
2071 if (qp
->s_flags
& RVT_S_TIMER
) {
2072 qp
->s_flags
&= ~RVT_S_TIMER
;
2073 del_timer(&qp
->s_timer
);
2074 if (rdi
->driver_f
.notify_restart_rc
)
2075 rdi
->driver_f
.notify_restart_rc(qp
,
2078 rdi
->driver_f
.schedule_send(qp
);
2080 spin_unlock(&qp
->s_lock
);
2081 spin_unlock_irqrestore(&qp
->r_lock
, flags
);
2085 * This is called from s_timer for RNR timeouts.
2087 enum hrtimer_restart
rvt_rc_rnr_retry(struct hrtimer
*t
)
2089 struct rvt_qp
*qp
= container_of(t
, struct rvt_qp
, s_rnr_timer
);
2090 struct rvt_dev_info
*rdi
= ib_to_rvt(qp
->ibqp
.device
);
2091 unsigned long flags
;
2093 spin_lock_irqsave(&qp
->s_lock
, flags
);
2094 rvt_stop_rnr_timer(qp
);
2095 rdi
->driver_f
.schedule_send(qp
);
2096 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
2097 return HRTIMER_NORESTART
;
2099 EXPORT_SYMBOL(rvt_rc_rnr_retry
);