2 * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved.
3 * Copyright (c) 2004 Infinicon Corporation. All rights reserved.
4 * Copyright (c) 2004 Intel Corporation. All rights reserved.
5 * Copyright (c) 2004 Topspin Corporation. All rights reserved.
6 * Copyright (c) 2004 Voltaire Corporation. All rights reserved.
7 * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
8 * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved.
10 * This software is available to you under a choice of one of two
11 * licenses. You may choose to be licensed under the terms of the GNU
12 * General Public License (GPL) Version 2, available from the file
13 * COPYING in the main directory of this source tree, or the
14 * OpenIB.org BSD license below:
16 * Redistribution and use in source and binary forms, with or
17 * without modification, are permitted provided that the following
20 * - Redistributions of source code must retain the above
21 * copyright notice, this list of conditions and the following
24 * - Redistributions in binary form must reproduce the above
25 * copyright notice, this list of conditions and the following
26 * disclaimer in the documentation and/or other materials
27 * provided with the distribution.
29 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
30 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
31 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
32 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
33 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
34 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
35 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
39 #include <linux/errno.h>
40 #include <linux/err.h>
41 #include <linux/export.h>
42 #include <linux/string.h>
43 #include <linux/slab.h>
45 #include <linux/in6.h>
46 #include <net/addrconf.h>
47 #include <linux/security.h>
49 #include <rdma/ib_verbs.h>
50 #include <rdma/ib_cache.h>
51 #include <rdma/ib_addr.h>
55 #include "core_priv.h"
56 #include <trace/events/rdma_core.h>
58 static int ib_resolve_eth_dmac(struct ib_device
*device
,
59 struct rdma_ah_attr
*ah_attr
);
61 static const char * const ib_events
[] = {
62 [IB_EVENT_CQ_ERR
] = "CQ error",
63 [IB_EVENT_QP_FATAL
] = "QP fatal error",
64 [IB_EVENT_QP_REQ_ERR
] = "QP request error",
65 [IB_EVENT_QP_ACCESS_ERR
] = "QP access error",
66 [IB_EVENT_COMM_EST
] = "communication established",
67 [IB_EVENT_SQ_DRAINED
] = "send queue drained",
68 [IB_EVENT_PATH_MIG
] = "path migration successful",
69 [IB_EVENT_PATH_MIG_ERR
] = "path migration error",
70 [IB_EVENT_DEVICE_FATAL
] = "device fatal error",
71 [IB_EVENT_PORT_ACTIVE
] = "port active",
72 [IB_EVENT_PORT_ERR
] = "port error",
73 [IB_EVENT_LID_CHANGE
] = "LID change",
74 [IB_EVENT_PKEY_CHANGE
] = "P_key change",
75 [IB_EVENT_SM_CHANGE
] = "SM change",
76 [IB_EVENT_SRQ_ERR
] = "SRQ error",
77 [IB_EVENT_SRQ_LIMIT_REACHED
] = "SRQ limit reached",
78 [IB_EVENT_QP_LAST_WQE_REACHED
] = "last WQE reached",
79 [IB_EVENT_CLIENT_REREGISTER
] = "client reregister",
80 [IB_EVENT_GID_CHANGE
] = "GID changed",
83 const char *__attribute_const__
ib_event_msg(enum ib_event_type event
)
87 return (index
< ARRAY_SIZE(ib_events
) && ib_events
[index
]) ?
88 ib_events
[index
] : "unrecognized event";
90 EXPORT_SYMBOL(ib_event_msg
);
92 static const char * const wc_statuses
[] = {
93 [IB_WC_SUCCESS
] = "success",
94 [IB_WC_LOC_LEN_ERR
] = "local length error",
95 [IB_WC_LOC_QP_OP_ERR
] = "local QP operation error",
96 [IB_WC_LOC_EEC_OP_ERR
] = "local EE context operation error",
97 [IB_WC_LOC_PROT_ERR
] = "local protection error",
98 [IB_WC_WR_FLUSH_ERR
] = "WR flushed",
99 [IB_WC_MW_BIND_ERR
] = "memory bind operation error",
100 [IB_WC_BAD_RESP_ERR
] = "bad response error",
101 [IB_WC_LOC_ACCESS_ERR
] = "local access error",
102 [IB_WC_REM_INV_REQ_ERR
] = "remote invalid request error",
103 [IB_WC_REM_ACCESS_ERR
] = "remote access error",
104 [IB_WC_REM_OP_ERR
] = "remote operation error",
105 [IB_WC_RETRY_EXC_ERR
] = "transport retry counter exceeded",
106 [IB_WC_RNR_RETRY_EXC_ERR
] = "RNR retry counter exceeded",
107 [IB_WC_LOC_RDD_VIOL_ERR
] = "local RDD violation error",
108 [IB_WC_REM_INV_RD_REQ_ERR
] = "remote invalid RD request",
109 [IB_WC_REM_ABORT_ERR
] = "operation aborted",
110 [IB_WC_INV_EECN_ERR
] = "invalid EE context number",
111 [IB_WC_INV_EEC_STATE_ERR
] = "invalid EE context state",
112 [IB_WC_FATAL_ERR
] = "fatal error",
113 [IB_WC_RESP_TIMEOUT_ERR
] = "response timeout error",
114 [IB_WC_GENERAL_ERR
] = "general error",
117 const char *__attribute_const__
ib_wc_status_msg(enum ib_wc_status status
)
119 size_t index
= status
;
121 return (index
< ARRAY_SIZE(wc_statuses
) && wc_statuses
[index
]) ?
122 wc_statuses
[index
] : "unrecognized status";
124 EXPORT_SYMBOL(ib_wc_status_msg
);
126 __attribute_const__
int ib_rate_to_mult(enum ib_rate rate
)
129 case IB_RATE_2_5_GBPS
: return 1;
130 case IB_RATE_5_GBPS
: return 2;
131 case IB_RATE_10_GBPS
: return 4;
132 case IB_RATE_20_GBPS
: return 8;
133 case IB_RATE_30_GBPS
: return 12;
134 case IB_RATE_40_GBPS
: return 16;
135 case IB_RATE_60_GBPS
: return 24;
136 case IB_RATE_80_GBPS
: return 32;
137 case IB_RATE_120_GBPS
: return 48;
138 case IB_RATE_14_GBPS
: return 6;
139 case IB_RATE_56_GBPS
: return 22;
140 case IB_RATE_112_GBPS
: return 45;
141 case IB_RATE_168_GBPS
: return 67;
142 case IB_RATE_25_GBPS
: return 10;
143 case IB_RATE_100_GBPS
: return 40;
144 case IB_RATE_200_GBPS
: return 80;
145 case IB_RATE_300_GBPS
: return 120;
146 case IB_RATE_28_GBPS
: return 11;
147 case IB_RATE_50_GBPS
: return 20;
148 case IB_RATE_400_GBPS
: return 160;
149 case IB_RATE_600_GBPS
: return 240;
150 case IB_RATE_800_GBPS
: return 320;
154 EXPORT_SYMBOL(ib_rate_to_mult
);
156 __attribute_const__
enum ib_rate
mult_to_ib_rate(int mult
)
159 case 1: return IB_RATE_2_5_GBPS
;
160 case 2: return IB_RATE_5_GBPS
;
161 case 4: return IB_RATE_10_GBPS
;
162 case 8: return IB_RATE_20_GBPS
;
163 case 12: return IB_RATE_30_GBPS
;
164 case 16: return IB_RATE_40_GBPS
;
165 case 24: return IB_RATE_60_GBPS
;
166 case 32: return IB_RATE_80_GBPS
;
167 case 48: return IB_RATE_120_GBPS
;
168 case 6: return IB_RATE_14_GBPS
;
169 case 22: return IB_RATE_56_GBPS
;
170 case 45: return IB_RATE_112_GBPS
;
171 case 67: return IB_RATE_168_GBPS
;
172 case 10: return IB_RATE_25_GBPS
;
173 case 40: return IB_RATE_100_GBPS
;
174 case 80: return IB_RATE_200_GBPS
;
175 case 120: return IB_RATE_300_GBPS
;
176 case 11: return IB_RATE_28_GBPS
;
177 case 20: return IB_RATE_50_GBPS
;
178 case 160: return IB_RATE_400_GBPS
;
179 case 240: return IB_RATE_600_GBPS
;
180 case 320: return IB_RATE_800_GBPS
;
181 default: return IB_RATE_PORT_CURRENT
;
184 EXPORT_SYMBOL(mult_to_ib_rate
);
186 __attribute_const__
int ib_rate_to_mbps(enum ib_rate rate
)
189 case IB_RATE_2_5_GBPS
: return 2500;
190 case IB_RATE_5_GBPS
: return 5000;
191 case IB_RATE_10_GBPS
: return 10000;
192 case IB_RATE_20_GBPS
: return 20000;
193 case IB_RATE_30_GBPS
: return 30000;
194 case IB_RATE_40_GBPS
: return 40000;
195 case IB_RATE_60_GBPS
: return 60000;
196 case IB_RATE_80_GBPS
: return 80000;
197 case IB_RATE_120_GBPS
: return 120000;
198 case IB_RATE_14_GBPS
: return 14062;
199 case IB_RATE_56_GBPS
: return 56250;
200 case IB_RATE_112_GBPS
: return 112500;
201 case IB_RATE_168_GBPS
: return 168750;
202 case IB_RATE_25_GBPS
: return 25781;
203 case IB_RATE_100_GBPS
: return 103125;
204 case IB_RATE_200_GBPS
: return 206250;
205 case IB_RATE_300_GBPS
: return 309375;
206 case IB_RATE_28_GBPS
: return 28125;
207 case IB_RATE_50_GBPS
: return 53125;
208 case IB_RATE_400_GBPS
: return 425000;
209 case IB_RATE_600_GBPS
: return 637500;
210 case IB_RATE_800_GBPS
: return 850000;
214 EXPORT_SYMBOL(ib_rate_to_mbps
);
216 __attribute_const__
enum rdma_transport_type
217 rdma_node_get_transport(unsigned int node_type
)
220 if (node_type
== RDMA_NODE_USNIC
)
221 return RDMA_TRANSPORT_USNIC
;
222 if (node_type
== RDMA_NODE_USNIC_UDP
)
223 return RDMA_TRANSPORT_USNIC_UDP
;
224 if (node_type
== RDMA_NODE_RNIC
)
225 return RDMA_TRANSPORT_IWARP
;
226 if (node_type
== RDMA_NODE_UNSPECIFIED
)
227 return RDMA_TRANSPORT_UNSPECIFIED
;
229 return RDMA_TRANSPORT_IB
;
231 EXPORT_SYMBOL(rdma_node_get_transport
);
233 enum rdma_link_layer
rdma_port_get_link_layer(struct ib_device
*device
,
236 enum rdma_transport_type lt
;
237 if (device
->ops
.get_link_layer
)
238 return device
->ops
.get_link_layer(device
, port_num
);
240 lt
= rdma_node_get_transport(device
->node_type
);
241 if (lt
== RDMA_TRANSPORT_IB
)
242 return IB_LINK_LAYER_INFINIBAND
;
244 return IB_LINK_LAYER_ETHERNET
;
246 EXPORT_SYMBOL(rdma_port_get_link_layer
);
248 /* Protection domains */
251 * __ib_alloc_pd - Allocates an unused protection domain.
252 * @device: The device on which to allocate the protection domain.
253 * @flags: protection domain flags
254 * @caller: caller's build-time module name
256 * A protection domain object provides an association between QPs, shared
257 * receive queues, address handles, memory regions, and memory windows.
259 * Every PD has a local_dma_lkey which can be used as the lkey value for local
262 struct ib_pd
*__ib_alloc_pd(struct ib_device
*device
, unsigned int flags
,
266 int mr_access_flags
= 0;
269 pd
= rdma_zalloc_drv_obj(device
, ib_pd
);
271 return ERR_PTR(-ENOMEM
);
276 rdma_restrack_new(&pd
->res
, RDMA_RESTRACK_PD
);
277 rdma_restrack_set_name(&pd
->res
, caller
);
279 ret
= device
->ops
.alloc_pd(pd
, NULL
);
281 rdma_restrack_put(&pd
->res
);
285 rdma_restrack_add(&pd
->res
);
287 if (device
->attrs
.kernel_cap_flags
& IBK_LOCAL_DMA_LKEY
)
288 pd
->local_dma_lkey
= device
->local_dma_lkey
;
290 mr_access_flags
|= IB_ACCESS_LOCAL_WRITE
;
292 if (flags
& IB_PD_UNSAFE_GLOBAL_RKEY
) {
293 pr_warn("%s: enabling unsafe global rkey\n", caller
);
294 mr_access_flags
|= IB_ACCESS_REMOTE_READ
| IB_ACCESS_REMOTE_WRITE
;
297 if (mr_access_flags
) {
300 mr
= pd
->device
->ops
.get_dma_mr(pd
, mr_access_flags
);
306 mr
->device
= pd
->device
;
308 mr
->type
= IB_MR_TYPE_DMA
;
310 mr
->need_inval
= false;
312 pd
->__internal_mr
= mr
;
314 if (!(device
->attrs
.kernel_cap_flags
& IBK_LOCAL_DMA_LKEY
))
315 pd
->local_dma_lkey
= pd
->__internal_mr
->lkey
;
317 if (flags
& IB_PD_UNSAFE_GLOBAL_RKEY
)
318 pd
->unsafe_global_rkey
= pd
->__internal_mr
->rkey
;
323 EXPORT_SYMBOL(__ib_alloc_pd
);
326 * ib_dealloc_pd_user - Deallocates a protection domain.
327 * @pd: The protection domain to deallocate.
328 * @udata: Valid user data or NULL for kernel object
330 * It is an error to call this function while any resources in the pd still
331 * exist. The caller is responsible to synchronously destroy them and
332 * guarantee no new allocations will happen.
334 int ib_dealloc_pd_user(struct ib_pd
*pd
, struct ib_udata
*udata
)
338 if (pd
->__internal_mr
) {
339 ret
= pd
->device
->ops
.dereg_mr(pd
->__internal_mr
, NULL
);
341 pd
->__internal_mr
= NULL
;
344 ret
= pd
->device
->ops
.dealloc_pd(pd
, udata
);
348 rdma_restrack_del(&pd
->res
);
352 EXPORT_SYMBOL(ib_dealloc_pd_user
);
354 /* Address handles */
357 * rdma_copy_ah_attr - Copy rdma ah attribute from source to destination.
358 * @dest: Pointer to destination ah_attr. Contents of the destination
359 * pointer is assumed to be invalid and attribute are overwritten.
360 * @src: Pointer to source ah_attr.
362 void rdma_copy_ah_attr(struct rdma_ah_attr
*dest
,
363 const struct rdma_ah_attr
*src
)
366 if (dest
->grh
.sgid_attr
)
367 rdma_hold_gid_attr(dest
->grh
.sgid_attr
);
369 EXPORT_SYMBOL(rdma_copy_ah_attr
);
372 * rdma_replace_ah_attr - Replace valid ah_attr with new one.
373 * @old: Pointer to existing ah_attr which needs to be replaced.
374 * old is assumed to be valid or zero'd
375 * @new: Pointer to the new ah_attr.
377 * rdma_replace_ah_attr() first releases any reference in the old ah_attr if
378 * old the ah_attr is valid; after that it copies the new attribute and holds
379 * the reference to the replaced ah_attr.
381 void rdma_replace_ah_attr(struct rdma_ah_attr
*old
,
382 const struct rdma_ah_attr
*new)
384 rdma_destroy_ah_attr(old
);
386 if (old
->grh
.sgid_attr
)
387 rdma_hold_gid_attr(old
->grh
.sgid_attr
);
389 EXPORT_SYMBOL(rdma_replace_ah_attr
);
392 * rdma_move_ah_attr - Move ah_attr pointed by source to destination.
393 * @dest: Pointer to destination ah_attr to copy to.
394 * dest is assumed to be valid or zero'd
395 * @src: Pointer to the new ah_attr.
397 * rdma_move_ah_attr() first releases any reference in the destination ah_attr
398 * if it is valid. This also transfers ownership of internal references from
399 * src to dest, making src invalid in the process. No new reference of the src
402 void rdma_move_ah_attr(struct rdma_ah_attr
*dest
, struct rdma_ah_attr
*src
)
404 rdma_destroy_ah_attr(dest
);
406 src
->grh
.sgid_attr
= NULL
;
408 EXPORT_SYMBOL(rdma_move_ah_attr
);
411 * Validate that the rdma_ah_attr is valid for the device before passing it
414 static int rdma_check_ah_attr(struct ib_device
*device
,
415 struct rdma_ah_attr
*ah_attr
)
417 if (!rdma_is_port_valid(device
, ah_attr
->port_num
))
420 if ((rdma_is_grh_required(device
, ah_attr
->port_num
) ||
421 ah_attr
->type
== RDMA_AH_ATTR_TYPE_ROCE
) &&
422 !(ah_attr
->ah_flags
& IB_AH_GRH
))
425 if (ah_attr
->grh
.sgid_attr
) {
427 * Make sure the passed sgid_attr is consistent with the
430 if (ah_attr
->grh
.sgid_attr
->index
!= ah_attr
->grh
.sgid_index
||
431 ah_attr
->grh
.sgid_attr
->port_num
!= ah_attr
->port_num
)
438 * If the ah requires a GRH then ensure that sgid_attr pointer is filled in.
439 * On success the caller is responsible to call rdma_unfill_sgid_attr().
441 static int rdma_fill_sgid_attr(struct ib_device
*device
,
442 struct rdma_ah_attr
*ah_attr
,
443 const struct ib_gid_attr
**old_sgid_attr
)
445 const struct ib_gid_attr
*sgid_attr
;
446 struct ib_global_route
*grh
;
449 *old_sgid_attr
= ah_attr
->grh
.sgid_attr
;
451 ret
= rdma_check_ah_attr(device
, ah_attr
);
455 if (!(ah_attr
->ah_flags
& IB_AH_GRH
))
458 grh
= rdma_ah_retrieve_grh(ah_attr
);
463 rdma_get_gid_attr(device
, ah_attr
->port_num
, grh
->sgid_index
);
464 if (IS_ERR(sgid_attr
))
465 return PTR_ERR(sgid_attr
);
467 /* Move ownerhip of the kref into the ah_attr */
468 grh
->sgid_attr
= sgid_attr
;
472 static void rdma_unfill_sgid_attr(struct rdma_ah_attr
*ah_attr
,
473 const struct ib_gid_attr
*old_sgid_attr
)
476 * Fill didn't change anything, the caller retains ownership of
479 if (ah_attr
->grh
.sgid_attr
== old_sgid_attr
)
483 * Otherwise, we need to undo what rdma_fill_sgid_attr so the caller
484 * doesn't see any change in the rdma_ah_attr. If we get here
485 * old_sgid_attr is NULL.
487 rdma_destroy_ah_attr(ah_attr
);
490 static const struct ib_gid_attr
*
491 rdma_update_sgid_attr(struct rdma_ah_attr
*ah_attr
,
492 const struct ib_gid_attr
*old_attr
)
495 rdma_put_gid_attr(old_attr
);
496 if (ah_attr
->ah_flags
& IB_AH_GRH
) {
497 rdma_hold_gid_attr(ah_attr
->grh
.sgid_attr
);
498 return ah_attr
->grh
.sgid_attr
;
503 static struct ib_ah
*_rdma_create_ah(struct ib_pd
*pd
,
504 struct rdma_ah_attr
*ah_attr
,
506 struct ib_udata
*udata
,
507 struct net_device
*xmit_slave
)
509 struct rdma_ah_init_attr init_attr
= {};
510 struct ib_device
*device
= pd
->device
;
514 might_sleep_if(flags
& RDMA_CREATE_AH_SLEEPABLE
);
516 if (!udata
&& !device
->ops
.create_ah
)
517 return ERR_PTR(-EOPNOTSUPP
);
519 ah
= rdma_zalloc_drv_obj_gfp(
521 (flags
& RDMA_CREATE_AH_SLEEPABLE
) ? GFP_KERNEL
: GFP_ATOMIC
);
523 return ERR_PTR(-ENOMEM
);
527 ah
->type
= ah_attr
->type
;
528 ah
->sgid_attr
= rdma_update_sgid_attr(ah_attr
, NULL
);
529 init_attr
.ah_attr
= ah_attr
;
530 init_attr
.flags
= flags
;
531 init_attr
.xmit_slave
= xmit_slave
;
534 ret
= device
->ops
.create_user_ah(ah
, &init_attr
, udata
);
536 ret
= device
->ops
.create_ah(ah
, &init_attr
, NULL
);
539 rdma_put_gid_attr(ah
->sgid_attr
);
544 atomic_inc(&pd
->usecnt
);
549 * rdma_create_ah - Creates an address handle for the
550 * given address vector.
551 * @pd: The protection domain associated with the address handle.
552 * @ah_attr: The attributes of the address vector.
553 * @flags: Create address handle flags (see enum rdma_create_ah_flags).
555 * It returns 0 on success and returns appropriate error code on error.
556 * The address handle is used to reference a local or global destination
557 * in all UD QP post sends.
559 struct ib_ah
*rdma_create_ah(struct ib_pd
*pd
, struct rdma_ah_attr
*ah_attr
,
562 const struct ib_gid_attr
*old_sgid_attr
;
563 struct net_device
*slave
;
567 ret
= rdma_fill_sgid_attr(pd
->device
, ah_attr
, &old_sgid_attr
);
570 slave
= rdma_lag_get_ah_roce_slave(pd
->device
, ah_attr
,
571 (flags
& RDMA_CREATE_AH_SLEEPABLE
) ?
572 GFP_KERNEL
: GFP_ATOMIC
);
574 rdma_unfill_sgid_attr(ah_attr
, old_sgid_attr
);
575 return (void *)slave
;
577 ah
= _rdma_create_ah(pd
, ah_attr
, flags
, NULL
, slave
);
578 rdma_lag_put_ah_roce_slave(slave
);
579 rdma_unfill_sgid_attr(ah_attr
, old_sgid_attr
);
582 EXPORT_SYMBOL(rdma_create_ah
);
585 * rdma_create_user_ah - Creates an address handle for the
586 * given address vector.
587 * It resolves destination mac address for ah attribute of RoCE type.
588 * @pd: The protection domain associated with the address handle.
589 * @ah_attr: The attributes of the address vector.
590 * @udata: pointer to user's input output buffer information need by
593 * It returns 0 on success and returns appropriate error code on error.
594 * The address handle is used to reference a local or global destination
595 * in all UD QP post sends.
597 struct ib_ah
*rdma_create_user_ah(struct ib_pd
*pd
,
598 struct rdma_ah_attr
*ah_attr
,
599 struct ib_udata
*udata
)
601 const struct ib_gid_attr
*old_sgid_attr
;
605 err
= rdma_fill_sgid_attr(pd
->device
, ah_attr
, &old_sgid_attr
);
609 if (ah_attr
->type
== RDMA_AH_ATTR_TYPE_ROCE
) {
610 err
= ib_resolve_eth_dmac(pd
->device
, ah_attr
);
617 ah
= _rdma_create_ah(pd
, ah_attr
, RDMA_CREATE_AH_SLEEPABLE
,
621 rdma_unfill_sgid_attr(ah_attr
, old_sgid_attr
);
624 EXPORT_SYMBOL(rdma_create_user_ah
);
626 int ib_get_rdma_header_version(const union rdma_network_hdr
*hdr
)
628 const struct iphdr
*ip4h
= (struct iphdr
*)&hdr
->roce4grh
;
629 struct iphdr ip4h_checked
;
630 const struct ipv6hdr
*ip6h
= (struct ipv6hdr
*)&hdr
->ibgrh
;
632 /* If it's IPv6, the version must be 6, otherwise, the first
633 * 20 bytes (before the IPv4 header) are garbled.
635 if (ip6h
->version
!= 6)
636 return (ip4h
->version
== 4) ? 4 : 0;
637 /* version may be 6 or 4 because the first 20 bytes could be garbled */
639 /* RoCE v2 requires no options, thus header length
646 * We can't write on scattered buffers so we need to copy to
649 memcpy(&ip4h_checked
, ip4h
, sizeof(ip4h_checked
));
650 ip4h_checked
.check
= 0;
651 ip4h_checked
.check
= ip_fast_csum((u8
*)&ip4h_checked
, 5);
652 /* if IPv4 header checksum is OK, believe it */
653 if (ip4h
->check
== ip4h_checked
.check
)
657 EXPORT_SYMBOL(ib_get_rdma_header_version
);
659 static enum rdma_network_type
ib_get_net_type_by_grh(struct ib_device
*device
,
661 const struct ib_grh
*grh
)
665 if (rdma_protocol_ib(device
, port_num
))
666 return RDMA_NETWORK_IB
;
668 grh_version
= ib_get_rdma_header_version((union rdma_network_hdr
*)grh
);
670 if (grh_version
== 4)
671 return RDMA_NETWORK_IPV4
;
673 if (grh
->next_hdr
== IPPROTO_UDP
)
674 return RDMA_NETWORK_IPV6
;
676 return RDMA_NETWORK_ROCE_V1
;
679 struct find_gid_index_context
{
681 enum ib_gid_type gid_type
;
684 static bool find_gid_index(const union ib_gid
*gid
,
685 const struct ib_gid_attr
*gid_attr
,
688 struct find_gid_index_context
*ctx
= context
;
689 u16 vlan_id
= 0xffff;
692 if (ctx
->gid_type
!= gid_attr
->gid_type
)
695 ret
= rdma_read_gid_l2_fields(gid_attr
, &vlan_id
, NULL
);
699 return ctx
->vlan_id
== vlan_id
;
702 static const struct ib_gid_attr
*
703 get_sgid_attr_from_eth(struct ib_device
*device
, u32 port_num
,
704 u16 vlan_id
, const union ib_gid
*sgid
,
705 enum ib_gid_type gid_type
)
707 struct find_gid_index_context context
= {.vlan_id
= vlan_id
,
708 .gid_type
= gid_type
};
710 return rdma_find_gid_by_filter(device
, sgid
, port_num
, find_gid_index
,
714 int ib_get_gids_from_rdma_hdr(const union rdma_network_hdr
*hdr
,
715 enum rdma_network_type net_type
,
716 union ib_gid
*sgid
, union ib_gid
*dgid
)
718 struct sockaddr_in src_in
;
719 struct sockaddr_in dst_in
;
720 __be32 src_saddr
, dst_saddr
;
725 if (net_type
== RDMA_NETWORK_IPV4
) {
726 memcpy(&src_in
.sin_addr
.s_addr
,
727 &hdr
->roce4grh
.saddr
, 4);
728 memcpy(&dst_in
.sin_addr
.s_addr
,
729 &hdr
->roce4grh
.daddr
, 4);
730 src_saddr
= src_in
.sin_addr
.s_addr
;
731 dst_saddr
= dst_in
.sin_addr
.s_addr
;
732 ipv6_addr_set_v4mapped(src_saddr
,
733 (struct in6_addr
*)sgid
);
734 ipv6_addr_set_v4mapped(dst_saddr
,
735 (struct in6_addr
*)dgid
);
737 } else if (net_type
== RDMA_NETWORK_IPV6
||
738 net_type
== RDMA_NETWORK_IB
|| RDMA_NETWORK_ROCE_V1
) {
739 *dgid
= hdr
->ibgrh
.dgid
;
740 *sgid
= hdr
->ibgrh
.sgid
;
746 EXPORT_SYMBOL(ib_get_gids_from_rdma_hdr
);
748 /* Resolve destination mac address and hop limit for unicast destination
749 * GID entry, considering the source GID entry as well.
750 * ah_attribute must have valid port_num, sgid_index.
752 static int ib_resolve_unicast_gid_dmac(struct ib_device
*device
,
753 struct rdma_ah_attr
*ah_attr
)
755 struct ib_global_route
*grh
= rdma_ah_retrieve_grh(ah_attr
);
756 const struct ib_gid_attr
*sgid_attr
= grh
->sgid_attr
;
757 int hop_limit
= 0xff;
760 /* If destination is link local and source GID is RoCEv1,
761 * IP stack is not used.
763 if (rdma_link_local_addr((struct in6_addr
*)grh
->dgid
.raw
) &&
764 sgid_attr
->gid_type
== IB_GID_TYPE_ROCE
) {
765 rdma_get_ll_mac((struct in6_addr
*)grh
->dgid
.raw
,
770 ret
= rdma_addr_find_l2_eth_by_grh(&sgid_attr
->gid
, &grh
->dgid
,
772 sgid_attr
, &hop_limit
);
774 grh
->hop_limit
= hop_limit
;
779 * This function initializes address handle attributes from the incoming packet.
780 * Incoming packet has dgid of the receiver node on which this code is
781 * getting executed and, sgid contains the GID of the sender.
783 * When resolving mac address of destination, the arrived dgid is used
784 * as sgid and, sgid is used as dgid because sgid contains destinations
785 * GID whom to respond to.
787 * On success the caller is responsible to call rdma_destroy_ah_attr on the
790 int ib_init_ah_attr_from_wc(struct ib_device
*device
, u32 port_num
,
791 const struct ib_wc
*wc
, const struct ib_grh
*grh
,
792 struct rdma_ah_attr
*ah_attr
)
796 enum rdma_network_type net_type
= RDMA_NETWORK_IB
;
797 enum ib_gid_type gid_type
= IB_GID_TYPE_IB
;
798 const struct ib_gid_attr
*sgid_attr
;
805 memset(ah_attr
, 0, sizeof *ah_attr
);
806 ah_attr
->type
= rdma_ah_find_type(device
, port_num
);
807 if (rdma_cap_eth_ah(device
, port_num
)) {
808 if (wc
->wc_flags
& IB_WC_WITH_NETWORK_HDR_TYPE
)
809 net_type
= wc
->network_hdr_type
;
811 net_type
= ib_get_net_type_by_grh(device
, port_num
, grh
);
812 gid_type
= ib_network_to_gid_type(net_type
);
814 ret
= ib_get_gids_from_rdma_hdr((union rdma_network_hdr
*)grh
, net_type
,
819 rdma_ah_set_sl(ah_attr
, wc
->sl
);
820 rdma_ah_set_port_num(ah_attr
, port_num
);
822 if (rdma_protocol_roce(device
, port_num
)) {
823 u16 vlan_id
= wc
->wc_flags
& IB_WC_WITH_VLAN
?
824 wc
->vlan_id
: 0xffff;
826 if (!(wc
->wc_flags
& IB_WC_GRH
))
829 sgid_attr
= get_sgid_attr_from_eth(device
, port_num
,
832 if (IS_ERR(sgid_attr
))
833 return PTR_ERR(sgid_attr
);
835 flow_class
= be32_to_cpu(grh
->version_tclass_flow
);
836 rdma_move_grh_sgid_attr(ah_attr
,
838 flow_class
& 0xFFFFF,
840 (flow_class
>> 20) & 0xFF,
843 ret
= ib_resolve_unicast_gid_dmac(device
, ah_attr
);
845 rdma_destroy_ah_attr(ah_attr
);
849 rdma_ah_set_dlid(ah_attr
, wc
->slid
);
850 rdma_ah_set_path_bits(ah_attr
, wc
->dlid_path_bits
);
852 if ((wc
->wc_flags
& IB_WC_GRH
) == 0)
855 if (dgid
.global
.interface_id
!=
856 cpu_to_be64(IB_SA_WELL_KNOWN_GUID
)) {
857 sgid_attr
= rdma_find_gid_by_port(
858 device
, &dgid
, IB_GID_TYPE_IB
, port_num
, NULL
);
860 sgid_attr
= rdma_get_gid_attr(device
, port_num
, 0);
862 if (IS_ERR(sgid_attr
))
863 return PTR_ERR(sgid_attr
);
864 flow_class
= be32_to_cpu(grh
->version_tclass_flow
);
865 rdma_move_grh_sgid_attr(ah_attr
,
867 flow_class
& 0xFFFFF,
869 (flow_class
>> 20) & 0xFF,
875 EXPORT_SYMBOL(ib_init_ah_attr_from_wc
);
878 * rdma_move_grh_sgid_attr - Sets the sgid attribute of GRH, taking ownership
881 * @attr: Pointer to AH attribute structure
882 * @dgid: Destination GID
883 * @flow_label: Flow label
884 * @hop_limit: Hop limit
885 * @traffic_class: traffic class
886 * @sgid_attr: Pointer to SGID attribute
888 * This takes ownership of the sgid_attr reference. The caller must ensure
889 * rdma_destroy_ah_attr() is called before destroying the rdma_ah_attr after
890 * calling this function.
892 void rdma_move_grh_sgid_attr(struct rdma_ah_attr
*attr
, union ib_gid
*dgid
,
893 u32 flow_label
, u8 hop_limit
, u8 traffic_class
,
894 const struct ib_gid_attr
*sgid_attr
)
896 rdma_ah_set_grh(attr
, dgid
, flow_label
, sgid_attr
->index
, hop_limit
,
898 attr
->grh
.sgid_attr
= sgid_attr
;
900 EXPORT_SYMBOL(rdma_move_grh_sgid_attr
);
903 * rdma_destroy_ah_attr - Release reference to SGID attribute of
905 * @ah_attr: Pointer to ah attribute
907 * Release reference to the SGID attribute of the ah attribute if it is
908 * non NULL. It is safe to call this multiple times, and safe to call it on
909 * a zero initialized ah_attr.
911 void rdma_destroy_ah_attr(struct rdma_ah_attr
*ah_attr
)
913 if (ah_attr
->grh
.sgid_attr
) {
914 rdma_put_gid_attr(ah_attr
->grh
.sgid_attr
);
915 ah_attr
->grh
.sgid_attr
= NULL
;
918 EXPORT_SYMBOL(rdma_destroy_ah_attr
);
920 struct ib_ah
*ib_create_ah_from_wc(struct ib_pd
*pd
, const struct ib_wc
*wc
,
921 const struct ib_grh
*grh
, u32 port_num
)
923 struct rdma_ah_attr ah_attr
;
927 ret
= ib_init_ah_attr_from_wc(pd
->device
, port_num
, wc
, grh
, &ah_attr
);
931 ah
= rdma_create_ah(pd
, &ah_attr
, RDMA_CREATE_AH_SLEEPABLE
);
933 rdma_destroy_ah_attr(&ah_attr
);
936 EXPORT_SYMBOL(ib_create_ah_from_wc
);
938 int rdma_modify_ah(struct ib_ah
*ah
, struct rdma_ah_attr
*ah_attr
)
940 const struct ib_gid_attr
*old_sgid_attr
;
943 if (ah
->type
!= ah_attr
->type
)
946 ret
= rdma_fill_sgid_attr(ah
->device
, ah_attr
, &old_sgid_attr
);
950 ret
= ah
->device
->ops
.modify_ah
?
951 ah
->device
->ops
.modify_ah(ah
, ah_attr
) :
954 ah
->sgid_attr
= rdma_update_sgid_attr(ah_attr
, ah
->sgid_attr
);
955 rdma_unfill_sgid_attr(ah_attr
, old_sgid_attr
);
958 EXPORT_SYMBOL(rdma_modify_ah
);
960 int rdma_query_ah(struct ib_ah
*ah
, struct rdma_ah_attr
*ah_attr
)
962 ah_attr
->grh
.sgid_attr
= NULL
;
964 return ah
->device
->ops
.query_ah
?
965 ah
->device
->ops
.query_ah(ah
, ah_attr
) :
968 EXPORT_SYMBOL(rdma_query_ah
);
970 int rdma_destroy_ah_user(struct ib_ah
*ah
, u32 flags
, struct ib_udata
*udata
)
972 const struct ib_gid_attr
*sgid_attr
= ah
->sgid_attr
;
976 might_sleep_if(flags
& RDMA_DESTROY_AH_SLEEPABLE
);
980 ret
= ah
->device
->ops
.destroy_ah(ah
, flags
);
984 atomic_dec(&pd
->usecnt
);
986 rdma_put_gid_attr(sgid_attr
);
991 EXPORT_SYMBOL(rdma_destroy_ah_user
);
993 /* Shared receive queues */
996 * ib_create_srq_user - Creates a SRQ associated with the specified protection
998 * @pd: The protection domain associated with the SRQ.
999 * @srq_init_attr: A list of initial attributes required to create the
1000 * SRQ. If SRQ creation succeeds, then the attributes are updated to
1001 * the actual capabilities of the created SRQ.
1002 * @uobject: uobject pointer if this is not a kernel SRQ
1003 * @udata: udata pointer if this is not a kernel SRQ
1005 * srq_attr->max_wr and srq_attr->max_sge are read the determine the
1006 * requested size of the SRQ, and set to the actual values allocated
1007 * on return. If ib_create_srq() succeeds, then max_wr and max_sge
1008 * will always be at least as large as the requested values.
1010 struct ib_srq
*ib_create_srq_user(struct ib_pd
*pd
,
1011 struct ib_srq_init_attr
*srq_init_attr
,
1012 struct ib_usrq_object
*uobject
,
1013 struct ib_udata
*udata
)
1018 srq
= rdma_zalloc_drv_obj(pd
->device
, ib_srq
);
1020 return ERR_PTR(-ENOMEM
);
1022 srq
->device
= pd
->device
;
1024 srq
->event_handler
= srq_init_attr
->event_handler
;
1025 srq
->srq_context
= srq_init_attr
->srq_context
;
1026 srq
->srq_type
= srq_init_attr
->srq_type
;
1027 srq
->uobject
= uobject
;
1029 if (ib_srq_has_cq(srq
->srq_type
)) {
1030 srq
->ext
.cq
= srq_init_attr
->ext
.cq
;
1031 atomic_inc(&srq
->ext
.cq
->usecnt
);
1033 if (srq
->srq_type
== IB_SRQT_XRC
) {
1034 srq
->ext
.xrc
.xrcd
= srq_init_attr
->ext
.xrc
.xrcd
;
1035 if (srq
->ext
.xrc
.xrcd
)
1036 atomic_inc(&srq
->ext
.xrc
.xrcd
->usecnt
);
1038 atomic_inc(&pd
->usecnt
);
1040 rdma_restrack_new(&srq
->res
, RDMA_RESTRACK_SRQ
);
1041 rdma_restrack_parent_name(&srq
->res
, &pd
->res
);
1043 ret
= pd
->device
->ops
.create_srq(srq
, srq_init_attr
, udata
);
1045 rdma_restrack_put(&srq
->res
);
1046 atomic_dec(&pd
->usecnt
);
1047 if (srq
->srq_type
== IB_SRQT_XRC
&& srq
->ext
.xrc
.xrcd
)
1048 atomic_dec(&srq
->ext
.xrc
.xrcd
->usecnt
);
1049 if (ib_srq_has_cq(srq
->srq_type
))
1050 atomic_dec(&srq
->ext
.cq
->usecnt
);
1052 return ERR_PTR(ret
);
1055 rdma_restrack_add(&srq
->res
);
1059 EXPORT_SYMBOL(ib_create_srq_user
);
1061 int ib_modify_srq(struct ib_srq
*srq
,
1062 struct ib_srq_attr
*srq_attr
,
1063 enum ib_srq_attr_mask srq_attr_mask
)
1065 return srq
->device
->ops
.modify_srq
?
1066 srq
->device
->ops
.modify_srq(srq
, srq_attr
, srq_attr_mask
,
1067 NULL
) : -EOPNOTSUPP
;
1069 EXPORT_SYMBOL(ib_modify_srq
);
1071 int ib_query_srq(struct ib_srq
*srq
,
1072 struct ib_srq_attr
*srq_attr
)
1074 return srq
->device
->ops
.query_srq
?
1075 srq
->device
->ops
.query_srq(srq
, srq_attr
) : -EOPNOTSUPP
;
1077 EXPORT_SYMBOL(ib_query_srq
);
1079 int ib_destroy_srq_user(struct ib_srq
*srq
, struct ib_udata
*udata
)
1083 if (atomic_read(&srq
->usecnt
))
1086 ret
= srq
->device
->ops
.destroy_srq(srq
, udata
);
1090 atomic_dec(&srq
->pd
->usecnt
);
1091 if (srq
->srq_type
== IB_SRQT_XRC
&& srq
->ext
.xrc
.xrcd
)
1092 atomic_dec(&srq
->ext
.xrc
.xrcd
->usecnt
);
1093 if (ib_srq_has_cq(srq
->srq_type
))
1094 atomic_dec(&srq
->ext
.cq
->usecnt
);
1095 rdma_restrack_del(&srq
->res
);
1100 EXPORT_SYMBOL(ib_destroy_srq_user
);
1104 static void __ib_qp_event_handler(struct ib_event
*event
, void *context
)
1106 struct ib_qp
*qp
= event
->element
.qp
;
1108 if (event
->event
== IB_EVENT_QP_LAST_WQE_REACHED
)
1109 complete(&qp
->srq_completion
);
1110 if (qp
->registered_event_handler
)
1111 qp
->registered_event_handler(event
, qp
->qp_context
);
1114 static void __ib_shared_qp_event_handler(struct ib_event
*event
, void *context
)
1116 struct ib_qp
*qp
= context
;
1117 unsigned long flags
;
1119 spin_lock_irqsave(&qp
->device
->qp_open_list_lock
, flags
);
1120 list_for_each_entry(event
->element
.qp
, &qp
->open_list
, open_list
)
1121 if (event
->element
.qp
->event_handler
)
1122 event
->element
.qp
->event_handler(event
, event
->element
.qp
->qp_context
);
1123 spin_unlock_irqrestore(&qp
->device
->qp_open_list_lock
, flags
);
1126 static struct ib_qp
*__ib_open_qp(struct ib_qp
*real_qp
,
1127 void (*event_handler
)(struct ib_event
*, void *),
1131 unsigned long flags
;
1134 qp
= kzalloc(sizeof *qp
, GFP_KERNEL
);
1136 return ERR_PTR(-ENOMEM
);
1138 qp
->real_qp
= real_qp
;
1139 err
= ib_open_shared_qp_security(qp
, real_qp
->device
);
1142 return ERR_PTR(err
);
1145 qp
->real_qp
= real_qp
;
1146 atomic_inc(&real_qp
->usecnt
);
1147 qp
->device
= real_qp
->device
;
1148 qp
->event_handler
= event_handler
;
1149 qp
->qp_context
= qp_context
;
1150 qp
->qp_num
= real_qp
->qp_num
;
1151 qp
->qp_type
= real_qp
->qp_type
;
1153 spin_lock_irqsave(&real_qp
->device
->qp_open_list_lock
, flags
);
1154 list_add(&qp
->open_list
, &real_qp
->open_list
);
1155 spin_unlock_irqrestore(&real_qp
->device
->qp_open_list_lock
, flags
);
1160 struct ib_qp
*ib_open_qp(struct ib_xrcd
*xrcd
,
1161 struct ib_qp_open_attr
*qp_open_attr
)
1163 struct ib_qp
*qp
, *real_qp
;
1165 if (qp_open_attr
->qp_type
!= IB_QPT_XRC_TGT
)
1166 return ERR_PTR(-EINVAL
);
1168 down_read(&xrcd
->tgt_qps_rwsem
);
1169 real_qp
= xa_load(&xrcd
->tgt_qps
, qp_open_attr
->qp_num
);
1171 up_read(&xrcd
->tgt_qps_rwsem
);
1172 return ERR_PTR(-EINVAL
);
1174 qp
= __ib_open_qp(real_qp
, qp_open_attr
->event_handler
,
1175 qp_open_attr
->qp_context
);
1176 up_read(&xrcd
->tgt_qps_rwsem
);
1179 EXPORT_SYMBOL(ib_open_qp
);
1181 static struct ib_qp
*create_xrc_qp_user(struct ib_qp
*qp
,
1182 struct ib_qp_init_attr
*qp_init_attr
)
1184 struct ib_qp
*real_qp
= qp
;
1187 qp
->event_handler
= __ib_shared_qp_event_handler
;
1188 qp
->qp_context
= qp
;
1190 qp
->send_cq
= qp
->recv_cq
= NULL
;
1192 qp
->xrcd
= qp_init_attr
->xrcd
;
1193 atomic_inc(&qp_init_attr
->xrcd
->usecnt
);
1194 INIT_LIST_HEAD(&qp
->open_list
);
1196 qp
= __ib_open_qp(real_qp
, qp_init_attr
->event_handler
,
1197 qp_init_attr
->qp_context
);
1201 err
= xa_err(xa_store(&qp_init_attr
->xrcd
->tgt_qps
, real_qp
->qp_num
,
1202 real_qp
, GFP_KERNEL
));
1205 return ERR_PTR(err
);
1210 static struct ib_qp
*create_qp(struct ib_device
*dev
, struct ib_pd
*pd
,
1211 struct ib_qp_init_attr
*attr
,
1212 struct ib_udata
*udata
,
1213 struct ib_uqp_object
*uobj
, const char *caller
)
1215 struct ib_udata dummy
= {};
1219 if (!dev
->ops
.create_qp
)
1220 return ERR_PTR(-EOPNOTSUPP
);
1222 qp
= rdma_zalloc_drv_obj_numa(dev
, ib_qp
);
1224 return ERR_PTR(-ENOMEM
);
1231 qp
->qp_type
= attr
->qp_type
;
1232 qp
->rwq_ind_tbl
= attr
->rwq_ind_tbl
;
1233 qp
->srq
= attr
->srq
;
1234 qp
->event_handler
= __ib_qp_event_handler
;
1235 qp
->registered_event_handler
= attr
->event_handler
;
1236 qp
->port
= attr
->port_num
;
1237 qp
->qp_context
= attr
->qp_context
;
1239 spin_lock_init(&qp
->mr_lock
);
1240 INIT_LIST_HEAD(&qp
->rdma_mrs
);
1241 INIT_LIST_HEAD(&qp
->sig_mrs
);
1242 init_completion(&qp
->srq_completion
);
1244 qp
->send_cq
= attr
->send_cq
;
1245 qp
->recv_cq
= attr
->recv_cq
;
1247 rdma_restrack_new(&qp
->res
, RDMA_RESTRACK_QP
);
1248 WARN_ONCE(!udata
&& !caller
, "Missing kernel QP owner");
1249 rdma_restrack_set_name(&qp
->res
, udata
? NULL
: caller
);
1250 ret
= dev
->ops
.create_qp(qp
, attr
, udata
);
1255 * TODO: The mlx4 internally overwrites send_cq and recv_cq.
1256 * Unfortunately, it is not an easy task to fix that driver.
1258 qp
->send_cq
= attr
->send_cq
;
1259 qp
->recv_cq
= attr
->recv_cq
;
1261 ret
= ib_create_qp_security(qp
, dev
);
1265 rdma_restrack_add(&qp
->res
);
1269 qp
->device
->ops
.destroy_qp(qp
, udata
? &dummy
: NULL
);
1271 rdma_restrack_put(&qp
->res
);
1273 return ERR_PTR(ret
);
1278 * ib_create_qp_user - Creates a QP associated with the specified protection
1281 * @pd: The protection domain associated with the QP.
1282 * @attr: A list of initial attributes required to create the
1283 * QP. If QP creation succeeds, then the attributes are updated to
1284 * the actual capabilities of the created QP.
1286 * @uobj: uverbs obect
1287 * @caller: caller's build-time module name
1289 struct ib_qp
*ib_create_qp_user(struct ib_device
*dev
, struct ib_pd
*pd
,
1290 struct ib_qp_init_attr
*attr
,
1291 struct ib_udata
*udata
,
1292 struct ib_uqp_object
*uobj
, const char *caller
)
1294 struct ib_qp
*qp
, *xrc_qp
;
1296 if (attr
->qp_type
== IB_QPT_XRC_TGT
)
1297 qp
= create_qp(dev
, pd
, attr
, NULL
, NULL
, caller
);
1299 qp
= create_qp(dev
, pd
, attr
, udata
, uobj
, NULL
);
1300 if (attr
->qp_type
!= IB_QPT_XRC_TGT
|| IS_ERR(qp
))
1303 xrc_qp
= create_xrc_qp_user(qp
, attr
);
1304 if (IS_ERR(xrc_qp
)) {
1309 xrc_qp
->uobject
= uobj
;
1312 EXPORT_SYMBOL(ib_create_qp_user
);
1314 void ib_qp_usecnt_inc(struct ib_qp
*qp
)
1317 atomic_inc(&qp
->pd
->usecnt
);
1319 atomic_inc(&qp
->send_cq
->usecnt
);
1321 atomic_inc(&qp
->recv_cq
->usecnt
);
1323 atomic_inc(&qp
->srq
->usecnt
);
1324 if (qp
->rwq_ind_tbl
)
1325 atomic_inc(&qp
->rwq_ind_tbl
->usecnt
);
1327 EXPORT_SYMBOL(ib_qp_usecnt_inc
);
1329 void ib_qp_usecnt_dec(struct ib_qp
*qp
)
1331 if (qp
->rwq_ind_tbl
)
1332 atomic_dec(&qp
->rwq_ind_tbl
->usecnt
);
1334 atomic_dec(&qp
->srq
->usecnt
);
1336 atomic_dec(&qp
->recv_cq
->usecnt
);
1338 atomic_dec(&qp
->send_cq
->usecnt
);
1340 atomic_dec(&qp
->pd
->usecnt
);
1342 EXPORT_SYMBOL(ib_qp_usecnt_dec
);
1344 struct ib_qp
*ib_create_qp_kernel(struct ib_pd
*pd
,
1345 struct ib_qp_init_attr
*qp_init_attr
,
1348 struct ib_device
*device
= pd
->device
;
1353 * If the callers is using the RDMA API calculate the resources
1354 * needed for the RDMA READ/WRITE operations.
1356 * Note that these callers need to pass in a port number.
1358 if (qp_init_attr
->cap
.max_rdma_ctxs
)
1359 rdma_rw_init_qp(device
, qp_init_attr
);
1361 qp
= create_qp(device
, pd
, qp_init_attr
, NULL
, NULL
, caller
);
1365 ib_qp_usecnt_inc(qp
);
1367 if (qp_init_attr
->cap
.max_rdma_ctxs
) {
1368 ret
= rdma_rw_init_mrs(qp
, qp_init_attr
);
1374 * Note: all hw drivers guarantee that max_send_sge is lower than
1375 * the device RDMA WRITE SGE limit but not all hw drivers ensure that
1376 * max_send_sge <= max_sge_rd.
1378 qp
->max_write_sge
= qp_init_attr
->cap
.max_send_sge
;
1379 qp
->max_read_sge
= min_t(u32
, qp_init_attr
->cap
.max_send_sge
,
1380 device
->attrs
.max_sge_rd
);
1381 if (qp_init_attr
->create_flags
& IB_QP_CREATE_INTEGRITY_EN
)
1382 qp
->integrity_en
= true;
1388 return ERR_PTR(ret
);
1391 EXPORT_SYMBOL(ib_create_qp_kernel
);
1393 static const struct {
1395 enum ib_qp_attr_mask req_param
[IB_QPT_MAX
];
1396 enum ib_qp_attr_mask opt_param
[IB_QPT_MAX
];
1397 } qp_state_table
[IB_QPS_ERR
+ 1][IB_QPS_ERR
+ 1] = {
1399 [IB_QPS_RESET
] = { .valid
= 1 },
1403 [IB_QPT_UD
] = (IB_QP_PKEY_INDEX
|
1406 [IB_QPT_RAW_PACKET
] = IB_QP_PORT
,
1407 [IB_QPT_UC
] = (IB_QP_PKEY_INDEX
|
1409 IB_QP_ACCESS_FLAGS
),
1410 [IB_QPT_RC
] = (IB_QP_PKEY_INDEX
|
1412 IB_QP_ACCESS_FLAGS
),
1413 [IB_QPT_XRC_INI
] = (IB_QP_PKEY_INDEX
|
1415 IB_QP_ACCESS_FLAGS
),
1416 [IB_QPT_XRC_TGT
] = (IB_QP_PKEY_INDEX
|
1418 IB_QP_ACCESS_FLAGS
),
1419 [IB_QPT_SMI
] = (IB_QP_PKEY_INDEX
|
1421 [IB_QPT_GSI
] = (IB_QP_PKEY_INDEX
|
1427 [IB_QPS_RESET
] = { .valid
= 1 },
1428 [IB_QPS_ERR
] = { .valid
= 1 },
1432 [IB_QPT_UD
] = (IB_QP_PKEY_INDEX
|
1435 [IB_QPT_UC
] = (IB_QP_PKEY_INDEX
|
1437 IB_QP_ACCESS_FLAGS
),
1438 [IB_QPT_RC
] = (IB_QP_PKEY_INDEX
|
1440 IB_QP_ACCESS_FLAGS
),
1441 [IB_QPT_XRC_INI
] = (IB_QP_PKEY_INDEX
|
1443 IB_QP_ACCESS_FLAGS
),
1444 [IB_QPT_XRC_TGT
] = (IB_QP_PKEY_INDEX
|
1446 IB_QP_ACCESS_FLAGS
),
1447 [IB_QPT_SMI
] = (IB_QP_PKEY_INDEX
|
1449 [IB_QPT_GSI
] = (IB_QP_PKEY_INDEX
|
1456 [IB_QPT_UC
] = (IB_QP_AV
|
1460 [IB_QPT_RC
] = (IB_QP_AV
|
1464 IB_QP_MAX_DEST_RD_ATOMIC
|
1465 IB_QP_MIN_RNR_TIMER
),
1466 [IB_QPT_XRC_INI
] = (IB_QP_AV
|
1470 [IB_QPT_XRC_TGT
] = (IB_QP_AV
|
1474 IB_QP_MAX_DEST_RD_ATOMIC
|
1475 IB_QP_MIN_RNR_TIMER
),
1478 [IB_QPT_UD
] = (IB_QP_PKEY_INDEX
|
1480 [IB_QPT_UC
] = (IB_QP_ALT_PATH
|
1481 IB_QP_ACCESS_FLAGS
|
1483 [IB_QPT_RC
] = (IB_QP_ALT_PATH
|
1484 IB_QP_ACCESS_FLAGS
|
1486 [IB_QPT_XRC_INI
] = (IB_QP_ALT_PATH
|
1487 IB_QP_ACCESS_FLAGS
|
1489 [IB_QPT_XRC_TGT
] = (IB_QP_ALT_PATH
|
1490 IB_QP_ACCESS_FLAGS
|
1492 [IB_QPT_SMI
] = (IB_QP_PKEY_INDEX
|
1494 [IB_QPT_GSI
] = (IB_QP_PKEY_INDEX
|
1500 [IB_QPS_RESET
] = { .valid
= 1 },
1501 [IB_QPS_ERR
] = { .valid
= 1 },
1505 [IB_QPT_UD
] = IB_QP_SQ_PSN
,
1506 [IB_QPT_UC
] = IB_QP_SQ_PSN
,
1507 [IB_QPT_RC
] = (IB_QP_TIMEOUT
|
1511 IB_QP_MAX_QP_RD_ATOMIC
),
1512 [IB_QPT_XRC_INI
] = (IB_QP_TIMEOUT
|
1516 IB_QP_MAX_QP_RD_ATOMIC
),
1517 [IB_QPT_XRC_TGT
] = (IB_QP_TIMEOUT
|
1519 [IB_QPT_SMI
] = IB_QP_SQ_PSN
,
1520 [IB_QPT_GSI
] = IB_QP_SQ_PSN
,
1523 [IB_QPT_UD
] = (IB_QP_CUR_STATE
|
1525 [IB_QPT_UC
] = (IB_QP_CUR_STATE
|
1527 IB_QP_ACCESS_FLAGS
|
1528 IB_QP_PATH_MIG_STATE
),
1529 [IB_QPT_RC
] = (IB_QP_CUR_STATE
|
1531 IB_QP_ACCESS_FLAGS
|
1532 IB_QP_MIN_RNR_TIMER
|
1533 IB_QP_PATH_MIG_STATE
),
1534 [IB_QPT_XRC_INI
] = (IB_QP_CUR_STATE
|
1536 IB_QP_ACCESS_FLAGS
|
1537 IB_QP_PATH_MIG_STATE
),
1538 [IB_QPT_XRC_TGT
] = (IB_QP_CUR_STATE
|
1540 IB_QP_ACCESS_FLAGS
|
1541 IB_QP_MIN_RNR_TIMER
|
1542 IB_QP_PATH_MIG_STATE
),
1543 [IB_QPT_SMI
] = (IB_QP_CUR_STATE
|
1545 [IB_QPT_GSI
] = (IB_QP_CUR_STATE
|
1547 [IB_QPT_RAW_PACKET
] = IB_QP_RATE_LIMIT
,
1552 [IB_QPS_RESET
] = { .valid
= 1 },
1553 [IB_QPS_ERR
] = { .valid
= 1 },
1557 [IB_QPT_UD
] = (IB_QP_CUR_STATE
|
1559 [IB_QPT_UC
] = (IB_QP_CUR_STATE
|
1560 IB_QP_ACCESS_FLAGS
|
1562 IB_QP_PATH_MIG_STATE
),
1563 [IB_QPT_RC
] = (IB_QP_CUR_STATE
|
1564 IB_QP_ACCESS_FLAGS
|
1566 IB_QP_PATH_MIG_STATE
|
1567 IB_QP_MIN_RNR_TIMER
),
1568 [IB_QPT_XRC_INI
] = (IB_QP_CUR_STATE
|
1569 IB_QP_ACCESS_FLAGS
|
1571 IB_QP_PATH_MIG_STATE
),
1572 [IB_QPT_XRC_TGT
] = (IB_QP_CUR_STATE
|
1573 IB_QP_ACCESS_FLAGS
|
1575 IB_QP_PATH_MIG_STATE
|
1576 IB_QP_MIN_RNR_TIMER
),
1577 [IB_QPT_SMI
] = (IB_QP_CUR_STATE
|
1579 [IB_QPT_GSI
] = (IB_QP_CUR_STATE
|
1581 [IB_QPT_RAW_PACKET
] = IB_QP_RATE_LIMIT
,
1587 [IB_QPT_UD
] = IB_QP_EN_SQD_ASYNC_NOTIFY
,
1588 [IB_QPT_UC
] = IB_QP_EN_SQD_ASYNC_NOTIFY
,
1589 [IB_QPT_RC
] = IB_QP_EN_SQD_ASYNC_NOTIFY
,
1590 [IB_QPT_XRC_INI
] = IB_QP_EN_SQD_ASYNC_NOTIFY
,
1591 [IB_QPT_XRC_TGT
] = IB_QP_EN_SQD_ASYNC_NOTIFY
, /* ??? */
1592 [IB_QPT_SMI
] = IB_QP_EN_SQD_ASYNC_NOTIFY
,
1593 [IB_QPT_GSI
] = IB_QP_EN_SQD_ASYNC_NOTIFY
1598 [IB_QPS_RESET
] = { .valid
= 1 },
1599 [IB_QPS_ERR
] = { .valid
= 1 },
1603 [IB_QPT_UD
] = (IB_QP_CUR_STATE
|
1605 [IB_QPT_UC
] = (IB_QP_CUR_STATE
|
1607 IB_QP_ACCESS_FLAGS
|
1608 IB_QP_PATH_MIG_STATE
),
1609 [IB_QPT_RC
] = (IB_QP_CUR_STATE
|
1611 IB_QP_ACCESS_FLAGS
|
1612 IB_QP_MIN_RNR_TIMER
|
1613 IB_QP_PATH_MIG_STATE
),
1614 [IB_QPT_XRC_INI
] = (IB_QP_CUR_STATE
|
1616 IB_QP_ACCESS_FLAGS
|
1617 IB_QP_PATH_MIG_STATE
),
1618 [IB_QPT_XRC_TGT
] = (IB_QP_CUR_STATE
|
1620 IB_QP_ACCESS_FLAGS
|
1621 IB_QP_MIN_RNR_TIMER
|
1622 IB_QP_PATH_MIG_STATE
),
1623 [IB_QPT_SMI
] = (IB_QP_CUR_STATE
|
1625 [IB_QPT_GSI
] = (IB_QP_CUR_STATE
|
1632 [IB_QPT_UD
] = (IB_QP_PKEY_INDEX
|
1634 [IB_QPT_UC
] = (IB_QP_AV
|
1636 IB_QP_ACCESS_FLAGS
|
1638 IB_QP_PATH_MIG_STATE
),
1639 [IB_QPT_RC
] = (IB_QP_PORT
|
1644 IB_QP_MAX_QP_RD_ATOMIC
|
1645 IB_QP_MAX_DEST_RD_ATOMIC
|
1647 IB_QP_ACCESS_FLAGS
|
1649 IB_QP_MIN_RNR_TIMER
|
1650 IB_QP_PATH_MIG_STATE
),
1651 [IB_QPT_XRC_INI
] = (IB_QP_PORT
|
1656 IB_QP_MAX_QP_RD_ATOMIC
|
1658 IB_QP_ACCESS_FLAGS
|
1660 IB_QP_PATH_MIG_STATE
),
1661 [IB_QPT_XRC_TGT
] = (IB_QP_PORT
|
1664 IB_QP_MAX_DEST_RD_ATOMIC
|
1666 IB_QP_ACCESS_FLAGS
|
1668 IB_QP_MIN_RNR_TIMER
|
1669 IB_QP_PATH_MIG_STATE
),
1670 [IB_QPT_SMI
] = (IB_QP_PKEY_INDEX
|
1672 [IB_QPT_GSI
] = (IB_QP_PKEY_INDEX
|
1678 [IB_QPS_RESET
] = { .valid
= 1 },
1679 [IB_QPS_ERR
] = { .valid
= 1 },
1683 [IB_QPT_UD
] = (IB_QP_CUR_STATE
|
1685 [IB_QPT_UC
] = (IB_QP_CUR_STATE
|
1686 IB_QP_ACCESS_FLAGS
),
1687 [IB_QPT_SMI
] = (IB_QP_CUR_STATE
|
1689 [IB_QPT_GSI
] = (IB_QP_CUR_STATE
|
1695 [IB_QPS_RESET
] = { .valid
= 1 },
1696 [IB_QPS_ERR
] = { .valid
= 1 }
1700 bool ib_modify_qp_is_ok(enum ib_qp_state cur_state
, enum ib_qp_state next_state
,
1701 enum ib_qp_type type
, enum ib_qp_attr_mask mask
)
1703 enum ib_qp_attr_mask req_param
, opt_param
;
1705 if (mask
& IB_QP_CUR_STATE
&&
1706 cur_state
!= IB_QPS_RTR
&& cur_state
!= IB_QPS_RTS
&&
1707 cur_state
!= IB_QPS_SQD
&& cur_state
!= IB_QPS_SQE
)
1710 if (!qp_state_table
[cur_state
][next_state
].valid
)
1713 req_param
= qp_state_table
[cur_state
][next_state
].req_param
[type
];
1714 opt_param
= qp_state_table
[cur_state
][next_state
].opt_param
[type
];
1716 if ((mask
& req_param
) != req_param
)
1719 if (mask
& ~(req_param
| opt_param
| IB_QP_STATE
))
1724 EXPORT_SYMBOL(ib_modify_qp_is_ok
);
1727 * ib_resolve_eth_dmac - Resolve destination mac address
1728 * @device: Device to consider
1729 * @ah_attr: address handle attribute which describes the
1730 * source and destination parameters
1731 * ib_resolve_eth_dmac() resolves destination mac address and L3 hop limit It
1732 * returns 0 on success or appropriate error code. It initializes the
1733 * necessary ah_attr fields when call is successful.
1735 static int ib_resolve_eth_dmac(struct ib_device
*device
,
1736 struct rdma_ah_attr
*ah_attr
)
1740 if (rdma_is_multicast_addr((struct in6_addr
*)ah_attr
->grh
.dgid
.raw
)) {
1741 if (ipv6_addr_v4mapped((struct in6_addr
*)ah_attr
->grh
.dgid
.raw
)) {
1744 memcpy(&addr
, ah_attr
->grh
.dgid
.raw
+ 12, 4);
1745 ip_eth_mc_map(addr
, (char *)ah_attr
->roce
.dmac
);
1747 ipv6_eth_mc_map((struct in6_addr
*)ah_attr
->grh
.dgid
.raw
,
1748 (char *)ah_attr
->roce
.dmac
);
1751 ret
= ib_resolve_unicast_gid_dmac(device
, ah_attr
);
1756 static bool is_qp_type_connected(const struct ib_qp
*qp
)
1758 return (qp
->qp_type
== IB_QPT_UC
||
1759 qp
->qp_type
== IB_QPT_RC
||
1760 qp
->qp_type
== IB_QPT_XRC_INI
||
1761 qp
->qp_type
== IB_QPT_XRC_TGT
);
1765 * IB core internal function to perform QP attributes modification.
1767 static int _ib_modify_qp(struct ib_qp
*qp
, struct ib_qp_attr
*attr
,
1768 int attr_mask
, struct ib_udata
*udata
)
1770 u32 port
= attr_mask
& IB_QP_PORT
? attr
->port_num
: qp
->port
;
1771 const struct ib_gid_attr
*old_sgid_attr_av
;
1772 const struct ib_gid_attr
*old_sgid_attr_alt_av
;
1775 attr
->xmit_slave
= NULL
;
1776 if (attr_mask
& IB_QP_AV
) {
1777 ret
= rdma_fill_sgid_attr(qp
->device
, &attr
->ah_attr
,
1782 if (attr
->ah_attr
.type
== RDMA_AH_ATTR_TYPE_ROCE
&&
1783 is_qp_type_connected(qp
)) {
1784 struct net_device
*slave
;
1787 * If the user provided the qp_attr then we have to
1788 * resolve it. Kerne users have to provide already
1789 * resolved rdma_ah_attr's.
1792 ret
= ib_resolve_eth_dmac(qp
->device
,
1797 slave
= rdma_lag_get_ah_roce_slave(qp
->device
,
1800 if (IS_ERR(slave
)) {
1801 ret
= PTR_ERR(slave
);
1804 attr
->xmit_slave
= slave
;
1807 if (attr_mask
& IB_QP_ALT_PATH
) {
1809 * FIXME: This does not track the migration state, so if the
1810 * user loads a new alternate path after the HW has migrated
1811 * from primary->alternate we will keep the wrong
1812 * references. This is OK for IB because the reference
1813 * counting does not serve any functional purpose.
1815 ret
= rdma_fill_sgid_attr(qp
->device
, &attr
->alt_ah_attr
,
1816 &old_sgid_attr_alt_av
);
1821 * Today the core code can only handle alternate paths and APM
1822 * for IB. Ban them in roce mode.
1824 if (!(rdma_protocol_ib(qp
->device
,
1825 attr
->alt_ah_attr
.port_num
) &&
1826 rdma_protocol_ib(qp
->device
, port
))) {
1832 if (rdma_ib_or_roce(qp
->device
, port
)) {
1833 if (attr_mask
& IB_QP_RQ_PSN
&& attr
->rq_psn
& ~0xffffff) {
1834 dev_warn(&qp
->device
->dev
,
1835 "%s rq_psn overflow, masking to 24 bits\n",
1837 attr
->rq_psn
&= 0xffffff;
1840 if (attr_mask
& IB_QP_SQ_PSN
&& attr
->sq_psn
& ~0xffffff) {
1841 dev_warn(&qp
->device
->dev
,
1842 " %s sq_psn overflow, masking to 24 bits\n",
1844 attr
->sq_psn
&= 0xffffff;
1849 * Bind this qp to a counter automatically based on the rdma counter
1850 * rules. This only set in RST2INIT with port specified
1852 if (!qp
->counter
&& (attr_mask
& IB_QP_PORT
) &&
1853 ((attr_mask
& IB_QP_STATE
) && attr
->qp_state
== IB_QPS_INIT
))
1854 rdma_counter_bind_qp_auto(qp
, attr
->port_num
);
1856 ret
= ib_security_modify_qp(qp
, attr
, attr_mask
, udata
);
1860 if (attr_mask
& IB_QP_PORT
)
1861 qp
->port
= attr
->port_num
;
1862 if (attr_mask
& IB_QP_AV
)
1864 rdma_update_sgid_attr(&attr
->ah_attr
, qp
->av_sgid_attr
);
1865 if (attr_mask
& IB_QP_ALT_PATH
)
1866 qp
->alt_path_sgid_attr
= rdma_update_sgid_attr(
1867 &attr
->alt_ah_attr
, qp
->alt_path_sgid_attr
);
1870 if (attr_mask
& IB_QP_ALT_PATH
)
1871 rdma_unfill_sgid_attr(&attr
->alt_ah_attr
, old_sgid_attr_alt_av
);
1873 if (attr_mask
& IB_QP_AV
) {
1874 rdma_lag_put_ah_roce_slave(attr
->xmit_slave
);
1875 rdma_unfill_sgid_attr(&attr
->ah_attr
, old_sgid_attr_av
);
1881 * ib_modify_qp_with_udata - Modifies the attributes for the specified QP.
1882 * @ib_qp: The QP to modify.
1883 * @attr: On input, specifies the QP attributes to modify. On output,
1884 * the current values of selected QP attributes are returned.
1885 * @attr_mask: A bit-mask used to specify which attributes of the QP
1886 * are being modified.
1887 * @udata: pointer to user's input output buffer information
1888 * are being modified.
1889 * It returns 0 on success and returns appropriate error code on error.
1891 int ib_modify_qp_with_udata(struct ib_qp
*ib_qp
, struct ib_qp_attr
*attr
,
1892 int attr_mask
, struct ib_udata
*udata
)
1894 return _ib_modify_qp(ib_qp
->real_qp
, attr
, attr_mask
, udata
);
1896 EXPORT_SYMBOL(ib_modify_qp_with_udata
);
1898 static void ib_get_width_and_speed(u32 netdev_speed
, u32 lanes
,
1899 u16
*speed
, u8
*width
)
1902 if (netdev_speed
<= SPEED_1000
) {
1903 *width
= IB_WIDTH_1X
;
1904 *speed
= IB_SPEED_SDR
;
1905 } else if (netdev_speed
<= SPEED_10000
) {
1906 *width
= IB_WIDTH_1X
;
1907 *speed
= IB_SPEED_FDR10
;
1908 } else if (netdev_speed
<= SPEED_20000
) {
1909 *width
= IB_WIDTH_4X
;
1910 *speed
= IB_SPEED_DDR
;
1911 } else if (netdev_speed
<= SPEED_25000
) {
1912 *width
= IB_WIDTH_1X
;
1913 *speed
= IB_SPEED_EDR
;
1914 } else if (netdev_speed
<= SPEED_40000
) {
1915 *width
= IB_WIDTH_4X
;
1916 *speed
= IB_SPEED_FDR10
;
1917 } else if (netdev_speed
<= SPEED_50000
) {
1918 *width
= IB_WIDTH_2X
;
1919 *speed
= IB_SPEED_EDR
;
1920 } else if (netdev_speed
<= SPEED_100000
) {
1921 *width
= IB_WIDTH_4X
;
1922 *speed
= IB_SPEED_EDR
;
1923 } else if (netdev_speed
<= SPEED_200000
) {
1924 *width
= IB_WIDTH_4X
;
1925 *speed
= IB_SPEED_HDR
;
1927 *width
= IB_WIDTH_4X
;
1928 *speed
= IB_SPEED_NDR
;
1936 *width
= IB_WIDTH_1X
;
1939 *width
= IB_WIDTH_2X
;
1942 *width
= IB_WIDTH_4X
;
1945 *width
= IB_WIDTH_8X
;
1948 *width
= IB_WIDTH_12X
;
1951 *width
= IB_WIDTH_1X
;
1954 switch (netdev_speed
/ lanes
) {
1956 *speed
= IB_SPEED_SDR
;
1959 *speed
= IB_SPEED_DDR
;
1962 *speed
= IB_SPEED_FDR10
;
1965 *speed
= IB_SPEED_FDR
;
1968 *speed
= IB_SPEED_EDR
;
1971 *speed
= IB_SPEED_HDR
;
1974 *speed
= IB_SPEED_NDR
;
1977 *speed
= IB_SPEED_SDR
;
1981 int ib_get_eth_speed(struct ib_device
*dev
, u32 port_num
, u16
*speed
, u8
*width
)
1985 struct net_device
*netdev
;
1986 struct ethtool_link_ksettings lksettings
= {};
1988 if (rdma_port_get_link_layer(dev
, port_num
) != IB_LINK_LAYER_ETHERNET
)
1991 netdev
= ib_device_get_netdev(dev
, port_num
);
1996 rc
= __ethtool_get_link_ksettings(netdev
, &lksettings
);
2001 if (!rc
&& lksettings
.base
.speed
!= (u32
)SPEED_UNKNOWN
) {
2002 netdev_speed
= lksettings
.base
.speed
;
2004 netdev_speed
= SPEED_1000
;
2006 pr_warn("%s speed is unknown, defaulting to %u\n",
2007 netdev
->name
, netdev_speed
);
2010 ib_get_width_and_speed(netdev_speed
, lksettings
.lanes
,
2015 EXPORT_SYMBOL(ib_get_eth_speed
);
2017 int ib_modify_qp(struct ib_qp
*qp
,
2018 struct ib_qp_attr
*qp_attr
,
2021 return _ib_modify_qp(qp
->real_qp
, qp_attr
, qp_attr_mask
, NULL
);
2023 EXPORT_SYMBOL(ib_modify_qp
);
2025 int ib_query_qp(struct ib_qp
*qp
,
2026 struct ib_qp_attr
*qp_attr
,
2028 struct ib_qp_init_attr
*qp_init_attr
)
2030 qp_attr
->ah_attr
.grh
.sgid_attr
= NULL
;
2031 qp_attr
->alt_ah_attr
.grh
.sgid_attr
= NULL
;
2033 return qp
->device
->ops
.query_qp
?
2034 qp
->device
->ops
.query_qp(qp
->real_qp
, qp_attr
, qp_attr_mask
,
2035 qp_init_attr
) : -EOPNOTSUPP
;
2037 EXPORT_SYMBOL(ib_query_qp
);
2039 int ib_close_qp(struct ib_qp
*qp
)
2041 struct ib_qp
*real_qp
;
2042 unsigned long flags
;
2044 real_qp
= qp
->real_qp
;
2048 spin_lock_irqsave(&real_qp
->device
->qp_open_list_lock
, flags
);
2049 list_del(&qp
->open_list
);
2050 spin_unlock_irqrestore(&real_qp
->device
->qp_open_list_lock
, flags
);
2052 atomic_dec(&real_qp
->usecnt
);
2054 ib_close_shared_qp_security(qp
->qp_sec
);
2059 EXPORT_SYMBOL(ib_close_qp
);
2061 static int __ib_destroy_shared_qp(struct ib_qp
*qp
)
2063 struct ib_xrcd
*xrcd
;
2064 struct ib_qp
*real_qp
;
2067 real_qp
= qp
->real_qp
;
2068 xrcd
= real_qp
->xrcd
;
2069 down_write(&xrcd
->tgt_qps_rwsem
);
2071 if (atomic_read(&real_qp
->usecnt
) == 0)
2072 xa_erase(&xrcd
->tgt_qps
, real_qp
->qp_num
);
2075 up_write(&xrcd
->tgt_qps_rwsem
);
2078 ret
= ib_destroy_qp(real_qp
);
2080 atomic_dec(&xrcd
->usecnt
);
2086 int ib_destroy_qp_user(struct ib_qp
*qp
, struct ib_udata
*udata
)
2088 const struct ib_gid_attr
*alt_path_sgid_attr
= qp
->alt_path_sgid_attr
;
2089 const struct ib_gid_attr
*av_sgid_attr
= qp
->av_sgid_attr
;
2090 struct ib_qp_security
*sec
;
2093 WARN_ON_ONCE(qp
->mrs_used
> 0);
2095 if (atomic_read(&qp
->usecnt
))
2098 if (qp
->real_qp
!= qp
)
2099 return __ib_destroy_shared_qp(qp
);
2103 ib_destroy_qp_security_begin(sec
);
2106 rdma_rw_cleanup_mrs(qp
);
2108 rdma_counter_unbind_qp(qp
, true);
2109 ret
= qp
->device
->ops
.destroy_qp(qp
, udata
);
2112 ib_destroy_qp_security_abort(sec
);
2116 if (alt_path_sgid_attr
)
2117 rdma_put_gid_attr(alt_path_sgid_attr
);
2119 rdma_put_gid_attr(av_sgid_attr
);
2121 ib_qp_usecnt_dec(qp
);
2123 ib_destroy_qp_security_end(sec
);
2125 rdma_restrack_del(&qp
->res
);
2129 EXPORT_SYMBOL(ib_destroy_qp_user
);
2131 /* Completion queues */
2133 struct ib_cq
*__ib_create_cq(struct ib_device
*device
,
2134 ib_comp_handler comp_handler
,
2135 void (*event_handler
)(struct ib_event
*, void *),
2137 const struct ib_cq_init_attr
*cq_attr
,
2143 cq
= rdma_zalloc_drv_obj(device
, ib_cq
);
2145 return ERR_PTR(-ENOMEM
);
2147 cq
->device
= device
;
2149 cq
->comp_handler
= comp_handler
;
2150 cq
->event_handler
= event_handler
;
2151 cq
->cq_context
= cq_context
;
2152 atomic_set(&cq
->usecnt
, 0);
2154 rdma_restrack_new(&cq
->res
, RDMA_RESTRACK_CQ
);
2155 rdma_restrack_set_name(&cq
->res
, caller
);
2157 ret
= device
->ops
.create_cq(cq
, cq_attr
, NULL
);
2159 rdma_restrack_put(&cq
->res
);
2161 return ERR_PTR(ret
);
2164 rdma_restrack_add(&cq
->res
);
2167 EXPORT_SYMBOL(__ib_create_cq
);
2169 int rdma_set_cq_moderation(struct ib_cq
*cq
, u16 cq_count
, u16 cq_period
)
2174 return cq
->device
->ops
.modify_cq
?
2175 cq
->device
->ops
.modify_cq(cq
, cq_count
,
2176 cq_period
) : -EOPNOTSUPP
;
2178 EXPORT_SYMBOL(rdma_set_cq_moderation
);
2180 int ib_destroy_cq_user(struct ib_cq
*cq
, struct ib_udata
*udata
)
2184 if (WARN_ON_ONCE(cq
->shared
))
2187 if (atomic_read(&cq
->usecnt
))
2190 ret
= cq
->device
->ops
.destroy_cq(cq
, udata
);
2194 rdma_restrack_del(&cq
->res
);
2198 EXPORT_SYMBOL(ib_destroy_cq_user
);
2200 int ib_resize_cq(struct ib_cq
*cq
, int cqe
)
2205 return cq
->device
->ops
.resize_cq
?
2206 cq
->device
->ops
.resize_cq(cq
, cqe
, NULL
) : -EOPNOTSUPP
;
2208 EXPORT_SYMBOL(ib_resize_cq
);
2210 /* Memory regions */
2212 struct ib_mr
*ib_reg_user_mr(struct ib_pd
*pd
, u64 start
, u64 length
,
2213 u64 virt_addr
, int access_flags
)
2217 if (access_flags
& IB_ACCESS_ON_DEMAND
) {
2218 if (!(pd
->device
->attrs
.kernel_cap_flags
&
2219 IBK_ON_DEMAND_PAGING
)) {
2220 pr_debug("ODP support not available\n");
2221 return ERR_PTR(-EINVAL
);
2225 mr
= pd
->device
->ops
.reg_user_mr(pd
, start
, length
, virt_addr
,
2226 access_flags
, NULL
);
2231 mr
->device
= pd
->device
;
2232 mr
->type
= IB_MR_TYPE_USER
;
2235 atomic_inc(&pd
->usecnt
);
2236 mr
->iova
= virt_addr
;
2237 mr
->length
= length
;
2239 rdma_restrack_new(&mr
->res
, RDMA_RESTRACK_MR
);
2240 rdma_restrack_parent_name(&mr
->res
, &pd
->res
);
2241 rdma_restrack_add(&mr
->res
);
2245 EXPORT_SYMBOL(ib_reg_user_mr
);
2247 int ib_advise_mr(struct ib_pd
*pd
, enum ib_uverbs_advise_mr_advice advice
,
2248 u32 flags
, struct ib_sge
*sg_list
, u32 num_sge
)
2250 if (!pd
->device
->ops
.advise_mr
)
2256 return pd
->device
->ops
.advise_mr(pd
, advice
, flags
, sg_list
, num_sge
,
2259 EXPORT_SYMBOL(ib_advise_mr
);
2261 int ib_dereg_mr_user(struct ib_mr
*mr
, struct ib_udata
*udata
)
2263 struct ib_pd
*pd
= mr
->pd
;
2264 struct ib_dm
*dm
= mr
->dm
;
2265 struct ib_sig_attrs
*sig_attrs
= mr
->sig_attrs
;
2269 rdma_restrack_del(&mr
->res
);
2270 ret
= mr
->device
->ops
.dereg_mr(mr
, udata
);
2272 atomic_dec(&pd
->usecnt
);
2274 atomic_dec(&dm
->usecnt
);
2280 EXPORT_SYMBOL(ib_dereg_mr_user
);
2283 * ib_alloc_mr() - Allocates a memory region
2284 * @pd: protection domain associated with the region
2285 * @mr_type: memory region type
2286 * @max_num_sg: maximum sg entries available for registration.
2289 * Memory registeration page/sg lists must not exceed max_num_sg.
2290 * For mr_type IB_MR_TYPE_MEM_REG, the total length cannot exceed
2291 * max_num_sg * used_page_size.
2294 struct ib_mr
*ib_alloc_mr(struct ib_pd
*pd
, enum ib_mr_type mr_type
,
2299 if (!pd
->device
->ops
.alloc_mr
) {
2300 mr
= ERR_PTR(-EOPNOTSUPP
);
2304 if (mr_type
== IB_MR_TYPE_INTEGRITY
) {
2306 mr
= ERR_PTR(-EINVAL
);
2310 mr
= pd
->device
->ops
.alloc_mr(pd
, mr_type
, max_num_sg
);
2314 mr
->device
= pd
->device
;
2318 atomic_inc(&pd
->usecnt
);
2319 mr
->need_inval
= false;
2321 mr
->sig_attrs
= NULL
;
2323 rdma_restrack_new(&mr
->res
, RDMA_RESTRACK_MR
);
2324 rdma_restrack_parent_name(&mr
->res
, &pd
->res
);
2325 rdma_restrack_add(&mr
->res
);
2327 trace_mr_alloc(pd
, mr_type
, max_num_sg
, mr
);
2330 EXPORT_SYMBOL(ib_alloc_mr
);
2333 * ib_alloc_mr_integrity() - Allocates an integrity memory region
2334 * @pd: protection domain associated with the region
2335 * @max_num_data_sg: maximum data sg entries available for registration
2336 * @max_num_meta_sg: maximum metadata sg entries available for
2340 * Memory registration page/sg lists must not exceed max_num_sg,
2341 * also the integrity page/sg lists must not exceed max_num_meta_sg.
2344 struct ib_mr
*ib_alloc_mr_integrity(struct ib_pd
*pd
,
2345 u32 max_num_data_sg
,
2346 u32 max_num_meta_sg
)
2349 struct ib_sig_attrs
*sig_attrs
;
2351 if (!pd
->device
->ops
.alloc_mr_integrity
||
2352 !pd
->device
->ops
.map_mr_sg_pi
) {
2353 mr
= ERR_PTR(-EOPNOTSUPP
);
2357 if (!max_num_meta_sg
) {
2358 mr
= ERR_PTR(-EINVAL
);
2362 sig_attrs
= kzalloc(sizeof(struct ib_sig_attrs
), GFP_KERNEL
);
2364 mr
= ERR_PTR(-ENOMEM
);
2368 mr
= pd
->device
->ops
.alloc_mr_integrity(pd
, max_num_data_sg
,
2375 mr
->device
= pd
->device
;
2379 atomic_inc(&pd
->usecnt
);
2380 mr
->need_inval
= false;
2381 mr
->type
= IB_MR_TYPE_INTEGRITY
;
2382 mr
->sig_attrs
= sig_attrs
;
2384 rdma_restrack_new(&mr
->res
, RDMA_RESTRACK_MR
);
2385 rdma_restrack_parent_name(&mr
->res
, &pd
->res
);
2386 rdma_restrack_add(&mr
->res
);
2388 trace_mr_integ_alloc(pd
, max_num_data_sg
, max_num_meta_sg
, mr
);
2391 EXPORT_SYMBOL(ib_alloc_mr_integrity
);
2393 /* Multicast groups */
2395 static bool is_valid_mcast_lid(struct ib_qp
*qp
, u16 lid
)
2397 struct ib_qp_init_attr init_attr
= {};
2398 struct ib_qp_attr attr
= {};
2399 int num_eth_ports
= 0;
2402 /* If QP state >= init, it is assigned to a port and we can check this
2405 if (!ib_query_qp(qp
, &attr
, IB_QP_STATE
| IB_QP_PORT
, &init_attr
)) {
2406 if (attr
.qp_state
>= IB_QPS_INIT
) {
2407 if (rdma_port_get_link_layer(qp
->device
, attr
.port_num
) !=
2408 IB_LINK_LAYER_INFINIBAND
)
2414 /* Can't get a quick answer, iterate over all ports */
2415 rdma_for_each_port(qp
->device
, port
)
2416 if (rdma_port_get_link_layer(qp
->device
, port
) !=
2417 IB_LINK_LAYER_INFINIBAND
)
2420 /* If we have at lease one Ethernet port, RoCE annex declares that
2421 * multicast LID should be ignored. We can't tell at this step if the
2422 * QP belongs to an IB or Ethernet port.
2427 /* If all the ports are IB, we can check according to IB spec. */
2429 return !(lid
< be16_to_cpu(IB_MULTICAST_LID_BASE
) ||
2430 lid
== be16_to_cpu(IB_LID_PERMISSIVE
));
2433 int ib_attach_mcast(struct ib_qp
*qp
, union ib_gid
*gid
, u16 lid
)
2437 if (!qp
->device
->ops
.attach_mcast
)
2440 if (!rdma_is_multicast_addr((struct in6_addr
*)gid
->raw
) ||
2441 qp
->qp_type
!= IB_QPT_UD
|| !is_valid_mcast_lid(qp
, lid
))
2444 ret
= qp
->device
->ops
.attach_mcast(qp
, gid
, lid
);
2446 atomic_inc(&qp
->usecnt
);
2449 EXPORT_SYMBOL(ib_attach_mcast
);
2451 int ib_detach_mcast(struct ib_qp
*qp
, union ib_gid
*gid
, u16 lid
)
2455 if (!qp
->device
->ops
.detach_mcast
)
2458 if (!rdma_is_multicast_addr((struct in6_addr
*)gid
->raw
) ||
2459 qp
->qp_type
!= IB_QPT_UD
|| !is_valid_mcast_lid(qp
, lid
))
2462 ret
= qp
->device
->ops
.detach_mcast(qp
, gid
, lid
);
2464 atomic_dec(&qp
->usecnt
);
2467 EXPORT_SYMBOL(ib_detach_mcast
);
2470 * ib_alloc_xrcd_user - Allocates an XRC domain.
2471 * @device: The device on which to allocate the XRC domain.
2472 * @inode: inode to connect XRCD
2473 * @udata: Valid user data or NULL for kernel object
2475 struct ib_xrcd
*ib_alloc_xrcd_user(struct ib_device
*device
,
2476 struct inode
*inode
, struct ib_udata
*udata
)
2478 struct ib_xrcd
*xrcd
;
2481 if (!device
->ops
.alloc_xrcd
)
2482 return ERR_PTR(-EOPNOTSUPP
);
2484 xrcd
= rdma_zalloc_drv_obj(device
, ib_xrcd
);
2486 return ERR_PTR(-ENOMEM
);
2488 xrcd
->device
= device
;
2489 xrcd
->inode
= inode
;
2490 atomic_set(&xrcd
->usecnt
, 0);
2491 init_rwsem(&xrcd
->tgt_qps_rwsem
);
2492 xa_init(&xrcd
->tgt_qps
);
2494 ret
= device
->ops
.alloc_xrcd(xrcd
, udata
);
2500 return ERR_PTR(ret
);
2502 EXPORT_SYMBOL(ib_alloc_xrcd_user
);
2505 * ib_dealloc_xrcd_user - Deallocates an XRC domain.
2506 * @xrcd: The XRC domain to deallocate.
2507 * @udata: Valid user data or NULL for kernel object
2509 int ib_dealloc_xrcd_user(struct ib_xrcd
*xrcd
, struct ib_udata
*udata
)
2513 if (atomic_read(&xrcd
->usecnt
))
2516 WARN_ON(!xa_empty(&xrcd
->tgt_qps
));
2517 ret
= xrcd
->device
->ops
.dealloc_xrcd(xrcd
, udata
);
2523 EXPORT_SYMBOL(ib_dealloc_xrcd_user
);
2526 * ib_create_wq - Creates a WQ associated with the specified protection
2528 * @pd: The protection domain associated with the WQ.
2529 * @wq_attr: A list of initial attributes required to create the
2530 * WQ. If WQ creation succeeds, then the attributes are updated to
2531 * the actual capabilities of the created WQ.
2533 * wq_attr->max_wr and wq_attr->max_sge determine
2534 * the requested size of the WQ, and set to the actual values allocated
2536 * If ib_create_wq() succeeds, then max_wr and max_sge will always be
2537 * at least as large as the requested values.
2539 struct ib_wq
*ib_create_wq(struct ib_pd
*pd
,
2540 struct ib_wq_init_attr
*wq_attr
)
2544 if (!pd
->device
->ops
.create_wq
)
2545 return ERR_PTR(-EOPNOTSUPP
);
2547 wq
= pd
->device
->ops
.create_wq(pd
, wq_attr
, NULL
);
2549 wq
->event_handler
= wq_attr
->event_handler
;
2550 wq
->wq_context
= wq_attr
->wq_context
;
2551 wq
->wq_type
= wq_attr
->wq_type
;
2552 wq
->cq
= wq_attr
->cq
;
2553 wq
->device
= pd
->device
;
2556 atomic_inc(&pd
->usecnt
);
2557 atomic_inc(&wq_attr
->cq
->usecnt
);
2558 atomic_set(&wq
->usecnt
, 0);
2562 EXPORT_SYMBOL(ib_create_wq
);
2565 * ib_destroy_wq_user - Destroys the specified user WQ.
2566 * @wq: The WQ to destroy.
2567 * @udata: Valid user data
2569 int ib_destroy_wq_user(struct ib_wq
*wq
, struct ib_udata
*udata
)
2571 struct ib_cq
*cq
= wq
->cq
;
2572 struct ib_pd
*pd
= wq
->pd
;
2575 if (atomic_read(&wq
->usecnt
))
2578 ret
= wq
->device
->ops
.destroy_wq(wq
, udata
);
2582 atomic_dec(&pd
->usecnt
);
2583 atomic_dec(&cq
->usecnt
);
2586 EXPORT_SYMBOL(ib_destroy_wq_user
);
2588 int ib_check_mr_status(struct ib_mr
*mr
, u32 check_mask
,
2589 struct ib_mr_status
*mr_status
)
2591 if (!mr
->device
->ops
.check_mr_status
)
2594 return mr
->device
->ops
.check_mr_status(mr
, check_mask
, mr_status
);
2596 EXPORT_SYMBOL(ib_check_mr_status
);
2598 int ib_set_vf_link_state(struct ib_device
*device
, int vf
, u32 port
,
2601 if (!device
->ops
.set_vf_link_state
)
2604 return device
->ops
.set_vf_link_state(device
, vf
, port
, state
);
2606 EXPORT_SYMBOL(ib_set_vf_link_state
);
2608 int ib_get_vf_config(struct ib_device
*device
, int vf
, u32 port
,
2609 struct ifla_vf_info
*info
)
2611 if (!device
->ops
.get_vf_config
)
2614 return device
->ops
.get_vf_config(device
, vf
, port
, info
);
2616 EXPORT_SYMBOL(ib_get_vf_config
);
2618 int ib_get_vf_stats(struct ib_device
*device
, int vf
, u32 port
,
2619 struct ifla_vf_stats
*stats
)
2621 if (!device
->ops
.get_vf_stats
)
2624 return device
->ops
.get_vf_stats(device
, vf
, port
, stats
);
2626 EXPORT_SYMBOL(ib_get_vf_stats
);
2628 int ib_set_vf_guid(struct ib_device
*device
, int vf
, u32 port
, u64 guid
,
2631 if (!device
->ops
.set_vf_guid
)
2634 return device
->ops
.set_vf_guid(device
, vf
, port
, guid
, type
);
2636 EXPORT_SYMBOL(ib_set_vf_guid
);
2638 int ib_get_vf_guid(struct ib_device
*device
, int vf
, u32 port
,
2639 struct ifla_vf_guid
*node_guid
,
2640 struct ifla_vf_guid
*port_guid
)
2642 if (!device
->ops
.get_vf_guid
)
2645 return device
->ops
.get_vf_guid(device
, vf
, port
, node_guid
, port_guid
);
2647 EXPORT_SYMBOL(ib_get_vf_guid
);
2649 * ib_map_mr_sg_pi() - Map the dma mapped SG lists for PI (protection
2650 * information) and set an appropriate memory region for registration.
2651 * @mr: memory region
2652 * @data_sg: dma mapped scatterlist for data
2653 * @data_sg_nents: number of entries in data_sg
2654 * @data_sg_offset: offset in bytes into data_sg
2655 * @meta_sg: dma mapped scatterlist for metadata
2656 * @meta_sg_nents: number of entries in meta_sg
2657 * @meta_sg_offset: offset in bytes into meta_sg
2658 * @page_size: page vector desired page size
2661 * - The MR must be allocated with type IB_MR_TYPE_INTEGRITY.
2663 * Return: 0 on success.
2665 * After this completes successfully, the memory region
2666 * is ready for registration.
2668 int ib_map_mr_sg_pi(struct ib_mr
*mr
, struct scatterlist
*data_sg
,
2669 int data_sg_nents
, unsigned int *data_sg_offset
,
2670 struct scatterlist
*meta_sg
, int meta_sg_nents
,
2671 unsigned int *meta_sg_offset
, unsigned int page_size
)
2673 if (unlikely(!mr
->device
->ops
.map_mr_sg_pi
||
2674 WARN_ON_ONCE(mr
->type
!= IB_MR_TYPE_INTEGRITY
)))
2677 mr
->page_size
= page_size
;
2679 return mr
->device
->ops
.map_mr_sg_pi(mr
, data_sg
, data_sg_nents
,
2680 data_sg_offset
, meta_sg
,
2681 meta_sg_nents
, meta_sg_offset
);
2683 EXPORT_SYMBOL(ib_map_mr_sg_pi
);
2686 * ib_map_mr_sg() - Map the largest prefix of a dma mapped SG list
2687 * and set it the memory region.
2688 * @mr: memory region
2689 * @sg: dma mapped scatterlist
2690 * @sg_nents: number of entries in sg
2691 * @sg_offset: offset in bytes into sg
2692 * @page_size: page vector desired page size
2696 * - The first sg element is allowed to have an offset.
2697 * - Each sg element must either be aligned to page_size or virtually
2698 * contiguous to the previous element. In case an sg element has a
2699 * non-contiguous offset, the mapping prefix will not include it.
2700 * - The last sg element is allowed to have length less than page_size.
2701 * - If sg_nents total byte length exceeds the mr max_num_sge * page_size
2702 * then only max_num_sg entries will be mapped.
2703 * - If the MR was allocated with type IB_MR_TYPE_SG_GAPS, none of these
2704 * constraints holds and the page_size argument is ignored.
2706 * Returns the number of sg elements that were mapped to the memory region.
2708 * After this completes successfully, the memory region
2709 * is ready for registration.
2711 int ib_map_mr_sg(struct ib_mr
*mr
, struct scatterlist
*sg
, int sg_nents
,
2712 unsigned int *sg_offset
, unsigned int page_size
)
2714 if (unlikely(!mr
->device
->ops
.map_mr_sg
))
2717 mr
->page_size
= page_size
;
2719 return mr
->device
->ops
.map_mr_sg(mr
, sg
, sg_nents
, sg_offset
);
2721 EXPORT_SYMBOL(ib_map_mr_sg
);
2724 * ib_sg_to_pages() - Convert the largest prefix of a sg list
2726 * @mr: memory region
2727 * @sgl: dma mapped scatterlist
2728 * @sg_nents: number of entries in sg
2729 * @sg_offset_p: ==== =======================================================
2730 * IN start offset in bytes into sg
2731 * OUT offset in bytes for element n of the sg of the first
2732 * byte that has not been processed where n is the return
2733 * value of this function.
2734 * ==== =======================================================
2735 * @set_page: driver page assignment function pointer
2737 * Core service helper for drivers to convert the largest
2738 * prefix of given sg list to a page vector. The sg list
2739 * prefix converted is the prefix that meet the requirements
2742 * Returns the number of sg elements that were assigned to
2745 int ib_sg_to_pages(struct ib_mr
*mr
, struct scatterlist
*sgl
, int sg_nents
,
2746 unsigned int *sg_offset_p
, int (*set_page
)(struct ib_mr
*, u64
))
2748 struct scatterlist
*sg
;
2749 u64 last_end_dma_addr
= 0;
2750 unsigned int sg_offset
= sg_offset_p
? *sg_offset_p
: 0;
2751 unsigned int last_page_off
= 0;
2752 u64 page_mask
= ~((u64
)mr
->page_size
- 1);
2755 if (unlikely(sg_nents
<= 0 || sg_offset
> sg_dma_len(&sgl
[0])))
2758 mr
->iova
= sg_dma_address(&sgl
[0]) + sg_offset
;
2761 for_each_sg(sgl
, sg
, sg_nents
, i
) {
2762 u64 dma_addr
= sg_dma_address(sg
) + sg_offset
;
2763 u64 prev_addr
= dma_addr
;
2764 unsigned int dma_len
= sg_dma_len(sg
) - sg_offset
;
2765 u64 end_dma_addr
= dma_addr
+ dma_len
;
2766 u64 page_addr
= dma_addr
& page_mask
;
2769 * For the second and later elements, check whether either the
2770 * end of element i-1 or the start of element i is not aligned
2771 * on a page boundary.
2773 if (i
&& (last_page_off
!= 0 || page_addr
!= dma_addr
)) {
2774 /* Stop mapping if there is a gap. */
2775 if (last_end_dma_addr
!= dma_addr
)
2779 * Coalesce this element with the last. If it is small
2780 * enough just update mr->length. Otherwise start
2781 * mapping from the next page.
2787 ret
= set_page(mr
, page_addr
);
2788 if (unlikely(ret
< 0)) {
2789 sg_offset
= prev_addr
- sg_dma_address(sg
);
2790 mr
->length
+= prev_addr
- dma_addr
;
2792 *sg_offset_p
= sg_offset
;
2793 return i
|| sg_offset
? i
: ret
;
2795 prev_addr
= page_addr
;
2797 page_addr
+= mr
->page_size
;
2798 } while (page_addr
< end_dma_addr
);
2800 mr
->length
+= dma_len
;
2801 last_end_dma_addr
= end_dma_addr
;
2802 last_page_off
= end_dma_addr
& ~page_mask
;
2811 EXPORT_SYMBOL(ib_sg_to_pages
);
2813 struct ib_drain_cqe
{
2815 struct completion done
;
2818 static void ib_drain_qp_done(struct ib_cq
*cq
, struct ib_wc
*wc
)
2820 struct ib_drain_cqe
*cqe
= container_of(wc
->wr_cqe
, struct ib_drain_cqe
,
2823 complete(&cqe
->done
);
2827 * Post a WR and block until its completion is reaped for the SQ.
2829 static void __ib_drain_sq(struct ib_qp
*qp
)
2831 struct ib_cq
*cq
= qp
->send_cq
;
2832 struct ib_qp_attr attr
= { .qp_state
= IB_QPS_ERR
};
2833 struct ib_drain_cqe sdrain
;
2834 struct ib_rdma_wr swr
= {
2837 { .wr_cqe
= &sdrain
.cqe
, },
2838 .opcode
= IB_WR_RDMA_WRITE
,
2843 ret
= ib_modify_qp(qp
, &attr
, IB_QP_STATE
);
2845 WARN_ONCE(ret
, "failed to drain send queue: %d\n", ret
);
2849 sdrain
.cqe
.done
= ib_drain_qp_done
;
2850 init_completion(&sdrain
.done
);
2852 ret
= ib_post_send(qp
, &swr
.wr
, NULL
);
2854 WARN_ONCE(ret
, "failed to drain send queue: %d\n", ret
);
2858 if (cq
->poll_ctx
== IB_POLL_DIRECT
)
2859 while (wait_for_completion_timeout(&sdrain
.done
, HZ
/ 10) <= 0)
2860 ib_process_cq_direct(cq
, -1);
2862 wait_for_completion(&sdrain
.done
);
2866 * Post a WR and block until its completion is reaped for the RQ.
2868 static void __ib_drain_rq(struct ib_qp
*qp
)
2870 struct ib_cq
*cq
= qp
->recv_cq
;
2871 struct ib_qp_attr attr
= { .qp_state
= IB_QPS_ERR
};
2872 struct ib_drain_cqe rdrain
;
2873 struct ib_recv_wr rwr
= {};
2876 ret
= ib_modify_qp(qp
, &attr
, IB_QP_STATE
);
2878 WARN_ONCE(ret
, "failed to drain recv queue: %d\n", ret
);
2882 rwr
.wr_cqe
= &rdrain
.cqe
;
2883 rdrain
.cqe
.done
= ib_drain_qp_done
;
2884 init_completion(&rdrain
.done
);
2886 ret
= ib_post_recv(qp
, &rwr
, NULL
);
2888 WARN_ONCE(ret
, "failed to drain recv queue: %d\n", ret
);
2892 if (cq
->poll_ctx
== IB_POLL_DIRECT
)
2893 while (wait_for_completion_timeout(&rdrain
.done
, HZ
/ 10) <= 0)
2894 ib_process_cq_direct(cq
, -1);
2896 wait_for_completion(&rdrain
.done
);
2900 * __ib_drain_srq() - Block until Last WQE Reached event arrives, or timeout
2902 * @qp: queue pair associated with SRQ to drain
2904 * Quoting 10.3.1 Queue Pair and EE Context States:
2906 * Note, for QPs that are associated with an SRQ, the Consumer should take the
2907 * QP through the Error State before invoking a Destroy QP or a Modify QP to the
2908 * Reset State. The Consumer may invoke the Destroy QP without first performing
2909 * a Modify QP to the Error State and waiting for the Affiliated Asynchronous
2910 * Last WQE Reached Event. However, if the Consumer does not wait for the
2911 * Affiliated Asynchronous Last WQE Reached Event, then WQE and Data Segment
2912 * leakage may occur. Therefore, it is good programming practice to tear down a
2913 * QP that is associated with an SRQ by using the following process:
2915 * - Put the QP in the Error State
2916 * - Wait for the Affiliated Asynchronous Last WQE Reached Event;
2918 * drain the CQ by invoking the Poll CQ verb and either wait for CQ
2919 * to be empty or the number of Poll CQ operations has exceeded
2922 * post another WR that completes on the same CQ and wait for this
2923 * WR to return as a WC;
2924 * - and then invoke a Destroy QP or Reset QP.
2926 * We use the first option.
2928 static void __ib_drain_srq(struct ib_qp
*qp
)
2930 struct ib_qp_attr attr
= { .qp_state
= IB_QPS_ERR
};
2936 WARN_ONCE(1, "QP 0x%p is not associated with SRQ\n", qp
);
2940 ret
= ib_modify_qp(qp
, &attr
, IB_QP_STATE
);
2942 WARN_ONCE(ret
, "failed to drain shared recv queue: %d\n", ret
);
2946 if (ib_srq_has_cq(qp
->srq
->srq_type
)) {
2947 cq
= qp
->srq
->ext
.cq
;
2948 } else if (qp
->recv_cq
) {
2951 WARN_ONCE(1, "QP 0x%p has no CQ associated with SRQ\n", qp
);
2955 if (wait_for_completion_timeout(&qp
->srq_completion
, 60 * HZ
) > 0) {
2956 while (polled
!= cq
->cqe
) {
2957 n
= ib_process_cq_direct(cq
, cq
->cqe
- polled
);
2966 * ib_drain_sq() - Block until all SQ CQEs have been consumed by the
2968 * @qp: queue pair to drain
2970 * If the device has a provider-specific drain function, then
2971 * call that. Otherwise call the generic drain function
2976 * ensure there is room in the CQ and SQ for the drain work request and
2979 * allocate the CQ using ib_alloc_cq().
2981 * ensure that there are no other contexts that are posting WRs concurrently.
2982 * Otherwise the drain is not guaranteed.
2984 void ib_drain_sq(struct ib_qp
*qp
)
2986 if (qp
->device
->ops
.drain_sq
)
2987 qp
->device
->ops
.drain_sq(qp
);
2990 trace_cq_drain_complete(qp
->send_cq
);
2992 EXPORT_SYMBOL(ib_drain_sq
);
2995 * ib_drain_rq() - Block until all RQ CQEs have been consumed by the
2997 * @qp: queue pair to drain
2999 * If the device has a provider-specific drain function, then
3000 * call that. Otherwise call the generic drain function
3005 * ensure there is room in the CQ and RQ for the drain work request and
3008 * allocate the CQ using ib_alloc_cq().
3010 * ensure that there are no other contexts that are posting WRs concurrently.
3011 * Otherwise the drain is not guaranteed.
3013 void ib_drain_rq(struct ib_qp
*qp
)
3015 if (qp
->device
->ops
.drain_rq
)
3016 qp
->device
->ops
.drain_rq(qp
);
3019 trace_cq_drain_complete(qp
->recv_cq
);
3021 EXPORT_SYMBOL(ib_drain_rq
);
3024 * ib_drain_qp() - Block until all CQEs have been consumed by the
3025 * application on both the RQ and SQ.
3026 * @qp: queue pair to drain
3030 * ensure there is room in the CQ(s), SQ, and RQ for drain work requests
3033 * allocate the CQs using ib_alloc_cq().
3035 * ensure that there are no other contexts that are posting WRs concurrently.
3036 * Otherwise the drain is not guaranteed.
3038 void ib_drain_qp(struct ib_qp
*qp
)
3046 EXPORT_SYMBOL(ib_drain_qp
);
3048 struct net_device
*rdma_alloc_netdev(struct ib_device
*device
, u32 port_num
,
3049 enum rdma_netdev_t type
, const char *name
,
3050 unsigned char name_assign_type
,
3051 void (*setup
)(struct net_device
*))
3053 struct rdma_netdev_alloc_params params
;
3054 struct net_device
*netdev
;
3057 if (!device
->ops
.rdma_netdev_get_params
)
3058 return ERR_PTR(-EOPNOTSUPP
);
3060 rc
= device
->ops
.rdma_netdev_get_params(device
, port_num
, type
,
3065 netdev
= alloc_netdev_mqs(params
.sizeof_priv
, name
, name_assign_type
,
3066 setup
, params
.txqs
, params
.rxqs
);
3068 return ERR_PTR(-ENOMEM
);
3072 EXPORT_SYMBOL(rdma_alloc_netdev
);
3074 int rdma_init_netdev(struct ib_device
*device
, u32 port_num
,
3075 enum rdma_netdev_t type
, const char *name
,
3076 unsigned char name_assign_type
,
3077 void (*setup
)(struct net_device
*),
3078 struct net_device
*netdev
)
3080 struct rdma_netdev_alloc_params params
;
3083 if (!device
->ops
.rdma_netdev_get_params
)
3086 rc
= device
->ops
.rdma_netdev_get_params(device
, port_num
, type
,
3091 return params
.initialize_rdma_netdev(device
, port_num
,
3092 netdev
, params
.param
);
3094 EXPORT_SYMBOL(rdma_init_netdev
);
3096 void __rdma_block_iter_start(struct ib_block_iter
*biter
,
3097 struct scatterlist
*sglist
, unsigned int nents
,
3100 memset(biter
, 0, sizeof(struct ib_block_iter
));
3101 biter
->__sg
= sglist
;
3102 biter
->__sg_nents
= nents
;
3104 /* Driver provides best block size to use */
3105 biter
->__pg_bit
= __fls(pgsz
);
3107 EXPORT_SYMBOL(__rdma_block_iter_start
);
3109 bool __rdma_block_iter_next(struct ib_block_iter
*biter
)
3111 unsigned int block_offset
;
3112 unsigned int sg_delta
;
3114 if (!biter
->__sg_nents
|| !biter
->__sg
)
3117 biter
->__dma_addr
= sg_dma_address(biter
->__sg
) + biter
->__sg_advance
;
3118 block_offset
= biter
->__dma_addr
& (BIT_ULL(biter
->__pg_bit
) - 1);
3119 sg_delta
= BIT_ULL(biter
->__pg_bit
) - block_offset
;
3121 if (sg_dma_len(biter
->__sg
) - biter
->__sg_advance
> sg_delta
) {
3122 biter
->__sg_advance
+= sg_delta
;
3124 biter
->__sg_advance
= 0;
3125 biter
->__sg
= sg_next(biter
->__sg
);
3126 biter
->__sg_nents
--;
3131 EXPORT_SYMBOL(__rdma_block_iter_next
);
3134 * rdma_alloc_hw_stats_struct - Helper function to allocate dynamic struct
3136 * @descs: array of static descriptors
3137 * @num_counters: number of elements in array
3138 * @lifespan: milliseconds between updates
3140 struct rdma_hw_stats
*rdma_alloc_hw_stats_struct(
3141 const struct rdma_stat_desc
*descs
, int num_counters
,
3142 unsigned long lifespan
)
3144 struct rdma_hw_stats
*stats
;
3146 stats
= kzalloc(struct_size(stats
, value
, num_counters
), GFP_KERNEL
);
3150 stats
->is_disabled
= kcalloc(BITS_TO_LONGS(num_counters
),
3151 sizeof(*stats
->is_disabled
), GFP_KERNEL
);
3152 if (!stats
->is_disabled
)
3155 stats
->descs
= descs
;
3156 stats
->num_counters
= num_counters
;
3157 stats
->lifespan
= msecs_to_jiffies(lifespan
);
3158 mutex_init(&stats
->lock
);
3166 EXPORT_SYMBOL(rdma_alloc_hw_stats_struct
);
3169 * rdma_free_hw_stats_struct - Helper function to release rdma_hw_stats
3170 * @stats: statistics to release
3172 void rdma_free_hw_stats_struct(struct rdma_hw_stats
*stats
)
3177 kfree(stats
->is_disabled
);
3180 EXPORT_SYMBOL(rdma_free_hw_stats_struct
);