2 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
6 * This file contains code imported from the OFED rds source file ib_recv.c
7 * Oracle elects to have and use the contents of ib_recv.c under and governed
8 * by the OpenIB.org BSD license (see below for full license text). However,
9 * the following notice accompanied the original version of this file:
13 * Copyright (c) 2006 Oracle. All rights reserved.
15 * This software is available to you under a choice of one of two
16 * licenses. You may choose to be licensed under the terms of the GNU
17 * General Public License (GPL) Version 2, available from the file
18 * COPYING in the main directory of this source tree, or the
19 * OpenIB.org BSD license below:
21 * Redistribution and use in source and binary forms, with or
22 * without modification, are permitted provided that the following
25 * - Redistributions of source code must retain the above
26 * copyright notice, this list of conditions and the following
29 * - Redistributions in binary form must reproduce the above
30 * copyright notice, this list of conditions and the following
31 * disclaimer in the documentation and/or other materials
32 * provided with the distribution.
34 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
35 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
36 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
37 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
38 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
39 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
40 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
44 #include <sys/types.h>
46 #include <sys/cpuvar.h>
48 #include <sys/containerof.h>
50 #include <sys/ib/clients/rdsv3/rdsv3.h>
51 #include <sys/ib/clients/rdsv3/ib.h>
52 #include <sys/ib/clients/rdsv3/rdsv3_debug.h>
54 static struct kmem_cache
*rdsv3_ib_incoming_slab
;
55 static atomic_t rdsv3_ib_allocation
= ATOMIC_INIT(0);
58 rdsv3_ib_recv_init_ring(struct rdsv3_ib_connection
*ic
)
60 struct rdsv3_ib_recv_work
*recv
;
61 struct rdsv3_header
*hdrp
;
64 RDSV3_DPRINTF4("rdsv3_ib_recv_init_ring", "ic: %p", ic
);
66 hdrp
= ic
->i_recv_hdrs
;
67 for (i
= 0, recv
= ic
->i_recvs
; i
< ic
->i_recv_ring
.w_nr
; i
++, recv
++) {
71 /* initialize the hdr sgl permanently */
72 recv
->r_sge
[0].ds_va
= (ib_vaddr_t
)(uintptr_t)hdrp
++;
73 recv
->r_sge
[0].ds_len
= sizeof (struct rdsv3_header
);
74 recv
->r_sge
[0].ds_key
= ic
->i_mr
->lkey
;
79 rdsv3_ib_recv_clear_one(struct rdsv3_ib_connection
*ic
,
80 struct rdsv3_ib_recv_work
*recv
)
82 RDSV3_DPRINTF4("rdsv3_ib_recv_clear_one", "ic: %p, recv: %p",
86 rdsv3_inc_put(&recv
->r_ibinc
->ii_inc
);
91 kmem_cache_free(ic
->rds_ibdev
->ib_frag_slab
, recv
->r_frag
);
95 RDSV3_DPRINTF4("rdsv3_ib_recv_clear_one", "Return: ic: %p, recv: %p",
100 rdsv3_ib_recv_clear_ring(struct rdsv3_ib_connection
*ic
)
104 RDSV3_DPRINTF4("rdsv3_ib_recv_clear_ring", "ic: %p", ic
);
106 for (i
= 0; i
< ic
->i_recv_ring
.w_nr
; i
++)
107 rdsv3_ib_recv_clear_one(ic
, &ic
->i_recvs
[i
]);
110 extern int atomic_add_unless(atomic_t
*, uint_t
, ulong_t
);
113 rdsv3_ib_recv_refill_one(struct rdsv3_connection
*conn
,
114 struct rdsv3_ib_recv_work
*recv
)
116 struct rdsv3_ib_connection
*ic
= conn
->c_transport_data
;
118 ibt_iov_attr_t iov_attr
;
119 ibt_iov_t iov_arr
[1];
121 RDSV3_DPRINTF5("rdsv3_ib_recv_refill_one", "conn: %p, recv: %p",
124 if (!recv
->r_ibinc
) {
125 if (!atomic_add_unless(&rdsv3_ib_allocation
, 1,
126 ic
->i_max_recv_alloc
)) {
127 rdsv3_ib_stats_inc(s_ib_rx_alloc_limit
);
130 recv
->r_ibinc
= kmem_cache_alloc(rdsv3_ib_incoming_slab
,
132 if (recv
->r_ibinc
== NULL
) {
133 atomic_dec_32(&rdsv3_ib_allocation
);
136 rdsv3_inc_init(&recv
->r_ibinc
->ii_inc
, conn
, conn
->c_faddr
);
137 recv
->r_ibinc
->ii_ibdev
= ic
->rds_ibdev
;
138 recv
->r_ibinc
->ii_pool
= ic
->rds_ibdev
->inc_pool
;
142 recv
->r_frag
= kmem_cache_alloc(ic
->rds_ibdev
->ib_frag_slab
,
148 /* Data sge, structure copy */
149 recv
->r_sge
[1] = recv
->r_frag
->f_sge
;
151 RDSV3_DPRINTF5("rdsv3_ib_recv_refill_one", "Return: conn: %p, recv: %p",
157 kmem_cache_free(rdsv3_ib_incoming_slab
, recv
->r_ibinc
);
158 atomic_dec_32(&rdsv3_ib_allocation
);
159 recv
->r_ibinc
= NULL
;
165 * This tries to allocate and post unused work requests after making sure that
166 * they have all the allocations they need to queue received fragments into
167 * sockets. The i_recv_mutex is held here so that ring_alloc and _unalloc
168 * pairs don't go unmatched.
170 * -1 is returned if posting fails due to temporary resource exhaustion.
173 rdsv3_ib_recv_refill(struct rdsv3_connection
*conn
, int prefill
)
175 struct rdsv3_ib_connection
*ic
= conn
->c_transport_data
;
176 struct rdsv3_ib_recv_work
*recv
;
177 unsigned int posted
= 0;
181 RDSV3_DPRINTF4("rdsv3_ib_recv_refill", "conn: %p, prefill: %d",
184 if (prefill
|| rdsv3_conn_up(conn
)) {
185 uint_t w_nr
= ic
->i_recv_ring
.w_nr
;
187 avail
= rdsv3_ib_ring_alloc(&ic
->i_recv_ring
, w_nr
, &pos
);
188 if ((avail
<= 0) || (pos
>= w_nr
)) {
189 RDSV3_DPRINTF2("rdsv3_ib_recv_refill",
190 "Argh - ring alloc returned pos=%u, avail: %d",
195 /* populate the WRs */
196 for (i
= 0; i
< avail
; i
++) {
197 recv
= &ic
->i_recvs
[pos
];
198 ret
= rdsv3_ib_recv_refill_one(conn
, recv
);
200 rdsv3_ib_ring_unalloc(&ic
->i_recv_ring
,
204 ic
->i_recv_wrs
[i
].wr_id
= (ibt_wrid_t
)pos
;
205 ic
->i_recv_wrs
[i
].wr_nds
= RDSV3_IB_RECV_SGE
;
206 ic
->i_recv_wrs
[i
].wr_sgl
= &recv
->r_sge
[0];
208 pos
= (pos
+ 1) % w_nr
;
212 /* post the WRs at one shot */
213 ret
= ibt_post_recv(ib_get_ibt_channel_hdl(ic
->i_cm_id
),
214 &ic
->i_recv_wrs
[0], i
, &posted
);
215 RDSV3_DPRINTF3("rdsv3_ib_recv_refill",
216 "attempted: %d posted: %d WRs ret %d",
219 RDSV3_DPRINTF2("rdsv3_ib_recv_refill",
220 "disconnecting and reconnecting\n",
221 NIPQUAD(conn
->c_faddr
), ret
);
222 rdsv3_ib_ring_unalloc(&ic
->i_recv_ring
,
224 rdsv3_conn_drop(conn
);
229 /* We're doing flow control - update the window. */
230 if (ic
->i_flowctl
&& posted
)
231 rdsv3_ib_advertise_credits(conn
, posted
);
233 RDSV3_DPRINTF4("rdsv3_ib_recv_refill", "Return: conn: %p, posted: %d",
239 * delayed freed incoming's
241 struct rdsv3_inc_pool
{
242 list_t f_list
; /* list of freed incoming */
243 kmutex_t f_lock
; /* lock of fmr pool */
248 rdsv3_ib_destroy_inc_pool(struct rdsv3_ib_device
*rds_ibdev
)
250 struct rdsv3_inc_pool
*pool
= rds_ibdev
->inc_pool
;
253 list_destroy(&pool
->f_list
);
254 kmem_free((void *) pool
, sizeof (*pool
));
259 rdsv3_ib_create_inc_pool(struct rdsv3_ib_device
*rds_ibdev
)
261 struct rdsv3_inc_pool
*pool
;
263 pool
= kmem_zalloc(sizeof (*pool
), KM_NOSLEEP
);
267 list_create(&pool
->f_list
, sizeof (struct rdsv3_ib_incoming
),
268 offsetof(struct rdsv3_ib_incoming
, ii_obj
));
269 mutex_init(&pool
->f_lock
, NULL
, MUTEX_DRIVER
, NULL
);
270 rds_ibdev
->inc_pool
= pool
;
275 rdsv3_ib_inc_drop(struct rdsv3_ib_incoming
*ibinc
)
277 struct rdsv3_page_frag
*frag
;
278 struct rdsv3_page_frag
*pos
;
280 RDSV3_FOR_EACH_LIST_NODE_SAFE(frag
, pos
, &ibinc
->ii_frags
, f_item
) {
281 list_remove_node(&frag
->f_item
);
282 kmem_cache_free(ibinc
->ii_ibdev
->ib_frag_slab
, frag
);
285 ASSERT(list_is_empty(&ibinc
->ii_frags
));
286 kmem_cache_free(rdsv3_ib_incoming_slab
, ibinc
);
287 atomic_dec_uint(&rdsv3_ib_allocation
);
291 rdsv3_ib_drain_inclist(void *data
)
293 struct rdsv3_inc_pool
*pool
= (struct rdsv3_inc_pool
*)data
;
294 struct rdsv3_ib_incoming
*ibinc
;
295 list_t
*listp
= &pool
->f_list
;
296 kmutex_t
*lockp
= &pool
->f_lock
;
301 ibinc
= (struct rdsv3_ib_incoming
*)list_remove_head(listp
);
308 rdsv3_ib_inc_drop(ibinc
);
313 rdsv3_ib_inc_free(struct rdsv3_incoming
*inc
)
315 struct rdsv3_ib_incoming
*ibinc
;
316 rdsv3_af_thr_t
*af_thr
;
318 RDSV3_DPRINTF4("rdsv3_ib_inc_free", "inc: %p", inc
);
320 ibinc
= __containerof(inc
, struct rdsv3_ib_incoming
, ii_inc
);
321 /* save af_thr in a local as ib_inc might be freed at mutex_exit */
322 af_thr
= ibinc
->ii_ibdev
->inc_soft_cq
;
324 mutex_enter(&ibinc
->ii_pool
->f_lock
);
325 list_insert_tail(&ibinc
->ii_pool
->f_list
, ibinc
);
326 ibinc
->ii_pool
->f_listcnt
++;
327 mutex_exit(&ibinc
->ii_pool
->f_lock
);
329 rdsv3_af_thr_fire(af_thr
);
333 rdsv3_ib_inc_copy_to_user(struct rdsv3_incoming
*inc
, uio_t
*uiop
,
336 struct rdsv3_ib_incoming
*ibinc
;
337 struct rdsv3_page_frag
*frag
;
338 unsigned long to_copy
;
339 unsigned long frag_off
= 0;
344 ibinc
= __containerof(inc
, struct rdsv3_ib_incoming
, ii_inc
);
345 frag
= list_head(&ibinc
->ii_frags
);
346 len
= ntohl(inc
->i_hdr
.h_len
);
348 RDSV3_DPRINTF4("rdsv3_ib_inc_copy_to_user", "inc: %p, size: %d len: %d",
351 while (copied
< size
&& copied
< len
) {
352 if (frag_off
== RDSV3_FRAG_SIZE
) {
353 frag
= list_next(&ibinc
->ii_frags
, frag
);
357 to_copy
= min(len
- copied
, RDSV3_FRAG_SIZE
- frag_off
);
358 to_copy
= min(size
- copied
, to_copy
);
360 RDSV3_DPRINTF5("rdsv3_ib_inc_copy_to_user",
361 "%lu bytes to user %p from frag [%p, %u] + %lu",
363 frag
->f_page
, frag
->f_offset
, frag_off
);
365 ret
= uiomove((caddr_t
)(frag
->f_page
+
366 frag
->f_offset
+ frag_off
),
367 to_copy
, UIO_READ
, uiop
);
369 RDSV3_DPRINTF2("rdsv3_ib_inc_copy_to_user",
370 "uiomove (%d) returned: %d", to_copy
, ret
);
378 RDSV3_DPRINTF4("rdsv3_ib_inc_copy_to_user",
379 "Return: inc: %p, copied: %d", inc
, copied
);
384 /* ic starts out kmem_zalloc()ed */
386 rdsv3_ib_recv_init_ack(struct rdsv3_ib_connection
*ic
)
388 ibt_send_wr_t
*wr
= &ic
->i_ack_wr
;
389 ibt_wr_ds_t
*sge
= &ic
->i_ack_sge
;
391 RDSV3_DPRINTF4("rdsv3_ib_recv_init_ack", "ic: %p", ic
);
393 sge
->ds_va
= ic
->i_ack_dma
;
394 sge
->ds_len
= sizeof (struct rdsv3_header
);
395 sge
->ds_key
= ic
->i_mr
->lkey
;
399 wr
->wr_opcode
= IBT_WRC_SEND
;
400 wr
->wr_id
= RDSV3_IB_ACK_WR_ID
;
401 wr
->wr_flags
= IBT_WR_SEND_SIGNAL
| IBT_WR_SEND_SOLICIT
;
405 * You'd think that with reliable IB connections you wouldn't need to ack
406 * messages that have been received. The problem is that IB hardware generates
407 * an ack message before it has DMAed the message into memory. This creates a
408 * potential message loss if the HCA is disabled for any reason between when it
409 * sends the ack and before the message is DMAed and processed. This is only a
410 * potential issue if another HCA is available for fail-over.
412 * When the remote host receives our ack they'll free the sent message from
413 * their send queue. To decrease the latency of this we always send an ack
414 * immediately after we've received messages.
416 * For simplicity, we only have one ack in flight at a time. This puts
417 * pressure on senders to have deep enough send queues to absorb the latency of
418 * a single ack frame being in flight. This might not be good enough.
420 * This is implemented by have a long-lived send_wr and sge which point to a
421 * statically allocated ack frame. This ack wr does not fall under the ring
422 * accounting that the tx and rx wrs do. The QP attribute specifically makes
423 * room for it beyond the ring size. Send completion notices its special
424 * wr_id and avoids working with the ring in that case.
427 rdsv3_ib_set_ack(struct rdsv3_ib_connection
*ic
, uint64_t seq
,
430 RDSV3_DPRINTF4("rdsv3_ib_set_ack", "ic: %p, seq: %lld ack: %d",
431 ic
, seq
, ack_required
);
433 mutex_enter(&ic
->i_ack_lock
);
434 ic
->i_ack_next
= seq
;
436 set_bit(IB_ACK_REQUESTED
, &ic
->i_ack_flags
);
437 mutex_exit(&ic
->i_ack_lock
);
441 rdsv3_ib_get_ack(struct rdsv3_ib_connection
*ic
)
445 RDSV3_DPRINTF4("rdsv3_ib_get_ack", "ic: %p", ic
);
447 clear_bit(IB_ACK_REQUESTED
, &ic
->i_ack_flags
);
449 mutex_enter(&ic
->i_ack_lock
);
450 seq
= ic
->i_ack_next
;
451 mutex_exit(&ic
->i_ack_lock
);
457 rdsv3_ib_send_ack(struct rdsv3_ib_connection
*ic
, unsigned int adv_credits
)
459 struct rdsv3_header
*hdr
= ic
->i_ack
;
463 RDSV3_DPRINTF4("rdsv3_ib_send_ack", "ic: %p adv_credits: %d",
466 seq
= rdsv3_ib_get_ack(ic
);
468 RDSV3_DPRINTF4("rdsv3_ib_send_ack", "send_ack: ic %p ack %llu",
469 ic
, (unsigned long long) seq
);
470 rdsv3_message_populate_header(hdr
, 0, 0, 0);
471 hdr
->h_ack
= htonll(seq
);
472 hdr
->h_credit
= adv_credits
;
473 rdsv3_message_make_checksum(hdr
);
474 ic
->i_ack_queued
= jiffies
;
476 ret
= ibt_post_send(RDSV3_QP2CHANHDL(ic
->i_cm_id
->qp
), &ic
->i_ack_wr
, 1,
480 * Failed to send. Release the WR, and
483 clear_bit(IB_ACK_IN_FLIGHT
, &ic
->i_ack_flags
);
484 set_bit(IB_ACK_REQUESTED
, &ic
->i_ack_flags
);
485 rdsv3_ib_stats_inc(s_ib_ack_send_failure
);
486 RDSV3_DPRINTF2("rdsv3_ib_send_ack", "sending ack failed\n");
487 rdsv3_conn_drop(ic
->conn
);
489 rdsv3_ib_stats_inc(s_ib_ack_sent
);
491 RDSV3_DPRINTF4("rdsv3_ib_send_ack", "Return: ic: %p adv_credits: %d",
496 * There are 3 ways of getting acknowledgements to the peer:
497 * 1. We call rdsv3_ib_attempt_ack from the recv completion handler
498 * to send an ACK-only frame.
499 * However, there can be only one such frame in the send queue
500 * at any time, so we may have to postpone it.
501 * 2. When another (data) packet is transmitted while there's
502 * an ACK in the queue, we piggyback the ACK sequence number
503 * on the data packet.
504 * 3. If the ACK WR is done sending, we get called from the
505 * send queue completion handler, and check whether there's
506 * another ACK pending (postponed because the WR was on the
507 * queue). If so, we transmit it.
509 * We maintain 2 variables:
510 * - i_ack_flags, which keeps track of whether the ACK WR
511 * is currently in the send queue or not (IB_ACK_IN_FLIGHT)
512 * - i_ack_next, which is the last sequence number we received
514 * Potentially, send queue and receive queue handlers can run concurrently.
515 * It would be nice to not have to use a spinlock to synchronize things,
516 * but the one problem that rules this out is that 64bit updates are
517 * not atomic on all platforms. Things would be a lot simpler if
518 * we had atomic64 or maybe cmpxchg64 everywhere.
520 * Reconnecting complicates this picture just slightly. When we
521 * reconnect, we may be seeing duplicate packets. The peer
522 * is retransmitting them, because it hasn't seen an ACK for
523 * them. It is important that we ACK these.
525 * ACK mitigation adds a header flag "ACK_REQUIRED"; any packet with
526 * this flag set *MUST* be acknowledged immediately.
530 * When we get here, we're called from the recv queue handler.
531 * Check whether we ought to transmit an ACK.
534 rdsv3_ib_attempt_ack(struct rdsv3_ib_connection
*ic
)
536 unsigned int adv_credits
;
538 RDSV3_DPRINTF4("rdsv3_ib_attempt_ack", "ic: %p", ic
);
540 if (!test_bit(IB_ACK_REQUESTED
, &ic
->i_ack_flags
))
543 if (test_and_set_bit(IB_ACK_IN_FLIGHT
, &ic
->i_ack_flags
)) {
544 rdsv3_ib_stats_inc(s_ib_ack_send_delayed
);
548 /* Can we get a send credit? */
549 if (!rdsv3_ib_send_grab_credits(ic
, 1, &adv_credits
, 0)) {
550 rdsv3_ib_stats_inc(s_ib_tx_throttle
);
551 clear_bit(IB_ACK_IN_FLIGHT
, &ic
->i_ack_flags
);
555 clear_bit(IB_ACK_REQUESTED
, &ic
->i_ack_flags
);
556 rdsv3_ib_send_ack(ic
, adv_credits
);
558 RDSV3_DPRINTF4("rdsv3_ib_attempt_ack", "Return: ic: %p", ic
);
562 * We get here from the send completion handler, when the
563 * adapter tells us the ACK frame was sent.
566 rdsv3_ib_ack_send_complete(struct rdsv3_ib_connection
*ic
)
568 RDSV3_DPRINTF4("rdsv3_ib_ack_send_complete", "ic: %p", ic
);
569 clear_bit(IB_ACK_IN_FLIGHT
, &ic
->i_ack_flags
);
570 rdsv3_ib_attempt_ack(ic
);
574 * This is called by the regular xmit code when it wants to piggyback
575 * an ACK on an outgoing frame.
578 rdsv3_ib_piggyb_ack(struct rdsv3_ib_connection
*ic
)
580 RDSV3_DPRINTF4("rdsv3_ib_piggyb_ack", "ic: %p", ic
);
581 if (test_and_clear_bit(IB_ACK_REQUESTED
, &ic
->i_ack_flags
)) {
582 rdsv3_ib_stats_inc(s_ib_ack_send_piggybacked
);
584 return (rdsv3_ib_get_ack(ic
));
588 * It's kind of lame that we're copying from the posted receive pages into
589 * long-lived bitmaps. We could have posted the bitmaps and rdma written into
590 * them. But receiving new congestion bitmaps should be a *rare* event, so
591 * hopefully we won't need to invest that complexity in making it more
592 * efficient. By copying we can share a simpler core with TCP which has to
596 rdsv3_ib_cong_recv(struct rdsv3_connection
*conn
,
597 struct rdsv3_ib_incoming
*ibinc
)
599 struct rdsv3_cong_map
*map
;
600 unsigned int map_off
;
601 unsigned int map_page
;
602 struct rdsv3_page_frag
*frag
;
603 unsigned long frag_off
;
604 unsigned long to_copy
;
605 unsigned long copied
;
606 uint64_t uncongested
= 0;
609 RDSV3_DPRINTF4("rdsv3_ib_cong_recv", "conn: %p, ibinc: %p",
612 /* catch completely corrupt packets */
613 if (ntohl(ibinc
->ii_inc
.i_hdr
.h_len
) != RDSV3_CONG_MAP_BYTES
)
620 frag
= list_head(&ibinc
->ii_frags
);
625 while (copied
< RDSV3_CONG_MAP_BYTES
) {
629 to_copy
= min(RDSV3_FRAG_SIZE
- frag_off
, PAGE_SIZE
- map_off
);
630 ASSERT(!(to_copy
& 7)); /* Must be 64bit aligned. */
632 addr
= frag
->f_page
+ frag
->f_offset
;
634 src
= (uint64_t *)(addr
+ frag_off
);
635 dst
= (uint64_t *)(map
->m_page_addrs
[map_page
] + map_off
);
636 RDSV3_DPRINTF4("rdsv3_ib_cong_recv",
637 "src: %p dst: %p copied: %d", src
, dst
, copied
);
638 for (k
= 0; k
< to_copy
; k
+= 8) {
640 * Record ports that became uncongested, ie
641 * bits that changed from 0 to 1.
643 uncongested
|= ~(*src
) & *dst
;
648 RDSV3_DPRINTF4("rdsv3_ib_cong_recv",
649 "src: %p dst: %p copied: %d", src
, dst
, copied
);
652 if (map_off
== PAGE_SIZE
) {
658 if (frag_off
== RDSV3_FRAG_SIZE
) {
659 frag
= list_next(&ibinc
->ii_frags
, frag
);
666 /* the congestion map is in little endian order */
667 uncongested
= le64_to_cpu(uncongested
);
670 rdsv3_cong_map_updated(map
, uncongested
);
672 RDSV3_DPRINTF4("rdsv3_ib_cong_recv", "Return: conn: %p, ibinc: %p",
677 rdsv3_ib_process_recv(struct rdsv3_connection
*conn
,
678 struct rdsv3_ib_recv_work
*recv
, uint32_t data_len
,
679 struct rdsv3_ib_ack_state
*state
)
681 struct rdsv3_ib_connection
*ic
= conn
->c_transport_data
;
682 struct rdsv3_ib_incoming
*ibinc
= ic
->i_ibinc
;
683 struct rdsv3_header
*ihdr
, *hdr
;
685 /* XXX shut down the connection if port 0,0 are seen? */
687 RDSV3_DPRINTF5("rdsv3_ib_process_recv",
688 "ic %p ibinc %p recv %p byte len %u", ic
, ibinc
, recv
, data_len
);
690 if (data_len
< sizeof (struct rdsv3_header
)) {
691 RDSV3_DPRINTF2("rdsv3_ib_process_recv",
692 "incoming message from %u.%u.%u.%u didn't include a "
693 "header, disconnecting and reconnecting",
694 NIPQUAD(conn
->c_faddr
));
695 rdsv3_conn_drop(conn
);
698 data_len
-= sizeof (struct rdsv3_header
);
700 ihdr
= &ic
->i_recv_hdrs
[recv
- ic
->i_recvs
];
702 /* Validate the checksum. */
703 if (!rdsv3_message_verify_checksum(ihdr
)) {
704 RDSV3_DPRINTF2("rdsv3_ib_process_recv", "incoming message "
705 "from %u.%u.%u.%u has corrupted header - "
706 "forcing a reconnect",
707 NIPQUAD(conn
->c_faddr
));
708 rdsv3_conn_drop(conn
);
709 rdsv3_stats_inc(s_recv_drop_bad_checksum
);
713 /* Process the ACK sequence which comes with every packet */
714 state
->ack_recv
= ntohll(ihdr
->h_ack
);
715 state
->ack_recv_valid
= 1;
717 /* Process the credits update if there was one */
719 rdsv3_ib_send_add_credits(conn
, ihdr
->h_credit
);
721 if (ihdr
->h_sport
== 0 && ihdr
->h_dport
== 0 && data_len
== 0) {
723 * This is an ACK-only packet. The fact that it gets
724 * special treatment here is that historically, ACKs
725 * were rather special beasts.
727 rdsv3_ib_stats_inc(s_ib_ack_received
);
732 * If we don't already have an inc on the connection then this
733 * fragment has a header and starts a message.. copy its header
734 * into the inc and save the inc so we can hang upcoming fragments
738 ibinc
= recv
->r_ibinc
;
739 recv
->r_ibinc
= NULL
;
742 hdr
= &ibinc
->ii_inc
.i_hdr
;
743 (void) memcpy(hdr
, ihdr
, sizeof (*hdr
));
744 ic
->i_recv_data_rem
= ntohl(hdr
->h_len
);
746 RDSV3_DPRINTF5("rdsv3_ib_process_recv",
747 "ic %p ibinc %p rem %u flag 0x%x", ic
, ibinc
,
748 ic
->i_recv_data_rem
, hdr
->h_flags
);
750 hdr
= &ibinc
->ii_inc
.i_hdr
;
752 * We can't just use memcmp here; fragments of a
753 * single message may carry different ACKs
755 if (hdr
->h_sequence
!= ihdr
->h_sequence
||
756 hdr
->h_len
!= ihdr
->h_len
||
757 hdr
->h_sport
!= ihdr
->h_sport
||
758 hdr
->h_dport
!= ihdr
->h_dport
) {
759 RDSV3_DPRINTF2("rdsv3_ib_process_recv",
760 "fragment header mismatch; forcing reconnect");
761 rdsv3_conn_drop(conn
);
766 list_insert_tail(&ibinc
->ii_frags
, recv
->r_frag
);
769 if (ic
->i_recv_data_rem
> RDSV3_FRAG_SIZE
)
770 ic
->i_recv_data_rem
-= RDSV3_FRAG_SIZE
;
772 ic
->i_recv_data_rem
= 0;
775 if (ibinc
->ii_inc
.i_hdr
.h_flags
== RDSV3_FLAG_CONG_BITMAP
)
776 rdsv3_ib_cong_recv(conn
, ibinc
);
778 rdsv3_recv_incoming(conn
, conn
->c_faddr
, conn
->c_laddr
,
779 &ibinc
->ii_inc
, KM_NOSLEEP
);
780 state
->ack_next
= ntohll(hdr
->h_sequence
);
781 state
->ack_next_valid
= 1;
785 * Evaluate the ACK_REQUIRED flag *after* we received
786 * the complete frame, and after bumping the next_rx
789 if (hdr
->h_flags
& RDSV3_FLAG_ACK_REQUIRED
) {
790 rdsv3_stats_inc(s_recv_ack_required
);
791 state
->ack_required
= 1;
794 rdsv3_inc_put(&ibinc
->ii_inc
);
797 RDSV3_DPRINTF4("rdsv3_ib_process_recv",
798 "Return: conn: %p recv: %p len: %d state: %p",
799 conn
, recv
, data_len
, state
);
803 rdsv3_ib_recv_cqe_handler(struct rdsv3_ib_connection
*ic
, ibt_wc_t
*wc
,
804 struct rdsv3_ib_ack_state
*state
)
806 struct rdsv3_connection
*conn
= ic
->conn
;
807 struct rdsv3_ib_recv_work
*recv
;
808 struct rdsv3_ib_work_ring
*recv_ringp
= &ic
->i_recv_ring
;
810 RDSV3_DPRINTF4("rdsv3_ib_recv_cqe_handler",
811 "rwc wc_id 0x%llx status %u byte_len %u imm_data %u\n",
812 (unsigned long long)wc
->wc_id
, wc
->wc_status
,
813 wc
->wc_bytes_xfer
, ntohl(wc
->wc_immed_data
));
815 rdsv3_ib_stats_inc(s_ib_rx_cq_event
);
817 recv
= &ic
->i_recvs
[rdsv3_ib_ring_oldest(recv_ringp
)];
820 * Also process recvs in connecting state because it is possible
821 * to get a recv completion _before_ the rdmacm ESTABLISHED
822 * event is processed.
824 if (rdsv3_conn_up(conn
) || rdsv3_conn_connecting(conn
)) {
825 /* We expect errors as the qp is drained during shutdown */
826 if (wc
->wc_status
== IBT_WC_SUCCESS
) {
827 rdsv3_ib_process_recv(conn
, recv
,
828 wc
->wc_bytes_xfer
, state
);
830 RDSV3_DPRINTF2("rdsv3_ib_recv_cqe_handler",
831 "recv completion on "
832 "%u.%u.%u.%u had status %u, "
833 "disconnecting and reconnecting\n",
834 NIPQUAD(conn
->c_faddr
),
836 rdsv3_conn_drop(conn
);
840 rdsv3_ib_ring_free(recv_ringp
, 1);
843 * If we ever end up with a really empty receive ring, we're
844 * in deep trouble, as the sender will definitely see RNR
847 if (rdsv3_ib_ring_empty(recv_ringp
))
848 rdsv3_ib_stats_inc(s_ib_rx_ring_empty
);
850 if (rdsv3_ib_ring_low(recv_ringp
)) {
851 rdsv3_af_thr_fire(ic
->i_refill_rq
);
856 rdsv3_ib_recv(struct rdsv3_connection
*conn
)
858 struct rdsv3_ib_connection
*ic
= conn
->c_transport_data
;
861 RDSV3_DPRINTF4("rdsv3_ib_recv", "conn %p\n", conn
);
863 if (rdsv3_conn_up(conn
))
864 rdsv3_ib_attempt_ack(ic
);
866 RDSV3_DPRINTF4("rdsv3_ib_recv", "Return: conn: %p", conn
);
871 extern int rdsv3_ib_inc_constructor(void *buf
, void *arg
, int kmflags
);
872 extern void rdsv3_ib_inc_destructor(void *buf
, void *arg
);
875 rdsv3_ib_recv_init(void)
877 RDSV3_DPRINTF4("rdsv3_ib_recv_init", "Enter");
879 rdsv3_ib_incoming_slab
= kmem_cache_create("rdsv3_ib_incoming",
880 sizeof (struct rdsv3_ib_incoming
), 0, rdsv3_ib_inc_constructor
,
881 rdsv3_ib_inc_destructor
, NULL
, NULL
, NULL
, 0);
882 if (!rdsv3_ib_incoming_slab
) {
883 RDSV3_DPRINTF2("rdsv3_ib_recv_init", "kmem_cache_create "
888 RDSV3_DPRINTF4("rdsv3_ib_recv_init", "Return");
893 rdsv3_ib_recv_exit(void)
895 RDSV3_DPRINTF4("rdsv3_ib_recv_exit", "Enter");
896 kmem_cache_destroy(rdsv3_ib_incoming_slab
);
897 RDSV3_DPRINTF4("rdsv3_ib_recv_exit", "Return");