Merge remote-tracking branch 'origin/master'
[unleashed/lotheac.git] / usr / src / uts / common / io / ib / clients / rdsv3 / ib_recv.c
blob39781cc12f529566c3e41a6d97fa0788cc3ed6e6
1 /*
2 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
3 */
5 /*
6 * This file contains code imported from the OFED rds source file ib_recv.c
7 * Oracle elects to have and use the contents of ib_recv.c under and governed
8 * by the OpenIB.org BSD license (see below for full license text). However,
9 * the following notice accompanied the original version of this file:
13 * Copyright (c) 2006 Oracle. All rights reserved.
15 * This software is available to you under a choice of one of two
16 * licenses. You may choose to be licensed under the terms of the GNU
17 * General Public License (GPL) Version 2, available from the file
18 * COPYING in the main directory of this source tree, or the
19 * OpenIB.org BSD license below:
21 * Redistribution and use in source and binary forms, with or
22 * without modification, are permitted provided that the following
23 * conditions are met:
25 * - Redistributions of source code must retain the above
26 * copyright notice, this list of conditions and the following
27 * disclaimer.
29 * - Redistributions in binary form must reproduce the above
30 * copyright notice, this list of conditions and the following
31 * disclaimer in the documentation and/or other materials
32 * provided with the distribution.
34 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
35 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
36 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
37 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
38 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
39 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
40 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
41 * SOFTWARE.
44 #include <sys/types.h>
45 #include <sys/kmem.h>
46 #include <sys/cpuvar.h>
47 #include <sys/rds.h>
48 #include <sys/containerof.h>
50 #include <sys/ib/clients/rdsv3/rdsv3.h>
51 #include <sys/ib/clients/rdsv3/ib.h>
52 #include <sys/ib/clients/rdsv3/rdsv3_debug.h>
54 static struct kmem_cache *rdsv3_ib_incoming_slab;
55 static atomic_t rdsv3_ib_allocation = ATOMIC_INIT(0);
57 void
58 rdsv3_ib_recv_init_ring(struct rdsv3_ib_connection *ic)
60 struct rdsv3_ib_recv_work *recv;
61 struct rdsv3_header *hdrp;
62 uint32_t i;
64 RDSV3_DPRINTF4("rdsv3_ib_recv_init_ring", "ic: %p", ic);
66 hdrp = ic->i_recv_hdrs;
67 for (i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) {
68 recv->r_ibinc = NULL;
69 recv->r_frag = NULL;
71 /* initialize the hdr sgl permanently */
72 recv->r_sge[0].ds_va = (ib_vaddr_t)(uintptr_t)hdrp++;
73 recv->r_sge[0].ds_len = sizeof (struct rdsv3_header);
74 recv->r_sge[0].ds_key = ic->i_mr->lkey;
78 static void
79 rdsv3_ib_recv_clear_one(struct rdsv3_ib_connection *ic,
80 struct rdsv3_ib_recv_work *recv)
82 RDSV3_DPRINTF4("rdsv3_ib_recv_clear_one", "ic: %p, recv: %p",
83 ic, recv);
85 if (recv->r_ibinc) {
86 rdsv3_inc_put(&recv->r_ibinc->ii_inc);
87 recv->r_ibinc = NULL;
90 if (recv->r_frag) {
91 kmem_cache_free(ic->rds_ibdev->ib_frag_slab, recv->r_frag);
92 recv->r_frag = NULL;
95 RDSV3_DPRINTF4("rdsv3_ib_recv_clear_one", "Return: ic: %p, recv: %p",
96 ic, recv);
99 void
100 rdsv3_ib_recv_clear_ring(struct rdsv3_ib_connection *ic)
102 uint32_t i;
104 RDSV3_DPRINTF4("rdsv3_ib_recv_clear_ring", "ic: %p", ic);
106 for (i = 0; i < ic->i_recv_ring.w_nr; i++)
107 rdsv3_ib_recv_clear_one(ic, &ic->i_recvs[i]);
110 extern int atomic_add_unless(atomic_t *, uint_t, ulong_t);
112 static int
113 rdsv3_ib_recv_refill_one(struct rdsv3_connection *conn,
114 struct rdsv3_ib_recv_work *recv)
116 struct rdsv3_ib_connection *ic = conn->c_transport_data;
117 ibt_mi_hdl_t mi_hdl;
118 ibt_iov_attr_t iov_attr;
119 ibt_iov_t iov_arr[1];
121 RDSV3_DPRINTF5("rdsv3_ib_recv_refill_one", "conn: %p, recv: %p",
122 conn, recv);
124 if (!recv->r_ibinc) {
125 if (!atomic_add_unless(&rdsv3_ib_allocation, 1,
126 ic->i_max_recv_alloc)) {
127 rdsv3_ib_stats_inc(s_ib_rx_alloc_limit);
128 goto out;
130 recv->r_ibinc = kmem_cache_alloc(rdsv3_ib_incoming_slab,
131 KM_NOSLEEP);
132 if (recv->r_ibinc == NULL) {
133 atomic_dec_32(&rdsv3_ib_allocation);
134 goto out;
136 rdsv3_inc_init(&recv->r_ibinc->ii_inc, conn, conn->c_faddr);
137 recv->r_ibinc->ii_ibdev = ic->rds_ibdev;
138 recv->r_ibinc->ii_pool = ic->rds_ibdev->inc_pool;
141 if (!recv->r_frag) {
142 recv->r_frag = kmem_cache_alloc(ic->rds_ibdev->ib_frag_slab,
143 KM_NOSLEEP);
144 if (!recv->r_frag)
145 goto out;
148 /* Data sge, structure copy */
149 recv->r_sge[1] = recv->r_frag->f_sge;
151 RDSV3_DPRINTF5("rdsv3_ib_recv_refill_one", "Return: conn: %p, recv: %p",
152 conn, recv);
154 return (0);
155 out:
156 if (recv->r_ibinc) {
157 kmem_cache_free(rdsv3_ib_incoming_slab, recv->r_ibinc);
158 atomic_dec_32(&rdsv3_ib_allocation);
159 recv->r_ibinc = NULL;
161 return (-ENOMEM);
165 * This tries to allocate and post unused work requests after making sure that
166 * they have all the allocations they need to queue received fragments into
167 * sockets. The i_recv_mutex is held here so that ring_alloc and _unalloc
168 * pairs don't go unmatched.
170 * -1 is returned if posting fails due to temporary resource exhaustion.
173 rdsv3_ib_recv_refill(struct rdsv3_connection *conn, int prefill)
175 struct rdsv3_ib_connection *ic = conn->c_transport_data;
176 struct rdsv3_ib_recv_work *recv;
177 unsigned int posted = 0;
178 int ret = 0, avail;
179 uint32_t pos, i;
181 RDSV3_DPRINTF4("rdsv3_ib_recv_refill", "conn: %p, prefill: %d",
182 conn, prefill);
184 if (prefill || rdsv3_conn_up(conn)) {
185 uint_t w_nr = ic->i_recv_ring.w_nr;
187 avail = rdsv3_ib_ring_alloc(&ic->i_recv_ring, w_nr, &pos);
188 if ((avail <= 0) || (pos >= w_nr)) {
189 RDSV3_DPRINTF2("rdsv3_ib_recv_refill",
190 "Argh - ring alloc returned pos=%u, avail: %d",
191 pos, avail);
192 return (-EINVAL);
195 /* populate the WRs */
196 for (i = 0; i < avail; i++) {
197 recv = &ic->i_recvs[pos];
198 ret = rdsv3_ib_recv_refill_one(conn, recv);
199 if (ret) {
200 rdsv3_ib_ring_unalloc(&ic->i_recv_ring,
201 avail - i);
202 break;
204 ic->i_recv_wrs[i].wr_id = (ibt_wrid_t)pos;
205 ic->i_recv_wrs[i].wr_nds = RDSV3_IB_RECV_SGE;
206 ic->i_recv_wrs[i].wr_sgl = &recv->r_sge[0];
208 pos = (pos + 1) % w_nr;
211 if (i) {
212 /* post the WRs at one shot */
213 ret = ibt_post_recv(ib_get_ibt_channel_hdl(ic->i_cm_id),
214 &ic->i_recv_wrs[0], i, &posted);
215 RDSV3_DPRINTF3("rdsv3_ib_recv_refill",
216 "attempted: %d posted: %d WRs ret %d",
217 i, posted, ret);
218 if (ret) {
219 RDSV3_DPRINTF2("rdsv3_ib_recv_refill",
220 "disconnecting and reconnecting\n",
221 NIPQUAD(conn->c_faddr), ret);
222 rdsv3_ib_ring_unalloc(&ic->i_recv_ring,
223 i - posted);
224 rdsv3_conn_drop(conn);
229 /* We're doing flow control - update the window. */
230 if (ic->i_flowctl && posted)
231 rdsv3_ib_advertise_credits(conn, posted);
233 RDSV3_DPRINTF4("rdsv3_ib_recv_refill", "Return: conn: %p, posted: %d",
234 conn, posted);
235 return (ret);
239 * delayed freed incoming's
241 struct rdsv3_inc_pool {
242 list_t f_list; /* list of freed incoming */
243 kmutex_t f_lock; /* lock of fmr pool */
244 int32_t f_listcnt;
247 void
248 rdsv3_ib_destroy_inc_pool(struct rdsv3_ib_device *rds_ibdev)
250 struct rdsv3_inc_pool *pool = rds_ibdev->inc_pool;
252 if (pool) {
253 list_destroy(&pool->f_list);
254 kmem_free((void *) pool, sizeof (*pool));
259 rdsv3_ib_create_inc_pool(struct rdsv3_ib_device *rds_ibdev)
261 struct rdsv3_inc_pool *pool;
263 pool = kmem_zalloc(sizeof (*pool), KM_NOSLEEP);
264 if (pool == NULL) {
265 return (-ENOMEM);
267 list_create(&pool->f_list, sizeof (struct rdsv3_ib_incoming),
268 offsetof(struct rdsv3_ib_incoming, ii_obj));
269 mutex_init(&pool->f_lock, NULL, MUTEX_DRIVER, NULL);
270 rds_ibdev->inc_pool = pool;
271 return (0);
274 static void
275 rdsv3_ib_inc_drop(struct rdsv3_ib_incoming *ibinc)
277 struct rdsv3_page_frag *frag;
278 struct rdsv3_page_frag *pos;
280 RDSV3_FOR_EACH_LIST_NODE_SAFE(frag, pos, &ibinc->ii_frags, f_item) {
281 list_remove_node(&frag->f_item);
282 kmem_cache_free(ibinc->ii_ibdev->ib_frag_slab, frag);
285 ASSERT(list_is_empty(&ibinc->ii_frags));
286 kmem_cache_free(rdsv3_ib_incoming_slab, ibinc);
287 atomic_dec_uint(&rdsv3_ib_allocation);
290 void
291 rdsv3_ib_drain_inclist(void *data)
293 struct rdsv3_inc_pool *pool = (struct rdsv3_inc_pool *)data;
294 struct rdsv3_ib_incoming *ibinc;
295 list_t *listp = &pool->f_list;
296 kmutex_t *lockp = &pool->f_lock;
297 int i = 0;
299 for (;;) {
300 mutex_enter(lockp);
301 ibinc = (struct rdsv3_ib_incoming *)list_remove_head(listp);
302 if (ibinc)
303 pool->f_listcnt--;
304 mutex_exit(lockp);
305 if (!ibinc)
306 break;
307 i++;
308 rdsv3_ib_inc_drop(ibinc);
312 void
313 rdsv3_ib_inc_free(struct rdsv3_incoming *inc)
315 struct rdsv3_ib_incoming *ibinc;
316 rdsv3_af_thr_t *af_thr;
318 RDSV3_DPRINTF4("rdsv3_ib_inc_free", "inc: %p", inc);
320 ibinc = __containerof(inc, struct rdsv3_ib_incoming, ii_inc);
321 /* save af_thr in a local as ib_inc might be freed at mutex_exit */
322 af_thr = ibinc->ii_ibdev->inc_soft_cq;
324 mutex_enter(&ibinc->ii_pool->f_lock);
325 list_insert_tail(&ibinc->ii_pool->f_list, ibinc);
326 ibinc->ii_pool->f_listcnt++;
327 mutex_exit(&ibinc->ii_pool->f_lock);
329 rdsv3_af_thr_fire(af_thr);
333 rdsv3_ib_inc_copy_to_user(struct rdsv3_incoming *inc, uio_t *uiop,
334 size_t size)
336 struct rdsv3_ib_incoming *ibinc;
337 struct rdsv3_page_frag *frag;
338 unsigned long to_copy;
339 unsigned long frag_off = 0;
340 int copied = 0;
341 int ret;
342 uint32_t len;
344 ibinc = __containerof(inc, struct rdsv3_ib_incoming, ii_inc);
345 frag = list_head(&ibinc->ii_frags);
346 len = ntohl(inc->i_hdr.h_len);
348 RDSV3_DPRINTF4("rdsv3_ib_inc_copy_to_user", "inc: %p, size: %d len: %d",
349 inc, size, len);
351 while (copied < size && copied < len) {
352 if (frag_off == RDSV3_FRAG_SIZE) {
353 frag = list_next(&ibinc->ii_frags, frag);
354 frag_off = 0;
357 to_copy = min(len - copied, RDSV3_FRAG_SIZE - frag_off);
358 to_copy = min(size - copied, to_copy);
360 RDSV3_DPRINTF5("rdsv3_ib_inc_copy_to_user",
361 "%lu bytes to user %p from frag [%p, %u] + %lu",
362 to_copy, uiop,
363 frag->f_page, frag->f_offset, frag_off);
365 ret = uiomove((caddr_t)(frag->f_page +
366 frag->f_offset + frag_off),
367 to_copy, UIO_READ, uiop);
368 if (ret) {
369 RDSV3_DPRINTF2("rdsv3_ib_inc_copy_to_user",
370 "uiomove (%d) returned: %d", to_copy, ret);
371 break;
374 frag_off += to_copy;
375 copied += to_copy;
378 RDSV3_DPRINTF4("rdsv3_ib_inc_copy_to_user",
379 "Return: inc: %p, copied: %d", inc, copied);
381 return (copied);
384 /* ic starts out kmem_zalloc()ed */
385 void
386 rdsv3_ib_recv_init_ack(struct rdsv3_ib_connection *ic)
388 ibt_send_wr_t *wr = &ic->i_ack_wr;
389 ibt_wr_ds_t *sge = &ic->i_ack_sge;
391 RDSV3_DPRINTF4("rdsv3_ib_recv_init_ack", "ic: %p", ic);
393 sge->ds_va = ic->i_ack_dma;
394 sge->ds_len = sizeof (struct rdsv3_header);
395 sge->ds_key = ic->i_mr->lkey;
397 wr->wr_sgl = sge;
398 wr->wr_nds = 1;
399 wr->wr_opcode = IBT_WRC_SEND;
400 wr->wr_id = RDSV3_IB_ACK_WR_ID;
401 wr->wr_flags = IBT_WR_SEND_SIGNAL | IBT_WR_SEND_SOLICIT;
405 * You'd think that with reliable IB connections you wouldn't need to ack
406 * messages that have been received. The problem is that IB hardware generates
407 * an ack message before it has DMAed the message into memory. This creates a
408 * potential message loss if the HCA is disabled for any reason between when it
409 * sends the ack and before the message is DMAed and processed. This is only a
410 * potential issue if another HCA is available for fail-over.
412 * When the remote host receives our ack they'll free the sent message from
413 * their send queue. To decrease the latency of this we always send an ack
414 * immediately after we've received messages.
416 * For simplicity, we only have one ack in flight at a time. This puts
417 * pressure on senders to have deep enough send queues to absorb the latency of
418 * a single ack frame being in flight. This might not be good enough.
420 * This is implemented by have a long-lived send_wr and sge which point to a
421 * statically allocated ack frame. This ack wr does not fall under the ring
422 * accounting that the tx and rx wrs do. The QP attribute specifically makes
423 * room for it beyond the ring size. Send completion notices its special
424 * wr_id and avoids working with the ring in that case.
426 void
427 rdsv3_ib_set_ack(struct rdsv3_ib_connection *ic, uint64_t seq,
428 int ack_required)
430 RDSV3_DPRINTF4("rdsv3_ib_set_ack", "ic: %p, seq: %lld ack: %d",
431 ic, seq, ack_required);
433 mutex_enter(&ic->i_ack_lock);
434 ic->i_ack_next = seq;
435 if (ack_required)
436 set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
437 mutex_exit(&ic->i_ack_lock);
440 static uint64_t
441 rdsv3_ib_get_ack(struct rdsv3_ib_connection *ic)
443 uint64_t seq;
445 RDSV3_DPRINTF4("rdsv3_ib_get_ack", "ic: %p", ic);
447 clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
449 mutex_enter(&ic->i_ack_lock);
450 seq = ic->i_ack_next;
451 mutex_exit(&ic->i_ack_lock);
453 return (seq);
456 static void
457 rdsv3_ib_send_ack(struct rdsv3_ib_connection *ic, unsigned int adv_credits)
459 struct rdsv3_header *hdr = ic->i_ack;
460 uint64_t seq;
461 int ret;
463 RDSV3_DPRINTF4("rdsv3_ib_send_ack", "ic: %p adv_credits: %d",
464 ic, adv_credits);
466 seq = rdsv3_ib_get_ack(ic);
468 RDSV3_DPRINTF4("rdsv3_ib_send_ack", "send_ack: ic %p ack %llu",
469 ic, (unsigned long long) seq);
470 rdsv3_message_populate_header(hdr, 0, 0, 0);
471 hdr->h_ack = htonll(seq);
472 hdr->h_credit = adv_credits;
473 rdsv3_message_make_checksum(hdr);
474 ic->i_ack_queued = jiffies;
476 ret = ibt_post_send(RDSV3_QP2CHANHDL(ic->i_cm_id->qp), &ic->i_ack_wr, 1,
477 NULL);
478 if (ret) {
480 * Failed to send. Release the WR, and
481 * force another ACK.
483 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
484 set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
485 rdsv3_ib_stats_inc(s_ib_ack_send_failure);
486 RDSV3_DPRINTF2("rdsv3_ib_send_ack", "sending ack failed\n");
487 rdsv3_conn_drop(ic->conn);
488 } else {
489 rdsv3_ib_stats_inc(s_ib_ack_sent);
491 RDSV3_DPRINTF4("rdsv3_ib_send_ack", "Return: ic: %p adv_credits: %d",
492 ic, adv_credits);
496 * There are 3 ways of getting acknowledgements to the peer:
497 * 1. We call rdsv3_ib_attempt_ack from the recv completion handler
498 * to send an ACK-only frame.
499 * However, there can be only one such frame in the send queue
500 * at any time, so we may have to postpone it.
501 * 2. When another (data) packet is transmitted while there's
502 * an ACK in the queue, we piggyback the ACK sequence number
503 * on the data packet.
504 * 3. If the ACK WR is done sending, we get called from the
505 * send queue completion handler, and check whether there's
506 * another ACK pending (postponed because the WR was on the
507 * queue). If so, we transmit it.
509 * We maintain 2 variables:
510 * - i_ack_flags, which keeps track of whether the ACK WR
511 * is currently in the send queue or not (IB_ACK_IN_FLIGHT)
512 * - i_ack_next, which is the last sequence number we received
514 * Potentially, send queue and receive queue handlers can run concurrently.
515 * It would be nice to not have to use a spinlock to synchronize things,
516 * but the one problem that rules this out is that 64bit updates are
517 * not atomic on all platforms. Things would be a lot simpler if
518 * we had atomic64 or maybe cmpxchg64 everywhere.
520 * Reconnecting complicates this picture just slightly. When we
521 * reconnect, we may be seeing duplicate packets. The peer
522 * is retransmitting them, because it hasn't seen an ACK for
523 * them. It is important that we ACK these.
525 * ACK mitigation adds a header flag "ACK_REQUIRED"; any packet with
526 * this flag set *MUST* be acknowledged immediately.
530 * When we get here, we're called from the recv queue handler.
531 * Check whether we ought to transmit an ACK.
533 void
534 rdsv3_ib_attempt_ack(struct rdsv3_ib_connection *ic)
536 unsigned int adv_credits;
538 RDSV3_DPRINTF4("rdsv3_ib_attempt_ack", "ic: %p", ic);
540 if (!test_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
541 return;
543 if (test_and_set_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags)) {
544 rdsv3_ib_stats_inc(s_ib_ack_send_delayed);
545 return;
548 /* Can we get a send credit? */
549 if (!rdsv3_ib_send_grab_credits(ic, 1, &adv_credits, 0)) {
550 rdsv3_ib_stats_inc(s_ib_tx_throttle);
551 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
552 return;
555 clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
556 rdsv3_ib_send_ack(ic, adv_credits);
558 RDSV3_DPRINTF4("rdsv3_ib_attempt_ack", "Return: ic: %p", ic);
562 * We get here from the send completion handler, when the
563 * adapter tells us the ACK frame was sent.
565 void
566 rdsv3_ib_ack_send_complete(struct rdsv3_ib_connection *ic)
568 RDSV3_DPRINTF4("rdsv3_ib_ack_send_complete", "ic: %p", ic);
569 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
570 rdsv3_ib_attempt_ack(ic);
574 * This is called by the regular xmit code when it wants to piggyback
575 * an ACK on an outgoing frame.
577 uint64_t
578 rdsv3_ib_piggyb_ack(struct rdsv3_ib_connection *ic)
580 RDSV3_DPRINTF4("rdsv3_ib_piggyb_ack", "ic: %p", ic);
581 if (test_and_clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags)) {
582 rdsv3_ib_stats_inc(s_ib_ack_send_piggybacked);
584 return (rdsv3_ib_get_ack(ic));
588 * It's kind of lame that we're copying from the posted receive pages into
589 * long-lived bitmaps. We could have posted the bitmaps and rdma written into
590 * them. But receiving new congestion bitmaps should be a *rare* event, so
591 * hopefully we won't need to invest that complexity in making it more
592 * efficient. By copying we can share a simpler core with TCP which has to
593 * copy.
595 static void
596 rdsv3_ib_cong_recv(struct rdsv3_connection *conn,
597 struct rdsv3_ib_incoming *ibinc)
599 struct rdsv3_cong_map *map;
600 unsigned int map_off;
601 unsigned int map_page;
602 struct rdsv3_page_frag *frag;
603 unsigned long frag_off;
604 unsigned long to_copy;
605 unsigned long copied;
606 uint64_t uncongested = 0;
607 caddr_t addr;
609 RDSV3_DPRINTF4("rdsv3_ib_cong_recv", "conn: %p, ibinc: %p",
610 conn, ibinc);
612 /* catch completely corrupt packets */
613 if (ntohl(ibinc->ii_inc.i_hdr.h_len) != RDSV3_CONG_MAP_BYTES)
614 return;
616 map = conn->c_fcong;
617 map_page = 0;
618 map_off = 0;
620 frag = list_head(&ibinc->ii_frags);
621 frag_off = 0;
623 copied = 0;
625 while (copied < RDSV3_CONG_MAP_BYTES) {
626 uint64_t *src, *dst;
627 unsigned int k;
629 to_copy = min(RDSV3_FRAG_SIZE - frag_off, PAGE_SIZE - map_off);
630 ASSERT(!(to_copy & 7)); /* Must be 64bit aligned. */
632 addr = frag->f_page + frag->f_offset;
634 src = (uint64_t *)(addr + frag_off);
635 dst = (uint64_t *)(map->m_page_addrs[map_page] + map_off);
636 RDSV3_DPRINTF4("rdsv3_ib_cong_recv",
637 "src: %p dst: %p copied: %d", src, dst, copied);
638 for (k = 0; k < to_copy; k += 8) {
640 * Record ports that became uncongested, ie
641 * bits that changed from 0 to 1.
643 uncongested |= ~(*src) & *dst;
644 *dst++ = *src++;
647 copied += to_copy;
648 RDSV3_DPRINTF4("rdsv3_ib_cong_recv",
649 "src: %p dst: %p copied: %d", src, dst, copied);
651 map_off += to_copy;
652 if (map_off == PAGE_SIZE) {
653 map_off = 0;
654 map_page++;
657 frag_off += to_copy;
658 if (frag_off == RDSV3_FRAG_SIZE) {
659 frag = list_next(&ibinc->ii_frags, frag);
660 frag_off = 0;
664 #if 0
666 /* the congestion map is in little endian order */
667 uncongested = le64_to_cpu(uncongested);
668 #endif
670 rdsv3_cong_map_updated(map, uncongested);
672 RDSV3_DPRINTF4("rdsv3_ib_cong_recv", "Return: conn: %p, ibinc: %p",
673 conn, ibinc);
676 static void
677 rdsv3_ib_process_recv(struct rdsv3_connection *conn,
678 struct rdsv3_ib_recv_work *recv, uint32_t data_len,
679 struct rdsv3_ib_ack_state *state)
681 struct rdsv3_ib_connection *ic = conn->c_transport_data;
682 struct rdsv3_ib_incoming *ibinc = ic->i_ibinc;
683 struct rdsv3_header *ihdr, *hdr;
685 /* XXX shut down the connection if port 0,0 are seen? */
687 RDSV3_DPRINTF5("rdsv3_ib_process_recv",
688 "ic %p ibinc %p recv %p byte len %u", ic, ibinc, recv, data_len);
690 if (data_len < sizeof (struct rdsv3_header)) {
691 RDSV3_DPRINTF2("rdsv3_ib_process_recv",
692 "incoming message from %u.%u.%u.%u didn't include a "
693 "header, disconnecting and reconnecting",
694 NIPQUAD(conn->c_faddr));
695 rdsv3_conn_drop(conn);
696 return;
698 data_len -= sizeof (struct rdsv3_header);
700 ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs];
702 /* Validate the checksum. */
703 if (!rdsv3_message_verify_checksum(ihdr)) {
704 RDSV3_DPRINTF2("rdsv3_ib_process_recv", "incoming message "
705 "from %u.%u.%u.%u has corrupted header - "
706 "forcing a reconnect",
707 NIPQUAD(conn->c_faddr));
708 rdsv3_conn_drop(conn);
709 rdsv3_stats_inc(s_recv_drop_bad_checksum);
710 return;
713 /* Process the ACK sequence which comes with every packet */
714 state->ack_recv = ntohll(ihdr->h_ack);
715 state->ack_recv_valid = 1;
717 /* Process the credits update if there was one */
718 if (ihdr->h_credit)
719 rdsv3_ib_send_add_credits(conn, ihdr->h_credit);
721 if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && data_len == 0) {
723 * This is an ACK-only packet. The fact that it gets
724 * special treatment here is that historically, ACKs
725 * were rather special beasts.
727 rdsv3_ib_stats_inc(s_ib_ack_received);
728 return;
732 * If we don't already have an inc on the connection then this
733 * fragment has a header and starts a message.. copy its header
734 * into the inc and save the inc so we can hang upcoming fragments
735 * off its list.
737 if (!ibinc) {
738 ibinc = recv->r_ibinc;
739 recv->r_ibinc = NULL;
740 ic->i_ibinc = ibinc;
742 hdr = &ibinc->ii_inc.i_hdr;
743 (void) memcpy(hdr, ihdr, sizeof (*hdr));
744 ic->i_recv_data_rem = ntohl(hdr->h_len);
746 RDSV3_DPRINTF5("rdsv3_ib_process_recv",
747 "ic %p ibinc %p rem %u flag 0x%x", ic, ibinc,
748 ic->i_recv_data_rem, hdr->h_flags);
749 } else {
750 hdr = &ibinc->ii_inc.i_hdr;
752 * We can't just use memcmp here; fragments of a
753 * single message may carry different ACKs
755 if (hdr->h_sequence != ihdr->h_sequence ||
756 hdr->h_len != ihdr->h_len ||
757 hdr->h_sport != ihdr->h_sport ||
758 hdr->h_dport != ihdr->h_dport) {
759 RDSV3_DPRINTF2("rdsv3_ib_process_recv",
760 "fragment header mismatch; forcing reconnect");
761 rdsv3_conn_drop(conn);
762 return;
766 list_insert_tail(&ibinc->ii_frags, recv->r_frag);
767 recv->r_frag = NULL;
769 if (ic->i_recv_data_rem > RDSV3_FRAG_SIZE)
770 ic->i_recv_data_rem -= RDSV3_FRAG_SIZE;
771 else {
772 ic->i_recv_data_rem = 0;
773 ic->i_ibinc = NULL;
775 if (ibinc->ii_inc.i_hdr.h_flags == RDSV3_FLAG_CONG_BITMAP)
776 rdsv3_ib_cong_recv(conn, ibinc);
777 else {
778 rdsv3_recv_incoming(conn, conn->c_faddr, conn->c_laddr,
779 &ibinc->ii_inc, KM_NOSLEEP);
780 state->ack_next = ntohll(hdr->h_sequence);
781 state->ack_next_valid = 1;
785 * Evaluate the ACK_REQUIRED flag *after* we received
786 * the complete frame, and after bumping the next_rx
787 * sequence.
789 if (hdr->h_flags & RDSV3_FLAG_ACK_REQUIRED) {
790 rdsv3_stats_inc(s_recv_ack_required);
791 state->ack_required = 1;
794 rdsv3_inc_put(&ibinc->ii_inc);
797 RDSV3_DPRINTF4("rdsv3_ib_process_recv",
798 "Return: conn: %p recv: %p len: %d state: %p",
799 conn, recv, data_len, state);
802 void
803 rdsv3_ib_recv_cqe_handler(struct rdsv3_ib_connection *ic, ibt_wc_t *wc,
804 struct rdsv3_ib_ack_state *state)
806 struct rdsv3_connection *conn = ic->conn;
807 struct rdsv3_ib_recv_work *recv;
808 struct rdsv3_ib_work_ring *recv_ringp = &ic->i_recv_ring;
810 RDSV3_DPRINTF4("rdsv3_ib_recv_cqe_handler",
811 "rwc wc_id 0x%llx status %u byte_len %u imm_data %u\n",
812 (unsigned long long)wc->wc_id, wc->wc_status,
813 wc->wc_bytes_xfer, ntohl(wc->wc_immed_data));
815 rdsv3_ib_stats_inc(s_ib_rx_cq_event);
817 recv = &ic->i_recvs[rdsv3_ib_ring_oldest(recv_ringp)];
820 * Also process recvs in connecting state because it is possible
821 * to get a recv completion _before_ the rdmacm ESTABLISHED
822 * event is processed.
824 if (rdsv3_conn_up(conn) || rdsv3_conn_connecting(conn)) {
825 /* We expect errors as the qp is drained during shutdown */
826 if (wc->wc_status == IBT_WC_SUCCESS) {
827 rdsv3_ib_process_recv(conn, recv,
828 wc->wc_bytes_xfer, state);
829 } else {
830 RDSV3_DPRINTF2("rdsv3_ib_recv_cqe_handler",
831 "recv completion on "
832 "%u.%u.%u.%u had status %u, "
833 "disconnecting and reconnecting\n",
834 NIPQUAD(conn->c_faddr),
835 wc->wc_status);
836 rdsv3_conn_drop(conn);
840 rdsv3_ib_ring_free(recv_ringp, 1);
843 * If we ever end up with a really empty receive ring, we're
844 * in deep trouble, as the sender will definitely see RNR
845 * timeouts.
847 if (rdsv3_ib_ring_empty(recv_ringp))
848 rdsv3_ib_stats_inc(s_ib_rx_ring_empty);
850 if (rdsv3_ib_ring_low(recv_ringp)) {
851 rdsv3_af_thr_fire(ic->i_refill_rq);
856 rdsv3_ib_recv(struct rdsv3_connection *conn)
858 struct rdsv3_ib_connection *ic = conn->c_transport_data;
859 int ret = 0;
861 RDSV3_DPRINTF4("rdsv3_ib_recv", "conn %p\n", conn);
863 if (rdsv3_conn_up(conn))
864 rdsv3_ib_attempt_ack(ic);
866 RDSV3_DPRINTF4("rdsv3_ib_recv", "Return: conn: %p", conn);
868 return (ret);
871 extern int rdsv3_ib_inc_constructor(void *buf, void *arg, int kmflags);
872 extern void rdsv3_ib_inc_destructor(void *buf, void *arg);
875 rdsv3_ib_recv_init(void)
877 RDSV3_DPRINTF4("rdsv3_ib_recv_init", "Enter");
879 rdsv3_ib_incoming_slab = kmem_cache_create("rdsv3_ib_incoming",
880 sizeof (struct rdsv3_ib_incoming), 0, rdsv3_ib_inc_constructor,
881 rdsv3_ib_inc_destructor, NULL, NULL, NULL, 0);
882 if (!rdsv3_ib_incoming_slab) {
883 RDSV3_DPRINTF2("rdsv3_ib_recv_init", "kmem_cache_create "
884 "failed");
885 return (-ENOMEM);
888 RDSV3_DPRINTF4("rdsv3_ib_recv_init", "Return");
889 return (0);
892 void
893 rdsv3_ib_recv_exit(void)
895 RDSV3_DPRINTF4("rdsv3_ib_recv_exit", "Enter");
896 kmem_cache_destroy(rdsv3_ib_incoming_slab);
897 RDSV3_DPRINTF4("rdsv3_ib_recv_exit", "Return");