2 * Copyright (C) 2017, Microsoft Corporation.
4 * Author(s): Long Li <longli@microsoft.com>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
14 * the GNU General Public License for more details.
16 #include <linux/module.h>
17 #include <linux/highmem.h>
18 #include "smbdirect.h"
19 #include "cifs_debug.h"
21 static struct smbd_response
*get_empty_queue_buffer(
22 struct smbd_connection
*info
);
23 static struct smbd_response
*get_receive_buffer(
24 struct smbd_connection
*info
);
25 static void put_receive_buffer(
26 struct smbd_connection
*info
,
27 struct smbd_response
*response
);
28 static int allocate_receive_buffers(struct smbd_connection
*info
, int num_buf
);
29 static void destroy_receive_buffers(struct smbd_connection
*info
);
31 static void put_empty_packet(
32 struct smbd_connection
*info
, struct smbd_response
*response
);
33 static void enqueue_reassembly(
34 struct smbd_connection
*info
,
35 struct smbd_response
*response
, int data_length
);
36 static struct smbd_response
*_get_first_reassembly(
37 struct smbd_connection
*info
);
39 static int smbd_post_recv(
40 struct smbd_connection
*info
,
41 struct smbd_response
*response
);
43 static int smbd_post_send_empty(struct smbd_connection
*info
);
44 static int smbd_post_send_data(
45 struct smbd_connection
*info
,
46 struct kvec
*iov
, int n_vec
, int remaining_data_length
);
47 static int smbd_post_send_page(struct smbd_connection
*info
,
48 struct page
*page
, unsigned long offset
,
49 size_t size
, int remaining_data_length
);
51 static void destroy_mr_list(struct smbd_connection
*info
);
52 static int allocate_mr_list(struct smbd_connection
*info
);
54 /* SMBD version number */
55 #define SMBD_V1 0x0100
57 /* Port numbers for SMBD transport */
59 #define SMBD_PORT 5445
61 /* Address lookup and resolve timeout in ms */
62 #define RDMA_RESOLVE_TIMEOUT 5000
64 /* SMBD negotiation timeout in seconds */
65 #define SMBD_NEGOTIATE_TIMEOUT 120
67 /* SMBD minimum receive size and fragmented sized defined in [MS-SMBD] */
68 #define SMBD_MIN_RECEIVE_SIZE 128
69 #define SMBD_MIN_FRAGMENTED_SIZE 131072
72 * Default maximum number of RDMA read/write outstanding on this connection
73 * This value is possibly decreased during QP creation on hardware limit
75 #define SMBD_CM_RESPONDER_RESOURCES 32
77 /* Maximum number of retries on data transfer operations */
78 #define SMBD_CM_RETRY 6
79 /* No need to retry on Receiver Not Ready since SMBD manages credits */
80 #define SMBD_CM_RNR_RETRY 0
83 * User configurable initial values per SMBD transport connection
84 * as defined in [MS-SMBD] 3.1.1.1
85 * Those may change after a SMBD negotiation
87 /* The local peer's maximum number of credits to grant to the peer */
88 int smbd_receive_credit_max
= 255;
90 /* The remote peer's credit request of local peer */
91 int smbd_send_credit_target
= 255;
93 /* The maximum single message size can be sent to remote peer */
94 int smbd_max_send_size
= 1364;
96 /* The maximum fragmented upper-layer payload receive size supported */
97 int smbd_max_fragmented_recv_size
= 1024 * 1024;
99 /* The maximum single-message size which can be received */
100 int smbd_max_receive_size
= 8192;
102 /* The timeout to initiate send of a keepalive message on idle */
103 int smbd_keep_alive_interval
= 120;
106 * User configurable initial values for RDMA transport
107 * The actual values used may be lower and are limited to hardware capabilities
109 /* Default maximum number of SGEs in a RDMA write/read */
110 int smbd_max_frmr_depth
= 2048;
112 /* If payload is less than this byte, use RDMA send/recv not read/write */
113 int rdma_readwrite_threshold
= 4096;
115 /* Transport logging functions
116 * Logging are defined as classes. They can be OR'ed to define the actual
117 * logging level via module parameter smbd_logging_class
118 * e.g. cifs.smbd_logging_class=0xa0 will log all log_rdma_recv() and
121 #define LOG_OUTGOING 0x1
122 #define LOG_INCOMING 0x2
124 #define LOG_WRITE 0x8
125 #define LOG_RDMA_SEND 0x10
126 #define LOG_RDMA_RECV 0x20
127 #define LOG_KEEP_ALIVE 0x40
128 #define LOG_RDMA_EVENT 0x80
129 #define LOG_RDMA_MR 0x100
130 static unsigned int smbd_logging_class
;
131 module_param(smbd_logging_class
, uint
, 0644);
132 MODULE_PARM_DESC(smbd_logging_class
,
133 "Logging class for SMBD transport 0x0 to 0x100");
137 static unsigned int smbd_logging_level
= ERR
;
138 module_param(smbd_logging_level
, uint
, 0644);
139 MODULE_PARM_DESC(smbd_logging_level
,
140 "Logging level for SMBD transport, 0 (default): error, 1: info");
142 #define log_rdma(level, class, fmt, args...) \
144 if (level <= smbd_logging_level || class & smbd_logging_class) \
145 cifs_dbg(VFS, "%s:%d " fmt, __func__, __LINE__, ##args);\
148 #define log_outgoing(level, fmt, args...) \
149 log_rdma(level, LOG_OUTGOING, fmt, ##args)
150 #define log_incoming(level, fmt, args...) \
151 log_rdma(level, LOG_INCOMING, fmt, ##args)
152 #define log_read(level, fmt, args...) log_rdma(level, LOG_READ, fmt, ##args)
153 #define log_write(level, fmt, args...) log_rdma(level, LOG_WRITE, fmt, ##args)
154 #define log_rdma_send(level, fmt, args...) \
155 log_rdma(level, LOG_RDMA_SEND, fmt, ##args)
156 #define log_rdma_recv(level, fmt, args...) \
157 log_rdma(level, LOG_RDMA_RECV, fmt, ##args)
158 #define log_keep_alive(level, fmt, args...) \
159 log_rdma(level, LOG_KEEP_ALIVE, fmt, ##args)
160 #define log_rdma_event(level, fmt, args...) \
161 log_rdma(level, LOG_RDMA_EVENT, fmt, ##args)
162 #define log_rdma_mr(level, fmt, args...) \
163 log_rdma(level, LOG_RDMA_MR, fmt, ##args)
166 * Destroy the transport and related RDMA and memory resources
167 * Need to go through all the pending counters and make sure on one is using
168 * the transport while it is destroyed
170 static void smbd_destroy_rdma_work(struct work_struct
*work
)
172 struct smbd_response
*response
;
173 struct smbd_connection
*info
=
174 container_of(work
, struct smbd_connection
, destroy_work
);
177 log_rdma_event(INFO
, "destroying qp\n");
178 ib_drain_qp(info
->id
->qp
);
179 rdma_destroy_qp(info
->id
);
181 /* Unblock all I/O waiting on the send queue */
182 wake_up_interruptible_all(&info
->wait_send_queue
);
184 log_rdma_event(INFO
, "cancelling idle timer\n");
185 cancel_delayed_work_sync(&info
->idle_timer_work
);
186 log_rdma_event(INFO
, "cancelling send immediate work\n");
187 cancel_delayed_work_sync(&info
->send_immediate_work
);
189 log_rdma_event(INFO
, "wait for all send to finish\n");
190 wait_event(info
->wait_smbd_send_pending
,
191 info
->smbd_send_pending
== 0);
193 log_rdma_event(INFO
, "wait for all recv to finish\n");
194 wake_up_interruptible(&info
->wait_reassembly_queue
);
195 wait_event(info
->wait_smbd_recv_pending
,
196 info
->smbd_recv_pending
== 0);
198 log_rdma_event(INFO
, "wait for all send posted to IB to finish\n");
199 wait_event(info
->wait_send_pending
,
200 atomic_read(&info
->send_pending
) == 0);
201 wait_event(info
->wait_send_payload_pending
,
202 atomic_read(&info
->send_payload_pending
) == 0);
204 log_rdma_event(INFO
, "freeing mr list\n");
205 wake_up_interruptible_all(&info
->wait_mr
);
206 wait_event(info
->wait_for_mr_cleanup
,
207 atomic_read(&info
->mr_used_count
) == 0);
208 destroy_mr_list(info
);
210 /* It's not posssible for upper layer to get to reassembly */
211 log_rdma_event(INFO
, "drain the reassembly queue\n");
213 spin_lock_irqsave(&info
->reassembly_queue_lock
, flags
);
214 response
= _get_first_reassembly(info
);
216 list_del(&response
->list
);
217 spin_unlock_irqrestore(
218 &info
->reassembly_queue_lock
, flags
);
219 put_receive_buffer(info
, response
);
221 spin_unlock_irqrestore(&info
->reassembly_queue_lock
, flags
);
224 info
->reassembly_data_length
= 0;
226 log_rdma_event(INFO
, "free receive buffers\n");
227 wait_event(info
->wait_receive_queues
,
228 info
->count_receive_queue
+ info
->count_empty_packet_queue
229 == info
->receive_credit_max
);
230 destroy_receive_buffers(info
);
232 ib_free_cq(info
->send_cq
);
233 ib_free_cq(info
->recv_cq
);
234 ib_dealloc_pd(info
->pd
);
235 rdma_destroy_id(info
->id
);
238 mempool_destroy(info
->request_mempool
);
239 kmem_cache_destroy(info
->request_cache
);
241 mempool_destroy(info
->response_mempool
);
242 kmem_cache_destroy(info
->response_cache
);
244 info
->transport_status
= SMBD_DESTROYED
;
245 wake_up_all(&info
->wait_destroy
);
248 static int smbd_process_disconnected(struct smbd_connection
*info
)
250 schedule_work(&info
->destroy_work
);
254 static void smbd_disconnect_rdma_work(struct work_struct
*work
)
256 struct smbd_connection
*info
=
257 container_of(work
, struct smbd_connection
, disconnect_work
);
259 if (info
->transport_status
== SMBD_CONNECTED
) {
260 info
->transport_status
= SMBD_DISCONNECTING
;
261 rdma_disconnect(info
->id
);
265 static void smbd_disconnect_rdma_connection(struct smbd_connection
*info
)
267 queue_work(info
->workqueue
, &info
->disconnect_work
);
270 /* Upcall from RDMA CM */
271 static int smbd_conn_upcall(
272 struct rdma_cm_id
*id
, struct rdma_cm_event
*event
)
274 struct smbd_connection
*info
= id
->context
;
276 log_rdma_event(INFO
, "event=%d status=%d\n",
277 event
->event
, event
->status
);
279 switch (event
->event
) {
280 case RDMA_CM_EVENT_ADDR_RESOLVED
:
281 case RDMA_CM_EVENT_ROUTE_RESOLVED
:
283 complete(&info
->ri_done
);
286 case RDMA_CM_EVENT_ADDR_ERROR
:
287 info
->ri_rc
= -EHOSTUNREACH
;
288 complete(&info
->ri_done
);
291 case RDMA_CM_EVENT_ROUTE_ERROR
:
292 info
->ri_rc
= -ENETUNREACH
;
293 complete(&info
->ri_done
);
296 case RDMA_CM_EVENT_ESTABLISHED
:
297 log_rdma_event(INFO
, "connected event=%d\n", event
->event
);
298 info
->transport_status
= SMBD_CONNECTED
;
299 wake_up_interruptible(&info
->conn_wait
);
302 case RDMA_CM_EVENT_CONNECT_ERROR
:
303 case RDMA_CM_EVENT_UNREACHABLE
:
304 case RDMA_CM_EVENT_REJECTED
:
305 log_rdma_event(INFO
, "connecting failed event=%d\n", event
->event
);
306 info
->transport_status
= SMBD_DISCONNECTED
;
307 wake_up_interruptible(&info
->conn_wait
);
310 case RDMA_CM_EVENT_DEVICE_REMOVAL
:
311 case RDMA_CM_EVENT_DISCONNECTED
:
312 /* This happenes when we fail the negotiation */
313 if (info
->transport_status
== SMBD_NEGOTIATE_FAILED
) {
314 info
->transport_status
= SMBD_DISCONNECTED
;
315 wake_up(&info
->conn_wait
);
319 info
->transport_status
= SMBD_DISCONNECTED
;
320 smbd_process_disconnected(info
);
330 /* Upcall from RDMA QP */
332 smbd_qp_async_error_upcall(struct ib_event
*event
, void *context
)
334 struct smbd_connection
*info
= context
;
336 log_rdma_event(ERR
, "%s on device %s info %p\n",
337 ib_event_msg(event
->event
), event
->device
->name
, info
);
339 switch (event
->event
) {
340 case IB_EVENT_CQ_ERR
:
341 case IB_EVENT_QP_FATAL
:
342 smbd_disconnect_rdma_connection(info
);
349 static inline void *smbd_request_payload(struct smbd_request
*request
)
351 return (void *)request
->packet
;
354 static inline void *smbd_response_payload(struct smbd_response
*response
)
356 return (void *)response
->packet
;
359 /* Called when a RDMA send is done */
360 static void send_done(struct ib_cq
*cq
, struct ib_wc
*wc
)
363 struct smbd_request
*request
=
364 container_of(wc
->wr_cqe
, struct smbd_request
, cqe
);
366 log_rdma_send(INFO
, "smbd_request %p completed wc->status=%d\n",
367 request
, wc
->status
);
369 if (wc
->status
!= IB_WC_SUCCESS
|| wc
->opcode
!= IB_WC_SEND
) {
370 log_rdma_send(ERR
, "wc->status=%d wc->opcode=%d\n",
371 wc
->status
, wc
->opcode
);
372 smbd_disconnect_rdma_connection(request
->info
);
375 for (i
= 0; i
< request
->num_sge
; i
++)
376 ib_dma_unmap_single(request
->info
->id
->device
,
377 request
->sge
[i
].addr
,
378 request
->sge
[i
].length
,
381 if (request
->has_payload
) {
382 if (atomic_dec_and_test(&request
->info
->send_payload_pending
))
383 wake_up(&request
->info
->wait_send_payload_pending
);
385 if (atomic_dec_and_test(&request
->info
->send_pending
))
386 wake_up(&request
->info
->wait_send_pending
);
389 mempool_free(request
, request
->info
->request_mempool
);
392 static void dump_smbd_negotiate_resp(struct smbd_negotiate_resp
*resp
)
394 log_rdma_event(INFO
, "resp message min_version %u max_version %u "
395 "negotiated_version %u credits_requested %u "
396 "credits_granted %u status %u max_readwrite_size %u "
397 "preferred_send_size %u max_receive_size %u "
398 "max_fragmented_size %u\n",
399 resp
->min_version
, resp
->max_version
, resp
->negotiated_version
,
400 resp
->credits_requested
, resp
->credits_granted
, resp
->status
,
401 resp
->max_readwrite_size
, resp
->preferred_send_size
,
402 resp
->max_receive_size
, resp
->max_fragmented_size
);
406 * Process a negotiation response message, according to [MS-SMBD]3.1.5.7
407 * response, packet_length: the negotiation response message
408 * return value: true if negotiation is a success, false if failed
410 static bool process_negotiation_response(
411 struct smbd_response
*response
, int packet_length
)
413 struct smbd_connection
*info
= response
->info
;
414 struct smbd_negotiate_resp
*packet
= smbd_response_payload(response
);
416 if (packet_length
< sizeof(struct smbd_negotiate_resp
)) {
418 "error: packet_length=%d\n", packet_length
);
422 if (le16_to_cpu(packet
->negotiated_version
) != SMBD_V1
) {
423 log_rdma_event(ERR
, "error: negotiated_version=%x\n",
424 le16_to_cpu(packet
->negotiated_version
));
427 info
->protocol
= le16_to_cpu(packet
->negotiated_version
);
429 if (packet
->credits_requested
== 0) {
430 log_rdma_event(ERR
, "error: credits_requested==0\n");
433 info
->receive_credit_target
= le16_to_cpu(packet
->credits_requested
);
435 if (packet
->credits_granted
== 0) {
436 log_rdma_event(ERR
, "error: credits_granted==0\n");
439 atomic_set(&info
->send_credits
, le16_to_cpu(packet
->credits_granted
));
441 atomic_set(&info
->receive_credits
, 0);
443 if (le32_to_cpu(packet
->preferred_send_size
) > info
->max_receive_size
) {
444 log_rdma_event(ERR
, "error: preferred_send_size=%d\n",
445 le32_to_cpu(packet
->preferred_send_size
));
448 info
->max_receive_size
= le32_to_cpu(packet
->preferred_send_size
);
450 if (le32_to_cpu(packet
->max_receive_size
) < SMBD_MIN_RECEIVE_SIZE
) {
451 log_rdma_event(ERR
, "error: max_receive_size=%d\n",
452 le32_to_cpu(packet
->max_receive_size
));
455 info
->max_send_size
= min_t(int, info
->max_send_size
,
456 le32_to_cpu(packet
->max_receive_size
));
458 if (le32_to_cpu(packet
->max_fragmented_size
) <
459 SMBD_MIN_FRAGMENTED_SIZE
) {
460 log_rdma_event(ERR
, "error: max_fragmented_size=%d\n",
461 le32_to_cpu(packet
->max_fragmented_size
));
464 info
->max_fragmented_send_size
=
465 le32_to_cpu(packet
->max_fragmented_size
);
466 info
->rdma_readwrite_threshold
=
467 rdma_readwrite_threshold
> info
->max_fragmented_send_size
?
468 info
->max_fragmented_send_size
:
469 rdma_readwrite_threshold
;
472 info
->max_readwrite_size
= min_t(u32
,
473 le32_to_cpu(packet
->max_readwrite_size
),
474 info
->max_frmr_depth
* PAGE_SIZE
);
475 info
->max_frmr_depth
= info
->max_readwrite_size
/ PAGE_SIZE
;
481 * Check and schedule to send an immediate packet
482 * This is used to extend credtis to remote peer to keep the transport busy
484 static void check_and_send_immediate(struct smbd_connection
*info
)
486 if (info
->transport_status
!= SMBD_CONNECTED
)
489 info
->send_immediate
= true;
492 * Promptly send a packet if our peer is running low on receive
495 if (atomic_read(&info
->receive_credits
) <
496 info
->receive_credit_target
- 1)
498 info
->workqueue
, &info
->send_immediate_work
, 0);
501 static void smbd_post_send_credits(struct work_struct
*work
)
504 int use_receive_queue
= 1;
506 struct smbd_response
*response
;
507 struct smbd_connection
*info
=
508 container_of(work
, struct smbd_connection
,
509 post_send_credits_work
);
511 if (info
->transport_status
!= SMBD_CONNECTED
) {
512 wake_up(&info
->wait_receive_queues
);
516 if (info
->receive_credit_target
>
517 atomic_read(&info
->receive_credits
)) {
519 if (use_receive_queue
)
520 response
= get_receive_buffer(info
);
522 response
= get_empty_queue_buffer(info
);
524 /* now switch to emtpy packet queue */
525 if (use_receive_queue
) {
526 use_receive_queue
= 0;
532 response
->type
= SMBD_TRANSFER_DATA
;
533 response
->first_segment
= false;
534 rc
= smbd_post_recv(info
, response
);
537 "post_recv failed rc=%d\n", rc
);
538 put_receive_buffer(info
, response
);
546 spin_lock(&info
->lock_new_credits_offered
);
547 info
->new_credits_offered
+= ret
;
548 spin_unlock(&info
->lock_new_credits_offered
);
550 atomic_add(ret
, &info
->receive_credits
);
552 /* Check if we can post new receive and grant credits to peer */
553 check_and_send_immediate(info
);
556 static void smbd_recv_done_work(struct work_struct
*work
)
558 struct smbd_connection
*info
=
559 container_of(work
, struct smbd_connection
, recv_done_work
);
562 * We may have new send credits granted from remote peer
563 * If any sender is blcoked on lack of credets, unblock it
565 if (atomic_read(&info
->send_credits
))
566 wake_up_interruptible(&info
->wait_send_queue
);
569 * Check if we need to send something to remote peer to
570 * grant more credits or respond to KEEP_ALIVE packet
572 check_and_send_immediate(info
);
575 /* Called from softirq, when recv is done */
576 static void recv_done(struct ib_cq
*cq
, struct ib_wc
*wc
)
578 struct smbd_data_transfer
*data_transfer
;
579 struct smbd_response
*response
=
580 container_of(wc
->wr_cqe
, struct smbd_response
, cqe
);
581 struct smbd_connection
*info
= response
->info
;
584 log_rdma_recv(INFO
, "response=%p type=%d wc status=%d wc opcode %d "
585 "byte_len=%d pkey_index=%x\n",
586 response
, response
->type
, wc
->status
, wc
->opcode
,
587 wc
->byte_len
, wc
->pkey_index
);
589 if (wc
->status
!= IB_WC_SUCCESS
|| wc
->opcode
!= IB_WC_RECV
) {
590 log_rdma_recv(INFO
, "wc->status=%d opcode=%d\n",
591 wc
->status
, wc
->opcode
);
592 smbd_disconnect_rdma_connection(info
);
596 ib_dma_sync_single_for_cpu(
599 response
->sge
.length
,
602 switch (response
->type
) {
603 /* SMBD negotiation response */
604 case SMBD_NEGOTIATE_RESP
:
605 dump_smbd_negotiate_resp(smbd_response_payload(response
));
606 info
->full_packet_received
= true;
607 info
->negotiate_done
=
608 process_negotiation_response(response
, wc
->byte_len
);
609 complete(&info
->negotiate_completion
);
612 /* SMBD data transfer packet */
613 case SMBD_TRANSFER_DATA
:
614 data_transfer
= smbd_response_payload(response
);
615 data_length
= le32_to_cpu(data_transfer
->data_length
);
618 * If this is a packet with data playload place the data in
619 * reassembly queue and wake up the reading thread
622 if (info
->full_packet_received
)
623 response
->first_segment
= true;
625 if (le32_to_cpu(data_transfer
->remaining_data_length
))
626 info
->full_packet_received
= false;
628 info
->full_packet_received
= true;
635 put_empty_packet(info
, response
);
638 wake_up_interruptible(&info
->wait_reassembly_queue
);
640 atomic_dec(&info
->receive_credits
);
641 info
->receive_credit_target
=
642 le16_to_cpu(data_transfer
->credits_requested
);
643 atomic_add(le16_to_cpu(data_transfer
->credits_granted
),
644 &info
->send_credits
);
646 log_incoming(INFO
, "data flags %d data_offset %d "
647 "data_length %d remaining_data_length %d\n",
648 le16_to_cpu(data_transfer
->flags
),
649 le32_to_cpu(data_transfer
->data_offset
),
650 le32_to_cpu(data_transfer
->data_length
),
651 le32_to_cpu(data_transfer
->remaining_data_length
));
653 /* Send a KEEP_ALIVE response right away if requested */
654 info
->keep_alive_requested
= KEEP_ALIVE_NONE
;
655 if (le16_to_cpu(data_transfer
->flags
) &
656 SMB_DIRECT_RESPONSE_REQUESTED
) {
657 info
->keep_alive_requested
= KEEP_ALIVE_PENDING
;
660 queue_work(info
->workqueue
, &info
->recv_done_work
);
665 "unexpected response type=%d\n", response
->type
);
669 put_receive_buffer(info
, response
);
672 static struct rdma_cm_id
*smbd_create_id(
673 struct smbd_connection
*info
,
674 struct sockaddr
*dstaddr
, int port
)
676 struct rdma_cm_id
*id
;
680 id
= rdma_create_id(&init_net
, smbd_conn_upcall
, info
,
681 RDMA_PS_TCP
, IB_QPT_RC
);
684 log_rdma_event(ERR
, "rdma_create_id() failed %i\n", rc
);
688 if (dstaddr
->sa_family
== AF_INET6
)
689 sport
= &((struct sockaddr_in6
*)dstaddr
)->sin6_port
;
691 sport
= &((struct sockaddr_in
*)dstaddr
)->sin_port
;
693 *sport
= htons(port
);
695 init_completion(&info
->ri_done
);
696 info
->ri_rc
= -ETIMEDOUT
;
698 rc
= rdma_resolve_addr(id
, NULL
, (struct sockaddr
*)dstaddr
,
699 RDMA_RESOLVE_TIMEOUT
);
701 log_rdma_event(ERR
, "rdma_resolve_addr() failed %i\n", rc
);
704 wait_for_completion_interruptible_timeout(
705 &info
->ri_done
, msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT
));
708 log_rdma_event(ERR
, "rdma_resolve_addr() completed %i\n", rc
);
712 info
->ri_rc
= -ETIMEDOUT
;
713 rc
= rdma_resolve_route(id
, RDMA_RESOLVE_TIMEOUT
);
715 log_rdma_event(ERR
, "rdma_resolve_route() failed %i\n", rc
);
718 wait_for_completion_interruptible_timeout(
719 &info
->ri_done
, msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT
));
722 log_rdma_event(ERR
, "rdma_resolve_route() completed %i\n", rc
);
734 * Test if FRWR (Fast Registration Work Requests) is supported on the device
735 * This implementation requries FRWR on RDMA read/write
736 * return value: true if it is supported
738 static bool frwr_is_supported(struct ib_device_attr
*attrs
)
740 if (!(attrs
->device_cap_flags
& IB_DEVICE_MEM_MGT_EXTENSIONS
))
742 if (attrs
->max_fast_reg_page_list_len
== 0)
747 static int smbd_ia_open(
748 struct smbd_connection
*info
,
749 struct sockaddr
*dstaddr
, int port
)
753 info
->id
= smbd_create_id(info
, dstaddr
, port
);
754 if (IS_ERR(info
->id
)) {
755 rc
= PTR_ERR(info
->id
);
759 if (!frwr_is_supported(&info
->id
->device
->attrs
)) {
761 "Fast Registration Work Requests "
762 "(FRWR) is not supported\n");
764 "Device capability flags = %llx "
765 "max_fast_reg_page_list_len = %u\n",
766 info
->id
->device
->attrs
.device_cap_flags
,
767 info
->id
->device
->attrs
.max_fast_reg_page_list_len
);
768 rc
= -EPROTONOSUPPORT
;
771 info
->max_frmr_depth
= min_t(int,
773 info
->id
->device
->attrs
.max_fast_reg_page_list_len
);
774 info
->mr_type
= IB_MR_TYPE_MEM_REG
;
775 if (info
->id
->device
->attrs
.device_cap_flags
& IB_DEVICE_SG_GAPS_REG
)
776 info
->mr_type
= IB_MR_TYPE_SG_GAPS
;
778 info
->pd
= ib_alloc_pd(info
->id
->device
, 0);
779 if (IS_ERR(info
->pd
)) {
780 rc
= PTR_ERR(info
->pd
);
781 log_rdma_event(ERR
, "ib_alloc_pd() returned %d\n", rc
);
788 rdma_destroy_id(info
->id
);
796 * Send a negotiation request message to the peer
797 * The negotiation procedure is in [MS-SMBD] 3.1.5.2 and 3.1.5.3
798 * After negotiation, the transport is connected and ready for
799 * carrying upper layer SMB payload
801 static int smbd_post_send_negotiate_req(struct smbd_connection
*info
)
803 struct ib_send_wr send_wr
, *send_wr_fail
;
805 struct smbd_request
*request
;
806 struct smbd_negotiate_req
*packet
;
808 request
= mempool_alloc(info
->request_mempool
, GFP_KERNEL
);
812 request
->info
= info
;
814 packet
= smbd_request_payload(request
);
815 packet
->min_version
= cpu_to_le16(SMBD_V1
);
816 packet
->max_version
= cpu_to_le16(SMBD_V1
);
817 packet
->reserved
= 0;
818 packet
->credits_requested
= cpu_to_le16(info
->send_credit_target
);
819 packet
->preferred_send_size
= cpu_to_le32(info
->max_send_size
);
820 packet
->max_receive_size
= cpu_to_le32(info
->max_receive_size
);
821 packet
->max_fragmented_size
=
822 cpu_to_le32(info
->max_fragmented_recv_size
);
824 request
->num_sge
= 1;
825 request
->sge
[0].addr
= ib_dma_map_single(
826 info
->id
->device
, (void *)packet
,
827 sizeof(*packet
), DMA_TO_DEVICE
);
828 if (ib_dma_mapping_error(info
->id
->device
, request
->sge
[0].addr
)) {
830 goto dma_mapping_failed
;
833 request
->sge
[0].length
= sizeof(*packet
);
834 request
->sge
[0].lkey
= info
->pd
->local_dma_lkey
;
836 ib_dma_sync_single_for_device(
837 info
->id
->device
, request
->sge
[0].addr
,
838 request
->sge
[0].length
, DMA_TO_DEVICE
);
840 request
->cqe
.done
= send_done
;
843 send_wr
.wr_cqe
= &request
->cqe
;
844 send_wr
.sg_list
= request
->sge
;
845 send_wr
.num_sge
= request
->num_sge
;
846 send_wr
.opcode
= IB_WR_SEND
;
847 send_wr
.send_flags
= IB_SEND_SIGNALED
;
849 log_rdma_send(INFO
, "sge addr=%llx length=%x lkey=%x\n",
850 request
->sge
[0].addr
,
851 request
->sge
[0].length
, request
->sge
[0].lkey
);
853 request
->has_payload
= false;
854 atomic_inc(&info
->send_pending
);
855 rc
= ib_post_send(info
->id
->qp
, &send_wr
, &send_wr_fail
);
859 /* if we reach here, post send failed */
860 log_rdma_send(ERR
, "ib_post_send failed rc=%d\n", rc
);
861 atomic_dec(&info
->send_pending
);
862 ib_dma_unmap_single(info
->id
->device
, request
->sge
[0].addr
,
863 request
->sge
[0].length
, DMA_TO_DEVICE
);
866 mempool_free(request
, info
->request_mempool
);
871 * Extend the credits to remote peer
872 * This implements [MS-SMBD] 3.1.5.9
873 * The idea is that we should extend credits to remote peer as quickly as
874 * it's allowed, to maintain data flow. We allocate as much receive
875 * buffer as possible, and extend the receive credits to remote peer
876 * return value: the new credtis being granted.
878 static int manage_credits_prior_sending(struct smbd_connection
*info
)
882 spin_lock(&info
->lock_new_credits_offered
);
883 new_credits
= info
->new_credits_offered
;
884 info
->new_credits_offered
= 0;
885 spin_unlock(&info
->lock_new_credits_offered
);
891 * Check if we need to send a KEEP_ALIVE message
892 * The idle connection timer triggers a KEEP_ALIVE message when expires
893 * SMB_DIRECT_RESPONSE_REQUESTED is set in the message flag to have peer send
896 * 1 if SMB_DIRECT_RESPONSE_REQUESTED needs to be set
899 static int manage_keep_alive_before_sending(struct smbd_connection
*info
)
901 if (info
->keep_alive_requested
== KEEP_ALIVE_PENDING
) {
902 info
->keep_alive_requested
= KEEP_ALIVE_SENT
;
909 * Build and prepare the SMBD packet header
910 * This function waits for avaialbe send credits and build a SMBD packet
911 * header. The caller then optional append payload to the packet after
914 * size: the size of the payload
915 * remaining_data_length: remaining data to send if this is part of a
918 * request_out: the request allocated from this function
919 * return values: 0 on success, otherwise actual error code returned
921 static int smbd_create_header(struct smbd_connection
*info
,
922 int size
, int remaining_data_length
,
923 struct smbd_request
**request_out
)
925 struct smbd_request
*request
;
926 struct smbd_data_transfer
*packet
;
930 /* Wait for send credits. A SMBD packet needs one credit */
931 rc
= wait_event_interruptible(info
->wait_send_queue
,
932 atomic_read(&info
->send_credits
) > 0 ||
933 info
->transport_status
!= SMBD_CONNECTED
);
937 if (info
->transport_status
!= SMBD_CONNECTED
) {
938 log_outgoing(ERR
, "disconnected not sending\n");
941 atomic_dec(&info
->send_credits
);
943 request
= mempool_alloc(info
->request_mempool
, GFP_KERNEL
);
949 request
->info
= info
;
951 /* Fill in the packet header */
952 packet
= smbd_request_payload(request
);
953 packet
->credits_requested
= cpu_to_le16(info
->send_credit_target
);
954 packet
->credits_granted
=
955 cpu_to_le16(manage_credits_prior_sending(info
));
956 info
->send_immediate
= false;
959 if (manage_keep_alive_before_sending(info
))
960 packet
->flags
|= cpu_to_le16(SMB_DIRECT_RESPONSE_REQUESTED
);
962 packet
->reserved
= 0;
964 packet
->data_offset
= 0;
966 packet
->data_offset
= cpu_to_le32(24);
967 packet
->data_length
= cpu_to_le32(size
);
968 packet
->remaining_data_length
= cpu_to_le32(remaining_data_length
);
971 log_outgoing(INFO
, "credits_requested=%d credits_granted=%d "
972 "data_offset=%d data_length=%d remaining_data_length=%d\n",
973 le16_to_cpu(packet
->credits_requested
),
974 le16_to_cpu(packet
->credits_granted
),
975 le32_to_cpu(packet
->data_offset
),
976 le32_to_cpu(packet
->data_length
),
977 le32_to_cpu(packet
->remaining_data_length
));
979 /* Map the packet to DMA */
980 header_length
= sizeof(struct smbd_data_transfer
);
981 /* If this is a packet without payload, don't send padding */
983 header_length
= offsetof(struct smbd_data_transfer
, padding
);
985 request
->num_sge
= 1;
986 request
->sge
[0].addr
= ib_dma_map_single(info
->id
->device
,
990 if (ib_dma_mapping_error(info
->id
->device
, request
->sge
[0].addr
)) {
991 mempool_free(request
, info
->request_mempool
);
996 request
->sge
[0].length
= header_length
;
997 request
->sge
[0].lkey
= info
->pd
->local_dma_lkey
;
999 *request_out
= request
;
1003 atomic_inc(&info
->send_credits
);
1007 static void smbd_destroy_header(struct smbd_connection
*info
,
1008 struct smbd_request
*request
)
1011 ib_dma_unmap_single(info
->id
->device
,
1012 request
->sge
[0].addr
,
1013 request
->sge
[0].length
,
1015 mempool_free(request
, info
->request_mempool
);
1016 atomic_inc(&info
->send_credits
);
1019 /* Post the send request */
1020 static int smbd_post_send(struct smbd_connection
*info
,
1021 struct smbd_request
*request
, bool has_payload
)
1023 struct ib_send_wr send_wr
, *send_wr_fail
;
1026 for (i
= 0; i
< request
->num_sge
; i
++) {
1028 "rdma_request sge[%d] addr=%llu legnth=%u\n",
1029 i
, request
->sge
[0].addr
, request
->sge
[0].length
);
1030 ib_dma_sync_single_for_device(
1032 request
->sge
[i
].addr
,
1033 request
->sge
[i
].length
,
1037 request
->cqe
.done
= send_done
;
1039 send_wr
.next
= NULL
;
1040 send_wr
.wr_cqe
= &request
->cqe
;
1041 send_wr
.sg_list
= request
->sge
;
1042 send_wr
.num_sge
= request
->num_sge
;
1043 send_wr
.opcode
= IB_WR_SEND
;
1044 send_wr
.send_flags
= IB_SEND_SIGNALED
;
1047 request
->has_payload
= true;
1048 atomic_inc(&info
->send_payload_pending
);
1050 request
->has_payload
= false;
1051 atomic_inc(&info
->send_pending
);
1054 rc
= ib_post_send(info
->id
->qp
, &send_wr
, &send_wr_fail
);
1056 log_rdma_send(ERR
, "ib_post_send failed rc=%d\n", rc
);
1058 if (atomic_dec_and_test(&info
->send_payload_pending
))
1059 wake_up(&info
->wait_send_payload_pending
);
1061 if (atomic_dec_and_test(&info
->send_pending
))
1062 wake_up(&info
->wait_send_pending
);
1065 /* Reset timer for idle connection after packet is sent */
1066 mod_delayed_work(info
->workqueue
, &info
->idle_timer_work
,
1067 info
->keep_alive_interval
*HZ
);
1072 static int smbd_post_send_sgl(struct smbd_connection
*info
,
1073 struct scatterlist
*sgl
, int data_length
, int remaining_data_length
)
1077 struct smbd_request
*request
;
1078 struct scatterlist
*sg
;
1080 rc
= smbd_create_header(
1081 info
, data_length
, remaining_data_length
, &request
);
1085 num_sgs
= sgl
? sg_nents(sgl
) : 0;
1086 for_each_sg(sgl
, sg
, num_sgs
, i
) {
1087 request
->sge
[i
+1].addr
=
1088 ib_dma_map_page(info
->id
->device
, sg_page(sg
),
1089 sg
->offset
, sg
->length
, DMA_BIDIRECTIONAL
);
1090 if (ib_dma_mapping_error(
1091 info
->id
->device
, request
->sge
[i
+1].addr
)) {
1093 request
->sge
[i
+1].addr
= 0;
1094 goto dma_mapping_failure
;
1096 request
->sge
[i
+1].length
= sg
->length
;
1097 request
->sge
[i
+1].lkey
= info
->pd
->local_dma_lkey
;
1101 rc
= smbd_post_send(info
, request
, data_length
);
1105 dma_mapping_failure
:
1106 for (i
= 1; i
< request
->num_sge
; i
++)
1107 if (request
->sge
[i
].addr
)
1108 ib_dma_unmap_single(info
->id
->device
,
1109 request
->sge
[i
].addr
,
1110 request
->sge
[i
].length
,
1112 smbd_destroy_header(info
, request
);
1118 * page: the page to send
1119 * offset: offset in the page to send
1120 * size: length in the page to send
1121 * remaining_data_length: remaining data to send in this payload
1123 static int smbd_post_send_page(struct smbd_connection
*info
, struct page
*page
,
1124 unsigned long offset
, size_t size
, int remaining_data_length
)
1126 struct scatterlist sgl
;
1128 sg_init_table(&sgl
, 1);
1129 sg_set_page(&sgl
, page
, size
, offset
);
1131 return smbd_post_send_sgl(info
, &sgl
, size
, remaining_data_length
);
1135 * Send an empty message
1136 * Empty message is used to extend credits to peer to for keep live
1137 * while there is no upper layer payload to send at the time
1139 static int smbd_post_send_empty(struct smbd_connection
*info
)
1141 info
->count_send_empty
++;
1142 return smbd_post_send_sgl(info
, NULL
, 0, 0);
1146 * Send a data buffer
1147 * iov: the iov array describing the data buffers
1148 * n_vec: number of iov array
1149 * remaining_data_length: remaining data to send following this packet
1150 * in segmented SMBD packet
1152 static int smbd_post_send_data(
1153 struct smbd_connection
*info
, struct kvec
*iov
, int n_vec
,
1154 int remaining_data_length
)
1157 u32 data_length
= 0;
1158 struct scatterlist sgl
[SMBDIRECT_MAX_SGE
];
1160 if (n_vec
> SMBDIRECT_MAX_SGE
) {
1161 cifs_dbg(VFS
, "Can't fit data to SGL, n_vec=%d\n", n_vec
);
1165 sg_init_table(sgl
, n_vec
);
1166 for (i
= 0; i
< n_vec
; i
++) {
1167 data_length
+= iov
[i
].iov_len
;
1168 sg_set_buf(&sgl
[i
], iov
[i
].iov_base
, iov
[i
].iov_len
);
1171 return smbd_post_send_sgl(info
, sgl
, data_length
, remaining_data_length
);
1175 * Post a receive request to the transport
1176 * The remote peer can only send data when a receive request is posted
1177 * The interaction is controlled by send/receive credit system
1179 static int smbd_post_recv(
1180 struct smbd_connection
*info
, struct smbd_response
*response
)
1182 struct ib_recv_wr recv_wr
, *recv_wr_fail
= NULL
;
1185 response
->sge
.addr
= ib_dma_map_single(
1186 info
->id
->device
, response
->packet
,
1187 info
->max_receive_size
, DMA_FROM_DEVICE
);
1188 if (ib_dma_mapping_error(info
->id
->device
, response
->sge
.addr
))
1191 response
->sge
.length
= info
->max_receive_size
;
1192 response
->sge
.lkey
= info
->pd
->local_dma_lkey
;
1194 response
->cqe
.done
= recv_done
;
1196 recv_wr
.wr_cqe
= &response
->cqe
;
1197 recv_wr
.next
= NULL
;
1198 recv_wr
.sg_list
= &response
->sge
;
1199 recv_wr
.num_sge
= 1;
1201 rc
= ib_post_recv(info
->id
->qp
, &recv_wr
, &recv_wr_fail
);
1203 ib_dma_unmap_single(info
->id
->device
, response
->sge
.addr
,
1204 response
->sge
.length
, DMA_FROM_DEVICE
);
1206 log_rdma_recv(ERR
, "ib_post_recv failed rc=%d\n", rc
);
1212 /* Perform SMBD negotiate according to [MS-SMBD] 3.1.5.2 */
1213 static int smbd_negotiate(struct smbd_connection
*info
)
1216 struct smbd_response
*response
= get_receive_buffer(info
);
1218 response
->type
= SMBD_NEGOTIATE_RESP
;
1219 rc
= smbd_post_recv(info
, response
);
1220 log_rdma_event(INFO
,
1221 "smbd_post_recv rc=%d iov.addr=%llx iov.length=%x "
1223 rc
, response
->sge
.addr
,
1224 response
->sge
.length
, response
->sge
.lkey
);
1228 init_completion(&info
->negotiate_completion
);
1229 info
->negotiate_done
= false;
1230 rc
= smbd_post_send_negotiate_req(info
);
1234 rc
= wait_for_completion_interruptible_timeout(
1235 &info
->negotiate_completion
, SMBD_NEGOTIATE_TIMEOUT
* HZ
);
1236 log_rdma_event(INFO
, "wait_for_completion_timeout rc=%d\n", rc
);
1238 if (info
->negotiate_done
)
1243 else if (rc
== -ERESTARTSYS
)
1251 static void put_empty_packet(
1252 struct smbd_connection
*info
, struct smbd_response
*response
)
1254 spin_lock(&info
->empty_packet_queue_lock
);
1255 list_add_tail(&response
->list
, &info
->empty_packet_queue
);
1256 info
->count_empty_packet_queue
++;
1257 spin_unlock(&info
->empty_packet_queue_lock
);
1259 queue_work(info
->workqueue
, &info
->post_send_credits_work
);
1263 * Implement Connection.FragmentReassemblyBuffer defined in [MS-SMBD] 3.1.1.1
1264 * This is a queue for reassembling upper layer payload and present to upper
1265 * layer. All the inncoming payload go to the reassembly queue, regardless of
1266 * if reassembly is required. The uuper layer code reads from the queue for all
1267 * incoming payloads.
1268 * Put a received packet to the reassembly queue
1269 * response: the packet received
1270 * data_length: the size of payload in this packet
1272 static void enqueue_reassembly(
1273 struct smbd_connection
*info
,
1274 struct smbd_response
*response
,
1277 spin_lock(&info
->reassembly_queue_lock
);
1278 list_add_tail(&response
->list
, &info
->reassembly_queue
);
1279 info
->reassembly_queue_length
++;
1281 * Make sure reassembly_data_length is updated after list and
1282 * reassembly_queue_length are updated. On the dequeue side
1283 * reassembly_data_length is checked without a lock to determine
1284 * if reassembly_queue_length and list is up to date
1287 info
->reassembly_data_length
+= data_length
;
1288 spin_unlock(&info
->reassembly_queue_lock
);
1289 info
->count_reassembly_queue
++;
1290 info
->count_enqueue_reassembly_queue
++;
1294 * Get the first entry at the front of reassembly queue
1295 * Caller is responsible for locking
1296 * return value: the first entry if any, NULL if queue is empty
1298 static struct smbd_response
*_get_first_reassembly(struct smbd_connection
*info
)
1300 struct smbd_response
*ret
= NULL
;
1302 if (!list_empty(&info
->reassembly_queue
)) {
1303 ret
= list_first_entry(
1304 &info
->reassembly_queue
,
1305 struct smbd_response
, list
);
1310 static struct smbd_response
*get_empty_queue_buffer(
1311 struct smbd_connection
*info
)
1313 struct smbd_response
*ret
= NULL
;
1314 unsigned long flags
;
1316 spin_lock_irqsave(&info
->empty_packet_queue_lock
, flags
);
1317 if (!list_empty(&info
->empty_packet_queue
)) {
1318 ret
= list_first_entry(
1319 &info
->empty_packet_queue
,
1320 struct smbd_response
, list
);
1321 list_del(&ret
->list
);
1322 info
->count_empty_packet_queue
--;
1324 spin_unlock_irqrestore(&info
->empty_packet_queue_lock
, flags
);
1330 * Get a receive buffer
1331 * For each remote send, we need to post a receive. The receive buffers are
1332 * pre-allocated in advance.
1333 * return value: the receive buffer, NULL if none is available
1335 static struct smbd_response
*get_receive_buffer(struct smbd_connection
*info
)
1337 struct smbd_response
*ret
= NULL
;
1338 unsigned long flags
;
1340 spin_lock_irqsave(&info
->receive_queue_lock
, flags
);
1341 if (!list_empty(&info
->receive_queue
)) {
1342 ret
= list_first_entry(
1343 &info
->receive_queue
,
1344 struct smbd_response
, list
);
1345 list_del(&ret
->list
);
1346 info
->count_receive_queue
--;
1347 info
->count_get_receive_buffer
++;
1349 spin_unlock_irqrestore(&info
->receive_queue_lock
, flags
);
1355 * Return a receive buffer
1356 * Upon returning of a receive buffer, we can post new receive and extend
1357 * more receive credits to remote peer. This is done immediately after a
1358 * receive buffer is returned.
1360 static void put_receive_buffer(
1361 struct smbd_connection
*info
, struct smbd_response
*response
)
1363 unsigned long flags
;
1365 ib_dma_unmap_single(info
->id
->device
, response
->sge
.addr
,
1366 response
->sge
.length
, DMA_FROM_DEVICE
);
1368 spin_lock_irqsave(&info
->receive_queue_lock
, flags
);
1369 list_add_tail(&response
->list
, &info
->receive_queue
);
1370 info
->count_receive_queue
++;
1371 info
->count_put_receive_buffer
++;
1372 spin_unlock_irqrestore(&info
->receive_queue_lock
, flags
);
1374 queue_work(info
->workqueue
, &info
->post_send_credits_work
);
1377 /* Preallocate all receive buffer on transport establishment */
1378 static int allocate_receive_buffers(struct smbd_connection
*info
, int num_buf
)
1381 struct smbd_response
*response
;
1383 INIT_LIST_HEAD(&info
->reassembly_queue
);
1384 spin_lock_init(&info
->reassembly_queue_lock
);
1385 info
->reassembly_data_length
= 0;
1386 info
->reassembly_queue_length
= 0;
1388 INIT_LIST_HEAD(&info
->receive_queue
);
1389 spin_lock_init(&info
->receive_queue_lock
);
1390 info
->count_receive_queue
= 0;
1392 INIT_LIST_HEAD(&info
->empty_packet_queue
);
1393 spin_lock_init(&info
->empty_packet_queue_lock
);
1394 info
->count_empty_packet_queue
= 0;
1396 init_waitqueue_head(&info
->wait_receive_queues
);
1398 for (i
= 0; i
< num_buf
; i
++) {
1399 response
= mempool_alloc(info
->response_mempool
, GFP_KERNEL
);
1401 goto allocate_failed
;
1403 response
->info
= info
;
1404 list_add_tail(&response
->list
, &info
->receive_queue
);
1405 info
->count_receive_queue
++;
1411 while (!list_empty(&info
->receive_queue
)) {
1412 response
= list_first_entry(
1413 &info
->receive_queue
,
1414 struct smbd_response
, list
);
1415 list_del(&response
->list
);
1416 info
->count_receive_queue
--;
1418 mempool_free(response
, info
->response_mempool
);
1423 static void destroy_receive_buffers(struct smbd_connection
*info
)
1425 struct smbd_response
*response
;
1427 while ((response
= get_receive_buffer(info
)))
1428 mempool_free(response
, info
->response_mempool
);
1430 while ((response
= get_empty_queue_buffer(info
)))
1431 mempool_free(response
, info
->response_mempool
);
1435 * Check and send an immediate or keep alive packet
1436 * The condition to send those packets are defined in [MS-SMBD] 3.1.1.1
1437 * Connection.KeepaliveRequested and Connection.SendImmediate
1438 * The idea is to extend credits to server as soon as it becomes available
1440 static void send_immediate_work(struct work_struct
*work
)
1442 struct smbd_connection
*info
= container_of(
1443 work
, struct smbd_connection
,
1444 send_immediate_work
.work
);
1446 if (info
->keep_alive_requested
== KEEP_ALIVE_PENDING
||
1447 info
->send_immediate
) {
1448 log_keep_alive(INFO
, "send an empty message\n");
1449 smbd_post_send_empty(info
);
1453 /* Implement idle connection timer [MS-SMBD] 3.1.6.2 */
1454 static void idle_connection_timer(struct work_struct
*work
)
1456 struct smbd_connection
*info
= container_of(
1457 work
, struct smbd_connection
,
1458 idle_timer_work
.work
);
1460 if (info
->keep_alive_requested
!= KEEP_ALIVE_NONE
) {
1462 "error status info->keep_alive_requested=%d\n",
1463 info
->keep_alive_requested
);
1464 smbd_disconnect_rdma_connection(info
);
1468 log_keep_alive(INFO
, "about to send an empty idle message\n");
1469 smbd_post_send_empty(info
);
1471 /* Setup the next idle timeout work */
1472 queue_delayed_work(info
->workqueue
, &info
->idle_timer_work
,
1473 info
->keep_alive_interval
*HZ
);
1476 /* Destroy this SMBD connection, called from upper layer */
1477 void smbd_destroy(struct smbd_connection
*info
)
1479 log_rdma_event(INFO
, "destroying rdma session\n");
1481 /* Kick off the disconnection process */
1482 smbd_disconnect_rdma_connection(info
);
1484 log_rdma_event(INFO
, "wait for transport being destroyed\n");
1485 wait_event(info
->wait_destroy
,
1486 info
->transport_status
== SMBD_DESTROYED
);
1488 destroy_workqueue(info
->workqueue
);
1493 * Reconnect this SMBD connection, called from upper layer
1494 * return value: 0 on success, or actual error code
1496 int smbd_reconnect(struct TCP_Server_Info
*server
)
1498 log_rdma_event(INFO
, "reconnecting rdma session\n");
1500 if (!server
->smbd_conn
) {
1501 log_rdma_event(ERR
, "rdma session already destroyed\n");
1506 * This is possible if transport is disconnected and we haven't received
1507 * notification from RDMA, but upper layer has detected timeout
1509 if (server
->smbd_conn
->transport_status
== SMBD_CONNECTED
) {
1510 log_rdma_event(INFO
, "disconnecting transport\n");
1511 smbd_disconnect_rdma_connection(server
->smbd_conn
);
1514 /* wait until the transport is destroyed */
1515 wait_event(server
->smbd_conn
->wait_destroy
,
1516 server
->smbd_conn
->transport_status
== SMBD_DESTROYED
);
1518 destroy_workqueue(server
->smbd_conn
->workqueue
);
1519 kfree(server
->smbd_conn
);
1521 log_rdma_event(INFO
, "creating rdma session\n");
1522 server
->smbd_conn
= smbd_get_connection(
1523 server
, (struct sockaddr
*) &server
->dstaddr
);
1525 return server
->smbd_conn
? 0 : -ENOENT
;
1528 static void destroy_caches_and_workqueue(struct smbd_connection
*info
)
1530 destroy_receive_buffers(info
);
1531 destroy_workqueue(info
->workqueue
);
1532 mempool_destroy(info
->response_mempool
);
1533 kmem_cache_destroy(info
->response_cache
);
1534 mempool_destroy(info
->request_mempool
);
1535 kmem_cache_destroy(info
->request_cache
);
1538 #define MAX_NAME_LEN 80
1539 static int allocate_caches_and_workqueue(struct smbd_connection
*info
)
1541 char name
[MAX_NAME_LEN
];
1544 snprintf(name
, MAX_NAME_LEN
, "smbd_request_%p", info
);
1545 info
->request_cache
=
1548 sizeof(struct smbd_request
) +
1549 sizeof(struct smbd_data_transfer
),
1550 0, SLAB_HWCACHE_ALIGN
, NULL
);
1551 if (!info
->request_cache
)
1554 info
->request_mempool
=
1555 mempool_create(info
->send_credit_target
, mempool_alloc_slab
,
1556 mempool_free_slab
, info
->request_cache
);
1557 if (!info
->request_mempool
)
1560 snprintf(name
, MAX_NAME_LEN
, "smbd_response_%p", info
);
1561 info
->response_cache
=
1564 sizeof(struct smbd_response
) +
1565 info
->max_receive_size
,
1566 0, SLAB_HWCACHE_ALIGN
, NULL
);
1567 if (!info
->response_cache
)
1570 info
->response_mempool
=
1571 mempool_create(info
->receive_credit_max
, mempool_alloc_slab
,
1572 mempool_free_slab
, info
->response_cache
);
1573 if (!info
->response_mempool
)
1576 snprintf(name
, MAX_NAME_LEN
, "smbd_%p", info
);
1577 info
->workqueue
= create_workqueue(name
);
1578 if (!info
->workqueue
)
1581 rc
= allocate_receive_buffers(info
, info
->receive_credit_max
);
1583 log_rdma_event(ERR
, "failed to allocate receive buffers\n");
1590 destroy_workqueue(info
->workqueue
);
1592 mempool_destroy(info
->response_mempool
);
1594 kmem_cache_destroy(info
->response_cache
);
1596 mempool_destroy(info
->request_mempool
);
1598 kmem_cache_destroy(info
->request_cache
);
1602 /* Create a SMBD connection, called by upper layer */
1603 static struct smbd_connection
*_smbd_get_connection(
1604 struct TCP_Server_Info
*server
, struct sockaddr
*dstaddr
, int port
)
1607 struct smbd_connection
*info
;
1608 struct rdma_conn_param conn_param
;
1609 struct ib_qp_init_attr qp_attr
;
1610 struct sockaddr_in
*addr_in
= (struct sockaddr_in
*) dstaddr
;
1611 struct ib_port_immutable port_immutable
;
1614 info
= kzalloc(sizeof(struct smbd_connection
), GFP_KERNEL
);
1618 info
->transport_status
= SMBD_CONNECTING
;
1619 rc
= smbd_ia_open(info
, dstaddr
, port
);
1621 log_rdma_event(INFO
, "smbd_ia_open rc=%d\n", rc
);
1622 goto create_id_failed
;
1625 if (smbd_send_credit_target
> info
->id
->device
->attrs
.max_cqe
||
1626 smbd_send_credit_target
> info
->id
->device
->attrs
.max_qp_wr
) {
1628 "consider lowering send_credit_target = %d. "
1629 "Possible CQE overrun, device "
1630 "reporting max_cpe %d max_qp_wr %d\n",
1631 smbd_send_credit_target
,
1632 info
->id
->device
->attrs
.max_cqe
,
1633 info
->id
->device
->attrs
.max_qp_wr
);
1637 if (smbd_receive_credit_max
> info
->id
->device
->attrs
.max_cqe
||
1638 smbd_receive_credit_max
> info
->id
->device
->attrs
.max_qp_wr
) {
1640 "consider lowering receive_credit_max = %d. "
1641 "Possible CQE overrun, device "
1642 "reporting max_cpe %d max_qp_wr %d\n",
1643 smbd_receive_credit_max
,
1644 info
->id
->device
->attrs
.max_cqe
,
1645 info
->id
->device
->attrs
.max_qp_wr
);
1649 info
->receive_credit_max
= smbd_receive_credit_max
;
1650 info
->send_credit_target
= smbd_send_credit_target
;
1651 info
->max_send_size
= smbd_max_send_size
;
1652 info
->max_fragmented_recv_size
= smbd_max_fragmented_recv_size
;
1653 info
->max_receive_size
= smbd_max_receive_size
;
1654 info
->keep_alive_interval
= smbd_keep_alive_interval
;
1656 if (info
->id
->device
->attrs
.max_sge
< SMBDIRECT_MAX_SGE
) {
1657 log_rdma_event(ERR
, "warning: device max_sge = %d too small\n",
1658 info
->id
->device
->attrs
.max_sge
);
1659 log_rdma_event(ERR
, "Queue Pair creation may fail\n");
1662 info
->send_cq
= NULL
;
1663 info
->recv_cq
= NULL
;
1664 info
->send_cq
= ib_alloc_cq(info
->id
->device
, info
,
1665 info
->send_credit_target
, 0, IB_POLL_SOFTIRQ
);
1666 if (IS_ERR(info
->send_cq
)) {
1667 info
->send_cq
= NULL
;
1668 goto alloc_cq_failed
;
1671 info
->recv_cq
= ib_alloc_cq(info
->id
->device
, info
,
1672 info
->receive_credit_max
, 0, IB_POLL_SOFTIRQ
);
1673 if (IS_ERR(info
->recv_cq
)) {
1674 info
->recv_cq
= NULL
;
1675 goto alloc_cq_failed
;
1678 memset(&qp_attr
, 0, sizeof(qp_attr
));
1679 qp_attr
.event_handler
= smbd_qp_async_error_upcall
;
1680 qp_attr
.qp_context
= info
;
1681 qp_attr
.cap
.max_send_wr
= info
->send_credit_target
;
1682 qp_attr
.cap
.max_recv_wr
= info
->receive_credit_max
;
1683 qp_attr
.cap
.max_send_sge
= SMBDIRECT_MAX_SGE
;
1684 qp_attr
.cap
.max_recv_sge
= SMBDIRECT_MAX_SGE
;
1685 qp_attr
.cap
.max_inline_data
= 0;
1686 qp_attr
.sq_sig_type
= IB_SIGNAL_REQ_WR
;
1687 qp_attr
.qp_type
= IB_QPT_RC
;
1688 qp_attr
.send_cq
= info
->send_cq
;
1689 qp_attr
.recv_cq
= info
->recv_cq
;
1690 qp_attr
.port_num
= ~0;
1692 rc
= rdma_create_qp(info
->id
, info
->pd
, &qp_attr
);
1694 log_rdma_event(ERR
, "rdma_create_qp failed %i\n", rc
);
1695 goto create_qp_failed
;
1698 memset(&conn_param
, 0, sizeof(conn_param
));
1699 conn_param
.initiator_depth
= 0;
1701 conn_param
.responder_resources
=
1702 info
->id
->device
->attrs
.max_qp_rd_atom
1703 < SMBD_CM_RESPONDER_RESOURCES
?
1704 info
->id
->device
->attrs
.max_qp_rd_atom
:
1705 SMBD_CM_RESPONDER_RESOURCES
;
1706 info
->responder_resources
= conn_param
.responder_resources
;
1707 log_rdma_mr(INFO
, "responder_resources=%d\n",
1708 info
->responder_resources
);
1710 /* Need to send IRD/ORD in private data for iWARP */
1711 info
->id
->device
->get_port_immutable(
1712 info
->id
->device
, info
->id
->port_num
, &port_immutable
);
1713 if (port_immutable
.core_cap_flags
& RDMA_CORE_PORT_IWARP
) {
1714 ird_ord_hdr
[0] = info
->responder_resources
;
1716 conn_param
.private_data
= ird_ord_hdr
;
1717 conn_param
.private_data_len
= sizeof(ird_ord_hdr
);
1719 conn_param
.private_data
= NULL
;
1720 conn_param
.private_data_len
= 0;
1723 conn_param
.retry_count
= SMBD_CM_RETRY
;
1724 conn_param
.rnr_retry_count
= SMBD_CM_RNR_RETRY
;
1725 conn_param
.flow_control
= 0;
1726 init_waitqueue_head(&info
->wait_destroy
);
1728 log_rdma_event(INFO
, "connecting to IP %pI4 port %d\n",
1729 &addr_in
->sin_addr
, port
);
1731 init_waitqueue_head(&info
->conn_wait
);
1732 rc
= rdma_connect(info
->id
, &conn_param
);
1734 log_rdma_event(ERR
, "rdma_connect() failed with %i\n", rc
);
1735 goto rdma_connect_failed
;
1738 wait_event_interruptible(
1739 info
->conn_wait
, info
->transport_status
!= SMBD_CONNECTING
);
1741 if (info
->transport_status
!= SMBD_CONNECTED
) {
1742 log_rdma_event(ERR
, "rdma_connect failed port=%d\n", port
);
1743 goto rdma_connect_failed
;
1746 log_rdma_event(INFO
, "rdma_connect connected\n");
1748 rc
= allocate_caches_and_workqueue(info
);
1750 log_rdma_event(ERR
, "cache allocation failed\n");
1751 goto allocate_cache_failed
;
1754 init_waitqueue_head(&info
->wait_send_queue
);
1755 init_waitqueue_head(&info
->wait_reassembly_queue
);
1757 INIT_DELAYED_WORK(&info
->idle_timer_work
, idle_connection_timer
);
1758 INIT_DELAYED_WORK(&info
->send_immediate_work
, send_immediate_work
);
1759 queue_delayed_work(info
->workqueue
, &info
->idle_timer_work
,
1760 info
->keep_alive_interval
*HZ
);
1762 init_waitqueue_head(&info
->wait_smbd_send_pending
);
1763 info
->smbd_send_pending
= 0;
1765 init_waitqueue_head(&info
->wait_smbd_recv_pending
);
1766 info
->smbd_recv_pending
= 0;
1768 init_waitqueue_head(&info
->wait_send_pending
);
1769 atomic_set(&info
->send_pending
, 0);
1771 init_waitqueue_head(&info
->wait_send_payload_pending
);
1772 atomic_set(&info
->send_payload_pending
, 0);
1774 INIT_WORK(&info
->disconnect_work
, smbd_disconnect_rdma_work
);
1775 INIT_WORK(&info
->destroy_work
, smbd_destroy_rdma_work
);
1776 INIT_WORK(&info
->recv_done_work
, smbd_recv_done_work
);
1777 INIT_WORK(&info
->post_send_credits_work
, smbd_post_send_credits
);
1778 info
->new_credits_offered
= 0;
1779 spin_lock_init(&info
->lock_new_credits_offered
);
1781 rc
= smbd_negotiate(info
);
1783 log_rdma_event(ERR
, "smbd_negotiate rc=%d\n", rc
);
1784 goto negotiation_failed
;
1787 rc
= allocate_mr_list(info
);
1789 log_rdma_mr(ERR
, "memory registration allocation failed\n");
1790 goto allocate_mr_failed
;
1796 /* At this point, need to a full transport shutdown */
1801 cancel_delayed_work_sync(&info
->idle_timer_work
);
1802 destroy_caches_and_workqueue(info
);
1803 info
->transport_status
= SMBD_NEGOTIATE_FAILED
;
1804 init_waitqueue_head(&info
->conn_wait
);
1805 rdma_disconnect(info
->id
);
1806 wait_event(info
->conn_wait
,
1807 info
->transport_status
== SMBD_DISCONNECTED
);
1809 allocate_cache_failed
:
1810 rdma_connect_failed
:
1811 rdma_destroy_qp(info
->id
);
1816 ib_free_cq(info
->send_cq
);
1818 ib_free_cq(info
->recv_cq
);
1821 ib_dealloc_pd(info
->pd
);
1822 rdma_destroy_id(info
->id
);
1829 struct smbd_connection
*smbd_get_connection(
1830 struct TCP_Server_Info
*server
, struct sockaddr
*dstaddr
)
1832 struct smbd_connection
*ret
;
1833 int port
= SMBD_PORT
;
1836 ret
= _smbd_get_connection(server
, dstaddr
, port
);
1838 /* Try SMB_PORT if SMBD_PORT doesn't work */
1839 if (!ret
&& port
== SMBD_PORT
) {
1847 * Receive data from receive reassembly queue
1848 * All the incoming data packets are placed in reassembly queue
1849 * buf: the buffer to read data into
1850 * size: the length of data to read
1851 * return value: actual data read
1852 * Note: this implementation copies the data from reassebmly queue to receive
1853 * buffers used by upper layer. This is not the optimal code path. A better way
1854 * to do it is to not have upper layer allocate its receive buffers but rather
1855 * borrow the buffer from reassembly queue, and return it after data is
1856 * consumed. But this will require more changes to upper layer code, and also
1857 * need to consider packet boundaries while they still being reassembled.
1859 static int smbd_recv_buf(struct smbd_connection
*info
, char *buf
,
1862 struct smbd_response
*response
;
1863 struct smbd_data_transfer
*data_transfer
;
1864 int to_copy
, to_read
, data_read
, offset
;
1865 u32 data_length
, remaining_data_length
, data_offset
;
1869 if (info
->transport_status
!= SMBD_CONNECTED
) {
1870 log_read(ERR
, "disconnected\n");
1875 * No need to hold the reassembly queue lock all the time as we are
1876 * the only one reading from the front of the queue. The transport
1877 * may add more entries to the back of the queue at the same time
1879 log_read(INFO
, "size=%d info->reassembly_data_length=%d\n", size
,
1880 info
->reassembly_data_length
);
1881 if (info
->reassembly_data_length
>= size
) {
1883 int queue_removed
= 0;
1886 * Need to make sure reassembly_data_length is read before
1887 * reading reassembly_queue_length and calling
1888 * _get_first_reassembly. This call is lock free
1889 * as we never read at the end of the queue which are being
1890 * updated in SOFTIRQ as more data is received
1893 queue_length
= info
->reassembly_queue_length
;
1896 offset
= info
->first_entry_offset
;
1897 while (data_read
< size
) {
1898 response
= _get_first_reassembly(info
);
1899 data_transfer
= smbd_response_payload(response
);
1900 data_length
= le32_to_cpu(data_transfer
->data_length
);
1901 remaining_data_length
=
1903 data_transfer
->remaining_data_length
);
1904 data_offset
= le32_to_cpu(data_transfer
->data_offset
);
1907 * The upper layer expects RFC1002 length at the
1908 * beginning of the payload. Return it to indicate
1909 * the total length of the packet. This minimize the
1910 * change to upper layer packet processing logic. This
1911 * will be eventually remove when an intermediate
1912 * transport layer is added
1914 if (response
->first_segment
&& size
== 4) {
1915 unsigned int rfc1002_len
=
1916 data_length
+ remaining_data_length
;
1917 *((__be32
*)buf
) = cpu_to_be32(rfc1002_len
);
1919 response
->first_segment
= false;
1920 log_read(INFO
, "returning rfc1002 length %d\n",
1922 goto read_rfc1002_done
;
1925 to_copy
= min_t(int, data_length
- offset
, to_read
);
1928 (char *)data_transfer
+ data_offset
+ offset
,
1931 /* move on to the next buffer? */
1932 if (to_copy
== data_length
- offset
) {
1935 * No need to lock if we are not at the
1939 list_del(&response
->list
);
1942 &info
->reassembly_queue_lock
);
1943 list_del(&response
->list
);
1945 &info
->reassembly_queue_lock
);
1948 info
->count_reassembly_queue
--;
1949 info
->count_dequeue_reassembly_queue
++;
1950 put_receive_buffer(info
, response
);
1952 log_read(INFO
, "put_receive_buffer offset=0\n");
1957 data_read
+= to_copy
;
1959 log_read(INFO
, "_get_first_reassembly memcpy %d bytes "
1960 "data_transfer_length-offset=%d after that "
1961 "to_read=%d data_read=%d offset=%d\n",
1962 to_copy
, data_length
- offset
,
1963 to_read
, data_read
, offset
);
1966 spin_lock_irq(&info
->reassembly_queue_lock
);
1967 info
->reassembly_data_length
-= data_read
;
1968 info
->reassembly_queue_length
-= queue_removed
;
1969 spin_unlock_irq(&info
->reassembly_queue_lock
);
1971 info
->first_entry_offset
= offset
;
1972 log_read(INFO
, "returning to thread data_read=%d "
1973 "reassembly_data_length=%d first_entry_offset=%d\n",
1974 data_read
, info
->reassembly_data_length
,
1975 info
->first_entry_offset
);
1980 log_read(INFO
, "wait_event on more data\n");
1981 rc
= wait_event_interruptible(
1982 info
->wait_reassembly_queue
,
1983 info
->reassembly_data_length
>= size
||
1984 info
->transport_status
!= SMBD_CONNECTED
);
1985 /* Don't return any data if interrupted */
1993 * Receive a page from receive reassembly queue
1994 * page: the page to read data into
1995 * to_read: the length of data to read
1996 * return value: actual data read
1998 static int smbd_recv_page(struct smbd_connection
*info
,
1999 struct page
*page
, unsigned int to_read
)
2004 /* make sure we have the page ready for read */
2005 ret
= wait_event_interruptible(
2006 info
->wait_reassembly_queue
,
2007 info
->reassembly_data_length
>= to_read
||
2008 info
->transport_status
!= SMBD_CONNECTED
);
2012 /* now we can read from reassembly queue and not sleep */
2013 to_address
= kmap_atomic(page
);
2015 log_read(INFO
, "reading from page=%p address=%p to_read=%d\n",
2016 page
, to_address
, to_read
);
2018 ret
= smbd_recv_buf(info
, to_address
, to_read
);
2019 kunmap_atomic(to_address
);
2025 * Receive data from transport
2026 * msg: a msghdr point to the buffer, can be ITER_KVEC or ITER_BVEC
2027 * return: total bytes read, or 0. SMB Direct will not do partial read.
2029 int smbd_recv(struct smbd_connection
*info
, struct msghdr
*msg
)
2033 unsigned int to_read
;
2036 info
->smbd_recv_pending
++;
2038 switch (msg
->msg_iter
.type
) {
2039 case READ
| ITER_KVEC
:
2040 buf
= msg
->msg_iter
.kvec
->iov_base
;
2041 to_read
= msg
->msg_iter
.kvec
->iov_len
;
2042 rc
= smbd_recv_buf(info
, buf
, to_read
);
2045 case READ
| ITER_BVEC
:
2046 page
= msg
->msg_iter
.bvec
->bv_page
;
2047 to_read
= msg
->msg_iter
.bvec
->bv_len
;
2048 rc
= smbd_recv_page(info
, page
, to_read
);
2052 /* It's a bug in upper layer to get there */
2053 cifs_dbg(VFS
, "CIFS: invalid msg type %d\n",
2054 msg
->msg_iter
.type
);
2058 info
->smbd_recv_pending
--;
2059 wake_up(&info
->wait_smbd_recv_pending
);
2061 /* SMBDirect will read it all or nothing */
2063 msg
->msg_iter
.count
= 0;
2068 * Send data to transport
2069 * Each rqst is transported as a SMBDirect payload
2070 * rqst: the data to write
2071 * return value: 0 if successfully write, otherwise error code
2073 int smbd_send(struct smbd_connection
*info
, struct smb_rqst
*rqst
)
2078 int buflen
= 0, remaining_data_length
;
2081 info
->max_send_size
- sizeof(struct smbd_data_transfer
);
2082 struct kvec iov
[SMBDIRECT_MAX_SGE
];
2085 info
->smbd_send_pending
++;
2086 if (info
->transport_status
!= SMBD_CONNECTED
) {
2092 * This usually means a configuration error
2093 * We use RDMA read/write for packet size > rdma_readwrite_threshold
2094 * as long as it's properly configured we should never get into this
2097 if (rqst
->rq_nvec
+ rqst
->rq_npages
> SMBDIRECT_MAX_SGE
) {
2098 log_write(ERR
, "maximum send segment %x exceeding %x\n",
2099 rqst
->rq_nvec
+ rqst
->rq_npages
, SMBDIRECT_MAX_SGE
);
2105 * Remove the RFC1002 length defined in MS-SMB2 section 2.1
2106 * It is used only for TCP transport
2107 * In future we may want to add a transport layer under protocol
2108 * layer so this will only be issued to TCP transport
2110 iov
[0].iov_base
= (char *)rqst
->rq_iov
[0].iov_base
+ 4;
2111 iov
[0].iov_len
= rqst
->rq_iov
[0].iov_len
- 4;
2112 buflen
+= iov
[0].iov_len
;
2114 /* total up iov array first */
2115 for (i
= 1; i
< rqst
->rq_nvec
; i
++) {
2116 iov
[i
].iov_base
= rqst
->rq_iov
[i
].iov_base
;
2117 iov
[i
].iov_len
= rqst
->rq_iov
[i
].iov_len
;
2118 buflen
+= iov
[i
].iov_len
;
2121 /* add in the page array if there is one */
2122 if (rqst
->rq_npages
) {
2123 buflen
+= rqst
->rq_pagesz
* (rqst
->rq_npages
- 1);
2124 buflen
+= rqst
->rq_tailsz
;
2127 if (buflen
+ sizeof(struct smbd_data_transfer
) >
2128 info
->max_fragmented_send_size
) {
2129 log_write(ERR
, "payload size %d > max size %d\n",
2130 buflen
, info
->max_fragmented_send_size
);
2135 remaining_data_length
= buflen
;
2137 log_write(INFO
, "rqst->rq_nvec=%d rqst->rq_npages=%d rq_pagesz=%d "
2138 "rq_tailsz=%d buflen=%d\n",
2139 rqst
->rq_nvec
, rqst
->rq_npages
, rqst
->rq_pagesz
,
2140 rqst
->rq_tailsz
, buflen
);
2142 start
= i
= iov
[0].iov_len
? 0 : 1;
2145 buflen
+= iov
[i
].iov_len
;
2146 if (buflen
> max_iov_size
) {
2148 remaining_data_length
-=
2149 (buflen
-iov
[i
].iov_len
);
2150 log_write(INFO
, "sending iov[] from start=%d "
2152 "remaining_data_length=%d\n",
2154 remaining_data_length
);
2155 rc
= smbd_post_send_data(
2156 info
, &iov
[start
], i
-start
,
2157 remaining_data_length
);
2161 /* iov[start] is too big, break it */
2162 nvecs
= (buflen
+max_iov_size
-1)/max_iov_size
;
2163 log_write(INFO
, "iov[%d] iov_base=%p buflen=%d"
2164 " break to %d vectors\n",
2165 start
, iov
[start
].iov_base
,
2167 for (j
= 0; j
< nvecs
; j
++) {
2169 (char *)iov
[start
].iov_base
+
2171 vec
.iov_len
= max_iov_size
;
2175 max_iov_size
*(nvecs
-1);
2176 remaining_data_length
-= vec
.iov_len
;
2178 "sending vec j=%d iov_base=%p"
2180 "remaining_data_length=%d\n",
2181 j
, vec
.iov_base
, vec
.iov_len
,
2182 remaining_data_length
);
2183 rc
= smbd_post_send_data(
2185 remaining_data_length
);
2195 if (i
== rqst
->rq_nvec
) {
2196 /* send out all remaining vecs */
2197 remaining_data_length
-= buflen
;
2199 "sending iov[] from start=%d i=%d "
2200 "nvecs=%d remaining_data_length=%d\n",
2202 remaining_data_length
);
2203 rc
= smbd_post_send_data(info
, &iov
[start
],
2204 i
-start
, remaining_data_length
);
2210 log_write(INFO
, "looping i=%d buflen=%d\n", i
, buflen
);
2213 /* now sending pages if there are any */
2214 for (i
= 0; i
< rqst
->rq_npages
; i
++) {
2215 buflen
= (i
== rqst
->rq_npages
-1) ?
2216 rqst
->rq_tailsz
: rqst
->rq_pagesz
;
2217 nvecs
= (buflen
+ max_iov_size
- 1) / max_iov_size
;
2218 log_write(INFO
, "sending pages buflen=%d nvecs=%d\n",
2220 for (j
= 0; j
< nvecs
; j
++) {
2221 size
= max_iov_size
;
2223 size
= buflen
- j
*max_iov_size
;
2224 remaining_data_length
-= size
;
2225 log_write(INFO
, "sending pages i=%d offset=%d size=%d"
2226 " remaining_data_length=%d\n",
2227 i
, j
*max_iov_size
, size
, remaining_data_length
);
2228 rc
= smbd_post_send_page(
2229 info
, rqst
->rq_pages
[i
], j
*max_iov_size
,
2230 size
, remaining_data_length
);
2238 * As an optimization, we don't wait for individual I/O to finish
2239 * before sending the next one.
2240 * Send them all and wait for pending send count to get to 0
2241 * that means all the I/Os have been out and we are good to return
2244 wait_event(info
->wait_send_payload_pending
,
2245 atomic_read(&info
->send_payload_pending
) == 0);
2247 info
->smbd_send_pending
--;
2248 wake_up(&info
->wait_smbd_send_pending
);
2253 static void register_mr_done(struct ib_cq
*cq
, struct ib_wc
*wc
)
2259 log_rdma_mr(ERR
, "status=%d\n", wc
->status
);
2261 mr
= container_of(cqe
, struct smbd_mr
, cqe
);
2262 smbd_disconnect_rdma_connection(mr
->conn
);
2267 * The work queue function that recovers MRs
2268 * We need to call ib_dereg_mr() and ib_alloc_mr() before this MR can be used
2269 * again. Both calls are slow, so finish them in a workqueue. This will not
2271 * There is one workqueue that recovers MRs, there is no need to lock as the
2272 * I/O requests calling smbd_register_mr will never update the links in the
2275 static void smbd_mr_recovery_work(struct work_struct
*work
)
2277 struct smbd_connection
*info
=
2278 container_of(work
, struct smbd_connection
, mr_recovery_work
);
2279 struct smbd_mr
*smbdirect_mr
;
2282 list_for_each_entry(smbdirect_mr
, &info
->mr_list
, list
) {
2283 if (smbdirect_mr
->state
== MR_INVALIDATED
||
2284 smbdirect_mr
->state
== MR_ERROR
) {
2286 if (smbdirect_mr
->state
== MR_INVALIDATED
) {
2288 info
->id
->device
, smbdirect_mr
->sgl
,
2289 smbdirect_mr
->sgl_count
,
2291 smbdirect_mr
->state
= MR_READY
;
2292 } else if (smbdirect_mr
->state
== MR_ERROR
) {
2294 /* recover this MR entry */
2295 rc
= ib_dereg_mr(smbdirect_mr
->mr
);
2298 "ib_dereg_mr faield rc=%x\n",
2300 smbd_disconnect_rdma_connection(info
);
2303 smbdirect_mr
->mr
= ib_alloc_mr(
2304 info
->pd
, info
->mr_type
,
2305 info
->max_frmr_depth
);
2306 if (IS_ERR(smbdirect_mr
->mr
)) {
2308 "ib_alloc_mr failed mr_type=%x "
2309 "max_frmr_depth=%x\n",
2311 info
->max_frmr_depth
);
2312 smbd_disconnect_rdma_connection(info
);
2315 smbdirect_mr
->state
= MR_READY
;
2317 /* smbdirect_mr->state is updated by this function
2318 * and is read and updated by I/O issuing CPUs trying
2319 * to get a MR, the call to atomic_inc_return
2320 * implicates a memory barrier and guarantees this
2321 * value is updated before waking up any calls to
2322 * get_mr() from the I/O issuing CPUs
2324 if (atomic_inc_return(&info
->mr_ready_count
) == 1)
2325 wake_up_interruptible(&info
->wait_mr
);
2330 static void destroy_mr_list(struct smbd_connection
*info
)
2332 struct smbd_mr
*mr
, *tmp
;
2334 cancel_work_sync(&info
->mr_recovery_work
);
2335 list_for_each_entry_safe(mr
, tmp
, &info
->mr_list
, list
) {
2336 if (mr
->state
== MR_INVALIDATED
)
2337 ib_dma_unmap_sg(info
->id
->device
, mr
->sgl
,
2338 mr
->sgl_count
, mr
->dir
);
2339 ib_dereg_mr(mr
->mr
);
2346 * Allocate MRs used for RDMA read/write
2347 * The number of MRs will not exceed hardware capability in responder_resources
2348 * All MRs are kept in mr_list. The MR can be recovered after it's used
2349 * Recovery is done in smbd_mr_recovery_work. The content of list entry changes
2350 * as MRs are used and recovered for I/O, but the list links will not change
2352 static int allocate_mr_list(struct smbd_connection
*info
)
2355 struct smbd_mr
*smbdirect_mr
, *tmp
;
2357 INIT_LIST_HEAD(&info
->mr_list
);
2358 init_waitqueue_head(&info
->wait_mr
);
2359 spin_lock_init(&info
->mr_list_lock
);
2360 atomic_set(&info
->mr_ready_count
, 0);
2361 atomic_set(&info
->mr_used_count
, 0);
2362 init_waitqueue_head(&info
->wait_for_mr_cleanup
);
2363 /* Allocate more MRs (2x) than hardware responder_resources */
2364 for (i
= 0; i
< info
->responder_resources
* 2; i
++) {
2365 smbdirect_mr
= kzalloc(sizeof(*smbdirect_mr
), GFP_KERNEL
);
2368 smbdirect_mr
->mr
= ib_alloc_mr(info
->pd
, info
->mr_type
,
2369 info
->max_frmr_depth
);
2370 if (IS_ERR(smbdirect_mr
->mr
)) {
2371 log_rdma_mr(ERR
, "ib_alloc_mr failed mr_type=%x "
2372 "max_frmr_depth=%x\n",
2373 info
->mr_type
, info
->max_frmr_depth
);
2376 smbdirect_mr
->sgl
= kcalloc(
2377 info
->max_frmr_depth
,
2378 sizeof(struct scatterlist
),
2380 if (!smbdirect_mr
->sgl
) {
2381 log_rdma_mr(ERR
, "failed to allocate sgl\n");
2382 ib_dereg_mr(smbdirect_mr
->mr
);
2385 smbdirect_mr
->state
= MR_READY
;
2386 smbdirect_mr
->conn
= info
;
2388 list_add_tail(&smbdirect_mr
->list
, &info
->mr_list
);
2389 atomic_inc(&info
->mr_ready_count
);
2391 INIT_WORK(&info
->mr_recovery_work
, smbd_mr_recovery_work
);
2395 kfree(smbdirect_mr
);
2397 list_for_each_entry_safe(smbdirect_mr
, tmp
, &info
->mr_list
, list
) {
2398 ib_dereg_mr(smbdirect_mr
->mr
);
2399 kfree(smbdirect_mr
->sgl
);
2400 kfree(smbdirect_mr
);
2406 * Get a MR from mr_list. This function waits until there is at least one
2407 * MR available in the list. It may access the list while the
2408 * smbd_mr_recovery_work is recovering the MR list. This doesn't need a lock
2409 * as they never modify the same places. However, there may be several CPUs
2410 * issueing I/O trying to get MR at the same time, mr_list_lock is used to
2411 * protect this situation.
2413 static struct smbd_mr
*get_mr(struct smbd_connection
*info
)
2415 struct smbd_mr
*ret
;
2418 rc
= wait_event_interruptible(info
->wait_mr
,
2419 atomic_read(&info
->mr_ready_count
) ||
2420 info
->transport_status
!= SMBD_CONNECTED
);
2422 log_rdma_mr(ERR
, "wait_event_interruptible rc=%x\n", rc
);
2426 if (info
->transport_status
!= SMBD_CONNECTED
) {
2427 log_rdma_mr(ERR
, "info->transport_status=%x\n",
2428 info
->transport_status
);
2432 spin_lock(&info
->mr_list_lock
);
2433 list_for_each_entry(ret
, &info
->mr_list
, list
) {
2434 if (ret
->state
== MR_READY
) {
2435 ret
->state
= MR_REGISTERED
;
2436 spin_unlock(&info
->mr_list_lock
);
2437 atomic_dec(&info
->mr_ready_count
);
2438 atomic_inc(&info
->mr_used_count
);
2443 spin_unlock(&info
->mr_list_lock
);
2445 * It is possible that we could fail to get MR because other processes may
2446 * try to acquire a MR at the same time. If this is the case, retry it.
2452 * Register memory for RDMA read/write
2453 * pages[]: the list of pages to register memory with
2454 * num_pages: the number of pages to register
2455 * tailsz: if non-zero, the bytes to register in the last page
2456 * writing: true if this is a RDMA write (SMB read), false for RDMA read
2457 * need_invalidate: true if this MR needs to be locally invalidated after I/O
2458 * return value: the MR registered, NULL if failed.
2460 struct smbd_mr
*smbd_register_mr(
2461 struct smbd_connection
*info
, struct page
*pages
[], int num_pages
,
2462 int tailsz
, bool writing
, bool need_invalidate
)
2464 struct smbd_mr
*smbdirect_mr
;
2466 enum dma_data_direction dir
;
2467 struct ib_reg_wr
*reg_wr
;
2468 struct ib_send_wr
*bad_wr
;
2470 if (num_pages
> info
->max_frmr_depth
) {
2471 log_rdma_mr(ERR
, "num_pages=%d max_frmr_depth=%d\n",
2472 num_pages
, info
->max_frmr_depth
);
2476 smbdirect_mr
= get_mr(info
);
2477 if (!smbdirect_mr
) {
2478 log_rdma_mr(ERR
, "get_mr returning NULL\n");
2481 smbdirect_mr
->need_invalidate
= need_invalidate
;
2482 smbdirect_mr
->sgl_count
= num_pages
;
2483 sg_init_table(smbdirect_mr
->sgl
, num_pages
);
2485 for (i
= 0; i
< num_pages
- 1; i
++)
2486 sg_set_page(&smbdirect_mr
->sgl
[i
], pages
[i
], PAGE_SIZE
, 0);
2488 sg_set_page(&smbdirect_mr
->sgl
[i
], pages
[i
],
2489 tailsz
? tailsz
: PAGE_SIZE
, 0);
2491 dir
= writing
? DMA_FROM_DEVICE
: DMA_TO_DEVICE
;
2492 smbdirect_mr
->dir
= dir
;
2493 rc
= ib_dma_map_sg(info
->id
->device
, smbdirect_mr
->sgl
, num_pages
, dir
);
2495 log_rdma_mr(INFO
, "ib_dma_map_sg num_pages=%x dir=%x rc=%x\n",
2496 num_pages
, dir
, rc
);
2500 rc
= ib_map_mr_sg(smbdirect_mr
->mr
, smbdirect_mr
->sgl
, num_pages
,
2502 if (rc
!= num_pages
) {
2504 "ib_map_mr_sg failed rc = %x num_pages = %x\n",
2509 ib_update_fast_reg_key(smbdirect_mr
->mr
,
2510 ib_inc_rkey(smbdirect_mr
->mr
->rkey
));
2511 reg_wr
= &smbdirect_mr
->wr
;
2512 reg_wr
->wr
.opcode
= IB_WR_REG_MR
;
2513 smbdirect_mr
->cqe
.done
= register_mr_done
;
2514 reg_wr
->wr
.wr_cqe
= &smbdirect_mr
->cqe
;
2515 reg_wr
->wr
.num_sge
= 0;
2516 reg_wr
->wr
.send_flags
= IB_SEND_SIGNALED
;
2517 reg_wr
->mr
= smbdirect_mr
->mr
;
2518 reg_wr
->key
= smbdirect_mr
->mr
->rkey
;
2519 reg_wr
->access
= writing
?
2520 IB_ACCESS_REMOTE_WRITE
| IB_ACCESS_LOCAL_WRITE
:
2521 IB_ACCESS_REMOTE_READ
;
2524 * There is no need for waiting for complemtion on ib_post_send
2525 * on IB_WR_REG_MR. Hardware enforces a barrier and order of execution
2526 * on the next ib_post_send when we actaully send I/O to remote peer
2528 rc
= ib_post_send(info
->id
->qp
, ®_wr
->wr
, &bad_wr
);
2530 return smbdirect_mr
;
2532 log_rdma_mr(ERR
, "ib_post_send failed rc=%x reg_wr->key=%x\n",
2535 /* If all failed, attempt to recover this MR by setting it MR_ERROR*/
2537 ib_dma_unmap_sg(info
->id
->device
, smbdirect_mr
->sgl
,
2538 smbdirect_mr
->sgl_count
, smbdirect_mr
->dir
);
2541 smbdirect_mr
->state
= MR_ERROR
;
2542 if (atomic_dec_and_test(&info
->mr_used_count
))
2543 wake_up(&info
->wait_for_mr_cleanup
);
2548 static void local_inv_done(struct ib_cq
*cq
, struct ib_wc
*wc
)
2550 struct smbd_mr
*smbdirect_mr
;
2554 smbdirect_mr
= container_of(cqe
, struct smbd_mr
, cqe
);
2555 smbdirect_mr
->state
= MR_INVALIDATED
;
2556 if (wc
->status
!= IB_WC_SUCCESS
) {
2557 log_rdma_mr(ERR
, "invalidate failed status=%x\n", wc
->status
);
2558 smbdirect_mr
->state
= MR_ERROR
;
2560 complete(&smbdirect_mr
->invalidate_done
);
2564 * Deregister a MR after I/O is done
2565 * This function may wait if remote invalidation is not used
2566 * and we have to locally invalidate the buffer to prevent data is being
2567 * modified by remote peer after upper layer consumes it
2569 int smbd_deregister_mr(struct smbd_mr
*smbdirect_mr
)
2571 struct ib_send_wr
*wr
, *bad_wr
;
2572 struct smbd_connection
*info
= smbdirect_mr
->conn
;
2575 if (smbdirect_mr
->need_invalidate
) {
2576 /* Need to finish local invalidation before returning */
2577 wr
= &smbdirect_mr
->inv_wr
;
2578 wr
->opcode
= IB_WR_LOCAL_INV
;
2579 smbdirect_mr
->cqe
.done
= local_inv_done
;
2580 wr
->wr_cqe
= &smbdirect_mr
->cqe
;
2582 wr
->ex
.invalidate_rkey
= smbdirect_mr
->mr
->rkey
;
2583 wr
->send_flags
= IB_SEND_SIGNALED
;
2585 init_completion(&smbdirect_mr
->invalidate_done
);
2586 rc
= ib_post_send(info
->id
->qp
, wr
, &bad_wr
);
2588 log_rdma_mr(ERR
, "ib_post_send failed rc=%x\n", rc
);
2589 smbd_disconnect_rdma_connection(info
);
2592 wait_for_completion(&smbdirect_mr
->invalidate_done
);
2593 smbdirect_mr
->need_invalidate
= false;
2596 * For remote invalidation, just set it to MR_INVALIDATED
2597 * and defer to mr_recovery_work to recover the MR for next use
2599 smbdirect_mr
->state
= MR_INVALIDATED
;
2602 * Schedule the work to do MR recovery for future I/Os
2603 * MR recovery is slow and we don't want it to block the current I/O
2605 queue_work(info
->workqueue
, &info
->mr_recovery_work
);
2608 if (atomic_dec_and_test(&info
->mr_used_count
))
2609 wake_up(&info
->wait_for_mr_cleanup
);