fs/cifs/smbdirect.c

   1 /*
   2  *   Copyright (C) 2017, Microsoft Corporation.
   3  *
   4  *   Author(s): Long Li <longli@microsoft.com>
   5  *
   6  *   This program is free software;  you can redistribute it and/or modify
   7  *   it under the terms of the GNU General Public License as published by
   8  *   the Free Software Foundation; either version 2 of the License, or
   9  *   (at your option) any later version.
  10  *
  11  *   This program is distributed in the hope that it will be useful,
  12  *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
  13  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  14  *   the GNU General Public License for more details.
  15  */
  16 #include <linux/module.h>
  17 #include <linux/highmem.h>
  18 #include "smbdirect.h"
  19 #include "cifs_debug.h"
  20
  21 static struct smbd_response *get_empty_queue_buffer(
  22                 struct smbd_connection *info);
  23 static struct smbd_response *get_receive_buffer(
  24                 struct smbd_connection *info);
  25 static void put_receive_buffer(
  26                 struct smbd_connection *info,
  27                 struct smbd_response *response);
  28 static int allocate_receive_buffers(struct smbd_connection *info, int num_buf);
  29 static void destroy_receive_buffers(struct smbd_connection *info);
  30
  31 static void put_empty_packet(
  32                 struct smbd_connection *info, struct smbd_response *response);
  33 static void enqueue_reassembly(
  34                 struct smbd_connection *info,
  35                 struct smbd_response *response, int data_length);
  36 static struct smbd_response *_get_first_reassembly(
  37                 struct smbd_connection *info);
  38
  39 static int smbd_post_recv(
  40                 struct smbd_connection *info,
  41                 struct smbd_response *response);
  42
  43 static int smbd_post_send_empty(struct smbd_connection *info);
  44 static int smbd_post_send_data(
  45                 struct smbd_connection *info,
  46                 struct kvec *iov, int n_vec, int remaining_data_length);
  47 static int smbd_post_send_page(struct smbd_connection *info,
  48                 struct page *page, unsigned long offset,
  49                 size_t size, int remaining_data_length);
  50
  51 static void destroy_mr_list(struct smbd_connection *info);
  52 static int allocate_mr_list(struct smbd_connection *info);
  53
  54 /* SMBD version number */
  55 #define SMBD_V1 0x0100
  56
  57 /* Port numbers for SMBD transport */
  58 #define SMB_PORT        445
  59 #define SMBD_PORT       5445
  60
  61 /* Address lookup and resolve timeout in ms */
  62 #define RDMA_RESOLVE_TIMEOUT    5000
  63
  64 /* SMBD negotiation timeout in seconds */
  65 #define SMBD_NEGOTIATE_TIMEOUT  120
  66
  67 /* SMBD minimum receive size and fragmented sized defined in [MS-SMBD] */
  68 #define SMBD_MIN_RECEIVE_SIZE           128
  69 #define SMBD_MIN_FRAGMENTED_SIZE        131072
  70
  71 /*
  72  * Default maximum number of RDMA read/write outstanding on this connection
  73  * This value is possibly decreased during QP creation on hardware limit
  74  */
  75 #define SMBD_CM_RESPONDER_RESOURCES     32
  76
  77 /* Maximum number of retries on data transfer operations */
  78 #define SMBD_CM_RETRY                   6
  79 /* No need to retry on Receiver Not Ready since SMBD manages credits */
  80 #define SMBD_CM_RNR_RETRY               0
  81
  82 /*
  83  * User configurable initial values per SMBD transport connection
  84  * as defined in [MS-SMBD] 3.1.1.1
  85  * Those may change after a SMBD negotiation
  86  */
  87 /* The local peer's maximum number of credits to grant to the peer */
  88 int smbd_receive_credit_max = 255;
  89
  90 /* The remote peer's credit request of local peer */
  91 int smbd_send_credit_target = 255;
  92
  93 /* The maximum single message size can be sent to remote peer */
  94 int smbd_max_send_size = 1364;
  95
  96 /*  The maximum fragmented upper-layer payload receive size supported */
  97 int smbd_max_fragmented_recv_size = 1024 * 1024;
  98
  99 /*  The maximum single-message size which can be received */
 100 int smbd_max_receive_size = 8192;
 101
 102 /* The timeout to initiate send of a keepalive message on idle */
 103 int smbd_keep_alive_interval = 120;
 104
 105 /*
 106  * User configurable initial values for RDMA transport
 107  * The actual values used may be lower and are limited to hardware capabilities
 108  */
 109 /* Default maximum number of SGEs in a RDMA write/read */
 110 int smbd_max_frmr_depth = 2048;
 111
 112 /* If payload is less than this byte, use RDMA send/recv not read/write */
 113 int rdma_readwrite_threshold = 4096;
 114
 115 /* Transport logging functions
 116  * Logging are defined as classes. They can be OR'ed to define the actual
 117  * logging level via module parameter smbd_logging_class
 118  * e.g. cifs.smbd_logging_class=0xa0 will log all log_rdma_recv() and
 119  * log_rdma_event()
 120  */
 121 #define LOG_OUTGOING                    0x1
 122 #define LOG_INCOMING                    0x2
 123 #define LOG_READ                        0x4
 124 #define LOG_WRITE                       0x8
 125 #define LOG_RDMA_SEND                   0x10
 126 #define LOG_RDMA_RECV                   0x20
 127 #define LOG_KEEP_ALIVE                  0x40
 128 #define LOG_RDMA_EVENT                  0x80
 129 #define LOG_RDMA_MR                     0x100
 130 static unsigned int smbd_logging_class;
 131 module_param(smbd_logging_class, uint, 0644);
 132 MODULE_PARM_DESC(smbd_logging_class,
 133         "Logging class for SMBD transport 0x0 to 0x100");
 134
 135 #define ERR             0x0
 136 #define INFO            0x1
 137 static unsigned int smbd_logging_level = ERR;
 138 module_param(smbd_logging_level, uint, 0644);
 139 MODULE_PARM_DESC(smbd_logging_level,
 140         "Logging level for SMBD transport, 0 (default): error, 1: info");
 141
 142 #define log_rdma(level, class, fmt, args...)                            \
 143 do {                                                                    \
 144         if (level <= smbd_logging_level || class & smbd_logging_class)  \
 145                 cifs_dbg(VFS, "%s:%d " fmt, __func__, __LINE__, ##args);\
 146 } while (0)
 147
 148 #define log_outgoing(level, fmt, args...) \
 149                 log_rdma(level, LOG_OUTGOING, fmt, ##args)
 150 #define log_incoming(level, fmt, args...) \
 151                 log_rdma(level, LOG_INCOMING, fmt, ##args)
 152 #define log_read(level, fmt, args...)   log_rdma(level, LOG_READ, fmt, ##args)
 153 #define log_write(level, fmt, args...)  log_rdma(level, LOG_WRITE, fmt, ##args)
 154 #define log_rdma_send(level, fmt, args...) \
 155                 log_rdma(level, LOG_RDMA_SEND, fmt, ##args)
 156 #define log_rdma_recv(level, fmt, args...) \
 157                 log_rdma(level, LOG_RDMA_RECV, fmt, ##args)
 158 #define log_keep_alive(level, fmt, args...) \
 159                 log_rdma(level, LOG_KEEP_ALIVE, fmt, ##args)
 160 #define log_rdma_event(level, fmt, args...) \
 161                 log_rdma(level, LOG_RDMA_EVENT, fmt, ##args)
 162 #define log_rdma_mr(level, fmt, args...) \
 163                 log_rdma(level, LOG_RDMA_MR, fmt, ##args)
 164
 165 /*
 166  * Destroy the transport and related RDMA and memory resources
 167  * Need to go through all the pending counters and make sure on one is using
 168  * the transport while it is destroyed
 169  */
 170 static void smbd_destroy_rdma_work(struct work_struct *work)
 171 {
 172         struct smbd_response *response;
 173         struct smbd_connection *info =
 174                 container_of(work, struct smbd_connection, destroy_work);
 175         unsigned long flags;
 176
 177         log_rdma_event(INFO, "destroying qp\n");
 178         ib_drain_qp(info->id->qp);
 179         rdma_destroy_qp(info->id);
 180
 181         /* Unblock all I/O waiting on the send queue */
 182         wake_up_interruptible_all(&info->wait_send_queue);
 183
 184         log_rdma_event(INFO, "cancelling idle timer\n");
 185         cancel_delayed_work_sync(&info->idle_timer_work);
 186         log_rdma_event(INFO, "cancelling send immediate work\n");
 187         cancel_delayed_work_sync(&info->send_immediate_work);
 188
 189         log_rdma_event(INFO, "wait for all send to finish\n");
 190         wait_event(info->wait_smbd_send_pending,
 191                 info->smbd_send_pending == 0);
 192
 193         log_rdma_event(INFO, "wait for all recv to finish\n");
 194         wake_up_interruptible(&info->wait_reassembly_queue);
 195         wait_event(info->wait_smbd_recv_pending,
 196                 info->smbd_recv_pending == 0);
 197
 198         log_rdma_event(INFO, "wait for all send posted to IB to finish\n");
 199         wait_event(info->wait_send_pending,
 200                 atomic_read(&info->send_pending) == 0);
 201         wait_event(info->wait_send_payload_pending,
 202                 atomic_read(&info->send_payload_pending) == 0);
 203
 204         log_rdma_event(INFO, "freeing mr list\n");
 205         wake_up_interruptible_all(&info->wait_mr);
 206         wait_event(info->wait_for_mr_cleanup,
 207                 atomic_read(&info->mr_used_count) == 0);
 208         destroy_mr_list(info);
 209
 210         /* It's not posssible for upper layer to get to reassembly */
 211         log_rdma_event(INFO, "drain the reassembly queue\n");
 212         do {
 213                 spin_lock_irqsave(&info->reassembly_queue_lock, flags);
 214                 response = _get_first_reassembly(info);
 215                 if (response) {
 216                         list_del(&response->list);
 217                         spin_unlock_irqrestore(
 218                                 &info->reassembly_queue_lock, flags);
 219                         put_receive_buffer(info, response);
 220                 } else
 221                         spin_unlock_irqrestore(&info->reassembly_queue_lock, flags);
 222         } while (response);
 223
 224         info->reassembly_data_length = 0;
 225
 226         log_rdma_event(INFO, "free receive buffers\n");
 227         wait_event(info->wait_receive_queues,
 228                 info->count_receive_queue + info->count_empty_packet_queue
 229                         == info->receive_credit_max);
 230         destroy_receive_buffers(info);
 231
 232         ib_free_cq(info->send_cq);
 233         ib_free_cq(info->recv_cq);
 234         ib_dealloc_pd(info->pd);
 235         rdma_destroy_id(info->id);
 236
 237         /* free mempools */
 238         mempool_destroy(info->request_mempool);
 239         kmem_cache_destroy(info->request_cache);
 240
 241         mempool_destroy(info->response_mempool);
 242         kmem_cache_destroy(info->response_cache);
 243
 244         info->transport_status = SMBD_DESTROYED;
 245         wake_up_all(&info->wait_destroy);
 246 }
 247
 248 static int smbd_process_disconnected(struct smbd_connection *info)
 249 {
 250         schedule_work(&info->destroy_work);
 251         return 0;
 252 }
 253
 254 static void smbd_disconnect_rdma_work(struct work_struct *work)
 255 {
 256         struct smbd_connection *info =
 257                 container_of(work, struct smbd_connection, disconnect_work);
 258
 259         if (info->transport_status == SMBD_CONNECTED) {
 260                 info->transport_status = SMBD_DISCONNECTING;
 261                 rdma_disconnect(info->id);
 262         }
 263 }
 264
 265 static void smbd_disconnect_rdma_connection(struct smbd_connection *info)
 266 {
 267         queue_work(info->workqueue, &info->disconnect_work);
 268 }
 269
 270 /* Upcall from RDMA CM */
 271 static int smbd_conn_upcall(
 272                 struct rdma_cm_id *id, struct rdma_cm_event *event)
 273 {
 274         struct smbd_connection *info = id->context;
 275
 276         log_rdma_event(INFO, "event=%d status=%d\n",
 277                 event->event, event->status);
 278
 279         switch (event->event) {
 280         case RDMA_CM_EVENT_ADDR_RESOLVED:
 281         case RDMA_CM_EVENT_ROUTE_RESOLVED:
 282                 info->ri_rc = 0;
 283                 complete(&info->ri_done);
 284                 break;
 285
 286         case RDMA_CM_EVENT_ADDR_ERROR:
 287                 info->ri_rc = -EHOSTUNREACH;
 288                 complete(&info->ri_done);
 289                 break;
 290
 291         case RDMA_CM_EVENT_ROUTE_ERROR:
 292                 info->ri_rc = -ENETUNREACH;
 293                 complete(&info->ri_done);
 294                 break;
 295
 296         case RDMA_CM_EVENT_ESTABLISHED:
 297                 log_rdma_event(INFO, "connected event=%d\n", event->event);
 298                 info->transport_status = SMBD_CONNECTED;
 299                 wake_up_interruptible(&info->conn_wait);
 300                 break;
 301
 302         case RDMA_CM_EVENT_CONNECT_ERROR:
 303         case RDMA_CM_EVENT_UNREACHABLE:
 304         case RDMA_CM_EVENT_REJECTED:
 305                 log_rdma_event(INFO, "connecting failed event=%d\n", event->event);
 306                 info->transport_status = SMBD_DISCONNECTED;
 307                 wake_up_interruptible(&info->conn_wait);
 308                 break;
 309
 310         case RDMA_CM_EVENT_DEVICE_REMOVAL:
 311         case RDMA_CM_EVENT_DISCONNECTED:
 312                 /* This happenes when we fail the negotiation */
 313                 if (info->transport_status == SMBD_NEGOTIATE_FAILED) {
 314                         info->transport_status = SMBD_DISCONNECTED;
 315                         wake_up(&info->conn_wait);
 316                         break;
 317                 }
 318
 319                 info->transport_status = SMBD_DISCONNECTED;
 320                 smbd_process_disconnected(info);
 321                 break;
 322
 323         default:
 324                 break;
 325         }
 326
 327         return 0;
 328 }
 329
 330 /* Upcall from RDMA QP */
 331 static void
 332 smbd_qp_async_error_upcall(struct ib_event *event, void *context)
 333 {
 334         struct smbd_connection *info = context;
 335
 336         log_rdma_event(ERR, "%s on device %s info %p\n",
 337                 ib_event_msg(event->event), event->device->name, info);
 338
 339         switch (event->event) {
 340         case IB_EVENT_CQ_ERR:
 341         case IB_EVENT_QP_FATAL:
 342                 smbd_disconnect_rdma_connection(info);
 343
 344         default:
 345                 break;
 346         }
 347 }
 348
 349 static inline void *smbd_request_payload(struct smbd_request *request)
 350 {
 351         return (void *)request->packet;
 352 }
 353
 354 static inline void *smbd_response_payload(struct smbd_response *response)
 355 {
 356         return (void *)response->packet;
 357 }
 358
 359 /* Called when a RDMA send is done */
 360 static void send_done(struct ib_cq *cq, struct ib_wc *wc)
 361 {
 362         int i;
 363         struct smbd_request *request =
 364                 container_of(wc->wr_cqe, struct smbd_request, cqe);
 365
 366         log_rdma_send(INFO, "smbd_request %p completed wc->status=%d\n",
 367                 request, wc->status);
 368
 369         if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_SEND) {
 370                 log_rdma_send(ERR, "wc->status=%d wc->opcode=%d\n",
 371                         wc->status, wc->opcode);
 372                 smbd_disconnect_rdma_connection(request->info);
 373         }
 374
 375         for (i = 0; i < request->num_sge; i++)
 376                 ib_dma_unmap_single(request->info->id->device,
 377                         request->sge[i].addr,
 378                         request->sge[i].length,
 379                         DMA_TO_DEVICE);
 380
 381         if (request->has_payload) {
 382                 if (atomic_dec_and_test(&request->info->send_payload_pending))
 383                         wake_up(&request->info->wait_send_payload_pending);
 384         } else {
 385                 if (atomic_dec_and_test(&request->info->send_pending))
 386                         wake_up(&request->info->wait_send_pending);
 387         }
 388
 389         mempool_free(request, request->info->request_mempool);
 390 }
 391
 392 static void dump_smbd_negotiate_resp(struct smbd_negotiate_resp *resp)
 393 {
 394         log_rdma_event(INFO, "resp message min_version %u max_version %u "
 395                 "negotiated_version %u credits_requested %u "
 396                 "credits_granted %u status %u max_readwrite_size %u "
 397                 "preferred_send_size %u max_receive_size %u "
 398                 "max_fragmented_size %u\n",
 399                 resp->min_version, resp->max_version, resp->negotiated_version,
 400                 resp->credits_requested, resp->credits_granted, resp->status,
 401                 resp->max_readwrite_size, resp->preferred_send_size,
 402                 resp->max_receive_size, resp->max_fragmented_size);
 403 }
 404
 405 /*
 406  * Process a negotiation response message, according to [MS-SMBD]3.1.5.7
 407  * response, packet_length: the negotiation response message
 408  * return value: true if negotiation is a success, false if failed
 409  */
 410 static bool process_negotiation_response(
 411                 struct smbd_response *response, int packet_length)
 412 {
 413         struct smbd_connection *info = response->info;
 414         struct smbd_negotiate_resp *packet = smbd_response_payload(response);
 415
 416         if (packet_length < sizeof(struct smbd_negotiate_resp)) {
 417                 log_rdma_event(ERR,
 418                         "error: packet_length=%d\n", packet_length);
 419                 return false;
 420         }
 421
 422         if (le16_to_cpu(packet->negotiated_version) != SMBD_V1) {
 423                 log_rdma_event(ERR, "error: negotiated_version=%x\n",
 424                         le16_to_cpu(packet->negotiated_version));
 425                 return false;
 426         }
 427         info->protocol = le16_to_cpu(packet->negotiated_version);
 428
 429         if (packet->credits_requested == 0) {
 430                 log_rdma_event(ERR, "error: credits_requested==0\n");
 431                 return false;
 432         }
 433         info->receive_credit_target = le16_to_cpu(packet->credits_requested);
 434
 435         if (packet->credits_granted == 0) {
 436                 log_rdma_event(ERR, "error: credits_granted==0\n");
 437                 return false;
 438         }
 439         atomic_set(&info->send_credits, le16_to_cpu(packet->credits_granted));
 440
 441         atomic_set(&info->receive_credits, 0);
 442
 443         if (le32_to_cpu(packet->preferred_send_size) > info->max_receive_size) {
 444                 log_rdma_event(ERR, "error: preferred_send_size=%d\n",
 445                         le32_to_cpu(packet->preferred_send_size));
 446                 return false;
 447         }
 448         info->max_receive_size = le32_to_cpu(packet->preferred_send_size);
 449
 450         if (le32_to_cpu(packet->max_receive_size) < SMBD_MIN_RECEIVE_SIZE) {
 451                 log_rdma_event(ERR, "error: max_receive_size=%d\n",
 452                         le32_to_cpu(packet->max_receive_size));
 453                 return false;
 454         }
 455         info->max_send_size = min_t(int, info->max_send_size,
 456                                         le32_to_cpu(packet->max_receive_size));
 457
 458         if (le32_to_cpu(packet->max_fragmented_size) <
 459                         SMBD_MIN_FRAGMENTED_SIZE) {
 460                 log_rdma_event(ERR, "error: max_fragmented_size=%d\n",
 461                         le32_to_cpu(packet->max_fragmented_size));
 462                 return false;
 463         }
 464         info->max_fragmented_send_size =
 465                 le32_to_cpu(packet->max_fragmented_size);
 466         info->rdma_readwrite_threshold =
 467                 rdma_readwrite_threshold > info->max_fragmented_send_size ?
 468                 info->max_fragmented_send_size :
 469                 rdma_readwrite_threshold;
 470
 471
 472         info->max_readwrite_size = min_t(u32,
 473                         le32_to_cpu(packet->max_readwrite_size),
 474                         info->max_frmr_depth * PAGE_SIZE);
 475         info->max_frmr_depth = info->max_readwrite_size / PAGE_SIZE;
 476
 477         return true;
 478 }
 479
 480 /*
 481  * Check and schedule to send an immediate packet
 482  * This is used to extend credtis to remote peer to keep the transport busy
 483  */
 484 static void check_and_send_immediate(struct smbd_connection *info)
 485 {
 486         if (info->transport_status != SMBD_CONNECTED)
 487                 return;
 488
 489         info->send_immediate = true;
 490
 491         /*
 492          * Promptly send a packet if our peer is running low on receive
 493          * credits
 494          */
 495         if (atomic_read(&info->receive_credits) <
 496                 info->receive_credit_target - 1)
 497                 queue_delayed_work(
 498                         info->workqueue, &info->send_immediate_work, 0);
 499 }
 500
 501 static void smbd_post_send_credits(struct work_struct *work)
 502 {
 503         int ret = 0;
 504         int use_receive_queue = 1;
 505         int rc;
 506         struct smbd_response *response;
 507         struct smbd_connection *info =
 508                 container_of(work, struct smbd_connection,
 509                         post_send_credits_work);
 510
 511         if (info->transport_status != SMBD_CONNECTED) {
 512                 wake_up(&info->wait_receive_queues);
 513                 return;
 514         }
 515
 516         if (info->receive_credit_target >
 517                 atomic_read(&info->receive_credits)) {
 518                 while (true) {
 519                         if (use_receive_queue)
 520                                 response = get_receive_buffer(info);
 521                         else
 522                                 response = get_empty_queue_buffer(info);
 523                         if (!response) {
 524                                 /* now switch to emtpy packet queue */
 525                                 if (use_receive_queue) {
 526                                         use_receive_queue = 0;
 527                                         continue;
 528                                 } else
 529                                         break;
 530                         }
 531
 532                         response->type = SMBD_TRANSFER_DATA;
 533                         response->first_segment = false;
 534                         rc = smbd_post_recv(info, response);
 535                         if (rc) {
 536                                 log_rdma_recv(ERR,
 537                                         "post_recv failed rc=%d\n", rc);
 538                                 put_receive_buffer(info, response);
 539                                 break;
 540                         }
 541
 542                         ret++;
 543                 }
 544         }
 545
 546         spin_lock(&info->lock_new_credits_offered);
 547         info->new_credits_offered += ret;
 548         spin_unlock(&info->lock_new_credits_offered);
 549
 550         atomic_add(ret, &info->receive_credits);
 551
 552         /* Check if we can post new receive and grant credits to peer */
 553         check_and_send_immediate(info);
 554 }
 555
 556 static void smbd_recv_done_work(struct work_struct *work)
 557 {
 558         struct smbd_connection *info =
 559                 container_of(work, struct smbd_connection, recv_done_work);
 560
 561         /*
 562          * We may have new send credits granted from remote peer
 563          * If any sender is blcoked on lack of credets, unblock it
 564          */
 565         if (atomic_read(&info->send_credits))
 566                 wake_up_interruptible(&info->wait_send_queue);
 567
 568         /*
 569          * Check if we need to send something to remote peer to
 570          * grant more credits or respond to KEEP_ALIVE packet
 571          */
 572         check_and_send_immediate(info);
 573 }
 574
 575 /* Called from softirq, when recv is done */
 576 static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
 577 {
 578         struct smbd_data_transfer *data_transfer;
 579         struct smbd_response *response =
 580                 container_of(wc->wr_cqe, struct smbd_response, cqe);
 581         struct smbd_connection *info = response->info;
 582         int data_length = 0;
 583
 584         log_rdma_recv(INFO, "response=%p type=%d wc status=%d wc opcode %d "
 585                       "byte_len=%d pkey_index=%x\n",
 586                 response, response->type, wc->status, wc->opcode,
 587                 wc->byte_len, wc->pkey_index);
 588
 589         if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_RECV) {
 590                 log_rdma_recv(INFO, "wc->status=%d opcode=%d\n",
 591                         wc->status, wc->opcode);
 592                 smbd_disconnect_rdma_connection(info);
 593                 goto error;
 594         }
 595
 596         ib_dma_sync_single_for_cpu(
 597                 wc->qp->device,
 598                 response->sge.addr,
 599                 response->sge.length,
 600                 DMA_FROM_DEVICE);
 601
 602         switch (response->type) {
 603         /* SMBD negotiation response */
 604         case SMBD_NEGOTIATE_RESP:
 605                 dump_smbd_negotiate_resp(smbd_response_payload(response));
 606                 info->full_packet_received = true;
 607                 info->negotiate_done =
 608                         process_negotiation_response(response, wc->byte_len);
 609                 complete(&info->negotiate_completion);
 610                 break;
 611
 612         /* SMBD data transfer packet */
 613         case SMBD_TRANSFER_DATA:
 614                 data_transfer = smbd_response_payload(response);
 615                 data_length = le32_to_cpu(data_transfer->data_length);
 616
 617                 /*
 618                  * If this is a packet with data playload place the data in
 619                  * reassembly queue and wake up the reading thread
 620                  */
 621                 if (data_length) {
 622                         if (info->full_packet_received)
 623                                 response->first_segment = true;
 624
 625                         if (le32_to_cpu(data_transfer->remaining_data_length))
 626                                 info->full_packet_received = false;
 627                         else
 628                                 info->full_packet_received = true;
 629
 630                         enqueue_reassembly(
 631                                 info,
 632                                 response,
 633                                 data_length);
 634                 } else
 635                         put_empty_packet(info, response);
 636
 637                 if (data_length)
 638                         wake_up_interruptible(&info->wait_reassembly_queue);
 639
 640                 atomic_dec(&info->receive_credits);
 641                 info->receive_credit_target =
 642                         le16_to_cpu(data_transfer->credits_requested);
 643                 atomic_add(le16_to_cpu(data_transfer->credits_granted),
 644                         &info->send_credits);
 645
 646                 log_incoming(INFO, "data flags %d data_offset %d "
 647                         "data_length %d remaining_data_length %d\n",
 648                         le16_to_cpu(data_transfer->flags),
 649                         le32_to_cpu(data_transfer->data_offset),
 650                         le32_to_cpu(data_transfer->data_length),
 651                         le32_to_cpu(data_transfer->remaining_data_length));
 652
 653                 /* Send a KEEP_ALIVE response right away if requested */
 654                 info->keep_alive_requested = KEEP_ALIVE_NONE;
 655                 if (le16_to_cpu(data_transfer->flags) &
 656                                 SMB_DIRECT_RESPONSE_REQUESTED) {
 657                         info->keep_alive_requested = KEEP_ALIVE_PENDING;
 658                 }
 659
 660                 queue_work(info->workqueue, &info->recv_done_work);
 661                 return;
 662
 663         default:
 664                 log_rdma_recv(ERR,
 665                         "unexpected response type=%d\n", response->type);
 666         }
 667
 668 error:
 669         put_receive_buffer(info, response);
 670 }
 671
 672 static struct rdma_cm_id *smbd_create_id(
 673                 struct smbd_connection *info,
 674                 struct sockaddr *dstaddr, int port)
 675 {
 676         struct rdma_cm_id *id;
 677         int rc;
 678         __be16 *sport;
 679
 680         id = rdma_create_id(&init_net, smbd_conn_upcall, info,
 681                 RDMA_PS_TCP, IB_QPT_RC);
 682         if (IS_ERR(id)) {
 683                 rc = PTR_ERR(id);
 684                 log_rdma_event(ERR, "rdma_create_id() failed %i\n", rc);
 685                 return id;
 686         }
 687
 688         if (dstaddr->sa_family == AF_INET6)
 689                 sport = &((struct sockaddr_in6 *)dstaddr)->sin6_port;
 690         else
 691                 sport = &((struct sockaddr_in *)dstaddr)->sin_port;
 692
 693         *sport = htons(port);
 694
 695         init_completion(&info->ri_done);
 696         info->ri_rc = -ETIMEDOUT;
 697
 698         rc = rdma_resolve_addr(id, NULL, (struct sockaddr *)dstaddr,
 699                 RDMA_RESOLVE_TIMEOUT);
 700         if (rc) {
 701                 log_rdma_event(ERR, "rdma_resolve_addr() failed %i\n", rc);
 702                 goto out;
 703         }
 704         wait_for_completion_interruptible_timeout(
 705                 &info->ri_done, msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT));
 706         rc = info->ri_rc;
 707         if (rc) {
 708                 log_rdma_event(ERR, "rdma_resolve_addr() completed %i\n", rc);
 709                 goto out;
 710         }
 711
 712         info->ri_rc = -ETIMEDOUT;
 713         rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
 714         if (rc) {
 715                 log_rdma_event(ERR, "rdma_resolve_route() failed %i\n", rc);
 716                 goto out;
 717         }
 718         wait_for_completion_interruptible_timeout(
 719                 &info->ri_done, msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT));
 720         rc = info->ri_rc;
 721         if (rc) {
 722                 log_rdma_event(ERR, "rdma_resolve_route() completed %i\n", rc);
 723                 goto out;
 724         }
 725
 726         return id;
 727
 728 out:
 729         rdma_destroy_id(id);
 730         return ERR_PTR(rc);
 731 }
 732
 733 /*
 734  * Test if FRWR (Fast Registration Work Requests) is supported on the device
 735  * This implementation requries FRWR on RDMA read/write
 736  * return value: true if it is supported
 737  */
 738 static bool frwr_is_supported(struct ib_device_attr *attrs)
 739 {
 740         if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS))
 741                 return false;
 742         if (attrs->max_fast_reg_page_list_len == 0)
 743                 return false;
 744         return true;
 745 }
 746
 747 static int smbd_ia_open(
 748                 struct smbd_connection *info,
 749                 struct sockaddr *dstaddr, int port)
 750 {
 751         int rc;
 752
 753         info->id = smbd_create_id(info, dstaddr, port);
 754         if (IS_ERR(info->id)) {
 755                 rc = PTR_ERR(info->id);
 756                 goto out1;
 757         }
 758
 759         if (!frwr_is_supported(&info->id->device->attrs)) {
 760                 log_rdma_event(ERR,
 761                         "Fast Registration Work Requests "
 762                         "(FRWR) is not supported\n");
 763                 log_rdma_event(ERR,
 764                         "Device capability flags = %llx "
 765                         "max_fast_reg_page_list_len = %u\n",
 766                         info->id->device->attrs.device_cap_flags,
 767                         info->id->device->attrs.max_fast_reg_page_list_len);
 768                 rc = -EPROTONOSUPPORT;
 769                 goto out2;
 770         }
 771         info->max_frmr_depth = min_t(int,
 772                 smbd_max_frmr_depth,
 773                 info->id->device->attrs.max_fast_reg_page_list_len);
 774         info->mr_type = IB_MR_TYPE_MEM_REG;
 775         if (info->id->device->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG)
 776                 info->mr_type = IB_MR_TYPE_SG_GAPS;
 777
 778         info->pd = ib_alloc_pd(info->id->device, 0);
 779         if (IS_ERR(info->pd)) {
 780                 rc = PTR_ERR(info->pd);
 781                 log_rdma_event(ERR, "ib_alloc_pd() returned %d\n", rc);
 782                 goto out2;
 783         }
 784
 785         return 0;
 786
 787 out2:
 788         rdma_destroy_id(info->id);
 789         info->id = NULL;
 790
 791 out1:
 792         return rc;
 793 }
 794
 795 /*
 796  * Send a negotiation request message to the peer
 797  * The negotiation procedure is in [MS-SMBD] 3.1.5.2 and 3.1.5.3
 798  * After negotiation, the transport is connected and ready for
 799  * carrying upper layer SMB payload
 800  */
 801 static int smbd_post_send_negotiate_req(struct smbd_connection *info)
 802 {
 803         struct ib_send_wr send_wr, *send_wr_fail;
 804         int rc = -ENOMEM;
 805         struct smbd_request *request;
 806         struct smbd_negotiate_req *packet;
 807
 808         request = mempool_alloc(info->request_mempool, GFP_KERNEL);
 809         if (!request)
 810                 return rc;
 811
 812         request->info = info;
 813
 814         packet = smbd_request_payload(request);
 815         packet->min_version = cpu_to_le16(SMBD_V1);
 816         packet->max_version = cpu_to_le16(SMBD_V1);
 817         packet->reserved = 0;
 818         packet->credits_requested = cpu_to_le16(info->send_credit_target);
 819         packet->preferred_send_size = cpu_to_le32(info->max_send_size);
 820         packet->max_receive_size = cpu_to_le32(info->max_receive_size);
 821         packet->max_fragmented_size =
 822                 cpu_to_le32(info->max_fragmented_recv_size);
 823
 824         request->num_sge = 1;
 825         request->sge[0].addr = ib_dma_map_single(
 826                                 info->id->device, (void *)packet,
 827                                 sizeof(*packet), DMA_TO_DEVICE);
 828         if (ib_dma_mapping_error(info->id->device, request->sge[0].addr)) {
 829                 rc = -EIO;
 830                 goto dma_mapping_failed;
 831         }
 832
 833         request->sge[0].length = sizeof(*packet);
 834         request->sge[0].lkey = info->pd->local_dma_lkey;
 835
 836         ib_dma_sync_single_for_device(
 837                 info->id->device, request->sge[0].addr,
 838                 request->sge[0].length, DMA_TO_DEVICE);
 839
 840         request->cqe.done = send_done;
 841
 842         send_wr.next = NULL;
 843         send_wr.wr_cqe = &request->cqe;
 844         send_wr.sg_list = request->sge;
 845         send_wr.num_sge = request->num_sge;
 846         send_wr.opcode = IB_WR_SEND;
 847         send_wr.send_flags = IB_SEND_SIGNALED;
 848
 849         log_rdma_send(INFO, "sge addr=%llx length=%x lkey=%x\n",
 850                 request->sge[0].addr,
 851                 request->sge[0].length, request->sge[0].lkey);
 852
 853         request->has_payload = false;
 854         atomic_inc(&info->send_pending);
 855         rc = ib_post_send(info->id->qp, &send_wr, &send_wr_fail);
 856         if (!rc)
 857                 return 0;
 858
 859         /* if we reach here, post send failed */
 860         log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc);
 861         atomic_dec(&info->send_pending);
 862         ib_dma_unmap_single(info->id->device, request->sge[0].addr,
 863                 request->sge[0].length, DMA_TO_DEVICE);
 864
 865 dma_mapping_failed:
 866         mempool_free(request, info->request_mempool);
 867         return rc;
 868 }
 869
 870 /*
 871  * Extend the credits to remote peer
 872  * This implements [MS-SMBD] 3.1.5.9
 873  * The idea is that we should extend credits to remote peer as quickly as
 874  * it's allowed, to maintain data flow. We allocate as much receive
 875  * buffer as possible, and extend the receive credits to remote peer
 876  * return value: the new credtis being granted.
 877  */
 878 static int manage_credits_prior_sending(struct smbd_connection *info)
 879 {
 880         int new_credits;
 881
 882         spin_lock(&info->lock_new_credits_offered);
 883         new_credits = info->new_credits_offered;
 884         info->new_credits_offered = 0;
 885         spin_unlock(&info->lock_new_credits_offered);
 886
 887         return new_credits;
 888 }
 889
 890 /*
 891  * Check if we need to send a KEEP_ALIVE message
 892  * The idle connection timer triggers a KEEP_ALIVE message when expires
 893  * SMB_DIRECT_RESPONSE_REQUESTED is set in the message flag to have peer send
 894  * back a response.
 895  * return value:
 896  * 1 if SMB_DIRECT_RESPONSE_REQUESTED needs to be set
 897  * 0: otherwise
 898  */
 899 static int manage_keep_alive_before_sending(struct smbd_connection *info)
 900 {
 901         if (info->keep_alive_requested == KEEP_ALIVE_PENDING) {
 902                 info->keep_alive_requested = KEEP_ALIVE_SENT;
 903                 return 1;
 904         }
 905         return 0;
 906 }
 907
 908 /*
 909  * Build and prepare the SMBD packet header
 910  * This function waits for avaialbe send credits and build a SMBD packet
 911  * header. The caller then optional append payload to the packet after
 912  * the header
 913  * intput values
 914  * size: the size of the payload
 915  * remaining_data_length: remaining data to send if this is part of a
 916  * fragmented packet
 917  * output values
 918  * request_out: the request allocated from this function
 919  * return values: 0 on success, otherwise actual error code returned
 920  */
 921 static int smbd_create_header(struct smbd_connection *info,
 922                 int size, int remaining_data_length,
 923                 struct smbd_request **request_out)
 924 {
 925         struct smbd_request *request;
 926         struct smbd_data_transfer *packet;
 927         int header_length;
 928         int rc;
 929
 930         /* Wait for send credits. A SMBD packet needs one credit */
 931         rc = wait_event_interruptible(info->wait_send_queue,
 932                 atomic_read(&info->send_credits) > 0 ||
 933                 info->transport_status != SMBD_CONNECTED);
 934         if (rc)
 935                 return rc;
 936
 937         if (info->transport_status != SMBD_CONNECTED) {
 938                 log_outgoing(ERR, "disconnected not sending\n");
 939                 return -ENOENT;
 940         }
 941         atomic_dec(&info->send_credits);
 942
 943         request = mempool_alloc(info->request_mempool, GFP_KERNEL);
 944         if (!request) {
 945                 rc = -ENOMEM;
 946                 goto err;
 947         }
 948
 949         request->info = info;
 950
 951         /* Fill in the packet header */
 952         packet = smbd_request_payload(request);
 953         packet->credits_requested = cpu_to_le16(info->send_credit_target);
 954         packet->credits_granted =
 955                 cpu_to_le16(manage_credits_prior_sending(info));
 956         info->send_immediate = false;
 957
 958         packet->flags = 0;
 959         if (manage_keep_alive_before_sending(info))
 960                 packet->flags |= cpu_to_le16(SMB_DIRECT_RESPONSE_REQUESTED);
 961
 962         packet->reserved = 0;
 963         if (!size)
 964                 packet->data_offset = 0;
 965         else
 966                 packet->data_offset = cpu_to_le32(24);
 967         packet->data_length = cpu_to_le32(size);
 968         packet->remaining_data_length = cpu_to_le32(remaining_data_length);
 969         packet->padding = 0;
 970
 971         log_outgoing(INFO, "credits_requested=%d credits_granted=%d "
 972                 "data_offset=%d data_length=%d remaining_data_length=%d\n",
 973                 le16_to_cpu(packet->credits_requested),
 974                 le16_to_cpu(packet->credits_granted),
 975                 le32_to_cpu(packet->data_offset),
 976                 le32_to_cpu(packet->data_length),
 977                 le32_to_cpu(packet->remaining_data_length));
 978
 979         /* Map the packet to DMA */
 980         header_length = sizeof(struct smbd_data_transfer);
 981         /* If this is a packet without payload, don't send padding */
 982         if (!size)
 983                 header_length = offsetof(struct smbd_data_transfer, padding);
 984
 985         request->num_sge = 1;
 986         request->sge[0].addr = ib_dma_map_single(info->id->device,
 987                                                  (void *)packet,
 988                                                  header_length,
 989                                                  DMA_BIDIRECTIONAL);
 990         if (ib_dma_mapping_error(info->id->device, request->sge[0].addr)) {
 991                 mempool_free(request, info->request_mempool);
 992                 rc = -EIO;
 993                 goto err;
 994         }
 995
 996         request->sge[0].length = header_length;
 997         request->sge[0].lkey = info->pd->local_dma_lkey;
 998
 999         *request_out = request;
1000         return 0;
1001
1002 err:
1003         atomic_inc(&info->send_credits);
1004         return rc;
1005 }
1006
1007 static void smbd_destroy_header(struct smbd_connection *info,
1008                 struct smbd_request *request)
1009 {
1010
1011         ib_dma_unmap_single(info->id->device,
1012                             request->sge[0].addr,
1013                             request->sge[0].length,
1014                             DMA_TO_DEVICE);
1015         mempool_free(request, info->request_mempool);
1016         atomic_inc(&info->send_credits);
1017 }
1018
1019 /* Post the send request */
1020 static int smbd_post_send(struct smbd_connection *info,
1021                 struct smbd_request *request, bool has_payload)
1022 {
1023         struct ib_send_wr send_wr, *send_wr_fail;
1024         int rc, i;
1025
1026         for (i = 0; i < request->num_sge; i++) {
1027                 log_rdma_send(INFO,
1028                         "rdma_request sge[%d] addr=%llu legnth=%u\n",
1029                         i, request->sge[0].addr, request->sge[0].length);
1030                 ib_dma_sync_single_for_device(
1031                         info->id->device,
1032                         request->sge[i].addr,
1033                         request->sge[i].length,
1034                         DMA_TO_DEVICE);
1035         }
1036
1037         request->cqe.done = send_done;
1038
1039         send_wr.next = NULL;
1040         send_wr.wr_cqe = &request->cqe;
1041         send_wr.sg_list = request->sge;
1042         send_wr.num_sge = request->num_sge;
1043         send_wr.opcode = IB_WR_SEND;
1044         send_wr.send_flags = IB_SEND_SIGNALED;
1045
1046         if (has_payload) {
1047                 request->has_payload = true;
1048                 atomic_inc(&info->send_payload_pending);
1049         } else {
1050                 request->has_payload = false;
1051                 atomic_inc(&info->send_pending);
1052         }
1053
1054         rc = ib_post_send(info->id->qp, &send_wr, &send_wr_fail);
1055         if (rc) {
1056                 log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc);
1057                 if (has_payload) {
1058                         if (atomic_dec_and_test(&info->send_payload_pending))
1059                                 wake_up(&info->wait_send_payload_pending);
1060                 } else {
1061                         if (atomic_dec_and_test(&info->send_pending))
1062                                 wake_up(&info->wait_send_pending);
1063                 }
1064         } else
1065                 /* Reset timer for idle connection after packet is sent */
1066                 mod_delayed_work(info->workqueue, &info->idle_timer_work,
1067                         info->keep_alive_interval*HZ);
1068
1069         return rc;
1070 }
1071
1072 static int smbd_post_send_sgl(struct smbd_connection *info,
1073         struct scatterlist *sgl, int data_length, int remaining_data_length)
1074 {
1075         int num_sgs;
1076         int i, rc;
1077         struct smbd_request *request;
1078         struct scatterlist *sg;
1079
1080         rc = smbd_create_header(
1081                 info, data_length, remaining_data_length, &request);
1082         if (rc)
1083                 return rc;
1084
1085         num_sgs = sgl ? sg_nents(sgl) : 0;
1086         for_each_sg(sgl, sg, num_sgs, i) {
1087                 request->sge[i+1].addr =
1088                         ib_dma_map_page(info->id->device, sg_page(sg),
1089                                sg->offset, sg->length, DMA_BIDIRECTIONAL);
1090                 if (ib_dma_mapping_error(
1091                                 info->id->device, request->sge[i+1].addr)) {
1092                         rc = -EIO;
1093                         request->sge[i+1].addr = 0;
1094                         goto dma_mapping_failure;
1095                 }
1096                 request->sge[i+1].length = sg->length;
1097                 request->sge[i+1].lkey = info->pd->local_dma_lkey;
1098                 request->num_sge++;
1099         }
1100
1101         rc = smbd_post_send(info, request, data_length);
1102         if (!rc)
1103                 return 0;
1104
1105 dma_mapping_failure:
1106         for (i = 1; i < request->num_sge; i++)
1107                 if (request->sge[i].addr)
1108                         ib_dma_unmap_single(info->id->device,
1109                                             request->sge[i].addr,
1110                                             request->sge[i].length,
1111                                             DMA_TO_DEVICE);
1112         smbd_destroy_header(info, request);
1113         return rc;
1114 }
1115
1116 /*
1117  * Send a page
1118  * page: the page to send
1119  * offset: offset in the page to send
1120  * size: length in the page to send
1121  * remaining_data_length: remaining data to send in this payload
1122  */
1123 static int smbd_post_send_page(struct smbd_connection *info, struct page *page,
1124                 unsigned long offset, size_t size, int remaining_data_length)
1125 {
1126         struct scatterlist sgl;
1127
1128         sg_init_table(&sgl, 1);
1129         sg_set_page(&sgl, page, size, offset);
1130
1131         return smbd_post_send_sgl(info, &sgl, size, remaining_data_length);
1132 }
1133
1134 /*
1135  * Send an empty message
1136  * Empty message is used to extend credits to peer to for keep live
1137  * while there is no upper layer payload to send at the time
1138  */
1139 static int smbd_post_send_empty(struct smbd_connection *info)
1140 {
1141         info->count_send_empty++;
1142         return smbd_post_send_sgl(info, NULL, 0, 0);
1143 }
1144
1145 /*
1146  * Send a data buffer
1147  * iov: the iov array describing the data buffers
1148  * n_vec: number of iov array
1149  * remaining_data_length: remaining data to send following this packet
1150  * in segmented SMBD packet
1151  */
1152 static int smbd_post_send_data(
1153         struct smbd_connection *info, struct kvec *iov, int n_vec,
1154         int remaining_data_length)
1155 {
1156         int i;
1157         u32 data_length = 0;
1158         struct scatterlist sgl[SMBDIRECT_MAX_SGE];
1159
1160         if (n_vec > SMBDIRECT_MAX_SGE) {
1161                 cifs_dbg(VFS, "Can't fit data to SGL, n_vec=%d\n", n_vec);
1162                 return -ENOMEM;
1163         }
1164
1165         sg_init_table(sgl, n_vec);
1166         for (i = 0; i < n_vec; i++) {
1167                 data_length += iov[i].iov_len;
1168                 sg_set_buf(&sgl[i], iov[i].iov_base, iov[i].iov_len);
1169         }
1170
1171         return smbd_post_send_sgl(info, sgl, data_length, remaining_data_length);
1172 }
1173
1174 /*
1175  * Post a receive request to the transport
1176  * The remote peer can only send data when a receive request is posted
1177  * The interaction is controlled by send/receive credit system
1178  */
1179 static int smbd_post_recv(
1180                 struct smbd_connection *info, struct smbd_response *response)
1181 {
1182         struct ib_recv_wr recv_wr, *recv_wr_fail = NULL;
1183         int rc = -EIO;
1184
1185         response->sge.addr = ib_dma_map_single(
1186                                 info->id->device, response->packet,
1187                                 info->max_receive_size, DMA_FROM_DEVICE);
1188         if (ib_dma_mapping_error(info->id->device, response->sge.addr))
1189                 return rc;
1190
1191         response->sge.length = info->max_receive_size;
1192         response->sge.lkey = info->pd->local_dma_lkey;
1193
1194         response->cqe.done = recv_done;
1195
1196         recv_wr.wr_cqe = &response->cqe;
1197         recv_wr.next = NULL;
1198         recv_wr.sg_list = &response->sge;
1199         recv_wr.num_sge = 1;
1200
1201         rc = ib_post_recv(info->id->qp, &recv_wr, &recv_wr_fail);
1202         if (rc) {
1203                 ib_dma_unmap_single(info->id->device, response->sge.addr,
1204                                     response->sge.length, DMA_FROM_DEVICE);
1205
1206                 log_rdma_recv(ERR, "ib_post_recv failed rc=%d\n", rc);
1207         }
1208
1209         return rc;
1210 }
1211
1212 /* Perform SMBD negotiate according to [MS-SMBD] 3.1.5.2 */
1213 static int smbd_negotiate(struct smbd_connection *info)
1214 {
1215         int rc;
1216         struct smbd_response *response = get_receive_buffer(info);
1217
1218         response->type = SMBD_NEGOTIATE_RESP;
1219         rc = smbd_post_recv(info, response);
1220         log_rdma_event(INFO,
1221                 "smbd_post_recv rc=%d iov.addr=%llx iov.length=%x "
1222                 "iov.lkey=%x\n",
1223                 rc, response->sge.addr,
1224                 response->sge.length, response->sge.lkey);
1225         if (rc)
1226                 return rc;
1227
1228         init_completion(&info->negotiate_completion);
1229         info->negotiate_done = false;
1230         rc = smbd_post_send_negotiate_req(info);
1231         if (rc)
1232                 return rc;
1233
1234         rc = wait_for_completion_interruptible_timeout(
1235                 &info->negotiate_completion, SMBD_NEGOTIATE_TIMEOUT * HZ);
1236         log_rdma_event(INFO, "wait_for_completion_timeout rc=%d\n", rc);
1237
1238         if (info->negotiate_done)
1239                 return 0;
1240
1241         if (rc == 0)
1242                 rc = -ETIMEDOUT;
1243         else if (rc == -ERESTARTSYS)
1244                 rc = -EINTR;
1245         else
1246                 rc = -ENOTCONN;
1247
1248         return rc;
1249 }
1250
1251 static void put_empty_packet(
1252                 struct smbd_connection *info, struct smbd_response *response)
1253 {
1254         spin_lock(&info->empty_packet_queue_lock);
1255         list_add_tail(&response->list, &info->empty_packet_queue);
1256         info->count_empty_packet_queue++;
1257         spin_unlock(&info->empty_packet_queue_lock);
1258
1259         queue_work(info->workqueue, &info->post_send_credits_work);
1260 }
1261
1262 /*
1263  * Implement Connection.FragmentReassemblyBuffer defined in [MS-SMBD] 3.1.1.1
1264  * This is a queue for reassembling upper layer payload and present to upper
1265  * layer. All the inncoming payload go to the reassembly queue, regardless of
1266  * if reassembly is required. The uuper layer code reads from the queue for all
1267  * incoming payloads.
1268  * Put a received packet to the reassembly queue
1269  * response: the packet received
1270  * data_length: the size of payload in this packet
1271  */
1272 static void enqueue_reassembly(
1273         struct smbd_connection *info,
1274         struct smbd_response *response,
1275         int data_length)
1276 {
1277         spin_lock(&info->reassembly_queue_lock);
1278         list_add_tail(&response->list, &info->reassembly_queue);
1279         info->reassembly_queue_length++;
1280         /*
1281          * Make sure reassembly_data_length is updated after list and
1282          * reassembly_queue_length are updated. On the dequeue side
1283          * reassembly_data_length is checked without a lock to determine
1284          * if reassembly_queue_length and list is up to date
1285          */
1286         virt_wmb();
1287         info->reassembly_data_length += data_length;
1288         spin_unlock(&info->reassembly_queue_lock);
1289         info->count_reassembly_queue++;
1290         info->count_enqueue_reassembly_queue++;
1291 }
1292
1293 /*
1294  * Get the first entry at the front of reassembly queue
1295  * Caller is responsible for locking
1296  * return value: the first entry if any, NULL if queue is empty
1297  */
1298 static struct smbd_response *_get_first_reassembly(struct smbd_connection *info)
1299 {
1300         struct smbd_response *ret = NULL;
1301
1302         if (!list_empty(&info->reassembly_queue)) {
1303                 ret = list_first_entry(
1304                         &info->reassembly_queue,
1305                         struct smbd_response, list);
1306         }
1307         return ret;
1308 }
1309
1310 static struct smbd_response *get_empty_queue_buffer(
1311                 struct smbd_connection *info)
1312 {
1313         struct smbd_response *ret = NULL;
1314         unsigned long flags;
1315
1316         spin_lock_irqsave(&info->empty_packet_queue_lock, flags);
1317         if (!list_empty(&info->empty_packet_queue)) {
1318                 ret = list_first_entry(
1319                         &info->empty_packet_queue,
1320                         struct smbd_response, list);
1321                 list_del(&ret->list);
1322                 info->count_empty_packet_queue--;
1323         }
1324         spin_unlock_irqrestore(&info->empty_packet_queue_lock, flags);
1325
1326         return ret;
1327 }
1328
1329 /*
1330  * Get a receive buffer
1331  * For each remote send, we need to post a receive. The receive buffers are
1332  * pre-allocated in advance.
1333  * return value: the receive buffer, NULL if none is available
1334  */
1335 static struct smbd_response *get_receive_buffer(struct smbd_connection *info)
1336 {
1337         struct smbd_response *ret = NULL;
1338         unsigned long flags;
1339
1340         spin_lock_irqsave(&info->receive_queue_lock, flags);
1341         if (!list_empty(&info->receive_queue)) {
1342                 ret = list_first_entry(
1343                         &info->receive_queue,
1344                         struct smbd_response, list);
1345                 list_del(&ret->list);
1346                 info->count_receive_queue--;
1347                 info->count_get_receive_buffer++;
1348         }
1349         spin_unlock_irqrestore(&info->receive_queue_lock, flags);
1350
1351         return ret;
1352 }
1353
1354 /*
1355  * Return a receive buffer
1356  * Upon returning of a receive buffer, we can post new receive and extend
1357  * more receive credits to remote peer. This is done immediately after a
1358  * receive buffer is returned.
1359  */
1360 static void put_receive_buffer(
1361         struct smbd_connection *info, struct smbd_response *response)
1362 {
1363         unsigned long flags;
1364
1365         ib_dma_unmap_single(info->id->device, response->sge.addr,
1366                 response->sge.length, DMA_FROM_DEVICE);
1367
1368         spin_lock_irqsave(&info->receive_queue_lock, flags);
1369         list_add_tail(&response->list, &info->receive_queue);
1370         info->count_receive_queue++;
1371         info->count_put_receive_buffer++;
1372         spin_unlock_irqrestore(&info->receive_queue_lock, flags);
1373
1374         queue_work(info->workqueue, &info->post_send_credits_work);
1375 }
1376
1377 /* Preallocate all receive buffer on transport establishment */
1378 static int allocate_receive_buffers(struct smbd_connection *info, int num_buf)
1379 {
1380         int i;
1381         struct smbd_response *response;
1382
1383         INIT_LIST_HEAD(&info->reassembly_queue);
1384         spin_lock_init(&info->reassembly_queue_lock);
1385         info->reassembly_data_length = 0;
1386         info->reassembly_queue_length = 0;
1387
1388         INIT_LIST_HEAD(&info->receive_queue);
1389         spin_lock_init(&info->receive_queue_lock);
1390         info->count_receive_queue = 0;
1391
1392         INIT_LIST_HEAD(&info->empty_packet_queue);
1393         spin_lock_init(&info->empty_packet_queue_lock);
1394         info->count_empty_packet_queue = 0;
1395
1396         init_waitqueue_head(&info->wait_receive_queues);
1397
1398         for (i = 0; i < num_buf; i++) {
1399                 response = mempool_alloc(info->response_mempool, GFP_KERNEL);
1400                 if (!response)
1401                         goto allocate_failed;
1402
1403                 response->info = info;
1404                 list_add_tail(&response->list, &info->receive_queue);
1405                 info->count_receive_queue++;
1406         }
1407
1408         return 0;
1409
1410 allocate_failed:
1411         while (!list_empty(&info->receive_queue)) {
1412                 response = list_first_entry(
1413                                 &info->receive_queue,
1414                                 struct smbd_response, list);
1415                 list_del(&response->list);
1416                 info->count_receive_queue--;
1417
1418                 mempool_free(response, info->response_mempool);
1419         }
1420         return -ENOMEM;
1421 }
1422
1423 static void destroy_receive_buffers(struct smbd_connection *info)
1424 {
1425         struct smbd_response *response;
1426
1427         while ((response = get_receive_buffer(info)))
1428                 mempool_free(response, info->response_mempool);
1429
1430         while ((response = get_empty_queue_buffer(info)))
1431                 mempool_free(response, info->response_mempool);
1432 }
1433
1434 /*
1435  * Check and send an immediate or keep alive packet
1436  * The condition to send those packets are defined in [MS-SMBD] 3.1.1.1
1437  * Connection.KeepaliveRequested and Connection.SendImmediate
1438  * The idea is to extend credits to server as soon as it becomes available
1439  */
1440 static void send_immediate_work(struct work_struct *work)
1441 {
1442         struct smbd_connection *info = container_of(
1443                                         work, struct smbd_connection,
1444                                         send_immediate_work.work);
1445
1446         if (info->keep_alive_requested == KEEP_ALIVE_PENDING ||
1447             info->send_immediate) {
1448                 log_keep_alive(INFO, "send an empty message\n");
1449                 smbd_post_send_empty(info);
1450         }
1451 }
1452
1453 /* Implement idle connection timer [MS-SMBD] 3.1.6.2 */
1454 static void idle_connection_timer(struct work_struct *work)
1455 {
1456         struct smbd_connection *info = container_of(
1457                                         work, struct smbd_connection,
1458                                         idle_timer_work.work);
1459
1460         if (info->keep_alive_requested != KEEP_ALIVE_NONE) {
1461                 log_keep_alive(ERR,
1462                         "error status info->keep_alive_requested=%d\n",
1463                         info->keep_alive_requested);
1464                 smbd_disconnect_rdma_connection(info);
1465                 return;
1466         }
1467
1468         log_keep_alive(INFO, "about to send an empty idle message\n");
1469         smbd_post_send_empty(info);
1470
1471         /* Setup the next idle timeout work */
1472         queue_delayed_work(info->workqueue, &info->idle_timer_work,
1473                         info->keep_alive_interval*HZ);
1474 }
1475
1476 /* Destroy this SMBD connection, called from upper layer */
1477 void smbd_destroy(struct smbd_connection *info)
1478 {
1479         log_rdma_event(INFO, "destroying rdma session\n");
1480
1481         /* Kick off the disconnection process */
1482         smbd_disconnect_rdma_connection(info);
1483
1484         log_rdma_event(INFO, "wait for transport being destroyed\n");
1485         wait_event(info->wait_destroy,
1486                 info->transport_status == SMBD_DESTROYED);
1487
1488         destroy_workqueue(info->workqueue);
1489         kfree(info);
1490 }
1491
1492 /*
1493  * Reconnect this SMBD connection, called from upper layer
1494  * return value: 0 on success, or actual error code
1495  */
1496 int smbd_reconnect(struct TCP_Server_Info *server)
1497 {
1498         log_rdma_event(INFO, "reconnecting rdma session\n");
1499
1500         if (!server->smbd_conn) {
1501                 log_rdma_event(ERR, "rdma session already destroyed\n");
1502                 return -EINVAL;
1503         }
1504
1505         /*
1506          * This is possible if transport is disconnected and we haven't received
1507          * notification from RDMA, but upper layer has detected timeout
1508          */
1509         if (server->smbd_conn->transport_status == SMBD_CONNECTED) {
1510                 log_rdma_event(INFO, "disconnecting transport\n");
1511                 smbd_disconnect_rdma_connection(server->smbd_conn);
1512         }
1513
1514         /* wait until the transport is destroyed */
1515         wait_event(server->smbd_conn->wait_destroy,
1516                 server->smbd_conn->transport_status == SMBD_DESTROYED);
1517
1518         destroy_workqueue(server->smbd_conn->workqueue);
1519         kfree(server->smbd_conn);
1520
1521         log_rdma_event(INFO, "creating rdma session\n");
1522         server->smbd_conn = smbd_get_connection(
1523                 server, (struct sockaddr *) &server->dstaddr);
1524
1525         return server->smbd_conn ? 0 : -ENOENT;
1526 }
1527
1528 static void destroy_caches_and_workqueue(struct smbd_connection *info)
1529 {
1530         destroy_receive_buffers(info);
1531         destroy_workqueue(info->workqueue);
1532         mempool_destroy(info->response_mempool);
1533         kmem_cache_destroy(info->response_cache);
1534         mempool_destroy(info->request_mempool);
1535         kmem_cache_destroy(info->request_cache);
1536 }
1537
1538 #define MAX_NAME_LEN    80
1539 static int allocate_caches_and_workqueue(struct smbd_connection *info)
1540 {
1541         char name[MAX_NAME_LEN];
1542         int rc;
1543
1544         snprintf(name, MAX_NAME_LEN, "smbd_request_%p", info);
1545         info->request_cache =
1546                 kmem_cache_create(
1547                         name,
1548                         sizeof(struct smbd_request) +
1549                                 sizeof(struct smbd_data_transfer),
1550                         0, SLAB_HWCACHE_ALIGN, NULL);
1551         if (!info->request_cache)
1552                 return -ENOMEM;
1553
1554         info->request_mempool =
1555                 mempool_create(info->send_credit_target, mempool_alloc_slab,
1556                         mempool_free_slab, info->request_cache);
1557         if (!info->request_mempool)
1558                 goto out1;
1559
1560         snprintf(name, MAX_NAME_LEN, "smbd_response_%p", info);
1561         info->response_cache =
1562                 kmem_cache_create(
1563                         name,
1564                         sizeof(struct smbd_response) +
1565                                 info->max_receive_size,
1566                         0, SLAB_HWCACHE_ALIGN, NULL);
1567         if (!info->response_cache)
1568                 goto out2;
1569
1570         info->response_mempool =
1571                 mempool_create(info->receive_credit_max, mempool_alloc_slab,
1572                        mempool_free_slab, info->response_cache);
1573         if (!info->response_mempool)
1574                 goto out3;
1575
1576         snprintf(name, MAX_NAME_LEN, "smbd_%p", info);
1577         info->workqueue = create_workqueue(name);
1578         if (!info->workqueue)
1579                 goto out4;
1580
1581         rc = allocate_receive_buffers(info, info->receive_credit_max);
1582         if (rc) {
1583                 log_rdma_event(ERR, "failed to allocate receive buffers\n");
1584                 goto out5;
1585         }
1586
1587         return 0;
1588
1589 out5:
1590         destroy_workqueue(info->workqueue);
1591 out4:
1592         mempool_destroy(info->response_mempool);
1593 out3:
1594         kmem_cache_destroy(info->response_cache);
1595 out2:
1596         mempool_destroy(info->request_mempool);
1597 out1:
1598         kmem_cache_destroy(info->request_cache);
1599         return -ENOMEM;
1600 }
1601
1602 /* Create a SMBD connection, called by upper layer */
1603 static struct smbd_connection *_smbd_get_connection(
1604         struct TCP_Server_Info *server, struct sockaddr *dstaddr, int port)
1605 {
1606         int rc;
1607         struct smbd_connection *info;
1608         struct rdma_conn_param conn_param;
1609         struct ib_qp_init_attr qp_attr;
1610         struct sockaddr_in *addr_in = (struct sockaddr_in *) dstaddr;
1611         struct ib_port_immutable port_immutable;
1612         u32 ird_ord_hdr[2];
1613
1614         info = kzalloc(sizeof(struct smbd_connection), GFP_KERNEL);
1615         if (!info)
1616                 return NULL;
1617
1618         info->transport_status = SMBD_CONNECTING;
1619         rc = smbd_ia_open(info, dstaddr, port);
1620         if (rc) {
1621                 log_rdma_event(INFO, "smbd_ia_open rc=%d\n", rc);
1622                 goto create_id_failed;
1623         }
1624
1625         if (smbd_send_credit_target > info->id->device->attrs.max_cqe ||
1626             smbd_send_credit_target > info->id->device->attrs.max_qp_wr) {
1627                 log_rdma_event(ERR,
1628                         "consider lowering send_credit_target = %d. "
1629                         "Possible CQE overrun, device "
1630                         "reporting max_cpe %d max_qp_wr %d\n",
1631                         smbd_send_credit_target,
1632                         info->id->device->attrs.max_cqe,
1633                         info->id->device->attrs.max_qp_wr);
1634                 goto config_failed;
1635         }
1636
1637         if (smbd_receive_credit_max > info->id->device->attrs.max_cqe ||
1638             smbd_receive_credit_max > info->id->device->attrs.max_qp_wr) {
1639                 log_rdma_event(ERR,
1640                         "consider lowering receive_credit_max = %d. "
1641                         "Possible CQE overrun, device "
1642                         "reporting max_cpe %d max_qp_wr %d\n",
1643                         smbd_receive_credit_max,
1644                         info->id->device->attrs.max_cqe,
1645                         info->id->device->attrs.max_qp_wr);
1646                 goto config_failed;
1647         }
1648
1649         info->receive_credit_max = smbd_receive_credit_max;
1650         info->send_credit_target = smbd_send_credit_target;
1651         info->max_send_size = smbd_max_send_size;
1652         info->max_fragmented_recv_size = smbd_max_fragmented_recv_size;
1653         info->max_receive_size = smbd_max_receive_size;
1654         info->keep_alive_interval = smbd_keep_alive_interval;
1655
1656         if (info->id->device->attrs.max_sge < SMBDIRECT_MAX_SGE) {
1657                 log_rdma_event(ERR, "warning: device max_sge = %d too small\n",
1658                         info->id->device->attrs.max_sge);
1659                 log_rdma_event(ERR, "Queue Pair creation may fail\n");
1660         }
1661
1662         info->send_cq = NULL;
1663         info->recv_cq = NULL;
1664         info->send_cq = ib_alloc_cq(info->id->device, info,
1665                         info->send_credit_target, 0, IB_POLL_SOFTIRQ);
1666         if (IS_ERR(info->send_cq)) {
1667                 info->send_cq = NULL;
1668                 goto alloc_cq_failed;
1669         }
1670
1671         info->recv_cq = ib_alloc_cq(info->id->device, info,
1672                         info->receive_credit_max, 0, IB_POLL_SOFTIRQ);
1673         if (IS_ERR(info->recv_cq)) {
1674                 info->recv_cq = NULL;
1675                 goto alloc_cq_failed;
1676         }
1677
1678         memset(&qp_attr, 0, sizeof(qp_attr));
1679         qp_attr.event_handler = smbd_qp_async_error_upcall;
1680         qp_attr.qp_context = info;
1681         qp_attr.cap.max_send_wr = info->send_credit_target;
1682         qp_attr.cap.max_recv_wr = info->receive_credit_max;
1683         qp_attr.cap.max_send_sge = SMBDIRECT_MAX_SGE;
1684         qp_attr.cap.max_recv_sge = SMBDIRECT_MAX_SGE;
1685         qp_attr.cap.max_inline_data = 0;
1686         qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
1687         qp_attr.qp_type = IB_QPT_RC;
1688         qp_attr.send_cq = info->send_cq;
1689         qp_attr.recv_cq = info->recv_cq;
1690         qp_attr.port_num = ~0;
1691
1692         rc = rdma_create_qp(info->id, info->pd, &qp_attr);
1693         if (rc) {
1694                 log_rdma_event(ERR, "rdma_create_qp failed %i\n", rc);
1695                 goto create_qp_failed;
1696         }
1697
1698         memset(&conn_param, 0, sizeof(conn_param));
1699         conn_param.initiator_depth = 0;
1700
1701         conn_param.responder_resources =
1702                 info->id->device->attrs.max_qp_rd_atom
1703                         < SMBD_CM_RESPONDER_RESOURCES ?
1704                 info->id->device->attrs.max_qp_rd_atom :
1705                 SMBD_CM_RESPONDER_RESOURCES;
1706         info->responder_resources = conn_param.responder_resources;
1707         log_rdma_mr(INFO, "responder_resources=%d\n",
1708                 info->responder_resources);
1709
1710         /* Need to send IRD/ORD in private data for iWARP */
1711         info->id->device->get_port_immutable(
1712                 info->id->device, info->id->port_num, &port_immutable);
1713         if (port_immutable.core_cap_flags & RDMA_CORE_PORT_IWARP) {
1714                 ird_ord_hdr[0] = info->responder_resources;
1715                 ird_ord_hdr[1] = 1;
1716                 conn_param.private_data = ird_ord_hdr;
1717                 conn_param.private_data_len = sizeof(ird_ord_hdr);
1718         } else {
1719                 conn_param.private_data = NULL;
1720                 conn_param.private_data_len = 0;
1721         }
1722
1723         conn_param.retry_count = SMBD_CM_RETRY;
1724         conn_param.rnr_retry_count = SMBD_CM_RNR_RETRY;
1725         conn_param.flow_control = 0;
1726         init_waitqueue_head(&info->wait_destroy);
1727
1728         log_rdma_event(INFO, "connecting to IP %pI4 port %d\n",
1729                 &addr_in->sin_addr, port);
1730
1731         init_waitqueue_head(&info->conn_wait);
1732         rc = rdma_connect(info->id, &conn_param);
1733         if (rc) {
1734                 log_rdma_event(ERR, "rdma_connect() failed with %i\n", rc);
1735                 goto rdma_connect_failed;
1736         }
1737
1738         wait_event_interruptible(
1739                 info->conn_wait, info->transport_status != SMBD_CONNECTING);
1740
1741         if (info->transport_status != SMBD_CONNECTED) {
1742                 log_rdma_event(ERR, "rdma_connect failed port=%d\n", port);
1743                 goto rdma_connect_failed;
1744         }
1745
1746         log_rdma_event(INFO, "rdma_connect connected\n");
1747
1748         rc = allocate_caches_and_workqueue(info);
1749         if (rc) {
1750                 log_rdma_event(ERR, "cache allocation failed\n");
1751                 goto allocate_cache_failed;
1752         }
1753
1754         init_waitqueue_head(&info->wait_send_queue);
1755         init_waitqueue_head(&info->wait_reassembly_queue);
1756
1757         INIT_DELAYED_WORK(&info->idle_timer_work, idle_connection_timer);
1758         INIT_DELAYED_WORK(&info->send_immediate_work, send_immediate_work);
1759         queue_delayed_work(info->workqueue, &info->idle_timer_work,
1760                 info->keep_alive_interval*HZ);
1761
1762         init_waitqueue_head(&info->wait_smbd_send_pending);
1763         info->smbd_send_pending = 0;
1764
1765         init_waitqueue_head(&info->wait_smbd_recv_pending);
1766         info->smbd_recv_pending = 0;
1767
1768         init_waitqueue_head(&info->wait_send_pending);
1769         atomic_set(&info->send_pending, 0);
1770
1771         init_waitqueue_head(&info->wait_send_payload_pending);
1772         atomic_set(&info->send_payload_pending, 0);
1773
1774         INIT_WORK(&info->disconnect_work, smbd_disconnect_rdma_work);
1775         INIT_WORK(&info->destroy_work, smbd_destroy_rdma_work);
1776         INIT_WORK(&info->recv_done_work, smbd_recv_done_work);
1777         INIT_WORK(&info->post_send_credits_work, smbd_post_send_credits);
1778         info->new_credits_offered = 0;
1779         spin_lock_init(&info->lock_new_credits_offered);
1780
1781         rc = smbd_negotiate(info);
1782         if (rc) {
1783                 log_rdma_event(ERR, "smbd_negotiate rc=%d\n", rc);
1784                 goto negotiation_failed;
1785         }
1786
1787         rc = allocate_mr_list(info);
1788         if (rc) {
1789                 log_rdma_mr(ERR, "memory registration allocation failed\n");
1790                 goto allocate_mr_failed;
1791         }
1792
1793         return info;
1794
1795 allocate_mr_failed:
1796         /* At this point, need to a full transport shutdown */
1797         smbd_destroy(info);
1798         return NULL;
1799
1800 negotiation_failed:
1801         cancel_delayed_work_sync(&info->idle_timer_work);
1802         destroy_caches_and_workqueue(info);
1803         info->transport_status = SMBD_NEGOTIATE_FAILED;
1804         init_waitqueue_head(&info->conn_wait);
1805         rdma_disconnect(info->id);
1806         wait_event(info->conn_wait,
1807                 info->transport_status == SMBD_DISCONNECTED);
1808
1809 allocate_cache_failed:
1810 rdma_connect_failed:
1811         rdma_destroy_qp(info->id);
1812
1813 create_qp_failed:
1814 alloc_cq_failed:
1815         if (info->send_cq)
1816                 ib_free_cq(info->send_cq);
1817         if (info->recv_cq)
1818                 ib_free_cq(info->recv_cq);
1819
1820 config_failed:
1821         ib_dealloc_pd(info->pd);
1822         rdma_destroy_id(info->id);
1823
1824 create_id_failed:
1825         kfree(info);
1826         return NULL;
1827 }
1828
1829 struct smbd_connection *smbd_get_connection(
1830         struct TCP_Server_Info *server, struct sockaddr *dstaddr)
1831 {
1832         struct smbd_connection *ret;
1833         int port = SMBD_PORT;
1834
1835 try_again:
1836         ret = _smbd_get_connection(server, dstaddr, port);
1837
1838         /* Try SMB_PORT if SMBD_PORT doesn't work */
1839         if (!ret && port == SMBD_PORT) {
1840                 port = SMB_PORT;
1841                 goto try_again;
1842         }
1843         return ret;
1844 }
1845
1846 /*
1847  * Receive data from receive reassembly queue
1848  * All the incoming data packets are placed in reassembly queue
1849  * buf: the buffer to read data into
1850  * size: the length of data to read
1851  * return value: actual data read
1852  * Note: this implementation copies the data from reassebmly queue to receive
1853  * buffers used by upper layer. This is not the optimal code path. A better way
1854  * to do it is to not have upper layer allocate its receive buffers but rather
1855  * borrow the buffer from reassembly queue, and return it after data is
1856  * consumed. But this will require more changes to upper layer code, and also
1857  * need to consider packet boundaries while they still being reassembled.
1858  */
1859 static int smbd_recv_buf(struct smbd_connection *info, char *buf,
1860                 unsigned int size)
1861 {
1862         struct smbd_response *response;
1863         struct smbd_data_transfer *data_transfer;
1864         int to_copy, to_read, data_read, offset;
1865         u32 data_length, remaining_data_length, data_offset;
1866         int rc;
1867
1868 again:
1869         if (info->transport_status != SMBD_CONNECTED) {
1870                 log_read(ERR, "disconnected\n");
1871                 return -ENODEV;
1872         }
1873
1874         /*
1875          * No need to hold the reassembly queue lock all the time as we are
1876          * the only one reading from the front of the queue. The transport
1877          * may add more entries to the back of the queue at the same time
1878          */
1879         log_read(INFO, "size=%d info->reassembly_data_length=%d\n", size,
1880                 info->reassembly_data_length);
1881         if (info->reassembly_data_length >= size) {
1882                 int queue_length;
1883                 int queue_removed = 0;
1884
1885                 /*
1886                  * Need to make sure reassembly_data_length is read before
1887                  * reading reassembly_queue_length and calling
1888                  * _get_first_reassembly. This call is lock free
1889                  * as we never read at the end of the queue which are being
1890                  * updated in SOFTIRQ as more data is received
1891                  */
1892                 virt_rmb();
1893                 queue_length = info->reassembly_queue_length;
1894                 data_read = 0;
1895                 to_read = size;
1896                 offset = info->first_entry_offset;
1897                 while (data_read < size) {
1898                         response = _get_first_reassembly(info);
1899                         data_transfer = smbd_response_payload(response);
1900                         data_length = le32_to_cpu(data_transfer->data_length);
1901                         remaining_data_length =
1902                                 le32_to_cpu(
1903                                         data_transfer->remaining_data_length);
1904                         data_offset = le32_to_cpu(data_transfer->data_offset);
1905
1906                         /*
1907                          * The upper layer expects RFC1002 length at the
1908                          * beginning of the payload. Return it to indicate
1909                          * the total length of the packet. This minimize the
1910                          * change to upper layer packet processing logic. This
1911                          * will be eventually remove when an intermediate
1912                          * transport layer is added
1913                          */
1914                         if (response->first_segment && size == 4) {
1915                                 unsigned int rfc1002_len =
1916                                         data_length + remaining_data_length;
1917                                 *((__be32 *)buf) = cpu_to_be32(rfc1002_len);
1918                                 data_read = 4;
1919                                 response->first_segment = false;
1920                                 log_read(INFO, "returning rfc1002 length %d\n",
1921                                         rfc1002_len);
1922                                 goto read_rfc1002_done;
1923                         }
1924
1925                         to_copy = min_t(int, data_length - offset, to_read);
1926                         memcpy(
1927                                 buf + data_read,
1928                                 (char *)data_transfer + data_offset + offset,
1929                                 to_copy);
1930
1931                         /* move on to the next buffer? */
1932                         if (to_copy == data_length - offset) {
1933                                 queue_length--;
1934                                 /*
1935                                  * No need to lock if we are not at the
1936                                  * end of the queue
1937                                  */
1938                                 if (queue_length)
1939                                         list_del(&response->list);
1940                                 else {
1941                                         spin_lock_irq(
1942                                                 &info->reassembly_queue_lock);
1943                                         list_del(&response->list);
1944                                         spin_unlock_irq(
1945                                                 &info->reassembly_queue_lock);
1946                                 }
1947                                 queue_removed++;
1948                                 info->count_reassembly_queue--;
1949                                 info->count_dequeue_reassembly_queue++;
1950                                 put_receive_buffer(info, response);
1951                                 offset = 0;
1952                                 log_read(INFO, "put_receive_buffer offset=0\n");
1953                         } else
1954                                 offset += to_copy;
1955
1956                         to_read -= to_copy;
1957                         data_read += to_copy;
1958
1959                         log_read(INFO, "_get_first_reassembly memcpy %d bytes "
1960                                 "data_transfer_length-offset=%d after that "
1961                                 "to_read=%d data_read=%d offset=%d\n",
1962                                 to_copy, data_length - offset,
1963                                 to_read, data_read, offset);
1964                 }
1965
1966                 spin_lock_irq(&info->reassembly_queue_lock);
1967                 info->reassembly_data_length -= data_read;
1968                 info->reassembly_queue_length -= queue_removed;
1969                 spin_unlock_irq(&info->reassembly_queue_lock);
1970
1971                 info->first_entry_offset = offset;
1972                 log_read(INFO, "returning to thread data_read=%d "
1973                         "reassembly_data_length=%d first_entry_offset=%d\n",
1974                         data_read, info->reassembly_data_length,
1975                         info->first_entry_offset);
1976 read_rfc1002_done:
1977                 return data_read;
1978         }
1979
1980         log_read(INFO, "wait_event on more data\n");
1981         rc = wait_event_interruptible(
1982                 info->wait_reassembly_queue,
1983                 info->reassembly_data_length >= size ||
1984                         info->transport_status != SMBD_CONNECTED);
1985         /* Don't return any data if interrupted */
1986         if (rc)
1987                 return -ENODEV;
1988
1989         goto again;
1990 }
1991
1992 /*
1993  * Receive a page from receive reassembly queue
1994  * page: the page to read data into
1995  * to_read: the length of data to read
1996  * return value: actual data read
1997  */
1998 static int smbd_recv_page(struct smbd_connection *info,
1999                 struct page *page, unsigned int to_read)
2000 {
2001         int ret;
2002         char *to_address;
2003
2004         /* make sure we have the page ready for read */
2005         ret = wait_event_interruptible(
2006                 info->wait_reassembly_queue,
2007                 info->reassembly_data_length >= to_read ||
2008                         info->transport_status != SMBD_CONNECTED);
2009         if (ret)
2010                 return 0;
2011
2012         /* now we can read from reassembly queue and not sleep */
2013         to_address = kmap_atomic(page);
2014
2015         log_read(INFO, "reading from page=%p address=%p to_read=%d\n",
2016                 page, to_address, to_read);
2017
2018         ret = smbd_recv_buf(info, to_address, to_read);
2019         kunmap_atomic(to_address);
2020
2021         return ret;
2022 }
2023
2024 /*
2025  * Receive data from transport
2026  * msg: a msghdr point to the buffer, can be ITER_KVEC or ITER_BVEC
2027  * return: total bytes read, or 0. SMB Direct will not do partial read.
2028  */
2029 int smbd_recv(struct smbd_connection *info, struct msghdr *msg)
2030 {
2031         char *buf;
2032         struct page *page;
2033         unsigned int to_read;
2034         int rc;
2035
2036         info->smbd_recv_pending++;
2037
2038         switch (msg->msg_iter.type) {
2039         case READ | ITER_KVEC:
2040                 buf = msg->msg_iter.kvec->iov_base;
2041                 to_read = msg->msg_iter.kvec->iov_len;
2042                 rc = smbd_recv_buf(info, buf, to_read);
2043                 break;
2044
2045         case READ | ITER_BVEC:
2046                 page = msg->msg_iter.bvec->bv_page;
2047                 to_read = msg->msg_iter.bvec->bv_len;
2048                 rc = smbd_recv_page(info, page, to_read);
2049                 break;
2050
2051         default:
2052                 /* It's a bug in upper layer to get there */
2053                 cifs_dbg(VFS, "CIFS: invalid msg type %d\n",
2054                         msg->msg_iter.type);
2055                 rc = -EIO;
2056         }
2057
2058         info->smbd_recv_pending--;
2059         wake_up(&info->wait_smbd_recv_pending);
2060
2061         /* SMBDirect will read it all or nothing */
2062         if (rc > 0)
2063                 msg->msg_iter.count = 0;
2064         return rc;
2065 }
2066
2067 /*
2068  * Send data to transport
2069  * Each rqst is transported as a SMBDirect payload
2070  * rqst: the data to write
2071  * return value: 0 if successfully write, otherwise error code
2072  */
2073 int smbd_send(struct smbd_connection *info, struct smb_rqst *rqst)
2074 {
2075         struct kvec vec;
2076         int nvecs;
2077         int size;
2078         int buflen = 0, remaining_data_length;
2079         int start, i, j;
2080         int max_iov_size =
2081                 info->max_send_size - sizeof(struct smbd_data_transfer);
2082         struct kvec iov[SMBDIRECT_MAX_SGE];
2083         int rc;
2084
2085         info->smbd_send_pending++;
2086         if (info->transport_status != SMBD_CONNECTED) {
2087                 rc = -ENODEV;
2088                 goto done;
2089         }
2090
2091         /*
2092          * This usually means a configuration error
2093          * We use RDMA read/write for packet size > rdma_readwrite_threshold
2094          * as long as it's properly configured we should never get into this
2095          * situation
2096          */
2097         if (rqst->rq_nvec + rqst->rq_npages > SMBDIRECT_MAX_SGE) {
2098                 log_write(ERR, "maximum send segment %x exceeding %x\n",
2099                          rqst->rq_nvec + rqst->rq_npages, SMBDIRECT_MAX_SGE);
2100                 rc = -EINVAL;
2101                 goto done;
2102         }
2103
2104         /*
2105          * Remove the RFC1002 length defined in MS-SMB2 section 2.1
2106          * It is used only for TCP transport
2107          * In future we may want to add a transport layer under protocol
2108          * layer so this will only be issued to TCP transport
2109          */
2110         iov[0].iov_base = (char *)rqst->rq_iov[0].iov_base + 4;
2111         iov[0].iov_len = rqst->rq_iov[0].iov_len - 4;
2112         buflen += iov[0].iov_len;
2113
2114         /* total up iov array first */
2115         for (i = 1; i < rqst->rq_nvec; i++) {
2116                 iov[i].iov_base = rqst->rq_iov[i].iov_base;
2117                 iov[i].iov_len = rqst->rq_iov[i].iov_len;
2118                 buflen += iov[i].iov_len;
2119         }
2120
2121         /* add in the page array if there is one */
2122         if (rqst->rq_npages) {
2123                 buflen += rqst->rq_pagesz * (rqst->rq_npages - 1);
2124                 buflen += rqst->rq_tailsz;
2125         }
2126
2127         if (buflen + sizeof(struct smbd_data_transfer) >
2128                 info->max_fragmented_send_size) {
2129                 log_write(ERR, "payload size %d > max size %d\n",
2130                         buflen, info->max_fragmented_send_size);
2131                 rc = -EINVAL;
2132                 goto done;
2133         }
2134
2135         remaining_data_length = buflen;
2136
2137         log_write(INFO, "rqst->rq_nvec=%d rqst->rq_npages=%d rq_pagesz=%d "
2138                 "rq_tailsz=%d buflen=%d\n",
2139                 rqst->rq_nvec, rqst->rq_npages, rqst->rq_pagesz,
2140                 rqst->rq_tailsz, buflen);
2141
2142         start = i = iov[0].iov_len ? 0 : 1;
2143         buflen = 0;
2144         while (true) {
2145                 buflen += iov[i].iov_len;
2146                 if (buflen > max_iov_size) {
2147                         if (i > start) {
2148                                 remaining_data_length -=
2149                                         (buflen-iov[i].iov_len);
2150                                 log_write(INFO, "sending iov[] from start=%d "
2151                                         "i=%d nvecs=%d "
2152                                         "remaining_data_length=%d\n",
2153                                         start, i, i-start,
2154                                         remaining_data_length);
2155                                 rc = smbd_post_send_data(
2156                                         info, &iov[start], i-start,
2157                                         remaining_data_length);
2158                                 if (rc)
2159                                         goto done;
2160                         } else {
2161                                 /* iov[start] is too big, break it */
2162                                 nvecs = (buflen+max_iov_size-1)/max_iov_size;
2163                                 log_write(INFO, "iov[%d] iov_base=%p buflen=%d"
2164                                         " break to %d vectors\n",
2165                                         start, iov[start].iov_base,
2166                                         buflen, nvecs);
2167                                 for (j = 0; j < nvecs; j++) {
2168                                         vec.iov_base =
2169                                                 (char *)iov[start].iov_base +
2170                                                 j*max_iov_size;
2171                                         vec.iov_len = max_iov_size;
2172                                         if (j == nvecs-1)
2173                                                 vec.iov_len =
2174                                                         buflen -
2175                                                         max_iov_size*(nvecs-1);
2176                                         remaining_data_length -= vec.iov_len;
2177                                         log_write(INFO,
2178                                                 "sending vec j=%d iov_base=%p"
2179                                                 " iov_len=%zu "
2180                                                 "remaining_data_length=%d\n",
2181                                                 j, vec.iov_base, vec.iov_len,
2182                                                 remaining_data_length);
2183                                         rc = smbd_post_send_data(
2184                                                 info, &vec, 1,
2185                                                 remaining_data_length);
2186                                         if (rc)
2187                                                 goto done;
2188                                 }
2189                                 i++;
2190                         }
2191                         start = i;
2192                         buflen = 0;
2193                 } else {
2194                         i++;
2195                         if (i == rqst->rq_nvec) {
2196                                 /* send out all remaining vecs */
2197                                 remaining_data_length -= buflen;
2198                                 log_write(INFO,
2199                                         "sending iov[] from start=%d i=%d "
2200                                         "nvecs=%d remaining_data_length=%d\n",
2201                                         start, i, i-start,
2202                                         remaining_data_length);
2203                                 rc = smbd_post_send_data(info, &iov[start],
2204                                         i-start, remaining_data_length);
2205                                 if (rc)
2206                                         goto done;
2207                                 break;
2208                         }
2209                 }
2210                 log_write(INFO, "looping i=%d buflen=%d\n", i, buflen);
2211         }
2212
2213         /* now sending pages if there are any */
2214         for (i = 0; i < rqst->rq_npages; i++) {
2215                 buflen = (i == rqst->rq_npages-1) ?
2216                         rqst->rq_tailsz : rqst->rq_pagesz;
2217                 nvecs = (buflen + max_iov_size - 1) / max_iov_size;
2218                 log_write(INFO, "sending pages buflen=%d nvecs=%d\n",
2219                         buflen, nvecs);
2220                 for (j = 0; j < nvecs; j++) {
2221                         size = max_iov_size;
2222                         if (j == nvecs-1)
2223                                 size = buflen - j*max_iov_size;
2224                         remaining_data_length -= size;
2225                         log_write(INFO, "sending pages i=%d offset=%d size=%d"
2226                                 " remaining_data_length=%d\n",
2227                                 i, j*max_iov_size, size, remaining_data_length);
2228                         rc = smbd_post_send_page(
2229                                 info, rqst->rq_pages[i], j*max_iov_size,
2230                                 size, remaining_data_length);
2231                         if (rc)
2232                                 goto done;
2233                 }
2234         }
2235
2236 done:
2237         /*
2238          * As an optimization, we don't wait for individual I/O to finish
2239          * before sending the next one.
2240          * Send them all and wait for pending send count to get to 0
2241          * that means all the I/Os have been out and we are good to return
2242          */
2243
2244         wait_event(info->wait_send_payload_pending,
2245                 atomic_read(&info->send_payload_pending) == 0);
2246
2247         info->smbd_send_pending--;
2248         wake_up(&info->wait_smbd_send_pending);
2249
2250         return rc;
2251 }
2252
2253 static void register_mr_done(struct ib_cq *cq, struct ib_wc *wc)
2254 {
2255         struct smbd_mr *mr;
2256         struct ib_cqe *cqe;
2257
2258         if (wc->status) {
2259                 log_rdma_mr(ERR, "status=%d\n", wc->status);
2260                 cqe = wc->wr_cqe;
2261                 mr = container_of(cqe, struct smbd_mr, cqe);
2262                 smbd_disconnect_rdma_connection(mr->conn);
2263         }
2264 }
2265
2266 /*
2267  * The work queue function that recovers MRs
2268  * We need to call ib_dereg_mr() and ib_alloc_mr() before this MR can be used
2269  * again. Both calls are slow, so finish them in a workqueue. This will not
2270  * block I/O path.
2271  * There is one workqueue that recovers MRs, there is no need to lock as the
2272  * I/O requests calling smbd_register_mr will never update the links in the
2273  * mr_list.
2274  */
2275 static void smbd_mr_recovery_work(struct work_struct *work)
2276 {
2277         struct smbd_connection *info =
2278                 container_of(work, struct smbd_connection, mr_recovery_work);
2279         struct smbd_mr *smbdirect_mr;
2280         int rc;
2281
2282         list_for_each_entry(smbdirect_mr, &info->mr_list, list) {
2283                 if (smbdirect_mr->state == MR_INVALIDATED ||
2284                         smbdirect_mr->state == MR_ERROR) {
2285
2286                         if (smbdirect_mr->state == MR_INVALIDATED) {
2287                                 ib_dma_unmap_sg(
2288                                         info->id->device, smbdirect_mr->sgl,
2289                                         smbdirect_mr->sgl_count,
2290                                         smbdirect_mr->dir);
2291                                 smbdirect_mr->state = MR_READY;
2292                         } else if (smbdirect_mr->state == MR_ERROR) {
2293
2294                                 /* recover this MR entry */
2295                                 rc = ib_dereg_mr(smbdirect_mr->mr);
2296                                 if (rc) {
2297                                         log_rdma_mr(ERR,
2298                                                 "ib_dereg_mr faield rc=%x\n",
2299                                                 rc);
2300                                         smbd_disconnect_rdma_connection(info);
2301                                 }
2302
2303                                 smbdirect_mr->mr = ib_alloc_mr(
2304                                         info->pd, info->mr_type,
2305                                         info->max_frmr_depth);
2306                                 if (IS_ERR(smbdirect_mr->mr)) {
2307                                         log_rdma_mr(ERR,
2308                                                 "ib_alloc_mr failed mr_type=%x "
2309                                                 "max_frmr_depth=%x\n",
2310                                                 info->mr_type,
2311                                                 info->max_frmr_depth);
2312                                         smbd_disconnect_rdma_connection(info);
2313                                 }
2314
2315                                 smbdirect_mr->state = MR_READY;
2316                         }
2317                         /* smbdirect_mr->state is updated by this function
2318                          * and is read and updated by I/O issuing CPUs trying
2319                          * to get a MR, the call to atomic_inc_return
2320                          * implicates a memory barrier and guarantees this
2321                          * value is updated before waking up any calls to
2322                          * get_mr() from the I/O issuing CPUs
2323                          */
2324                         if (atomic_inc_return(&info->mr_ready_count) == 1)
2325                                 wake_up_interruptible(&info->wait_mr);
2326                 }
2327         }
2328 }
2329
2330 static void destroy_mr_list(struct smbd_connection *info)
2331 {
2332         struct smbd_mr *mr, *tmp;
2333
2334         cancel_work_sync(&info->mr_recovery_work);
2335         list_for_each_entry_safe(mr, tmp, &info->mr_list, list) {
2336                 if (mr->state == MR_INVALIDATED)
2337                         ib_dma_unmap_sg(info->id->device, mr->sgl,
2338                                 mr->sgl_count, mr->dir);
2339                 ib_dereg_mr(mr->mr);
2340                 kfree(mr->sgl);
2341                 kfree(mr);
2342         }
2343 }
2344
2345 /*
2346  * Allocate MRs used for RDMA read/write
2347  * The number of MRs will not exceed hardware capability in responder_resources
2348  * All MRs are kept in mr_list. The MR can be recovered after it's used
2349  * Recovery is done in smbd_mr_recovery_work. The content of list entry changes
2350  * as MRs are used and recovered for I/O, but the list links will not change
2351  */
2352 static int allocate_mr_list(struct smbd_connection *info)
2353 {
2354         int i;
2355         struct smbd_mr *smbdirect_mr, *tmp;
2356
2357         INIT_LIST_HEAD(&info->mr_list);
2358         init_waitqueue_head(&info->wait_mr);
2359         spin_lock_init(&info->mr_list_lock);
2360         atomic_set(&info->mr_ready_count, 0);
2361         atomic_set(&info->mr_used_count, 0);
2362         init_waitqueue_head(&info->wait_for_mr_cleanup);
2363         /* Allocate more MRs (2x) than hardware responder_resources */
2364         for (i = 0; i < info->responder_resources * 2; i++) {
2365                 smbdirect_mr = kzalloc(sizeof(*smbdirect_mr), GFP_KERNEL);
2366                 if (!smbdirect_mr)
2367                         goto out;
2368                 smbdirect_mr->mr = ib_alloc_mr(info->pd, info->mr_type,
2369                                         info->max_frmr_depth);
2370                 if (IS_ERR(smbdirect_mr->mr)) {
2371                         log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x "
2372                                 "max_frmr_depth=%x\n",
2373                                 info->mr_type, info->max_frmr_depth);
2374                         goto out;
2375                 }
2376                 smbdirect_mr->sgl = kcalloc(
2377                                         info->max_frmr_depth,
2378                                         sizeof(struct scatterlist),
2379                                         GFP_KERNEL);
2380                 if (!smbdirect_mr->sgl) {
2381                         log_rdma_mr(ERR, "failed to allocate sgl\n");
2382                         ib_dereg_mr(smbdirect_mr->mr);
2383                         goto out;
2384                 }
2385                 smbdirect_mr->state = MR_READY;
2386                 smbdirect_mr->conn = info;
2387
2388                 list_add_tail(&smbdirect_mr->list, &info->mr_list);
2389                 atomic_inc(&info->mr_ready_count);
2390         }
2391         INIT_WORK(&info->mr_recovery_work, smbd_mr_recovery_work);
2392         return 0;
2393
2394 out:
2395         kfree(smbdirect_mr);
2396
2397         list_for_each_entry_safe(smbdirect_mr, tmp, &info->mr_list, list) {
2398                 ib_dereg_mr(smbdirect_mr->mr);
2399                 kfree(smbdirect_mr->sgl);
2400                 kfree(smbdirect_mr);
2401         }
2402         return -ENOMEM;
2403 }
2404
2405 /*
2406  * Get a MR from mr_list. This function waits until there is at least one
2407  * MR available in the list. It may access the list while the
2408  * smbd_mr_recovery_work is recovering the MR list. This doesn't need a lock
2409  * as they never modify the same places. However, there may be several CPUs
2410  * issueing I/O trying to get MR at the same time, mr_list_lock is used to
2411  * protect this situation.
2412  */
2413 static struct smbd_mr *get_mr(struct smbd_connection *info)
2414 {
2415         struct smbd_mr *ret;
2416         int rc;
2417 again:
2418         rc = wait_event_interruptible(info->wait_mr,
2419                 atomic_read(&info->mr_ready_count) ||
2420                 info->transport_status != SMBD_CONNECTED);
2421         if (rc) {
2422                 log_rdma_mr(ERR, "wait_event_interruptible rc=%x\n", rc);
2423                 return NULL;
2424         }
2425
2426         if (info->transport_status != SMBD_CONNECTED) {
2427                 log_rdma_mr(ERR, "info->transport_status=%x\n",
2428                         info->transport_status);
2429                 return NULL;
2430         }
2431
2432         spin_lock(&info->mr_list_lock);
2433         list_for_each_entry(ret, &info->mr_list, list) {
2434                 if (ret->state == MR_READY) {
2435                         ret->state = MR_REGISTERED;
2436                         spin_unlock(&info->mr_list_lock);
2437                         atomic_dec(&info->mr_ready_count);
2438                         atomic_inc(&info->mr_used_count);
2439                         return ret;
2440                 }
2441         }
2442
2443         spin_unlock(&info->mr_list_lock);
2444         /*
2445          * It is possible that we could fail to get MR because other processes may
2446          * try to acquire a MR at the same time. If this is the case, retry it.
2447          */
2448         goto again;
2449 }
2450
2451 /*
2452  * Register memory for RDMA read/write
2453  * pages[]: the list of pages to register memory with
2454  * num_pages: the number of pages to register
2455  * tailsz: if non-zero, the bytes to register in the last page
2456  * writing: true if this is a RDMA write (SMB read), false for RDMA read
2457  * need_invalidate: true if this MR needs to be locally invalidated after I/O
2458  * return value: the MR registered, NULL if failed.
2459  */
2460 struct smbd_mr *smbd_register_mr(
2461         struct smbd_connection *info, struct page *pages[], int num_pages,
2462         int tailsz, bool writing, bool need_invalidate)
2463 {
2464         struct smbd_mr *smbdirect_mr;
2465         int rc, i;
2466         enum dma_data_direction dir;
2467         struct ib_reg_wr *reg_wr;
2468         struct ib_send_wr *bad_wr;
2469
2470         if (num_pages > info->max_frmr_depth) {
2471                 log_rdma_mr(ERR, "num_pages=%d max_frmr_depth=%d\n",
2472                         num_pages, info->max_frmr_depth);
2473                 return NULL;
2474         }
2475
2476         smbdirect_mr = get_mr(info);
2477         if (!smbdirect_mr) {
2478                 log_rdma_mr(ERR, "get_mr returning NULL\n");
2479                 return NULL;
2480         }
2481         smbdirect_mr->need_invalidate = need_invalidate;
2482         smbdirect_mr->sgl_count = num_pages;
2483         sg_init_table(smbdirect_mr->sgl, num_pages);
2484
2485         for (i = 0; i < num_pages - 1; i++)
2486                 sg_set_page(&smbdirect_mr->sgl[i], pages[i], PAGE_SIZE, 0);
2487
2488         sg_set_page(&smbdirect_mr->sgl[i], pages[i],
2489                 tailsz ? tailsz : PAGE_SIZE, 0);
2490
2491         dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
2492         smbdirect_mr->dir = dir;
2493         rc = ib_dma_map_sg(info->id->device, smbdirect_mr->sgl, num_pages, dir);
2494         if (!rc) {
2495                 log_rdma_mr(INFO, "ib_dma_map_sg num_pages=%x dir=%x rc=%x\n",
2496                         num_pages, dir, rc);
2497                 goto dma_map_error;
2498         }
2499
2500         rc = ib_map_mr_sg(smbdirect_mr->mr, smbdirect_mr->sgl, num_pages,
2501                 NULL, PAGE_SIZE);
2502         if (rc != num_pages) {
2503                 log_rdma_mr(INFO,
2504                         "ib_map_mr_sg failed rc = %x num_pages = %x\n",
2505                         rc, num_pages);
2506                 goto map_mr_error;
2507         }
2508
2509         ib_update_fast_reg_key(smbdirect_mr->mr,
2510                 ib_inc_rkey(smbdirect_mr->mr->rkey));
2511         reg_wr = &smbdirect_mr->wr;
2512         reg_wr->wr.opcode = IB_WR_REG_MR;
2513         smbdirect_mr->cqe.done = register_mr_done;
2514         reg_wr->wr.wr_cqe = &smbdirect_mr->cqe;
2515         reg_wr->wr.num_sge = 0;
2516         reg_wr->wr.send_flags = IB_SEND_SIGNALED;
2517         reg_wr->mr = smbdirect_mr->mr;
2518         reg_wr->key = smbdirect_mr->mr->rkey;
2519         reg_wr->access = writing ?
2520                         IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
2521                         IB_ACCESS_REMOTE_READ;
2522
2523         /*
2524          * There is no need for waiting for complemtion on ib_post_send
2525          * on IB_WR_REG_MR. Hardware enforces a barrier and order of execution
2526          * on the next ib_post_send when we actaully send I/O to remote peer
2527          */
2528         rc = ib_post_send(info->id->qp, &reg_wr->wr, &bad_wr);
2529         if (!rc)
2530                 return smbdirect_mr;
2531
2532         log_rdma_mr(ERR, "ib_post_send failed rc=%x reg_wr->key=%x\n",
2533                 rc, reg_wr->key);
2534
2535         /* If all failed, attempt to recover this MR by setting it MR_ERROR*/
2536 map_mr_error:
2537         ib_dma_unmap_sg(info->id->device, smbdirect_mr->sgl,
2538                 smbdirect_mr->sgl_count, smbdirect_mr->dir);
2539
2540 dma_map_error:
2541         smbdirect_mr->state = MR_ERROR;
2542         if (atomic_dec_and_test(&info->mr_used_count))
2543                 wake_up(&info->wait_for_mr_cleanup);
2544
2545         return NULL;
2546 }
2547
2548 static void local_inv_done(struct ib_cq *cq, struct ib_wc *wc)
2549 {
2550         struct smbd_mr *smbdirect_mr;
2551         struct ib_cqe *cqe;
2552
2553         cqe = wc->wr_cqe;
2554         smbdirect_mr = container_of(cqe, struct smbd_mr, cqe);
2555         smbdirect_mr->state = MR_INVALIDATED;
2556         if (wc->status != IB_WC_SUCCESS) {
2557                 log_rdma_mr(ERR, "invalidate failed status=%x\n", wc->status);
2558                 smbdirect_mr->state = MR_ERROR;
2559         }
2560         complete(&smbdirect_mr->invalidate_done);
2561 }
2562
2563 /*
2564  * Deregister a MR after I/O is done
2565  * This function may wait if remote invalidation is not used
2566  * and we have to locally invalidate the buffer to prevent data is being
2567  * modified by remote peer after upper layer consumes it
2568  */
2569 int smbd_deregister_mr(struct smbd_mr *smbdirect_mr)
2570 {
2571         struct ib_send_wr *wr, *bad_wr;
2572         struct smbd_connection *info = smbdirect_mr->conn;
2573         int rc = 0;
2574
2575         if (smbdirect_mr->need_invalidate) {
2576                 /* Need to finish local invalidation before returning */
2577                 wr = &smbdirect_mr->inv_wr;
2578                 wr->opcode = IB_WR_LOCAL_INV;
2579                 smbdirect_mr->cqe.done = local_inv_done;
2580                 wr->wr_cqe = &smbdirect_mr->cqe;
2581                 wr->num_sge = 0;
2582                 wr->ex.invalidate_rkey = smbdirect_mr->mr->rkey;
2583                 wr->send_flags = IB_SEND_SIGNALED;
2584
2585                 init_completion(&smbdirect_mr->invalidate_done);
2586                 rc = ib_post_send(info->id->qp, wr, &bad_wr);
2587                 if (rc) {
2588                         log_rdma_mr(ERR, "ib_post_send failed rc=%x\n", rc);
2589                         smbd_disconnect_rdma_connection(info);
2590                         goto done;
2591                 }
2592                 wait_for_completion(&smbdirect_mr->invalidate_done);
2593                 smbdirect_mr->need_invalidate = false;
2594         } else
2595                 /*
2596                  * For remote invalidation, just set it to MR_INVALIDATED
2597                  * and defer to mr_recovery_work to recover the MR for next use
2598                  */
2599                 smbdirect_mr->state = MR_INVALIDATED;
2600
2601         /*
2602          * Schedule the work to do MR recovery for future I/Os
2603          * MR recovery is slow and we don't want it to block the current I/O
2604          */
2605         queue_work(info->workqueue, &info->mr_recovery_work);
2606
2607 done:
2608         if (atomic_dec_and_test(&info->mr_used_count))
2609                 wake_up(&info->wait_for_mr_cleanup);
2610
2611         return rc;
2612 }