4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
26 * Copyright (c) 2007, The Ohio State University. All rights reserved.
28 * Portions of this source code is developed by the team members of
29 * The Ohio State University's Network-Based Computing Laboratory (NBCL),
30 * headed by Professor Dhabaleswar K. (DK) Panda.
32 * Acknowledgements to contributions from developors:
33 * Ranjit Noronha: noronha@cse.ohio-state.edu
34 * Lei Chai : chail@cse.ohio-state.edu
35 * Weikuan Yu : yuw@cse.ohio-state.edu
39 #ifndef _RPC_RPC_RDMA_H
40 #define _RPC_RPC_RDMA_H
43 #include <rpc/rpc_sztypes.h>
44 #include <sys/sunddi.h>
45 #include <sys/sunldi.h>
51 #define RPCRDMA_VERS 1 /* Version of the RPC over RDMA protocol */
52 #define RDMATF_VERS 1 /* Version of the API used by RPC for RDMA */
53 #define RDMATF_VERS_1 1 /* Current version of RDMATF */
56 * The size of an RPC call or reply message
58 #define RPC_MSG_SZ 1024
63 #define RDMA_MINCHUNK 1024
66 * Storage for a chunk list
68 #define RPC_CL_SZ 1024
76 * Size of receive buffer
78 #define RPC_BUF_SIZE 2048
80 #define NOWAIT 0 /* don't wait for operation of complete */
81 #define WAIT 1 /* wait and ensure that operation is complete */
84 * RDMA xdr buffer control and other control flags. Add new flags here,
85 * set them in private structure for xdr over RDMA in xdr_rdma.c
87 #define XDR_RDMA_CHUNK 0x1
88 #define XDR_RDMA_WLIST_REG 0x2
89 #define XDR_RDMA_RLIST_REG 0x4
91 #define LONG_REPLY_LEN 65536
92 #define WCL_BUF_LEN 32768
93 #define RCL_BUF_LEN 32768
96 #define RDMA_BUFS_RQST 34 /* Num bufs requested by client */
97 #define RDMA_BUFS_GRANT 32 /* Num bufs granted by server */
99 struct xdr_ops
*xdrrdma_xops(void);
102 * Credit Control Structures.
104 typedef enum rdma_cc_type
{
105 RDMA_CC_CLNT
, /* CONN is for a client */
106 RDMA_CC_SRV
/* CONN is for a server */
110 * Client side credit control data structure.
112 typedef struct rdma_clnt_cred_ctrl
{
113 uint32_t clnt_cc_granted_ops
;
114 uint32_t clnt_cc_in_flight_ops
;
115 kcondvar_t clnt_cc_cv
;
116 } rdma_clnt_cred_ctrl_t
;
119 * Server side credit control data structure.
121 typedef struct rdma_srv_cred_ctrl
{
122 uint32_t srv_cc_buffers_granted
;
123 uint32_t srv_cc_cur_buffers_used
;
124 uint32_t srv_cc_posted
;
125 uint32_t srv_cc_max_buf_size
; /* to be determined by CCP */
126 uint32_t srv_cc_cur_buf_size
; /* to be determined by CCP */
127 } rdma_srv_cred_ctrl_t
;
136 CLIST_REG_SOURCE
= 1,
141 * Return codes from RDMA operations
145 RDMA_SUCCESS
= 0, /* successful operation */
147 RDMA_INVAL
= 1, /* invalid parameter */
148 RDMA_TIMEDOUT
= 2, /* operation timed out */
149 RDMA_INTR
= 3, /* operation interrupted */
150 RDMA_NORESOURCE
= 4, /* insufficient resource */
154 RDMA_REJECT
= 5, /* connection req rejected */
155 RDMA_NOLISTENER
= 6, /* no listener on server */
156 RDMA_UNREACHABLE
= 7, /* host unreachable */
157 RDMA_CONNLOST
= 8, /* connection lost */
159 RDMA_XPRTFAILED
= 9, /* RDMA transport failed */
160 RDMA_PROTECTERR
= 10, /* memory protection error */
161 RDMA_OVERRUN
= 11, /* transport overrun */
162 RDMA_RECVQEMPTY
= 12, /* incoming pkt dropped, recv q empty */
163 RDMA_PROTFAILED
= 13, /* RDMA protocol failed */
164 RDMA_NOTSUPP
= 14, /* requested feature not supported */
165 RDMA_REMOTERR
= 15, /* error at remote end */
169 RDMA_BADVERS
= 16, /* mismatch RDMATF versions */
170 RDMA_REG_EXIST
= 17, /* RDMATF registration already exists */
171 RDMA_HCA_ATTACH
= 18,
172 RDMA_HCA_DETACH
= 19,
177 RDMA_FAILED
= 20 /* generic error */
181 * Memory region context. This is an RDMA provider generated
182 * handle for a registered arbitrary size contiguous virtual
183 * memory. The RDMA Interface Adapter needs this for local or
184 * remote memory access.
186 * The mrc_rmr field holds the remote memory region context
187 * which is sent over-the-wire to provide the remote host
188 * with RDMA access to the memory region.
191 uint32_t mrc_rmr
; /* Remote MR context, sent OTW */
194 uint32_t lmr
; /* Local MR context */
195 uint64_t linfo
; /* Local memory info */
200 #define mrc_lmr lhdl.mr.lmr
201 #define mrc_linfo lhdl.mr.linfo
204 * Memory management for the RDMA buffers
210 SEND_BUFFER
, /* buf for send msg */
211 SEND_DESCRIPTOR
, /* buf used for send msg descriptor in plugins only */
212 RECV_BUFFER
, /* buf for recv msg */
213 RECV_DESCRIPTOR
, /* buf used for recv msg descriptor in plugins only */
214 RDMA_LONG_BUFFER
/* chunk buf used in RDMATF only and not in plugins */
218 * RDMA buffer information
220 typedef struct rdma_buf
{
221 rdma_btype type
; /* buffer type */
222 uint_t len
; /* length of buffer */
223 caddr_t addr
; /* buffer address */
224 struct mrc handle
; /* buffer registration handle */
230 * The XDR offset value is used by the XDR
231 * routine to identify the position in the
232 * RPC message where the opaque object would
233 * normally occur. Neither the data content
234 * of the chunk, nor its size field are included
235 * in the RPC message. The XDR offset is calculated
236 * as if the chunks were present.
238 * The remaining fields identify the chunk of data
239 * on the sender. The c_memhandle identifies a
240 * registered RDMA memory region and the c_addr
241 * and c_len fields identify the chunk within it.
244 uint32 c_xdroff
; /* XDR offset */
245 uint32 c_len
; /* Length */
246 clist_dstsrc c_regtype
; /* type of registration */
247 struct mrc c_smemhandle
; /* src memory handle */
248 uint64 c_ssynchandle
; /* src sync handle */
250 uint64 c_saddr
; /* src address */
253 struct mrc c_dmemhandle
; /* dst memory handle */
254 uint64 c_dsynchandle
; /* dst sync handle */
256 uint64 c_daddr
; /* dst address */
259 struct as
*c_adspc
; /* address space for saddr/daddr */
260 rdma_buf_t rb_longbuf
; /* used for long requests/replies */
261 struct clist
*c_next
; /* Next chunk */
264 typedef struct clist clist
;
267 * max 4M wlist xfer size
268 * This is defined because the rfs3_tsize service requires
269 * svc_req struct (which we don't have that in krecv).
271 #define MAX_SVC_XFER_SIZE (4*1024*1024)
274 RDMA_MSG
= 0, /* chunk list and RPC msg follow */
275 RDMA_NOMSG
= 1, /* only chunk list follows */
276 RDMA_MSGP
= 2, /* chunk list and RPC msg with padding follow */
277 RDMA_DONE
= 3 /* signal completion of chunk transfer */
281 * Listener information for a service
283 struct rdma_svc_data
{
284 queue_t q
; /* queue_t to place incoming pkts */
285 int active
; /* If active, after registeration startup */
286 rdma_stat err_code
; /* Error code from plugin layer */
287 int32_t svcid
; /* RDMA based service identifier */
291 * Per RDMA plugin module information.
292 * Will be populated by each plugin
293 * module during its initialization.
295 typedef struct rdma_mod
{
296 char *rdma_api
; /* "kvipl", "ibtf", etc */
297 uint_t rdma_version
; /* RDMATF API version */
298 int rdma_count
; /* # of devices */
299 struct rdmaops
*rdma_ops
; /* rdma op vector for api */
303 * Registry of RDMA plugins
305 typedef struct rdma_registry
{
306 rdma_mod_t
*r_mod
; /* plugin mod info */
307 uint32_t r_mod_state
;
308 struct rdma_registry
*r_next
; /* next registered RDMA plugin */
312 * RDMA MODULE state flags (r_mod_state).
314 #define RDMA_MOD_ACTIVE 1
315 #define RDMA_MOD_INACTIVE 0
318 * RDMA transport information
320 typedef struct rdma_info
{
321 uint_t addrlen
; /* address length */
322 uint_t mts
; /* max transfer size */
323 uint_t mtu
; /* native mtu size of unlerlying network */
328 C_CONN_PEND
= 0x00000002,
329 C_CONNECTED
= 0x00000004,
330 C_ERROR_CONN
= 0x00000008,
331 C_DISCONN_PEND
= 0x00000010,
332 C_REMOTE_DOWN
= 0x00000020
336 #define C_CLOSE_NOTNEEDED 0x00000001 /* just free the channel */
337 #define C_CLOSE_PENDING 0x00000002 /* a close in progress */
340 * RDMA Connection information
342 typedef struct conn
{
343 rdma_mod_t
*c_rdmamod
; /* RDMA transport info for conn */
344 char *c_netid
; /* tcp or tcp6 token */
345 struct netbuf c_raddr
; /* remote address */
346 struct netbuf c_laddr
; /* local address */
347 struct netbuf c_addrmask
; /* Address Mask */
348 int c_ref
; /* no. of clients of connection */
349 struct conn
*c_next
; /* next in list of connections */
350 struct conn
*c_prev
; /* prev in list of connections */
351 caddr_t c_private
; /* transport specific stuff */
352 conn_c_state c_state
; /* state of connection */
353 int c_flags
; /* flags for connection management */
354 rdma_cc_type_t c_cc_type
; /* client or server, for credit cntrl */
356 rdma_clnt_cred_ctrl_t c_clnt_cc
;
357 rdma_srv_cred_ctrl_t c_srv_cc
;
358 } rdma_conn_cred_ctrl_u
;
359 kmutex_t c_lock
; /* protect c_state and c_ref fields */
360 kcondvar_t c_cv
; /* to signal when pending is done */
361 timeout_id_t c_timeout
; /* timeout id for untimeout() */
362 time_t c_last_used
; /* last time any activity on the conn */
367 * Data transferred from plugin interrupt to svc_queuereq()
369 typedef struct rdma_recv_data
{
375 /* structure used to pass information for READ over rdma write */
377 RCI_WRITE_UIO_CHUNK
= 1,
378 RCI_WRITE_ADDR_CHUNK
= 2,
385 struct uio
*rci_uiop
;
389 struct clist
**rci_clpp
; /* point to write chunk list in readargs */
395 } rdma_chunkinfo_lengths_t
;
398 struct clist
*rwci_wlist
;
400 } rdma_wlist_conn_info_t
;
403 * Operations vector for RDMA transports.
405 typedef struct rdmaops
{
407 rdma_stat (*rdma_reachable
)(int addr_type
, struct netbuf
*,
410 rdma_stat (*rdma_get_conn
)(struct netbuf
*, struct netbuf
*,
411 int addr_type
, void *, CONN
**);
412 rdma_stat (*rdma_rel_conn
)(CONN
*);
413 /* Server side listner start and stop routines */
414 void (*rdma_svc_listen
)(struct rdma_svc_data
*);
415 void (*rdma_svc_stop
)(struct rdma_svc_data
*);
417 rdma_stat (*rdma_regmem
)(CONN
*, caddr_t
, caddr_t
,
418 uint_t
, struct mrc
*);
419 rdma_stat (*rdma_deregmem
)(CONN
*, caddr_t
, struct mrc
);
420 rdma_stat (*rdma_regmemsync
)(CONN
*, caddr_t
, caddr_t
, uint_t
,
421 struct mrc
*, void **, void *);
422 rdma_stat (*rdma_deregmemsync
)(CONN
*, caddr_t
, struct mrc
,
424 rdma_stat (*rdma_syncmem
)(CONN
*, void *, caddr_t
, int, int);
426 rdma_stat (*rdma_buf_alloc
)(CONN
*, rdma_buf_t
*);
427 void (*rdma_buf_free
)(CONN
*, rdma_buf_t
*);
429 rdma_stat (*rdma_send
)(CONN
*, clist
*, uint32_t);
430 rdma_stat (*rdma_send_resp
)(CONN
*, clist
*, uint32_t);
431 rdma_stat (*rdma_clnt_recvbuf
)(CONN
*, clist
*, uint32_t);
432 rdma_stat (*rdma_clnt_recvbuf_remove
)(CONN
*, uint32_t);
433 rdma_stat (*rdma_svc_recvbuf
)(CONN
*, clist
*);
434 rdma_stat (*rdma_recv
)(CONN
*, clist
**, uint32_t);
436 rdma_stat (*rdma_read
)(CONN
*, clist
*, int);
437 rdma_stat (*rdma_write
)(CONN
*, clist
*, int);
439 rdma_stat (*rdma_getinfo
)(rdma_info_t
*info
);
442 typedef struct rdma_svc_wait
{
448 extern rdma_svc_wait_t rdma_wait
;
453 #define RDMA_REACHABLE(rdma_ops, addr_type, addr, handle) \
454 (*(rdma_ops)->rdma_reachable)(addr_type, addr, handle)
456 #define RDMA_GET_CONN(rdma_ops, saddr, daddr, addr_type, handle, conn) \
457 (*(rdma_ops)->rdma_get_conn)(saddr, daddr, addr_type, handle, conn)
459 #define RDMA_REL_CONN(conn) \
460 (*(conn)->c_rdmamod->rdma_ops->rdma_rel_conn)(conn)
462 #define RDMA_REGMEM(conn, adsp, buff, len, handle) \
463 (*(conn)->c_rdmamod->rdma_ops->rdma_regmem)(conn, adsp, \
466 #define RDMA_DEREGMEM(conn, buff, handle) \
467 (*(conn)->c_rdmamod->rdma_ops->rdma_deregmem)(conn, buff, handle)
469 #define RDMA_REGMEMSYNC(conn, adsp, buff, len, handle, synchandle, lrc) \
470 (*(conn)->c_rdmamod->rdma_ops->rdma_regmemsync)(conn, adsp, buff, \
471 len, handle, synchandle, lrc)
473 #define RDMA_DEREGMEMSYNC(conn, buff, handle, synchandle, lrc) \
474 (*(conn)->c_rdmamod->rdma_ops->rdma_deregmemsync)(conn, buff, \
475 handle, synchandle, lrc)
477 #define RDMA_SYNCMEM(conn, handle, buff, len, direction) \
478 (*(conn)->c_rdmamod->rdma_ops->rdma_syncmem)(conn, handle, \
479 buff, len, direction)
481 #define RDMA_BUF_ALLOC(conn, rbuf) \
482 (*(conn)->c_rdmamod->rdma_ops->rdma_buf_alloc)(conn, rbuf)
484 #define RDMA_BUF_FREE(conn, rbuf) \
485 (*(conn)->c_rdmamod->rdma_ops->rdma_buf_free)(conn, rbuf)
487 #define RDMA_SEND(conn, sendlist, xid) \
488 (*(conn)->c_rdmamod->rdma_ops->rdma_send)(conn, sendlist, xid)
490 #define RDMA_SEND_RESP(conn, sendlist, xid) \
491 (*(conn)->c_rdmamod->rdma_ops->rdma_send_resp)(conn, sendlist, xid)
493 #define RDMA_CLNT_RECVBUF(conn, cl, xid) \
494 (*(conn)->c_rdmamod->rdma_ops->rdma_clnt_recvbuf)(conn, cl, xid)
496 #define RDMA_CLNT_RECVBUF_REMOVE(conn, xid) \
497 (*(conn)->c_rdmamod->rdma_ops->rdma_clnt_recvbuf_remove)(conn, xid)
499 #define RDMA_SVC_RECVBUF(conn, cl) \
500 (*(conn)->c_rdmamod->rdma_ops->rdma_svc_recvbuf)(conn, cl)
502 #define RDMA_RECV(conn, recvlist, xid) \
503 (*(conn)->c_rdmamod->rdma_ops->rdma_recv)(conn, recvlist, xid)
505 #define RDMA_READ(conn, cl, wait) \
506 (*(conn)->c_rdmamod->rdma_ops->rdma_read)(conn, cl, wait)
508 #define RDMA_WRITE(conn, cl, wait) \
509 (*(conn)->c_rdmamod->rdma_ops->rdma_write)(conn, cl, wait)
511 #define RDMA_GETINFO(rdma_mod, info) \
512 (*(rdma_mod)->rdma_ops->rdma_getinfo)(info)
515 extern rdma_registry_t
*rdma_mod_head
;
516 extern krwlock_t rdma_lock
; /* protects rdma_mod_head list */
517 extern int rdma_modloaded
; /* flag for loading RDMA plugins */
518 extern int rdma_dev_available
; /* rdma device is loaded or not */
519 extern kmutex_t rdma_modload_lock
; /* protects rdma_modloaded flag */
520 extern uint_t rdma_minchunk
;
521 extern ldi_ident_t rpcmod_li
; /* needed by layed driver framework */
524 * General RDMA routines
526 extern struct clist
*clist_alloc(void);
527 extern void clist_add(struct clist
**, uint32_t, int,
528 struct mrc
*, caddr_t
, struct mrc
*, caddr_t
);
529 extern void clist_free(struct clist
*);
530 extern uint32_t clist_len(struct clist
*);
531 extern void clist_zero_len(struct clist
*);
532 extern rdma_stat
clist_register(CONN
*conn
, struct clist
*cl
, clist_dstsrc
);
533 extern rdma_stat
clist_deregister(CONN
*conn
, struct clist
*cl
);
534 extern rdma_stat
clist_syncmem(CONN
*conn
, struct clist
*cl
, clist_dstsrc
);
535 extern rdma_stat
rdma_clnt_postrecv(CONN
*conn
, uint32_t xid
);
536 extern rdma_stat
rdma_clnt_postrecv_remove(CONN
*conn
, uint32_t xid
);
537 extern rdma_stat
rdma_svc_postrecv(CONN
*conn
);
538 extern rdma_stat
rdma_register_mod(rdma_mod_t
*mod
);
539 extern rdma_stat
rdma_unregister_mod(rdma_mod_t
*mod
);
540 extern rdma_stat
rdma_buf_alloc(CONN
*, rdma_buf_t
*);
541 extern void rdma_buf_free(CONN
*, rdma_buf_t
*);
542 extern int rdma_modload();
543 extern bool_t
rdma_get_wchunk(struct svc_req
*, iovec_t
*, struct clist
*);
544 extern rdma_stat
rdma_kwait(void);
545 extern int rdma_setup_read_chunks(struct clist
*, uint32_t, int *);
550 extern void xdrrdma_create(XDR
*, caddr_t
, uint_t
, int, struct clist
*,
551 enum xdr_op
, CONN
*);
552 extern void xdrrdma_destroy(XDR
*);
554 extern uint_t
xdrrdma_getpos(XDR
*);
555 extern bool_t
xdrrdma_setpos(XDR
*, uint_t
);
556 extern bool_t
xdr_clist(XDR
*, clist
*);
557 extern bool_t
xdr_do_clist(XDR
*, clist
**);
558 extern uint_t
xdr_getbufsize(XDR
*);
559 extern unsigned int xdrrdma_sizeof(xdrproc_t
, void *, int, uint_t
*, uint_t
*);
560 extern unsigned int xdrrdma_authsize(AUTH
*, struct cred
*, int);
562 extern void xdrrdma_store_wlist(XDR
*, struct clist
*);
563 extern struct clist
*xdrrdma_wclist(XDR
*);
564 extern bool_t
xdr_decode_reply_wchunk(XDR
*, struct clist
**);
565 extern bool_t
xdr_decode_wlist(XDR
*xdrs
, struct clist
**, bool_t
*);
566 extern bool_t
xdr_decode_wlist_svc(XDR
*xdrs
, struct clist
**, bool_t
*,
568 extern bool_t
xdr_encode_rlist_svc(XDR
*, clist
*);
569 extern bool_t
xdr_encode_wlist(XDR
*, clist
*);
570 extern bool_t
xdr_encode_reply_wchunk(XDR
*, struct clist
*,
571 uint32_t seg_array_len
);
572 bool_t
xdrrdma_getrdmablk(XDR
*, struct clist
**, uint_t
*,
573 CONN
**conn
, const uint_t
);
574 bool_t
xdrrdma_read_from_client(struct clist
*, CONN
**, uint_t
);
575 bool_t
xdrrdma_send_read_data(XDR
*, uint_t
, struct clist
*);
576 bool_t
xdrrdma_free_clist(CONN
*, struct clist
*);
583 #endif /* _RPC_RPC_RDMA_H */