Merge remote-tracking branch 'origin/master'
[unleashed/lotheac.git] / usr / src / uts / common / rpc / rpc_rdma.h
blob0cbb8ef45b4282c98aaf96e72748a3fc27ade0f8
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
26 * Copyright (c) 2007, The Ohio State University. All rights reserved.
28 * Portions of this source code is developed by the team members of
29 * The Ohio State University's Network-Based Computing Laboratory (NBCL),
30 * headed by Professor Dhabaleswar K. (DK) Panda.
32 * Acknowledgements to contributions from developors:
33 * Ranjit Noronha: noronha@cse.ohio-state.edu
34 * Lei Chai : chail@cse.ohio-state.edu
35 * Weikuan Yu : yuw@cse.ohio-state.edu
39 #ifndef _RPC_RPC_RDMA_H
40 #define _RPC_RPC_RDMA_H
42 #include <rpc/rpc.h>
43 #include <rpc/rpc_sztypes.h>
44 #include <sys/sunddi.h>
45 #include <sys/sunldi.h>
47 #ifdef __cplusplus
48 extern "C" {
49 #endif
51 #define RPCRDMA_VERS 1 /* Version of the RPC over RDMA protocol */
52 #define RDMATF_VERS 1 /* Version of the API used by RPC for RDMA */
53 #define RDMATF_VERS_1 1 /* Current version of RDMATF */
56 * The size of an RPC call or reply message
58 #define RPC_MSG_SZ 1024
61 * RDMA chunk size
63 #define RDMA_MINCHUNK 1024
66 * Storage for a chunk list
68 #define RPC_CL_SZ 1024
71 * Chunk size
73 #define MINCHUNK 1024
76 * Size of receive buffer
78 #define RPC_BUF_SIZE 2048
80 #define NOWAIT 0 /* don't wait for operation of complete */
81 #define WAIT 1 /* wait and ensure that operation is complete */
84 * RDMA xdr buffer control and other control flags. Add new flags here,
85 * set them in private structure for xdr over RDMA in xdr_rdma.c
87 #define XDR_RDMA_CHUNK 0x1
88 #define XDR_RDMA_WLIST_REG 0x2
89 #define XDR_RDMA_RLIST_REG 0x4
91 #define LONG_REPLY_LEN 65536
92 #define WCL_BUF_LEN 32768
93 #define RCL_BUF_LEN 32768
96 #define RDMA_BUFS_RQST 34 /* Num bufs requested by client */
97 #define RDMA_BUFS_GRANT 32 /* Num bufs granted by server */
99 struct xdr_ops *xdrrdma_xops(void);
102 * Credit Control Structures.
104 typedef enum rdma_cc_type {
105 RDMA_CC_CLNT, /* CONN is for a client */
106 RDMA_CC_SRV /* CONN is for a server */
107 } rdma_cc_type_t;
110 * Client side credit control data structure.
112 typedef struct rdma_clnt_cred_ctrl {
113 uint32_t clnt_cc_granted_ops;
114 uint32_t clnt_cc_in_flight_ops;
115 kcondvar_t clnt_cc_cv;
116 } rdma_clnt_cred_ctrl_t;
119 * Server side credit control data structure.
121 typedef struct rdma_srv_cred_ctrl {
122 uint32_t srv_cc_buffers_granted;
123 uint32_t srv_cc_cur_buffers_used;
124 uint32_t srv_cc_posted;
125 uint32_t srv_cc_max_buf_size; /* to be determined by CCP */
126 uint32_t srv_cc_cur_buf_size; /* to be determined by CCP */
127 } rdma_srv_cred_ctrl_t;
129 typedef enum {
130 RPCCALL_WLIST,
131 RPCCALL_WCHUNK,
132 RPCCALL_NOWRITE
133 }rpccall_write_t;
135 typedef enum {
136 CLIST_REG_SOURCE = 1,
137 CLIST_REG_DST
138 } clist_dstsrc;
141 * Return codes from RDMA operations
143 typedef enum {
145 RDMA_SUCCESS = 0, /* successful operation */
147 RDMA_INVAL = 1, /* invalid parameter */
148 RDMA_TIMEDOUT = 2, /* operation timed out */
149 RDMA_INTR = 3, /* operation interrupted */
150 RDMA_NORESOURCE = 4, /* insufficient resource */
152 * connection errors
154 RDMA_REJECT = 5, /* connection req rejected */
155 RDMA_NOLISTENER = 6, /* no listener on server */
156 RDMA_UNREACHABLE = 7, /* host unreachable */
157 RDMA_CONNLOST = 8, /* connection lost */
159 RDMA_XPRTFAILED = 9, /* RDMA transport failed */
160 RDMA_PROTECTERR = 10, /* memory protection error */
161 RDMA_OVERRUN = 11, /* transport overrun */
162 RDMA_RECVQEMPTY = 12, /* incoming pkt dropped, recv q empty */
163 RDMA_PROTFAILED = 13, /* RDMA protocol failed */
164 RDMA_NOTSUPP = 14, /* requested feature not supported */
165 RDMA_REMOTERR = 15, /* error at remote end */
167 * RDMATF errors
169 RDMA_BADVERS = 16, /* mismatch RDMATF versions */
170 RDMA_REG_EXIST = 17, /* RDMATF registration already exists */
171 RDMA_HCA_ATTACH = 18,
172 RDMA_HCA_DETACH = 19,
175 * fallback error
177 RDMA_FAILED = 20 /* generic error */
178 } rdma_stat;
181 * Memory region context. This is an RDMA provider generated
182 * handle for a registered arbitrary size contiguous virtual
183 * memory. The RDMA Interface Adapter needs this for local or
184 * remote memory access.
186 * The mrc_rmr field holds the remote memory region context
187 * which is sent over-the-wire to provide the remote host
188 * with RDMA access to the memory region.
190 struct mrc {
191 uint32_t mrc_rmr; /* Remote MR context, sent OTW */
192 union {
193 struct mr {
194 uint32_t lmr; /* Local MR context */
195 uint64_t linfo; /* Local memory info */
196 } mr;
197 } lhdl;
200 #define mrc_lmr lhdl.mr.lmr
201 #define mrc_linfo lhdl.mr.linfo
204 * Memory management for the RDMA buffers
207 * RDMA buffer types
209 typedef enum {
210 SEND_BUFFER, /* buf for send msg */
211 SEND_DESCRIPTOR, /* buf used for send msg descriptor in plugins only */
212 RECV_BUFFER, /* buf for recv msg */
213 RECV_DESCRIPTOR, /* buf used for recv msg descriptor in plugins only */
214 RDMA_LONG_BUFFER /* chunk buf used in RDMATF only and not in plugins */
215 } rdma_btype;
218 * RDMA buffer information
220 typedef struct rdma_buf {
221 rdma_btype type; /* buffer type */
222 uint_t len; /* length of buffer */
223 caddr_t addr; /* buffer address */
224 struct mrc handle; /* buffer registration handle */
225 caddr_t rb_private;
226 } rdma_buf_t;
230 * The XDR offset value is used by the XDR
231 * routine to identify the position in the
232 * RPC message where the opaque object would
233 * normally occur. Neither the data content
234 * of the chunk, nor its size field are included
235 * in the RPC message. The XDR offset is calculated
236 * as if the chunks were present.
238 * The remaining fields identify the chunk of data
239 * on the sender. The c_memhandle identifies a
240 * registered RDMA memory region and the c_addr
241 * and c_len fields identify the chunk within it.
243 struct clist {
244 uint32 c_xdroff; /* XDR offset */
245 uint32 c_len; /* Length */
246 clist_dstsrc c_regtype; /* type of registration */
247 struct mrc c_smemhandle; /* src memory handle */
248 uint64 c_ssynchandle; /* src sync handle */
249 union {
250 uint64 c_saddr; /* src address */
251 caddr_t c_saddr3;
252 } w;
253 struct mrc c_dmemhandle; /* dst memory handle */
254 uint64 c_dsynchandle; /* dst sync handle */
255 union {
256 uint64 c_daddr; /* dst address */
257 caddr_t c_daddr3;
258 } u;
259 struct as *c_adspc; /* address space for saddr/daddr */
260 rdma_buf_t rb_longbuf; /* used for long requests/replies */
261 struct clist *c_next; /* Next chunk */
264 typedef struct clist clist;
267 * max 4M wlist xfer size
268 * This is defined because the rfs3_tsize service requires
269 * svc_req struct (which we don't have that in krecv).
271 #define MAX_SVC_XFER_SIZE (4*1024*1024)
273 enum rdma_proc {
274 RDMA_MSG = 0, /* chunk list and RPC msg follow */
275 RDMA_NOMSG = 1, /* only chunk list follows */
276 RDMA_MSGP = 2, /* chunk list and RPC msg with padding follow */
277 RDMA_DONE = 3 /* signal completion of chunk transfer */
281 * Listener information for a service
283 struct rdma_svc_data {
284 queue_t q; /* queue_t to place incoming pkts */
285 int active; /* If active, after registeration startup */
286 rdma_stat err_code; /* Error code from plugin layer */
287 int32_t svcid; /* RDMA based service identifier */
291 * Per RDMA plugin module information.
292 * Will be populated by each plugin
293 * module during its initialization.
295 typedef struct rdma_mod {
296 char *rdma_api; /* "kvipl", "ibtf", etc */
297 uint_t rdma_version; /* RDMATF API version */
298 int rdma_count; /* # of devices */
299 struct rdmaops *rdma_ops; /* rdma op vector for api */
300 } rdma_mod_t;
303 * Registry of RDMA plugins
305 typedef struct rdma_registry {
306 rdma_mod_t *r_mod; /* plugin mod info */
307 uint32_t r_mod_state;
308 struct rdma_registry *r_next; /* next registered RDMA plugin */
309 } rdma_registry_t;
312 * RDMA MODULE state flags (r_mod_state).
314 #define RDMA_MOD_ACTIVE 1
315 #define RDMA_MOD_INACTIVE 0
318 * RDMA transport information
320 typedef struct rdma_info {
321 uint_t addrlen; /* address length */
322 uint_t mts; /* max transfer size */
323 uint_t mtu; /* native mtu size of unlerlying network */
324 } rdma_info_t;
326 typedef enum {
327 C_IDLE = 0x00000001,
328 C_CONN_PEND = 0x00000002,
329 C_CONNECTED = 0x00000004,
330 C_ERROR_CONN = 0x00000008,
331 C_DISCONN_PEND = 0x00000010,
332 C_REMOTE_DOWN = 0x00000020
333 } conn_c_state;
335 /* c_flags */
336 #define C_CLOSE_NOTNEEDED 0x00000001 /* just free the channel */
337 #define C_CLOSE_PENDING 0x00000002 /* a close in progress */
340 * RDMA Connection information
342 typedef struct conn {
343 rdma_mod_t *c_rdmamod; /* RDMA transport info for conn */
344 char *c_netid; /* tcp or tcp6 token */
345 struct netbuf c_raddr; /* remote address */
346 struct netbuf c_laddr; /* local address */
347 struct netbuf c_addrmask; /* Address Mask */
348 int c_ref; /* no. of clients of connection */
349 struct conn *c_next; /* next in list of connections */
350 struct conn *c_prev; /* prev in list of connections */
351 caddr_t c_private; /* transport specific stuff */
352 conn_c_state c_state; /* state of connection */
353 int c_flags; /* flags for connection management */
354 rdma_cc_type_t c_cc_type; /* client or server, for credit cntrl */
355 union {
356 rdma_clnt_cred_ctrl_t c_clnt_cc;
357 rdma_srv_cred_ctrl_t c_srv_cc;
358 } rdma_conn_cred_ctrl_u;
359 kmutex_t c_lock; /* protect c_state and c_ref fields */
360 kcondvar_t c_cv; /* to signal when pending is done */
361 timeout_id_t c_timeout; /* timeout id for untimeout() */
362 time_t c_last_used; /* last time any activity on the conn */
363 } CONN;
367 * Data transferred from plugin interrupt to svc_queuereq()
369 typedef struct rdma_recv_data {
370 CONN *conn;
371 int status;
372 rdma_buf_t rpcmsg;
373 } rdma_recv_data_t;
375 /* structure used to pass information for READ over rdma write */
376 typedef enum {
377 RCI_WRITE_UIO_CHUNK = 1,
378 RCI_WRITE_ADDR_CHUNK = 2,
379 RCI_REPLY_CHUNK = 3
380 } rci_type_t;
382 typedef struct {
383 rci_type_t rci_type;
384 union {
385 struct uio *rci_uiop;
386 caddr_t rci_addr;
387 } rci_a;
388 uint32 rci_len;
389 struct clist **rci_clpp; /* point to write chunk list in readargs */
390 } rdma_chunkinfo_t;
392 typedef struct {
393 uint_t rcil_len;
394 uint_t rcil_len_alt;
395 } rdma_chunkinfo_lengths_t;
397 typedef struct {
398 struct clist *rwci_wlist;
399 CONN *rwci_conn;
400 } rdma_wlist_conn_info_t;
403 * Operations vector for RDMA transports.
405 typedef struct rdmaops {
406 /* Network */
407 rdma_stat (*rdma_reachable)(int addr_type, struct netbuf *,
408 void **handle);
409 /* Connection */
410 rdma_stat (*rdma_get_conn)(struct netbuf *, struct netbuf *,
411 int addr_type, void *, CONN **);
412 rdma_stat (*rdma_rel_conn)(CONN *);
413 /* Server side listner start and stop routines */
414 void (*rdma_svc_listen)(struct rdma_svc_data *);
415 void (*rdma_svc_stop)(struct rdma_svc_data *);
416 /* Memory */
417 rdma_stat (*rdma_regmem)(CONN *, caddr_t, caddr_t,
418 uint_t, struct mrc *);
419 rdma_stat (*rdma_deregmem)(CONN *, caddr_t, struct mrc);
420 rdma_stat (*rdma_regmemsync)(CONN *, caddr_t, caddr_t, uint_t,
421 struct mrc *, void **, void *);
422 rdma_stat (*rdma_deregmemsync)(CONN *, caddr_t, struct mrc,
423 void *, void *);
424 rdma_stat (*rdma_syncmem)(CONN *, void *, caddr_t, int, int);
425 /* Buffer */
426 rdma_stat (*rdma_buf_alloc)(CONN *, rdma_buf_t *);
427 void (*rdma_buf_free)(CONN *, rdma_buf_t *);
428 /* Transfer */
429 rdma_stat (*rdma_send)(CONN *, clist *, uint32_t);
430 rdma_stat (*rdma_send_resp)(CONN *, clist *, uint32_t);
431 rdma_stat (*rdma_clnt_recvbuf)(CONN *, clist *, uint32_t);
432 rdma_stat (*rdma_clnt_recvbuf_remove)(CONN *, uint32_t);
433 rdma_stat (*rdma_svc_recvbuf)(CONN *, clist *);
434 rdma_stat (*rdma_recv)(CONN *, clist **, uint32_t);
435 /* RDMA */
436 rdma_stat (*rdma_read)(CONN *, clist *, int);
437 rdma_stat (*rdma_write)(CONN *, clist *, int);
438 /* INFO */
439 rdma_stat (*rdma_getinfo)(rdma_info_t *info);
440 } rdmaops_t;
442 typedef struct rdma_svc_wait {
443 kmutex_t svc_lock;
444 kcondvar_t svc_cv;
445 rdma_stat svc_stat;
446 } rdma_svc_wait_t;
448 extern rdma_svc_wait_t rdma_wait;
451 * RDMA operations.
453 #define RDMA_REACHABLE(rdma_ops, addr_type, addr, handle) \
454 (*(rdma_ops)->rdma_reachable)(addr_type, addr, handle)
456 #define RDMA_GET_CONN(rdma_ops, saddr, daddr, addr_type, handle, conn) \
457 (*(rdma_ops)->rdma_get_conn)(saddr, daddr, addr_type, handle, conn)
459 #define RDMA_REL_CONN(conn) \
460 (*(conn)->c_rdmamod->rdma_ops->rdma_rel_conn)(conn)
462 #define RDMA_REGMEM(conn, adsp, buff, len, handle) \
463 (*(conn)->c_rdmamod->rdma_ops->rdma_regmem)(conn, adsp, \
464 buff, len, handle)
466 #define RDMA_DEREGMEM(conn, buff, handle) \
467 (*(conn)->c_rdmamod->rdma_ops->rdma_deregmem)(conn, buff, handle)
469 #define RDMA_REGMEMSYNC(conn, adsp, buff, len, handle, synchandle, lrc) \
470 (*(conn)->c_rdmamod->rdma_ops->rdma_regmemsync)(conn, adsp, buff, \
471 len, handle, synchandle, lrc)
473 #define RDMA_DEREGMEMSYNC(conn, buff, handle, synchandle, lrc) \
474 (*(conn)->c_rdmamod->rdma_ops->rdma_deregmemsync)(conn, buff, \
475 handle, synchandle, lrc)
477 #define RDMA_SYNCMEM(conn, handle, buff, len, direction) \
478 (*(conn)->c_rdmamod->rdma_ops->rdma_syncmem)(conn, handle, \
479 buff, len, direction)
481 #define RDMA_BUF_ALLOC(conn, rbuf) \
482 (*(conn)->c_rdmamod->rdma_ops->rdma_buf_alloc)(conn, rbuf)
484 #define RDMA_BUF_FREE(conn, rbuf) \
485 (*(conn)->c_rdmamod->rdma_ops->rdma_buf_free)(conn, rbuf)
487 #define RDMA_SEND(conn, sendlist, xid) \
488 (*(conn)->c_rdmamod->rdma_ops->rdma_send)(conn, sendlist, xid)
490 #define RDMA_SEND_RESP(conn, sendlist, xid) \
491 (*(conn)->c_rdmamod->rdma_ops->rdma_send_resp)(conn, sendlist, xid)
493 #define RDMA_CLNT_RECVBUF(conn, cl, xid) \
494 (*(conn)->c_rdmamod->rdma_ops->rdma_clnt_recvbuf)(conn, cl, xid)
496 #define RDMA_CLNT_RECVBUF_REMOVE(conn, xid) \
497 (*(conn)->c_rdmamod->rdma_ops->rdma_clnt_recvbuf_remove)(conn, xid)
499 #define RDMA_SVC_RECVBUF(conn, cl) \
500 (*(conn)->c_rdmamod->rdma_ops->rdma_svc_recvbuf)(conn, cl)
502 #define RDMA_RECV(conn, recvlist, xid) \
503 (*(conn)->c_rdmamod->rdma_ops->rdma_recv)(conn, recvlist, xid)
505 #define RDMA_READ(conn, cl, wait) \
506 (*(conn)->c_rdmamod->rdma_ops->rdma_read)(conn, cl, wait)
508 #define RDMA_WRITE(conn, cl, wait) \
509 (*(conn)->c_rdmamod->rdma_ops->rdma_write)(conn, cl, wait)
511 #define RDMA_GETINFO(rdma_mod, info) \
512 (*(rdma_mod)->rdma_ops->rdma_getinfo)(info)
514 #ifdef _KERNEL
515 extern rdma_registry_t *rdma_mod_head;
516 extern krwlock_t rdma_lock; /* protects rdma_mod_head list */
517 extern int rdma_modloaded; /* flag for loading RDMA plugins */
518 extern int rdma_dev_available; /* rdma device is loaded or not */
519 extern kmutex_t rdma_modload_lock; /* protects rdma_modloaded flag */
520 extern uint_t rdma_minchunk;
521 extern ldi_ident_t rpcmod_li; /* needed by layed driver framework */
524 * General RDMA routines
526 extern struct clist *clist_alloc(void);
527 extern void clist_add(struct clist **, uint32_t, int,
528 struct mrc *, caddr_t, struct mrc *, caddr_t);
529 extern void clist_free(struct clist *);
530 extern uint32_t clist_len(struct clist *);
531 extern void clist_zero_len(struct clist *);
532 extern rdma_stat clist_register(CONN *conn, struct clist *cl, clist_dstsrc);
533 extern rdma_stat clist_deregister(CONN *conn, struct clist *cl);
534 extern rdma_stat clist_syncmem(CONN *conn, struct clist *cl, clist_dstsrc);
535 extern rdma_stat rdma_clnt_postrecv(CONN *conn, uint32_t xid);
536 extern rdma_stat rdma_clnt_postrecv_remove(CONN *conn, uint32_t xid);
537 extern rdma_stat rdma_svc_postrecv(CONN *conn);
538 extern rdma_stat rdma_register_mod(rdma_mod_t *mod);
539 extern rdma_stat rdma_unregister_mod(rdma_mod_t *mod);
540 extern rdma_stat rdma_buf_alloc(CONN *, rdma_buf_t *);
541 extern void rdma_buf_free(CONN *, rdma_buf_t *);
542 extern int rdma_modload();
543 extern bool_t rdma_get_wchunk(struct svc_req *, iovec_t *, struct clist *);
544 extern rdma_stat rdma_kwait(void);
545 extern int rdma_setup_read_chunks(struct clist *, uint32_t, int *);
548 * RDMA XDR
550 extern void xdrrdma_create(XDR *, caddr_t, uint_t, int, struct clist *,
551 enum xdr_op, CONN *);
552 extern void xdrrdma_destroy(XDR *);
554 extern uint_t xdrrdma_getpos(XDR *);
555 extern bool_t xdrrdma_setpos(XDR *, uint_t);
556 extern bool_t xdr_clist(XDR *, clist *);
557 extern bool_t xdr_do_clist(XDR *, clist **);
558 extern uint_t xdr_getbufsize(XDR *);
559 extern unsigned int xdrrdma_sizeof(xdrproc_t, void *, int, uint_t *, uint_t *);
560 extern unsigned int xdrrdma_authsize(AUTH *, struct cred *, int);
562 extern void xdrrdma_store_wlist(XDR *, struct clist *);
563 extern struct clist *xdrrdma_wclist(XDR *);
564 extern bool_t xdr_decode_reply_wchunk(XDR *, struct clist **);
565 extern bool_t xdr_decode_wlist(XDR *xdrs, struct clist **, bool_t *);
566 extern bool_t xdr_decode_wlist_svc(XDR *xdrs, struct clist **, bool_t *,
567 uint32_t *, CONN *);
568 extern bool_t xdr_encode_rlist_svc(XDR *, clist *);
569 extern bool_t xdr_encode_wlist(XDR *, clist *);
570 extern bool_t xdr_encode_reply_wchunk(XDR *, struct clist *,
571 uint32_t seg_array_len);
572 bool_t xdrrdma_getrdmablk(XDR *, struct clist **, uint_t *,
573 CONN **conn, const uint_t);
574 bool_t xdrrdma_read_from_client(struct clist *, CONN **, uint_t);
575 bool_t xdrrdma_send_read_data(XDR *, uint_t, struct clist *);
576 bool_t xdrrdma_free_clist(CONN *, struct clist *);
577 #endif /* _KERNEL */
579 #ifdef __cplusplus
581 #endif
583 #endif /* _RPC_RPC_RDMA_H */