epan/dissectors/pidl/ C99 drsuapi
[wireshark-sm.git] / epan / dissectors / packet-rpcrdma.c
blob24c18b98a27c58a463adadc69b7e52e0cb49d627
1 /* packet-rpcordma.c
2 * Routines for RPC over RDMA dissection (RFC 5666)
3 * Copyright 2014-2015, Mellanox Technologies Ltd.
4 * Code by Yan Burman.
6 * Wireshark - Network traffic analyzer
7 * By Gerald Combs <gerald@wireshark.org>
8 * Copyright 1998 Gerald Combs
10 * SPDX-License-Identifier: GPL-2.0-or-later
13 #include "config.h"
15 #include <stdlib.h>
17 #include <epan/packet.h>
18 #include <epan/prefs.h>
19 #include <epan/exceptions.h>
20 #include <epan/proto_data.h>
21 #include <epan/reassemble.h>
22 #include <epan/conversation.h>
23 #include <epan/addr_resolv.h>
25 #include "packet-rpcrdma.h"
26 #include "packet-frame.h"
27 #include "packet-infiniband.h"
28 #include "packet-iwarp-ddp-rdmap.h"
30 #define MIN_RPCRDMA_HDR_SZ 16
31 #define MIN_RPCRDMA_MSG_SZ (MIN_RPCRDMA_HDR_SZ + 12)
32 #define MIN_RPCRDMA_MSGP_SZ (MIN_RPCRDMA_MSG_SZ + 8)
34 #define SID_ULP_MASK 0x00000000FF000000
35 #define SID_PROTO_MASK 0x0000000000FF0000
36 #define SID_PORT_MASK 0x000000000000FFFF
38 #define SID_ULP 0x01
39 #define SID_PROTO_TCP 0x06
40 #define TCP_PORT_RPCRDMA_RANGE "20049,2050"
42 #define SID_MASK (SID_ULP_MASK | SID_PROTO_MASK)
43 #define SID_ULP_TCP ((SID_ULP << 3 * 8) | (SID_PROTO_TCP << 2 * 8))
45 void proto_reg_handoff_rpcordma(void);
46 void proto_register_rpcordma(void);
48 static int proto_rpcordma;
49 static dissector_handle_t rpcordma_handle;
50 static dissector_handle_t rpc_handler;
52 /* RPCoRDMA Header */
53 static int hf_rpcordma_xid;
54 static int hf_rpcordma_vers;
55 static int hf_rpcordma_flow_control;
56 static int hf_rpcordma_message_type;
58 /* chunks */
59 static int hf_rpcordma_reads_count;
60 static int hf_rpcordma_writes_count;
61 static int hf_rpcordma_reply_count;
63 static int hf_rpcordma_position;
64 static int hf_rpcordma_segment_count;
66 /* rdma_segment */
67 static int hf_rpcordma_rdma_handle;
68 static int hf_rpcordma_rdma_length;
69 static int hf_rpcordma_rdma_offset;
71 static int hf_rpcordma_rdma_align;
72 static int hf_rpcordma_rdma_thresh;
74 static int hf_rpcordma_errcode;
75 static int hf_rpcordma_vers_high;
76 static int hf_rpcordma_vers_low;
78 /* Initialize the subtree pointers */
79 static int ett_rpcordma;
80 static int ett_rpcordma_chunk;
81 static int ett_rpcordma_read_list;
82 static int ett_rpcordma_read_chunk;
83 static int ett_rpcordma_write_list;
84 static int ett_rpcordma_write_chunk;
85 static int ett_rpcordma_reply_chunk;
86 static int ett_rpcordma_segment;
88 /* Fragmentation */
89 static int hf_rpcordma_fragments;
90 static int hf_rpcordma_fragment;
91 static int hf_rpcordma_fragment_overlap;
92 static int hf_rpcordma_fragment_overlap_conflicts;
93 static int hf_rpcordma_fragment_multiple_tails;
94 static int hf_rpcordma_fragment_too_long_fragment;
95 static int hf_rpcordma_fragment_error;
96 static int hf_rpcordma_fragment_count;
97 static int hf_rpcordma_reassembled_in;
98 static int hf_rpcordma_reassembled_length;
99 static int hf_rpcordma_reassembled_data;
101 static int ett_rpcordma_fragment;
102 static int ett_rpcordma_fragments;
104 static const fragment_items rpcordma_frag_items = {
105 /* Fragment subtrees */
106 &ett_rpcordma_fragment,
107 &ett_rpcordma_fragments,
108 /* Fragment fields */
109 &hf_rpcordma_fragments,
110 &hf_rpcordma_fragment,
111 &hf_rpcordma_fragment_overlap,
112 &hf_rpcordma_fragment_overlap_conflicts,
113 &hf_rpcordma_fragment_multiple_tails,
114 &hf_rpcordma_fragment_too_long_fragment,
115 &hf_rpcordma_fragment_error,
116 &hf_rpcordma_fragment_count,
117 /* Reassembled in field */
118 &hf_rpcordma_reassembled_in,
119 /* Reassembled length field */
120 &hf_rpcordma_reassembled_length,
121 /* Reassembled data field */
122 &hf_rpcordma_reassembled_data,
123 /* Tag */
124 "RPCoRDMA fragments"
127 /* Reassembly table */
128 static reassembly_table rpcordma_reassembly_table;
130 enum MSG_TYPE {
131 RDMA_MSG,
132 RDMA_NOMSG,
133 RDMA_MSGP,
134 RDMA_DONE,
135 RDMA_ERROR
138 static const value_string rpcordma_message_type[] = {
139 {RDMA_MSG, "RDMA_MSG"},
140 {RDMA_NOMSG, "RDMA_NOMSG"},
141 {RDMA_MSGP, "RDMA_MSGP"},
142 {RDMA_DONE, "RDMA_DONE"},
143 {RDMA_ERROR, "RDMA_ERROR"},
144 {0, NULL}
147 #define ERR_VERS 1
148 #define ERR_CHUNK 2
150 static const value_string rpcordma_err[] = {
151 {ERR_VERS, "ERR_VERS"},
152 {ERR_CHUNK, "ERR_CHUNK"},
153 {0, NULL}
156 /* RDMA chunk type */
157 typedef enum {
158 RDMA_READ_CHUNK,
159 RDMA_WRITE_CHUNK,
160 RDMA_REPLY_CHUNK
161 } chunk_type_t;
163 /* RDMA segment */
164 typedef struct {
165 uint32_t xdrpos; /* Position in XDR stream -- RDMA read only */
166 uint32_t handle; /* Registered memory handle */
167 uint32_t length; /* Length of segment in bytes */
168 uint64_t offset; /* Segment virtual address or offset */
169 } rdma_segment_t;
171 /* RDMA chunk */
172 typedef struct {
173 chunk_type_t type; /* Chunk type */
174 uint32_t length; /* Length of chunk in bytes */
175 wmem_array_t *segments; /* List of segments for chunk */
176 } rdma_chunk_t;
178 /* RPC-over-RDMA lists */
179 typedef struct {
180 wmem_array_t *p_read_list; /* List of RDMA read chunks */
181 wmem_array_t *p_write_list; /* List of RDMA write chunks */
182 wmem_array_t *p_reply_list; /* List of RDMA reply chunks */
183 } rdma_lists_t;
186 * Segment information for RDMA I/O
187 * All segments belonging to the same chunk list have the same message ID
188 * A segment could have multiple I/O requests
190 typedef struct {
191 uint32_t handle; /* Handle or remote key of segment */
192 uint64_t offset; /* Segment virtual address or offset */
193 uint32_t msgid; /* ID for fragments belonging together */
194 uint32_t msgno; /* Message number base so fragments are
195 consecutive within segment requests */
196 chunk_type_t type; /* Chunk type for segment */
197 uint32_t xdrpos; /* Position in XDR stream -- RDMA read only */
198 uint32_t length; /* Length of segment in bytes */
199 uint32_t rbytes; /* Number of bytes added to reassembly table */
200 } segment_info_t;
202 typedef struct {
203 uint32_t psn; /* First PSN for request */
204 uint32_t length; /* Request length */
205 uint64_t offset; /* Request offset */
206 segment_info_t *segment; /* Segment info for RDMA I/O */
207 } ib_request_t;
209 /* Send reassembly info structure */
210 typedef struct {
211 uint32_t msgid; /* ID for fragments belonging together */
212 uint32_t msgno; /* Message number base */
213 uint32_t rsize; /* Number of bytes added to reassembly table */
214 } send_info_t;
216 /* State structure per conversation */
217 typedef struct {
218 wmem_tree_t *segment_list; /* Binary tree of segments searched by handle */
219 wmem_tree_t *psn_list; /* Binary tree of IB requests searched by PSN */
220 wmem_tree_t *msgid_list; /* Binary tree of segments with same message id */
221 wmem_tree_t *request_list; /* Binary tree of iWarp read requests for mapping sink -> source */
222 wmem_tree_t *send_list; /* Binary tree for mapping PSN -> msgid (IB) */
223 wmem_tree_t *msn_list; /* Binary tree for mapping MSN -> msgid (iWarp) */
224 segment_info_t *segment_info; /* Current READ/WRITE/REPLY segment info */
225 uint32_t iosize; /* Maximum size of data transferred in a
226 single packet */
227 } rdma_conv_info_t;
229 /* Proto data keys */
230 enum {
231 RPCRDMA_MSG_ID,
232 RPCRDMA_FRAG_HEAD,
233 RPCRDMA_WRITE_SIZE,
236 /* Return the number of fragments of size 'b' in 'a' */
237 #define NFRAGS(a,b) ((a)/(b) + ((a)%(b) ? 1: 0))
240 * Global variable set for every InfiniBand packet. This is used because
241 * the arguments in dissect_rpcrdma are fixed and cannot be changed to pass
242 * an extra argument to differentiate between InfiniBand and iWarp.
243 * Reassembly is only supported for InfiniBand packets.
245 static struct infinibandinfo *gp_infiniband_info;
247 /* Global variable set for every iWarp packet */
248 static rdmap_info_t *gp_rdmap_info;
250 /* Call process_reassembled_data just once per frame */
251 static bool g_needs_reassembly;
253 /* Array of offsets for reduced data in write chunks */
254 static wmem_array_t *gp_rdma_write_offsets;
256 /* Signal upper layer(s) the current frame's data has been reduced by DDP */
257 static bool g_rpcrdma_reduced;
260 * Signal upper layer(s) the current frame's data has been reduced by DDP
261 * (direct data placement) in which large data chunks have been removed from
262 * the XDR data so these data chunks could be transferred using RDMA writes.
263 * This is only used on RDMA write chunks because there is no way to know
264 * where each write chunk must be inserted into the XDR data.
265 * Read chunks have the xdrpos because the client needs to notify the server
266 * how to reassemble the reduced message and their chunks. On the other hand,
267 * write chunks do not have this information because the client knows exactly
268 * how to reassemble the reply with the use of the virtual address in the chunk,
269 * but this virtual address is internal to the client -- there is no way to
270 * map the virtual address to an offset within the XDR data.
272 bool rpcrdma_is_reduced(void)
274 return g_rpcrdma_reduced;
278 * Insert offset in the reduced data write chunk array.
279 * Offset is relative to the reduced message from the end of the reported
280 * buffer because the upper layer is dealing with the reduced XDR message
281 * so it is easier to report this offset back and calculate the correct XDR
282 * position in this layer before reassembly starts for a reduced message
284 void rpcrdma_insert_offset(int offset)
286 wmem_array_append_one(gp_rdma_write_offsets, offset);
290 * Reset the array of write offsets at the end of the frame. These
291 * are packet scoped, so they don't need to be freed, but we want
292 * to ensure that the global doesn't point to no longer allocated
293 * memory in a later packet.
295 static void
296 reset_write_offsets(void)
298 gp_rdma_write_offsets = NULL;
301 /* Get conversation state, it is created if it does not exist */
302 static rdma_conv_info_t *get_rdma_conv_info(packet_info *pinfo)
304 conversation_t *p_conversation;
305 rdma_conv_info_t *p_rdma_conv_info;
307 /* Find or create conversation info */
308 p_conversation = find_or_create_conversation(pinfo);
310 /* Get state structure for this conversation */
311 p_rdma_conv_info = (rdma_conv_info_t *)conversation_get_proto_data(p_conversation, proto_rpcordma);
312 if (p_rdma_conv_info == NULL) {
313 /* Add state structure for this conversation */
314 p_rdma_conv_info = wmem_new(wmem_file_scope(), rdma_conv_info_t);
315 p_rdma_conv_info->segment_list = wmem_tree_new(wmem_file_scope());
316 p_rdma_conv_info->psn_list = wmem_tree_new(wmem_file_scope());
317 p_rdma_conv_info->msgid_list = wmem_tree_new(wmem_file_scope());
318 p_rdma_conv_info->send_list = wmem_tree_new(wmem_file_scope());
319 p_rdma_conv_info->msn_list = wmem_tree_new(wmem_file_scope());
320 p_rdma_conv_info->request_list = wmem_tree_new(wmem_file_scope());
321 p_rdma_conv_info->segment_info = NULL;
322 p_rdma_conv_info->iosize = 1;
323 conversation_add_proto_data(p_conversation, proto_rpcordma, p_rdma_conv_info);
325 return p_rdma_conv_info;
328 /* Set RDMA maximum I/O size for conversation */
329 static void set_max_iosize(rdma_conv_info_t *p_rdma_conv_info, unsigned size)
331 p_rdma_conv_info->iosize = MAX(p_rdma_conv_info->iosize, size);
334 /* Return a unique non-zero message ID */
335 static uint32_t get_msg_id(void)
337 static uint32_t msg_id = 0;
338 if (++msg_id == 0) {
339 /* Message ID has wrapped around so increment again */
340 ++msg_id;
342 return msg_id;
345 /* Find segment info for the given handle and offset */
346 static segment_info_t *find_segment_info(rdma_conv_info_t *p_rdma_conv_info, uint32_t handle, uint64_t offset)
348 segment_info_t *p_segment_info;
350 p_segment_info = (segment_info_t *)wmem_tree_lookup32(p_rdma_conv_info->segment_list, handle);
351 if (p_segment_info && offset >= p_segment_info->offset && \
352 offset < p_segment_info->offset + p_segment_info->length)
353 return p_segment_info;
354 return NULL;
357 /* Add Infiniband request info for the correct segment */
358 static void add_request_info(rdma_conv_info_t *p_rdma_conv_info, packet_info *pinfo)
360 segment_info_t *p_segment_info;
361 ib_request_t *p_ib_request;
363 if (!pinfo->fd->visited) {
364 p_segment_info = find_segment_info(p_rdma_conv_info, gp_infiniband_info->reth_remote_key, gp_infiniband_info->reth_remote_address);
365 if (p_segment_info) {
366 /* Add request to list */
367 p_ib_request = wmem_new(wmem_file_scope(), ib_request_t);
368 p_ib_request->psn = gp_infiniband_info->packet_seq_num;
369 p_ib_request->offset = gp_infiniband_info->reth_remote_address;
370 p_ib_request->length = gp_infiniband_info->reth_dma_length;
371 p_ib_request->segment = p_segment_info;
372 wmem_tree_insert32(p_rdma_conv_info->psn_list, gp_infiniband_info->packet_seq_num, p_ib_request);
378 * Return if reassembly is done by checking all bytes in each segment have
379 * been added to the reassembly table. It could be more than requested
380 * because of padding bytes.
382 static bool is_reassembly_done(rdma_conv_info_t *p_rdma_conv_info, uint32_t msgid)
384 uint32_t message_size = 0;
385 uint32_t reassembled_size = 0;
386 wmem_list_frame_t *item;
387 wmem_list_t *msgid_segments;
388 segment_info_t *p_segment_info;
389 bool ret = false; /* Make sure there is at least one segment */
390 int segment_type = -1;
392 /* Get all segments for the given msgid */
393 msgid_segments = wmem_tree_lookup32(p_rdma_conv_info->msgid_list, msgid);
394 if (msgid_segments) {
395 for (item = wmem_list_head(msgid_segments); item != NULL; item = wmem_list_frame_next(item)) {
396 p_segment_info = wmem_list_frame_data(item);
397 segment_type = p_segment_info->type;
398 if (p_segment_info->rbytes < p_segment_info->length) {
399 /* Not all bytes have been received for this request */
400 return false;
402 /* At least one segment is done, check the rest */
403 ret = true;
404 message_size += p_segment_info->length;
405 reassembled_size += p_segment_info->rbytes;
408 if (ret && segment_type == RDMA_READ_CHUNK) {
410 * Make sure all bytes are added to the reassembly table. Since the
411 * reassembly is done on the READ_RESPONSE_LAST, a read request could
412 * happen after the last read response for the previous request, in
413 * this case this will give a false positive so check the total size
414 * of all chunks (all segments required for the message)
416 return (reassembled_size >= message_size);
418 return ret;
422 * Get the fragment head from the cache
423 * Returns NULL if still missing fragments
425 static fragment_head *get_fragment_head(packet_info *pinfo)
427 return (fragment_head *)p_get_proto_data(wmem_file_scope(), pinfo, proto_rpcordma, RPCRDMA_FRAG_HEAD);
430 /* Save the fragment head on the proto data cache */
431 static void set_fragment_head(fragment_head *fd_head, packet_info *pinfo)
433 if (fd_head && fd_head != get_fragment_head(pinfo)) {
434 /* Add the fragment head to the packet cache */
435 p_add_proto_data(wmem_file_scope(), pinfo, proto_rpcordma, RPCRDMA_FRAG_HEAD, fd_head);
440 * Get the fragment head for the current frame
441 * Returns non-NULL if this frame is a fragment
443 static fragment_head *get_reassembled_id(packet_info *pinfo)
445 uint32_t *p_msgid;
446 p_msgid = (uint32_t *)p_get_proto_data(wmem_file_scope(), pinfo, proto_rpcordma, RPCRDMA_MSG_ID);
447 if (p_msgid) {
448 return fragment_get_reassembled_id(&rpcordma_reassembly_table, pinfo, *p_msgid);
450 return NULL;
453 /* Get the reassembled data, returns NULL if still missing fragments */
454 static tvbuff_t *get_reassembled_data(tvbuff_t *tvb, unsigned offset,
455 packet_info *pinfo, proto_tree *tree)
457 tvbuff_t *new_tvb = NULL;
458 fragment_head *fd_head;
460 if (g_needs_reassembly) {
461 /* Get fragment head for fragment to display "Reassembled in" message */
462 fd_head = get_reassembled_id(pinfo);
463 if (!fd_head) {
464 /* Get fragment head on frame where reassembly has been completed */
465 fd_head = get_fragment_head(pinfo);
467 if (fd_head) {
468 new_tvb = process_reassembled_data(tvb, offset, pinfo,
469 "Reassembled RPCoRDMA Message", fd_head, &rpcordma_frag_items,
470 NULL, tree);
471 /* Call process_reassembled_data just once per frame */
472 g_needs_reassembly = false;
475 return new_tvb;
479 * Complete reassembly:
480 * 1. If p_rdma_conv_info is NULL then complete reassembly.
481 * 2. If p_rdma_conv_info is non-NULL then complete reassembly only if
482 * reassembly is really done by making sure all data has been received.
484 static fragment_head *end_reassembly(uint32_t msgid,
485 rdma_conv_info_t *p_rdma_conv_info, packet_info *pinfo)
487 fragment_head *fd_head = NULL;
489 /* Check if reassembly is really done only if p_rdma_conv_info is non-NULL */
490 if (!p_rdma_conv_info || is_reassembly_done(p_rdma_conv_info, msgid)) {
491 /* Complete the reassembly */
492 fd_head = fragment_end_seq_next(&rpcordma_reassembly_table, pinfo, msgid, NULL);
493 set_fragment_head(fd_head, pinfo);
495 return fd_head;
499 * Add a fragment to the reassembly table and return the reassembled data
500 * if all fragments have been added
502 static tvbuff_t *add_fragment(tvbuff_t *tvb, int offset, uint32_t msgid,
503 int32_t msg_num, bool more_frags, rdma_conv_info_t *p_rdma_conv_info,
504 packet_info *pinfo, proto_tree *tree)
506 uint8_t pad_count = 0;
507 uint32_t nbytes, frag_size;
508 tvbuff_t *new_tvb = NULL;
509 fragment_head *fd_head = NULL;
510 uint32_t *p_msgid;
512 if (gp_infiniband_info) {
513 pad_count = gp_infiniband_info->pad_count;
516 /* Get fragment head if reassembly has been completed */
517 fd_head = get_fragment_head(pinfo);
518 if (fd_head == NULL) {
519 /* Reassembly has not been completed yet */
520 if (msg_num >= 0) {
521 nbytes = tvb_captured_length_remaining(tvb, offset);
522 if (nbytes > 0 || more_frags) {
523 /* Add message fragment to reassembly table */
524 if (pad_count > 0 && p_rdma_conv_info && \
525 p_rdma_conv_info->segment_info != NULL && \
526 p_rdma_conv_info->segment_info->type == RDMA_READ_CHUNK && \
527 p_rdma_conv_info->segment_info->xdrpos == 0) {
528 /* Do not include any padding bytes inserted by Infiniband
529 * layer if this is a PZRC (Position-Zero Read Chunk) since
530 * payload stream already has any necessary padding bytes */
531 frag_size = tvb_reported_length_remaining(tvb, offset) - pad_count;
532 if (frag_size < nbytes) {
533 nbytes = frag_size;
536 fd_head = fragment_add_seq_check(&rpcordma_reassembly_table,
537 tvb, offset, pinfo,
538 msgid, NULL, (uint32_t)msg_num,
539 nbytes, more_frags);
540 /* Save the msgid in the proto data cache */
541 p_msgid = wmem_new(wmem_file_scope(), uint32_t);
542 *p_msgid = msgid;
543 p_add_proto_data(wmem_file_scope(), pinfo, proto_rpcordma, RPCRDMA_MSG_ID, p_msgid);
544 } else if (p_rdma_conv_info) {
545 /* No data in this frame, so just complete the reassembly
546 * if reassembly is really done */
547 fd_head = end_reassembly(msgid, p_rdma_conv_info, pinfo);
549 /* Add the fragment head to the packet cache */
550 set_fragment_head(fd_head, pinfo);
554 /* Get reassembled data */
555 new_tvb = get_reassembled_data(tvb, 0, pinfo, tree);
557 return new_tvb;
561 * Add an Infiniband fragment to the reassembly table and return the
562 * reassembled data if all fragments have been added
564 static tvbuff_t *add_ib_fragment(tvbuff_t *tvb,
565 rdma_conv_info_t *p_rdma_conv_info, bool only_frag,
566 packet_info *pinfo, proto_tree *tree)
568 uint32_t msgid, msg_num, msg_off;
569 uint32_t nfrags, psndelta = 0;
570 tvbuff_t *new_tvb = NULL;
571 ib_request_t *p_ib_request;
572 segment_info_t *p_segment_info = NULL;
573 uint32_t iosize = p_rdma_conv_info->iosize;
574 uint64_t va_offset;
576 if (pinfo->fd->visited) {
577 return get_reassembled_data(tvb, 0, pinfo, tree);
578 } else if (only_frag) {
579 /* Write Only: no request so use segment info */
580 p_segment_info = find_segment_info(p_rdma_conv_info, gp_infiniband_info->reth_remote_key, gp_infiniband_info->reth_remote_address);
581 va_offset = gp_infiniband_info->reth_remote_address;
582 } else {
583 p_rdma_conv_info->segment_info = NULL;
584 /* Get correct request */
585 p_ib_request = (ib_request_t *)wmem_tree_lookup32_le(p_rdma_conv_info->psn_list, gp_infiniband_info->packet_seq_num);
586 if (p_ib_request) {
587 psndelta = gp_infiniband_info->packet_seq_num - p_ib_request->psn;
588 nfrags = NFRAGS((p_ib_request->length), iosize);
589 if (psndelta < nfrags) {
590 /* This is the correct request */
591 p_segment_info = p_ib_request->segment;
592 /* Make message number relative to request */
593 va_offset = p_ib_request->offset;
597 if (p_segment_info) {
598 p_rdma_conv_info->segment_info = p_segment_info;
599 p_segment_info->rbytes += tvb_reported_length(tvb);
600 /* Make message number relative to request or segment(write only) */
601 msg_off = (uint32_t)NFRAGS((va_offset - p_segment_info->offset), iosize) + psndelta;
602 msgid = p_segment_info->msgid;
603 msg_num = p_segment_info->msgno + 1 + msg_off;
604 new_tvb = add_fragment(tvb, 0, msgid, msg_num, true, p_rdma_conv_info, pinfo, tree);
606 return new_tvb;
610 * Add padding bytes as a separate fragment when last fragment's data is not
611 * on a four-byte boundary. The MPA layer removes the padding bytes from all
612 * iWarp Reads/Writes. The iWarp Send messages are padded correctly.
614 static void add_iwarp_padding(tvbuff_t *tvb, int offset,
615 uint32_t msgid, uint32_t msgno, packet_info *pinfo)
617 char *pbuf;
618 tvbuff_t *pad_tvb;
619 /* Size of payload data for current iWarp Read/Write */
620 uint32_t bsize = tvb_reported_length_remaining(tvb, offset);
621 /* Number of padding bytes needed */
622 uint32_t padding = (4 - (bsize%4)) % 4;
624 if (padding > 0) {
625 /* Allocate buffer for the number of padding bytes that will be added */
626 pbuf = (char *)wmem_alloc(pinfo->pool, padding);
627 memset(pbuf, 0, padding);
628 /* Create tvb buffer */
629 pad_tvb = tvb_new_real_data(pbuf, padding, padding);
630 /* Add padding fragment to the reassembly table */
631 fragment_add_seq_check(&rpcordma_reassembly_table, pad_tvb, 0,
632 pinfo, msgid, NULL, msgno, padding, true);
637 * Add an iWarp fragment to the reassembly table and return the
638 * reassembled data if all fragments have been added
640 static tvbuff_t *add_iwarp_fragment(tvbuff_t *tvb,
641 rdma_conv_info_t *p_rdma_conv_info, packet_info *pinfo,
642 proto_tree *tree)
644 uint32_t sbytes = 0; /* Total bytes for all segments in current reassembly */
645 uint32_t rbytes = 0; /* Total bytes received so far */
646 uint32_t msgno; /* Message number for this fragment */
647 uint32_t steering_tag;
648 uint64_t tagged_offset;
649 bool more_frags = true;
650 wmem_list_t *msgid_segments;
651 wmem_list_frame_t *item;
652 segment_info_t *p_seginfo;
653 segment_info_t *p_segment_info;
654 rdmap_request_t *p_read_request = NULL;
655 tvbuff_t *new_tvb = NULL;
657 if (pinfo->fd->visited) {
658 return get_reassembled_data(tvb, 0, pinfo, tree);
659 } else if (gp_rdmap_info->opcode == RDMA_READ_RESPONSE) {
660 /* Read fragment: map sink -> source using the request info */
661 p_read_request = wmem_tree_lookup32(p_rdma_conv_info->request_list, gp_rdmap_info->steering_tag);
662 if (p_read_request) {
663 /* Map Read Response STag to segment STag */
664 steering_tag = p_read_request->source_stag;
665 /* Map Read Response offset to segment offset */
666 tagged_offset = gp_rdmap_info->tagged_offset - p_read_request->sink_toffset + p_read_request->source_toffset;
667 } else {
668 return NULL;
670 } else {
671 /* Write fragment: no need for mapping, use steering tag and offset */
672 steering_tag = gp_rdmap_info->steering_tag;
673 tagged_offset = gp_rdmap_info->tagged_offset;
676 p_rdma_conv_info->segment_info = NULL;
677 p_segment_info = find_segment_info(p_rdma_conv_info, steering_tag, tagged_offset);
678 if (p_segment_info) {
679 /* Message number is relative with respect to chunk, adding
680 * one since msgno = 0 is reserved for the reduced message */
681 msgno = (uint32_t)(tagged_offset - p_segment_info->offset) + p_segment_info->msgno + 1;
682 p_rdma_conv_info->segment_info = p_segment_info;
684 /* Include this fragment's data */
685 p_segment_info->rbytes += tvb_captured_length_remaining(tvb, 0);
687 if (gp_rdmap_info->last_flag) {
688 /* This is a last fragment so go through all segments
689 * to calculate sbytes and rbytes */
690 msgid_segments = wmem_tree_lookup32(p_rdma_conv_info->msgid_list, p_segment_info->msgid);
691 if (msgid_segments) {
692 for (item = wmem_list_head(msgid_segments); item != NULL; item = wmem_list_frame_next(item)) {
693 p_seginfo = wmem_list_frame_data(item);
694 sbytes += p_seginfo->length;
695 rbytes += p_seginfo->rbytes;
698 if (p_read_request && rbytes == sbytes) {
699 /* Complete read chunk reassembly since all fragments
700 * have been received */
701 more_frags = false;
704 new_tvb = add_fragment(tvb, 0, p_segment_info->msgid, msgno, true, p_rdma_conv_info, pinfo, tree);
705 if ((!new_tvb && !more_frags) || (gp_rdmap_info->last_flag && !p_read_request && rbytes == sbytes)) {
706 /* This is the very last fragment, include any padding if needed */
707 add_iwarp_padding(tvb, 0, p_segment_info->msgid, msgno+1, pinfo);
709 if (!new_tvb && !more_frags) {
710 /* Complete reassembly */
711 end_reassembly(p_segment_info->msgid, p_rdma_conv_info, pinfo);
712 new_tvb = get_reassembled_data(tvb, 0, pinfo, tree);
715 return new_tvb;
718 static unsigned get_read_list_size(tvbuff_t *tvb, unsigned max_offset, unsigned offset)
720 uint32_t value_follows;
721 unsigned start = offset;
723 while (1) {
724 value_follows = tvb_get_ntohl(tvb, offset);
725 offset += 4;
726 if (offset > max_offset)
727 return 0;
728 if (!value_follows)
729 break;
731 offset += 20;
732 if (offset > max_offset)
733 return 0;
736 return offset - start;
739 static unsigned get_read_list_chunk_count(tvbuff_t *tvb, unsigned offset)
741 uint32_t value_follows;
742 unsigned num_chunks;
744 num_chunks = 0;
745 while (1) {
746 value_follows = tvb_get_ntohl(tvb, offset);
747 offset += 4;
748 if (!value_follows)
749 break;
751 num_chunks++;
752 offset += 20;
754 return num_chunks;
757 static unsigned get_write_chunk_size(tvbuff_t *tvb, unsigned offset)
759 unsigned segment_count;
760 unsigned max_count = (unsigned)tvb_reported_length_remaining(tvb, offset + 4) / 16;
762 segment_count = tvb_get_ntohl(tvb, offset);
763 if (segment_count > max_count) {
764 /* XXX We should throw an exception here. */
765 segment_count = max_count;
767 return 4 + (segment_count * 16);
770 static unsigned get_write_list_size(tvbuff_t *tvb, unsigned max_offset, unsigned offset)
772 uint32_t value_follows;
773 unsigned chunk_size, start = offset;
775 while (1) {
776 value_follows = tvb_get_ntohl(tvb, offset);
777 offset += 4;
778 if (offset > max_offset)
779 return 0;
780 if (!value_follows)
781 break;
783 chunk_size = get_write_chunk_size(tvb, offset);
784 if (max_offset - offset < chunk_size)
785 return 0;
786 offset += chunk_size;
789 return offset - start;
792 static unsigned get_write_list_chunk_count(tvbuff_t *tvb, unsigned offset)
794 uint32_t value_follows;
795 unsigned num_chunks, chunk_size;
797 num_chunks = 0;
798 while (1) {
799 value_follows = tvb_get_ntohl(tvb, offset);
800 offset += 4;
801 if (!value_follows)
802 break;
804 num_chunks++;
805 chunk_size = get_write_chunk_size(tvb, offset);
806 if (chunk_size == 0)
807 break;
808 offset += chunk_size;
811 return num_chunks;
814 static unsigned get_reply_chunk_size(tvbuff_t *tvb, unsigned max_offset, unsigned offset)
816 uint32_t value_follows;
817 unsigned start = offset;
819 value_follows = tvb_get_ntohl(tvb, offset);
820 offset += 4;
821 if (offset > max_offset)
822 return 0;
824 if (value_follows) {
825 offset += get_write_chunk_size(tvb, offset);
826 if (offset > max_offset)
827 return 0;
830 return offset - start;
833 static unsigned get_reply_chunk_count(tvbuff_t *tvb, unsigned offset)
835 uint32_t value_follows;
837 value_follows = tvb_get_ntohl(tvb, offset);
838 return value_follows ? 1 : 0;
842 * The RDMA read list is given as a list of read segments in the protocol.
843 * In order to create a list of chunks, all segments having the same XDR
844 * position will be part of an RDMA read chunk.
846 static void add_rdma_read_segment(wmem_array_t *p_read_list,
847 rdma_segment_t *p_rdma_segment)
849 unsigned i;
850 rdma_segment_t *p_segment;
851 rdma_chunk_t *p_rdma_chunk = NULL;
853 /* Look for correct chunk where to insert the segment */
854 for (i=0; i<wmem_array_get_count(p_read_list); i++) {
855 p_rdma_chunk = (rdma_chunk_t *)wmem_array_index(p_read_list, i);
856 p_segment = (rdma_segment_t *)wmem_array_index(p_rdma_chunk->segments, 0);
857 if (p_segment->xdrpos == p_rdma_segment->xdrpos) {
858 /* Found correct read chunk */
859 break;
860 } else {
861 p_rdma_chunk = NULL;
865 if (p_rdma_chunk == NULL) {
866 /* No read chunk was found so initialize a new chunk */
867 p_rdma_chunk = wmem_new(wmem_packet_scope(), rdma_chunk_t);
868 p_rdma_chunk->type = RDMA_READ_CHUNK;
869 p_rdma_chunk->segments = wmem_array_new(wmem_packet_scope(), sizeof(rdma_segment_t));
870 /* Add read chunk to the RDMA read list */
871 wmem_array_append(p_read_list, p_rdma_chunk, 1);
874 /* Add segment to the read chunk */
875 wmem_array_append(p_rdma_chunk->segments, p_rdma_segment, 1);
878 static unsigned dissect_rpcrdma_read_chunk(proto_tree *read_list,
879 tvbuff_t *tvb, unsigned offset, wmem_array_t *p_read_list)
881 proto_tree *read_chunk;
882 uint32_t position;
883 rdma_segment_t *p_rdma_segment;
885 /* Initialize read segment */
886 p_rdma_segment = wmem_new(wmem_packet_scope(), rdma_segment_t);
888 position = tvb_get_ntohl(tvb, offset);
889 p_rdma_segment->xdrpos = position;
890 read_chunk = proto_tree_add_subtree_format(read_list, tvb,
891 offset, 20, ett_rpcordma_read_chunk, NULL,
892 "Read chunk: (position %u)", position);
894 proto_tree_add_item(read_chunk, hf_rpcordma_position, tvb,
895 offset, 4, ENC_BIG_ENDIAN);
896 offset += 4;
897 proto_tree_add_item_ret_uint(read_chunk, hf_rpcordma_rdma_handle, tvb,
898 offset, 4, ENC_BIG_ENDIAN, &p_rdma_segment->handle);
899 offset += 4;
900 proto_tree_add_item_ret_uint(read_chunk, hf_rpcordma_rdma_length, tvb,
901 offset, 4, ENC_BIG_ENDIAN, &p_rdma_segment->length);
902 offset += 4;
903 proto_tree_add_item_ret_uint64(read_chunk, hf_rpcordma_rdma_offset, tvb,
904 offset, 8, ENC_BIG_ENDIAN, &p_rdma_segment->offset);
906 add_rdma_read_segment(p_read_list, p_rdma_segment);
907 return offset + 8;
910 static unsigned dissect_rpcrdma_read_list(tvbuff_t *tvb, unsigned offset,
911 proto_tree *tree, rdma_lists_t *rdma_lists)
913 unsigned chunk_count, start = offset;
914 proto_tree *read_list;
915 uint32_t value_follows;
916 proto_item *item;
918 chunk_count = get_read_list_chunk_count(tvb, offset);
919 item = proto_tree_add_uint_format(tree, hf_rpcordma_reads_count,
920 tvb, offset, 0, chunk_count,
921 "Read list (count: %u)", chunk_count);
923 read_list = proto_item_add_subtree(item, ett_rpcordma_read_list);
925 while (1) {
926 value_follows = tvb_get_ntohl(tvb, offset);
927 offset += 4;
928 if (!value_follows)
929 break;
931 if (rdma_lists->p_read_list == NULL) {
932 /* Initialize RDMA read list */
933 rdma_lists->p_read_list = wmem_array_new(wmem_packet_scope(), sizeof(rdma_chunk_t));
935 offset = dissect_rpcrdma_read_chunk(read_list, tvb, offset, rdma_lists->p_read_list);
938 proto_item_set_len(item, offset - start);
939 return offset;
942 static unsigned dissect_rpcrdma_segment(proto_tree *write_chunk, tvbuff_t *tvb,
943 unsigned offset, uint32_t i, wmem_array_t *p_segments)
945 proto_tree *segment;
946 rdma_segment_t *p_rdma_segment;
948 /* Initialize write segment */
949 p_rdma_segment = wmem_new(wmem_packet_scope(), rdma_segment_t);
950 p_rdma_segment->xdrpos = 0; /* Not used in write segments */
952 segment = proto_tree_add_subtree_format(write_chunk, tvb,
953 offset, 16, ett_rpcordma_segment, NULL,
954 "RDMA segment %u", i);
956 proto_tree_add_item_ret_uint(segment, hf_rpcordma_rdma_handle, tvb,
957 offset, 4, ENC_BIG_ENDIAN, &p_rdma_segment->handle);
958 offset += 4;
959 proto_tree_add_item_ret_uint(segment, hf_rpcordma_rdma_length, tvb,
960 offset, 4, ENC_BIG_ENDIAN, &p_rdma_segment->length);
961 offset += 4;
962 proto_tree_add_item_ret_uint64(segment, hf_rpcordma_rdma_offset, tvb,
963 offset, 8, ENC_BIG_ENDIAN, &p_rdma_segment->offset);
965 /* Add segment to the write chunk */
966 wmem_array_append(p_segments, p_rdma_segment, 1);
967 return offset + 8;
970 static unsigned dissect_rpcrdma_write_chunk(proto_tree *write_list, tvbuff_t *tvb,
971 unsigned offset, chunk_type_t chunk_type, wmem_array_t *p_rdma_list)
973 uint32_t i, segment_count;
974 proto_tree *write_chunk;
975 unsigned selection_size;
976 rdma_chunk_t *p_rdma_chunk;
978 selection_size = get_write_chunk_size(tvb, offset);
979 segment_count = tvb_get_ntohl(tvb, offset);
980 write_chunk = proto_tree_add_subtree_format(write_list, tvb,
981 offset, selection_size,
982 ett_rpcordma_write_chunk, NULL,
983 "Write chunk (%u segment%s)", segment_count,
984 segment_count == 1 ? "" : "s");
985 proto_tree_add_item(write_chunk, hf_rpcordma_segment_count,
986 tvb, offset, 4, ENC_BIG_ENDIAN);
987 offset += 4;
989 /* Initialize write chunk */
990 p_rdma_chunk = wmem_new(wmem_packet_scope(), rdma_chunk_t);
991 p_rdma_chunk->type = chunk_type;
992 p_rdma_chunk->segments = wmem_array_new(wmem_packet_scope(), sizeof(rdma_segment_t));
994 /* Add chunk to the write/reply list */
995 wmem_array_append(p_rdma_list, p_rdma_chunk, 1);
997 for (i = 0; i < segment_count; ++i)
998 offset = dissect_rpcrdma_segment(write_chunk, tvb, offset, i, p_rdma_chunk->segments);
1000 return offset;
1003 static unsigned dissect_rpcrdma_write_list(tvbuff_t *tvb, unsigned offset,
1004 proto_tree *tree, rdma_lists_t *rdma_lists)
1006 unsigned chunk_count, start = offset;
1007 proto_tree *write_list;
1008 uint32_t value_follows;
1009 proto_item *item;
1011 chunk_count = get_write_list_chunk_count(tvb, offset);
1012 item = proto_tree_add_uint_format(tree, hf_rpcordma_writes_count,
1013 tvb, offset, 0, chunk_count,
1014 "Write list (count: %u)", chunk_count);
1016 write_list = proto_item_add_subtree(item, ett_rpcordma_write_list);
1018 while (1) {
1019 value_follows = tvb_get_ntohl(tvb, offset);
1020 offset += 4;
1021 if (!value_follows)
1022 break;
1024 if (rdma_lists->p_write_list == NULL) {
1025 /* Initialize RDMA write list */
1026 rdma_lists->p_write_list = wmem_array_new(wmem_packet_scope(), sizeof(rdma_chunk_t));
1028 offset = dissect_rpcrdma_write_chunk(write_list, tvb, offset, RDMA_WRITE_CHUNK, rdma_lists->p_write_list);
1031 proto_item_set_len(item, offset - start);
1032 return offset;
1035 static unsigned dissect_rpcrdma_reply_chunk(tvbuff_t *tvb, unsigned offset,
1036 proto_tree *tree, rdma_lists_t *rdma_lists)
1038 uint32_t chunk_count, start = offset;
1039 proto_tree *reply_chunk;
1040 uint32_t value_follows;
1041 proto_item *item;
1043 chunk_count = get_reply_chunk_count(tvb, offset);
1044 item = proto_tree_add_uint_format(tree, hf_rpcordma_reply_count,
1045 tvb, offset, 4, chunk_count,
1046 "Reply chunk (count: %u)", chunk_count);
1048 reply_chunk = proto_item_add_subtree(item, ett_rpcordma_reply_chunk);
1050 value_follows = tvb_get_ntohl(tvb, offset);
1051 offset += 4;
1052 if (!value_follows)
1053 return offset;
1055 /* Initialize RDMA reply list */
1056 rdma_lists->p_reply_list = wmem_array_new(wmem_packet_scope(), sizeof(rdma_chunk_t));
1058 offset = dissect_rpcrdma_write_chunk(reply_chunk, tvb, offset, RDMA_REPLY_CHUNK, rdma_lists->p_reply_list);
1059 proto_item_set_len(item, offset - start);
1060 return offset;
1063 static unsigned parse_rdma_header(tvbuff_t *tvb, unsigned offset, proto_tree *tree,
1064 rdma_lists_t *rdma_lists)
1066 offset = dissect_rpcrdma_read_list(tvb, offset, tree, rdma_lists);
1067 offset = dissect_rpcrdma_write_list(tvb, offset, tree, rdma_lists);
1068 return dissect_rpcrdma_reply_chunk(tvb, offset, tree, rdma_lists);
1071 static unsigned get_chunk_lists_size(tvbuff_t *tvb, unsigned max_offset, unsigned offset)
1073 unsigned size, start = offset;
1075 size = get_read_list_size(tvb, max_offset, offset);
1076 if (!size)
1077 return 0;
1078 offset += size;
1080 size = get_write_list_size(tvb, max_offset, offset);
1081 if (!size)
1082 return 0;
1083 offset += size;
1085 size = get_reply_chunk_size(tvb, max_offset, offset);
1086 if (!size)
1087 return 0;
1088 offset += size;
1090 return offset - start;
1094 * Return the total number of bytes for the given RDMA chunk list
1095 * Returns 0 when called on an RPC call message because that is where the
1096 * segments are set up. On an RPC reply message the total number of bytes
1097 * added to the reassembly table is returned. This is only valid for RDMA
1098 * writes since there is no RPC-over-RDMA layer for RDMA reads on an RPC reply.
1100 static unsigned
1101 get_rdma_list_size(wmem_array_t *p_list, packet_info *pinfo)
1103 unsigned i, j, size = 0;
1104 uint32_t *p_size;
1105 rdma_chunk_t *p_rdma_chunk;
1106 rdma_segment_t *p_rdma_segment;
1107 segment_info_t *p_segment_info;
1108 rdma_conv_info_t *p_rdma_conv_info;
1110 if (p_list) {
1111 /* Get size from cache */
1112 p_size = (uint32_t *)p_get_proto_data(wmem_file_scope(), pinfo, proto_rpcordma, RPCRDMA_WRITE_SIZE);
1113 if (p_size) {
1114 return *p_size;
1116 /* Get conversation state */
1117 p_rdma_conv_info = get_rdma_conv_info(pinfo);
1118 for (i=0; i<wmem_array_get_count(p_list); i++) {
1119 p_rdma_chunk = (rdma_chunk_t *)wmem_array_index(p_list, i);
1120 for (j=0; j<wmem_array_get_count(p_rdma_chunk->segments); j++) {
1121 p_rdma_segment = (rdma_segment_t *)wmem_array_index(p_rdma_chunk->segments, j);
1122 p_segment_info = find_segment_info(p_rdma_conv_info, p_rdma_segment->handle, p_rdma_segment->offset);
1123 if (p_segment_info) {
1124 size += p_segment_info->rbytes;
1129 if (size > 0) {
1130 /* Save size on the proto data cache */
1131 p_size = wmem_new(wmem_file_scope(), uint32_t);
1132 *p_size = size;
1133 p_add_proto_data(wmem_file_scope(), pinfo, proto_rpcordma, RPCRDMA_WRITE_SIZE, p_size);
1135 return size;
1138 /* Process an RDMA chunk list (read, write or reply) */
1139 static tvbuff_t *
1140 process_rdma_list(tvbuff_t *tvb, unsigned offset, wmem_array_t *p_list,
1141 packet_info *pinfo, proto_tree *tree)
1143 unsigned i, j, size;
1144 uint32_t msgid = 0;
1145 uint32_t xdrpos = 0;
1146 uint32_t xdrprev = 0;
1147 uint32_t lenprev = 0;
1148 uint32_t msg_num = 0;
1149 uint32_t msg_off = 0;
1150 unsigned *p_offset = NULL;
1151 tvbuff_t *tmp_tvb;
1152 tvbuff_t *new_tvb = NULL;
1153 fragment_head *fd_head;
1154 rdma_segment_t *p_rdma_segment;
1155 rdma_chunk_t *p_rdma_chunk = NULL;
1156 segment_info_t *p_segment_info = NULL;
1157 bool setup = false;
1158 wmem_list_t *msgid_segments = NULL;
1159 rdma_conv_info_t *p_rdma_conv_info;
1161 if (p_list) {
1162 /* Get conversation state */
1163 p_rdma_conv_info = get_rdma_conv_info(pinfo);
1165 for (i=0; i<wmem_array_get_count(p_list); i++) {
1166 p_rdma_chunk = (rdma_chunk_t *)wmem_array_index(p_list, i);
1167 p_rdma_chunk->length = 0;
1168 p_offset = NULL;
1170 if (p_rdma_chunk->type == RDMA_WRITE_CHUNK) {
1171 /* Process any write chunk offsets from reduced message */
1172 if (gp_rdma_write_offsets && wmem_array_get_count(gp_rdma_write_offsets) == wmem_array_get_count(p_list)) {
1173 p_offset = (unsigned *)wmem_array_index(gp_rdma_write_offsets, i);
1174 /* Convert reduced offset to xdr position */
1175 xdrpos = tvb_reported_length_remaining(tvb, offset) - *p_offset + msg_off;
1179 for (j=0; j<wmem_array_get_count(p_rdma_chunk->segments); j++) {
1180 p_rdma_segment = (rdma_segment_t *)wmem_array_index(p_rdma_chunk->segments, j);
1181 if (p_rdma_chunk->type == RDMA_READ_CHUNK) {
1182 xdrpos = p_rdma_segment->xdrpos;
1184 p_segment_info = find_segment_info(p_rdma_conv_info, p_rdma_segment->handle, p_rdma_segment->offset);
1185 if (p_segment_info) {
1186 /* This must be the reply, change segment size */
1187 p_segment_info->length = p_rdma_segment->length;
1188 } else {
1189 if (msgid == 0) {
1190 /* Create new message ID */
1191 msgid = get_msg_id();
1192 msgid_segments = wmem_list_new(wmem_file_scope());
1193 wmem_tree_insert32(p_rdma_conv_info->msgid_list, msgid, msgid_segments);
1195 /* Create new segment info */
1196 p_segment_info = wmem_new(wmem_file_scope(), segment_info_t);
1197 p_segment_info->handle = p_rdma_segment->handle;
1198 p_segment_info->length = p_rdma_segment->length;
1199 p_segment_info->offset = p_rdma_segment->offset;
1200 p_segment_info->msgid = msgid;
1201 p_segment_info->msgno = p_rdma_chunk->length;
1202 p_segment_info->type = p_rdma_chunk->type;
1203 p_segment_info->xdrpos = xdrpos;
1204 p_segment_info->rbytes = 0;
1205 /* Add segment to the list of segments */
1206 wmem_tree_insert32(p_rdma_conv_info->segment_list, p_rdma_segment->handle, p_segment_info);
1207 wmem_list_append(msgid_segments, p_segment_info);
1208 setup = true;
1210 /* Calculate the number of bytes for the whole chunk */
1211 p_rdma_chunk->length += p_rdma_segment->length;
1214 /* Add chunk length to correctly calculate xdrpos */
1215 msg_off += p_rdma_chunk->length;
1218 * Add reduced data before each chunk data for either the
1219 * read chunk or write chunk (p_offset != NULL)
1221 if (p_rdma_chunk->type == RDMA_READ_CHUNK || p_offset) {
1223 * Payload data in this frame (e.g., two chunks)
1224 * where chunk data is sent separately using RDMA:
1225 * +----------------+----------------+----------------+
1226 * | xdrdata1 | xdrdata2 | xdrdata3 |
1227 * +----------------+----------------+----------------+
1228 * chunk data1 --^ chunk data2 --^
1230 * Reassembled message should look like the following in which
1231 * the xdrpos specifies where the chunk data must be inserted.
1232 * The xdrpos is relative to the reassembled message and NOT
1233 * relative to the reduced data (data in this frame):
1234 * +----------+-------------+----------+-------------+----------+
1235 * | xdrdata1 | chunk data1 | xdrdata2 | chunk data2 | xdrdata3 |
1236 * +----------+-------------+----------+-------------+----------+
1237 * xdrpos1 ---^ xdrpos2 --^
1240 /* Add data before the xdr position */
1241 size = xdrpos - xdrprev - lenprev;
1242 if (size > 0 && tvb_captured_length_remaining(tvb, offset) > 0 && p_segment_info) {
1243 tmp_tvb = tvb_new_subset_length(tvb, offset, size);
1244 add_fragment(tmp_tvb, 0, p_segment_info->msgid, msg_num, true, p_rdma_conv_info, pinfo, tree);
1245 /* Message number for fragment after read/write chunk */
1246 msg_num += p_rdma_chunk->length;
1247 /* Save rest of data for next fragment */
1248 tvb = tvb_new_subset_remaining(tvb, offset+size);
1249 offset = 0;
1252 xdrprev = xdrpos;
1253 lenprev = p_rdma_chunk->length;
1257 fd_head = get_fragment_head(pinfo);
1258 if (fd_head == NULL) {
1259 if (p_segment_info == NULL) {
1260 return NULL;
1261 } else if (p_rdma_chunk->type == RDMA_REPLY_CHUNK && !setup &&
1262 !pinfo->fd->visited && p_rdma_chunk->length > 0) {
1263 /* Only reassemble if reply chunk size is non-zero to avoid
1264 * reassembly of a single fragment. The RPC-over-RDMA reply
1265 * has no data when the reply chunk size is non-zero but it
1266 * needs to reassemble all fragments (more_frags = false)
1267 * in this frame. On the other hand when the reply chunk
1268 * size is zero, the whole message is given in this frame
1269 * therefore there is no need to reassemble. */
1270 new_tvb = add_fragment(tvb, offset, p_segment_info->msgid, 0, false, p_rdma_conv_info, pinfo, tree);
1271 } else if (p_rdma_chunk->type == RDMA_READ_CHUNK && tvb_captured_length_remaining(tvb, offset) > 0) {
1272 /* Add data after the last read chunk */
1273 add_fragment(tvb, offset, p_segment_info->msgid, msg_num, true, p_rdma_conv_info, pinfo, tree);
1274 } else if (p_offset && tvb_reported_length_remaining(tvb, offset) > 0) {
1275 /* Add data after the last write chunk */
1276 new_tvb = add_fragment(tvb, offset, p_segment_info->msgid, msg_num, true, p_rdma_conv_info, pinfo, tree);
1281 return new_tvb;
1284 /* Process all RDMA chunk lists (read, write and reply) */
1285 static tvbuff_t *
1286 process_rdma_lists(tvbuff_t *tvb, unsigned offset, rdma_lists_t *rdma_lists,
1287 packet_info *pinfo, proto_tree *tree)
1289 tvbuff_t *new_tvb;
1290 tvbuff_t *ret_tvb;
1292 new_tvb = get_reassembled_data(tvb, offset, pinfo, tree);
1293 if (new_tvb) {
1294 /* Reassembled message has already been cached */
1295 return new_tvb;
1299 * Reassembly is not done here, process the rdma list to set up the
1300 * expected read chunks and their respective segments
1301 * Reassembly is done on the last read response
1302 * - Used for a large RPC call which has at least one large opaque,
1303 * e.g., NFS WRITE
1304 * - The RPC call packet is used only to set up the RDMA read chunk list.
1305 * It also has the reduced message data which includes the first fragment
1306 * (XDR data up to and including the opaque length), but it could also
1307 * have fragments between each read chunk and the last fragment after
1308 * the last read chunk data. The reduced message is then broken down
1309 * into fragments and inserted into the reassembly table.
1310 * - The opaque data is transferred via RDMA reads, once all fragments are
1311 * accounted for they are reassembled and the whole RPC call is dissected
1312 * in the last read response -- there is no RPCoRDMA layer
1314 * - Packet sent order, the reduced RPC call is sent first, then the RDMA
1315 * reads, e.g., showing only for a single chunk:
1316 * +----------------+-------------+-----------+-----------+-----+-----------+
1317 * | WRITE call XDR | opaque size | GETATTR | RDMA read | ... | RDMA read |
1318 * +----------------+-------------+-----------+-----------+-----+-----------+
1319 * |<-------------- First frame ------------->|<-------- chunk data ------->|
1320 * Each RDMA read could be a single RDMA_READ_RESPONSE_ONLY or a series of
1321 * RDMA_READ_RESPONSE_FIRST, RDMA_READ_RESPONSE_MIDDLE, ...,
1322 * RDMA_READ_RESPONSE_LAST
1324 * - NFS WRITE call, this is how it should be reassembled:
1325 * +----------------+-------------+-----------+-----+-----------+-----------+
1326 * | WRITE call XDR | opaque size | RDMA read | ... | RDMA read | GETATTR |
1327 * +----------------+-------------+-----------+-----+-----------+-----------+
1328 * |<--- opaque (chunk) data --->|
1330 process_rdma_list(tvb, offset, rdma_lists->p_read_list, pinfo, tree);
1333 * Reassembly is done on the reply message (RDMA_NOMSG)
1334 * Process the rdma list on the call message to set up the reply
1335 * chunk and its respective segments expected by the reply
1336 * - Used for a large RPC reply which does not fit into a single SEND
1337 * operation and does not have a single large opaque, e.g., NFS READDIR
1338 * - The RPC call packet is used only to set up the RDMA reply chunk list
1339 * - The whole RPC reply is transferred via RDMA writes
1340 * - The RPC reply packet has no data (RDMA_NOMSG) but fragments are
1341 * reassembled and the whole RPC reply is dissected
1343 * - Packet sent order, this is the whole XDR data for the RPC reply:
1344 * +--------------------------+------------------+--------------------------+
1345 * | RDMA write | ... | RDMA write |
1346 * +--------------------------+------------------+--------------------------+
1347 * Each RDMA write could be a single RDMA_WRITE_ONLY or a series of
1348 * RDMA_WRITE_FIRST, RDMA_WRITE_MIDDLE, ..., RDMA_WRITE_LAST
1350 new_tvb = process_rdma_list(tvb, offset, rdma_lists->p_reply_list, pinfo, tree);
1353 * Reassembly is done on the reply message (RDMA_MSG)
1354 * Process the rdma list on the call message to set up the write
1355 * chunks and their respective segments expected by the reply
1356 * - Used for a large RPC reply which has at least one large opaque,
1357 * e.g., NFS READ
1358 * - The RPC call packet is used only to set up the RDMA write chunk list
1359 * - The opaque data is transferred via RDMA writes
1360 * - The RPC reply packet has the reduced message data which includes the
1361 * first fragment (XDR data up to and including the opaque length), but
1362 * it could also have fragments between each write chunk and the last
1363 * fragment after the last write chunk data. The reduced message is
1364 * then broken down into fragments and inserted into the reassembly table.
1365 * Fragments are then reassembled and the whole RPC reply is dissected
1366 * - Packet sent order, the RDMA writes are sent first, then the reduced RPC
1367 * reply, e.g., showing only for a single chunk:
1368 * +------------+-----+------------+----------------+-------------+---------+
1369 * | RDMA write | ... | RDMA write | READ reply XDR | opaque size | GETATTR |
1370 * +------------+-----+------------+----------------+-------------+---------+
1371 * |<-------- write chunk -------->|<------------- Last frame ------------->|
1372 * Each RDMA write could be a single RDMA_WRITE_ONLY or a series of
1373 * RDMA_WRITE_FIRST, RDMA_WRITE_MIDDLE, ..., RDMA_WRITE_LAST
1375 * - NFS READ reply, this is how it should be reassembled:
1376 * +----------------+-------------+------------+-----+------------+---------+
1377 * | READ reply XDR | opaque size | RDMA write | ... | RDMA write | GETATTR |
1378 * +----------------+-------------+------------+-----+------------+---------+
1379 * |<---- opaque (chunk) data ---->|
1381 ret_tvb = process_rdma_list(tvb, offset, rdma_lists->p_write_list, pinfo, tree);
1384 * Either the reply chunk or the write chunks should be reassembled here
1385 * but not both
1387 new_tvb = (new_tvb && ret_tvb) ? NULL : (ret_tvb ? ret_tvb : new_tvb);
1389 return new_tvb;
1393 * Add a fragment to the SEND reassembly table and return the reassembled data
1394 * if all fragments have been added
1396 static tvbuff_t *add_send_fragment(rdma_conv_info_t *p_rdma_conv_info,
1397 tvbuff_t *tvb, packet_info *pinfo, proto_tree *tree)
1399 uint32_t msgid = 0;
1400 int32_t msgno = -1;
1401 tvbuff_t *new_tvb = NULL;
1402 bool first_frag = false;
1403 bool middle_frag = false;
1404 bool last_frag = false;
1405 send_info_t *p_send_info = NULL;
1407 if (gp_infiniband_info) {
1408 first_frag = gp_infiniband_info->opCode == RC_SEND_FIRST;
1409 middle_frag = gp_infiniband_info->opCode == RC_SEND_MIDDLE;
1410 last_frag = (gp_infiniband_info->opCode == RC_SEND_LAST || \
1411 gp_infiniband_info->opCode == RC_SEND_LAST_INVAL);
1412 } else if (gp_rdmap_info) {
1413 first_frag = !gp_rdmap_info->last_flag && gp_rdmap_info->message_offset == 0;
1414 middle_frag = !gp_rdmap_info->last_flag && gp_rdmap_info->message_offset > 0;
1415 last_frag = gp_rdmap_info->last_flag && gp_rdmap_info->message_offset > 0;
1418 if (!first_frag && !middle_frag && !last_frag) {
1419 /* Only one SEND fragment, no need to reassemble */
1420 return tvb;
1421 } else if (pinfo->fd->visited) {
1422 return get_reassembled_data(tvb, 0, pinfo, tree);
1423 } else if (first_frag) {
1424 /* Start of multi-SEND message */
1425 p_send_info = wmem_new(wmem_file_scope(), send_info_t);
1426 p_send_info->msgid = get_msg_id();
1427 p_send_info->rsize = 0;
1429 if (gp_infiniband_info) {
1430 /* Message numbers are relative with respect to current PSN */
1431 p_send_info->msgno = gp_infiniband_info->packet_seq_num;
1432 wmem_tree_insert32(p_rdma_conv_info->send_list, gp_infiniband_info->packet_seq_num, p_send_info);
1433 } else if (gp_rdmap_info) {
1434 /* Message numbers are given by the RDMAP offset -- msgno is not used */
1435 p_send_info->msgno = 0;
1436 wmem_tree_insert32(p_rdma_conv_info->msn_list, gp_rdmap_info->message_seq_num, p_send_info);
1438 } else {
1439 /* SEND fragment, get the send reassembly info structure */
1440 if (gp_infiniband_info) {
1441 p_send_info = wmem_tree_lookup32_le(p_rdma_conv_info->send_list, gp_infiniband_info->packet_seq_num);
1442 } else if (gp_rdmap_info) {
1443 p_send_info = wmem_tree_lookup32(p_rdma_conv_info->msn_list, gp_rdmap_info->message_seq_num);
1446 if (p_send_info) {
1447 p_send_info->rsize += tvb_reported_length(tvb);
1448 msgid = p_send_info->msgid;
1449 if (gp_infiniband_info) {
1450 /* Message numbers are consecutive starting at zero */
1451 msgno = gp_infiniband_info->packet_seq_num - p_send_info->msgno;
1452 } else if (gp_rdmap_info) {
1453 /* Message numbers are given by the RDMAP offset */
1454 msgno = gp_rdmap_info->message_offset;
1457 if (msgid > 0 && msgno >= 0) {
1458 new_tvb = add_fragment(tvb, 0, msgid, msgno, !last_frag, p_rdma_conv_info, pinfo, tree);
1459 if (last_frag && !new_tvb && gp_rdmap_info) {
1460 /* Since message numbers are not consecutive for iWarp,
1461 * verify there are no missing fragments */
1462 if (p_send_info->rsize == msgno + tvb_reported_length(tvb)) {
1463 end_reassembly(msgid, NULL, pinfo);
1464 new_tvb = get_reassembled_data(tvb, 0, pinfo, tree);
1468 if (new_tvb) {
1469 /* This is the last fragment, data has been reassembled
1470 * and ready to be dissected */
1471 return new_tvb;
1473 return tvb;
1477 * We need to differentiate between RPC messages inside RDMA and regular send messages.
1478 * In order to do that (as well as extra validation) we want to verify that for RDMA_MSG
1479 * and RDMA_MSGP types, RPC call or RPC reply header follows. We can do this by comparing
1480 * XID in RPC and RPCoRDMA headers.
1482 static bool
1483 packet_is_rpcordma(tvbuff_t *tvb)
1485 unsigned size, len = tvb_reported_length(tvb);
1486 uint32_t xid_rpc;
1487 uint32_t xid = tvb_get_ntohl(tvb, 0);
1488 uint32_t msg_type = tvb_get_ntohl(tvb, 12);
1489 unsigned offset;
1491 if (len < MIN_RPCRDMA_HDR_SZ)
1492 return 0;
1494 switch (msg_type) {
1495 case RDMA_MSG:
1496 if (len < MIN_RPCRDMA_MSG_SZ)
1497 return false;
1498 offset = MIN_RPCRDMA_HDR_SZ;
1499 size = get_chunk_lists_size(tvb, len, offset);
1500 if (!size)
1501 return false;
1502 offset += size;
1504 if (offset + 4 > len)
1505 return false;
1506 xid_rpc = tvb_get_ntohl(tvb, offset);
1507 if (xid != xid_rpc)
1508 return false;
1509 break;
1511 case RDMA_MSGP:
1512 if (len < MIN_RPCRDMA_MSGP_SZ)
1513 return false;
1514 offset = MIN_RPCRDMA_HDR_SZ + 8;
1515 size = get_chunk_lists_size(tvb, len, offset);
1516 if (!size)
1517 return false;
1518 offset += size;
1520 if (offset + 4 > len)
1521 return false;
1522 xid_rpc = tvb_get_ntohl(tvb, offset);
1523 if (xid != xid_rpc)
1524 return false;
1525 break;
1527 case RDMA_NOMSG:
1528 case RDMA_DONE:
1529 case RDMA_ERROR:
1530 break;
1532 default:
1533 return false;
1536 return true;
1539 static int
1540 dissect_rpcrdma(tvbuff_t *tvb, packet_info *pinfo, proto_tree *tree, void* data _U_)
1542 tvbuff_t *volatile next_tvb;
1543 tvbuff_t *frag_tvb;
1544 proto_item *ti;
1545 proto_tree *rpcordma_tree;
1546 unsigned offset;
1547 uint32_t msg_type;
1548 uint32_t xid;
1549 uint32_t val;
1550 uint32_t *p_msgid;
1551 unsigned write_size;
1552 int save_visited;
1553 rdma_lists_t rdma_lists = { NULL, NULL, NULL };
1555 /* tvb_get_ntohl() should not throw an exception while checking if
1556 this is an rpcrdma packet */
1557 if (tvb_captured_length(tvb) < MIN_RPCRDMA_HDR_SZ)
1558 return 0;
1560 if (tvb_get_ntohl(tvb, 4) != 1) /* vers */
1561 return 0;
1563 xid = tvb_get_ntohl(tvb, 0);
1564 msg_type = tvb_get_ntohl(tvb, 12);
1566 col_set_str(pinfo->cinfo, COL_PROTOCOL, "RPCoRDMA");
1567 col_add_fstr(pinfo->cinfo, COL_INFO, "%s XID 0x%x",
1568 val_to_str(msg_type, rpcordma_message_type, "Unknown (%d)"), xid);
1570 ti = proto_tree_add_item(tree, proto_rpcordma, tvb, 0, MIN_RPCRDMA_HDR_SZ, ENC_NA);
1572 rpcordma_tree = proto_item_add_subtree(ti, ett_rpcordma);
1574 offset = 0;
1575 proto_tree_add_item(rpcordma_tree, hf_rpcordma_xid, tvb,
1576 offset, 4, ENC_BIG_ENDIAN);
1577 offset += 4;
1578 proto_tree_add_item(rpcordma_tree, hf_rpcordma_vers, tvb,
1579 offset, 4, ENC_BIG_ENDIAN);
1580 offset += 4;
1581 proto_tree_add_item(rpcordma_tree, hf_rpcordma_flow_control, tvb,
1582 offset, 4, ENC_BIG_ENDIAN);
1583 offset += 4;
1584 proto_tree_add_item(rpcordma_tree, hf_rpcordma_message_type, tvb,
1585 offset, 4, ENC_BIG_ENDIAN);
1586 offset += 4;
1588 switch (msg_type) {
1589 case RDMA_MSG:
1590 /* Parse rpc_rdma_header */
1591 offset = parse_rdma_header(tvb, offset, rpcordma_tree, &rdma_lists);
1593 proto_item_set_len(ti, offset);
1595 frag_tvb = get_reassembled_data(tvb, offset, pinfo, tree);
1596 if (frag_tvb) {
1597 /* Reassembled message has already been cached -- call upper dissector */
1598 return call_dissector(rpc_handler, frag_tvb, pinfo, tree);
1599 } else if (pinfo->fd->visited && !g_needs_reassembly && rdma_lists.p_read_list) {
1600 /* This frame has already been added as a read fragment */
1601 return 0;
1602 } else {
1603 next_tvb = tvb_new_subset_remaining(tvb, offset);
1606 * Get the total number of bytes for the write chunk list.
1607 * It returns 0 if there is no write chunk list, or this is an
1608 * RPC call (list has just been set up) or it is an RPC reply but
1609 * there is an error so the reply message has not been reduced.
1611 write_size = get_rdma_list_size(rdma_lists.p_write_list, pinfo);
1613 if (write_size > 0 && !pinfo->fd->visited) {
1614 /* Initialize array of write chunk offsets */
1615 gp_rdma_write_offsets = wmem_array_new(wmem_packet_scope(), sizeof(int));
1616 register_frame_end_routine(pinfo, reset_write_offsets);
1617 TRY {
1619 * Call the upper layer dissector to get a list of offsets
1620 * where message has been reduced.
1621 * This is done on the first pass (visited = 0)
1623 g_rpcrdma_reduced = true;
1624 call_dissector(rpc_handler, next_tvb, pinfo, tree);
1626 FINALLY {
1627 /* Make sure to disable reduced data processing */
1628 g_rpcrdma_reduced = false;
1630 ENDTRY;
1631 } else if (write_size > 0 && pinfo->fd->visited) {
1633 * Reassembly is done on the second pass (visited = 1)
1634 * This is done because dissecting the upper layer(s) again
1635 * causes the upper layer(s) to be displayed twice if it is
1636 * done on the same pass.
1638 p_msgid = (uint32_t *)p_get_proto_data(wmem_file_scope(), pinfo, proto_rpcordma, RPCRDMA_MSG_ID);
1639 if (p_msgid) {
1641 * All fragments were added during the first pass,
1642 * reassembly just needs to be completed here
1644 save_visited = pinfo->fd->visited;
1645 pinfo->fd->visited = 0;
1646 end_reassembly(*p_msgid, NULL, pinfo);
1647 pinfo->fd->visited = save_visited;
1652 * If there is a write chunk list, process_rdma_lists will convert
1653 * the offsets returned by the upper layer into xdr positions
1654 * and break the current reduced message into separate fragments
1655 * and insert them into the reassembly table in the first pass.
1656 * On the second pass, the reassembly has just been done so
1657 * process_rdma_lists should only call process_reassembled_data
1658 * to get the reassembled data and call the dissector for the
1659 * upper layer with the reassembled message.
1661 frag_tvb = process_rdma_lists(next_tvb, 0, &rdma_lists, pinfo, tree);
1662 gp_rdma_write_offsets = NULL;
1663 if (rdma_lists.p_read_list) {
1665 * If there is a read chunk list, do not dissect upper layer
1666 * just label rest of packet as "Data" since the reassembly
1667 * will be done on the last read response.
1669 call_data_dissector(next_tvb, pinfo, tree);
1670 break;
1671 } else if (frag_tvb) {
1672 /* Replace current frame data with the reassembled data */
1673 next_tvb = frag_tvb;
1676 return call_dissector(rpc_handler, next_tvb, pinfo, tree);
1678 case RDMA_NOMSG:
1679 /* Parse rpc_rdma_header_nomsg */
1680 offset = parse_rdma_header(tvb, offset, rpcordma_tree, &rdma_lists);
1681 if (pinfo->fd->visited) {
1682 /* Reassembly was done on the first pass, so just get the reassembled data */
1683 next_tvb = get_reassembled_data(tvb, offset, pinfo, tree);
1684 } else {
1685 next_tvb = process_rdma_lists(tvb, offset, &rdma_lists, pinfo, tree);
1687 if (next_tvb) {
1689 * Even though there is no data in this frame, reassembly for
1690 * the reply chunk is done in this frame so dissect upper layer
1692 call_dissector(rpc_handler, next_tvb, pinfo, tree);
1694 break;
1696 case RDMA_MSGP:
1697 /* Parse rpc_rdma_header_padded */
1698 proto_tree_add_item(rpcordma_tree, hf_rpcordma_rdma_align, tvb,
1699 offset, 4, ENC_BIG_ENDIAN);
1700 offset += 4;
1702 proto_tree_add_item(rpcordma_tree, hf_rpcordma_rdma_thresh, tvb,
1703 offset, 4, ENC_BIG_ENDIAN);
1704 offset += 4;
1706 offset = parse_rdma_header(tvb, offset, rpcordma_tree, &rdma_lists);
1708 proto_item_set_len(ti, offset);
1709 next_tvb = tvb_new_subset_remaining(tvb, offset);
1710 return call_dissector(rpc_handler, next_tvb, pinfo, tree);
1712 case RDMA_DONE:
1713 break;
1715 case RDMA_ERROR:
1716 /* rpc_rdma_errcode */
1717 val = tvb_get_ntohl(tvb, offset);
1718 proto_tree_add_item(rpcordma_tree, hf_rpcordma_errcode, tvb,
1719 offset, 4, ENC_BIG_ENDIAN);
1720 offset += 4;
1722 switch (val) {
1723 case ERR_VERS:
1724 proto_tree_add_item(rpcordma_tree, hf_rpcordma_vers_low, tvb,
1725 offset, 4, ENC_BIG_ENDIAN);
1726 offset += 4;
1727 proto_tree_add_item(rpcordma_tree, hf_rpcordma_vers_high, tvb,
1728 offset, 4, ENC_BIG_ENDIAN);
1729 offset += 4;
1730 break;
1732 case ERR_CHUNK:
1733 break;
1735 default:
1736 proto_item_set_len(ti, offset);
1737 next_tvb = tvb_new_subset_remaining(tvb, offset);
1738 return call_data_dissector(next_tvb, pinfo, tree);
1740 break;
1743 proto_item_set_len(ti, offset);
1744 return offset;
1747 static bool
1748 dissect_rpcrdma_ib_heur(tvbuff_t *tvb, packet_info *pinfo,
1749 proto_tree *tree, void *data)
1751 tvbuff_t *new_tvb = NULL;
1752 bool more_frags = false;
1753 rdma_conv_info_t *p_rdma_conv_info;
1755 /* Initialize global variables for InfiniBand reassembly */
1756 g_rpcrdma_reduced = false;
1757 g_needs_reassembly = true;
1758 gp_rdmap_info = NULL;
1759 gp_infiniband_info = (struct infinibandinfo *)data;
1761 if (!gp_infiniband_info)
1762 return false;
1764 /* Get conversation state */
1765 p_rdma_conv_info = get_rdma_conv_info(pinfo);
1767 switch (gp_infiniband_info->opCode) {
1768 case RC_SEND_ONLY:
1769 case RC_SEND_ONLY_INVAL:
1770 break;
1771 case RC_SEND_FIRST:
1772 case RC_SEND_MIDDLE:
1773 add_send_fragment(p_rdma_conv_info, tvb, pinfo, tree);
1774 return false;
1775 case RC_SEND_LAST:
1776 case RC_SEND_LAST_INVAL:
1777 tvb = add_send_fragment(p_rdma_conv_info, tvb, pinfo, tree);
1778 break;
1779 case RC_RDMA_WRITE_ONLY:
1780 case RC_RDMA_WRITE_ONLY_IMM:
1781 set_max_iosize(p_rdma_conv_info, tvb_reported_length(tvb));
1782 add_ib_fragment(tvb, p_rdma_conv_info, true, pinfo, tree);
1783 return false;
1784 case RC_RDMA_WRITE_FIRST:
1785 set_max_iosize(p_rdma_conv_info, tvb_reported_length(tvb));
1786 add_request_info(p_rdma_conv_info, pinfo);
1787 /* fall through */
1788 case RC_RDMA_WRITE_MIDDLE:
1789 case RC_RDMA_WRITE_LAST:
1790 case RC_RDMA_WRITE_LAST_IMM:
1791 /* Add fragment to the reassembly table */
1792 add_ib_fragment(tvb, p_rdma_conv_info, false, pinfo, tree);
1793 /* Do not dissect here, dissection is done on RDMA_MSG or RDMA_NOMSG */
1794 return false;
1795 case RC_RDMA_READ_REQUEST:
1796 add_request_info(p_rdma_conv_info, pinfo);
1797 return false;
1798 case RC_RDMA_READ_RESPONSE_FIRST:
1799 set_max_iosize(p_rdma_conv_info, tvb_reported_length(tvb));
1800 /* fall through */
1801 case RC_RDMA_READ_RESPONSE_MIDDLE:
1802 more_frags = true;
1803 /* fall through */
1804 case RC_RDMA_READ_RESPONSE_LAST:
1805 case RC_RDMA_READ_RESPONSE_ONLY:
1806 /* Add fragment to the reassembly table */
1807 new_tvb = add_ib_fragment(tvb, p_rdma_conv_info, false, pinfo, tree);
1808 if (!new_tvb && !more_frags && p_rdma_conv_info->segment_info) {
1810 * Reassembled data has not been cached (new_tvb==NULL) yet,
1811 * so make sure reassembly is really done if more_frags==false,
1812 * (for the READ_RESPONSE_LAST or ONLY case).
1813 * Do not add any more data, just complete the reassembly
1815 end_reassembly(p_rdma_conv_info->segment_info->msgid, p_rdma_conv_info, pinfo);
1816 new_tvb = get_reassembled_data(tvb, 0, pinfo, tree);
1818 if (new_tvb) {
1819 /* This is the last fragment, data has been reassembled and ready to dissect */
1820 return call_dissector(rpc_handler, new_tvb, pinfo, tree);
1822 return false;
1823 default:
1824 return false;
1827 if (!packet_is_rpcordma(tvb))
1828 return false;
1829 dissect_rpcrdma(tvb, pinfo, tree, NULL);
1830 return true;
1833 static bool
1834 dissect_rpcrdma_iwarp_heur(tvbuff_t *tvb, packet_info *pinfo,
1835 proto_tree *tree, void *data)
1837 tvbuff_t *new_tvb;
1838 rdma_conv_info_t *p_rdma_conv_info;
1839 rdmap_request_t *p_read_request;
1841 /* Initialize global variables for iWarp reassembly */
1842 g_rpcrdma_reduced = false;
1843 g_needs_reassembly = true;
1844 gp_infiniband_info = NULL;
1845 gp_rdmap_info = (rdmap_info_t *)data;
1847 if (!gp_rdmap_info)
1848 return false;
1850 /* Get conversation state */
1851 p_rdma_conv_info = get_rdma_conv_info(pinfo);
1853 switch (gp_rdmap_info->opcode) {
1854 case RDMA_SEND:
1855 case RDMA_SEND_INVALIDATE:
1856 tvb = add_send_fragment(p_rdma_conv_info, tvb, pinfo, tree);
1857 if (!gp_rdmap_info->last_flag) {
1858 /* This is a SEND fragment, do not dissect yet */
1859 return false;
1861 break;
1862 case RDMA_WRITE:
1863 add_iwarp_fragment(tvb, p_rdma_conv_info, pinfo, tree);
1864 /* Do not dissect here, dissection is done on RDMA_MSG or RDMA_NOMSG */
1865 return false;
1866 case RDMA_READ_REQUEST:
1867 if (!pinfo->fd->visited && gp_rdmap_info->read_request) {
1868 p_read_request = wmem_new(wmem_file_scope(), rdmap_request_t);
1869 memcpy(p_read_request, gp_rdmap_info->read_request, sizeof(rdmap_request_t));
1870 wmem_tree_insert32(p_rdma_conv_info->request_list, gp_rdmap_info->read_request->sink_stag, p_read_request);
1872 return false;
1873 case RDMA_READ_RESPONSE:
1874 new_tvb = add_iwarp_fragment(tvb, p_rdma_conv_info, pinfo, tree);
1875 if (new_tvb) {
1876 /* This is the last fragment, data has been reassembled and ready to dissect */
1877 return call_dissector(rpc_handler, new_tvb, pinfo, tree);
1879 return false;
1880 default:
1881 return false;
1884 if (!packet_is_rpcordma(tvb))
1885 return false;
1887 dissect_rpcrdma(tvb, pinfo, tree, NULL);
1888 return true;
1891 void
1892 proto_register_rpcordma(void)
1894 module_t *rpcordma_module;
1895 static hf_register_info hf[] = {
1896 { &hf_rpcordma_xid,
1897 { "XID", "rpcordma.xid",
1898 FT_UINT32, BASE_HEX,
1899 NULL, 0x0, NULL, HFILL}
1901 { &hf_rpcordma_vers,
1902 { "Version", "rpcordma.version",
1903 FT_UINT32, BASE_DEC,
1904 NULL, 0x0, NULL, HFILL}
1906 { &hf_rpcordma_flow_control,
1907 { "Flow Control", "rpcordma.flow_control",
1908 FT_UINT32, BASE_DEC,
1909 NULL, 0x0, NULL, HFILL}
1911 { &hf_rpcordma_message_type,
1912 { "Message Type", "rpcordma.msg_type",
1913 FT_UINT32, BASE_DEC,
1914 VALS(rpcordma_message_type), 0x0, NULL, HFILL}
1916 { &hf_rpcordma_reads_count,
1917 { "Read list", "rpcordma.reads_count",
1918 FT_UINT32, BASE_DEC,
1919 NULL, 0, NULL, HFILL }
1921 { &hf_rpcordma_writes_count,
1922 { "Write list", "rpcordma.writes_count",
1923 FT_UINT32, BASE_DEC,
1924 NULL, 0, NULL, HFILL }
1926 { &hf_rpcordma_reply_count,
1927 { "Reply list", "rpcordma.reply_count",
1928 FT_UINT32, BASE_DEC,
1929 NULL, 0, NULL, HFILL }
1931 { &hf_rpcordma_rdma_handle,
1932 { "RDMA handle", "rpcordma.rdma_handle",
1933 FT_UINT32, BASE_HEX,
1934 NULL, 0, NULL, HFILL }
1936 { &hf_rpcordma_rdma_length,
1937 { "RDMA length", "rpcordma.rdma_length",
1938 FT_UINT32, BASE_DEC,
1939 NULL, 0, NULL, HFILL }
1941 { &hf_rpcordma_rdma_offset,
1942 { "RDMA offset", "rpcordma.rdma_offset",
1943 FT_UINT64, BASE_HEX,
1944 NULL, 0, NULL, HFILL }
1946 { &hf_rpcordma_position,
1947 { "Position in XDR", "rpcordma.position",
1948 FT_UINT32, BASE_DEC,
1949 NULL, 0, NULL, HFILL }
1951 { &hf_rpcordma_segment_count,
1952 { "Write chunk segment count", "rpcordma.segment_count",
1953 FT_UINT32, BASE_DEC,
1954 NULL, 0, NULL, HFILL }
1956 { &hf_rpcordma_rdma_align,
1957 { "RDMA align", "rpcordma.rdma_align",
1958 FT_UINT32, BASE_DEC,
1959 NULL, 0, NULL, HFILL }
1961 { &hf_rpcordma_rdma_thresh,
1962 { "RDMA threshold", "rpcordma.rdma_thresh",
1963 FT_UINT32, BASE_DEC,
1964 NULL, 0, NULL, HFILL }
1966 { &hf_rpcordma_errcode,
1967 { "Error code", "rpcordma.errcode",
1968 FT_UINT32, BASE_DEC,
1969 VALS(rpcordma_err), 0, NULL, HFILL }
1971 { &hf_rpcordma_vers_low,
1972 { "Version low", "rpcordma.vers_low",
1973 FT_UINT32, BASE_DEC,
1974 NULL, 0, NULL, HFILL }
1976 { &hf_rpcordma_vers_high,
1977 { "Version high", "rpcordma.vers_high",
1978 FT_UINT32, BASE_DEC,
1979 NULL, 0, NULL, HFILL }
1981 /* Fragment entries */
1982 { &hf_rpcordma_fragments,
1983 { "Reassembled RPCoRDMA fragments", "rpcordma.fragments",
1984 FT_NONE, BASE_NONE, NULL, 0x00, NULL, HFILL}
1986 { &hf_rpcordma_fragment,
1987 { "RPCoRDMA fragment", "rpcordma.fragment",
1988 FT_FRAMENUM, BASE_NONE, NULL, 0x00, NULL, HFILL}
1990 { &hf_rpcordma_fragment_overlap,
1991 { "Fragment overlap", "rpcordma.fragment.overlap",
1992 FT_BOOLEAN, BASE_NONE, NULL, 0x00, NULL, HFILL}
1994 { &hf_rpcordma_fragment_overlap_conflicts,
1995 { "Fragment overlapping with conflicting data", "rpcordma.fragment.overlap.conflicts",
1996 FT_BOOLEAN, BASE_NONE, NULL, 0x00, NULL, HFILL}
1998 { &hf_rpcordma_fragment_multiple_tails,
1999 { "Multiple tail fragments found", "rpcordma.fragment.multiple_tails",
2000 FT_BOOLEAN, BASE_NONE, NULL, 0x00, NULL, HFILL}
2002 { &hf_rpcordma_fragment_too_long_fragment,
2003 { "Fragment too long", "rpcordma.fragment.too_long_fragment",
2004 FT_BOOLEAN, BASE_NONE, NULL, 0x00, NULL, HFILL}
2006 { &hf_rpcordma_fragment_error,
2007 { "Defragmentation error", "rpcordma.fragment.error",
2008 FT_FRAMENUM, BASE_NONE, NULL, 0x00, NULL, HFILL}
2010 { &hf_rpcordma_fragment_count,
2011 { "Fragment count", "rpcordma.fragment.count",
2012 FT_UINT32, BASE_DEC, NULL, 0x00, NULL, HFILL}
2014 { &hf_rpcordma_reassembled_in,
2015 { "Reassembled PDU in frame", "rpcordma.reassembled.in",
2016 FT_FRAMENUM, BASE_NONE, NULL, 0x00, NULL, HFILL}
2018 { &hf_rpcordma_reassembled_length,
2019 { "Reassembled RPCoRDMA length", "rpcordma.reassembled.length",
2020 FT_UINT32, BASE_DEC, NULL, 0x00, NULL, HFILL}
2022 { &hf_rpcordma_reassembled_data,
2023 { "Reassembled RPCoRDMA data", "rpcordma.reassembled.data",
2024 FT_BYTES, BASE_NONE, NULL, 0, NULL, HFILL }
2028 static int *ett[] = {
2029 &ett_rpcordma,
2030 &ett_rpcordma_chunk,
2031 &ett_rpcordma_read_list,
2032 &ett_rpcordma_read_chunk,
2033 &ett_rpcordma_write_list,
2034 &ett_rpcordma_write_chunk,
2035 &ett_rpcordma_reply_chunk,
2036 &ett_rpcordma_segment,
2037 &ett_rpcordma_fragment,
2038 &ett_rpcordma_fragments,
2041 proto_rpcordma = proto_register_protocol (
2042 "RPC over RDMA", /* name */
2043 "RPCoRDMA", /* short name */
2044 "rpcordma" /* abbrev */
2047 proto_register_field_array(proto_rpcordma, hf, array_length(hf));
2048 proto_register_subtree_array(ett, array_length(ett));
2049 reassembly_table_register(&rpcordma_reassembly_table, &addresses_ports_reassembly_table_functions);
2051 /* Register dissector handle */
2052 rpcordma_handle = register_dissector("rpcordma", dissect_rpcrdma, proto_rpcordma);
2054 /* Register preferences */
2055 rpcordma_module = prefs_register_protocol_obsolete(proto_rpcordma);
2057 prefs_register_obsolete_preference(rpcordma_module, "manual_en");
2058 prefs_register_obsolete_preference(rpcordma_module, "addr_a");
2059 prefs_register_obsolete_preference(rpcordma_module, "addr_a_type");
2060 prefs_register_obsolete_preference(rpcordma_module, "addr_a_id");
2061 prefs_register_obsolete_preference(rpcordma_module, "addr_a_qp");
2062 prefs_register_obsolete_preference(rpcordma_module, "addr_b");
2063 prefs_register_obsolete_preference(rpcordma_module, "addr_b_type");
2064 prefs_register_obsolete_preference(rpcordma_module, "addr_b_id");
2065 prefs_register_obsolete_preference(rpcordma_module, "addr_b_qp");
2066 prefs_register_obsolete_preference(rpcordma_module, "target_ports");
2069 void
2070 proto_reg_handoff_rpcordma(void)
2072 heur_dissector_add("infiniband.payload", dissect_rpcrdma_ib_heur, "RPC-over-RDMA on Infiniband",
2073 "rpcrdma_infiniband", proto_rpcordma, HEURISTIC_ENABLE);
2074 dissector_add_for_decode_as("infiniband", rpcordma_handle);
2076 heur_dissector_add("iwarp_ddp_rdmap", dissect_rpcrdma_iwarp_heur, "RPC-over-RDMA on iWARP",
2077 "rpcrdma_iwarp", proto_rpcordma, HEURISTIC_ENABLE);
2079 rpc_handler = find_dissector_add_dependency("rpc", proto_rpcordma);
2083 * Editor modelines - https://www.wireshark.org/tools/modelines.html
2085 * Local variables:
2086 * c-basic-offset: 4
2087 * tab-width: 8
2088 * indent-tabs-mode: nil
2089 * End:
2091 * vi: set shiftwidth=4 tabstop=8 expandtab:
2092 * :indentSize=4:tabSize=8:noTabs=true: