2 * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
36 #include <linux/types.h>
37 #include <linux/spinlock.h>
38 #include <linux/kernel.h>
39 #include <linux/interrupt.h>
40 #include <rdma/ib_pack.h>
42 #include "ipath_layer.h"
43 #include "verbs_debug.h"
45 #define QPN_MAX (1 << 24)
46 #define QPNMAP_ENTRIES (QPN_MAX / PAGE_SIZE / BITS_PER_BYTE)
49 * Increment this value if any changes that break userspace ABI
50 * compatibility are made.
52 #define IPATH_UVERBS_ABI_VERSION 1
55 * Define an ib_cq_notify value that is not valid so we know when CQ
56 * notifications are armed.
58 #define IB_CQ_NONE (IB_CQ_NEXT_COMP + 1)
60 #define IB_RNR_NAK 0x20
61 #define IB_NAK_PSN_ERROR 0x60
62 #define IB_NAK_INVALID_REQUEST 0x61
63 #define IB_NAK_REMOTE_ACCESS_ERROR 0x62
64 #define IB_NAK_REMOTE_OPERATIONAL_ERROR 0x63
65 #define IB_NAK_INVALID_RD_REQUEST 0x64
67 #define IPATH_POST_SEND_OK 0x01
68 #define IPATH_POST_RECV_OK 0x02
69 #define IPATH_PROCESS_RECV_OK 0x04
70 #define IPATH_PROCESS_SEND_OK 0x08
72 /* IB Performance Manager status values */
73 #define IB_PMA_SAMPLE_STATUS_DONE 0x00
74 #define IB_PMA_SAMPLE_STATUS_STARTED 0x01
75 #define IB_PMA_SAMPLE_STATUS_RUNNING 0x02
77 /* Mandatory IB performance counter select values. */
78 #define IB_PMA_PORT_XMIT_DATA __constant_htons(0x0001)
79 #define IB_PMA_PORT_RCV_DATA __constant_htons(0x0002)
80 #define IB_PMA_PORT_XMIT_PKTS __constant_htons(0x0003)
81 #define IB_PMA_PORT_RCV_PKTS __constant_htons(0x0004)
82 #define IB_PMA_PORT_XMIT_WAIT __constant_htons(0x0005)
88 } __attribute__ ((packed
));
90 struct ib_atomic_eth
{
95 } __attribute__ ((packed
));
97 struct ipath_other_headers
{
110 __be64 atomic_ack_eth
;
114 struct ib_atomic_eth atomic_eth
;
116 } __attribute__ ((packed
));
119 * Note that UD packets with a GRH header are 8+40+12+8 = 68 bytes
120 * long (72 w/ imm_data). Only the first 56 bytes of the IB header
121 * will be in the eager header buffer. The remaining 12 or 16 bytes
122 * are in the data buffer.
124 struct ipath_ib_header
{
129 struct ipath_other_headers oth
;
131 struct ipath_other_headers oth
;
133 } __attribute__ ((packed
));
136 * There is one struct ipath_mcast for each multicast GID.
137 * All attached QPs are then stored as a list of
138 * struct ipath_mcast_qp.
140 struct ipath_mcast_qp
{
141 struct list_head list
;
146 struct rb_node rb_node
;
148 struct list_head qp_list
;
149 wait_queue_head_t wait
;
156 struct ipath_mregion mr
; /* must be last */
159 /* Fast memory region */
163 struct ipath_mregion mr
; /* must be last */
166 /* Protection domain */
169 int user
; /* non-zero if created from user space */
175 struct ib_ah_attr attr
;
179 * Quick description of our CQ/QP locking scheme:
181 * We have one global lock that protects dev->cq/qp_table. Each
182 * struct ipath_cq/qp also has its own lock. An individual qp lock
183 * may be taken inside of an individual cq lock. Both cqs attached to
184 * a qp may be locked, with the send cq locked first. No other
185 * nesting should be done.
187 * Each struct ipath_cq/qp also has an atomic_t ref count. The
188 * pointer from the cq/qp_table to the struct counts as one reference.
189 * This reference also is good for access through the consumer API, so
190 * modifying the CQ/QP etc doesn't need to take another reference.
191 * Access because of a completion being polled does need a reference.
193 * Finally, each struct ipath_cq/qp has a wait_queue_head_t for the
194 * destroy function to sleep on.
196 * This means that access from the consumer API requires nothing but
197 * taking the struct's lock.
199 * Access because of a completion event should go as follows:
200 * - lock cq/qp_table and look up struct
201 * - increment ref count in struct
202 * - drop cq/qp_table lock
203 * - lock struct, do your thing, and unlock struct
204 * - decrement ref count; if zero, wake up waiters
206 * To destroy a CQ/QP, we can do the following:
207 * - lock cq/qp_table, remove pointer, unlock cq/qp_table lock
208 * - decrement ref count
209 * - wait_event until ref count is zero
211 * It is the consumer's responsibilty to make sure that no QP
212 * operations (WQE posting or state modification) are pending when the
213 * QP is destroyed. Also, the consumer must make sure that calls to
214 * qp_modify are serialized.
216 * Possible optimizations (wait for profile data to see if/where we
217 * have locks bouncing between CPUs):
218 * - split cq/qp table lock into n separate (cache-aligned) locks,
219 * indexed (say) by the page in the table
224 struct tasklet_struct comptask
;
228 u32 head
; /* new records added to the head */
229 u32 tail
; /* poll_cq() reads from here. */
230 struct ib_wc
*queue
; /* this is actually ibcq.cqe + 1 */
234 * Send work request queue entry.
235 * The size of the sg_list is determined when the QP is created and stored
239 struct ib_send_wr wr
; /* don't use wr.sg_list */
240 u32 psn
; /* first packet sequence number */
241 u32 lpsn
; /* last packet sequence number */
242 u32 ssn
; /* send sequence number */
243 u32 length
; /* total length of data in sg_list */
244 struct ipath_sge sg_list
[0];
248 * Receive work request queue entry.
249 * The size of the sg_list is determined when the QP is created and stored
254 u32 length
; /* total length of data in sg_list */
256 struct ipath_sge sg_list
[0];
261 u32 head
; /* new work requests posted to the head */
262 u32 tail
; /* receives pull requests from here. */
263 u32 size
; /* size of RWQE array */
265 struct ipath_rwqe
*wq
; /* RWQE array */
271 /* send signal when number of RWQEs < limit */
276 * Variables prefixed with s_ are for the requester (sender).
277 * Variables prefixed with r_ are for the responder (receiver).
278 * Variables prefixed with ack_ are for responder replies.
280 * Common variables are protected by both r_rq.lock and s_lock in that order
281 * which only happens in modify_qp() or changing the QP 'state'.
285 struct ipath_qp
*next
; /* link list for QPN hash table */
286 struct ipath_qp
*timer_next
; /* link list for ipath_ib_timer() */
287 struct list_head piowait
; /* link for wait PIO buf */
288 struct list_head timerwait
; /* link for waiting for timeouts */
289 struct ib_ah_attr remote_ah_attr
;
290 struct ipath_ib_header s_hdr
; /* next packet header to send */
292 wait_queue_head_t wait
;
293 struct tasklet_struct s_task
;
294 struct ipath_sge_state
*s_cur_sge
;
295 struct ipath_sge_state s_sge
; /* current send request data */
296 /* current RDMA read send data */
297 struct ipath_sge_state s_rdma_sge
;
298 struct ipath_sge_state r_sge
; /* current receive data */
300 unsigned long s_flags
;
301 u32 s_hdrwords
; /* size of s_hdr in 32 bit words */
302 u32 s_cur_size
; /* size of send packet in bytes */
303 u32 s_len
; /* total length of s_sge */
304 u32 s_rdma_len
; /* total length of s_rdma_sge */
305 u32 s_next_psn
; /* PSN for next request */
306 u32 s_last_psn
; /* last response PSN processed */
307 u32 s_psn
; /* current packet sequence number */
308 u32 s_rnr_timeout
; /* number of milliseconds for RNR timeout */
309 u32 s_ack_psn
; /* PSN for next ACK or RDMA_READ */
310 u64 s_ack_atomic
; /* data for atomic ACK */
311 u64 r_wr_id
; /* ID for current receive WQE */
312 u64 r_atomic_data
; /* data for last atomic op */
313 u32 r_atomic_psn
; /* PSN of last atomic op */
314 u32 r_len
; /* total length of r_sge */
315 u32 r_rcv_len
; /* receive data len processed */
316 u32 r_psn
; /* expected rcv packet sequence number */
317 u8 state
; /* QP state */
318 u8 s_state
; /* opcode of last packet sent */
319 u8 s_ack_state
; /* opcode of packet to ACK */
320 u8 s_nak_state
; /* non-zero if NAK is pending */
321 u8 r_state
; /* opcode of last packet received */
322 u8 r_reuse_sge
; /* for UC receive errors */
323 u8 r_sge_inx
; /* current index into sg_list */
324 u8 s_max_sge
; /* size of s_wq->sg_list */
326 u8 s_retry_cnt
; /* number of times to retry */
329 u8 s_retry
; /* requester retry counter */
330 u8 s_rnr_retry
; /* requester RNR retry counter */
331 u8 s_pkey_index
; /* PKEY index to use */
332 enum ib_mtu path_mtu
;
333 atomic_t msn
; /* message sequence number */
335 u32 qkey
; /* QKEY for this QP (for UD or RD) */
336 u32 s_size
; /* send work queue size */
337 u32 s_head
; /* new entries added here */
338 u32 s_tail
; /* next entry to process */
339 u32 s_cur
; /* current work queue entry */
340 u32 s_last
; /* last un-ACK'ed entry */
341 u32 s_ssn
; /* SSN of tail entry */
342 u32 s_lsn
; /* limit sequence number (credit) */
343 struct ipath_swqe
*s_wq
; /* send work queue */
344 struct ipath_rq r_rq
; /* receive work queue */
348 * Bit definitions for s_flags.
350 #define IPATH_S_BUSY 0
351 #define IPATH_S_SIGNAL_REQ_WR 1
354 * Since struct ipath_swqe is not a fixed size, we can't simply index into
355 * struct ipath_qp.s_wq. This function does the array index computation.
357 static inline struct ipath_swqe
*get_swqe_ptr(struct ipath_qp
*qp
,
360 return (struct ipath_swqe
*)((char *)qp
->s_wq
+
361 (sizeof(struct ipath_swqe
) +
363 sizeof(struct ipath_sge
)) * n
);
367 * Since struct ipath_rwqe is not a fixed size, we can't simply index into
368 * struct ipath_rq.wq. This function does the array index computation.
370 static inline struct ipath_rwqe
*get_rwqe_ptr(struct ipath_rq
*rq
,
373 return (struct ipath_rwqe
*)
375 (sizeof(struct ipath_rwqe
) +
376 rq
->max_sge
* sizeof(struct ipath_sge
)) * n
);
380 * QPN-map pages start out as NULL, they get allocated upon
381 * first use and are never deallocated. This way,
382 * large bitmaps are not allocated unless large numbers of QPs are used.
389 struct ipath_qp_table
{
391 u32 last
; /* last QP number allocated */
392 u32 max
; /* size of the hash table */
393 u32 nmaps
; /* size of the map table */
394 struct ipath_qp
**table
;
395 /* bit map of free numbers */
396 struct qpn_map map
[QPNMAP_ENTRIES
];
399 struct ipath_lkey_table
{
401 u32 next
; /* next unused index (speeds search) */
402 u32 gen
; /* generation count */
403 u32 max
; /* size of the table */
404 struct ipath_mregion
**table
;
407 struct ipath_opcode_stats
{
408 u64 n_packets
; /* number of packets */
409 u64 n_bytes
; /* total number of bytes */
413 struct ib_device ibdev
;
414 struct list_head dev_list
;
415 struct ipath_devdata
*dd
;
416 int ib_unit
; /* This is the device number */
417 u16 sm_lid
; /* in host order */
419 u8 mkeyprot_resv_lmc
;
420 /* non-zero when timer is set */
421 unsigned long mkey_lease_timeout
;
423 /* The following fields are really per port. */
424 struct ipath_qp_table qp_table
;
425 struct ipath_lkey_table lk_table
;
426 struct list_head pending
[3]; /* FIFO of QPs waiting for ACKs */
427 struct list_head piowait
; /* list for wait PIO buf */
428 /* list of QPs waiting for RNR timer */
429 struct list_head rnrwait
;
430 spinlock_t pending_lock
;
431 __be64 sys_image_guid
; /* in network order */
432 __be64 gid_prefix
; /* in network order */
434 u64 ipath_sword
; /* total dwords sent (sample result) */
435 u64 ipath_rword
; /* total dwords received (sample result) */
436 u64 ipath_spkts
; /* total packets sent (sample result) */
437 u64 ipath_rpkts
; /* total packets received (sample result) */
438 /* # of ticks no data sent (sample result) */
440 u64 rcv_errors
; /* # of packets with SW detected rcv errs */
441 u64 n_unicast_xmit
; /* total unicast packets sent */
442 u64 n_unicast_rcv
; /* total unicast packets received */
443 u64 n_multicast_xmit
; /* total multicast packets sent */
444 u64 n_multicast_rcv
; /* total multicast packets received */
445 u64 n_symbol_error_counter
; /* starting count for PMA */
446 u64 n_link_error_recovery_counter
; /* starting count for PMA */
447 u64 n_link_downed_counter
; /* starting count for PMA */
448 u64 n_port_rcv_errors
; /* starting count for PMA */
449 u64 n_port_rcv_remphys_errors
; /* starting count for PMA */
450 u64 n_port_xmit_discards
; /* starting count for PMA */
451 u64 n_port_xmit_data
; /* starting count for PMA */
452 u64 n_port_rcv_data
; /* starting count for PMA */
453 u64 n_port_xmit_packets
; /* starting count for PMA */
454 u64 n_port_rcv_packets
; /* starting count for PMA */
455 u32 n_pkey_violations
; /* starting count for PMA */
470 u32 pma_sample_start
;
471 u32 pma_sample_interval
;
472 __be16 pma_counter_select
[5];
476 u16 mkey_lease_period
;
477 u16 pending_index
; /* which pending queue is active */
478 u8 pma_sample_status
;
480 u8 link_width_enabled
;
482 struct ipath_opcode_stats opstats
[128];
485 struct ipath_ucontext
{
486 struct ib_ucontext ibucontext
;
489 static inline struct ipath_mr
*to_imr(struct ib_mr
*ibmr
)
491 return container_of(ibmr
, struct ipath_mr
, ibmr
);
494 static inline struct ipath_fmr
*to_ifmr(struct ib_fmr
*ibfmr
)
496 return container_of(ibfmr
, struct ipath_fmr
, ibfmr
);
499 static inline struct ipath_pd
*to_ipd(struct ib_pd
*ibpd
)
501 return container_of(ibpd
, struct ipath_pd
, ibpd
);
504 static inline struct ipath_ah
*to_iah(struct ib_ah
*ibah
)
506 return container_of(ibah
, struct ipath_ah
, ibah
);
509 static inline struct ipath_cq
*to_icq(struct ib_cq
*ibcq
)
511 return container_of(ibcq
, struct ipath_cq
, ibcq
);
514 static inline struct ipath_srq
*to_isrq(struct ib_srq
*ibsrq
)
516 return container_of(ibsrq
, struct ipath_srq
, ibsrq
);
519 static inline struct ipath_qp
*to_iqp(struct ib_qp
*ibqp
)
521 return container_of(ibqp
, struct ipath_qp
, ibqp
);
524 static inline struct ipath_ibdev
*to_idev(struct ib_device
*ibdev
)
526 return container_of(ibdev
, struct ipath_ibdev
, ibdev
);
529 int ipath_process_mad(struct ib_device
*ibdev
,
533 struct ib_grh
*in_grh
,
534 struct ib_mad
*in_mad
, struct ib_mad
*out_mad
);
536 static inline struct ipath_ucontext
*to_iucontext(struct ib_ucontext
539 return container_of(ibucontext
, struct ipath_ucontext
, ibucontext
);
543 * Compare the lower 24 bits of the two values.
544 * Returns an integer <, ==, or > than zero.
546 static inline int ipath_cmp24(u32 a
, u32 b
)
548 return (((int) a
) - ((int) b
)) << 8;
551 struct ipath_mcast
*ipath_mcast_find(union ib_gid
*mgid
);
553 int ipath_multicast_attach(struct ib_qp
*ibqp
, union ib_gid
*gid
, u16 lid
);
555 int ipath_multicast_detach(struct ib_qp
*ibqp
, union ib_gid
*gid
, u16 lid
);
557 int ipath_mcast_tree_empty(void);
559 __be32
ipath_compute_aeth(struct ipath_qp
*qp
);
561 struct ipath_qp
*ipath_lookup_qpn(struct ipath_qp_table
*qpt
, u32 qpn
);
563 struct ib_qp
*ipath_create_qp(struct ib_pd
*ibpd
,
564 struct ib_qp_init_attr
*init_attr
,
565 struct ib_udata
*udata
);
567 int ipath_destroy_qp(struct ib_qp
*ibqp
);
569 int ipath_modify_qp(struct ib_qp
*ibqp
, struct ib_qp_attr
*attr
,
572 int ipath_query_qp(struct ib_qp
*ibqp
, struct ib_qp_attr
*attr
,
573 int attr_mask
, struct ib_qp_init_attr
*init_attr
);
575 void ipath_free_all_qps(struct ipath_qp_table
*qpt
);
577 int ipath_init_qp_table(struct ipath_ibdev
*idev
, int size
);
579 void ipath_sqerror_qp(struct ipath_qp
*qp
, struct ib_wc
*wc
);
581 void ipath_get_credit(struct ipath_qp
*qp
, u32 aeth
);
583 void ipath_do_rc_send(unsigned long data
);
585 void ipath_do_uc_send(unsigned long data
);
587 void ipath_cq_enter(struct ipath_cq
*cq
, struct ib_wc
*entry
, int sig
);
589 int ipath_rkey_ok(struct ipath_ibdev
*dev
, struct ipath_sge_state
*ss
,
590 u32 len
, u64 vaddr
, u32 rkey
, int acc
);
592 int ipath_lkey_ok(struct ipath_lkey_table
*rkt
, struct ipath_sge
*isge
,
593 struct ib_sge
*sge
, int acc
);
595 void ipath_copy_sge(struct ipath_sge_state
*ss
, void *data
, u32 length
);
597 void ipath_skip_sge(struct ipath_sge_state
*ss
, u32 length
);
599 int ipath_post_rc_send(struct ipath_qp
*qp
, struct ib_send_wr
*wr
);
601 void ipath_uc_rcv(struct ipath_ibdev
*dev
, struct ipath_ib_header
*hdr
,
602 int has_grh
, void *data
, u32 tlen
, struct ipath_qp
*qp
);
604 void ipath_rc_rcv(struct ipath_ibdev
*dev
, struct ipath_ib_header
*hdr
,
605 int has_grh
, void *data
, u32 tlen
, struct ipath_qp
*qp
);
607 void ipath_restart_rc(struct ipath_qp
*qp
, u32 psn
, struct ib_wc
*wc
);
609 int ipath_post_ud_send(struct ipath_qp
*qp
, struct ib_send_wr
*wr
);
611 void ipath_ud_rcv(struct ipath_ibdev
*dev
, struct ipath_ib_header
*hdr
,
612 int has_grh
, void *data
, u32 tlen
, struct ipath_qp
*qp
);
614 int ipath_alloc_lkey(struct ipath_lkey_table
*rkt
,
615 struct ipath_mregion
*mr
);
617 void ipath_free_lkey(struct ipath_lkey_table
*rkt
, u32 lkey
);
619 int ipath_lkey_ok(struct ipath_lkey_table
*rkt
, struct ipath_sge
*isge
,
620 struct ib_sge
*sge
, int acc
);
622 int ipath_rkey_ok(struct ipath_ibdev
*dev
, struct ipath_sge_state
*ss
,
623 u32 len
, u64 vaddr
, u32 rkey
, int acc
);
625 int ipath_post_srq_receive(struct ib_srq
*ibsrq
, struct ib_recv_wr
*wr
,
626 struct ib_recv_wr
**bad_wr
);
628 struct ib_srq
*ipath_create_srq(struct ib_pd
*ibpd
,
629 struct ib_srq_init_attr
*srq_init_attr
,
630 struct ib_udata
*udata
);
632 int ipath_modify_srq(struct ib_srq
*ibsrq
, struct ib_srq_attr
*attr
,
633 enum ib_srq_attr_mask attr_mask
);
635 int ipath_query_srq(struct ib_srq
*ibsrq
, struct ib_srq_attr
*attr
);
637 int ipath_destroy_srq(struct ib_srq
*ibsrq
);
639 void ipath_cq_enter(struct ipath_cq
*cq
, struct ib_wc
*entry
, int sig
);
641 int ipath_poll_cq(struct ib_cq
*ibcq
, int num_entries
, struct ib_wc
*entry
);
643 struct ib_cq
*ipath_create_cq(struct ib_device
*ibdev
, int entries
,
644 struct ib_ucontext
*context
,
645 struct ib_udata
*udata
);
647 int ipath_destroy_cq(struct ib_cq
*ibcq
);
649 int ipath_req_notify_cq(struct ib_cq
*ibcq
, enum ib_cq_notify notify
);
651 int ipath_resize_cq(struct ib_cq
*ibcq
, int cqe
, struct ib_udata
*udata
);
653 struct ib_mr
*ipath_get_dma_mr(struct ib_pd
*pd
, int acc
);
655 struct ib_mr
*ipath_reg_phys_mr(struct ib_pd
*pd
,
656 struct ib_phys_buf
*buffer_list
,
657 int num_phys_buf
, int acc
, u64
*iova_start
);
659 struct ib_mr
*ipath_reg_user_mr(struct ib_pd
*pd
, struct ib_umem
*region
,
661 struct ib_udata
*udata
);
663 int ipath_dereg_mr(struct ib_mr
*ibmr
);
665 struct ib_fmr
*ipath_alloc_fmr(struct ib_pd
*pd
, int mr_access_flags
,
666 struct ib_fmr_attr
*fmr_attr
);
668 int ipath_map_phys_fmr(struct ib_fmr
*ibfmr
, u64
* page_list
,
669 int list_len
, u64 iova
);
671 int ipath_unmap_fmr(struct list_head
*fmr_list
);
673 int ipath_dealloc_fmr(struct ib_fmr
*ibfmr
);
675 void ipath_no_bufs_available(struct ipath_qp
*qp
, struct ipath_ibdev
*dev
);
677 void ipath_insert_rnr_queue(struct ipath_qp
*qp
);
679 int ipath_get_rwqe(struct ipath_qp
*qp
, int wr_id_only
);
681 void ipath_ruc_loopback(struct ipath_qp
*sqp
, struct ib_wc
*wc
);
683 extern const enum ib_wc_opcode ib_ipath_wc_opcode
[];
685 extern const u8 ipath_cvt_physportstate
[];
687 extern const int ib_ipath_state_ops
[];
689 extern unsigned int ib_ipath_lkey_table_size
;
691 extern const u32 ib_ipath_rnr_table
[];
693 #endif /* IPATH_VERBS_H */