2 * Copyright (c) 2006 Intel Corporation. All rights reserved.
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
33 #include <linux/completion.h>
34 #include <linux/dma-mapping.h>
35 #include <linux/err.h>
36 #include <linux/interrupt.h>
37 #include <linux/bitops.h>
38 #include <linux/random.h>
40 #include <rdma/ib_cache.h>
43 static void mcast_add_one(struct ib_device
*device
);
44 static void mcast_remove_one(struct ib_device
*device
);
46 static struct ib_client mcast_client
= {
47 .name
= "ib_multicast",
49 .remove
= mcast_remove_one
52 static struct ib_sa_client sa_client
;
53 static struct workqueue_struct
*mcast_wq
;
54 static union ib_gid mgid0
;
59 struct mcast_device
*dev
;
63 struct completion comp
;
68 struct ib_device
*device
;
69 struct ib_event_handler event_handler
;
72 struct mcast_port port
[0];
86 struct ib_sa_mcmember_rec rec
;
88 struct mcast_port
*port
;
90 struct work_struct work
;
91 struct list_head pending_list
;
92 struct list_head active_list
;
93 struct mcast_member
*last_join
;
96 enum mcast_state state
;
97 struct ib_sa_query
*query
;
101 struct mcast_member
{
102 struct ib_sa_multicast multicast
;
103 struct ib_sa_client
*client
;
104 struct mcast_group
*group
;
105 struct list_head list
;
106 enum mcast_state state
;
108 struct completion comp
;
111 static void join_handler(int status
, struct ib_sa_mcmember_rec
*rec
,
113 static void leave_handler(int status
, struct ib_sa_mcmember_rec
*rec
,
116 static struct mcast_group
*mcast_find(struct mcast_port
*port
,
119 struct rb_node
*node
= port
->table
.rb_node
;
120 struct mcast_group
*group
;
124 group
= rb_entry(node
, struct mcast_group
, node
);
125 ret
= memcmp(mgid
->raw
, group
->rec
.mgid
.raw
, sizeof *mgid
);
130 node
= node
->rb_left
;
132 node
= node
->rb_right
;
137 static struct mcast_group
*mcast_insert(struct mcast_port
*port
,
138 struct mcast_group
*group
,
139 int allow_duplicates
)
141 struct rb_node
**link
= &port
->table
.rb_node
;
142 struct rb_node
*parent
= NULL
;
143 struct mcast_group
*cur_group
;
148 cur_group
= rb_entry(parent
, struct mcast_group
, node
);
150 ret
= memcmp(group
->rec
.mgid
.raw
, cur_group
->rec
.mgid
.raw
,
151 sizeof group
->rec
.mgid
);
153 link
= &(*link
)->rb_left
;
155 link
= &(*link
)->rb_right
;
156 else if (allow_duplicates
)
157 link
= &(*link
)->rb_left
;
161 rb_link_node(&group
->node
, parent
, link
);
162 rb_insert_color(&group
->node
, &port
->table
);
166 static void deref_port(struct mcast_port
*port
)
168 if (atomic_dec_and_test(&port
->refcount
))
169 complete(&port
->comp
);
172 static void release_group(struct mcast_group
*group
)
174 struct mcast_port
*port
= group
->port
;
177 spin_lock_irqsave(&port
->lock
, flags
);
178 if (atomic_dec_and_test(&group
->refcount
)) {
179 rb_erase(&group
->node
, &port
->table
);
180 spin_unlock_irqrestore(&port
->lock
, flags
);
184 spin_unlock_irqrestore(&port
->lock
, flags
);
187 static void deref_member(struct mcast_member
*member
)
189 if (atomic_dec_and_test(&member
->refcount
))
190 complete(&member
->comp
);
193 static void queue_join(struct mcast_member
*member
)
195 struct mcast_group
*group
= member
->group
;
198 spin_lock_irqsave(&group
->lock
, flags
);
199 list_add_tail(&member
->list
, &group
->pending_list
);
200 if (group
->state
== MCAST_IDLE
) {
201 group
->state
= MCAST_BUSY
;
202 atomic_inc(&group
->refcount
);
203 queue_work(mcast_wq
, &group
->work
);
205 spin_unlock_irqrestore(&group
->lock
, flags
);
209 * A multicast group has three types of members: full member, non member, and
210 * send only member. We need to keep track of the number of members of each
211 * type based on their join state. Adjust the number of members the belong to
212 * the specified join states.
214 static void adjust_membership(struct mcast_group
*group
, u8 join_state
, int inc
)
218 for (i
= 0; i
< 3; i
++, join_state
>>= 1)
219 if (join_state
& 0x1)
220 group
->members
[i
] += inc
;
224 * If a multicast group has zero members left for a particular join state, but
225 * the group is still a member with the SA, we need to leave that join state.
226 * Determine which join states we still belong to, but that do not have any
229 static u8
get_leave_state(struct mcast_group
*group
)
234 for (i
= 0; i
< 3; i
++)
235 if (!group
->members
[i
])
236 leave_state
|= (0x1 << i
);
238 return leave_state
& group
->rec
.join_state
;
241 static int check_selector(ib_sa_comp_mask comp_mask
,
242 ib_sa_comp_mask selector_mask
,
243 ib_sa_comp_mask value_mask
,
244 u8 selector
, u8 src_value
, u8 dst_value
)
248 if (!(comp_mask
& selector_mask
) || !(comp_mask
& value_mask
))
253 err
= (src_value
<= dst_value
);
256 err
= (src_value
>= dst_value
);
259 err
= (src_value
!= dst_value
);
269 static int cmp_rec(struct ib_sa_mcmember_rec
*src
,
270 struct ib_sa_mcmember_rec
*dst
, ib_sa_comp_mask comp_mask
)
272 /* MGID must already match */
274 if (comp_mask
& IB_SA_MCMEMBER_REC_PORT_GID
&&
275 memcmp(&src
->port_gid
, &dst
->port_gid
, sizeof src
->port_gid
))
277 if (comp_mask
& IB_SA_MCMEMBER_REC_QKEY
&& src
->qkey
!= dst
->qkey
)
279 if (comp_mask
& IB_SA_MCMEMBER_REC_MLID
&& src
->mlid
!= dst
->mlid
)
281 if (check_selector(comp_mask
, IB_SA_MCMEMBER_REC_MTU_SELECTOR
,
282 IB_SA_MCMEMBER_REC_MTU
, dst
->mtu_selector
,
285 if (comp_mask
& IB_SA_MCMEMBER_REC_TRAFFIC_CLASS
&&
286 src
->traffic_class
!= dst
->traffic_class
)
288 if (comp_mask
& IB_SA_MCMEMBER_REC_PKEY
&& src
->pkey
!= dst
->pkey
)
290 if (check_selector(comp_mask
, IB_SA_MCMEMBER_REC_RATE_SELECTOR
,
291 IB_SA_MCMEMBER_REC_RATE
, dst
->rate_selector
,
292 src
->rate
, dst
->rate
))
294 if (check_selector(comp_mask
,
295 IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR
,
296 IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME
,
297 dst
->packet_life_time_selector
,
298 src
->packet_life_time
, dst
->packet_life_time
))
300 if (comp_mask
& IB_SA_MCMEMBER_REC_SL
&& src
->sl
!= dst
->sl
)
302 if (comp_mask
& IB_SA_MCMEMBER_REC_FLOW_LABEL
&&
303 src
->flow_label
!= dst
->flow_label
)
305 if (comp_mask
& IB_SA_MCMEMBER_REC_HOP_LIMIT
&&
306 src
->hop_limit
!= dst
->hop_limit
)
308 if (comp_mask
& IB_SA_MCMEMBER_REC_SCOPE
&& src
->scope
!= dst
->scope
)
311 /* join_state checked separately, proxy_join ignored */
316 static int send_join(struct mcast_group
*group
, struct mcast_member
*member
)
318 struct mcast_port
*port
= group
->port
;
321 group
->last_join
= member
;
322 ret
= ib_sa_mcmember_rec_query(&sa_client
, port
->dev
->device
,
323 port
->port_num
, IB_MGMT_METHOD_SET
,
324 &member
->multicast
.rec
,
325 member
->multicast
.comp_mask
,
326 3000, GFP_KERNEL
, join_handler
, group
,
329 group
->query_id
= ret
;
335 static int send_leave(struct mcast_group
*group
, u8 leave_state
)
337 struct mcast_port
*port
= group
->port
;
338 struct ib_sa_mcmember_rec rec
;
342 rec
.join_state
= leave_state
;
344 ret
= ib_sa_mcmember_rec_query(&sa_client
, port
->dev
->device
,
345 port
->port_num
, IB_SA_METHOD_DELETE
, &rec
,
346 IB_SA_MCMEMBER_REC_MGID
|
347 IB_SA_MCMEMBER_REC_PORT_GID
|
348 IB_SA_MCMEMBER_REC_JOIN_STATE
,
349 3000, GFP_KERNEL
, leave_handler
,
350 group
, &group
->query
);
352 group
->query_id
= ret
;
358 static void join_group(struct mcast_group
*group
, struct mcast_member
*member
,
361 member
->state
= MCAST_MEMBER
;
362 adjust_membership(group
, join_state
, 1);
363 group
->rec
.join_state
|= join_state
;
364 member
->multicast
.rec
= group
->rec
;
365 member
->multicast
.rec
.join_state
= join_state
;
366 list_move(&member
->list
, &group
->active_list
);
369 static int fail_join(struct mcast_group
*group
, struct mcast_member
*member
,
372 spin_lock_irq(&group
->lock
);
373 list_del_init(&member
->list
);
374 spin_unlock_irq(&group
->lock
);
375 return member
->multicast
.callback(status
, &member
->multicast
);
378 static void process_group_error(struct mcast_group
*group
)
380 struct mcast_member
*member
;
383 spin_lock_irq(&group
->lock
);
384 while (!list_empty(&group
->active_list
)) {
385 member
= list_entry(group
->active_list
.next
,
386 struct mcast_member
, list
);
387 atomic_inc(&member
->refcount
);
388 list_del_init(&member
->list
);
389 adjust_membership(group
, member
->multicast
.rec
.join_state
, -1);
390 member
->state
= MCAST_ERROR
;
391 spin_unlock_irq(&group
->lock
);
393 ret
= member
->multicast
.callback(-ENETRESET
,
395 deref_member(member
);
397 ib_sa_free_multicast(&member
->multicast
);
398 spin_lock_irq(&group
->lock
);
401 group
->rec
.join_state
= 0;
402 group
->state
= MCAST_BUSY
;
403 spin_unlock_irq(&group
->lock
);
406 static void mcast_work_handler(struct work_struct
*work
)
408 struct mcast_group
*group
;
409 struct mcast_member
*member
;
410 struct ib_sa_multicast
*multicast
;
414 group
= container_of(work
, typeof(*group
), work
);
416 spin_lock_irq(&group
->lock
);
417 while (!list_empty(&group
->pending_list
) ||
418 (group
->state
== MCAST_ERROR
)) {
420 if (group
->state
== MCAST_ERROR
) {
421 spin_unlock_irq(&group
->lock
);
422 process_group_error(group
);
426 member
= list_entry(group
->pending_list
.next
,
427 struct mcast_member
, list
);
428 multicast
= &member
->multicast
;
429 join_state
= multicast
->rec
.join_state
;
430 atomic_inc(&member
->refcount
);
432 if (join_state
== (group
->rec
.join_state
& join_state
)) {
433 status
= cmp_rec(&group
->rec
, &multicast
->rec
,
434 multicast
->comp_mask
);
436 join_group(group
, member
, join_state
);
438 list_del_init(&member
->list
);
439 spin_unlock_irq(&group
->lock
);
440 ret
= multicast
->callback(status
, multicast
);
442 spin_unlock_irq(&group
->lock
);
443 status
= send_join(group
, member
);
445 deref_member(member
);
448 ret
= fail_join(group
, member
, status
);
451 deref_member(member
);
453 ib_sa_free_multicast(&member
->multicast
);
454 spin_lock_irq(&group
->lock
);
457 join_state
= get_leave_state(group
);
459 group
->rec
.join_state
&= ~join_state
;
460 spin_unlock_irq(&group
->lock
);
461 if (send_leave(group
, join_state
))
464 group
->state
= MCAST_IDLE
;
465 spin_unlock_irq(&group
->lock
);
466 release_group(group
);
471 * Fail a join request if it is still active - at the head of the pending queue.
473 static void process_join_error(struct mcast_group
*group
, int status
)
475 struct mcast_member
*member
;
478 spin_lock_irq(&group
->lock
);
479 member
= list_entry(group
->pending_list
.next
,
480 struct mcast_member
, list
);
481 if (group
->last_join
== member
) {
482 atomic_inc(&member
->refcount
);
483 list_del_init(&member
->list
);
484 spin_unlock_irq(&group
->lock
);
485 ret
= member
->multicast
.callback(status
, &member
->multicast
);
486 deref_member(member
);
488 ib_sa_free_multicast(&member
->multicast
);
490 spin_unlock_irq(&group
->lock
);
493 static void join_handler(int status
, struct ib_sa_mcmember_rec
*rec
,
496 struct mcast_group
*group
= context
;
499 process_join_error(group
, status
);
501 spin_lock_irq(&group
->port
->lock
);
503 if (!memcmp(&mgid0
, &group
->rec
.mgid
, sizeof mgid0
)) {
504 rb_erase(&group
->node
, &group
->port
->table
);
505 mcast_insert(group
->port
, group
, 1);
507 spin_unlock_irq(&group
->port
->lock
);
509 mcast_work_handler(&group
->work
);
512 static void leave_handler(int status
, struct ib_sa_mcmember_rec
*rec
,
515 struct mcast_group
*group
= context
;
517 mcast_work_handler(&group
->work
);
520 static struct mcast_group
*acquire_group(struct mcast_port
*port
,
521 union ib_gid
*mgid
, gfp_t gfp_mask
)
523 struct mcast_group
*group
, *cur_group
;
527 is_mgid0
= !memcmp(&mgid0
, mgid
, sizeof mgid0
);
529 spin_lock_irqsave(&port
->lock
, flags
);
530 group
= mcast_find(port
, mgid
);
533 spin_unlock_irqrestore(&port
->lock
, flags
);
536 group
= kzalloc(sizeof *group
, gfp_mask
);
541 group
->rec
.mgid
= *mgid
;
542 INIT_LIST_HEAD(&group
->pending_list
);
543 INIT_LIST_HEAD(&group
->active_list
);
544 INIT_WORK(&group
->work
, mcast_work_handler
);
545 spin_lock_init(&group
->lock
);
547 spin_lock_irqsave(&port
->lock
, flags
);
548 cur_group
= mcast_insert(port
, group
, is_mgid0
);
553 atomic_inc(&port
->refcount
);
555 atomic_inc(&group
->refcount
);
556 spin_unlock_irqrestore(&port
->lock
, flags
);
561 * We serialize all join requests to a single group to make our lives much
562 * easier. Otherwise, two users could try to join the same group
563 * simultaneously, with different configurations, one could leave while the
564 * join is in progress, etc., which makes locking around error recovery
567 struct ib_sa_multicast
*
568 ib_sa_join_multicast(struct ib_sa_client
*client
,
569 struct ib_device
*device
, u8 port_num
,
570 struct ib_sa_mcmember_rec
*rec
,
571 ib_sa_comp_mask comp_mask
, gfp_t gfp_mask
,
572 int (*callback
)(int status
,
573 struct ib_sa_multicast
*multicast
),
576 struct mcast_device
*dev
;
577 struct mcast_member
*member
;
578 struct ib_sa_multicast
*multicast
;
581 dev
= ib_get_client_data(device
, &mcast_client
);
583 return ERR_PTR(-ENODEV
);
585 member
= kmalloc(sizeof *member
, gfp_mask
);
587 return ERR_PTR(-ENOMEM
);
589 ib_sa_client_get(client
);
590 member
->client
= client
;
591 member
->multicast
.rec
= *rec
;
592 member
->multicast
.comp_mask
= comp_mask
;
593 member
->multicast
.callback
= callback
;
594 member
->multicast
.context
= context
;
595 init_completion(&member
->comp
);
596 atomic_set(&member
->refcount
, 1);
597 member
->state
= MCAST_JOINING
;
599 member
->group
= acquire_group(&dev
->port
[port_num
- dev
->start_port
],
600 &rec
->mgid
, gfp_mask
);
601 if (!member
->group
) {
607 * The user will get the multicast structure in their callback. They
608 * could then free the multicast structure before we can return from
609 * this routine. So we save the pointer to return before queuing
612 multicast
= &member
->multicast
;
617 ib_sa_client_put(client
);
621 EXPORT_SYMBOL(ib_sa_join_multicast
);
623 void ib_sa_free_multicast(struct ib_sa_multicast
*multicast
)
625 struct mcast_member
*member
;
626 struct mcast_group
*group
;
628 member
= container_of(multicast
, struct mcast_member
, multicast
);
629 group
= member
->group
;
631 spin_lock_irq(&group
->lock
);
632 if (member
->state
== MCAST_MEMBER
)
633 adjust_membership(group
, multicast
->rec
.join_state
, -1);
635 list_del_init(&member
->list
);
637 if (group
->state
== MCAST_IDLE
) {
638 group
->state
= MCAST_BUSY
;
639 spin_unlock_irq(&group
->lock
);
640 /* Continue to hold reference on group until callback */
641 queue_work(mcast_wq
, &group
->work
);
643 spin_unlock_irq(&group
->lock
);
644 release_group(group
);
647 deref_member(member
);
648 wait_for_completion(&member
->comp
);
649 ib_sa_client_put(member
->client
);
652 EXPORT_SYMBOL(ib_sa_free_multicast
);
654 int ib_sa_get_mcmember_rec(struct ib_device
*device
, u8 port_num
,
655 union ib_gid
*mgid
, struct ib_sa_mcmember_rec
*rec
)
657 struct mcast_device
*dev
;
658 struct mcast_port
*port
;
659 struct mcast_group
*group
;
663 dev
= ib_get_client_data(device
, &mcast_client
);
667 port
= &dev
->port
[port_num
- dev
->start_port
];
668 spin_lock_irqsave(&port
->lock
, flags
);
669 group
= mcast_find(port
, mgid
);
673 ret
= -EADDRNOTAVAIL
;
674 spin_unlock_irqrestore(&port
->lock
, flags
);
678 EXPORT_SYMBOL(ib_sa_get_mcmember_rec
);
680 int ib_init_ah_from_mcmember(struct ib_device
*device
, u8 port_num
,
681 struct ib_sa_mcmember_rec
*rec
,
682 struct ib_ah_attr
*ah_attr
)
688 ret
= ib_find_cached_gid(device
, &rec
->port_gid
, &p
, &gid_index
);
692 memset(ah_attr
, 0, sizeof *ah_attr
);
693 ah_attr
->dlid
= be16_to_cpu(rec
->mlid
);
694 ah_attr
->sl
= rec
->sl
;
695 ah_attr
->port_num
= port_num
;
696 ah_attr
->static_rate
= rec
->rate
;
698 ah_attr
->ah_flags
= IB_AH_GRH
;
699 ah_attr
->grh
.dgid
= rec
->mgid
;
701 ah_attr
->grh
.sgid_index
= (u8
) gid_index
;
702 ah_attr
->grh
.flow_label
= be32_to_cpu(rec
->flow_label
);
703 ah_attr
->grh
.hop_limit
= rec
->hop_limit
;
704 ah_attr
->grh
.traffic_class
= rec
->traffic_class
;
708 EXPORT_SYMBOL(ib_init_ah_from_mcmember
);
710 static void mcast_groups_lost(struct mcast_port
*port
)
712 struct mcast_group
*group
;
713 struct rb_node
*node
;
716 spin_lock_irqsave(&port
->lock
, flags
);
717 for (node
= rb_first(&port
->table
); node
; node
= rb_next(node
)) {
718 group
= rb_entry(node
, struct mcast_group
, node
);
719 spin_lock(&group
->lock
);
720 if (group
->state
== MCAST_IDLE
) {
721 atomic_inc(&group
->refcount
);
722 queue_work(mcast_wq
, &group
->work
);
724 group
->state
= MCAST_ERROR
;
725 spin_unlock(&group
->lock
);
727 spin_unlock_irqrestore(&port
->lock
, flags
);
730 static void mcast_event_handler(struct ib_event_handler
*handler
,
731 struct ib_event
*event
)
733 struct mcast_device
*dev
;
735 dev
= container_of(handler
, struct mcast_device
, event_handler
);
737 switch (event
->event
) {
738 case IB_EVENT_PORT_ERR
:
739 case IB_EVENT_LID_CHANGE
:
740 case IB_EVENT_SM_CHANGE
:
741 case IB_EVENT_CLIENT_REREGISTER
:
742 mcast_groups_lost(&dev
->port
[event
->element
.port_num
-
750 static void mcast_add_one(struct ib_device
*device
)
752 struct mcast_device
*dev
;
753 struct mcast_port
*port
;
756 if (rdma_node_get_transport(device
->node_type
) != RDMA_TRANSPORT_IB
)
759 dev
= kmalloc(sizeof *dev
+ device
->phys_port_cnt
* sizeof *port
,
764 if (device
->node_type
== RDMA_NODE_IB_SWITCH
)
765 dev
->start_port
= dev
->end_port
= 0;
768 dev
->end_port
= device
->phys_port_cnt
;
771 for (i
= 0; i
<= dev
->end_port
- dev
->start_port
; i
++) {
772 port
= &dev
->port
[i
];
774 port
->port_num
= dev
->start_port
+ i
;
775 spin_lock_init(&port
->lock
);
776 port
->table
= RB_ROOT
;
777 init_completion(&port
->comp
);
778 atomic_set(&port
->refcount
, 1);
781 dev
->device
= device
;
782 ib_set_client_data(device
, &mcast_client
, dev
);
784 INIT_IB_EVENT_HANDLER(&dev
->event_handler
, device
, mcast_event_handler
);
785 ib_register_event_handler(&dev
->event_handler
);
788 static void mcast_remove_one(struct ib_device
*device
)
790 struct mcast_device
*dev
;
791 struct mcast_port
*port
;
794 dev
= ib_get_client_data(device
, &mcast_client
);
798 ib_unregister_event_handler(&dev
->event_handler
);
799 flush_workqueue(mcast_wq
);
801 for (i
= 0; i
<= dev
->end_port
- dev
->start_port
; i
++) {
802 port
= &dev
->port
[i
];
804 wait_for_completion(&port
->comp
);
814 mcast_wq
= create_singlethread_workqueue("ib_mcast");
818 ib_sa_register_client(&sa_client
);
820 ret
= ib_register_client(&mcast_client
);
826 ib_sa_unregister_client(&sa_client
);
827 destroy_workqueue(mcast_wq
);
831 void mcast_cleanup(void)
833 ib_unregister_client(&mcast_client
);
834 ib_sa_unregister_client(&sa_client
);
835 destroy_workqueue(mcast_wq
);