2 * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
33 #include <linux/netdevice.h>
34 #include <linux/mlx5/driver.h>
35 #include <linux/mlx5/vport.h>
36 #include "mlx5_core.h"
39 MLX5_LAG_FLAG_BONDED
= 1 << 0,
43 struct mlx5_core_dev
*dev
;
44 struct net_device
*netdev
;
47 /* Used for collection of netdev event info. */
49 enum netdev_lag_tx_type tx_type
;
50 struct netdev_lag_lower_state_info netdev_state
[MLX5_MAX_PORTS
];
54 /* LAG data of a ConnectX card.
55 * It serves both its phys functions.
59 u8 v2p_map
[MLX5_MAX_PORTS
];
60 struct lag_func pf
[MLX5_MAX_PORTS
];
61 struct lag_tracker tracker
;
62 struct delayed_work bond_work
;
63 struct notifier_block nb
;
65 /* Admin state. Allow lag only if allowed is true
66 * even if network conditions for lag were met
71 /* General purpose, use for short periods of time.
72 * Beware of lock dependencies (preferably, no locks should be acquired
75 static DEFINE_MUTEX(lag_mutex
);
77 static int mlx5_cmd_create_lag(struct mlx5_core_dev
*dev
, u8 remap_port1
,
80 u32 in
[MLX5_ST_SZ_DW(create_lag_in
)] = {0};
81 u32 out
[MLX5_ST_SZ_DW(create_lag_out
)] = {0};
82 void *lag_ctx
= MLX5_ADDR_OF(create_lag_in
, in
, ctx
);
84 MLX5_SET(create_lag_in
, in
, opcode
, MLX5_CMD_OP_CREATE_LAG
);
86 MLX5_SET(lagc
, lag_ctx
, tx_remap_affinity_1
, remap_port1
);
87 MLX5_SET(lagc
, lag_ctx
, tx_remap_affinity_2
, remap_port2
);
89 return mlx5_cmd_exec(dev
, in
, sizeof(in
), out
, sizeof(out
));
92 static int mlx5_cmd_modify_lag(struct mlx5_core_dev
*dev
, u8 remap_port1
,
95 u32 in
[MLX5_ST_SZ_DW(modify_lag_in
)] = {0};
96 u32 out
[MLX5_ST_SZ_DW(modify_lag_out
)] = {0};
97 void *lag_ctx
= MLX5_ADDR_OF(modify_lag_in
, in
, ctx
);
99 MLX5_SET(modify_lag_in
, in
, opcode
, MLX5_CMD_OP_MODIFY_LAG
);
100 MLX5_SET(modify_lag_in
, in
, field_select
, 0x1);
102 MLX5_SET(lagc
, lag_ctx
, tx_remap_affinity_1
, remap_port1
);
103 MLX5_SET(lagc
, lag_ctx
, tx_remap_affinity_2
, remap_port2
);
105 return mlx5_cmd_exec(dev
, in
, sizeof(in
), out
, sizeof(out
));
108 static int mlx5_cmd_destroy_lag(struct mlx5_core_dev
*dev
)
110 u32 in
[MLX5_ST_SZ_DW(destroy_lag_in
)] = {0};
111 u32 out
[MLX5_ST_SZ_DW(destroy_lag_out
)] = {0};
113 MLX5_SET(destroy_lag_in
, in
, opcode
, MLX5_CMD_OP_DESTROY_LAG
);
115 return mlx5_cmd_exec(dev
, in
, sizeof(in
), out
, sizeof(out
));
118 int mlx5_cmd_create_vport_lag(struct mlx5_core_dev
*dev
)
120 u32 in
[MLX5_ST_SZ_DW(create_vport_lag_in
)] = {0};
121 u32 out
[MLX5_ST_SZ_DW(create_vport_lag_out
)] = {0};
123 MLX5_SET(create_vport_lag_in
, in
, opcode
, MLX5_CMD_OP_CREATE_VPORT_LAG
);
125 return mlx5_cmd_exec(dev
, in
, sizeof(in
), out
, sizeof(out
));
127 EXPORT_SYMBOL(mlx5_cmd_create_vport_lag
);
129 int mlx5_cmd_destroy_vport_lag(struct mlx5_core_dev
*dev
)
131 u32 in
[MLX5_ST_SZ_DW(destroy_vport_lag_in
)] = {0};
132 u32 out
[MLX5_ST_SZ_DW(destroy_vport_lag_out
)] = {0};
134 MLX5_SET(destroy_vport_lag_in
, in
, opcode
, MLX5_CMD_OP_DESTROY_VPORT_LAG
);
136 return mlx5_cmd_exec(dev
, in
, sizeof(in
), out
, sizeof(out
));
138 EXPORT_SYMBOL(mlx5_cmd_destroy_vport_lag
);
140 static int mlx5_cmd_query_cong_counter(struct mlx5_core_dev
*dev
,
141 bool reset
, void *out
, int out_size
)
143 u32 in
[MLX5_ST_SZ_DW(query_cong_statistics_in
)] = { };
145 MLX5_SET(query_cong_statistics_in
, in
, opcode
,
146 MLX5_CMD_OP_QUERY_CONG_STATISTICS
);
147 MLX5_SET(query_cong_statistics_in
, in
, clear
, reset
);
148 return mlx5_cmd_exec(dev
, in
, sizeof(in
), out
, out_size
);
151 static struct mlx5_lag
*mlx5_lag_dev_get(struct mlx5_core_dev
*dev
)
153 return dev
->priv
.lag
;
156 static int mlx5_lag_dev_get_netdev_idx(struct mlx5_lag
*ldev
,
157 struct net_device
*ndev
)
161 for (i
= 0; i
< MLX5_MAX_PORTS
; i
++)
162 if (ldev
->pf
[i
].netdev
== ndev
)
168 static bool mlx5_lag_is_bonded(struct mlx5_lag
*ldev
)
170 return !!(ldev
->flags
& MLX5_LAG_FLAG_BONDED
);
173 static void mlx5_infer_tx_affinity_mapping(struct lag_tracker
*tracker
,
174 u8
*port1
, u8
*port2
)
178 if (!tracker
->netdev_state
[0].tx_enabled
||
179 !tracker
->netdev_state
[0].link_up
) {
184 if (!tracker
->netdev_state
[1].tx_enabled
||
185 !tracker
->netdev_state
[1].link_up
)
189 static void mlx5_activate_lag(struct mlx5_lag
*ldev
,
190 struct lag_tracker
*tracker
)
192 struct mlx5_core_dev
*dev0
= ldev
->pf
[0].dev
;
195 ldev
->flags
|= MLX5_LAG_FLAG_BONDED
;
197 mlx5_infer_tx_affinity_mapping(tracker
, &ldev
->v2p_map
[0],
200 err
= mlx5_cmd_create_lag(dev0
, ldev
->v2p_map
[0], ldev
->v2p_map
[1]);
203 "Failed to create LAG (%d)\n",
207 static void mlx5_deactivate_lag(struct mlx5_lag
*ldev
)
209 struct mlx5_core_dev
*dev0
= ldev
->pf
[0].dev
;
212 ldev
->flags
&= ~MLX5_LAG_FLAG_BONDED
;
214 err
= mlx5_cmd_destroy_lag(dev0
);
217 "Failed to destroy LAG (%d)\n",
221 static void mlx5_do_bond(struct mlx5_lag
*ldev
)
223 struct mlx5_core_dev
*dev0
= ldev
->pf
[0].dev
;
224 struct mlx5_core_dev
*dev1
= ldev
->pf
[1].dev
;
225 struct lag_tracker tracker
;
226 u8 v2p_port1
, v2p_port2
;
233 mutex_lock(&lag_mutex
);
234 tracker
= ldev
->tracker
;
235 mutex_unlock(&lag_mutex
);
237 do_bond
= tracker
.is_bonded
&& ldev
->allowed
;
239 if (do_bond
&& !mlx5_lag_is_bonded(ldev
)) {
240 for (i
= 0; i
< MLX5_MAX_PORTS
; i
++)
241 mlx5_remove_dev_by_protocol(ldev
->pf
[i
].dev
,
242 MLX5_INTERFACE_PROTOCOL_IB
);
244 mlx5_activate_lag(ldev
, &tracker
);
246 mlx5_add_dev_by_protocol(dev0
, MLX5_INTERFACE_PROTOCOL_IB
);
247 mlx5_nic_vport_enable_roce(dev1
);
248 } else if (do_bond
&& mlx5_lag_is_bonded(ldev
)) {
249 mlx5_infer_tx_affinity_mapping(&tracker
, &v2p_port1
,
252 if ((v2p_port1
!= ldev
->v2p_map
[0]) ||
253 (v2p_port2
!= ldev
->v2p_map
[1])) {
254 ldev
->v2p_map
[0] = v2p_port1
;
255 ldev
->v2p_map
[1] = v2p_port2
;
257 err
= mlx5_cmd_modify_lag(dev0
, v2p_port1
, v2p_port2
);
260 "Failed to modify LAG (%d)\n",
263 } else if (!do_bond
&& mlx5_lag_is_bonded(ldev
)) {
264 mlx5_remove_dev_by_protocol(dev0
, MLX5_INTERFACE_PROTOCOL_IB
);
265 mlx5_nic_vport_disable_roce(dev1
);
267 mlx5_deactivate_lag(ldev
);
269 for (i
= 0; i
< MLX5_MAX_PORTS
; i
++)
271 mlx5_add_dev_by_protocol(ldev
->pf
[i
].dev
,
272 MLX5_INTERFACE_PROTOCOL_IB
);
276 static void mlx5_queue_bond_work(struct mlx5_lag
*ldev
, unsigned long delay
)
278 schedule_delayed_work(&ldev
->bond_work
, delay
);
281 static void mlx5_do_bond_work(struct work_struct
*work
)
283 struct delayed_work
*delayed_work
= to_delayed_work(work
);
284 struct mlx5_lag
*ldev
= container_of(delayed_work
, struct mlx5_lag
,
288 status
= mlx5_dev_list_trylock();
291 mlx5_queue_bond_work(ldev
, HZ
);
296 mlx5_dev_list_unlock();
299 static int mlx5_handle_changeupper_event(struct mlx5_lag
*ldev
,
300 struct lag_tracker
*tracker
,
301 struct net_device
*ndev
,
302 struct netdev_notifier_changeupper_info
*info
)
304 struct net_device
*upper
= info
->upper_dev
, *ndev_tmp
;
305 struct netdev_lag_upper_info
*lag_upper_info
= NULL
;
311 if (!netif_is_lag_master(upper
))
315 lag_upper_info
= info
->upper_info
;
317 /* The event may still be of interest if the slave does not belong to
318 * us, but is enslaved to a master which has one or more of our netdevs
319 * as slaves (e.g., if a new slave is added to a master that bonds two
320 * of our netdevs, we should unbond).
323 for_each_netdev_in_bond_rcu(upper
, ndev_tmp
) {
324 idx
= mlx5_lag_dev_get_netdev_idx(ldev
, ndev_tmp
);
326 bond_status
|= (1 << idx
);
332 /* None of this lagdev's netdevs are slaves of this master. */
333 if (!(bond_status
& 0x3))
337 tracker
->tx_type
= lag_upper_info
->tx_type
;
339 /* Determine bonding status:
340 * A device is considered bonded if both its physical ports are slaves
341 * of the same lag master, and only them.
342 * Lag mode must be activebackup or hash.
344 is_bonded
= (num_slaves
== MLX5_MAX_PORTS
) &&
345 (bond_status
== 0x3) &&
346 ((tracker
->tx_type
== NETDEV_LAG_TX_TYPE_ACTIVEBACKUP
) ||
347 (tracker
->tx_type
== NETDEV_LAG_TX_TYPE_HASH
));
349 if (tracker
->is_bonded
!= is_bonded
) {
350 tracker
->is_bonded
= is_bonded
;
357 static int mlx5_handle_changelowerstate_event(struct mlx5_lag
*ldev
,
358 struct lag_tracker
*tracker
,
359 struct net_device
*ndev
,
360 struct netdev_notifier_changelowerstate_info
*info
)
362 struct netdev_lag_lower_state_info
*lag_lower_info
;
365 if (!netif_is_lag_port(ndev
))
368 idx
= mlx5_lag_dev_get_netdev_idx(ldev
, ndev
);
372 /* This information is used to determine virtual to physical
375 lag_lower_info
= info
->lower_state_info
;
379 tracker
->netdev_state
[idx
] = *lag_lower_info
;
384 static int mlx5_lag_netdev_event(struct notifier_block
*this,
385 unsigned long event
, void *ptr
)
387 struct net_device
*ndev
= netdev_notifier_info_to_dev(ptr
);
388 struct lag_tracker tracker
;
389 struct mlx5_lag
*ldev
;
392 if (!net_eq(dev_net(ndev
), &init_net
))
395 if ((event
!= NETDEV_CHANGEUPPER
) && (event
!= NETDEV_CHANGELOWERSTATE
))
398 ldev
= container_of(this, struct mlx5_lag
, nb
);
399 tracker
= ldev
->tracker
;
402 case NETDEV_CHANGEUPPER
:
403 changed
= mlx5_handle_changeupper_event(ldev
, &tracker
, ndev
,
406 case NETDEV_CHANGELOWERSTATE
:
407 changed
= mlx5_handle_changelowerstate_event(ldev
, &tracker
,
412 mutex_lock(&lag_mutex
);
413 ldev
->tracker
= tracker
;
414 mutex_unlock(&lag_mutex
);
417 mlx5_queue_bond_work(ldev
, 0);
422 static bool mlx5_lag_check_prereq(struct mlx5_lag
*ldev
)
424 if ((ldev
->pf
[0].dev
&& mlx5_sriov_is_enabled(ldev
->pf
[0].dev
)) ||
425 (ldev
->pf
[1].dev
&& mlx5_sriov_is_enabled(ldev
->pf
[1].dev
)))
431 static struct mlx5_lag
*mlx5_lag_dev_alloc(void)
433 struct mlx5_lag
*ldev
;
435 ldev
= kzalloc(sizeof(*ldev
), GFP_KERNEL
);
439 INIT_DELAYED_WORK(&ldev
->bond_work
, mlx5_do_bond_work
);
440 ldev
->allowed
= mlx5_lag_check_prereq(ldev
);
445 static void mlx5_lag_dev_free(struct mlx5_lag
*ldev
)
450 static void mlx5_lag_dev_add_pf(struct mlx5_lag
*ldev
,
451 struct mlx5_core_dev
*dev
,
452 struct net_device
*netdev
)
454 unsigned int fn
= PCI_FUNC(dev
->pdev
->devfn
);
456 if (fn
>= MLX5_MAX_PORTS
)
459 mutex_lock(&lag_mutex
);
460 ldev
->pf
[fn
].dev
= dev
;
461 ldev
->pf
[fn
].netdev
= netdev
;
462 ldev
->tracker
.netdev_state
[fn
].link_up
= 0;
463 ldev
->tracker
.netdev_state
[fn
].tx_enabled
= 0;
465 ldev
->allowed
= mlx5_lag_check_prereq(ldev
);
466 dev
->priv
.lag
= ldev
;
468 mutex_unlock(&lag_mutex
);
471 static void mlx5_lag_dev_remove_pf(struct mlx5_lag
*ldev
,
472 struct mlx5_core_dev
*dev
)
476 for (i
= 0; i
< MLX5_MAX_PORTS
; i
++)
477 if (ldev
->pf
[i
].dev
== dev
)
480 if (i
== MLX5_MAX_PORTS
)
483 mutex_lock(&lag_mutex
);
484 memset(&ldev
->pf
[i
], 0, sizeof(*ldev
->pf
));
486 dev
->priv
.lag
= NULL
;
487 ldev
->allowed
= mlx5_lag_check_prereq(ldev
);
488 mutex_unlock(&lag_mutex
);
491 /* Must be called with intf_mutex held */
492 void mlx5_lag_add(struct mlx5_core_dev
*dev
, struct net_device
*netdev
)
494 struct mlx5_lag
*ldev
= NULL
;
495 struct mlx5_core_dev
*tmp_dev
;
497 if (!MLX5_CAP_GEN(dev
, vport_group_manager
) ||
498 !MLX5_CAP_GEN(dev
, lag_master
) ||
499 (MLX5_CAP_GEN(dev
, num_lag_ports
) != MLX5_MAX_PORTS
))
502 tmp_dev
= mlx5_get_next_phys_dev(dev
);
504 ldev
= tmp_dev
->priv
.lag
;
507 ldev
= mlx5_lag_dev_alloc();
509 mlx5_core_err(dev
, "Failed to alloc lag dev\n");
514 mlx5_lag_dev_add_pf(ldev
, dev
, netdev
);
516 if (!ldev
->nb
.notifier_call
) {
517 ldev
->nb
.notifier_call
= mlx5_lag_netdev_event
;
518 if (register_netdevice_notifier(&ldev
->nb
)) {
519 ldev
->nb
.notifier_call
= NULL
;
520 mlx5_core_err(dev
, "Failed to register LAG netdev notifier\n");
525 /* Must be called with intf_mutex held */
526 void mlx5_lag_remove(struct mlx5_core_dev
*dev
)
528 struct mlx5_lag
*ldev
;
531 ldev
= mlx5_lag_dev_get(dev
);
535 if (mlx5_lag_is_bonded(ldev
))
536 mlx5_deactivate_lag(ldev
);
538 mlx5_lag_dev_remove_pf(ldev
, dev
);
540 for (i
= 0; i
< MLX5_MAX_PORTS
; i
++)
544 if (i
== MLX5_MAX_PORTS
) {
545 if (ldev
->nb
.notifier_call
)
546 unregister_netdevice_notifier(&ldev
->nb
);
547 cancel_delayed_work_sync(&ldev
->bond_work
);
548 mlx5_lag_dev_free(ldev
);
552 bool mlx5_lag_is_active(struct mlx5_core_dev
*dev
)
554 struct mlx5_lag
*ldev
;
557 mutex_lock(&lag_mutex
);
558 ldev
= mlx5_lag_dev_get(dev
);
559 res
= ldev
&& mlx5_lag_is_bonded(ldev
);
560 mutex_unlock(&lag_mutex
);
564 EXPORT_SYMBOL(mlx5_lag_is_active
);
566 static int mlx5_lag_set_state(struct mlx5_core_dev
*dev
, bool allow
)
568 struct mlx5_lag
*ldev
;
572 mlx5_dev_list_lock();
574 ldev
= mlx5_lag_dev_get(dev
);
579 lag_active
= mlx5_lag_is_bonded(ldev
);
580 if (!mlx5_lag_check_prereq(ldev
) && allow
) {
584 if (ldev
->allowed
== allow
)
586 ldev
->allowed
= allow
;
587 if ((lag_active
&& !allow
) || allow
)
590 mlx5_dev_list_unlock();
594 int mlx5_lag_forbid(struct mlx5_core_dev
*dev
)
596 return mlx5_lag_set_state(dev
, false);
599 int mlx5_lag_allow(struct mlx5_core_dev
*dev
)
601 return mlx5_lag_set_state(dev
, true);
604 struct net_device
*mlx5_lag_get_roce_netdev(struct mlx5_core_dev
*dev
)
606 struct net_device
*ndev
= NULL
;
607 struct mlx5_lag
*ldev
;
609 mutex_lock(&lag_mutex
);
610 ldev
= mlx5_lag_dev_get(dev
);
612 if (!(ldev
&& mlx5_lag_is_bonded(ldev
)))
615 if (ldev
->tracker
.tx_type
== NETDEV_LAG_TX_TYPE_ACTIVEBACKUP
) {
616 ndev
= ldev
->tracker
.netdev_state
[0].tx_enabled
?
617 ldev
->pf
[0].netdev
: ldev
->pf
[1].netdev
;
619 ndev
= ldev
->pf
[0].netdev
;
625 mutex_unlock(&lag_mutex
);
629 EXPORT_SYMBOL(mlx5_lag_get_roce_netdev
);
631 bool mlx5_lag_intf_add(struct mlx5_interface
*intf
, struct mlx5_priv
*priv
)
633 struct mlx5_core_dev
*dev
= container_of(priv
, struct mlx5_core_dev
,
635 struct mlx5_lag
*ldev
;
637 if (intf
->protocol
!= MLX5_INTERFACE_PROTOCOL_IB
)
640 ldev
= mlx5_lag_dev_get(dev
);
641 if (!ldev
|| !mlx5_lag_is_bonded(ldev
) || ldev
->pf
[0].dev
== dev
)
644 /* If bonded, we do not add an IB device for PF1. */
648 int mlx5_lag_query_cong_counters(struct mlx5_core_dev
*dev
,
653 int outlen
= MLX5_ST_SZ_BYTES(query_cong_statistics_out
);
654 struct mlx5_core_dev
*mdev
[MLX5_MAX_PORTS
];
655 struct mlx5_lag
*ldev
;
660 out
= kvzalloc(outlen
, GFP_KERNEL
);
664 memset(values
, 0, sizeof(*values
) * num_counters
);
666 mutex_lock(&lag_mutex
);
667 ldev
= mlx5_lag_dev_get(dev
);
668 if (ldev
&& mlx5_lag_is_bonded(ldev
)) {
669 num_ports
= MLX5_MAX_PORTS
;
670 mdev
[0] = ldev
->pf
[0].dev
;
671 mdev
[1] = ldev
->pf
[1].dev
;
677 for (i
= 0; i
< num_ports
; ++i
) {
678 ret
= mlx5_cmd_query_cong_counter(mdev
[i
], false, out
, outlen
);
682 for (j
= 0; j
< num_counters
; ++j
)
683 values
[j
] += be64_to_cpup((__be64
*)(out
+ offsets
[j
]));
687 mutex_unlock(&lag_mutex
);
691 EXPORT_SYMBOL(mlx5_lag_query_cong_counters
);