2 * net/sched/sch_mqprio.c
4 * Copyright (c) 2010 John Fastabend <john.r.fastabend@intel.com>
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * version 2 as published by the Free Software Foundation.
11 #include <linux/types.h>
12 #include <linux/slab.h>
13 #include <linux/kernel.h>
14 #include <linux/string.h>
15 #include <linux/errno.h>
16 #include <linux/skbuff.h>
17 #include <linux/module.h>
18 #include <net/netlink.h>
19 #include <net/pkt_sched.h>
20 #include <net/sch_generic.h>
21 #include <net/pkt_cls.h>
24 struct Qdisc
**qdiscs
;
29 u64 min_rate
[TC_QOPT_MAX_QUEUE
];
30 u64 max_rate
[TC_QOPT_MAX_QUEUE
];
33 static void mqprio_destroy(struct Qdisc
*sch
)
35 struct net_device
*dev
= qdisc_dev(sch
);
36 struct mqprio_sched
*priv
= qdisc_priv(sch
);
41 ntx
< dev
->num_tx_queues
&& priv
->qdiscs
[ntx
];
43 qdisc_destroy(priv
->qdiscs
[ntx
]);
47 if (priv
->hw_offload
&& dev
->netdev_ops
->ndo_setup_tc
) {
48 struct tc_mqprio_qopt_offload mqprio
= { { 0 } };
51 case TC_MQPRIO_MODE_DCB
:
52 case TC_MQPRIO_MODE_CHANNEL
:
53 dev
->netdev_ops
->ndo_setup_tc(dev
,
54 TC_SETUP_QDISC_MQPRIO
,
61 netdev_set_num_tc(dev
, 0);
65 static int mqprio_parse_opt(struct net_device
*dev
, struct tc_mqprio_qopt
*qopt
)
69 /* Verify num_tc is not out of max range */
70 if (qopt
->num_tc
> TC_MAX_QUEUE
)
73 /* Verify priority mapping uses valid tcs */
74 for (i
= 0; i
< TC_BITMASK
+ 1; i
++) {
75 if (qopt
->prio_tc_map
[i
] >= qopt
->num_tc
)
79 /* Limit qopt->hw to maximum supported offload value. Drivers have
80 * the option of overriding this later if they don't support the a
83 if (qopt
->hw
> TC_MQPRIO_HW_OFFLOAD_MAX
)
84 qopt
->hw
= TC_MQPRIO_HW_OFFLOAD_MAX
;
86 /* If hardware offload is requested we will leave it to the device
87 * to either populate the queue counts itself or to validate the
88 * provided queue counts. If ndo_setup_tc is not present then
89 * hardware doesn't support offload and we should return an error.
92 return dev
->netdev_ops
->ndo_setup_tc
? 0 : -EINVAL
;
94 for (i
= 0; i
< qopt
->num_tc
; i
++) {
95 unsigned int last
= qopt
->offset
[i
] + qopt
->count
[i
];
97 /* Verify the queue count is in tx range being equal to the
98 * real_num_tx_queues indicates the last queue is in use.
100 if (qopt
->offset
[i
] >= dev
->real_num_tx_queues
||
102 last
> dev
->real_num_tx_queues
)
105 /* Verify that the offset and counts do not overlap */
106 for (j
= i
+ 1; j
< qopt
->num_tc
; j
++) {
107 if (last
> qopt
->offset
[j
])
115 static const struct nla_policy mqprio_policy
[TCA_MQPRIO_MAX
+ 1] = {
116 [TCA_MQPRIO_MODE
] = { .len
= sizeof(u16
) },
117 [TCA_MQPRIO_SHAPER
] = { .len
= sizeof(u16
) },
118 [TCA_MQPRIO_MIN_RATE64
] = { .type
= NLA_NESTED
},
119 [TCA_MQPRIO_MAX_RATE64
] = { .type
= NLA_NESTED
},
122 static int parse_attr(struct nlattr
*tb
[], int maxtype
, struct nlattr
*nla
,
123 const struct nla_policy
*policy
, int len
)
125 int nested_len
= nla_len(nla
) - NLA_ALIGN(len
);
127 if (nested_len
>= nla_attr_size(0))
128 return nla_parse(tb
, maxtype
, nla_data(nla
) + NLA_ALIGN(len
),
129 nested_len
, policy
, NULL
);
131 memset(tb
, 0, sizeof(struct nlattr
*) * (maxtype
+ 1));
135 static int mqprio_init(struct Qdisc
*sch
, struct nlattr
*opt
,
136 struct netlink_ext_ack
*extack
)
138 struct net_device
*dev
= qdisc_dev(sch
);
139 struct mqprio_sched
*priv
= qdisc_priv(sch
);
140 struct netdev_queue
*dev_queue
;
142 int i
, err
= -EOPNOTSUPP
;
143 struct tc_mqprio_qopt
*qopt
= NULL
;
144 struct nlattr
*tb
[TCA_MQPRIO_MAX
+ 1];
149 BUILD_BUG_ON(TC_MAX_QUEUE
!= TC_QOPT_MAX_QUEUE
);
150 BUILD_BUG_ON(TC_BITMASK
!= TC_QOPT_BITMASK
);
152 if (sch
->parent
!= TC_H_ROOT
)
155 if (!netif_is_multiqueue(dev
))
158 /* make certain can allocate enough classids to handle queues */
159 if (dev
->num_tx_queues
>= TC_H_MIN_PRIORITY
)
162 if (!opt
|| nla_len(opt
) < sizeof(*qopt
))
165 qopt
= nla_data(opt
);
166 if (mqprio_parse_opt(dev
, qopt
))
169 len
= nla_len(opt
) - NLA_ALIGN(sizeof(*qopt
));
171 err
= parse_attr(tb
, TCA_MQPRIO_MAX
, opt
, mqprio_policy
,
179 if (tb
[TCA_MQPRIO_MODE
]) {
180 priv
->flags
|= TC_MQPRIO_F_MODE
;
181 priv
->mode
= *(u16
*)nla_data(tb
[TCA_MQPRIO_MODE
]);
184 if (tb
[TCA_MQPRIO_SHAPER
]) {
185 priv
->flags
|= TC_MQPRIO_F_SHAPER
;
186 priv
->shaper
= *(u16
*)nla_data(tb
[TCA_MQPRIO_SHAPER
]);
189 if (tb
[TCA_MQPRIO_MIN_RATE64
]) {
190 if (priv
->shaper
!= TC_MQPRIO_SHAPER_BW_RATE
)
193 nla_for_each_nested(attr
, tb
[TCA_MQPRIO_MIN_RATE64
],
195 if (nla_type(attr
) != TCA_MQPRIO_MIN_RATE64
)
197 if (i
>= qopt
->num_tc
)
199 priv
->min_rate
[i
] = *(u64
*)nla_data(attr
);
202 priv
->flags
|= TC_MQPRIO_F_MIN_RATE
;
205 if (tb
[TCA_MQPRIO_MAX_RATE64
]) {
206 if (priv
->shaper
!= TC_MQPRIO_SHAPER_BW_RATE
)
209 nla_for_each_nested(attr
, tb
[TCA_MQPRIO_MAX_RATE64
],
211 if (nla_type(attr
) != TCA_MQPRIO_MAX_RATE64
)
213 if (i
>= qopt
->num_tc
)
215 priv
->max_rate
[i
] = *(u64
*)nla_data(attr
);
218 priv
->flags
|= TC_MQPRIO_F_MAX_RATE
;
222 /* pre-allocate qdisc, attachment can't fail */
223 priv
->qdiscs
= kcalloc(dev
->num_tx_queues
, sizeof(priv
->qdiscs
[0]),
228 for (i
= 0; i
< dev
->num_tx_queues
; i
++) {
229 dev_queue
= netdev_get_tx_queue(dev
, i
);
230 qdisc
= qdisc_create_dflt(dev_queue
,
231 get_default_qdisc_ops(dev
, i
),
232 TC_H_MAKE(TC_H_MAJ(sch
->handle
),
233 TC_H_MIN(i
+ 1)), extack
);
237 priv
->qdiscs
[i
] = qdisc
;
238 qdisc
->flags
|= TCQ_F_ONETXQUEUE
| TCQ_F_NOPARENT
;
241 /* If the mqprio options indicate that hardware should own
242 * the queue mapping then run ndo_setup_tc otherwise use the
243 * supplied and verified mapping
246 struct tc_mqprio_qopt_offload mqprio
= {.qopt
= *qopt
};
248 switch (priv
->mode
) {
249 case TC_MQPRIO_MODE_DCB
:
250 if (priv
->shaper
!= TC_MQPRIO_SHAPER_DCB
)
253 case TC_MQPRIO_MODE_CHANNEL
:
254 mqprio
.flags
= priv
->flags
;
255 if (priv
->flags
& TC_MQPRIO_F_MODE
)
256 mqprio
.mode
= priv
->mode
;
257 if (priv
->flags
& TC_MQPRIO_F_SHAPER
)
258 mqprio
.shaper
= priv
->shaper
;
259 if (priv
->flags
& TC_MQPRIO_F_MIN_RATE
)
260 for (i
= 0; i
< mqprio
.qopt
.num_tc
; i
++)
261 mqprio
.min_rate
[i
] = priv
->min_rate
[i
];
262 if (priv
->flags
& TC_MQPRIO_F_MAX_RATE
)
263 for (i
= 0; i
< mqprio
.qopt
.num_tc
; i
++)
264 mqprio
.max_rate
[i
] = priv
->max_rate
[i
];
269 err
= dev
->netdev_ops
->ndo_setup_tc(dev
,
270 TC_SETUP_QDISC_MQPRIO
,
275 priv
->hw_offload
= mqprio
.qopt
.hw
;
277 netdev_set_num_tc(dev
, qopt
->num_tc
);
278 for (i
= 0; i
< qopt
->num_tc
; i
++)
279 netdev_set_tc_queue(dev
, i
,
280 qopt
->count
[i
], qopt
->offset
[i
]);
283 /* Always use supplied priority mappings */
284 for (i
= 0; i
< TC_BITMASK
+ 1; i
++)
285 netdev_set_prio_tc_map(dev
, i
, qopt
->prio_tc_map
[i
]);
287 sch
->flags
|= TCQ_F_MQROOT
;
291 static void mqprio_attach(struct Qdisc
*sch
)
293 struct net_device
*dev
= qdisc_dev(sch
);
294 struct mqprio_sched
*priv
= qdisc_priv(sch
);
295 struct Qdisc
*qdisc
, *old
;
298 /* Attach underlying qdisc */
299 for (ntx
= 0; ntx
< dev
->num_tx_queues
; ntx
++) {
300 qdisc
= priv
->qdiscs
[ntx
];
301 old
= dev_graft_qdisc(qdisc
->dev_queue
, qdisc
);
304 if (ntx
< dev
->real_num_tx_queues
)
305 qdisc_hash_add(qdisc
, false);
311 static struct netdev_queue
*mqprio_queue_get(struct Qdisc
*sch
,
314 struct net_device
*dev
= qdisc_dev(sch
);
315 unsigned long ntx
= cl
- 1;
317 if (ntx
>= dev
->num_tx_queues
)
319 return netdev_get_tx_queue(dev
, ntx
);
322 static int mqprio_graft(struct Qdisc
*sch
, unsigned long cl
, struct Qdisc
*new,
323 struct Qdisc
**old
, struct netlink_ext_ack
*extack
)
325 struct net_device
*dev
= qdisc_dev(sch
);
326 struct netdev_queue
*dev_queue
= mqprio_queue_get(sch
, cl
);
331 if (dev
->flags
& IFF_UP
)
334 *old
= dev_graft_qdisc(dev_queue
, new);
337 new->flags
|= TCQ_F_ONETXQUEUE
| TCQ_F_NOPARENT
;
339 if (dev
->flags
& IFF_UP
)
345 static int dump_rates(struct mqprio_sched
*priv
,
346 struct tc_mqprio_qopt
*opt
, struct sk_buff
*skb
)
351 if (priv
->flags
& TC_MQPRIO_F_MIN_RATE
) {
352 nest
= nla_nest_start(skb
, TCA_MQPRIO_MIN_RATE64
);
354 goto nla_put_failure
;
356 for (i
= 0; i
< opt
->num_tc
; i
++) {
357 if (nla_put(skb
, TCA_MQPRIO_MIN_RATE64
,
358 sizeof(priv
->min_rate
[i
]),
360 goto nla_put_failure
;
362 nla_nest_end(skb
, nest
);
365 if (priv
->flags
& TC_MQPRIO_F_MAX_RATE
) {
366 nest
= nla_nest_start(skb
, TCA_MQPRIO_MAX_RATE64
);
368 goto nla_put_failure
;
370 for (i
= 0; i
< opt
->num_tc
; i
++) {
371 if (nla_put(skb
, TCA_MQPRIO_MAX_RATE64
,
372 sizeof(priv
->max_rate
[i
]),
374 goto nla_put_failure
;
376 nla_nest_end(skb
, nest
);
381 nla_nest_cancel(skb
, nest
);
385 static int mqprio_dump(struct Qdisc
*sch
, struct sk_buff
*skb
)
387 struct net_device
*dev
= qdisc_dev(sch
);
388 struct mqprio_sched
*priv
= qdisc_priv(sch
);
389 struct nlattr
*nla
= (struct nlattr
*)skb_tail_pointer(skb
);
390 struct tc_mqprio_qopt opt
= { 0 };
392 unsigned int ntx
, tc
;
395 memset(&sch
->bstats
, 0, sizeof(sch
->bstats
));
396 memset(&sch
->qstats
, 0, sizeof(sch
->qstats
));
398 /* MQ supports lockless qdiscs. However, statistics accounting needs
399 * to account for all, none, or a mix of locked and unlocked child
400 * qdiscs. Percpu stats are added to counters in-band and locking
401 * qdisc totals are added at end.
403 for (ntx
= 0; ntx
< dev
->num_tx_queues
; ntx
++) {
404 qdisc
= netdev_get_tx_queue(dev
, ntx
)->qdisc_sleeping
;
405 spin_lock_bh(qdisc_lock(qdisc
));
407 if (qdisc_is_percpu_stats(qdisc
)) {
408 __u32 qlen
= qdisc_qlen_sum(qdisc
);
410 __gnet_stats_copy_basic(NULL
, &sch
->bstats
,
413 __gnet_stats_copy_queue(&sch
->qstats
,
415 &qdisc
->qstats
, qlen
);
417 sch
->q
.qlen
+= qdisc
->q
.qlen
;
418 sch
->bstats
.bytes
+= qdisc
->bstats
.bytes
;
419 sch
->bstats
.packets
+= qdisc
->bstats
.packets
;
420 sch
->qstats
.backlog
+= qdisc
->qstats
.backlog
;
421 sch
->qstats
.drops
+= qdisc
->qstats
.drops
;
422 sch
->qstats
.requeues
+= qdisc
->qstats
.requeues
;
423 sch
->qstats
.overlimits
+= qdisc
->qstats
.overlimits
;
426 spin_unlock_bh(qdisc_lock(qdisc
));
429 opt
.num_tc
= netdev_get_num_tc(dev
);
430 memcpy(opt
.prio_tc_map
, dev
->prio_tc_map
, sizeof(opt
.prio_tc_map
));
431 opt
.hw
= priv
->hw_offload
;
433 for (tc
= 0; tc
< netdev_get_num_tc(dev
); tc
++) {
434 opt
.count
[tc
] = dev
->tc_to_txq
[tc
].count
;
435 opt
.offset
[tc
] = dev
->tc_to_txq
[tc
].offset
;
438 if (nla_put(skb
, TCA_OPTIONS
, NLA_ALIGN(sizeof(opt
)), &opt
))
439 goto nla_put_failure
;
441 if ((priv
->flags
& TC_MQPRIO_F_MODE
) &&
442 nla_put_u16(skb
, TCA_MQPRIO_MODE
, priv
->mode
))
443 goto nla_put_failure
;
445 if ((priv
->flags
& TC_MQPRIO_F_SHAPER
) &&
446 nla_put_u16(skb
, TCA_MQPRIO_SHAPER
, priv
->shaper
))
447 goto nla_put_failure
;
449 if ((priv
->flags
& TC_MQPRIO_F_MIN_RATE
||
450 priv
->flags
& TC_MQPRIO_F_MAX_RATE
) &&
451 (dump_rates(priv
, &opt
, skb
) != 0))
452 goto nla_put_failure
;
454 return nla_nest_end(skb
, nla
);
456 nlmsg_trim(skb
, nla
);
460 static struct Qdisc
*mqprio_leaf(struct Qdisc
*sch
, unsigned long cl
)
462 struct netdev_queue
*dev_queue
= mqprio_queue_get(sch
, cl
);
467 return dev_queue
->qdisc_sleeping
;
470 static unsigned long mqprio_find(struct Qdisc
*sch
, u32 classid
)
472 struct net_device
*dev
= qdisc_dev(sch
);
473 unsigned int ntx
= TC_H_MIN(classid
);
475 /* There are essentially two regions here that have valid classid
476 * values. The first region will have a classid value of 1 through
477 * num_tx_queues. All of these are backed by actual Qdiscs.
479 if (ntx
< TC_H_MIN_PRIORITY
)
480 return (ntx
<= dev
->num_tx_queues
) ? ntx
: 0;
482 /* The second region represents the hardware traffic classes. These
483 * are represented by classid values of TC_H_MIN_PRIORITY through
484 * TC_H_MIN_PRIORITY + netdev_get_num_tc - 1
486 return ((ntx
- TC_H_MIN_PRIORITY
) < netdev_get_num_tc(dev
)) ? ntx
: 0;
489 static int mqprio_dump_class(struct Qdisc
*sch
, unsigned long cl
,
490 struct sk_buff
*skb
, struct tcmsg
*tcm
)
492 if (cl
< TC_H_MIN_PRIORITY
) {
493 struct netdev_queue
*dev_queue
= mqprio_queue_get(sch
, cl
);
494 struct net_device
*dev
= qdisc_dev(sch
);
495 int tc
= netdev_txq_to_tc(dev
, cl
- 1);
497 tcm
->tcm_parent
= (tc
< 0) ? 0 :
498 TC_H_MAKE(TC_H_MAJ(sch
->handle
),
499 TC_H_MIN(tc
+ TC_H_MIN_PRIORITY
));
500 tcm
->tcm_info
= dev_queue
->qdisc_sleeping
->handle
;
502 tcm
->tcm_parent
= TC_H_ROOT
;
505 tcm
->tcm_handle
|= TC_H_MIN(cl
);
509 static int mqprio_dump_class_stats(struct Qdisc
*sch
, unsigned long cl
,
514 if (cl
>= TC_H_MIN_PRIORITY
) {
517 struct gnet_stats_queue qstats
= {0};
518 struct gnet_stats_basic_packed bstats
= {0};
519 struct net_device
*dev
= qdisc_dev(sch
);
520 struct netdev_tc_txq tc
= dev
->tc_to_txq
[cl
& TC_BITMASK
];
522 /* Drop lock here it will be reclaimed before touching
523 * statistics this is required because the d->lock we
524 * hold here is the look on dev_queue->qdisc_sleeping
525 * also acquired below.
528 spin_unlock_bh(d
->lock
);
530 for (i
= tc
.offset
; i
< tc
.offset
+ tc
.count
; i
++) {
531 struct netdev_queue
*q
= netdev_get_tx_queue(dev
, i
);
532 struct Qdisc
*qdisc
= rtnl_dereference(q
->qdisc
);
533 struct gnet_stats_basic_cpu __percpu
*cpu_bstats
= NULL
;
534 struct gnet_stats_queue __percpu
*cpu_qstats
= NULL
;
536 spin_lock_bh(qdisc_lock(qdisc
));
537 if (qdisc_is_percpu_stats(qdisc
)) {
538 cpu_bstats
= qdisc
->cpu_bstats
;
539 cpu_qstats
= qdisc
->cpu_qstats
;
542 qlen
= qdisc_qlen_sum(qdisc
);
543 __gnet_stats_copy_basic(NULL
, &sch
->bstats
,
544 cpu_bstats
, &qdisc
->bstats
);
545 __gnet_stats_copy_queue(&sch
->qstats
,
549 spin_unlock_bh(qdisc_lock(qdisc
));
552 /* Reclaim root sleeping lock before completing stats */
554 spin_lock_bh(d
->lock
);
555 if (gnet_stats_copy_basic(NULL
, d
, NULL
, &bstats
) < 0 ||
556 gnet_stats_copy_queue(d
, NULL
, &qstats
, qlen
) < 0)
559 struct netdev_queue
*dev_queue
= mqprio_queue_get(sch
, cl
);
561 sch
= dev_queue
->qdisc_sleeping
;
562 if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch
),
563 d
, NULL
, &sch
->bstats
) < 0 ||
564 gnet_stats_copy_queue(d
, NULL
,
565 &sch
->qstats
, sch
->q
.qlen
) < 0)
571 static void mqprio_walk(struct Qdisc
*sch
, struct qdisc_walker
*arg
)
573 struct net_device
*dev
= qdisc_dev(sch
);
579 /* Walk hierarchy with a virtual class per tc */
580 arg
->count
= arg
->skip
;
581 for (ntx
= arg
->skip
; ntx
< netdev_get_num_tc(dev
); ntx
++) {
582 if (arg
->fn(sch
, ntx
+ TC_H_MIN_PRIORITY
, arg
) < 0) {
589 /* Pad the values and skip over unused traffic classes */
590 if (ntx
< TC_MAX_QUEUE
) {
591 arg
->count
= TC_MAX_QUEUE
;
595 /* Reset offset, sort out remaining per-queue qdiscs */
596 for (ntx
-= TC_MAX_QUEUE
; ntx
< dev
->num_tx_queues
; ntx
++) {
597 if (arg
->fn(sch
, ntx
+ 1, arg
) < 0) {
605 static struct netdev_queue
*mqprio_select_queue(struct Qdisc
*sch
,
608 return mqprio_queue_get(sch
, TC_H_MIN(tcm
->tcm_parent
));
611 static const struct Qdisc_class_ops mqprio_class_ops
= {
612 .graft
= mqprio_graft
,
616 .dump
= mqprio_dump_class
,
617 .dump_stats
= mqprio_dump_class_stats
,
618 .select_queue
= mqprio_select_queue
,
621 static struct Qdisc_ops mqprio_qdisc_ops __read_mostly
= {
622 .cl_ops
= &mqprio_class_ops
,
624 .priv_size
= sizeof(struct mqprio_sched
),
626 .destroy
= mqprio_destroy
,
627 .attach
= mqprio_attach
,
629 .owner
= THIS_MODULE
,
632 static int __init
mqprio_module_init(void)
634 return register_qdisc(&mqprio_qdisc_ops
);
637 static void __exit
mqprio_module_exit(void)
639 unregister_qdisc(&mqprio_qdisc_ops
);
642 module_init(mqprio_module_init
);
643 module_exit(mqprio_module_exit
);
645 MODULE_LICENSE("GPL");