1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * net/sched/sch_api.c Packet scheduler API.
5 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
9 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
10 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
11 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/string.h>
18 #include <linux/errno.h>
19 #include <linux/skbuff.h>
20 #include <linux/init.h>
21 #include <linux/proc_fs.h>
22 #include <linux/seq_file.h>
23 #include <linux/kmod.h>
24 #include <linux/list.h>
25 #include <linux/hrtimer.h>
26 #include <linux/slab.h>
27 #include <linux/hashtable.h>
29 #include <net/net_namespace.h>
31 #include <net/netlink.h>
32 #include <net/pkt_sched.h>
33 #include <net/pkt_cls.h>
35 #include <trace/events/qdisc.h>
42 This file consists of two interrelated parts:
44 1. queueing disciplines manager frontend.
45 2. traffic classes manager frontend.
47 Generally, queueing discipline ("qdisc") is a black box,
48 which is able to enqueue packets and to dequeue them (when
49 device is ready to send something) in order and at times
50 determined by algorithm hidden in it.
52 qdisc's are divided to two categories:
53 - "queues", which have no internal structure visible from outside.
54 - "schedulers", which split all the packets to "traffic classes",
55 using "packet classifiers" (look at cls_api.c)
57 In turn, classes may have child qdiscs (as rule, queues)
58 attached to them etc. etc. etc.
60 The goal of the routines in this file is to translate
61 information supplied by user in the form of handles
62 to more intelligible for kernel form, to make some sanity
63 checks and part of work, which is common to all qdiscs
64 and to provide rtnetlink notifications.
66 All real intelligent work is done inside qdisc modules.
70 Every discipline has two major routines: enqueue and dequeue.
74 dequeue usually returns a skb to send. It is allowed to return NULL,
75 but it does not mean that queue is empty, it just means that
76 discipline does not want to send anything this time.
77 Queue is really empty if q->q.qlen == 0.
78 For complicated disciplines with multiple queues q->q is not
79 real packet queue, but however q->q.qlen must be valid.
83 enqueue returns 0, if packet was enqueued successfully.
84 If packet (this one or another one) was dropped, it returns
86 NET_XMIT_DROP - this packet dropped
87 Expected action: do not backoff, but wait until queue will clear.
88 NET_XMIT_CN - probably this packet enqueued, but another one dropped.
89 Expected action: backoff or ignore
95 like dequeue but without removing a packet from the queue
99 returns qdisc to initial state: purge all buffers, clear all
100 timers, counters (except for statistics) etc.
104 initializes newly created qdisc.
108 destroys resources allocated by init and during lifetime of qdisc.
112 changes qdisc parameters.
115 /* Protects list of registered TC modules. It is pure SMP lock. */
116 static DEFINE_RWLOCK(qdisc_mod_lock
);
119 /************************************************
120 * Queueing disciplines manipulation. *
121 ************************************************/
124 /* The list of all installed queueing disciplines. */
126 static struct Qdisc_ops
*qdisc_base
;
128 /* Register/unregister queueing discipline */
130 int register_qdisc(struct Qdisc_ops
*qops
)
132 struct Qdisc_ops
*q
, **qp
;
135 write_lock(&qdisc_mod_lock
);
136 for (qp
= &qdisc_base
; (q
= *qp
) != NULL
; qp
= &q
->next
)
137 if (!strcmp(qops
->id
, q
->id
))
140 if (qops
->enqueue
== NULL
)
141 qops
->enqueue
= noop_qdisc_ops
.enqueue
;
142 if (qops
->peek
== NULL
) {
143 if (qops
->dequeue
== NULL
)
144 qops
->peek
= noop_qdisc_ops
.peek
;
148 if (qops
->dequeue
== NULL
)
149 qops
->dequeue
= noop_qdisc_ops
.dequeue
;
152 const struct Qdisc_class_ops
*cops
= qops
->cl_ops
;
154 if (!(cops
->find
&& cops
->walk
&& cops
->leaf
))
157 if (cops
->tcf_block
&& !(cops
->bind_tcf
&& cops
->unbind_tcf
))
165 write_unlock(&qdisc_mod_lock
);
172 EXPORT_SYMBOL(register_qdisc
);
174 int unregister_qdisc(struct Qdisc_ops
*qops
)
176 struct Qdisc_ops
*q
, **qp
;
179 write_lock(&qdisc_mod_lock
);
180 for (qp
= &qdisc_base
; (q
= *qp
) != NULL
; qp
= &q
->next
)
188 write_unlock(&qdisc_mod_lock
);
191 EXPORT_SYMBOL(unregister_qdisc
);
193 /* Get default qdisc if not otherwise specified */
194 void qdisc_get_default(char *name
, size_t len
)
196 read_lock(&qdisc_mod_lock
);
197 strlcpy(name
, default_qdisc_ops
->id
, len
);
198 read_unlock(&qdisc_mod_lock
);
201 static struct Qdisc_ops
*qdisc_lookup_default(const char *name
)
203 struct Qdisc_ops
*q
= NULL
;
205 for (q
= qdisc_base
; q
; q
= q
->next
) {
206 if (!strcmp(name
, q
->id
)) {
207 if (!try_module_get(q
->owner
))
216 /* Set new default qdisc to use */
217 int qdisc_set_default(const char *name
)
219 const struct Qdisc_ops
*ops
;
221 if (!capable(CAP_NET_ADMIN
))
224 write_lock(&qdisc_mod_lock
);
225 ops
= qdisc_lookup_default(name
);
227 /* Not found, drop lock and try to load module */
228 write_unlock(&qdisc_mod_lock
);
229 request_module("sch_%s", name
);
230 write_lock(&qdisc_mod_lock
);
232 ops
= qdisc_lookup_default(name
);
236 /* Set new default */
237 module_put(default_qdisc_ops
->owner
);
238 default_qdisc_ops
= ops
;
240 write_unlock(&qdisc_mod_lock
);
242 return ops
? 0 : -ENOENT
;
245 #ifdef CONFIG_NET_SCH_DEFAULT
246 /* Set default value from kernel config */
247 static int __init
sch_default_qdisc(void)
249 return qdisc_set_default(CONFIG_DEFAULT_NET_SCH
);
251 late_initcall(sch_default_qdisc
);
254 /* We know handle. Find qdisc among all qdisc's attached to device
255 * (root qdisc, all its children, children of children etc.)
256 * Note: caller either uses rtnl or rcu_read_lock()
259 static struct Qdisc
*qdisc_match_from_root(struct Qdisc
*root
, u32 handle
)
263 if (!qdisc_dev(root
))
264 return (root
->handle
== handle
? root
: NULL
);
266 if (!(root
->flags
& TCQ_F_BUILTIN
) &&
267 root
->handle
== handle
)
270 hash_for_each_possible_rcu(qdisc_dev(root
)->qdisc_hash
, q
, hash
, handle
,
271 lockdep_rtnl_is_held()) {
272 if (q
->handle
== handle
)
278 void qdisc_hash_add(struct Qdisc
*q
, bool invisible
)
280 if ((q
->parent
!= TC_H_ROOT
) && !(q
->flags
& TCQ_F_INGRESS
)) {
282 hash_add_rcu(qdisc_dev(q
)->qdisc_hash
, &q
->hash
, q
->handle
);
284 q
->flags
|= TCQ_F_INVISIBLE
;
287 EXPORT_SYMBOL(qdisc_hash_add
);
289 void qdisc_hash_del(struct Qdisc
*q
)
291 if ((q
->parent
!= TC_H_ROOT
) && !(q
->flags
& TCQ_F_INGRESS
)) {
293 hash_del_rcu(&q
->hash
);
296 EXPORT_SYMBOL(qdisc_hash_del
);
298 struct Qdisc
*qdisc_lookup(struct net_device
*dev
, u32 handle
)
304 q
= qdisc_match_from_root(dev
->qdisc
, handle
);
308 if (dev_ingress_queue(dev
))
309 q
= qdisc_match_from_root(
310 dev_ingress_queue(dev
)->qdisc_sleeping
,
316 struct Qdisc
*qdisc_lookup_rcu(struct net_device
*dev
, u32 handle
)
318 struct netdev_queue
*nq
;
323 q
= qdisc_match_from_root(dev
->qdisc
, handle
);
327 nq
= dev_ingress_queue_rcu(dev
);
329 q
= qdisc_match_from_root(nq
->qdisc_sleeping
, handle
);
334 static struct Qdisc
*qdisc_leaf(struct Qdisc
*p
, u32 classid
)
337 const struct Qdisc_class_ops
*cops
= p
->ops
->cl_ops
;
341 cl
= cops
->find(p
, classid
);
345 return cops
->leaf(p
, cl
);
348 /* Find queueing discipline by name */
350 static struct Qdisc_ops
*qdisc_lookup_ops(struct nlattr
*kind
)
352 struct Qdisc_ops
*q
= NULL
;
355 read_lock(&qdisc_mod_lock
);
356 for (q
= qdisc_base
; q
; q
= q
->next
) {
357 if (nla_strcmp(kind
, q
->id
) == 0) {
358 if (!try_module_get(q
->owner
))
363 read_unlock(&qdisc_mod_lock
);
368 /* The linklayer setting were not transferred from iproute2, in older
369 * versions, and the rate tables lookup systems have been dropped in
370 * the kernel. To keep backward compatible with older iproute2 tc
371 * utils, we detect the linklayer setting by detecting if the rate
372 * table were modified.
374 * For linklayer ATM table entries, the rate table will be aligned to
375 * 48 bytes, thus some table entries will contain the same value. The
376 * mpu (min packet unit) is also encoded into the old rate table, thus
377 * starting from the mpu, we find low and high table entries for
378 * mapping this cell. If these entries contain the same value, when
379 * the rate tables have been modified for linklayer ATM.
381 * This is done by rounding mpu to the nearest 48 bytes cell/entry,
382 * and then roundup to the next cell, calc the table entry one below,
385 static __u8
__detect_linklayer(struct tc_ratespec
*r
, __u32
*rtab
)
387 int low
= roundup(r
->mpu
, 48);
388 int high
= roundup(low
+1, 48);
389 int cell_low
= low
>> r
->cell_log
;
390 int cell_high
= (high
>> r
->cell_log
) - 1;
392 /* rtab is too inaccurate at rates > 100Mbit/s */
393 if ((r
->rate
> (100000000/8)) || (rtab
[0] == 0)) {
394 pr_debug("TC linklayer: Giving up ATM detection\n");
395 return TC_LINKLAYER_ETHERNET
;
398 if ((cell_high
> cell_low
) && (cell_high
< 256)
399 && (rtab
[cell_low
] == rtab
[cell_high
])) {
400 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
401 cell_low
, cell_high
, rtab
[cell_high
]);
402 return TC_LINKLAYER_ATM
;
404 return TC_LINKLAYER_ETHERNET
;
407 static struct qdisc_rate_table
*qdisc_rtab_list
;
409 struct qdisc_rate_table
*qdisc_get_rtab(struct tc_ratespec
*r
,
411 struct netlink_ext_ack
*extack
)
413 struct qdisc_rate_table
*rtab
;
415 if (tab
== NULL
|| r
->rate
== 0 || r
->cell_log
== 0 ||
416 nla_len(tab
) != TC_RTAB_SIZE
) {
417 NL_SET_ERR_MSG(extack
, "Invalid rate table parameters for searching");
421 for (rtab
= qdisc_rtab_list
; rtab
; rtab
= rtab
->next
) {
422 if (!memcmp(&rtab
->rate
, r
, sizeof(struct tc_ratespec
)) &&
423 !memcmp(&rtab
->data
, nla_data(tab
), 1024)) {
429 rtab
= kmalloc(sizeof(*rtab
), GFP_KERNEL
);
433 memcpy(rtab
->data
, nla_data(tab
), 1024);
434 if (r
->linklayer
== TC_LINKLAYER_UNAWARE
)
435 r
->linklayer
= __detect_linklayer(r
, rtab
->data
);
436 rtab
->next
= qdisc_rtab_list
;
437 qdisc_rtab_list
= rtab
;
439 NL_SET_ERR_MSG(extack
, "Failed to allocate new qdisc rate table");
443 EXPORT_SYMBOL(qdisc_get_rtab
);
445 void qdisc_put_rtab(struct qdisc_rate_table
*tab
)
447 struct qdisc_rate_table
*rtab
, **rtabp
;
449 if (!tab
|| --tab
->refcnt
)
452 for (rtabp
= &qdisc_rtab_list
;
453 (rtab
= *rtabp
) != NULL
;
454 rtabp
= &rtab
->next
) {
462 EXPORT_SYMBOL(qdisc_put_rtab
);
464 static LIST_HEAD(qdisc_stab_list
);
466 static const struct nla_policy stab_policy
[TCA_STAB_MAX
+ 1] = {
467 [TCA_STAB_BASE
] = { .len
= sizeof(struct tc_sizespec
) },
468 [TCA_STAB_DATA
] = { .type
= NLA_BINARY
},
471 static struct qdisc_size_table
*qdisc_get_stab(struct nlattr
*opt
,
472 struct netlink_ext_ack
*extack
)
474 struct nlattr
*tb
[TCA_STAB_MAX
+ 1];
475 struct qdisc_size_table
*stab
;
476 struct tc_sizespec
*s
;
477 unsigned int tsize
= 0;
481 err
= nla_parse_nested_deprecated(tb
, TCA_STAB_MAX
, opt
, stab_policy
,
485 if (!tb
[TCA_STAB_BASE
]) {
486 NL_SET_ERR_MSG(extack
, "Size table base attribute is missing");
487 return ERR_PTR(-EINVAL
);
490 s
= nla_data(tb
[TCA_STAB_BASE
]);
493 if (!tb
[TCA_STAB_DATA
]) {
494 NL_SET_ERR_MSG(extack
, "Size table data attribute is missing");
495 return ERR_PTR(-EINVAL
);
497 tab
= nla_data(tb
[TCA_STAB_DATA
]);
498 tsize
= nla_len(tb
[TCA_STAB_DATA
]) / sizeof(u16
);
501 if (tsize
!= s
->tsize
|| (!tab
&& tsize
> 0)) {
502 NL_SET_ERR_MSG(extack
, "Invalid size of size table");
503 return ERR_PTR(-EINVAL
);
506 list_for_each_entry(stab
, &qdisc_stab_list
, list
) {
507 if (memcmp(&stab
->szopts
, s
, sizeof(*s
)))
509 if (tsize
> 0 && memcmp(stab
->data
, tab
, tsize
* sizeof(u16
)))
515 stab
= kmalloc(sizeof(*stab
) + tsize
* sizeof(u16
), GFP_KERNEL
);
517 return ERR_PTR(-ENOMEM
);
522 memcpy(stab
->data
, tab
, tsize
* sizeof(u16
));
524 list_add_tail(&stab
->list
, &qdisc_stab_list
);
529 void qdisc_put_stab(struct qdisc_size_table
*tab
)
534 if (--tab
->refcnt
== 0) {
535 list_del(&tab
->list
);
539 EXPORT_SYMBOL(qdisc_put_stab
);
541 static int qdisc_dump_stab(struct sk_buff
*skb
, struct qdisc_size_table
*stab
)
545 nest
= nla_nest_start_noflag(skb
, TCA_STAB
);
547 goto nla_put_failure
;
548 if (nla_put(skb
, TCA_STAB_BASE
, sizeof(stab
->szopts
), &stab
->szopts
))
549 goto nla_put_failure
;
550 nla_nest_end(skb
, nest
);
558 void __qdisc_calculate_pkt_len(struct sk_buff
*skb
,
559 const struct qdisc_size_table
*stab
)
563 pkt_len
= skb
->len
+ stab
->szopts
.overhead
;
564 if (unlikely(!stab
->szopts
.tsize
))
567 slot
= pkt_len
+ stab
->szopts
.cell_align
;
568 if (unlikely(slot
< 0))
571 slot
>>= stab
->szopts
.cell_log
;
572 if (likely(slot
< stab
->szopts
.tsize
))
573 pkt_len
= stab
->data
[slot
];
575 pkt_len
= stab
->data
[stab
->szopts
.tsize
- 1] *
576 (slot
/ stab
->szopts
.tsize
) +
577 stab
->data
[slot
% stab
->szopts
.tsize
];
579 pkt_len
<<= stab
->szopts
.size_log
;
581 if (unlikely(pkt_len
< 1))
583 qdisc_skb_cb(skb
)->pkt_len
= pkt_len
;
585 EXPORT_SYMBOL(__qdisc_calculate_pkt_len
);
587 void qdisc_warn_nonwc(const char *txt
, struct Qdisc
*qdisc
)
589 if (!(qdisc
->flags
& TCQ_F_WARN_NONWC
)) {
590 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
591 txt
, qdisc
->ops
->id
, qdisc
->handle
>> 16);
592 qdisc
->flags
|= TCQ_F_WARN_NONWC
;
595 EXPORT_SYMBOL(qdisc_warn_nonwc
);
597 static enum hrtimer_restart
qdisc_watchdog(struct hrtimer
*timer
)
599 struct qdisc_watchdog
*wd
= container_of(timer
, struct qdisc_watchdog
,
603 __netif_schedule(qdisc_root(wd
->qdisc
));
606 return HRTIMER_NORESTART
;
609 void qdisc_watchdog_init_clockid(struct qdisc_watchdog
*wd
, struct Qdisc
*qdisc
,
612 hrtimer_init(&wd
->timer
, clockid
, HRTIMER_MODE_ABS_PINNED
);
613 wd
->timer
.function
= qdisc_watchdog
;
616 EXPORT_SYMBOL(qdisc_watchdog_init_clockid
);
618 void qdisc_watchdog_init(struct qdisc_watchdog
*wd
, struct Qdisc
*qdisc
)
620 qdisc_watchdog_init_clockid(wd
, qdisc
, CLOCK_MONOTONIC
);
622 EXPORT_SYMBOL(qdisc_watchdog_init
);
624 void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog
*wd
, u64 expires
,
627 if (test_bit(__QDISC_STATE_DEACTIVATED
,
628 &qdisc_root_sleeping(wd
->qdisc
)->state
))
631 if (hrtimer_is_queued(&wd
->timer
)) {
632 /* If timer is already set in [expires, expires + delta_ns],
633 * do not reprogram it.
635 if (wd
->last_expires
- expires
<= delta_ns
)
639 wd
->last_expires
= expires
;
640 hrtimer_start_range_ns(&wd
->timer
,
641 ns_to_ktime(expires
),
643 HRTIMER_MODE_ABS_PINNED
);
645 EXPORT_SYMBOL(qdisc_watchdog_schedule_range_ns
);
647 void qdisc_watchdog_cancel(struct qdisc_watchdog
*wd
)
649 hrtimer_cancel(&wd
->timer
);
651 EXPORT_SYMBOL(qdisc_watchdog_cancel
);
653 static struct hlist_head
*qdisc_class_hash_alloc(unsigned int n
)
655 struct hlist_head
*h
;
658 h
= kvmalloc_array(n
, sizeof(struct hlist_head
), GFP_KERNEL
);
661 for (i
= 0; i
< n
; i
++)
662 INIT_HLIST_HEAD(&h
[i
]);
667 void qdisc_class_hash_grow(struct Qdisc
*sch
, struct Qdisc_class_hash
*clhash
)
669 struct Qdisc_class_common
*cl
;
670 struct hlist_node
*next
;
671 struct hlist_head
*nhash
, *ohash
;
672 unsigned int nsize
, nmask
, osize
;
675 /* Rehash when load factor exceeds 0.75 */
676 if (clhash
->hashelems
* 4 <= clhash
->hashsize
* 3)
678 nsize
= clhash
->hashsize
* 2;
680 nhash
= qdisc_class_hash_alloc(nsize
);
684 ohash
= clhash
->hash
;
685 osize
= clhash
->hashsize
;
688 for (i
= 0; i
< osize
; i
++) {
689 hlist_for_each_entry_safe(cl
, next
, &ohash
[i
], hnode
) {
690 h
= qdisc_class_hash(cl
->classid
, nmask
);
691 hlist_add_head(&cl
->hnode
, &nhash
[h
]);
694 clhash
->hash
= nhash
;
695 clhash
->hashsize
= nsize
;
696 clhash
->hashmask
= nmask
;
697 sch_tree_unlock(sch
);
701 EXPORT_SYMBOL(qdisc_class_hash_grow
);
703 int qdisc_class_hash_init(struct Qdisc_class_hash
*clhash
)
705 unsigned int size
= 4;
707 clhash
->hash
= qdisc_class_hash_alloc(size
);
710 clhash
->hashsize
= size
;
711 clhash
->hashmask
= size
- 1;
712 clhash
->hashelems
= 0;
715 EXPORT_SYMBOL(qdisc_class_hash_init
);
717 void qdisc_class_hash_destroy(struct Qdisc_class_hash
*clhash
)
719 kvfree(clhash
->hash
);
721 EXPORT_SYMBOL(qdisc_class_hash_destroy
);
723 void qdisc_class_hash_insert(struct Qdisc_class_hash
*clhash
,
724 struct Qdisc_class_common
*cl
)
728 INIT_HLIST_NODE(&cl
->hnode
);
729 h
= qdisc_class_hash(cl
->classid
, clhash
->hashmask
);
730 hlist_add_head(&cl
->hnode
, &clhash
->hash
[h
]);
733 EXPORT_SYMBOL(qdisc_class_hash_insert
);
735 void qdisc_class_hash_remove(struct Qdisc_class_hash
*clhash
,
736 struct Qdisc_class_common
*cl
)
738 hlist_del(&cl
->hnode
);
741 EXPORT_SYMBOL(qdisc_class_hash_remove
);
743 /* Allocate an unique handle from space managed by kernel
744 * Possible range is [8000-FFFF]:0000 (0x8000 values)
746 static u32
qdisc_alloc_handle(struct net_device
*dev
)
749 static u32 autohandle
= TC_H_MAKE(0x80000000U
, 0);
752 autohandle
+= TC_H_MAKE(0x10000U
, 0);
753 if (autohandle
== TC_H_MAKE(TC_H_ROOT
, 0))
754 autohandle
= TC_H_MAKE(0x80000000U
, 0);
755 if (!qdisc_lookup(dev
, autohandle
))
763 void qdisc_tree_reduce_backlog(struct Qdisc
*sch
, int n
, int len
)
765 bool qdisc_is_offloaded
= sch
->flags
& TCQ_F_OFFLOADED
;
766 const struct Qdisc_class_ops
*cops
;
772 if (n
== 0 && len
== 0)
774 drops
= max_t(int, n
, 0);
776 while ((parentid
= sch
->parent
)) {
777 if (TC_H_MAJ(parentid
) == TC_H_MAJ(TC_H_INGRESS
))
780 if (sch
->flags
& TCQ_F_NOPARENT
)
782 /* Notify parent qdisc only if child qdisc becomes empty.
784 * If child was empty even before update then backlog
785 * counter is screwed and we skip notification because
786 * parent class is already passive.
788 * If the original child was offloaded then it is allowed
789 * to be seem as empty, so the parent is notified anyway.
791 notify
= !sch
->q
.qlen
&& !WARN_ON_ONCE(!n
&&
792 !qdisc_is_offloaded
);
793 /* TODO: perform the search on a per txq basis */
794 sch
= qdisc_lookup(qdisc_dev(sch
), TC_H_MAJ(parentid
));
796 WARN_ON_ONCE(parentid
!= TC_H_ROOT
);
799 cops
= sch
->ops
->cl_ops
;
800 if (notify
&& cops
->qlen_notify
) {
801 cl
= cops
->find(sch
, parentid
);
802 cops
->qlen_notify(sch
, cl
);
805 sch
->qstats
.backlog
-= len
;
806 __qdisc_qstats_drop(sch
, drops
);
810 EXPORT_SYMBOL(qdisc_tree_reduce_backlog
);
812 int qdisc_offload_dump_helper(struct Qdisc
*sch
, enum tc_setup_type type
,
815 struct net_device
*dev
= qdisc_dev(sch
);
818 sch
->flags
&= ~TCQ_F_OFFLOADED
;
819 if (!tc_can_offload(dev
) || !dev
->netdev_ops
->ndo_setup_tc
)
822 err
= dev
->netdev_ops
->ndo_setup_tc(dev
, type
, type_data
);
823 if (err
== -EOPNOTSUPP
)
827 sch
->flags
|= TCQ_F_OFFLOADED
;
831 EXPORT_SYMBOL(qdisc_offload_dump_helper
);
833 void qdisc_offload_graft_helper(struct net_device
*dev
, struct Qdisc
*sch
,
834 struct Qdisc
*new, struct Qdisc
*old
,
835 enum tc_setup_type type
, void *type_data
,
836 struct netlink_ext_ack
*extack
)
838 bool any_qdisc_is_offloaded
;
841 if (!tc_can_offload(dev
) || !dev
->netdev_ops
->ndo_setup_tc
)
844 err
= dev
->netdev_ops
->ndo_setup_tc(dev
, type
, type_data
);
846 /* Don't report error if the graft is part of destroy operation. */
847 if (!err
|| !new || new == &noop_qdisc
)
850 /* Don't report error if the parent, the old child and the new
851 * one are not offloaded.
853 any_qdisc_is_offloaded
= new->flags
& TCQ_F_OFFLOADED
;
854 any_qdisc_is_offloaded
|= sch
&& sch
->flags
& TCQ_F_OFFLOADED
;
855 any_qdisc_is_offloaded
|= old
&& old
->flags
& TCQ_F_OFFLOADED
;
857 if (any_qdisc_is_offloaded
)
858 NL_SET_ERR_MSG(extack
, "Offloading graft operation failed.");
860 EXPORT_SYMBOL(qdisc_offload_graft_helper
);
862 static void qdisc_offload_graft_root(struct net_device
*dev
,
863 struct Qdisc
*new, struct Qdisc
*old
,
864 struct netlink_ext_ack
*extack
)
866 struct tc_root_qopt_offload graft_offload
= {
867 .command
= TC_ROOT_GRAFT
,
868 .handle
= new ? new->handle
: 0,
869 .ingress
= (new && new->flags
& TCQ_F_INGRESS
) ||
870 (old
&& old
->flags
& TCQ_F_INGRESS
),
873 qdisc_offload_graft_helper(dev
, NULL
, new, old
,
874 TC_SETUP_ROOT_QDISC
, &graft_offload
, extack
);
877 static int tc_fill_qdisc(struct sk_buff
*skb
, struct Qdisc
*q
, u32 clid
,
878 u32 portid
, u32 seq
, u16 flags
, int event
)
880 struct gnet_stats_basic_cpu __percpu
*cpu_bstats
= NULL
;
881 struct gnet_stats_queue __percpu
*cpu_qstats
= NULL
;
883 struct nlmsghdr
*nlh
;
884 unsigned char *b
= skb_tail_pointer(skb
);
886 struct qdisc_size_table
*stab
;
891 nlh
= nlmsg_put(skb
, portid
, seq
, event
, sizeof(*tcm
), flags
);
894 tcm
= nlmsg_data(nlh
);
895 tcm
->tcm_family
= AF_UNSPEC
;
898 tcm
->tcm_ifindex
= qdisc_dev(q
)->ifindex
;
899 tcm
->tcm_parent
= clid
;
900 tcm
->tcm_handle
= q
->handle
;
901 tcm
->tcm_info
= refcount_read(&q
->refcnt
);
902 if (nla_put_string(skb
, TCA_KIND
, q
->ops
->id
))
903 goto nla_put_failure
;
904 if (q
->ops
->ingress_block_get
) {
905 block_index
= q
->ops
->ingress_block_get(q
);
907 nla_put_u32(skb
, TCA_INGRESS_BLOCK
, block_index
))
908 goto nla_put_failure
;
910 if (q
->ops
->egress_block_get
) {
911 block_index
= q
->ops
->egress_block_get(q
);
913 nla_put_u32(skb
, TCA_EGRESS_BLOCK
, block_index
))
914 goto nla_put_failure
;
916 if (q
->ops
->dump
&& q
->ops
->dump(q
, skb
) < 0)
917 goto nla_put_failure
;
918 if (nla_put_u8(skb
, TCA_HW_OFFLOAD
, !!(q
->flags
& TCQ_F_OFFLOADED
)))
919 goto nla_put_failure
;
920 qlen
= qdisc_qlen_sum(q
);
922 stab
= rtnl_dereference(q
->stab
);
923 if (stab
&& qdisc_dump_stab(skb
, stab
) < 0)
924 goto nla_put_failure
;
926 if (gnet_stats_start_copy_compat(skb
, TCA_STATS2
, TCA_STATS
, TCA_XSTATS
,
927 NULL
, &d
, TCA_PAD
) < 0)
928 goto nla_put_failure
;
930 if (q
->ops
->dump_stats
&& q
->ops
->dump_stats(q
, &d
) < 0)
931 goto nla_put_failure
;
933 if (qdisc_is_percpu_stats(q
)) {
934 cpu_bstats
= q
->cpu_bstats
;
935 cpu_qstats
= q
->cpu_qstats
;
938 if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q
),
939 &d
, cpu_bstats
, &q
->bstats
) < 0 ||
940 gnet_stats_copy_rate_est(&d
, &q
->rate_est
) < 0 ||
941 gnet_stats_copy_queue(&d
, cpu_qstats
, &q
->qstats
, qlen
) < 0)
942 goto nla_put_failure
;
944 if (gnet_stats_finish_copy(&d
) < 0)
945 goto nla_put_failure
;
947 nlh
->nlmsg_len
= skb_tail_pointer(skb
) - b
;
956 static bool tc_qdisc_dump_ignore(struct Qdisc
*q
, bool dump_invisible
)
958 if (q
->flags
& TCQ_F_BUILTIN
)
960 if ((q
->flags
& TCQ_F_INVISIBLE
) && !dump_invisible
)
966 static int qdisc_notify(struct net
*net
, struct sk_buff
*oskb
,
967 struct nlmsghdr
*n
, u32 clid
,
968 struct Qdisc
*old
, struct Qdisc
*new)
971 u32 portid
= oskb
? NETLINK_CB(oskb
).portid
: 0;
973 skb
= alloc_skb(NLMSG_GOODSIZE
, GFP_KERNEL
);
977 if (old
&& !tc_qdisc_dump_ignore(old
, false)) {
978 if (tc_fill_qdisc(skb
, old
, clid
, portid
, n
->nlmsg_seq
,
979 0, RTM_DELQDISC
) < 0)
982 if (new && !tc_qdisc_dump_ignore(new, false)) {
983 if (tc_fill_qdisc(skb
, new, clid
, portid
, n
->nlmsg_seq
,
984 old
? NLM_F_REPLACE
: 0, RTM_NEWQDISC
) < 0)
989 return rtnetlink_send(skb
, net
, portid
, RTNLGRP_TC
,
990 n
->nlmsg_flags
& NLM_F_ECHO
);
997 static void notify_and_destroy(struct net
*net
, struct sk_buff
*skb
,
998 struct nlmsghdr
*n
, u32 clid
,
999 struct Qdisc
*old
, struct Qdisc
*new)
1002 qdisc_notify(net
, skb
, n
, clid
, old
, new);
1008 static void qdisc_clear_nolock(struct Qdisc
*sch
)
1010 sch
->flags
&= ~TCQ_F_NOLOCK
;
1011 if (!(sch
->flags
& TCQ_F_CPUSTATS
))
1014 free_percpu(sch
->cpu_bstats
);
1015 free_percpu(sch
->cpu_qstats
);
1016 sch
->cpu_bstats
= NULL
;
1017 sch
->cpu_qstats
= NULL
;
1018 sch
->flags
&= ~TCQ_F_CPUSTATS
;
1021 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
1024 * When appropriate send a netlink notification using 'skb'
1027 * On success, destroy old qdisc.
1030 static int qdisc_graft(struct net_device
*dev
, struct Qdisc
*parent
,
1031 struct sk_buff
*skb
, struct nlmsghdr
*n
, u32 classid
,
1032 struct Qdisc
*new, struct Qdisc
*old
,
1033 struct netlink_ext_ack
*extack
)
1035 struct Qdisc
*q
= old
;
1036 struct net
*net
= dev_net(dev
);
1038 if (parent
== NULL
) {
1039 unsigned int i
, num_q
, ingress
;
1042 num_q
= dev
->num_tx_queues
;
1043 if ((q
&& q
->flags
& TCQ_F_INGRESS
) ||
1044 (new && new->flags
& TCQ_F_INGRESS
)) {
1047 if (!dev_ingress_queue(dev
)) {
1048 NL_SET_ERR_MSG(extack
, "Device does not have an ingress queue");
1053 if (dev
->flags
& IFF_UP
)
1054 dev_deactivate(dev
);
1056 qdisc_offload_graft_root(dev
, new, old
, extack
);
1058 if (new && new->ops
->attach
)
1061 for (i
= 0; i
< num_q
; i
++) {
1062 struct netdev_queue
*dev_queue
= dev_ingress_queue(dev
);
1065 dev_queue
= netdev_get_tx_queue(dev
, i
);
1067 old
= dev_graft_qdisc(dev_queue
, new);
1069 qdisc_refcount_inc(new);
1077 notify_and_destroy(net
, skb
, n
, classid
,
1079 if (new && !new->ops
->attach
)
1080 qdisc_refcount_inc(new);
1081 dev
->qdisc
= new ? : &noop_qdisc
;
1083 if (new && new->ops
->attach
)
1084 new->ops
->attach(new);
1086 notify_and_destroy(net
, skb
, n
, classid
, old
, new);
1089 if (dev
->flags
& IFF_UP
)
1092 const struct Qdisc_class_ops
*cops
= parent
->ops
->cl_ops
;
1096 /* Only support running class lockless if parent is lockless */
1097 if (new && (new->flags
& TCQ_F_NOLOCK
) && !(parent
->flags
& TCQ_F_NOLOCK
))
1098 qdisc_clear_nolock(new);
1100 if (!cops
|| !cops
->graft
)
1103 cl
= cops
->find(parent
, classid
);
1105 NL_SET_ERR_MSG(extack
, "Specified class not found");
1109 err
= cops
->graft(parent
, cl
, new, &old
, extack
);
1112 notify_and_destroy(net
, skb
, n
, classid
, old
, new);
1117 static int qdisc_block_indexes_set(struct Qdisc
*sch
, struct nlattr
**tca
,
1118 struct netlink_ext_ack
*extack
)
1122 if (tca
[TCA_INGRESS_BLOCK
]) {
1123 block_index
= nla_get_u32(tca
[TCA_INGRESS_BLOCK
]);
1126 NL_SET_ERR_MSG(extack
, "Ingress block index cannot be 0");
1129 if (!sch
->ops
->ingress_block_set
) {
1130 NL_SET_ERR_MSG(extack
, "Ingress block sharing is not supported");
1133 sch
->ops
->ingress_block_set(sch
, block_index
);
1135 if (tca
[TCA_EGRESS_BLOCK
]) {
1136 block_index
= nla_get_u32(tca
[TCA_EGRESS_BLOCK
]);
1139 NL_SET_ERR_MSG(extack
, "Egress block index cannot be 0");
1142 if (!sch
->ops
->egress_block_set
) {
1143 NL_SET_ERR_MSG(extack
, "Egress block sharing is not supported");
1146 sch
->ops
->egress_block_set(sch
, block_index
);
1152 Allocate and initialize new qdisc.
1154 Parameters are passed via opt.
1157 static struct Qdisc
*qdisc_create(struct net_device
*dev
,
1158 struct netdev_queue
*dev_queue
,
1159 struct Qdisc
*p
, u32 parent
, u32 handle
,
1160 struct nlattr
**tca
, int *errp
,
1161 struct netlink_ext_ack
*extack
)
1164 struct nlattr
*kind
= tca
[TCA_KIND
];
1166 struct Qdisc_ops
*ops
;
1167 struct qdisc_size_table
*stab
;
1169 ops
= qdisc_lookup_ops(kind
);
1170 #ifdef CONFIG_MODULES
1171 if (ops
== NULL
&& kind
!= NULL
) {
1172 char name
[IFNAMSIZ
];
1173 if (nla_strscpy(name
, kind
, IFNAMSIZ
) >= 0) {
1174 /* We dropped the RTNL semaphore in order to
1175 * perform the module load. So, even if we
1176 * succeeded in loading the module we have to
1177 * tell the caller to replay the request. We
1178 * indicate this using -EAGAIN.
1179 * We replay the request because the device may
1180 * go away in the mean time.
1183 request_module("sch_%s", name
);
1185 ops
= qdisc_lookup_ops(kind
);
1187 /* We will try again qdisc_lookup_ops,
1188 * so don't keep a reference.
1190 module_put(ops
->owner
);
1200 NL_SET_ERR_MSG(extack
, "Specified qdisc not found");
1204 sch
= qdisc_alloc(dev_queue
, ops
, extack
);
1210 sch
->parent
= parent
;
1212 if (handle
== TC_H_INGRESS
) {
1213 sch
->flags
|= TCQ_F_INGRESS
;
1214 handle
= TC_H_MAKE(TC_H_INGRESS
, 0);
1217 handle
= qdisc_alloc_handle(dev
);
1219 NL_SET_ERR_MSG(extack
, "Maximum number of qdisc handles was exceeded");
1224 if (!netif_is_multiqueue(dev
))
1225 sch
->flags
|= TCQ_F_ONETXQUEUE
;
1228 sch
->handle
= handle
;
1230 /* This exist to keep backward compatible with a userspace
1231 * loophole, what allowed userspace to get IFF_NO_QUEUE
1232 * facility on older kernels by setting tx_queue_len=0 (prior
1233 * to qdisc init), and then forgot to reinit tx_queue_len
1234 * before again attaching a qdisc.
1236 if ((dev
->priv_flags
& IFF_NO_QUEUE
) && (dev
->tx_queue_len
== 0)) {
1237 dev
->tx_queue_len
= DEFAULT_TX_QUEUE_LEN
;
1238 netdev_info(dev
, "Caught tx_queue_len zero misconfig\n");
1241 err
= qdisc_block_indexes_set(sch
, tca
, extack
);
1246 err
= ops
->init(sch
, tca
[TCA_OPTIONS
], extack
);
1251 if (tca
[TCA_STAB
]) {
1252 stab
= qdisc_get_stab(tca
[TCA_STAB
], extack
);
1254 err
= PTR_ERR(stab
);
1257 rcu_assign_pointer(sch
->stab
, stab
);
1259 if (tca
[TCA_RATE
]) {
1260 seqcount_t
*running
;
1263 if (sch
->flags
& TCQ_F_MQROOT
) {
1264 NL_SET_ERR_MSG(extack
, "Cannot attach rate estimator to a multi-queue root qdisc");
1268 if (sch
->parent
!= TC_H_ROOT
&&
1269 !(sch
->flags
& TCQ_F_INGRESS
) &&
1270 (!p
|| !(p
->flags
& TCQ_F_MQROOT
)))
1271 running
= qdisc_root_sleeping_running(sch
);
1273 running
= &sch
->running
;
1275 err
= gen_new_estimator(&sch
->bstats
,
1282 NL_SET_ERR_MSG(extack
, "Failed to generate new estimator");
1287 qdisc_hash_add(sch
, false);
1288 trace_qdisc_create(ops
, dev
, parent
);
1293 /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1300 module_put(ops
->owner
);
1307 * Any broken qdiscs that would require a ops->reset() here?
1308 * The qdisc was never in action so it shouldn't be necessary.
1310 qdisc_put_stab(rtnl_dereference(sch
->stab
));
1316 static int qdisc_change(struct Qdisc
*sch
, struct nlattr
**tca
,
1317 struct netlink_ext_ack
*extack
)
1319 struct qdisc_size_table
*ostab
, *stab
= NULL
;
1322 if (tca
[TCA_OPTIONS
]) {
1323 if (!sch
->ops
->change
) {
1324 NL_SET_ERR_MSG(extack
, "Change operation not supported by specified qdisc");
1327 if (tca
[TCA_INGRESS_BLOCK
] || tca
[TCA_EGRESS_BLOCK
]) {
1328 NL_SET_ERR_MSG(extack
, "Change of blocks is not supported");
1331 err
= sch
->ops
->change(sch
, tca
[TCA_OPTIONS
], extack
);
1336 if (tca
[TCA_STAB
]) {
1337 stab
= qdisc_get_stab(tca
[TCA_STAB
], extack
);
1339 return PTR_ERR(stab
);
1342 ostab
= rtnl_dereference(sch
->stab
);
1343 rcu_assign_pointer(sch
->stab
, stab
);
1344 qdisc_put_stab(ostab
);
1346 if (tca
[TCA_RATE
]) {
1347 /* NB: ignores errors from replace_estimator
1348 because change can't be undone. */
1349 if (sch
->flags
& TCQ_F_MQROOT
)
1351 gen_replace_estimator(&sch
->bstats
,
1355 qdisc_root_sleeping_running(sch
),
1362 struct check_loop_arg
{
1363 struct qdisc_walker w
;
1368 static int check_loop_fn(struct Qdisc
*q
, unsigned long cl
,
1369 struct qdisc_walker
*w
);
1371 static int check_loop(struct Qdisc
*q
, struct Qdisc
*p
, int depth
)
1373 struct check_loop_arg arg
;
1375 if (q
->ops
->cl_ops
== NULL
)
1378 arg
.w
.stop
= arg
.w
.skip
= arg
.w
.count
= 0;
1379 arg
.w
.fn
= check_loop_fn
;
1382 q
->ops
->cl_ops
->walk(q
, &arg
.w
);
1383 return arg
.w
.stop
? -ELOOP
: 0;
1387 check_loop_fn(struct Qdisc
*q
, unsigned long cl
, struct qdisc_walker
*w
)
1390 const struct Qdisc_class_ops
*cops
= q
->ops
->cl_ops
;
1391 struct check_loop_arg
*arg
= (struct check_loop_arg
*)w
;
1393 leaf
= cops
->leaf(q
, cl
);
1395 if (leaf
== arg
->p
|| arg
->depth
> 7)
1397 return check_loop(leaf
, arg
->p
, arg
->depth
+ 1);
1402 const struct nla_policy rtm_tca_policy
[TCA_MAX
+ 1] = {
1403 [TCA_KIND
] = { .type
= NLA_STRING
},
1404 [TCA_RATE
] = { .type
= NLA_BINARY
,
1405 .len
= sizeof(struct tc_estimator
) },
1406 [TCA_STAB
] = { .type
= NLA_NESTED
},
1407 [TCA_DUMP_INVISIBLE
] = { .type
= NLA_FLAG
},
1408 [TCA_CHAIN
] = { .type
= NLA_U32
},
1409 [TCA_INGRESS_BLOCK
] = { .type
= NLA_U32
},
1410 [TCA_EGRESS_BLOCK
] = { .type
= NLA_U32
},
1417 static int tc_get_qdisc(struct sk_buff
*skb
, struct nlmsghdr
*n
,
1418 struct netlink_ext_ack
*extack
)
1420 struct net
*net
= sock_net(skb
->sk
);
1421 struct tcmsg
*tcm
= nlmsg_data(n
);
1422 struct nlattr
*tca
[TCA_MAX
+ 1];
1423 struct net_device
*dev
;
1425 struct Qdisc
*q
= NULL
;
1426 struct Qdisc
*p
= NULL
;
1429 if ((n
->nlmsg_type
!= RTM_GETQDISC
) &&
1430 !netlink_ns_capable(skb
, net
->user_ns
, CAP_NET_ADMIN
))
1433 err
= nlmsg_parse_deprecated(n
, sizeof(*tcm
), tca
, TCA_MAX
,
1434 rtm_tca_policy
, extack
);
1438 dev
= __dev_get_by_index(net
, tcm
->tcm_ifindex
);
1442 clid
= tcm
->tcm_parent
;
1444 if (clid
!= TC_H_ROOT
) {
1445 if (TC_H_MAJ(clid
) != TC_H_MAJ(TC_H_INGRESS
)) {
1446 p
= qdisc_lookup(dev
, TC_H_MAJ(clid
));
1448 NL_SET_ERR_MSG(extack
, "Failed to find qdisc with specified classid");
1451 q
= qdisc_leaf(p
, clid
);
1452 } else if (dev_ingress_queue(dev
)) {
1453 q
= dev_ingress_queue(dev
)->qdisc_sleeping
;
1459 NL_SET_ERR_MSG(extack
, "Cannot find specified qdisc on specified device");
1463 if (tcm
->tcm_handle
&& q
->handle
!= tcm
->tcm_handle
) {
1464 NL_SET_ERR_MSG(extack
, "Invalid handle");
1468 q
= qdisc_lookup(dev
, tcm
->tcm_handle
);
1470 NL_SET_ERR_MSG(extack
, "Failed to find qdisc with specified handle");
1475 if (tca
[TCA_KIND
] && nla_strcmp(tca
[TCA_KIND
], q
->ops
->id
)) {
1476 NL_SET_ERR_MSG(extack
, "Invalid qdisc name");
1480 if (n
->nlmsg_type
== RTM_DELQDISC
) {
1482 NL_SET_ERR_MSG(extack
, "Classid cannot be zero");
1485 if (q
->handle
== 0) {
1486 NL_SET_ERR_MSG(extack
, "Cannot delete qdisc with handle of zero");
1489 err
= qdisc_graft(dev
, p
, skb
, n
, clid
, NULL
, q
, extack
);
1493 qdisc_notify(net
, skb
, n
, clid
, NULL
, q
);
1499 * Create/change qdisc.
1502 static int tc_modify_qdisc(struct sk_buff
*skb
, struct nlmsghdr
*n
,
1503 struct netlink_ext_ack
*extack
)
1505 struct net
*net
= sock_net(skb
->sk
);
1507 struct nlattr
*tca
[TCA_MAX
+ 1];
1508 struct net_device
*dev
;
1510 struct Qdisc
*q
, *p
;
1513 if (!netlink_ns_capable(skb
, net
->user_ns
, CAP_NET_ADMIN
))
1517 /* Reinit, just in case something touches this. */
1518 err
= nlmsg_parse_deprecated(n
, sizeof(*tcm
), tca
, TCA_MAX
,
1519 rtm_tca_policy
, extack
);
1523 tcm
= nlmsg_data(n
);
1524 clid
= tcm
->tcm_parent
;
1527 dev
= __dev_get_by_index(net
, tcm
->tcm_ifindex
);
1533 if (clid
!= TC_H_ROOT
) {
1534 if (clid
!= TC_H_INGRESS
) {
1535 p
= qdisc_lookup(dev
, TC_H_MAJ(clid
));
1537 NL_SET_ERR_MSG(extack
, "Failed to find specified qdisc");
1540 q
= qdisc_leaf(p
, clid
);
1541 } else if (dev_ingress_queue_create(dev
)) {
1542 q
= dev_ingress_queue(dev
)->qdisc_sleeping
;
1548 /* It may be default qdisc, ignore it */
1549 if (q
&& q
->handle
== 0)
1552 if (!q
|| !tcm
->tcm_handle
|| q
->handle
!= tcm
->tcm_handle
) {
1553 if (tcm
->tcm_handle
) {
1554 if (q
&& !(n
->nlmsg_flags
& NLM_F_REPLACE
)) {
1555 NL_SET_ERR_MSG(extack
, "NLM_F_REPLACE needed to override");
1558 if (TC_H_MIN(tcm
->tcm_handle
)) {
1559 NL_SET_ERR_MSG(extack
, "Invalid minor handle");
1562 q
= qdisc_lookup(dev
, tcm
->tcm_handle
);
1564 goto create_n_graft
;
1565 if (n
->nlmsg_flags
& NLM_F_EXCL
) {
1566 NL_SET_ERR_MSG(extack
, "Exclusivity flag on, cannot override");
1569 if (tca
[TCA_KIND
] &&
1570 nla_strcmp(tca
[TCA_KIND
], q
->ops
->id
)) {
1571 NL_SET_ERR_MSG(extack
, "Invalid qdisc name");
1575 (p
&& check_loop(q
, p
, 0))) {
1576 NL_SET_ERR_MSG(extack
, "Qdisc parent/child loop detected");
1579 qdisc_refcount_inc(q
);
1583 goto create_n_graft
;
1585 /* This magic test requires explanation.
1587 * We know, that some child q is already
1588 * attached to this parent and have choice:
1589 * either to change it or to create/graft new one.
1591 * 1. We are allowed to create/graft only
1592 * if CREATE and REPLACE flags are set.
1594 * 2. If EXCL is set, requestor wanted to say,
1595 * that qdisc tcm_handle is not expected
1596 * to exist, so that we choose create/graft too.
1598 * 3. The last case is when no flags are set.
1599 * Alas, it is sort of hole in API, we
1600 * cannot decide what to do unambiguously.
1601 * For now we select create/graft, if
1602 * user gave KIND, which does not match existing.
1604 if ((n
->nlmsg_flags
& NLM_F_CREATE
) &&
1605 (n
->nlmsg_flags
& NLM_F_REPLACE
) &&
1606 ((n
->nlmsg_flags
& NLM_F_EXCL
) ||
1608 nla_strcmp(tca
[TCA_KIND
], q
->ops
->id
))))
1609 goto create_n_graft
;
1613 if (!tcm
->tcm_handle
) {
1614 NL_SET_ERR_MSG(extack
, "Handle cannot be zero");
1617 q
= qdisc_lookup(dev
, tcm
->tcm_handle
);
1620 /* Change qdisc parameters */
1622 NL_SET_ERR_MSG(extack
, "Specified qdisc not found");
1625 if (n
->nlmsg_flags
& NLM_F_EXCL
) {
1626 NL_SET_ERR_MSG(extack
, "Exclusivity flag on, cannot modify");
1629 if (tca
[TCA_KIND
] && nla_strcmp(tca
[TCA_KIND
], q
->ops
->id
)) {
1630 NL_SET_ERR_MSG(extack
, "Invalid qdisc name");
1633 err
= qdisc_change(q
, tca
, extack
);
1635 qdisc_notify(net
, skb
, n
, clid
, NULL
, q
);
1639 if (!(n
->nlmsg_flags
& NLM_F_CREATE
)) {
1640 NL_SET_ERR_MSG(extack
, "Qdisc not found. To create specify NLM_F_CREATE flag");
1643 if (clid
== TC_H_INGRESS
) {
1644 if (dev_ingress_queue(dev
)) {
1645 q
= qdisc_create(dev
, dev_ingress_queue(dev
), p
,
1646 tcm
->tcm_parent
, tcm
->tcm_parent
,
1649 NL_SET_ERR_MSG(extack
, "Cannot find ingress queue for specified device");
1653 struct netdev_queue
*dev_queue
;
1655 if (p
&& p
->ops
->cl_ops
&& p
->ops
->cl_ops
->select_queue
)
1656 dev_queue
= p
->ops
->cl_ops
->select_queue(p
, tcm
);
1658 dev_queue
= p
->dev_queue
;
1660 dev_queue
= netdev_get_tx_queue(dev
, 0);
1662 q
= qdisc_create(dev
, dev_queue
, p
,
1663 tcm
->tcm_parent
, tcm
->tcm_handle
,
1673 err
= qdisc_graft(dev
, p
, skb
, n
, clid
, q
, NULL
, extack
);
1683 static int tc_dump_qdisc_root(struct Qdisc
*root
, struct sk_buff
*skb
,
1684 struct netlink_callback
*cb
,
1685 int *q_idx_p
, int s_q_idx
, bool recur
,
1686 bool dump_invisible
)
1688 int ret
= 0, q_idx
= *q_idx_p
;
1696 if (q_idx
< s_q_idx
) {
1699 if (!tc_qdisc_dump_ignore(q
, dump_invisible
) &&
1700 tc_fill_qdisc(skb
, q
, q
->parent
, NETLINK_CB(cb
->skb
).portid
,
1701 cb
->nlh
->nlmsg_seq
, NLM_F_MULTI
,
1707 /* If dumping singletons, there is no qdisc_dev(root) and the singleton
1708 * itself has already been dumped.
1710 * If we've already dumped the top-level (ingress) qdisc above and the global
1711 * qdisc hashtable, we don't want to hit it again
1713 if (!qdisc_dev(root
) || !recur
)
1716 hash_for_each(qdisc_dev(root
)->qdisc_hash
, b
, q
, hash
) {
1717 if (q_idx
< s_q_idx
) {
1721 if (!tc_qdisc_dump_ignore(q
, dump_invisible
) &&
1722 tc_fill_qdisc(skb
, q
, q
->parent
, NETLINK_CB(cb
->skb
).portid
,
1723 cb
->nlh
->nlmsg_seq
, NLM_F_MULTI
,
1737 static int tc_dump_qdisc(struct sk_buff
*skb
, struct netlink_callback
*cb
)
1739 struct net
*net
= sock_net(skb
->sk
);
1742 struct net_device
*dev
;
1743 const struct nlmsghdr
*nlh
= cb
->nlh
;
1744 struct nlattr
*tca
[TCA_MAX
+ 1];
1747 s_idx
= cb
->args
[0];
1748 s_q_idx
= q_idx
= cb
->args
[1];
1753 err
= nlmsg_parse_deprecated(nlh
, sizeof(struct tcmsg
), tca
, TCA_MAX
,
1754 rtm_tca_policy
, cb
->extack
);
1758 for_each_netdev(net
, dev
) {
1759 struct netdev_queue
*dev_queue
;
1767 if (tc_dump_qdisc_root(dev
->qdisc
, skb
, cb
, &q_idx
, s_q_idx
,
1768 true, tca
[TCA_DUMP_INVISIBLE
]) < 0)
1771 dev_queue
= dev_ingress_queue(dev
);
1773 tc_dump_qdisc_root(dev_queue
->qdisc_sleeping
, skb
, cb
,
1774 &q_idx
, s_q_idx
, false,
1775 tca
[TCA_DUMP_INVISIBLE
]) < 0)
1784 cb
->args
[1] = q_idx
;
1791 /************************************************
1792 * Traffic classes manipulation. *
1793 ************************************************/
1795 static int tc_fill_tclass(struct sk_buff
*skb
, struct Qdisc
*q
,
1797 u32 portid
, u32 seq
, u16 flags
, int event
)
1800 struct nlmsghdr
*nlh
;
1801 unsigned char *b
= skb_tail_pointer(skb
);
1803 const struct Qdisc_class_ops
*cl_ops
= q
->ops
->cl_ops
;
1806 nlh
= nlmsg_put(skb
, portid
, seq
, event
, sizeof(*tcm
), flags
);
1808 goto out_nlmsg_trim
;
1809 tcm
= nlmsg_data(nlh
);
1810 tcm
->tcm_family
= AF_UNSPEC
;
1813 tcm
->tcm_ifindex
= qdisc_dev(q
)->ifindex
;
1814 tcm
->tcm_parent
= q
->handle
;
1815 tcm
->tcm_handle
= q
->handle
;
1817 if (nla_put_string(skb
, TCA_KIND
, q
->ops
->id
))
1818 goto nla_put_failure
;
1819 if (cl_ops
->dump
&& cl_ops
->dump(q
, cl
, skb
, tcm
) < 0)
1820 goto nla_put_failure
;
1822 if (gnet_stats_start_copy_compat(skb
, TCA_STATS2
, TCA_STATS
, TCA_XSTATS
,
1823 NULL
, &d
, TCA_PAD
) < 0)
1824 goto nla_put_failure
;
1826 if (cl_ops
->dump_stats
&& cl_ops
->dump_stats(q
, cl
, &d
) < 0)
1827 goto nla_put_failure
;
1829 if (gnet_stats_finish_copy(&d
) < 0)
1830 goto nla_put_failure
;
1832 nlh
->nlmsg_len
= skb_tail_pointer(skb
) - b
;
1841 static int tclass_notify(struct net
*net
, struct sk_buff
*oskb
,
1842 struct nlmsghdr
*n
, struct Qdisc
*q
,
1843 unsigned long cl
, int event
)
1845 struct sk_buff
*skb
;
1846 u32 portid
= oskb
? NETLINK_CB(oskb
).portid
: 0;
1849 skb
= alloc_skb(NLMSG_GOODSIZE
, GFP_KERNEL
);
1853 if (tc_fill_tclass(skb
, q
, cl
, portid
, n
->nlmsg_seq
, 0, event
) < 0) {
1858 err
= rtnetlink_send(skb
, net
, portid
, RTNLGRP_TC
,
1859 n
->nlmsg_flags
& NLM_F_ECHO
);
1865 static int tclass_del_notify(struct net
*net
,
1866 const struct Qdisc_class_ops
*cops
,
1867 struct sk_buff
*oskb
, struct nlmsghdr
*n
,
1868 struct Qdisc
*q
, unsigned long cl
)
1870 u32 portid
= oskb
? NETLINK_CB(oskb
).portid
: 0;
1871 struct sk_buff
*skb
;
1877 skb
= alloc_skb(NLMSG_GOODSIZE
, GFP_KERNEL
);
1881 if (tc_fill_tclass(skb
, q
, cl
, portid
, n
->nlmsg_seq
, 0,
1882 RTM_DELTCLASS
) < 0) {
1887 err
= cops
->delete(q
, cl
);
1893 err
= rtnetlink_send(skb
, net
, portid
, RTNLGRP_TC
,
1894 n
->nlmsg_flags
& NLM_F_ECHO
);
1900 #ifdef CONFIG_NET_CLS
1902 struct tcf_bind_args
{
1903 struct tcf_walker w
;
1909 static int tcf_node_bind(struct tcf_proto
*tp
, void *n
, struct tcf_walker
*arg
)
1911 struct tcf_bind_args
*a
= (void *)arg
;
1913 if (tp
->ops
->bind_class
) {
1914 struct Qdisc
*q
= tcf_block_q(tp
->chain
->block
);
1917 tp
->ops
->bind_class(n
, a
->classid
, a
->cl
, q
, a
->base
);
1923 struct tc_bind_class_args
{
1924 struct qdisc_walker w
;
1925 unsigned long new_cl
;
1930 static int tc_bind_class_walker(struct Qdisc
*q
, unsigned long cl
,
1931 struct qdisc_walker
*w
)
1933 struct tc_bind_class_args
*a
= (struct tc_bind_class_args
*)w
;
1934 const struct Qdisc_class_ops
*cops
= q
->ops
->cl_ops
;
1935 struct tcf_block
*block
;
1936 struct tcf_chain
*chain
;
1938 block
= cops
->tcf_block(q
, cl
, NULL
);
1941 for (chain
= tcf_get_next_chain(block
, NULL
);
1943 chain
= tcf_get_next_chain(block
, chain
)) {
1944 struct tcf_proto
*tp
;
1946 for (tp
= tcf_get_next_proto(chain
, NULL
);
1947 tp
; tp
= tcf_get_next_proto(chain
, tp
)) {
1948 struct tcf_bind_args arg
= {};
1950 arg
.w
.fn
= tcf_node_bind
;
1951 arg
.classid
= a
->clid
;
1954 tp
->ops
->walk(tp
, &arg
.w
, true);
1961 static void tc_bind_tclass(struct Qdisc
*q
, u32 portid
, u32 clid
,
1962 unsigned long new_cl
)
1964 const struct Qdisc_class_ops
*cops
= q
->ops
->cl_ops
;
1965 struct tc_bind_class_args args
= {};
1967 if (!cops
->tcf_block
)
1969 args
.portid
= portid
;
1971 args
.new_cl
= new_cl
;
1972 args
.w
.fn
= tc_bind_class_walker
;
1973 q
->ops
->cl_ops
->walk(q
, &args
.w
);
1978 static void tc_bind_tclass(struct Qdisc
*q
, u32 portid
, u32 clid
,
1979 unsigned long new_cl
)
1985 static int tc_ctl_tclass(struct sk_buff
*skb
, struct nlmsghdr
*n
,
1986 struct netlink_ext_ack
*extack
)
1988 struct net
*net
= sock_net(skb
->sk
);
1989 struct tcmsg
*tcm
= nlmsg_data(n
);
1990 struct nlattr
*tca
[TCA_MAX
+ 1];
1991 struct net_device
*dev
;
1992 struct Qdisc
*q
= NULL
;
1993 const struct Qdisc_class_ops
*cops
;
1994 unsigned long cl
= 0;
1995 unsigned long new_cl
;
2001 if ((n
->nlmsg_type
!= RTM_GETTCLASS
) &&
2002 !netlink_ns_capable(skb
, net
->user_ns
, CAP_NET_ADMIN
))
2005 err
= nlmsg_parse_deprecated(n
, sizeof(*tcm
), tca
, TCA_MAX
,
2006 rtm_tca_policy
, extack
);
2010 dev
= __dev_get_by_index(net
, tcm
->tcm_ifindex
);
2015 parent == TC_H_UNSPEC - unspecified parent.
2016 parent == TC_H_ROOT - class is root, which has no parent.
2017 parent == X:0 - parent is root class.
2018 parent == X:Y - parent is a node in hierarchy.
2019 parent == 0:Y - parent is X:Y, where X:0 is qdisc.
2021 handle == 0:0 - generate handle from kernel pool.
2022 handle == 0:Y - class is X:Y, where X:0 is qdisc.
2023 handle == X:Y - clear.
2024 handle == X:0 - root class.
2027 /* Step 1. Determine qdisc handle X:0 */
2029 portid
= tcm
->tcm_parent
;
2030 clid
= tcm
->tcm_handle
;
2031 qid
= TC_H_MAJ(clid
);
2033 if (portid
!= TC_H_ROOT
) {
2034 u32 qid1
= TC_H_MAJ(portid
);
2037 /* If both majors are known, they must be identical. */
2042 } else if (qid
== 0)
2043 qid
= dev
->qdisc
->handle
;
2045 /* Now qid is genuine qdisc handle consistent
2046 * both with parent and child.
2048 * TC_H_MAJ(portid) still may be unspecified, complete it now.
2051 portid
= TC_H_MAKE(qid
, portid
);
2054 qid
= dev
->qdisc
->handle
;
2057 /* OK. Locate qdisc */
2058 q
= qdisc_lookup(dev
, qid
);
2062 /* An check that it supports classes */
2063 cops
= q
->ops
->cl_ops
;
2067 /* Now try to get class */
2069 if (portid
== TC_H_ROOT
)
2072 clid
= TC_H_MAKE(qid
, clid
);
2075 cl
= cops
->find(q
, clid
);
2079 if (n
->nlmsg_type
!= RTM_NEWTCLASS
||
2080 !(n
->nlmsg_flags
& NLM_F_CREATE
))
2083 switch (n
->nlmsg_type
) {
2086 if (n
->nlmsg_flags
& NLM_F_EXCL
)
2090 err
= tclass_del_notify(net
, cops
, skb
, n
, q
, cl
);
2091 /* Unbind the class with flilters with 0 */
2092 tc_bind_tclass(q
, portid
, clid
, 0);
2095 err
= tclass_notify(net
, skb
, n
, q
, cl
, RTM_NEWTCLASS
);
2103 if (tca
[TCA_INGRESS_BLOCK
] || tca
[TCA_EGRESS_BLOCK
]) {
2104 NL_SET_ERR_MSG(extack
, "Shared blocks are not supported for classes");
2111 err
= cops
->change(q
, clid
, portid
, tca
, &new_cl
, extack
);
2113 tclass_notify(net
, skb
, n
, q
, new_cl
, RTM_NEWTCLASS
);
2114 /* We just create a new class, need to do reverse binding. */
2116 tc_bind_tclass(q
, portid
, clid
, new_cl
);
2122 struct qdisc_dump_args
{
2123 struct qdisc_walker w
;
2124 struct sk_buff
*skb
;
2125 struct netlink_callback
*cb
;
2128 static int qdisc_class_dump(struct Qdisc
*q
, unsigned long cl
,
2129 struct qdisc_walker
*arg
)
2131 struct qdisc_dump_args
*a
= (struct qdisc_dump_args
*)arg
;
2133 return tc_fill_tclass(a
->skb
, q
, cl
, NETLINK_CB(a
->cb
->skb
).portid
,
2134 a
->cb
->nlh
->nlmsg_seq
, NLM_F_MULTI
,
2138 static int tc_dump_tclass_qdisc(struct Qdisc
*q
, struct sk_buff
*skb
,
2139 struct tcmsg
*tcm
, struct netlink_callback
*cb
,
2142 struct qdisc_dump_args arg
;
2144 if (tc_qdisc_dump_ignore(q
, false) ||
2145 *t_p
< s_t
|| !q
->ops
->cl_ops
||
2147 TC_H_MAJ(tcm
->tcm_parent
) != q
->handle
)) {
2152 memset(&cb
->args
[1], 0, sizeof(cb
->args
)-sizeof(cb
->args
[0]));
2153 arg
.w
.fn
= qdisc_class_dump
;
2157 arg
.w
.skip
= cb
->args
[1];
2159 q
->ops
->cl_ops
->walk(q
, &arg
.w
);
2160 cb
->args
[1] = arg
.w
.count
;
2167 static int tc_dump_tclass_root(struct Qdisc
*root
, struct sk_buff
*skb
,
2168 struct tcmsg
*tcm
, struct netlink_callback
*cb
,
2177 if (tc_dump_tclass_qdisc(root
, skb
, tcm
, cb
, t_p
, s_t
) < 0)
2180 if (!qdisc_dev(root
))
2183 if (tcm
->tcm_parent
) {
2184 q
= qdisc_match_from_root(root
, TC_H_MAJ(tcm
->tcm_parent
));
2185 if (q
&& q
!= root
&&
2186 tc_dump_tclass_qdisc(q
, skb
, tcm
, cb
, t_p
, s_t
) < 0)
2190 hash_for_each(qdisc_dev(root
)->qdisc_hash
, b
, q
, hash
) {
2191 if (tc_dump_tclass_qdisc(q
, skb
, tcm
, cb
, t_p
, s_t
) < 0)
2198 static int tc_dump_tclass(struct sk_buff
*skb
, struct netlink_callback
*cb
)
2200 struct tcmsg
*tcm
= nlmsg_data(cb
->nlh
);
2201 struct net
*net
= sock_net(skb
->sk
);
2202 struct netdev_queue
*dev_queue
;
2203 struct net_device
*dev
;
2206 if (nlmsg_len(cb
->nlh
) < sizeof(*tcm
))
2208 dev
= dev_get_by_index(net
, tcm
->tcm_ifindex
);
2215 if (tc_dump_tclass_root(dev
->qdisc
, skb
, tcm
, cb
, &t
, s_t
) < 0)
2218 dev_queue
= dev_ingress_queue(dev
);
2220 tc_dump_tclass_root(dev_queue
->qdisc_sleeping
, skb
, tcm
, cb
,
2231 #ifdef CONFIG_PROC_FS
2232 static int psched_show(struct seq_file
*seq
, void *v
)
2234 seq_printf(seq
, "%08x %08x %08x %08x\n",
2235 (u32
)NSEC_PER_USEC
, (u32
)PSCHED_TICKS2NS(1),
2237 (u32
)NSEC_PER_SEC
/ hrtimer_resolution
);
2242 static int __net_init
psched_net_init(struct net
*net
)
2244 struct proc_dir_entry
*e
;
2246 e
= proc_create_single("psched", 0, net
->proc_net
, psched_show
);
2253 static void __net_exit
psched_net_exit(struct net
*net
)
2255 remove_proc_entry("psched", net
->proc_net
);
2258 static int __net_init
psched_net_init(struct net
*net
)
2263 static void __net_exit
psched_net_exit(struct net
*net
)
2268 static struct pernet_operations psched_net_ops
= {
2269 .init
= psched_net_init
,
2270 .exit
= psched_net_exit
,
2273 static int __init
pktsched_init(void)
2277 err
= register_pernet_subsys(&psched_net_ops
);
2279 pr_err("pktsched_init: "
2280 "cannot initialize per netns operations\n");
2284 register_qdisc(&pfifo_fast_ops
);
2285 register_qdisc(&pfifo_qdisc_ops
);
2286 register_qdisc(&bfifo_qdisc_ops
);
2287 register_qdisc(&pfifo_head_drop_qdisc_ops
);
2288 register_qdisc(&mq_qdisc_ops
);
2289 register_qdisc(&noqueue_qdisc_ops
);
2291 rtnl_register(PF_UNSPEC
, RTM_NEWQDISC
, tc_modify_qdisc
, NULL
, 0);
2292 rtnl_register(PF_UNSPEC
, RTM_DELQDISC
, tc_get_qdisc
, NULL
, 0);
2293 rtnl_register(PF_UNSPEC
, RTM_GETQDISC
, tc_get_qdisc
, tc_dump_qdisc
,
2295 rtnl_register(PF_UNSPEC
, RTM_NEWTCLASS
, tc_ctl_tclass
, NULL
, 0);
2296 rtnl_register(PF_UNSPEC
, RTM_DELTCLASS
, tc_ctl_tclass
, NULL
, 0);
2297 rtnl_register(PF_UNSPEC
, RTM_GETTCLASS
, tc_ctl_tclass
, tc_dump_tclass
,
2303 subsys_initcall(pktsched_init
);