bpf: Prevent memory disambiguation attack
[linux/fpc-iii.git] / net / sched / sch_api.c
blobd512f49ee83c29143c5aea7315c9d4f2304c71bf
1 /*
2 * net/sched/sch_api.c Packet scheduler API.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
11 * Fixes:
13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
31 #include <linux/slab.h>
32 #include <linux/hashtable.h>
34 #include <net/net_namespace.h>
35 #include <net/sock.h>
36 #include <net/netlink.h>
37 #include <net/pkt_sched.h>
38 #include <net/pkt_cls.h>
42 Short review.
43 -------------
45 This file consists of two interrelated parts:
47 1. queueing disciplines manager frontend.
48 2. traffic classes manager frontend.
50 Generally, queueing discipline ("qdisc") is a black box,
51 which is able to enqueue packets and to dequeue them (when
52 device is ready to send something) in order and at times
53 determined by algorithm hidden in it.
55 qdisc's are divided to two categories:
56 - "queues", which have no internal structure visible from outside.
57 - "schedulers", which split all the packets to "traffic classes",
58 using "packet classifiers" (look at cls_api.c)
60 In turn, classes may have child qdiscs (as rule, queues)
61 attached to them etc. etc. etc.
63 The goal of the routines in this file is to translate
64 information supplied by user in the form of handles
65 to more intelligible for kernel form, to make some sanity
66 checks and part of work, which is common to all qdiscs
67 and to provide rtnetlink notifications.
69 All real intelligent work is done inside qdisc modules.
73 Every discipline has two major routines: enqueue and dequeue.
75 ---dequeue
77 dequeue usually returns a skb to send. It is allowed to return NULL,
78 but it does not mean that queue is empty, it just means that
79 discipline does not want to send anything this time.
80 Queue is really empty if q->q.qlen == 0.
81 For complicated disciplines with multiple queues q->q is not
82 real packet queue, but however q->q.qlen must be valid.
84 ---enqueue
86 enqueue returns 0, if packet was enqueued successfully.
87 If packet (this one or another one) was dropped, it returns
88 not zero error code.
89 NET_XMIT_DROP - this packet dropped
90 Expected action: do not backoff, but wait until queue will clear.
91 NET_XMIT_CN - probably this packet enqueued, but another one dropped.
92 Expected action: backoff or ignore
94 Auxiliary routines:
96 ---peek
98 like dequeue but without removing a packet from the queue
100 ---reset
102 returns qdisc to initial state: purge all buffers, clear all
103 timers, counters (except for statistics) etc.
105 ---init
107 initializes newly created qdisc.
109 ---destroy
111 destroys resources allocated by init and during lifetime of qdisc.
113 ---change
115 changes qdisc parameters.
118 /* Protects list of registered TC modules. It is pure SMP lock. */
119 static DEFINE_RWLOCK(qdisc_mod_lock);
122 /************************************************
123 * Queueing disciplines manipulation. *
124 ************************************************/
127 /* The list of all installed queueing disciplines. */
129 static struct Qdisc_ops *qdisc_base;
131 /* Register/unregister queueing discipline */
133 int register_qdisc(struct Qdisc_ops *qops)
135 struct Qdisc_ops *q, **qp;
136 int rc = -EEXIST;
138 write_lock(&qdisc_mod_lock);
139 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
140 if (!strcmp(qops->id, q->id))
141 goto out;
143 if (qops->enqueue == NULL)
144 qops->enqueue = noop_qdisc_ops.enqueue;
145 if (qops->peek == NULL) {
146 if (qops->dequeue == NULL)
147 qops->peek = noop_qdisc_ops.peek;
148 else
149 goto out_einval;
151 if (qops->dequeue == NULL)
152 qops->dequeue = noop_qdisc_ops.dequeue;
154 if (qops->cl_ops) {
155 const struct Qdisc_class_ops *cops = qops->cl_ops;
157 if (!(cops->find && cops->walk && cops->leaf))
158 goto out_einval;
160 if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
161 goto out_einval;
164 qops->next = NULL;
165 *qp = qops;
166 rc = 0;
167 out:
168 write_unlock(&qdisc_mod_lock);
169 return rc;
171 out_einval:
172 rc = -EINVAL;
173 goto out;
175 EXPORT_SYMBOL(register_qdisc);
177 int unregister_qdisc(struct Qdisc_ops *qops)
179 struct Qdisc_ops *q, **qp;
180 int err = -ENOENT;
182 write_lock(&qdisc_mod_lock);
183 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
184 if (q == qops)
185 break;
186 if (q) {
187 *qp = q->next;
188 q->next = NULL;
189 err = 0;
191 write_unlock(&qdisc_mod_lock);
192 return err;
194 EXPORT_SYMBOL(unregister_qdisc);
196 /* Get default qdisc if not otherwise specified */
197 void qdisc_get_default(char *name, size_t len)
199 read_lock(&qdisc_mod_lock);
200 strlcpy(name, default_qdisc_ops->id, len);
201 read_unlock(&qdisc_mod_lock);
204 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
206 struct Qdisc_ops *q = NULL;
208 for (q = qdisc_base; q; q = q->next) {
209 if (!strcmp(name, q->id)) {
210 if (!try_module_get(q->owner))
211 q = NULL;
212 break;
216 return q;
219 /* Set new default qdisc to use */
220 int qdisc_set_default(const char *name)
222 const struct Qdisc_ops *ops;
224 if (!capable(CAP_NET_ADMIN))
225 return -EPERM;
227 write_lock(&qdisc_mod_lock);
228 ops = qdisc_lookup_default(name);
229 if (!ops) {
230 /* Not found, drop lock and try to load module */
231 write_unlock(&qdisc_mod_lock);
232 request_module("sch_%s", name);
233 write_lock(&qdisc_mod_lock);
235 ops = qdisc_lookup_default(name);
238 if (ops) {
239 /* Set new default */
240 module_put(default_qdisc_ops->owner);
241 default_qdisc_ops = ops;
243 write_unlock(&qdisc_mod_lock);
245 return ops ? 0 : -ENOENT;
248 #ifdef CONFIG_NET_SCH_DEFAULT
249 /* Set default value from kernel config */
250 static int __init sch_default_qdisc(void)
252 return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
254 late_initcall(sch_default_qdisc);
255 #endif
257 /* We know handle. Find qdisc among all qdisc's attached to device
258 * (root qdisc, all its children, children of children etc.)
259 * Note: caller either uses rtnl or rcu_read_lock()
262 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
264 struct Qdisc *q;
266 if (!qdisc_dev(root))
267 return (root->handle == handle ? root : NULL);
269 if (!(root->flags & TCQ_F_BUILTIN) &&
270 root->handle == handle)
271 return root;
273 hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle) {
274 if (q->handle == handle)
275 return q;
277 return NULL;
280 void qdisc_hash_add(struct Qdisc *q, bool invisible)
282 if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
283 ASSERT_RTNL();
284 hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
285 if (invisible)
286 q->flags |= TCQ_F_INVISIBLE;
289 EXPORT_SYMBOL(qdisc_hash_add);
291 void qdisc_hash_del(struct Qdisc *q)
293 if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
294 ASSERT_RTNL();
295 hash_del_rcu(&q->hash);
298 EXPORT_SYMBOL(qdisc_hash_del);
300 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
302 struct Qdisc *q;
304 if (!handle)
305 return NULL;
306 q = qdisc_match_from_root(dev->qdisc, handle);
307 if (q)
308 goto out;
310 if (dev_ingress_queue(dev))
311 q = qdisc_match_from_root(
312 dev_ingress_queue(dev)->qdisc_sleeping,
313 handle);
314 out:
315 return q;
318 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
320 unsigned long cl;
321 struct Qdisc *leaf;
322 const struct Qdisc_class_ops *cops = p->ops->cl_ops;
324 if (cops == NULL)
325 return NULL;
326 cl = cops->find(p, classid);
328 if (cl == 0)
329 return NULL;
330 leaf = cops->leaf(p, cl);
331 return leaf;
334 /* Find queueing discipline by name */
336 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
338 struct Qdisc_ops *q = NULL;
340 if (kind) {
341 read_lock(&qdisc_mod_lock);
342 for (q = qdisc_base; q; q = q->next) {
343 if (nla_strcmp(kind, q->id) == 0) {
344 if (!try_module_get(q->owner))
345 q = NULL;
346 break;
349 read_unlock(&qdisc_mod_lock);
351 return q;
354 /* The linklayer setting were not transferred from iproute2, in older
355 * versions, and the rate tables lookup systems have been dropped in
356 * the kernel. To keep backward compatible with older iproute2 tc
357 * utils, we detect the linklayer setting by detecting if the rate
358 * table were modified.
360 * For linklayer ATM table entries, the rate table will be aligned to
361 * 48 bytes, thus some table entries will contain the same value. The
362 * mpu (min packet unit) is also encoded into the old rate table, thus
363 * starting from the mpu, we find low and high table entries for
364 * mapping this cell. If these entries contain the same value, when
365 * the rate tables have been modified for linklayer ATM.
367 * This is done by rounding mpu to the nearest 48 bytes cell/entry,
368 * and then roundup to the next cell, calc the table entry one below,
369 * and compare.
371 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
373 int low = roundup(r->mpu, 48);
374 int high = roundup(low+1, 48);
375 int cell_low = low >> r->cell_log;
376 int cell_high = (high >> r->cell_log) - 1;
378 /* rtab is too inaccurate at rates > 100Mbit/s */
379 if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
380 pr_debug("TC linklayer: Giving up ATM detection\n");
381 return TC_LINKLAYER_ETHERNET;
384 if ((cell_high > cell_low) && (cell_high < 256)
385 && (rtab[cell_low] == rtab[cell_high])) {
386 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
387 cell_low, cell_high, rtab[cell_high]);
388 return TC_LINKLAYER_ATM;
390 return TC_LINKLAYER_ETHERNET;
393 static struct qdisc_rate_table *qdisc_rtab_list;
395 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
396 struct nlattr *tab,
397 struct netlink_ext_ack *extack)
399 struct qdisc_rate_table *rtab;
401 if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
402 nla_len(tab) != TC_RTAB_SIZE) {
403 NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
404 return NULL;
407 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
408 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
409 !memcmp(&rtab->data, nla_data(tab), 1024)) {
410 rtab->refcnt++;
411 return rtab;
415 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
416 if (rtab) {
417 rtab->rate = *r;
418 rtab->refcnt = 1;
419 memcpy(rtab->data, nla_data(tab), 1024);
420 if (r->linklayer == TC_LINKLAYER_UNAWARE)
421 r->linklayer = __detect_linklayer(r, rtab->data);
422 rtab->next = qdisc_rtab_list;
423 qdisc_rtab_list = rtab;
424 } else {
425 NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
427 return rtab;
429 EXPORT_SYMBOL(qdisc_get_rtab);
431 void qdisc_put_rtab(struct qdisc_rate_table *tab)
433 struct qdisc_rate_table *rtab, **rtabp;
435 if (!tab || --tab->refcnt)
436 return;
438 for (rtabp = &qdisc_rtab_list;
439 (rtab = *rtabp) != NULL;
440 rtabp = &rtab->next) {
441 if (rtab == tab) {
442 *rtabp = rtab->next;
443 kfree(rtab);
444 return;
448 EXPORT_SYMBOL(qdisc_put_rtab);
450 static LIST_HEAD(qdisc_stab_list);
452 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
453 [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
454 [TCA_STAB_DATA] = { .type = NLA_BINARY },
457 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
458 struct netlink_ext_ack *extack)
460 struct nlattr *tb[TCA_STAB_MAX + 1];
461 struct qdisc_size_table *stab;
462 struct tc_sizespec *s;
463 unsigned int tsize = 0;
464 u16 *tab = NULL;
465 int err;
467 err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy, extack);
468 if (err < 0)
469 return ERR_PTR(err);
470 if (!tb[TCA_STAB_BASE]) {
471 NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
472 return ERR_PTR(-EINVAL);
475 s = nla_data(tb[TCA_STAB_BASE]);
477 if (s->tsize > 0) {
478 if (!tb[TCA_STAB_DATA]) {
479 NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
480 return ERR_PTR(-EINVAL);
482 tab = nla_data(tb[TCA_STAB_DATA]);
483 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
486 if (tsize != s->tsize || (!tab && tsize > 0)) {
487 NL_SET_ERR_MSG(extack, "Invalid size of size table");
488 return ERR_PTR(-EINVAL);
491 list_for_each_entry(stab, &qdisc_stab_list, list) {
492 if (memcmp(&stab->szopts, s, sizeof(*s)))
493 continue;
494 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
495 continue;
496 stab->refcnt++;
497 return stab;
500 stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
501 if (!stab)
502 return ERR_PTR(-ENOMEM);
504 stab->refcnt = 1;
505 stab->szopts = *s;
506 if (tsize > 0)
507 memcpy(stab->data, tab, tsize * sizeof(u16));
509 list_add_tail(&stab->list, &qdisc_stab_list);
511 return stab;
514 static void stab_kfree_rcu(struct rcu_head *head)
516 kfree(container_of(head, struct qdisc_size_table, rcu));
519 void qdisc_put_stab(struct qdisc_size_table *tab)
521 if (!tab)
522 return;
524 if (--tab->refcnt == 0) {
525 list_del(&tab->list);
526 call_rcu_bh(&tab->rcu, stab_kfree_rcu);
529 EXPORT_SYMBOL(qdisc_put_stab);
531 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
533 struct nlattr *nest;
535 nest = nla_nest_start(skb, TCA_STAB);
536 if (nest == NULL)
537 goto nla_put_failure;
538 if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
539 goto nla_put_failure;
540 nla_nest_end(skb, nest);
542 return skb->len;
544 nla_put_failure:
545 return -1;
548 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
549 const struct qdisc_size_table *stab)
551 int pkt_len, slot;
553 pkt_len = skb->len + stab->szopts.overhead;
554 if (unlikely(!stab->szopts.tsize))
555 goto out;
557 slot = pkt_len + stab->szopts.cell_align;
558 if (unlikely(slot < 0))
559 slot = 0;
561 slot >>= stab->szopts.cell_log;
562 if (likely(slot < stab->szopts.tsize))
563 pkt_len = stab->data[slot];
564 else
565 pkt_len = stab->data[stab->szopts.tsize - 1] *
566 (slot / stab->szopts.tsize) +
567 stab->data[slot % stab->szopts.tsize];
569 pkt_len <<= stab->szopts.size_log;
570 out:
571 if (unlikely(pkt_len < 1))
572 pkt_len = 1;
573 qdisc_skb_cb(skb)->pkt_len = pkt_len;
575 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
577 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
579 if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
580 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
581 txt, qdisc->ops->id, qdisc->handle >> 16);
582 qdisc->flags |= TCQ_F_WARN_NONWC;
585 EXPORT_SYMBOL(qdisc_warn_nonwc);
587 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
589 struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
590 timer);
592 rcu_read_lock();
593 __netif_schedule(qdisc_root(wd->qdisc));
594 rcu_read_unlock();
596 return HRTIMER_NORESTART;
599 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
601 hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
602 wd->timer.function = qdisc_watchdog;
603 wd->qdisc = qdisc;
605 EXPORT_SYMBOL(qdisc_watchdog_init);
607 void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
609 if (test_bit(__QDISC_STATE_DEACTIVATED,
610 &qdisc_root_sleeping(wd->qdisc)->state))
611 return;
613 if (wd->last_expires == expires)
614 return;
616 wd->last_expires = expires;
617 hrtimer_start(&wd->timer,
618 ns_to_ktime(expires),
619 HRTIMER_MODE_ABS_PINNED);
621 EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
623 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
625 hrtimer_cancel(&wd->timer);
627 EXPORT_SYMBOL(qdisc_watchdog_cancel);
629 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
631 struct hlist_head *h;
632 unsigned int i;
634 h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
636 if (h != NULL) {
637 for (i = 0; i < n; i++)
638 INIT_HLIST_HEAD(&h[i]);
640 return h;
643 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
645 struct Qdisc_class_common *cl;
646 struct hlist_node *next;
647 struct hlist_head *nhash, *ohash;
648 unsigned int nsize, nmask, osize;
649 unsigned int i, h;
651 /* Rehash when load factor exceeds 0.75 */
652 if (clhash->hashelems * 4 <= clhash->hashsize * 3)
653 return;
654 nsize = clhash->hashsize * 2;
655 nmask = nsize - 1;
656 nhash = qdisc_class_hash_alloc(nsize);
657 if (nhash == NULL)
658 return;
660 ohash = clhash->hash;
661 osize = clhash->hashsize;
663 sch_tree_lock(sch);
664 for (i = 0; i < osize; i++) {
665 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
666 h = qdisc_class_hash(cl->classid, nmask);
667 hlist_add_head(&cl->hnode, &nhash[h]);
670 clhash->hash = nhash;
671 clhash->hashsize = nsize;
672 clhash->hashmask = nmask;
673 sch_tree_unlock(sch);
675 kvfree(ohash);
677 EXPORT_SYMBOL(qdisc_class_hash_grow);
679 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
681 unsigned int size = 4;
683 clhash->hash = qdisc_class_hash_alloc(size);
684 if (!clhash->hash)
685 return -ENOMEM;
686 clhash->hashsize = size;
687 clhash->hashmask = size - 1;
688 clhash->hashelems = 0;
689 return 0;
691 EXPORT_SYMBOL(qdisc_class_hash_init);
693 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
695 kvfree(clhash->hash);
697 EXPORT_SYMBOL(qdisc_class_hash_destroy);
699 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
700 struct Qdisc_class_common *cl)
702 unsigned int h;
704 INIT_HLIST_NODE(&cl->hnode);
705 h = qdisc_class_hash(cl->classid, clhash->hashmask);
706 hlist_add_head(&cl->hnode, &clhash->hash[h]);
707 clhash->hashelems++;
709 EXPORT_SYMBOL(qdisc_class_hash_insert);
711 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
712 struct Qdisc_class_common *cl)
714 hlist_del(&cl->hnode);
715 clhash->hashelems--;
717 EXPORT_SYMBOL(qdisc_class_hash_remove);
719 /* Allocate an unique handle from space managed by kernel
720 * Possible range is [8000-FFFF]:0000 (0x8000 values)
722 static u32 qdisc_alloc_handle(struct net_device *dev)
724 int i = 0x8000;
725 static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
727 do {
728 autohandle += TC_H_MAKE(0x10000U, 0);
729 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
730 autohandle = TC_H_MAKE(0x80000000U, 0);
731 if (!qdisc_lookup(dev, autohandle))
732 return autohandle;
733 cond_resched();
734 } while (--i > 0);
736 return 0;
739 void qdisc_tree_reduce_backlog(struct Qdisc *sch, unsigned int n,
740 unsigned int len)
742 const struct Qdisc_class_ops *cops;
743 unsigned long cl;
744 u32 parentid;
745 bool notify;
746 int drops;
748 if (n == 0 && len == 0)
749 return;
750 drops = max_t(int, n, 0);
751 rcu_read_lock();
752 while ((parentid = sch->parent)) {
753 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
754 break;
756 if (sch->flags & TCQ_F_NOPARENT)
757 break;
758 /* Notify parent qdisc only if child qdisc becomes empty.
760 * If child was empty even before update then backlog
761 * counter is screwed and we skip notification because
762 * parent class is already passive.
764 notify = !sch->q.qlen && !WARN_ON_ONCE(!n);
765 /* TODO: perform the search on a per txq basis */
766 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
767 if (sch == NULL) {
768 WARN_ON_ONCE(parentid != TC_H_ROOT);
769 break;
771 cops = sch->ops->cl_ops;
772 if (notify && cops->qlen_notify) {
773 cl = cops->find(sch, parentid);
774 cops->qlen_notify(sch, cl);
776 sch->q.qlen -= n;
777 sch->qstats.backlog -= len;
778 __qdisc_qstats_drop(sch, drops);
780 rcu_read_unlock();
782 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
784 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
785 u32 portid, u32 seq, u16 flags, int event)
787 struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
788 struct gnet_stats_queue __percpu *cpu_qstats = NULL;
789 struct tcmsg *tcm;
790 struct nlmsghdr *nlh;
791 unsigned char *b = skb_tail_pointer(skb);
792 struct gnet_dump d;
793 struct qdisc_size_table *stab;
794 u32 block_index;
795 __u32 qlen;
797 cond_resched();
798 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
799 if (!nlh)
800 goto out_nlmsg_trim;
801 tcm = nlmsg_data(nlh);
802 tcm->tcm_family = AF_UNSPEC;
803 tcm->tcm__pad1 = 0;
804 tcm->tcm__pad2 = 0;
805 tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
806 tcm->tcm_parent = clid;
807 tcm->tcm_handle = q->handle;
808 tcm->tcm_info = refcount_read(&q->refcnt);
809 if (nla_put_string(skb, TCA_KIND, q->ops->id))
810 goto nla_put_failure;
811 if (q->ops->ingress_block_get) {
812 block_index = q->ops->ingress_block_get(q);
813 if (block_index &&
814 nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
815 goto nla_put_failure;
817 if (q->ops->egress_block_get) {
818 block_index = q->ops->egress_block_get(q);
819 if (block_index &&
820 nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
821 goto nla_put_failure;
823 if (q->ops->dump && q->ops->dump(q, skb) < 0)
824 goto nla_put_failure;
825 if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
826 goto nla_put_failure;
827 qlen = qdisc_qlen_sum(q);
829 stab = rtnl_dereference(q->stab);
830 if (stab && qdisc_dump_stab(skb, stab) < 0)
831 goto nla_put_failure;
833 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
834 NULL, &d, TCA_PAD) < 0)
835 goto nla_put_failure;
837 if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
838 goto nla_put_failure;
840 if (qdisc_is_percpu_stats(q)) {
841 cpu_bstats = q->cpu_bstats;
842 cpu_qstats = q->cpu_qstats;
845 if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
846 &d, cpu_bstats, &q->bstats) < 0 ||
847 gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
848 gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
849 goto nla_put_failure;
851 if (gnet_stats_finish_copy(&d) < 0)
852 goto nla_put_failure;
854 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
855 return skb->len;
857 out_nlmsg_trim:
858 nla_put_failure:
859 nlmsg_trim(skb, b);
860 return -1;
863 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
865 if (q->flags & TCQ_F_BUILTIN)
866 return true;
867 if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
868 return true;
870 return false;
873 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
874 struct nlmsghdr *n, u32 clid,
875 struct Qdisc *old, struct Qdisc *new)
877 struct sk_buff *skb;
878 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
880 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
881 if (!skb)
882 return -ENOBUFS;
884 if (old && !tc_qdisc_dump_ignore(old, false)) {
885 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
886 0, RTM_DELQDISC) < 0)
887 goto err_out;
889 if (new && !tc_qdisc_dump_ignore(new, false)) {
890 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
891 old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
892 goto err_out;
895 if (skb->len)
896 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
897 n->nlmsg_flags & NLM_F_ECHO);
899 err_out:
900 kfree_skb(skb);
901 return -EINVAL;
904 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
905 struct nlmsghdr *n, u32 clid,
906 struct Qdisc *old, struct Qdisc *new)
908 if (new || old)
909 qdisc_notify(net, skb, n, clid, old, new);
911 if (old)
912 qdisc_destroy(old);
915 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
916 * to device "dev".
918 * When appropriate send a netlink notification using 'skb'
919 * and "n".
921 * On success, destroy old qdisc.
924 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
925 struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
926 struct Qdisc *new, struct Qdisc *old,
927 struct netlink_ext_ack *extack)
929 struct Qdisc *q = old;
930 struct net *net = dev_net(dev);
931 int err = 0;
933 if (parent == NULL) {
934 unsigned int i, num_q, ingress;
936 ingress = 0;
937 num_q = dev->num_tx_queues;
938 if ((q && q->flags & TCQ_F_INGRESS) ||
939 (new && new->flags & TCQ_F_INGRESS)) {
940 num_q = 1;
941 ingress = 1;
942 if (!dev_ingress_queue(dev)) {
943 NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
944 return -ENOENT;
948 if (dev->flags & IFF_UP)
949 dev_deactivate(dev);
951 if (new && new->ops->attach)
952 goto skip;
954 for (i = 0; i < num_q; i++) {
955 struct netdev_queue *dev_queue = dev_ingress_queue(dev);
957 if (!ingress)
958 dev_queue = netdev_get_tx_queue(dev, i);
960 old = dev_graft_qdisc(dev_queue, new);
961 if (new && i > 0)
962 qdisc_refcount_inc(new);
964 if (!ingress)
965 qdisc_destroy(old);
968 skip:
969 if (!ingress) {
970 notify_and_destroy(net, skb, n, classid,
971 dev->qdisc, new);
972 if (new && !new->ops->attach)
973 qdisc_refcount_inc(new);
974 dev->qdisc = new ? : &noop_qdisc;
976 if (new && new->ops->attach)
977 new->ops->attach(new);
978 } else {
979 notify_and_destroy(net, skb, n, classid, old, new);
982 if (dev->flags & IFF_UP)
983 dev_activate(dev);
984 } else {
985 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
987 /* Only support running class lockless if parent is lockless */
988 if (new && (new->flags & TCQ_F_NOLOCK) &&
989 parent && !(parent->flags & TCQ_F_NOLOCK))
990 new->flags &= ~TCQ_F_NOLOCK;
992 err = -EOPNOTSUPP;
993 if (cops && cops->graft) {
994 unsigned long cl = cops->find(parent, classid);
996 if (cl) {
997 err = cops->graft(parent, cl, new, &old,
998 extack);
999 } else {
1000 NL_SET_ERR_MSG(extack, "Specified class not found");
1001 err = -ENOENT;
1004 if (!err)
1005 notify_and_destroy(net, skb, n, classid, old, new);
1007 return err;
1010 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1011 struct netlink_ext_ack *extack)
1013 u32 block_index;
1015 if (tca[TCA_INGRESS_BLOCK]) {
1016 block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1018 if (!block_index) {
1019 NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1020 return -EINVAL;
1022 if (!sch->ops->ingress_block_set) {
1023 NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1024 return -EOPNOTSUPP;
1026 sch->ops->ingress_block_set(sch, block_index);
1028 if (tca[TCA_EGRESS_BLOCK]) {
1029 block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1031 if (!block_index) {
1032 NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1033 return -EINVAL;
1035 if (!sch->ops->egress_block_set) {
1036 NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1037 return -EOPNOTSUPP;
1039 sch->ops->egress_block_set(sch, block_index);
1041 return 0;
1044 /* lockdep annotation is needed for ingress; egress gets it only for name */
1045 static struct lock_class_key qdisc_tx_lock;
1046 static struct lock_class_key qdisc_rx_lock;
1049 Allocate and initialize new qdisc.
1051 Parameters are passed via opt.
1054 static struct Qdisc *qdisc_create(struct net_device *dev,
1055 struct netdev_queue *dev_queue,
1056 struct Qdisc *p, u32 parent, u32 handle,
1057 struct nlattr **tca, int *errp,
1058 struct netlink_ext_ack *extack)
1060 int err;
1061 struct nlattr *kind = tca[TCA_KIND];
1062 struct Qdisc *sch;
1063 struct Qdisc_ops *ops;
1064 struct qdisc_size_table *stab;
1066 ops = qdisc_lookup_ops(kind);
1067 #ifdef CONFIG_MODULES
1068 if (ops == NULL && kind != NULL) {
1069 char name[IFNAMSIZ];
1070 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
1071 /* We dropped the RTNL semaphore in order to
1072 * perform the module load. So, even if we
1073 * succeeded in loading the module we have to
1074 * tell the caller to replay the request. We
1075 * indicate this using -EAGAIN.
1076 * We replay the request because the device may
1077 * go away in the mean time.
1079 rtnl_unlock();
1080 request_module("sch_%s", name);
1081 rtnl_lock();
1082 ops = qdisc_lookup_ops(kind);
1083 if (ops != NULL) {
1084 /* We will try again qdisc_lookup_ops,
1085 * so don't keep a reference.
1087 module_put(ops->owner);
1088 err = -EAGAIN;
1089 goto err_out;
1093 #endif
1095 err = -ENOENT;
1096 if (!ops) {
1097 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1098 goto err_out;
1101 sch = qdisc_alloc(dev_queue, ops, extack);
1102 if (IS_ERR(sch)) {
1103 err = PTR_ERR(sch);
1104 goto err_out2;
1107 sch->parent = parent;
1109 if (handle == TC_H_INGRESS) {
1110 sch->flags |= TCQ_F_INGRESS;
1111 handle = TC_H_MAKE(TC_H_INGRESS, 0);
1112 lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
1113 } else {
1114 if (handle == 0) {
1115 handle = qdisc_alloc_handle(dev);
1116 err = -ENOMEM;
1117 if (handle == 0)
1118 goto err_out3;
1120 lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
1121 if (!netif_is_multiqueue(dev))
1122 sch->flags |= TCQ_F_ONETXQUEUE;
1125 sch->handle = handle;
1127 /* This exist to keep backward compatible with a userspace
1128 * loophole, what allowed userspace to get IFF_NO_QUEUE
1129 * facility on older kernels by setting tx_queue_len=0 (prior
1130 * to qdisc init), and then forgot to reinit tx_queue_len
1131 * before again attaching a qdisc.
1133 if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1134 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1135 netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1138 err = qdisc_block_indexes_set(sch, tca, extack);
1139 if (err)
1140 goto err_out3;
1142 if (ops->init) {
1143 err = ops->init(sch, tca[TCA_OPTIONS], extack);
1144 if (err != 0)
1145 goto err_out5;
1148 if (tca[TCA_STAB]) {
1149 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1150 if (IS_ERR(stab)) {
1151 err = PTR_ERR(stab);
1152 goto err_out4;
1154 rcu_assign_pointer(sch->stab, stab);
1156 if (tca[TCA_RATE]) {
1157 seqcount_t *running;
1159 err = -EOPNOTSUPP;
1160 if (sch->flags & TCQ_F_MQROOT) {
1161 NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1162 goto err_out4;
1165 if (sch->parent != TC_H_ROOT &&
1166 !(sch->flags & TCQ_F_INGRESS) &&
1167 (!p || !(p->flags & TCQ_F_MQROOT)))
1168 running = qdisc_root_sleeping_running(sch);
1169 else
1170 running = &sch->running;
1172 err = gen_new_estimator(&sch->bstats,
1173 sch->cpu_bstats,
1174 &sch->rate_est,
1175 NULL,
1176 running,
1177 tca[TCA_RATE]);
1178 if (err) {
1179 NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1180 goto err_out4;
1184 qdisc_hash_add(sch, false);
1186 return sch;
1188 err_out5:
1189 /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1190 if (ops->destroy)
1191 ops->destroy(sch);
1192 err_out3:
1193 dev_put(dev);
1194 qdisc_free(sch);
1195 err_out2:
1196 module_put(ops->owner);
1197 err_out:
1198 *errp = err;
1199 return NULL;
1201 err_out4:
1203 * Any broken qdiscs that would require a ops->reset() here?
1204 * The qdisc was never in action so it shouldn't be necessary.
1206 qdisc_put_stab(rtnl_dereference(sch->stab));
1207 if (ops->destroy)
1208 ops->destroy(sch);
1209 goto err_out3;
1212 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1213 struct netlink_ext_ack *extack)
1215 struct qdisc_size_table *ostab, *stab = NULL;
1216 int err = 0;
1218 if (tca[TCA_OPTIONS]) {
1219 if (!sch->ops->change) {
1220 NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1221 return -EINVAL;
1223 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1224 NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1225 return -EOPNOTSUPP;
1227 err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1228 if (err)
1229 return err;
1232 if (tca[TCA_STAB]) {
1233 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1234 if (IS_ERR(stab))
1235 return PTR_ERR(stab);
1238 ostab = rtnl_dereference(sch->stab);
1239 rcu_assign_pointer(sch->stab, stab);
1240 qdisc_put_stab(ostab);
1242 if (tca[TCA_RATE]) {
1243 /* NB: ignores errors from replace_estimator
1244 because change can't be undone. */
1245 if (sch->flags & TCQ_F_MQROOT)
1246 goto out;
1247 gen_replace_estimator(&sch->bstats,
1248 sch->cpu_bstats,
1249 &sch->rate_est,
1250 NULL,
1251 qdisc_root_sleeping_running(sch),
1252 tca[TCA_RATE]);
1254 out:
1255 return 0;
1258 struct check_loop_arg {
1259 struct qdisc_walker w;
1260 struct Qdisc *p;
1261 int depth;
1264 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1265 struct qdisc_walker *w);
1267 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1269 struct check_loop_arg arg;
1271 if (q->ops->cl_ops == NULL)
1272 return 0;
1274 arg.w.stop = arg.w.skip = arg.w.count = 0;
1275 arg.w.fn = check_loop_fn;
1276 arg.depth = depth;
1277 arg.p = p;
1278 q->ops->cl_ops->walk(q, &arg.w);
1279 return arg.w.stop ? -ELOOP : 0;
1282 static int
1283 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1285 struct Qdisc *leaf;
1286 const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1287 struct check_loop_arg *arg = (struct check_loop_arg *)w;
1289 leaf = cops->leaf(q, cl);
1290 if (leaf) {
1291 if (leaf == arg->p || arg->depth > 7)
1292 return -ELOOP;
1293 return check_loop(leaf, arg->p, arg->depth + 1);
1295 return 0;
1299 * Delete/get qdisc.
1302 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1303 struct netlink_ext_ack *extack)
1305 struct net *net = sock_net(skb->sk);
1306 struct tcmsg *tcm = nlmsg_data(n);
1307 struct nlattr *tca[TCA_MAX + 1];
1308 struct net_device *dev;
1309 u32 clid;
1310 struct Qdisc *q = NULL;
1311 struct Qdisc *p = NULL;
1312 int err;
1314 if ((n->nlmsg_type != RTM_GETQDISC) &&
1315 !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1316 return -EPERM;
1318 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL, extack);
1319 if (err < 0)
1320 return err;
1322 dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1323 if (!dev)
1324 return -ENODEV;
1326 clid = tcm->tcm_parent;
1327 if (clid) {
1328 if (clid != TC_H_ROOT) {
1329 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1330 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1331 if (!p) {
1332 NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1333 return -ENOENT;
1335 q = qdisc_leaf(p, clid);
1336 } else if (dev_ingress_queue(dev)) {
1337 q = dev_ingress_queue(dev)->qdisc_sleeping;
1339 } else {
1340 q = dev->qdisc;
1342 if (!q) {
1343 NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1344 return -ENOENT;
1347 if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1348 NL_SET_ERR_MSG(extack, "Invalid handle");
1349 return -EINVAL;
1351 } else {
1352 q = qdisc_lookup(dev, tcm->tcm_handle);
1353 if (!q) {
1354 NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1355 return -ENOENT;
1359 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1360 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1361 return -EINVAL;
1364 if (n->nlmsg_type == RTM_DELQDISC) {
1365 if (!clid) {
1366 NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1367 return -EINVAL;
1369 if (q->handle == 0) {
1370 NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1371 return -ENOENT;
1373 err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1374 if (err != 0)
1375 return err;
1376 } else {
1377 qdisc_notify(net, skb, n, clid, NULL, q);
1379 return 0;
1383 * Create/change qdisc.
1386 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1387 struct netlink_ext_ack *extack)
1389 struct net *net = sock_net(skb->sk);
1390 struct tcmsg *tcm;
1391 struct nlattr *tca[TCA_MAX + 1];
1392 struct net_device *dev;
1393 u32 clid;
1394 struct Qdisc *q, *p;
1395 int err;
1397 if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1398 return -EPERM;
1400 replay:
1401 /* Reinit, just in case something touches this. */
1402 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL, extack);
1403 if (err < 0)
1404 return err;
1406 tcm = nlmsg_data(n);
1407 clid = tcm->tcm_parent;
1408 q = p = NULL;
1410 dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1411 if (!dev)
1412 return -ENODEV;
1415 if (clid) {
1416 if (clid != TC_H_ROOT) {
1417 if (clid != TC_H_INGRESS) {
1418 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1419 if (!p) {
1420 NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1421 return -ENOENT;
1423 q = qdisc_leaf(p, clid);
1424 } else if (dev_ingress_queue_create(dev)) {
1425 q = dev_ingress_queue(dev)->qdisc_sleeping;
1427 } else {
1428 q = dev->qdisc;
1431 /* It may be default qdisc, ignore it */
1432 if (q && q->handle == 0)
1433 q = NULL;
1435 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1436 if (tcm->tcm_handle) {
1437 if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1438 NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1439 return -EEXIST;
1441 if (TC_H_MIN(tcm->tcm_handle)) {
1442 NL_SET_ERR_MSG(extack, "Invalid minor handle");
1443 return -EINVAL;
1445 q = qdisc_lookup(dev, tcm->tcm_handle);
1446 if (!q)
1447 goto create_n_graft;
1448 if (n->nlmsg_flags & NLM_F_EXCL) {
1449 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1450 return -EEXIST;
1452 if (tca[TCA_KIND] &&
1453 nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1454 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1455 return -EINVAL;
1457 if (q == p ||
1458 (p && check_loop(q, p, 0))) {
1459 NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1460 return -ELOOP;
1462 qdisc_refcount_inc(q);
1463 goto graft;
1464 } else {
1465 if (!q)
1466 goto create_n_graft;
1468 /* This magic test requires explanation.
1470 * We know, that some child q is already
1471 * attached to this parent and have choice:
1472 * either to change it or to create/graft new one.
1474 * 1. We are allowed to create/graft only
1475 * if CREATE and REPLACE flags are set.
1477 * 2. If EXCL is set, requestor wanted to say,
1478 * that qdisc tcm_handle is not expected
1479 * to exist, so that we choose create/graft too.
1481 * 3. The last case is when no flags are set.
1482 * Alas, it is sort of hole in API, we
1483 * cannot decide what to do unambiguously.
1484 * For now we select create/graft, if
1485 * user gave KIND, which does not match existing.
1487 if ((n->nlmsg_flags & NLM_F_CREATE) &&
1488 (n->nlmsg_flags & NLM_F_REPLACE) &&
1489 ((n->nlmsg_flags & NLM_F_EXCL) ||
1490 (tca[TCA_KIND] &&
1491 nla_strcmp(tca[TCA_KIND], q->ops->id))))
1492 goto create_n_graft;
1495 } else {
1496 if (!tcm->tcm_handle) {
1497 NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1498 return -EINVAL;
1500 q = qdisc_lookup(dev, tcm->tcm_handle);
1503 /* Change qdisc parameters */
1504 if (!q) {
1505 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1506 return -ENOENT;
1508 if (n->nlmsg_flags & NLM_F_EXCL) {
1509 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1510 return -EEXIST;
1512 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1513 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1514 return -EINVAL;
1516 err = qdisc_change(q, tca, extack);
1517 if (err == 0)
1518 qdisc_notify(net, skb, n, clid, NULL, q);
1519 return err;
1521 create_n_graft:
1522 if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1523 NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1524 return -ENOENT;
1526 if (clid == TC_H_INGRESS) {
1527 if (dev_ingress_queue(dev)) {
1528 q = qdisc_create(dev, dev_ingress_queue(dev), p,
1529 tcm->tcm_parent, tcm->tcm_parent,
1530 tca, &err, extack);
1531 } else {
1532 NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1533 err = -ENOENT;
1535 } else {
1536 struct netdev_queue *dev_queue;
1538 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1539 dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1540 else if (p)
1541 dev_queue = p->dev_queue;
1542 else
1543 dev_queue = netdev_get_tx_queue(dev, 0);
1545 q = qdisc_create(dev, dev_queue, p,
1546 tcm->tcm_parent, tcm->tcm_handle,
1547 tca, &err, extack);
1549 if (q == NULL) {
1550 if (err == -EAGAIN)
1551 goto replay;
1552 return err;
1555 graft:
1556 err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1557 if (err) {
1558 if (q)
1559 qdisc_destroy(q);
1560 return err;
1563 return 0;
1566 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1567 struct netlink_callback *cb,
1568 int *q_idx_p, int s_q_idx, bool recur,
1569 bool dump_invisible)
1571 int ret = 0, q_idx = *q_idx_p;
1572 struct Qdisc *q;
1573 int b;
1575 if (!root)
1576 return 0;
1578 q = root;
1579 if (q_idx < s_q_idx) {
1580 q_idx++;
1581 } else {
1582 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1583 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1584 cb->nlh->nlmsg_seq, NLM_F_MULTI,
1585 RTM_NEWQDISC) <= 0)
1586 goto done;
1587 q_idx++;
1590 /* If dumping singletons, there is no qdisc_dev(root) and the singleton
1591 * itself has already been dumped.
1593 * If we've already dumped the top-level (ingress) qdisc above and the global
1594 * qdisc hashtable, we don't want to hit it again
1596 if (!qdisc_dev(root) || !recur)
1597 goto out;
1599 hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1600 if (q_idx < s_q_idx) {
1601 q_idx++;
1602 continue;
1604 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1605 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1606 cb->nlh->nlmsg_seq, NLM_F_MULTI,
1607 RTM_NEWQDISC) <= 0)
1608 goto done;
1609 q_idx++;
1612 out:
1613 *q_idx_p = q_idx;
1614 return ret;
1615 done:
1616 ret = -1;
1617 goto out;
1620 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1622 struct net *net = sock_net(skb->sk);
1623 int idx, q_idx;
1624 int s_idx, s_q_idx;
1625 struct net_device *dev;
1626 const struct nlmsghdr *nlh = cb->nlh;
1627 struct nlattr *tca[TCA_MAX + 1];
1628 int err;
1630 s_idx = cb->args[0];
1631 s_q_idx = q_idx = cb->args[1];
1633 idx = 0;
1634 ASSERT_RTNL();
1636 err = nlmsg_parse(nlh, sizeof(struct tcmsg), tca, TCA_MAX, NULL, NULL);
1637 if (err < 0)
1638 return err;
1640 for_each_netdev(net, dev) {
1641 struct netdev_queue *dev_queue;
1643 if (idx < s_idx)
1644 goto cont;
1645 if (idx > s_idx)
1646 s_q_idx = 0;
1647 q_idx = 0;
1649 if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx,
1650 true, tca[TCA_DUMP_INVISIBLE]) < 0)
1651 goto done;
1653 dev_queue = dev_ingress_queue(dev);
1654 if (dev_queue &&
1655 tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1656 &q_idx, s_q_idx, false,
1657 tca[TCA_DUMP_INVISIBLE]) < 0)
1658 goto done;
1660 cont:
1661 idx++;
1664 done:
1665 cb->args[0] = idx;
1666 cb->args[1] = q_idx;
1668 return skb->len;
1673 /************************************************
1674 * Traffic classes manipulation. *
1675 ************************************************/
1677 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1678 unsigned long cl,
1679 u32 portid, u32 seq, u16 flags, int event)
1681 struct tcmsg *tcm;
1682 struct nlmsghdr *nlh;
1683 unsigned char *b = skb_tail_pointer(skb);
1684 struct gnet_dump d;
1685 const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1687 cond_resched();
1688 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1689 if (!nlh)
1690 goto out_nlmsg_trim;
1691 tcm = nlmsg_data(nlh);
1692 tcm->tcm_family = AF_UNSPEC;
1693 tcm->tcm__pad1 = 0;
1694 tcm->tcm__pad2 = 0;
1695 tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1696 tcm->tcm_parent = q->handle;
1697 tcm->tcm_handle = q->handle;
1698 tcm->tcm_info = 0;
1699 if (nla_put_string(skb, TCA_KIND, q->ops->id))
1700 goto nla_put_failure;
1701 if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1702 goto nla_put_failure;
1704 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1705 NULL, &d, TCA_PAD) < 0)
1706 goto nla_put_failure;
1708 if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1709 goto nla_put_failure;
1711 if (gnet_stats_finish_copy(&d) < 0)
1712 goto nla_put_failure;
1714 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1715 return skb->len;
1717 out_nlmsg_trim:
1718 nla_put_failure:
1719 nlmsg_trim(skb, b);
1720 return -1;
1723 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1724 struct nlmsghdr *n, struct Qdisc *q,
1725 unsigned long cl, int event)
1727 struct sk_buff *skb;
1728 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1730 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1731 if (!skb)
1732 return -ENOBUFS;
1734 if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1735 kfree_skb(skb);
1736 return -EINVAL;
1739 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1740 n->nlmsg_flags & NLM_F_ECHO);
1743 static int tclass_del_notify(struct net *net,
1744 const struct Qdisc_class_ops *cops,
1745 struct sk_buff *oskb, struct nlmsghdr *n,
1746 struct Qdisc *q, unsigned long cl)
1748 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1749 struct sk_buff *skb;
1750 int err = 0;
1752 if (!cops->delete)
1753 return -EOPNOTSUPP;
1755 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1756 if (!skb)
1757 return -ENOBUFS;
1759 if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1760 RTM_DELTCLASS) < 0) {
1761 kfree_skb(skb);
1762 return -EINVAL;
1765 err = cops->delete(q, cl);
1766 if (err) {
1767 kfree_skb(skb);
1768 return err;
1771 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1772 n->nlmsg_flags & NLM_F_ECHO);
1775 #ifdef CONFIG_NET_CLS
1777 struct tcf_bind_args {
1778 struct tcf_walker w;
1779 u32 classid;
1780 unsigned long cl;
1783 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1785 struct tcf_bind_args *a = (void *)arg;
1787 if (tp->ops->bind_class) {
1788 struct Qdisc *q = tcf_block_q(tp->chain->block);
1790 sch_tree_lock(q);
1791 tp->ops->bind_class(n, a->classid, a->cl);
1792 sch_tree_unlock(q);
1794 return 0;
1797 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1798 unsigned long new_cl)
1800 const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1801 struct tcf_block *block;
1802 struct tcf_chain *chain;
1803 unsigned long cl;
1805 cl = cops->find(q, portid);
1806 if (!cl)
1807 return;
1808 block = cops->tcf_block(q, cl, NULL);
1809 if (!block)
1810 return;
1811 list_for_each_entry(chain, &block->chain_list, list) {
1812 struct tcf_proto *tp;
1814 for (tp = rtnl_dereference(chain->filter_chain);
1815 tp; tp = rtnl_dereference(tp->next)) {
1816 struct tcf_bind_args arg = {};
1818 arg.w.fn = tcf_node_bind;
1819 arg.classid = clid;
1820 arg.cl = new_cl;
1821 tp->ops->walk(tp, &arg.w);
1826 #else
1828 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1829 unsigned long new_cl)
1833 #endif
1835 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
1836 struct netlink_ext_ack *extack)
1838 struct net *net = sock_net(skb->sk);
1839 struct tcmsg *tcm = nlmsg_data(n);
1840 struct nlattr *tca[TCA_MAX + 1];
1841 struct net_device *dev;
1842 struct Qdisc *q = NULL;
1843 const struct Qdisc_class_ops *cops;
1844 unsigned long cl = 0;
1845 unsigned long new_cl;
1846 u32 portid;
1847 u32 clid;
1848 u32 qid;
1849 int err;
1851 if ((n->nlmsg_type != RTM_GETTCLASS) &&
1852 !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1853 return -EPERM;
1855 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL, extack);
1856 if (err < 0)
1857 return err;
1859 dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1860 if (!dev)
1861 return -ENODEV;
1864 parent == TC_H_UNSPEC - unspecified parent.
1865 parent == TC_H_ROOT - class is root, which has no parent.
1866 parent == X:0 - parent is root class.
1867 parent == X:Y - parent is a node in hierarchy.
1868 parent == 0:Y - parent is X:Y, where X:0 is qdisc.
1870 handle == 0:0 - generate handle from kernel pool.
1871 handle == 0:Y - class is X:Y, where X:0 is qdisc.
1872 handle == X:Y - clear.
1873 handle == X:0 - root class.
1876 /* Step 1. Determine qdisc handle X:0 */
1878 portid = tcm->tcm_parent;
1879 clid = tcm->tcm_handle;
1880 qid = TC_H_MAJ(clid);
1882 if (portid != TC_H_ROOT) {
1883 u32 qid1 = TC_H_MAJ(portid);
1885 if (qid && qid1) {
1886 /* If both majors are known, they must be identical. */
1887 if (qid != qid1)
1888 return -EINVAL;
1889 } else if (qid1) {
1890 qid = qid1;
1891 } else if (qid == 0)
1892 qid = dev->qdisc->handle;
1894 /* Now qid is genuine qdisc handle consistent
1895 * both with parent and child.
1897 * TC_H_MAJ(portid) still may be unspecified, complete it now.
1899 if (portid)
1900 portid = TC_H_MAKE(qid, portid);
1901 } else {
1902 if (qid == 0)
1903 qid = dev->qdisc->handle;
1906 /* OK. Locate qdisc */
1907 q = qdisc_lookup(dev, qid);
1908 if (!q)
1909 return -ENOENT;
1911 /* An check that it supports classes */
1912 cops = q->ops->cl_ops;
1913 if (cops == NULL)
1914 return -EINVAL;
1916 /* Now try to get class */
1917 if (clid == 0) {
1918 if (portid == TC_H_ROOT)
1919 clid = qid;
1920 } else
1921 clid = TC_H_MAKE(qid, clid);
1923 if (clid)
1924 cl = cops->find(q, clid);
1926 if (cl == 0) {
1927 err = -ENOENT;
1928 if (n->nlmsg_type != RTM_NEWTCLASS ||
1929 !(n->nlmsg_flags & NLM_F_CREATE))
1930 goto out;
1931 } else {
1932 switch (n->nlmsg_type) {
1933 case RTM_NEWTCLASS:
1934 err = -EEXIST;
1935 if (n->nlmsg_flags & NLM_F_EXCL)
1936 goto out;
1937 break;
1938 case RTM_DELTCLASS:
1939 err = tclass_del_notify(net, cops, skb, n, q, cl);
1940 /* Unbind the class with flilters with 0 */
1941 tc_bind_tclass(q, portid, clid, 0);
1942 goto out;
1943 case RTM_GETTCLASS:
1944 err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1945 goto out;
1946 default:
1947 err = -EINVAL;
1948 goto out;
1952 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1953 NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
1954 return -EOPNOTSUPP;
1957 new_cl = cl;
1958 err = -EOPNOTSUPP;
1959 if (cops->change)
1960 err = cops->change(q, clid, portid, tca, &new_cl, extack);
1961 if (err == 0) {
1962 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1963 /* We just create a new class, need to do reverse binding. */
1964 if (cl != new_cl)
1965 tc_bind_tclass(q, portid, clid, new_cl);
1967 out:
1968 return err;
1971 struct qdisc_dump_args {
1972 struct qdisc_walker w;
1973 struct sk_buff *skb;
1974 struct netlink_callback *cb;
1977 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
1978 struct qdisc_walker *arg)
1980 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1982 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
1983 a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
1984 RTM_NEWTCLASS);
1987 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1988 struct tcmsg *tcm, struct netlink_callback *cb,
1989 int *t_p, int s_t)
1991 struct qdisc_dump_args arg;
1993 if (tc_qdisc_dump_ignore(q, false) ||
1994 *t_p < s_t || !q->ops->cl_ops ||
1995 (tcm->tcm_parent &&
1996 TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1997 (*t_p)++;
1998 return 0;
2000 if (*t_p > s_t)
2001 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2002 arg.w.fn = qdisc_class_dump;
2003 arg.skb = skb;
2004 arg.cb = cb;
2005 arg.w.stop = 0;
2006 arg.w.skip = cb->args[1];
2007 arg.w.count = 0;
2008 q->ops->cl_ops->walk(q, &arg.w);
2009 cb->args[1] = arg.w.count;
2010 if (arg.w.stop)
2011 return -1;
2012 (*t_p)++;
2013 return 0;
2016 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2017 struct tcmsg *tcm, struct netlink_callback *cb,
2018 int *t_p, int s_t)
2020 struct Qdisc *q;
2021 int b;
2023 if (!root)
2024 return 0;
2026 if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2027 return -1;
2029 if (!qdisc_dev(root))
2030 return 0;
2032 if (tcm->tcm_parent) {
2033 q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2034 if (q && tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2035 return -1;
2036 return 0;
2038 hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2039 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2040 return -1;
2043 return 0;
2046 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2048 struct tcmsg *tcm = nlmsg_data(cb->nlh);
2049 struct net *net = sock_net(skb->sk);
2050 struct netdev_queue *dev_queue;
2051 struct net_device *dev;
2052 int t, s_t;
2054 if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2055 return 0;
2056 dev = dev_get_by_index(net, tcm->tcm_ifindex);
2057 if (!dev)
2058 return 0;
2060 s_t = cb->args[0];
2061 t = 0;
2063 if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
2064 goto done;
2066 dev_queue = dev_ingress_queue(dev);
2067 if (dev_queue &&
2068 tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
2069 &t, s_t) < 0)
2070 goto done;
2072 done:
2073 cb->args[0] = t;
2075 dev_put(dev);
2076 return skb->len;
2079 #ifdef CONFIG_PROC_FS
2080 static int psched_show(struct seq_file *seq, void *v)
2082 seq_printf(seq, "%08x %08x %08x %08x\n",
2083 (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2084 1000000,
2085 (u32)NSEC_PER_SEC / hrtimer_resolution);
2087 return 0;
2090 static int psched_open(struct inode *inode, struct file *file)
2092 return single_open(file, psched_show, NULL);
2095 static const struct file_operations psched_fops = {
2096 .open = psched_open,
2097 .read = seq_read,
2098 .llseek = seq_lseek,
2099 .release = single_release,
2102 static int __net_init psched_net_init(struct net *net)
2104 struct proc_dir_entry *e;
2106 e = proc_create("psched", 0, net->proc_net, &psched_fops);
2107 if (e == NULL)
2108 return -ENOMEM;
2110 return 0;
2113 static void __net_exit psched_net_exit(struct net *net)
2115 remove_proc_entry("psched", net->proc_net);
2117 #else
2118 static int __net_init psched_net_init(struct net *net)
2120 return 0;
2123 static void __net_exit psched_net_exit(struct net *net)
2126 #endif
2128 static struct pernet_operations psched_net_ops = {
2129 .init = psched_net_init,
2130 .exit = psched_net_exit,
2133 static int __init pktsched_init(void)
2135 int err;
2137 err = register_pernet_subsys(&psched_net_ops);
2138 if (err) {
2139 pr_err("pktsched_init: "
2140 "cannot initialize per netns operations\n");
2141 return err;
2144 register_qdisc(&pfifo_fast_ops);
2145 register_qdisc(&pfifo_qdisc_ops);
2146 register_qdisc(&bfifo_qdisc_ops);
2147 register_qdisc(&pfifo_head_drop_qdisc_ops);
2148 register_qdisc(&mq_qdisc_ops);
2149 register_qdisc(&noqueue_qdisc_ops);
2151 rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2152 rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2153 rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2155 rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2156 rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2157 rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2160 return 0;
2163 subsys_initcall(pktsched_init);