2 * net/sched/sch_api.c Packet scheduler API.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
23 #include <linux/socket.h>
24 #include <linux/sockios.h>
26 #include <linux/errno.h>
27 #include <linux/interrupt.h>
28 #include <linux/netdevice.h>
29 #include <linux/skbuff.h>
30 #include <linux/rtnetlink.h>
31 #include <linux/init.h>
32 #include <linux/proc_fs.h>
33 #include <linux/seq_file.h>
34 #include <linux/kmod.h>
35 #include <linux/list.h>
36 #include <linux/bitops.h>
39 #include <net/pkt_sched.h>
41 #include <asm/processor.h>
42 #include <asm/uaccess.h>
43 #include <asm/system.h>
45 static int qdisc_notify(struct sk_buff
*oskb
, struct nlmsghdr
*n
, u32 clid
,
46 struct Qdisc
*old
, struct Qdisc
*new);
47 static int tclass_notify(struct sk_buff
*oskb
, struct nlmsghdr
*n
,
48 struct Qdisc
*q
, unsigned long cl
, int event
);
55 This file consists of two interrelated parts:
57 1. queueing disciplines manager frontend.
58 2. traffic classes manager frontend.
60 Generally, queueing discipline ("qdisc") is a black box,
61 which is able to enqueue packets and to dequeue them (when
62 device is ready to send something) in order and at times
63 determined by algorithm hidden in it.
65 qdisc's are divided to two categories:
66 - "queues", which have no internal structure visible from outside.
67 - "schedulers", which split all the packets to "traffic classes",
68 using "packet classifiers" (look at cls_api.c)
70 In turn, classes may have child qdiscs (as rule, queues)
71 attached to them etc. etc. etc.
73 The goal of the routines in this file is to translate
74 information supplied by user in the form of handles
75 to more intelligible for kernel form, to make some sanity
76 checks and part of work, which is common to all qdiscs
77 and to provide rtnetlink notifications.
79 All real intelligent work is done inside qdisc modules.
83 Every discipline has two major routines: enqueue and dequeue.
87 dequeue usually returns a skb to send. It is allowed to return NULL,
88 but it does not mean that queue is empty, it just means that
89 discipline does not want to send anything this time.
90 Queue is really empty if q->q.qlen == 0.
91 For complicated disciplines with multiple queues q->q is not
92 real packet queue, but however q->q.qlen must be valid.
96 enqueue returns 0, if packet was enqueued successfully.
97 If packet (this one or another one) was dropped, it returns
99 NET_XMIT_DROP - this packet dropped
100 Expected action: do not backoff, but wait until queue will clear.
101 NET_XMIT_CN - probably this packet enqueued, but another one dropped.
102 Expected action: backoff or ignore
103 NET_XMIT_POLICED - dropped by police.
104 Expected action: backoff or error to real-time apps.
110 requeues once dequeued packet. It is used for non-standard or
111 just buggy devices, which can defer output even if dev->tbusy=0.
115 returns qdisc to initial state: purge all buffers, clear all
116 timers, counters (except for statistics) etc.
120 initializes newly created qdisc.
124 destroys resources allocated by init and during lifetime of qdisc.
128 changes qdisc parameters.
131 /* Protects list of registered TC modules. It is pure SMP lock. */
132 static DEFINE_RWLOCK(qdisc_mod_lock
);
135 /************************************************
136 * Queueing disciplines manipulation. *
137 ************************************************/
140 /* The list of all installed queueing disciplines. */
142 static struct Qdisc_ops
*qdisc_base
;
144 /* Register/uregister queueing discipline */
146 int register_qdisc(struct Qdisc_ops
*qops
)
148 struct Qdisc_ops
*q
, **qp
;
151 write_lock(&qdisc_mod_lock
);
152 for (qp
= &qdisc_base
; (q
= *qp
) != NULL
; qp
= &q
->next
)
153 if (!strcmp(qops
->id
, q
->id
))
156 if (qops
->enqueue
== NULL
)
157 qops
->enqueue
= noop_qdisc_ops
.enqueue
;
158 if (qops
->requeue
== NULL
)
159 qops
->requeue
= noop_qdisc_ops
.requeue
;
160 if (qops
->dequeue
== NULL
)
161 qops
->dequeue
= noop_qdisc_ops
.dequeue
;
167 write_unlock(&qdisc_mod_lock
);
171 int unregister_qdisc(struct Qdisc_ops
*qops
)
173 struct Qdisc_ops
*q
, **qp
;
176 write_lock(&qdisc_mod_lock
);
177 for (qp
= &qdisc_base
; (q
=*qp
)!=NULL
; qp
= &q
->next
)
185 write_unlock(&qdisc_mod_lock
);
189 /* We know handle. Find qdisc among all qdisc's attached to device
190 (root qdisc, all its children, children of children etc.)
193 static struct Qdisc
*__qdisc_lookup(struct net_device
*dev
, u32 handle
)
197 list_for_each_entry(q
, &dev
->qdisc_list
, list
) {
198 if (q
->handle
== handle
)
204 struct Qdisc
*qdisc_lookup(struct net_device
*dev
, u32 handle
)
208 read_lock(&qdisc_tree_lock
);
209 q
= __qdisc_lookup(dev
, handle
);
210 read_unlock(&qdisc_tree_lock
);
214 static struct Qdisc
*qdisc_leaf(struct Qdisc
*p
, u32 classid
)
218 struct Qdisc_class_ops
*cops
= p
->ops
->cl_ops
;
222 cl
= cops
->get(p
, classid
);
226 leaf
= cops
->leaf(p
, cl
);
231 /* Find queueing discipline by name */
233 static struct Qdisc_ops
*qdisc_lookup_ops(struct rtattr
*kind
)
235 struct Qdisc_ops
*q
= NULL
;
238 read_lock(&qdisc_mod_lock
);
239 for (q
= qdisc_base
; q
; q
= q
->next
) {
240 if (rtattr_strcmp(kind
, q
->id
) == 0) {
241 if (!try_module_get(q
->owner
))
246 read_unlock(&qdisc_mod_lock
);
251 static struct qdisc_rate_table
*qdisc_rtab_list
;
253 struct qdisc_rate_table
*qdisc_get_rtab(struct tc_ratespec
*r
, struct rtattr
*tab
)
255 struct qdisc_rate_table
*rtab
;
257 for (rtab
= qdisc_rtab_list
; rtab
; rtab
= rtab
->next
) {
258 if (memcmp(&rtab
->rate
, r
, sizeof(struct tc_ratespec
)) == 0) {
264 if (tab
== NULL
|| r
->rate
== 0 || r
->cell_log
== 0 || RTA_PAYLOAD(tab
) != 1024)
267 rtab
= kmalloc(sizeof(*rtab
), GFP_KERNEL
);
271 memcpy(rtab
->data
, RTA_DATA(tab
), 1024);
272 rtab
->next
= qdisc_rtab_list
;
273 qdisc_rtab_list
= rtab
;
278 void qdisc_put_rtab(struct qdisc_rate_table
*tab
)
280 struct qdisc_rate_table
*rtab
, **rtabp
;
282 if (!tab
|| --tab
->refcnt
)
285 for (rtabp
= &qdisc_rtab_list
; (rtab
=*rtabp
) != NULL
; rtabp
= &rtab
->next
) {
295 /* Allocate an unique handle from space managed by kernel */
297 static u32
qdisc_alloc_handle(struct net_device
*dev
)
300 static u32 autohandle
= TC_H_MAKE(0x80000000U
, 0);
303 autohandle
+= TC_H_MAKE(0x10000U
, 0);
304 if (autohandle
== TC_H_MAKE(TC_H_ROOT
, 0))
305 autohandle
= TC_H_MAKE(0x80000000U
, 0);
306 } while (qdisc_lookup(dev
, autohandle
) && --i
> 0);
308 return i
>0 ? autohandle
: 0;
311 /* Attach toplevel qdisc to device dev */
313 static struct Qdisc
*
314 dev_graft_qdisc(struct net_device
*dev
, struct Qdisc
*qdisc
)
316 struct Qdisc
*oqdisc
;
318 if (dev
->flags
& IFF_UP
)
321 qdisc_lock_tree(dev
);
322 if (qdisc
&& qdisc
->flags
&TCQ_F_INGRESS
) {
323 oqdisc
= dev
->qdisc_ingress
;
324 /* Prune old scheduler */
325 if (oqdisc
&& atomic_read(&oqdisc
->refcnt
) <= 1) {
328 dev
->qdisc_ingress
= NULL
;
330 dev
->qdisc_ingress
= qdisc
;
335 oqdisc
= dev
->qdisc_sleeping
;
337 /* Prune old scheduler */
338 if (oqdisc
&& atomic_read(&oqdisc
->refcnt
) <= 1)
341 /* ... and graft new one */
344 dev
->qdisc_sleeping
= qdisc
;
345 dev
->qdisc
= &noop_qdisc
;
348 qdisc_unlock_tree(dev
);
350 if (dev
->flags
& IFF_UP
)
356 void qdisc_tree_decrease_qlen(struct Qdisc
*sch
, unsigned int n
)
358 struct Qdisc_class_ops
*cops
;
364 while ((parentid
= sch
->parent
)) {
365 sch
= __qdisc_lookup(sch
->dev
, TC_H_MAJ(parentid
));
366 cops
= sch
->ops
->cl_ops
;
367 if (cops
->qlen_notify
) {
368 cl
= cops
->get(sch
, parentid
);
369 cops
->qlen_notify(sch
, cl
);
375 EXPORT_SYMBOL(qdisc_tree_decrease_qlen
);
377 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
380 Old qdisc is not destroyed but returned in *old.
383 static int qdisc_graft(struct net_device
*dev
, struct Qdisc
*parent
,
385 struct Qdisc
*new, struct Qdisc
**old
)
388 struct Qdisc
*q
= *old
;
391 if (parent
== NULL
) {
392 if (q
&& q
->flags
&TCQ_F_INGRESS
) {
393 *old
= dev_graft_qdisc(dev
, q
);
395 *old
= dev_graft_qdisc(dev
, new);
398 struct Qdisc_class_ops
*cops
= parent
->ops
->cl_ops
;
403 unsigned long cl
= cops
->get(parent
, classid
);
405 err
= cops
->graft(parent
, cl
, new, old
);
407 new->parent
= classid
;
408 cops
->put(parent
, cl
);
416 Allocate and initialize new qdisc.
418 Parameters are passed via opt.
421 static struct Qdisc
*
422 qdisc_create(struct net_device
*dev
, u32 handle
, struct rtattr
**tca
, int *errp
)
425 struct rtattr
*kind
= tca
[TCA_KIND
-1];
427 struct Qdisc_ops
*ops
;
429 ops
= qdisc_lookup_ops(kind
);
431 if (ops
== NULL
&& kind
!= NULL
) {
433 if (rtattr_strlcpy(name
, kind
, IFNAMSIZ
) < IFNAMSIZ
) {
434 /* We dropped the RTNL semaphore in order to
435 * perform the module load. So, even if we
436 * succeeded in loading the module we have to
437 * tell the caller to replay the request. We
438 * indicate this using -EAGAIN.
439 * We replay the request because the device may
440 * go away in the mean time.
443 request_module("sch_%s", name
);
445 ops
= qdisc_lookup_ops(kind
);
447 /* We will try again qdisc_lookup_ops,
448 * so don't keep a reference.
450 module_put(ops
->owner
);
462 sch
= qdisc_alloc(dev
, ops
);
468 if (handle
== TC_H_INGRESS
) {
469 sch
->flags
|= TCQ_F_INGRESS
;
470 handle
= TC_H_MAKE(TC_H_INGRESS
, 0);
471 } else if (handle
== 0) {
472 handle
= qdisc_alloc_handle(dev
);
478 sch
->handle
= handle
;
480 if (!ops
->init
|| (err
= ops
->init(sch
, tca
[TCA_OPTIONS
-1])) == 0) {
481 #ifdef CONFIG_NET_ESTIMATOR
482 if (tca
[TCA_RATE
-1]) {
483 err
= gen_new_estimator(&sch
->bstats
, &sch
->rate_est
,
488 * Any broken qdiscs that would require
489 * a ops->reset() here? The qdisc was never
490 * in action so it shouldn't be necessary.
498 qdisc_lock_tree(dev
);
499 list_add_tail(&sch
->list
, &dev
->qdisc_list
);
500 qdisc_unlock_tree(dev
);
506 kfree((char *) sch
- sch
->padded
);
508 module_put(ops
->owner
);
514 static int qdisc_change(struct Qdisc
*sch
, struct rtattr
**tca
)
516 if (tca
[TCA_OPTIONS
-1]) {
519 if (sch
->ops
->change
== NULL
)
521 err
= sch
->ops
->change(sch
, tca
[TCA_OPTIONS
-1]);
525 #ifdef CONFIG_NET_ESTIMATOR
527 gen_replace_estimator(&sch
->bstats
, &sch
->rate_est
,
528 sch
->stats_lock
, tca
[TCA_RATE
-1]);
533 struct check_loop_arg
535 struct qdisc_walker w
;
540 static int check_loop_fn(struct Qdisc
*q
, unsigned long cl
, struct qdisc_walker
*w
);
542 static int check_loop(struct Qdisc
*q
, struct Qdisc
*p
, int depth
)
544 struct check_loop_arg arg
;
546 if (q
->ops
->cl_ops
== NULL
)
549 arg
.w
.stop
= arg
.w
.skip
= arg
.w
.count
= 0;
550 arg
.w
.fn
= check_loop_fn
;
553 q
->ops
->cl_ops
->walk(q
, &arg
.w
);
554 return arg
.w
.stop
? -ELOOP
: 0;
558 check_loop_fn(struct Qdisc
*q
, unsigned long cl
, struct qdisc_walker
*w
)
561 struct Qdisc_class_ops
*cops
= q
->ops
->cl_ops
;
562 struct check_loop_arg
*arg
= (struct check_loop_arg
*)w
;
564 leaf
= cops
->leaf(q
, cl
);
566 if (leaf
== arg
->p
|| arg
->depth
> 7)
568 return check_loop(leaf
, arg
->p
, arg
->depth
+ 1);
577 static int tc_get_qdisc(struct sk_buff
*skb
, struct nlmsghdr
*n
, void *arg
)
579 struct tcmsg
*tcm
= NLMSG_DATA(n
);
580 struct rtattr
**tca
= arg
;
581 struct net_device
*dev
;
582 u32 clid
= tcm
->tcm_parent
;
583 struct Qdisc
*q
= NULL
;
584 struct Qdisc
*p
= NULL
;
587 if ((dev
= __dev_get_by_index(tcm
->tcm_ifindex
)) == NULL
)
591 if (clid
!= TC_H_ROOT
) {
592 if (TC_H_MAJ(clid
) != TC_H_MAJ(TC_H_INGRESS
)) {
593 if ((p
= qdisc_lookup(dev
, TC_H_MAJ(clid
))) == NULL
)
595 q
= qdisc_leaf(p
, clid
);
596 } else { /* ingress */
597 q
= dev
->qdisc_ingress
;
600 q
= dev
->qdisc_sleeping
;
605 if (tcm
->tcm_handle
&& q
->handle
!= tcm
->tcm_handle
)
608 if ((q
= qdisc_lookup(dev
, tcm
->tcm_handle
)) == NULL
)
612 if (tca
[TCA_KIND
-1] && rtattr_strcmp(tca
[TCA_KIND
-1], q
->ops
->id
))
615 if (n
->nlmsg_type
== RTM_DELQDISC
) {
620 if ((err
= qdisc_graft(dev
, p
, clid
, NULL
, &q
)) != 0)
623 qdisc_notify(skb
, n
, clid
, q
, NULL
);
624 spin_lock_bh(&dev
->queue_lock
);
626 spin_unlock_bh(&dev
->queue_lock
);
629 qdisc_notify(skb
, n
, clid
, NULL
, q
);
638 static int tc_modify_qdisc(struct sk_buff
*skb
, struct nlmsghdr
*n
, void *arg
)
642 struct net_device
*dev
;
648 /* Reinit, just in case something touches this. */
651 clid
= tcm
->tcm_parent
;
654 if ((dev
= __dev_get_by_index(tcm
->tcm_ifindex
)) == NULL
)
658 if (clid
!= TC_H_ROOT
) {
659 if (clid
!= TC_H_INGRESS
) {
660 if ((p
= qdisc_lookup(dev
, TC_H_MAJ(clid
))) == NULL
)
662 q
= qdisc_leaf(p
, clid
);
663 } else { /*ingress */
664 q
= dev
->qdisc_ingress
;
667 q
= dev
->qdisc_sleeping
;
670 /* It may be default qdisc, ignore it */
671 if (q
&& q
->handle
== 0)
674 if (!q
|| !tcm
->tcm_handle
|| q
->handle
!= tcm
->tcm_handle
) {
675 if (tcm
->tcm_handle
) {
676 if (q
&& !(n
->nlmsg_flags
&NLM_F_REPLACE
))
678 if (TC_H_MIN(tcm
->tcm_handle
))
680 if ((q
= qdisc_lookup(dev
, tcm
->tcm_handle
)) == NULL
)
682 if (n
->nlmsg_flags
&NLM_F_EXCL
)
684 if (tca
[TCA_KIND
-1] && rtattr_strcmp(tca
[TCA_KIND
-1], q
->ops
->id
))
687 (p
&& check_loop(q
, p
, 0)))
689 atomic_inc(&q
->refcnt
);
695 /* This magic test requires explanation.
697 * We know, that some child q is already
698 * attached to this parent and have choice:
699 * either to change it or to create/graft new one.
701 * 1. We are allowed to create/graft only
702 * if CREATE and REPLACE flags are set.
704 * 2. If EXCL is set, requestor wanted to say,
705 * that qdisc tcm_handle is not expected
706 * to exist, so that we choose create/graft too.
708 * 3. The last case is when no flags are set.
709 * Alas, it is sort of hole in API, we
710 * cannot decide what to do unambiguously.
711 * For now we select create/graft, if
712 * user gave KIND, which does not match existing.
714 if ((n
->nlmsg_flags
&NLM_F_CREATE
) &&
715 (n
->nlmsg_flags
&NLM_F_REPLACE
) &&
716 ((n
->nlmsg_flags
&NLM_F_EXCL
) ||
718 rtattr_strcmp(tca
[TCA_KIND
-1], q
->ops
->id
))))
723 if (!tcm
->tcm_handle
)
725 q
= qdisc_lookup(dev
, tcm
->tcm_handle
);
728 /* Change qdisc parameters */
731 if (n
->nlmsg_flags
&NLM_F_EXCL
)
733 if (tca
[TCA_KIND
-1] && rtattr_strcmp(tca
[TCA_KIND
-1], q
->ops
->id
))
735 err
= qdisc_change(q
, tca
);
737 qdisc_notify(skb
, n
, clid
, NULL
, q
);
741 if (!(n
->nlmsg_flags
&NLM_F_CREATE
))
743 if (clid
== TC_H_INGRESS
)
744 q
= qdisc_create(dev
, tcm
->tcm_parent
, tca
, &err
);
746 q
= qdisc_create(dev
, tcm
->tcm_handle
, tca
, &err
);
755 struct Qdisc
*old_q
= NULL
;
756 err
= qdisc_graft(dev
, p
, clid
, q
, &old_q
);
759 spin_lock_bh(&dev
->queue_lock
);
761 spin_unlock_bh(&dev
->queue_lock
);
765 qdisc_notify(skb
, n
, clid
, old_q
, q
);
767 spin_lock_bh(&dev
->queue_lock
);
768 qdisc_destroy(old_q
);
769 spin_unlock_bh(&dev
->queue_lock
);
775 static int tc_fill_qdisc(struct sk_buff
*skb
, struct Qdisc
*q
, u32 clid
,
776 u32 pid
, u32 seq
, u16 flags
, int event
)
779 struct nlmsghdr
*nlh
;
780 unsigned char *b
= skb
->tail
;
783 nlh
= NLMSG_NEW(skb
, pid
, seq
, event
, sizeof(*tcm
), flags
);
784 tcm
= NLMSG_DATA(nlh
);
785 tcm
->tcm_family
= AF_UNSPEC
;
788 tcm
->tcm_ifindex
= q
->dev
->ifindex
;
789 tcm
->tcm_parent
= clid
;
790 tcm
->tcm_handle
= q
->handle
;
791 tcm
->tcm_info
= atomic_read(&q
->refcnt
);
792 RTA_PUT(skb
, TCA_KIND
, IFNAMSIZ
, q
->ops
->id
);
793 if (q
->ops
->dump
&& q
->ops
->dump(q
, skb
) < 0)
795 q
->qstats
.qlen
= q
->q
.qlen
;
797 if (gnet_stats_start_copy_compat(skb
, TCA_STATS2
, TCA_STATS
,
798 TCA_XSTATS
, q
->stats_lock
, &d
) < 0)
801 if (q
->ops
->dump_stats
&& q
->ops
->dump_stats(q
, &d
) < 0)
804 if (gnet_stats_copy_basic(&d
, &q
->bstats
) < 0 ||
805 #ifdef CONFIG_NET_ESTIMATOR
806 gnet_stats_copy_rate_est(&d
, &q
->rate_est
) < 0 ||
808 gnet_stats_copy_queue(&d
, &q
->qstats
) < 0)
811 if (gnet_stats_finish_copy(&d
) < 0)
814 nlh
->nlmsg_len
= skb
->tail
- b
;
819 skb_trim(skb
, b
- skb
->data
);
823 static int qdisc_notify(struct sk_buff
*oskb
, struct nlmsghdr
*n
,
824 u32 clid
, struct Qdisc
*old
, struct Qdisc
*new)
827 u32 pid
= oskb
? NETLINK_CB(oskb
).pid
: 0;
829 skb
= alloc_skb(NLMSG_GOODSIZE
, GFP_KERNEL
);
833 if (old
&& old
->handle
) {
834 if (tc_fill_qdisc(skb
, old
, clid
, pid
, n
->nlmsg_seq
, 0, RTM_DELQDISC
) < 0)
838 if (tc_fill_qdisc(skb
, new, clid
, pid
, n
->nlmsg_seq
, old
? NLM_F_REPLACE
: 0, RTM_NEWQDISC
) < 0)
843 return rtnetlink_send(skb
, pid
, RTNLGRP_TC
, n
->nlmsg_flags
&NLM_F_ECHO
);
850 static int tc_dump_qdisc(struct sk_buff
*skb
, struct netlink_callback
*cb
)
854 struct net_device
*dev
;
858 s_q_idx
= q_idx
= cb
->args
[1];
859 read_lock(&dev_base_lock
);
860 for (dev
=dev_base
, idx
=0; dev
; dev
= dev
->next
, idx
++) {
865 read_lock(&qdisc_tree_lock
);
867 list_for_each_entry(q
, &dev
->qdisc_list
, list
) {
868 if (q_idx
< s_q_idx
) {
872 if (tc_fill_qdisc(skb
, q
, q
->parent
, NETLINK_CB(cb
->skb
).pid
,
873 cb
->nlh
->nlmsg_seq
, NLM_F_MULTI
, RTM_NEWQDISC
) <= 0) {
874 read_unlock(&qdisc_tree_lock
);
879 read_unlock(&qdisc_tree_lock
);
883 read_unlock(&dev_base_lock
);
893 /************************************************
894 * Traffic classes manipulation. *
895 ************************************************/
899 static int tc_ctl_tclass(struct sk_buff
*skb
, struct nlmsghdr
*n
, void *arg
)
901 struct tcmsg
*tcm
= NLMSG_DATA(n
);
902 struct rtattr
**tca
= arg
;
903 struct net_device
*dev
;
904 struct Qdisc
*q
= NULL
;
905 struct Qdisc_class_ops
*cops
;
906 unsigned long cl
= 0;
907 unsigned long new_cl
;
908 u32 pid
= tcm
->tcm_parent
;
909 u32 clid
= tcm
->tcm_handle
;
910 u32 qid
= TC_H_MAJ(clid
);
913 if ((dev
= __dev_get_by_index(tcm
->tcm_ifindex
)) == NULL
)
917 parent == TC_H_UNSPEC - unspecified parent.
918 parent == TC_H_ROOT - class is root, which has no parent.
919 parent == X:0 - parent is root class.
920 parent == X:Y - parent is a node in hierarchy.
921 parent == 0:Y - parent is X:Y, where X:0 is qdisc.
923 handle == 0:0 - generate handle from kernel pool.
924 handle == 0:Y - class is X:Y, where X:0 is qdisc.
925 handle == X:Y - clear.
926 handle == X:0 - root class.
929 /* Step 1. Determine qdisc handle X:0 */
931 if (pid
!= TC_H_ROOT
) {
932 u32 qid1
= TC_H_MAJ(pid
);
935 /* If both majors are known, they must be identical. */
941 qid
= dev
->qdisc_sleeping
->handle
;
943 /* Now qid is genuine qdisc handle consistent
944 both with parent and child.
946 TC_H_MAJ(pid) still may be unspecified, complete it now.
949 pid
= TC_H_MAKE(qid
, pid
);
952 qid
= dev
->qdisc_sleeping
->handle
;
955 /* OK. Locate qdisc */
956 if ((q
= qdisc_lookup(dev
, qid
)) == NULL
)
959 /* An check that it supports classes */
960 cops
= q
->ops
->cl_ops
;
964 /* Now try to get class */
966 if (pid
== TC_H_ROOT
)
969 clid
= TC_H_MAKE(qid
, clid
);
972 cl
= cops
->get(q
, clid
);
976 if (n
->nlmsg_type
!= RTM_NEWTCLASS
|| !(n
->nlmsg_flags
&NLM_F_CREATE
))
979 switch (n
->nlmsg_type
) {
982 if (n
->nlmsg_flags
&NLM_F_EXCL
)
986 err
= cops
->delete(q
, cl
);
988 tclass_notify(skb
, n
, q
, cl
, RTM_DELTCLASS
);
991 err
= tclass_notify(skb
, n
, q
, cl
, RTM_NEWTCLASS
);
1000 err
= cops
->change(q
, clid
, pid
, tca
, &new_cl
);
1002 tclass_notify(skb
, n
, q
, new_cl
, RTM_NEWTCLASS
);
1012 static int tc_fill_tclass(struct sk_buff
*skb
, struct Qdisc
*q
,
1014 u32 pid
, u32 seq
, u16 flags
, int event
)
1017 struct nlmsghdr
*nlh
;
1018 unsigned char *b
= skb
->tail
;
1020 struct Qdisc_class_ops
*cl_ops
= q
->ops
->cl_ops
;
1022 nlh
= NLMSG_NEW(skb
, pid
, seq
, event
, sizeof(*tcm
), flags
);
1023 tcm
= NLMSG_DATA(nlh
);
1024 tcm
->tcm_family
= AF_UNSPEC
;
1025 tcm
->tcm_ifindex
= q
->dev
->ifindex
;
1026 tcm
->tcm_parent
= q
->handle
;
1027 tcm
->tcm_handle
= q
->handle
;
1029 RTA_PUT(skb
, TCA_KIND
, IFNAMSIZ
, q
->ops
->id
);
1030 if (cl_ops
->dump
&& cl_ops
->dump(q
, cl
, skb
, tcm
) < 0)
1031 goto rtattr_failure
;
1033 if (gnet_stats_start_copy_compat(skb
, TCA_STATS2
, TCA_STATS
,
1034 TCA_XSTATS
, q
->stats_lock
, &d
) < 0)
1035 goto rtattr_failure
;
1037 if (cl_ops
->dump_stats
&& cl_ops
->dump_stats(q
, cl
, &d
) < 0)
1038 goto rtattr_failure
;
1040 if (gnet_stats_finish_copy(&d
) < 0)
1041 goto rtattr_failure
;
1043 nlh
->nlmsg_len
= skb
->tail
- b
;
1048 skb_trim(skb
, b
- skb
->data
);
1052 static int tclass_notify(struct sk_buff
*oskb
, struct nlmsghdr
*n
,
1053 struct Qdisc
*q
, unsigned long cl
, int event
)
1055 struct sk_buff
*skb
;
1056 u32 pid
= oskb
? NETLINK_CB(oskb
).pid
: 0;
1058 skb
= alloc_skb(NLMSG_GOODSIZE
, GFP_KERNEL
);
1062 if (tc_fill_tclass(skb
, q
, cl
, pid
, n
->nlmsg_seq
, 0, event
) < 0) {
1067 return rtnetlink_send(skb
, pid
, RTNLGRP_TC
, n
->nlmsg_flags
&NLM_F_ECHO
);
1070 struct qdisc_dump_args
1072 struct qdisc_walker w
;
1073 struct sk_buff
*skb
;
1074 struct netlink_callback
*cb
;
1077 static int qdisc_class_dump(struct Qdisc
*q
, unsigned long cl
, struct qdisc_walker
*arg
)
1079 struct qdisc_dump_args
*a
= (struct qdisc_dump_args
*)arg
;
1081 return tc_fill_tclass(a
->skb
, q
, cl
, NETLINK_CB(a
->cb
->skb
).pid
,
1082 a
->cb
->nlh
->nlmsg_seq
, NLM_F_MULTI
, RTM_NEWTCLASS
);
1085 static int tc_dump_tclass(struct sk_buff
*skb
, struct netlink_callback
*cb
)
1089 struct net_device
*dev
;
1091 struct tcmsg
*tcm
= (struct tcmsg
*)NLMSG_DATA(cb
->nlh
);
1092 struct qdisc_dump_args arg
;
1094 if (cb
->nlh
->nlmsg_len
< NLMSG_LENGTH(sizeof(*tcm
)))
1096 if ((dev
= dev_get_by_index(tcm
->tcm_ifindex
)) == NULL
)
1102 read_lock(&qdisc_tree_lock
);
1103 list_for_each_entry(q
, &dev
->qdisc_list
, list
) {
1104 if (t
< s_t
|| !q
->ops
->cl_ops
||
1106 TC_H_MAJ(tcm
->tcm_parent
) != q
->handle
)) {
1111 memset(&cb
->args
[1], 0, sizeof(cb
->args
)-sizeof(cb
->args
[0]));
1112 arg
.w
.fn
= qdisc_class_dump
;
1116 arg
.w
.skip
= cb
->args
[1];
1118 q
->ops
->cl_ops
->walk(q
, &arg
.w
);
1119 cb
->args
[1] = arg
.w
.count
;
1124 read_unlock(&qdisc_tree_lock
);
1132 /* Main classifier routine: scans classifier chain attached
1133 to this qdisc, (optionally) tests for protocol and asks
1134 specific classifiers.
1136 int tc_classify(struct sk_buff
*skb
, struct tcf_proto
*tp
,
1137 struct tcf_result
*res
)
1140 __be16 protocol
= skb
->protocol
;
1141 #ifdef CONFIG_NET_CLS_ACT
1142 struct tcf_proto
*otp
= tp
;
1145 protocol
= skb
->protocol
;
1147 for ( ; tp
; tp
= tp
->next
) {
1148 if ((tp
->protocol
== protocol
||
1149 tp
->protocol
== __constant_htons(ETH_P_ALL
)) &&
1150 (err
= tp
->classify(skb
, tp
, res
)) >= 0) {
1151 #ifdef CONFIG_NET_CLS_ACT
1152 if ( TC_ACT_RECLASSIFY
== err
) {
1153 __u32 verd
= (__u32
) G_TC_VERD(skb
->tc_verd
);
1156 if (MAX_REC_LOOP
< verd
++) {
1157 printk("rule prio %d protocol %02x reclassify is buggy packet dropped\n",
1158 tp
->prio
&0xffff, ntohs(tp
->protocol
));
1161 skb
->tc_verd
= SET_TC_VERD(skb
->tc_verd
,verd
);
1165 skb
->tc_verd
= SET_TC_VERD(skb
->tc_verd
,0);
1178 static int psched_us_per_tick
= 1;
1179 static int psched_tick_per_us
= 1;
1181 #ifdef CONFIG_PROC_FS
1182 static int psched_show(struct seq_file
*seq
, void *v
)
1184 seq_printf(seq
, "%08x %08x %08x %08x\n",
1185 psched_tick_per_us
, psched_us_per_tick
,
1191 static int psched_open(struct inode
*inode
, struct file
*file
)
1193 return single_open(file
, psched_show
, PDE(inode
)->data
);
1196 static const struct file_operations psched_fops
= {
1197 .owner
= THIS_MODULE
,
1198 .open
= psched_open
,
1200 .llseek
= seq_lseek
,
1201 .release
= single_release
,
1205 #ifdef CONFIG_NET_SCH_CLK_CPU
1206 psched_tdiff_t psched_clock_per_hz
;
1207 int psched_clock_scale
;
1208 EXPORT_SYMBOL(psched_clock_per_hz
);
1209 EXPORT_SYMBOL(psched_clock_scale
);
1211 psched_time_t psched_time_base
;
1212 cycles_t psched_time_mark
;
1213 EXPORT_SYMBOL(psched_time_mark
);
1214 EXPORT_SYMBOL(psched_time_base
);
1217 * Periodically adjust psched_time_base to avoid overflow
1218 * with 32-bit get_cycles(). Safe up to 4GHz CPU.
1220 static void psched_tick(unsigned long);
1221 static DEFINE_TIMER(psched_timer
, psched_tick
, 0, 0);
1223 static void psched_tick(unsigned long dummy
)
1225 if (sizeof(cycles_t
) == sizeof(u32
)) {
1226 psched_time_t dummy_stamp
;
1227 PSCHED_GET_TIME(dummy_stamp
);
1228 psched_timer
.expires
= jiffies
+ 1*HZ
;
1229 add_timer(&psched_timer
);
1233 int __init
psched_calibrate_clock(void)
1235 psched_time_t stamp
, stamp1
;
1236 struct timeval tv
, tv1
;
1237 psched_tdiff_t delay
;
1242 stop
= jiffies
+ HZ
/10;
1243 PSCHED_GET_TIME(stamp
);
1244 do_gettimeofday(&tv
);
1245 while (time_before(jiffies
, stop
)) {
1249 PSCHED_GET_TIME(stamp1
);
1250 do_gettimeofday(&tv1
);
1252 delay
= PSCHED_TDIFF(stamp1
, stamp
);
1253 rdelay
= tv1
.tv_usec
- tv
.tv_usec
;
1254 rdelay
+= (tv1
.tv_sec
- tv
.tv_sec
)*1000000;
1258 psched_tick_per_us
= delay
;
1259 while ((delay
>>=1) != 0)
1260 psched_clock_scale
++;
1261 psched_us_per_tick
= 1<<psched_clock_scale
;
1262 psched_clock_per_hz
= (psched_tick_per_us
*(1000000/HZ
))>>psched_clock_scale
;
1267 static int __init
pktsched_init(void)
1269 struct rtnetlink_link
*link_p
;
1271 #ifdef CONFIG_NET_SCH_CLK_CPU
1272 if (psched_calibrate_clock() < 0)
1274 #elif defined(CONFIG_NET_SCH_CLK_JIFFIES)
1275 psched_tick_per_us
= HZ
<<PSCHED_JSCALE
;
1276 psched_us_per_tick
= 1000000;
1279 link_p
= rtnetlink_links
[PF_UNSPEC
];
1281 /* Setup rtnetlink links. It is made here to avoid
1282 exporting large number of public symbols.
1286 link_p
[RTM_NEWQDISC
-RTM_BASE
].doit
= tc_modify_qdisc
;
1287 link_p
[RTM_DELQDISC
-RTM_BASE
].doit
= tc_get_qdisc
;
1288 link_p
[RTM_GETQDISC
-RTM_BASE
].doit
= tc_get_qdisc
;
1289 link_p
[RTM_GETQDISC
-RTM_BASE
].dumpit
= tc_dump_qdisc
;
1290 link_p
[RTM_NEWTCLASS
-RTM_BASE
].doit
= tc_ctl_tclass
;
1291 link_p
[RTM_DELTCLASS
-RTM_BASE
].doit
= tc_ctl_tclass
;
1292 link_p
[RTM_GETTCLASS
-RTM_BASE
].doit
= tc_ctl_tclass
;
1293 link_p
[RTM_GETTCLASS
-RTM_BASE
].dumpit
= tc_dump_tclass
;
1296 register_qdisc(&pfifo_qdisc_ops
);
1297 register_qdisc(&bfifo_qdisc_ops
);
1298 proc_net_fops_create("psched", 0, &psched_fops
);
1303 subsys_initcall(pktsched_init
);
1305 EXPORT_SYMBOL(qdisc_get_rtab
);
1306 EXPORT_SYMBOL(qdisc_put_rtab
);
1307 EXPORT_SYMBOL(register_qdisc
);
1308 EXPORT_SYMBOL(unregister_qdisc
);
1309 EXPORT_SYMBOL(tc_classify
);