2 * net/sched/sch_api.c Packet scheduler API.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
18 #include <linux/config.h>
19 #include <linux/module.h>
20 #include <linux/types.h>
21 #include <linux/kernel.h>
22 #include <linux/sched.h>
23 #include <linux/string.h>
25 #include <linux/socket.h>
26 #include <linux/sockios.h>
28 #include <linux/errno.h>
29 #include <linux/interrupt.h>
30 #include <linux/netdevice.h>
31 #include <linux/skbuff.h>
32 #include <linux/rtnetlink.h>
33 #include <linux/init.h>
34 #include <linux/proc_fs.h>
35 #include <linux/seq_file.h>
36 #include <linux/kmod.h>
37 #include <linux/list.h>
38 #include <linux/bitops.h>
41 #include <net/pkt_sched.h>
43 #include <asm/processor.h>
44 #include <asm/uaccess.h>
45 #include <asm/system.h>
47 static int qdisc_notify(struct sk_buff
*oskb
, struct nlmsghdr
*n
, u32 clid
,
48 struct Qdisc
*old
, struct Qdisc
*new);
49 static int tclass_notify(struct sk_buff
*oskb
, struct nlmsghdr
*n
,
50 struct Qdisc
*q
, unsigned long cl
, int event
);
57 This file consists of two interrelated parts:
59 1. queueing disciplines manager frontend.
60 2. traffic classes manager frontend.
62 Generally, queueing discipline ("qdisc") is a black box,
63 which is able to enqueue packets and to dequeue them (when
64 device is ready to send something) in order and at times
65 determined by algorithm hidden in it.
67 qdisc's are divided to two categories:
68 - "queues", which have no internal structure visible from outside.
69 - "schedulers", which split all the packets to "traffic classes",
70 using "packet classifiers" (look at cls_api.c)
72 In turn, classes may have child qdiscs (as rule, queues)
73 attached to them etc. etc. etc.
75 The goal of the routines in this file is to translate
76 information supplied by user in the form of handles
77 to more intelligible for kernel form, to make some sanity
78 checks and part of work, which is common to all qdiscs
79 and to provide rtnetlink notifications.
81 All real intelligent work is done inside qdisc modules.
85 Every discipline has two major routines: enqueue and dequeue.
89 dequeue usually returns a skb to send. It is allowed to return NULL,
90 but it does not mean that queue is empty, it just means that
91 discipline does not want to send anything this time.
92 Queue is really empty if q->q.qlen == 0.
93 For complicated disciplines with multiple queues q->q is not
94 real packet queue, but however q->q.qlen must be valid.
98 enqueue returns 0, if packet was enqueued successfully.
99 If packet (this one or another one) was dropped, it returns
101 NET_XMIT_DROP - this packet dropped
102 Expected action: do not backoff, but wait until queue will clear.
103 NET_XMIT_CN - probably this packet enqueued, but another one dropped.
104 Expected action: backoff or ignore
105 NET_XMIT_POLICED - dropped by police.
106 Expected action: backoff or error to real-time apps.
112 requeues once dequeued packet. It is used for non-standard or
113 just buggy devices, which can defer output even if dev->tbusy=0.
117 returns qdisc to initial state: purge all buffers, clear all
118 timers, counters (except for statistics) etc.
122 initializes newly created qdisc.
126 destroys resources allocated by init and during lifetime of qdisc.
130 changes qdisc parameters.
133 /* Protects list of registered TC modules. It is pure SMP lock. */
134 static DEFINE_RWLOCK(qdisc_mod_lock
);
137 /************************************************
138 * Queueing disciplines manipulation. *
139 ************************************************/
142 /* The list of all installed queueing disciplines. */
144 static struct Qdisc_ops
*qdisc_base
;
146 /* Register/uregister queueing discipline */
148 int register_qdisc(struct Qdisc_ops
*qops
)
150 struct Qdisc_ops
*q
, **qp
;
153 write_lock(&qdisc_mod_lock
);
154 for (qp
= &qdisc_base
; (q
= *qp
) != NULL
; qp
= &q
->next
)
155 if (!strcmp(qops
->id
, q
->id
))
158 if (qops
->enqueue
== NULL
)
159 qops
->enqueue
= noop_qdisc_ops
.enqueue
;
160 if (qops
->requeue
== NULL
)
161 qops
->requeue
= noop_qdisc_ops
.requeue
;
162 if (qops
->dequeue
== NULL
)
163 qops
->dequeue
= noop_qdisc_ops
.dequeue
;
169 write_unlock(&qdisc_mod_lock
);
173 int unregister_qdisc(struct Qdisc_ops
*qops
)
175 struct Qdisc_ops
*q
, **qp
;
178 write_lock(&qdisc_mod_lock
);
179 for (qp
= &qdisc_base
; (q
=*qp
)!=NULL
; qp
= &q
->next
)
187 write_unlock(&qdisc_mod_lock
);
191 /* We know handle. Find qdisc among all qdisc's attached to device
192 (root qdisc, all its children, children of children etc.)
195 struct Qdisc
*qdisc_lookup(struct net_device
*dev
, u32 handle
)
199 read_lock_bh(&qdisc_tree_lock
);
200 list_for_each_entry(q
, &dev
->qdisc_list
, list
) {
201 if (q
->handle
== handle
) {
202 read_unlock_bh(&qdisc_tree_lock
);
206 read_unlock_bh(&qdisc_tree_lock
);
210 static struct Qdisc
*qdisc_leaf(struct Qdisc
*p
, u32 classid
)
214 struct Qdisc_class_ops
*cops
= p
->ops
->cl_ops
;
218 cl
= cops
->get(p
, classid
);
222 leaf
= cops
->leaf(p
, cl
);
227 /* Find queueing discipline by name */
229 static struct Qdisc_ops
*qdisc_lookup_ops(struct rtattr
*kind
)
231 struct Qdisc_ops
*q
= NULL
;
234 read_lock(&qdisc_mod_lock
);
235 for (q
= qdisc_base
; q
; q
= q
->next
) {
236 if (rtattr_strcmp(kind
, q
->id
) == 0) {
237 if (!try_module_get(q
->owner
))
242 read_unlock(&qdisc_mod_lock
);
247 static struct qdisc_rate_table
*qdisc_rtab_list
;
249 struct qdisc_rate_table
*qdisc_get_rtab(struct tc_ratespec
*r
, struct rtattr
*tab
)
251 struct qdisc_rate_table
*rtab
;
253 for (rtab
= qdisc_rtab_list
; rtab
; rtab
= rtab
->next
) {
254 if (memcmp(&rtab
->rate
, r
, sizeof(struct tc_ratespec
)) == 0) {
260 if (tab
== NULL
|| r
->rate
== 0 || r
->cell_log
== 0 || RTA_PAYLOAD(tab
) != 1024)
263 rtab
= kmalloc(sizeof(*rtab
), GFP_KERNEL
);
267 memcpy(rtab
->data
, RTA_DATA(tab
), 1024);
268 rtab
->next
= qdisc_rtab_list
;
269 qdisc_rtab_list
= rtab
;
274 void qdisc_put_rtab(struct qdisc_rate_table
*tab
)
276 struct qdisc_rate_table
*rtab
, **rtabp
;
278 if (!tab
|| --tab
->refcnt
)
281 for (rtabp
= &qdisc_rtab_list
; (rtab
=*rtabp
) != NULL
; rtabp
= &rtab
->next
) {
291 /* Allocate an unique handle from space managed by kernel */
293 static u32
qdisc_alloc_handle(struct net_device
*dev
)
296 static u32 autohandle
= TC_H_MAKE(0x80000000U
, 0);
299 autohandle
+= TC_H_MAKE(0x10000U
, 0);
300 if (autohandle
== TC_H_MAKE(TC_H_ROOT
, 0))
301 autohandle
= TC_H_MAKE(0x80000000U
, 0);
302 } while (qdisc_lookup(dev
, autohandle
) && --i
> 0);
304 return i
>0 ? autohandle
: 0;
307 /* Attach toplevel qdisc to device dev */
309 static struct Qdisc
*
310 dev_graft_qdisc(struct net_device
*dev
, struct Qdisc
*qdisc
)
312 struct Qdisc
*oqdisc
;
314 if (dev
->flags
& IFF_UP
)
317 qdisc_lock_tree(dev
);
318 if (qdisc
&& qdisc
->flags
&TCQ_F_INGRESS
) {
319 oqdisc
= dev
->qdisc_ingress
;
320 /* Prune old scheduler */
321 if (oqdisc
&& atomic_read(&oqdisc
->refcnt
) <= 1) {
324 dev
->qdisc_ingress
= NULL
;
326 dev
->qdisc_ingress
= qdisc
;
331 oqdisc
= dev
->qdisc_sleeping
;
333 /* Prune old scheduler */
334 if (oqdisc
&& atomic_read(&oqdisc
->refcnt
) <= 1)
337 /* ... and graft new one */
340 dev
->qdisc_sleeping
= qdisc
;
341 dev
->qdisc
= &noop_qdisc
;
344 qdisc_unlock_tree(dev
);
346 if (dev
->flags
& IFF_UP
)
353 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
356 Old qdisc is not destroyed but returned in *old.
359 static int qdisc_graft(struct net_device
*dev
, struct Qdisc
*parent
,
361 struct Qdisc
*new, struct Qdisc
**old
)
364 struct Qdisc
*q
= *old
;
367 if (parent
== NULL
) {
368 if (q
&& q
->flags
&TCQ_F_INGRESS
) {
369 *old
= dev_graft_qdisc(dev
, q
);
371 *old
= dev_graft_qdisc(dev
, new);
374 struct Qdisc_class_ops
*cops
= parent
->ops
->cl_ops
;
379 unsigned long cl
= cops
->get(parent
, classid
);
381 err
= cops
->graft(parent
, cl
, new, old
);
383 new->parent
= classid
;
384 cops
->put(parent
, cl
);
392 Allocate and initialize new qdisc.
394 Parameters are passed via opt.
397 static struct Qdisc
*
398 qdisc_create(struct net_device
*dev
, u32 handle
, struct rtattr
**tca
, int *errp
)
401 struct rtattr
*kind
= tca
[TCA_KIND
-1];
404 struct Qdisc_ops
*ops
;
407 ops
= qdisc_lookup_ops(kind
);
409 if (ops
== NULL
&& kind
!= NULL
) {
411 if (rtattr_strlcpy(name
, kind
, IFNAMSIZ
) < IFNAMSIZ
) {
412 /* We dropped the RTNL semaphore in order to
413 * perform the module load. So, even if we
414 * succeeded in loading the module we have to
415 * tell the caller to replay the request. We
416 * indicate this using -EAGAIN.
417 * We replay the request because the device may
418 * go away in the mean time.
421 request_module("sch_%s", name
);
423 ops
= qdisc_lookup_ops(kind
);
425 /* We will try again qdisc_lookup_ops,
426 * so don't keep a reference.
428 module_put(ops
->owner
);
440 /* ensure that the Qdisc and the private data are 32-byte aligned */
441 size
= ((sizeof(*sch
) + QDISC_ALIGN_CONST
) & ~QDISC_ALIGN_CONST
);
442 size
+= ops
->priv_size
+ QDISC_ALIGN_CONST
;
444 p
= kmalloc(size
, GFP_KERNEL
);
449 sch
= (struct Qdisc
*)(((unsigned long)p
+ QDISC_ALIGN_CONST
)
450 & ~QDISC_ALIGN_CONST
);
451 sch
->padded
= (char *)sch
- (char *)p
;
453 INIT_LIST_HEAD(&sch
->list
);
454 skb_queue_head_init(&sch
->q
);
456 if (handle
== TC_H_INGRESS
)
457 sch
->flags
|= TCQ_F_INGRESS
;
460 sch
->enqueue
= ops
->enqueue
;
461 sch
->dequeue
= ops
->dequeue
;
464 atomic_set(&sch
->refcnt
, 1);
465 sch
->stats_lock
= &dev
->queue_lock
;
467 handle
= qdisc_alloc_handle(dev
);
473 if (handle
== TC_H_INGRESS
)
474 sch
->handle
=TC_H_MAKE(TC_H_INGRESS
, 0);
476 sch
->handle
= handle
;
478 if (!ops
->init
|| (err
= ops
->init(sch
, tca
[TCA_OPTIONS
-1])) == 0) {
479 qdisc_lock_tree(dev
);
480 list_add_tail(&sch
->list
, &dev
->qdisc_list
);
481 qdisc_unlock_tree(dev
);
483 #ifdef CONFIG_NET_ESTIMATOR
485 gen_new_estimator(&sch
->bstats
, &sch
->rate_est
,
486 sch
->stats_lock
, tca
[TCA_RATE
-1]);
493 module_put(ops
->owner
);
501 static int qdisc_change(struct Qdisc
*sch
, struct rtattr
**tca
)
503 if (tca
[TCA_OPTIONS
-1]) {
506 if (sch
->ops
->change
== NULL
)
508 err
= sch
->ops
->change(sch
, tca
[TCA_OPTIONS
-1]);
512 #ifdef CONFIG_NET_ESTIMATOR
514 gen_replace_estimator(&sch
->bstats
, &sch
->rate_est
,
515 sch
->stats_lock
, tca
[TCA_RATE
-1]);
520 struct check_loop_arg
522 struct qdisc_walker w
;
527 static int check_loop_fn(struct Qdisc
*q
, unsigned long cl
, struct qdisc_walker
*w
);
529 static int check_loop(struct Qdisc
*q
, struct Qdisc
*p
, int depth
)
531 struct check_loop_arg arg
;
533 if (q
->ops
->cl_ops
== NULL
)
536 arg
.w
.stop
= arg
.w
.skip
= arg
.w
.count
= 0;
537 arg
.w
.fn
= check_loop_fn
;
540 q
->ops
->cl_ops
->walk(q
, &arg
.w
);
541 return arg
.w
.stop
? -ELOOP
: 0;
545 check_loop_fn(struct Qdisc
*q
, unsigned long cl
, struct qdisc_walker
*w
)
548 struct Qdisc_class_ops
*cops
= q
->ops
->cl_ops
;
549 struct check_loop_arg
*arg
= (struct check_loop_arg
*)w
;
551 leaf
= cops
->leaf(q
, cl
);
553 if (leaf
== arg
->p
|| arg
->depth
> 7)
555 return check_loop(leaf
, arg
->p
, arg
->depth
+ 1);
564 static int tc_get_qdisc(struct sk_buff
*skb
, struct nlmsghdr
*n
, void *arg
)
566 struct tcmsg
*tcm
= NLMSG_DATA(n
);
567 struct rtattr
**tca
= arg
;
568 struct net_device
*dev
;
569 u32 clid
= tcm
->tcm_parent
;
570 struct Qdisc
*q
= NULL
;
571 struct Qdisc
*p
= NULL
;
574 if ((dev
= __dev_get_by_index(tcm
->tcm_ifindex
)) == NULL
)
578 if (clid
!= TC_H_ROOT
) {
579 if (TC_H_MAJ(clid
) != TC_H_MAJ(TC_H_INGRESS
)) {
580 if ((p
= qdisc_lookup(dev
, TC_H_MAJ(clid
))) == NULL
)
582 q
= qdisc_leaf(p
, clid
);
583 } else { /* ingress */
584 q
= dev
->qdisc_ingress
;
587 q
= dev
->qdisc_sleeping
;
592 if (tcm
->tcm_handle
&& q
->handle
!= tcm
->tcm_handle
)
595 if ((q
= qdisc_lookup(dev
, tcm
->tcm_handle
)) == NULL
)
599 if (tca
[TCA_KIND
-1] && rtattr_strcmp(tca
[TCA_KIND
-1], q
->ops
->id
))
602 if (n
->nlmsg_type
== RTM_DELQDISC
) {
607 if ((err
= qdisc_graft(dev
, p
, clid
, NULL
, &q
)) != 0)
610 qdisc_notify(skb
, n
, clid
, q
, NULL
);
611 spin_lock_bh(&dev
->queue_lock
);
613 spin_unlock_bh(&dev
->queue_lock
);
616 qdisc_notify(skb
, n
, clid
, NULL
, q
);
625 static int tc_modify_qdisc(struct sk_buff
*skb
, struct nlmsghdr
*n
, void *arg
)
629 struct net_device
*dev
;
635 /* Reinit, just in case something touches this. */
638 clid
= tcm
->tcm_parent
;
641 if ((dev
= __dev_get_by_index(tcm
->tcm_ifindex
)) == NULL
)
645 if (clid
!= TC_H_ROOT
) {
646 if (clid
!= TC_H_INGRESS
) {
647 if ((p
= qdisc_lookup(dev
, TC_H_MAJ(clid
))) == NULL
)
649 q
= qdisc_leaf(p
, clid
);
650 } else { /*ingress */
651 q
= dev
->qdisc_ingress
;
654 q
= dev
->qdisc_sleeping
;
657 /* It may be default qdisc, ignore it */
658 if (q
&& q
->handle
== 0)
661 if (!q
|| !tcm
->tcm_handle
|| q
->handle
!= tcm
->tcm_handle
) {
662 if (tcm
->tcm_handle
) {
663 if (q
&& !(n
->nlmsg_flags
&NLM_F_REPLACE
))
665 if (TC_H_MIN(tcm
->tcm_handle
))
667 if ((q
= qdisc_lookup(dev
, tcm
->tcm_handle
)) == NULL
)
669 if (n
->nlmsg_flags
&NLM_F_EXCL
)
671 if (tca
[TCA_KIND
-1] && rtattr_strcmp(tca
[TCA_KIND
-1], q
->ops
->id
))
674 (p
&& check_loop(q
, p
, 0)))
676 atomic_inc(&q
->refcnt
);
682 /* This magic test requires explanation.
684 * We know, that some child q is already
685 * attached to this parent and have choice:
686 * either to change it or to create/graft new one.
688 * 1. We are allowed to create/graft only
689 * if CREATE and REPLACE flags are set.
691 * 2. If EXCL is set, requestor wanted to say,
692 * that qdisc tcm_handle is not expected
693 * to exist, so that we choose create/graft too.
695 * 3. The last case is when no flags are set.
696 * Alas, it is sort of hole in API, we
697 * cannot decide what to do unambiguously.
698 * For now we select create/graft, if
699 * user gave KIND, which does not match existing.
701 if ((n
->nlmsg_flags
&NLM_F_CREATE
) &&
702 (n
->nlmsg_flags
&NLM_F_REPLACE
) &&
703 ((n
->nlmsg_flags
&NLM_F_EXCL
) ||
705 rtattr_strcmp(tca
[TCA_KIND
-1], q
->ops
->id
))))
710 if (!tcm
->tcm_handle
)
712 q
= qdisc_lookup(dev
, tcm
->tcm_handle
);
715 /* Change qdisc parameters */
718 if (n
->nlmsg_flags
&NLM_F_EXCL
)
720 if (tca
[TCA_KIND
-1] && rtattr_strcmp(tca
[TCA_KIND
-1], q
->ops
->id
))
722 err
= qdisc_change(q
, tca
);
724 qdisc_notify(skb
, n
, clid
, NULL
, q
);
728 if (!(n
->nlmsg_flags
&NLM_F_CREATE
))
730 if (clid
== TC_H_INGRESS
)
731 q
= qdisc_create(dev
, tcm
->tcm_parent
, tca
, &err
);
733 q
= qdisc_create(dev
, tcm
->tcm_handle
, tca
, &err
);
742 struct Qdisc
*old_q
= NULL
;
743 err
= qdisc_graft(dev
, p
, clid
, q
, &old_q
);
746 spin_lock_bh(&dev
->queue_lock
);
748 spin_unlock_bh(&dev
->queue_lock
);
752 qdisc_notify(skb
, n
, clid
, old_q
, q
);
754 spin_lock_bh(&dev
->queue_lock
);
755 qdisc_destroy(old_q
);
756 spin_unlock_bh(&dev
->queue_lock
);
762 static int tc_fill_qdisc(struct sk_buff
*skb
, struct Qdisc
*q
, u32 clid
,
763 u32 pid
, u32 seq
, unsigned flags
, int event
)
766 struct nlmsghdr
*nlh
;
767 unsigned char *b
= skb
->tail
;
770 nlh
= NLMSG_PUT(skb
, pid
, seq
, event
, sizeof(*tcm
));
771 nlh
->nlmsg_flags
= flags
;
772 tcm
= NLMSG_DATA(nlh
);
773 tcm
->tcm_family
= AF_UNSPEC
;
774 tcm
->tcm_ifindex
= q
->dev
->ifindex
;
775 tcm
->tcm_parent
= clid
;
776 tcm
->tcm_handle
= q
->handle
;
777 tcm
->tcm_info
= atomic_read(&q
->refcnt
);
778 RTA_PUT(skb
, TCA_KIND
, IFNAMSIZ
, q
->ops
->id
);
779 if (q
->ops
->dump
&& q
->ops
->dump(q
, skb
) < 0)
781 q
->qstats
.qlen
= q
->q
.qlen
;
783 if (gnet_stats_start_copy_compat(skb
, TCA_STATS2
, TCA_STATS
,
784 TCA_XSTATS
, q
->stats_lock
, &d
) < 0)
787 if (q
->ops
->dump_stats
&& q
->ops
->dump_stats(q
, &d
) < 0)
790 if (gnet_stats_copy_basic(&d
, &q
->bstats
) < 0 ||
791 #ifdef CONFIG_NET_ESTIMATOR
792 gnet_stats_copy_rate_est(&d
, &q
->rate_est
) < 0 ||
794 gnet_stats_copy_queue(&d
, &q
->qstats
) < 0)
797 if (gnet_stats_finish_copy(&d
) < 0)
800 nlh
->nlmsg_len
= skb
->tail
- b
;
805 skb_trim(skb
, b
- skb
->data
);
809 static int qdisc_notify(struct sk_buff
*oskb
, struct nlmsghdr
*n
,
810 u32 clid
, struct Qdisc
*old
, struct Qdisc
*new)
813 u32 pid
= oskb
? NETLINK_CB(oskb
).pid
: 0;
815 skb
= alloc_skb(NLMSG_GOODSIZE
, GFP_KERNEL
);
819 if (old
&& old
->handle
) {
820 if (tc_fill_qdisc(skb
, old
, clid
, pid
, n
->nlmsg_seq
, 0, RTM_DELQDISC
) < 0)
824 if (tc_fill_qdisc(skb
, new, clid
, pid
, n
->nlmsg_seq
, old
? NLM_F_REPLACE
: 0, RTM_NEWQDISC
) < 0)
829 return rtnetlink_send(skb
, pid
, RTMGRP_TC
, n
->nlmsg_flags
&NLM_F_ECHO
);
836 static int tc_dump_qdisc(struct sk_buff
*skb
, struct netlink_callback
*cb
)
840 struct net_device
*dev
;
844 s_q_idx
= q_idx
= cb
->args
[1];
845 read_lock(&dev_base_lock
);
846 for (dev
=dev_base
, idx
=0; dev
; dev
= dev
->next
, idx
++) {
851 read_lock_bh(&qdisc_tree_lock
);
853 list_for_each_entry(q
, &dev
->qdisc_list
, list
) {
854 if (q_idx
< s_q_idx
) {
858 if (tc_fill_qdisc(skb
, q
, q
->parent
, NETLINK_CB(cb
->skb
).pid
,
859 cb
->nlh
->nlmsg_seq
, NLM_F_MULTI
, RTM_NEWQDISC
) <= 0) {
860 read_unlock_bh(&qdisc_tree_lock
);
865 read_unlock_bh(&qdisc_tree_lock
);
869 read_unlock(&dev_base_lock
);
879 /************************************************
880 * Traffic classes manipulation. *
881 ************************************************/
885 static int tc_ctl_tclass(struct sk_buff
*skb
, struct nlmsghdr
*n
, void *arg
)
887 struct tcmsg
*tcm
= NLMSG_DATA(n
);
888 struct rtattr
**tca
= arg
;
889 struct net_device
*dev
;
890 struct Qdisc
*q
= NULL
;
891 struct Qdisc_class_ops
*cops
;
892 unsigned long cl
= 0;
893 unsigned long new_cl
;
894 u32 pid
= tcm
->tcm_parent
;
895 u32 clid
= tcm
->tcm_handle
;
896 u32 qid
= TC_H_MAJ(clid
);
899 if ((dev
= __dev_get_by_index(tcm
->tcm_ifindex
)) == NULL
)
903 parent == TC_H_UNSPEC - unspecified parent.
904 parent == TC_H_ROOT - class is root, which has no parent.
905 parent == X:0 - parent is root class.
906 parent == X:Y - parent is a node in hierarchy.
907 parent == 0:Y - parent is X:Y, where X:0 is qdisc.
909 handle == 0:0 - generate handle from kernel pool.
910 handle == 0:Y - class is X:Y, where X:0 is qdisc.
911 handle == X:Y - clear.
912 handle == X:0 - root class.
915 /* Step 1. Determine qdisc handle X:0 */
917 if (pid
!= TC_H_ROOT
) {
918 u32 qid1
= TC_H_MAJ(pid
);
921 /* If both majors are known, they must be identical. */
927 qid
= dev
->qdisc_sleeping
->handle
;
929 /* Now qid is genuine qdisc handle consistent
930 both with parent and child.
932 TC_H_MAJ(pid) still may be unspecified, complete it now.
935 pid
= TC_H_MAKE(qid
, pid
);
938 qid
= dev
->qdisc_sleeping
->handle
;
941 /* OK. Locate qdisc */
942 if ((q
= qdisc_lookup(dev
, qid
)) == NULL
)
945 /* An check that it supports classes */
946 cops
= q
->ops
->cl_ops
;
950 /* Now try to get class */
952 if (pid
== TC_H_ROOT
)
955 clid
= TC_H_MAKE(qid
, clid
);
958 cl
= cops
->get(q
, clid
);
962 if (n
->nlmsg_type
!= RTM_NEWTCLASS
|| !(n
->nlmsg_flags
&NLM_F_CREATE
))
965 switch (n
->nlmsg_type
) {
968 if (n
->nlmsg_flags
&NLM_F_EXCL
)
972 err
= cops
->delete(q
, cl
);
974 tclass_notify(skb
, n
, q
, cl
, RTM_DELTCLASS
);
977 err
= tclass_notify(skb
, n
, q
, cl
, RTM_NEWTCLASS
);
986 err
= cops
->change(q
, clid
, pid
, tca
, &new_cl
);
988 tclass_notify(skb
, n
, q
, new_cl
, RTM_NEWTCLASS
);
998 static int tc_fill_tclass(struct sk_buff
*skb
, struct Qdisc
*q
,
1000 u32 pid
, u32 seq
, unsigned flags
, int event
)
1003 struct nlmsghdr
*nlh
;
1004 unsigned char *b
= skb
->tail
;
1006 struct Qdisc_class_ops
*cl_ops
= q
->ops
->cl_ops
;
1008 nlh
= NLMSG_PUT(skb
, pid
, seq
, event
, sizeof(*tcm
));
1009 nlh
->nlmsg_flags
= flags
;
1010 tcm
= NLMSG_DATA(nlh
);
1011 tcm
->tcm_family
= AF_UNSPEC
;
1012 tcm
->tcm_ifindex
= q
->dev
->ifindex
;
1013 tcm
->tcm_parent
= q
->handle
;
1014 tcm
->tcm_handle
= q
->handle
;
1016 RTA_PUT(skb
, TCA_KIND
, IFNAMSIZ
, q
->ops
->id
);
1017 if (cl_ops
->dump
&& cl_ops
->dump(q
, cl
, skb
, tcm
) < 0)
1018 goto rtattr_failure
;
1020 if (gnet_stats_start_copy_compat(skb
, TCA_STATS2
, TCA_STATS
,
1021 TCA_XSTATS
, q
->stats_lock
, &d
) < 0)
1022 goto rtattr_failure
;
1024 if (cl_ops
->dump_stats
&& cl_ops
->dump_stats(q
, cl
, &d
) < 0)
1025 goto rtattr_failure
;
1027 if (gnet_stats_finish_copy(&d
) < 0)
1028 goto rtattr_failure
;
1030 nlh
->nlmsg_len
= skb
->tail
- b
;
1035 skb_trim(skb
, b
- skb
->data
);
1039 static int tclass_notify(struct sk_buff
*oskb
, struct nlmsghdr
*n
,
1040 struct Qdisc
*q
, unsigned long cl
, int event
)
1042 struct sk_buff
*skb
;
1043 u32 pid
= oskb
? NETLINK_CB(oskb
).pid
: 0;
1045 skb
= alloc_skb(NLMSG_GOODSIZE
, GFP_KERNEL
);
1049 if (tc_fill_tclass(skb
, q
, cl
, pid
, n
->nlmsg_seq
, 0, event
) < 0) {
1054 return rtnetlink_send(skb
, pid
, RTMGRP_TC
, n
->nlmsg_flags
&NLM_F_ECHO
);
1057 struct qdisc_dump_args
1059 struct qdisc_walker w
;
1060 struct sk_buff
*skb
;
1061 struct netlink_callback
*cb
;
1064 static int qdisc_class_dump(struct Qdisc
*q
, unsigned long cl
, struct qdisc_walker
*arg
)
1066 struct qdisc_dump_args
*a
= (struct qdisc_dump_args
*)arg
;
1068 return tc_fill_tclass(a
->skb
, q
, cl
, NETLINK_CB(a
->cb
->skb
).pid
,
1069 a
->cb
->nlh
->nlmsg_seq
, NLM_F_MULTI
, RTM_NEWTCLASS
);
1072 static int tc_dump_tclass(struct sk_buff
*skb
, struct netlink_callback
*cb
)
1076 struct net_device
*dev
;
1078 struct tcmsg
*tcm
= (struct tcmsg
*)NLMSG_DATA(cb
->nlh
);
1079 struct qdisc_dump_args arg
;
1081 if (cb
->nlh
->nlmsg_len
< NLMSG_LENGTH(sizeof(*tcm
)))
1083 if ((dev
= dev_get_by_index(tcm
->tcm_ifindex
)) == NULL
)
1089 read_lock_bh(&qdisc_tree_lock
);
1090 list_for_each_entry(q
, &dev
->qdisc_list
, list
) {
1091 if (t
< s_t
|| !q
->ops
->cl_ops
||
1093 TC_H_MAJ(tcm
->tcm_parent
) != q
->handle
)) {
1098 memset(&cb
->args
[1], 0, sizeof(cb
->args
)-sizeof(cb
->args
[0]));
1099 arg
.w
.fn
= qdisc_class_dump
;
1103 arg
.w
.skip
= cb
->args
[1];
1105 q
->ops
->cl_ops
->walk(q
, &arg
.w
);
1106 cb
->args
[1] = arg
.w
.count
;
1111 read_unlock_bh(&qdisc_tree_lock
);
1119 /* Main classifier routine: scans classifier chain attached
1120 to this qdisc, (optionally) tests for protocol and asks
1121 specific classifiers.
1123 int tc_classify(struct sk_buff
*skb
, struct tcf_proto
*tp
,
1124 struct tcf_result
*res
)
1127 u32 protocol
= skb
->protocol
;
1128 #ifdef CONFIG_NET_CLS_ACT
1129 struct tcf_proto
*otp
= tp
;
1132 protocol
= skb
->protocol
;
1134 for ( ; tp
; tp
= tp
->next
) {
1135 if ((tp
->protocol
== protocol
||
1136 tp
->protocol
== __constant_htons(ETH_P_ALL
)) &&
1137 (err
= tp
->classify(skb
, tp
, res
)) >= 0) {
1138 #ifdef CONFIG_NET_CLS_ACT
1139 if ( TC_ACT_RECLASSIFY
== err
) {
1140 __u32 verd
= (__u32
) G_TC_VERD(skb
->tc_verd
);
1143 if (MAX_REC_LOOP
< verd
++) {
1144 printk("rule prio %d protocol %02x reclassify is buggy packet dropped\n",
1145 tp
->prio
&0xffff, ntohs(tp
->protocol
));
1148 skb
->tc_verd
= SET_TC_VERD(skb
->tc_verd
,verd
);
1152 skb
->tc_verd
= SET_TC_VERD(skb
->tc_verd
,0);
1165 static int psched_us_per_tick
= 1;
1166 static int psched_tick_per_us
= 1;
1168 #ifdef CONFIG_PROC_FS
1169 static int psched_show(struct seq_file
*seq
, void *v
)
1171 seq_printf(seq
, "%08x %08x %08x %08x\n",
1172 psched_tick_per_us
, psched_us_per_tick
,
1178 static int psched_open(struct inode
*inode
, struct file
*file
)
1180 return single_open(file
, psched_show
, PDE(inode
)->data
);
1183 static struct file_operations psched_fops
= {
1184 .owner
= THIS_MODULE
,
1185 .open
= psched_open
,
1187 .llseek
= seq_lseek
,
1188 .release
= single_release
,
1192 #ifdef CONFIG_NET_SCH_CLK_CPU
1193 psched_tdiff_t psched_clock_per_hz
;
1194 int psched_clock_scale
;
1195 EXPORT_SYMBOL(psched_clock_per_hz
);
1196 EXPORT_SYMBOL(psched_clock_scale
);
1198 psched_time_t psched_time_base
;
1199 cycles_t psched_time_mark
;
1200 EXPORT_SYMBOL(psched_time_mark
);
1201 EXPORT_SYMBOL(psched_time_base
);
1204 * Periodically adjust psched_time_base to avoid overflow
1205 * with 32-bit get_cycles(). Safe up to 4GHz CPU.
1207 static void psched_tick(unsigned long);
1208 static struct timer_list psched_timer
= TIMER_INITIALIZER(psched_tick
, 0, 0);
1210 static void psched_tick(unsigned long dummy
)
1212 if (sizeof(cycles_t
) == sizeof(u32
)) {
1213 psched_time_t dummy_stamp
;
1214 PSCHED_GET_TIME(dummy_stamp
);
1215 psched_timer
.expires
= jiffies
+ 1*HZ
;
1216 add_timer(&psched_timer
);
1220 int __init
psched_calibrate_clock(void)
1222 psched_time_t stamp
, stamp1
;
1223 struct timeval tv
, tv1
;
1224 psched_tdiff_t delay
;
1229 stop
= jiffies
+ HZ
/10;
1230 PSCHED_GET_TIME(stamp
);
1231 do_gettimeofday(&tv
);
1232 while (time_before(jiffies
, stop
)) {
1236 PSCHED_GET_TIME(stamp1
);
1237 do_gettimeofday(&tv1
);
1239 delay
= PSCHED_TDIFF(stamp1
, stamp
);
1240 rdelay
= tv1
.tv_usec
- tv
.tv_usec
;
1241 rdelay
+= (tv1
.tv_sec
- tv
.tv_sec
)*1000000;
1245 psched_tick_per_us
= delay
;
1246 while ((delay
>>=1) != 0)
1247 psched_clock_scale
++;
1248 psched_us_per_tick
= 1<<psched_clock_scale
;
1249 psched_clock_per_hz
= (psched_tick_per_us
*(1000000/HZ
))>>psched_clock_scale
;
1254 static int __init
pktsched_init(void)
1256 struct rtnetlink_link
*link_p
;
1258 #ifdef CONFIG_NET_SCH_CLK_CPU
1259 if (psched_calibrate_clock() < 0)
1261 #elif defined(CONFIG_NET_SCH_CLK_JIFFIES)
1262 psched_tick_per_us
= HZ
<<PSCHED_JSCALE
;
1263 psched_us_per_tick
= 1000000;
1266 link_p
= rtnetlink_links
[PF_UNSPEC
];
1268 /* Setup rtnetlink links. It is made here to avoid
1269 exporting large number of public symbols.
1273 link_p
[RTM_NEWQDISC
-RTM_BASE
].doit
= tc_modify_qdisc
;
1274 link_p
[RTM_DELQDISC
-RTM_BASE
].doit
= tc_get_qdisc
;
1275 link_p
[RTM_GETQDISC
-RTM_BASE
].doit
= tc_get_qdisc
;
1276 link_p
[RTM_GETQDISC
-RTM_BASE
].dumpit
= tc_dump_qdisc
;
1277 link_p
[RTM_NEWTCLASS
-RTM_BASE
].doit
= tc_ctl_tclass
;
1278 link_p
[RTM_DELTCLASS
-RTM_BASE
].doit
= tc_ctl_tclass
;
1279 link_p
[RTM_GETTCLASS
-RTM_BASE
].doit
= tc_ctl_tclass
;
1280 link_p
[RTM_GETTCLASS
-RTM_BASE
].dumpit
= tc_dump_tclass
;
1283 register_qdisc(&pfifo_qdisc_ops
);
1284 register_qdisc(&bfifo_qdisc_ops
);
1285 proc_net_fops_create("psched", 0, &psched_fops
);
1290 subsys_initcall(pktsched_init
);
1292 EXPORT_SYMBOL(qdisc_lookup
);
1293 EXPORT_SYMBOL(qdisc_get_rtab
);
1294 EXPORT_SYMBOL(qdisc_put_rtab
);
1295 EXPORT_SYMBOL(register_qdisc
);
1296 EXPORT_SYMBOL(unregister_qdisc
);
1297 EXPORT_SYMBOL(tc_classify
);