Linux 2.6.21.1
[linux/fpc-iii.git] / net / sched / sch_api.c
blobecc988af4a9a8e798b5dd439b9ce3c008a8f97e9
1 /*
2 * net/sched/sch_api.c Packet scheduler API.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
11 * Fixes:
13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/mm.h>
23 #include <linux/socket.h>
24 #include <linux/sockios.h>
25 #include <linux/in.h>
26 #include <linux/errno.h>
27 #include <linux/interrupt.h>
28 #include <linux/netdevice.h>
29 #include <linux/skbuff.h>
30 #include <linux/rtnetlink.h>
31 #include <linux/init.h>
32 #include <linux/proc_fs.h>
33 #include <linux/seq_file.h>
34 #include <linux/kmod.h>
35 #include <linux/list.h>
36 #include <linux/bitops.h>
38 #include <net/sock.h>
39 #include <net/pkt_sched.h>
41 #include <asm/processor.h>
42 #include <asm/uaccess.h>
43 #include <asm/system.h>
45 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
46 struct Qdisc *old, struct Qdisc *new);
47 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
48 struct Qdisc *q, unsigned long cl, int event);
52 Short review.
53 -------------
55 This file consists of two interrelated parts:
57 1. queueing disciplines manager frontend.
58 2. traffic classes manager frontend.
60 Generally, queueing discipline ("qdisc") is a black box,
61 which is able to enqueue packets and to dequeue them (when
62 device is ready to send something) in order and at times
63 determined by algorithm hidden in it.
65 qdisc's are divided to two categories:
66 - "queues", which have no internal structure visible from outside.
67 - "schedulers", which split all the packets to "traffic classes",
68 using "packet classifiers" (look at cls_api.c)
70 In turn, classes may have child qdiscs (as rule, queues)
71 attached to them etc. etc. etc.
73 The goal of the routines in this file is to translate
74 information supplied by user in the form of handles
75 to more intelligible for kernel form, to make some sanity
76 checks and part of work, which is common to all qdiscs
77 and to provide rtnetlink notifications.
79 All real intelligent work is done inside qdisc modules.
83 Every discipline has two major routines: enqueue and dequeue.
85 ---dequeue
87 dequeue usually returns a skb to send. It is allowed to return NULL,
88 but it does not mean that queue is empty, it just means that
89 discipline does not want to send anything this time.
90 Queue is really empty if q->q.qlen == 0.
91 For complicated disciplines with multiple queues q->q is not
92 real packet queue, but however q->q.qlen must be valid.
94 ---enqueue
96 enqueue returns 0, if packet was enqueued successfully.
97 If packet (this one or another one) was dropped, it returns
98 not zero error code.
99 NET_XMIT_DROP - this packet dropped
100 Expected action: do not backoff, but wait until queue will clear.
101 NET_XMIT_CN - probably this packet enqueued, but another one dropped.
102 Expected action: backoff or ignore
103 NET_XMIT_POLICED - dropped by police.
104 Expected action: backoff or error to real-time apps.
106 Auxiliary routines:
108 ---requeue
110 requeues once dequeued packet. It is used for non-standard or
111 just buggy devices, which can defer output even if dev->tbusy=0.
113 ---reset
115 returns qdisc to initial state: purge all buffers, clear all
116 timers, counters (except for statistics) etc.
118 ---init
120 initializes newly created qdisc.
122 ---destroy
124 destroys resources allocated by init and during lifetime of qdisc.
126 ---change
128 changes qdisc parameters.
131 /* Protects list of registered TC modules. It is pure SMP lock. */
132 static DEFINE_RWLOCK(qdisc_mod_lock);
135 /************************************************
136 * Queueing disciplines manipulation. *
137 ************************************************/
140 /* The list of all installed queueing disciplines. */
142 static struct Qdisc_ops *qdisc_base;
144 /* Register/uregister queueing discipline */
146 int register_qdisc(struct Qdisc_ops *qops)
148 struct Qdisc_ops *q, **qp;
149 int rc = -EEXIST;
151 write_lock(&qdisc_mod_lock);
152 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
153 if (!strcmp(qops->id, q->id))
154 goto out;
156 if (qops->enqueue == NULL)
157 qops->enqueue = noop_qdisc_ops.enqueue;
158 if (qops->requeue == NULL)
159 qops->requeue = noop_qdisc_ops.requeue;
160 if (qops->dequeue == NULL)
161 qops->dequeue = noop_qdisc_ops.dequeue;
163 qops->next = NULL;
164 *qp = qops;
165 rc = 0;
166 out:
167 write_unlock(&qdisc_mod_lock);
168 return rc;
171 int unregister_qdisc(struct Qdisc_ops *qops)
173 struct Qdisc_ops *q, **qp;
174 int err = -ENOENT;
176 write_lock(&qdisc_mod_lock);
177 for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
178 if (q == qops)
179 break;
180 if (q) {
181 *qp = q->next;
182 q->next = NULL;
183 err = 0;
185 write_unlock(&qdisc_mod_lock);
186 return err;
189 /* We know handle. Find qdisc among all qdisc's attached to device
190 (root qdisc, all its children, children of children etc.)
193 static struct Qdisc *__qdisc_lookup(struct net_device *dev, u32 handle)
195 struct Qdisc *q;
197 list_for_each_entry(q, &dev->qdisc_list, list) {
198 if (q->handle == handle)
199 return q;
201 return NULL;
204 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
206 struct Qdisc *q;
208 read_lock(&qdisc_tree_lock);
209 q = __qdisc_lookup(dev, handle);
210 read_unlock(&qdisc_tree_lock);
211 return q;
214 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
216 unsigned long cl;
217 struct Qdisc *leaf;
218 struct Qdisc_class_ops *cops = p->ops->cl_ops;
220 if (cops == NULL)
221 return NULL;
222 cl = cops->get(p, classid);
224 if (cl == 0)
225 return NULL;
226 leaf = cops->leaf(p, cl);
227 cops->put(p, cl);
228 return leaf;
231 /* Find queueing discipline by name */
233 static struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind)
235 struct Qdisc_ops *q = NULL;
237 if (kind) {
238 read_lock(&qdisc_mod_lock);
239 for (q = qdisc_base; q; q = q->next) {
240 if (rtattr_strcmp(kind, q->id) == 0) {
241 if (!try_module_get(q->owner))
242 q = NULL;
243 break;
246 read_unlock(&qdisc_mod_lock);
248 return q;
251 static struct qdisc_rate_table *qdisc_rtab_list;
253 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab)
255 struct qdisc_rate_table *rtab;
257 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
258 if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
259 rtab->refcnt++;
260 return rtab;
264 if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024)
265 return NULL;
267 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
268 if (rtab) {
269 rtab->rate = *r;
270 rtab->refcnt = 1;
271 memcpy(rtab->data, RTA_DATA(tab), 1024);
272 rtab->next = qdisc_rtab_list;
273 qdisc_rtab_list = rtab;
275 return rtab;
278 void qdisc_put_rtab(struct qdisc_rate_table *tab)
280 struct qdisc_rate_table *rtab, **rtabp;
282 if (!tab || --tab->refcnt)
283 return;
285 for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
286 if (rtab == tab) {
287 *rtabp = rtab->next;
288 kfree(rtab);
289 return;
295 /* Allocate an unique handle from space managed by kernel */
297 static u32 qdisc_alloc_handle(struct net_device *dev)
299 int i = 0x10000;
300 static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
302 do {
303 autohandle += TC_H_MAKE(0x10000U, 0);
304 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
305 autohandle = TC_H_MAKE(0x80000000U, 0);
306 } while (qdisc_lookup(dev, autohandle) && --i > 0);
308 return i>0 ? autohandle : 0;
311 /* Attach toplevel qdisc to device dev */
313 static struct Qdisc *
314 dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc)
316 struct Qdisc *oqdisc;
318 if (dev->flags & IFF_UP)
319 dev_deactivate(dev);
321 qdisc_lock_tree(dev);
322 if (qdisc && qdisc->flags&TCQ_F_INGRESS) {
323 oqdisc = dev->qdisc_ingress;
324 /* Prune old scheduler */
325 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) {
326 /* delete */
327 qdisc_reset(oqdisc);
328 dev->qdisc_ingress = NULL;
329 } else { /* new */
330 dev->qdisc_ingress = qdisc;
333 } else {
335 oqdisc = dev->qdisc_sleeping;
337 /* Prune old scheduler */
338 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
339 qdisc_reset(oqdisc);
341 /* ... and graft new one */
342 if (qdisc == NULL)
343 qdisc = &noop_qdisc;
344 dev->qdisc_sleeping = qdisc;
345 dev->qdisc = &noop_qdisc;
348 qdisc_unlock_tree(dev);
350 if (dev->flags & IFF_UP)
351 dev_activate(dev);
353 return oqdisc;
356 void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
358 struct Qdisc_class_ops *cops;
359 unsigned long cl;
360 u32 parentid;
362 if (n == 0)
363 return;
364 while ((parentid = sch->parent)) {
365 sch = __qdisc_lookup(sch->dev, TC_H_MAJ(parentid));
366 cops = sch->ops->cl_ops;
367 if (cops->qlen_notify) {
368 cl = cops->get(sch, parentid);
369 cops->qlen_notify(sch, cl);
370 cops->put(sch, cl);
372 sch->q.qlen -= n;
375 EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
377 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
378 to device "dev".
380 Old qdisc is not destroyed but returned in *old.
383 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
384 u32 classid,
385 struct Qdisc *new, struct Qdisc **old)
387 int err = 0;
388 struct Qdisc *q = *old;
391 if (parent == NULL) {
392 if (q && q->flags&TCQ_F_INGRESS) {
393 *old = dev_graft_qdisc(dev, q);
394 } else {
395 *old = dev_graft_qdisc(dev, new);
397 } else {
398 struct Qdisc_class_ops *cops = parent->ops->cl_ops;
400 err = -EINVAL;
402 if (cops) {
403 unsigned long cl = cops->get(parent, classid);
404 if (cl) {
405 err = cops->graft(parent, cl, new, old);
406 if (new)
407 new->parent = classid;
408 cops->put(parent, cl);
412 return err;
416 Allocate and initialize new qdisc.
418 Parameters are passed via opt.
421 static struct Qdisc *
422 qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
424 int err;
425 struct rtattr *kind = tca[TCA_KIND-1];
426 struct Qdisc *sch;
427 struct Qdisc_ops *ops;
429 ops = qdisc_lookup_ops(kind);
430 #ifdef CONFIG_KMOD
431 if (ops == NULL && kind != NULL) {
432 char name[IFNAMSIZ];
433 if (rtattr_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
434 /* We dropped the RTNL semaphore in order to
435 * perform the module load. So, even if we
436 * succeeded in loading the module we have to
437 * tell the caller to replay the request. We
438 * indicate this using -EAGAIN.
439 * We replay the request because the device may
440 * go away in the mean time.
442 rtnl_unlock();
443 request_module("sch_%s", name);
444 rtnl_lock();
445 ops = qdisc_lookup_ops(kind);
446 if (ops != NULL) {
447 /* We will try again qdisc_lookup_ops,
448 * so don't keep a reference.
450 module_put(ops->owner);
451 err = -EAGAIN;
452 goto err_out;
456 #endif
458 err = -ENOENT;
459 if (ops == NULL)
460 goto err_out;
462 sch = qdisc_alloc(dev, ops);
463 if (IS_ERR(sch)) {
464 err = PTR_ERR(sch);
465 goto err_out2;
468 if (handle == TC_H_INGRESS) {
469 sch->flags |= TCQ_F_INGRESS;
470 handle = TC_H_MAKE(TC_H_INGRESS, 0);
471 } else if (handle == 0) {
472 handle = qdisc_alloc_handle(dev);
473 err = -ENOMEM;
474 if (handle == 0)
475 goto err_out3;
478 sch->handle = handle;
480 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
481 #ifdef CONFIG_NET_ESTIMATOR
482 if (tca[TCA_RATE-1]) {
483 err = gen_new_estimator(&sch->bstats, &sch->rate_est,
484 sch->stats_lock,
485 tca[TCA_RATE-1]);
486 if (err) {
488 * Any broken qdiscs that would require
489 * a ops->reset() here? The qdisc was never
490 * in action so it shouldn't be necessary.
492 if (ops->destroy)
493 ops->destroy(sch);
494 goto err_out3;
497 #endif
498 qdisc_lock_tree(dev);
499 list_add_tail(&sch->list, &dev->qdisc_list);
500 qdisc_unlock_tree(dev);
502 return sch;
504 err_out3:
505 dev_put(dev);
506 kfree((char *) sch - sch->padded);
507 err_out2:
508 module_put(ops->owner);
509 err_out:
510 *errp = err;
511 return NULL;
514 static int qdisc_change(struct Qdisc *sch, struct rtattr **tca)
516 if (tca[TCA_OPTIONS-1]) {
517 int err;
519 if (sch->ops->change == NULL)
520 return -EINVAL;
521 err = sch->ops->change(sch, tca[TCA_OPTIONS-1]);
522 if (err)
523 return err;
525 #ifdef CONFIG_NET_ESTIMATOR
526 if (tca[TCA_RATE-1])
527 gen_replace_estimator(&sch->bstats, &sch->rate_est,
528 sch->stats_lock, tca[TCA_RATE-1]);
529 #endif
530 return 0;
533 struct check_loop_arg
535 struct qdisc_walker w;
536 struct Qdisc *p;
537 int depth;
540 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
542 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
544 struct check_loop_arg arg;
546 if (q->ops->cl_ops == NULL)
547 return 0;
549 arg.w.stop = arg.w.skip = arg.w.count = 0;
550 arg.w.fn = check_loop_fn;
551 arg.depth = depth;
552 arg.p = p;
553 q->ops->cl_ops->walk(q, &arg.w);
554 return arg.w.stop ? -ELOOP : 0;
557 static int
558 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
560 struct Qdisc *leaf;
561 struct Qdisc_class_ops *cops = q->ops->cl_ops;
562 struct check_loop_arg *arg = (struct check_loop_arg *)w;
564 leaf = cops->leaf(q, cl);
565 if (leaf) {
566 if (leaf == arg->p || arg->depth > 7)
567 return -ELOOP;
568 return check_loop(leaf, arg->p, arg->depth + 1);
570 return 0;
574 * Delete/get qdisc.
577 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
579 struct tcmsg *tcm = NLMSG_DATA(n);
580 struct rtattr **tca = arg;
581 struct net_device *dev;
582 u32 clid = tcm->tcm_parent;
583 struct Qdisc *q = NULL;
584 struct Qdisc *p = NULL;
585 int err;
587 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
588 return -ENODEV;
590 if (clid) {
591 if (clid != TC_H_ROOT) {
592 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
593 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
594 return -ENOENT;
595 q = qdisc_leaf(p, clid);
596 } else { /* ingress */
597 q = dev->qdisc_ingress;
599 } else {
600 q = dev->qdisc_sleeping;
602 if (!q)
603 return -ENOENT;
605 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
606 return -EINVAL;
607 } else {
608 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
609 return -ENOENT;
612 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
613 return -EINVAL;
615 if (n->nlmsg_type == RTM_DELQDISC) {
616 if (!clid)
617 return -EINVAL;
618 if (q->handle == 0)
619 return -ENOENT;
620 if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0)
621 return err;
622 if (q) {
623 qdisc_notify(skb, n, clid, q, NULL);
624 spin_lock_bh(&dev->queue_lock);
625 qdisc_destroy(q);
626 spin_unlock_bh(&dev->queue_lock);
628 } else {
629 qdisc_notify(skb, n, clid, NULL, q);
631 return 0;
635 Create/change qdisc.
638 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
640 struct tcmsg *tcm;
641 struct rtattr **tca;
642 struct net_device *dev;
643 u32 clid;
644 struct Qdisc *q, *p;
645 int err;
647 replay:
648 /* Reinit, just in case something touches this. */
649 tcm = NLMSG_DATA(n);
650 tca = arg;
651 clid = tcm->tcm_parent;
652 q = p = NULL;
654 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
655 return -ENODEV;
657 if (clid) {
658 if (clid != TC_H_ROOT) {
659 if (clid != TC_H_INGRESS) {
660 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
661 return -ENOENT;
662 q = qdisc_leaf(p, clid);
663 } else { /*ingress */
664 q = dev->qdisc_ingress;
666 } else {
667 q = dev->qdisc_sleeping;
670 /* It may be default qdisc, ignore it */
671 if (q && q->handle == 0)
672 q = NULL;
674 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
675 if (tcm->tcm_handle) {
676 if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
677 return -EEXIST;
678 if (TC_H_MIN(tcm->tcm_handle))
679 return -EINVAL;
680 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
681 goto create_n_graft;
682 if (n->nlmsg_flags&NLM_F_EXCL)
683 return -EEXIST;
684 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
685 return -EINVAL;
686 if (q == p ||
687 (p && check_loop(q, p, 0)))
688 return -ELOOP;
689 atomic_inc(&q->refcnt);
690 goto graft;
691 } else {
692 if (q == NULL)
693 goto create_n_graft;
695 /* This magic test requires explanation.
697 * We know, that some child q is already
698 * attached to this parent and have choice:
699 * either to change it or to create/graft new one.
701 * 1. We are allowed to create/graft only
702 * if CREATE and REPLACE flags are set.
704 * 2. If EXCL is set, requestor wanted to say,
705 * that qdisc tcm_handle is not expected
706 * to exist, so that we choose create/graft too.
708 * 3. The last case is when no flags are set.
709 * Alas, it is sort of hole in API, we
710 * cannot decide what to do unambiguously.
711 * For now we select create/graft, if
712 * user gave KIND, which does not match existing.
714 if ((n->nlmsg_flags&NLM_F_CREATE) &&
715 (n->nlmsg_flags&NLM_F_REPLACE) &&
716 ((n->nlmsg_flags&NLM_F_EXCL) ||
717 (tca[TCA_KIND-1] &&
718 rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))))
719 goto create_n_graft;
722 } else {
723 if (!tcm->tcm_handle)
724 return -EINVAL;
725 q = qdisc_lookup(dev, tcm->tcm_handle);
728 /* Change qdisc parameters */
729 if (q == NULL)
730 return -ENOENT;
731 if (n->nlmsg_flags&NLM_F_EXCL)
732 return -EEXIST;
733 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
734 return -EINVAL;
735 err = qdisc_change(q, tca);
736 if (err == 0)
737 qdisc_notify(skb, n, clid, NULL, q);
738 return err;
740 create_n_graft:
741 if (!(n->nlmsg_flags&NLM_F_CREATE))
742 return -ENOENT;
743 if (clid == TC_H_INGRESS)
744 q = qdisc_create(dev, tcm->tcm_parent, tca, &err);
745 else
746 q = qdisc_create(dev, tcm->tcm_handle, tca, &err);
747 if (q == NULL) {
748 if (err == -EAGAIN)
749 goto replay;
750 return err;
753 graft:
754 if (1) {
755 struct Qdisc *old_q = NULL;
756 err = qdisc_graft(dev, p, clid, q, &old_q);
757 if (err) {
758 if (q) {
759 spin_lock_bh(&dev->queue_lock);
760 qdisc_destroy(q);
761 spin_unlock_bh(&dev->queue_lock);
763 return err;
765 qdisc_notify(skb, n, clid, old_q, q);
766 if (old_q) {
767 spin_lock_bh(&dev->queue_lock);
768 qdisc_destroy(old_q);
769 spin_unlock_bh(&dev->queue_lock);
772 return 0;
775 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
776 u32 pid, u32 seq, u16 flags, int event)
778 struct tcmsg *tcm;
779 struct nlmsghdr *nlh;
780 unsigned char *b = skb->tail;
781 struct gnet_dump d;
783 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
784 tcm = NLMSG_DATA(nlh);
785 tcm->tcm_family = AF_UNSPEC;
786 tcm->tcm__pad1 = 0;
787 tcm->tcm__pad2 = 0;
788 tcm->tcm_ifindex = q->dev->ifindex;
789 tcm->tcm_parent = clid;
790 tcm->tcm_handle = q->handle;
791 tcm->tcm_info = atomic_read(&q->refcnt);
792 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
793 if (q->ops->dump && q->ops->dump(q, skb) < 0)
794 goto rtattr_failure;
795 q->qstats.qlen = q->q.qlen;
797 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
798 TCA_XSTATS, q->stats_lock, &d) < 0)
799 goto rtattr_failure;
801 if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
802 goto rtattr_failure;
804 if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
805 #ifdef CONFIG_NET_ESTIMATOR
806 gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
807 #endif
808 gnet_stats_copy_queue(&d, &q->qstats) < 0)
809 goto rtattr_failure;
811 if (gnet_stats_finish_copy(&d) < 0)
812 goto rtattr_failure;
814 nlh->nlmsg_len = skb->tail - b;
815 return skb->len;
817 nlmsg_failure:
818 rtattr_failure:
819 skb_trim(skb, b - skb->data);
820 return -1;
823 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
824 u32 clid, struct Qdisc *old, struct Qdisc *new)
826 struct sk_buff *skb;
827 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
829 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
830 if (!skb)
831 return -ENOBUFS;
833 if (old && old->handle) {
834 if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
835 goto err_out;
837 if (new) {
838 if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
839 goto err_out;
842 if (skb->len)
843 return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
845 err_out:
846 kfree_skb(skb);
847 return -EINVAL;
850 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
852 int idx, q_idx;
853 int s_idx, s_q_idx;
854 struct net_device *dev;
855 struct Qdisc *q;
857 s_idx = cb->args[0];
858 s_q_idx = q_idx = cb->args[1];
859 read_lock(&dev_base_lock);
860 for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) {
861 if (idx < s_idx)
862 continue;
863 if (idx > s_idx)
864 s_q_idx = 0;
865 read_lock(&qdisc_tree_lock);
866 q_idx = 0;
867 list_for_each_entry(q, &dev->qdisc_list, list) {
868 if (q_idx < s_q_idx) {
869 q_idx++;
870 continue;
872 if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
873 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) {
874 read_unlock(&qdisc_tree_lock);
875 goto done;
877 q_idx++;
879 read_unlock(&qdisc_tree_lock);
882 done:
883 read_unlock(&dev_base_lock);
885 cb->args[0] = idx;
886 cb->args[1] = q_idx;
888 return skb->len;
893 /************************************************
894 * Traffic classes manipulation. *
895 ************************************************/
899 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
901 struct tcmsg *tcm = NLMSG_DATA(n);
902 struct rtattr **tca = arg;
903 struct net_device *dev;
904 struct Qdisc *q = NULL;
905 struct Qdisc_class_ops *cops;
906 unsigned long cl = 0;
907 unsigned long new_cl;
908 u32 pid = tcm->tcm_parent;
909 u32 clid = tcm->tcm_handle;
910 u32 qid = TC_H_MAJ(clid);
911 int err;
913 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
914 return -ENODEV;
917 parent == TC_H_UNSPEC - unspecified parent.
918 parent == TC_H_ROOT - class is root, which has no parent.
919 parent == X:0 - parent is root class.
920 parent == X:Y - parent is a node in hierarchy.
921 parent == 0:Y - parent is X:Y, where X:0 is qdisc.
923 handle == 0:0 - generate handle from kernel pool.
924 handle == 0:Y - class is X:Y, where X:0 is qdisc.
925 handle == X:Y - clear.
926 handle == X:0 - root class.
929 /* Step 1. Determine qdisc handle X:0 */
931 if (pid != TC_H_ROOT) {
932 u32 qid1 = TC_H_MAJ(pid);
934 if (qid && qid1) {
935 /* If both majors are known, they must be identical. */
936 if (qid != qid1)
937 return -EINVAL;
938 } else if (qid1) {
939 qid = qid1;
940 } else if (qid == 0)
941 qid = dev->qdisc_sleeping->handle;
943 /* Now qid is genuine qdisc handle consistent
944 both with parent and child.
946 TC_H_MAJ(pid) still may be unspecified, complete it now.
948 if (pid)
949 pid = TC_H_MAKE(qid, pid);
950 } else {
951 if (qid == 0)
952 qid = dev->qdisc_sleeping->handle;
955 /* OK. Locate qdisc */
956 if ((q = qdisc_lookup(dev, qid)) == NULL)
957 return -ENOENT;
959 /* An check that it supports classes */
960 cops = q->ops->cl_ops;
961 if (cops == NULL)
962 return -EINVAL;
964 /* Now try to get class */
965 if (clid == 0) {
966 if (pid == TC_H_ROOT)
967 clid = qid;
968 } else
969 clid = TC_H_MAKE(qid, clid);
971 if (clid)
972 cl = cops->get(q, clid);
974 if (cl == 0) {
975 err = -ENOENT;
976 if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
977 goto out;
978 } else {
979 switch (n->nlmsg_type) {
980 case RTM_NEWTCLASS:
981 err = -EEXIST;
982 if (n->nlmsg_flags&NLM_F_EXCL)
983 goto out;
984 break;
985 case RTM_DELTCLASS:
986 err = cops->delete(q, cl);
987 if (err == 0)
988 tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
989 goto out;
990 case RTM_GETTCLASS:
991 err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
992 goto out;
993 default:
994 err = -EINVAL;
995 goto out;
999 new_cl = cl;
1000 err = cops->change(q, clid, pid, tca, &new_cl);
1001 if (err == 0)
1002 tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
1004 out:
1005 if (cl)
1006 cops->put(q, cl);
1008 return err;
1012 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1013 unsigned long cl,
1014 u32 pid, u32 seq, u16 flags, int event)
1016 struct tcmsg *tcm;
1017 struct nlmsghdr *nlh;
1018 unsigned char *b = skb->tail;
1019 struct gnet_dump d;
1020 struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1022 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1023 tcm = NLMSG_DATA(nlh);
1024 tcm->tcm_family = AF_UNSPEC;
1025 tcm->tcm_ifindex = q->dev->ifindex;
1026 tcm->tcm_parent = q->handle;
1027 tcm->tcm_handle = q->handle;
1028 tcm->tcm_info = 0;
1029 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
1030 if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1031 goto rtattr_failure;
1033 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
1034 TCA_XSTATS, q->stats_lock, &d) < 0)
1035 goto rtattr_failure;
1037 if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1038 goto rtattr_failure;
1040 if (gnet_stats_finish_copy(&d) < 0)
1041 goto rtattr_failure;
1043 nlh->nlmsg_len = skb->tail - b;
1044 return skb->len;
1046 nlmsg_failure:
1047 rtattr_failure:
1048 skb_trim(skb, b - skb->data);
1049 return -1;
1052 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1053 struct Qdisc *q, unsigned long cl, int event)
1055 struct sk_buff *skb;
1056 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1058 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1059 if (!skb)
1060 return -ENOBUFS;
1062 if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1063 kfree_skb(skb);
1064 return -EINVAL;
1067 return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1070 struct qdisc_dump_args
1072 struct qdisc_walker w;
1073 struct sk_buff *skb;
1074 struct netlink_callback *cb;
1077 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1079 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1081 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1082 a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1085 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1087 int t;
1088 int s_t;
1089 struct net_device *dev;
1090 struct Qdisc *q;
1091 struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1092 struct qdisc_dump_args arg;
1094 if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1095 return 0;
1096 if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
1097 return 0;
1099 s_t = cb->args[0];
1100 t = 0;
1102 read_lock(&qdisc_tree_lock);
1103 list_for_each_entry(q, &dev->qdisc_list, list) {
1104 if (t < s_t || !q->ops->cl_ops ||
1105 (tcm->tcm_parent &&
1106 TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1107 t++;
1108 continue;
1110 if (t > s_t)
1111 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1112 arg.w.fn = qdisc_class_dump;
1113 arg.skb = skb;
1114 arg.cb = cb;
1115 arg.w.stop = 0;
1116 arg.w.skip = cb->args[1];
1117 arg.w.count = 0;
1118 q->ops->cl_ops->walk(q, &arg.w);
1119 cb->args[1] = arg.w.count;
1120 if (arg.w.stop)
1121 break;
1122 t++;
1124 read_unlock(&qdisc_tree_lock);
1126 cb->args[0] = t;
1128 dev_put(dev);
1129 return skb->len;
1132 /* Main classifier routine: scans classifier chain attached
1133 to this qdisc, (optionally) tests for protocol and asks
1134 specific classifiers.
1136 int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1137 struct tcf_result *res)
1139 int err = 0;
1140 __be16 protocol = skb->protocol;
1141 #ifdef CONFIG_NET_CLS_ACT
1142 struct tcf_proto *otp = tp;
1143 reclassify:
1144 #endif
1145 protocol = skb->protocol;
1147 for ( ; tp; tp = tp->next) {
1148 if ((tp->protocol == protocol ||
1149 tp->protocol == __constant_htons(ETH_P_ALL)) &&
1150 (err = tp->classify(skb, tp, res)) >= 0) {
1151 #ifdef CONFIG_NET_CLS_ACT
1152 if ( TC_ACT_RECLASSIFY == err) {
1153 __u32 verd = (__u32) G_TC_VERD(skb->tc_verd);
1154 tp = otp;
1156 if (MAX_REC_LOOP < verd++) {
1157 printk("rule prio %d protocol %02x reclassify is buggy packet dropped\n",
1158 tp->prio&0xffff, ntohs(tp->protocol));
1159 return TC_ACT_SHOT;
1161 skb->tc_verd = SET_TC_VERD(skb->tc_verd,verd);
1162 goto reclassify;
1163 } else {
1164 if (skb->tc_verd)
1165 skb->tc_verd = SET_TC_VERD(skb->tc_verd,0);
1166 return err;
1168 #else
1170 return err;
1171 #endif
1175 return -1;
1178 static int psched_us_per_tick = 1;
1179 static int psched_tick_per_us = 1;
1181 #ifdef CONFIG_PROC_FS
1182 static int psched_show(struct seq_file *seq, void *v)
1184 seq_printf(seq, "%08x %08x %08x %08x\n",
1185 psched_tick_per_us, psched_us_per_tick,
1186 1000000, HZ);
1188 return 0;
1191 static int psched_open(struct inode *inode, struct file *file)
1193 return single_open(file, psched_show, PDE(inode)->data);
1196 static const struct file_operations psched_fops = {
1197 .owner = THIS_MODULE,
1198 .open = psched_open,
1199 .read = seq_read,
1200 .llseek = seq_lseek,
1201 .release = single_release,
1203 #endif
1205 #ifdef CONFIG_NET_SCH_CLK_CPU
1206 psched_tdiff_t psched_clock_per_hz;
1207 int psched_clock_scale;
1208 EXPORT_SYMBOL(psched_clock_per_hz);
1209 EXPORT_SYMBOL(psched_clock_scale);
1211 psched_time_t psched_time_base;
1212 cycles_t psched_time_mark;
1213 EXPORT_SYMBOL(psched_time_mark);
1214 EXPORT_SYMBOL(psched_time_base);
1217 * Periodically adjust psched_time_base to avoid overflow
1218 * with 32-bit get_cycles(). Safe up to 4GHz CPU.
1220 static void psched_tick(unsigned long);
1221 static DEFINE_TIMER(psched_timer, psched_tick, 0, 0);
1223 static void psched_tick(unsigned long dummy)
1225 if (sizeof(cycles_t) == sizeof(u32)) {
1226 psched_time_t dummy_stamp;
1227 PSCHED_GET_TIME(dummy_stamp);
1228 psched_timer.expires = jiffies + 1*HZ;
1229 add_timer(&psched_timer);
1233 int __init psched_calibrate_clock(void)
1235 psched_time_t stamp, stamp1;
1236 struct timeval tv, tv1;
1237 psched_tdiff_t delay;
1238 long rdelay;
1239 unsigned long stop;
1241 psched_tick(0);
1242 stop = jiffies + HZ/10;
1243 PSCHED_GET_TIME(stamp);
1244 do_gettimeofday(&tv);
1245 while (time_before(jiffies, stop)) {
1246 barrier();
1247 cpu_relax();
1249 PSCHED_GET_TIME(stamp1);
1250 do_gettimeofday(&tv1);
1252 delay = PSCHED_TDIFF(stamp1, stamp);
1253 rdelay = tv1.tv_usec - tv.tv_usec;
1254 rdelay += (tv1.tv_sec - tv.tv_sec)*1000000;
1255 if (rdelay > delay)
1256 return -1;
1257 delay /= rdelay;
1258 psched_tick_per_us = delay;
1259 while ((delay>>=1) != 0)
1260 psched_clock_scale++;
1261 psched_us_per_tick = 1<<psched_clock_scale;
1262 psched_clock_per_hz = (psched_tick_per_us*(1000000/HZ))>>psched_clock_scale;
1263 return 0;
1265 #endif
1267 static int __init pktsched_init(void)
1269 struct rtnetlink_link *link_p;
1271 #ifdef CONFIG_NET_SCH_CLK_CPU
1272 if (psched_calibrate_clock() < 0)
1273 return -1;
1274 #elif defined(CONFIG_NET_SCH_CLK_JIFFIES)
1275 psched_tick_per_us = HZ<<PSCHED_JSCALE;
1276 psched_us_per_tick = 1000000;
1277 #endif
1279 link_p = rtnetlink_links[PF_UNSPEC];
1281 /* Setup rtnetlink links. It is made here to avoid
1282 exporting large number of public symbols.
1285 if (link_p) {
1286 link_p[RTM_NEWQDISC-RTM_BASE].doit = tc_modify_qdisc;
1287 link_p[RTM_DELQDISC-RTM_BASE].doit = tc_get_qdisc;
1288 link_p[RTM_GETQDISC-RTM_BASE].doit = tc_get_qdisc;
1289 link_p[RTM_GETQDISC-RTM_BASE].dumpit = tc_dump_qdisc;
1290 link_p[RTM_NEWTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1291 link_p[RTM_DELTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1292 link_p[RTM_GETTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1293 link_p[RTM_GETTCLASS-RTM_BASE].dumpit = tc_dump_tclass;
1296 register_qdisc(&pfifo_qdisc_ops);
1297 register_qdisc(&bfifo_qdisc_ops);
1298 proc_net_fops_create("psched", 0, &psched_fops);
1300 return 0;
1303 subsys_initcall(pktsched_init);
1305 EXPORT_SYMBOL(qdisc_get_rtab);
1306 EXPORT_SYMBOL(qdisc_put_rtab);
1307 EXPORT_SYMBOL(register_qdisc);
1308 EXPORT_SYMBOL(unregister_qdisc);
1309 EXPORT_SYMBOL(tc_classify);