Linux 2.6.17.7
[linux/fpc-iii.git] / net / sched / sch_api.c
blob31570b9a6e9aa72cca60d1953c14e47938547ebe
1 /*
2 * net/sched/sch_api.c Packet scheduler API.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
11 * Fixes:
13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
18 #include <linux/config.h>
19 #include <linux/module.h>
20 #include <linux/types.h>
21 #include <linux/kernel.h>
22 #include <linux/sched.h>
23 #include <linux/string.h>
24 #include <linux/mm.h>
25 #include <linux/socket.h>
26 #include <linux/sockios.h>
27 #include <linux/in.h>
28 #include <linux/errno.h>
29 #include <linux/interrupt.h>
30 #include <linux/netdevice.h>
31 #include <linux/skbuff.h>
32 #include <linux/rtnetlink.h>
33 #include <linux/init.h>
34 #include <linux/proc_fs.h>
35 #include <linux/seq_file.h>
36 #include <linux/kmod.h>
37 #include <linux/list.h>
38 #include <linux/bitops.h>
40 #include <net/sock.h>
41 #include <net/pkt_sched.h>
43 #include <asm/processor.h>
44 #include <asm/uaccess.h>
45 #include <asm/system.h>
47 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
48 struct Qdisc *old, struct Qdisc *new);
49 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
50 struct Qdisc *q, unsigned long cl, int event);
54 Short review.
55 -------------
57 This file consists of two interrelated parts:
59 1. queueing disciplines manager frontend.
60 2. traffic classes manager frontend.
62 Generally, queueing discipline ("qdisc") is a black box,
63 which is able to enqueue packets and to dequeue them (when
64 device is ready to send something) in order and at times
65 determined by algorithm hidden in it.
67 qdisc's are divided to two categories:
68 - "queues", which have no internal structure visible from outside.
69 - "schedulers", which split all the packets to "traffic classes",
70 using "packet classifiers" (look at cls_api.c)
72 In turn, classes may have child qdiscs (as rule, queues)
73 attached to them etc. etc. etc.
75 The goal of the routines in this file is to translate
76 information supplied by user in the form of handles
77 to more intelligible for kernel form, to make some sanity
78 checks and part of work, which is common to all qdiscs
79 and to provide rtnetlink notifications.
81 All real intelligent work is done inside qdisc modules.
85 Every discipline has two major routines: enqueue and dequeue.
87 ---dequeue
89 dequeue usually returns a skb to send. It is allowed to return NULL,
90 but it does not mean that queue is empty, it just means that
91 discipline does not want to send anything this time.
92 Queue is really empty if q->q.qlen == 0.
93 For complicated disciplines with multiple queues q->q is not
94 real packet queue, but however q->q.qlen must be valid.
96 ---enqueue
98 enqueue returns 0, if packet was enqueued successfully.
99 If packet (this one or another one) was dropped, it returns
100 not zero error code.
101 NET_XMIT_DROP - this packet dropped
102 Expected action: do not backoff, but wait until queue will clear.
103 NET_XMIT_CN - probably this packet enqueued, but another one dropped.
104 Expected action: backoff or ignore
105 NET_XMIT_POLICED - dropped by police.
106 Expected action: backoff or error to real-time apps.
108 Auxiliary routines:
110 ---requeue
112 requeues once dequeued packet. It is used for non-standard or
113 just buggy devices, which can defer output even if dev->tbusy=0.
115 ---reset
117 returns qdisc to initial state: purge all buffers, clear all
118 timers, counters (except for statistics) etc.
120 ---init
122 initializes newly created qdisc.
124 ---destroy
126 destroys resources allocated by init and during lifetime of qdisc.
128 ---change
130 changes qdisc parameters.
133 /* Protects list of registered TC modules. It is pure SMP lock. */
134 static DEFINE_RWLOCK(qdisc_mod_lock);
137 /************************************************
138 * Queueing disciplines manipulation. *
139 ************************************************/
142 /* The list of all installed queueing disciplines. */
144 static struct Qdisc_ops *qdisc_base;
146 /* Register/uregister queueing discipline */
148 int register_qdisc(struct Qdisc_ops *qops)
150 struct Qdisc_ops *q, **qp;
151 int rc = -EEXIST;
153 write_lock(&qdisc_mod_lock);
154 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
155 if (!strcmp(qops->id, q->id))
156 goto out;
158 if (qops->enqueue == NULL)
159 qops->enqueue = noop_qdisc_ops.enqueue;
160 if (qops->requeue == NULL)
161 qops->requeue = noop_qdisc_ops.requeue;
162 if (qops->dequeue == NULL)
163 qops->dequeue = noop_qdisc_ops.dequeue;
165 qops->next = NULL;
166 *qp = qops;
167 rc = 0;
168 out:
169 write_unlock(&qdisc_mod_lock);
170 return rc;
173 int unregister_qdisc(struct Qdisc_ops *qops)
175 struct Qdisc_ops *q, **qp;
176 int err = -ENOENT;
178 write_lock(&qdisc_mod_lock);
179 for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
180 if (q == qops)
181 break;
182 if (q) {
183 *qp = q->next;
184 q->next = NULL;
185 err = 0;
187 write_unlock(&qdisc_mod_lock);
188 return err;
191 /* We know handle. Find qdisc among all qdisc's attached to device
192 (root qdisc, all its children, children of children etc.)
195 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
197 struct Qdisc *q;
199 read_lock_bh(&qdisc_tree_lock);
200 list_for_each_entry(q, &dev->qdisc_list, list) {
201 if (q->handle == handle) {
202 read_unlock_bh(&qdisc_tree_lock);
203 return q;
206 read_unlock_bh(&qdisc_tree_lock);
207 return NULL;
210 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
212 unsigned long cl;
213 struct Qdisc *leaf;
214 struct Qdisc_class_ops *cops = p->ops->cl_ops;
216 if (cops == NULL)
217 return NULL;
218 cl = cops->get(p, classid);
220 if (cl == 0)
221 return NULL;
222 leaf = cops->leaf(p, cl);
223 cops->put(p, cl);
224 return leaf;
227 /* Find queueing discipline by name */
229 static struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind)
231 struct Qdisc_ops *q = NULL;
233 if (kind) {
234 read_lock(&qdisc_mod_lock);
235 for (q = qdisc_base; q; q = q->next) {
236 if (rtattr_strcmp(kind, q->id) == 0) {
237 if (!try_module_get(q->owner))
238 q = NULL;
239 break;
242 read_unlock(&qdisc_mod_lock);
244 return q;
247 static struct qdisc_rate_table *qdisc_rtab_list;
249 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab)
251 struct qdisc_rate_table *rtab;
253 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
254 if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
255 rtab->refcnt++;
256 return rtab;
260 if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024)
261 return NULL;
263 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
264 if (rtab) {
265 rtab->rate = *r;
266 rtab->refcnt = 1;
267 memcpy(rtab->data, RTA_DATA(tab), 1024);
268 rtab->next = qdisc_rtab_list;
269 qdisc_rtab_list = rtab;
271 return rtab;
274 void qdisc_put_rtab(struct qdisc_rate_table *tab)
276 struct qdisc_rate_table *rtab, **rtabp;
278 if (!tab || --tab->refcnt)
279 return;
281 for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
282 if (rtab == tab) {
283 *rtabp = rtab->next;
284 kfree(rtab);
285 return;
291 /* Allocate an unique handle from space managed by kernel */
293 static u32 qdisc_alloc_handle(struct net_device *dev)
295 int i = 0x10000;
296 static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
298 do {
299 autohandle += TC_H_MAKE(0x10000U, 0);
300 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
301 autohandle = TC_H_MAKE(0x80000000U, 0);
302 } while (qdisc_lookup(dev, autohandle) && --i > 0);
304 return i>0 ? autohandle : 0;
307 /* Attach toplevel qdisc to device dev */
309 static struct Qdisc *
310 dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc)
312 struct Qdisc *oqdisc;
314 if (dev->flags & IFF_UP)
315 dev_deactivate(dev);
317 qdisc_lock_tree(dev);
318 if (qdisc && qdisc->flags&TCQ_F_INGRESS) {
319 oqdisc = dev->qdisc_ingress;
320 /* Prune old scheduler */
321 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) {
322 /* delete */
323 qdisc_reset(oqdisc);
324 dev->qdisc_ingress = NULL;
325 } else { /* new */
326 dev->qdisc_ingress = qdisc;
329 } else {
331 oqdisc = dev->qdisc_sleeping;
333 /* Prune old scheduler */
334 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
335 qdisc_reset(oqdisc);
337 /* ... and graft new one */
338 if (qdisc == NULL)
339 qdisc = &noop_qdisc;
340 dev->qdisc_sleeping = qdisc;
341 dev->qdisc = &noop_qdisc;
344 qdisc_unlock_tree(dev);
346 if (dev->flags & IFF_UP)
347 dev_activate(dev);
349 return oqdisc;
353 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
354 to device "dev".
356 Old qdisc is not destroyed but returned in *old.
359 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
360 u32 classid,
361 struct Qdisc *new, struct Qdisc **old)
363 int err = 0;
364 struct Qdisc *q = *old;
367 if (parent == NULL) {
368 if (q && q->flags&TCQ_F_INGRESS) {
369 *old = dev_graft_qdisc(dev, q);
370 } else {
371 *old = dev_graft_qdisc(dev, new);
373 } else {
374 struct Qdisc_class_ops *cops = parent->ops->cl_ops;
376 err = -EINVAL;
378 if (cops) {
379 unsigned long cl = cops->get(parent, classid);
380 if (cl) {
381 err = cops->graft(parent, cl, new, old);
382 if (new)
383 new->parent = classid;
384 cops->put(parent, cl);
388 return err;
392 Allocate and initialize new qdisc.
394 Parameters are passed via opt.
397 static struct Qdisc *
398 qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
400 int err;
401 struct rtattr *kind = tca[TCA_KIND-1];
402 struct Qdisc *sch;
403 struct Qdisc_ops *ops;
405 ops = qdisc_lookup_ops(kind);
406 #ifdef CONFIG_KMOD
407 if (ops == NULL && kind != NULL) {
408 char name[IFNAMSIZ];
409 if (rtattr_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
410 /* We dropped the RTNL semaphore in order to
411 * perform the module load. So, even if we
412 * succeeded in loading the module we have to
413 * tell the caller to replay the request. We
414 * indicate this using -EAGAIN.
415 * We replay the request because the device may
416 * go away in the mean time.
418 rtnl_unlock();
419 request_module("sch_%s", name);
420 rtnl_lock();
421 ops = qdisc_lookup_ops(kind);
422 if (ops != NULL) {
423 /* We will try again qdisc_lookup_ops,
424 * so don't keep a reference.
426 module_put(ops->owner);
427 err = -EAGAIN;
428 goto err_out;
432 #endif
434 err = -EINVAL;
435 if (ops == NULL)
436 goto err_out;
438 sch = qdisc_alloc(dev, ops);
439 if (IS_ERR(sch)) {
440 err = PTR_ERR(sch);
441 goto err_out2;
444 if (handle == TC_H_INGRESS) {
445 sch->flags |= TCQ_F_INGRESS;
446 handle = TC_H_MAKE(TC_H_INGRESS, 0);
447 } else if (handle == 0) {
448 handle = qdisc_alloc_handle(dev);
449 err = -ENOMEM;
450 if (handle == 0)
451 goto err_out3;
454 sch->handle = handle;
456 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
457 #ifdef CONFIG_NET_ESTIMATOR
458 if (tca[TCA_RATE-1]) {
459 err = gen_new_estimator(&sch->bstats, &sch->rate_est,
460 sch->stats_lock,
461 tca[TCA_RATE-1]);
462 if (err) {
464 * Any broken qdiscs that would require
465 * a ops->reset() here? The qdisc was never
466 * in action so it shouldn't be necessary.
468 if (ops->destroy)
469 ops->destroy(sch);
470 goto err_out3;
473 #endif
474 qdisc_lock_tree(dev);
475 list_add_tail(&sch->list, &dev->qdisc_list);
476 qdisc_unlock_tree(dev);
478 return sch;
480 err_out3:
481 dev_put(dev);
482 kfree((char *) sch - sch->padded);
483 err_out2:
484 module_put(ops->owner);
485 err_out:
486 *errp = err;
487 return NULL;
490 static int qdisc_change(struct Qdisc *sch, struct rtattr **tca)
492 if (tca[TCA_OPTIONS-1]) {
493 int err;
495 if (sch->ops->change == NULL)
496 return -EINVAL;
497 err = sch->ops->change(sch, tca[TCA_OPTIONS-1]);
498 if (err)
499 return err;
501 #ifdef CONFIG_NET_ESTIMATOR
502 if (tca[TCA_RATE-1])
503 gen_replace_estimator(&sch->bstats, &sch->rate_est,
504 sch->stats_lock, tca[TCA_RATE-1]);
505 #endif
506 return 0;
509 struct check_loop_arg
511 struct qdisc_walker w;
512 struct Qdisc *p;
513 int depth;
516 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
518 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
520 struct check_loop_arg arg;
522 if (q->ops->cl_ops == NULL)
523 return 0;
525 arg.w.stop = arg.w.skip = arg.w.count = 0;
526 arg.w.fn = check_loop_fn;
527 arg.depth = depth;
528 arg.p = p;
529 q->ops->cl_ops->walk(q, &arg.w);
530 return arg.w.stop ? -ELOOP : 0;
533 static int
534 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
536 struct Qdisc *leaf;
537 struct Qdisc_class_ops *cops = q->ops->cl_ops;
538 struct check_loop_arg *arg = (struct check_loop_arg *)w;
540 leaf = cops->leaf(q, cl);
541 if (leaf) {
542 if (leaf == arg->p || arg->depth > 7)
543 return -ELOOP;
544 return check_loop(leaf, arg->p, arg->depth + 1);
546 return 0;
550 * Delete/get qdisc.
553 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
555 struct tcmsg *tcm = NLMSG_DATA(n);
556 struct rtattr **tca = arg;
557 struct net_device *dev;
558 u32 clid = tcm->tcm_parent;
559 struct Qdisc *q = NULL;
560 struct Qdisc *p = NULL;
561 int err;
563 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
564 return -ENODEV;
566 if (clid) {
567 if (clid != TC_H_ROOT) {
568 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
569 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
570 return -ENOENT;
571 q = qdisc_leaf(p, clid);
572 } else { /* ingress */
573 q = dev->qdisc_ingress;
575 } else {
576 q = dev->qdisc_sleeping;
578 if (!q)
579 return -ENOENT;
581 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
582 return -EINVAL;
583 } else {
584 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
585 return -ENOENT;
588 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
589 return -EINVAL;
591 if (n->nlmsg_type == RTM_DELQDISC) {
592 if (!clid)
593 return -EINVAL;
594 if (q->handle == 0)
595 return -ENOENT;
596 if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0)
597 return err;
598 if (q) {
599 qdisc_notify(skb, n, clid, q, NULL);
600 spin_lock_bh(&dev->queue_lock);
601 qdisc_destroy(q);
602 spin_unlock_bh(&dev->queue_lock);
604 } else {
605 qdisc_notify(skb, n, clid, NULL, q);
607 return 0;
611 Create/change qdisc.
614 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
616 struct tcmsg *tcm;
617 struct rtattr **tca;
618 struct net_device *dev;
619 u32 clid;
620 struct Qdisc *q, *p;
621 int err;
623 replay:
624 /* Reinit, just in case something touches this. */
625 tcm = NLMSG_DATA(n);
626 tca = arg;
627 clid = tcm->tcm_parent;
628 q = p = NULL;
630 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
631 return -ENODEV;
633 if (clid) {
634 if (clid != TC_H_ROOT) {
635 if (clid != TC_H_INGRESS) {
636 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
637 return -ENOENT;
638 q = qdisc_leaf(p, clid);
639 } else { /*ingress */
640 q = dev->qdisc_ingress;
642 } else {
643 q = dev->qdisc_sleeping;
646 /* It may be default qdisc, ignore it */
647 if (q && q->handle == 0)
648 q = NULL;
650 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
651 if (tcm->tcm_handle) {
652 if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
653 return -EEXIST;
654 if (TC_H_MIN(tcm->tcm_handle))
655 return -EINVAL;
656 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
657 goto create_n_graft;
658 if (n->nlmsg_flags&NLM_F_EXCL)
659 return -EEXIST;
660 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
661 return -EINVAL;
662 if (q == p ||
663 (p && check_loop(q, p, 0)))
664 return -ELOOP;
665 atomic_inc(&q->refcnt);
666 goto graft;
667 } else {
668 if (q == NULL)
669 goto create_n_graft;
671 /* This magic test requires explanation.
673 * We know, that some child q is already
674 * attached to this parent and have choice:
675 * either to change it or to create/graft new one.
677 * 1. We are allowed to create/graft only
678 * if CREATE and REPLACE flags are set.
680 * 2. If EXCL is set, requestor wanted to say,
681 * that qdisc tcm_handle is not expected
682 * to exist, so that we choose create/graft too.
684 * 3. The last case is when no flags are set.
685 * Alas, it is sort of hole in API, we
686 * cannot decide what to do unambiguously.
687 * For now we select create/graft, if
688 * user gave KIND, which does not match existing.
690 if ((n->nlmsg_flags&NLM_F_CREATE) &&
691 (n->nlmsg_flags&NLM_F_REPLACE) &&
692 ((n->nlmsg_flags&NLM_F_EXCL) ||
693 (tca[TCA_KIND-1] &&
694 rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))))
695 goto create_n_graft;
698 } else {
699 if (!tcm->tcm_handle)
700 return -EINVAL;
701 q = qdisc_lookup(dev, tcm->tcm_handle);
704 /* Change qdisc parameters */
705 if (q == NULL)
706 return -ENOENT;
707 if (n->nlmsg_flags&NLM_F_EXCL)
708 return -EEXIST;
709 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
710 return -EINVAL;
711 err = qdisc_change(q, tca);
712 if (err == 0)
713 qdisc_notify(skb, n, clid, NULL, q);
714 return err;
716 create_n_graft:
717 if (!(n->nlmsg_flags&NLM_F_CREATE))
718 return -ENOENT;
719 if (clid == TC_H_INGRESS)
720 q = qdisc_create(dev, tcm->tcm_parent, tca, &err);
721 else
722 q = qdisc_create(dev, tcm->tcm_handle, tca, &err);
723 if (q == NULL) {
724 if (err == -EAGAIN)
725 goto replay;
726 return err;
729 graft:
730 if (1) {
731 struct Qdisc *old_q = NULL;
732 err = qdisc_graft(dev, p, clid, q, &old_q);
733 if (err) {
734 if (q) {
735 spin_lock_bh(&dev->queue_lock);
736 qdisc_destroy(q);
737 spin_unlock_bh(&dev->queue_lock);
739 return err;
741 qdisc_notify(skb, n, clid, old_q, q);
742 if (old_q) {
743 spin_lock_bh(&dev->queue_lock);
744 qdisc_destroy(old_q);
745 spin_unlock_bh(&dev->queue_lock);
748 return 0;
751 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
752 u32 pid, u32 seq, u16 flags, int event)
754 struct tcmsg *tcm;
755 struct nlmsghdr *nlh;
756 unsigned char *b = skb->tail;
757 struct gnet_dump d;
759 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
760 tcm = NLMSG_DATA(nlh);
761 tcm->tcm_family = AF_UNSPEC;
762 tcm->tcm__pad1 = 0;
763 tcm->tcm__pad2 = 0;
764 tcm->tcm_ifindex = q->dev->ifindex;
765 tcm->tcm_parent = clid;
766 tcm->tcm_handle = q->handle;
767 tcm->tcm_info = atomic_read(&q->refcnt);
768 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
769 if (q->ops->dump && q->ops->dump(q, skb) < 0)
770 goto rtattr_failure;
771 q->qstats.qlen = q->q.qlen;
773 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
774 TCA_XSTATS, q->stats_lock, &d) < 0)
775 goto rtattr_failure;
777 if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
778 goto rtattr_failure;
780 if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
781 #ifdef CONFIG_NET_ESTIMATOR
782 gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
783 #endif
784 gnet_stats_copy_queue(&d, &q->qstats) < 0)
785 goto rtattr_failure;
787 if (gnet_stats_finish_copy(&d) < 0)
788 goto rtattr_failure;
790 nlh->nlmsg_len = skb->tail - b;
791 return skb->len;
793 nlmsg_failure:
794 rtattr_failure:
795 skb_trim(skb, b - skb->data);
796 return -1;
799 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
800 u32 clid, struct Qdisc *old, struct Qdisc *new)
802 struct sk_buff *skb;
803 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
805 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
806 if (!skb)
807 return -ENOBUFS;
809 if (old && old->handle) {
810 if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
811 goto err_out;
813 if (new) {
814 if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
815 goto err_out;
818 if (skb->len)
819 return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
821 err_out:
822 kfree_skb(skb);
823 return -EINVAL;
826 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
828 int idx, q_idx;
829 int s_idx, s_q_idx;
830 struct net_device *dev;
831 struct Qdisc *q;
833 s_idx = cb->args[0];
834 s_q_idx = q_idx = cb->args[1];
835 read_lock(&dev_base_lock);
836 for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) {
837 if (idx < s_idx)
838 continue;
839 if (idx > s_idx)
840 s_q_idx = 0;
841 read_lock_bh(&qdisc_tree_lock);
842 q_idx = 0;
843 list_for_each_entry(q, &dev->qdisc_list, list) {
844 if (q_idx < s_q_idx) {
845 q_idx++;
846 continue;
848 if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
849 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) {
850 read_unlock_bh(&qdisc_tree_lock);
851 goto done;
853 q_idx++;
855 read_unlock_bh(&qdisc_tree_lock);
858 done:
859 read_unlock(&dev_base_lock);
861 cb->args[0] = idx;
862 cb->args[1] = q_idx;
864 return skb->len;
869 /************************************************
870 * Traffic classes manipulation. *
871 ************************************************/
875 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
877 struct tcmsg *tcm = NLMSG_DATA(n);
878 struct rtattr **tca = arg;
879 struct net_device *dev;
880 struct Qdisc *q = NULL;
881 struct Qdisc_class_ops *cops;
882 unsigned long cl = 0;
883 unsigned long new_cl;
884 u32 pid = tcm->tcm_parent;
885 u32 clid = tcm->tcm_handle;
886 u32 qid = TC_H_MAJ(clid);
887 int err;
889 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
890 return -ENODEV;
893 parent == TC_H_UNSPEC - unspecified parent.
894 parent == TC_H_ROOT - class is root, which has no parent.
895 parent == X:0 - parent is root class.
896 parent == X:Y - parent is a node in hierarchy.
897 parent == 0:Y - parent is X:Y, where X:0 is qdisc.
899 handle == 0:0 - generate handle from kernel pool.
900 handle == 0:Y - class is X:Y, where X:0 is qdisc.
901 handle == X:Y - clear.
902 handle == X:0 - root class.
905 /* Step 1. Determine qdisc handle X:0 */
907 if (pid != TC_H_ROOT) {
908 u32 qid1 = TC_H_MAJ(pid);
910 if (qid && qid1) {
911 /* If both majors are known, they must be identical. */
912 if (qid != qid1)
913 return -EINVAL;
914 } else if (qid1) {
915 qid = qid1;
916 } else if (qid == 0)
917 qid = dev->qdisc_sleeping->handle;
919 /* Now qid is genuine qdisc handle consistent
920 both with parent and child.
922 TC_H_MAJ(pid) still may be unspecified, complete it now.
924 if (pid)
925 pid = TC_H_MAKE(qid, pid);
926 } else {
927 if (qid == 0)
928 qid = dev->qdisc_sleeping->handle;
931 /* OK. Locate qdisc */
932 if ((q = qdisc_lookup(dev, qid)) == NULL)
933 return -ENOENT;
935 /* An check that it supports classes */
936 cops = q->ops->cl_ops;
937 if (cops == NULL)
938 return -EINVAL;
940 /* Now try to get class */
941 if (clid == 0) {
942 if (pid == TC_H_ROOT)
943 clid = qid;
944 } else
945 clid = TC_H_MAKE(qid, clid);
947 if (clid)
948 cl = cops->get(q, clid);
950 if (cl == 0) {
951 err = -ENOENT;
952 if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
953 goto out;
954 } else {
955 switch (n->nlmsg_type) {
956 case RTM_NEWTCLASS:
957 err = -EEXIST;
958 if (n->nlmsg_flags&NLM_F_EXCL)
959 goto out;
960 break;
961 case RTM_DELTCLASS:
962 err = cops->delete(q, cl);
963 if (err == 0)
964 tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
965 goto out;
966 case RTM_GETTCLASS:
967 err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
968 goto out;
969 default:
970 err = -EINVAL;
971 goto out;
975 new_cl = cl;
976 err = cops->change(q, clid, pid, tca, &new_cl);
977 if (err == 0)
978 tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
980 out:
981 if (cl)
982 cops->put(q, cl);
984 return err;
988 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
989 unsigned long cl,
990 u32 pid, u32 seq, u16 flags, int event)
992 struct tcmsg *tcm;
993 struct nlmsghdr *nlh;
994 unsigned char *b = skb->tail;
995 struct gnet_dump d;
996 struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
998 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
999 tcm = NLMSG_DATA(nlh);
1000 tcm->tcm_family = AF_UNSPEC;
1001 tcm->tcm_ifindex = q->dev->ifindex;
1002 tcm->tcm_parent = q->handle;
1003 tcm->tcm_handle = q->handle;
1004 tcm->tcm_info = 0;
1005 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
1006 if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1007 goto rtattr_failure;
1009 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
1010 TCA_XSTATS, q->stats_lock, &d) < 0)
1011 goto rtattr_failure;
1013 if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1014 goto rtattr_failure;
1016 if (gnet_stats_finish_copy(&d) < 0)
1017 goto rtattr_failure;
1019 nlh->nlmsg_len = skb->tail - b;
1020 return skb->len;
1022 nlmsg_failure:
1023 rtattr_failure:
1024 skb_trim(skb, b - skb->data);
1025 return -1;
1028 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1029 struct Qdisc *q, unsigned long cl, int event)
1031 struct sk_buff *skb;
1032 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1034 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1035 if (!skb)
1036 return -ENOBUFS;
1038 if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1039 kfree_skb(skb);
1040 return -EINVAL;
1043 return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1046 struct qdisc_dump_args
1048 struct qdisc_walker w;
1049 struct sk_buff *skb;
1050 struct netlink_callback *cb;
1053 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1055 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1057 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1058 a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1061 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1063 int t;
1064 int s_t;
1065 struct net_device *dev;
1066 struct Qdisc *q;
1067 struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1068 struct qdisc_dump_args arg;
1070 if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1071 return 0;
1072 if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
1073 return 0;
1075 s_t = cb->args[0];
1076 t = 0;
1078 read_lock_bh(&qdisc_tree_lock);
1079 list_for_each_entry(q, &dev->qdisc_list, list) {
1080 if (t < s_t || !q->ops->cl_ops ||
1081 (tcm->tcm_parent &&
1082 TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1083 t++;
1084 continue;
1086 if (t > s_t)
1087 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1088 arg.w.fn = qdisc_class_dump;
1089 arg.skb = skb;
1090 arg.cb = cb;
1091 arg.w.stop = 0;
1092 arg.w.skip = cb->args[1];
1093 arg.w.count = 0;
1094 q->ops->cl_ops->walk(q, &arg.w);
1095 cb->args[1] = arg.w.count;
1096 if (arg.w.stop)
1097 break;
1098 t++;
1100 read_unlock_bh(&qdisc_tree_lock);
1102 cb->args[0] = t;
1104 dev_put(dev);
1105 return skb->len;
1108 /* Main classifier routine: scans classifier chain attached
1109 to this qdisc, (optionally) tests for protocol and asks
1110 specific classifiers.
1112 int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1113 struct tcf_result *res)
1115 int err = 0;
1116 u32 protocol = skb->protocol;
1117 #ifdef CONFIG_NET_CLS_ACT
1118 struct tcf_proto *otp = tp;
1119 reclassify:
1120 #endif
1121 protocol = skb->protocol;
1123 for ( ; tp; tp = tp->next) {
1124 if ((tp->protocol == protocol ||
1125 tp->protocol == __constant_htons(ETH_P_ALL)) &&
1126 (err = tp->classify(skb, tp, res)) >= 0) {
1127 #ifdef CONFIG_NET_CLS_ACT
1128 if ( TC_ACT_RECLASSIFY == err) {
1129 __u32 verd = (__u32) G_TC_VERD(skb->tc_verd);
1130 tp = otp;
1132 if (MAX_REC_LOOP < verd++) {
1133 printk("rule prio %d protocol %02x reclassify is buggy packet dropped\n",
1134 tp->prio&0xffff, ntohs(tp->protocol));
1135 return TC_ACT_SHOT;
1137 skb->tc_verd = SET_TC_VERD(skb->tc_verd,verd);
1138 goto reclassify;
1139 } else {
1140 if (skb->tc_verd)
1141 skb->tc_verd = SET_TC_VERD(skb->tc_verd,0);
1142 return err;
1144 #else
1146 return err;
1147 #endif
1151 return -1;
1154 static int psched_us_per_tick = 1;
1155 static int psched_tick_per_us = 1;
1157 #ifdef CONFIG_PROC_FS
1158 static int psched_show(struct seq_file *seq, void *v)
1160 seq_printf(seq, "%08x %08x %08x %08x\n",
1161 psched_tick_per_us, psched_us_per_tick,
1162 1000000, HZ);
1164 return 0;
1167 static int psched_open(struct inode *inode, struct file *file)
1169 return single_open(file, psched_show, PDE(inode)->data);
1172 static struct file_operations psched_fops = {
1173 .owner = THIS_MODULE,
1174 .open = psched_open,
1175 .read = seq_read,
1176 .llseek = seq_lseek,
1177 .release = single_release,
1179 #endif
1181 #ifdef CONFIG_NET_SCH_CLK_CPU
1182 psched_tdiff_t psched_clock_per_hz;
1183 int psched_clock_scale;
1184 EXPORT_SYMBOL(psched_clock_per_hz);
1185 EXPORT_SYMBOL(psched_clock_scale);
1187 psched_time_t psched_time_base;
1188 cycles_t psched_time_mark;
1189 EXPORT_SYMBOL(psched_time_mark);
1190 EXPORT_SYMBOL(psched_time_base);
1193 * Periodically adjust psched_time_base to avoid overflow
1194 * with 32-bit get_cycles(). Safe up to 4GHz CPU.
1196 static void psched_tick(unsigned long);
1197 static DEFINE_TIMER(psched_timer, psched_tick, 0, 0);
1199 static void psched_tick(unsigned long dummy)
1201 if (sizeof(cycles_t) == sizeof(u32)) {
1202 psched_time_t dummy_stamp;
1203 PSCHED_GET_TIME(dummy_stamp);
1204 psched_timer.expires = jiffies + 1*HZ;
1205 add_timer(&psched_timer);
1209 int __init psched_calibrate_clock(void)
1211 psched_time_t stamp, stamp1;
1212 struct timeval tv, tv1;
1213 psched_tdiff_t delay;
1214 long rdelay;
1215 unsigned long stop;
1217 psched_tick(0);
1218 stop = jiffies + HZ/10;
1219 PSCHED_GET_TIME(stamp);
1220 do_gettimeofday(&tv);
1221 while (time_before(jiffies, stop)) {
1222 barrier();
1223 cpu_relax();
1225 PSCHED_GET_TIME(stamp1);
1226 do_gettimeofday(&tv1);
1228 delay = PSCHED_TDIFF(stamp1, stamp);
1229 rdelay = tv1.tv_usec - tv.tv_usec;
1230 rdelay += (tv1.tv_sec - tv.tv_sec)*1000000;
1231 if (rdelay > delay)
1232 return -1;
1233 delay /= rdelay;
1234 psched_tick_per_us = delay;
1235 while ((delay>>=1) != 0)
1236 psched_clock_scale++;
1237 psched_us_per_tick = 1<<psched_clock_scale;
1238 psched_clock_per_hz = (psched_tick_per_us*(1000000/HZ))>>psched_clock_scale;
1239 return 0;
1241 #endif
1243 static int __init pktsched_init(void)
1245 struct rtnetlink_link *link_p;
1247 #ifdef CONFIG_NET_SCH_CLK_CPU
1248 if (psched_calibrate_clock() < 0)
1249 return -1;
1250 #elif defined(CONFIG_NET_SCH_CLK_JIFFIES)
1251 psched_tick_per_us = HZ<<PSCHED_JSCALE;
1252 psched_us_per_tick = 1000000;
1253 #endif
1255 link_p = rtnetlink_links[PF_UNSPEC];
1257 /* Setup rtnetlink links. It is made here to avoid
1258 exporting large number of public symbols.
1261 if (link_p) {
1262 link_p[RTM_NEWQDISC-RTM_BASE].doit = tc_modify_qdisc;
1263 link_p[RTM_DELQDISC-RTM_BASE].doit = tc_get_qdisc;
1264 link_p[RTM_GETQDISC-RTM_BASE].doit = tc_get_qdisc;
1265 link_p[RTM_GETQDISC-RTM_BASE].dumpit = tc_dump_qdisc;
1266 link_p[RTM_NEWTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1267 link_p[RTM_DELTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1268 link_p[RTM_GETTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1269 link_p[RTM_GETTCLASS-RTM_BASE].dumpit = tc_dump_tclass;
1272 register_qdisc(&pfifo_qdisc_ops);
1273 register_qdisc(&bfifo_qdisc_ops);
1274 proc_net_fops_create("psched", 0, &psched_fops);
1276 return 0;
1279 subsys_initcall(pktsched_init);
1281 EXPORT_SYMBOL(qdisc_lookup);
1282 EXPORT_SYMBOL(qdisc_get_rtab);
1283 EXPORT_SYMBOL(qdisc_put_rtab);
1284 EXPORT_SYMBOL(register_qdisc);
1285 EXPORT_SYMBOL(unregister_qdisc);
1286 EXPORT_SYMBOL(tc_classify);