Merge master.kernel.org:/pub/scm/linux/kernel/git/davem/sparc-2.6
[linux/fpc-iii.git] / net / sched / sch_api.c
blob0b6489291140f0324c73e8a8ddb0aad716c03593
1 /*
2 * net/sched/sch_api.c Packet scheduler API.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
11 * Fixes:
13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/sched.h>
22 #include <linux/string.h>
23 #include <linux/mm.h>
24 #include <linux/socket.h>
25 #include <linux/sockios.h>
26 #include <linux/in.h>
27 #include <linux/errno.h>
28 #include <linux/interrupt.h>
29 #include <linux/netdevice.h>
30 #include <linux/skbuff.h>
31 #include <linux/rtnetlink.h>
32 #include <linux/init.h>
33 #include <linux/proc_fs.h>
34 #include <linux/seq_file.h>
35 #include <linux/kmod.h>
36 #include <linux/list.h>
37 #include <linux/bitops.h>
39 #include <net/sock.h>
40 #include <net/pkt_sched.h>
42 #include <asm/processor.h>
43 #include <asm/uaccess.h>
44 #include <asm/system.h>
46 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
47 struct Qdisc *old, struct Qdisc *new);
48 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
49 struct Qdisc *q, unsigned long cl, int event);
53 Short review.
54 -------------
56 This file consists of two interrelated parts:
58 1. queueing disciplines manager frontend.
59 2. traffic classes manager frontend.
61 Generally, queueing discipline ("qdisc") is a black box,
62 which is able to enqueue packets and to dequeue them (when
63 device is ready to send something) in order and at times
64 determined by algorithm hidden in it.
66 qdisc's are divided to two categories:
67 - "queues", which have no internal structure visible from outside.
68 - "schedulers", which split all the packets to "traffic classes",
69 using "packet classifiers" (look at cls_api.c)
71 In turn, classes may have child qdiscs (as rule, queues)
72 attached to them etc. etc. etc.
74 The goal of the routines in this file is to translate
75 information supplied by user in the form of handles
76 to more intelligible for kernel form, to make some sanity
77 checks and part of work, which is common to all qdiscs
78 and to provide rtnetlink notifications.
80 All real intelligent work is done inside qdisc modules.
84 Every discipline has two major routines: enqueue and dequeue.
86 ---dequeue
88 dequeue usually returns a skb to send. It is allowed to return NULL,
89 but it does not mean that queue is empty, it just means that
90 discipline does not want to send anything this time.
91 Queue is really empty if q->q.qlen == 0.
92 For complicated disciplines with multiple queues q->q is not
93 real packet queue, but however q->q.qlen must be valid.
95 ---enqueue
97 enqueue returns 0, if packet was enqueued successfully.
98 If packet (this one or another one) was dropped, it returns
99 not zero error code.
100 NET_XMIT_DROP - this packet dropped
101 Expected action: do not backoff, but wait until queue will clear.
102 NET_XMIT_CN - probably this packet enqueued, but another one dropped.
103 Expected action: backoff or ignore
104 NET_XMIT_POLICED - dropped by police.
105 Expected action: backoff or error to real-time apps.
107 Auxiliary routines:
109 ---requeue
111 requeues once dequeued packet. It is used for non-standard or
112 just buggy devices, which can defer output even if dev->tbusy=0.
114 ---reset
116 returns qdisc to initial state: purge all buffers, clear all
117 timers, counters (except for statistics) etc.
119 ---init
121 initializes newly created qdisc.
123 ---destroy
125 destroys resources allocated by init and during lifetime of qdisc.
127 ---change
129 changes qdisc parameters.
132 /* Protects list of registered TC modules. It is pure SMP lock. */
133 static DEFINE_RWLOCK(qdisc_mod_lock);
136 /************************************************
137 * Queueing disciplines manipulation. *
138 ************************************************/
141 /* The list of all installed queueing disciplines. */
143 static struct Qdisc_ops *qdisc_base;
145 /* Register/uregister queueing discipline */
147 int register_qdisc(struct Qdisc_ops *qops)
149 struct Qdisc_ops *q, **qp;
150 int rc = -EEXIST;
152 write_lock(&qdisc_mod_lock);
153 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
154 if (!strcmp(qops->id, q->id))
155 goto out;
157 if (qops->enqueue == NULL)
158 qops->enqueue = noop_qdisc_ops.enqueue;
159 if (qops->requeue == NULL)
160 qops->requeue = noop_qdisc_ops.requeue;
161 if (qops->dequeue == NULL)
162 qops->dequeue = noop_qdisc_ops.dequeue;
164 qops->next = NULL;
165 *qp = qops;
166 rc = 0;
167 out:
168 write_unlock(&qdisc_mod_lock);
169 return rc;
172 int unregister_qdisc(struct Qdisc_ops *qops)
174 struct Qdisc_ops *q, **qp;
175 int err = -ENOENT;
177 write_lock(&qdisc_mod_lock);
178 for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
179 if (q == qops)
180 break;
181 if (q) {
182 *qp = q->next;
183 q->next = NULL;
184 err = 0;
186 write_unlock(&qdisc_mod_lock);
187 return err;
190 /* We know handle. Find qdisc among all qdisc's attached to device
191 (root qdisc, all its children, children of children etc.)
194 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
196 struct Qdisc *q;
198 read_lock(&qdisc_tree_lock);
199 list_for_each_entry(q, &dev->qdisc_list, list) {
200 if (q->handle == handle) {
201 read_unlock(&qdisc_tree_lock);
202 return q;
205 read_unlock(&qdisc_tree_lock);
206 return NULL;
209 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
211 unsigned long cl;
212 struct Qdisc *leaf;
213 struct Qdisc_class_ops *cops = p->ops->cl_ops;
215 if (cops == NULL)
216 return NULL;
217 cl = cops->get(p, classid);
219 if (cl == 0)
220 return NULL;
221 leaf = cops->leaf(p, cl);
222 cops->put(p, cl);
223 return leaf;
226 /* Find queueing discipline by name */
228 static struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind)
230 struct Qdisc_ops *q = NULL;
232 if (kind) {
233 read_lock(&qdisc_mod_lock);
234 for (q = qdisc_base; q; q = q->next) {
235 if (rtattr_strcmp(kind, q->id) == 0) {
236 if (!try_module_get(q->owner))
237 q = NULL;
238 break;
241 read_unlock(&qdisc_mod_lock);
243 return q;
246 static struct qdisc_rate_table *qdisc_rtab_list;
248 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab)
250 struct qdisc_rate_table *rtab;
252 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
253 if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
254 rtab->refcnt++;
255 return rtab;
259 if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024)
260 return NULL;
262 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
263 if (rtab) {
264 rtab->rate = *r;
265 rtab->refcnt = 1;
266 memcpy(rtab->data, RTA_DATA(tab), 1024);
267 rtab->next = qdisc_rtab_list;
268 qdisc_rtab_list = rtab;
270 return rtab;
273 void qdisc_put_rtab(struct qdisc_rate_table *tab)
275 struct qdisc_rate_table *rtab, **rtabp;
277 if (!tab || --tab->refcnt)
278 return;
280 for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
281 if (rtab == tab) {
282 *rtabp = rtab->next;
283 kfree(rtab);
284 return;
290 /* Allocate an unique handle from space managed by kernel */
292 static u32 qdisc_alloc_handle(struct net_device *dev)
294 int i = 0x10000;
295 static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
297 do {
298 autohandle += TC_H_MAKE(0x10000U, 0);
299 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
300 autohandle = TC_H_MAKE(0x80000000U, 0);
301 } while (qdisc_lookup(dev, autohandle) && --i > 0);
303 return i>0 ? autohandle : 0;
306 /* Attach toplevel qdisc to device dev */
308 static struct Qdisc *
309 dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc)
311 struct Qdisc *oqdisc;
313 if (dev->flags & IFF_UP)
314 dev_deactivate(dev);
316 qdisc_lock_tree(dev);
317 if (qdisc && qdisc->flags&TCQ_F_INGRESS) {
318 oqdisc = dev->qdisc_ingress;
319 /* Prune old scheduler */
320 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) {
321 /* delete */
322 qdisc_reset(oqdisc);
323 dev->qdisc_ingress = NULL;
324 } else { /* new */
325 dev->qdisc_ingress = qdisc;
328 } else {
330 oqdisc = dev->qdisc_sleeping;
332 /* Prune old scheduler */
333 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
334 qdisc_reset(oqdisc);
336 /* ... and graft new one */
337 if (qdisc == NULL)
338 qdisc = &noop_qdisc;
339 dev->qdisc_sleeping = qdisc;
340 dev->qdisc = &noop_qdisc;
343 qdisc_unlock_tree(dev);
345 if (dev->flags & IFF_UP)
346 dev_activate(dev);
348 return oqdisc;
352 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
353 to device "dev".
355 Old qdisc is not destroyed but returned in *old.
358 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
359 u32 classid,
360 struct Qdisc *new, struct Qdisc **old)
362 int err = 0;
363 struct Qdisc *q = *old;
366 if (parent == NULL) {
367 if (q && q->flags&TCQ_F_INGRESS) {
368 *old = dev_graft_qdisc(dev, q);
369 } else {
370 *old = dev_graft_qdisc(dev, new);
372 } else {
373 struct Qdisc_class_ops *cops = parent->ops->cl_ops;
375 err = -EINVAL;
377 if (cops) {
378 unsigned long cl = cops->get(parent, classid);
379 if (cl) {
380 err = cops->graft(parent, cl, new, old);
381 if (new)
382 new->parent = classid;
383 cops->put(parent, cl);
387 return err;
391 Allocate and initialize new qdisc.
393 Parameters are passed via opt.
396 static struct Qdisc *
397 qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
399 int err;
400 struct rtattr *kind = tca[TCA_KIND-1];
401 struct Qdisc *sch;
402 struct Qdisc_ops *ops;
404 ops = qdisc_lookup_ops(kind);
405 #ifdef CONFIG_KMOD
406 if (ops == NULL && kind != NULL) {
407 char name[IFNAMSIZ];
408 if (rtattr_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
409 /* We dropped the RTNL semaphore in order to
410 * perform the module load. So, even if we
411 * succeeded in loading the module we have to
412 * tell the caller to replay the request. We
413 * indicate this using -EAGAIN.
414 * We replay the request because the device may
415 * go away in the mean time.
417 rtnl_unlock();
418 request_module("sch_%s", name);
419 rtnl_lock();
420 ops = qdisc_lookup_ops(kind);
421 if (ops != NULL) {
422 /* We will try again qdisc_lookup_ops,
423 * so don't keep a reference.
425 module_put(ops->owner);
426 err = -EAGAIN;
427 goto err_out;
431 #endif
433 err = -ENOENT;
434 if (ops == NULL)
435 goto err_out;
437 sch = qdisc_alloc(dev, ops);
438 if (IS_ERR(sch)) {
439 err = PTR_ERR(sch);
440 goto err_out2;
443 if (handle == TC_H_INGRESS) {
444 sch->flags |= TCQ_F_INGRESS;
445 handle = TC_H_MAKE(TC_H_INGRESS, 0);
446 } else if (handle == 0) {
447 handle = qdisc_alloc_handle(dev);
448 err = -ENOMEM;
449 if (handle == 0)
450 goto err_out3;
453 sch->handle = handle;
455 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
456 #ifdef CONFIG_NET_ESTIMATOR
457 if (tca[TCA_RATE-1]) {
458 err = gen_new_estimator(&sch->bstats, &sch->rate_est,
459 sch->stats_lock,
460 tca[TCA_RATE-1]);
461 if (err) {
463 * Any broken qdiscs that would require
464 * a ops->reset() here? The qdisc was never
465 * in action so it shouldn't be necessary.
467 if (ops->destroy)
468 ops->destroy(sch);
469 goto err_out3;
472 #endif
473 qdisc_lock_tree(dev);
474 list_add_tail(&sch->list, &dev->qdisc_list);
475 qdisc_unlock_tree(dev);
477 return sch;
479 err_out3:
480 dev_put(dev);
481 kfree((char *) sch - sch->padded);
482 err_out2:
483 module_put(ops->owner);
484 err_out:
485 *errp = err;
486 return NULL;
489 static int qdisc_change(struct Qdisc *sch, struct rtattr **tca)
491 if (tca[TCA_OPTIONS-1]) {
492 int err;
494 if (sch->ops->change == NULL)
495 return -EINVAL;
496 err = sch->ops->change(sch, tca[TCA_OPTIONS-1]);
497 if (err)
498 return err;
500 #ifdef CONFIG_NET_ESTIMATOR
501 if (tca[TCA_RATE-1])
502 gen_replace_estimator(&sch->bstats, &sch->rate_est,
503 sch->stats_lock, tca[TCA_RATE-1]);
504 #endif
505 return 0;
508 struct check_loop_arg
510 struct qdisc_walker w;
511 struct Qdisc *p;
512 int depth;
515 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
517 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
519 struct check_loop_arg arg;
521 if (q->ops->cl_ops == NULL)
522 return 0;
524 arg.w.stop = arg.w.skip = arg.w.count = 0;
525 arg.w.fn = check_loop_fn;
526 arg.depth = depth;
527 arg.p = p;
528 q->ops->cl_ops->walk(q, &arg.w);
529 return arg.w.stop ? -ELOOP : 0;
532 static int
533 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
535 struct Qdisc *leaf;
536 struct Qdisc_class_ops *cops = q->ops->cl_ops;
537 struct check_loop_arg *arg = (struct check_loop_arg *)w;
539 leaf = cops->leaf(q, cl);
540 if (leaf) {
541 if (leaf == arg->p || arg->depth > 7)
542 return -ELOOP;
543 return check_loop(leaf, arg->p, arg->depth + 1);
545 return 0;
549 * Delete/get qdisc.
552 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
554 struct tcmsg *tcm = NLMSG_DATA(n);
555 struct rtattr **tca = arg;
556 struct net_device *dev;
557 u32 clid = tcm->tcm_parent;
558 struct Qdisc *q = NULL;
559 struct Qdisc *p = NULL;
560 int err;
562 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
563 return -ENODEV;
565 if (clid) {
566 if (clid != TC_H_ROOT) {
567 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
568 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
569 return -ENOENT;
570 q = qdisc_leaf(p, clid);
571 } else { /* ingress */
572 q = dev->qdisc_ingress;
574 } else {
575 q = dev->qdisc_sleeping;
577 if (!q)
578 return -ENOENT;
580 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
581 return -EINVAL;
582 } else {
583 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
584 return -ENOENT;
587 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
588 return -EINVAL;
590 if (n->nlmsg_type == RTM_DELQDISC) {
591 if (!clid)
592 return -EINVAL;
593 if (q->handle == 0)
594 return -ENOENT;
595 if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0)
596 return err;
597 if (q) {
598 qdisc_notify(skb, n, clid, q, NULL);
599 spin_lock_bh(&dev->queue_lock);
600 qdisc_destroy(q);
601 spin_unlock_bh(&dev->queue_lock);
603 } else {
604 qdisc_notify(skb, n, clid, NULL, q);
606 return 0;
610 Create/change qdisc.
613 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
615 struct tcmsg *tcm;
616 struct rtattr **tca;
617 struct net_device *dev;
618 u32 clid;
619 struct Qdisc *q, *p;
620 int err;
622 replay:
623 /* Reinit, just in case something touches this. */
624 tcm = NLMSG_DATA(n);
625 tca = arg;
626 clid = tcm->tcm_parent;
627 q = p = NULL;
629 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
630 return -ENODEV;
632 if (clid) {
633 if (clid != TC_H_ROOT) {
634 if (clid != TC_H_INGRESS) {
635 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
636 return -ENOENT;
637 q = qdisc_leaf(p, clid);
638 } else { /*ingress */
639 q = dev->qdisc_ingress;
641 } else {
642 q = dev->qdisc_sleeping;
645 /* It may be default qdisc, ignore it */
646 if (q && q->handle == 0)
647 q = NULL;
649 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
650 if (tcm->tcm_handle) {
651 if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
652 return -EEXIST;
653 if (TC_H_MIN(tcm->tcm_handle))
654 return -EINVAL;
655 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
656 goto create_n_graft;
657 if (n->nlmsg_flags&NLM_F_EXCL)
658 return -EEXIST;
659 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
660 return -EINVAL;
661 if (q == p ||
662 (p && check_loop(q, p, 0)))
663 return -ELOOP;
664 atomic_inc(&q->refcnt);
665 goto graft;
666 } else {
667 if (q == NULL)
668 goto create_n_graft;
670 /* This magic test requires explanation.
672 * We know, that some child q is already
673 * attached to this parent and have choice:
674 * either to change it or to create/graft new one.
676 * 1. We are allowed to create/graft only
677 * if CREATE and REPLACE flags are set.
679 * 2. If EXCL is set, requestor wanted to say,
680 * that qdisc tcm_handle is not expected
681 * to exist, so that we choose create/graft too.
683 * 3. The last case is when no flags are set.
684 * Alas, it is sort of hole in API, we
685 * cannot decide what to do unambiguously.
686 * For now we select create/graft, if
687 * user gave KIND, which does not match existing.
689 if ((n->nlmsg_flags&NLM_F_CREATE) &&
690 (n->nlmsg_flags&NLM_F_REPLACE) &&
691 ((n->nlmsg_flags&NLM_F_EXCL) ||
692 (tca[TCA_KIND-1] &&
693 rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))))
694 goto create_n_graft;
697 } else {
698 if (!tcm->tcm_handle)
699 return -EINVAL;
700 q = qdisc_lookup(dev, tcm->tcm_handle);
703 /* Change qdisc parameters */
704 if (q == NULL)
705 return -ENOENT;
706 if (n->nlmsg_flags&NLM_F_EXCL)
707 return -EEXIST;
708 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
709 return -EINVAL;
710 err = qdisc_change(q, tca);
711 if (err == 0)
712 qdisc_notify(skb, n, clid, NULL, q);
713 return err;
715 create_n_graft:
716 if (!(n->nlmsg_flags&NLM_F_CREATE))
717 return -ENOENT;
718 if (clid == TC_H_INGRESS)
719 q = qdisc_create(dev, tcm->tcm_parent, tca, &err);
720 else
721 q = qdisc_create(dev, tcm->tcm_handle, tca, &err);
722 if (q == NULL) {
723 if (err == -EAGAIN)
724 goto replay;
725 return err;
728 graft:
729 if (1) {
730 struct Qdisc *old_q = NULL;
731 err = qdisc_graft(dev, p, clid, q, &old_q);
732 if (err) {
733 if (q) {
734 spin_lock_bh(&dev->queue_lock);
735 qdisc_destroy(q);
736 spin_unlock_bh(&dev->queue_lock);
738 return err;
740 qdisc_notify(skb, n, clid, old_q, q);
741 if (old_q) {
742 spin_lock_bh(&dev->queue_lock);
743 qdisc_destroy(old_q);
744 spin_unlock_bh(&dev->queue_lock);
747 return 0;
750 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
751 u32 pid, u32 seq, u16 flags, int event)
753 struct tcmsg *tcm;
754 struct nlmsghdr *nlh;
755 unsigned char *b = skb->tail;
756 struct gnet_dump d;
758 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
759 tcm = NLMSG_DATA(nlh);
760 tcm->tcm_family = AF_UNSPEC;
761 tcm->tcm__pad1 = 0;
762 tcm->tcm__pad2 = 0;
763 tcm->tcm_ifindex = q->dev->ifindex;
764 tcm->tcm_parent = clid;
765 tcm->tcm_handle = q->handle;
766 tcm->tcm_info = atomic_read(&q->refcnt);
767 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
768 if (q->ops->dump && q->ops->dump(q, skb) < 0)
769 goto rtattr_failure;
770 q->qstats.qlen = q->q.qlen;
772 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
773 TCA_XSTATS, q->stats_lock, &d) < 0)
774 goto rtattr_failure;
776 if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
777 goto rtattr_failure;
779 if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
780 #ifdef CONFIG_NET_ESTIMATOR
781 gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
782 #endif
783 gnet_stats_copy_queue(&d, &q->qstats) < 0)
784 goto rtattr_failure;
786 if (gnet_stats_finish_copy(&d) < 0)
787 goto rtattr_failure;
789 nlh->nlmsg_len = skb->tail - b;
790 return skb->len;
792 nlmsg_failure:
793 rtattr_failure:
794 skb_trim(skb, b - skb->data);
795 return -1;
798 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
799 u32 clid, struct Qdisc *old, struct Qdisc *new)
801 struct sk_buff *skb;
802 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
804 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
805 if (!skb)
806 return -ENOBUFS;
808 if (old && old->handle) {
809 if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
810 goto err_out;
812 if (new) {
813 if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
814 goto err_out;
817 if (skb->len)
818 return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
820 err_out:
821 kfree_skb(skb);
822 return -EINVAL;
825 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
827 int idx, q_idx;
828 int s_idx, s_q_idx;
829 struct net_device *dev;
830 struct Qdisc *q;
832 s_idx = cb->args[0];
833 s_q_idx = q_idx = cb->args[1];
834 read_lock(&dev_base_lock);
835 for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) {
836 if (idx < s_idx)
837 continue;
838 if (idx > s_idx)
839 s_q_idx = 0;
840 read_lock(&qdisc_tree_lock);
841 q_idx = 0;
842 list_for_each_entry(q, &dev->qdisc_list, list) {
843 if (q_idx < s_q_idx) {
844 q_idx++;
845 continue;
847 if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
848 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) {
849 read_unlock(&qdisc_tree_lock);
850 goto done;
852 q_idx++;
854 read_unlock(&qdisc_tree_lock);
857 done:
858 read_unlock(&dev_base_lock);
860 cb->args[0] = idx;
861 cb->args[1] = q_idx;
863 return skb->len;
868 /************************************************
869 * Traffic classes manipulation. *
870 ************************************************/
874 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
876 struct tcmsg *tcm = NLMSG_DATA(n);
877 struct rtattr **tca = arg;
878 struct net_device *dev;
879 struct Qdisc *q = NULL;
880 struct Qdisc_class_ops *cops;
881 unsigned long cl = 0;
882 unsigned long new_cl;
883 u32 pid = tcm->tcm_parent;
884 u32 clid = tcm->tcm_handle;
885 u32 qid = TC_H_MAJ(clid);
886 int err;
888 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
889 return -ENODEV;
892 parent == TC_H_UNSPEC - unspecified parent.
893 parent == TC_H_ROOT - class is root, which has no parent.
894 parent == X:0 - parent is root class.
895 parent == X:Y - parent is a node in hierarchy.
896 parent == 0:Y - parent is X:Y, where X:0 is qdisc.
898 handle == 0:0 - generate handle from kernel pool.
899 handle == 0:Y - class is X:Y, where X:0 is qdisc.
900 handle == X:Y - clear.
901 handle == X:0 - root class.
904 /* Step 1. Determine qdisc handle X:0 */
906 if (pid != TC_H_ROOT) {
907 u32 qid1 = TC_H_MAJ(pid);
909 if (qid && qid1) {
910 /* If both majors are known, they must be identical. */
911 if (qid != qid1)
912 return -EINVAL;
913 } else if (qid1) {
914 qid = qid1;
915 } else if (qid == 0)
916 qid = dev->qdisc_sleeping->handle;
918 /* Now qid is genuine qdisc handle consistent
919 both with parent and child.
921 TC_H_MAJ(pid) still may be unspecified, complete it now.
923 if (pid)
924 pid = TC_H_MAKE(qid, pid);
925 } else {
926 if (qid == 0)
927 qid = dev->qdisc_sleeping->handle;
930 /* OK. Locate qdisc */
931 if ((q = qdisc_lookup(dev, qid)) == NULL)
932 return -ENOENT;
934 /* An check that it supports classes */
935 cops = q->ops->cl_ops;
936 if (cops == NULL)
937 return -EINVAL;
939 /* Now try to get class */
940 if (clid == 0) {
941 if (pid == TC_H_ROOT)
942 clid = qid;
943 } else
944 clid = TC_H_MAKE(qid, clid);
946 if (clid)
947 cl = cops->get(q, clid);
949 if (cl == 0) {
950 err = -ENOENT;
951 if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
952 goto out;
953 } else {
954 switch (n->nlmsg_type) {
955 case RTM_NEWTCLASS:
956 err = -EEXIST;
957 if (n->nlmsg_flags&NLM_F_EXCL)
958 goto out;
959 break;
960 case RTM_DELTCLASS:
961 err = cops->delete(q, cl);
962 if (err == 0)
963 tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
964 goto out;
965 case RTM_GETTCLASS:
966 err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
967 goto out;
968 default:
969 err = -EINVAL;
970 goto out;
974 new_cl = cl;
975 err = cops->change(q, clid, pid, tca, &new_cl);
976 if (err == 0)
977 tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
979 out:
980 if (cl)
981 cops->put(q, cl);
983 return err;
987 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
988 unsigned long cl,
989 u32 pid, u32 seq, u16 flags, int event)
991 struct tcmsg *tcm;
992 struct nlmsghdr *nlh;
993 unsigned char *b = skb->tail;
994 struct gnet_dump d;
995 struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
997 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
998 tcm = NLMSG_DATA(nlh);
999 tcm->tcm_family = AF_UNSPEC;
1000 tcm->tcm_ifindex = q->dev->ifindex;
1001 tcm->tcm_parent = q->handle;
1002 tcm->tcm_handle = q->handle;
1003 tcm->tcm_info = 0;
1004 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
1005 if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1006 goto rtattr_failure;
1008 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
1009 TCA_XSTATS, q->stats_lock, &d) < 0)
1010 goto rtattr_failure;
1012 if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1013 goto rtattr_failure;
1015 if (gnet_stats_finish_copy(&d) < 0)
1016 goto rtattr_failure;
1018 nlh->nlmsg_len = skb->tail - b;
1019 return skb->len;
1021 nlmsg_failure:
1022 rtattr_failure:
1023 skb_trim(skb, b - skb->data);
1024 return -1;
1027 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1028 struct Qdisc *q, unsigned long cl, int event)
1030 struct sk_buff *skb;
1031 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1033 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1034 if (!skb)
1035 return -ENOBUFS;
1037 if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1038 kfree_skb(skb);
1039 return -EINVAL;
1042 return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1045 struct qdisc_dump_args
1047 struct qdisc_walker w;
1048 struct sk_buff *skb;
1049 struct netlink_callback *cb;
1052 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1054 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1056 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1057 a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1060 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1062 int t;
1063 int s_t;
1064 struct net_device *dev;
1065 struct Qdisc *q;
1066 struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1067 struct qdisc_dump_args arg;
1069 if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1070 return 0;
1071 if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
1072 return 0;
1074 s_t = cb->args[0];
1075 t = 0;
1077 read_lock(&qdisc_tree_lock);
1078 list_for_each_entry(q, &dev->qdisc_list, list) {
1079 if (t < s_t || !q->ops->cl_ops ||
1080 (tcm->tcm_parent &&
1081 TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1082 t++;
1083 continue;
1085 if (t > s_t)
1086 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1087 arg.w.fn = qdisc_class_dump;
1088 arg.skb = skb;
1089 arg.cb = cb;
1090 arg.w.stop = 0;
1091 arg.w.skip = cb->args[1];
1092 arg.w.count = 0;
1093 q->ops->cl_ops->walk(q, &arg.w);
1094 cb->args[1] = arg.w.count;
1095 if (arg.w.stop)
1096 break;
1097 t++;
1099 read_unlock(&qdisc_tree_lock);
1101 cb->args[0] = t;
1103 dev_put(dev);
1104 return skb->len;
1107 /* Main classifier routine: scans classifier chain attached
1108 to this qdisc, (optionally) tests for protocol and asks
1109 specific classifiers.
1111 int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1112 struct tcf_result *res)
1114 int err = 0;
1115 u32 protocol = skb->protocol;
1116 #ifdef CONFIG_NET_CLS_ACT
1117 struct tcf_proto *otp = tp;
1118 reclassify:
1119 #endif
1120 protocol = skb->protocol;
1122 for ( ; tp; tp = tp->next) {
1123 if ((tp->protocol == protocol ||
1124 tp->protocol == __constant_htons(ETH_P_ALL)) &&
1125 (err = tp->classify(skb, tp, res)) >= 0) {
1126 #ifdef CONFIG_NET_CLS_ACT
1127 if ( TC_ACT_RECLASSIFY == err) {
1128 __u32 verd = (__u32) G_TC_VERD(skb->tc_verd);
1129 tp = otp;
1131 if (MAX_REC_LOOP < verd++) {
1132 printk("rule prio %d protocol %02x reclassify is buggy packet dropped\n",
1133 tp->prio&0xffff, ntohs(tp->protocol));
1134 return TC_ACT_SHOT;
1136 skb->tc_verd = SET_TC_VERD(skb->tc_verd,verd);
1137 goto reclassify;
1138 } else {
1139 if (skb->tc_verd)
1140 skb->tc_verd = SET_TC_VERD(skb->tc_verd,0);
1141 return err;
1143 #else
1145 return err;
1146 #endif
1150 return -1;
1153 static int psched_us_per_tick = 1;
1154 static int psched_tick_per_us = 1;
1156 #ifdef CONFIG_PROC_FS
1157 static int psched_show(struct seq_file *seq, void *v)
1159 seq_printf(seq, "%08x %08x %08x %08x\n",
1160 psched_tick_per_us, psched_us_per_tick,
1161 1000000, HZ);
1163 return 0;
1166 static int psched_open(struct inode *inode, struct file *file)
1168 return single_open(file, psched_show, PDE(inode)->data);
1171 static struct file_operations psched_fops = {
1172 .owner = THIS_MODULE,
1173 .open = psched_open,
1174 .read = seq_read,
1175 .llseek = seq_lseek,
1176 .release = single_release,
1178 #endif
1180 #ifdef CONFIG_NET_SCH_CLK_CPU
1181 psched_tdiff_t psched_clock_per_hz;
1182 int psched_clock_scale;
1183 EXPORT_SYMBOL(psched_clock_per_hz);
1184 EXPORT_SYMBOL(psched_clock_scale);
1186 psched_time_t psched_time_base;
1187 cycles_t psched_time_mark;
1188 EXPORT_SYMBOL(psched_time_mark);
1189 EXPORT_SYMBOL(psched_time_base);
1192 * Periodically adjust psched_time_base to avoid overflow
1193 * with 32-bit get_cycles(). Safe up to 4GHz CPU.
1195 static void psched_tick(unsigned long);
1196 static DEFINE_TIMER(psched_timer, psched_tick, 0, 0);
1198 static void psched_tick(unsigned long dummy)
1200 if (sizeof(cycles_t) == sizeof(u32)) {
1201 psched_time_t dummy_stamp;
1202 PSCHED_GET_TIME(dummy_stamp);
1203 psched_timer.expires = jiffies + 1*HZ;
1204 add_timer(&psched_timer);
1208 int __init psched_calibrate_clock(void)
1210 psched_time_t stamp, stamp1;
1211 struct timeval tv, tv1;
1212 psched_tdiff_t delay;
1213 long rdelay;
1214 unsigned long stop;
1216 psched_tick(0);
1217 stop = jiffies + HZ/10;
1218 PSCHED_GET_TIME(stamp);
1219 do_gettimeofday(&tv);
1220 while (time_before(jiffies, stop)) {
1221 barrier();
1222 cpu_relax();
1224 PSCHED_GET_TIME(stamp1);
1225 do_gettimeofday(&tv1);
1227 delay = PSCHED_TDIFF(stamp1, stamp);
1228 rdelay = tv1.tv_usec - tv.tv_usec;
1229 rdelay += (tv1.tv_sec - tv.tv_sec)*1000000;
1230 if (rdelay > delay)
1231 return -1;
1232 delay /= rdelay;
1233 psched_tick_per_us = delay;
1234 while ((delay>>=1) != 0)
1235 psched_clock_scale++;
1236 psched_us_per_tick = 1<<psched_clock_scale;
1237 psched_clock_per_hz = (psched_tick_per_us*(1000000/HZ))>>psched_clock_scale;
1238 return 0;
1240 #endif
1242 static int __init pktsched_init(void)
1244 struct rtnetlink_link *link_p;
1246 #ifdef CONFIG_NET_SCH_CLK_CPU
1247 if (psched_calibrate_clock() < 0)
1248 return -1;
1249 #elif defined(CONFIG_NET_SCH_CLK_JIFFIES)
1250 psched_tick_per_us = HZ<<PSCHED_JSCALE;
1251 psched_us_per_tick = 1000000;
1252 #endif
1254 link_p = rtnetlink_links[PF_UNSPEC];
1256 /* Setup rtnetlink links. It is made here to avoid
1257 exporting large number of public symbols.
1260 if (link_p) {
1261 link_p[RTM_NEWQDISC-RTM_BASE].doit = tc_modify_qdisc;
1262 link_p[RTM_DELQDISC-RTM_BASE].doit = tc_get_qdisc;
1263 link_p[RTM_GETQDISC-RTM_BASE].doit = tc_get_qdisc;
1264 link_p[RTM_GETQDISC-RTM_BASE].dumpit = tc_dump_qdisc;
1265 link_p[RTM_NEWTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1266 link_p[RTM_DELTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1267 link_p[RTM_GETTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1268 link_p[RTM_GETTCLASS-RTM_BASE].dumpit = tc_dump_tclass;
1271 register_qdisc(&pfifo_qdisc_ops);
1272 register_qdisc(&bfifo_qdisc_ops);
1273 proc_net_fops_create("psched", 0, &psched_fops);
1275 return 0;
1278 subsys_initcall(pktsched_init);
1280 EXPORT_SYMBOL(qdisc_lookup);
1281 EXPORT_SYMBOL(qdisc_get_rtab);
1282 EXPORT_SYMBOL(qdisc_put_rtab);
1283 EXPORT_SYMBOL(register_qdisc);
1284 EXPORT_SYMBOL(unregister_qdisc);
1285 EXPORT_SYMBOL(tc_classify);