* better
[mascara-docs.git] / i386 / linux-2.3.21 / net / sched / sch_api.c
blob8a8e3276d2b95691349b1ebbaa474bebae35f7bc
1 /*
2 * net/sched/sch_api.c Packet scheduler API.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
11 * Fixes:
13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
17 #include <linux/config.h>
18 #include <linux/types.h>
19 #include <linux/kernel.h>
20 #include <linux/sched.h>
21 #include <linux/string.h>
22 #include <linux/mm.h>
23 #include <linux/socket.h>
24 #include <linux/sockios.h>
25 #include <linux/in.h>
26 #include <linux/errno.h>
27 #include <linux/interrupt.h>
28 #include <linux/netdevice.h>
29 #include <linux/skbuff.h>
30 #include <linux/rtnetlink.h>
31 #include <linux/init.h>
32 #include <linux/proc_fs.h>
33 #include <linux/kmod.h>
35 #include <net/sock.h>
36 #include <net/pkt_sched.h>
38 #include <asm/processor.h>
39 #include <asm/uaccess.h>
40 #include <asm/system.h>
41 #include <asm/bitops.h>
43 #ifdef CONFIG_RTNETLINK
44 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
45 struct Qdisc *old, struct Qdisc *new);
46 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
47 struct Qdisc *q, unsigned long cl, int event);
48 #endif
52 Short review.
53 -------------
55 This file consists of two interrelated parts:
57 1. queueing disciplines manager frontend.
58 2. traffic classes manager frontend.
60 Generally, queueing discipline ("qdisc") is a black box,
61 which is able to enqueue packets and to dequeue them (when
62 device is ready to send something) in order and at times
63 determined by algorithm hidden in it.
65 qdisc's are divided to two categories:
66 - "queues", which have no internal structure visible from outside.
67 - "schedulers", which split all the packets to "traffic classes",
68 using "packet classifiers" (look at cls_api.c)
70 In turn, classes may have child qdiscs (as rule, queues)
71 attached to them etc. etc. etc.
73 The goal of the routines in this file is to translate
74 information supplied by user in the form of handles
75 to more intelligible for kernel form, to make some sanity
76 checks and part of work, which is common to all qdiscs
77 and to provide rtnetlink notifications.
79 All real intelligent work is done inside qdisc modules.
83 Every discipline has two major routines: enqueue and dequeue.
85 ---dequeue
87 dequeue usually returns a skb to send. It is allowed to return NULL,
88 but it does not mean that queue is empty, it just means that
89 discipline does not want to send anything this time.
90 Queue is really empty if q->q.qlen == 0.
91 For complicated disciplines with multiple queues q->q is not
92 real packet queue, but however q->q.qlen must be valid.
94 ---enqueue
96 enqueue returns 0, if packet was enqueued successfully.
97 If packet (this one or another one) was dropped, it returns
98 not zero error code.
99 NET_XMIT_DROP - this packet dropped
100 Expected action: do not backoff, but wait until queue will clear.
101 NET_XMIT_CN - probably this packet enqueued, but another one dropped.
102 Expected action: backoff or ignore
103 NET_XMIT_POLICED - dropped by police.
104 Expected action: backoff or error to real-time apps.
106 Auxiliary routines:
108 ---requeue
110 requeues once dequeued packet. It is used for non-standard or
111 just buggy devices, which can defer output even if dev->tbusy=0.
113 ---reset
115 returns qdisc to initial state: purge all buffers, clear all
116 timers, counters (except for statistics) etc.
118 ---init
120 initializes newly created qdisc.
122 ---destroy
124 destroys resources allocated by init and during lifetime of qdisc.
126 ---change
128 changes qdisc parameters.
131 /* Protects list of registered TC modules. It is pure SMP lock. */
132 static rwlock_t qdisc_mod_lock = RW_LOCK_UNLOCKED;
135 /************************************************
136 * Queueing disciplines manipulation. *
137 ************************************************/
140 /* The list of all installed queueing disciplines. */
142 static struct Qdisc_ops *qdisc_base = NULL;
144 /* Register/uregister queueing discipline */
146 int register_qdisc(struct Qdisc_ops *qops)
148 struct Qdisc_ops *q, **qp;
150 write_lock(&qdisc_mod_lock);
151 for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next) {
152 if (strcmp(qops->id, q->id) == 0) {
153 write_unlock(&qdisc_mod_lock);
154 return -EEXIST;
158 if (qops->enqueue == NULL)
159 qops->enqueue = noop_qdisc_ops.enqueue;
160 if (qops->requeue == NULL)
161 qops->requeue = noop_qdisc_ops.requeue;
162 if (qops->dequeue == NULL)
163 qops->dequeue = noop_qdisc_ops.dequeue;
165 qops->next = NULL;
166 *qp = qops;
167 write_unlock(&qdisc_mod_lock);
168 return 0;
171 int unregister_qdisc(struct Qdisc_ops *qops)
173 struct Qdisc_ops *q, **qp;
174 int err = -ENOENT;
176 write_lock(&qdisc_mod_lock);
177 for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
178 if (q == qops)
179 break;
180 if (q) {
181 *qp = q->next;
182 q->next = NULL;
183 err = 0;
185 write_unlock(&qdisc_mod_lock);
186 return err;
189 /* We know handle. Find qdisc among all qdisc's attached to device
190 (root qdisc, all its children, children of children etc.)
193 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
195 struct Qdisc *q;
197 for (q = dev->qdisc_list; q; q = q->next) {
198 if (q->handle == handle)
199 return q;
201 return NULL;
204 struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
206 unsigned long cl;
207 struct Qdisc *leaf;
208 struct Qdisc_class_ops *cops = p->ops->cl_ops;
210 if (cops == NULL)
211 return NULL;
212 cl = cops->get(p, classid);
213 if (cl == 0)
214 return NULL;
215 leaf = cops->leaf(p, cl);
216 cops->put(p, cl);
217 return leaf;
220 /* Find queueing discipline by name */
222 struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind)
224 struct Qdisc_ops *q = NULL;
226 if (kind) {
227 read_lock(&qdisc_mod_lock);
228 for (q = qdisc_base; q; q = q->next) {
229 if (rtattr_strcmp(kind, q->id) == 0)
230 break;
232 read_unlock(&qdisc_mod_lock);
234 return q;
237 static struct qdisc_rate_table *qdisc_rtab_list;
239 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab)
241 struct qdisc_rate_table *rtab;
243 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
244 if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
245 rtab->refcnt++;
246 return rtab;
250 if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024)
251 return NULL;
253 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
254 if (rtab) {
255 rtab->rate = *r;
256 rtab->refcnt = 1;
257 memcpy(rtab->data, RTA_DATA(tab), 1024);
258 rtab->next = qdisc_rtab_list;
259 qdisc_rtab_list = rtab;
261 return rtab;
264 void qdisc_put_rtab(struct qdisc_rate_table *tab)
266 struct qdisc_rate_table *rtab, **rtabp;
268 if (!tab || --tab->refcnt)
269 return;
271 for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
272 if (rtab == tab) {
273 *rtabp = rtab->next;
274 kfree(rtab);
275 return;
281 /* Allocate an unique handle from space managed by kernel */
283 u32 qdisc_alloc_handle(struct net_device *dev)
285 int i = 0x10000;
286 static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
288 do {
289 autohandle += TC_H_MAKE(0x10000U, 0);
290 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
291 autohandle = TC_H_MAKE(0x80000000U, 0);
292 } while (qdisc_lookup(dev, autohandle) && --i > 0);
294 return i>0 ? autohandle : 0;
297 /* Attach toplevel qdisc to device dev */
299 static struct Qdisc *
300 dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc)
302 struct Qdisc *oqdisc;
304 if (dev->flags & IFF_UP)
305 dev_deactivate(dev);
307 write_lock(&qdisc_tree_lock);
308 spin_lock_bh(&dev->queue_lock);
309 oqdisc = dev->qdisc_sleeping;
311 /* Prune old scheduler */
312 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
313 qdisc_reset(oqdisc);
315 /* ... and graft new one */
316 if (qdisc == NULL)
317 qdisc = &noop_qdisc;
318 dev->qdisc_sleeping = qdisc;
319 dev->qdisc = &noop_qdisc;
320 spin_unlock_bh(&dev->queue_lock);
321 write_unlock(&qdisc_tree_lock);
323 if (dev->flags & IFF_UP)
324 dev_activate(dev);
326 return oqdisc;
330 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
331 to device "dev".
333 Old qdisc is not destroyed but returned in *old.
336 int qdisc_graft(struct net_device *dev, struct Qdisc *parent, u32 classid,
337 struct Qdisc *new, struct Qdisc **old)
339 int err = 0;
341 if (parent == NULL) {
342 *old = dev_graft_qdisc(dev, new);
343 } else {
344 struct Qdisc_class_ops *cops = parent->ops->cl_ops;
346 err = -EINVAL;
348 if (cops) {
349 unsigned long cl = cops->get(parent, classid);
350 if (cl) {
351 err = cops->graft(parent, cl, new, old);
352 cops->put(parent, cl);
356 return err;
359 #ifdef CONFIG_RTNETLINK
362 Allocate and initialize new qdisc.
364 Parameters are passed via opt.
367 static struct Qdisc *
368 qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
370 int err;
371 struct rtattr *kind = tca[TCA_KIND-1];
372 struct Qdisc *sch = NULL;
373 struct Qdisc_ops *ops;
374 int size;
376 ops = qdisc_lookup_ops(kind);
377 #ifdef CONFIG_KMOD
378 if (ops==NULL && tca[TCA_KIND-1] != NULL) {
379 char module_name[4 + IFNAMSIZ + 1];
381 if (RTA_PAYLOAD(kind) <= IFNAMSIZ) {
382 sprintf(module_name, "sch_%s", (char*)RTA_DATA(kind));
383 request_module (module_name);
384 ops = qdisc_lookup_ops(kind);
387 #endif
389 err = -EINVAL;
390 if (ops == NULL)
391 goto err_out;
393 size = sizeof(*sch) + ops->priv_size;
395 sch = kmalloc(size, GFP_KERNEL);
396 err = -ENOBUFS;
397 if (!sch)
398 goto err_out;
400 /* Grrr... Resolve race condition with module unload */
402 err = -EINVAL;
403 if (ops != qdisc_lookup_ops(kind))
404 goto err_out;
406 memset(sch, 0, size);
408 skb_queue_head_init(&sch->q);
409 sch->ops = ops;
410 sch->enqueue = ops->enqueue;
411 sch->dequeue = ops->dequeue;
412 sch->dev = dev;
413 atomic_set(&sch->refcnt, 1);
414 sch->stats.lock = &dev->queue_lock;
415 if (handle == 0) {
416 handle = qdisc_alloc_handle(dev);
417 err = -ENOMEM;
418 if (handle == 0)
419 goto err_out;
421 sch->handle = handle;
423 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
424 write_lock(&qdisc_tree_lock);
425 sch->next = dev->qdisc_list;
426 dev->qdisc_list = sch;
427 write_unlock(&qdisc_tree_lock);
428 #ifdef CONFIG_NET_ESTIMATOR
429 if (tca[TCA_RATE-1])
430 qdisc_new_estimator(&sch->stats, tca[TCA_RATE-1]);
431 #endif
432 return sch;
435 err_out:
436 *errp = err;
437 if (sch)
438 kfree(sch);
439 return NULL;
442 static int qdisc_change(struct Qdisc *sch, struct rtattr **tca)
444 if (tca[TCA_OPTIONS-1]) {
445 int err;
447 if (sch->ops->change == NULL)
448 return -EINVAL;
449 err = sch->ops->change(sch, tca[TCA_OPTIONS-1]);
450 if (err)
451 return err;
453 #ifdef CONFIG_NET_ESTIMATOR
454 if (tca[TCA_RATE-1]) {
455 qdisc_kill_estimator(&sch->stats);
456 qdisc_new_estimator(&sch->stats, tca[TCA_RATE-1]);
458 #endif
459 return 0;
462 struct check_loop_arg
464 struct qdisc_walker w;
465 struct Qdisc *p;
466 int depth;
469 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
471 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
473 struct check_loop_arg arg;
475 if (q->ops->cl_ops == NULL)
476 return 0;
478 arg.w.stop = arg.w.skip = arg.w.count = 0;
479 arg.w.fn = check_loop_fn;
480 arg.depth = depth;
481 arg.p = p;
482 q->ops->cl_ops->walk(q, &arg.w);
483 return arg.w.stop ? -ELOOP : 0;
486 static int
487 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
489 struct Qdisc *leaf;
490 struct Qdisc_class_ops *cops = q->ops->cl_ops;
491 struct check_loop_arg *arg = (struct check_loop_arg *)w;
493 leaf = cops->leaf(q, cl);
494 if (leaf) {
495 if (leaf == arg->p || arg->depth > 7)
496 return -ELOOP;
497 return check_loop(leaf, arg->p, arg->depth + 1);
499 return 0;
503 * Delete/get qdisc.
506 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
508 struct tcmsg *tcm = NLMSG_DATA(n);
509 struct rtattr **tca = arg;
510 struct net_device *dev;
511 u32 clid = tcm->tcm_parent;
512 struct Qdisc *q = NULL;
513 struct Qdisc *p = NULL;
514 int err;
516 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
517 return -ENODEV;
519 if (clid) {
520 if (clid != TC_H_ROOT) {
521 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
522 return -ENOENT;
523 q = qdisc_leaf(p, clid);
524 } else
525 q = dev->qdisc_sleeping;
527 if (!q)
528 return -ENOENT;
530 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
531 return -EINVAL;
532 } else {
533 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
534 return -ENOENT;
537 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
538 return -EINVAL;
540 if (n->nlmsg_type == RTM_DELQDISC) {
541 if (!clid)
542 return -EINVAL;
543 if (q->handle == 0)
544 return -ENOENT;
545 if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0)
546 return err;
547 if (q) {
548 qdisc_notify(skb, n, clid, q, NULL);
549 spin_lock_bh(&dev->queue_lock);
550 qdisc_destroy(q);
551 spin_unlock_bh(&dev->queue_lock);
553 } else {
554 qdisc_notify(skb, n, clid, NULL, q);
556 return 0;
560 Create/change qdisc.
563 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
565 struct tcmsg *tcm = NLMSG_DATA(n);
566 struct rtattr **tca = arg;
567 struct net_device *dev;
568 u32 clid = tcm->tcm_parent;
569 struct Qdisc *q = NULL;
570 struct Qdisc *p = NULL;
571 int err;
573 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
574 return -ENODEV;
576 if (clid) {
577 if (clid != TC_H_ROOT) {
578 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
579 return -ENOENT;
580 q = qdisc_leaf(p, clid);
581 } else {
582 q = dev->qdisc_sleeping;
585 /* It may be default qdisc, ignore it */
586 if (q && q->handle == 0)
587 q = NULL;
589 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
590 if (tcm->tcm_handle) {
591 if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
592 return -EEXIST;
593 if (TC_H_MIN(tcm->tcm_handle))
594 return -EINVAL;
595 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
596 goto create_n_graft;
597 if (n->nlmsg_flags&NLM_F_EXCL)
598 return -EEXIST;
599 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
600 return -EINVAL;
601 if (q == p ||
602 (p && check_loop(q, p, 0)))
603 return -ELOOP;
604 atomic_inc(&q->refcnt);
605 goto graft;
606 } else {
607 if (q == NULL)
608 goto create_n_graft;
610 /* This magic test requires explanation.
612 * We know, that some child q is already
613 * attached to this parent and have choice:
614 * either to change it or to create/graft new one.
616 * 1. We are allowed to create/graft only
617 * if CREATE and REPLACE flags are set.
619 * 2. If EXCL is set, requestor wanted to say,
620 * that qdisc tcm_handle is not expected
621 * to exist, so that we choose create/graft too.
623 * 3. The last case is when no flags are set.
624 * Alas, it is sort of hole in API, we
625 * cannot decide what to do unambiguously.
626 * For now we select create/graft, if
627 * user gave KIND, which does not match existing.
629 if ((n->nlmsg_flags&NLM_F_CREATE) &&
630 (n->nlmsg_flags&NLM_F_REPLACE) &&
631 ((n->nlmsg_flags&NLM_F_EXCL) ||
632 (tca[TCA_KIND-1] &&
633 rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))))
634 goto create_n_graft;
637 } else {
638 if (!tcm->tcm_handle)
639 return -EINVAL;
640 q = qdisc_lookup(dev, tcm->tcm_handle);
643 /* Change qdisc parameters */
644 if (q == NULL)
645 return -ENOENT;
646 if (n->nlmsg_flags&NLM_F_EXCL)
647 return -EEXIST;
648 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
649 return -EINVAL;
650 err = qdisc_change(q, tca);
651 if (err == 0)
652 qdisc_notify(skb, n, clid, NULL, q);
653 return err;
655 create_n_graft:
656 if (!(n->nlmsg_flags&NLM_F_CREATE))
657 return -ENOENT;
658 q = qdisc_create(dev, tcm->tcm_handle, tca, &err);
659 if (q == NULL)
660 return err;
662 graft:
663 if (1) {
664 struct Qdisc *old_q = NULL;
665 err = qdisc_graft(dev, p, clid, q, &old_q);
666 if (err) {
667 if (q) {
668 spin_lock_bh(&dev->queue_lock);
669 qdisc_destroy(q);
670 spin_unlock_bh(&dev->queue_lock);
672 return err;
674 qdisc_notify(skb, n, clid, old_q, q);
675 if (old_q) {
676 spin_lock_bh(&dev->queue_lock);
677 qdisc_destroy(old_q);
678 spin_unlock_bh(&dev->queue_lock);
681 return 0;
684 int qdisc_copy_stats(struct sk_buff *skb, struct tc_stats *st)
686 spin_lock_bh(st->lock);
687 RTA_PUT(skb, TCA_STATS, (char*)&st->lock - (char*)st, st);
688 spin_unlock_bh(st->lock);
689 return 0;
691 rtattr_failure:
692 spin_unlock_bh(st->lock);
693 return -1;
697 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
698 u32 pid, u32 seq, unsigned flags, int event)
700 struct tcmsg *tcm;
701 struct nlmsghdr *nlh;
702 unsigned char *b = skb->tail;
704 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm));
705 nlh->nlmsg_flags = flags;
706 tcm = NLMSG_DATA(nlh);
707 tcm->tcm_family = AF_UNSPEC;
708 tcm->tcm_ifindex = q->dev ? q->dev->ifindex : 0;
709 tcm->tcm_parent = clid;
710 tcm->tcm_handle = q->handle;
711 tcm->tcm_info = atomic_read(&q->refcnt);
712 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
713 if (q->ops->dump && q->ops->dump(q, skb) < 0)
714 goto rtattr_failure;
715 q->stats.qlen = q->q.qlen;
716 if (qdisc_copy_stats(skb, &q->stats))
717 goto rtattr_failure;
718 nlh->nlmsg_len = skb->tail - b;
719 return skb->len;
721 nlmsg_failure:
722 rtattr_failure:
723 skb_trim(skb, b - skb->data);
724 return -1;
727 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
728 u32 clid, struct Qdisc *old, struct Qdisc *new)
730 struct sk_buff *skb;
731 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
733 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
734 if (!skb)
735 return -ENOBUFS;
737 if (old && old->handle) {
738 if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
739 goto err_out;
741 if (new) {
742 if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
743 goto err_out;
746 if (skb->len)
747 return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
749 err_out:
750 kfree_skb(skb);
751 return -EINVAL;
754 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
756 int idx, q_idx;
757 int s_idx, s_q_idx;
758 struct net_device *dev;
759 struct Qdisc *q;
761 s_idx = cb->args[0];
762 s_q_idx = q_idx = cb->args[1];
763 read_lock(&dev_base_lock);
764 for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) {
765 if (idx < s_idx)
766 continue;
767 if (idx > s_idx)
768 s_q_idx = 0;
769 read_lock(&qdisc_tree_lock);
770 for (q = dev->qdisc_list, q_idx = 0; q;
771 q = q->next, q_idx++) {
772 if (q_idx < s_q_idx)
773 continue;
774 if (tc_fill_qdisc(skb, q, 0, NETLINK_CB(cb->skb).pid,
775 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) {
776 read_unlock(&qdisc_tree_lock);
777 goto done;
780 read_unlock(&qdisc_tree_lock);
783 done:
784 read_unlock(&dev_base_lock);
786 cb->args[0] = idx;
787 cb->args[1] = q_idx;
789 return skb->len;
794 /************************************************
795 * Traffic classes manipulation. *
796 ************************************************/
800 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
802 struct tcmsg *tcm = NLMSG_DATA(n);
803 struct rtattr **tca = arg;
804 struct net_device *dev;
805 struct Qdisc *q = NULL;
806 struct Qdisc_class_ops *cops;
807 unsigned long cl = 0;
808 unsigned long new_cl;
809 u32 pid = tcm->tcm_parent;
810 u32 clid = tcm->tcm_handle;
811 u32 qid = TC_H_MAJ(clid);
812 int err;
814 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
815 return -ENODEV;
818 parent == TC_H_UNSPEC - unspecified parent.
819 parent == TC_H_ROOT - class is root, which has no parent.
820 parent == X:0 - parent is root class.
821 parent == X:Y - parent is a node in hierarchy.
822 parent == 0:Y - parent is X:Y, where X:0 is qdisc.
824 handle == 0:0 - generate handle from kernel pool.
825 handle == 0:Y - class is X:Y, where X:0 is qdisc.
826 handle == X:Y - clear.
827 handle == X:0 - root class.
830 /* Step 1. Determine qdisc handle X:0 */
832 if (pid != TC_H_ROOT) {
833 u32 qid1 = TC_H_MAJ(pid);
835 if (qid && qid1) {
836 /* If both majors are known, they must be identical. */
837 if (qid != qid1)
838 return -EINVAL;
839 } else if (qid1) {
840 qid = qid1;
841 } else if (qid == 0)
842 qid = dev->qdisc_sleeping->handle;
844 /* Now qid is genuine qdisc handle consistent
845 both with parent and child.
847 TC_H_MAJ(pid) still may be unspecified, complete it now.
849 if (pid)
850 pid = TC_H_MAKE(qid, pid);
851 } else {
852 if (qid == 0)
853 qid = dev->qdisc_sleeping->handle;
856 /* OK. Locate qdisc */
857 if ((q = qdisc_lookup(dev, qid)) == NULL)
858 return -ENOENT;
860 /* An check that it supports classes */
861 cops = q->ops->cl_ops;
862 if (cops == NULL)
863 return -EINVAL;
865 /* Now try to get class */
866 if (clid == 0) {
867 if (pid == TC_H_ROOT)
868 clid = qid;
869 } else
870 clid = TC_H_MAKE(qid, clid);
872 if (clid)
873 cl = cops->get(q, clid);
875 if (cl == 0) {
876 err = -ENOENT;
877 if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
878 goto out;
879 } else {
880 switch (n->nlmsg_type) {
881 case RTM_NEWTCLASS:
882 err = -EEXIST;
883 if (n->nlmsg_flags&NLM_F_EXCL)
884 goto out;
885 break;
886 case RTM_DELTCLASS:
887 err = cops->delete(q, cl);
888 if (err == 0)
889 tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
890 goto out;
891 case RTM_GETTCLASS:
892 err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
893 goto out;
894 default:
895 err = -EINVAL;
896 goto out;
900 new_cl = cl;
901 err = cops->change(q, clid, pid, tca, &new_cl);
902 if (err == 0)
903 tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
905 out:
906 if (cl)
907 cops->put(q, cl);
909 return err;
913 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
914 unsigned long cl,
915 u32 pid, u32 seq, unsigned flags, int event)
917 struct tcmsg *tcm;
918 struct nlmsghdr *nlh;
919 unsigned char *b = skb->tail;
921 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm));
922 nlh->nlmsg_flags = flags;
923 tcm = NLMSG_DATA(nlh);
924 tcm->tcm_family = AF_UNSPEC;
925 tcm->tcm_ifindex = q->dev ? q->dev->ifindex : 0;
926 tcm->tcm_parent = q->handle;
927 tcm->tcm_handle = q->handle;
928 tcm->tcm_info = 0;
929 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
930 if (q->ops->cl_ops->dump && q->ops->cl_ops->dump(q, cl, skb, tcm) < 0)
931 goto rtattr_failure;
932 nlh->nlmsg_len = skb->tail - b;
933 return skb->len;
935 nlmsg_failure:
936 rtattr_failure:
937 skb_trim(skb, b - skb->data);
938 return -1;
941 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
942 struct Qdisc *q, unsigned long cl, int event)
944 struct sk_buff *skb;
945 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
947 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
948 if (!skb)
949 return -ENOBUFS;
951 if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
952 kfree_skb(skb);
953 return -EINVAL;
956 return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
959 struct qdisc_dump_args
961 struct qdisc_walker w;
962 struct sk_buff *skb;
963 struct netlink_callback *cb;
966 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
968 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
970 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
971 a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
974 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
976 int t;
977 int s_t;
978 struct net_device *dev;
979 struct Qdisc *q;
980 struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
981 struct qdisc_dump_args arg;
983 if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
984 return 0;
985 if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
986 return 0;
988 s_t = cb->args[0];
990 read_lock(&qdisc_tree_lock);
991 for (q=dev->qdisc_list, t=0; q; q = q->next, t++) {
992 if (t < s_t) continue;
993 if (!q->ops->cl_ops) continue;
994 if (tcm->tcm_parent && TC_H_MAJ(tcm->tcm_parent) != q->handle)
995 continue;
996 if (t > s_t)
997 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
998 arg.w.fn = qdisc_class_dump;
999 arg.skb = skb;
1000 arg.cb = cb;
1001 arg.w.stop = 0;
1002 arg.w.skip = cb->args[1];
1003 arg.w.count = 0;
1004 q->ops->cl_ops->walk(q, &arg.w);
1005 cb->args[1] = arg.w.count;
1006 if (arg.w.stop)
1007 break;
1009 read_unlock(&qdisc_tree_lock);
1011 cb->args[0] = t;
1013 dev_put(dev);
1014 return skb->len;
1016 #endif
1018 int psched_us_per_tick = 1;
1019 int psched_tick_per_us = 1;
1021 #ifdef CONFIG_PROC_FS
1022 static int psched_read_proc(char *buffer, char **start, off_t offset,
1023 int length, int *eof, void *data)
1025 int len;
1027 len = sprintf(buffer, "%08x %08x %08x %08x\n",
1028 psched_tick_per_us, psched_us_per_tick,
1029 1000000, HZ);
1031 len -= offset;
1033 if (len > length)
1034 len = length;
1035 if(len < 0)
1036 len = 0;
1038 *start = buffer + offset;
1039 *eof = 1;
1041 return len;
1043 #endif
1045 #if PSCHED_CLOCK_SOURCE == PSCHED_GETTIMEOFDAY
1046 int psched_tod_diff(int delta_sec, int bound)
1048 int delta;
1050 if (bound <= 1000000 || delta_sec > (0x7FFFFFFF/1000000)-1)
1051 return bound;
1052 delta = delta_sec * 1000000;
1053 if (delta > bound)
1054 delta = bound;
1055 return delta;
1057 #endif
1059 psched_time_t psched_time_base;
1061 #if PSCHED_CLOCK_SOURCE == PSCHED_CPU
1062 psched_tdiff_t psched_clock_per_hz;
1063 int psched_clock_scale;
1064 #endif
1066 #ifdef PSCHED_WATCHER
1067 PSCHED_WATCHER psched_time_mark;
1069 static void psched_tick(unsigned long);
1071 static struct timer_list psched_timer =
1072 { NULL, NULL, 0, 0L, psched_tick };
1074 static void psched_tick(unsigned long dummy)
1076 #if PSCHED_CLOCK_SOURCE == PSCHED_CPU
1077 psched_time_t dummy_stamp;
1078 PSCHED_GET_TIME(dummy_stamp);
1079 /* It is OK up to 4GHz cpu */
1080 psched_timer.expires = jiffies + 1*HZ;
1081 #else
1082 unsigned long now = jiffies;
1083 psched_time_base = ((u64)now)<<PSCHED_JSCALE;
1084 psched_time_mark = now;
1085 psched_timer.expires = now + 60*60*HZ;
1086 #endif
1087 add_timer(&psched_timer);
1089 #endif
1091 #if PSCHED_CLOCK_SOURCE == PSCHED_CPU
1092 int __init psched_calibrate_clock(void)
1094 psched_time_t stamp, stamp1;
1095 struct timeval tv, tv1;
1096 psched_tdiff_t delay;
1097 long rdelay;
1098 unsigned long stop;
1100 #if CPU == 586 || CPU == 686
1101 if (!(boot_cpu_data.x86_capability & X86_FEATURE_TSC))
1102 return -1;
1103 #endif
1105 #ifdef PSCHED_WATCHER
1106 psched_tick(0);
1107 #endif
1108 stop = jiffies + HZ/10;
1109 PSCHED_GET_TIME(stamp);
1110 do_gettimeofday(&tv);
1111 while (time_before(jiffies, stop))
1112 barrier();
1113 PSCHED_GET_TIME(stamp1);
1114 do_gettimeofday(&tv1);
1116 delay = PSCHED_TDIFF(stamp1, stamp);
1117 rdelay = tv1.tv_usec - tv.tv_usec;
1118 rdelay += (tv1.tv_sec - tv.tv_sec)*1000000;
1119 if (rdelay > delay)
1120 return -1;
1121 delay /= rdelay;
1122 psched_tick_per_us = delay;
1123 while ((delay>>=1) != 0)
1124 psched_clock_scale++;
1125 psched_us_per_tick = 1<<psched_clock_scale;
1126 psched_clock_per_hz = (psched_tick_per_us*(1000000/HZ))>>psched_clock_scale;
1127 return 0;
1129 #endif
1131 int __init pktsched_init(void)
1133 #ifdef CONFIG_RTNETLINK
1134 struct rtnetlink_link *link_p;
1135 #endif
1136 #ifdef CONFIG_PROC_FS
1137 struct proc_dir_entry *ent;
1138 #endif
1140 #if PSCHED_CLOCK_SOURCE == PSCHED_CPU
1141 if (psched_calibrate_clock() < 0)
1142 return -1;
1143 #elif PSCHED_CLOCK_SOURCE == PSCHED_JIFFIES
1144 psched_tick_per_us = HZ<<PSCHED_JSCALE;
1145 psched_us_per_tick = 1000000;
1146 #ifdef PSCHED_WATCHER
1147 psched_tick(0);
1148 #endif
1149 #endif
1151 #ifdef CONFIG_RTNETLINK
1152 link_p = rtnetlink_links[PF_UNSPEC];
1154 /* Setup rtnetlink links. It is made here to avoid
1155 exporting large number of public symbols.
1158 if (link_p) {
1159 link_p[RTM_NEWQDISC-RTM_BASE].doit = tc_modify_qdisc;
1160 link_p[RTM_DELQDISC-RTM_BASE].doit = tc_get_qdisc;
1161 link_p[RTM_GETQDISC-RTM_BASE].doit = tc_get_qdisc;
1162 link_p[RTM_GETQDISC-RTM_BASE].dumpit = tc_dump_qdisc;
1163 link_p[RTM_NEWTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1164 link_p[RTM_DELTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1165 link_p[RTM_GETTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1166 link_p[RTM_GETTCLASS-RTM_BASE].dumpit = tc_dump_tclass;
1168 #endif
1170 #define INIT_QDISC(name) { \
1171 extern struct Qdisc_ops name##_qdisc_ops; \
1172 register_qdisc(&##name##_qdisc_ops); \
1175 INIT_QDISC(pfifo);
1176 INIT_QDISC(bfifo);
1178 #ifdef CONFIG_NET_SCH_CBQ
1179 INIT_QDISC(cbq);
1180 #endif
1181 #ifdef CONFIG_NET_SCH_CSZ
1182 INIT_QDISC(csz);
1183 #endif
1184 #ifdef CONFIG_NET_SCH_HPFQ
1185 INIT_QDISC(hpfq);
1186 #endif
1187 #ifdef CONFIG_NET_SCH_HFSC
1188 INIT_QDISC(hfsc);
1189 #endif
1190 #ifdef CONFIG_NET_SCH_RED
1191 INIT_QDISC(red);
1192 #endif
1193 #ifdef CONFIG_NET_SCH_GRED
1194 INIT_QDISC(gred);
1195 #endif
1196 #ifdef CONFIG_NET_SCH_DSMARK
1197 INIT_QDISC(dsmark);
1198 #endif
1199 #ifdef CONFIG_NET_SCH_SFQ
1200 INIT_QDISC(sfq);
1201 #endif
1202 #ifdef CONFIG_NET_SCH_TBF
1203 INIT_QDISC(tbf);
1204 #endif
1205 #ifdef CONFIG_NET_SCH_TEQL
1206 teql_init();
1207 #endif
1208 #ifdef CONFIG_NET_SCH_PRIO
1209 INIT_QDISC(prio);
1210 #endif
1211 #ifdef CONFIG_NET_SCH_ATM
1212 INIT_QDISC(atm);
1213 #endif
1214 #ifdef CONFIG_NET_CLS
1215 tc_filter_init();
1216 #endif
1218 #ifdef CONFIG_PROC_FS
1219 ent = create_proc_entry("net/psched", 0, 0);
1220 ent->read_proc = psched_read_proc;
1221 #endif
1223 return 0;