* added 0.99 linux version
[mascara-docs.git] / i386 / linux / linux-2.3.21 / net / sched / sch_cbq.c
blob0308a02f1bad212b8be943b2d2453e19cce95fa9
1 /*
2 * net/sched/sch_cbq.c Class-Based Queueing discipline.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 #include <linux/config.h>
14 #include <linux/module.h>
15 #include <asm/uaccess.h>
16 #include <asm/system.h>
17 #include <asm/bitops.h>
18 #include <linux/types.h>
19 #include <linux/kernel.h>
20 #include <linux/sched.h>
21 #include <linux/string.h>
22 #include <linux/mm.h>
23 #include <linux/socket.h>
24 #include <linux/sockios.h>
25 #include <linux/in.h>
26 #include <linux/errno.h>
27 #include <linux/interrupt.h>
28 #include <linux/if_ether.h>
29 #include <linux/inet.h>
30 #include <linux/netdevice.h>
31 #include <linux/etherdevice.h>
32 #include <linux/notifier.h>
33 #include <net/ip.h>
34 #include <net/route.h>
35 #include <linux/skbuff.h>
36 #include <net/sock.h>
37 #include <net/pkt_sched.h>
40 /* Class-Based Queueing (CBQ) algorithm.
41 =======================================
43 Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
44 Management Models for Packet Networks",
45 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
47 [2] Sally Floyd, "Notes on CBQ and Guaranted Service", 1995
49 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
50 Parameters", 1996
52 [4] Sally Floyd and Michael Speer, "Experimental Results
53 for Class-Based Queueing", 1998, not published.
55 -----------------------------------------------------------------------
57 Algorithm skeleton was taken from NS simulator cbq.cc.
58 If someone wants to check this code against the LBL version,
59 he should take into account that ONLY the skeleton was borrowed,
60 the implementation is different. Particularly:
62 --- The WRR algorithm is different. Our version looks more
63 reasonable (I hope) and works when quanta are allowed to be
64 less than MTU, which is always the case when real time classes
65 have small rates. Note, that the statement of [3] is
66 incomplete, delay may actually be estimated even if class
67 per-round allotment is less than MTU. Namely, if per-round
68 allotment is W*r_i, and r_1+...+r_k = r < 1
70 delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
72 In the worst case we have IntServ estimate with D = W*r+k*MTU
73 and C = MTU*r. The proof (if correct at all) is trivial.
76 --- It seems that cbq-2.0 is not very accurate. At least, I cannot
77 interpret some places, which look like wrong translations
78 from NS. Anyone is advised to find these differences
79 and explain to me, why I am wrong 8).
81 --- Linux has no EOI event, so that we cannot estimate true class
82 idle time. Workaround is to consider the next dequeue event
83 as sign that previous packet is finished. This is wrong because of
84 internal device queueing, but on a permanently loaded link it is true.
85 Moreover, combined with clock integrator, this scheme looks
86 very close to an ideal solution. */
88 struct cbq_sched_data;
91 struct cbq_class
93 struct cbq_class *next; /* hash table link */
94 struct cbq_class *next_alive; /* next class with backlog in this priority band */
96 /* Parameters */
97 u32 classid;
98 unsigned char priority; /* class priority */
99 unsigned char priority2; /* priority to be used after overlimit */
100 unsigned char ewma_log; /* time constant for idle time calculation */
101 unsigned char ovl_strategy;
102 #ifdef CONFIG_NET_CLS_POLICE
103 unsigned char police;
104 #endif
106 u32 defmap;
108 /* Link-sharing scheduler parameters */
109 long maxidle; /* Class paramters: see below. */
110 long offtime;
111 long minidle;
112 u32 avpkt;
113 struct qdisc_rate_table *R_tab;
115 /* Overlimit strategy parameters */
116 void (*overlimit)(struct cbq_class *cl);
117 long penalty;
119 /* General scheduler (WRR) parameters */
120 long allot;
121 long quantum; /* Allotment per WRR round */
122 long weight; /* Relative allotment: see below */
124 struct Qdisc *qdisc; /* Ptr to CBQ discipline */
125 struct cbq_class *split; /* Ptr to split node */
126 struct cbq_class *share; /* Ptr to LS parent in the class tree */
127 struct cbq_class *tparent; /* Ptr to tree parent in the class tree */
128 struct cbq_class *borrow; /* NULL if class is bandwidth limited;
129 parent otherwise */
130 struct cbq_class *sibling; /* Sibling chain */
131 struct cbq_class *children; /* Pointer to children chain */
133 struct Qdisc *q; /* Elementary queueing discipline */
136 /* Variables */
137 unsigned char cpriority; /* Effective priority */
138 unsigned char delayed;
139 unsigned char level; /* level of the class in hierarchy:
140 0 for leaf classes, and maximal
141 level of children + 1 for nodes.
144 psched_time_t last; /* Last end of service */
145 psched_time_t undertime;
146 long avgidle;
147 long deficit; /* Saved deficit for WRR */
148 unsigned long penalized;
149 struct tc_stats stats;
150 struct tc_cbq_xstats xstats;
152 struct tcf_proto *filter_list;
154 int refcnt;
155 int filters;
157 struct cbq_class *defaults[TC_PRIO_MAX+1];
160 struct cbq_sched_data
162 struct cbq_class *classes[16]; /* Hash table of all classes */
163 int nclasses[TC_CBQ_MAXPRIO+1];
164 unsigned quanta[TC_CBQ_MAXPRIO+1];
166 struct cbq_class link;
168 unsigned activemask;
169 struct cbq_class *active[TC_CBQ_MAXPRIO+1]; /* List of all classes
170 with backlog */
172 #ifdef CONFIG_NET_CLS_POLICE
173 struct cbq_class *rx_class;
174 #endif
175 struct cbq_class *tx_class;
176 struct cbq_class *tx_borrowed;
177 int tx_len;
178 psched_time_t now; /* Cached timestamp */
179 psched_time_t now_rt; /* Cached real time */
180 unsigned pmask;
182 struct timer_list delay_timer;
183 struct timer_list wd_timer; /* Watchdog timer,
184 started when CBQ has
185 backlog, but cannot
186 transmit just now */
187 long wd_expires;
188 int toplevel;
189 u32 hgenerator;
193 #define L2T(cl,len) ((cl)->R_tab->data[(len)>>(cl)->R_tab->rate.cell_log])
196 static __inline__ unsigned cbq_hash(u32 h)
198 h ^= h>>8;
199 h ^= h>>4;
200 return h&0xF;
203 static __inline__ struct cbq_class *
204 cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
206 struct cbq_class *cl;
208 for (cl = q->classes[cbq_hash(classid)]; cl; cl = cl->next)
209 if (cl->classid == classid)
210 return cl;
211 return NULL;
214 #ifdef CONFIG_NET_CLS_POLICE
216 static struct cbq_class *
217 cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
219 struct cbq_class *cl, *new;
221 for (cl = this->tparent; cl; cl = cl->tparent)
222 if ((new = cl->defaults[TC_PRIO_BESTEFFORT]) != NULL && new != this)
223 return new;
225 return NULL;
228 #endif
230 /* Classify packet. The procedure is pretty complicated, but
231 it allows us to combine link sharing and priority scheduling
232 transparently.
234 Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
235 so that it resolves to split nodes. Then packets are classified
236 by logical priority, or a more specific classifier may be attached
237 to the split node.
240 static struct cbq_class *
241 cbq_classify(struct sk_buff *skb, struct Qdisc *sch)
243 struct cbq_sched_data *q = (struct cbq_sched_data*)sch->data;
244 struct cbq_class *head = &q->link;
245 struct cbq_class **defmap;
246 struct cbq_class *cl = NULL;
247 u32 prio = skb->priority;
248 struct tcf_result res;
251 * Step 1. If skb->priority points to one of our classes, use it.
253 if (TC_H_MAJ(prio^sch->handle) == 0 &&
254 (cl = cbq_class_lookup(q, prio)) != NULL)
255 return cl;
257 for (;;) {
258 int result = 0;
260 defmap = head->defaults;
263 * Step 2+n. Apply classifier.
265 if (!head->filter_list || (result = tc_classify(skb, head->filter_list, &res)) < 0)
266 goto fallback;
268 if ((cl = (void*)res.class) == NULL) {
269 if (TC_H_MAJ(res.classid))
270 cl = cbq_class_lookup(q, res.classid);
271 else if ((cl = defmap[res.classid&TC_PRIO_MAX]) == NULL)
272 cl = defmap[TC_PRIO_BESTEFFORT];
274 if (cl == NULL || cl->level >= head->level)
275 goto fallback;
278 #ifdef CONFIG_NET_CLS_POLICE
279 switch (result) {
280 case TC_POLICE_RECLASSIFY:
281 return cbq_reclassify(skb, cl);
282 case TC_POLICE_SHOT:
283 return NULL;
284 default:
286 #endif
287 if (cl->level == 0)
288 return cl;
291 * Step 3+n. If classifier selected a link sharing class,
292 * apply agency specific classifier.
293 * Repeat this procdure until we hit a leaf node.
295 head = cl;
298 fallback:
299 cl = head;
302 * Step 4. No success...
304 if (TC_H_MAJ(prio) == 0 &&
305 !(cl = head->defaults[prio&TC_PRIO_MAX]) &&
306 !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
307 return head;
309 return cl;
313 A packet has just been enqueued on the empty class.
314 cbq_activate_class adds it to the tail of active class list
315 of its priority band.
318 static __inline__ void cbq_activate_class(struct cbq_class *cl)
320 struct cbq_sched_data *q = (struct cbq_sched_data*)cl->qdisc->data;
321 int prio = cl->cpriority;
322 struct cbq_class *cl_tail;
324 cl_tail = q->active[prio];
325 q->active[prio] = cl;
327 if (cl_tail != NULL) {
328 cl->next_alive = cl_tail->next_alive;
329 cl_tail->next_alive = cl;
330 } else {
331 cl->next_alive = cl;
332 q->activemask |= (1<<prio);
337 Unlink class from active chain.
338 Note that this same procedure is done directly in cbq_dequeue*
339 during round-robin procedure.
342 static void cbq_deactivate_class(struct cbq_class *this)
344 struct cbq_sched_data *q = (struct cbq_sched_data*)this->qdisc->data;
345 int prio = this->cpriority;
346 struct cbq_class *cl;
347 struct cbq_class *cl_prev = q->active[prio];
349 do {
350 cl = cl_prev->next_alive;
351 if (cl == this) {
352 cl_prev->next_alive = cl->next_alive;
353 cl->next_alive = NULL;
355 if (cl == q->active[prio]) {
356 q->active[prio] = cl_prev;
357 if (cl == q->active[prio]) {
358 q->active[prio] = NULL;
359 q->activemask &= ~(1<<prio);
360 return;
364 cl = cl_prev->next_alive;
365 return;
367 } while ((cl_prev = cl) != q->active[prio]);
370 static void
371 cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
373 int toplevel = q->toplevel;
375 if (toplevel > cl->level && !(cl->q->flags&TCQ_F_THROTTLED)) {
376 psched_time_t now;
377 psched_tdiff_t incr;
379 PSCHED_GET_TIME(now);
380 incr = PSCHED_TDIFF(now, q->now_rt);
381 PSCHED_TADD2(q->now, incr, now);
383 do {
384 if (PSCHED_TLESS(cl->undertime, now)) {
385 q->toplevel = cl->level;
386 return;
388 } while ((cl=cl->borrow) != NULL && toplevel > cl->level);
392 static int
393 cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
395 struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
396 struct cbq_class *cl = cbq_classify(skb, sch);
397 int len = skb->len;
398 int ret = NET_XMIT_POLICED;
400 #ifdef CONFIG_NET_CLS_POLICE
401 q->rx_class = cl;
402 #endif
403 if (cl) {
404 #ifdef CONFIG_NET_CLS_POLICE
405 cl->q->__parent = sch;
406 #endif
407 if ((ret = cl->q->enqueue(skb, cl->q)) == 0) {
408 sch->q.qlen++;
409 sch->stats.packets++;
410 sch->stats.bytes+=len;
411 cbq_mark_toplevel(q, cl);
412 if (!cl->next_alive)
413 cbq_activate_class(cl);
414 return 0;
418 sch->stats.drops++;
419 if (cl == NULL)
420 kfree_skb(skb);
421 else {
422 cbq_mark_toplevel(q, cl);
423 cl->stats.drops++;
425 return ret;
428 static int
429 cbq_requeue(struct sk_buff *skb, struct Qdisc *sch)
431 struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
432 struct cbq_class *cl;
433 int ret;
435 if ((cl = q->tx_class) == NULL) {
436 kfree_skb(skb);
437 sch->stats.drops++;
438 return NET_XMIT_CN;
440 q->tx_class = NULL;
442 cbq_mark_toplevel(q, cl);
444 #ifdef CONFIG_NET_CLS_POLICE
445 q->rx_class = cl;
446 cl->q->__parent = sch;
447 #endif
448 if ((ret = cl->q->ops->requeue(skb, cl->q)) == 0) {
449 sch->q.qlen++;
450 if (!cl->next_alive)
451 cbq_activate_class(cl);
452 return 0;
454 sch->stats.drops++;
455 cl->stats.drops++;
456 return ret;
459 /* Overlimit actions */
461 /* TC_CBQ_OVL_CLASSIC: (default) penalize leaf class by adding offtime */
463 static void cbq_ovl_classic(struct cbq_class *cl)
465 struct cbq_sched_data *q = (struct cbq_sched_data *)cl->qdisc->data;
466 psched_tdiff_t delay = PSCHED_TDIFF(cl->undertime, q->now);
468 if (!cl->delayed) {
469 delay += cl->offtime;
472 Class goes to sleep, so that it will have no
473 chance to work avgidle. Let's forgive it 8)
475 BTW cbq-2.0 has a crap in this
476 place, apparently they forgot to shift it by cl->ewma_log.
478 if (cl->avgidle < 0)
479 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
480 if (cl->avgidle < cl->minidle)
481 cl->avgidle = cl->minidle;
482 if (delay <= 0)
483 delay = 1;
484 PSCHED_TADD2(q->now, delay, cl->undertime);
486 cl->xstats.overactions++;
487 cl->delayed = 1;
489 if (q->wd_expires == 0 || q->wd_expires > delay)
490 q->wd_expires = delay;
492 /* Dirty work! We must schedule wakeups based on
493 real available rate, rather than leaf rate,
494 which may be tiny (even zero).
496 if (q->toplevel == TC_CBQ_MAXLEVEL) {
497 struct cbq_class *b;
498 psched_tdiff_t base_delay = q->wd_expires;
500 for (b = cl->borrow; b; b = b->borrow) {
501 delay = PSCHED_TDIFF(b->undertime, q->now);
502 if (delay < base_delay) {
503 if (delay <= 0)
504 delay = 1;
505 base_delay = delay;
509 q->wd_expires = delay;
513 /* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when
514 they go overlimit
517 static void cbq_ovl_rclassic(struct cbq_class *cl)
519 struct cbq_sched_data *q = (struct cbq_sched_data *)cl->qdisc->data;
520 struct cbq_class *this = cl;
522 do {
523 if (cl->level > q->toplevel) {
524 cl = NULL;
525 break;
527 } while ((cl = cl->borrow) != NULL);
529 if (cl == NULL)
530 cl = this;
531 cbq_ovl_classic(cl);
534 /* TC_CBQ_OVL_DELAY: delay until it will go to underlimit */
536 static void cbq_ovl_delay(struct cbq_class *cl)
538 struct cbq_sched_data *q = (struct cbq_sched_data *)cl->qdisc->data;
539 psched_tdiff_t delay = PSCHED_TDIFF(cl->undertime, q->now);
541 if (!cl->delayed) {
542 unsigned long sched = jiffies;
544 delay += cl->offtime;
545 if (cl->avgidle < 0)
546 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
547 if (cl->avgidle < cl->minidle)
548 cl->avgidle = cl->minidle;
549 PSCHED_TADD2(q->now, delay, cl->undertime);
551 if (delay > 0) {
552 sched += PSCHED_US2JIFFIE(delay) + cl->penalty;
553 cl->penalized = sched;
554 cl->cpriority = TC_CBQ_MAXPRIO;
555 q->pmask |= (1<<TC_CBQ_MAXPRIO);
556 if (del_timer(&q->delay_timer) &&
557 (long)(q->delay_timer.expires - sched) > 0)
558 q->delay_timer.expires = sched;
559 add_timer(&q->delay_timer);
560 cl->delayed = 1;
561 cl->xstats.overactions++;
562 return;
564 delay = 1;
566 if (q->wd_expires == 0 || q->wd_expires > delay)
567 q->wd_expires = delay;
570 /* TC_CBQ_OVL_LOWPRIO: penalize class by lowering its priority band */
572 static void cbq_ovl_lowprio(struct cbq_class *cl)
574 struct cbq_sched_data *q = (struct cbq_sched_data*)cl->qdisc->data;
576 cl->penalized = jiffies + cl->penalty;
578 if (cl->cpriority != cl->priority2) {
579 cl->cpriority = cl->priority2;
580 q->pmask |= (1<<cl->cpriority);
581 cl->xstats.overactions++;
583 cbq_ovl_classic(cl);
586 /* TC_CBQ_OVL_DROP: penalize class by dropping */
588 static void cbq_ovl_drop(struct cbq_class *cl)
590 if (cl->q->ops->drop)
591 if (cl->q->ops->drop(cl->q))
592 cl->qdisc->q.qlen--;
593 cl->xstats.overactions++;
594 cbq_ovl_classic(cl);
597 static void cbq_watchdog(unsigned long arg)
599 struct Qdisc *sch = (struct Qdisc*)arg;
600 sch->flags &= ~TCQ_F_THROTTLED;
601 qdisc_wakeup(sch->dev);
604 static unsigned long cbq_undelay_prio(struct cbq_sched_data *q, int prio)
606 struct cbq_class *cl;
607 struct cbq_class *cl_prev = q->active[prio];
608 unsigned long now = jiffies;
609 unsigned long sched = now;
611 if (cl_prev == NULL)
612 return now;
614 do {
615 cl = cl_prev->next_alive;
616 if ((long)(now - cl->penalized) > 0) {
617 cl_prev->next_alive = cl->next_alive;
618 cl->next_alive = NULL;
619 cl->cpriority = cl->priority;
620 cl->delayed = 0;
621 cbq_activate_class(cl);
623 if (cl == q->active[prio]) {
624 q->active[prio] = cl_prev;
625 if (cl == q->active[prio]) {
626 q->active[prio] = NULL;
627 return 0;
631 cl = cl_prev->next_alive;
632 } else if ((long)(sched - cl->penalized) > 0)
633 sched = cl->penalized;
634 } while ((cl_prev = cl) != q->active[prio]);
636 return (long)(sched - now);
639 static void cbq_undelay(unsigned long arg)
641 struct Qdisc *sch = (struct Qdisc*)arg;
642 struct cbq_sched_data *q = (struct cbq_sched_data*)sch->data;
643 long delay = 0;
644 unsigned pmask;
646 pmask = q->pmask;
647 q->pmask = 0;
649 while (pmask) {
650 int prio = ffz(~pmask);
651 long tmp;
653 pmask &= ~(1<<prio);
655 tmp = cbq_undelay_prio(q, prio);
656 if (tmp > 0) {
657 q->pmask |= 1<<prio;
658 if (tmp < delay || delay == 0)
659 delay = tmp;
663 if (delay) {
664 q->delay_timer.expires = jiffies + delay;
665 add_timer(&q->delay_timer);
668 sch->flags &= ~TCQ_F_THROTTLED;
669 qdisc_wakeup(sch->dev);
673 #ifdef CONFIG_NET_CLS_POLICE
675 static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
677 int len = skb->len;
678 struct Qdisc *sch = child->__parent;
679 struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
680 struct cbq_class *cl = q->rx_class;
682 q->rx_class = NULL;
684 if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) {
686 cbq_mark_toplevel(q, cl);
688 q->rx_class = cl;
689 cl->q->__parent = sch;
691 if (cl->q->enqueue(skb, cl->q) == 0) {
692 sch->q.qlen++;
693 sch->stats.packets++;
694 sch->stats.bytes+=len;
695 if (!cl->next_alive)
696 cbq_activate_class(cl);
697 return 0;
699 sch->stats.drops++;
700 return 0;
703 sch->stats.drops++;
704 return -1;
706 #endif
709 It is mission critical procedure.
711 We "regenerate" toplevel cutoff, if transmitting class
712 has backlog and it is not regulated. It is not part of
713 original CBQ description, but looks more reasonable.
714 Probably, it is wrong. This question needs further investigation.
717 static __inline__ void
718 cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
719 struct cbq_class *borrowed)
721 if (cl && q->toplevel >= borrowed->level) {
722 if (cl->q->q.qlen > 1) {
723 do {
724 if (PSCHED_IS_PASTPERFECT(borrowed->undertime)) {
725 q->toplevel = borrowed->level;
726 return;
728 } while ((borrowed=borrowed->borrow) != NULL);
730 #if 0
731 /* It is not necessary now. Uncommenting it
732 will save CPU cycles, but decrease fairness.
734 q->toplevel = TC_CBQ_MAXLEVEL;
735 #endif
739 static void
740 cbq_update(struct cbq_sched_data *q)
742 struct cbq_class *this = q->tx_class;
743 struct cbq_class *cl = this;
744 int len = q->tx_len;
746 q->tx_class = NULL;
748 for ( ; cl; cl = cl->share) {
749 long avgidle = cl->avgidle;
750 long idle;
752 cl->stats.packets++;
753 cl->stats.bytes += len;
756 (now - last) is total time between packet right edges.
757 (last_pktlen/rate) is "virtual" busy time, so that
759 idle = (now - last) - last_pktlen/rate
762 idle = PSCHED_TDIFF(q->now, cl->last);
763 if ((unsigned long)idle > 128*1024*1024) {
764 avgidle = cl->maxidle;
765 } else {
766 idle -= L2T(cl, len);
768 /* true_avgidle := (1-W)*true_avgidle + W*idle,
769 where W=2^{-ewma_log}. But cl->avgidle is scaled:
770 cl->avgidle == true_avgidle/W,
771 hence:
773 avgidle += idle - (avgidle>>cl->ewma_log);
776 if (avgidle <= 0) {
777 /* Overlimit or at-limit */
779 if (avgidle < cl->minidle)
780 avgidle = cl->minidle;
782 cl->avgidle = avgidle;
784 /* Calculate expected time, when this class
785 will be allowed to send.
786 It will occur, when:
787 (1-W)*true_avgidle + W*delay = 0, i.e.
788 idle = (1/W - 1)*(-true_avgidle)
790 idle = (1 - W)*(-cl->avgidle);
792 idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
795 That is not all.
796 To maintain the rate allocated to the class,
797 we add to undertime virtual clock,
798 necesary to complete transmitted packet.
799 (len/phys_bandwidth has been already passed
800 to the moment of cbq_update)
803 idle -= L2T(&q->link, len);
804 idle += L2T(cl, len);
806 PSCHED_AUDIT_TDIFF(idle);
808 PSCHED_TADD2(q->now, idle, cl->undertime);
809 } else {
810 /* Underlimit */
812 PSCHED_SET_PASTPERFECT(cl->undertime);
813 if (avgidle > cl->maxidle)
814 cl->avgidle = cl->maxidle;
815 else
816 cl->avgidle = avgidle;
818 cl->last = q->now;
821 cbq_update_toplevel(q, this, q->tx_borrowed);
824 static __inline__ struct cbq_class *
825 cbq_under_limit(struct cbq_class *cl)
827 struct cbq_sched_data *q = (struct cbq_sched_data*)cl->qdisc->data;
828 struct cbq_class *this_cl = cl;
830 if (cl->tparent == NULL)
831 return cl;
833 if (PSCHED_IS_PASTPERFECT(cl->undertime) ||
834 !PSCHED_TLESS(q->now, cl->undertime)) {
835 cl->delayed = 0;
836 return cl;
839 do {
840 /* It is very suspicious place. Now overlimit
841 action is generated for not bounded classes
842 only if link is completely congested.
843 Though it is in agree with ancestor-only paradigm,
844 it looks very stupid. Particularly,
845 it means that this chunk of code will either
846 never be called or result in strong amplification
847 of burstiness. Dangerous, silly, and, however,
848 no another solution exists.
850 if ((cl = cl->borrow) == NULL) {
851 this_cl->stats.overlimits++;
852 this_cl->overlimit(this_cl);
853 return NULL;
855 if (cl->level > q->toplevel)
856 return NULL;
857 } while (!PSCHED_IS_PASTPERFECT(cl->undertime) &&
858 PSCHED_TLESS(q->now, cl->undertime));
860 cl->delayed = 0;
861 return cl;
864 static __inline__ struct sk_buff *
865 cbq_dequeue_prio(struct Qdisc *sch, int prio)
867 struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
868 struct cbq_class *cl_tail, *cl_prev, *cl;
869 struct sk_buff *skb;
870 int deficit;
872 cl_tail = cl_prev = q->active[prio];
873 cl = cl_prev->next_alive;
875 do {
876 deficit = 0;
878 /* Start round */
879 do {
880 struct cbq_class *borrow = NULL;
882 if (cl->q->q.qlen &&
883 (borrow = cbq_under_limit(cl)) == NULL)
884 goto skip_class;
886 if (cl->deficit <= 0) {
887 /* Class exhausted its allotment per
888 this round. Switch to the next one.
890 deficit = 1;
891 cl->deficit += cl->quantum;
892 goto next_class;
895 skb = cl->q->dequeue(cl->q);
897 /* Class did not give us any skb :-(
898 It could occur even if cl->q->q.qlen != 0
899 f.e. if cl->q == "tbf"
901 if (skb == NULL)
902 goto skip_class;
904 cl->deficit -= skb->len;
905 q->tx_class = cl;
906 q->tx_borrowed = borrow;
907 if (borrow != cl) {
908 #ifndef CBQ_XSTATS_BORROWS_BYTES
909 borrow->xstats.borrows++;
910 cl->xstats.borrows++;
911 #else
912 borrow->xstats.borrows += skb->len;
913 cl->xstats.borrows += skb->len;
914 #endif
916 q->tx_len = skb->len;
918 if (cl->deficit <= 0) {
919 q->active[prio] = cl;
920 cl = cl->next_alive;
921 cl->deficit += cl->quantum;
923 return skb;
925 skip_class:
926 if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
927 /* Class is empty or penalized.
928 Unlink it from active chain.
930 cl_prev->next_alive = cl->next_alive;
931 cl->next_alive = NULL;
933 /* Did cl_tail point to it? */
934 if (cl == cl_tail) {
935 /* Repair it! */
936 cl_tail = cl_prev;
938 /* Was it the last class in this band? */
939 if (cl == cl_tail) {
940 /* Kill the band! */
941 q->active[prio] = NULL;
942 q->activemask &= ~(1<<prio);
943 if (cl->q->q.qlen)
944 cbq_activate_class(cl);
945 return NULL;
948 q->active[prio] = cl_tail;
950 if (cl->q->q.qlen)
951 cbq_activate_class(cl);
953 cl = cl_prev;
956 next_class:
957 cl_prev = cl;
958 cl = cl->next_alive;
959 } while (cl_prev != cl_tail);
960 } while (deficit);
962 q->active[prio] = cl_prev;
964 return NULL;
967 static __inline__ struct sk_buff *
968 cbq_dequeue_1(struct Qdisc *sch)
970 struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
971 struct sk_buff *skb;
972 unsigned activemask;
974 activemask = q->activemask&0xFF;
975 while (activemask) {
976 int prio = ffz(~activemask);
977 activemask &= ~(1<<prio);
978 skb = cbq_dequeue_prio(sch, prio);
979 if (skb)
980 return skb;
982 return NULL;
985 static struct sk_buff *
986 cbq_dequeue(struct Qdisc *sch)
988 struct sk_buff *skb;
989 struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
990 psched_time_t now;
991 psched_tdiff_t incr;
993 PSCHED_GET_TIME(now);
994 incr = PSCHED_TDIFF(now, q->now_rt);
996 if (q->tx_class) {
997 psched_tdiff_t incr2;
998 /* Time integrator. We calculate EOS time
999 by adding expected packet transmittion time.
1000 If real time is greater, we warp artificial clock,
1001 so that:
1003 cbq_time = max(real_time, work);
1005 incr2 = L2T(&q->link, q->tx_len);
1006 PSCHED_TADD(q->now, incr2);
1007 cbq_update(q);
1008 if ((incr -= incr2) < 0)
1009 incr = 0;
1011 PSCHED_TADD(q->now, incr);
1012 q->now_rt = now;
1014 for (;;) {
1015 q->wd_expires = 0;
1017 skb = cbq_dequeue_1(sch);
1018 if (skb) {
1019 sch->q.qlen--;
1020 sch->flags &= ~TCQ_F_THROTTLED;
1021 return skb;
1024 /* All the classes are overlimit.
1026 It is possible, if:
1028 1. Scheduler is empty.
1029 2. Toplevel cutoff inhibited borrowing.
1030 3. Root class is overlimit.
1032 Reset 2d and 3d conditions and retry.
1034 Note, that NS and cbq-2.0 are buggy, peeking
1035 an arbitrary class is appropriate for ancestor-only
1036 sharing, but not for toplevel algorithm.
1038 Our version is better, but slower, because it requires
1039 two passes, but it is unavoidable with top-level sharing.
1042 if (q->toplevel == TC_CBQ_MAXLEVEL &&
1043 PSCHED_IS_PASTPERFECT(q->link.undertime))
1044 break;
1046 q->toplevel = TC_CBQ_MAXLEVEL;
1047 PSCHED_SET_PASTPERFECT(q->link.undertime);
1050 /* No packets in scheduler or nobody wants to give them to us :-(
1051 Sigh... start watchdog timer in the last case. */
1053 if (sch->q.qlen) {
1054 sch->stats.overlimits++;
1055 if (q->wd_expires && !sch->dev->tbusy) {
1056 long delay = PSCHED_US2JIFFIE(q->wd_expires);
1057 del_timer(&q->wd_timer);
1058 if (delay <= 0)
1059 delay = 1;
1060 q->wd_timer.expires = jiffies + delay;
1061 add_timer(&q->wd_timer);
1062 sch->flags |= TCQ_F_THROTTLED;
1065 return NULL;
1068 /* CBQ class maintanance routines */
1070 static void cbq_adjust_levels(struct cbq_class *this)
1072 if (this == NULL)
1073 return;
1075 do {
1076 int level = 0;
1077 struct cbq_class *cl;
1079 if ((cl = this->children) != NULL) {
1080 do {
1081 if (cl->level > level)
1082 level = cl->level;
1083 } while ((cl = cl->sibling) != this->children);
1085 this->level = level+1;
1086 } while ((this = this->tparent) != NULL);
1089 static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
1091 struct cbq_class *cl;
1092 unsigned h;
1094 if (q->quanta[prio] == 0)
1095 return;
1097 for (h=0; h<16; h++) {
1098 for (cl = q->classes[h]; cl; cl = cl->next) {
1099 /* BUGGGG... Beware! This expression suffer of
1100 arithmetic overflows!
1102 if (cl->priority == prio) {
1103 cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
1104 q->quanta[prio];
1106 if (cl->quantum <= 0 || cl->quantum>32*cl->qdisc->dev->mtu) {
1107 printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->classid, cl->quantum);
1108 cl->quantum = cl->qdisc->dev->mtu/2 + 1;
1114 static void cbq_sync_defmap(struct cbq_class *cl)
1116 struct cbq_sched_data *q = (struct cbq_sched_data*)cl->qdisc->data;
1117 struct cbq_class *split = cl->split;
1118 unsigned h;
1119 int i;
1121 if (split == NULL)
1122 return;
1124 for (i=0; i<=TC_PRIO_MAX; i++) {
1125 if (split->defaults[i] == cl && !(cl->defmap&(1<<i)))
1126 split->defaults[i] = NULL;
1129 for (i=0; i<=TC_PRIO_MAX; i++) {
1130 int level = split->level;
1132 if (split->defaults[i])
1133 continue;
1135 for (h=0; h<16; h++) {
1136 struct cbq_class *c;
1138 for (c = q->classes[h]; c; c = c->next) {
1139 if (c->split == split && c->level < level &&
1140 c->defmap&(1<<i)) {
1141 split->defaults[i] = c;
1142 level = c->level;
1149 static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
1151 struct cbq_class *split = NULL;
1153 if (splitid == 0) {
1154 if ((split = cl->split) == NULL)
1155 return;
1156 splitid = split->classid;
1159 if (split == NULL || split->classid != splitid) {
1160 for (split = cl->tparent; split; split = split->tparent)
1161 if (split->classid == splitid)
1162 break;
1165 if (split == NULL)
1166 return;
1168 if (cl->split != split) {
1169 cl->defmap = 0;
1170 cbq_sync_defmap(cl);
1171 cl->split = split;
1172 cl->defmap = def&mask;
1173 } else
1174 cl->defmap = (cl->defmap&~mask)|(def&mask);
1176 cbq_sync_defmap(cl);
1179 static void cbq_unlink_class(struct cbq_class *this)
1181 struct cbq_class *cl, **clp;
1182 struct cbq_sched_data *q = (struct cbq_sched_data*)this->qdisc->data;
1184 for (clp = &q->classes[cbq_hash(this->classid)]; (cl = *clp) != NULL; clp = &cl->next) {
1185 if (cl == this) {
1186 *clp = cl->next;
1187 cl->next = NULL;
1188 break;
1192 if (this->tparent) {
1193 clp=&this->sibling;
1194 cl = *clp;
1195 do {
1196 if (cl == this) {
1197 *clp = cl->sibling;
1198 break;
1200 clp = &cl->sibling;
1201 } while ((cl = *clp) != this->sibling);
1203 if (this->tparent->children == this) {
1204 this->tparent->children = this->sibling;
1205 if (this->sibling == this)
1206 this->tparent->children = NULL;
1208 } else {
1209 BUG_TRAP(this->sibling == this);
1213 static void cbq_link_class(struct cbq_class *this)
1215 struct cbq_sched_data *q = (struct cbq_sched_data*)this->qdisc->data;
1216 unsigned h = cbq_hash(this->classid);
1217 struct cbq_class *parent = this->tparent;
1219 this->sibling = this;
1220 this->next = q->classes[h];
1221 q->classes[h] = this;
1223 if (parent == NULL)
1224 return;
1226 if (parent->children == NULL) {
1227 parent->children = this;
1228 } else {
1229 this->sibling = parent->children->sibling;
1230 parent->children->sibling = this;
1234 static int cbq_drop(struct Qdisc* sch)
1236 struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
1237 struct cbq_class *cl, *cl_head;
1238 int prio;
1240 for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) {
1241 if ((cl_head = q->active[prio]) == NULL)
1242 continue;
1244 cl = cl_head;
1245 do {
1246 if (cl->q->ops->drop && cl->q->ops->drop(cl->q))
1247 return 1;
1248 } while ((cl = cl->next_alive) != cl_head);
1250 return 0;
1253 static void
1254 cbq_reset(struct Qdisc* sch)
1256 struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
1257 struct cbq_class *cl;
1258 int prio;
1259 unsigned h;
1261 q->activemask = 0;
1262 q->pmask = 0;
1263 q->tx_class = NULL;
1264 q->tx_borrowed = NULL;
1265 del_timer(&q->wd_timer);
1266 del_timer(&q->delay_timer);
1267 q->toplevel = TC_CBQ_MAXLEVEL;
1268 PSCHED_GET_TIME(q->now);
1269 q->now_rt = q->now;
1271 for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1272 q->active[prio] = NULL;
1274 for (h = 0; h < 16; h++) {
1275 for (cl = q->classes[h]; cl; cl = cl->next) {
1276 qdisc_reset(cl->q);
1278 cl->next_alive = NULL;
1279 PSCHED_SET_PASTPERFECT(cl->undertime);
1280 cl->avgidle = cl->maxidle;
1281 cl->deficit = cl->quantum;
1282 cl->cpriority = cl->priority;
1285 sch->q.qlen = 0;
1289 static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1291 if (lss->change&TCF_CBQ_LSS_FLAGS) {
1292 cl->share = (lss->flags&TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1293 cl->borrow = (lss->flags&TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1295 if (lss->change&TCF_CBQ_LSS_EWMA)
1296 cl->ewma_log = lss->ewma_log;
1297 if (lss->change&TCF_CBQ_LSS_AVPKT)
1298 cl->avpkt = lss->avpkt;
1299 if (lss->change&TCF_CBQ_LSS_MINIDLE)
1300 cl->minidle = -(long)lss->minidle;
1301 if (lss->change&TCF_CBQ_LSS_MAXIDLE) {
1302 cl->maxidle = lss->maxidle;
1303 cl->avgidle = lss->maxidle;
1305 if (lss->change&TCF_CBQ_LSS_OFFTIME)
1306 cl->offtime = lss->offtime;
1307 return 0;
1310 static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1312 q->nclasses[cl->priority]--;
1313 q->quanta[cl->priority] -= cl->weight;
1314 cbq_normalize_quanta(q, cl->priority);
1317 static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1319 q->nclasses[cl->priority]++;
1320 q->quanta[cl->priority] += cl->weight;
1321 cbq_normalize_quanta(q, cl->priority);
1324 static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1326 struct cbq_sched_data *q = (struct cbq_sched_data *)cl->qdisc->data;
1328 if (wrr->allot)
1329 cl->allot = wrr->allot;
1330 if (wrr->weight)
1331 cl->weight = wrr->weight;
1332 if (wrr->priority) {
1333 cl->priority = wrr->priority-1;
1334 cl->cpriority = cl->priority;
1335 if (cl->priority >= cl->priority2)
1336 cl->priority2 = TC_CBQ_MAXPRIO-1;
1339 cbq_addprio(q, cl);
1340 return 0;
1343 static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl)
1345 switch (ovl->strategy) {
1346 case TC_CBQ_OVL_CLASSIC:
1347 cl->overlimit = cbq_ovl_classic;
1348 break;
1349 case TC_CBQ_OVL_DELAY:
1350 cl->overlimit = cbq_ovl_delay;
1351 break;
1352 case TC_CBQ_OVL_LOWPRIO:
1353 if (ovl->priority2-1 >= TC_CBQ_MAXPRIO ||
1354 ovl->priority2-1 <= cl->priority)
1355 return -EINVAL;
1356 cl->priority2 = ovl->priority2-1;
1357 cl->overlimit = cbq_ovl_lowprio;
1358 break;
1359 case TC_CBQ_OVL_DROP:
1360 cl->overlimit = cbq_ovl_drop;
1361 break;
1362 case TC_CBQ_OVL_RCLASSIC:
1363 cl->overlimit = cbq_ovl_rclassic;
1364 break;
1365 default:
1366 return -EINVAL;
1368 cl->penalty = (ovl->penalty*HZ)/1000;
1369 return 0;
1372 #ifdef CONFIG_NET_CLS_POLICE
1373 static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p)
1375 cl->police = p->police;
1377 if (cl->q->handle) {
1378 if (p->police == TC_POLICE_RECLASSIFY)
1379 cl->q->reshape_fail = cbq_reshape_fail;
1380 else
1381 cl->q->reshape_fail = NULL;
1383 return 0;
1385 #endif
1387 static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1389 cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1390 return 0;
1393 static int cbq_init(struct Qdisc *sch, struct rtattr *opt)
1395 struct cbq_sched_data *q = (struct cbq_sched_data*)sch->data;
1396 struct rtattr *tb[TCA_CBQ_MAX];
1397 struct tc_ratespec *r;
1399 if (rtattr_parse(tb, TCA_CBQ_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt)) < 0 ||
1400 tb[TCA_CBQ_RTAB-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL ||
1401 RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec))
1402 return -EINVAL;
1404 if (tb[TCA_CBQ_LSSOPT-1] &&
1405 RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt))
1406 return -EINVAL;
1408 r = RTA_DATA(tb[TCA_CBQ_RATE-1]);
1410 MOD_INC_USE_COUNT;
1411 if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB-1])) == NULL) {
1412 MOD_DEC_USE_COUNT;
1413 return -EINVAL;
1416 q->link.refcnt = 1;
1417 q->link.sibling = &q->link;
1418 q->link.classid = sch->handle;
1419 q->link.qdisc = sch;
1420 if (!(q->link.q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops)))
1421 q->link.q = &noop_qdisc;
1423 q->link.priority = TC_CBQ_MAXPRIO-1;
1424 q->link.priority2 = TC_CBQ_MAXPRIO-1;
1425 q->link.cpriority = TC_CBQ_MAXPRIO-1;
1426 q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC;
1427 q->link.overlimit = cbq_ovl_classic;
1428 q->link.allot = psched_mtu(sch->dev);
1429 q->link.quantum = q->link.allot;
1430 q->link.weight = q->link.R_tab->rate.rate;
1432 q->link.ewma_log = TC_CBQ_DEF_EWMA;
1433 q->link.avpkt = q->link.allot/2;
1434 q->link.minidle = -0x7FFFFFFF;
1435 q->link.stats.lock = &sch->dev->queue_lock;
1437 init_timer(&q->wd_timer);
1438 q->wd_timer.data = (unsigned long)sch;
1439 q->wd_timer.function = cbq_watchdog;
1440 init_timer(&q->delay_timer);
1441 q->delay_timer.data = (unsigned long)sch;
1442 q->delay_timer.function = cbq_undelay;
1443 q->toplevel = TC_CBQ_MAXLEVEL;
1444 PSCHED_GET_TIME(q->now);
1445 q->now_rt = q->now;
1447 cbq_link_class(&q->link);
1449 if (tb[TCA_CBQ_LSSOPT-1])
1450 cbq_set_lss(&q->link, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
1452 cbq_addprio(q, &q->link);
1453 return 0;
1456 #ifdef CONFIG_RTNETLINK
1458 static __inline__ int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1460 unsigned char *b = skb->tail;
1462 RTA_PUT(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate);
1463 return skb->len;
1465 rtattr_failure:
1466 skb_trim(skb, b - skb->data);
1467 return -1;
1470 static __inline__ int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1472 unsigned char *b = skb->tail;
1473 struct tc_cbq_lssopt opt;
1475 opt.flags = 0;
1476 if (cl->borrow == NULL)
1477 opt.flags |= TCF_CBQ_LSS_BOUNDED;
1478 if (cl->share == NULL)
1479 opt.flags |= TCF_CBQ_LSS_ISOLATED;
1480 opt.ewma_log = cl->ewma_log;
1481 opt.level = cl->level;
1482 opt.avpkt = cl->avpkt;
1483 opt.maxidle = cl->maxidle;
1484 opt.minidle = (u32)(-cl->minidle);
1485 opt.offtime = cl->offtime;
1486 opt.change = ~0;
1487 RTA_PUT(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt);
1488 return skb->len;
1490 rtattr_failure:
1491 skb_trim(skb, b - skb->data);
1492 return -1;
1495 static __inline__ int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1497 unsigned char *b = skb->tail;
1498 struct tc_cbq_wrropt opt;
1500 opt.flags = 0;
1501 opt.allot = cl->allot;
1502 opt.priority = cl->priority+1;
1503 opt.cpriority = cl->cpriority+1;
1504 opt.weight = cl->weight;
1505 RTA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt);
1506 return skb->len;
1508 rtattr_failure:
1509 skb_trim(skb, b - skb->data);
1510 return -1;
1513 static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
1515 unsigned char *b = skb->tail;
1516 struct tc_cbq_ovl opt;
1518 opt.strategy = cl->ovl_strategy;
1519 opt.priority2 = cl->priority2+1;
1520 opt.penalty = (cl->penalty*1000)/HZ;
1521 RTA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt);
1522 return skb->len;
1524 rtattr_failure:
1525 skb_trim(skb, b - skb->data);
1526 return -1;
1529 static __inline__ int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1531 unsigned char *b = skb->tail;
1532 struct tc_cbq_fopt opt;
1534 if (cl->split || cl->defmap) {
1535 opt.split = cl->split ? cl->split->classid : 0;
1536 opt.defmap = cl->defmap;
1537 opt.defchange = ~0;
1538 RTA_PUT(skb, TCA_CBQ_FOPT, sizeof(opt), &opt);
1540 return skb->len;
1542 rtattr_failure:
1543 skb_trim(skb, b - skb->data);
1544 return -1;
1547 #ifdef CONFIG_NET_CLS_POLICE
1548 static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
1550 unsigned char *b = skb->tail;
1551 struct tc_cbq_police opt;
1553 if (cl->police) {
1554 opt.police = cl->police;
1555 RTA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt);
1557 return skb->len;
1559 rtattr_failure:
1560 skb_trim(skb, b - skb->data);
1561 return -1;
1563 #endif
1565 static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1567 if (cbq_dump_lss(skb, cl) < 0 ||
1568 cbq_dump_rate(skb, cl) < 0 ||
1569 cbq_dump_wrr(skb, cl) < 0 ||
1570 cbq_dump_ovl(skb, cl) < 0 ||
1571 #ifdef CONFIG_NET_CLS_POLICE
1572 cbq_dump_police(skb, cl) < 0 ||
1573 #endif
1574 cbq_dump_fopt(skb, cl) < 0)
1575 return -1;
1576 return 0;
1579 int cbq_copy_xstats(struct sk_buff *skb, struct tc_cbq_xstats *st)
1581 RTA_PUT(skb, TCA_XSTATS, sizeof(*st), st);
1582 return 0;
1584 rtattr_failure:
1585 return -1;
1589 static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1591 struct cbq_sched_data *q = (struct cbq_sched_data*)sch->data;
1592 unsigned char *b = skb->tail;
1593 struct rtattr *rta;
1595 rta = (struct rtattr*)b;
1596 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
1597 if (cbq_dump_attr(skb, &q->link) < 0)
1598 goto rtattr_failure;
1599 rta->rta_len = skb->tail - b;
1600 spin_lock_bh(&sch->dev->queue_lock);
1601 q->link.xstats.avgidle = q->link.avgidle;
1602 if (cbq_copy_xstats(skb, &q->link.xstats)) {
1603 spin_unlock_bh(&sch->dev->queue_lock);
1604 goto rtattr_failure;
1606 spin_unlock_bh(&sch->dev->queue_lock);
1607 return skb->len;
1609 rtattr_failure:
1610 skb_trim(skb, b - skb->data);
1611 return -1;
1614 static int
1615 cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1616 struct sk_buff *skb, struct tcmsg *tcm)
1618 struct cbq_sched_data *q = (struct cbq_sched_data*)sch->data;
1619 struct cbq_class *cl = (struct cbq_class*)arg;
1620 unsigned char *b = skb->tail;
1621 struct rtattr *rta;
1623 if (cl->tparent)
1624 tcm->tcm_parent = cl->tparent->classid;
1625 else
1626 tcm->tcm_parent = TC_H_ROOT;
1627 tcm->tcm_handle = cl->classid;
1628 tcm->tcm_info = cl->q->handle;
1630 rta = (struct rtattr*)b;
1631 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
1632 if (cbq_dump_attr(skb, cl) < 0)
1633 goto rtattr_failure;
1634 rta->rta_len = skb->tail - b;
1635 cl->stats.qlen = cl->q->q.qlen;
1636 if (qdisc_copy_stats(skb, &cl->stats))
1637 goto rtattr_failure;
1638 spin_lock_bh(&sch->dev->queue_lock);
1639 cl->xstats.avgidle = cl->avgidle;
1640 cl->xstats.undertime = 0;
1641 if (!PSCHED_IS_PASTPERFECT(cl->undertime))
1642 cl->xstats.undertime = PSCHED_TDIFF(cl->undertime, q->now);
1643 q->link.xstats.avgidle = q->link.avgidle;
1644 if (cbq_copy_xstats(skb, &cl->xstats)) {
1645 spin_unlock_bh(&sch->dev->queue_lock);
1646 goto rtattr_failure;
1648 spin_unlock_bh(&sch->dev->queue_lock);
1650 return skb->len;
1652 rtattr_failure:
1653 skb_trim(skb, b - skb->data);
1654 return -1;
1657 #endif
1659 static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1660 struct Qdisc **old)
1662 struct cbq_class *cl = (struct cbq_class*)arg;
1664 if (cl) {
1665 if (new == NULL) {
1666 if ((new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops)) == NULL)
1667 return -ENOBUFS;
1668 } else {
1669 #ifdef CONFIG_NET_CLS_POLICE
1670 if (cl->police == TC_POLICE_RECLASSIFY)
1671 new->reshape_fail = cbq_reshape_fail;
1672 #endif
1674 sch_tree_lock(sch);
1675 *old = cl->q;
1676 cl->q = new;
1677 qdisc_reset(*old);
1678 sch_tree_unlock(sch);
1680 return 0;
1682 return -ENOENT;
1685 static struct Qdisc *
1686 cbq_leaf(struct Qdisc *sch, unsigned long arg)
1688 struct cbq_class *cl = (struct cbq_class*)arg;
1690 return cl ? cl->q : NULL;
1693 static unsigned long cbq_get(struct Qdisc *sch, u32 classid)
1695 struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
1696 struct cbq_class *cl = cbq_class_lookup(q, classid);
1698 if (cl) {
1699 cl->refcnt++;
1700 return (unsigned long)cl;
1702 return 0;
1705 static void cbq_destroy_filters(struct cbq_class *cl)
1707 struct tcf_proto *tp;
1709 while ((tp = cl->filter_list) != NULL) {
1710 cl->filter_list = tp->next;
1711 tp->ops->destroy(tp);
1715 static void cbq_destroy_class(struct cbq_class *cl)
1717 cbq_destroy_filters(cl);
1718 qdisc_destroy(cl->q);
1719 qdisc_put_rtab(cl->R_tab);
1720 #ifdef CONFIG_NET_ESTIMATOR
1721 qdisc_kill_estimator(&cl->stats);
1722 #endif
1723 kfree(cl);
1726 static void
1727 cbq_destroy(struct Qdisc* sch)
1729 struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
1730 struct cbq_class *cl;
1731 unsigned h;
1733 #ifdef CONFIG_NET_CLS_POLICE
1734 q->rx_class = NULL;
1735 #endif
1736 for (h = 0; h < 16; h++) {
1737 for (cl = q->classes[h]; cl; cl = cl->next)
1738 cbq_destroy_filters(cl);
1741 for (h = 0; h < 16; h++) {
1742 for (cl = q->classes[h]; cl; cl = cl->next)
1743 if (cl != &q->link)
1744 cbq_destroy_class(cl);
1747 qdisc_put_rtab(q->link.R_tab);
1748 MOD_DEC_USE_COUNT;
1751 static void cbq_put(struct Qdisc *sch, unsigned long arg)
1753 struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
1754 struct cbq_class *cl = (struct cbq_class*)arg;
1756 if (--cl->refcnt == 0) {
1757 #ifdef CONFIG_NET_CLS_POLICE
1758 spin_lock_bh(&sch->dev->queue_lock);
1759 if (q->rx_class == cl)
1760 q->rx_class = NULL;
1761 spin_unlock_bh(&sch->dev->queue_lock);
1762 #endif
1764 cbq_destroy_class(cl);
1768 static int
1769 cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct rtattr **tca,
1770 unsigned long *arg)
1772 int err;
1773 struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
1774 struct cbq_class *cl = (struct cbq_class*)*arg;
1775 struct rtattr *opt = tca[TCA_OPTIONS-1];
1776 struct rtattr *tb[TCA_CBQ_MAX];
1777 struct cbq_class *parent;
1778 struct qdisc_rate_table *rtab = NULL;
1780 if (opt==NULL ||
1781 rtattr_parse(tb, TCA_CBQ_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt)))
1782 return -EINVAL;
1784 if (tb[TCA_CBQ_OVL_STRATEGY-1] &&
1785 RTA_PAYLOAD(tb[TCA_CBQ_OVL_STRATEGY-1]) < sizeof(struct tc_cbq_ovl))
1786 return -EINVAL;
1788 if (tb[TCA_CBQ_FOPT-1] &&
1789 RTA_PAYLOAD(tb[TCA_CBQ_FOPT-1]) < sizeof(struct tc_cbq_fopt))
1790 return -EINVAL;
1792 if (tb[TCA_CBQ_RATE-1] &&
1793 RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec))
1794 return -EINVAL;
1796 if (tb[TCA_CBQ_LSSOPT-1] &&
1797 RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt))
1798 return -EINVAL;
1800 if (tb[TCA_CBQ_WRROPT-1] &&
1801 RTA_PAYLOAD(tb[TCA_CBQ_WRROPT-1]) < sizeof(struct tc_cbq_wrropt))
1802 return -EINVAL;
1804 #ifdef CONFIG_NET_CLS_POLICE
1805 if (tb[TCA_CBQ_POLICE-1] &&
1806 RTA_PAYLOAD(tb[TCA_CBQ_POLICE-1]) < sizeof(struct tc_cbq_police))
1807 return -EINVAL;
1808 #endif
1810 if (cl) {
1811 /* Check parent */
1812 if (parentid) {
1813 if (cl->tparent && cl->tparent->classid != parentid)
1814 return -EINVAL;
1815 if (!cl->tparent && parentid != TC_H_ROOT)
1816 return -EINVAL;
1819 if (tb[TCA_CBQ_RATE-1]) {
1820 rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]);
1821 if (rtab == NULL)
1822 return -EINVAL;
1825 /* Change class parameters */
1826 sch_tree_lock(sch);
1828 if (cl->next_alive != NULL)
1829 cbq_deactivate_class(cl);
1831 if (rtab) {
1832 rtab = xchg(&cl->R_tab, rtab);
1833 qdisc_put_rtab(rtab);
1836 if (tb[TCA_CBQ_LSSOPT-1])
1837 cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
1839 if (tb[TCA_CBQ_WRROPT-1]) {
1840 cbq_rmprio(q, cl);
1841 cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1]));
1844 if (tb[TCA_CBQ_OVL_STRATEGY-1])
1845 cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1]));
1847 #ifdef CONFIG_NET_CLS_POLICE
1848 if (tb[TCA_CBQ_POLICE-1])
1849 cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1]));
1850 #endif
1852 if (tb[TCA_CBQ_FOPT-1])
1853 cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1]));
1855 if (cl->q->q.qlen)
1856 cbq_activate_class(cl);
1858 sch_tree_unlock(sch);
1860 #ifdef CONFIG_NET_ESTIMATOR
1861 if (tca[TCA_RATE-1]) {
1862 qdisc_kill_estimator(&cl->stats);
1863 qdisc_new_estimator(&cl->stats, tca[TCA_RATE-1]);
1865 #endif
1866 return 0;
1869 if (parentid == TC_H_ROOT)
1870 return -EINVAL;
1872 if (tb[TCA_CBQ_WRROPT-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL ||
1873 tb[TCA_CBQ_LSSOPT-1] == NULL)
1874 return -EINVAL;
1876 rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]);
1877 if (rtab == NULL)
1878 return -EINVAL;
1880 if (classid) {
1881 err = -EINVAL;
1882 if (TC_H_MAJ(classid^sch->handle) || cbq_class_lookup(q, classid))
1883 goto failure;
1884 } else {
1885 int i;
1886 classid = TC_H_MAKE(sch->handle,0x8000);
1888 for (i=0; i<0x8000; i++) {
1889 if (++q->hgenerator >= 0x8000)
1890 q->hgenerator = 1;
1891 if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1892 break;
1894 err = -ENOSR;
1895 if (i >= 0x8000)
1896 goto failure;
1897 classid = classid|q->hgenerator;
1900 parent = &q->link;
1901 if (parentid) {
1902 parent = cbq_class_lookup(q, parentid);
1903 err = -EINVAL;
1904 if (parent == NULL)
1905 goto failure;
1908 err = -ENOBUFS;
1909 cl = kmalloc(sizeof(*cl), GFP_KERNEL);
1910 if (cl == NULL)
1911 goto failure;
1912 memset(cl, 0, sizeof(*cl));
1913 cl->R_tab = rtab;
1914 rtab = NULL;
1915 cl->refcnt = 1;
1916 if (!(cl->q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops)))
1917 cl->q = &noop_qdisc;
1918 cl->classid = classid;
1919 cl->tparent = parent;
1920 cl->qdisc = sch;
1921 cl->allot = parent->allot;
1922 cl->quantum = cl->allot;
1923 cl->weight = cl->R_tab->rate.rate;
1924 cl->stats.lock = &sch->dev->queue_lock;
1926 sch_tree_lock(sch);
1927 cbq_link_class(cl);
1928 cl->borrow = cl->tparent;
1929 if (cl->tparent != &q->link)
1930 cl->share = cl->tparent;
1931 cbq_adjust_levels(parent);
1932 cl->minidle = -0x7FFFFFFF;
1933 cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
1934 cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1]));
1935 if (cl->ewma_log==0)
1936 cl->ewma_log = q->link.ewma_log;
1937 if (cl->maxidle==0)
1938 cl->maxidle = q->link.maxidle;
1939 if (cl->avpkt==0)
1940 cl->avpkt = q->link.avpkt;
1941 cl->overlimit = cbq_ovl_classic;
1942 if (tb[TCA_CBQ_OVL_STRATEGY-1])
1943 cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1]));
1944 #ifdef CONFIG_NET_CLS_POLICE
1945 if (tb[TCA_CBQ_POLICE-1])
1946 cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1]));
1947 #endif
1948 if (tb[TCA_CBQ_FOPT-1])
1949 cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1]));
1950 sch_tree_unlock(sch);
1952 #ifdef CONFIG_NET_ESTIMATOR
1953 if (tca[TCA_RATE-1])
1954 qdisc_new_estimator(&cl->stats, tca[TCA_RATE-1]);
1955 #endif
1957 *arg = (unsigned long)cl;
1958 return 0;
1960 failure:
1961 qdisc_put_rtab(rtab);
1962 return err;
1965 static int cbq_delete(struct Qdisc *sch, unsigned long arg)
1967 struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
1968 struct cbq_class *cl = (struct cbq_class*)arg;
1970 if (cl->filters || cl->children || cl == &q->link)
1971 return -EBUSY;
1973 sch_tree_lock(sch);
1975 if (cl->next_alive)
1976 cbq_deactivate_class(cl);
1978 if (q->tx_borrowed == cl)
1979 q->tx_borrowed = q->tx_class;
1980 if (q->tx_class == cl) {
1981 q->tx_class = NULL;
1982 q->tx_borrowed = NULL;
1984 #ifdef CONFIG_NET_CLS_POLICE
1985 if (q->rx_class == cl)
1986 q->rx_class = NULL;
1987 #endif
1989 cbq_unlink_class(cl);
1990 cbq_adjust_levels(cl->tparent);
1991 cl->defmap = 0;
1992 cbq_sync_defmap(cl);
1994 cbq_rmprio(q, cl);
1995 sch_tree_unlock(sch);
1997 if (--cl->refcnt == 0)
1998 cbq_destroy_class(cl);
2000 return 0;
2003 static struct tcf_proto **cbq_find_tcf(struct Qdisc *sch, unsigned long arg)
2005 struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
2006 struct cbq_class *cl = (struct cbq_class *)arg;
2008 if (cl == NULL)
2009 cl = &q->link;
2011 return &cl->filter_list;
2014 static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
2015 u32 classid)
2017 struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
2018 struct cbq_class *p = (struct cbq_class*)parent;
2019 struct cbq_class *cl = cbq_class_lookup(q, classid);
2021 if (cl) {
2022 if (p && p->level <= cl->level)
2023 return 0;
2024 cl->filters++;
2025 return (unsigned long)cl;
2027 return 0;
2030 static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
2032 struct cbq_class *cl = (struct cbq_class*)arg;
2034 cl->filters--;
2037 static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
2039 struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
2040 unsigned h;
2042 if (arg->stop)
2043 return;
2045 for (h = 0; h < 16; h++) {
2046 struct cbq_class *cl;
2048 for (cl = q->classes[h]; cl; cl = cl->next) {
2049 if (arg->count < arg->skip) {
2050 arg->count++;
2051 continue;
2053 if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
2054 arg->stop = 1;
2055 return;
2057 arg->count++;
2062 static struct Qdisc_class_ops cbq_class_ops =
2064 cbq_graft,
2065 cbq_leaf,
2066 cbq_get,
2067 cbq_put,
2068 cbq_change_class,
2069 cbq_delete,
2070 cbq_walk,
2072 cbq_find_tcf,
2073 cbq_bind_filter,
2074 cbq_unbind_filter,
2076 #ifdef CONFIG_RTNETLINK
2077 cbq_dump_class,
2078 #endif
2081 struct Qdisc_ops cbq_qdisc_ops =
2083 NULL,
2084 &cbq_class_ops,
2085 "cbq",
2086 sizeof(struct cbq_sched_data),
2088 cbq_enqueue,
2089 cbq_dequeue,
2090 cbq_requeue,
2091 cbq_drop,
2093 cbq_init,
2094 cbq_reset,
2095 cbq_destroy,
2096 NULL /* cbq_change */,
2098 #ifdef CONFIG_RTNETLINK
2099 cbq_dump,
2100 #endif
2103 #ifdef MODULE
2104 int init_module(void)
2106 return register_qdisc(&cbq_qdisc_ops);
2109 void cleanup_module(void)
2111 unregister_qdisc(&cbq_qdisc_ops);
2113 #endif