MOXA linux-2.6.x / linux-2.6.9-uc0 from sdlinux-moxaart.tgz
[linux-2.6.9-moxart.git] / net / sched / sch_cbq.c
blob2ae73826d6cfb687e723d5f6dffe3314e51390f8
1 /*
2 * net/sched/sch_cbq.c Class-Based Queueing discipline.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 #include <linux/config.h>
14 #include <linux/module.h>
15 #include <asm/uaccess.h>
16 #include <asm/system.h>
17 #include <asm/bitops.h>
18 #include <linux/types.h>
19 #include <linux/kernel.h>
20 #include <linux/sched.h>
21 #include <linux/string.h>
22 #include <linux/mm.h>
23 #include <linux/socket.h>
24 #include <linux/sockios.h>
25 #include <linux/in.h>
26 #include <linux/errno.h>
27 #include <linux/interrupt.h>
28 #include <linux/if_ether.h>
29 #include <linux/inet.h>
30 #include <linux/netdevice.h>
31 #include <linux/etherdevice.h>
32 #include <linux/notifier.h>
33 #include <net/ip.h>
34 #include <net/route.h>
35 #include <linux/skbuff.h>
36 #include <net/sock.h>
37 #include <net/pkt_sched.h>
40 /* Class-Based Queueing (CBQ) algorithm.
41 =======================================
43 Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
44 Management Models for Packet Networks",
45 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
47 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
49 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
50 Parameters", 1996
52 [4] Sally Floyd and Michael Speer, "Experimental Results
53 for Class-Based Queueing", 1998, not published.
55 -----------------------------------------------------------------------
57 Algorithm skeleton was taken from NS simulator cbq.cc.
58 If someone wants to check this code against the LBL version,
59 he should take into account that ONLY the skeleton was borrowed,
60 the implementation is different. Particularly:
62 --- The WRR algorithm is different. Our version looks more
63 reasonable (I hope) and works when quanta are allowed to be
64 less than MTU, which is always the case when real time classes
65 have small rates. Note, that the statement of [3] is
66 incomplete, delay may actually be estimated even if class
67 per-round allotment is less than MTU. Namely, if per-round
68 allotment is W*r_i, and r_1+...+r_k = r < 1
70 delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
72 In the worst case we have IntServ estimate with D = W*r+k*MTU
73 and C = MTU*r. The proof (if correct at all) is trivial.
76 --- It seems that cbq-2.0 is not very accurate. At least, I cannot
77 interpret some places, which look like wrong translations
78 from NS. Anyone is advised to find these differences
79 and explain to me, why I am wrong 8).
81 --- Linux has no EOI event, so that we cannot estimate true class
82 idle time. Workaround is to consider the next dequeue event
83 as sign that previous packet is finished. This is wrong because of
84 internal device queueing, but on a permanently loaded link it is true.
85 Moreover, combined with clock integrator, this scheme looks
86 very close to an ideal solution. */
88 struct cbq_sched_data;
91 struct cbq_class
93 struct cbq_class *next; /* hash table link */
94 struct cbq_class *next_alive; /* next class with backlog in this priority band */
96 /* Parameters */
97 u32 classid;
98 unsigned char priority; /* class priority */
99 unsigned char priority2; /* priority to be used after overlimit */
100 unsigned char ewma_log; /* time constant for idle time calculation */
101 unsigned char ovl_strategy;
102 #ifdef CONFIG_NET_CLS_POLICE
103 unsigned char police;
104 #endif
106 u32 defmap;
108 /* Link-sharing scheduler parameters */
109 long maxidle; /* Class parameters: see below. */
110 long offtime;
111 long minidle;
112 u32 avpkt;
113 struct qdisc_rate_table *R_tab;
115 /* Overlimit strategy parameters */
116 void (*overlimit)(struct cbq_class *cl);
117 long penalty;
119 /* General scheduler (WRR) parameters */
120 long allot;
121 long quantum; /* Allotment per WRR round */
122 long weight; /* Relative allotment: see below */
124 struct Qdisc *qdisc; /* Ptr to CBQ discipline */
125 struct cbq_class *split; /* Ptr to split node */
126 struct cbq_class *share; /* Ptr to LS parent in the class tree */
127 struct cbq_class *tparent; /* Ptr to tree parent in the class tree */
128 struct cbq_class *borrow; /* NULL if class is bandwidth limited;
129 parent otherwise */
130 struct cbq_class *sibling; /* Sibling chain */
131 struct cbq_class *children; /* Pointer to children chain */
133 struct Qdisc *q; /* Elementary queueing discipline */
136 /* Variables */
137 unsigned char cpriority; /* Effective priority */
138 unsigned char delayed;
139 unsigned char level; /* level of the class in hierarchy:
140 0 for leaf classes, and maximal
141 level of children + 1 for nodes.
144 psched_time_t last; /* Last end of service */
145 psched_time_t undertime;
146 long avgidle;
147 long deficit; /* Saved deficit for WRR */
148 unsigned long penalized;
149 struct tc_stats stats;
150 spinlock_t *stats_lock;
151 struct tc_cbq_xstats xstats;
153 struct tcf_proto *filter_list;
155 int refcnt;
156 int filters;
158 struct cbq_class *defaults[TC_PRIO_MAX+1];
161 struct cbq_sched_data
163 struct cbq_class *classes[16]; /* Hash table of all classes */
164 int nclasses[TC_CBQ_MAXPRIO+1];
165 unsigned quanta[TC_CBQ_MAXPRIO+1];
167 struct cbq_class link;
169 unsigned activemask;
170 struct cbq_class *active[TC_CBQ_MAXPRIO+1]; /* List of all classes
171 with backlog */
173 #ifdef CONFIG_NET_CLS_POLICE
174 struct cbq_class *rx_class;
175 #endif
176 struct cbq_class *tx_class;
177 struct cbq_class *tx_borrowed;
178 int tx_len;
179 psched_time_t now; /* Cached timestamp */
180 psched_time_t now_rt; /* Cached real time */
181 unsigned pmask;
183 struct timer_list delay_timer;
184 struct timer_list wd_timer; /* Watchdog timer,
185 started when CBQ has
186 backlog, but cannot
187 transmit just now */
188 long wd_expires;
189 int toplevel;
190 u32 hgenerator;
194 #define L2T(cl,len) ((cl)->R_tab->data[(len)>>(cl)->R_tab->rate.cell_log])
197 static __inline__ unsigned cbq_hash(u32 h)
199 h ^= h>>8;
200 h ^= h>>4;
201 return h&0xF;
204 static __inline__ struct cbq_class *
205 cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
207 struct cbq_class *cl;
209 for (cl = q->classes[cbq_hash(classid)]; cl; cl = cl->next)
210 if (cl->classid == classid)
211 return cl;
212 return NULL;
215 #ifdef CONFIG_NET_CLS_POLICE
217 static struct cbq_class *
218 cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
220 struct cbq_class *cl, *new;
222 for (cl = this->tparent; cl; cl = cl->tparent)
223 if ((new = cl->defaults[TC_PRIO_BESTEFFORT]) != NULL && new != this)
224 return new;
226 return NULL;
229 #endif
231 /* Classify packet. The procedure is pretty complicated, but
232 it allows us to combine link sharing and priority scheduling
233 transparently.
235 Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
236 so that it resolves to split nodes. Then packets are classified
237 by logical priority, or a more specific classifier may be attached
238 to the split node.
241 static struct cbq_class *
242 cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qres)
244 struct cbq_sched_data *q = qdisc_priv(sch);
245 struct cbq_class *head = &q->link;
246 struct cbq_class **defmap;
247 struct cbq_class *cl = NULL;
248 u32 prio = skb->priority;
249 struct tcf_result res;
252 * Step 1. If skb->priority points to one of our classes, use it.
254 if (TC_H_MAJ(prio^sch->handle) == 0 &&
255 (cl = cbq_class_lookup(q, prio)) != NULL)
256 return cl;
258 for (;;) {
259 int result = 0;
260 #ifdef CONFIG_NET_CLS_ACT
261 int terminal = 0;
262 #endif
263 defmap = head->defaults;
266 * Step 2+n. Apply classifier.
268 if (!head->filter_list || (result = tc_classify(skb, head->filter_list, &res)) < 0)
269 goto fallback;
271 if ((cl = (void*)res.class) == NULL) {
272 if (TC_H_MAJ(res.classid))
273 cl = cbq_class_lookup(q, res.classid);
274 else if ((cl = defmap[res.classid&TC_PRIO_MAX]) == NULL)
275 cl = defmap[TC_PRIO_BESTEFFORT];
277 if (cl == NULL || cl->level >= head->level)
278 goto fallback;
281 #ifdef CONFIG_NET_CLS_ACT
282 switch (result) {
283 case TC_ACT_SHOT: /* Stop and kfree */
284 *qres = NET_XMIT_DROP;
285 terminal = 1;
286 break;
287 case TC_ACT_QUEUED:
288 case TC_ACT_STOLEN:
289 terminal = 1;
290 break;
291 case TC_ACT_RECLASSIFY: /* Things look good */
292 case TC_ACT_OK:
293 case TC_ACT_UNSPEC:
294 default:
295 break;
298 if (terminal) {
299 kfree_skb(skb);
300 return NULL;
302 #else
303 #ifdef CONFIG_NET_CLS_POLICE
304 switch (result) {
305 case TC_POLICE_RECLASSIFY:
306 return cbq_reclassify(skb, cl);
307 case TC_POLICE_SHOT:
308 return NULL;
309 default:
310 break;
312 #endif
313 #endif
314 if (cl->level == 0)
315 return cl;
318 * Step 3+n. If classifier selected a link sharing class,
319 * apply agency specific classifier.
320 * Repeat this procdure until we hit a leaf node.
322 head = cl;
325 fallback:
326 cl = head;
329 * Step 4. No success...
331 if (TC_H_MAJ(prio) == 0 &&
332 !(cl = head->defaults[prio&TC_PRIO_MAX]) &&
333 !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
334 return head;
336 return cl;
340 A packet has just been enqueued on the empty class.
341 cbq_activate_class adds it to the tail of active class list
342 of its priority band.
345 static __inline__ void cbq_activate_class(struct cbq_class *cl)
347 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
348 int prio = cl->cpriority;
349 struct cbq_class *cl_tail;
351 cl_tail = q->active[prio];
352 q->active[prio] = cl;
354 if (cl_tail != NULL) {
355 cl->next_alive = cl_tail->next_alive;
356 cl_tail->next_alive = cl;
357 } else {
358 cl->next_alive = cl;
359 q->activemask |= (1<<prio);
364 Unlink class from active chain.
365 Note that this same procedure is done directly in cbq_dequeue*
366 during round-robin procedure.
369 static void cbq_deactivate_class(struct cbq_class *this)
371 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
372 int prio = this->cpriority;
373 struct cbq_class *cl;
374 struct cbq_class *cl_prev = q->active[prio];
376 do {
377 cl = cl_prev->next_alive;
378 if (cl == this) {
379 cl_prev->next_alive = cl->next_alive;
380 cl->next_alive = NULL;
382 if (cl == q->active[prio]) {
383 q->active[prio] = cl_prev;
384 if (cl == q->active[prio]) {
385 q->active[prio] = NULL;
386 q->activemask &= ~(1<<prio);
387 return;
391 cl = cl_prev->next_alive;
392 return;
394 } while ((cl_prev = cl) != q->active[prio]);
397 static void
398 cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
400 int toplevel = q->toplevel;
402 if (toplevel > cl->level && !(cl->q->flags&TCQ_F_THROTTLED)) {
403 psched_time_t now;
404 psched_tdiff_t incr;
406 PSCHED_GET_TIME(now);
407 incr = PSCHED_TDIFF(now, q->now_rt);
408 PSCHED_TADD2(q->now, incr, now);
410 do {
411 if (PSCHED_TLESS(cl->undertime, now)) {
412 q->toplevel = cl->level;
413 return;
415 } while ((cl=cl->borrow) != NULL && toplevel > cl->level);
419 static int
420 cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
422 struct cbq_sched_data *q = qdisc_priv(sch);
423 int len = skb->len;
424 int ret = NET_XMIT_SUCCESS;
425 struct cbq_class *cl = cbq_classify(skb, sch,&ret);
427 #ifdef CONFIG_NET_CLS_POLICE
428 q->rx_class = cl;
429 #endif
430 if (cl) {
431 #ifdef CONFIG_NET_CLS_POLICE
432 cl->q->__parent = sch;
433 #endif
434 if ((ret = cl->q->enqueue(skb, cl->q)) == NET_XMIT_SUCCESS) {
435 sch->q.qlen++;
436 sch->stats.packets++;
437 sch->stats.bytes+=len;
438 cbq_mark_toplevel(q, cl);
439 if (!cl->next_alive)
440 cbq_activate_class(cl);
441 return ret;
445 #ifndef CONFIG_NET_CLS_ACT
446 sch->stats.drops++;
447 if (cl == NULL)
448 kfree_skb(skb);
449 else {
450 cbq_mark_toplevel(q, cl);
451 cl->stats.drops++;
453 #else
454 if ( NET_XMIT_DROP == ret) {
455 sch->stats.drops++;
458 if (cl != NULL) {
459 cbq_mark_toplevel(q, cl);
460 cl->stats.drops++;
462 #endif
463 return ret;
466 static int
467 cbq_requeue(struct sk_buff *skb, struct Qdisc *sch)
469 struct cbq_sched_data *q = qdisc_priv(sch);
470 struct cbq_class *cl;
471 int ret;
473 if ((cl = q->tx_class) == NULL) {
474 kfree_skb(skb);
475 sch->stats.drops++;
476 return NET_XMIT_CN;
478 q->tx_class = NULL;
480 cbq_mark_toplevel(q, cl);
482 #ifdef CONFIG_NET_CLS_POLICE
483 q->rx_class = cl;
484 cl->q->__parent = sch;
485 #endif
486 if ((ret = cl->q->ops->requeue(skb, cl->q)) == 0) {
487 sch->q.qlen++;
488 if (!cl->next_alive)
489 cbq_activate_class(cl);
490 return 0;
492 sch->stats.drops++;
493 cl->stats.drops++;
494 return ret;
497 /* Overlimit actions */
499 /* TC_CBQ_OVL_CLASSIC: (default) penalize leaf class by adding offtime */
501 static void cbq_ovl_classic(struct cbq_class *cl)
503 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
504 psched_tdiff_t delay = PSCHED_TDIFF(cl->undertime, q->now);
506 if (!cl->delayed) {
507 delay += cl->offtime;
510 Class goes to sleep, so that it will have no
511 chance to work avgidle. Let's forgive it 8)
513 BTW cbq-2.0 has a crap in this
514 place, apparently they forgot to shift it by cl->ewma_log.
516 if (cl->avgidle < 0)
517 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
518 if (cl->avgidle < cl->minidle)
519 cl->avgidle = cl->minidle;
520 if (delay <= 0)
521 delay = 1;
522 PSCHED_TADD2(q->now, delay, cl->undertime);
524 cl->xstats.overactions++;
525 cl->delayed = 1;
527 if (q->wd_expires == 0 || q->wd_expires > delay)
528 q->wd_expires = delay;
530 /* Dirty work! We must schedule wakeups based on
531 real available rate, rather than leaf rate,
532 which may be tiny (even zero).
534 if (q->toplevel == TC_CBQ_MAXLEVEL) {
535 struct cbq_class *b;
536 psched_tdiff_t base_delay = q->wd_expires;
538 for (b = cl->borrow; b; b = b->borrow) {
539 delay = PSCHED_TDIFF(b->undertime, q->now);
540 if (delay < base_delay) {
541 if (delay <= 0)
542 delay = 1;
543 base_delay = delay;
547 q->wd_expires = base_delay;
551 /* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when
552 they go overlimit
555 static void cbq_ovl_rclassic(struct cbq_class *cl)
557 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
558 struct cbq_class *this = cl;
560 do {
561 if (cl->level > q->toplevel) {
562 cl = NULL;
563 break;
565 } while ((cl = cl->borrow) != NULL);
567 if (cl == NULL)
568 cl = this;
569 cbq_ovl_classic(cl);
572 /* TC_CBQ_OVL_DELAY: delay until it will go to underlimit */
574 static void cbq_ovl_delay(struct cbq_class *cl)
576 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
577 psched_tdiff_t delay = PSCHED_TDIFF(cl->undertime, q->now);
579 if (!cl->delayed) {
580 unsigned long sched = jiffies;
582 delay += cl->offtime;
583 if (cl->avgidle < 0)
584 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
585 if (cl->avgidle < cl->minidle)
586 cl->avgidle = cl->minidle;
587 PSCHED_TADD2(q->now, delay, cl->undertime);
589 if (delay > 0) {
590 sched += PSCHED_US2JIFFIE(delay) + cl->penalty;
591 cl->penalized = sched;
592 cl->cpriority = TC_CBQ_MAXPRIO;
593 q->pmask |= (1<<TC_CBQ_MAXPRIO);
594 if (del_timer(&q->delay_timer) &&
595 (long)(q->delay_timer.expires - sched) > 0)
596 q->delay_timer.expires = sched;
597 add_timer(&q->delay_timer);
598 cl->delayed = 1;
599 cl->xstats.overactions++;
600 return;
602 delay = 1;
604 if (q->wd_expires == 0 || q->wd_expires > delay)
605 q->wd_expires = delay;
608 /* TC_CBQ_OVL_LOWPRIO: penalize class by lowering its priority band */
610 static void cbq_ovl_lowprio(struct cbq_class *cl)
612 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
614 cl->penalized = jiffies + cl->penalty;
616 if (cl->cpriority != cl->priority2) {
617 cl->cpriority = cl->priority2;
618 q->pmask |= (1<<cl->cpriority);
619 cl->xstats.overactions++;
621 cbq_ovl_classic(cl);
624 /* TC_CBQ_OVL_DROP: penalize class by dropping */
626 static void cbq_ovl_drop(struct cbq_class *cl)
628 if (cl->q->ops->drop)
629 if (cl->q->ops->drop(cl->q))
630 cl->qdisc->q.qlen--;
631 cl->xstats.overactions++;
632 cbq_ovl_classic(cl);
635 static void cbq_watchdog(unsigned long arg)
637 struct Qdisc *sch = (struct Qdisc*)arg;
639 sch->flags &= ~TCQ_F_THROTTLED;
640 netif_schedule(sch->dev);
643 static unsigned long cbq_undelay_prio(struct cbq_sched_data *q, int prio)
645 struct cbq_class *cl;
646 struct cbq_class *cl_prev = q->active[prio];
647 unsigned long now = jiffies;
648 unsigned long sched = now;
650 if (cl_prev == NULL)
651 return now;
653 do {
654 cl = cl_prev->next_alive;
655 if ((long)(now - cl->penalized) > 0) {
656 cl_prev->next_alive = cl->next_alive;
657 cl->next_alive = NULL;
658 cl->cpriority = cl->priority;
659 cl->delayed = 0;
660 cbq_activate_class(cl);
662 if (cl == q->active[prio]) {
663 q->active[prio] = cl_prev;
664 if (cl == q->active[prio]) {
665 q->active[prio] = NULL;
666 return 0;
670 cl = cl_prev->next_alive;
671 } else if ((long)(sched - cl->penalized) > 0)
672 sched = cl->penalized;
673 } while ((cl_prev = cl) != q->active[prio]);
675 return (long)(sched - now);
678 static void cbq_undelay(unsigned long arg)
680 struct Qdisc *sch = (struct Qdisc*)arg;
681 struct cbq_sched_data *q = qdisc_priv(sch);
682 long delay = 0;
683 unsigned pmask;
685 pmask = q->pmask;
686 q->pmask = 0;
688 while (pmask) {
689 int prio = ffz(~pmask);
690 long tmp;
692 pmask &= ~(1<<prio);
694 tmp = cbq_undelay_prio(q, prio);
695 if (tmp > 0) {
696 q->pmask |= 1<<prio;
697 if (tmp < delay || delay == 0)
698 delay = tmp;
702 if (delay) {
703 q->delay_timer.expires = jiffies + delay;
704 add_timer(&q->delay_timer);
707 sch->flags &= ~TCQ_F_THROTTLED;
708 netif_schedule(sch->dev);
712 #ifdef CONFIG_NET_CLS_POLICE
714 static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
716 int len = skb->len;
717 struct Qdisc *sch = child->__parent;
718 struct cbq_sched_data *q = qdisc_priv(sch);
719 struct cbq_class *cl = q->rx_class;
721 q->rx_class = NULL;
723 if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) {
725 cbq_mark_toplevel(q, cl);
727 q->rx_class = cl;
728 cl->q->__parent = sch;
730 if (cl->q->enqueue(skb, cl->q) == 0) {
731 sch->q.qlen++;
732 sch->stats.packets++;
733 sch->stats.bytes+=len;
734 if (!cl->next_alive)
735 cbq_activate_class(cl);
736 return 0;
738 sch->stats.drops++;
739 return 0;
742 sch->stats.drops++;
743 return -1;
745 #endif
748 It is mission critical procedure.
750 We "regenerate" toplevel cutoff, if transmitting class
751 has backlog and it is not regulated. It is not part of
752 original CBQ description, but looks more reasonable.
753 Probably, it is wrong. This question needs further investigation.
756 static __inline__ void
757 cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
758 struct cbq_class *borrowed)
760 if (cl && q->toplevel >= borrowed->level) {
761 if (cl->q->q.qlen > 1) {
762 do {
763 if (PSCHED_IS_PASTPERFECT(borrowed->undertime)) {
764 q->toplevel = borrowed->level;
765 return;
767 } while ((borrowed=borrowed->borrow) != NULL);
769 #if 0
770 /* It is not necessary now. Uncommenting it
771 will save CPU cycles, but decrease fairness.
773 q->toplevel = TC_CBQ_MAXLEVEL;
774 #endif
778 static void
779 cbq_update(struct cbq_sched_data *q)
781 struct cbq_class *this = q->tx_class;
782 struct cbq_class *cl = this;
783 int len = q->tx_len;
785 q->tx_class = NULL;
787 for ( ; cl; cl = cl->share) {
788 long avgidle = cl->avgidle;
789 long idle;
791 cl->stats.packets++;
792 cl->stats.bytes += len;
795 (now - last) is total time between packet right edges.
796 (last_pktlen/rate) is "virtual" busy time, so that
798 idle = (now - last) - last_pktlen/rate
801 idle = PSCHED_TDIFF(q->now, cl->last);
802 if ((unsigned long)idle > 128*1024*1024) {
803 avgidle = cl->maxidle;
804 } else {
805 idle -= L2T(cl, len);
807 /* true_avgidle := (1-W)*true_avgidle + W*idle,
808 where W=2^{-ewma_log}. But cl->avgidle is scaled:
809 cl->avgidle == true_avgidle/W,
810 hence:
812 avgidle += idle - (avgidle>>cl->ewma_log);
815 if (avgidle <= 0) {
816 /* Overlimit or at-limit */
818 if (avgidle < cl->minidle)
819 avgidle = cl->minidle;
821 cl->avgidle = avgidle;
823 /* Calculate expected time, when this class
824 will be allowed to send.
825 It will occur, when:
826 (1-W)*true_avgidle + W*delay = 0, i.e.
827 idle = (1/W - 1)*(-true_avgidle)
829 idle = (1 - W)*(-cl->avgidle);
831 idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
834 That is not all.
835 To maintain the rate allocated to the class,
836 we add to undertime virtual clock,
837 necessary to complete transmitted packet.
838 (len/phys_bandwidth has been already passed
839 to the moment of cbq_update)
842 idle -= L2T(&q->link, len);
843 idle += L2T(cl, len);
845 PSCHED_AUDIT_TDIFF(idle);
847 PSCHED_TADD2(q->now, idle, cl->undertime);
848 } else {
849 /* Underlimit */
851 PSCHED_SET_PASTPERFECT(cl->undertime);
852 if (avgidle > cl->maxidle)
853 cl->avgidle = cl->maxidle;
854 else
855 cl->avgidle = avgidle;
857 cl->last = q->now;
860 cbq_update_toplevel(q, this, q->tx_borrowed);
863 static __inline__ struct cbq_class *
864 cbq_under_limit(struct cbq_class *cl)
866 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
867 struct cbq_class *this_cl = cl;
869 if (cl->tparent == NULL)
870 return cl;
872 if (PSCHED_IS_PASTPERFECT(cl->undertime) ||
873 !PSCHED_TLESS(q->now, cl->undertime)) {
874 cl->delayed = 0;
875 return cl;
878 do {
879 /* It is very suspicious place. Now overlimit
880 action is generated for not bounded classes
881 only if link is completely congested.
882 Though it is in agree with ancestor-only paradigm,
883 it looks very stupid. Particularly,
884 it means that this chunk of code will either
885 never be called or result in strong amplification
886 of burstiness. Dangerous, silly, and, however,
887 no another solution exists.
889 if ((cl = cl->borrow) == NULL) {
890 this_cl->stats.overlimits++;
891 this_cl->overlimit(this_cl);
892 return NULL;
894 if (cl->level > q->toplevel)
895 return NULL;
896 } while (!PSCHED_IS_PASTPERFECT(cl->undertime) &&
897 PSCHED_TLESS(q->now, cl->undertime));
899 cl->delayed = 0;
900 return cl;
903 static __inline__ struct sk_buff *
904 cbq_dequeue_prio(struct Qdisc *sch, int prio)
906 struct cbq_sched_data *q = qdisc_priv(sch);
907 struct cbq_class *cl_tail, *cl_prev, *cl;
908 struct sk_buff *skb;
909 int deficit;
911 cl_tail = cl_prev = q->active[prio];
912 cl = cl_prev->next_alive;
914 do {
915 deficit = 0;
917 /* Start round */
918 do {
919 struct cbq_class *borrow = cl;
921 if (cl->q->q.qlen &&
922 (borrow = cbq_under_limit(cl)) == NULL)
923 goto skip_class;
925 if (cl->deficit <= 0) {
926 /* Class exhausted its allotment per
927 this round. Switch to the next one.
929 deficit = 1;
930 cl->deficit += cl->quantum;
931 goto next_class;
934 skb = cl->q->dequeue(cl->q);
936 /* Class did not give us any skb :-(
937 It could occur even if cl->q->q.qlen != 0
938 f.e. if cl->q == "tbf"
940 if (skb == NULL)
941 goto skip_class;
943 cl->deficit -= skb->len;
944 q->tx_class = cl;
945 q->tx_borrowed = borrow;
946 if (borrow != cl) {
947 #ifndef CBQ_XSTATS_BORROWS_BYTES
948 borrow->xstats.borrows++;
949 cl->xstats.borrows++;
950 #else
951 borrow->xstats.borrows += skb->len;
952 cl->xstats.borrows += skb->len;
953 #endif
955 q->tx_len = skb->len;
957 if (cl->deficit <= 0) {
958 q->active[prio] = cl;
959 cl = cl->next_alive;
960 cl->deficit += cl->quantum;
962 return skb;
964 skip_class:
965 if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
966 /* Class is empty or penalized.
967 Unlink it from active chain.
969 cl_prev->next_alive = cl->next_alive;
970 cl->next_alive = NULL;
972 /* Did cl_tail point to it? */
973 if (cl == cl_tail) {
974 /* Repair it! */
975 cl_tail = cl_prev;
977 /* Was it the last class in this band? */
978 if (cl == cl_tail) {
979 /* Kill the band! */
980 q->active[prio] = NULL;
981 q->activemask &= ~(1<<prio);
982 if (cl->q->q.qlen)
983 cbq_activate_class(cl);
984 return NULL;
987 q->active[prio] = cl_tail;
989 if (cl->q->q.qlen)
990 cbq_activate_class(cl);
992 cl = cl_prev;
995 next_class:
996 cl_prev = cl;
997 cl = cl->next_alive;
998 } while (cl_prev != cl_tail);
999 } while (deficit);
1001 q->active[prio] = cl_prev;
1003 return NULL;
1006 static __inline__ struct sk_buff *
1007 cbq_dequeue_1(struct Qdisc *sch)
1009 struct cbq_sched_data *q = qdisc_priv(sch);
1010 struct sk_buff *skb;
1011 unsigned activemask;
1013 activemask = q->activemask&0xFF;
1014 while (activemask) {
1015 int prio = ffz(~activemask);
1016 activemask &= ~(1<<prio);
1017 skb = cbq_dequeue_prio(sch, prio);
1018 if (skb)
1019 return skb;
1021 return NULL;
1024 static struct sk_buff *
1025 cbq_dequeue(struct Qdisc *sch)
1027 struct sk_buff *skb;
1028 struct cbq_sched_data *q = qdisc_priv(sch);
1029 psched_time_t now;
1030 psched_tdiff_t incr;
1032 PSCHED_GET_TIME(now);
1033 incr = PSCHED_TDIFF(now, q->now_rt);
1035 if (q->tx_class) {
1036 psched_tdiff_t incr2;
1037 /* Time integrator. We calculate EOS time
1038 by adding expected packet transmission time.
1039 If real time is greater, we warp artificial clock,
1040 so that:
1042 cbq_time = max(real_time, work);
1044 incr2 = L2T(&q->link, q->tx_len);
1045 PSCHED_TADD(q->now, incr2);
1046 cbq_update(q);
1047 if ((incr -= incr2) < 0)
1048 incr = 0;
1050 PSCHED_TADD(q->now, incr);
1051 q->now_rt = now;
1053 for (;;) {
1054 q->wd_expires = 0;
1056 skb = cbq_dequeue_1(sch);
1057 if (skb) {
1058 sch->q.qlen--;
1059 sch->flags &= ~TCQ_F_THROTTLED;
1060 return skb;
1063 /* All the classes are overlimit.
1065 It is possible, if:
1067 1. Scheduler is empty.
1068 2. Toplevel cutoff inhibited borrowing.
1069 3. Root class is overlimit.
1071 Reset 2d and 3d conditions and retry.
1073 Note, that NS and cbq-2.0 are buggy, peeking
1074 an arbitrary class is appropriate for ancestor-only
1075 sharing, but not for toplevel algorithm.
1077 Our version is better, but slower, because it requires
1078 two passes, but it is unavoidable with top-level sharing.
1081 if (q->toplevel == TC_CBQ_MAXLEVEL &&
1082 PSCHED_IS_PASTPERFECT(q->link.undertime))
1083 break;
1085 q->toplevel = TC_CBQ_MAXLEVEL;
1086 PSCHED_SET_PASTPERFECT(q->link.undertime);
1089 /* No packets in scheduler or nobody wants to give them to us :-(
1090 Sigh... start watchdog timer in the last case. */
1092 if (sch->q.qlen) {
1093 sch->stats.overlimits++;
1094 if (q->wd_expires) {
1095 long delay = PSCHED_US2JIFFIE(q->wd_expires);
1096 if (delay <= 0)
1097 delay = 1;
1098 mod_timer(&q->wd_timer, jiffies + delay);
1099 sch->flags |= TCQ_F_THROTTLED;
1102 return NULL;
1105 /* CBQ class maintanance routines */
1107 static void cbq_adjust_levels(struct cbq_class *this)
1109 if (this == NULL)
1110 return;
1112 do {
1113 int level = 0;
1114 struct cbq_class *cl;
1116 if ((cl = this->children) != NULL) {
1117 do {
1118 if (cl->level > level)
1119 level = cl->level;
1120 } while ((cl = cl->sibling) != this->children);
1122 this->level = level+1;
1123 } while ((this = this->tparent) != NULL);
1126 static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
1128 struct cbq_class *cl;
1129 unsigned h;
1131 if (q->quanta[prio] == 0)
1132 return;
1134 for (h=0; h<16; h++) {
1135 for (cl = q->classes[h]; cl; cl = cl->next) {
1136 /* BUGGGG... Beware! This expression suffer of
1137 arithmetic overflows!
1139 if (cl->priority == prio) {
1140 cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
1141 q->quanta[prio];
1143 if (cl->quantum <= 0 || cl->quantum>32*cl->qdisc->dev->mtu) {
1144 printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->classid, cl->quantum);
1145 cl->quantum = cl->qdisc->dev->mtu/2 + 1;
1151 static void cbq_sync_defmap(struct cbq_class *cl)
1153 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1154 struct cbq_class *split = cl->split;
1155 unsigned h;
1156 int i;
1158 if (split == NULL)
1159 return;
1161 for (i=0; i<=TC_PRIO_MAX; i++) {
1162 if (split->defaults[i] == cl && !(cl->defmap&(1<<i)))
1163 split->defaults[i] = NULL;
1166 for (i=0; i<=TC_PRIO_MAX; i++) {
1167 int level = split->level;
1169 if (split->defaults[i])
1170 continue;
1172 for (h=0; h<16; h++) {
1173 struct cbq_class *c;
1175 for (c = q->classes[h]; c; c = c->next) {
1176 if (c->split == split && c->level < level &&
1177 c->defmap&(1<<i)) {
1178 split->defaults[i] = c;
1179 level = c->level;
1186 static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
1188 struct cbq_class *split = NULL;
1190 if (splitid == 0) {
1191 if ((split = cl->split) == NULL)
1192 return;
1193 splitid = split->classid;
1196 if (split == NULL || split->classid != splitid) {
1197 for (split = cl->tparent; split; split = split->tparent)
1198 if (split->classid == splitid)
1199 break;
1202 if (split == NULL)
1203 return;
1205 if (cl->split != split) {
1206 cl->defmap = 0;
1207 cbq_sync_defmap(cl);
1208 cl->split = split;
1209 cl->defmap = def&mask;
1210 } else
1211 cl->defmap = (cl->defmap&~mask)|(def&mask);
1213 cbq_sync_defmap(cl);
1216 static void cbq_unlink_class(struct cbq_class *this)
1218 struct cbq_class *cl, **clp;
1219 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1221 for (clp = &q->classes[cbq_hash(this->classid)]; (cl = *clp) != NULL; clp = &cl->next) {
1222 if (cl == this) {
1223 *clp = cl->next;
1224 cl->next = NULL;
1225 break;
1229 if (this->tparent) {
1230 clp=&this->sibling;
1231 cl = *clp;
1232 do {
1233 if (cl == this) {
1234 *clp = cl->sibling;
1235 break;
1237 clp = &cl->sibling;
1238 } while ((cl = *clp) != this->sibling);
1240 if (this->tparent->children == this) {
1241 this->tparent->children = this->sibling;
1242 if (this->sibling == this)
1243 this->tparent->children = NULL;
1245 } else {
1246 BUG_TRAP(this->sibling == this);
1250 static void cbq_link_class(struct cbq_class *this)
1252 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1253 unsigned h = cbq_hash(this->classid);
1254 struct cbq_class *parent = this->tparent;
1256 this->sibling = this;
1257 this->next = q->classes[h];
1258 q->classes[h] = this;
1260 if (parent == NULL)
1261 return;
1263 if (parent->children == NULL) {
1264 parent->children = this;
1265 } else {
1266 this->sibling = parent->children->sibling;
1267 parent->children->sibling = this;
1271 static unsigned int cbq_drop(struct Qdisc* sch)
1273 struct cbq_sched_data *q = qdisc_priv(sch);
1274 struct cbq_class *cl, *cl_head;
1275 int prio;
1276 unsigned int len;
1278 for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) {
1279 if ((cl_head = q->active[prio]) == NULL)
1280 continue;
1282 cl = cl_head;
1283 do {
1284 if (cl->q->ops->drop && (len = cl->q->ops->drop(cl->q))) {
1285 sch->q.qlen--;
1286 return len;
1288 } while ((cl = cl->next_alive) != cl_head);
1290 return 0;
1293 static void
1294 cbq_reset(struct Qdisc* sch)
1296 struct cbq_sched_data *q = qdisc_priv(sch);
1297 struct cbq_class *cl;
1298 int prio;
1299 unsigned h;
1301 q->activemask = 0;
1302 q->pmask = 0;
1303 q->tx_class = NULL;
1304 q->tx_borrowed = NULL;
1305 del_timer(&q->wd_timer);
1306 del_timer(&q->delay_timer);
1307 q->toplevel = TC_CBQ_MAXLEVEL;
1308 PSCHED_GET_TIME(q->now);
1309 q->now_rt = q->now;
1311 for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1312 q->active[prio] = NULL;
1314 for (h = 0; h < 16; h++) {
1315 for (cl = q->classes[h]; cl; cl = cl->next) {
1316 qdisc_reset(cl->q);
1318 cl->next_alive = NULL;
1319 PSCHED_SET_PASTPERFECT(cl->undertime);
1320 cl->avgidle = cl->maxidle;
1321 cl->deficit = cl->quantum;
1322 cl->cpriority = cl->priority;
1325 sch->q.qlen = 0;
1329 static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1331 if (lss->change&TCF_CBQ_LSS_FLAGS) {
1332 cl->share = (lss->flags&TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1333 cl->borrow = (lss->flags&TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1335 if (lss->change&TCF_CBQ_LSS_EWMA)
1336 cl->ewma_log = lss->ewma_log;
1337 if (lss->change&TCF_CBQ_LSS_AVPKT)
1338 cl->avpkt = lss->avpkt;
1339 if (lss->change&TCF_CBQ_LSS_MINIDLE)
1340 cl->minidle = -(long)lss->minidle;
1341 if (lss->change&TCF_CBQ_LSS_MAXIDLE) {
1342 cl->maxidle = lss->maxidle;
1343 cl->avgidle = lss->maxidle;
1345 if (lss->change&TCF_CBQ_LSS_OFFTIME)
1346 cl->offtime = lss->offtime;
1347 return 0;
1350 static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1352 q->nclasses[cl->priority]--;
1353 q->quanta[cl->priority] -= cl->weight;
1354 cbq_normalize_quanta(q, cl->priority);
1357 static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1359 q->nclasses[cl->priority]++;
1360 q->quanta[cl->priority] += cl->weight;
1361 cbq_normalize_quanta(q, cl->priority);
1364 static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1366 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1368 if (wrr->allot)
1369 cl->allot = wrr->allot;
1370 if (wrr->weight)
1371 cl->weight = wrr->weight;
1372 if (wrr->priority) {
1373 cl->priority = wrr->priority-1;
1374 cl->cpriority = cl->priority;
1375 if (cl->priority >= cl->priority2)
1376 cl->priority2 = TC_CBQ_MAXPRIO-1;
1379 cbq_addprio(q, cl);
1380 return 0;
1383 static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl)
1385 switch (ovl->strategy) {
1386 case TC_CBQ_OVL_CLASSIC:
1387 cl->overlimit = cbq_ovl_classic;
1388 break;
1389 case TC_CBQ_OVL_DELAY:
1390 cl->overlimit = cbq_ovl_delay;
1391 break;
1392 case TC_CBQ_OVL_LOWPRIO:
1393 if (ovl->priority2-1 >= TC_CBQ_MAXPRIO ||
1394 ovl->priority2-1 <= cl->priority)
1395 return -EINVAL;
1396 cl->priority2 = ovl->priority2-1;
1397 cl->overlimit = cbq_ovl_lowprio;
1398 break;
1399 case TC_CBQ_OVL_DROP:
1400 cl->overlimit = cbq_ovl_drop;
1401 break;
1402 case TC_CBQ_OVL_RCLASSIC:
1403 cl->overlimit = cbq_ovl_rclassic;
1404 break;
1405 default:
1406 return -EINVAL;
1408 cl->penalty = (ovl->penalty*HZ)/1000;
1409 return 0;
1412 #ifdef CONFIG_NET_CLS_POLICE
1413 static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p)
1415 cl->police = p->police;
1417 if (cl->q->handle) {
1418 if (p->police == TC_POLICE_RECLASSIFY)
1419 cl->q->reshape_fail = cbq_reshape_fail;
1420 else
1421 cl->q->reshape_fail = NULL;
1423 return 0;
1425 #endif
1427 static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1429 cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1430 return 0;
1433 static int cbq_init(struct Qdisc *sch, struct rtattr *opt)
1435 struct cbq_sched_data *q = qdisc_priv(sch);
1436 struct rtattr *tb[TCA_CBQ_MAX];
1437 struct tc_ratespec *r;
1439 if (rtattr_parse(tb, TCA_CBQ_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt)) < 0 ||
1440 tb[TCA_CBQ_RTAB-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL ||
1441 RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec))
1442 return -EINVAL;
1444 if (tb[TCA_CBQ_LSSOPT-1] &&
1445 RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt))
1446 return -EINVAL;
1448 r = RTA_DATA(tb[TCA_CBQ_RATE-1]);
1450 if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB-1])) == NULL)
1451 return -EINVAL;
1453 q->link.refcnt = 1;
1454 q->link.sibling = &q->link;
1455 q->link.classid = sch->handle;
1456 q->link.qdisc = sch;
1457 if (!(q->link.q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops)))
1458 q->link.q = &noop_qdisc;
1460 q->link.priority = TC_CBQ_MAXPRIO-1;
1461 q->link.priority2 = TC_CBQ_MAXPRIO-1;
1462 q->link.cpriority = TC_CBQ_MAXPRIO-1;
1463 q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC;
1464 q->link.overlimit = cbq_ovl_classic;
1465 q->link.allot = psched_mtu(sch->dev);
1466 q->link.quantum = q->link.allot;
1467 q->link.weight = q->link.R_tab->rate.rate;
1469 q->link.ewma_log = TC_CBQ_DEF_EWMA;
1470 q->link.avpkt = q->link.allot/2;
1471 q->link.minidle = -0x7FFFFFFF;
1472 q->link.stats_lock = &sch->dev->queue_lock;
1474 init_timer(&q->wd_timer);
1475 q->wd_timer.data = (unsigned long)sch;
1476 q->wd_timer.function = cbq_watchdog;
1477 init_timer(&q->delay_timer);
1478 q->delay_timer.data = (unsigned long)sch;
1479 q->delay_timer.function = cbq_undelay;
1480 q->toplevel = TC_CBQ_MAXLEVEL;
1481 PSCHED_GET_TIME(q->now);
1482 q->now_rt = q->now;
1484 cbq_link_class(&q->link);
1486 if (tb[TCA_CBQ_LSSOPT-1])
1487 cbq_set_lss(&q->link, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
1489 cbq_addprio(q, &q->link);
1490 return 0;
1493 static __inline__ int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1495 unsigned char *b = skb->tail;
1497 RTA_PUT(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate);
1498 return skb->len;
1500 rtattr_failure:
1501 skb_trim(skb, b - skb->data);
1502 return -1;
1505 static __inline__ int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1507 unsigned char *b = skb->tail;
1508 struct tc_cbq_lssopt opt;
1510 opt.flags = 0;
1511 if (cl->borrow == NULL)
1512 opt.flags |= TCF_CBQ_LSS_BOUNDED;
1513 if (cl->share == NULL)
1514 opt.flags |= TCF_CBQ_LSS_ISOLATED;
1515 opt.ewma_log = cl->ewma_log;
1516 opt.level = cl->level;
1517 opt.avpkt = cl->avpkt;
1518 opt.maxidle = cl->maxidle;
1519 opt.minidle = (u32)(-cl->minidle);
1520 opt.offtime = cl->offtime;
1521 opt.change = ~0;
1522 RTA_PUT(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt);
1523 return skb->len;
1525 rtattr_failure:
1526 skb_trim(skb, b - skb->data);
1527 return -1;
1530 static __inline__ int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1532 unsigned char *b = skb->tail;
1533 struct tc_cbq_wrropt opt;
1535 opt.flags = 0;
1536 opt.allot = cl->allot;
1537 opt.priority = cl->priority+1;
1538 opt.cpriority = cl->cpriority+1;
1539 opt.weight = cl->weight;
1540 RTA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt);
1541 return skb->len;
1543 rtattr_failure:
1544 skb_trim(skb, b - skb->data);
1545 return -1;
1548 static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
1550 unsigned char *b = skb->tail;
1551 struct tc_cbq_ovl opt;
1553 opt.strategy = cl->ovl_strategy;
1554 opt.priority2 = cl->priority2+1;
1555 opt.penalty = (cl->penalty*1000)/HZ;
1556 RTA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt);
1557 return skb->len;
1559 rtattr_failure:
1560 skb_trim(skb, b - skb->data);
1561 return -1;
1564 static __inline__ int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1566 unsigned char *b = skb->tail;
1567 struct tc_cbq_fopt opt;
1569 if (cl->split || cl->defmap) {
1570 opt.split = cl->split ? cl->split->classid : 0;
1571 opt.defmap = cl->defmap;
1572 opt.defchange = ~0;
1573 RTA_PUT(skb, TCA_CBQ_FOPT, sizeof(opt), &opt);
1575 return skb->len;
1577 rtattr_failure:
1578 skb_trim(skb, b - skb->data);
1579 return -1;
1582 #ifdef CONFIG_NET_CLS_POLICE
1583 static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
1585 unsigned char *b = skb->tail;
1586 struct tc_cbq_police opt;
1588 if (cl->police) {
1589 opt.police = cl->police;
1590 RTA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt);
1592 return skb->len;
1594 rtattr_failure:
1595 skb_trim(skb, b - skb->data);
1596 return -1;
1598 #endif
1600 static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1602 if (cbq_dump_lss(skb, cl) < 0 ||
1603 cbq_dump_rate(skb, cl) < 0 ||
1604 cbq_dump_wrr(skb, cl) < 0 ||
1605 cbq_dump_ovl(skb, cl) < 0 ||
1606 #ifdef CONFIG_NET_CLS_POLICE
1607 cbq_dump_police(skb, cl) < 0 ||
1608 #endif
1609 cbq_dump_fopt(skb, cl) < 0)
1610 return -1;
1611 return 0;
1614 int cbq_copy_xstats(struct sk_buff *skb, struct tc_cbq_xstats *st)
1616 RTA_PUT(skb, TCA_XSTATS, sizeof(*st), st);
1617 return 0;
1619 rtattr_failure:
1620 return -1;
1624 static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1626 struct cbq_sched_data *q = qdisc_priv(sch);
1627 unsigned char *b = skb->tail;
1628 struct rtattr *rta;
1630 rta = (struct rtattr*)b;
1631 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
1632 if (cbq_dump_attr(skb, &q->link) < 0)
1633 goto rtattr_failure;
1634 rta->rta_len = skb->tail - b;
1635 spin_lock_bh(&sch->dev->queue_lock);
1636 q->link.xstats.avgidle = q->link.avgidle;
1637 if (cbq_copy_xstats(skb, &q->link.xstats)) {
1638 spin_unlock_bh(&sch->dev->queue_lock);
1639 goto rtattr_failure;
1641 spin_unlock_bh(&sch->dev->queue_lock);
1642 return skb->len;
1644 rtattr_failure:
1645 skb_trim(skb, b - skb->data);
1646 return -1;
1649 static int
1650 cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1651 struct sk_buff *skb, struct tcmsg *tcm)
1653 struct cbq_sched_data *q = qdisc_priv(sch);
1654 struct cbq_class *cl = (struct cbq_class*)arg;
1655 unsigned char *b = skb->tail;
1656 struct rtattr *rta;
1658 if (cl->tparent)
1659 tcm->tcm_parent = cl->tparent->classid;
1660 else
1661 tcm->tcm_parent = TC_H_ROOT;
1662 tcm->tcm_handle = cl->classid;
1663 tcm->tcm_info = cl->q->handle;
1665 rta = (struct rtattr*)b;
1666 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
1667 if (cbq_dump_attr(skb, cl) < 0)
1668 goto rtattr_failure;
1669 rta->rta_len = skb->tail - b;
1670 cl->stats.qlen = cl->q->q.qlen;
1671 if (qdisc_copy_stats(skb, &cl->stats, cl->stats_lock))
1672 goto rtattr_failure;
1673 spin_lock_bh(&sch->dev->queue_lock);
1674 cl->xstats.avgidle = cl->avgidle;
1675 cl->xstats.undertime = 0;
1676 if (!PSCHED_IS_PASTPERFECT(cl->undertime))
1677 cl->xstats.undertime = PSCHED_TDIFF(cl->undertime, q->now);
1678 if (cbq_copy_xstats(skb, &cl->xstats)) {
1679 spin_unlock_bh(&sch->dev->queue_lock);
1680 goto rtattr_failure;
1682 spin_unlock_bh(&sch->dev->queue_lock);
1684 return skb->len;
1686 rtattr_failure:
1687 skb_trim(skb, b - skb->data);
1688 return -1;
1691 static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1692 struct Qdisc **old)
1694 struct cbq_class *cl = (struct cbq_class*)arg;
1696 if (cl) {
1697 if (new == NULL) {
1698 if ((new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops)) == NULL)
1699 return -ENOBUFS;
1700 } else {
1701 #ifdef CONFIG_NET_CLS_POLICE
1702 if (cl->police == TC_POLICE_RECLASSIFY)
1703 new->reshape_fail = cbq_reshape_fail;
1704 #endif
1706 sch_tree_lock(sch);
1707 *old = cl->q;
1708 cl->q = new;
1709 sch->q.qlen -= (*old)->q.qlen;
1710 qdisc_reset(*old);
1711 sch_tree_unlock(sch);
1713 return 0;
1715 return -ENOENT;
1718 static struct Qdisc *
1719 cbq_leaf(struct Qdisc *sch, unsigned long arg)
1721 struct cbq_class *cl = (struct cbq_class*)arg;
1723 return cl ? cl->q : NULL;
1726 static unsigned long cbq_get(struct Qdisc *sch, u32 classid)
1728 struct cbq_sched_data *q = qdisc_priv(sch);
1729 struct cbq_class *cl = cbq_class_lookup(q, classid);
1731 if (cl) {
1732 cl->refcnt++;
1733 return (unsigned long)cl;
1735 return 0;
1738 static void cbq_destroy_filters(struct cbq_class *cl)
1740 struct tcf_proto *tp;
1742 while ((tp = cl->filter_list) != NULL) {
1743 cl->filter_list = tp->next;
1744 tcf_destroy(tp);
1748 static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1750 struct cbq_sched_data *q = qdisc_priv(sch);
1752 cbq_destroy_filters(cl);
1753 qdisc_destroy(cl->q);
1754 qdisc_put_rtab(cl->R_tab);
1755 #ifdef CONFIG_NET_ESTIMATOR
1756 qdisc_kill_estimator(&cl->stats);
1757 #endif
1758 if (cl != &q->link)
1759 kfree(cl);
1762 static void
1763 cbq_destroy(struct Qdisc* sch)
1765 struct cbq_sched_data *q = qdisc_priv(sch);
1766 struct cbq_class *cl;
1767 unsigned h;
1769 #ifdef CONFIG_NET_CLS_POLICE
1770 q->rx_class = NULL;
1771 #endif
1773 for (h = 0; h < 16; h++) {
1774 struct cbq_class *next;
1776 for (cl = q->classes[h]; cl; cl = next) {
1777 next = cl->next;
1778 cbq_destroy_class(sch, cl);
1783 static void cbq_put(struct Qdisc *sch, unsigned long arg)
1785 struct cbq_class *cl = (struct cbq_class*)arg;
1787 if (--cl->refcnt == 0) {
1788 #ifdef CONFIG_NET_CLS_POLICE
1789 struct cbq_sched_data *q = qdisc_priv(sch);
1791 spin_lock_bh(&sch->dev->queue_lock);
1792 if (q->rx_class == cl)
1793 q->rx_class = NULL;
1794 spin_unlock_bh(&sch->dev->queue_lock);
1795 #endif
1797 cbq_destroy_class(sch, cl);
1801 static int
1802 cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct rtattr **tca,
1803 unsigned long *arg)
1805 int err;
1806 struct cbq_sched_data *q = qdisc_priv(sch);
1807 struct cbq_class *cl = (struct cbq_class*)*arg;
1808 struct rtattr *opt = tca[TCA_OPTIONS-1];
1809 struct rtattr *tb[TCA_CBQ_MAX];
1810 struct cbq_class *parent;
1811 struct qdisc_rate_table *rtab = NULL;
1813 if (opt==NULL ||
1814 rtattr_parse(tb, TCA_CBQ_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt)))
1815 return -EINVAL;
1817 if (tb[TCA_CBQ_OVL_STRATEGY-1] &&
1818 RTA_PAYLOAD(tb[TCA_CBQ_OVL_STRATEGY-1]) < sizeof(struct tc_cbq_ovl))
1819 return -EINVAL;
1821 if (tb[TCA_CBQ_FOPT-1] &&
1822 RTA_PAYLOAD(tb[TCA_CBQ_FOPT-1]) < sizeof(struct tc_cbq_fopt))
1823 return -EINVAL;
1825 if (tb[TCA_CBQ_RATE-1] &&
1826 RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec))
1827 return -EINVAL;
1829 if (tb[TCA_CBQ_LSSOPT-1] &&
1830 RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt))
1831 return -EINVAL;
1833 if (tb[TCA_CBQ_WRROPT-1] &&
1834 RTA_PAYLOAD(tb[TCA_CBQ_WRROPT-1]) < sizeof(struct tc_cbq_wrropt))
1835 return -EINVAL;
1837 #ifdef CONFIG_NET_CLS_POLICE
1838 if (tb[TCA_CBQ_POLICE-1] &&
1839 RTA_PAYLOAD(tb[TCA_CBQ_POLICE-1]) < sizeof(struct tc_cbq_police))
1840 return -EINVAL;
1841 #endif
1843 if (cl) {
1844 /* Check parent */
1845 if (parentid) {
1846 if (cl->tparent && cl->tparent->classid != parentid)
1847 return -EINVAL;
1848 if (!cl->tparent && parentid != TC_H_ROOT)
1849 return -EINVAL;
1852 if (tb[TCA_CBQ_RATE-1]) {
1853 rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]);
1854 if (rtab == NULL)
1855 return -EINVAL;
1858 /* Change class parameters */
1859 sch_tree_lock(sch);
1861 if (cl->next_alive != NULL)
1862 cbq_deactivate_class(cl);
1864 if (rtab) {
1865 rtab = xchg(&cl->R_tab, rtab);
1866 qdisc_put_rtab(rtab);
1869 if (tb[TCA_CBQ_LSSOPT-1])
1870 cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
1872 if (tb[TCA_CBQ_WRROPT-1]) {
1873 cbq_rmprio(q, cl);
1874 cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1]));
1877 if (tb[TCA_CBQ_OVL_STRATEGY-1])
1878 cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1]));
1880 #ifdef CONFIG_NET_CLS_POLICE
1881 if (tb[TCA_CBQ_POLICE-1])
1882 cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1]));
1883 #endif
1885 if (tb[TCA_CBQ_FOPT-1])
1886 cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1]));
1888 if (cl->q->q.qlen)
1889 cbq_activate_class(cl);
1891 sch_tree_unlock(sch);
1893 #ifdef CONFIG_NET_ESTIMATOR
1894 if (tca[TCA_RATE-1]) {
1895 qdisc_kill_estimator(&cl->stats);
1896 qdisc_new_estimator(&cl->stats, cl->stats_lock,
1897 tca[TCA_RATE-1]);
1899 #endif
1900 return 0;
1903 if (parentid == TC_H_ROOT)
1904 return -EINVAL;
1906 if (tb[TCA_CBQ_WRROPT-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL ||
1907 tb[TCA_CBQ_LSSOPT-1] == NULL)
1908 return -EINVAL;
1910 rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]);
1911 if (rtab == NULL)
1912 return -EINVAL;
1914 if (classid) {
1915 err = -EINVAL;
1916 if (TC_H_MAJ(classid^sch->handle) || cbq_class_lookup(q, classid))
1917 goto failure;
1918 } else {
1919 int i;
1920 classid = TC_H_MAKE(sch->handle,0x8000);
1922 for (i=0; i<0x8000; i++) {
1923 if (++q->hgenerator >= 0x8000)
1924 q->hgenerator = 1;
1925 if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1926 break;
1928 err = -ENOSR;
1929 if (i >= 0x8000)
1930 goto failure;
1931 classid = classid|q->hgenerator;
1934 parent = &q->link;
1935 if (parentid) {
1936 parent = cbq_class_lookup(q, parentid);
1937 err = -EINVAL;
1938 if (parent == NULL)
1939 goto failure;
1942 err = -ENOBUFS;
1943 cl = kmalloc(sizeof(*cl), GFP_KERNEL);
1944 if (cl == NULL)
1945 goto failure;
1946 memset(cl, 0, sizeof(*cl));
1947 cl->R_tab = rtab;
1948 rtab = NULL;
1949 cl->refcnt = 1;
1950 if (!(cl->q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops)))
1951 cl->q = &noop_qdisc;
1952 cl->classid = classid;
1953 cl->tparent = parent;
1954 cl->qdisc = sch;
1955 cl->allot = parent->allot;
1956 cl->quantum = cl->allot;
1957 cl->weight = cl->R_tab->rate.rate;
1958 cl->stats_lock = &sch->dev->queue_lock;
1960 sch_tree_lock(sch);
1961 cbq_link_class(cl);
1962 cl->borrow = cl->tparent;
1963 if (cl->tparent != &q->link)
1964 cl->share = cl->tparent;
1965 cbq_adjust_levels(parent);
1966 cl->minidle = -0x7FFFFFFF;
1967 cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
1968 cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1]));
1969 if (cl->ewma_log==0)
1970 cl->ewma_log = q->link.ewma_log;
1971 if (cl->maxidle==0)
1972 cl->maxidle = q->link.maxidle;
1973 if (cl->avpkt==0)
1974 cl->avpkt = q->link.avpkt;
1975 cl->overlimit = cbq_ovl_classic;
1976 if (tb[TCA_CBQ_OVL_STRATEGY-1])
1977 cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1]));
1978 #ifdef CONFIG_NET_CLS_POLICE
1979 if (tb[TCA_CBQ_POLICE-1])
1980 cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1]));
1981 #endif
1982 if (tb[TCA_CBQ_FOPT-1])
1983 cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1]));
1984 sch_tree_unlock(sch);
1986 #ifdef CONFIG_NET_ESTIMATOR
1987 if (tca[TCA_RATE-1])
1988 qdisc_new_estimator(&cl->stats, cl->stats_lock,
1989 tca[TCA_RATE-1]);
1990 #endif
1992 *arg = (unsigned long)cl;
1993 return 0;
1995 failure:
1996 qdisc_put_rtab(rtab);
1997 return err;
2000 static int cbq_delete(struct Qdisc *sch, unsigned long arg)
2002 struct cbq_sched_data *q = qdisc_priv(sch);
2003 struct cbq_class *cl = (struct cbq_class*)arg;
2005 if (cl->filters || cl->children || cl == &q->link)
2006 return -EBUSY;
2008 sch_tree_lock(sch);
2010 if (cl->next_alive)
2011 cbq_deactivate_class(cl);
2013 if (q->tx_borrowed == cl)
2014 q->tx_borrowed = q->tx_class;
2015 if (q->tx_class == cl) {
2016 q->tx_class = NULL;
2017 q->tx_borrowed = NULL;
2019 #ifdef CONFIG_NET_CLS_POLICE
2020 if (q->rx_class == cl)
2021 q->rx_class = NULL;
2022 #endif
2024 cbq_unlink_class(cl);
2025 cbq_adjust_levels(cl->tparent);
2026 cl->defmap = 0;
2027 cbq_sync_defmap(cl);
2029 cbq_rmprio(q, cl);
2030 sch_tree_unlock(sch);
2032 if (--cl->refcnt == 0)
2033 cbq_destroy_class(sch, cl);
2035 return 0;
2038 static struct tcf_proto **cbq_find_tcf(struct Qdisc *sch, unsigned long arg)
2040 struct cbq_sched_data *q = qdisc_priv(sch);
2041 struct cbq_class *cl = (struct cbq_class *)arg;
2043 if (cl == NULL)
2044 cl = &q->link;
2046 return &cl->filter_list;
2049 static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
2050 u32 classid)
2052 struct cbq_sched_data *q = qdisc_priv(sch);
2053 struct cbq_class *p = (struct cbq_class*)parent;
2054 struct cbq_class *cl = cbq_class_lookup(q, classid);
2056 if (cl) {
2057 if (p && p->level <= cl->level)
2058 return 0;
2059 cl->filters++;
2060 return (unsigned long)cl;
2062 return 0;
2065 static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
2067 struct cbq_class *cl = (struct cbq_class*)arg;
2069 cl->filters--;
2072 static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
2074 struct cbq_sched_data *q = qdisc_priv(sch);
2075 unsigned h;
2077 if (arg->stop)
2078 return;
2080 for (h = 0; h < 16; h++) {
2081 struct cbq_class *cl;
2083 for (cl = q->classes[h]; cl; cl = cl->next) {
2084 if (arg->count < arg->skip) {
2085 arg->count++;
2086 continue;
2088 if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
2089 arg->stop = 1;
2090 return;
2092 arg->count++;
2097 static struct Qdisc_class_ops cbq_class_ops = {
2098 .graft = cbq_graft,
2099 .leaf = cbq_leaf,
2100 .get = cbq_get,
2101 .put = cbq_put,
2102 .change = cbq_change_class,
2103 .delete = cbq_delete,
2104 .walk = cbq_walk,
2105 .tcf_chain = cbq_find_tcf,
2106 .bind_tcf = cbq_bind_filter,
2107 .unbind_tcf = cbq_unbind_filter,
2108 .dump = cbq_dump_class,
2111 static struct Qdisc_ops cbq_qdisc_ops = {
2112 .next = NULL,
2113 .cl_ops = &cbq_class_ops,
2114 .id = "cbq",
2115 .priv_size = sizeof(struct cbq_sched_data),
2116 .enqueue = cbq_enqueue,
2117 .dequeue = cbq_dequeue,
2118 .requeue = cbq_requeue,
2119 .drop = cbq_drop,
2120 .init = cbq_init,
2121 .reset = cbq_reset,
2122 .destroy = cbq_destroy,
2123 .change = NULL,
2124 .dump = cbq_dump,
2125 .owner = THIS_MODULE,
2128 static int __init cbq_module_init(void)
2130 return register_qdisc(&cbq_qdisc_ops);
2132 static void __exit cbq_module_exit(void)
2134 unregister_qdisc(&cbq_qdisc_ops);
2136 module_init(cbq_module_init)
2137 module_exit(cbq_module_exit)
2138 MODULE_LICENSE("GPL");