Merge master.kernel.org:/pub/scm/linux/kernel/git/davem/sparc-2.6
[linux/fpc-iii.git] / net / sched / sch_cbq.c
blobbac881bfe3628b24a36642c967f30543447fca39
1 /*
2 * net/sched/sch_cbq.c Class-Based Queueing discipline.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 #include <linux/module.h>
14 #include <asm/uaccess.h>
15 #include <asm/system.h>
16 #include <linux/bitops.h>
17 #include <linux/types.h>
18 #include <linux/kernel.h>
19 #include <linux/sched.h>
20 #include <linux/string.h>
21 #include <linux/mm.h>
22 #include <linux/socket.h>
23 #include <linux/sockios.h>
24 #include <linux/in.h>
25 #include <linux/errno.h>
26 #include <linux/interrupt.h>
27 #include <linux/if_ether.h>
28 #include <linux/inet.h>
29 #include <linux/netdevice.h>
30 #include <linux/etherdevice.h>
31 #include <linux/notifier.h>
32 #include <net/ip.h>
33 #include <net/route.h>
34 #include <linux/skbuff.h>
35 #include <net/sock.h>
36 #include <net/pkt_sched.h>
39 /* Class-Based Queueing (CBQ) algorithm.
40 =======================================
42 Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
43 Management Models for Packet Networks",
44 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
46 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
48 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
49 Parameters", 1996
51 [4] Sally Floyd and Michael Speer, "Experimental Results
52 for Class-Based Queueing", 1998, not published.
54 -----------------------------------------------------------------------
56 Algorithm skeleton was taken from NS simulator cbq.cc.
57 If someone wants to check this code against the LBL version,
58 he should take into account that ONLY the skeleton was borrowed,
59 the implementation is different. Particularly:
61 --- The WRR algorithm is different. Our version looks more
62 reasonable (I hope) and works when quanta are allowed to be
63 less than MTU, which is always the case when real time classes
64 have small rates. Note, that the statement of [3] is
65 incomplete, delay may actually be estimated even if class
66 per-round allotment is less than MTU. Namely, if per-round
67 allotment is W*r_i, and r_1+...+r_k = r < 1
69 delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
71 In the worst case we have IntServ estimate with D = W*r+k*MTU
72 and C = MTU*r. The proof (if correct at all) is trivial.
75 --- It seems that cbq-2.0 is not very accurate. At least, I cannot
76 interpret some places, which look like wrong translations
77 from NS. Anyone is advised to find these differences
78 and explain to me, why I am wrong 8).
80 --- Linux has no EOI event, so that we cannot estimate true class
81 idle time. Workaround is to consider the next dequeue event
82 as sign that previous packet is finished. This is wrong because of
83 internal device queueing, but on a permanently loaded link it is true.
84 Moreover, combined with clock integrator, this scheme looks
85 very close to an ideal solution. */
87 struct cbq_sched_data;
90 struct cbq_class
92 struct cbq_class *next; /* hash table link */
93 struct cbq_class *next_alive; /* next class with backlog in this priority band */
95 /* Parameters */
96 u32 classid;
97 unsigned char priority; /* class priority */
98 unsigned char priority2; /* priority to be used after overlimit */
99 unsigned char ewma_log; /* time constant for idle time calculation */
100 unsigned char ovl_strategy;
101 #ifdef CONFIG_NET_CLS_POLICE
102 unsigned char police;
103 #endif
105 u32 defmap;
107 /* Link-sharing scheduler parameters */
108 long maxidle; /* Class parameters: see below. */
109 long offtime;
110 long minidle;
111 u32 avpkt;
112 struct qdisc_rate_table *R_tab;
114 /* Overlimit strategy parameters */
115 void (*overlimit)(struct cbq_class *cl);
116 long penalty;
118 /* General scheduler (WRR) parameters */
119 long allot;
120 long quantum; /* Allotment per WRR round */
121 long weight; /* Relative allotment: see below */
123 struct Qdisc *qdisc; /* Ptr to CBQ discipline */
124 struct cbq_class *split; /* Ptr to split node */
125 struct cbq_class *share; /* Ptr to LS parent in the class tree */
126 struct cbq_class *tparent; /* Ptr to tree parent in the class tree */
127 struct cbq_class *borrow; /* NULL if class is bandwidth limited;
128 parent otherwise */
129 struct cbq_class *sibling; /* Sibling chain */
130 struct cbq_class *children; /* Pointer to children chain */
132 struct Qdisc *q; /* Elementary queueing discipline */
135 /* Variables */
136 unsigned char cpriority; /* Effective priority */
137 unsigned char delayed;
138 unsigned char level; /* level of the class in hierarchy:
139 0 for leaf classes, and maximal
140 level of children + 1 for nodes.
143 psched_time_t last; /* Last end of service */
144 psched_time_t undertime;
145 long avgidle;
146 long deficit; /* Saved deficit for WRR */
147 unsigned long penalized;
148 struct gnet_stats_basic bstats;
149 struct gnet_stats_queue qstats;
150 struct gnet_stats_rate_est rate_est;
151 spinlock_t *stats_lock;
152 struct tc_cbq_xstats xstats;
154 struct tcf_proto *filter_list;
156 int refcnt;
157 int filters;
159 struct cbq_class *defaults[TC_PRIO_MAX+1];
162 struct cbq_sched_data
164 struct cbq_class *classes[16]; /* Hash table of all classes */
165 int nclasses[TC_CBQ_MAXPRIO+1];
166 unsigned quanta[TC_CBQ_MAXPRIO+1];
168 struct cbq_class link;
170 unsigned activemask;
171 struct cbq_class *active[TC_CBQ_MAXPRIO+1]; /* List of all classes
172 with backlog */
174 #ifdef CONFIG_NET_CLS_POLICE
175 struct cbq_class *rx_class;
176 #endif
177 struct cbq_class *tx_class;
178 struct cbq_class *tx_borrowed;
179 int tx_len;
180 psched_time_t now; /* Cached timestamp */
181 psched_time_t now_rt; /* Cached real time */
182 unsigned pmask;
184 struct timer_list delay_timer;
185 struct timer_list wd_timer; /* Watchdog timer,
186 started when CBQ has
187 backlog, but cannot
188 transmit just now */
189 long wd_expires;
190 int toplevel;
191 u32 hgenerator;
195 #define L2T(cl,len) ((cl)->R_tab->data[(len)>>(cl)->R_tab->rate.cell_log])
198 static __inline__ unsigned cbq_hash(u32 h)
200 h ^= h>>8;
201 h ^= h>>4;
202 return h&0xF;
205 static __inline__ struct cbq_class *
206 cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
208 struct cbq_class *cl;
210 for (cl = q->classes[cbq_hash(classid)]; cl; cl = cl->next)
211 if (cl->classid == classid)
212 return cl;
213 return NULL;
216 #ifdef CONFIG_NET_CLS_POLICE
218 static struct cbq_class *
219 cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
221 struct cbq_class *cl, *new;
223 for (cl = this->tparent; cl; cl = cl->tparent)
224 if ((new = cl->defaults[TC_PRIO_BESTEFFORT]) != NULL && new != this)
225 return new;
227 return NULL;
230 #endif
232 /* Classify packet. The procedure is pretty complicated, but
233 it allows us to combine link sharing and priority scheduling
234 transparently.
236 Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
237 so that it resolves to split nodes. Then packets are classified
238 by logical priority, or a more specific classifier may be attached
239 to the split node.
242 static struct cbq_class *
243 cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
245 struct cbq_sched_data *q = qdisc_priv(sch);
246 struct cbq_class *head = &q->link;
247 struct cbq_class **defmap;
248 struct cbq_class *cl = NULL;
249 u32 prio = skb->priority;
250 struct tcf_result res;
253 * Step 1. If skb->priority points to one of our classes, use it.
255 if (TC_H_MAJ(prio^sch->handle) == 0 &&
256 (cl = cbq_class_lookup(q, prio)) != NULL)
257 return cl;
259 *qerr = NET_XMIT_BYPASS;
260 for (;;) {
261 int result = 0;
262 defmap = head->defaults;
265 * Step 2+n. Apply classifier.
267 if (!head->filter_list || (result = tc_classify(skb, head->filter_list, &res)) < 0)
268 goto fallback;
270 if ((cl = (void*)res.class) == NULL) {
271 if (TC_H_MAJ(res.classid))
272 cl = cbq_class_lookup(q, res.classid);
273 else if ((cl = defmap[res.classid&TC_PRIO_MAX]) == NULL)
274 cl = defmap[TC_PRIO_BESTEFFORT];
276 if (cl == NULL || cl->level >= head->level)
277 goto fallback;
280 #ifdef CONFIG_NET_CLS_ACT
281 switch (result) {
282 case TC_ACT_QUEUED:
283 case TC_ACT_STOLEN:
284 *qerr = NET_XMIT_SUCCESS;
285 case TC_ACT_SHOT:
286 return NULL;
288 #elif defined(CONFIG_NET_CLS_POLICE)
289 switch (result) {
290 case TC_POLICE_RECLASSIFY:
291 return cbq_reclassify(skb, cl);
292 case TC_POLICE_SHOT:
293 return NULL;
294 default:
295 break;
297 #endif
298 if (cl->level == 0)
299 return cl;
302 * Step 3+n. If classifier selected a link sharing class,
303 * apply agency specific classifier.
304 * Repeat this procdure until we hit a leaf node.
306 head = cl;
309 fallback:
310 cl = head;
313 * Step 4. No success...
315 if (TC_H_MAJ(prio) == 0 &&
316 !(cl = head->defaults[prio&TC_PRIO_MAX]) &&
317 !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
318 return head;
320 return cl;
324 A packet has just been enqueued on the empty class.
325 cbq_activate_class adds it to the tail of active class list
326 of its priority band.
329 static __inline__ void cbq_activate_class(struct cbq_class *cl)
331 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
332 int prio = cl->cpriority;
333 struct cbq_class *cl_tail;
335 cl_tail = q->active[prio];
336 q->active[prio] = cl;
338 if (cl_tail != NULL) {
339 cl->next_alive = cl_tail->next_alive;
340 cl_tail->next_alive = cl;
341 } else {
342 cl->next_alive = cl;
343 q->activemask |= (1<<prio);
348 Unlink class from active chain.
349 Note that this same procedure is done directly in cbq_dequeue*
350 during round-robin procedure.
353 static void cbq_deactivate_class(struct cbq_class *this)
355 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
356 int prio = this->cpriority;
357 struct cbq_class *cl;
358 struct cbq_class *cl_prev = q->active[prio];
360 do {
361 cl = cl_prev->next_alive;
362 if (cl == this) {
363 cl_prev->next_alive = cl->next_alive;
364 cl->next_alive = NULL;
366 if (cl == q->active[prio]) {
367 q->active[prio] = cl_prev;
368 if (cl == q->active[prio]) {
369 q->active[prio] = NULL;
370 q->activemask &= ~(1<<prio);
371 return;
375 cl = cl_prev->next_alive;
376 return;
378 } while ((cl_prev = cl) != q->active[prio]);
381 static void
382 cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
384 int toplevel = q->toplevel;
386 if (toplevel > cl->level && !(cl->q->flags&TCQ_F_THROTTLED)) {
387 psched_time_t now;
388 psched_tdiff_t incr;
390 PSCHED_GET_TIME(now);
391 incr = PSCHED_TDIFF(now, q->now_rt);
392 PSCHED_TADD2(q->now, incr, now);
394 do {
395 if (PSCHED_TLESS(cl->undertime, now)) {
396 q->toplevel = cl->level;
397 return;
399 } while ((cl=cl->borrow) != NULL && toplevel > cl->level);
403 static int
404 cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
406 struct cbq_sched_data *q = qdisc_priv(sch);
407 int len = skb->len;
408 int ret;
409 struct cbq_class *cl = cbq_classify(skb, sch, &ret);
411 #ifdef CONFIG_NET_CLS_POLICE
412 q->rx_class = cl;
413 #endif
414 if (cl == NULL) {
415 if (ret == NET_XMIT_BYPASS)
416 sch->qstats.drops++;
417 kfree_skb(skb);
418 return ret;
421 #ifdef CONFIG_NET_CLS_POLICE
422 cl->q->__parent = sch;
423 #endif
424 if ((ret = cl->q->enqueue(skb, cl->q)) == NET_XMIT_SUCCESS) {
425 sch->q.qlen++;
426 sch->bstats.packets++;
427 sch->bstats.bytes+=len;
428 cbq_mark_toplevel(q, cl);
429 if (!cl->next_alive)
430 cbq_activate_class(cl);
431 return ret;
434 sch->qstats.drops++;
435 cbq_mark_toplevel(q, cl);
436 cl->qstats.drops++;
437 return ret;
440 static int
441 cbq_requeue(struct sk_buff *skb, struct Qdisc *sch)
443 struct cbq_sched_data *q = qdisc_priv(sch);
444 struct cbq_class *cl;
445 int ret;
447 if ((cl = q->tx_class) == NULL) {
448 kfree_skb(skb);
449 sch->qstats.drops++;
450 return NET_XMIT_CN;
452 q->tx_class = NULL;
454 cbq_mark_toplevel(q, cl);
456 #ifdef CONFIG_NET_CLS_POLICE
457 q->rx_class = cl;
458 cl->q->__parent = sch;
459 #endif
460 if ((ret = cl->q->ops->requeue(skb, cl->q)) == 0) {
461 sch->q.qlen++;
462 sch->qstats.requeues++;
463 if (!cl->next_alive)
464 cbq_activate_class(cl);
465 return 0;
467 sch->qstats.drops++;
468 cl->qstats.drops++;
469 return ret;
472 /* Overlimit actions */
474 /* TC_CBQ_OVL_CLASSIC: (default) penalize leaf class by adding offtime */
476 static void cbq_ovl_classic(struct cbq_class *cl)
478 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
479 psched_tdiff_t delay = PSCHED_TDIFF(cl->undertime, q->now);
481 if (!cl->delayed) {
482 delay += cl->offtime;
485 Class goes to sleep, so that it will have no
486 chance to work avgidle. Let's forgive it 8)
488 BTW cbq-2.0 has a crap in this
489 place, apparently they forgot to shift it by cl->ewma_log.
491 if (cl->avgidle < 0)
492 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
493 if (cl->avgidle < cl->minidle)
494 cl->avgidle = cl->minidle;
495 if (delay <= 0)
496 delay = 1;
497 PSCHED_TADD2(q->now, delay, cl->undertime);
499 cl->xstats.overactions++;
500 cl->delayed = 1;
502 if (q->wd_expires == 0 || q->wd_expires > delay)
503 q->wd_expires = delay;
505 /* Dirty work! We must schedule wakeups based on
506 real available rate, rather than leaf rate,
507 which may be tiny (even zero).
509 if (q->toplevel == TC_CBQ_MAXLEVEL) {
510 struct cbq_class *b;
511 psched_tdiff_t base_delay = q->wd_expires;
513 for (b = cl->borrow; b; b = b->borrow) {
514 delay = PSCHED_TDIFF(b->undertime, q->now);
515 if (delay < base_delay) {
516 if (delay <= 0)
517 delay = 1;
518 base_delay = delay;
522 q->wd_expires = base_delay;
526 /* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when
527 they go overlimit
530 static void cbq_ovl_rclassic(struct cbq_class *cl)
532 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
533 struct cbq_class *this = cl;
535 do {
536 if (cl->level > q->toplevel) {
537 cl = NULL;
538 break;
540 } while ((cl = cl->borrow) != NULL);
542 if (cl == NULL)
543 cl = this;
544 cbq_ovl_classic(cl);
547 /* TC_CBQ_OVL_DELAY: delay until it will go to underlimit */
549 static void cbq_ovl_delay(struct cbq_class *cl)
551 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
552 psched_tdiff_t delay = PSCHED_TDIFF(cl->undertime, q->now);
554 if (!cl->delayed) {
555 unsigned long sched = jiffies;
557 delay += cl->offtime;
558 if (cl->avgidle < 0)
559 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
560 if (cl->avgidle < cl->minidle)
561 cl->avgidle = cl->minidle;
562 PSCHED_TADD2(q->now, delay, cl->undertime);
564 if (delay > 0) {
565 sched += PSCHED_US2JIFFIE(delay) + cl->penalty;
566 cl->penalized = sched;
567 cl->cpriority = TC_CBQ_MAXPRIO;
568 q->pmask |= (1<<TC_CBQ_MAXPRIO);
569 if (del_timer(&q->delay_timer) &&
570 (long)(q->delay_timer.expires - sched) > 0)
571 q->delay_timer.expires = sched;
572 add_timer(&q->delay_timer);
573 cl->delayed = 1;
574 cl->xstats.overactions++;
575 return;
577 delay = 1;
579 if (q->wd_expires == 0 || q->wd_expires > delay)
580 q->wd_expires = delay;
583 /* TC_CBQ_OVL_LOWPRIO: penalize class by lowering its priority band */
585 static void cbq_ovl_lowprio(struct cbq_class *cl)
587 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
589 cl->penalized = jiffies + cl->penalty;
591 if (cl->cpriority != cl->priority2) {
592 cl->cpriority = cl->priority2;
593 q->pmask |= (1<<cl->cpriority);
594 cl->xstats.overactions++;
596 cbq_ovl_classic(cl);
599 /* TC_CBQ_OVL_DROP: penalize class by dropping */
601 static void cbq_ovl_drop(struct cbq_class *cl)
603 if (cl->q->ops->drop)
604 if (cl->q->ops->drop(cl->q))
605 cl->qdisc->q.qlen--;
606 cl->xstats.overactions++;
607 cbq_ovl_classic(cl);
610 static void cbq_watchdog(unsigned long arg)
612 struct Qdisc *sch = (struct Qdisc*)arg;
614 sch->flags &= ~TCQ_F_THROTTLED;
615 netif_schedule(sch->dev);
618 static unsigned long cbq_undelay_prio(struct cbq_sched_data *q, int prio)
620 struct cbq_class *cl;
621 struct cbq_class *cl_prev = q->active[prio];
622 unsigned long now = jiffies;
623 unsigned long sched = now;
625 if (cl_prev == NULL)
626 return now;
628 do {
629 cl = cl_prev->next_alive;
630 if ((long)(now - cl->penalized) > 0) {
631 cl_prev->next_alive = cl->next_alive;
632 cl->next_alive = NULL;
633 cl->cpriority = cl->priority;
634 cl->delayed = 0;
635 cbq_activate_class(cl);
637 if (cl == q->active[prio]) {
638 q->active[prio] = cl_prev;
639 if (cl == q->active[prio]) {
640 q->active[prio] = NULL;
641 return 0;
645 cl = cl_prev->next_alive;
646 } else if ((long)(sched - cl->penalized) > 0)
647 sched = cl->penalized;
648 } while ((cl_prev = cl) != q->active[prio]);
650 return (long)(sched - now);
653 static void cbq_undelay(unsigned long arg)
655 struct Qdisc *sch = (struct Qdisc*)arg;
656 struct cbq_sched_data *q = qdisc_priv(sch);
657 long delay = 0;
658 unsigned pmask;
660 pmask = q->pmask;
661 q->pmask = 0;
663 while (pmask) {
664 int prio = ffz(~pmask);
665 long tmp;
667 pmask &= ~(1<<prio);
669 tmp = cbq_undelay_prio(q, prio);
670 if (tmp > 0) {
671 q->pmask |= 1<<prio;
672 if (tmp < delay || delay == 0)
673 delay = tmp;
677 if (delay) {
678 q->delay_timer.expires = jiffies + delay;
679 add_timer(&q->delay_timer);
682 sch->flags &= ~TCQ_F_THROTTLED;
683 netif_schedule(sch->dev);
687 #ifdef CONFIG_NET_CLS_POLICE
689 static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
691 int len = skb->len;
692 struct Qdisc *sch = child->__parent;
693 struct cbq_sched_data *q = qdisc_priv(sch);
694 struct cbq_class *cl = q->rx_class;
696 q->rx_class = NULL;
698 if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) {
700 cbq_mark_toplevel(q, cl);
702 q->rx_class = cl;
703 cl->q->__parent = sch;
705 if (cl->q->enqueue(skb, cl->q) == 0) {
706 sch->q.qlen++;
707 sch->bstats.packets++;
708 sch->bstats.bytes+=len;
709 if (!cl->next_alive)
710 cbq_activate_class(cl);
711 return 0;
713 sch->qstats.drops++;
714 return 0;
717 sch->qstats.drops++;
718 return -1;
720 #endif
723 It is mission critical procedure.
725 We "regenerate" toplevel cutoff, if transmitting class
726 has backlog and it is not regulated. It is not part of
727 original CBQ description, but looks more reasonable.
728 Probably, it is wrong. This question needs further investigation.
731 static __inline__ void
732 cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
733 struct cbq_class *borrowed)
735 if (cl && q->toplevel >= borrowed->level) {
736 if (cl->q->q.qlen > 1) {
737 do {
738 if (PSCHED_IS_PASTPERFECT(borrowed->undertime)) {
739 q->toplevel = borrowed->level;
740 return;
742 } while ((borrowed=borrowed->borrow) != NULL);
744 #if 0
745 /* It is not necessary now. Uncommenting it
746 will save CPU cycles, but decrease fairness.
748 q->toplevel = TC_CBQ_MAXLEVEL;
749 #endif
753 static void
754 cbq_update(struct cbq_sched_data *q)
756 struct cbq_class *this = q->tx_class;
757 struct cbq_class *cl = this;
758 int len = q->tx_len;
760 q->tx_class = NULL;
762 for ( ; cl; cl = cl->share) {
763 long avgidle = cl->avgidle;
764 long idle;
766 cl->bstats.packets++;
767 cl->bstats.bytes += len;
770 (now - last) is total time between packet right edges.
771 (last_pktlen/rate) is "virtual" busy time, so that
773 idle = (now - last) - last_pktlen/rate
776 idle = PSCHED_TDIFF(q->now, cl->last);
777 if ((unsigned long)idle > 128*1024*1024) {
778 avgidle = cl->maxidle;
779 } else {
780 idle -= L2T(cl, len);
782 /* true_avgidle := (1-W)*true_avgidle + W*idle,
783 where W=2^{-ewma_log}. But cl->avgidle is scaled:
784 cl->avgidle == true_avgidle/W,
785 hence:
787 avgidle += idle - (avgidle>>cl->ewma_log);
790 if (avgidle <= 0) {
791 /* Overlimit or at-limit */
793 if (avgidle < cl->minidle)
794 avgidle = cl->minidle;
796 cl->avgidle = avgidle;
798 /* Calculate expected time, when this class
799 will be allowed to send.
800 It will occur, when:
801 (1-W)*true_avgidle + W*delay = 0, i.e.
802 idle = (1/W - 1)*(-true_avgidle)
804 idle = (1 - W)*(-cl->avgidle);
806 idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
809 That is not all.
810 To maintain the rate allocated to the class,
811 we add to undertime virtual clock,
812 necessary to complete transmitted packet.
813 (len/phys_bandwidth has been already passed
814 to the moment of cbq_update)
817 idle -= L2T(&q->link, len);
818 idle += L2T(cl, len);
820 PSCHED_AUDIT_TDIFF(idle);
822 PSCHED_TADD2(q->now, idle, cl->undertime);
823 } else {
824 /* Underlimit */
826 PSCHED_SET_PASTPERFECT(cl->undertime);
827 if (avgidle > cl->maxidle)
828 cl->avgidle = cl->maxidle;
829 else
830 cl->avgidle = avgidle;
832 cl->last = q->now;
835 cbq_update_toplevel(q, this, q->tx_borrowed);
838 static __inline__ struct cbq_class *
839 cbq_under_limit(struct cbq_class *cl)
841 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
842 struct cbq_class *this_cl = cl;
844 if (cl->tparent == NULL)
845 return cl;
847 if (PSCHED_IS_PASTPERFECT(cl->undertime) ||
848 !PSCHED_TLESS(q->now, cl->undertime)) {
849 cl->delayed = 0;
850 return cl;
853 do {
854 /* It is very suspicious place. Now overlimit
855 action is generated for not bounded classes
856 only if link is completely congested.
857 Though it is in agree with ancestor-only paradigm,
858 it looks very stupid. Particularly,
859 it means that this chunk of code will either
860 never be called or result in strong amplification
861 of burstiness. Dangerous, silly, and, however,
862 no another solution exists.
864 if ((cl = cl->borrow) == NULL) {
865 this_cl->qstats.overlimits++;
866 this_cl->overlimit(this_cl);
867 return NULL;
869 if (cl->level > q->toplevel)
870 return NULL;
871 } while (!PSCHED_IS_PASTPERFECT(cl->undertime) &&
872 PSCHED_TLESS(q->now, cl->undertime));
874 cl->delayed = 0;
875 return cl;
878 static __inline__ struct sk_buff *
879 cbq_dequeue_prio(struct Qdisc *sch, int prio)
881 struct cbq_sched_data *q = qdisc_priv(sch);
882 struct cbq_class *cl_tail, *cl_prev, *cl;
883 struct sk_buff *skb;
884 int deficit;
886 cl_tail = cl_prev = q->active[prio];
887 cl = cl_prev->next_alive;
889 do {
890 deficit = 0;
892 /* Start round */
893 do {
894 struct cbq_class *borrow = cl;
896 if (cl->q->q.qlen &&
897 (borrow = cbq_under_limit(cl)) == NULL)
898 goto skip_class;
900 if (cl->deficit <= 0) {
901 /* Class exhausted its allotment per
902 this round. Switch to the next one.
904 deficit = 1;
905 cl->deficit += cl->quantum;
906 goto next_class;
909 skb = cl->q->dequeue(cl->q);
911 /* Class did not give us any skb :-(
912 It could occur even if cl->q->q.qlen != 0
913 f.e. if cl->q == "tbf"
915 if (skb == NULL)
916 goto skip_class;
918 cl->deficit -= skb->len;
919 q->tx_class = cl;
920 q->tx_borrowed = borrow;
921 if (borrow != cl) {
922 #ifndef CBQ_XSTATS_BORROWS_BYTES
923 borrow->xstats.borrows++;
924 cl->xstats.borrows++;
925 #else
926 borrow->xstats.borrows += skb->len;
927 cl->xstats.borrows += skb->len;
928 #endif
930 q->tx_len = skb->len;
932 if (cl->deficit <= 0) {
933 q->active[prio] = cl;
934 cl = cl->next_alive;
935 cl->deficit += cl->quantum;
937 return skb;
939 skip_class:
940 if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
941 /* Class is empty or penalized.
942 Unlink it from active chain.
944 cl_prev->next_alive = cl->next_alive;
945 cl->next_alive = NULL;
947 /* Did cl_tail point to it? */
948 if (cl == cl_tail) {
949 /* Repair it! */
950 cl_tail = cl_prev;
952 /* Was it the last class in this band? */
953 if (cl == cl_tail) {
954 /* Kill the band! */
955 q->active[prio] = NULL;
956 q->activemask &= ~(1<<prio);
957 if (cl->q->q.qlen)
958 cbq_activate_class(cl);
959 return NULL;
962 q->active[prio] = cl_tail;
964 if (cl->q->q.qlen)
965 cbq_activate_class(cl);
967 cl = cl_prev;
970 next_class:
971 cl_prev = cl;
972 cl = cl->next_alive;
973 } while (cl_prev != cl_tail);
974 } while (deficit);
976 q->active[prio] = cl_prev;
978 return NULL;
981 static __inline__ struct sk_buff *
982 cbq_dequeue_1(struct Qdisc *sch)
984 struct cbq_sched_data *q = qdisc_priv(sch);
985 struct sk_buff *skb;
986 unsigned activemask;
988 activemask = q->activemask&0xFF;
989 while (activemask) {
990 int prio = ffz(~activemask);
991 activemask &= ~(1<<prio);
992 skb = cbq_dequeue_prio(sch, prio);
993 if (skb)
994 return skb;
996 return NULL;
999 static struct sk_buff *
1000 cbq_dequeue(struct Qdisc *sch)
1002 struct sk_buff *skb;
1003 struct cbq_sched_data *q = qdisc_priv(sch);
1004 psched_time_t now;
1005 psched_tdiff_t incr;
1007 PSCHED_GET_TIME(now);
1008 incr = PSCHED_TDIFF(now, q->now_rt);
1010 if (q->tx_class) {
1011 psched_tdiff_t incr2;
1012 /* Time integrator. We calculate EOS time
1013 by adding expected packet transmission time.
1014 If real time is greater, we warp artificial clock,
1015 so that:
1017 cbq_time = max(real_time, work);
1019 incr2 = L2T(&q->link, q->tx_len);
1020 PSCHED_TADD(q->now, incr2);
1021 cbq_update(q);
1022 if ((incr -= incr2) < 0)
1023 incr = 0;
1025 PSCHED_TADD(q->now, incr);
1026 q->now_rt = now;
1028 for (;;) {
1029 q->wd_expires = 0;
1031 skb = cbq_dequeue_1(sch);
1032 if (skb) {
1033 sch->q.qlen--;
1034 sch->flags &= ~TCQ_F_THROTTLED;
1035 return skb;
1038 /* All the classes are overlimit.
1040 It is possible, if:
1042 1. Scheduler is empty.
1043 2. Toplevel cutoff inhibited borrowing.
1044 3. Root class is overlimit.
1046 Reset 2d and 3d conditions and retry.
1048 Note, that NS and cbq-2.0 are buggy, peeking
1049 an arbitrary class is appropriate for ancestor-only
1050 sharing, but not for toplevel algorithm.
1052 Our version is better, but slower, because it requires
1053 two passes, but it is unavoidable with top-level sharing.
1056 if (q->toplevel == TC_CBQ_MAXLEVEL &&
1057 PSCHED_IS_PASTPERFECT(q->link.undertime))
1058 break;
1060 q->toplevel = TC_CBQ_MAXLEVEL;
1061 PSCHED_SET_PASTPERFECT(q->link.undertime);
1064 /* No packets in scheduler or nobody wants to give them to us :-(
1065 Sigh... start watchdog timer in the last case. */
1067 if (sch->q.qlen) {
1068 sch->qstats.overlimits++;
1069 if (q->wd_expires) {
1070 long delay = PSCHED_US2JIFFIE(q->wd_expires);
1071 if (delay <= 0)
1072 delay = 1;
1073 mod_timer(&q->wd_timer, jiffies + delay);
1074 sch->flags |= TCQ_F_THROTTLED;
1077 return NULL;
1080 /* CBQ class maintanance routines */
1082 static void cbq_adjust_levels(struct cbq_class *this)
1084 if (this == NULL)
1085 return;
1087 do {
1088 int level = 0;
1089 struct cbq_class *cl;
1091 if ((cl = this->children) != NULL) {
1092 do {
1093 if (cl->level > level)
1094 level = cl->level;
1095 } while ((cl = cl->sibling) != this->children);
1097 this->level = level+1;
1098 } while ((this = this->tparent) != NULL);
1101 static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
1103 struct cbq_class *cl;
1104 unsigned h;
1106 if (q->quanta[prio] == 0)
1107 return;
1109 for (h=0; h<16; h++) {
1110 for (cl = q->classes[h]; cl; cl = cl->next) {
1111 /* BUGGGG... Beware! This expression suffer of
1112 arithmetic overflows!
1114 if (cl->priority == prio) {
1115 cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
1116 q->quanta[prio];
1118 if (cl->quantum <= 0 || cl->quantum>32*cl->qdisc->dev->mtu) {
1119 printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->classid, cl->quantum);
1120 cl->quantum = cl->qdisc->dev->mtu/2 + 1;
1126 static void cbq_sync_defmap(struct cbq_class *cl)
1128 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1129 struct cbq_class *split = cl->split;
1130 unsigned h;
1131 int i;
1133 if (split == NULL)
1134 return;
1136 for (i=0; i<=TC_PRIO_MAX; i++) {
1137 if (split->defaults[i] == cl && !(cl->defmap&(1<<i)))
1138 split->defaults[i] = NULL;
1141 for (i=0; i<=TC_PRIO_MAX; i++) {
1142 int level = split->level;
1144 if (split->defaults[i])
1145 continue;
1147 for (h=0; h<16; h++) {
1148 struct cbq_class *c;
1150 for (c = q->classes[h]; c; c = c->next) {
1151 if (c->split == split && c->level < level &&
1152 c->defmap&(1<<i)) {
1153 split->defaults[i] = c;
1154 level = c->level;
1161 static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
1163 struct cbq_class *split = NULL;
1165 if (splitid == 0) {
1166 if ((split = cl->split) == NULL)
1167 return;
1168 splitid = split->classid;
1171 if (split == NULL || split->classid != splitid) {
1172 for (split = cl->tparent; split; split = split->tparent)
1173 if (split->classid == splitid)
1174 break;
1177 if (split == NULL)
1178 return;
1180 if (cl->split != split) {
1181 cl->defmap = 0;
1182 cbq_sync_defmap(cl);
1183 cl->split = split;
1184 cl->defmap = def&mask;
1185 } else
1186 cl->defmap = (cl->defmap&~mask)|(def&mask);
1188 cbq_sync_defmap(cl);
1191 static void cbq_unlink_class(struct cbq_class *this)
1193 struct cbq_class *cl, **clp;
1194 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1196 for (clp = &q->classes[cbq_hash(this->classid)]; (cl = *clp) != NULL; clp = &cl->next) {
1197 if (cl == this) {
1198 *clp = cl->next;
1199 cl->next = NULL;
1200 break;
1204 if (this->tparent) {
1205 clp=&this->sibling;
1206 cl = *clp;
1207 do {
1208 if (cl == this) {
1209 *clp = cl->sibling;
1210 break;
1212 clp = &cl->sibling;
1213 } while ((cl = *clp) != this->sibling);
1215 if (this->tparent->children == this) {
1216 this->tparent->children = this->sibling;
1217 if (this->sibling == this)
1218 this->tparent->children = NULL;
1220 } else {
1221 BUG_TRAP(this->sibling == this);
1225 static void cbq_link_class(struct cbq_class *this)
1227 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1228 unsigned h = cbq_hash(this->classid);
1229 struct cbq_class *parent = this->tparent;
1231 this->sibling = this;
1232 this->next = q->classes[h];
1233 q->classes[h] = this;
1235 if (parent == NULL)
1236 return;
1238 if (parent->children == NULL) {
1239 parent->children = this;
1240 } else {
1241 this->sibling = parent->children->sibling;
1242 parent->children->sibling = this;
1246 static unsigned int cbq_drop(struct Qdisc* sch)
1248 struct cbq_sched_data *q = qdisc_priv(sch);
1249 struct cbq_class *cl, *cl_head;
1250 int prio;
1251 unsigned int len;
1253 for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) {
1254 if ((cl_head = q->active[prio]) == NULL)
1255 continue;
1257 cl = cl_head;
1258 do {
1259 if (cl->q->ops->drop && (len = cl->q->ops->drop(cl->q))) {
1260 sch->q.qlen--;
1261 return len;
1263 } while ((cl = cl->next_alive) != cl_head);
1265 return 0;
1268 static void
1269 cbq_reset(struct Qdisc* sch)
1271 struct cbq_sched_data *q = qdisc_priv(sch);
1272 struct cbq_class *cl;
1273 int prio;
1274 unsigned h;
1276 q->activemask = 0;
1277 q->pmask = 0;
1278 q->tx_class = NULL;
1279 q->tx_borrowed = NULL;
1280 del_timer(&q->wd_timer);
1281 del_timer(&q->delay_timer);
1282 q->toplevel = TC_CBQ_MAXLEVEL;
1283 PSCHED_GET_TIME(q->now);
1284 q->now_rt = q->now;
1286 for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1287 q->active[prio] = NULL;
1289 for (h = 0; h < 16; h++) {
1290 for (cl = q->classes[h]; cl; cl = cl->next) {
1291 qdisc_reset(cl->q);
1293 cl->next_alive = NULL;
1294 PSCHED_SET_PASTPERFECT(cl->undertime);
1295 cl->avgidle = cl->maxidle;
1296 cl->deficit = cl->quantum;
1297 cl->cpriority = cl->priority;
1300 sch->q.qlen = 0;
1304 static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1306 if (lss->change&TCF_CBQ_LSS_FLAGS) {
1307 cl->share = (lss->flags&TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1308 cl->borrow = (lss->flags&TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1310 if (lss->change&TCF_CBQ_LSS_EWMA)
1311 cl->ewma_log = lss->ewma_log;
1312 if (lss->change&TCF_CBQ_LSS_AVPKT)
1313 cl->avpkt = lss->avpkt;
1314 if (lss->change&TCF_CBQ_LSS_MINIDLE)
1315 cl->minidle = -(long)lss->minidle;
1316 if (lss->change&TCF_CBQ_LSS_MAXIDLE) {
1317 cl->maxidle = lss->maxidle;
1318 cl->avgidle = lss->maxidle;
1320 if (lss->change&TCF_CBQ_LSS_OFFTIME)
1321 cl->offtime = lss->offtime;
1322 return 0;
1325 static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1327 q->nclasses[cl->priority]--;
1328 q->quanta[cl->priority] -= cl->weight;
1329 cbq_normalize_quanta(q, cl->priority);
1332 static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1334 q->nclasses[cl->priority]++;
1335 q->quanta[cl->priority] += cl->weight;
1336 cbq_normalize_quanta(q, cl->priority);
1339 static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1341 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1343 if (wrr->allot)
1344 cl->allot = wrr->allot;
1345 if (wrr->weight)
1346 cl->weight = wrr->weight;
1347 if (wrr->priority) {
1348 cl->priority = wrr->priority-1;
1349 cl->cpriority = cl->priority;
1350 if (cl->priority >= cl->priority2)
1351 cl->priority2 = TC_CBQ_MAXPRIO-1;
1354 cbq_addprio(q, cl);
1355 return 0;
1358 static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl)
1360 switch (ovl->strategy) {
1361 case TC_CBQ_OVL_CLASSIC:
1362 cl->overlimit = cbq_ovl_classic;
1363 break;
1364 case TC_CBQ_OVL_DELAY:
1365 cl->overlimit = cbq_ovl_delay;
1366 break;
1367 case TC_CBQ_OVL_LOWPRIO:
1368 if (ovl->priority2-1 >= TC_CBQ_MAXPRIO ||
1369 ovl->priority2-1 <= cl->priority)
1370 return -EINVAL;
1371 cl->priority2 = ovl->priority2-1;
1372 cl->overlimit = cbq_ovl_lowprio;
1373 break;
1374 case TC_CBQ_OVL_DROP:
1375 cl->overlimit = cbq_ovl_drop;
1376 break;
1377 case TC_CBQ_OVL_RCLASSIC:
1378 cl->overlimit = cbq_ovl_rclassic;
1379 break;
1380 default:
1381 return -EINVAL;
1383 cl->penalty = (ovl->penalty*HZ)/1000;
1384 return 0;
1387 #ifdef CONFIG_NET_CLS_POLICE
1388 static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p)
1390 cl->police = p->police;
1392 if (cl->q->handle) {
1393 if (p->police == TC_POLICE_RECLASSIFY)
1394 cl->q->reshape_fail = cbq_reshape_fail;
1395 else
1396 cl->q->reshape_fail = NULL;
1398 return 0;
1400 #endif
1402 static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1404 cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1405 return 0;
1408 static int cbq_init(struct Qdisc *sch, struct rtattr *opt)
1410 struct cbq_sched_data *q = qdisc_priv(sch);
1411 struct rtattr *tb[TCA_CBQ_MAX];
1412 struct tc_ratespec *r;
1414 if (rtattr_parse_nested(tb, TCA_CBQ_MAX, opt) < 0 ||
1415 tb[TCA_CBQ_RTAB-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL ||
1416 RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec))
1417 return -EINVAL;
1419 if (tb[TCA_CBQ_LSSOPT-1] &&
1420 RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt))
1421 return -EINVAL;
1423 r = RTA_DATA(tb[TCA_CBQ_RATE-1]);
1425 if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB-1])) == NULL)
1426 return -EINVAL;
1428 q->link.refcnt = 1;
1429 q->link.sibling = &q->link;
1430 q->link.classid = sch->handle;
1431 q->link.qdisc = sch;
1432 if (!(q->link.q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops)))
1433 q->link.q = &noop_qdisc;
1435 q->link.priority = TC_CBQ_MAXPRIO-1;
1436 q->link.priority2 = TC_CBQ_MAXPRIO-1;
1437 q->link.cpriority = TC_CBQ_MAXPRIO-1;
1438 q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC;
1439 q->link.overlimit = cbq_ovl_classic;
1440 q->link.allot = psched_mtu(sch->dev);
1441 q->link.quantum = q->link.allot;
1442 q->link.weight = q->link.R_tab->rate.rate;
1444 q->link.ewma_log = TC_CBQ_DEF_EWMA;
1445 q->link.avpkt = q->link.allot/2;
1446 q->link.minidle = -0x7FFFFFFF;
1447 q->link.stats_lock = &sch->dev->queue_lock;
1449 init_timer(&q->wd_timer);
1450 q->wd_timer.data = (unsigned long)sch;
1451 q->wd_timer.function = cbq_watchdog;
1452 init_timer(&q->delay_timer);
1453 q->delay_timer.data = (unsigned long)sch;
1454 q->delay_timer.function = cbq_undelay;
1455 q->toplevel = TC_CBQ_MAXLEVEL;
1456 PSCHED_GET_TIME(q->now);
1457 q->now_rt = q->now;
1459 cbq_link_class(&q->link);
1461 if (tb[TCA_CBQ_LSSOPT-1])
1462 cbq_set_lss(&q->link, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
1464 cbq_addprio(q, &q->link);
1465 return 0;
1468 static __inline__ int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1470 unsigned char *b = skb->tail;
1472 RTA_PUT(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate);
1473 return skb->len;
1475 rtattr_failure:
1476 skb_trim(skb, b - skb->data);
1477 return -1;
1480 static __inline__ int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1482 unsigned char *b = skb->tail;
1483 struct tc_cbq_lssopt opt;
1485 opt.flags = 0;
1486 if (cl->borrow == NULL)
1487 opt.flags |= TCF_CBQ_LSS_BOUNDED;
1488 if (cl->share == NULL)
1489 opt.flags |= TCF_CBQ_LSS_ISOLATED;
1490 opt.ewma_log = cl->ewma_log;
1491 opt.level = cl->level;
1492 opt.avpkt = cl->avpkt;
1493 opt.maxidle = cl->maxidle;
1494 opt.minidle = (u32)(-cl->minidle);
1495 opt.offtime = cl->offtime;
1496 opt.change = ~0;
1497 RTA_PUT(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt);
1498 return skb->len;
1500 rtattr_failure:
1501 skb_trim(skb, b - skb->data);
1502 return -1;
1505 static __inline__ int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1507 unsigned char *b = skb->tail;
1508 struct tc_cbq_wrropt opt;
1510 opt.flags = 0;
1511 opt.allot = cl->allot;
1512 opt.priority = cl->priority+1;
1513 opt.cpriority = cl->cpriority+1;
1514 opt.weight = cl->weight;
1515 RTA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt);
1516 return skb->len;
1518 rtattr_failure:
1519 skb_trim(skb, b - skb->data);
1520 return -1;
1523 static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
1525 unsigned char *b = skb->tail;
1526 struct tc_cbq_ovl opt;
1528 opt.strategy = cl->ovl_strategy;
1529 opt.priority2 = cl->priority2+1;
1530 opt.pad = 0;
1531 opt.penalty = (cl->penalty*1000)/HZ;
1532 RTA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt);
1533 return skb->len;
1535 rtattr_failure:
1536 skb_trim(skb, b - skb->data);
1537 return -1;
1540 static __inline__ int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1542 unsigned char *b = skb->tail;
1543 struct tc_cbq_fopt opt;
1545 if (cl->split || cl->defmap) {
1546 opt.split = cl->split ? cl->split->classid : 0;
1547 opt.defmap = cl->defmap;
1548 opt.defchange = ~0;
1549 RTA_PUT(skb, TCA_CBQ_FOPT, sizeof(opt), &opt);
1551 return skb->len;
1553 rtattr_failure:
1554 skb_trim(skb, b - skb->data);
1555 return -1;
1558 #ifdef CONFIG_NET_CLS_POLICE
1559 static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
1561 unsigned char *b = skb->tail;
1562 struct tc_cbq_police opt;
1564 if (cl->police) {
1565 opt.police = cl->police;
1566 opt.__res1 = 0;
1567 opt.__res2 = 0;
1568 RTA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt);
1570 return skb->len;
1572 rtattr_failure:
1573 skb_trim(skb, b - skb->data);
1574 return -1;
1576 #endif
1578 static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1580 if (cbq_dump_lss(skb, cl) < 0 ||
1581 cbq_dump_rate(skb, cl) < 0 ||
1582 cbq_dump_wrr(skb, cl) < 0 ||
1583 cbq_dump_ovl(skb, cl) < 0 ||
1584 #ifdef CONFIG_NET_CLS_POLICE
1585 cbq_dump_police(skb, cl) < 0 ||
1586 #endif
1587 cbq_dump_fopt(skb, cl) < 0)
1588 return -1;
1589 return 0;
1592 static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1594 struct cbq_sched_data *q = qdisc_priv(sch);
1595 unsigned char *b = skb->tail;
1596 struct rtattr *rta;
1598 rta = (struct rtattr*)b;
1599 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
1600 if (cbq_dump_attr(skb, &q->link) < 0)
1601 goto rtattr_failure;
1602 rta->rta_len = skb->tail - b;
1603 return skb->len;
1605 rtattr_failure:
1606 skb_trim(skb, b - skb->data);
1607 return -1;
1610 static int
1611 cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1613 struct cbq_sched_data *q = qdisc_priv(sch);
1615 q->link.xstats.avgidle = q->link.avgidle;
1616 return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1619 static int
1620 cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1621 struct sk_buff *skb, struct tcmsg *tcm)
1623 struct cbq_class *cl = (struct cbq_class*)arg;
1624 unsigned char *b = skb->tail;
1625 struct rtattr *rta;
1627 if (cl->tparent)
1628 tcm->tcm_parent = cl->tparent->classid;
1629 else
1630 tcm->tcm_parent = TC_H_ROOT;
1631 tcm->tcm_handle = cl->classid;
1632 tcm->tcm_info = cl->q->handle;
1634 rta = (struct rtattr*)b;
1635 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
1636 if (cbq_dump_attr(skb, cl) < 0)
1637 goto rtattr_failure;
1638 rta->rta_len = skb->tail - b;
1639 return skb->len;
1641 rtattr_failure:
1642 skb_trim(skb, b - skb->data);
1643 return -1;
1646 static int
1647 cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1648 struct gnet_dump *d)
1650 struct cbq_sched_data *q = qdisc_priv(sch);
1651 struct cbq_class *cl = (struct cbq_class*)arg;
1653 cl->qstats.qlen = cl->q->q.qlen;
1654 cl->xstats.avgidle = cl->avgidle;
1655 cl->xstats.undertime = 0;
1657 if (!PSCHED_IS_PASTPERFECT(cl->undertime))
1658 cl->xstats.undertime = PSCHED_TDIFF(cl->undertime, q->now);
1660 if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
1661 #ifdef CONFIG_NET_ESTIMATOR
1662 gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1663 #endif
1664 gnet_stats_copy_queue(d, &cl->qstats) < 0)
1665 return -1;
1667 return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1670 static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1671 struct Qdisc **old)
1673 struct cbq_class *cl = (struct cbq_class*)arg;
1675 if (cl) {
1676 if (new == NULL) {
1677 if ((new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops)) == NULL)
1678 return -ENOBUFS;
1679 } else {
1680 #ifdef CONFIG_NET_CLS_POLICE
1681 if (cl->police == TC_POLICE_RECLASSIFY)
1682 new->reshape_fail = cbq_reshape_fail;
1683 #endif
1685 sch_tree_lock(sch);
1686 *old = cl->q;
1687 cl->q = new;
1688 sch->q.qlen -= (*old)->q.qlen;
1689 qdisc_reset(*old);
1690 sch_tree_unlock(sch);
1692 return 0;
1694 return -ENOENT;
1697 static struct Qdisc *
1698 cbq_leaf(struct Qdisc *sch, unsigned long arg)
1700 struct cbq_class *cl = (struct cbq_class*)arg;
1702 return cl ? cl->q : NULL;
1705 static unsigned long cbq_get(struct Qdisc *sch, u32 classid)
1707 struct cbq_sched_data *q = qdisc_priv(sch);
1708 struct cbq_class *cl = cbq_class_lookup(q, classid);
1710 if (cl) {
1711 cl->refcnt++;
1712 return (unsigned long)cl;
1714 return 0;
1717 static void cbq_destroy_filters(struct cbq_class *cl)
1719 struct tcf_proto *tp;
1721 while ((tp = cl->filter_list) != NULL) {
1722 cl->filter_list = tp->next;
1723 tcf_destroy(tp);
1727 static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1729 struct cbq_sched_data *q = qdisc_priv(sch);
1731 BUG_TRAP(!cl->filters);
1733 cbq_destroy_filters(cl);
1734 qdisc_destroy(cl->q);
1735 qdisc_put_rtab(cl->R_tab);
1736 #ifdef CONFIG_NET_ESTIMATOR
1737 gen_kill_estimator(&cl->bstats, &cl->rate_est);
1738 #endif
1739 if (cl != &q->link)
1740 kfree(cl);
1743 static void
1744 cbq_destroy(struct Qdisc* sch)
1746 struct cbq_sched_data *q = qdisc_priv(sch);
1747 struct cbq_class *cl;
1748 unsigned h;
1750 #ifdef CONFIG_NET_CLS_POLICE
1751 q->rx_class = NULL;
1752 #endif
1754 * Filters must be destroyed first because we don't destroy the
1755 * classes from root to leafs which means that filters can still
1756 * be bound to classes which have been destroyed already. --TGR '04
1758 for (h = 0; h < 16; h++)
1759 for (cl = q->classes[h]; cl; cl = cl->next)
1760 cbq_destroy_filters(cl);
1762 for (h = 0; h < 16; h++) {
1763 struct cbq_class *next;
1765 for (cl = q->classes[h]; cl; cl = next) {
1766 next = cl->next;
1767 cbq_destroy_class(sch, cl);
1772 static void cbq_put(struct Qdisc *sch, unsigned long arg)
1774 struct cbq_class *cl = (struct cbq_class*)arg;
1776 if (--cl->refcnt == 0) {
1777 #ifdef CONFIG_NET_CLS_POLICE
1778 struct cbq_sched_data *q = qdisc_priv(sch);
1780 spin_lock_bh(&sch->dev->queue_lock);
1781 if (q->rx_class == cl)
1782 q->rx_class = NULL;
1783 spin_unlock_bh(&sch->dev->queue_lock);
1784 #endif
1786 cbq_destroy_class(sch, cl);
1790 static int
1791 cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct rtattr **tca,
1792 unsigned long *arg)
1794 int err;
1795 struct cbq_sched_data *q = qdisc_priv(sch);
1796 struct cbq_class *cl = (struct cbq_class*)*arg;
1797 struct rtattr *opt = tca[TCA_OPTIONS-1];
1798 struct rtattr *tb[TCA_CBQ_MAX];
1799 struct cbq_class *parent;
1800 struct qdisc_rate_table *rtab = NULL;
1802 if (opt==NULL || rtattr_parse_nested(tb, TCA_CBQ_MAX, opt))
1803 return -EINVAL;
1805 if (tb[TCA_CBQ_OVL_STRATEGY-1] &&
1806 RTA_PAYLOAD(tb[TCA_CBQ_OVL_STRATEGY-1]) < sizeof(struct tc_cbq_ovl))
1807 return -EINVAL;
1809 if (tb[TCA_CBQ_FOPT-1] &&
1810 RTA_PAYLOAD(tb[TCA_CBQ_FOPT-1]) < sizeof(struct tc_cbq_fopt))
1811 return -EINVAL;
1813 if (tb[TCA_CBQ_RATE-1] &&
1814 RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec))
1815 return -EINVAL;
1817 if (tb[TCA_CBQ_LSSOPT-1] &&
1818 RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt))
1819 return -EINVAL;
1821 if (tb[TCA_CBQ_WRROPT-1] &&
1822 RTA_PAYLOAD(tb[TCA_CBQ_WRROPT-1]) < sizeof(struct tc_cbq_wrropt))
1823 return -EINVAL;
1825 #ifdef CONFIG_NET_CLS_POLICE
1826 if (tb[TCA_CBQ_POLICE-1] &&
1827 RTA_PAYLOAD(tb[TCA_CBQ_POLICE-1]) < sizeof(struct tc_cbq_police))
1828 return -EINVAL;
1829 #endif
1831 if (cl) {
1832 /* Check parent */
1833 if (parentid) {
1834 if (cl->tparent && cl->tparent->classid != parentid)
1835 return -EINVAL;
1836 if (!cl->tparent && parentid != TC_H_ROOT)
1837 return -EINVAL;
1840 if (tb[TCA_CBQ_RATE-1]) {
1841 rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]);
1842 if (rtab == NULL)
1843 return -EINVAL;
1846 /* Change class parameters */
1847 sch_tree_lock(sch);
1849 if (cl->next_alive != NULL)
1850 cbq_deactivate_class(cl);
1852 if (rtab) {
1853 rtab = xchg(&cl->R_tab, rtab);
1854 qdisc_put_rtab(rtab);
1857 if (tb[TCA_CBQ_LSSOPT-1])
1858 cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
1860 if (tb[TCA_CBQ_WRROPT-1]) {
1861 cbq_rmprio(q, cl);
1862 cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1]));
1865 if (tb[TCA_CBQ_OVL_STRATEGY-1])
1866 cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1]));
1868 #ifdef CONFIG_NET_CLS_POLICE
1869 if (tb[TCA_CBQ_POLICE-1])
1870 cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1]));
1871 #endif
1873 if (tb[TCA_CBQ_FOPT-1])
1874 cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1]));
1876 if (cl->q->q.qlen)
1877 cbq_activate_class(cl);
1879 sch_tree_unlock(sch);
1881 #ifdef CONFIG_NET_ESTIMATOR
1882 if (tca[TCA_RATE-1])
1883 gen_replace_estimator(&cl->bstats, &cl->rate_est,
1884 cl->stats_lock, tca[TCA_RATE-1]);
1885 #endif
1886 return 0;
1889 if (parentid == TC_H_ROOT)
1890 return -EINVAL;
1892 if (tb[TCA_CBQ_WRROPT-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL ||
1893 tb[TCA_CBQ_LSSOPT-1] == NULL)
1894 return -EINVAL;
1896 rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]);
1897 if (rtab == NULL)
1898 return -EINVAL;
1900 if (classid) {
1901 err = -EINVAL;
1902 if (TC_H_MAJ(classid^sch->handle) || cbq_class_lookup(q, classid))
1903 goto failure;
1904 } else {
1905 int i;
1906 classid = TC_H_MAKE(sch->handle,0x8000);
1908 for (i=0; i<0x8000; i++) {
1909 if (++q->hgenerator >= 0x8000)
1910 q->hgenerator = 1;
1911 if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1912 break;
1914 err = -ENOSR;
1915 if (i >= 0x8000)
1916 goto failure;
1917 classid = classid|q->hgenerator;
1920 parent = &q->link;
1921 if (parentid) {
1922 parent = cbq_class_lookup(q, parentid);
1923 err = -EINVAL;
1924 if (parent == NULL)
1925 goto failure;
1928 err = -ENOBUFS;
1929 cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1930 if (cl == NULL)
1931 goto failure;
1932 cl->R_tab = rtab;
1933 rtab = NULL;
1934 cl->refcnt = 1;
1935 if (!(cl->q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops)))
1936 cl->q = &noop_qdisc;
1937 cl->classid = classid;
1938 cl->tparent = parent;
1939 cl->qdisc = sch;
1940 cl->allot = parent->allot;
1941 cl->quantum = cl->allot;
1942 cl->weight = cl->R_tab->rate.rate;
1943 cl->stats_lock = &sch->dev->queue_lock;
1945 sch_tree_lock(sch);
1946 cbq_link_class(cl);
1947 cl->borrow = cl->tparent;
1948 if (cl->tparent != &q->link)
1949 cl->share = cl->tparent;
1950 cbq_adjust_levels(parent);
1951 cl->minidle = -0x7FFFFFFF;
1952 cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
1953 cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1]));
1954 if (cl->ewma_log==0)
1955 cl->ewma_log = q->link.ewma_log;
1956 if (cl->maxidle==0)
1957 cl->maxidle = q->link.maxidle;
1958 if (cl->avpkt==0)
1959 cl->avpkt = q->link.avpkt;
1960 cl->overlimit = cbq_ovl_classic;
1961 if (tb[TCA_CBQ_OVL_STRATEGY-1])
1962 cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1]));
1963 #ifdef CONFIG_NET_CLS_POLICE
1964 if (tb[TCA_CBQ_POLICE-1])
1965 cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1]));
1966 #endif
1967 if (tb[TCA_CBQ_FOPT-1])
1968 cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1]));
1969 sch_tree_unlock(sch);
1971 #ifdef CONFIG_NET_ESTIMATOR
1972 if (tca[TCA_RATE-1])
1973 gen_new_estimator(&cl->bstats, &cl->rate_est,
1974 cl->stats_lock, tca[TCA_RATE-1]);
1975 #endif
1977 *arg = (unsigned long)cl;
1978 return 0;
1980 failure:
1981 qdisc_put_rtab(rtab);
1982 return err;
1985 static int cbq_delete(struct Qdisc *sch, unsigned long arg)
1987 struct cbq_sched_data *q = qdisc_priv(sch);
1988 struct cbq_class *cl = (struct cbq_class*)arg;
1990 if (cl->filters || cl->children || cl == &q->link)
1991 return -EBUSY;
1993 sch_tree_lock(sch);
1995 if (cl->next_alive)
1996 cbq_deactivate_class(cl);
1998 if (q->tx_borrowed == cl)
1999 q->tx_borrowed = q->tx_class;
2000 if (q->tx_class == cl) {
2001 q->tx_class = NULL;
2002 q->tx_borrowed = NULL;
2004 #ifdef CONFIG_NET_CLS_POLICE
2005 if (q->rx_class == cl)
2006 q->rx_class = NULL;
2007 #endif
2009 cbq_unlink_class(cl);
2010 cbq_adjust_levels(cl->tparent);
2011 cl->defmap = 0;
2012 cbq_sync_defmap(cl);
2014 cbq_rmprio(q, cl);
2015 sch_tree_unlock(sch);
2017 if (--cl->refcnt == 0)
2018 cbq_destroy_class(sch, cl);
2020 return 0;
2023 static struct tcf_proto **cbq_find_tcf(struct Qdisc *sch, unsigned long arg)
2025 struct cbq_sched_data *q = qdisc_priv(sch);
2026 struct cbq_class *cl = (struct cbq_class *)arg;
2028 if (cl == NULL)
2029 cl = &q->link;
2031 return &cl->filter_list;
2034 static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
2035 u32 classid)
2037 struct cbq_sched_data *q = qdisc_priv(sch);
2038 struct cbq_class *p = (struct cbq_class*)parent;
2039 struct cbq_class *cl = cbq_class_lookup(q, classid);
2041 if (cl) {
2042 if (p && p->level <= cl->level)
2043 return 0;
2044 cl->filters++;
2045 return (unsigned long)cl;
2047 return 0;
2050 static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
2052 struct cbq_class *cl = (struct cbq_class*)arg;
2054 cl->filters--;
2057 static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
2059 struct cbq_sched_data *q = qdisc_priv(sch);
2060 unsigned h;
2062 if (arg->stop)
2063 return;
2065 for (h = 0; h < 16; h++) {
2066 struct cbq_class *cl;
2068 for (cl = q->classes[h]; cl; cl = cl->next) {
2069 if (arg->count < arg->skip) {
2070 arg->count++;
2071 continue;
2073 if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
2074 arg->stop = 1;
2075 return;
2077 arg->count++;
2082 static struct Qdisc_class_ops cbq_class_ops = {
2083 .graft = cbq_graft,
2084 .leaf = cbq_leaf,
2085 .get = cbq_get,
2086 .put = cbq_put,
2087 .change = cbq_change_class,
2088 .delete = cbq_delete,
2089 .walk = cbq_walk,
2090 .tcf_chain = cbq_find_tcf,
2091 .bind_tcf = cbq_bind_filter,
2092 .unbind_tcf = cbq_unbind_filter,
2093 .dump = cbq_dump_class,
2094 .dump_stats = cbq_dump_class_stats,
2097 static struct Qdisc_ops cbq_qdisc_ops = {
2098 .next = NULL,
2099 .cl_ops = &cbq_class_ops,
2100 .id = "cbq",
2101 .priv_size = sizeof(struct cbq_sched_data),
2102 .enqueue = cbq_enqueue,
2103 .dequeue = cbq_dequeue,
2104 .requeue = cbq_requeue,
2105 .drop = cbq_drop,
2106 .init = cbq_init,
2107 .reset = cbq_reset,
2108 .destroy = cbq_destroy,
2109 .change = NULL,
2110 .dump = cbq_dump,
2111 .dump_stats = cbq_dump_stats,
2112 .owner = THIS_MODULE,
2115 static int __init cbq_module_init(void)
2117 return register_qdisc(&cbq_qdisc_ops);
2119 static void __exit cbq_module_exit(void)
2121 unregister_qdisc(&cbq_qdisc_ops);
2123 module_init(cbq_module_init)
2124 module_exit(cbq_module_exit)
2125 MODULE_LICENSE("GPL");