* better
[mascara-docs.git] / i386 / linux-2.3.21 / net / sched / sch_tbf.c
blob3a44f6dd776b5c889c28f2d72bef8714fa18070b
1 /*
2 * net/sched/sch_tbf.c Token Bucket Filter queue.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 #include <linux/config.h>
14 #include <linux/module.h>
15 #include <asm/uaccess.h>
16 #include <asm/system.h>
17 #include <asm/bitops.h>
18 #include <linux/types.h>
19 #include <linux/kernel.h>
20 #include <linux/sched.h>
21 #include <linux/string.h>
22 #include <linux/mm.h>
23 #include <linux/socket.h>
24 #include <linux/sockios.h>
25 #include <linux/in.h>
26 #include <linux/errno.h>
27 #include <linux/interrupt.h>
28 #include <linux/if_ether.h>
29 #include <linux/inet.h>
30 #include <linux/netdevice.h>
31 #include <linux/etherdevice.h>
32 #include <linux/notifier.h>
33 #include <net/ip.h>
34 #include <net/route.h>
35 #include <linux/skbuff.h>
36 #include <net/sock.h>
37 #include <net/pkt_sched.h>
40 /* Simple Token Bucket Filter.
41 =======================================
43 SOURCE.
44 -------
46 None.
48 Description.
49 ------------
51 A data flow obeys TBF with rate R and depth B, if for any
52 time interval t_i...t_f the number of transmitted bits
53 does not exceed B + R*(t_f-t_i).
55 Packetized version of this definition:
56 The sequence of packets of sizes s_i served at moments t_i
57 obeys TBF, if for any i<=k:
59 s_i+....+s_k <= B + R*(t_k - t_i)
61 Algorithm.
62 ----------
64 Let N(t_i) be B/R initially and N(t) grow continuously with time as:
66 N(t+delta) = min{B/R, N(t) + delta}
68 If the first packet in queue has length S, it may be
69 transmited only at the time t_* when S/R <= N(t_*),
70 and in this case N(t) jumps:
72 N(t_* + 0) = N(t_* - 0) - S/R.
76 Actually, QoS requires two TBF to be applied to a data stream.
77 One of them controls steady state burst size, another
78 one with rate P (peak rate) and depth M (equal to link MTU)
79 limits bursts at a smaller time scale.
81 It is easy to see that P>R, and B>M. If P is infinity, this double
82 TBF is equivalent to a single one.
84 When TBF works in reshaping mode, latency is estimated as:
86 lat = max ((L-B)/R, (L-M)/P)
89 NOTES.
90 ------
92 If TBF throttles, it starts a watchdog timer, which will wake it up
93 when it is ready to transmit.
94 Note that the minimal timer resolution is 1/HZ.
95 If no new packets arrive during this period,
96 or if the device is not awaken by EOI for some previous packet,
97 TBF can stop its activity for 1/HZ.
100 This means, that with depth B, the maximal rate is
102 R_crit = B*HZ
104 F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
106 Note that the peak rate TBF is much more tough: with MTU 1500
107 P_crit = 150Kbytes/sec. So, if you need greater peak
108 rates, use alpha with HZ=1000 :-)
111 struct tbf_sched_data
113 /* Parameters */
114 u32 limit; /* Maximal length of backlog: bytes */
115 u32 buffer; /* Token bucket depth/rate: MUST BE >= MTU/B */
116 u32 mtu;
117 u32 max_size;
118 struct qdisc_rate_table *R_tab;
119 struct qdisc_rate_table *P_tab;
121 /* Variables */
122 long tokens; /* Current number of B tokens */
123 long ptokens; /* Current number of P tokens */
124 psched_time_t t_c; /* Time check-point */
125 struct timer_list wd_timer; /* Watchdog timer */
128 #define L2T(q,L) ((q)->R_tab->data[(L)>>(q)->R_tab->rate.cell_log])
129 #define L2T_P(q,L) ((q)->P_tab->data[(L)>>(q)->P_tab->rate.cell_log])
131 static int
132 tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch)
134 struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
136 if (skb->len > q->max_size)
137 goto drop;
138 __skb_queue_tail(&sch->q, skb);
139 if ((sch->stats.backlog += skb->len) <= q->limit) {
140 sch->stats.bytes += skb->len;
141 sch->stats.packets++;
142 return 0;
145 /* Drop action: undo the things that we just did,
146 * i.e. make tail drop
149 __skb_unlink(skb, &sch->q);
150 sch->stats.backlog -= skb->len;
152 drop:
153 sch->stats.drops++;
154 #ifdef CONFIG_NET_CLS_POLICE
155 if (sch->reshape_fail==NULL || sch->reshape_fail(skb, sch))
156 #endif
157 kfree_skb(skb);
158 return NET_XMIT_DROP;
161 static int
162 tbf_requeue(struct sk_buff *skb, struct Qdisc* sch)
164 __skb_queue_head(&sch->q, skb);
165 sch->stats.backlog += skb->len;
166 return 0;
169 static int
170 tbf_drop(struct Qdisc* sch)
172 struct sk_buff *skb;
174 skb = __skb_dequeue_tail(&sch->q);
175 if (skb) {
176 sch->stats.backlog -= skb->len;
177 sch->stats.drops++;
178 kfree_skb(skb);
179 return 1;
181 return 0;
184 static void tbf_watchdog(unsigned long arg)
186 struct Qdisc *sch = (struct Qdisc*)arg;
188 sch->flags &= ~TCQ_F_THROTTLED;
189 qdisc_wakeup(sch->dev);
192 static struct sk_buff *
193 tbf_dequeue(struct Qdisc* sch)
195 struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
196 struct sk_buff *skb;
198 skb = __skb_dequeue(&sch->q);
200 if (skb) {
201 psched_time_t now;
202 long toks;
203 long ptoks = 0;
205 PSCHED_GET_TIME(now);
207 toks = PSCHED_TDIFF_SAFE(now, q->t_c, q->buffer, 0);
209 if (q->P_tab) {
210 ptoks = toks + q->ptokens;
211 if (ptoks > (long)q->mtu)
212 ptoks = q->mtu;
213 ptoks -= L2T_P(q, skb->len);
215 toks += q->tokens;
216 if (toks > (long)q->buffer)
217 toks = q->buffer;
218 toks -= L2T(q, skb->len);
220 if ((toks|ptoks) >= 0) {
221 q->t_c = now;
222 q->tokens = toks;
223 q->ptokens = ptoks;
224 sch->stats.backlog -= skb->len;
225 sch->flags &= ~TCQ_F_THROTTLED;
226 return skb;
229 if (!sch->dev->tbusy) {
230 long delay = PSCHED_US2JIFFIE(max(-toks, -ptoks));
232 if (delay == 0)
233 delay = 1;
235 del_timer(&q->wd_timer);
236 q->wd_timer.expires = jiffies + delay;
237 add_timer(&q->wd_timer);
240 /* Maybe we have a shorter packet in the queue,
241 which can be sent now. It sounds cool,
242 but, however, this is wrong in principle.
243 We MUST NOT reorder packets under these circumstances.
245 Really, if we split the flow into independent
246 subflows, it would be a very good solution.
247 This is the main idea of all FQ algorithms
248 (cf. CSZ, HPFQ, HFSC)
250 __skb_queue_head(&sch->q, skb);
252 sch->flags |= TCQ_F_THROTTLED;
253 sch->stats.overlimits++;
255 return NULL;
259 static void
260 tbf_reset(struct Qdisc* sch)
262 struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
264 skb_queue_purge(&sch->q);
265 sch->stats.backlog = 0;
266 PSCHED_GET_TIME(q->t_c);
267 q->tokens = q->buffer;
268 q->ptokens = q->mtu;
269 sch->flags &= ~TCQ_F_THROTTLED;
270 del_timer(&q->wd_timer);
273 static int tbf_change(struct Qdisc* sch, struct rtattr *opt)
275 int err = -EINVAL;
276 struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
277 struct rtattr *tb[TCA_TBF_PTAB];
278 struct tc_tbf_qopt *qopt;
279 struct qdisc_rate_table *rtab = NULL;
280 struct qdisc_rate_table *ptab = NULL;
281 int max_size;
283 if (rtattr_parse(tb, TCA_TBF_PTAB, RTA_DATA(opt), RTA_PAYLOAD(opt)) ||
284 tb[TCA_TBF_PARMS-1] == NULL ||
285 RTA_PAYLOAD(tb[TCA_TBF_PARMS-1]) < sizeof(*qopt))
286 goto done;
288 qopt = RTA_DATA(tb[TCA_TBF_PARMS-1]);
289 rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB-1]);
290 if (rtab == NULL)
291 goto done;
293 if (qopt->peakrate.rate) {
294 if (qopt->peakrate.rate > qopt->rate.rate)
295 ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB-1]);
296 if (ptab == NULL)
297 goto done;
300 max_size = psched_mtu(sch->dev);
301 if (ptab) {
302 int n = max_size>>qopt->peakrate.cell_log;
303 while (n>0 && ptab->data[n-1] > qopt->mtu) {
304 max_size -= (1<<qopt->peakrate.cell_log);
305 n--;
308 if (rtab->data[max_size>>qopt->rate.cell_log] > qopt->buffer)
309 goto done;
311 sch_tree_lock(sch);
312 q->limit = qopt->limit;
313 q->mtu = qopt->mtu;
314 q->max_size = max_size;
315 q->buffer = qopt->buffer;
316 q->tokens = q->buffer;
317 q->ptokens = q->mtu;
318 rtab = xchg(&q->R_tab, rtab);
319 ptab = xchg(&q->P_tab, ptab);
320 sch_tree_unlock(sch);
321 err = 0;
322 done:
323 if (rtab)
324 qdisc_put_rtab(rtab);
325 if (ptab)
326 qdisc_put_rtab(ptab);
327 return err;
330 static int tbf_init(struct Qdisc* sch, struct rtattr *opt)
332 int err;
333 struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
335 if (opt == NULL)
336 return -EINVAL;
338 MOD_INC_USE_COUNT;
340 PSCHED_GET_TIME(q->t_c);
341 init_timer(&q->wd_timer);
342 q->wd_timer.function = tbf_watchdog;
343 q->wd_timer.data = (unsigned long)sch;
345 if ((err = tbf_change(sch, opt)) != 0) {
346 MOD_DEC_USE_COUNT;
348 return err;
351 static void tbf_destroy(struct Qdisc *sch)
353 struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
355 del_timer(&q->wd_timer);
357 if (q->P_tab)
358 qdisc_put_rtab(q->P_tab);
359 if (q->R_tab)
360 qdisc_put_rtab(q->R_tab);
362 MOD_DEC_USE_COUNT;
365 #ifdef CONFIG_RTNETLINK
366 static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
368 struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
369 unsigned char *b = skb->tail;
370 struct rtattr *rta;
371 struct tc_tbf_qopt opt;
373 rta = (struct rtattr*)b;
374 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
376 opt.limit = q->limit;
377 opt.rate = q->R_tab->rate;
378 if (q->P_tab)
379 opt.peakrate = q->P_tab->rate;
380 else
381 memset(&opt.peakrate, 0, sizeof(opt.peakrate));
382 opt.mtu = q->mtu;
383 opt.buffer = q->buffer;
384 RTA_PUT(skb, TCA_TBF_PARMS, sizeof(opt), &opt);
385 rta->rta_len = skb->tail - b;
387 return skb->len;
389 rtattr_failure:
390 skb_trim(skb, b - skb->data);
391 return -1;
393 #endif
395 struct Qdisc_ops tbf_qdisc_ops =
397 NULL,
398 NULL,
399 "tbf",
400 sizeof(struct tbf_sched_data),
402 tbf_enqueue,
403 tbf_dequeue,
404 tbf_requeue,
405 tbf_drop,
407 tbf_init,
408 tbf_reset,
409 tbf_destroy,
410 tbf_change,
412 #ifdef CONFIG_RTNETLINK
413 tbf_dump,
414 #endif
418 #ifdef MODULE
419 int init_module(void)
421 return register_qdisc(&tbf_qdisc_ops);
424 void cleanup_module(void)
426 unregister_qdisc(&tbf_qdisc_ops);
428 #endif