2 * net/sched/sch_tbf.c Token Bucket Filter queue.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 #include <linux/config.h>
14 #include <linux/module.h>
15 #include <asm/uaccess.h>
16 #include <asm/system.h>
17 #include <asm/bitops.h>
18 #include <linux/types.h>
19 #include <linux/kernel.h>
20 #include <linux/sched.h>
21 #include <linux/string.h>
23 #include <linux/socket.h>
24 #include <linux/sockios.h>
26 #include <linux/errno.h>
27 #include <linux/interrupt.h>
28 #include <linux/if_ether.h>
29 #include <linux/inet.h>
30 #include <linux/netdevice.h>
31 #include <linux/etherdevice.h>
32 #include <linux/notifier.h>
34 #include <net/route.h>
35 #include <linux/skbuff.h>
37 #include <net/pkt_sched.h>
40 /* Simple Token Bucket Filter.
41 =======================================
51 A data flow obeys TBF with rate R and depth B, if for any
52 time interval t_i...t_f the number of transmitted bits
53 does not exceed B + R*(t_f-t_i).
55 Packetized version of this definition:
56 The sequence of packets of sizes s_i served at moments t_i
57 obeys TBF, if for any i<=k:
59 s_i+....+s_k <= B + R*(t_k - t_i)
64 Let N(t_i) be B/R initially and N(t) grow continuously with time as:
66 N(t+delta) = min{B/R, N(t) + delta}
68 If the first packet in queue has length S, it may be
69 transmited only at the time t_* when S/R <= N(t_*),
70 and in this case N(t) jumps:
72 N(t_* + 0) = N(t_* - 0) - S/R.
76 Actually, QoS requires two TBF to be applied to a data stream.
77 One of them controls steady state burst size, another
78 one with rate P (peak rate) and depth M (equal to link MTU)
79 limits bursts at a smaller time scale.
81 It is easy to see that P>R, and B>M. If P is infinity, this double
82 TBF is equivalent to a single one.
84 When TBF works in reshaping mode, latency is estimated as:
86 lat = max ((L-B)/R, (L-M)/P)
92 If TBF throttles, it starts a watchdog timer, which will wake it up
93 when it is ready to transmit.
94 Note that the minimal timer resolution is 1/HZ.
95 If no new packets arrive during this period,
96 or if the device is not awaken by EOI for some previous packet,
97 TBF can stop its activity for 1/HZ.
100 This means, that with depth B, the maximal rate is
104 F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
106 Note that the peak rate TBF is much more tough: with MTU 1500
107 P_crit = 150Kbytes/sec. So, if you need greater peak
108 rates, use alpha with HZ=1000 :-)
111 struct tbf_sched_data
114 u32 limit
; /* Maximal length of backlog: bytes */
115 u32 buffer
; /* Token bucket depth/rate: MUST BE >= MTU/B */
118 struct qdisc_rate_table
*R_tab
;
119 struct qdisc_rate_table
*P_tab
;
122 long tokens
; /* Current number of B tokens */
123 long ptokens
; /* Current number of P tokens */
124 psched_time_t t_c
; /* Time check-point */
125 struct timer_list wd_timer
; /* Watchdog timer */
128 #define L2T(q,L) ((q)->R_tab->data[(L)>>(q)->R_tab->rate.cell_log])
129 #define L2T_P(q,L) ((q)->P_tab->data[(L)>>(q)->P_tab->rate.cell_log])
132 tbf_enqueue(struct sk_buff
*skb
, struct Qdisc
* sch
)
134 struct tbf_sched_data
*q
= (struct tbf_sched_data
*)sch
->data
;
136 if (skb
->len
> q
->max_size
)
138 __skb_queue_tail(&sch
->q
, skb
);
139 if ((sch
->stats
.backlog
+= skb
->len
) <= q
->limit
) {
140 sch
->stats
.bytes
+= skb
->len
;
141 sch
->stats
.packets
++;
145 /* Drop action: undo the things that we just did,
146 * i.e. make tail drop
149 __skb_unlink(skb
, &sch
->q
);
150 sch
->stats
.backlog
-= skb
->len
;
154 #ifdef CONFIG_NET_CLS_POLICE
155 if (sch
->reshape_fail
==NULL
|| sch
->reshape_fail(skb
, sch
))
158 return NET_XMIT_DROP
;
162 tbf_requeue(struct sk_buff
*skb
, struct Qdisc
* sch
)
164 __skb_queue_head(&sch
->q
, skb
);
165 sch
->stats
.backlog
+= skb
->len
;
170 tbf_drop(struct Qdisc
* sch
)
174 skb
= __skb_dequeue_tail(&sch
->q
);
176 sch
->stats
.backlog
-= skb
->len
;
184 static void tbf_watchdog(unsigned long arg
)
186 struct Qdisc
*sch
= (struct Qdisc
*)arg
;
188 sch
->flags
&= ~TCQ_F_THROTTLED
;
189 qdisc_wakeup(sch
->dev
);
192 static struct sk_buff
*
193 tbf_dequeue(struct Qdisc
* sch
)
195 struct tbf_sched_data
*q
= (struct tbf_sched_data
*)sch
->data
;
198 skb
= __skb_dequeue(&sch
->q
);
205 PSCHED_GET_TIME(now
);
207 toks
= PSCHED_TDIFF_SAFE(now
, q
->t_c
, q
->buffer
, 0);
210 ptoks
= toks
+ q
->ptokens
;
211 if (ptoks
> (long)q
->mtu
)
213 ptoks
-= L2T_P(q
, skb
->len
);
216 if (toks
> (long)q
->buffer
)
218 toks
-= L2T(q
, skb
->len
);
220 if ((toks
|ptoks
) >= 0) {
224 sch
->stats
.backlog
-= skb
->len
;
225 sch
->flags
&= ~TCQ_F_THROTTLED
;
229 if (!sch
->dev
->tbusy
) {
230 long delay
= PSCHED_US2JIFFIE(max(-toks
, -ptoks
));
235 del_timer(&q
->wd_timer
);
236 q
->wd_timer
.expires
= jiffies
+ delay
;
237 add_timer(&q
->wd_timer
);
240 /* Maybe we have a shorter packet in the queue,
241 which can be sent now. It sounds cool,
242 but, however, this is wrong in principle.
243 We MUST NOT reorder packets under these circumstances.
245 Really, if we split the flow into independent
246 subflows, it would be a very good solution.
247 This is the main idea of all FQ algorithms
248 (cf. CSZ, HPFQ, HFSC)
250 __skb_queue_head(&sch
->q
, skb
);
252 sch
->flags
|= TCQ_F_THROTTLED
;
253 sch
->stats
.overlimits
++;
260 tbf_reset(struct Qdisc
* sch
)
262 struct tbf_sched_data
*q
= (struct tbf_sched_data
*)sch
->data
;
264 skb_queue_purge(&sch
->q
);
265 sch
->stats
.backlog
= 0;
266 PSCHED_GET_TIME(q
->t_c
);
267 q
->tokens
= q
->buffer
;
269 sch
->flags
&= ~TCQ_F_THROTTLED
;
270 del_timer(&q
->wd_timer
);
273 static int tbf_change(struct Qdisc
* sch
, struct rtattr
*opt
)
276 struct tbf_sched_data
*q
= (struct tbf_sched_data
*)sch
->data
;
277 struct rtattr
*tb
[TCA_TBF_PTAB
];
278 struct tc_tbf_qopt
*qopt
;
279 struct qdisc_rate_table
*rtab
= NULL
;
280 struct qdisc_rate_table
*ptab
= NULL
;
283 if (rtattr_parse(tb
, TCA_TBF_PTAB
, RTA_DATA(opt
), RTA_PAYLOAD(opt
)) ||
284 tb
[TCA_TBF_PARMS
-1] == NULL
||
285 RTA_PAYLOAD(tb
[TCA_TBF_PARMS
-1]) < sizeof(*qopt
))
288 qopt
= RTA_DATA(tb
[TCA_TBF_PARMS
-1]);
289 rtab
= qdisc_get_rtab(&qopt
->rate
, tb
[TCA_TBF_RTAB
-1]);
293 if (qopt
->peakrate
.rate
) {
294 if (qopt
->peakrate
.rate
> qopt
->rate
.rate
)
295 ptab
= qdisc_get_rtab(&qopt
->peakrate
, tb
[TCA_TBF_PTAB
-1]);
300 max_size
= psched_mtu(sch
->dev
);
302 int n
= max_size
>>qopt
->peakrate
.cell_log
;
303 while (n
>0 && ptab
->data
[n
-1] > qopt
->mtu
) {
304 max_size
-= (1<<qopt
->peakrate
.cell_log
);
308 if (rtab
->data
[max_size
>>qopt
->rate
.cell_log
] > qopt
->buffer
)
312 q
->limit
= qopt
->limit
;
314 q
->max_size
= max_size
;
315 q
->buffer
= qopt
->buffer
;
316 q
->tokens
= q
->buffer
;
318 rtab
= xchg(&q
->R_tab
, rtab
);
319 ptab
= xchg(&q
->P_tab
, ptab
);
320 sch_tree_unlock(sch
);
324 qdisc_put_rtab(rtab
);
326 qdisc_put_rtab(ptab
);
330 static int tbf_init(struct Qdisc
* sch
, struct rtattr
*opt
)
333 struct tbf_sched_data
*q
= (struct tbf_sched_data
*)sch
->data
;
340 PSCHED_GET_TIME(q
->t_c
);
341 init_timer(&q
->wd_timer
);
342 q
->wd_timer
.function
= tbf_watchdog
;
343 q
->wd_timer
.data
= (unsigned long)sch
;
345 if ((err
= tbf_change(sch
, opt
)) != 0) {
351 static void tbf_destroy(struct Qdisc
*sch
)
353 struct tbf_sched_data
*q
= (struct tbf_sched_data
*)sch
->data
;
355 del_timer(&q
->wd_timer
);
358 qdisc_put_rtab(q
->P_tab
);
360 qdisc_put_rtab(q
->R_tab
);
365 #ifdef CONFIG_RTNETLINK
366 static int tbf_dump(struct Qdisc
*sch
, struct sk_buff
*skb
)
368 struct tbf_sched_data
*q
= (struct tbf_sched_data
*)sch
->data
;
369 unsigned char *b
= skb
->tail
;
371 struct tc_tbf_qopt opt
;
373 rta
= (struct rtattr
*)b
;
374 RTA_PUT(skb
, TCA_OPTIONS
, 0, NULL
);
376 opt
.limit
= q
->limit
;
377 opt
.rate
= q
->R_tab
->rate
;
379 opt
.peakrate
= q
->P_tab
->rate
;
381 memset(&opt
.peakrate
, 0, sizeof(opt
.peakrate
));
383 opt
.buffer
= q
->buffer
;
384 RTA_PUT(skb
, TCA_TBF_PARMS
, sizeof(opt
), &opt
);
385 rta
->rta_len
= skb
->tail
- b
;
390 skb_trim(skb
, b
- skb
->data
);
395 struct Qdisc_ops tbf_qdisc_ops
=
400 sizeof(struct tbf_sched_data
),
412 #ifdef CONFIG_RTNETLINK
419 int init_module(void)
421 return register_qdisc(&tbf_qdisc_ops
);
424 void cleanup_module(void)
426 unregister_qdisc(&tbf_qdisc_ops
);