i386/linux-2.3.21/net/sched/sch_tbf.c

   1 /*
   2  * net/sched/sch_tbf.c  Token Bucket Filter queue.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10  *
  11  */
  12
  13 #include <linux/config.h>
  14 #include <linux/module.h>
  15 #include <asm/uaccess.h>
  16 #include <asm/system.h>
  17 #include <asm/bitops.h>
  18 #include <linux/types.h>
  19 #include <linux/kernel.h>
  20 #include <linux/sched.h>
  21 #include <linux/string.h>
  22 #include <linux/mm.h>
  23 #include <linux/socket.h>
  24 #include <linux/sockios.h>
  25 #include <linux/in.h>
  26 #include <linux/errno.h>
  27 #include <linux/interrupt.h>
  28 #include <linux/if_ether.h>
  29 #include <linux/inet.h>
  30 #include <linux/netdevice.h>
  31 #include <linux/etherdevice.h>
  32 #include <linux/notifier.h>
  33 #include <net/ip.h>
  34 #include <net/route.h>
  35 #include <linux/skbuff.h>
  36 #include <net/sock.h>
  37 #include <net/pkt_sched.h>
  38
  39
  40 /*      Simple Token Bucket Filter.
  41         =======================================
  42
  43         SOURCE.
  44         -------
  45
  46         None.
  47
  48         Description.
  49         ------------
  50
  51         A data flow obeys TBF with rate R and depth B, if for any
  52         time interval t_i...t_f the number of transmitted bits
  53         does not exceed B + R*(t_f-t_i).
  54
  55         Packetized version of this definition:
  56         The sequence of packets of sizes s_i served at moments t_i
  57         obeys TBF, if for any i<=k:
  58
  59         s_i+....+s_k <= B + R*(t_k - t_i)
  60
  61         Algorithm.
  62         ----------
  63
  64         Let N(t_i) be B/R initially and N(t) grow continuously with time as:
  65
  66         N(t+delta) = min{B/R, N(t) + delta}
  67
  68         If the first packet in queue has length S, it may be
  69         transmited only at the time t_* when S/R <= N(t_*),
  70         and in this case N(t) jumps:
  71
  72         N(t_* + 0) = N(t_* - 0) - S/R.
  73
  74
  75
  76         Actually, QoS requires two TBF to be applied to a data stream.
  77         One of them controls steady state burst size, another
  78         one with rate P (peak rate) and depth M (equal to link MTU)
  79         limits bursts at a smaller time scale.
  80
  81         It is easy to see that P>R, and B>M. If P is infinity, this double
  82         TBF is equivalent to a single one.
  83
  84         When TBF works in reshaping mode, latency is estimated as:
  85
  86         lat = max ((L-B)/R, (L-M)/P)
  87
  88
  89         NOTES.
  90         ------
  91
  92         If TBF throttles, it starts a watchdog timer, which will wake it up
  93         when it is ready to transmit.
  94         Note that the minimal timer resolution is 1/HZ.
  95         If no new packets arrive during this period,
  96         or if the device is not awaken by EOI for some previous packet,
  97         TBF can stop its activity for 1/HZ.
  98
  99
 100         This means, that with depth B, the maximal rate is
 101
 102         R_crit = B*HZ
 103
 104         F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
 105
 106         Note that the peak rate TBF is much more tough: with MTU 1500
 107         P_crit = 150Kbytes/sec. So, if you need greater peak
 108         rates, use alpha with HZ=1000 :-)
 109 */
 110
 111 struct tbf_sched_data
 112 {
 113 /* Parameters */
 114         u32             limit;          /* Maximal length of backlog: bytes */
 115         u32             buffer;         /* Token bucket depth/rate: MUST BE >= MTU/B */
 116         u32             mtu;
 117         u32             max_size;
 118         struct qdisc_rate_table *R_tab;
 119         struct qdisc_rate_table *P_tab;
 120
 121 /* Variables */
 122         long    tokens;                 /* Current number of B tokens */
 123         long    ptokens;                /* Current number of P tokens */
 124         psched_time_t   t_c;            /* Time check-point */
 125         struct timer_list wd_timer;     /* Watchdog timer */
 126 };
 127
 128 #define L2T(q,L)   ((q)->R_tab->data[(L)>>(q)->R_tab->rate.cell_log])
 129 #define L2T_P(q,L) ((q)->P_tab->data[(L)>>(q)->P_tab->rate.cell_log])
 130
 131 static int
 132 tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch)
 133 {
 134         struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
 135
 136         if (skb->len > q->max_size)
 137                 goto drop;
 138         __skb_queue_tail(&sch->q, skb);
 139         if ((sch->stats.backlog += skb->len) <= q->limit) {
 140                 sch->stats.bytes += skb->len;
 141                 sch->stats.packets++;
 142                 return 0;
 143         }
 144
 145         /* Drop action: undo the things that we just did,
 146          * i.e. make tail drop
 147          */
 148
 149         __skb_unlink(skb, &sch->q);
 150         sch->stats.backlog -= skb->len;
 151
 152 drop:
 153         sch->stats.drops++;
 154 #ifdef CONFIG_NET_CLS_POLICE
 155         if (sch->reshape_fail==NULL || sch->reshape_fail(skb, sch))
 156 #endif
 157                 kfree_skb(skb);
 158         return NET_XMIT_DROP;
 159 }
 160
 161 static int
 162 tbf_requeue(struct sk_buff *skb, struct Qdisc* sch)
 163 {
 164         __skb_queue_head(&sch->q, skb);
 165         sch->stats.backlog += skb->len;
 166         return 0;
 167 }
 168
 169 static int
 170 tbf_drop(struct Qdisc* sch)
 171 {
 172         struct sk_buff *skb;
 173
 174         skb = __skb_dequeue_tail(&sch->q);
 175         if (skb) {
 176                 sch->stats.backlog -= skb->len;
 177                 sch->stats.drops++;
 178                 kfree_skb(skb);
 179                 return 1;
 180         }
 181         return 0;
 182 }
 183
 184 static void tbf_watchdog(unsigned long arg)
 185 {
 186         struct Qdisc *sch = (struct Qdisc*)arg;
 187
 188         sch->flags &= ~TCQ_F_THROTTLED;
 189         qdisc_wakeup(sch->dev);
 190 }
 191
 192 static struct sk_buff *
 193 tbf_dequeue(struct Qdisc* sch)
 194 {
 195         struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
 196         struct sk_buff *skb;
 197
 198         skb = __skb_dequeue(&sch->q);
 199
 200         if (skb) {
 201                 psched_time_t now;
 202                 long toks;
 203                 long ptoks = 0;
 204
 205                 PSCHED_GET_TIME(now);
 206
 207                 toks = PSCHED_TDIFF_SAFE(now, q->t_c, q->buffer, 0);
 208
 209                 if (q->P_tab) {
 210                         ptoks = toks + q->ptokens;
 211                         if (ptoks > (long)q->mtu)
 212                                 ptoks = q->mtu;
 213                         ptoks -= L2T_P(q, skb->len);
 214                 }
 215                 toks += q->tokens;
 216                 if (toks > (long)q->buffer)
 217                         toks = q->buffer;
 218                 toks -= L2T(q, skb->len);
 219
 220                 if ((toks|ptoks) >= 0) {
 221                         q->t_c = now;
 222                         q->tokens = toks;
 223                         q->ptokens = ptoks;
 224                         sch->stats.backlog -= skb->len;
 225                         sch->flags &= ~TCQ_F_THROTTLED;
 226                         return skb;
 227                 }
 228
 229                 if (!sch->dev->tbusy) {
 230                         long delay = PSCHED_US2JIFFIE(max(-toks, -ptoks));
 231
 232                         if (delay == 0)
 233                                 delay = 1;
 234
 235                         del_timer(&q->wd_timer);
 236                         q->wd_timer.expires = jiffies + delay;
 237                         add_timer(&q->wd_timer);
 238                 }
 239
 240                 /* Maybe we have a shorter packet in the queue,
 241                    which can be sent now. It sounds cool,
 242                    but, however, this is wrong in principle.
 243                    We MUST NOT reorder packets under these circumstances.
 244
 245                    Really, if we split the flow into independent
 246                    subflows, it would be a very good solution.
 247                    This is the main idea of all FQ algorithms
 248                    (cf. CSZ, HPFQ, HFSC)
 249                  */
 250                 __skb_queue_head(&sch->q, skb);
 251
 252                 sch->flags |= TCQ_F_THROTTLED;
 253                 sch->stats.overlimits++;
 254         }
 255         return NULL;
 256 }
 257
 258
 259 static void
 260 tbf_reset(struct Qdisc* sch)
 261 {
 262         struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
 263
 264         skb_queue_purge(&sch->q);
 265         sch->stats.backlog = 0;
 266         PSCHED_GET_TIME(q->t_c);
 267         q->tokens = q->buffer;
 268         q->ptokens = q->mtu;
 269         sch->flags &= ~TCQ_F_THROTTLED;
 270         del_timer(&q->wd_timer);
 271 }
 272
 273 static int tbf_change(struct Qdisc* sch, struct rtattr *opt)
 274 {
 275         int err = -EINVAL;
 276         struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
 277         struct rtattr *tb[TCA_TBF_PTAB];
 278         struct tc_tbf_qopt *qopt;
 279         struct qdisc_rate_table *rtab = NULL;
 280         struct qdisc_rate_table *ptab = NULL;
 281         int max_size;
 282
 283         if (rtattr_parse(tb, TCA_TBF_PTAB, RTA_DATA(opt), RTA_PAYLOAD(opt)) ||
 284             tb[TCA_TBF_PARMS-1] == NULL ||
 285             RTA_PAYLOAD(tb[TCA_TBF_PARMS-1]) < sizeof(*qopt))
 286                 goto done;
 287
 288         qopt = RTA_DATA(tb[TCA_TBF_PARMS-1]);
 289         rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB-1]);
 290         if (rtab == NULL)
 291                 goto done;
 292
 293         if (qopt->peakrate.rate) {
 294                 if (qopt->peakrate.rate > qopt->rate.rate)
 295                         ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB-1]);
 296                 if (ptab == NULL)
 297                         goto done;
 298         }
 299
 300         max_size = psched_mtu(sch->dev);
 301         if (ptab) {
 302                 int n = max_size>>qopt->peakrate.cell_log;
 303                 while (n>0 && ptab->data[n-1] > qopt->mtu) {
 304                         max_size -= (1<<qopt->peakrate.cell_log);
 305                         n--;
 306                 }
 307         }
 308         if (rtab->data[max_size>>qopt->rate.cell_log] > qopt->buffer)
 309                 goto done;
 310
 311         sch_tree_lock(sch);
 312         q->limit = qopt->limit;
 313         q->mtu = qopt->mtu;
 314         q->max_size = max_size;
 315         q->buffer = qopt->buffer;
 316         q->tokens = q->buffer;
 317         q->ptokens = q->mtu;
 318         rtab = xchg(&q->R_tab, rtab);
 319         ptab = xchg(&q->P_tab, ptab);
 320         sch_tree_unlock(sch);
 321         err = 0;
 322 done:
 323         if (rtab)
 324                 qdisc_put_rtab(rtab);
 325         if (ptab)
 326                 qdisc_put_rtab(ptab);
 327         return err;
 328 }
 329
 330 static int tbf_init(struct Qdisc* sch, struct rtattr *opt)
 331 {
 332         int err;
 333         struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
 334
 335         if (opt == NULL)
 336                 return -EINVAL;
 337
 338         MOD_INC_USE_COUNT;
 339
 340         PSCHED_GET_TIME(q->t_c);
 341         init_timer(&q->wd_timer);
 342         q->wd_timer.function = tbf_watchdog;
 343         q->wd_timer.data = (unsigned long)sch;
 344
 345         if ((err = tbf_change(sch, opt)) != 0) {
 346                 MOD_DEC_USE_COUNT;
 347         }
 348         return err;
 349 }
 350
 351 static void tbf_destroy(struct Qdisc *sch)
 352 {
 353         struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
 354
 355         del_timer(&q->wd_timer);
 356
 357         if (q->P_tab)
 358                 qdisc_put_rtab(q->P_tab);
 359         if (q->R_tab)
 360                 qdisc_put_rtab(q->R_tab);
 361
 362         MOD_DEC_USE_COUNT;
 363 }
 364
 365 #ifdef CONFIG_RTNETLINK
 366 static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
 367 {
 368         struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
 369         unsigned char    *b = skb->tail;
 370         struct rtattr *rta;
 371         struct tc_tbf_qopt opt;
 372
 373         rta = (struct rtattr*)b;
 374         RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
 375
 376         opt.limit = q->limit;
 377         opt.rate = q->R_tab->rate;
 378         if (q->P_tab)
 379                 opt.peakrate = q->P_tab->rate;
 380         else
 381                 memset(&opt.peakrate, 0, sizeof(opt.peakrate));
 382         opt.mtu = q->mtu;
 383         opt.buffer = q->buffer;
 384         RTA_PUT(skb, TCA_TBF_PARMS, sizeof(opt), &opt);
 385         rta->rta_len = skb->tail - b;
 386
 387         return skb->len;
 388
 389 rtattr_failure:
 390         skb_trim(skb, b - skb->data);
 391         return -1;
 392 }
 393 #endif
 394
 395 struct Qdisc_ops tbf_qdisc_ops =
 396 {
 397         NULL,
 398         NULL,
 399         "tbf",
 400         sizeof(struct tbf_sched_data),
 401
 402         tbf_enqueue,
 403         tbf_dequeue,
 404         tbf_requeue,
 405         tbf_drop,
 406
 407         tbf_init,
 408         tbf_reset,
 409         tbf_destroy,
 410         tbf_change,
 411
 412 #ifdef CONFIG_RTNETLINK
 413         tbf_dump,
 414 #endif
 415 };
 416
 417
 418 #ifdef MODULE
 419 int init_module(void)
 420 {
 421         return register_qdisc(&tbf_qdisc_ops);
 422 }
 423
 424 void cleanup_module(void)
 425 {
 426         unregister_qdisc(&tbf_qdisc_ops);
 427 }
 428 #endif