net/sched/sch_tbf.c

   1 /*
   2  * net/sched/sch_tbf.c  Token Bucket Filter queue.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10  *              Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
  11  *                                               original idea by Martin Devera
  12  *
  13  */
  14
  15 #include <linux/module.h>
  16 #include <linux/types.h>
  17 #include <linux/kernel.h>
  18 #include <linux/string.h>
  19 #include <linux/errno.h>
  20 #include <linux/skbuff.h>
  21 #include <net/netlink.h>
  22 #include <net/sch_generic.h>
  23 #include <net/pkt_sched.h>
  24
  25
  26 /*      Simple Token Bucket Filter.
  27         =======================================
  28
  29         SOURCE.
  30         -------
  31
  32         None.
  33
  34         Description.
  35         ------------
  36
  37         A data flow obeys TBF with rate R and depth B, if for any
  38         time interval t_i...t_f the number of transmitted bits
  39         does not exceed B + R*(t_f-t_i).
  40
  41         Packetized version of this definition:
  42         The sequence of packets of sizes s_i served at moments t_i
  43         obeys TBF, if for any i<=k:
  44
  45         s_i+....+s_k <= B + R*(t_k - t_i)
  46
  47         Algorithm.
  48         ----------
  49
  50         Let N(t_i) be B/R initially and N(t) grow continuously with time as:
  51
  52         N(t+delta) = min{B/R, N(t) + delta}
  53
  54         If the first packet in queue has length S, it may be
  55         transmitted only at the time t_* when S/R <= N(t_*),
  56         and in this case N(t) jumps:
  57
  58         N(t_* + 0) = N(t_* - 0) - S/R.
  59
  60
  61
  62         Actually, QoS requires two TBF to be applied to a data stream.
  63         One of them controls steady state burst size, another
  64         one with rate P (peak rate) and depth M (equal to link MTU)
  65         limits bursts at a smaller time scale.
  66
  67         It is easy to see that P>R, and B>M. If P is infinity, this double
  68         TBF is equivalent to a single one.
  69
  70         When TBF works in reshaping mode, latency is estimated as:
  71
  72         lat = max ((L-B)/R, (L-M)/P)
  73
  74
  75         NOTES.
  76         ------
  77
  78         If TBF throttles, it starts a watchdog timer, which will wake it up
  79         when it is ready to transmit.
  80         Note that the minimal timer resolution is 1/HZ.
  81         If no new packets arrive during this period,
  82         or if the device is not awaken by EOI for some previous packet,
  83         TBF can stop its activity for 1/HZ.
  84
  85
  86         This means, that with depth B, the maximal rate is
  87
  88         R_crit = B*HZ
  89
  90         F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
  91
  92         Note that the peak rate TBF is much more tough: with MTU 1500
  93         P_crit = 150Kbytes/sec. So, if you need greater peak
  94         rates, use alpha with HZ=1000 :-)
  95
  96         With classful TBF, limit is just kept for backwards compatibility.
  97         It is passed to the default bfifo qdisc - if the inner qdisc is
  98         changed the limit is not effective anymore.
  99 */
 100
 101 struct tbf_sched_data {
 102 /* Parameters */
 103         u32             limit;          /* Maximal length of backlog: bytes */
 104         u32             max_size;
 105         s64             buffer;         /* Token bucket depth/rate: MUST BE >= MTU/B */
 106         s64             mtu;
 107         struct psched_ratecfg rate;
 108         struct psched_ratecfg peak;
 109
 110 /* Variables */
 111         s64     tokens;                 /* Current number of B tokens */
 112         s64     ptokens;                /* Current number of P tokens */
 113         s64     t_c;                    /* Time check-point */
 114         struct Qdisc    *qdisc;         /* Inner qdisc, default - bfifo queue */
 115         struct qdisc_watchdog watchdog; /* Watchdog timer */
 116 };
 117
 118
 119 /* Time to Length, convert time in ns to length in bytes
 120  * to determinate how many bytes can be sent in given time.
 121  */
 122 static u64 psched_ns_t2l(const struct psched_ratecfg *r,
 123                          u64 time_in_ns)
 124 {
 125         /* The formula is :
 126          * len = (time_in_ns * r->rate_bytes_ps) / NSEC_PER_SEC
 127          */
 128         u64 len = time_in_ns * r->rate_bytes_ps;
 129
 130         do_div(len, NSEC_PER_SEC);
 131
 132         if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) {
 133                 do_div(len, 53);
 134                 len = len * 48;
 135         }
 136
 137         if (len > r->overhead)
 138                 len -= r->overhead;
 139         else
 140                 len = 0;
 141
 142         return len;
 143 }
 144
 145 /* GSO packet is too big, segment it so that tbf can transmit
 146  * each segment in time
 147  */
 148 static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch,
 149                        struct sk_buff **to_free)
 150 {
 151         struct tbf_sched_data *q = qdisc_priv(sch);
 152         struct sk_buff *segs, *nskb;
 153         netdev_features_t features = netif_skb_features(skb);
 154         unsigned int len = 0, prev_len = qdisc_pkt_len(skb);
 155         int ret, nb;
 156
 157         segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
 158
 159         if (IS_ERR_OR_NULL(segs))
 160                 return qdisc_drop(skb, sch, to_free);
 161
 162         nb = 0;
 163         while (segs) {
 164                 nskb = segs->next;
 165                 segs->next = NULL;
 166                 qdisc_skb_cb(segs)->pkt_len = segs->len;
 167                 len += segs->len;
 168                 ret = qdisc_enqueue(segs, q->qdisc, to_free);
 169                 if (ret != NET_XMIT_SUCCESS) {
 170                         if (net_xmit_drop_count(ret))
 171                                 qdisc_qstats_drop(sch);
 172                 } else {
 173                         nb++;
 174                 }
 175                 segs = nskb;
 176         }
 177         sch->q.qlen += nb;
 178         if (nb > 1)
 179                 qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len);
 180         consume_skb(skb);
 181         return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
 182 }
 183
 184 static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 185                        struct sk_buff **to_free)
 186 {
 187         struct tbf_sched_data *q = qdisc_priv(sch);
 188         int ret;
 189
 190         if (qdisc_pkt_len(skb) > q->max_size) {
 191                 if (skb_is_gso(skb) && skb_gso_mac_seglen(skb) <= q->max_size)
 192                         return tbf_segment(skb, sch, to_free);
 193                 return qdisc_drop(skb, sch, to_free);
 194         }
 195         ret = qdisc_enqueue(skb, q->qdisc, to_free);
 196         if (ret != NET_XMIT_SUCCESS) {
 197                 if (net_xmit_drop_count(ret))
 198                         qdisc_qstats_drop(sch);
 199                 return ret;
 200         }
 201
 202         qdisc_qstats_backlog_inc(sch, skb);
 203         sch->q.qlen++;
 204         return NET_XMIT_SUCCESS;
 205 }
 206
 207 static bool tbf_peak_present(const struct tbf_sched_data *q)
 208 {
 209         return q->peak.rate_bytes_ps;
 210 }
 211
 212 static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
 213 {
 214         struct tbf_sched_data *q = qdisc_priv(sch);
 215         struct sk_buff *skb;
 216
 217         skb = q->qdisc->ops->peek(q->qdisc);
 218
 219         if (skb) {
 220                 s64 now;
 221                 s64 toks;
 222                 s64 ptoks = 0;
 223                 unsigned int len = qdisc_pkt_len(skb);
 224
 225                 now = ktime_get_ns();
 226                 toks = min_t(s64, now - q->t_c, q->buffer);
 227
 228                 if (tbf_peak_present(q)) {
 229                         ptoks = toks + q->ptokens;
 230                         if (ptoks > q->mtu)
 231                                 ptoks = q->mtu;
 232                         ptoks -= (s64) psched_l2t_ns(&q->peak, len);
 233                 }
 234                 toks += q->tokens;
 235                 if (toks > q->buffer)
 236                         toks = q->buffer;
 237                 toks -= (s64) psched_l2t_ns(&q->rate, len);
 238
 239                 if ((toks|ptoks) >= 0) {
 240                         skb = qdisc_dequeue_peeked(q->qdisc);
 241                         if (unlikely(!skb))
 242                                 return NULL;
 243
 244                         q->t_c = now;
 245                         q->tokens = toks;
 246                         q->ptokens = ptoks;
 247                         qdisc_qstats_backlog_dec(sch, skb);
 248                         sch->q.qlen--;
 249                         qdisc_bstats_update(sch, skb);
 250                         return skb;
 251                 }
 252
 253                 qdisc_watchdog_schedule_ns(&q->watchdog,
 254                                            now + max_t(long, -toks, -ptoks));
 255
 256                 /* Maybe we have a shorter packet in the queue,
 257                    which can be sent now. It sounds cool,
 258                    but, however, this is wrong in principle.
 259                    We MUST NOT reorder packets under these circumstances.
 260
 261                    Really, if we split the flow into independent
 262                    subflows, it would be a very good solution.
 263                    This is the main idea of all FQ algorithms
 264                    (cf. CSZ, HPFQ, HFSC)
 265                  */
 266
 267                 qdisc_qstats_overlimit(sch);
 268         }
 269         return NULL;
 270 }
 271
 272 static void tbf_reset(struct Qdisc *sch)
 273 {
 274         struct tbf_sched_data *q = qdisc_priv(sch);
 275
 276         qdisc_reset(q->qdisc);
 277         sch->qstats.backlog = 0;
 278         sch->q.qlen = 0;
 279         q->t_c = ktime_get_ns();
 280         q->tokens = q->buffer;
 281         q->ptokens = q->mtu;
 282         qdisc_watchdog_cancel(&q->watchdog);
 283 }
 284
 285 static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
 286         [TCA_TBF_PARMS] = { .len = sizeof(struct tc_tbf_qopt) },
 287         [TCA_TBF_RTAB]  = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
 288         [TCA_TBF_PTAB]  = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
 289         [TCA_TBF_RATE64]        = { .type = NLA_U64 },
 290         [TCA_TBF_PRATE64]       = { .type = NLA_U64 },
 291         [TCA_TBF_BURST] = { .type = NLA_U32 },
 292         [TCA_TBF_PBURST] = { .type = NLA_U32 },
 293 };
 294
 295 static int tbf_change(struct Qdisc *sch, struct nlattr *opt,
 296                       struct netlink_ext_ack *extack)
 297 {
 298         int err;
 299         struct tbf_sched_data *q = qdisc_priv(sch);
 300         struct nlattr *tb[TCA_TBF_MAX + 1];
 301         struct tc_tbf_qopt *qopt;
 302         struct Qdisc *child = NULL;
 303         struct psched_ratecfg rate;
 304         struct psched_ratecfg peak;
 305         u64 max_size;
 306         s64 buffer, mtu;
 307         u64 rate64 = 0, prate64 = 0;
 308
 309         err = nla_parse_nested(tb, TCA_TBF_MAX, opt, tbf_policy, NULL);
 310         if (err < 0)
 311                 return err;
 312
 313         err = -EINVAL;
 314         if (tb[TCA_TBF_PARMS] == NULL)
 315                 goto done;
 316
 317         qopt = nla_data(tb[TCA_TBF_PARMS]);
 318         if (qopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
 319                 qdisc_put_rtab(qdisc_get_rtab(&qopt->rate,
 320                                               tb[TCA_TBF_RTAB],
 321                                               NULL));
 322
 323         if (qopt->peakrate.linklayer == TC_LINKLAYER_UNAWARE)
 324                         qdisc_put_rtab(qdisc_get_rtab(&qopt->peakrate,
 325                                                       tb[TCA_TBF_PTAB],
 326                                                       NULL));
 327
 328         buffer = min_t(u64, PSCHED_TICKS2NS(qopt->buffer), ~0U);
 329         mtu = min_t(u64, PSCHED_TICKS2NS(qopt->mtu), ~0U);
 330
 331         if (tb[TCA_TBF_RATE64])
 332                 rate64 = nla_get_u64(tb[TCA_TBF_RATE64]);
 333         psched_ratecfg_precompute(&rate, &qopt->rate, rate64);
 334
 335         if (tb[TCA_TBF_BURST]) {
 336                 max_size = nla_get_u32(tb[TCA_TBF_BURST]);
 337                 buffer = psched_l2t_ns(&rate, max_size);
 338         } else {
 339                 max_size = min_t(u64, psched_ns_t2l(&rate, buffer), ~0U);
 340         }
 341
 342         if (qopt->peakrate.rate) {
 343                 if (tb[TCA_TBF_PRATE64])
 344                         prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]);
 345                 psched_ratecfg_precompute(&peak, &qopt->peakrate, prate64);
 346                 if (peak.rate_bytes_ps <= rate.rate_bytes_ps) {
 347                         pr_warn_ratelimited("sch_tbf: peakrate %llu is lower than or equals to rate %llu !\n",
 348                                         peak.rate_bytes_ps, rate.rate_bytes_ps);
 349                         err = -EINVAL;
 350                         goto done;
 351                 }
 352
 353                 if (tb[TCA_TBF_PBURST]) {
 354                         u32 pburst = nla_get_u32(tb[TCA_TBF_PBURST]);
 355                         max_size = min_t(u32, max_size, pburst);
 356                         mtu = psched_l2t_ns(&peak, pburst);
 357                 } else {
 358                         max_size = min_t(u64, max_size, psched_ns_t2l(&peak, mtu));
 359                 }
 360         } else {
 361                 memset(&peak, 0, sizeof(peak));
 362         }
 363
 364         if (max_size < psched_mtu(qdisc_dev(sch)))
 365                 pr_warn_ratelimited("sch_tbf: burst %llu is lower than device %s mtu (%u) !\n",
 366                                     max_size, qdisc_dev(sch)->name,
 367                                     psched_mtu(qdisc_dev(sch)));
 368
 369         if (!max_size) {
 370                 err = -EINVAL;
 371                 goto done;
 372         }
 373
 374         if (q->qdisc != &noop_qdisc) {
 375                 err = fifo_set_limit(q->qdisc, qopt->limit);
 376                 if (err)
 377                         goto done;
 378         } else if (qopt->limit > 0) {
 379                 child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit,
 380                                          extack);
 381                 if (IS_ERR(child)) {
 382                         err = PTR_ERR(child);
 383                         goto done;
 384                 }
 385         }
 386
 387         sch_tree_lock(sch);
 388         if (child) {
 389                 qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen,
 390                                           q->qdisc->qstats.backlog);
 391                 qdisc_destroy(q->qdisc);
 392                 q->qdisc = child;
 393                 if (child != &noop_qdisc)
 394                         qdisc_hash_add(child, true);
 395         }
 396         q->limit = qopt->limit;
 397         if (tb[TCA_TBF_PBURST])
 398                 q->mtu = mtu;
 399         else
 400                 q->mtu = PSCHED_TICKS2NS(qopt->mtu);
 401         q->max_size = max_size;
 402         if (tb[TCA_TBF_BURST])
 403                 q->buffer = buffer;
 404         else
 405                 q->buffer = PSCHED_TICKS2NS(qopt->buffer);
 406         q->tokens = q->buffer;
 407         q->ptokens = q->mtu;
 408
 409         memcpy(&q->rate, &rate, sizeof(struct psched_ratecfg));
 410         memcpy(&q->peak, &peak, sizeof(struct psched_ratecfg));
 411
 412         sch_tree_unlock(sch);
 413         err = 0;
 414 done:
 415         return err;
 416 }
 417
 418 static int tbf_init(struct Qdisc *sch, struct nlattr *opt,
 419                     struct netlink_ext_ack *extack)
 420 {
 421         struct tbf_sched_data *q = qdisc_priv(sch);
 422
 423         qdisc_watchdog_init(&q->watchdog, sch);
 424         q->qdisc = &noop_qdisc;
 425
 426         if (!opt)
 427                 return -EINVAL;
 428
 429         q->t_c = ktime_get_ns();
 430
 431         return tbf_change(sch, opt, extack);
 432 }
 433
 434 static void tbf_destroy(struct Qdisc *sch)
 435 {
 436         struct tbf_sched_data *q = qdisc_priv(sch);
 437
 438         qdisc_watchdog_cancel(&q->watchdog);
 439         qdisc_destroy(q->qdisc);
 440 }
 441
 442 static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
 443 {
 444         struct tbf_sched_data *q = qdisc_priv(sch);
 445         struct nlattr *nest;
 446         struct tc_tbf_qopt opt;
 447
 448         sch->qstats.backlog = q->qdisc->qstats.backlog;
 449         nest = nla_nest_start(skb, TCA_OPTIONS);
 450         if (nest == NULL)
 451                 goto nla_put_failure;
 452
 453         opt.limit = q->limit;
 454         psched_ratecfg_getrate(&opt.rate, &q->rate);
 455         if (tbf_peak_present(q))
 456                 psched_ratecfg_getrate(&opt.peakrate, &q->peak);
 457         else
 458                 memset(&opt.peakrate, 0, sizeof(opt.peakrate));
 459         opt.mtu = PSCHED_NS2TICKS(q->mtu);
 460         opt.buffer = PSCHED_NS2TICKS(q->buffer);
 461         if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt))
 462                 goto nla_put_failure;
 463         if (q->rate.rate_bytes_ps >= (1ULL << 32) &&
 464             nla_put_u64_64bit(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps,
 465                               TCA_TBF_PAD))
 466                 goto nla_put_failure;
 467         if (tbf_peak_present(q) &&
 468             q->peak.rate_bytes_ps >= (1ULL << 32) &&
 469             nla_put_u64_64bit(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps,
 470                               TCA_TBF_PAD))
 471                 goto nla_put_failure;
 472
 473         return nla_nest_end(skb, nest);
 474
 475 nla_put_failure:
 476         nla_nest_cancel(skb, nest);
 477         return -1;
 478 }
 479
 480 static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
 481                           struct sk_buff *skb, struct tcmsg *tcm)
 482 {
 483         struct tbf_sched_data *q = qdisc_priv(sch);
 484
 485         tcm->tcm_handle |= TC_H_MIN(1);
 486         tcm->tcm_info = q->qdisc->handle;
 487
 488         return 0;
 489 }
 490
 491 static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 492                      struct Qdisc **old, struct netlink_ext_ack *extack)
 493 {
 494         struct tbf_sched_data *q = qdisc_priv(sch);
 495
 496         if (new == NULL)
 497                 new = &noop_qdisc;
 498
 499         *old = qdisc_replace(sch, new, &q->qdisc);
 500         return 0;
 501 }
 502
 503 static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg)
 504 {
 505         struct tbf_sched_data *q = qdisc_priv(sch);
 506         return q->qdisc;
 507 }
 508
 509 static unsigned long tbf_find(struct Qdisc *sch, u32 classid)
 510 {
 511         return 1;
 512 }
 513
 514 static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
 515 {
 516         if (!walker->stop) {
 517                 if (walker->count >= walker->skip)
 518                         if (walker->fn(sch, 1, walker) < 0) {
 519                                 walker->stop = 1;
 520                                 return;
 521                         }
 522                 walker->count++;
 523         }
 524 }
 525
 526 static const struct Qdisc_class_ops tbf_class_ops = {
 527         .graft          =       tbf_graft,
 528         .leaf           =       tbf_leaf,
 529         .find           =       tbf_find,
 530         .walk           =       tbf_walk,
 531         .dump           =       tbf_dump_class,
 532 };
 533
 534 static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
 535         .next           =       NULL,
 536         .cl_ops         =       &tbf_class_ops,
 537         .id             =       "tbf",
 538         .priv_size      =       sizeof(struct tbf_sched_data),
 539         .enqueue        =       tbf_enqueue,
 540         .dequeue        =       tbf_dequeue,
 541         .peek           =       qdisc_peek_dequeued,
 542         .init           =       tbf_init,
 543         .reset          =       tbf_reset,
 544         .destroy        =       tbf_destroy,
 545         .change         =       tbf_change,
 546         .dump           =       tbf_dump,
 547         .owner          =       THIS_MODULE,
 548 };
 549
 550 static int __init tbf_module_init(void)
 551 {
 552         return register_qdisc(&tbf_qdisc_ops);
 553 }
 554
 555 static void __exit tbf_module_exit(void)
 556 {
 557         unregister_qdisc(&tbf_qdisc_ops);
 558 }
 559 module_init(tbf_module_init)
 560 module_exit(tbf_module_exit)
 561 MODULE_LICENSE("GPL");