net/sched/sch_tbf.c

   1 /*
   2  * net/sched/sch_tbf.c  Token Bucket Filter queue.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10  *              Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
  11  *                                               original idea by Martin Devera
  12  *
  13  */
  14
  15 #include <linux/module.h>
  16 #include <linux/types.h>
  17 #include <linux/kernel.h>
  18 #include <linux/string.h>
  19 #include <linux/errno.h>
  20 #include <linux/skbuff.h>
  21 #include <net/netlink.h>
  22 #include <net/sch_generic.h>
  23 #include <net/pkt_sched.h>
  24
  25
  26 /*      Simple Token Bucket Filter.
  27         =======================================
  28
  29         SOURCE.
  30         -------
  31
  32         None.
  33
  34         Description.
  35         ------------
  36
  37         A data flow obeys TBF with rate R and depth B, if for any
  38         time interval t_i...t_f the number of transmitted bits
  39         does not exceed B + R*(t_f-t_i).
  40
  41         Packetized version of this definition:
  42         The sequence of packets of sizes s_i served at moments t_i
  43         obeys TBF, if for any i<=k:
  44
  45         s_i+....+s_k <= B + R*(t_k - t_i)
  46
  47         Algorithm.
  48         ----------
  49
  50         Let N(t_i) be B/R initially and N(t) grow continuously with time as:
  51
  52         N(t+delta) = min{B/R, N(t) + delta}
  53
  54         If the first packet in queue has length S, it may be
  55         transmitted only at the time t_* when S/R <= N(t_*),
  56         and in this case N(t) jumps:
  57
  58         N(t_* + 0) = N(t_* - 0) - S/R.
  59
  60
  61
  62         Actually, QoS requires two TBF to be applied to a data stream.
  63         One of them controls steady state burst size, another
  64         one with rate P (peak rate) and depth M (equal to link MTU)
  65         limits bursts at a smaller time scale.
  66
  67         It is easy to see that P>R, and B>M. If P is infinity, this double
  68         TBF is equivalent to a single one.
  69
  70         When TBF works in reshaping mode, latency is estimated as:
  71
  72         lat = max ((L-B)/R, (L-M)/P)
  73
  74
  75         NOTES.
  76         ------
  77
  78         If TBF throttles, it starts a watchdog timer, which will wake it up
  79         when it is ready to transmit.
  80         Note that the minimal timer resolution is 1/HZ.
  81         If no new packets arrive during this period,
  82         or if the device is not awaken by EOI for some previous packet,
  83         TBF can stop its activity for 1/HZ.
  84
  85
  86         This means, that with depth B, the maximal rate is
  87
  88         R_crit = B*HZ
  89
  90         F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
  91
  92         Note that the peak rate TBF is much more tough: with MTU 1500
  93         P_crit = 150Kbytes/sec. So, if you need greater peak
  94         rates, use alpha with HZ=1000 :-)
  95
  96         With classful TBF, limit is just kept for backwards compatibility.
  97         It is passed to the default bfifo qdisc - if the inner qdisc is
  98         changed the limit is not effective anymore.
  99 */
 100
 101 struct tbf_sched_data {
 102 /* Parameters */
 103         u32             limit;          /* Maximal length of backlog: bytes */
 104         u32             max_size;
 105         s64             buffer;         /* Token bucket depth/rate: MUST BE >= MTU/B */
 106         s64             mtu;
 107         struct psched_ratecfg rate;
 108         struct psched_ratecfg peak;
 109
 110 /* Variables */
 111         s64     tokens;                 /* Current number of B tokens */
 112         s64     ptokens;                /* Current number of P tokens */
 113         s64     t_c;                    /* Time check-point */
 114         struct Qdisc    *qdisc;         /* Inner qdisc, default - bfifo queue */
 115         struct qdisc_watchdog watchdog; /* Watchdog timer */
 116 };
 117
 118
 119 /* Time to Length, convert time in ns to length in bytes
 120  * to determinate how many bytes can be sent in given time.
 121  */
 122 static u64 psched_ns_t2l(const struct psched_ratecfg *r,
 123                          u64 time_in_ns)
 124 {
 125         /* The formula is :
 126          * len = (time_in_ns * r->rate_bytes_ps) / NSEC_PER_SEC
 127          */
 128         u64 len = time_in_ns * r->rate_bytes_ps;
 129
 130         do_div(len, NSEC_PER_SEC);
 131
 132         if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) {
 133                 do_div(len, 53);
 134                 len = len * 48;
 135         }
 136
 137         if (len > r->overhead)
 138                 len -= r->overhead;
 139         else
 140                 len = 0;
 141
 142         return len;
 143 }
 144
 145 /* GSO packet is too big, segment it so that tbf can transmit
 146  * each segment in time
 147  */
 148 static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch,
 149                        struct sk_buff **to_free)
 150 {
 151         struct tbf_sched_data *q = qdisc_priv(sch);
 152         struct sk_buff *segs, *nskb;
 153         netdev_features_t features = netif_skb_features(skb);
 154         unsigned int len = 0, prev_len = qdisc_pkt_len(skb);
 155         int ret, nb;
 156
 157         segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
 158
 159         if (IS_ERR_OR_NULL(segs))
 160                 return qdisc_drop(skb, sch, to_free);
 161
 162         nb = 0;
 163         while (segs) {
 164                 nskb = segs->next;
 165                 segs->next = NULL;
 166                 qdisc_skb_cb(segs)->pkt_len = segs->len;
 167                 len += segs->len;
 168                 ret = qdisc_enqueue(segs, q->qdisc, to_free);
 169                 if (ret != NET_XMIT_SUCCESS) {
 170                         if (net_xmit_drop_count(ret))
 171                                 qdisc_qstats_drop(sch);
 172                 } else {
 173                         nb++;
 174                 }
 175                 segs = nskb;
 176         }
 177         sch->q.qlen += nb;
 178         if (nb > 1)
 179                 qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len);
 180         consume_skb(skb);
 181         return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
 182 }
 183
 184 static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 185                        struct sk_buff **to_free)
 186 {
 187         struct tbf_sched_data *q = qdisc_priv(sch);
 188         int ret;
 189
 190         if (qdisc_pkt_len(skb) > q->max_size) {
 191                 if (skb_is_gso(skb) &&
 192                     skb_gso_validate_mac_len(skb, q->max_size))
 193                         return tbf_segment(skb, sch, to_free);
 194                 return qdisc_drop(skb, sch, to_free);
 195         }
 196         ret = qdisc_enqueue(skb, q->qdisc, to_free);
 197         if (ret != NET_XMIT_SUCCESS) {
 198                 if (net_xmit_drop_count(ret))
 199                         qdisc_qstats_drop(sch);
 200                 return ret;
 201         }
 202
 203         qdisc_qstats_backlog_inc(sch, skb);
 204         sch->q.qlen++;
 205         return NET_XMIT_SUCCESS;
 206 }
 207
 208 static bool tbf_peak_present(const struct tbf_sched_data *q)
 209 {
 210         return q->peak.rate_bytes_ps;
 211 }
 212
 213 static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
 214 {
 215         struct tbf_sched_data *q = qdisc_priv(sch);
 216         struct sk_buff *skb;
 217
 218         skb = q->qdisc->ops->peek(q->qdisc);
 219
 220         if (skb) {
 221                 s64 now;
 222                 s64 toks;
 223                 s64 ptoks = 0;
 224                 unsigned int len = qdisc_pkt_len(skb);
 225
 226                 now = ktime_get_ns();
 227                 toks = min_t(s64, now - q->t_c, q->buffer);
 228
 229                 if (tbf_peak_present(q)) {
 230                         ptoks = toks + q->ptokens;
 231                         if (ptoks > q->mtu)
 232                                 ptoks = q->mtu;
 233                         ptoks -= (s64) psched_l2t_ns(&q->peak, len);
 234                 }
 235                 toks += q->tokens;
 236                 if (toks > q->buffer)
 237                         toks = q->buffer;
 238                 toks -= (s64) psched_l2t_ns(&q->rate, len);
 239
 240                 if ((toks|ptoks) >= 0) {
 241                         skb = qdisc_dequeue_peeked(q->qdisc);
 242                         if (unlikely(!skb))
 243                                 return NULL;
 244
 245                         q->t_c = now;
 246                         q->tokens = toks;
 247                         q->ptokens = ptoks;
 248                         qdisc_qstats_backlog_dec(sch, skb);
 249                         sch->q.qlen--;
 250                         qdisc_bstats_update(sch, skb);
 251                         return skb;
 252                 }
 253
 254                 qdisc_watchdog_schedule_ns(&q->watchdog,
 255                                            now + max_t(long, -toks, -ptoks));
 256
 257                 /* Maybe we have a shorter packet in the queue,
 258                    which can be sent now. It sounds cool,
 259                    but, however, this is wrong in principle.
 260                    We MUST NOT reorder packets under these circumstances.
 261
 262                    Really, if we split the flow into independent
 263                    subflows, it would be a very good solution.
 264                    This is the main idea of all FQ algorithms
 265                    (cf. CSZ, HPFQ, HFSC)
 266                  */
 267
 268                 qdisc_qstats_overlimit(sch);
 269         }
 270         return NULL;
 271 }
 272
 273 static void tbf_reset(struct Qdisc *sch)
 274 {
 275         struct tbf_sched_data *q = qdisc_priv(sch);
 276
 277         qdisc_reset(q->qdisc);
 278         sch->qstats.backlog = 0;
 279         sch->q.qlen = 0;
 280         q->t_c = ktime_get_ns();
 281         q->tokens = q->buffer;
 282         q->ptokens = q->mtu;
 283         qdisc_watchdog_cancel(&q->watchdog);
 284 }
 285
 286 static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
 287         [TCA_TBF_PARMS] = { .len = sizeof(struct tc_tbf_qopt) },
 288         [TCA_TBF_RTAB]  = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
 289         [TCA_TBF_PTAB]  = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
 290         [TCA_TBF_RATE64]        = { .type = NLA_U64 },
 291         [TCA_TBF_PRATE64]       = { .type = NLA_U64 },
 292         [TCA_TBF_BURST] = { .type = NLA_U32 },
 293         [TCA_TBF_PBURST] = { .type = NLA_U32 },
 294 };
 295
 296 static int tbf_change(struct Qdisc *sch, struct nlattr *opt,
 297                       struct netlink_ext_ack *extack)
 298 {
 299         int err;
 300         struct tbf_sched_data *q = qdisc_priv(sch);
 301         struct nlattr *tb[TCA_TBF_MAX + 1];
 302         struct tc_tbf_qopt *qopt;
 303         struct Qdisc *child = NULL;
 304         struct psched_ratecfg rate;
 305         struct psched_ratecfg peak;
 306         u64 max_size;
 307         s64 buffer, mtu;
 308         u64 rate64 = 0, prate64 = 0;
 309
 310         err = nla_parse_nested(tb, TCA_TBF_MAX, opt, tbf_policy, NULL);
 311         if (err < 0)
 312                 return err;
 313
 314         err = -EINVAL;
 315         if (tb[TCA_TBF_PARMS] == NULL)
 316                 goto done;
 317
 318         qopt = nla_data(tb[TCA_TBF_PARMS]);
 319         if (qopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
 320                 qdisc_put_rtab(qdisc_get_rtab(&qopt->rate,
 321                                               tb[TCA_TBF_RTAB],
 322                                               NULL));
 323
 324         if (qopt->peakrate.linklayer == TC_LINKLAYER_UNAWARE)
 325                         qdisc_put_rtab(qdisc_get_rtab(&qopt->peakrate,
 326                                                       tb[TCA_TBF_PTAB],
 327                                                       NULL));
 328
 329         buffer = min_t(u64, PSCHED_TICKS2NS(qopt->buffer), ~0U);
 330         mtu = min_t(u64, PSCHED_TICKS2NS(qopt->mtu), ~0U);
 331
 332         if (tb[TCA_TBF_RATE64])
 333                 rate64 = nla_get_u64(tb[TCA_TBF_RATE64]);
 334         psched_ratecfg_precompute(&rate, &qopt->rate, rate64);
 335
 336         if (tb[TCA_TBF_BURST]) {
 337                 max_size = nla_get_u32(tb[TCA_TBF_BURST]);
 338                 buffer = psched_l2t_ns(&rate, max_size);
 339         } else {
 340                 max_size = min_t(u64, psched_ns_t2l(&rate, buffer), ~0U);
 341         }
 342
 343         if (qopt->peakrate.rate) {
 344                 if (tb[TCA_TBF_PRATE64])
 345                         prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]);
 346                 psched_ratecfg_precompute(&peak, &qopt->peakrate, prate64);
 347                 if (peak.rate_bytes_ps <= rate.rate_bytes_ps) {
 348                         pr_warn_ratelimited("sch_tbf: peakrate %llu is lower than or equals to rate %llu !\n",
 349                                         peak.rate_bytes_ps, rate.rate_bytes_ps);
 350                         err = -EINVAL;
 351                         goto done;
 352                 }
 353
 354                 if (tb[TCA_TBF_PBURST]) {
 355                         u32 pburst = nla_get_u32(tb[TCA_TBF_PBURST]);
 356                         max_size = min_t(u32, max_size, pburst);
 357                         mtu = psched_l2t_ns(&peak, pburst);
 358                 } else {
 359                         max_size = min_t(u64, max_size, psched_ns_t2l(&peak, mtu));
 360                 }
 361         } else {
 362                 memset(&peak, 0, sizeof(peak));
 363         }
 364
 365         if (max_size < psched_mtu(qdisc_dev(sch)))
 366                 pr_warn_ratelimited("sch_tbf: burst %llu is lower than device %s mtu (%u) !\n",
 367                                     max_size, qdisc_dev(sch)->name,
 368                                     psched_mtu(qdisc_dev(sch)));
 369
 370         if (!max_size) {
 371                 err = -EINVAL;
 372                 goto done;
 373         }
 374
 375         if (q->qdisc != &noop_qdisc) {
 376                 err = fifo_set_limit(q->qdisc, qopt->limit);
 377                 if (err)
 378                         goto done;
 379         } else if (qopt->limit > 0) {
 380                 child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit,
 381                                          extack);
 382                 if (IS_ERR(child)) {
 383                         err = PTR_ERR(child);
 384                         goto done;
 385                 }
 386         }
 387
 388         sch_tree_lock(sch);
 389         if (child) {
 390                 qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen,
 391                                           q->qdisc->qstats.backlog);
 392                 qdisc_destroy(q->qdisc);
 393                 q->qdisc = child;
 394                 if (child != &noop_qdisc)
 395                         qdisc_hash_add(child, true);
 396         }
 397         q->limit = qopt->limit;
 398         if (tb[TCA_TBF_PBURST])
 399                 q->mtu = mtu;
 400         else
 401                 q->mtu = PSCHED_TICKS2NS(qopt->mtu);
 402         q->max_size = max_size;
 403         if (tb[TCA_TBF_BURST])
 404                 q->buffer = buffer;
 405         else
 406                 q->buffer = PSCHED_TICKS2NS(qopt->buffer);
 407         q->tokens = q->buffer;
 408         q->ptokens = q->mtu;
 409
 410         memcpy(&q->rate, &rate, sizeof(struct psched_ratecfg));
 411         memcpy(&q->peak, &peak, sizeof(struct psched_ratecfg));
 412
 413         sch_tree_unlock(sch);
 414         err = 0;
 415 done:
 416         return err;
 417 }
 418
 419 static int tbf_init(struct Qdisc *sch, struct nlattr *opt,
 420                     struct netlink_ext_ack *extack)
 421 {
 422         struct tbf_sched_data *q = qdisc_priv(sch);
 423
 424         qdisc_watchdog_init(&q->watchdog, sch);
 425         q->qdisc = &noop_qdisc;
 426
 427         if (!opt)
 428                 return -EINVAL;
 429
 430         q->t_c = ktime_get_ns();
 431
 432         return tbf_change(sch, opt, extack);
 433 }
 434
 435 static void tbf_destroy(struct Qdisc *sch)
 436 {
 437         struct tbf_sched_data *q = qdisc_priv(sch);
 438
 439         qdisc_watchdog_cancel(&q->watchdog);
 440         qdisc_destroy(q->qdisc);
 441 }
 442
 443 static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
 444 {
 445         struct tbf_sched_data *q = qdisc_priv(sch);
 446         struct nlattr *nest;
 447         struct tc_tbf_qopt opt;
 448
 449         sch->qstats.backlog = q->qdisc->qstats.backlog;
 450         nest = nla_nest_start(skb, TCA_OPTIONS);
 451         if (nest == NULL)
 452                 goto nla_put_failure;
 453
 454         opt.limit = q->limit;
 455         psched_ratecfg_getrate(&opt.rate, &q->rate);
 456         if (tbf_peak_present(q))
 457                 psched_ratecfg_getrate(&opt.peakrate, &q->peak);
 458         else
 459                 memset(&opt.peakrate, 0, sizeof(opt.peakrate));
 460         opt.mtu = PSCHED_NS2TICKS(q->mtu);
 461         opt.buffer = PSCHED_NS2TICKS(q->buffer);
 462         if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt))
 463                 goto nla_put_failure;
 464         if (q->rate.rate_bytes_ps >= (1ULL << 32) &&
 465             nla_put_u64_64bit(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps,
 466                               TCA_TBF_PAD))
 467                 goto nla_put_failure;
 468         if (tbf_peak_present(q) &&
 469             q->peak.rate_bytes_ps >= (1ULL << 32) &&
 470             nla_put_u64_64bit(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps,
 471                               TCA_TBF_PAD))
 472                 goto nla_put_failure;
 473
 474         return nla_nest_end(skb, nest);
 475
 476 nla_put_failure:
 477         nla_nest_cancel(skb, nest);
 478         return -1;
 479 }
 480
 481 static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
 482                           struct sk_buff *skb, struct tcmsg *tcm)
 483 {
 484         struct tbf_sched_data *q = qdisc_priv(sch);
 485
 486         tcm->tcm_handle |= TC_H_MIN(1);
 487         tcm->tcm_info = q->qdisc->handle;
 488
 489         return 0;
 490 }
 491
 492 static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 493                      struct Qdisc **old, struct netlink_ext_ack *extack)
 494 {
 495         struct tbf_sched_data *q = qdisc_priv(sch);
 496
 497         if (new == NULL)
 498                 new = &noop_qdisc;
 499
 500         *old = qdisc_replace(sch, new, &q->qdisc);
 501         return 0;
 502 }
 503
 504 static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg)
 505 {
 506         struct tbf_sched_data *q = qdisc_priv(sch);
 507         return q->qdisc;
 508 }
 509
 510 static unsigned long tbf_find(struct Qdisc *sch, u32 classid)
 511 {
 512         return 1;
 513 }
 514
 515 static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
 516 {
 517         if (!walker->stop) {
 518                 if (walker->count >= walker->skip)
 519                         if (walker->fn(sch, 1, walker) < 0) {
 520                                 walker->stop = 1;
 521                                 return;
 522                         }
 523                 walker->count++;
 524         }
 525 }
 526
 527 static const struct Qdisc_class_ops tbf_class_ops = {
 528         .graft          =       tbf_graft,
 529         .leaf           =       tbf_leaf,
 530         .find           =       tbf_find,
 531         .walk           =       tbf_walk,
 532         .dump           =       tbf_dump_class,
 533 };
 534
 535 static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
 536         .next           =       NULL,
 537         .cl_ops         =       &tbf_class_ops,
 538         .id             =       "tbf",
 539         .priv_size      =       sizeof(struct tbf_sched_data),
 540         .enqueue        =       tbf_enqueue,
 541         .dequeue        =       tbf_dequeue,
 542         .peek           =       qdisc_peek_dequeued,
 543         .init           =       tbf_init,
 544         .reset          =       tbf_reset,
 545         .destroy        =       tbf_destroy,
 546         .change         =       tbf_change,
 547         .dump           =       tbf_dump,
 548         .owner          =       THIS_MODULE,
 549 };
 550
 551 static int __init tbf_module_init(void)
 552 {
 553         return register_qdisc(&tbf_qdisc_ops);
 554 }
 555
 556 static void __exit tbf_module_exit(void)
 557 {
 558         unregister_qdisc(&tbf_qdisc_ops);
 559 }
 560 module_init(tbf_module_init)
 561 module_exit(tbf_module_exit)
 562 MODULE_LICENSE("GPL");