net/sched/sch_tbf.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * net/sched/sch_tbf.c  Token Bucket Filter queue.
   4  *
   5  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
   6  *              Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
   7  *                                               original idea by Martin Devera
   8  */
   9
  10 #include <linux/module.h>
  11 #include <linux/types.h>
  12 #include <linux/kernel.h>
  13 #include <linux/string.h>
  14 #include <linux/errno.h>
  15 #include <linux/skbuff.h>
  16 #include <net/netlink.h>
  17 #include <net/sch_generic.h>
  18 #include <net/pkt_cls.h>
  19 #include <net/pkt_sched.h>
  20
  21
  22 /*      Simple Token Bucket Filter.
  23         =======================================
  24
  25         SOURCE.
  26         -------
  27
  28         None.
  29
  30         Description.
  31         ------------
  32
  33         A data flow obeys TBF with rate R and depth B, if for any
  34         time interval t_i...t_f the number of transmitted bits
  35         does not exceed B + R*(t_f-t_i).
  36
  37         Packetized version of this definition:
  38         The sequence of packets of sizes s_i served at moments t_i
  39         obeys TBF, if for any i<=k:
  40
  41         s_i+....+s_k <= B + R*(t_k - t_i)
  42
  43         Algorithm.
  44         ----------
  45
  46         Let N(t_i) be B/R initially and N(t) grow continuously with time as:
  47
  48         N(t+delta) = min{B/R, N(t) + delta}
  49
  50         If the first packet in queue has length S, it may be
  51         transmitted only at the time t_* when S/R <= N(t_*),
  52         and in this case N(t) jumps:
  53
  54         N(t_* + 0) = N(t_* - 0) - S/R.
  55
  56
  57
  58         Actually, QoS requires two TBF to be applied to a data stream.
  59         One of them controls steady state burst size, another
  60         one with rate P (peak rate) and depth M (equal to link MTU)
  61         limits bursts at a smaller time scale.
  62
  63         It is easy to see that P>R, and B>M. If P is infinity, this double
  64         TBF is equivalent to a single one.
  65
  66         When TBF works in reshaping mode, latency is estimated as:
  67
  68         lat = max ((L-B)/R, (L-M)/P)
  69
  70
  71         NOTES.
  72         ------
  73
  74         If TBF throttles, it starts a watchdog timer, which will wake it up
  75         when it is ready to transmit.
  76         Note that the minimal timer resolution is 1/HZ.
  77         If no new packets arrive during this period,
  78         or if the device is not awaken by EOI for some previous packet,
  79         TBF can stop its activity for 1/HZ.
  80
  81
  82         This means, that with depth B, the maximal rate is
  83
  84         R_crit = B*HZ
  85
  86         F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
  87
  88         Note that the peak rate TBF is much more tough: with MTU 1500
  89         P_crit = 150Kbytes/sec. So, if you need greater peak
  90         rates, use alpha with HZ=1000 :-)
  91
  92         With classful TBF, limit is just kept for backwards compatibility.
  93         It is passed to the default bfifo qdisc - if the inner qdisc is
  94         changed the limit is not effective anymore.
  95 */
  96
  97 struct tbf_sched_data {
  98 /* Parameters */
  99         u32             limit;          /* Maximal length of backlog: bytes */
 100         u32             max_size;
 101         s64             buffer;         /* Token bucket depth/rate: MUST BE >= MTU/B */
 102         s64             mtu;
 103         struct psched_ratecfg rate;
 104         struct psched_ratecfg peak;
 105
 106 /* Variables */
 107         s64     tokens;                 /* Current number of B tokens */
 108         s64     ptokens;                /* Current number of P tokens */
 109         s64     t_c;                    /* Time check-point */
 110         struct Qdisc    *qdisc;         /* Inner qdisc, default - bfifo queue */
 111         struct qdisc_watchdog watchdog; /* Watchdog timer */
 112 };
 113
 114
 115 /* Time to Length, convert time in ns to length in bytes
 116  * to determinate how many bytes can be sent in given time.
 117  */
 118 static u64 psched_ns_t2l(const struct psched_ratecfg *r,
 119                          u64 time_in_ns)
 120 {
 121         /* The formula is :
 122          * len = (time_in_ns * r->rate_bytes_ps) / NSEC_PER_SEC
 123          */
 124         u64 len = time_in_ns * r->rate_bytes_ps;
 125
 126         do_div(len, NSEC_PER_SEC);
 127
 128         if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) {
 129                 do_div(len, 53);
 130                 len = len * 48;
 131         }
 132
 133         if (len > r->overhead)
 134                 len -= r->overhead;
 135         else
 136                 len = 0;
 137
 138         return len;
 139 }
 140
 141 static void tbf_offload_change(struct Qdisc *sch)
 142 {
 143         struct tbf_sched_data *q = qdisc_priv(sch);
 144         struct net_device *dev = qdisc_dev(sch);
 145         struct tc_tbf_qopt_offload qopt;
 146
 147         if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
 148                 return;
 149
 150         qopt.command = TC_TBF_REPLACE;
 151         qopt.handle = sch->handle;
 152         qopt.parent = sch->parent;
 153         qopt.replace_params.rate = q->rate;
 154         qopt.replace_params.max_size = q->max_size;
 155         qopt.replace_params.qstats = &sch->qstats;
 156
 157         dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TBF, &qopt);
 158 }
 159
 160 static void tbf_offload_destroy(struct Qdisc *sch)
 161 {
 162         struct net_device *dev = qdisc_dev(sch);
 163         struct tc_tbf_qopt_offload qopt;
 164
 165         if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
 166                 return;
 167
 168         qopt.command = TC_TBF_DESTROY;
 169         qopt.handle = sch->handle;
 170         qopt.parent = sch->parent;
 171         dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TBF, &qopt);
 172 }
 173
 174 static int tbf_offload_dump(struct Qdisc *sch)
 175 {
 176         struct tc_tbf_qopt_offload qopt;
 177
 178         qopt.command = TC_TBF_STATS;
 179         qopt.handle = sch->handle;
 180         qopt.parent = sch->parent;
 181         qopt.stats.bstats = &sch->bstats;
 182         qopt.stats.qstats = &sch->qstats;
 183
 184         return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_TBF, &qopt);
 185 }
 186
 187 /* GSO packet is too big, segment it so that tbf can transmit
 188  * each segment in time
 189  */
 190 static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch,
 191                        struct sk_buff **to_free)
 192 {
 193         struct tbf_sched_data *q = qdisc_priv(sch);
 194         struct sk_buff *segs, *nskb;
 195         netdev_features_t features = netif_skb_features(skb);
 196         unsigned int len = 0, prev_len = qdisc_pkt_len(skb);
 197         int ret, nb;
 198
 199         segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
 200
 201         if (IS_ERR_OR_NULL(segs))
 202                 return qdisc_drop(skb, sch, to_free);
 203
 204         nb = 0;
 205         skb_list_walk_safe(segs, segs, nskb) {
 206                 skb_mark_not_on_list(segs);
 207                 qdisc_skb_cb(segs)->pkt_len = segs->len;
 208                 len += segs->len;
 209                 ret = qdisc_enqueue(segs, q->qdisc, to_free);
 210                 if (ret != NET_XMIT_SUCCESS) {
 211                         if (net_xmit_drop_count(ret))
 212                                 qdisc_qstats_drop(sch);
 213                 } else {
 214                         nb++;
 215                 }
 216         }
 217         sch->q.qlen += nb;
 218         if (nb > 1)
 219                 qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len);
 220         consume_skb(skb);
 221         return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
 222 }
 223
 224 static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 225                        struct sk_buff **to_free)
 226 {
 227         struct tbf_sched_data *q = qdisc_priv(sch);
 228         unsigned int len = qdisc_pkt_len(skb);
 229         int ret;
 230
 231         if (qdisc_pkt_len(skb) > q->max_size) {
 232                 if (skb_is_gso(skb) &&
 233                     skb_gso_validate_mac_len(skb, q->max_size))
 234                         return tbf_segment(skb, sch, to_free);
 235                 return qdisc_drop(skb, sch, to_free);
 236         }
 237         ret = qdisc_enqueue(skb, q->qdisc, to_free);
 238         if (ret != NET_XMIT_SUCCESS) {
 239                 if (net_xmit_drop_count(ret))
 240                         qdisc_qstats_drop(sch);
 241                 return ret;
 242         }
 243
 244         sch->qstats.backlog += len;
 245         sch->q.qlen++;
 246         return NET_XMIT_SUCCESS;
 247 }
 248
 249 static bool tbf_peak_present(const struct tbf_sched_data *q)
 250 {
 251         return q->peak.rate_bytes_ps;
 252 }
 253
 254 static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
 255 {
 256         struct tbf_sched_data *q = qdisc_priv(sch);
 257         struct sk_buff *skb;
 258
 259         skb = q->qdisc->ops->peek(q->qdisc);
 260
 261         if (skb) {
 262                 s64 now;
 263                 s64 toks;
 264                 s64 ptoks = 0;
 265                 unsigned int len = qdisc_pkt_len(skb);
 266
 267                 now = ktime_get_ns();
 268                 toks = min_t(s64, now - q->t_c, q->buffer);
 269
 270                 if (tbf_peak_present(q)) {
 271                         ptoks = toks + q->ptokens;
 272                         if (ptoks > q->mtu)
 273                                 ptoks = q->mtu;
 274                         ptoks -= (s64) psched_l2t_ns(&q->peak, len);
 275                 }
 276                 toks += q->tokens;
 277                 if (toks > q->buffer)
 278                         toks = q->buffer;
 279                 toks -= (s64) psched_l2t_ns(&q->rate, len);
 280
 281                 if ((toks|ptoks) >= 0) {
 282                         skb = qdisc_dequeue_peeked(q->qdisc);
 283                         if (unlikely(!skb))
 284                                 return NULL;
 285
 286                         q->t_c = now;
 287                         q->tokens = toks;
 288                         q->ptokens = ptoks;
 289                         qdisc_qstats_backlog_dec(sch, skb);
 290                         sch->q.qlen--;
 291                         qdisc_bstats_update(sch, skb);
 292                         return skb;
 293                 }
 294
 295                 qdisc_watchdog_schedule_ns(&q->watchdog,
 296                                            now + max_t(long, -toks, -ptoks));
 297
 298                 /* Maybe we have a shorter packet in the queue,
 299                    which can be sent now. It sounds cool,
 300                    but, however, this is wrong in principle.
 301                    We MUST NOT reorder packets under these circumstances.
 302
 303                    Really, if we split the flow into independent
 304                    subflows, it would be a very good solution.
 305                    This is the main idea of all FQ algorithms
 306                    (cf. CSZ, HPFQ, HFSC)
 307                  */
 308
 309                 qdisc_qstats_overlimit(sch);
 310         }
 311         return NULL;
 312 }
 313
 314 static void tbf_reset(struct Qdisc *sch)
 315 {
 316         struct tbf_sched_data *q = qdisc_priv(sch);
 317
 318         qdisc_reset(q->qdisc);
 319         sch->qstats.backlog = 0;
 320         sch->q.qlen = 0;
 321         q->t_c = ktime_get_ns();
 322         q->tokens = q->buffer;
 323         q->ptokens = q->mtu;
 324         qdisc_watchdog_cancel(&q->watchdog);
 325 }
 326
 327 static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
 328         [TCA_TBF_PARMS] = { .len = sizeof(struct tc_tbf_qopt) },
 329         [TCA_TBF_RTAB]  = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
 330         [TCA_TBF_PTAB]  = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
 331         [TCA_TBF_RATE64]        = { .type = NLA_U64 },
 332         [TCA_TBF_PRATE64]       = { .type = NLA_U64 },
 333         [TCA_TBF_BURST] = { .type = NLA_U32 },
 334         [TCA_TBF_PBURST] = { .type = NLA_U32 },
 335 };
 336
 337 static int tbf_change(struct Qdisc *sch, struct nlattr *opt,
 338                       struct netlink_ext_ack *extack)
 339 {
 340         int err;
 341         struct tbf_sched_data *q = qdisc_priv(sch);
 342         struct nlattr *tb[TCA_TBF_MAX + 1];
 343         struct tc_tbf_qopt *qopt;
 344         struct Qdisc *child = NULL;
 345         struct psched_ratecfg rate;
 346         struct psched_ratecfg peak;
 347         u64 max_size;
 348         s64 buffer, mtu;
 349         u64 rate64 = 0, prate64 = 0;
 350
 351         err = nla_parse_nested_deprecated(tb, TCA_TBF_MAX, opt, tbf_policy,
 352                                           NULL);
 353         if (err < 0)
 354                 return err;
 355
 356         err = -EINVAL;
 357         if (tb[TCA_TBF_PARMS] == NULL)
 358                 goto done;
 359
 360         qopt = nla_data(tb[TCA_TBF_PARMS]);
 361         if (qopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
 362                 qdisc_put_rtab(qdisc_get_rtab(&qopt->rate,
 363                                               tb[TCA_TBF_RTAB],
 364                                               NULL));
 365
 366         if (qopt->peakrate.linklayer == TC_LINKLAYER_UNAWARE)
 367                         qdisc_put_rtab(qdisc_get_rtab(&qopt->peakrate,
 368                                                       tb[TCA_TBF_PTAB],
 369                                                       NULL));
 370
 371         buffer = min_t(u64, PSCHED_TICKS2NS(qopt->buffer), ~0U);
 372         mtu = min_t(u64, PSCHED_TICKS2NS(qopt->mtu), ~0U);
 373
 374         if (tb[TCA_TBF_RATE64])
 375                 rate64 = nla_get_u64(tb[TCA_TBF_RATE64]);
 376         psched_ratecfg_precompute(&rate, &qopt->rate, rate64);
 377
 378         if (tb[TCA_TBF_BURST]) {
 379                 max_size = nla_get_u32(tb[TCA_TBF_BURST]);
 380                 buffer = psched_l2t_ns(&rate, max_size);
 381         } else {
 382                 max_size = min_t(u64, psched_ns_t2l(&rate, buffer), ~0U);
 383         }
 384
 385         if (qopt->peakrate.rate) {
 386                 if (tb[TCA_TBF_PRATE64])
 387                         prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]);
 388                 psched_ratecfg_precompute(&peak, &qopt->peakrate, prate64);
 389                 if (peak.rate_bytes_ps <= rate.rate_bytes_ps) {
 390                         pr_warn_ratelimited("sch_tbf: peakrate %llu is lower than or equals to rate %llu !\n",
 391                                         peak.rate_bytes_ps, rate.rate_bytes_ps);
 392                         err = -EINVAL;
 393                         goto done;
 394                 }
 395
 396                 if (tb[TCA_TBF_PBURST]) {
 397                         u32 pburst = nla_get_u32(tb[TCA_TBF_PBURST]);
 398                         max_size = min_t(u32, max_size, pburst);
 399                         mtu = psched_l2t_ns(&peak, pburst);
 400                 } else {
 401                         max_size = min_t(u64, max_size, psched_ns_t2l(&peak, mtu));
 402                 }
 403         } else {
 404                 memset(&peak, 0, sizeof(peak));
 405         }
 406
 407         if (max_size < psched_mtu(qdisc_dev(sch)))
 408                 pr_warn_ratelimited("sch_tbf: burst %llu is lower than device %s mtu (%u) !\n",
 409                                     max_size, qdisc_dev(sch)->name,
 410                                     psched_mtu(qdisc_dev(sch)));
 411
 412         if (!max_size) {
 413                 err = -EINVAL;
 414                 goto done;
 415         }
 416
 417         if (q->qdisc != &noop_qdisc) {
 418                 err = fifo_set_limit(q->qdisc, qopt->limit);
 419                 if (err)
 420                         goto done;
 421         } else if (qopt->limit > 0) {
 422                 child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit,
 423                                          extack);
 424                 if (IS_ERR(child)) {
 425                         err = PTR_ERR(child);
 426                         goto done;
 427                 }
 428
 429                 /* child is fifo, no need to check for noop_qdisc */
 430                 qdisc_hash_add(child, true);
 431         }
 432
 433         sch_tree_lock(sch);
 434         if (child) {
 435                 qdisc_tree_flush_backlog(q->qdisc);
 436                 qdisc_put(q->qdisc);
 437                 q->qdisc = child;
 438         }
 439         q->limit = qopt->limit;
 440         if (tb[TCA_TBF_PBURST])
 441                 q->mtu = mtu;
 442         else
 443                 q->mtu = PSCHED_TICKS2NS(qopt->mtu);
 444         q->max_size = max_size;
 445         if (tb[TCA_TBF_BURST])
 446                 q->buffer = buffer;
 447         else
 448                 q->buffer = PSCHED_TICKS2NS(qopt->buffer);
 449         q->tokens = q->buffer;
 450         q->ptokens = q->mtu;
 451
 452         memcpy(&q->rate, &rate, sizeof(struct psched_ratecfg));
 453         memcpy(&q->peak, &peak, sizeof(struct psched_ratecfg));
 454
 455         sch_tree_unlock(sch);
 456         err = 0;
 457
 458         tbf_offload_change(sch);
 459 done:
 460         return err;
 461 }
 462
 463 static int tbf_init(struct Qdisc *sch, struct nlattr *opt,
 464                     struct netlink_ext_ack *extack)
 465 {
 466         struct tbf_sched_data *q = qdisc_priv(sch);
 467
 468         qdisc_watchdog_init(&q->watchdog, sch);
 469         q->qdisc = &noop_qdisc;
 470
 471         if (!opt)
 472                 return -EINVAL;
 473
 474         q->t_c = ktime_get_ns();
 475
 476         return tbf_change(sch, opt, extack);
 477 }
 478
 479 static void tbf_destroy(struct Qdisc *sch)
 480 {
 481         struct tbf_sched_data *q = qdisc_priv(sch);
 482
 483         qdisc_watchdog_cancel(&q->watchdog);
 484         tbf_offload_destroy(sch);
 485         qdisc_put(q->qdisc);
 486 }
 487
 488 static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
 489 {
 490         struct tbf_sched_data *q = qdisc_priv(sch);
 491         struct nlattr *nest;
 492         struct tc_tbf_qopt opt;
 493         int err;
 494
 495         err = tbf_offload_dump(sch);
 496         if (err)
 497                 return err;
 498
 499         nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
 500         if (nest == NULL)
 501                 goto nla_put_failure;
 502
 503         opt.limit = q->limit;
 504         psched_ratecfg_getrate(&opt.rate, &q->rate);
 505         if (tbf_peak_present(q))
 506                 psched_ratecfg_getrate(&opt.peakrate, &q->peak);
 507         else
 508                 memset(&opt.peakrate, 0, sizeof(opt.peakrate));
 509         opt.mtu = PSCHED_NS2TICKS(q->mtu);
 510         opt.buffer = PSCHED_NS2TICKS(q->buffer);
 511         if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt))
 512                 goto nla_put_failure;
 513         if (q->rate.rate_bytes_ps >= (1ULL << 32) &&
 514             nla_put_u64_64bit(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps,
 515                               TCA_TBF_PAD))
 516                 goto nla_put_failure;
 517         if (tbf_peak_present(q) &&
 518             q->peak.rate_bytes_ps >= (1ULL << 32) &&
 519             nla_put_u64_64bit(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps,
 520                               TCA_TBF_PAD))
 521                 goto nla_put_failure;
 522
 523         return nla_nest_end(skb, nest);
 524
 525 nla_put_failure:
 526         nla_nest_cancel(skb, nest);
 527         return -1;
 528 }
 529
 530 static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
 531                           struct sk_buff *skb, struct tcmsg *tcm)
 532 {
 533         struct tbf_sched_data *q = qdisc_priv(sch);
 534
 535         tcm->tcm_handle |= TC_H_MIN(1);
 536         tcm->tcm_info = q->qdisc->handle;
 537
 538         return 0;
 539 }
 540
 541 static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 542                      struct Qdisc **old, struct netlink_ext_ack *extack)
 543 {
 544         struct tbf_sched_data *q = qdisc_priv(sch);
 545
 546         if (new == NULL)
 547                 new = &noop_qdisc;
 548
 549         *old = qdisc_replace(sch, new, &q->qdisc);
 550         return 0;
 551 }
 552
 553 static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg)
 554 {
 555         struct tbf_sched_data *q = qdisc_priv(sch);
 556         return q->qdisc;
 557 }
 558
 559 static unsigned long tbf_find(struct Qdisc *sch, u32 classid)
 560 {
 561         return 1;
 562 }
 563
 564 static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
 565 {
 566         if (!walker->stop) {
 567                 if (walker->count >= walker->skip)
 568                         if (walker->fn(sch, 1, walker) < 0) {
 569                                 walker->stop = 1;
 570                                 return;
 571                         }
 572                 walker->count++;
 573         }
 574 }
 575
 576 static const struct Qdisc_class_ops tbf_class_ops = {
 577         .graft          =       tbf_graft,
 578         .leaf           =       tbf_leaf,
 579         .find           =       tbf_find,
 580         .walk           =       tbf_walk,
 581         .dump           =       tbf_dump_class,
 582 };
 583
 584 static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
 585         .next           =       NULL,
 586         .cl_ops         =       &tbf_class_ops,
 587         .id             =       "tbf",
 588         .priv_size      =       sizeof(struct tbf_sched_data),
 589         .enqueue        =       tbf_enqueue,
 590         .dequeue        =       tbf_dequeue,
 591         .peek           =       qdisc_peek_dequeued,
 592         .init           =       tbf_init,
 593         .reset          =       tbf_reset,
 594         .destroy        =       tbf_destroy,
 595         .change         =       tbf_change,
 596         .dump           =       tbf_dump,
 597         .owner          =       THIS_MODULE,
 598 };
 599
 600 static int __init tbf_module_init(void)
 601 {
 602         return register_qdisc(&tbf_qdisc_ops);
 603 }
 604
 605 static void __exit tbf_module_exit(void)
 606 {
 607         unregister_qdisc(&tbf_qdisc_ops);
 608 }
 609 module_init(tbf_module_init)
 610 module_exit(tbf_module_exit)
 611 MODULE_LICENSE("GPL");