net/sched/sch_cake.c

   1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
   2
   3 /* COMMON Applications Kept Enhanced (CAKE) discipline
   4  *
   5  * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com>
   6  * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk>
   7  * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com>
   8  * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de>
   9  * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk>
  10  * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au>
  11  *
  12  * The CAKE Principles:
  13  *                 (or, how to have your cake and eat it too)
  14  *
  15  * This is a combination of several shaping, AQM and FQ techniques into one
  16  * easy-to-use package:
  17  *
  18  * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE
  19  *   equipment and bloated MACs.  This operates in deficit mode (as in sch_fq),
  20  *   eliminating the need for any sort of burst parameter (eg. token bucket
  21  *   depth).  Burst support is limited to that necessary to overcome scheduling
  22  *   latency.
  23  *
  24  * - A Diffserv-aware priority queue, giving more priority to certain classes,
  25  *   up to a specified fraction of bandwidth.  Above that bandwidth threshold,
  26  *   the priority is reduced to avoid starving other tins.
  27  *
  28  * - Each priority tin has a separate Flow Queue system, to isolate traffic
  29  *   flows from each other.  This prevents a burst on one flow from increasing
  30  *   the delay to another.  Flows are distributed to queues using a
  31  *   set-associative hash function.
  32  *
  33  * - Each queue is actively managed by Cobalt, which is a combination of the
  34  *   Codel and Blue AQM algorithms.  This serves flows fairly, and signals
  35  *   congestion early via ECN (if available) and/or packet drops, to keep
  36  *   latency low.  The codel parameters are auto-tuned based on the bandwidth
  37  *   setting, as is necessary at low bandwidths.
  38  *
  39  * The configuration parameters are kept deliberately simple for ease of use.
  40  * Everything has sane defaults.  Complete generality of configuration is *not*
  41  * a goal.
  42  *
  43  * The priority queue operates according to a weighted DRR scheme, combined with
  44  * a bandwidth tracker which reuses the shaper logic to detect which side of the
  45  * bandwidth sharing threshold the tin is operating.  This determines whether a
  46  * priority-based weight (high) or a bandwidth-based weight (low) is used for
  47  * that tin in the current pass.
  48  *
  49  * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly
  50  * granted us permission to leverage.
  51  */
  52
  53 #include <linux/module.h>
  54 #include <linux/types.h>
  55 #include <linux/kernel.h>
  56 #include <linux/jiffies.h>
  57 #include <linux/string.h>
  58 #include <linux/in.h>
  59 #include <linux/errno.h>
  60 #include <linux/init.h>
  61 #include <linux/skbuff.h>
  62 #include <linux/jhash.h>
  63 #include <linux/slab.h>
  64 #include <linux/vmalloc.h>
  65 #include <linux/reciprocal_div.h>
  66 #include <net/netlink.h>
  67 #include <linux/if_vlan.h>
  68 #include <net/pkt_sched.h>
  69 #include <net/pkt_cls.h>
  70 #include <net/tcp.h>
  71 #include <net/flow_dissector.h>
  72
  73 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
  74 #include <net/netfilter/nf_conntrack_core.h>
  75 #endif
  76
  77 #define CAKE_SET_WAYS (8)
  78 #define CAKE_MAX_TINS (8)
  79 #define CAKE_QUEUES (1024)
  80 #define CAKE_FLOW_MASK 63
  81 #define CAKE_FLOW_NAT_FLAG 64
  82
  83 /* struct cobalt_params - contains codel and blue parameters
  84  * @interval:   codel initial drop rate
  85  * @target:     maximum persistent sojourn time & blue update rate
  86  * @mtu_time:   serialisation delay of maximum-size packet
  87  * @p_inc:      increment of blue drop probability (0.32 fxp)
  88  * @p_dec:      decrement of blue drop probability (0.32 fxp)
  89  */
  90 struct cobalt_params {
  91         u64     interval;
  92         u64     target;
  93         u64     mtu_time;
  94         u32     p_inc;
  95         u32     p_dec;
  96 };
  97
  98 /* struct cobalt_vars - contains codel and blue variables
  99  * @count:              codel dropping frequency
 100  * @rec_inv_sqrt:       reciprocal value of sqrt(count) >> 1
 101  * @drop_next:          time to drop next packet, or when we dropped last
 102  * @blue_timer:         Blue time to next drop
 103  * @p_drop:             BLUE drop probability (0.32 fxp)
 104  * @dropping:           set if in dropping state
 105  * @ecn_marked:         set if marked
 106  */
 107 struct cobalt_vars {
 108         u32     count;
 109         u32     rec_inv_sqrt;
 110         ktime_t drop_next;
 111         ktime_t blue_timer;
 112         u32     p_drop;
 113         bool    dropping;
 114         bool    ecn_marked;
 115 };
 116
 117 enum {
 118         CAKE_SET_NONE = 0,
 119         CAKE_SET_SPARSE,
 120         CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */
 121         CAKE_SET_BULK,
 122         CAKE_SET_DECAYING
 123 };
 124
 125 struct cake_flow {
 126         /* this stuff is all needed per-flow at dequeue time */
 127         struct sk_buff    *head;
 128         struct sk_buff    *tail;
 129         struct list_head  flowchain;
 130         s32               deficit;
 131         u32               dropped;
 132         struct cobalt_vars cvars;
 133         u16               srchost; /* index into cake_host table */
 134         u16               dsthost;
 135         u8                set;
 136 }; /* please try to keep this structure <= 64 bytes */
 137
 138 struct cake_host {
 139         u32 srchost_tag;
 140         u32 dsthost_tag;
 141         u16 srchost_refcnt;
 142         u16 dsthost_refcnt;
 143 };
 144
 145 struct cake_heap_entry {
 146         u16 t:3, b:10;
 147 };
 148
 149 struct cake_tin_data {
 150         struct cake_flow flows[CAKE_QUEUES];
 151         u32     backlogs[CAKE_QUEUES];
 152         u32     tags[CAKE_QUEUES]; /* for set association */
 153         u16     overflow_idx[CAKE_QUEUES];
 154         struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */
 155         u16     flow_quantum;
 156
 157         struct cobalt_params cparams;
 158         u32     drop_overlimit;
 159         u16     bulk_flow_count;
 160         u16     sparse_flow_count;
 161         u16     decaying_flow_count;
 162         u16     unresponsive_flow_count;
 163
 164         u32     max_skblen;
 165
 166         struct list_head new_flows;
 167         struct list_head old_flows;
 168         struct list_head decaying_flows;
 169
 170         /* time_next = time_this + ((len * rate_ns) >> rate_shft) */
 171         ktime_t time_next_packet;
 172         u64     tin_rate_ns;
 173         u64     tin_rate_bps;
 174         u16     tin_rate_shft;
 175
 176         u16     tin_quantum_prio;
 177         u16     tin_quantum_band;
 178         s32     tin_deficit;
 179         u32     tin_backlog;
 180         u32     tin_dropped;
 181         u32     tin_ecn_mark;
 182
 183         u32     packets;
 184         u64     bytes;
 185
 186         u32     ack_drops;
 187
 188         /* moving averages */
 189         u64 avge_delay;
 190         u64 peak_delay;
 191         u64 base_delay;
 192
 193         /* hash function stats */
 194         u32     way_directs;
 195         u32     way_hits;
 196         u32     way_misses;
 197         u32     way_collisions;
 198 }; /* number of tins is small, so size of this struct doesn't matter much */
 199
 200 struct cake_sched_data {
 201         struct tcf_proto __rcu *filter_list; /* optional external classifier */
 202         struct tcf_block *block;
 203         struct cake_tin_data *tins;
 204
 205         struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS];
 206         u16             overflow_timeout;
 207
 208         u16             tin_cnt;
 209         u8              tin_mode;
 210         u8              flow_mode;
 211         u8              ack_filter;
 212         u8              atm_mode;
 213
 214         /* time_next = time_this + ((len * rate_ns) >> rate_shft) */
 215         u16             rate_shft;
 216         ktime_t         time_next_packet;
 217         ktime_t         failsafe_next_packet;
 218         u64             rate_ns;
 219         u64             rate_bps;
 220         u16             rate_flags;
 221         s16             rate_overhead;
 222         u16             rate_mpu;
 223         u64             interval;
 224         u64             target;
 225
 226         /* resource tracking */
 227         u32             buffer_used;
 228         u32             buffer_max_used;
 229         u32             buffer_limit;
 230         u32             buffer_config_limit;
 231
 232         /* indices for dequeue */
 233         u16             cur_tin;
 234         u16             cur_flow;
 235
 236         struct qdisc_watchdog watchdog;
 237         const u8        *tin_index;
 238         const u8        *tin_order;
 239
 240         /* bandwidth capacity estimate */
 241         ktime_t         last_packet_time;
 242         ktime_t         avg_window_begin;
 243         u64             avg_packet_interval;
 244         u64             avg_window_bytes;
 245         u64             avg_peak_bandwidth;
 246         ktime_t         last_reconfig_time;
 247
 248         /* packet length stats */
 249         u32             avg_netoff;
 250         u16             max_netlen;
 251         u16             max_adjlen;
 252         u16             min_netlen;
 253         u16             min_adjlen;
 254 };
 255
 256 enum {
 257         CAKE_FLAG_OVERHEAD         = BIT(0),
 258         CAKE_FLAG_AUTORATE_INGRESS = BIT(1),
 259         CAKE_FLAG_INGRESS          = BIT(2),
 260         CAKE_FLAG_WASH             = BIT(3),
 261         CAKE_FLAG_SPLIT_GSO        = BIT(4)
 262 };
 263
 264 /* COBALT operates the Codel and BLUE algorithms in parallel, in order to
 265  * obtain the best features of each.  Codel is excellent on flows which
 266  * respond to congestion signals in a TCP-like way.  BLUE is more effective on
 267  * unresponsive flows.
 268  */
 269
 270 struct cobalt_skb_cb {
 271         ktime_t enqueue_time;
 272         u32     adjusted_len;
 273 };
 274
 275 static u64 us_to_ns(u64 us)
 276 {
 277         return us * NSEC_PER_USEC;
 278 }
 279
 280 static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb)
 281 {
 282         qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb));
 283         return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data;
 284 }
 285
 286 static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb)
 287 {
 288         return get_cobalt_cb(skb)->enqueue_time;
 289 }
 290
 291 static void cobalt_set_enqueue_time(struct sk_buff *skb,
 292                                     ktime_t now)
 293 {
 294         get_cobalt_cb(skb)->enqueue_time = now;
 295 }
 296
 297 static u16 quantum_div[CAKE_QUEUES + 1] = {0};
 298
 299 /* Diffserv lookup tables */
 300
 301 static const u8 precedence[] = {
 302         0, 0, 0, 0, 0, 0, 0, 0,
 303         1, 1, 1, 1, 1, 1, 1, 1,
 304         2, 2, 2, 2, 2, 2, 2, 2,
 305         3, 3, 3, 3, 3, 3, 3, 3,
 306         4, 4, 4, 4, 4, 4, 4, 4,
 307         5, 5, 5, 5, 5, 5, 5, 5,
 308         6, 6, 6, 6, 6, 6, 6, 6,
 309         7, 7, 7, 7, 7, 7, 7, 7,
 310 };
 311
 312 static const u8 diffserv8[] = {
 313         2, 5, 1, 2, 4, 2, 2, 2,
 314         0, 2, 1, 2, 1, 2, 1, 2,
 315         5, 2, 4, 2, 4, 2, 4, 2,
 316         3, 2, 3, 2, 3, 2, 3, 2,
 317         6, 2, 3, 2, 3, 2, 3, 2,
 318         6, 2, 2, 2, 6, 2, 6, 2,
 319         7, 2, 2, 2, 2, 2, 2, 2,
 320         7, 2, 2, 2, 2, 2, 2, 2,
 321 };
 322
 323 static const u8 diffserv4[] = {
 324         0, 2, 0, 0, 2, 0, 0, 0,
 325         1, 0, 0, 0, 0, 0, 0, 0,
 326         2, 0, 2, 0, 2, 0, 2, 0,
 327         2, 0, 2, 0, 2, 0, 2, 0,
 328         3, 0, 2, 0, 2, 0, 2, 0,
 329         3, 0, 0, 0, 3, 0, 3, 0,
 330         3, 0, 0, 0, 0, 0, 0, 0,
 331         3, 0, 0, 0, 0, 0, 0, 0,
 332 };
 333
 334 static const u8 diffserv3[] = {
 335         0, 0, 0, 0, 2, 0, 0, 0,
 336         1, 0, 0, 0, 0, 0, 0, 0,
 337         0, 0, 0, 0, 0, 0, 0, 0,
 338         0, 0, 0, 0, 0, 0, 0, 0,
 339         0, 0, 0, 0, 0, 0, 0, 0,
 340         0, 0, 0, 0, 2, 0, 2, 0,
 341         2, 0, 0, 0, 0, 0, 0, 0,
 342         2, 0, 0, 0, 0, 0, 0, 0,
 343 };
 344
 345 static const u8 besteffort[] = {
 346         0, 0, 0, 0, 0, 0, 0, 0,
 347         0, 0, 0, 0, 0, 0, 0, 0,
 348         0, 0, 0, 0, 0, 0, 0, 0,
 349         0, 0, 0, 0, 0, 0, 0, 0,
 350         0, 0, 0, 0, 0, 0, 0, 0,
 351         0, 0, 0, 0, 0, 0, 0, 0,
 352         0, 0, 0, 0, 0, 0, 0, 0,
 353         0, 0, 0, 0, 0, 0, 0, 0,
 354 };
 355
 356 /* tin priority order for stats dumping */
 357
 358 static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7};
 359 static const u8 bulk_order[] = {1, 0, 2, 3};
 360
 361 #define REC_INV_SQRT_CACHE (16)
 362 static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {0};
 363
 364 /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots
 365  * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
 366  *
 367  * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
 368  */
 369
 370 static void cobalt_newton_step(struct cobalt_vars *vars)
 371 {
 372         u32 invsqrt, invsqrt2;
 373         u64 val;
 374
 375         invsqrt = vars->rec_inv_sqrt;
 376         invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
 377         val = (3LL << 32) - ((u64)vars->count * invsqrt2);
 378
 379         val >>= 2; /* avoid overflow in following multiply */
 380         val = (val * invsqrt) >> (32 - 2 + 1);
 381
 382         vars->rec_inv_sqrt = val;
 383 }
 384
 385 static void cobalt_invsqrt(struct cobalt_vars *vars)
 386 {
 387         if (vars->count < REC_INV_SQRT_CACHE)
 388                 vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count];
 389         else
 390                 cobalt_newton_step(vars);
 391 }
 392
 393 /* There is a big difference in timing between the accurate values placed in
 394  * the cache and the approximations given by a single Newton step for small
 395  * count values, particularly when stepping from count 1 to 2 or vice versa.
 396  * Above 16, a single Newton step gives sufficient accuracy in either
 397  * direction, given the precision stored.
 398  *
 399  * The magnitude of the error when stepping up to count 2 is such as to give
 400  * the value that *should* have been produced at count 4.
 401  */
 402
 403 static void cobalt_cache_init(void)
 404 {
 405         struct cobalt_vars v;
 406
 407         memset(&v, 0, sizeof(v));
 408         v.rec_inv_sqrt = ~0U;
 409         cobalt_rec_inv_sqrt_cache[0] = v.rec_inv_sqrt;
 410
 411         for (v.count = 1; v.count < REC_INV_SQRT_CACHE; v.count++) {
 412                 cobalt_newton_step(&v);
 413                 cobalt_newton_step(&v);
 414                 cobalt_newton_step(&v);
 415                 cobalt_newton_step(&v);
 416
 417                 cobalt_rec_inv_sqrt_cache[v.count] = v.rec_inv_sqrt;
 418         }
 419 }
 420
 421 static void cobalt_vars_init(struct cobalt_vars *vars)
 422 {
 423         memset(vars, 0, sizeof(*vars));
 424
 425         if (!cobalt_rec_inv_sqrt_cache[0]) {
 426                 cobalt_cache_init();
 427                 cobalt_rec_inv_sqrt_cache[0] = ~0;
 428         }
 429 }
 430
 431 /* CoDel control_law is t + interval/sqrt(count)
 432  * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid
 433  * both sqrt() and divide operation.
 434  */
 435 static ktime_t cobalt_control(ktime_t t,
 436                               u64 interval,
 437                               u32 rec_inv_sqrt)
 438 {
 439         return ktime_add_ns(t, reciprocal_scale(interval,
 440                                                 rec_inv_sqrt));
 441 }
 442
 443 /* Call this when a packet had to be dropped due to queue overflow.  Returns
 444  * true if the BLUE state was quiescent before but active after this call.
 445  */
 446 static bool cobalt_queue_full(struct cobalt_vars *vars,
 447                               struct cobalt_params *p,
 448                               ktime_t now)
 449 {
 450         bool up = false;
 451
 452         if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
 453                 up = !vars->p_drop;
 454                 vars->p_drop += p->p_inc;
 455                 if (vars->p_drop < p->p_inc)
 456                         vars->p_drop = ~0;
 457                 vars->blue_timer = now;
 458         }
 459         vars->dropping = true;
 460         vars->drop_next = now;
 461         if (!vars->count)
 462                 vars->count = 1;
 463
 464         return up;
 465 }
 466
 467 /* Call this when the queue was serviced but turned out to be empty.  Returns
 468  * true if the BLUE state was active before but quiescent after this call.
 469  */
 470 static bool cobalt_queue_empty(struct cobalt_vars *vars,
 471                                struct cobalt_params *p,
 472                                ktime_t now)
 473 {
 474         bool down = false;
 475
 476         if (vars->p_drop &&
 477             ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
 478                 if (vars->p_drop < p->p_dec)
 479                         vars->p_drop = 0;
 480                 else
 481                         vars->p_drop -= p->p_dec;
 482                 vars->blue_timer = now;
 483                 down = !vars->p_drop;
 484         }
 485         vars->dropping = false;
 486
 487         if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
 488                 vars->count--;
 489                 cobalt_invsqrt(vars);
 490                 vars->drop_next = cobalt_control(vars->drop_next,
 491                                                  p->interval,
 492                                                  vars->rec_inv_sqrt);
 493         }
 494
 495         return down;
 496 }
 497
 498 /* Call this with a freshly dequeued packet for possible congestion marking.
 499  * Returns true as an instruction to drop the packet, false for delivery.
 500  */
 501 static bool cobalt_should_drop(struct cobalt_vars *vars,
 502                                struct cobalt_params *p,
 503                                ktime_t now,
 504                                struct sk_buff *skb,
 505                                u32 bulk_flows)
 506 {
 507         bool next_due, over_target, drop = false;
 508         ktime_t schedule;
 509         u64 sojourn;
 510
 511 /* The 'schedule' variable records, in its sign, whether 'now' is before or
 512  * after 'drop_next'.  This allows 'drop_next' to be updated before the next
 513  * scheduling decision is actually branched, without destroying that
 514  * information.  Similarly, the first 'schedule' value calculated is preserved
 515  * in the boolean 'next_due'.
 516  *
 517  * As for 'drop_next', we take advantage of the fact that 'interval' is both
 518  * the delay between first exceeding 'target' and the first signalling event,
 519  * *and* the scaling factor for the signalling frequency.  It's therefore very
 520  * natural to use a single mechanism for both purposes, and eliminates a
 521  * significant amount of reference Codel's spaghetti code.  To help with this,
 522  * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close
 523  * as possible to 1.0 in fixed-point.
 524  */
 525
 526         sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
 527         schedule = ktime_sub(now, vars->drop_next);
 528         over_target = sojourn > p->target &&
 529                       sojourn > p->mtu_time * bulk_flows * 2 &&
 530                       sojourn > p->mtu_time * 4;
 531         next_due = vars->count && ktime_to_ns(schedule) >= 0;
 532
 533         vars->ecn_marked = false;
 534
 535         if (over_target) {
 536                 if (!vars->dropping) {
 537                         vars->dropping = true;
 538                         vars->drop_next = cobalt_control(now,
 539                                                          p->interval,
 540                                                          vars->rec_inv_sqrt);
 541                 }
 542                 if (!vars->count)
 543                         vars->count = 1;
 544         } else if (vars->dropping) {
 545                 vars->dropping = false;
 546         }
 547
 548         if (next_due && vars->dropping) {
 549                 /* Use ECN mark if possible, otherwise drop */
 550                 drop = !(vars->ecn_marked = INET_ECN_set_ce(skb));
 551
 552                 vars->count++;
 553                 if (!vars->count)
 554                         vars->count--;
 555                 cobalt_invsqrt(vars);
 556                 vars->drop_next = cobalt_control(vars->drop_next,
 557                                                  p->interval,
 558                                                  vars->rec_inv_sqrt);
 559                 schedule = ktime_sub(now, vars->drop_next);
 560         } else {
 561                 while (next_due) {
 562                         vars->count--;
 563                         cobalt_invsqrt(vars);
 564                         vars->drop_next = cobalt_control(vars->drop_next,
 565                                                          p->interval,
 566                                                          vars->rec_inv_sqrt);
 567                         schedule = ktime_sub(now, vars->drop_next);
 568                         next_due = vars->count && ktime_to_ns(schedule) >= 0;
 569                 }
 570         }
 571
 572         /* Simple BLUE implementation.  Lack of ECN is deliberate. */
 573         if (vars->p_drop)
 574                 drop |= (prandom_u32() < vars->p_drop);
 575
 576         /* Overload the drop_next field as an activity timeout */
 577         if (!vars->count)
 578                 vars->drop_next = ktime_add_ns(now, p->interval);
 579         else if (ktime_to_ns(schedule) > 0 && !drop)
 580                 vars->drop_next = now;
 581
 582         return drop;
 583 }
 584
 585 static void cake_update_flowkeys(struct flow_keys *keys,
 586                                  const struct sk_buff *skb)
 587 {
 588 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
 589         struct nf_conntrack_tuple tuple = {};
 590         bool rev = !skb->_nfct;
 591
 592         if (skb_protocol(skb, true) != htons(ETH_P_IP))
 593                 return;
 594
 595         if (!nf_ct_get_tuple_skb(&tuple, skb))
 596                 return;
 597
 598         keys->addrs.v4addrs.src = rev ? tuple.dst.u3.ip : tuple.src.u3.ip;
 599         keys->addrs.v4addrs.dst = rev ? tuple.src.u3.ip : tuple.dst.u3.ip;
 600
 601         if (keys->ports.ports) {
 602                 keys->ports.src = rev ? tuple.dst.u.all : tuple.src.u.all;
 603                 keys->ports.dst = rev ? tuple.src.u.all : tuple.dst.u.all;
 604         }
 605 #endif
 606 }
 607
 608 /* Cake has several subtle multiple bit settings. In these cases you
 609  *  would be matching triple isolate mode as well.
 610  */
 611
 612 static bool cake_dsrc(int flow_mode)
 613 {
 614         return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC;
 615 }
 616
 617 static bool cake_ddst(int flow_mode)
 618 {
 619         return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST;
 620 }
 621
 622 static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
 623                      int flow_mode, u16 flow_override, u16 host_override)
 624 {
 625         u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0;
 626         u16 reduced_hash, srchost_idx, dsthost_idx;
 627         struct flow_keys keys, host_keys;
 628
 629         if (unlikely(flow_mode == CAKE_FLOW_NONE))
 630                 return 0;
 631
 632         /* If both overrides are set we can skip packet dissection entirely */
 633         if ((flow_override || !(flow_mode & CAKE_FLOW_FLOWS)) &&
 634             (host_override || !(flow_mode & CAKE_FLOW_HOSTS)))
 635                 goto skip_hash;
 636
 637         skb_flow_dissect_flow_keys(skb, &keys,
 638                                    FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
 639
 640         if (flow_mode & CAKE_FLOW_NAT_FLAG)
 641                 cake_update_flowkeys(&keys, skb);
 642
 643         /* flow_hash_from_keys() sorts the addresses by value, so we have
 644          * to preserve their order in a separate data structure to treat
 645          * src and dst host addresses as independently selectable.
 646          */
 647         host_keys = keys;
 648         host_keys.ports.ports     = 0;
 649         host_keys.basic.ip_proto  = 0;
 650         host_keys.keyid.keyid     = 0;
 651         host_keys.tags.flow_label = 0;
 652
 653         switch (host_keys.control.addr_type) {
 654         case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
 655                 host_keys.addrs.v4addrs.src = 0;
 656                 dsthost_hash = flow_hash_from_keys(&host_keys);
 657                 host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
 658                 host_keys.addrs.v4addrs.dst = 0;
 659                 srchost_hash = flow_hash_from_keys(&host_keys);
 660                 break;
 661
 662         case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
 663                 memset(&host_keys.addrs.v6addrs.src, 0,
 664                        sizeof(host_keys.addrs.v6addrs.src));
 665                 dsthost_hash = flow_hash_from_keys(&host_keys);
 666                 host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
 667                 memset(&host_keys.addrs.v6addrs.dst, 0,
 668                        sizeof(host_keys.addrs.v6addrs.dst));
 669                 srchost_hash = flow_hash_from_keys(&host_keys);
 670                 break;
 671
 672         default:
 673                 dsthost_hash = 0;
 674                 srchost_hash = 0;
 675         }
 676
 677         /* This *must* be after the above switch, since as a
 678          * side-effect it sorts the src and dst addresses.
 679          */
 680         if (flow_mode & CAKE_FLOW_FLOWS)
 681                 flow_hash = flow_hash_from_keys(&keys);
 682
 683 skip_hash:
 684         if (flow_override)
 685                 flow_hash = flow_override - 1;
 686         if (host_override) {
 687                 dsthost_hash = host_override - 1;
 688                 srchost_hash = host_override - 1;
 689         }
 690
 691         if (!(flow_mode & CAKE_FLOW_FLOWS)) {
 692                 if (flow_mode & CAKE_FLOW_SRC_IP)
 693                         flow_hash ^= srchost_hash;
 694
 695                 if (flow_mode & CAKE_FLOW_DST_IP)
 696                         flow_hash ^= dsthost_hash;
 697         }
 698
 699         reduced_hash = flow_hash % CAKE_QUEUES;
 700
 701         /* set-associative hashing */
 702         /* fast path if no hash collision (direct lookup succeeds) */
 703         if (likely(q->tags[reduced_hash] == flow_hash &&
 704                    q->flows[reduced_hash].set)) {
 705                 q->way_directs++;
 706         } else {
 707                 u32 inner_hash = reduced_hash % CAKE_SET_WAYS;
 708                 u32 outer_hash = reduced_hash - inner_hash;
 709                 bool allocate_src = false;
 710                 bool allocate_dst = false;
 711                 u32 i, k;
 712
 713                 /* check if any active queue in the set is reserved for
 714                  * this flow.
 715                  */
 716                 for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
 717                      i++, k = (k + 1) % CAKE_SET_WAYS) {
 718                         if (q->tags[outer_hash + k] == flow_hash) {
 719                                 if (i)
 720                                         q->way_hits++;
 721
 722                                 if (!q->flows[outer_hash + k].set) {
 723                                         /* need to increment host refcnts */
 724                                         allocate_src = cake_dsrc(flow_mode);
 725                                         allocate_dst = cake_ddst(flow_mode);
 726                                 }
 727
 728                                 goto found;
 729                         }
 730                 }
 731
 732                 /* no queue is reserved for this flow, look for an
 733                  * empty one.
 734                  */
 735                 for (i = 0; i < CAKE_SET_WAYS;
 736                          i++, k = (k + 1) % CAKE_SET_WAYS) {
 737                         if (!q->flows[outer_hash + k].set) {
 738                                 q->way_misses++;
 739                                 allocate_src = cake_dsrc(flow_mode);
 740                                 allocate_dst = cake_ddst(flow_mode);
 741                                 goto found;
 742                         }
 743                 }
 744
 745                 /* With no empty queues, default to the original
 746                  * queue, accept the collision, update the host tags.
 747                  */
 748                 q->way_collisions++;
 749                 q->hosts[q->flows[reduced_hash].srchost].srchost_refcnt--;
 750                 q->hosts[q->flows[reduced_hash].dsthost].dsthost_refcnt--;
 751                 allocate_src = cake_dsrc(flow_mode);
 752                 allocate_dst = cake_ddst(flow_mode);
 753 found:
 754                 /* reserve queue for future packets in same flow */
 755                 reduced_hash = outer_hash + k;
 756                 q->tags[reduced_hash] = flow_hash;
 757
 758                 if (allocate_src) {
 759                         srchost_idx = srchost_hash % CAKE_QUEUES;
 760                         inner_hash = srchost_idx % CAKE_SET_WAYS;
 761                         outer_hash = srchost_idx - inner_hash;
 762                         for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
 763                                 i++, k = (k + 1) % CAKE_SET_WAYS) {
 764                                 if (q->hosts[outer_hash + k].srchost_tag ==
 765                                     srchost_hash)
 766                                         goto found_src;
 767                         }
 768                         for (i = 0; i < CAKE_SET_WAYS;
 769                                 i++, k = (k + 1) % CAKE_SET_WAYS) {
 770                                 if (!q->hosts[outer_hash + k].srchost_refcnt)
 771                                         break;
 772                         }
 773                         q->hosts[outer_hash + k].srchost_tag = srchost_hash;
 774 found_src:
 775                         srchost_idx = outer_hash + k;
 776                         q->hosts[srchost_idx].srchost_refcnt++;
 777                         q->flows[reduced_hash].srchost = srchost_idx;
 778                 }
 779
 780                 if (allocate_dst) {
 781                         dsthost_idx = dsthost_hash % CAKE_QUEUES;
 782                         inner_hash = dsthost_idx % CAKE_SET_WAYS;
 783                         outer_hash = dsthost_idx - inner_hash;
 784                         for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
 785                              i++, k = (k + 1) % CAKE_SET_WAYS) {
 786                                 if (q->hosts[outer_hash + k].dsthost_tag ==
 787                                     dsthost_hash)
 788                                         goto found_dst;
 789                         }
 790                         for (i = 0; i < CAKE_SET_WAYS;
 791                              i++, k = (k + 1) % CAKE_SET_WAYS) {
 792                                 if (!q->hosts[outer_hash + k].dsthost_refcnt)
 793                                         break;
 794                         }
 795                         q->hosts[outer_hash + k].dsthost_tag = dsthost_hash;
 796 found_dst:
 797                         dsthost_idx = outer_hash + k;
 798                         q->hosts[dsthost_idx].dsthost_refcnt++;
 799                         q->flows[reduced_hash].dsthost = dsthost_idx;
 800                 }
 801         }
 802
 803         return reduced_hash;
 804 }
 805
 806 /* helper functions : might be changed when/if skb use a standard list_head */
 807 /* remove one skb from head of slot queue */
 808
 809 static struct sk_buff *dequeue_head(struct cake_flow *flow)
 810 {
 811         struct sk_buff *skb = flow->head;
 812
 813         if (skb) {
 814                 flow->head = skb->next;
 815                 skb->next = NULL;
 816         }
 817
 818         return skb;
 819 }
 820
 821 /* add skb to flow queue (tail add) */
 822
 823 static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb)
 824 {
 825         if (!flow->head)
 826                 flow->head = skb;
 827         else
 828                 flow->tail->next = skb;
 829         flow->tail = skb;
 830         skb->next = NULL;
 831 }
 832
 833 static struct iphdr *cake_get_iphdr(const struct sk_buff *skb,
 834                                     struct ipv6hdr *buf)
 835 {
 836         unsigned int offset = skb_network_offset(skb);
 837         struct iphdr *iph;
 838
 839         iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf);
 840
 841         if (!iph)
 842                 return NULL;
 843
 844         if (iph->version == 4 && iph->protocol == IPPROTO_IPV6)
 845                 return skb_header_pointer(skb, offset + iph->ihl * 4,
 846                                           sizeof(struct ipv6hdr), buf);
 847
 848         else if (iph->version == 4)
 849                 return iph;
 850
 851         else if (iph->version == 6)
 852                 return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr),
 853                                           buf);
 854
 855         return NULL;
 856 }
 857
 858 static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb,
 859                                       void *buf, unsigned int bufsize)
 860 {
 861         unsigned int offset = skb_network_offset(skb);
 862         const struct ipv6hdr *ipv6h;
 863         const struct tcphdr *tcph;
 864         const struct iphdr *iph;
 865         struct ipv6hdr _ipv6h;
 866         struct tcphdr _tcph;
 867
 868         ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
 869
 870         if (!ipv6h)
 871                 return NULL;
 872
 873         if (ipv6h->version == 4) {
 874                 iph = (struct iphdr *)ipv6h;
 875                 offset += iph->ihl * 4;
 876
 877                 /* special-case 6in4 tunnelling, as that is a common way to get
 878                  * v6 connectivity in the home
 879                  */
 880                 if (iph->protocol == IPPROTO_IPV6) {
 881                         ipv6h = skb_header_pointer(skb, offset,
 882                                                    sizeof(_ipv6h), &_ipv6h);
 883
 884                         if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP)
 885                                 return NULL;
 886
 887                         offset += sizeof(struct ipv6hdr);
 888
 889                 } else if (iph->protocol != IPPROTO_TCP) {
 890                         return NULL;
 891                 }
 892
 893         } else if (ipv6h->version == 6) {
 894                 if (ipv6h->nexthdr != IPPROTO_TCP)
 895                         return NULL;
 896
 897                 offset += sizeof(struct ipv6hdr);
 898         } else {
 899                 return NULL;
 900         }
 901
 902         tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
 903         if (!tcph)
 904                 return NULL;
 905
 906         return skb_header_pointer(skb, offset,
 907                                   min(__tcp_hdrlen(tcph), bufsize), buf);
 908 }
 909
 910 static const void *cake_get_tcpopt(const struct tcphdr *tcph,
 911                                    int code, int *oplen)
 912 {
 913         /* inspired by tcp_parse_options in tcp_input.c */
 914         int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
 915         const u8 *ptr = (const u8 *)(tcph + 1);
 916
 917         while (length > 0) {
 918                 int opcode = *ptr++;
 919                 int opsize;
 920
 921                 if (opcode == TCPOPT_EOL)
 922                         break;
 923                 if (opcode == TCPOPT_NOP) {
 924                         length--;
 925                         continue;
 926                 }
 927                 opsize = *ptr++;
 928                 if (opsize < 2 || opsize > length)
 929                         break;
 930
 931                 if (opcode == code) {
 932                         *oplen = opsize;
 933                         return ptr;
 934                 }
 935
 936                 ptr += opsize - 2;
 937                 length -= opsize;
 938         }
 939
 940         return NULL;
 941 }
 942
 943 /* Compare two SACK sequences. A sequence is considered greater if it SACKs more
 944  * bytes than the other. In the case where both sequences ACKs bytes that the
 945  * other doesn't, A is considered greater. DSACKs in A also makes A be
 946  * considered greater.
 947  *
 948  * @return -1, 0 or 1 as normal compare functions
 949  */
 950 static int cake_tcph_sack_compare(const struct tcphdr *tcph_a,
 951                                   const struct tcphdr *tcph_b)
 952 {
 953         const struct tcp_sack_block_wire *sack_a, *sack_b;
 954         u32 ack_seq_a = ntohl(tcph_a->ack_seq);
 955         u32 bytes_a = 0, bytes_b = 0;
 956         int oplen_a, oplen_b;
 957         bool first = true;
 958
 959         sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a);
 960         sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b);
 961
 962         /* pointers point to option contents */
 963         oplen_a -= TCPOLEN_SACK_BASE;
 964         oplen_b -= TCPOLEN_SACK_BASE;
 965
 966         if (sack_a && oplen_a >= sizeof(*sack_a) &&
 967             (!sack_b || oplen_b < sizeof(*sack_b)))
 968                 return -1;
 969         else if (sack_b && oplen_b >= sizeof(*sack_b) &&
 970                  (!sack_a || oplen_a < sizeof(*sack_a)))
 971                 return 1;
 972         else if ((!sack_a || oplen_a < sizeof(*sack_a)) &&
 973                  (!sack_b || oplen_b < sizeof(*sack_b)))
 974                 return 0;
 975
 976         while (oplen_a >= sizeof(*sack_a)) {
 977                 const struct tcp_sack_block_wire *sack_tmp = sack_b;
 978                 u32 start_a = get_unaligned_be32(&sack_a->start_seq);
 979                 u32 end_a = get_unaligned_be32(&sack_a->end_seq);
 980                 int oplen_tmp = oplen_b;
 981                 bool found = false;
 982
 983                 /* DSACK; always considered greater to prevent dropping */
 984                 if (before(start_a, ack_seq_a))
 985                         return -1;
 986
 987                 bytes_a += end_a - start_a;
 988
 989                 while (oplen_tmp >= sizeof(*sack_tmp)) {
 990                         u32 start_b = get_unaligned_be32(&sack_tmp->start_seq);
 991                         u32 end_b = get_unaligned_be32(&sack_tmp->end_seq);
 992
 993                         /* first time through we count the total size */
 994                         if (first)
 995                                 bytes_b += end_b - start_b;
 996
 997                         if (!after(start_b, start_a) && !before(end_b, end_a)) {
 998                                 found = true;
 999                                 if (!first)
1000                                         break;
1001                         }
1002                         oplen_tmp -= sizeof(*sack_tmp);
1003                         sack_tmp++;
1004                 }
1005
1006                 if (!found)
1007                         return -1;
1008
1009                 oplen_a -= sizeof(*sack_a);
1010                 sack_a++;
1011                 first = false;
1012         }
1013
1014         /* If we made it this far, all ranges SACKed by A are covered by B, so
1015          * either the SACKs are equal, or B SACKs more bytes.
1016          */
1017         return bytes_b > bytes_a ? 1 : 0;
1018 }
1019
1020 static void cake_tcph_get_tstamp(const struct tcphdr *tcph,
1021                                  u32 *tsval, u32 *tsecr)
1022 {
1023         const u8 *ptr;
1024         int opsize;
1025
1026         ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize);
1027
1028         if (ptr && opsize == TCPOLEN_TIMESTAMP) {
1029                 *tsval = get_unaligned_be32(ptr);
1030                 *tsecr = get_unaligned_be32(ptr + 4);
1031         }
1032 }
1033
1034 static bool cake_tcph_may_drop(const struct tcphdr *tcph,
1035                                u32 tstamp_new, u32 tsecr_new)
1036 {
1037         /* inspired by tcp_parse_options in tcp_input.c */
1038         int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
1039         const u8 *ptr = (const u8 *)(tcph + 1);
1040         u32 tstamp, tsecr;
1041
1042         /* 3 reserved flags must be unset to avoid future breakage
1043          * ACK must be set
1044          * ECE/CWR are handled separately
1045          * All other flags URG/PSH/RST/SYN/FIN must be unset
1046          * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero)
1047          * 0x00C00000 = CWR/ECE (handled separately)
1048          * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000
1049          */
1050         if (((tcp_flag_word(tcph) &
1051               cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK))
1052                 return false;
1053
1054         while (length > 0) {
1055                 int opcode = *ptr++;
1056                 int opsize;
1057
1058                 if (opcode == TCPOPT_EOL)
1059                         break;
1060                 if (opcode == TCPOPT_NOP) {
1061                         length--;
1062                         continue;
1063                 }
1064                 opsize = *ptr++;
1065                 if (opsize < 2 || opsize > length)
1066                         break;
1067
1068                 switch (opcode) {
1069                 case TCPOPT_MD5SIG: /* doesn't influence state */
1070                         break;
1071
1072                 case TCPOPT_SACK: /* stricter checking performed later */
1073                         if (opsize % 8 != 2)
1074                                 return false;
1075                         break;
1076
1077                 case TCPOPT_TIMESTAMP:
1078                         /* only drop timestamps lower than new */
1079                         if (opsize != TCPOLEN_TIMESTAMP)
1080                                 return false;
1081                         tstamp = get_unaligned_be32(ptr);
1082                         tsecr = get_unaligned_be32(ptr + 4);
1083                         if (after(tstamp, tstamp_new) ||
1084                             after(tsecr, tsecr_new))
1085                                 return false;
1086                         break;
1087
1088                 case TCPOPT_MSS:  /* these should only be set on SYN */
1089                 case TCPOPT_WINDOW:
1090                 case TCPOPT_SACK_PERM:
1091                 case TCPOPT_FASTOPEN:
1092                 case TCPOPT_EXP:
1093                 default: /* don't drop if any unknown options are present */
1094                         return false;
1095                 }
1096
1097                 ptr += opsize - 2;
1098                 length -= opsize;
1099         }
1100
1101         return true;
1102 }
1103
1104 static struct sk_buff *cake_ack_filter(struct cake_sched_data *q,
1105                                        struct cake_flow *flow)
1106 {
1107         bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE;
1108         struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL;
1109         struct sk_buff *skb_check, *skb_prev = NULL;
1110         const struct ipv6hdr *ipv6h, *ipv6h_check;
1111         unsigned char _tcph[64], _tcph_check[64];
1112         const struct tcphdr *tcph, *tcph_check;
1113         const struct iphdr *iph, *iph_check;
1114         struct ipv6hdr _iph, _iph_check;
1115         const struct sk_buff *skb;
1116         int seglen, num_found = 0;
1117         u32 tstamp = 0, tsecr = 0;
1118         __be32 elig_flags = 0;
1119         int sack_comp;
1120
1121         /* no other possible ACKs to filter */
1122         if (flow->head == flow->tail)
1123                 return NULL;
1124
1125         skb = flow->tail;
1126         tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph));
1127         iph = cake_get_iphdr(skb, &_iph);
1128         if (!tcph)
1129                 return NULL;
1130
1131         cake_tcph_get_tstamp(tcph, &tstamp, &tsecr);
1132
1133         /* the 'triggering' packet need only have the ACK flag set.
1134          * also check that SYN is not set, as there won't be any previous ACKs.
1135          */
1136         if ((tcp_flag_word(tcph) &
1137              (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK)
1138                 return NULL;
1139
1140         /* the 'triggering' ACK is at the tail of the queue, we have already
1141          * returned if it is the only packet in the flow. loop through the rest
1142          * of the queue looking for pure ACKs with the same 5-tuple as the
1143          * triggering one.
1144          */
1145         for (skb_check = flow->head;
1146              skb_check && skb_check != skb;
1147              skb_prev = skb_check, skb_check = skb_check->next) {
1148                 iph_check = cake_get_iphdr(skb_check, &_iph_check);
1149                 tcph_check = cake_get_tcphdr(skb_check, &_tcph_check,
1150                                              sizeof(_tcph_check));
1151
1152                 /* only TCP packets with matching 5-tuple are eligible, and only
1153                  * drop safe headers
1154                  */
1155                 if (!tcph_check || iph->version != iph_check->version ||
1156                     tcph_check->source != tcph->source ||
1157                     tcph_check->dest != tcph->dest)
1158                         continue;
1159
1160                 if (iph_check->version == 4) {
1161                         if (iph_check->saddr != iph->saddr ||
1162                             iph_check->daddr != iph->daddr)
1163                                 continue;
1164
1165                         seglen = ntohs(iph_check->tot_len) -
1166                                        (4 * iph_check->ihl);
1167                 } else if (iph_check->version == 6) {
1168                         ipv6h = (struct ipv6hdr *)iph;
1169                         ipv6h_check = (struct ipv6hdr *)iph_check;
1170
1171                         if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) ||
1172                             ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr))
1173                                 continue;
1174
1175                         seglen = ntohs(ipv6h_check->payload_len);
1176                 } else {
1177                         WARN_ON(1);  /* shouldn't happen */
1178                         continue;
1179                 }
1180
1181                 /* If the ECE/CWR flags changed from the previous eligible
1182                  * packet in the same flow, we should no longer be dropping that
1183                  * previous packet as this would lose information.
1184                  */
1185                 if (elig_ack && (tcp_flag_word(tcph_check) &
1186                                  (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) {
1187                         elig_ack = NULL;
1188                         elig_ack_prev = NULL;
1189                         num_found--;
1190                 }
1191
1192                 /* Check TCP options and flags, don't drop ACKs with segment
1193                  * data, and don't drop ACKs with a higher cumulative ACK
1194                  * counter than the triggering packet. Check ACK seqno here to
1195                  * avoid parsing SACK options of packets we are going to exclude
1196                  * anyway.
1197                  */
1198                 if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) ||
1199                     (seglen - __tcp_hdrlen(tcph_check)) != 0 ||
1200                     after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq)))
1201                         continue;
1202
1203                 /* Check SACK options. The triggering packet must SACK more data
1204                  * than the ACK under consideration, or SACK the same range but
1205                  * have a larger cumulative ACK counter. The latter is a
1206                  * pathological case, but is contained in the following check
1207                  * anyway, just to be safe.
1208                  */
1209                 sack_comp = cake_tcph_sack_compare(tcph_check, tcph);
1210
1211                 if (sack_comp < 0 ||
1212                     (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) &&
1213                      sack_comp == 0))
1214                         continue;
1215
1216                 /* At this point we have found an eligible pure ACK to drop; if
1217                  * we are in aggressive mode, we are done. Otherwise, keep
1218                  * searching unless this is the second eligible ACK we
1219                  * found.
1220                  *
1221                  * Since we want to drop ACK closest to the head of the queue,
1222                  * save the first eligible ACK we find, even if we need to loop
1223                  * again.
1224                  */
1225                 if (!elig_ack) {
1226                         elig_ack = skb_check;
1227                         elig_ack_prev = skb_prev;
1228                         elig_flags = (tcp_flag_word(tcph_check)
1229                                       & (TCP_FLAG_ECE | TCP_FLAG_CWR));
1230                 }
1231
1232                 if (num_found++ > 0)
1233                         goto found;
1234         }
1235
1236         /* We made it through the queue without finding two eligible ACKs . If
1237          * we found a single eligible ACK we can drop it in aggressive mode if
1238          * we can guarantee that this does not interfere with ECN flag
1239          * information. We ensure this by dropping it only if the enqueued
1240          * packet is consecutive with the eligible ACK, and their flags match.
1241          */
1242         if (elig_ack && aggressive && elig_ack->next == skb &&
1243             (elig_flags == (tcp_flag_word(tcph) &
1244                             (TCP_FLAG_ECE | TCP_FLAG_CWR))))
1245                 goto found;
1246
1247         return NULL;
1248
1249 found:
1250         if (elig_ack_prev)
1251                 elig_ack_prev->next = elig_ack->next;
1252         else
1253                 flow->head = elig_ack->next;
1254
1255         elig_ack->next = NULL;
1256
1257         return elig_ack;
1258 }
1259
1260 static u64 cake_ewma(u64 avg, u64 sample, u32 shift)
1261 {
1262         avg -= avg >> shift;
1263         avg += sample >> shift;
1264         return avg;
1265 }
1266
1267 static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off)
1268 {
1269         if (q->rate_flags & CAKE_FLAG_OVERHEAD)
1270                 len -= off;
1271
1272         if (q->max_netlen < len)
1273                 q->max_netlen = len;
1274         if (q->min_netlen > len)
1275                 q->min_netlen = len;
1276
1277         len += q->rate_overhead;
1278
1279         if (len < q->rate_mpu)
1280                 len = q->rate_mpu;
1281
1282         if (q->atm_mode == CAKE_ATM_ATM) {
1283                 len += 47;
1284                 len /= 48;
1285                 len *= 53;
1286         } else if (q->atm_mode == CAKE_ATM_PTM) {
1287                 /* Add one byte per 64 bytes or part thereof.
1288                  * This is conservative and easier to calculate than the
1289                  * precise value.
1290                  */
1291                 len += (len + 63) / 64;
1292         }
1293
1294         if (q->max_adjlen < len)
1295                 q->max_adjlen = len;
1296         if (q->min_adjlen > len)
1297                 q->min_adjlen = len;
1298
1299         return len;
1300 }
1301
1302 static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
1303 {
1304         const struct skb_shared_info *shinfo = skb_shinfo(skb);
1305         unsigned int hdr_len, last_len = 0;
1306         u32 off = skb_network_offset(skb);
1307         u32 len = qdisc_pkt_len(skb);
1308         u16 segs = 1;
1309
1310         q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8);
1311
1312         if (!shinfo->gso_size)
1313                 return cake_calc_overhead(q, len, off);
1314
1315         /* borrowed from qdisc_pkt_len_init() */
1316         hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
1317
1318         /* + transport layer */
1319         if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 |
1320                                                 SKB_GSO_TCPV6))) {
1321                 const struct tcphdr *th;
1322                 struct tcphdr _tcphdr;
1323
1324                 th = skb_header_pointer(skb, skb_transport_offset(skb),
1325                                         sizeof(_tcphdr), &_tcphdr);
1326                 if (likely(th))
1327                         hdr_len += __tcp_hdrlen(th);
1328         } else {
1329                 struct udphdr _udphdr;
1330
1331                 if (skb_header_pointer(skb, skb_transport_offset(skb),
1332                                        sizeof(_udphdr), &_udphdr))
1333                         hdr_len += sizeof(struct udphdr);
1334         }
1335
1336         if (unlikely(shinfo->gso_type & SKB_GSO_DODGY))
1337                 segs = DIV_ROUND_UP(skb->len - hdr_len,
1338                                     shinfo->gso_size);
1339         else
1340                 segs = shinfo->gso_segs;
1341
1342         len = shinfo->gso_size + hdr_len;
1343         last_len = skb->len - shinfo->gso_size * (segs - 1);
1344
1345         return (cake_calc_overhead(q, len, off) * (segs - 1) +
1346                 cake_calc_overhead(q, last_len, off));
1347 }
1348
1349 static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j)
1350 {
1351         struct cake_heap_entry ii = q->overflow_heap[i];
1352         struct cake_heap_entry jj = q->overflow_heap[j];
1353
1354         q->overflow_heap[i] = jj;
1355         q->overflow_heap[j] = ii;
1356
1357         q->tins[ii.t].overflow_idx[ii.b] = j;
1358         q->tins[jj.t].overflow_idx[jj.b] = i;
1359 }
1360
1361 static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i)
1362 {
1363         struct cake_heap_entry ii = q->overflow_heap[i];
1364
1365         return q->tins[ii.t].backlogs[ii.b];
1366 }
1367
1368 static void cake_heapify(struct cake_sched_data *q, u16 i)
1369 {
1370         static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES;
1371         u32 mb = cake_heap_get_backlog(q, i);
1372         u32 m = i;
1373
1374         while (m < a) {
1375                 u32 l = m + m + 1;
1376                 u32 r = l + 1;
1377
1378                 if (l < a) {
1379                         u32 lb = cake_heap_get_backlog(q, l);
1380
1381                         if (lb > mb) {
1382                                 m  = l;
1383                                 mb = lb;
1384                         }
1385                 }
1386
1387                 if (r < a) {
1388                         u32 rb = cake_heap_get_backlog(q, r);
1389
1390                         if (rb > mb) {
1391                                 m  = r;
1392                                 mb = rb;
1393                         }
1394                 }
1395
1396                 if (m != i) {
1397                         cake_heap_swap(q, i, m);
1398                         i = m;
1399                 } else {
1400                         break;
1401                 }
1402         }
1403 }
1404
1405 static void cake_heapify_up(struct cake_sched_data *q, u16 i)
1406 {
1407         while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) {
1408                 u16 p = (i - 1) >> 1;
1409                 u32 ib = cake_heap_get_backlog(q, i);
1410                 u32 pb = cake_heap_get_backlog(q, p);
1411
1412                 if (ib > pb) {
1413                         cake_heap_swap(q, i, p);
1414                         i = p;
1415                 } else {
1416                         break;
1417                 }
1418         }
1419 }
1420
1421 static int cake_advance_shaper(struct cake_sched_data *q,
1422                                struct cake_tin_data *b,
1423                                struct sk_buff *skb,
1424                                ktime_t now, bool drop)
1425 {
1426         u32 len = get_cobalt_cb(skb)->adjusted_len;
1427
1428         /* charge packet bandwidth to this tin
1429          * and to the global shaper.
1430          */
1431         if (q->rate_ns) {
1432                 u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft;
1433                 u64 global_dur = (len * q->rate_ns) >> q->rate_shft;
1434                 u64 failsafe_dur = global_dur + (global_dur >> 1);
1435
1436                 if (ktime_before(b->time_next_packet, now))
1437                         b->time_next_packet = ktime_add_ns(b->time_next_packet,
1438                                                            tin_dur);
1439
1440                 else if (ktime_before(b->time_next_packet,
1441                                       ktime_add_ns(now, tin_dur)))
1442                         b->time_next_packet = ktime_add_ns(now, tin_dur);
1443
1444                 q->time_next_packet = ktime_add_ns(q->time_next_packet,
1445                                                    global_dur);
1446                 if (!drop)
1447                         q->failsafe_next_packet = \
1448                                 ktime_add_ns(q->failsafe_next_packet,
1449                                              failsafe_dur);
1450         }
1451         return len;
1452 }
1453
1454 static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
1455 {
1456         struct cake_sched_data *q = qdisc_priv(sch);
1457         ktime_t now = ktime_get();
1458         u32 idx = 0, tin = 0, len;
1459         struct cake_heap_entry qq;
1460         struct cake_tin_data *b;
1461         struct cake_flow *flow;
1462         struct sk_buff *skb;
1463
1464         if (!q->overflow_timeout) {
1465                 int i;
1466                 /* Build fresh max-heap */
1467                 for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2; i >= 0; i--)
1468                         cake_heapify(q, i);
1469         }
1470         q->overflow_timeout = 65535;
1471
1472         /* select longest queue for pruning */
1473         qq  = q->overflow_heap[0];
1474         tin = qq.t;
1475         idx = qq.b;
1476
1477         b = &q->tins[tin];
1478         flow = &b->flows[idx];
1479         skb = dequeue_head(flow);
1480         if (unlikely(!skb)) {
1481                 /* heap has gone wrong, rebuild it next time */
1482                 q->overflow_timeout = 0;
1483                 return idx + (tin << 16);
1484         }
1485
1486         if (cobalt_queue_full(&flow->cvars, &b->cparams, now))
1487                 b->unresponsive_flow_count++;
1488
1489         len = qdisc_pkt_len(skb);
1490         q->buffer_used      -= skb->truesize;
1491         b->backlogs[idx]    -= len;
1492         b->tin_backlog      -= len;
1493         sch->qstats.backlog -= len;
1494         qdisc_tree_reduce_backlog(sch, 1, len);
1495
1496         flow->dropped++;
1497         b->tin_dropped++;
1498         sch->qstats.drops++;
1499
1500         if (q->rate_flags & CAKE_FLAG_INGRESS)
1501                 cake_advance_shaper(q, b, skb, now, true);
1502
1503         __qdisc_drop(skb, to_free);
1504         sch->q.qlen--;
1505
1506         cake_heapify(q, 0);
1507
1508         return idx + (tin << 16);
1509 }
1510
1511 static u8 cake_handle_diffserv(struct sk_buff *skb, bool wash)
1512 {
1513         const int offset = skb_network_offset(skb);
1514         u16 *buf, buf_;
1515         u8 dscp;
1516
1517         switch (skb_protocol(skb, true)) {
1518         case htons(ETH_P_IP):
1519                 buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1520                 if (unlikely(!buf))
1521                         return 0;
1522
1523                 /* ToS is in the second byte of iphdr */
1524                 dscp = ipv4_get_dsfield((struct iphdr *)buf) >> 2;
1525
1526                 if (wash && dscp) {
1527                         const int wlen = offset + sizeof(struct iphdr);
1528
1529                         if (!pskb_may_pull(skb, wlen) ||
1530                             skb_try_make_writable(skb, wlen))
1531                                 return 0;
1532
1533                         ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
1534                 }
1535
1536                 return dscp;
1537
1538         case htons(ETH_P_IPV6):
1539                 buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1540                 if (unlikely(!buf))
1541                         return 0;
1542
1543                 /* Traffic class is in the first and second bytes of ipv6hdr */
1544                 dscp = ipv6_get_dsfield((struct ipv6hdr *)buf) >> 2;
1545
1546                 if (wash && dscp) {
1547                         const int wlen = offset + sizeof(struct ipv6hdr);
1548
1549                         if (!pskb_may_pull(skb, wlen) ||
1550                             skb_try_make_writable(skb, wlen))
1551                                 return 0;
1552
1553                         ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
1554                 }
1555
1556                 return dscp;
1557
1558         case htons(ETH_P_ARP):
1559                 return 0x38;  /* CS7 - Net Control */
1560
1561         default:
1562                 /* If there is no Diffserv field, treat as best-effort */
1563                 return 0;
1564         }
1565 }
1566
1567 static struct cake_tin_data *cake_select_tin(struct Qdisc *sch,
1568                                              struct sk_buff *skb)
1569 {
1570         struct cake_sched_data *q = qdisc_priv(sch);
1571         u32 tin;
1572         bool wash;
1573         u8 dscp;
1574
1575         /* Tin selection: Default to diffserv-based selection, allow overriding
1576          * using firewall marks or skb->priority. Call DSCP parsing early if
1577          * wash is enabled, otherwise defer to below to skip unneeded parsing.
1578          */
1579         wash = !!(q->rate_flags & CAKE_FLAG_WASH);
1580         if (wash)
1581                 dscp = cake_handle_diffserv(skb, wash);
1582
1583         if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT)
1584                 tin = 0;
1585
1586         else if (TC_H_MAJ(skb->priority) == sch->handle &&
1587                  TC_H_MIN(skb->priority) > 0 &&
1588                  TC_H_MIN(skb->priority) <= q->tin_cnt)
1589                 tin = q->tin_order[TC_H_MIN(skb->priority) - 1];
1590
1591         else {
1592                 if (!wash)
1593                         dscp = cake_handle_diffserv(skb, wash);
1594                 tin = q->tin_index[dscp];
1595
1596                 if (unlikely(tin >= q->tin_cnt))
1597                         tin = 0;
1598         }
1599
1600         return &q->tins[tin];
1601 }
1602
1603 static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t,
1604                          struct sk_buff *skb, int flow_mode, int *qerr)
1605 {
1606         struct cake_sched_data *q = qdisc_priv(sch);
1607         struct tcf_proto *filter;
1608         struct tcf_result res;
1609         u16 flow = 0, host = 0;
1610         int result;
1611
1612         filter = rcu_dereference_bh(q->filter_list);
1613         if (!filter)
1614                 goto hash;
1615
1616         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
1617         result = tcf_classify(skb, filter, &res, false);
1618
1619         if (result >= 0) {
1620 #ifdef CONFIG_NET_CLS_ACT
1621                 switch (result) {
1622                 case TC_ACT_STOLEN:
1623                 case TC_ACT_QUEUED:
1624                 case TC_ACT_TRAP:
1625                         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
1626                         /* fall through */
1627                 case TC_ACT_SHOT:
1628                         return 0;
1629                 }
1630 #endif
1631                 if (TC_H_MIN(res.classid) <= CAKE_QUEUES)
1632                         flow = TC_H_MIN(res.classid);
1633                 if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << 16))
1634                         host = TC_H_MAJ(res.classid) >> 16;
1635         }
1636 hash:
1637         *t = cake_select_tin(sch, skb);
1638         return cake_hash(*t, skb, flow_mode, flow, host) + 1;
1639 }
1640
1641 static void cake_reconfigure(struct Qdisc *sch);
1642
1643 static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
1644                         struct sk_buff **to_free)
1645 {
1646         struct cake_sched_data *q = qdisc_priv(sch);
1647         int len = qdisc_pkt_len(skb);
1648         int uninitialized_var(ret);
1649         struct sk_buff *ack = NULL;
1650         ktime_t now = ktime_get();
1651         struct cake_tin_data *b;
1652         struct cake_flow *flow;
1653         u32 idx;
1654
1655         /* choose flow to insert into */
1656         idx = cake_classify(sch, &b, skb, q->flow_mode, &ret);
1657         if (idx == 0) {
1658                 if (ret & __NET_XMIT_BYPASS)
1659                         qdisc_qstats_drop(sch);
1660                 __qdisc_drop(skb, to_free);
1661                 return ret;
1662         }
1663         idx--;
1664         flow = &b->flows[idx];
1665
1666         /* ensure shaper state isn't stale */
1667         if (!b->tin_backlog) {
1668                 if (ktime_before(b->time_next_packet, now))
1669                         b->time_next_packet = now;
1670
1671                 if (!sch->q.qlen) {
1672                         if (ktime_before(q->time_next_packet, now)) {
1673                                 q->failsafe_next_packet = now;
1674                                 q->time_next_packet = now;
1675                         } else if (ktime_after(q->time_next_packet, now) &&
1676                                    ktime_after(q->failsafe_next_packet, now)) {
1677                                 u64 next = \
1678                                         min(ktime_to_ns(q->time_next_packet),
1679                                             ktime_to_ns(
1680                                                    q->failsafe_next_packet));
1681                                 sch->qstats.overlimits++;
1682                                 qdisc_watchdog_schedule_ns(&q->watchdog, next);
1683                         }
1684                 }
1685         }
1686
1687         if (unlikely(len > b->max_skblen))
1688                 b->max_skblen = len;
1689
1690         if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) {
1691                 struct sk_buff *segs, *nskb;
1692                 netdev_features_t features = netif_skb_features(skb);
1693                 unsigned int slen = 0, numsegs = 0;
1694
1695                 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
1696                 if (IS_ERR_OR_NULL(segs))
1697                         return qdisc_drop(skb, sch, to_free);
1698
1699                 while (segs) {
1700                         nskb = segs->next;
1701                         segs->next = NULL;
1702                         qdisc_skb_cb(segs)->pkt_len = segs->len;
1703                         cobalt_set_enqueue_time(segs, now);
1704                         get_cobalt_cb(segs)->adjusted_len = cake_overhead(q,
1705                                                                           segs);
1706                         flow_queue_add(flow, segs);
1707
1708                         sch->q.qlen++;
1709                         numsegs++;
1710                         slen += segs->len;
1711                         q->buffer_used += segs->truesize;
1712                         b->packets++;
1713                         segs = nskb;
1714                 }
1715
1716                 /* stats */
1717                 b->bytes            += slen;
1718                 b->backlogs[idx]    += slen;
1719                 b->tin_backlog      += slen;
1720                 sch->qstats.backlog += slen;
1721                 q->avg_window_bytes += slen;
1722
1723                 qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen);
1724                 consume_skb(skb);
1725         } else {
1726                 /* not splitting */
1727                 cobalt_set_enqueue_time(skb, now);
1728                 get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb);
1729                 flow_queue_add(flow, skb);
1730
1731                 if (q->ack_filter)
1732                         ack = cake_ack_filter(q, flow);
1733
1734                 if (ack) {
1735                         b->ack_drops++;
1736                         sch->qstats.drops++;
1737                         b->bytes += qdisc_pkt_len(ack);
1738                         len -= qdisc_pkt_len(ack);
1739                         q->buffer_used += skb->truesize - ack->truesize;
1740                         if (q->rate_flags & CAKE_FLAG_INGRESS)
1741                                 cake_advance_shaper(q, b, ack, now, true);
1742
1743                         qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack));
1744                         consume_skb(ack);
1745                 } else {
1746                         sch->q.qlen++;
1747                         q->buffer_used      += skb->truesize;
1748                 }
1749
1750                 /* stats */
1751                 b->packets++;
1752                 b->bytes            += len;
1753                 b->backlogs[idx]    += len;
1754                 b->tin_backlog      += len;
1755                 sch->qstats.backlog += len;
1756                 q->avg_window_bytes += len;
1757         }
1758
1759         if (q->overflow_timeout)
1760                 cake_heapify_up(q, b->overflow_idx[idx]);
1761
1762         /* incoming bandwidth capacity estimate */
1763         if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) {
1764                 u64 packet_interval = \
1765                         ktime_to_ns(ktime_sub(now, q->last_packet_time));
1766
1767                 if (packet_interval > NSEC_PER_SEC)
1768                         packet_interval = NSEC_PER_SEC;
1769
1770                 /* filter out short-term bursts, eg. wifi aggregation */
1771                 q->avg_packet_interval = \
1772                         cake_ewma(q->avg_packet_interval,
1773                                   packet_interval,
1774                                   (packet_interval > q->avg_packet_interval ?
1775                                           2 : 8));
1776
1777                 q->last_packet_time = now;
1778
1779                 if (packet_interval > q->avg_packet_interval) {
1780                         u64 window_interval = \
1781                                 ktime_to_ns(ktime_sub(now,
1782                                                       q->avg_window_begin));
1783                         u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC;
1784
1785                         b = div64_u64(b, window_interval);
1786                         q->avg_peak_bandwidth =
1787                                 cake_ewma(q->avg_peak_bandwidth, b,
1788                                           b > q->avg_peak_bandwidth ? 2 : 8);
1789                         q->avg_window_bytes = 0;
1790                         q->avg_window_begin = now;
1791
1792                         if (ktime_after(now,
1793                                         ktime_add_ms(q->last_reconfig_time,
1794                                                      250))) {
1795                                 q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4;
1796                                 cake_reconfigure(sch);
1797                         }
1798                 }
1799         } else {
1800                 q->avg_window_bytes = 0;
1801                 q->last_packet_time = now;
1802         }
1803
1804         /* flowchain */
1805         if (!flow->set || flow->set == CAKE_SET_DECAYING) {
1806                 struct cake_host *srchost = &b->hosts[flow->srchost];
1807                 struct cake_host *dsthost = &b->hosts[flow->dsthost];
1808                 u16 host_load = 1;
1809
1810                 if (!flow->set) {
1811                         list_add_tail(&flow->flowchain, &b->new_flows);
1812                 } else {
1813                         b->decaying_flow_count--;
1814                         list_move_tail(&flow->flowchain, &b->new_flows);
1815                 }
1816                 flow->set = CAKE_SET_SPARSE;
1817                 b->sparse_flow_count++;
1818
1819                 if (cake_dsrc(q->flow_mode))
1820                         host_load = max(host_load, srchost->srchost_refcnt);
1821
1822                 if (cake_ddst(q->flow_mode))
1823                         host_load = max(host_load, dsthost->dsthost_refcnt);
1824
1825                 flow->deficit = (b->flow_quantum *
1826                                  quantum_div[host_load]) >> 16;
1827         } else if (flow->set == CAKE_SET_SPARSE_WAIT) {
1828                 /* this flow was empty, accounted as a sparse flow, but actually
1829                  * in the bulk rotation.
1830                  */
1831                 flow->set = CAKE_SET_BULK;
1832                 b->sparse_flow_count--;
1833                 b->bulk_flow_count++;
1834         }
1835
1836         if (q->buffer_used > q->buffer_max_used)
1837                 q->buffer_max_used = q->buffer_used;
1838
1839         if (q->buffer_used > q->buffer_limit) {
1840                 u32 dropped = 0;
1841
1842                 while (q->buffer_used > q->buffer_limit) {
1843                         dropped++;
1844                         cake_drop(sch, to_free);
1845                 }
1846                 b->drop_overlimit += dropped;
1847         }
1848         return NET_XMIT_SUCCESS;
1849 }
1850
1851 static struct sk_buff *cake_dequeue_one(struct Qdisc *sch)
1852 {
1853         struct cake_sched_data *q = qdisc_priv(sch);
1854         struct cake_tin_data *b = &q->tins[q->cur_tin];
1855         struct cake_flow *flow = &b->flows[q->cur_flow];
1856         struct sk_buff *skb = NULL;
1857         u32 len;
1858
1859         if (flow->head) {
1860                 skb = dequeue_head(flow);
1861                 len = qdisc_pkt_len(skb);
1862                 b->backlogs[q->cur_flow] -= len;
1863                 b->tin_backlog           -= len;
1864                 sch->qstats.backlog      -= len;
1865                 q->buffer_used           -= skb->truesize;
1866                 sch->q.qlen--;
1867
1868                 if (q->overflow_timeout)
1869                         cake_heapify(q, b->overflow_idx[q->cur_flow]);
1870         }
1871         return skb;
1872 }
1873
1874 /* Discard leftover packets from a tin no longer in use. */
1875 static void cake_clear_tin(struct Qdisc *sch, u16 tin)
1876 {
1877         struct cake_sched_data *q = qdisc_priv(sch);
1878         struct sk_buff *skb;
1879
1880         q->cur_tin = tin;
1881         for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++)
1882                 while (!!(skb = cake_dequeue_one(sch)))
1883                         kfree_skb(skb);
1884 }
1885
1886 static struct sk_buff *cake_dequeue(struct Qdisc *sch)
1887 {
1888         struct cake_sched_data *q = qdisc_priv(sch);
1889         struct cake_tin_data *b = &q->tins[q->cur_tin];
1890         struct cake_host *srchost, *dsthost;
1891         ktime_t now = ktime_get();
1892         struct cake_flow *flow;
1893         struct list_head *head;
1894         bool first_flow = true;
1895         struct sk_buff *skb;
1896         u16 host_load;
1897         u64 delay;
1898         u32 len;
1899
1900 begin:
1901         if (!sch->q.qlen)
1902                 return NULL;
1903
1904         /* global hard shaper */
1905         if (ktime_after(q->time_next_packet, now) &&
1906             ktime_after(q->failsafe_next_packet, now)) {
1907                 u64 next = min(ktime_to_ns(q->time_next_packet),
1908                                ktime_to_ns(q->failsafe_next_packet));
1909
1910                 sch->qstats.overlimits++;
1911                 qdisc_watchdog_schedule_ns(&q->watchdog, next);
1912                 return NULL;
1913         }
1914
1915         /* Choose a class to work on. */
1916         if (!q->rate_ns) {
1917                 /* In unlimited mode, can't rely on shaper timings, just balance
1918                  * with DRR
1919                  */
1920                 bool wrapped = false, empty = true;
1921
1922                 while (b->tin_deficit < 0 ||
1923                        !(b->sparse_flow_count + b->bulk_flow_count)) {
1924                         if (b->tin_deficit <= 0)
1925                                 b->tin_deficit += b->tin_quantum_band;
1926                         if (b->sparse_flow_count + b->bulk_flow_count)
1927                                 empty = false;
1928
1929                         q->cur_tin++;
1930                         b++;
1931                         if (q->cur_tin >= q->tin_cnt) {
1932                                 q->cur_tin = 0;
1933                                 b = q->tins;
1934
1935                                 if (wrapped) {
1936                                         /* It's possible for q->qlen to be
1937                                          * nonzero when we actually have no
1938                                          * packets anywhere.
1939                                          */
1940                                         if (empty)
1941                                                 return NULL;
1942                                 } else {
1943                                         wrapped = true;
1944                                 }
1945                         }
1946                 }
1947         } else {
1948                 /* In shaped mode, choose:
1949                  * - Highest-priority tin with queue and meeting schedule, or
1950                  * - The earliest-scheduled tin with queue.
1951                  */
1952                 ktime_t best_time = KTIME_MAX;
1953                 int tin, best_tin = 0;
1954
1955                 for (tin = 0; tin < q->tin_cnt; tin++) {
1956                         b = q->tins + tin;
1957                         if ((b->sparse_flow_count + b->bulk_flow_count) > 0) {
1958                                 ktime_t time_to_pkt = \
1959                                         ktime_sub(b->time_next_packet, now);
1960
1961                                 if (ktime_to_ns(time_to_pkt) <= 0 ||
1962                                     ktime_compare(time_to_pkt,
1963                                                   best_time) <= 0) {
1964                                         best_time = time_to_pkt;
1965                                         best_tin = tin;
1966                                 }
1967                         }
1968                 }
1969
1970                 q->cur_tin = best_tin;
1971                 b = q->tins + best_tin;
1972
1973                 /* No point in going further if no packets to deliver. */
1974                 if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count)))
1975                         return NULL;
1976         }
1977
1978 retry:
1979         /* service this class */
1980         head = &b->decaying_flows;
1981         if (!first_flow || list_empty(head)) {
1982                 head = &b->new_flows;
1983                 if (list_empty(head)) {
1984                         head = &b->old_flows;
1985                         if (unlikely(list_empty(head))) {
1986                                 head = &b->decaying_flows;
1987                                 if (unlikely(list_empty(head)))
1988                                         goto begin;
1989                         }
1990                 }
1991         }
1992         flow = list_first_entry(head, struct cake_flow, flowchain);
1993         q->cur_flow = flow - b->flows;
1994         first_flow = false;
1995
1996         /* triple isolation (modified DRR++) */
1997         srchost = &b->hosts[flow->srchost];
1998         dsthost = &b->hosts[flow->dsthost];
1999         host_load = 1;
2000
2001         if (cake_dsrc(q->flow_mode))
2002                 host_load = max(host_load, srchost->srchost_refcnt);
2003
2004         if (cake_ddst(q->flow_mode))
2005                 host_load = max(host_load, dsthost->dsthost_refcnt);
2006
2007         WARN_ON(host_load > CAKE_QUEUES);
2008
2009         /* flow isolation (DRR++) */
2010         if (flow->deficit <= 0) {
2011                 /* The shifted prandom_u32() is a way to apply dithering to
2012                  * avoid accumulating roundoff errors
2013                  */
2014                 flow->deficit += (b->flow_quantum * quantum_div[host_load] +
2015                                   (prandom_u32() >> 16)) >> 16;
2016                 list_move_tail(&flow->flowchain, &b->old_flows);
2017
2018                 /* Keep all flows with deficits out of the sparse and decaying
2019                  * rotations.  No non-empty flow can go into the decaying
2020                  * rotation, so they can't get deficits
2021                  */
2022                 if (flow->set == CAKE_SET_SPARSE) {
2023                         if (flow->head) {
2024                                 b->sparse_flow_count--;
2025                                 b->bulk_flow_count++;
2026                                 flow->set = CAKE_SET_BULK;
2027                         } else {
2028                                 /* we've moved it to the bulk rotation for
2029                                  * correct deficit accounting but we still want
2030                                  * to count it as a sparse flow, not a bulk one.
2031                                  */
2032                                 flow->set = CAKE_SET_SPARSE_WAIT;
2033                         }
2034                 }
2035                 goto retry;
2036         }
2037
2038         /* Retrieve a packet via the AQM */
2039         while (1) {
2040                 skb = cake_dequeue_one(sch);
2041                 if (!skb) {
2042                         /* this queue was actually empty */
2043                         if (cobalt_queue_empty(&flow->cvars, &b->cparams, now))
2044                                 b->unresponsive_flow_count--;
2045
2046                         if (flow->cvars.p_drop || flow->cvars.count ||
2047                             ktime_before(now, flow->cvars.drop_next)) {
2048                                 /* keep in the flowchain until the state has
2049                                  * decayed to rest
2050                                  */
2051                                 list_move_tail(&flow->flowchain,
2052                                                &b->decaying_flows);
2053                                 if (flow->set == CAKE_SET_BULK) {
2054                                         b->bulk_flow_count--;
2055                                         b->decaying_flow_count++;
2056                                 } else if (flow->set == CAKE_SET_SPARSE ||
2057                                            flow->set == CAKE_SET_SPARSE_WAIT) {
2058                                         b->sparse_flow_count--;
2059                                         b->decaying_flow_count++;
2060                                 }
2061                                 flow->set = CAKE_SET_DECAYING;
2062                         } else {
2063                                 /* remove empty queue from the flowchain */
2064                                 list_del_init(&flow->flowchain);
2065                                 if (flow->set == CAKE_SET_SPARSE ||
2066                                     flow->set == CAKE_SET_SPARSE_WAIT)
2067                                         b->sparse_flow_count--;
2068                                 else if (flow->set == CAKE_SET_BULK)
2069                                         b->bulk_flow_count--;
2070                                 else
2071                                         b->decaying_flow_count--;
2072
2073                                 flow->set = CAKE_SET_NONE;
2074                                 srchost->srchost_refcnt--;
2075                                 dsthost->dsthost_refcnt--;
2076                         }
2077                         goto begin;
2078                 }
2079
2080                 /* Last packet in queue may be marked, shouldn't be dropped */
2081                 if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb,
2082                                         (b->bulk_flow_count *
2083                                          !!(q->rate_flags &
2084                                             CAKE_FLAG_INGRESS))) ||
2085                     !flow->head)
2086                         break;
2087
2088                 /* drop this packet, get another one */
2089                 if (q->rate_flags & CAKE_FLAG_INGRESS) {
2090                         len = cake_advance_shaper(q, b, skb,
2091                                                   now, true);
2092                         flow->deficit -= len;
2093                         b->tin_deficit -= len;
2094                 }
2095                 flow->dropped++;
2096                 b->tin_dropped++;
2097                 qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
2098                 qdisc_qstats_drop(sch);
2099                 kfree_skb(skb);
2100                 if (q->rate_flags & CAKE_FLAG_INGRESS)
2101                         goto retry;
2102         }
2103
2104         b->tin_ecn_mark += !!flow->cvars.ecn_marked;
2105         qdisc_bstats_update(sch, skb);
2106
2107         /* collect delay stats */
2108         delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
2109         b->avge_delay = cake_ewma(b->avge_delay, delay, 8);
2110         b->peak_delay = cake_ewma(b->peak_delay, delay,
2111                                   delay > b->peak_delay ? 2 : 8);
2112         b->base_delay = cake_ewma(b->base_delay, delay,
2113                                   delay < b->base_delay ? 2 : 8);
2114
2115         len = cake_advance_shaper(q, b, skb, now, false);
2116         flow->deficit -= len;
2117         b->tin_deficit -= len;
2118
2119         if (ktime_after(q->time_next_packet, now) && sch->q.qlen) {
2120                 u64 next = min(ktime_to_ns(q->time_next_packet),
2121                                ktime_to_ns(q->failsafe_next_packet));
2122
2123                 qdisc_watchdog_schedule_ns(&q->watchdog, next);
2124         } else if (!sch->q.qlen) {
2125                 int i;
2126
2127                 for (i = 0; i < q->tin_cnt; i++) {
2128                         if (q->tins[i].decaying_flow_count) {
2129                                 ktime_t next = \
2130                                         ktime_add_ns(now,
2131                                                      q->tins[i].cparams.target);
2132
2133                                 qdisc_watchdog_schedule_ns(&q->watchdog,
2134                                                            ktime_to_ns(next));
2135                                 break;
2136                         }
2137                 }
2138         }
2139
2140         if (q->overflow_timeout)
2141                 q->overflow_timeout--;
2142
2143         return skb;
2144 }
2145
2146 static void cake_reset(struct Qdisc *sch)
2147 {
2148         u32 c;
2149
2150         for (c = 0; c < CAKE_MAX_TINS; c++)
2151                 cake_clear_tin(sch, c);
2152 }
2153
2154 static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = {
2155         [TCA_CAKE_BASE_RATE64]   = { .type = NLA_U64 },
2156         [TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 },
2157         [TCA_CAKE_ATM]           = { .type = NLA_U32 },
2158         [TCA_CAKE_FLOW_MODE]     = { .type = NLA_U32 },
2159         [TCA_CAKE_OVERHEAD]      = { .type = NLA_S32 },
2160         [TCA_CAKE_RTT]           = { .type = NLA_U32 },
2161         [TCA_CAKE_TARGET]        = { .type = NLA_U32 },
2162         [TCA_CAKE_AUTORATE]      = { .type = NLA_U32 },
2163         [TCA_CAKE_MEMORY]        = { .type = NLA_U32 },
2164         [TCA_CAKE_NAT]           = { .type = NLA_U32 },
2165         [TCA_CAKE_RAW]           = { .type = NLA_U32 },
2166         [TCA_CAKE_WASH]          = { .type = NLA_U32 },
2167         [TCA_CAKE_MPU]           = { .type = NLA_U32 },
2168         [TCA_CAKE_INGRESS]       = { .type = NLA_U32 },
2169         [TCA_CAKE_ACK_FILTER]    = { .type = NLA_U32 },
2170 };
2171
2172 static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
2173                           u64 target_ns, u64 rtt_est_ns)
2174 {
2175         /* convert byte-rate into time-per-byte
2176          * so it will always unwedge in reasonable time.
2177          */
2178         static const u64 MIN_RATE = 64;
2179         u32 byte_target = mtu;
2180         u64 byte_target_ns;
2181         u8  rate_shft = 0;
2182         u64 rate_ns = 0;
2183
2184         b->flow_quantum = 1514;
2185         if (rate) {
2186                 b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL);
2187                 rate_shft = 34;
2188                 rate_ns = ((u64)NSEC_PER_SEC) << rate_shft;
2189                 rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate));
2190                 while (!!(rate_ns >> 34)) {
2191                         rate_ns >>= 1;
2192                         rate_shft--;
2193                 }
2194         } /* else unlimited, ie. zero delay */
2195
2196         b->tin_rate_bps  = rate;
2197         b->tin_rate_ns   = rate_ns;
2198         b->tin_rate_shft = rate_shft;
2199
2200         byte_target_ns = (byte_target * rate_ns) >> rate_shft;
2201
2202         b->cparams.target = max((byte_target_ns * 3) / 2, target_ns);
2203         b->cparams.interval = max(rtt_est_ns +
2204                                      b->cparams.target - target_ns,
2205                                      b->cparams.target * 2);
2206         b->cparams.mtu_time = byte_target_ns;
2207         b->cparams.p_inc = 1 << 24; /* 1/256 */
2208         b->cparams.p_dec = 1 << 20; /* 1/4096 */
2209 }
2210
2211 static int cake_config_besteffort(struct Qdisc *sch)
2212 {
2213         struct cake_sched_data *q = qdisc_priv(sch);
2214         struct cake_tin_data *b = &q->tins[0];
2215         u32 mtu = psched_mtu(qdisc_dev(sch));
2216         u64 rate = q->rate_bps;
2217
2218         q->tin_cnt = 1;
2219
2220         q->tin_index = besteffort;
2221         q->tin_order = normal_order;
2222
2223         cake_set_rate(b, rate, mtu,
2224                       us_to_ns(q->target), us_to_ns(q->interval));
2225         b->tin_quantum_band = 65535;
2226         b->tin_quantum_prio = 65535;
2227
2228         return 0;
2229 }
2230
2231 static int cake_config_precedence(struct Qdisc *sch)
2232 {
2233         /* convert high-level (user visible) parameters into internal format */
2234         struct cake_sched_data *q = qdisc_priv(sch);
2235         u32 mtu = psched_mtu(qdisc_dev(sch));
2236         u64 rate = q->rate_bps;
2237         u32 quantum1 = 256;
2238         u32 quantum2 = 256;
2239         u32 i;
2240
2241         q->tin_cnt = 8;
2242         q->tin_index = precedence;
2243         q->tin_order = normal_order;
2244
2245         for (i = 0; i < q->tin_cnt; i++) {
2246                 struct cake_tin_data *b = &q->tins[i];
2247
2248                 cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2249                               us_to_ns(q->interval));
2250
2251                 b->tin_quantum_prio = max_t(u16, 1U, quantum1);
2252                 b->tin_quantum_band = max_t(u16, 1U, quantum2);
2253
2254                 /* calculate next class's parameters */
2255                 rate  *= 7;
2256                 rate >>= 3;
2257
2258                 quantum1  *= 3;
2259                 quantum1 >>= 1;
2260
2261                 quantum2  *= 7;
2262                 quantum2 >>= 3;
2263         }
2264
2265         return 0;
2266 }
2267
2268 /*      List of known Diffserv codepoints:
2269  *
2270  *      Least Effort (CS1)
2271  *      Best Effort (CS0)
2272  *      Max Reliability & LLT "Lo" (TOS1)
2273  *      Max Throughput (TOS2)
2274  *      Min Delay (TOS4)
2275  *      LLT "La" (TOS5)
2276  *      Assured Forwarding 1 (AF1x) - x3
2277  *      Assured Forwarding 2 (AF2x) - x3
2278  *      Assured Forwarding 3 (AF3x) - x3
2279  *      Assured Forwarding 4 (AF4x) - x3
2280  *      Precedence Class 2 (CS2)
2281  *      Precedence Class 3 (CS3)
2282  *      Precedence Class 4 (CS4)
2283  *      Precedence Class 5 (CS5)
2284  *      Precedence Class 6 (CS6)
2285  *      Precedence Class 7 (CS7)
2286  *      Voice Admit (VA)
2287  *      Expedited Forwarding (EF)
2288
2289  *      Total 25 codepoints.
2290  */
2291
2292 /*      List of traffic classes in RFC 4594:
2293  *              (roughly descending order of contended priority)
2294  *              (roughly ascending order of uncontended throughput)
2295  *
2296  *      Network Control (CS6,CS7)      - routing traffic
2297  *      Telephony (EF,VA)         - aka. VoIP streams
2298  *      Signalling (CS5)               - VoIP setup
2299  *      Multimedia Conferencing (AF4x) - aka. video calls
2300  *      Realtime Interactive (CS4)     - eg. games
2301  *      Multimedia Streaming (AF3x)    - eg. YouTube, NetFlix, Twitch
2302  *      Broadcast Video (CS3)
2303  *      Low Latency Data (AF2x,TOS4)      - eg. database
2304  *      Ops, Admin, Management (CS2,TOS1) - eg. ssh
2305  *      Standard Service (CS0 & unrecognised codepoints)
2306  *      High Throughput Data (AF1x,TOS2)  - eg. web traffic
2307  *      Low Priority Data (CS1)           - eg. BitTorrent
2308
2309  *      Total 12 traffic classes.
2310  */
2311
2312 static int cake_config_diffserv8(struct Qdisc *sch)
2313 {
2314 /*      Pruned list of traffic classes for typical applications:
2315  *
2316  *              Network Control          (CS6, CS7)
2317  *              Minimum Latency          (EF, VA, CS5, CS4)
2318  *              Interactive Shell        (CS2, TOS1)
2319  *              Low Latency Transactions (AF2x, TOS4)
2320  *              Video Streaming          (AF4x, AF3x, CS3)
2321  *              Bog Standard             (CS0 etc.)
2322  *              High Throughput          (AF1x, TOS2)
2323  *              Background Traffic       (CS1)
2324  *
2325  *              Total 8 traffic classes.
2326  */
2327
2328         struct cake_sched_data *q = qdisc_priv(sch);
2329         u32 mtu = psched_mtu(qdisc_dev(sch));
2330         u64 rate = q->rate_bps;
2331         u32 quantum1 = 256;
2332         u32 quantum2 = 256;
2333         u32 i;
2334
2335         q->tin_cnt = 8;
2336
2337         /* codepoint to class mapping */
2338         q->tin_index = diffserv8;
2339         q->tin_order = normal_order;
2340
2341         /* class characteristics */
2342         for (i = 0; i < q->tin_cnt; i++) {
2343                 struct cake_tin_data *b = &q->tins[i];
2344
2345                 cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2346                               us_to_ns(q->interval));
2347
2348                 b->tin_quantum_prio = max_t(u16, 1U, quantum1);
2349                 b->tin_quantum_band = max_t(u16, 1U, quantum2);
2350
2351                 /* calculate next class's parameters */
2352                 rate  *= 7;
2353                 rate >>= 3;
2354
2355                 quantum1  *= 3;
2356                 quantum1 >>= 1;
2357
2358                 quantum2  *= 7;
2359                 quantum2 >>= 3;
2360         }
2361
2362         return 0;
2363 }
2364
2365 static int cake_config_diffserv4(struct Qdisc *sch)
2366 {
2367 /*  Further pruned list of traffic classes for four-class system:
2368  *
2369  *          Latency Sensitive  (CS7, CS6, EF, VA, CS5, CS4)
2370  *          Streaming Media    (AF4x, AF3x, CS3, AF2x, TOS4, CS2, TOS1)
2371  *          Best Effort        (CS0, AF1x, TOS2, and those not specified)
2372  *          Background Traffic (CS1)
2373  *
2374  *              Total 4 traffic classes.
2375  */
2376
2377         struct cake_sched_data *q = qdisc_priv(sch);
2378         u32 mtu = psched_mtu(qdisc_dev(sch));
2379         u64 rate = q->rate_bps;
2380         u32 quantum = 1024;
2381
2382         q->tin_cnt = 4;
2383
2384         /* codepoint to class mapping */
2385         q->tin_index = diffserv4;
2386         q->tin_order = bulk_order;
2387
2388         /* class characteristics */
2389         cake_set_rate(&q->tins[0], rate, mtu,
2390                       us_to_ns(q->target), us_to_ns(q->interval));
2391         cake_set_rate(&q->tins[1], rate >> 4, mtu,
2392                       us_to_ns(q->target), us_to_ns(q->interval));
2393         cake_set_rate(&q->tins[2], rate >> 1, mtu,
2394                       us_to_ns(q->target), us_to_ns(q->interval));
2395         cake_set_rate(&q->tins[3], rate >> 2, mtu,
2396                       us_to_ns(q->target), us_to_ns(q->interval));
2397
2398         /* priority weights */
2399         q->tins[0].tin_quantum_prio = quantum;
2400         q->tins[1].tin_quantum_prio = quantum >> 4;
2401         q->tins[2].tin_quantum_prio = quantum << 2;
2402         q->tins[3].tin_quantum_prio = quantum << 4;
2403
2404         /* bandwidth-sharing weights */
2405         q->tins[0].tin_quantum_band = quantum;
2406         q->tins[1].tin_quantum_band = quantum >> 4;
2407         q->tins[2].tin_quantum_band = quantum >> 1;
2408         q->tins[3].tin_quantum_band = quantum >> 2;
2409
2410         return 0;
2411 }
2412
2413 static int cake_config_diffserv3(struct Qdisc *sch)
2414 {
2415 /*  Simplified Diffserv structure with 3 tins.
2416  *              Low Priority            (CS1)
2417  *              Best Effort
2418  *              Latency Sensitive       (TOS4, VA, EF, CS6, CS7)
2419  */
2420         struct cake_sched_data *q = qdisc_priv(sch);
2421         u32 mtu = psched_mtu(qdisc_dev(sch));
2422         u64 rate = q->rate_bps;
2423         u32 quantum = 1024;
2424
2425         q->tin_cnt = 3;
2426
2427         /* codepoint to class mapping */
2428         q->tin_index = diffserv3;
2429         q->tin_order = bulk_order;
2430
2431         /* class characteristics */
2432         cake_set_rate(&q->tins[0], rate, mtu,
2433                       us_to_ns(q->target), us_to_ns(q->interval));
2434         cake_set_rate(&q->tins[1], rate >> 4, mtu,
2435                       us_to_ns(q->target), us_to_ns(q->interval));
2436         cake_set_rate(&q->tins[2], rate >> 2, mtu,
2437                       us_to_ns(q->target), us_to_ns(q->interval));
2438
2439         /* priority weights */
2440         q->tins[0].tin_quantum_prio = quantum;
2441         q->tins[1].tin_quantum_prio = quantum >> 4;
2442         q->tins[2].tin_quantum_prio = quantum << 4;
2443
2444         /* bandwidth-sharing weights */
2445         q->tins[0].tin_quantum_band = quantum;
2446         q->tins[1].tin_quantum_band = quantum >> 4;
2447         q->tins[2].tin_quantum_band = quantum >> 2;
2448
2449         return 0;
2450 }
2451
2452 static void cake_reconfigure(struct Qdisc *sch)
2453 {
2454         struct cake_sched_data *q = qdisc_priv(sch);
2455         int c, ft;
2456
2457         switch (q->tin_mode) {
2458         case CAKE_DIFFSERV_BESTEFFORT:
2459                 ft = cake_config_besteffort(sch);
2460                 break;
2461
2462         case CAKE_DIFFSERV_PRECEDENCE:
2463                 ft = cake_config_precedence(sch);
2464                 break;
2465
2466         case CAKE_DIFFSERV_DIFFSERV8:
2467                 ft = cake_config_diffserv8(sch);
2468                 break;
2469
2470         case CAKE_DIFFSERV_DIFFSERV4:
2471                 ft = cake_config_diffserv4(sch);
2472                 break;
2473
2474         case CAKE_DIFFSERV_DIFFSERV3:
2475         default:
2476                 ft = cake_config_diffserv3(sch);
2477                 break;
2478         }
2479
2480         for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) {
2481                 cake_clear_tin(sch, c);
2482                 q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time;
2483         }
2484
2485         q->rate_ns   = q->tins[ft].tin_rate_ns;
2486         q->rate_shft = q->tins[ft].tin_rate_shft;
2487
2488         if (q->buffer_config_limit) {
2489                 q->buffer_limit = q->buffer_config_limit;
2490         } else if (q->rate_bps) {
2491                 u64 t = q->rate_bps * q->interval;
2492
2493                 do_div(t, USEC_PER_SEC / 4);
2494                 q->buffer_limit = max_t(u32, t, 4U << 20);
2495         } else {
2496                 q->buffer_limit = ~0;
2497         }
2498
2499         sch->flags &= ~TCQ_F_CAN_BYPASS;
2500
2501         q->buffer_limit = min(q->buffer_limit,
2502                               max(sch->limit * psched_mtu(qdisc_dev(sch)),
2503                                   q->buffer_config_limit));
2504 }
2505
2506 static int cake_change(struct Qdisc *sch, struct nlattr *opt,
2507                        struct netlink_ext_ack *extack)
2508 {
2509         struct cake_sched_data *q = qdisc_priv(sch);
2510         struct nlattr *tb[TCA_CAKE_MAX + 1];
2511         int err;
2512
2513         if (!opt)
2514                 return -EINVAL;
2515
2516         err = nla_parse_nested(tb, TCA_CAKE_MAX, opt, cake_policy, extack);
2517         if (err < 0)
2518                 return err;
2519
2520         if (tb[TCA_CAKE_NAT]) {
2521 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
2522                 q->flow_mode &= ~CAKE_FLOW_NAT_FLAG;
2523                 q->flow_mode |= CAKE_FLOW_NAT_FLAG *
2524                         !!nla_get_u32(tb[TCA_CAKE_NAT]);
2525 #else
2526                 NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT],
2527                                     "No conntrack support in kernel");
2528                 return -EOPNOTSUPP;
2529 #endif
2530         }
2531
2532         if (tb[TCA_CAKE_BASE_RATE64])
2533                 q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]);
2534
2535         if (tb[TCA_CAKE_DIFFSERV_MODE])
2536                 q->tin_mode = nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]);
2537
2538         if (tb[TCA_CAKE_WASH]) {
2539                 if (!!nla_get_u32(tb[TCA_CAKE_WASH]))
2540                         q->rate_flags |= CAKE_FLAG_WASH;
2541                 else
2542                         q->rate_flags &= ~CAKE_FLAG_WASH;
2543         }
2544
2545         if (tb[TCA_CAKE_FLOW_MODE])
2546                 q->flow_mode = ((q->flow_mode & CAKE_FLOW_NAT_FLAG) |
2547                                 (nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) &
2548                                         CAKE_FLOW_MASK));
2549
2550         if (tb[TCA_CAKE_ATM])
2551                 q->atm_mode = nla_get_u32(tb[TCA_CAKE_ATM]);
2552
2553         if (tb[TCA_CAKE_OVERHEAD]) {
2554                 q->rate_overhead = nla_get_s32(tb[TCA_CAKE_OVERHEAD]);
2555                 q->rate_flags |= CAKE_FLAG_OVERHEAD;
2556
2557                 q->max_netlen = 0;
2558                 q->max_adjlen = 0;
2559                 q->min_netlen = ~0;
2560                 q->min_adjlen = ~0;
2561         }
2562
2563         if (tb[TCA_CAKE_RAW]) {
2564                 q->rate_flags &= ~CAKE_FLAG_OVERHEAD;
2565
2566                 q->max_netlen = 0;
2567                 q->max_adjlen = 0;
2568                 q->min_netlen = ~0;
2569                 q->min_adjlen = ~0;
2570         }
2571
2572         if (tb[TCA_CAKE_MPU])
2573                 q->rate_mpu = nla_get_u32(tb[TCA_CAKE_MPU]);
2574
2575         if (tb[TCA_CAKE_RTT]) {
2576                 q->interval = nla_get_u32(tb[TCA_CAKE_RTT]);
2577
2578                 if (!q->interval)
2579                         q->interval = 1;
2580         }
2581
2582         if (tb[TCA_CAKE_TARGET]) {
2583                 q->target = nla_get_u32(tb[TCA_CAKE_TARGET]);
2584
2585                 if (!q->target)
2586                         q->target = 1;
2587         }
2588
2589         if (tb[TCA_CAKE_AUTORATE]) {
2590                 if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE]))
2591                         q->rate_flags |= CAKE_FLAG_AUTORATE_INGRESS;
2592                 else
2593                         q->rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS;
2594         }
2595
2596         if (tb[TCA_CAKE_INGRESS]) {
2597                 if (!!nla_get_u32(tb[TCA_CAKE_INGRESS]))
2598                         q->rate_flags |= CAKE_FLAG_INGRESS;
2599                 else
2600                         q->rate_flags &= ~CAKE_FLAG_INGRESS;
2601         }
2602
2603         if (tb[TCA_CAKE_ACK_FILTER])
2604                 q->ack_filter = nla_get_u32(tb[TCA_CAKE_ACK_FILTER]);
2605
2606         if (tb[TCA_CAKE_MEMORY])
2607                 q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]);
2608
2609         if (tb[TCA_CAKE_SPLIT_GSO]) {
2610                 if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO]))
2611                         q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2612                 else
2613                         q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO;
2614         }
2615
2616         if (q->tins) {
2617                 sch_tree_lock(sch);
2618                 cake_reconfigure(sch);
2619                 sch_tree_unlock(sch);
2620         }
2621
2622         return 0;
2623 }
2624
2625 static void cake_destroy(struct Qdisc *sch)
2626 {
2627         struct cake_sched_data *q = qdisc_priv(sch);
2628
2629         qdisc_watchdog_cancel(&q->watchdog);
2630         tcf_block_put(q->block);
2631         kvfree(q->tins);
2632 }
2633
2634 static int cake_init(struct Qdisc *sch, struct nlattr *opt,
2635                      struct netlink_ext_ack *extack)
2636 {
2637         struct cake_sched_data *q = qdisc_priv(sch);
2638         int i, j, err;
2639
2640         sch->limit = 10240;
2641         q->tin_mode = CAKE_DIFFSERV_DIFFSERV3;
2642         q->flow_mode  = CAKE_FLOW_TRIPLE;
2643
2644         q->rate_bps = 0; /* unlimited by default */
2645
2646         q->interval = 100000; /* 100ms default */
2647         q->target   =   5000; /* 5ms: codel RFC argues
2648                                * for 5 to 10% of interval
2649                                */
2650         q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2651         q->cur_tin = 0;
2652         q->cur_flow  = 0;
2653
2654         qdisc_watchdog_init(&q->watchdog, sch);
2655
2656         if (opt) {
2657                 err = cake_change(sch, opt, extack);
2658
2659                 if (err)
2660                         return err;
2661         }
2662
2663         err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
2664         if (err)
2665                 return err;
2666
2667         quantum_div[0] = ~0;
2668         for (i = 1; i <= CAKE_QUEUES; i++)
2669                 quantum_div[i] = 65535 / i;
2670
2671         q->tins = kvcalloc(CAKE_MAX_TINS, sizeof(struct cake_tin_data),
2672                            GFP_KERNEL);
2673         if (!q->tins)
2674                 goto nomem;
2675
2676         for (i = 0; i < CAKE_MAX_TINS; i++) {
2677                 struct cake_tin_data *b = q->tins + i;
2678
2679                 INIT_LIST_HEAD(&b->new_flows);
2680                 INIT_LIST_HEAD(&b->old_flows);
2681                 INIT_LIST_HEAD(&b->decaying_flows);
2682                 b->sparse_flow_count = 0;
2683                 b->bulk_flow_count = 0;
2684                 b->decaying_flow_count = 0;
2685
2686                 for (j = 0; j < CAKE_QUEUES; j++) {
2687                         struct cake_flow *flow = b->flows + j;
2688                         u32 k = j * CAKE_MAX_TINS + i;
2689
2690                         INIT_LIST_HEAD(&flow->flowchain);
2691                         cobalt_vars_init(&flow->cvars);
2692
2693                         q->overflow_heap[k].t = i;
2694                         q->overflow_heap[k].b = j;
2695                         b->overflow_idx[j] = k;
2696                 }
2697         }
2698
2699         cake_reconfigure(sch);
2700         q->avg_peak_bandwidth = q->rate_bps;
2701         q->min_netlen = ~0;
2702         q->min_adjlen = ~0;
2703         return 0;
2704
2705 nomem:
2706         cake_destroy(sch);
2707         return -ENOMEM;
2708 }
2709
2710 static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
2711 {
2712         struct cake_sched_data *q = qdisc_priv(sch);
2713         struct nlattr *opts;
2714
2715         opts = nla_nest_start(skb, TCA_OPTIONS);
2716         if (!opts)
2717                 goto nla_put_failure;
2718
2719         if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, q->rate_bps,
2720                               TCA_CAKE_PAD))
2721                 goto nla_put_failure;
2722
2723         if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE,
2724                         q->flow_mode & CAKE_FLOW_MASK))
2725                 goto nla_put_failure;
2726
2727         if (nla_put_u32(skb, TCA_CAKE_RTT, q->interval))
2728                 goto nla_put_failure;
2729
2730         if (nla_put_u32(skb, TCA_CAKE_TARGET, q->target))
2731                 goto nla_put_failure;
2732
2733         if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit))
2734                 goto nla_put_failure;
2735
2736         if (nla_put_u32(skb, TCA_CAKE_AUTORATE,
2737                         !!(q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS)))
2738                 goto nla_put_failure;
2739
2740         if (nla_put_u32(skb, TCA_CAKE_INGRESS,
2741                         !!(q->rate_flags & CAKE_FLAG_INGRESS)))
2742                 goto nla_put_failure;
2743
2744         if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter))
2745                 goto nla_put_failure;
2746
2747         if (nla_put_u32(skb, TCA_CAKE_NAT,
2748                         !!(q->flow_mode & CAKE_FLOW_NAT_FLAG)))
2749                 goto nla_put_failure;
2750
2751         if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, q->tin_mode))
2752                 goto nla_put_failure;
2753
2754         if (nla_put_u32(skb, TCA_CAKE_WASH,
2755                         !!(q->rate_flags & CAKE_FLAG_WASH)))
2756                 goto nla_put_failure;
2757
2758         if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, q->rate_overhead))
2759                 goto nla_put_failure;
2760
2761         if (!(q->rate_flags & CAKE_FLAG_OVERHEAD))
2762                 if (nla_put_u32(skb, TCA_CAKE_RAW, 0))
2763                         goto nla_put_failure;
2764
2765         if (nla_put_u32(skb, TCA_CAKE_ATM, q->atm_mode))
2766                 goto nla_put_failure;
2767
2768         if (nla_put_u32(skb, TCA_CAKE_MPU, q->rate_mpu))
2769                 goto nla_put_failure;
2770
2771         if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO,
2772                         !!(q->rate_flags & CAKE_FLAG_SPLIT_GSO)))
2773                 goto nla_put_failure;
2774
2775         return nla_nest_end(skb, opts);
2776
2777 nla_put_failure:
2778         return -1;
2779 }
2780
2781 static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
2782 {
2783         struct nlattr *stats = nla_nest_start(d->skb, TCA_STATS_APP);
2784         struct cake_sched_data *q = qdisc_priv(sch);
2785         struct nlattr *tstats, *ts;
2786         int i;
2787
2788         if (!stats)
2789                 return -1;
2790
2791 #define PUT_STAT_U32(attr, data) do {                                  \
2792                 if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2793                         goto nla_put_failure;                          \
2794         } while (0)
2795 #define PUT_STAT_U64(attr, data) do {                                  \
2796                 if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \
2797                                         data, TCA_CAKE_STATS_PAD)) \
2798                         goto nla_put_failure;                          \
2799         } while (0)
2800
2801         PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth);
2802         PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit);
2803         PUT_STAT_U32(MEMORY_USED, q->buffer_max_used);
2804         PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16));
2805         PUT_STAT_U32(MAX_NETLEN, q->max_netlen);
2806         PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen);
2807         PUT_STAT_U32(MIN_NETLEN, q->min_netlen);
2808         PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen);
2809
2810 #undef PUT_STAT_U32
2811 #undef PUT_STAT_U64
2812
2813         tstats = nla_nest_start(d->skb, TCA_CAKE_STATS_TIN_STATS);
2814         if (!tstats)
2815                 goto nla_put_failure;
2816
2817 #define PUT_TSTAT_U32(attr, data) do {                                  \
2818                 if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \
2819                         goto nla_put_failure;                           \
2820         } while (0)
2821 #define PUT_TSTAT_U64(attr, data) do {                                  \
2822                 if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \
2823                                         data, TCA_CAKE_TIN_STATS_PAD))  \
2824                         goto nla_put_failure;                           \
2825         } while (0)
2826
2827         for (i = 0; i < q->tin_cnt; i++) {
2828                 struct cake_tin_data *b = &q->tins[q->tin_order[i]];
2829
2830                 ts = nla_nest_start(d->skb, i + 1);
2831                 if (!ts)
2832                         goto nla_put_failure;
2833
2834                 PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps);
2835                 PUT_TSTAT_U64(SENT_BYTES64, b->bytes);
2836                 PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog);
2837
2838                 PUT_TSTAT_U32(TARGET_US,
2839                               ktime_to_us(ns_to_ktime(b->cparams.target)));
2840                 PUT_TSTAT_U32(INTERVAL_US,
2841                               ktime_to_us(ns_to_ktime(b->cparams.interval)));
2842
2843                 PUT_TSTAT_U32(SENT_PACKETS, b->packets);
2844                 PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped);
2845                 PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark);
2846                 PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops);
2847
2848                 PUT_TSTAT_U32(PEAK_DELAY_US,
2849                               ktime_to_us(ns_to_ktime(b->peak_delay)));
2850                 PUT_TSTAT_U32(AVG_DELAY_US,
2851                               ktime_to_us(ns_to_ktime(b->avge_delay)));
2852                 PUT_TSTAT_U32(BASE_DELAY_US,
2853                               ktime_to_us(ns_to_ktime(b->base_delay)));
2854
2855                 PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits);
2856                 PUT_TSTAT_U32(WAY_MISSES, b->way_misses);
2857                 PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions);
2858
2859                 PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count +
2860                                             b->decaying_flow_count);
2861                 PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count);
2862                 PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count);
2863                 PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen);
2864
2865                 PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum);
2866                 nla_nest_end(d->skb, ts);
2867         }
2868
2869 #undef PUT_TSTAT_U32
2870 #undef PUT_TSTAT_U64
2871
2872         nla_nest_end(d->skb, tstats);
2873         return nla_nest_end(d->skb, stats);
2874
2875 nla_put_failure:
2876         nla_nest_cancel(d->skb, stats);
2877         return -1;
2878 }
2879
2880 static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg)
2881 {
2882         return NULL;
2883 }
2884
2885 static unsigned long cake_find(struct Qdisc *sch, u32 classid)
2886 {
2887         return 0;
2888 }
2889
2890 static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent,
2891                                u32 classid)
2892 {
2893         return 0;
2894 }
2895
2896 static void cake_unbind(struct Qdisc *q, unsigned long cl)
2897 {
2898 }
2899
2900 static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl,
2901                                         struct netlink_ext_ack *extack)
2902 {
2903         struct cake_sched_data *q = qdisc_priv(sch);
2904
2905         if (cl)
2906                 return NULL;
2907         return q->block;
2908 }
2909
2910 static int cake_dump_class(struct Qdisc *sch, unsigned long cl,
2911                            struct sk_buff *skb, struct tcmsg *tcm)
2912 {
2913         tcm->tcm_handle |= TC_H_MIN(cl);
2914         return 0;
2915 }
2916
2917 static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
2918                                  struct gnet_dump *d)
2919 {
2920         struct cake_sched_data *q = qdisc_priv(sch);
2921         const struct cake_flow *flow = NULL;
2922         struct gnet_stats_queue qs = { 0 };
2923         struct nlattr *stats;
2924         u32 idx = cl - 1;
2925
2926         if (idx < CAKE_QUEUES * q->tin_cnt) {
2927                 const struct cake_tin_data *b = \
2928                         &q->tins[q->tin_order[idx / CAKE_QUEUES]];
2929                 const struct sk_buff *skb;
2930
2931                 flow = &b->flows[idx % CAKE_QUEUES];
2932
2933                 if (flow->head) {
2934                         sch_tree_lock(sch);
2935                         skb = flow->head;
2936                         while (skb) {
2937                                 qs.qlen++;
2938                                 skb = skb->next;
2939                         }
2940                         sch_tree_unlock(sch);
2941                 }
2942                 qs.backlog = b->backlogs[idx % CAKE_QUEUES];
2943                 qs.drops = flow->dropped;
2944         }
2945         if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
2946                 return -1;
2947         if (flow) {
2948                 ktime_t now = ktime_get();
2949
2950                 stats = nla_nest_start(d->skb, TCA_STATS_APP);
2951                 if (!stats)
2952                         return -1;
2953
2954 #define PUT_STAT_U32(attr, data) do {                                  \
2955                 if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2956                         goto nla_put_failure;                          \
2957         } while (0)
2958 #define PUT_STAT_S32(attr, data) do {                                  \
2959                 if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2960                         goto nla_put_failure;                          \
2961         } while (0)
2962
2963                 PUT_STAT_S32(DEFICIT, flow->deficit);
2964                 PUT_STAT_U32(DROPPING, flow->cvars.dropping);
2965                 PUT_STAT_U32(COBALT_COUNT, flow->cvars.count);
2966                 PUT_STAT_U32(P_DROP, flow->cvars.p_drop);
2967                 if (flow->cvars.p_drop) {
2968                         PUT_STAT_S32(BLUE_TIMER_US,
2969                                      ktime_to_us(
2970                                              ktime_sub(now,
2971                                                        flow->cvars.blue_timer)));
2972                 }
2973                 if (flow->cvars.dropping) {
2974                         PUT_STAT_S32(DROP_NEXT_US,
2975                                      ktime_to_us(
2976                                              ktime_sub(now,
2977                                                        flow->cvars.drop_next)));
2978                 }
2979
2980                 if (nla_nest_end(d->skb, stats) < 0)
2981                         return -1;
2982         }
2983
2984         return 0;
2985
2986 nla_put_failure:
2987         nla_nest_cancel(d->skb, stats);
2988         return -1;
2989 }
2990
2991 static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg)
2992 {
2993         struct cake_sched_data *q = qdisc_priv(sch);
2994         unsigned int i, j;
2995
2996         if (arg->stop)
2997                 return;
2998
2999         for (i = 0; i < q->tin_cnt; i++) {
3000                 struct cake_tin_data *b = &q->tins[q->tin_order[i]];
3001
3002                 for (j = 0; j < CAKE_QUEUES; j++) {
3003                         if (list_empty(&b->flows[j].flowchain) ||
3004                             arg->count < arg->skip) {
3005                                 arg->count++;
3006                                 continue;
3007                         }
3008                         if (arg->fn(sch, i * CAKE_QUEUES + j + 1, arg) < 0) {
3009                                 arg->stop = 1;
3010                                 break;
3011                         }
3012                         arg->count++;
3013                 }
3014         }
3015 }
3016
3017 static const struct Qdisc_class_ops cake_class_ops = {
3018         .leaf           =       cake_leaf,
3019         .find           =       cake_find,
3020         .tcf_block      =       cake_tcf_block,
3021         .bind_tcf       =       cake_bind,
3022         .unbind_tcf     =       cake_unbind,
3023         .dump           =       cake_dump_class,
3024         .dump_stats     =       cake_dump_class_stats,
3025         .walk           =       cake_walk,
3026 };
3027
3028 static struct Qdisc_ops cake_qdisc_ops __read_mostly = {
3029         .cl_ops         =       &cake_class_ops,
3030         .id             =       "cake",
3031         .priv_size      =       sizeof(struct cake_sched_data),
3032         .enqueue        =       cake_enqueue,
3033         .dequeue        =       cake_dequeue,
3034         .peek           =       qdisc_peek_dequeued,
3035         .init           =       cake_init,
3036         .reset          =       cake_reset,
3037         .destroy        =       cake_destroy,
3038         .change         =       cake_change,
3039         .dump           =       cake_dump,
3040         .dump_stats     =       cake_dump_stats,
3041         .owner          =       THIS_MODULE,
3042 };
3043
3044 static int __init cake_module_init(void)
3045 {
3046         return register_qdisc(&cake_qdisc_ops);
3047 }
3048
3049 static void __exit cake_module_exit(void)
3050 {
3051         unregister_qdisc(&cake_qdisc_ops);
3052 }
3053
3054 module_init(cake_module_init)
3055 module_exit(cake_module_exit)
3056 MODULE_AUTHOR("Jonathan Morton");
3057 MODULE_LICENSE("Dual BSD/GPL");
3058 MODULE_DESCRIPTION("The CAKE shaper.");