net/sched/sch_cake.c

   1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
   2
   3 /* COMMON Applications Kept Enhanced (CAKE) discipline
   4  *
   5  * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com>
   6  * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk>
   7  * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com>
   8  * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de>
   9  * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk>
  10  * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au>
  11  *
  12  * The CAKE Principles:
  13  *                 (or, how to have your cake and eat it too)
  14  *
  15  * This is a combination of several shaping, AQM and FQ techniques into one
  16  * easy-to-use package:
  17  *
  18  * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE
  19  *   equipment and bloated MACs.  This operates in deficit mode (as in sch_fq),
  20  *   eliminating the need for any sort of burst parameter (eg. token bucket
  21  *   depth).  Burst support is limited to that necessary to overcome scheduling
  22  *   latency.
  23  *
  24  * - A Diffserv-aware priority queue, giving more priority to certain classes,
  25  *   up to a specified fraction of bandwidth.  Above that bandwidth threshold,
  26  *   the priority is reduced to avoid starving other tins.
  27  *
  28  * - Each priority tin has a separate Flow Queue system, to isolate traffic
  29  *   flows from each other.  This prevents a burst on one flow from increasing
  30  *   the delay to another.  Flows are distributed to queues using a
  31  *   set-associative hash function.
  32  *
  33  * - Each queue is actively managed by Cobalt, which is a combination of the
  34  *   Codel and Blue AQM algorithms.  This serves flows fairly, and signals
  35  *   congestion early via ECN (if available) and/or packet drops, to keep
  36  *   latency low.  The codel parameters are auto-tuned based on the bandwidth
  37  *   setting, as is necessary at low bandwidths.
  38  *
  39  * The configuration parameters are kept deliberately simple for ease of use.
  40  * Everything has sane defaults.  Complete generality of configuration is *not*
  41  * a goal.
  42  *
  43  * The priority queue operates according to a weighted DRR scheme, combined with
  44  * a bandwidth tracker which reuses the shaper logic to detect which side of the
  45  * bandwidth sharing threshold the tin is operating.  This determines whether a
  46  * priority-based weight (high) or a bandwidth-based weight (low) is used for
  47  * that tin in the current pass.
  48  *
  49  * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly
  50  * granted us permission to leverage.
  51  */
  52
  53 #include <linux/module.h>
  54 #include <linux/types.h>
  55 #include <linux/kernel.h>
  56 #include <linux/jiffies.h>
  57 #include <linux/string.h>
  58 #include <linux/in.h>
  59 #include <linux/errno.h>
  60 #include <linux/init.h>
  61 #include <linux/skbuff.h>
  62 #include <linux/jhash.h>
  63 #include <linux/slab.h>
  64 #include <linux/vmalloc.h>
  65 #include <linux/reciprocal_div.h>
  66 #include <net/netlink.h>
  67 #include <linux/if_vlan.h>
  68 #include <net/gso.h>
  69 #include <net/pkt_sched.h>
  70 #include <net/pkt_cls.h>
  71 #include <net/tcp.h>
  72 #include <net/flow_dissector.h>
  73
  74 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
  75 #include <net/netfilter/nf_conntrack_core.h>
  76 #endif
  77
  78 #define CAKE_SET_WAYS (8)
  79 #define CAKE_MAX_TINS (8)
  80 #define CAKE_QUEUES (1024)
  81 #define CAKE_FLOW_MASK 63
  82 #define CAKE_FLOW_NAT_FLAG 64
  83
  84 /* struct cobalt_params - contains codel and blue parameters
  85  * @interval:   codel initial drop rate
  86  * @target:     maximum persistent sojourn time & blue update rate
  87  * @mtu_time:   serialisation delay of maximum-size packet
  88  * @p_inc:      increment of blue drop probability (0.32 fxp)
  89  * @p_dec:      decrement of blue drop probability (0.32 fxp)
  90  */
  91 struct cobalt_params {
  92         u64     interval;
  93         u64     target;
  94         u64     mtu_time;
  95         u32     p_inc;
  96         u32     p_dec;
  97 };
  98
  99 /* struct cobalt_vars - contains codel and blue variables
 100  * @count:              codel dropping frequency
 101  * @rec_inv_sqrt:       reciprocal value of sqrt(count) >> 1
 102  * @drop_next:          time to drop next packet, or when we dropped last
 103  * @blue_timer:         Blue time to next drop
 104  * @p_drop:             BLUE drop probability (0.32 fxp)
 105  * @dropping:           set if in dropping state
 106  * @ecn_marked:         set if marked
 107  */
 108 struct cobalt_vars {
 109         u32     count;
 110         u32     rec_inv_sqrt;
 111         ktime_t drop_next;
 112         ktime_t blue_timer;
 113         u32     p_drop;
 114         bool    dropping;
 115         bool    ecn_marked;
 116 };
 117
 118 enum {
 119         CAKE_SET_NONE = 0,
 120         CAKE_SET_SPARSE,
 121         CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */
 122         CAKE_SET_BULK,
 123         CAKE_SET_DECAYING
 124 };
 125
 126 struct cake_flow {
 127         /* this stuff is all needed per-flow at dequeue time */
 128         struct sk_buff    *head;
 129         struct sk_buff    *tail;
 130         struct list_head  flowchain;
 131         s32               deficit;
 132         u32               dropped;
 133         struct cobalt_vars cvars;
 134         u16               srchost; /* index into cake_host table */
 135         u16               dsthost;
 136         u8                set;
 137 }; /* please try to keep this structure <= 64 bytes */
 138
 139 struct cake_host {
 140         u32 srchost_tag;
 141         u32 dsthost_tag;
 142         u16 srchost_bulk_flow_count;
 143         u16 dsthost_bulk_flow_count;
 144 };
 145
 146 struct cake_heap_entry {
 147         u16 t:3, b:10;
 148 };
 149
 150 struct cake_tin_data {
 151         struct cake_flow flows[CAKE_QUEUES];
 152         u32     backlogs[CAKE_QUEUES];
 153         u32     tags[CAKE_QUEUES]; /* for set association */
 154         u16     overflow_idx[CAKE_QUEUES];
 155         struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */
 156         u16     flow_quantum;
 157
 158         struct cobalt_params cparams;
 159         u32     drop_overlimit;
 160         u16     bulk_flow_count;
 161         u16     sparse_flow_count;
 162         u16     decaying_flow_count;
 163         u16     unresponsive_flow_count;
 164
 165         u32     max_skblen;
 166
 167         struct list_head new_flows;
 168         struct list_head old_flows;
 169         struct list_head decaying_flows;
 170
 171         /* time_next = time_this + ((len * rate_ns) >> rate_shft) */
 172         ktime_t time_next_packet;
 173         u64     tin_rate_ns;
 174         u64     tin_rate_bps;
 175         u16     tin_rate_shft;
 176
 177         u16     tin_quantum;
 178         s32     tin_deficit;
 179         u32     tin_backlog;
 180         u32     tin_dropped;
 181         u32     tin_ecn_mark;
 182
 183         u32     packets;
 184         u64     bytes;
 185
 186         u32     ack_drops;
 187
 188         /* moving averages */
 189         u64 avge_delay;
 190         u64 peak_delay;
 191         u64 base_delay;
 192
 193         /* hash function stats */
 194         u32     way_directs;
 195         u32     way_hits;
 196         u32     way_misses;
 197         u32     way_collisions;
 198 }; /* number of tins is small, so size of this struct doesn't matter much */
 199
 200 struct cake_sched_data {
 201         struct tcf_proto __rcu *filter_list; /* optional external classifier */
 202         struct tcf_block *block;
 203         struct cake_tin_data *tins;
 204
 205         struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS];
 206         u16             overflow_timeout;
 207
 208         u16             tin_cnt;
 209         u8              tin_mode;
 210         u8              flow_mode;
 211         u8              ack_filter;
 212         u8              atm_mode;
 213
 214         u32             fwmark_mask;
 215         u16             fwmark_shft;
 216
 217         /* time_next = time_this + ((len * rate_ns) >> rate_shft) */
 218         u16             rate_shft;
 219         ktime_t         time_next_packet;
 220         ktime_t         failsafe_next_packet;
 221         u64             rate_ns;
 222         u64             rate_bps;
 223         u16             rate_flags;
 224         s16             rate_overhead;
 225         u16             rate_mpu;
 226         u64             interval;
 227         u64             target;
 228
 229         /* resource tracking */
 230         u32             buffer_used;
 231         u32             buffer_max_used;
 232         u32             buffer_limit;
 233         u32             buffer_config_limit;
 234
 235         /* indices for dequeue */
 236         u16             cur_tin;
 237         u16             cur_flow;
 238
 239         struct qdisc_watchdog watchdog;
 240         const u8        *tin_index;
 241         const u8        *tin_order;
 242
 243         /* bandwidth capacity estimate */
 244         ktime_t         last_packet_time;
 245         ktime_t         avg_window_begin;
 246         u64             avg_packet_interval;
 247         u64             avg_window_bytes;
 248         u64             avg_peak_bandwidth;
 249         ktime_t         last_reconfig_time;
 250
 251         /* packet length stats */
 252         u32             avg_netoff;
 253         u16             max_netlen;
 254         u16             max_adjlen;
 255         u16             min_netlen;
 256         u16             min_adjlen;
 257 };
 258
 259 enum {
 260         CAKE_FLAG_OVERHEAD         = BIT(0),
 261         CAKE_FLAG_AUTORATE_INGRESS = BIT(1),
 262         CAKE_FLAG_INGRESS          = BIT(2),
 263         CAKE_FLAG_WASH             = BIT(3),
 264         CAKE_FLAG_SPLIT_GSO        = BIT(4)
 265 };
 266
 267 /* COBALT operates the Codel and BLUE algorithms in parallel, in order to
 268  * obtain the best features of each.  Codel is excellent on flows which
 269  * respond to congestion signals in a TCP-like way.  BLUE is more effective on
 270  * unresponsive flows.
 271  */
 272
 273 struct cobalt_skb_cb {
 274         ktime_t enqueue_time;
 275         u32     adjusted_len;
 276 };
 277
 278 static u64 us_to_ns(u64 us)
 279 {
 280         return us * NSEC_PER_USEC;
 281 }
 282
 283 static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb)
 284 {
 285         qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb));
 286         return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data;
 287 }
 288
 289 static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb)
 290 {
 291         return get_cobalt_cb(skb)->enqueue_time;
 292 }
 293
 294 static void cobalt_set_enqueue_time(struct sk_buff *skb,
 295                                     ktime_t now)
 296 {
 297         get_cobalt_cb(skb)->enqueue_time = now;
 298 }
 299
 300 static u16 quantum_div[CAKE_QUEUES + 1] = {0};
 301
 302 /* Diffserv lookup tables */
 303
 304 static const u8 precedence[] = {
 305         0, 0, 0, 0, 0, 0, 0, 0,
 306         1, 1, 1, 1, 1, 1, 1, 1,
 307         2, 2, 2, 2, 2, 2, 2, 2,
 308         3, 3, 3, 3, 3, 3, 3, 3,
 309         4, 4, 4, 4, 4, 4, 4, 4,
 310         5, 5, 5, 5, 5, 5, 5, 5,
 311         6, 6, 6, 6, 6, 6, 6, 6,
 312         7, 7, 7, 7, 7, 7, 7, 7,
 313 };
 314
 315 static const u8 diffserv8[] = {
 316         2, 0, 1, 2, 4, 2, 2, 2,
 317         1, 2, 1, 2, 1, 2, 1, 2,
 318         5, 2, 4, 2, 4, 2, 4, 2,
 319         3, 2, 3, 2, 3, 2, 3, 2,
 320         6, 2, 3, 2, 3, 2, 3, 2,
 321         6, 2, 2, 2, 6, 2, 6, 2,
 322         7, 2, 2, 2, 2, 2, 2, 2,
 323         7, 2, 2, 2, 2, 2, 2, 2,
 324 };
 325
 326 static const u8 diffserv4[] = {
 327         0, 1, 0, 0, 2, 0, 0, 0,
 328         1, 0, 0, 0, 0, 0, 0, 0,
 329         2, 0, 2, 0, 2, 0, 2, 0,
 330         2, 0, 2, 0, 2, 0, 2, 0,
 331         3, 0, 2, 0, 2, 0, 2, 0,
 332         3, 0, 0, 0, 3, 0, 3, 0,
 333         3, 0, 0, 0, 0, 0, 0, 0,
 334         3, 0, 0, 0, 0, 0, 0, 0,
 335 };
 336
 337 static const u8 diffserv3[] = {
 338         0, 1, 0, 0, 2, 0, 0, 0,
 339         1, 0, 0, 0, 0, 0, 0, 0,
 340         0, 0, 0, 0, 0, 0, 0, 0,
 341         0, 0, 0, 0, 0, 0, 0, 0,
 342         0, 0, 0, 0, 0, 0, 0, 0,
 343         0, 0, 0, 0, 2, 0, 2, 0,
 344         2, 0, 0, 0, 0, 0, 0, 0,
 345         2, 0, 0, 0, 0, 0, 0, 0,
 346 };
 347
 348 static const u8 besteffort[] = {
 349         0, 0, 0, 0, 0, 0, 0, 0,
 350         0, 0, 0, 0, 0, 0, 0, 0,
 351         0, 0, 0, 0, 0, 0, 0, 0,
 352         0, 0, 0, 0, 0, 0, 0, 0,
 353         0, 0, 0, 0, 0, 0, 0, 0,
 354         0, 0, 0, 0, 0, 0, 0, 0,
 355         0, 0, 0, 0, 0, 0, 0, 0,
 356         0, 0, 0, 0, 0, 0, 0, 0,
 357 };
 358
 359 /* tin priority order for stats dumping */
 360
 361 static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7};
 362 static const u8 bulk_order[] = {1, 0, 2, 3};
 363
 364 /* There is a big difference in timing between the accurate values placed in the
 365  * cache and the approximations given by a single Newton step for small count
 366  * values, particularly when stepping from count 1 to 2 or vice versa. Hence,
 367  * these values are calculated using eight Newton steps, using the
 368  * implementation below. Above 16, a single Newton step gives sufficient
 369  * accuracy in either direction, given the precision stored.
 370  *
 371  * The magnitude of the error when stepping up to count 2 is such as to give the
 372  * value that *should* have been produced at count 4.
 373  */
 374
 375 #define REC_INV_SQRT_CACHE (16)
 376 static const u32 inv_sqrt_cache[REC_INV_SQRT_CACHE] = {
 377                 ~0,         ~0, 3037000500, 2479700525,
 378         2147483647, 1920767767, 1753413056, 1623345051,
 379         1518500250, 1431655765, 1358187914, 1294981364,
 380         1239850263, 1191209601, 1147878294, 1108955788
 381 };
 382
 383 /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots
 384  * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
 385  *
 386  * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
 387  */
 388
 389 static void cobalt_newton_step(struct cobalt_vars *vars)
 390 {
 391         u32 invsqrt, invsqrt2;
 392         u64 val;
 393
 394         invsqrt = vars->rec_inv_sqrt;
 395         invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
 396         val = (3LL << 32) - ((u64)vars->count * invsqrt2);
 397
 398         val >>= 2; /* avoid overflow in following multiply */
 399         val = (val * invsqrt) >> (32 - 2 + 1);
 400
 401         vars->rec_inv_sqrt = val;
 402 }
 403
 404 static void cobalt_invsqrt(struct cobalt_vars *vars)
 405 {
 406         if (vars->count < REC_INV_SQRT_CACHE)
 407                 vars->rec_inv_sqrt = inv_sqrt_cache[vars->count];
 408         else
 409                 cobalt_newton_step(vars);
 410 }
 411
 412 static void cobalt_vars_init(struct cobalt_vars *vars)
 413 {
 414         memset(vars, 0, sizeof(*vars));
 415 }
 416
 417 /* CoDel control_law is t + interval/sqrt(count)
 418  * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid
 419  * both sqrt() and divide operation.
 420  */
 421 static ktime_t cobalt_control(ktime_t t,
 422                               u64 interval,
 423                               u32 rec_inv_sqrt)
 424 {
 425         return ktime_add_ns(t, reciprocal_scale(interval,
 426                                                 rec_inv_sqrt));
 427 }
 428
 429 /* Call this when a packet had to be dropped due to queue overflow.  Returns
 430  * true if the BLUE state was quiescent before but active after this call.
 431  */
 432 static bool cobalt_queue_full(struct cobalt_vars *vars,
 433                               struct cobalt_params *p,
 434                               ktime_t now)
 435 {
 436         bool up = false;
 437
 438         if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
 439                 up = !vars->p_drop;
 440                 vars->p_drop += p->p_inc;
 441                 if (vars->p_drop < p->p_inc)
 442                         vars->p_drop = ~0;
 443                 vars->blue_timer = now;
 444         }
 445         vars->dropping = true;
 446         vars->drop_next = now;
 447         if (!vars->count)
 448                 vars->count = 1;
 449
 450         return up;
 451 }
 452
 453 /* Call this when the queue was serviced but turned out to be empty.  Returns
 454  * true if the BLUE state was active before but quiescent after this call.
 455  */
 456 static bool cobalt_queue_empty(struct cobalt_vars *vars,
 457                                struct cobalt_params *p,
 458                                ktime_t now)
 459 {
 460         bool down = false;
 461
 462         if (vars->p_drop &&
 463             ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
 464                 if (vars->p_drop < p->p_dec)
 465                         vars->p_drop = 0;
 466                 else
 467                         vars->p_drop -= p->p_dec;
 468                 vars->blue_timer = now;
 469                 down = !vars->p_drop;
 470         }
 471         vars->dropping = false;
 472
 473         if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
 474                 vars->count--;
 475                 cobalt_invsqrt(vars);
 476                 vars->drop_next = cobalt_control(vars->drop_next,
 477                                                  p->interval,
 478                                                  vars->rec_inv_sqrt);
 479         }
 480
 481         return down;
 482 }
 483
 484 /* Call this with a freshly dequeued packet for possible congestion marking.
 485  * Returns true as an instruction to drop the packet, false for delivery.
 486  */
 487 static enum skb_drop_reason cobalt_should_drop(struct cobalt_vars *vars,
 488                                                struct cobalt_params *p,
 489                                                ktime_t now,
 490                                                struct sk_buff *skb,
 491                                                u32 bulk_flows)
 492 {
 493         enum skb_drop_reason reason = SKB_NOT_DROPPED_YET;
 494         bool next_due, over_target;
 495         ktime_t schedule;
 496         u64 sojourn;
 497
 498 /* The 'schedule' variable records, in its sign, whether 'now' is before or
 499  * after 'drop_next'.  This allows 'drop_next' to be updated before the next
 500  * scheduling decision is actually branched, without destroying that
 501  * information.  Similarly, the first 'schedule' value calculated is preserved
 502  * in the boolean 'next_due'.
 503  *
 504  * As for 'drop_next', we take advantage of the fact that 'interval' is both
 505  * the delay between first exceeding 'target' and the first signalling event,
 506  * *and* the scaling factor for the signalling frequency.  It's therefore very
 507  * natural to use a single mechanism for both purposes, and eliminates a
 508  * significant amount of reference Codel's spaghetti code.  To help with this,
 509  * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close
 510  * as possible to 1.0 in fixed-point.
 511  */
 512
 513         sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
 514         schedule = ktime_sub(now, vars->drop_next);
 515         over_target = sojourn > p->target &&
 516                       sojourn > p->mtu_time * bulk_flows * 2 &&
 517                       sojourn > p->mtu_time * 4;
 518         next_due = vars->count && ktime_to_ns(schedule) >= 0;
 519
 520         vars->ecn_marked = false;
 521
 522         if (over_target) {
 523                 if (!vars->dropping) {
 524                         vars->dropping = true;
 525                         vars->drop_next = cobalt_control(now,
 526                                                          p->interval,
 527                                                          vars->rec_inv_sqrt);
 528                 }
 529                 if (!vars->count)
 530                         vars->count = 1;
 531         } else if (vars->dropping) {
 532                 vars->dropping = false;
 533         }
 534
 535         if (next_due && vars->dropping) {
 536                 /* Use ECN mark if possible, otherwise drop */
 537                 if (!(vars->ecn_marked = INET_ECN_set_ce(skb)))
 538                         reason = SKB_DROP_REASON_QDISC_CONGESTED;
 539
 540                 vars->count++;
 541                 if (!vars->count)
 542                         vars->count--;
 543                 cobalt_invsqrt(vars);
 544                 vars->drop_next = cobalt_control(vars->drop_next,
 545                                                  p->interval,
 546                                                  vars->rec_inv_sqrt);
 547                 schedule = ktime_sub(now, vars->drop_next);
 548         } else {
 549                 while (next_due) {
 550                         vars->count--;
 551                         cobalt_invsqrt(vars);
 552                         vars->drop_next = cobalt_control(vars->drop_next,
 553                                                          p->interval,
 554                                                          vars->rec_inv_sqrt);
 555                         schedule = ktime_sub(now, vars->drop_next);
 556                         next_due = vars->count && ktime_to_ns(schedule) >= 0;
 557                 }
 558         }
 559
 560         /* Simple BLUE implementation.  Lack of ECN is deliberate. */
 561         if (vars->p_drop && reason == SKB_NOT_DROPPED_YET &&
 562             get_random_u32() < vars->p_drop)
 563                 reason = SKB_DROP_REASON_CAKE_FLOOD;
 564
 565         /* Overload the drop_next field as an activity timeout */
 566         if (!vars->count)
 567                 vars->drop_next = ktime_add_ns(now, p->interval);
 568         else if (ktime_to_ns(schedule) > 0 && reason == SKB_NOT_DROPPED_YET)
 569                 vars->drop_next = now;
 570
 571         return reason;
 572 }
 573
 574 static bool cake_update_flowkeys(struct flow_keys *keys,
 575                                  const struct sk_buff *skb)
 576 {
 577 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
 578         struct nf_conntrack_tuple tuple = {};
 579         bool rev = !skb->_nfct, upd = false;
 580         __be32 ip;
 581
 582         if (skb_protocol(skb, true) != htons(ETH_P_IP))
 583                 return false;
 584
 585         if (!nf_ct_get_tuple_skb(&tuple, skb))
 586                 return false;
 587
 588         ip = rev ? tuple.dst.u3.ip : tuple.src.u3.ip;
 589         if (ip != keys->addrs.v4addrs.src) {
 590                 keys->addrs.v4addrs.src = ip;
 591                 upd = true;
 592         }
 593         ip = rev ? tuple.src.u3.ip : tuple.dst.u3.ip;
 594         if (ip != keys->addrs.v4addrs.dst) {
 595                 keys->addrs.v4addrs.dst = ip;
 596                 upd = true;
 597         }
 598
 599         if (keys->ports.ports) {
 600                 __be16 port;
 601
 602                 port = rev ? tuple.dst.u.all : tuple.src.u.all;
 603                 if (port != keys->ports.src) {
 604                         keys->ports.src = port;
 605                         upd = true;
 606                 }
 607                 port = rev ? tuple.src.u.all : tuple.dst.u.all;
 608                 if (port != keys->ports.dst) {
 609                         port = keys->ports.dst;
 610                         upd = true;
 611                 }
 612         }
 613         return upd;
 614 #else
 615         return false;
 616 #endif
 617 }
 618
 619 /* Cake has several subtle multiple bit settings. In these cases you
 620  *  would be matching triple isolate mode as well.
 621  */
 622
 623 static bool cake_dsrc(int flow_mode)
 624 {
 625         return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC;
 626 }
 627
 628 static bool cake_ddst(int flow_mode)
 629 {
 630         return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST;
 631 }
 632
 633 static void cake_dec_srchost_bulk_flow_count(struct cake_tin_data *q,
 634                                              struct cake_flow *flow,
 635                                              int flow_mode)
 636 {
 637         if (likely(cake_dsrc(flow_mode) &&
 638                    q->hosts[flow->srchost].srchost_bulk_flow_count))
 639                 q->hosts[flow->srchost].srchost_bulk_flow_count--;
 640 }
 641
 642 static void cake_inc_srchost_bulk_flow_count(struct cake_tin_data *q,
 643                                              struct cake_flow *flow,
 644                                              int flow_mode)
 645 {
 646         if (likely(cake_dsrc(flow_mode) &&
 647                    q->hosts[flow->srchost].srchost_bulk_flow_count < CAKE_QUEUES))
 648                 q->hosts[flow->srchost].srchost_bulk_flow_count++;
 649 }
 650
 651 static void cake_dec_dsthost_bulk_flow_count(struct cake_tin_data *q,
 652                                              struct cake_flow *flow,
 653                                              int flow_mode)
 654 {
 655         if (likely(cake_ddst(flow_mode) &&
 656                    q->hosts[flow->dsthost].dsthost_bulk_flow_count))
 657                 q->hosts[flow->dsthost].dsthost_bulk_flow_count--;
 658 }
 659
 660 static void cake_inc_dsthost_bulk_flow_count(struct cake_tin_data *q,
 661                                              struct cake_flow *flow,
 662                                              int flow_mode)
 663 {
 664         if (likely(cake_ddst(flow_mode) &&
 665                    q->hosts[flow->dsthost].dsthost_bulk_flow_count < CAKE_QUEUES))
 666                 q->hosts[flow->dsthost].dsthost_bulk_flow_count++;
 667 }
 668
 669 static u16 cake_get_flow_quantum(struct cake_tin_data *q,
 670                                  struct cake_flow *flow,
 671                                  int flow_mode)
 672 {
 673         u16 host_load = 1;
 674
 675         if (cake_dsrc(flow_mode))
 676                 host_load = max(host_load,
 677                                 q->hosts[flow->srchost].srchost_bulk_flow_count);
 678
 679         if (cake_ddst(flow_mode))
 680                 host_load = max(host_load,
 681                                 q->hosts[flow->dsthost].dsthost_bulk_flow_count);
 682
 683         /* The get_random_u16() is a way to apply dithering to avoid
 684          * accumulating roundoff errors
 685          */
 686         return (q->flow_quantum * quantum_div[host_load] +
 687                 get_random_u16()) >> 16;
 688 }
 689
 690 static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
 691                      int flow_mode, u16 flow_override, u16 host_override)
 692 {
 693         bool hash_flows = (!flow_override && !!(flow_mode & CAKE_FLOW_FLOWS));
 694         bool hash_hosts = (!host_override && !!(flow_mode & CAKE_FLOW_HOSTS));
 695         bool nat_enabled = !!(flow_mode & CAKE_FLOW_NAT_FLAG);
 696         u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0;
 697         u16 reduced_hash, srchost_idx, dsthost_idx;
 698         struct flow_keys keys, host_keys;
 699         bool use_skbhash = skb->l4_hash;
 700
 701         if (unlikely(flow_mode == CAKE_FLOW_NONE))
 702                 return 0;
 703
 704         /* If both overrides are set, or we can use the SKB hash and nat mode is
 705          * disabled, we can skip packet dissection entirely. If nat mode is
 706          * enabled there's another check below after doing the conntrack lookup.
 707          */
 708         if ((!hash_flows || (use_skbhash && !nat_enabled)) && !hash_hosts)
 709                 goto skip_hash;
 710
 711         skb_flow_dissect_flow_keys(skb, &keys,
 712                                    FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
 713
 714         /* Don't use the SKB hash if we change the lookup keys from conntrack */
 715         if (nat_enabled && cake_update_flowkeys(&keys, skb))
 716                 use_skbhash = false;
 717
 718         /* If we can still use the SKB hash and don't need the host hash, we can
 719          * skip the rest of the hashing procedure
 720          */
 721         if (use_skbhash && !hash_hosts)
 722                 goto skip_hash;
 723
 724         /* flow_hash_from_keys() sorts the addresses by value, so we have
 725          * to preserve their order in a separate data structure to treat
 726          * src and dst host addresses as independently selectable.
 727          */
 728         host_keys = keys;
 729         host_keys.ports.ports     = 0;
 730         host_keys.basic.ip_proto  = 0;
 731         host_keys.keyid.keyid     = 0;
 732         host_keys.tags.flow_label = 0;
 733
 734         switch (host_keys.control.addr_type) {
 735         case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
 736                 host_keys.addrs.v4addrs.src = 0;
 737                 dsthost_hash = flow_hash_from_keys(&host_keys);
 738                 host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
 739                 host_keys.addrs.v4addrs.dst = 0;
 740                 srchost_hash = flow_hash_from_keys(&host_keys);
 741                 break;
 742
 743         case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
 744                 memset(&host_keys.addrs.v6addrs.src, 0,
 745                        sizeof(host_keys.addrs.v6addrs.src));
 746                 dsthost_hash = flow_hash_from_keys(&host_keys);
 747                 host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
 748                 memset(&host_keys.addrs.v6addrs.dst, 0,
 749                        sizeof(host_keys.addrs.v6addrs.dst));
 750                 srchost_hash = flow_hash_from_keys(&host_keys);
 751                 break;
 752
 753         default:
 754                 dsthost_hash = 0;
 755                 srchost_hash = 0;
 756         }
 757
 758         /* This *must* be after the above switch, since as a
 759          * side-effect it sorts the src and dst addresses.
 760          */
 761         if (hash_flows && !use_skbhash)
 762                 flow_hash = flow_hash_from_keys(&keys);
 763
 764 skip_hash:
 765         if (flow_override)
 766                 flow_hash = flow_override - 1;
 767         else if (use_skbhash && (flow_mode & CAKE_FLOW_FLOWS))
 768                 flow_hash = skb->hash;
 769         if (host_override) {
 770                 dsthost_hash = host_override - 1;
 771                 srchost_hash = host_override - 1;
 772         }
 773
 774         if (!(flow_mode & CAKE_FLOW_FLOWS)) {
 775                 if (flow_mode & CAKE_FLOW_SRC_IP)
 776                         flow_hash ^= srchost_hash;
 777
 778                 if (flow_mode & CAKE_FLOW_DST_IP)
 779                         flow_hash ^= dsthost_hash;
 780         }
 781
 782         reduced_hash = flow_hash % CAKE_QUEUES;
 783
 784         /* set-associative hashing */
 785         /* fast path if no hash collision (direct lookup succeeds) */
 786         if (likely(q->tags[reduced_hash] == flow_hash &&
 787                    q->flows[reduced_hash].set)) {
 788                 q->way_directs++;
 789         } else {
 790                 u32 inner_hash = reduced_hash % CAKE_SET_WAYS;
 791                 u32 outer_hash = reduced_hash - inner_hash;
 792                 bool allocate_src = false;
 793                 bool allocate_dst = false;
 794                 u32 i, k;
 795
 796                 /* check if any active queue in the set is reserved for
 797                  * this flow.
 798                  */
 799                 for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
 800                      i++, k = (k + 1) % CAKE_SET_WAYS) {
 801                         if (q->tags[outer_hash + k] == flow_hash) {
 802                                 if (i)
 803                                         q->way_hits++;
 804
 805                                 if (!q->flows[outer_hash + k].set) {
 806                                         /* need to increment host refcnts */
 807                                         allocate_src = cake_dsrc(flow_mode);
 808                                         allocate_dst = cake_ddst(flow_mode);
 809                                 }
 810
 811                                 goto found;
 812                         }
 813                 }
 814
 815                 /* no queue is reserved for this flow, look for an
 816                  * empty one.
 817                  */
 818                 for (i = 0; i < CAKE_SET_WAYS;
 819                          i++, k = (k + 1) % CAKE_SET_WAYS) {
 820                         if (!q->flows[outer_hash + k].set) {
 821                                 q->way_misses++;
 822                                 allocate_src = cake_dsrc(flow_mode);
 823                                 allocate_dst = cake_ddst(flow_mode);
 824                                 goto found;
 825                         }
 826                 }
 827
 828                 /* With no empty queues, default to the original
 829                  * queue, accept the collision, update the host tags.
 830                  */
 831                 q->way_collisions++;
 832                 allocate_src = cake_dsrc(flow_mode);
 833                 allocate_dst = cake_ddst(flow_mode);
 834
 835                 if (q->flows[outer_hash + k].set == CAKE_SET_BULK) {
 836                         cake_dec_srchost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode);
 837                         cake_dec_dsthost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode);
 838                 }
 839 found:
 840                 /* reserve queue for future packets in same flow */
 841                 reduced_hash = outer_hash + k;
 842                 q->tags[reduced_hash] = flow_hash;
 843
 844                 if (allocate_src) {
 845                         srchost_idx = srchost_hash % CAKE_QUEUES;
 846                         inner_hash = srchost_idx % CAKE_SET_WAYS;
 847                         outer_hash = srchost_idx - inner_hash;
 848                         for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
 849                                 i++, k = (k + 1) % CAKE_SET_WAYS) {
 850                                 if (q->hosts[outer_hash + k].srchost_tag ==
 851                                     srchost_hash)
 852                                         goto found_src;
 853                         }
 854                         for (i = 0; i < CAKE_SET_WAYS;
 855                                 i++, k = (k + 1) % CAKE_SET_WAYS) {
 856                                 if (!q->hosts[outer_hash + k].srchost_bulk_flow_count)
 857                                         break;
 858                         }
 859                         q->hosts[outer_hash + k].srchost_tag = srchost_hash;
 860 found_src:
 861                         srchost_idx = outer_hash + k;
 862                         q->flows[reduced_hash].srchost = srchost_idx;
 863
 864                         if (q->flows[reduced_hash].set == CAKE_SET_BULK)
 865                                 cake_inc_srchost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode);
 866                 }
 867
 868                 if (allocate_dst) {
 869                         dsthost_idx = dsthost_hash % CAKE_QUEUES;
 870                         inner_hash = dsthost_idx % CAKE_SET_WAYS;
 871                         outer_hash = dsthost_idx - inner_hash;
 872                         for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
 873                              i++, k = (k + 1) % CAKE_SET_WAYS) {
 874                                 if (q->hosts[outer_hash + k].dsthost_tag ==
 875                                     dsthost_hash)
 876                                         goto found_dst;
 877                         }
 878                         for (i = 0; i < CAKE_SET_WAYS;
 879                              i++, k = (k + 1) % CAKE_SET_WAYS) {
 880                                 if (!q->hosts[outer_hash + k].dsthost_bulk_flow_count)
 881                                         break;
 882                         }
 883                         q->hosts[outer_hash + k].dsthost_tag = dsthost_hash;
 884 found_dst:
 885                         dsthost_idx = outer_hash + k;
 886                         q->flows[reduced_hash].dsthost = dsthost_idx;
 887
 888                         if (q->flows[reduced_hash].set == CAKE_SET_BULK)
 889                                 cake_inc_dsthost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode);
 890                 }
 891         }
 892
 893         return reduced_hash;
 894 }
 895
 896 /* helper functions : might be changed when/if skb use a standard list_head */
 897 /* remove one skb from head of slot queue */
 898
 899 static struct sk_buff *dequeue_head(struct cake_flow *flow)
 900 {
 901         struct sk_buff *skb = flow->head;
 902
 903         if (skb) {
 904                 flow->head = skb->next;
 905                 skb_mark_not_on_list(skb);
 906         }
 907
 908         return skb;
 909 }
 910
 911 /* add skb to flow queue (tail add) */
 912
 913 static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb)
 914 {
 915         if (!flow->head)
 916                 flow->head = skb;
 917         else
 918                 flow->tail->next = skb;
 919         flow->tail = skb;
 920         skb->next = NULL;
 921 }
 922
 923 static struct iphdr *cake_get_iphdr(const struct sk_buff *skb,
 924                                     struct ipv6hdr *buf)
 925 {
 926         unsigned int offset = skb_network_offset(skb);
 927         struct iphdr *iph;
 928
 929         iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf);
 930
 931         if (!iph)
 932                 return NULL;
 933
 934         if (iph->version == 4 && iph->protocol == IPPROTO_IPV6)
 935                 return skb_header_pointer(skb, offset + iph->ihl * 4,
 936                                           sizeof(struct ipv6hdr), buf);
 937
 938         else if (iph->version == 4)
 939                 return iph;
 940
 941         else if (iph->version == 6)
 942                 return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr),
 943                                           buf);
 944
 945         return NULL;
 946 }
 947
 948 static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb,
 949                                       void *buf, unsigned int bufsize)
 950 {
 951         unsigned int offset = skb_network_offset(skb);
 952         const struct ipv6hdr *ipv6h;
 953         const struct tcphdr *tcph;
 954         const struct iphdr *iph;
 955         struct ipv6hdr _ipv6h;
 956         struct tcphdr _tcph;
 957
 958         ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
 959
 960         if (!ipv6h)
 961                 return NULL;
 962
 963         if (ipv6h->version == 4) {
 964                 iph = (struct iphdr *)ipv6h;
 965                 offset += iph->ihl * 4;
 966
 967                 /* special-case 6in4 tunnelling, as that is a common way to get
 968                  * v6 connectivity in the home
 969                  */
 970                 if (iph->protocol == IPPROTO_IPV6) {
 971                         ipv6h = skb_header_pointer(skb, offset,
 972                                                    sizeof(_ipv6h), &_ipv6h);
 973
 974                         if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP)
 975                                 return NULL;
 976
 977                         offset += sizeof(struct ipv6hdr);
 978
 979                 } else if (iph->protocol != IPPROTO_TCP) {
 980                         return NULL;
 981                 }
 982
 983         } else if (ipv6h->version == 6) {
 984                 if (ipv6h->nexthdr != IPPROTO_TCP)
 985                         return NULL;
 986
 987                 offset += sizeof(struct ipv6hdr);
 988         } else {
 989                 return NULL;
 990         }
 991
 992         tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
 993         if (!tcph || tcph->doff < 5)
 994                 return NULL;
 995
 996         return skb_header_pointer(skb, offset,
 997                                   min(__tcp_hdrlen(tcph), bufsize), buf);
 998 }
 999
1000 static const void *cake_get_tcpopt(const struct tcphdr *tcph,
1001                                    int code, int *oplen)
1002 {
1003         /* inspired by tcp_parse_options in tcp_input.c */
1004         int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
1005         const u8 *ptr = (const u8 *)(tcph + 1);
1006
1007         while (length > 0) {
1008                 int opcode = *ptr++;
1009                 int opsize;
1010
1011                 if (opcode == TCPOPT_EOL)
1012                         break;
1013                 if (opcode == TCPOPT_NOP) {
1014                         length--;
1015                         continue;
1016                 }
1017                 if (length < 2)
1018                         break;
1019                 opsize = *ptr++;
1020                 if (opsize < 2 || opsize > length)
1021                         break;
1022
1023                 if (opcode == code) {
1024                         *oplen = opsize;
1025                         return ptr;
1026                 }
1027
1028                 ptr += opsize - 2;
1029                 length -= opsize;
1030         }
1031
1032         return NULL;
1033 }
1034
1035 /* Compare two SACK sequences. A sequence is considered greater if it SACKs more
1036  * bytes than the other. In the case where both sequences ACKs bytes that the
1037  * other doesn't, A is considered greater. DSACKs in A also makes A be
1038  * considered greater.
1039  *
1040  * @return -1, 0 or 1 as normal compare functions
1041  */
1042 static int cake_tcph_sack_compare(const struct tcphdr *tcph_a,
1043                                   const struct tcphdr *tcph_b)
1044 {
1045         const struct tcp_sack_block_wire *sack_a, *sack_b;
1046         u32 ack_seq_a = ntohl(tcph_a->ack_seq);
1047         u32 bytes_a = 0, bytes_b = 0;
1048         int oplen_a, oplen_b;
1049         bool first = true;
1050
1051         sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a);
1052         sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b);
1053
1054         /* pointers point to option contents */
1055         oplen_a -= TCPOLEN_SACK_BASE;
1056         oplen_b -= TCPOLEN_SACK_BASE;
1057
1058         if (sack_a && oplen_a >= sizeof(*sack_a) &&
1059             (!sack_b || oplen_b < sizeof(*sack_b)))
1060                 return -1;
1061         else if (sack_b && oplen_b >= sizeof(*sack_b) &&
1062                  (!sack_a || oplen_a < sizeof(*sack_a)))
1063                 return 1;
1064         else if ((!sack_a || oplen_a < sizeof(*sack_a)) &&
1065                  (!sack_b || oplen_b < sizeof(*sack_b)))
1066                 return 0;
1067
1068         while (oplen_a >= sizeof(*sack_a)) {
1069                 const struct tcp_sack_block_wire *sack_tmp = sack_b;
1070                 u32 start_a = get_unaligned_be32(&sack_a->start_seq);
1071                 u32 end_a = get_unaligned_be32(&sack_a->end_seq);
1072                 int oplen_tmp = oplen_b;
1073                 bool found = false;
1074
1075                 /* DSACK; always considered greater to prevent dropping */
1076                 if (before(start_a, ack_seq_a))
1077                         return -1;
1078
1079                 bytes_a += end_a - start_a;
1080
1081                 while (oplen_tmp >= sizeof(*sack_tmp)) {
1082                         u32 start_b = get_unaligned_be32(&sack_tmp->start_seq);
1083                         u32 end_b = get_unaligned_be32(&sack_tmp->end_seq);
1084
1085                         /* first time through we count the total size */
1086                         if (first)
1087                                 bytes_b += end_b - start_b;
1088
1089                         if (!after(start_b, start_a) && !before(end_b, end_a)) {
1090                                 found = true;
1091                                 if (!first)
1092                                         break;
1093                         }
1094                         oplen_tmp -= sizeof(*sack_tmp);
1095                         sack_tmp++;
1096                 }
1097
1098                 if (!found)
1099                         return -1;
1100
1101                 oplen_a -= sizeof(*sack_a);
1102                 sack_a++;
1103                 first = false;
1104         }
1105
1106         /* If we made it this far, all ranges SACKed by A are covered by B, so
1107          * either the SACKs are equal, or B SACKs more bytes.
1108          */
1109         return bytes_b > bytes_a ? 1 : 0;
1110 }
1111
1112 static void cake_tcph_get_tstamp(const struct tcphdr *tcph,
1113                                  u32 *tsval, u32 *tsecr)
1114 {
1115         const u8 *ptr;
1116         int opsize;
1117
1118         ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize);
1119
1120         if (ptr && opsize == TCPOLEN_TIMESTAMP) {
1121                 *tsval = get_unaligned_be32(ptr);
1122                 *tsecr = get_unaligned_be32(ptr + 4);
1123         }
1124 }
1125
1126 static bool cake_tcph_may_drop(const struct tcphdr *tcph,
1127                                u32 tstamp_new, u32 tsecr_new)
1128 {
1129         /* inspired by tcp_parse_options in tcp_input.c */
1130         int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
1131         const u8 *ptr = (const u8 *)(tcph + 1);
1132         u32 tstamp, tsecr;
1133
1134         /* 3 reserved flags must be unset to avoid future breakage
1135          * ACK must be set
1136          * ECE/CWR are handled separately
1137          * All other flags URG/PSH/RST/SYN/FIN must be unset
1138          * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero)
1139          * 0x00C00000 = CWR/ECE (handled separately)
1140          * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000
1141          */
1142         if (((tcp_flag_word(tcph) &
1143               cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK))
1144                 return false;
1145
1146         while (length > 0) {
1147                 int opcode = *ptr++;
1148                 int opsize;
1149
1150                 if (opcode == TCPOPT_EOL)
1151                         break;
1152                 if (opcode == TCPOPT_NOP) {
1153                         length--;
1154                         continue;
1155                 }
1156                 if (length < 2)
1157                         break;
1158                 opsize = *ptr++;
1159                 if (opsize < 2 || opsize > length)
1160                         break;
1161
1162                 switch (opcode) {
1163                 case TCPOPT_MD5SIG: /* doesn't influence state */
1164                         break;
1165
1166                 case TCPOPT_SACK: /* stricter checking performed later */
1167                         if (opsize % 8 != 2)
1168                                 return false;
1169                         break;
1170
1171                 case TCPOPT_TIMESTAMP:
1172                         /* only drop timestamps lower than new */
1173                         if (opsize != TCPOLEN_TIMESTAMP)
1174                                 return false;
1175                         tstamp = get_unaligned_be32(ptr);
1176                         tsecr = get_unaligned_be32(ptr + 4);
1177                         if (after(tstamp, tstamp_new) ||
1178                             after(tsecr, tsecr_new))
1179                                 return false;
1180                         break;
1181
1182                 case TCPOPT_MSS:  /* these should only be set on SYN */
1183                 case TCPOPT_WINDOW:
1184                 case TCPOPT_SACK_PERM:
1185                 case TCPOPT_FASTOPEN:
1186                 case TCPOPT_EXP:
1187                 default: /* don't drop if any unknown options are present */
1188                         return false;
1189                 }
1190
1191                 ptr += opsize - 2;
1192                 length -= opsize;
1193         }
1194
1195         return true;
1196 }
1197
1198 static struct sk_buff *cake_ack_filter(struct cake_sched_data *q,
1199                                        struct cake_flow *flow)
1200 {
1201         bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE;
1202         struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL;
1203         struct sk_buff *skb_check, *skb_prev = NULL;
1204         const struct ipv6hdr *ipv6h, *ipv6h_check;
1205         unsigned char _tcph[64], _tcph_check[64];
1206         const struct tcphdr *tcph, *tcph_check;
1207         const struct iphdr *iph, *iph_check;
1208         struct ipv6hdr _iph, _iph_check;
1209         const struct sk_buff *skb;
1210         int seglen, num_found = 0;
1211         u32 tstamp = 0, tsecr = 0;
1212         __be32 elig_flags = 0;
1213         int sack_comp;
1214
1215         /* no other possible ACKs to filter */
1216         if (flow->head == flow->tail)
1217                 return NULL;
1218
1219         skb = flow->tail;
1220         tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph));
1221         iph = cake_get_iphdr(skb, &_iph);
1222         if (!tcph)
1223                 return NULL;
1224
1225         cake_tcph_get_tstamp(tcph, &tstamp, &tsecr);
1226
1227         /* the 'triggering' packet need only have the ACK flag set.
1228          * also check that SYN is not set, as there won't be any previous ACKs.
1229          */
1230         if ((tcp_flag_word(tcph) &
1231              (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK)
1232                 return NULL;
1233
1234         /* the 'triggering' ACK is at the tail of the queue, we have already
1235          * returned if it is the only packet in the flow. loop through the rest
1236          * of the queue looking for pure ACKs with the same 5-tuple as the
1237          * triggering one.
1238          */
1239         for (skb_check = flow->head;
1240              skb_check && skb_check != skb;
1241              skb_prev = skb_check, skb_check = skb_check->next) {
1242                 iph_check = cake_get_iphdr(skb_check, &_iph_check);
1243                 tcph_check = cake_get_tcphdr(skb_check, &_tcph_check,
1244                                              sizeof(_tcph_check));
1245
1246                 /* only TCP packets with matching 5-tuple are eligible, and only
1247                  * drop safe headers
1248                  */
1249                 if (!tcph_check || iph->version != iph_check->version ||
1250                     tcph_check->source != tcph->source ||
1251                     tcph_check->dest != tcph->dest)
1252                         continue;
1253
1254                 if (iph_check->version == 4) {
1255                         if (iph_check->saddr != iph->saddr ||
1256                             iph_check->daddr != iph->daddr)
1257                                 continue;
1258
1259                         seglen = iph_totlen(skb, iph_check) -
1260                                        (4 * iph_check->ihl);
1261                 } else if (iph_check->version == 6) {
1262                         ipv6h = (struct ipv6hdr *)iph;
1263                         ipv6h_check = (struct ipv6hdr *)iph_check;
1264
1265                         if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) ||
1266                             ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr))
1267                                 continue;
1268
1269                         seglen = ntohs(ipv6h_check->payload_len);
1270                 } else {
1271                         WARN_ON(1);  /* shouldn't happen */
1272                         continue;
1273                 }
1274
1275                 /* If the ECE/CWR flags changed from the previous eligible
1276                  * packet in the same flow, we should no longer be dropping that
1277                  * previous packet as this would lose information.
1278                  */
1279                 if (elig_ack && (tcp_flag_word(tcph_check) &
1280                                  (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) {
1281                         elig_ack = NULL;
1282                         elig_ack_prev = NULL;
1283                         num_found--;
1284                 }
1285
1286                 /* Check TCP options and flags, don't drop ACKs with segment
1287                  * data, and don't drop ACKs with a higher cumulative ACK
1288                  * counter than the triggering packet. Check ACK seqno here to
1289                  * avoid parsing SACK options of packets we are going to exclude
1290                  * anyway.
1291                  */
1292                 if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) ||
1293                     (seglen - __tcp_hdrlen(tcph_check)) != 0 ||
1294                     after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq)))
1295                         continue;
1296
1297                 /* Check SACK options. The triggering packet must SACK more data
1298                  * than the ACK under consideration, or SACK the same range but
1299                  * have a larger cumulative ACK counter. The latter is a
1300                  * pathological case, but is contained in the following check
1301                  * anyway, just to be safe.
1302                  */
1303                 sack_comp = cake_tcph_sack_compare(tcph_check, tcph);
1304
1305                 if (sack_comp < 0 ||
1306                     (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) &&
1307                      sack_comp == 0))
1308                         continue;
1309
1310                 /* At this point we have found an eligible pure ACK to drop; if
1311                  * we are in aggressive mode, we are done. Otherwise, keep
1312                  * searching unless this is the second eligible ACK we
1313                  * found.
1314                  *
1315                  * Since we want to drop ACK closest to the head of the queue,
1316                  * save the first eligible ACK we find, even if we need to loop
1317                  * again.
1318                  */
1319                 if (!elig_ack) {
1320                         elig_ack = skb_check;
1321                         elig_ack_prev = skb_prev;
1322                         elig_flags = (tcp_flag_word(tcph_check)
1323                                       & (TCP_FLAG_ECE | TCP_FLAG_CWR));
1324                 }
1325
1326                 if (num_found++ > 0)
1327                         goto found;
1328         }
1329
1330         /* We made it through the queue without finding two eligible ACKs . If
1331          * we found a single eligible ACK we can drop it in aggressive mode if
1332          * we can guarantee that this does not interfere with ECN flag
1333          * information. We ensure this by dropping it only if the enqueued
1334          * packet is consecutive with the eligible ACK, and their flags match.
1335          */
1336         if (elig_ack && aggressive && elig_ack->next == skb &&
1337             (elig_flags == (tcp_flag_word(tcph) &
1338                             (TCP_FLAG_ECE | TCP_FLAG_CWR))))
1339                 goto found;
1340
1341         return NULL;
1342
1343 found:
1344         if (elig_ack_prev)
1345                 elig_ack_prev->next = elig_ack->next;
1346         else
1347                 flow->head = elig_ack->next;
1348
1349         skb_mark_not_on_list(elig_ack);
1350
1351         return elig_ack;
1352 }
1353
1354 static u64 cake_ewma(u64 avg, u64 sample, u32 shift)
1355 {
1356         avg -= avg >> shift;
1357         avg += sample >> shift;
1358         return avg;
1359 }
1360
1361 static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off)
1362 {
1363         if (q->rate_flags & CAKE_FLAG_OVERHEAD)
1364                 len -= off;
1365
1366         if (q->max_netlen < len)
1367                 q->max_netlen = len;
1368         if (q->min_netlen > len)
1369                 q->min_netlen = len;
1370
1371         len += q->rate_overhead;
1372
1373         if (len < q->rate_mpu)
1374                 len = q->rate_mpu;
1375
1376         if (q->atm_mode == CAKE_ATM_ATM) {
1377                 len += 47;
1378                 len /= 48;
1379                 len *= 53;
1380         } else if (q->atm_mode == CAKE_ATM_PTM) {
1381                 /* Add one byte per 64 bytes or part thereof.
1382                  * This is conservative and easier to calculate than the
1383                  * precise value.
1384                  */
1385                 len += (len + 63) / 64;
1386         }
1387
1388         if (q->max_adjlen < len)
1389                 q->max_adjlen = len;
1390         if (q->min_adjlen > len)
1391                 q->min_adjlen = len;
1392
1393         return len;
1394 }
1395
1396 static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
1397 {
1398         const struct skb_shared_info *shinfo = skb_shinfo(skb);
1399         unsigned int hdr_len, last_len = 0;
1400         u32 off = skb_network_offset(skb);
1401         u32 len = qdisc_pkt_len(skb);
1402         u16 segs = 1;
1403
1404         q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8);
1405
1406         if (!shinfo->gso_size)
1407                 return cake_calc_overhead(q, len, off);
1408
1409         /* borrowed from qdisc_pkt_len_init() */
1410         hdr_len = skb_transport_offset(skb);
1411
1412         /* + transport layer */
1413         if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 |
1414                                                 SKB_GSO_TCPV6))) {
1415                 const struct tcphdr *th;
1416                 struct tcphdr _tcphdr;
1417
1418                 th = skb_header_pointer(skb, hdr_len,
1419                                         sizeof(_tcphdr), &_tcphdr);
1420                 if (likely(th))
1421                         hdr_len += __tcp_hdrlen(th);
1422         } else {
1423                 struct udphdr _udphdr;
1424
1425                 if (skb_header_pointer(skb, hdr_len,
1426                                        sizeof(_udphdr), &_udphdr))
1427                         hdr_len += sizeof(struct udphdr);
1428         }
1429
1430         if (unlikely(shinfo->gso_type & SKB_GSO_DODGY))
1431                 segs = DIV_ROUND_UP(skb->len - hdr_len,
1432                                     shinfo->gso_size);
1433         else
1434                 segs = shinfo->gso_segs;
1435
1436         len = shinfo->gso_size + hdr_len;
1437         last_len = skb->len - shinfo->gso_size * (segs - 1);
1438
1439         return (cake_calc_overhead(q, len, off) * (segs - 1) +
1440                 cake_calc_overhead(q, last_len, off));
1441 }
1442
1443 static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j)
1444 {
1445         struct cake_heap_entry ii = q->overflow_heap[i];
1446         struct cake_heap_entry jj = q->overflow_heap[j];
1447
1448         q->overflow_heap[i] = jj;
1449         q->overflow_heap[j] = ii;
1450
1451         q->tins[ii.t].overflow_idx[ii.b] = j;
1452         q->tins[jj.t].overflow_idx[jj.b] = i;
1453 }
1454
1455 static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i)
1456 {
1457         struct cake_heap_entry ii = q->overflow_heap[i];
1458
1459         return q->tins[ii.t].backlogs[ii.b];
1460 }
1461
1462 static void cake_heapify(struct cake_sched_data *q, u16 i)
1463 {
1464         static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES;
1465         u32 mb = cake_heap_get_backlog(q, i);
1466         u32 m = i;
1467
1468         while (m < a) {
1469                 u32 l = m + m + 1;
1470                 u32 r = l + 1;
1471
1472                 if (l < a) {
1473                         u32 lb = cake_heap_get_backlog(q, l);
1474
1475                         if (lb > mb) {
1476                                 m  = l;
1477                                 mb = lb;
1478                         }
1479                 }
1480
1481                 if (r < a) {
1482                         u32 rb = cake_heap_get_backlog(q, r);
1483
1484                         if (rb > mb) {
1485                                 m  = r;
1486                                 mb = rb;
1487                         }
1488                 }
1489
1490                 if (m != i) {
1491                         cake_heap_swap(q, i, m);
1492                         i = m;
1493                 } else {
1494                         break;
1495                 }
1496         }
1497 }
1498
1499 static void cake_heapify_up(struct cake_sched_data *q, u16 i)
1500 {
1501         while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) {
1502                 u16 p = (i - 1) >> 1;
1503                 u32 ib = cake_heap_get_backlog(q, i);
1504                 u32 pb = cake_heap_get_backlog(q, p);
1505
1506                 if (ib > pb) {
1507                         cake_heap_swap(q, i, p);
1508                         i = p;
1509                 } else {
1510                         break;
1511                 }
1512         }
1513 }
1514
1515 static int cake_advance_shaper(struct cake_sched_data *q,
1516                                struct cake_tin_data *b,
1517                                struct sk_buff *skb,
1518                                ktime_t now, bool drop)
1519 {
1520         u32 len = get_cobalt_cb(skb)->adjusted_len;
1521
1522         /* charge packet bandwidth to this tin
1523          * and to the global shaper.
1524          */
1525         if (q->rate_ns) {
1526                 u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft;
1527                 u64 global_dur = (len * q->rate_ns) >> q->rate_shft;
1528                 u64 failsafe_dur = global_dur + (global_dur >> 1);
1529
1530                 if (ktime_before(b->time_next_packet, now))
1531                         b->time_next_packet = ktime_add_ns(b->time_next_packet,
1532                                                            tin_dur);
1533
1534                 else if (ktime_before(b->time_next_packet,
1535                                       ktime_add_ns(now, tin_dur)))
1536                         b->time_next_packet = ktime_add_ns(now, tin_dur);
1537
1538                 q->time_next_packet = ktime_add_ns(q->time_next_packet,
1539                                                    global_dur);
1540                 if (!drop)
1541                         q->failsafe_next_packet = \
1542                                 ktime_add_ns(q->failsafe_next_packet,
1543                                              failsafe_dur);
1544         }
1545         return len;
1546 }
1547
1548 static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
1549 {
1550         struct cake_sched_data *q = qdisc_priv(sch);
1551         ktime_t now = ktime_get();
1552         u32 idx = 0, tin = 0, len;
1553         struct cake_heap_entry qq;
1554         struct cake_tin_data *b;
1555         struct cake_flow *flow;
1556         struct sk_buff *skb;
1557
1558         if (!q->overflow_timeout) {
1559                 int i;
1560                 /* Build fresh max-heap */
1561                 for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2 - 1; i >= 0; i--)
1562                         cake_heapify(q, i);
1563         }
1564         q->overflow_timeout = 65535;
1565
1566         /* select longest queue for pruning */
1567         qq  = q->overflow_heap[0];
1568         tin = qq.t;
1569         idx = qq.b;
1570
1571         b = &q->tins[tin];
1572         flow = &b->flows[idx];
1573         skb = dequeue_head(flow);
1574         if (unlikely(!skb)) {
1575                 /* heap has gone wrong, rebuild it next time */
1576                 q->overflow_timeout = 0;
1577                 return idx + (tin << 16);
1578         }
1579
1580         if (cobalt_queue_full(&flow->cvars, &b->cparams, now))
1581                 b->unresponsive_flow_count++;
1582
1583         len = qdisc_pkt_len(skb);
1584         q->buffer_used      -= skb->truesize;
1585         b->backlogs[idx]    -= len;
1586         b->tin_backlog      -= len;
1587         sch->qstats.backlog -= len;
1588
1589         flow->dropped++;
1590         b->tin_dropped++;
1591
1592         if (q->rate_flags & CAKE_FLAG_INGRESS)
1593                 cake_advance_shaper(q, b, skb, now, true);
1594
1595         qdisc_drop_reason(skb, sch, to_free, SKB_DROP_REASON_QDISC_OVERLIMIT);
1596         sch->q.qlen--;
1597         qdisc_tree_reduce_backlog(sch, 1, len);
1598
1599         cake_heapify(q, 0);
1600
1601         return idx + (tin << 16);
1602 }
1603
1604 static u8 cake_handle_diffserv(struct sk_buff *skb, bool wash)
1605 {
1606         const int offset = skb_network_offset(skb);
1607         u16 *buf, buf_;
1608         u8 dscp;
1609
1610         switch (skb_protocol(skb, true)) {
1611         case htons(ETH_P_IP):
1612                 buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1613                 if (unlikely(!buf))
1614                         return 0;
1615
1616                 /* ToS is in the second byte of iphdr */
1617                 dscp = ipv4_get_dsfield((struct iphdr *)buf) >> 2;
1618
1619                 if (wash && dscp) {
1620                         const int wlen = offset + sizeof(struct iphdr);
1621
1622                         if (!pskb_may_pull(skb, wlen) ||
1623                             skb_try_make_writable(skb, wlen))
1624                                 return 0;
1625
1626                         ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
1627                 }
1628
1629                 return dscp;
1630
1631         case htons(ETH_P_IPV6):
1632                 buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1633                 if (unlikely(!buf))
1634                         return 0;
1635
1636                 /* Traffic class is in the first and second bytes of ipv6hdr */
1637                 dscp = ipv6_get_dsfield((struct ipv6hdr *)buf) >> 2;
1638
1639                 if (wash && dscp) {
1640                         const int wlen = offset + sizeof(struct ipv6hdr);
1641
1642                         if (!pskb_may_pull(skb, wlen) ||
1643                             skb_try_make_writable(skb, wlen))
1644                                 return 0;
1645
1646                         ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
1647                 }
1648
1649                 return dscp;
1650
1651         case htons(ETH_P_ARP):
1652                 return 0x38;  /* CS7 - Net Control */
1653
1654         default:
1655                 /* If there is no Diffserv field, treat as best-effort */
1656                 return 0;
1657         }
1658 }
1659
1660 static struct cake_tin_data *cake_select_tin(struct Qdisc *sch,
1661                                              struct sk_buff *skb)
1662 {
1663         struct cake_sched_data *q = qdisc_priv(sch);
1664         u32 tin, mark;
1665         bool wash;
1666         u8 dscp;
1667
1668         /* Tin selection: Default to diffserv-based selection, allow overriding
1669          * using firewall marks or skb->priority. Call DSCP parsing early if
1670          * wash is enabled, otherwise defer to below to skip unneeded parsing.
1671          */
1672         mark = (skb->mark & q->fwmark_mask) >> q->fwmark_shft;
1673         wash = !!(q->rate_flags & CAKE_FLAG_WASH);
1674         if (wash)
1675                 dscp = cake_handle_diffserv(skb, wash);
1676
1677         if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT)
1678                 tin = 0;
1679
1680         else if (mark && mark <= q->tin_cnt)
1681                 tin = q->tin_order[mark - 1];
1682
1683         else if (TC_H_MAJ(skb->priority) == sch->handle &&
1684                  TC_H_MIN(skb->priority) > 0 &&
1685                  TC_H_MIN(skb->priority) <= q->tin_cnt)
1686                 tin = q->tin_order[TC_H_MIN(skb->priority) - 1];
1687
1688         else {
1689                 if (!wash)
1690                         dscp = cake_handle_diffserv(skb, wash);
1691                 tin = q->tin_index[dscp];
1692
1693                 if (unlikely(tin >= q->tin_cnt))
1694                         tin = 0;
1695         }
1696
1697         return &q->tins[tin];
1698 }
1699
1700 static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t,
1701                          struct sk_buff *skb, int flow_mode, int *qerr)
1702 {
1703         struct cake_sched_data *q = qdisc_priv(sch);
1704         struct tcf_proto *filter;
1705         struct tcf_result res;
1706         u16 flow = 0, host = 0;
1707         int result;
1708
1709         filter = rcu_dereference_bh(q->filter_list);
1710         if (!filter)
1711                 goto hash;
1712
1713         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
1714         result = tcf_classify(skb, NULL, filter, &res, false);
1715
1716         if (result >= 0) {
1717 #ifdef CONFIG_NET_CLS_ACT
1718                 switch (result) {
1719                 case TC_ACT_STOLEN:
1720                 case TC_ACT_QUEUED:
1721                 case TC_ACT_TRAP:
1722                         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
1723                         fallthrough;
1724                 case TC_ACT_SHOT:
1725                         return 0;
1726                 }
1727 #endif
1728                 if (TC_H_MIN(res.classid) <= CAKE_QUEUES)
1729                         flow = TC_H_MIN(res.classid);
1730                 if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << 16))
1731                         host = TC_H_MAJ(res.classid) >> 16;
1732         }
1733 hash:
1734         *t = cake_select_tin(sch, skb);
1735         return cake_hash(*t, skb, flow_mode, flow, host) + 1;
1736 }
1737
1738 static void cake_reconfigure(struct Qdisc *sch);
1739
1740 static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
1741                         struct sk_buff **to_free)
1742 {
1743         struct cake_sched_data *q = qdisc_priv(sch);
1744         int len = qdisc_pkt_len(skb);
1745         int ret;
1746         struct sk_buff *ack = NULL;
1747         ktime_t now = ktime_get();
1748         struct cake_tin_data *b;
1749         struct cake_flow *flow;
1750         u32 idx;
1751
1752         /* choose flow to insert into */
1753         idx = cake_classify(sch, &b, skb, q->flow_mode, &ret);
1754         if (idx == 0) {
1755                 if (ret & __NET_XMIT_BYPASS)
1756                         qdisc_qstats_drop(sch);
1757                 __qdisc_drop(skb, to_free);
1758                 return ret;
1759         }
1760         idx--;
1761         flow = &b->flows[idx];
1762
1763         /* ensure shaper state isn't stale */
1764         if (!b->tin_backlog) {
1765                 if (ktime_before(b->time_next_packet, now))
1766                         b->time_next_packet = now;
1767
1768                 if (!sch->q.qlen) {
1769                         if (ktime_before(q->time_next_packet, now)) {
1770                                 q->failsafe_next_packet = now;
1771                                 q->time_next_packet = now;
1772                         } else if (ktime_after(q->time_next_packet, now) &&
1773                                    ktime_after(q->failsafe_next_packet, now)) {
1774                                 u64 next = \
1775                                         min(ktime_to_ns(q->time_next_packet),
1776                                             ktime_to_ns(
1777                                                    q->failsafe_next_packet));
1778                                 sch->qstats.overlimits++;
1779                                 qdisc_watchdog_schedule_ns(&q->watchdog, next);
1780                         }
1781                 }
1782         }
1783
1784         if (unlikely(len > b->max_skblen))
1785                 b->max_skblen = len;
1786
1787         if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) {
1788                 struct sk_buff *segs, *nskb;
1789                 netdev_features_t features = netif_skb_features(skb);
1790                 unsigned int slen = 0, numsegs = 0;
1791
1792                 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
1793                 if (IS_ERR_OR_NULL(segs))
1794                         return qdisc_drop(skb, sch, to_free);
1795
1796                 skb_list_walk_safe(segs, segs, nskb) {
1797                         skb_mark_not_on_list(segs);
1798                         qdisc_skb_cb(segs)->pkt_len = segs->len;
1799                         cobalt_set_enqueue_time(segs, now);
1800                         get_cobalt_cb(segs)->adjusted_len = cake_overhead(q,
1801                                                                           segs);
1802                         flow_queue_add(flow, segs);
1803
1804                         sch->q.qlen++;
1805                         numsegs++;
1806                         slen += segs->len;
1807                         q->buffer_used += segs->truesize;
1808                         b->packets++;
1809                 }
1810
1811                 /* stats */
1812                 b->bytes            += slen;
1813                 b->backlogs[idx]    += slen;
1814                 b->tin_backlog      += slen;
1815                 sch->qstats.backlog += slen;
1816                 q->avg_window_bytes += slen;
1817
1818                 qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen);
1819                 consume_skb(skb);
1820         } else {
1821                 /* not splitting */
1822                 cobalt_set_enqueue_time(skb, now);
1823                 get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb);
1824                 flow_queue_add(flow, skb);
1825
1826                 if (q->ack_filter)
1827                         ack = cake_ack_filter(q, flow);
1828
1829                 if (ack) {
1830                         b->ack_drops++;
1831                         sch->qstats.drops++;
1832                         b->bytes += qdisc_pkt_len(ack);
1833                         len -= qdisc_pkt_len(ack);
1834                         q->buffer_used += skb->truesize - ack->truesize;
1835                         if (q->rate_flags & CAKE_FLAG_INGRESS)
1836                                 cake_advance_shaper(q, b, ack, now, true);
1837
1838                         qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack));
1839                         consume_skb(ack);
1840                 } else {
1841                         sch->q.qlen++;
1842                         q->buffer_used      += skb->truesize;
1843                 }
1844
1845                 /* stats */
1846                 b->packets++;
1847                 b->bytes            += len;
1848                 b->backlogs[idx]    += len;
1849                 b->tin_backlog      += len;
1850                 sch->qstats.backlog += len;
1851                 q->avg_window_bytes += len;
1852         }
1853
1854         if (q->overflow_timeout)
1855                 cake_heapify_up(q, b->overflow_idx[idx]);
1856
1857         /* incoming bandwidth capacity estimate */
1858         if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) {
1859                 u64 packet_interval = \
1860                         ktime_to_ns(ktime_sub(now, q->last_packet_time));
1861
1862                 if (packet_interval > NSEC_PER_SEC)
1863                         packet_interval = NSEC_PER_SEC;
1864
1865                 /* filter out short-term bursts, eg. wifi aggregation */
1866                 q->avg_packet_interval = \
1867                         cake_ewma(q->avg_packet_interval,
1868                                   packet_interval,
1869                                   (packet_interval > q->avg_packet_interval ?
1870                                           2 : 8));
1871
1872                 q->last_packet_time = now;
1873
1874                 if (packet_interval > q->avg_packet_interval) {
1875                         u64 window_interval = \
1876                                 ktime_to_ns(ktime_sub(now,
1877                                                       q->avg_window_begin));
1878                         u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC;
1879
1880                         b = div64_u64(b, window_interval);
1881                         q->avg_peak_bandwidth =
1882                                 cake_ewma(q->avg_peak_bandwidth, b,
1883                                           b > q->avg_peak_bandwidth ? 2 : 8);
1884                         q->avg_window_bytes = 0;
1885                         q->avg_window_begin = now;
1886
1887                         if (ktime_after(now,
1888                                         ktime_add_ms(q->last_reconfig_time,
1889                                                      250))) {
1890                                 q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4;
1891                                 cake_reconfigure(sch);
1892                         }
1893                 }
1894         } else {
1895                 q->avg_window_bytes = 0;
1896                 q->last_packet_time = now;
1897         }
1898
1899         /* flowchain */
1900         if (!flow->set || flow->set == CAKE_SET_DECAYING) {
1901                 if (!flow->set) {
1902                         list_add_tail(&flow->flowchain, &b->new_flows);
1903                 } else {
1904                         b->decaying_flow_count--;
1905                         list_move_tail(&flow->flowchain, &b->new_flows);
1906                 }
1907                 flow->set = CAKE_SET_SPARSE;
1908                 b->sparse_flow_count++;
1909
1910                 flow->deficit = cake_get_flow_quantum(b, flow, q->flow_mode);
1911         } else if (flow->set == CAKE_SET_SPARSE_WAIT) {
1912                 /* this flow was empty, accounted as a sparse flow, but actually
1913                  * in the bulk rotation.
1914                  */
1915                 flow->set = CAKE_SET_BULK;
1916                 b->sparse_flow_count--;
1917                 b->bulk_flow_count++;
1918
1919                 cake_inc_srchost_bulk_flow_count(b, flow, q->flow_mode);
1920                 cake_inc_dsthost_bulk_flow_count(b, flow, q->flow_mode);
1921         }
1922
1923         if (q->buffer_used > q->buffer_max_used)
1924                 q->buffer_max_used = q->buffer_used;
1925
1926         if (q->buffer_used > q->buffer_limit) {
1927                 u32 dropped = 0;
1928
1929                 while (q->buffer_used > q->buffer_limit) {
1930                         dropped++;
1931                         cake_drop(sch, to_free);
1932                 }
1933                 b->drop_overlimit += dropped;
1934         }
1935         return NET_XMIT_SUCCESS;
1936 }
1937
1938 static struct sk_buff *cake_dequeue_one(struct Qdisc *sch)
1939 {
1940         struct cake_sched_data *q = qdisc_priv(sch);
1941         struct cake_tin_data *b = &q->tins[q->cur_tin];
1942         struct cake_flow *flow = &b->flows[q->cur_flow];
1943         struct sk_buff *skb = NULL;
1944         u32 len;
1945
1946         if (flow->head) {
1947                 skb = dequeue_head(flow);
1948                 len = qdisc_pkt_len(skb);
1949                 b->backlogs[q->cur_flow] -= len;
1950                 b->tin_backlog           -= len;
1951                 sch->qstats.backlog      -= len;
1952                 q->buffer_used           -= skb->truesize;
1953                 sch->q.qlen--;
1954
1955                 if (q->overflow_timeout)
1956                         cake_heapify(q, b->overflow_idx[q->cur_flow]);
1957         }
1958         return skb;
1959 }
1960
1961 /* Discard leftover packets from a tin no longer in use. */
1962 static void cake_clear_tin(struct Qdisc *sch, u16 tin)
1963 {
1964         struct cake_sched_data *q = qdisc_priv(sch);
1965         struct sk_buff *skb;
1966
1967         q->cur_tin = tin;
1968         for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++)
1969                 while (!!(skb = cake_dequeue_one(sch)))
1970                         kfree_skb_reason(skb, SKB_DROP_REASON_QUEUE_PURGE);
1971 }
1972
1973 static struct sk_buff *cake_dequeue(struct Qdisc *sch)
1974 {
1975         struct cake_sched_data *q = qdisc_priv(sch);
1976         struct cake_tin_data *b = &q->tins[q->cur_tin];
1977         enum skb_drop_reason reason;
1978         ktime_t now = ktime_get();
1979         struct cake_flow *flow;
1980         struct list_head *head;
1981         bool first_flow = true;
1982         struct sk_buff *skb;
1983         u64 delay;
1984         u32 len;
1985
1986 begin:
1987         if (!sch->q.qlen)
1988                 return NULL;
1989
1990         /* global hard shaper */
1991         if (ktime_after(q->time_next_packet, now) &&
1992             ktime_after(q->failsafe_next_packet, now)) {
1993                 u64 next = min(ktime_to_ns(q->time_next_packet),
1994                                ktime_to_ns(q->failsafe_next_packet));
1995
1996                 sch->qstats.overlimits++;
1997                 qdisc_watchdog_schedule_ns(&q->watchdog, next);
1998                 return NULL;
1999         }
2000
2001         /* Choose a class to work on. */
2002         if (!q->rate_ns) {
2003                 /* In unlimited mode, can't rely on shaper timings, just balance
2004                  * with DRR
2005                  */
2006                 bool wrapped = false, empty = true;
2007
2008                 while (b->tin_deficit < 0 ||
2009                        !(b->sparse_flow_count + b->bulk_flow_count)) {
2010                         if (b->tin_deficit <= 0)
2011                                 b->tin_deficit += b->tin_quantum;
2012                         if (b->sparse_flow_count + b->bulk_flow_count)
2013                                 empty = false;
2014
2015                         q->cur_tin++;
2016                         b++;
2017                         if (q->cur_tin >= q->tin_cnt) {
2018                                 q->cur_tin = 0;
2019                                 b = q->tins;
2020
2021                                 if (wrapped) {
2022                                         /* It's possible for q->qlen to be
2023                                          * nonzero when we actually have no
2024                                          * packets anywhere.
2025                                          */
2026                                         if (empty)
2027                                                 return NULL;
2028                                 } else {
2029                                         wrapped = true;
2030                                 }
2031                         }
2032                 }
2033         } else {
2034                 /* In shaped mode, choose:
2035                  * - Highest-priority tin with queue and meeting schedule, or
2036                  * - The earliest-scheduled tin with queue.
2037                  */
2038                 ktime_t best_time = KTIME_MAX;
2039                 int tin, best_tin = 0;
2040
2041                 for (tin = 0; tin < q->tin_cnt; tin++) {
2042                         b = q->tins + tin;
2043                         if ((b->sparse_flow_count + b->bulk_flow_count) > 0) {
2044                                 ktime_t time_to_pkt = \
2045                                         ktime_sub(b->time_next_packet, now);
2046
2047                                 if (ktime_to_ns(time_to_pkt) <= 0 ||
2048                                     ktime_compare(time_to_pkt,
2049                                                   best_time) <= 0) {
2050                                         best_time = time_to_pkt;
2051                                         best_tin = tin;
2052                                 }
2053                         }
2054                 }
2055
2056                 q->cur_tin = best_tin;
2057                 b = q->tins + best_tin;
2058
2059                 /* No point in going further if no packets to deliver. */
2060                 if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count)))
2061                         return NULL;
2062         }
2063
2064 retry:
2065         /* service this class */
2066         head = &b->decaying_flows;
2067         if (!first_flow || list_empty(head)) {
2068                 head = &b->new_flows;
2069                 if (list_empty(head)) {
2070                         head = &b->old_flows;
2071                         if (unlikely(list_empty(head))) {
2072                                 head = &b->decaying_flows;
2073                                 if (unlikely(list_empty(head)))
2074                                         goto begin;
2075                         }
2076                 }
2077         }
2078         flow = list_first_entry(head, struct cake_flow, flowchain);
2079         q->cur_flow = flow - b->flows;
2080         first_flow = false;
2081
2082         /* flow isolation (DRR++) */
2083         if (flow->deficit <= 0) {
2084                 /* Keep all flows with deficits out of the sparse and decaying
2085                  * rotations.  No non-empty flow can go into the decaying
2086                  * rotation, so they can't get deficits
2087                  */
2088                 if (flow->set == CAKE_SET_SPARSE) {
2089                         if (flow->head) {
2090                                 b->sparse_flow_count--;
2091                                 b->bulk_flow_count++;
2092
2093                                 cake_inc_srchost_bulk_flow_count(b, flow, q->flow_mode);
2094                                 cake_inc_dsthost_bulk_flow_count(b, flow, q->flow_mode);
2095
2096                                 flow->set = CAKE_SET_BULK;
2097                         } else {
2098                                 /* we've moved it to the bulk rotation for
2099                                  * correct deficit accounting but we still want
2100                                  * to count it as a sparse flow, not a bulk one.
2101                                  */
2102                                 flow->set = CAKE_SET_SPARSE_WAIT;
2103                         }
2104                 }
2105
2106                 flow->deficit += cake_get_flow_quantum(b, flow, q->flow_mode);
2107                 list_move_tail(&flow->flowchain, &b->old_flows);
2108
2109                 goto retry;
2110         }
2111
2112         /* Retrieve a packet via the AQM */
2113         while (1) {
2114                 skb = cake_dequeue_one(sch);
2115                 if (!skb) {
2116                         /* this queue was actually empty */
2117                         if (cobalt_queue_empty(&flow->cvars, &b->cparams, now))
2118                                 b->unresponsive_flow_count--;
2119
2120                         if (flow->cvars.p_drop || flow->cvars.count ||
2121                             ktime_before(now, flow->cvars.drop_next)) {
2122                                 /* keep in the flowchain until the state has
2123                                  * decayed to rest
2124                                  */
2125                                 list_move_tail(&flow->flowchain,
2126                                                &b->decaying_flows);
2127                                 if (flow->set == CAKE_SET_BULK) {
2128                                         b->bulk_flow_count--;
2129
2130                                         cake_dec_srchost_bulk_flow_count(b, flow, q->flow_mode);
2131                                         cake_dec_dsthost_bulk_flow_count(b, flow, q->flow_mode);
2132
2133                                         b->decaying_flow_count++;
2134                                 } else if (flow->set == CAKE_SET_SPARSE ||
2135                                            flow->set == CAKE_SET_SPARSE_WAIT) {
2136                                         b->sparse_flow_count--;
2137                                         b->decaying_flow_count++;
2138                                 }
2139                                 flow->set = CAKE_SET_DECAYING;
2140                         } else {
2141                                 /* remove empty queue from the flowchain */
2142                                 list_del_init(&flow->flowchain);
2143                                 if (flow->set == CAKE_SET_SPARSE ||
2144                                     flow->set == CAKE_SET_SPARSE_WAIT)
2145                                         b->sparse_flow_count--;
2146                                 else if (flow->set == CAKE_SET_BULK) {
2147                                         b->bulk_flow_count--;
2148
2149                                         cake_dec_srchost_bulk_flow_count(b, flow, q->flow_mode);
2150                                         cake_dec_dsthost_bulk_flow_count(b, flow, q->flow_mode);
2151                                 } else
2152                                         b->decaying_flow_count--;
2153
2154                                 flow->set = CAKE_SET_NONE;
2155                         }
2156                         goto begin;
2157                 }
2158
2159                 reason = cobalt_should_drop(&flow->cvars, &b->cparams, now, skb,
2160                                             (b->bulk_flow_count *
2161                                              !!(q->rate_flags &
2162                                                 CAKE_FLAG_INGRESS)));
2163                 /* Last packet in queue may be marked, shouldn't be dropped */
2164                 if (reason == SKB_NOT_DROPPED_YET || !flow->head)
2165                         break;
2166
2167                 /* drop this packet, get another one */
2168                 if (q->rate_flags & CAKE_FLAG_INGRESS) {
2169                         len = cake_advance_shaper(q, b, skb,
2170                                                   now, true);
2171                         flow->deficit -= len;
2172                         b->tin_deficit -= len;
2173                 }
2174                 flow->dropped++;
2175                 b->tin_dropped++;
2176                 qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
2177                 qdisc_qstats_drop(sch);
2178                 kfree_skb_reason(skb, reason);
2179                 if (q->rate_flags & CAKE_FLAG_INGRESS)
2180                         goto retry;
2181         }
2182
2183         b->tin_ecn_mark += !!flow->cvars.ecn_marked;
2184         qdisc_bstats_update(sch, skb);
2185
2186         /* collect delay stats */
2187         delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
2188         b->avge_delay = cake_ewma(b->avge_delay, delay, 8);
2189         b->peak_delay = cake_ewma(b->peak_delay, delay,
2190                                   delay > b->peak_delay ? 2 : 8);
2191         b->base_delay = cake_ewma(b->base_delay, delay,
2192                                   delay < b->base_delay ? 2 : 8);
2193
2194         len = cake_advance_shaper(q, b, skb, now, false);
2195         flow->deficit -= len;
2196         b->tin_deficit -= len;
2197
2198         if (ktime_after(q->time_next_packet, now) && sch->q.qlen) {
2199                 u64 next = min(ktime_to_ns(q->time_next_packet),
2200                                ktime_to_ns(q->failsafe_next_packet));
2201
2202                 qdisc_watchdog_schedule_ns(&q->watchdog, next);
2203         } else if (!sch->q.qlen) {
2204                 int i;
2205
2206                 for (i = 0; i < q->tin_cnt; i++) {
2207                         if (q->tins[i].decaying_flow_count) {
2208                                 ktime_t next = \
2209                                         ktime_add_ns(now,
2210                                                      q->tins[i].cparams.target);
2211
2212                                 qdisc_watchdog_schedule_ns(&q->watchdog,
2213                                                            ktime_to_ns(next));
2214                                 break;
2215                         }
2216                 }
2217         }
2218
2219         if (q->overflow_timeout)
2220                 q->overflow_timeout--;
2221
2222         return skb;
2223 }
2224
2225 static void cake_reset(struct Qdisc *sch)
2226 {
2227         struct cake_sched_data *q = qdisc_priv(sch);
2228         u32 c;
2229
2230         if (!q->tins)
2231                 return;
2232
2233         for (c = 0; c < CAKE_MAX_TINS; c++)
2234                 cake_clear_tin(sch, c);
2235 }
2236
2237 static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = {
2238         [TCA_CAKE_BASE_RATE64]   = { .type = NLA_U64 },
2239         [TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 },
2240         [TCA_CAKE_ATM]           = { .type = NLA_U32 },
2241         [TCA_CAKE_FLOW_MODE]     = { .type = NLA_U32 },
2242         [TCA_CAKE_OVERHEAD]      = { .type = NLA_S32 },
2243         [TCA_CAKE_RTT]           = { .type = NLA_U32 },
2244         [TCA_CAKE_TARGET]        = { .type = NLA_U32 },
2245         [TCA_CAKE_AUTORATE]      = { .type = NLA_U32 },
2246         [TCA_CAKE_MEMORY]        = { .type = NLA_U32 },
2247         [TCA_CAKE_NAT]           = { .type = NLA_U32 },
2248         [TCA_CAKE_RAW]           = { .type = NLA_U32 },
2249         [TCA_CAKE_WASH]          = { .type = NLA_U32 },
2250         [TCA_CAKE_MPU]           = { .type = NLA_U32 },
2251         [TCA_CAKE_INGRESS]       = { .type = NLA_U32 },
2252         [TCA_CAKE_ACK_FILTER]    = { .type = NLA_U32 },
2253         [TCA_CAKE_SPLIT_GSO]     = { .type = NLA_U32 },
2254         [TCA_CAKE_FWMARK]        = { .type = NLA_U32 },
2255 };
2256
2257 static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
2258                           u64 target_ns, u64 rtt_est_ns)
2259 {
2260         /* convert byte-rate into time-per-byte
2261          * so it will always unwedge in reasonable time.
2262          */
2263         static const u64 MIN_RATE = 64;
2264         u32 byte_target = mtu;
2265         u64 byte_target_ns;
2266         u8  rate_shft = 0;
2267         u64 rate_ns = 0;
2268
2269         b->flow_quantum = 1514;
2270         if (rate) {
2271                 b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL);
2272                 rate_shft = 34;
2273                 rate_ns = ((u64)NSEC_PER_SEC) << rate_shft;
2274                 rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate));
2275                 while (!!(rate_ns >> 34)) {
2276                         rate_ns >>= 1;
2277                         rate_shft--;
2278                 }
2279         } /* else unlimited, ie. zero delay */
2280
2281         b->tin_rate_bps  = rate;
2282         b->tin_rate_ns   = rate_ns;
2283         b->tin_rate_shft = rate_shft;
2284
2285         byte_target_ns = (byte_target * rate_ns) >> rate_shft;
2286
2287         b->cparams.target = max((byte_target_ns * 3) / 2, target_ns);
2288         b->cparams.interval = max(rtt_est_ns +
2289                                      b->cparams.target - target_ns,
2290                                      b->cparams.target * 2);
2291         b->cparams.mtu_time = byte_target_ns;
2292         b->cparams.p_inc = 1 << 24; /* 1/256 */
2293         b->cparams.p_dec = 1 << 20; /* 1/4096 */
2294 }
2295
2296 static int cake_config_besteffort(struct Qdisc *sch)
2297 {
2298         struct cake_sched_data *q = qdisc_priv(sch);
2299         struct cake_tin_data *b = &q->tins[0];
2300         u32 mtu = psched_mtu(qdisc_dev(sch));
2301         u64 rate = q->rate_bps;
2302
2303         q->tin_cnt = 1;
2304
2305         q->tin_index = besteffort;
2306         q->tin_order = normal_order;
2307
2308         cake_set_rate(b, rate, mtu,
2309                       us_to_ns(q->target), us_to_ns(q->interval));
2310         b->tin_quantum = 65535;
2311
2312         return 0;
2313 }
2314
2315 static int cake_config_precedence(struct Qdisc *sch)
2316 {
2317         /* convert high-level (user visible) parameters into internal format */
2318         struct cake_sched_data *q = qdisc_priv(sch);
2319         u32 mtu = psched_mtu(qdisc_dev(sch));
2320         u64 rate = q->rate_bps;
2321         u32 quantum = 256;
2322         u32 i;
2323
2324         q->tin_cnt = 8;
2325         q->tin_index = precedence;
2326         q->tin_order = normal_order;
2327
2328         for (i = 0; i < q->tin_cnt; i++) {
2329                 struct cake_tin_data *b = &q->tins[i];
2330
2331                 cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2332                               us_to_ns(q->interval));
2333
2334                 b->tin_quantum = max_t(u16, 1U, quantum);
2335
2336                 /* calculate next class's parameters */
2337                 rate  *= 7;
2338                 rate >>= 3;
2339
2340                 quantum  *= 7;
2341                 quantum >>= 3;
2342         }
2343
2344         return 0;
2345 }
2346
2347 /*      List of known Diffserv codepoints:
2348  *
2349  *      Default Forwarding (DF/CS0) - Best Effort
2350  *      Max Throughput (TOS2)
2351  *      Min Delay (TOS4)
2352  *      LLT "La" (TOS5)
2353  *      Assured Forwarding 1 (AF1x) - x3
2354  *      Assured Forwarding 2 (AF2x) - x3
2355  *      Assured Forwarding 3 (AF3x) - x3
2356  *      Assured Forwarding 4 (AF4x) - x3
2357  *      Precedence Class 1 (CS1)
2358  *      Precedence Class 2 (CS2)
2359  *      Precedence Class 3 (CS3)
2360  *      Precedence Class 4 (CS4)
2361  *      Precedence Class 5 (CS5)
2362  *      Precedence Class 6 (CS6)
2363  *      Precedence Class 7 (CS7)
2364  *      Voice Admit (VA)
2365  *      Expedited Forwarding (EF)
2366  *      Lower Effort (LE)
2367  *
2368  *      Total 26 codepoints.
2369  */
2370
2371 /*      List of traffic classes in RFC 4594, updated by RFC 8622:
2372  *              (roughly descending order of contended priority)
2373  *              (roughly ascending order of uncontended throughput)
2374  *
2375  *      Network Control (CS6,CS7)      - routing traffic
2376  *      Telephony (EF,VA)         - aka. VoIP streams
2377  *      Signalling (CS5)               - VoIP setup
2378  *      Multimedia Conferencing (AF4x) - aka. video calls
2379  *      Realtime Interactive (CS4)     - eg. games
2380  *      Multimedia Streaming (AF3x)    - eg. YouTube, NetFlix, Twitch
2381  *      Broadcast Video (CS3)
2382  *      Low-Latency Data (AF2x,TOS4)      - eg. database
2383  *      Ops, Admin, Management (CS2)      - eg. ssh
2384  *      Standard Service (DF & unrecognised codepoints)
2385  *      High-Throughput Data (AF1x,TOS2)  - eg. web traffic
2386  *      Low-Priority Data (LE,CS1)        - eg. BitTorrent
2387  *
2388  *      Total 12 traffic classes.
2389  */
2390
2391 static int cake_config_diffserv8(struct Qdisc *sch)
2392 {
2393 /*      Pruned list of traffic classes for typical applications:
2394  *
2395  *              Network Control          (CS6, CS7)
2396  *              Minimum Latency          (EF, VA, CS5, CS4)
2397  *              Interactive Shell        (CS2)
2398  *              Low Latency Transactions (AF2x, TOS4)
2399  *              Video Streaming          (AF4x, AF3x, CS3)
2400  *              Bog Standard             (DF etc.)
2401  *              High Throughput          (AF1x, TOS2, CS1)
2402  *              Background Traffic       (LE)
2403  *
2404  *              Total 8 traffic classes.
2405  */
2406
2407         struct cake_sched_data *q = qdisc_priv(sch);
2408         u32 mtu = psched_mtu(qdisc_dev(sch));
2409         u64 rate = q->rate_bps;
2410         u32 quantum = 256;
2411         u32 i;
2412
2413         q->tin_cnt = 8;
2414
2415         /* codepoint to class mapping */
2416         q->tin_index = diffserv8;
2417         q->tin_order = normal_order;
2418
2419         /* class characteristics */
2420         for (i = 0; i < q->tin_cnt; i++) {
2421                 struct cake_tin_data *b = &q->tins[i];
2422
2423                 cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2424                               us_to_ns(q->interval));
2425
2426                 b->tin_quantum = max_t(u16, 1U, quantum);
2427
2428                 /* calculate next class's parameters */
2429                 rate  *= 7;
2430                 rate >>= 3;
2431
2432                 quantum  *= 7;
2433                 quantum >>= 3;
2434         }
2435
2436         return 0;
2437 }
2438
2439 static int cake_config_diffserv4(struct Qdisc *sch)
2440 {
2441 /*  Further pruned list of traffic classes for four-class system:
2442  *
2443  *          Latency Sensitive  (CS7, CS6, EF, VA, CS5, CS4)
2444  *          Streaming Media    (AF4x, AF3x, CS3, AF2x, TOS4, CS2)
2445  *          Best Effort        (DF, AF1x, TOS2, and those not specified)
2446  *          Background Traffic (LE, CS1)
2447  *
2448  *              Total 4 traffic classes.
2449  */
2450
2451         struct cake_sched_data *q = qdisc_priv(sch);
2452         u32 mtu = psched_mtu(qdisc_dev(sch));
2453         u64 rate = q->rate_bps;
2454         u32 quantum = 1024;
2455
2456         q->tin_cnt = 4;
2457
2458         /* codepoint to class mapping */
2459         q->tin_index = diffserv4;
2460         q->tin_order = bulk_order;
2461
2462         /* class characteristics */
2463         cake_set_rate(&q->tins[0], rate, mtu,
2464                       us_to_ns(q->target), us_to_ns(q->interval));
2465         cake_set_rate(&q->tins[1], rate >> 4, mtu,
2466                       us_to_ns(q->target), us_to_ns(q->interval));
2467         cake_set_rate(&q->tins[2], rate >> 1, mtu,
2468                       us_to_ns(q->target), us_to_ns(q->interval));
2469         cake_set_rate(&q->tins[3], rate >> 2, mtu,
2470                       us_to_ns(q->target), us_to_ns(q->interval));
2471
2472         /* bandwidth-sharing weights */
2473         q->tins[0].tin_quantum = quantum;
2474         q->tins[1].tin_quantum = quantum >> 4;
2475         q->tins[2].tin_quantum = quantum >> 1;
2476         q->tins[3].tin_quantum = quantum >> 2;
2477
2478         return 0;
2479 }
2480
2481 static int cake_config_diffserv3(struct Qdisc *sch)
2482 {
2483 /*  Simplified Diffserv structure with 3 tins.
2484  *              Latency Sensitive       (CS7, CS6, EF, VA, TOS4)
2485  *              Best Effort
2486  *              Low Priority            (LE, CS1)
2487  */
2488         struct cake_sched_data *q = qdisc_priv(sch);
2489         u32 mtu = psched_mtu(qdisc_dev(sch));
2490         u64 rate = q->rate_bps;
2491         u32 quantum = 1024;
2492
2493         q->tin_cnt = 3;
2494
2495         /* codepoint to class mapping */
2496         q->tin_index = diffserv3;
2497         q->tin_order = bulk_order;
2498
2499         /* class characteristics */
2500         cake_set_rate(&q->tins[0], rate, mtu,
2501                       us_to_ns(q->target), us_to_ns(q->interval));
2502         cake_set_rate(&q->tins[1], rate >> 4, mtu,
2503                       us_to_ns(q->target), us_to_ns(q->interval));
2504         cake_set_rate(&q->tins[2], rate >> 2, mtu,
2505                       us_to_ns(q->target), us_to_ns(q->interval));
2506
2507         /* bandwidth-sharing weights */
2508         q->tins[0].tin_quantum = quantum;
2509         q->tins[1].tin_quantum = quantum >> 4;
2510         q->tins[2].tin_quantum = quantum >> 2;
2511
2512         return 0;
2513 }
2514
2515 static void cake_reconfigure(struct Qdisc *sch)
2516 {
2517         struct cake_sched_data *q = qdisc_priv(sch);
2518         int c, ft;
2519
2520         switch (q->tin_mode) {
2521         case CAKE_DIFFSERV_BESTEFFORT:
2522                 ft = cake_config_besteffort(sch);
2523                 break;
2524
2525         case CAKE_DIFFSERV_PRECEDENCE:
2526                 ft = cake_config_precedence(sch);
2527                 break;
2528
2529         case CAKE_DIFFSERV_DIFFSERV8:
2530                 ft = cake_config_diffserv8(sch);
2531                 break;
2532
2533         case CAKE_DIFFSERV_DIFFSERV4:
2534                 ft = cake_config_diffserv4(sch);
2535                 break;
2536
2537         case CAKE_DIFFSERV_DIFFSERV3:
2538         default:
2539                 ft = cake_config_diffserv3(sch);
2540                 break;
2541         }
2542
2543         for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) {
2544                 cake_clear_tin(sch, c);
2545                 q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time;
2546         }
2547
2548         q->rate_ns   = q->tins[ft].tin_rate_ns;
2549         q->rate_shft = q->tins[ft].tin_rate_shft;
2550
2551         if (q->buffer_config_limit) {
2552                 q->buffer_limit = q->buffer_config_limit;
2553         } else if (q->rate_bps) {
2554                 u64 t = q->rate_bps * q->interval;
2555
2556                 do_div(t, USEC_PER_SEC / 4);
2557                 q->buffer_limit = max_t(u32, t, 4U << 20);
2558         } else {
2559                 q->buffer_limit = ~0;
2560         }
2561
2562         sch->flags &= ~TCQ_F_CAN_BYPASS;
2563
2564         q->buffer_limit = min(q->buffer_limit,
2565                               max(sch->limit * psched_mtu(qdisc_dev(sch)),
2566                                   q->buffer_config_limit));
2567 }
2568
2569 static int cake_change(struct Qdisc *sch, struct nlattr *opt,
2570                        struct netlink_ext_ack *extack)
2571 {
2572         struct cake_sched_data *q = qdisc_priv(sch);
2573         struct nlattr *tb[TCA_CAKE_MAX + 1];
2574         u16 rate_flags;
2575         u8 flow_mode;
2576         int err;
2577
2578         err = nla_parse_nested_deprecated(tb, TCA_CAKE_MAX, opt, cake_policy,
2579                                           extack);
2580         if (err < 0)
2581                 return err;
2582
2583         flow_mode = q->flow_mode;
2584         if (tb[TCA_CAKE_NAT]) {
2585 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
2586                 flow_mode &= ~CAKE_FLOW_NAT_FLAG;
2587                 flow_mode |= CAKE_FLOW_NAT_FLAG *
2588                         !!nla_get_u32(tb[TCA_CAKE_NAT]);
2589 #else
2590                 NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT],
2591                                     "No conntrack support in kernel");
2592                 return -EOPNOTSUPP;
2593 #endif
2594         }
2595
2596         if (tb[TCA_CAKE_BASE_RATE64])
2597                 WRITE_ONCE(q->rate_bps,
2598                            nla_get_u64(tb[TCA_CAKE_BASE_RATE64]));
2599
2600         if (tb[TCA_CAKE_DIFFSERV_MODE])
2601                 WRITE_ONCE(q->tin_mode,
2602                            nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]));
2603
2604         rate_flags = q->rate_flags;
2605         if (tb[TCA_CAKE_WASH]) {
2606                 if (!!nla_get_u32(tb[TCA_CAKE_WASH]))
2607                         rate_flags |= CAKE_FLAG_WASH;
2608                 else
2609                         rate_flags &= ~CAKE_FLAG_WASH;
2610         }
2611
2612         if (tb[TCA_CAKE_FLOW_MODE])
2613                 flow_mode = ((flow_mode & CAKE_FLOW_NAT_FLAG) |
2614                                 (nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) &
2615                                         CAKE_FLOW_MASK));
2616
2617         if (tb[TCA_CAKE_ATM])
2618                 WRITE_ONCE(q->atm_mode,
2619                            nla_get_u32(tb[TCA_CAKE_ATM]));
2620
2621         if (tb[TCA_CAKE_OVERHEAD]) {
2622                 WRITE_ONCE(q->rate_overhead,
2623                            nla_get_s32(tb[TCA_CAKE_OVERHEAD]));
2624                 rate_flags |= CAKE_FLAG_OVERHEAD;
2625
2626                 q->max_netlen = 0;
2627                 q->max_adjlen = 0;
2628                 q->min_netlen = ~0;
2629                 q->min_adjlen = ~0;
2630         }
2631
2632         if (tb[TCA_CAKE_RAW]) {
2633                 rate_flags &= ~CAKE_FLAG_OVERHEAD;
2634
2635                 q->max_netlen = 0;
2636                 q->max_adjlen = 0;
2637                 q->min_netlen = ~0;
2638                 q->min_adjlen = ~0;
2639         }
2640
2641         if (tb[TCA_CAKE_MPU])
2642                 WRITE_ONCE(q->rate_mpu,
2643                            nla_get_u32(tb[TCA_CAKE_MPU]));
2644
2645         if (tb[TCA_CAKE_RTT]) {
2646                 u32 interval = nla_get_u32(tb[TCA_CAKE_RTT]);
2647
2648                 WRITE_ONCE(q->interval, max(interval, 1U));
2649         }
2650
2651         if (tb[TCA_CAKE_TARGET]) {
2652                 u32 target = nla_get_u32(tb[TCA_CAKE_TARGET]);
2653
2654                 WRITE_ONCE(q->target, max(target, 1U));
2655         }
2656
2657         if (tb[TCA_CAKE_AUTORATE]) {
2658                 if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE]))
2659                         rate_flags |= CAKE_FLAG_AUTORATE_INGRESS;
2660                 else
2661                         rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS;
2662         }
2663
2664         if (tb[TCA_CAKE_INGRESS]) {
2665                 if (!!nla_get_u32(tb[TCA_CAKE_INGRESS]))
2666                         rate_flags |= CAKE_FLAG_INGRESS;
2667                 else
2668                         rate_flags &= ~CAKE_FLAG_INGRESS;
2669         }
2670
2671         if (tb[TCA_CAKE_ACK_FILTER])
2672                 WRITE_ONCE(q->ack_filter,
2673                            nla_get_u32(tb[TCA_CAKE_ACK_FILTER]));
2674
2675         if (tb[TCA_CAKE_MEMORY])
2676                 WRITE_ONCE(q->buffer_config_limit,
2677                            nla_get_u32(tb[TCA_CAKE_MEMORY]));
2678
2679         if (tb[TCA_CAKE_SPLIT_GSO]) {
2680                 if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO]))
2681                         rate_flags |= CAKE_FLAG_SPLIT_GSO;
2682                 else
2683                         rate_flags &= ~CAKE_FLAG_SPLIT_GSO;
2684         }
2685
2686         if (tb[TCA_CAKE_FWMARK]) {
2687                 WRITE_ONCE(q->fwmark_mask, nla_get_u32(tb[TCA_CAKE_FWMARK]));
2688                 WRITE_ONCE(q->fwmark_shft,
2689                            q->fwmark_mask ? __ffs(q->fwmark_mask) : 0);
2690         }
2691
2692         WRITE_ONCE(q->rate_flags, rate_flags);
2693         WRITE_ONCE(q->flow_mode, flow_mode);
2694         if (q->tins) {
2695                 sch_tree_lock(sch);
2696                 cake_reconfigure(sch);
2697                 sch_tree_unlock(sch);
2698         }
2699
2700         return 0;
2701 }
2702
2703 static void cake_destroy(struct Qdisc *sch)
2704 {
2705         struct cake_sched_data *q = qdisc_priv(sch);
2706
2707         qdisc_watchdog_cancel(&q->watchdog);
2708         tcf_block_put(q->block);
2709         kvfree(q->tins);
2710 }
2711
2712 static int cake_init(struct Qdisc *sch, struct nlattr *opt,
2713                      struct netlink_ext_ack *extack)
2714 {
2715         struct cake_sched_data *q = qdisc_priv(sch);
2716         int i, j, err;
2717
2718         sch->limit = 10240;
2719         q->tin_mode = CAKE_DIFFSERV_DIFFSERV3;
2720         q->flow_mode  = CAKE_FLOW_TRIPLE;
2721
2722         q->rate_bps = 0; /* unlimited by default */
2723
2724         q->interval = 100000; /* 100ms default */
2725         q->target   =   5000; /* 5ms: codel RFC argues
2726                                * for 5 to 10% of interval
2727                                */
2728         q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2729         q->cur_tin = 0;
2730         q->cur_flow  = 0;
2731
2732         qdisc_watchdog_init(&q->watchdog, sch);
2733
2734         if (opt) {
2735                 err = cake_change(sch, opt, extack);
2736
2737                 if (err)
2738                         return err;
2739         }
2740
2741         err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
2742         if (err)
2743                 return err;
2744
2745         quantum_div[0] = ~0;
2746         for (i = 1; i <= CAKE_QUEUES; i++)
2747                 quantum_div[i] = 65535 / i;
2748
2749         q->tins = kvcalloc(CAKE_MAX_TINS, sizeof(struct cake_tin_data),
2750                            GFP_KERNEL);
2751         if (!q->tins)
2752                 return -ENOMEM;
2753
2754         for (i = 0; i < CAKE_MAX_TINS; i++) {
2755                 struct cake_tin_data *b = q->tins + i;
2756
2757                 INIT_LIST_HEAD(&b->new_flows);
2758                 INIT_LIST_HEAD(&b->old_flows);
2759                 INIT_LIST_HEAD(&b->decaying_flows);
2760                 b->sparse_flow_count = 0;
2761                 b->bulk_flow_count = 0;
2762                 b->decaying_flow_count = 0;
2763
2764                 for (j = 0; j < CAKE_QUEUES; j++) {
2765                         struct cake_flow *flow = b->flows + j;
2766                         u32 k = j * CAKE_MAX_TINS + i;
2767
2768                         INIT_LIST_HEAD(&flow->flowchain);
2769                         cobalt_vars_init(&flow->cvars);
2770
2771                         q->overflow_heap[k].t = i;
2772                         q->overflow_heap[k].b = j;
2773                         b->overflow_idx[j] = k;
2774                 }
2775         }
2776
2777         cake_reconfigure(sch);
2778         q->avg_peak_bandwidth = q->rate_bps;
2779         q->min_netlen = ~0;
2780         q->min_adjlen = ~0;
2781         return 0;
2782 }
2783
2784 static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
2785 {
2786         struct cake_sched_data *q = qdisc_priv(sch);
2787         struct nlattr *opts;
2788         u16 rate_flags;
2789         u8 flow_mode;
2790
2791         opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
2792         if (!opts)
2793                 goto nla_put_failure;
2794
2795         if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64,
2796                               READ_ONCE(q->rate_bps), TCA_CAKE_PAD))
2797                 goto nla_put_failure;
2798
2799         flow_mode = READ_ONCE(q->flow_mode);
2800         if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE, flow_mode & CAKE_FLOW_MASK))
2801                 goto nla_put_failure;
2802
2803         if (nla_put_u32(skb, TCA_CAKE_RTT, READ_ONCE(q->interval)))
2804                 goto nla_put_failure;
2805
2806         if (nla_put_u32(skb, TCA_CAKE_TARGET, READ_ONCE(q->target)))
2807                 goto nla_put_failure;
2808
2809         if (nla_put_u32(skb, TCA_CAKE_MEMORY,
2810                         READ_ONCE(q->buffer_config_limit)))
2811                 goto nla_put_failure;
2812
2813         rate_flags = READ_ONCE(q->rate_flags);
2814         if (nla_put_u32(skb, TCA_CAKE_AUTORATE,
2815                         !!(rate_flags & CAKE_FLAG_AUTORATE_INGRESS)))
2816                 goto nla_put_failure;
2817
2818         if (nla_put_u32(skb, TCA_CAKE_INGRESS,
2819                         !!(rate_flags & CAKE_FLAG_INGRESS)))
2820                 goto nla_put_failure;
2821
2822         if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, READ_ONCE(q->ack_filter)))
2823                 goto nla_put_failure;
2824
2825         if (nla_put_u32(skb, TCA_CAKE_NAT,
2826                         !!(flow_mode & CAKE_FLOW_NAT_FLAG)))
2827                 goto nla_put_failure;
2828
2829         if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, READ_ONCE(q->tin_mode)))
2830                 goto nla_put_failure;
2831
2832         if (nla_put_u32(skb, TCA_CAKE_WASH,
2833                         !!(rate_flags & CAKE_FLAG_WASH)))
2834                 goto nla_put_failure;
2835
2836         if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, READ_ONCE(q->rate_overhead)))
2837                 goto nla_put_failure;
2838
2839         if (!(rate_flags & CAKE_FLAG_OVERHEAD))
2840                 if (nla_put_u32(skb, TCA_CAKE_RAW, 0))
2841                         goto nla_put_failure;
2842
2843         if (nla_put_u32(skb, TCA_CAKE_ATM, READ_ONCE(q->atm_mode)))
2844                 goto nla_put_failure;
2845
2846         if (nla_put_u32(skb, TCA_CAKE_MPU, READ_ONCE(q->rate_mpu)))
2847                 goto nla_put_failure;
2848
2849         if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO,
2850                         !!(rate_flags & CAKE_FLAG_SPLIT_GSO)))
2851                 goto nla_put_failure;
2852
2853         if (nla_put_u32(skb, TCA_CAKE_FWMARK, READ_ONCE(q->fwmark_mask)))
2854                 goto nla_put_failure;
2855
2856         return nla_nest_end(skb, opts);
2857
2858 nla_put_failure:
2859         return -1;
2860 }
2861
2862 static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
2863 {
2864         struct nlattr *stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
2865         struct cake_sched_data *q = qdisc_priv(sch);
2866         struct nlattr *tstats, *ts;
2867         int i;
2868
2869         if (!stats)
2870                 return -1;
2871
2872 #define PUT_STAT_U32(attr, data) do {                                  \
2873                 if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2874                         goto nla_put_failure;                          \
2875         } while (0)
2876 #define PUT_STAT_U64(attr, data) do {                                  \
2877                 if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \
2878                                         data, TCA_CAKE_STATS_PAD)) \
2879                         goto nla_put_failure;                          \
2880         } while (0)
2881
2882         PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth);
2883         PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit);
2884         PUT_STAT_U32(MEMORY_USED, q->buffer_max_used);
2885         PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16));
2886         PUT_STAT_U32(MAX_NETLEN, q->max_netlen);
2887         PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen);
2888         PUT_STAT_U32(MIN_NETLEN, q->min_netlen);
2889         PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen);
2890
2891 #undef PUT_STAT_U32
2892 #undef PUT_STAT_U64
2893
2894         tstats = nla_nest_start_noflag(d->skb, TCA_CAKE_STATS_TIN_STATS);
2895         if (!tstats)
2896                 goto nla_put_failure;
2897
2898 #define PUT_TSTAT_U32(attr, data) do {                                  \
2899                 if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \
2900                         goto nla_put_failure;                           \
2901         } while (0)
2902 #define PUT_TSTAT_U64(attr, data) do {                                  \
2903                 if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \
2904                                         data, TCA_CAKE_TIN_STATS_PAD))  \
2905                         goto nla_put_failure;                           \
2906         } while (0)
2907
2908         for (i = 0; i < q->tin_cnt; i++) {
2909                 struct cake_tin_data *b = &q->tins[q->tin_order[i]];
2910
2911                 ts = nla_nest_start_noflag(d->skb, i + 1);
2912                 if (!ts)
2913                         goto nla_put_failure;
2914
2915                 PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps);
2916                 PUT_TSTAT_U64(SENT_BYTES64, b->bytes);
2917                 PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog);
2918
2919                 PUT_TSTAT_U32(TARGET_US,
2920                               ktime_to_us(ns_to_ktime(b->cparams.target)));
2921                 PUT_TSTAT_U32(INTERVAL_US,
2922                               ktime_to_us(ns_to_ktime(b->cparams.interval)));
2923
2924                 PUT_TSTAT_U32(SENT_PACKETS, b->packets);
2925                 PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped);
2926                 PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark);
2927                 PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops);
2928
2929                 PUT_TSTAT_U32(PEAK_DELAY_US,
2930                               ktime_to_us(ns_to_ktime(b->peak_delay)));
2931                 PUT_TSTAT_U32(AVG_DELAY_US,
2932                               ktime_to_us(ns_to_ktime(b->avge_delay)));
2933                 PUT_TSTAT_U32(BASE_DELAY_US,
2934                               ktime_to_us(ns_to_ktime(b->base_delay)));
2935
2936                 PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits);
2937                 PUT_TSTAT_U32(WAY_MISSES, b->way_misses);
2938                 PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions);
2939
2940                 PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count +
2941                                             b->decaying_flow_count);
2942                 PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count);
2943                 PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count);
2944                 PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen);
2945
2946                 PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum);
2947                 nla_nest_end(d->skb, ts);
2948         }
2949
2950 #undef PUT_TSTAT_U32
2951 #undef PUT_TSTAT_U64
2952
2953         nla_nest_end(d->skb, tstats);
2954         return nla_nest_end(d->skb, stats);
2955
2956 nla_put_failure:
2957         nla_nest_cancel(d->skb, stats);
2958         return -1;
2959 }
2960
2961 static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg)
2962 {
2963         return NULL;
2964 }
2965
2966 static unsigned long cake_find(struct Qdisc *sch, u32 classid)
2967 {
2968         return 0;
2969 }
2970
2971 static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent,
2972                                u32 classid)
2973 {
2974         return 0;
2975 }
2976
2977 static void cake_unbind(struct Qdisc *q, unsigned long cl)
2978 {
2979 }
2980
2981 static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl,
2982                                         struct netlink_ext_ack *extack)
2983 {
2984         struct cake_sched_data *q = qdisc_priv(sch);
2985
2986         if (cl)
2987                 return NULL;
2988         return q->block;
2989 }
2990
2991 static int cake_dump_class(struct Qdisc *sch, unsigned long cl,
2992                            struct sk_buff *skb, struct tcmsg *tcm)
2993 {
2994         tcm->tcm_handle |= TC_H_MIN(cl);
2995         return 0;
2996 }
2997
2998 static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
2999                                  struct gnet_dump *d)
3000 {
3001         struct cake_sched_data *q = qdisc_priv(sch);
3002         const struct cake_flow *flow = NULL;
3003         struct gnet_stats_queue qs = { 0 };
3004         struct nlattr *stats;
3005         u32 idx = cl - 1;
3006
3007         if (idx < CAKE_QUEUES * q->tin_cnt) {
3008                 const struct cake_tin_data *b = \
3009                         &q->tins[q->tin_order[idx / CAKE_QUEUES]];
3010                 const struct sk_buff *skb;
3011
3012                 flow = &b->flows[idx % CAKE_QUEUES];
3013
3014                 if (flow->head) {
3015                         sch_tree_lock(sch);
3016                         skb = flow->head;
3017                         while (skb) {
3018                                 qs.qlen++;
3019                                 skb = skb->next;
3020                         }
3021                         sch_tree_unlock(sch);
3022                 }
3023                 qs.backlog = b->backlogs[idx % CAKE_QUEUES];
3024                 qs.drops = flow->dropped;
3025         }
3026         if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
3027                 return -1;
3028         if (flow) {
3029                 ktime_t now = ktime_get();
3030
3031                 stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
3032                 if (!stats)
3033                         return -1;
3034
3035 #define PUT_STAT_U32(attr, data) do {                                  \
3036                 if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3037                         goto nla_put_failure;                          \
3038         } while (0)
3039 #define PUT_STAT_S32(attr, data) do {                                  \
3040                 if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3041                         goto nla_put_failure;                          \
3042         } while (0)
3043
3044                 PUT_STAT_S32(DEFICIT, flow->deficit);
3045                 PUT_STAT_U32(DROPPING, flow->cvars.dropping);
3046                 PUT_STAT_U32(COBALT_COUNT, flow->cvars.count);
3047                 PUT_STAT_U32(P_DROP, flow->cvars.p_drop);
3048                 if (flow->cvars.p_drop) {
3049                         PUT_STAT_S32(BLUE_TIMER_US,
3050                                      ktime_to_us(
3051                                              ktime_sub(now,
3052                                                        flow->cvars.blue_timer)));
3053                 }
3054                 if (flow->cvars.dropping) {
3055                         PUT_STAT_S32(DROP_NEXT_US,
3056                                      ktime_to_us(
3057                                              ktime_sub(now,
3058                                                        flow->cvars.drop_next)));
3059                 }
3060
3061                 if (nla_nest_end(d->skb, stats) < 0)
3062                         return -1;
3063         }
3064
3065         return 0;
3066
3067 nla_put_failure:
3068         nla_nest_cancel(d->skb, stats);
3069         return -1;
3070 }
3071
3072 static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg)
3073 {
3074         struct cake_sched_data *q = qdisc_priv(sch);
3075         unsigned int i, j;
3076
3077         if (arg->stop)
3078                 return;
3079
3080         for (i = 0; i < q->tin_cnt; i++) {
3081                 struct cake_tin_data *b = &q->tins[q->tin_order[i]];
3082
3083                 for (j = 0; j < CAKE_QUEUES; j++) {
3084                         if (list_empty(&b->flows[j].flowchain)) {
3085                                 arg->count++;
3086                                 continue;
3087                         }
3088                         if (!tc_qdisc_stats_dump(sch, i * CAKE_QUEUES + j + 1,
3089                                                  arg))
3090                                 break;
3091                 }
3092         }
3093 }
3094
3095 static const struct Qdisc_class_ops cake_class_ops = {
3096         .leaf           =       cake_leaf,
3097         .find           =       cake_find,
3098         .tcf_block      =       cake_tcf_block,
3099         .bind_tcf       =       cake_bind,
3100         .unbind_tcf     =       cake_unbind,
3101         .dump           =       cake_dump_class,
3102         .dump_stats     =       cake_dump_class_stats,
3103         .walk           =       cake_walk,
3104 };
3105
3106 static struct Qdisc_ops cake_qdisc_ops __read_mostly = {
3107         .cl_ops         =       &cake_class_ops,
3108         .id             =       "cake",
3109         .priv_size      =       sizeof(struct cake_sched_data),
3110         .enqueue        =       cake_enqueue,
3111         .dequeue        =       cake_dequeue,
3112         .peek           =       qdisc_peek_dequeued,
3113         .init           =       cake_init,
3114         .reset          =       cake_reset,
3115         .destroy        =       cake_destroy,
3116         .change         =       cake_change,
3117         .dump           =       cake_dump,
3118         .dump_stats     =       cake_dump_stats,
3119         .owner          =       THIS_MODULE,
3120 };
3121 MODULE_ALIAS_NET_SCH("cake");
3122
3123 static int __init cake_module_init(void)
3124 {
3125         return register_qdisc(&cake_qdisc_ops);
3126 }
3127
3128 static void __exit cake_module_exit(void)
3129 {
3130         unregister_qdisc(&cake_qdisc_ops);
3131 }
3132
3133 module_init(cake_module_init)
3134 module_exit(cake_module_exit)
3135 MODULE_AUTHOR("Jonathan Morton");
3136 MODULE_LICENSE("Dual BSD/GPL");
3137 MODULE_DESCRIPTION("The CAKE shaper.");