net/sched/sch_fq.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing)
   4  *
   5  *  Copyright (C) 2013-2023 Eric Dumazet <edumazet@google.com>
   6  *
   7  *  Meant to be mostly used for locally generated traffic :
   8  *  Fast classification depends on skb->sk being set before reaching us.
   9  *  If not, (router workload), we use rxhash as fallback, with 32 bits wide hash.
  10  *  All packets belonging to a socket are considered as a 'flow'.
  11  *
  12  *  Flows are dynamically allocated and stored in a hash table of RB trees
  13  *  They are also part of one Round Robin 'queues' (new or old flows)
  14  *
  15  *  Burst avoidance (aka pacing) capability :
  16  *
  17  *  Transport (eg TCP) can set in sk->sk_pacing_rate a rate, enqueue a
  18  *  bunch of packets, and this packet scheduler adds delay between
  19  *  packets to respect rate limitation.
  20  *
  21  *  enqueue() :
  22  *   - lookup one RB tree (out of 1024 or more) to find the flow.
  23  *     If non existent flow, create it, add it to the tree.
  24  *     Add skb to the per flow list of skb (fifo).
  25  *   - Use a special fifo for high prio packets
  26  *
  27  *  dequeue() : serves flows in Round Robin
  28  *  Note : When a flow becomes empty, we do not immediately remove it from
  29  *  rb trees, for performance reasons (its expected to send additional packets,
  30  *  or SLAB cache will reuse socket for another flow)
  31  */
  32
  33 #include <linux/module.h>
  34 #include <linux/types.h>
  35 #include <linux/kernel.h>
  36 #include <linux/jiffies.h>
  37 #include <linux/string.h>
  38 #include <linux/in.h>
  39 #include <linux/errno.h>
  40 #include <linux/init.h>
  41 #include <linux/skbuff.h>
  42 #include <linux/slab.h>
  43 #include <linux/rbtree.h>
  44 #include <linux/hash.h>
  45 #include <linux/prefetch.h>
  46 #include <linux/vmalloc.h>
  47 #include <net/netlink.h>
  48 #include <net/pkt_sched.h>
  49 #include <net/sock.h>
  50 #include <net/tcp_states.h>
  51 #include <net/tcp.h>
  52
  53 struct fq_skb_cb {
  54         u64     time_to_send;
  55         u8      band;
  56 };
  57
  58 static inline struct fq_skb_cb *fq_skb_cb(struct sk_buff *skb)
  59 {
  60         qdisc_cb_private_validate(skb, sizeof(struct fq_skb_cb));
  61         return (struct fq_skb_cb *)qdisc_skb_cb(skb)->data;
  62 }
  63
  64 /*
  65  * Per flow structure, dynamically allocated.
  66  * If packets have monotically increasing time_to_send, they are placed in O(1)
  67  * in linear list (head,tail), otherwise are placed in a rbtree (t_root).
  68  */
  69 struct fq_flow {
  70 /* First cache line : used in fq_gc(), fq_enqueue(), fq_dequeue() */
  71         struct rb_root  t_root;
  72         struct sk_buff  *head;          /* list of skbs for this flow : first skb */
  73         union {
  74                 struct sk_buff *tail;   /* last skb in the list */
  75                 unsigned long  age;     /* (jiffies | 1UL) when flow was emptied, for gc */
  76         };
  77         union {
  78                 struct rb_node  fq_node;        /* anchor in fq_root[] trees */
  79                 /* Following field is only used for q->internal,
  80                  * because q->internal is not hashed in fq_root[]
  81                  */
  82                 u64             stat_fastpath_packets;
  83         };
  84         struct sock     *sk;
  85         u32             socket_hash;    /* sk_hash */
  86         int             qlen;           /* number of packets in flow queue */
  87
  88 /* Second cache line */
  89         int             credit;
  90         int             band;
  91         struct fq_flow *next;           /* next pointer in RR lists */
  92
  93         struct rb_node  rate_node;      /* anchor in q->delayed tree */
  94         u64             time_next_packet;
  95 };
  96
  97 struct fq_flow_head {
  98         struct fq_flow *first;
  99         struct fq_flow *last;
 100 };
 101
 102 struct fq_perband_flows {
 103         struct fq_flow_head new_flows;
 104         struct fq_flow_head old_flows;
 105         int                 credit;
 106         int                 quantum; /* based on band nr : 576KB, 192KB, 64KB */
 107 };
 108
 109 #define FQ_PRIO2BAND_CRUMB_SIZE ((TC_PRIO_MAX + 1) >> 2)
 110
 111 struct fq_sched_data {
 112 /* Read mostly cache line */
 113
 114         u32             quantum;
 115         u32             initial_quantum;
 116         u32             flow_refill_delay;
 117         u32             flow_plimit;    /* max packets per flow */
 118         unsigned long   flow_max_rate;  /* optional max rate per flow */
 119         u64             ce_threshold;
 120         u64             horizon;        /* horizon in ns */
 121         u32             orphan_mask;    /* mask for orphaned skb */
 122         u32             low_rate_threshold;
 123         struct rb_root  *fq_root;
 124         u8              rate_enable;
 125         u8              fq_trees_log;
 126         u8              horizon_drop;
 127         u8              prio2band[FQ_PRIO2BAND_CRUMB_SIZE];
 128         u32             timer_slack; /* hrtimer slack in ns */
 129
 130 /* Read/Write fields. */
 131
 132         unsigned int band_nr; /* band being serviced in fq_dequeue() */
 133
 134         struct fq_perband_flows band_flows[FQ_BANDS];
 135
 136         struct fq_flow  internal;       /* fastpath queue. */
 137         struct rb_root  delayed;        /* for rate limited flows */
 138         u64             time_next_delayed_flow;
 139         unsigned long   unthrottle_latency_ns;
 140
 141         u32             band_pkt_count[FQ_BANDS];
 142         u32             flows;
 143         u32             inactive_flows; /* Flows with no packet to send. */
 144         u32             throttled_flows;
 145
 146         u64             stat_throttled;
 147         struct qdisc_watchdog watchdog;
 148         u64             stat_gc_flows;
 149
 150 /* Seldom used fields. */
 151
 152         u64             stat_band_drops[FQ_BANDS];
 153         u64             stat_ce_mark;
 154         u64             stat_horizon_drops;
 155         u64             stat_horizon_caps;
 156         u64             stat_flows_plimit;
 157         u64             stat_pkts_too_long;
 158         u64             stat_allocation_errors;
 159 };
 160
 161 /* return the i-th 2-bit value ("crumb") */
 162 static u8 fq_prio2band(const u8 *prio2band, unsigned int prio)
 163 {
 164         return (READ_ONCE(prio2band[prio / 4]) >> (2 * (prio & 0x3))) & 0x3;
 165 }
 166
 167 /*
 168  * f->tail and f->age share the same location.
 169  * We can use the low order bit to differentiate if this location points
 170  * to a sk_buff or contains a jiffies value, if we force this value to be odd.
 171  * This assumes f->tail low order bit must be 0 since alignof(struct sk_buff) >= 2
 172  */
 173 static void fq_flow_set_detached(struct fq_flow *f)
 174 {
 175         f->age = jiffies | 1UL;
 176 }
 177
 178 static bool fq_flow_is_detached(const struct fq_flow *f)
 179 {
 180         return !!(f->age & 1UL);
 181 }
 182
 183 /* special value to mark a throttled flow (not on old/new list) */
 184 static struct fq_flow throttled;
 185
 186 static bool fq_flow_is_throttled(const struct fq_flow *f)
 187 {
 188         return f->next == &throttled;
 189 }
 190
 191 enum new_flow {
 192         NEW_FLOW,
 193         OLD_FLOW
 194 };
 195
 196 static void fq_flow_add_tail(struct fq_sched_data *q, struct fq_flow *flow,
 197                              enum new_flow list_sel)
 198 {
 199         struct fq_perband_flows *pband = &q->band_flows[flow->band];
 200         struct fq_flow_head *head = (list_sel == NEW_FLOW) ?
 201                                         &pband->new_flows :
 202                                         &pband->old_flows;
 203
 204         if (head->first)
 205                 head->last->next = flow;
 206         else
 207                 head->first = flow;
 208         head->last = flow;
 209         flow->next = NULL;
 210 }
 211
 212 static void fq_flow_unset_throttled(struct fq_sched_data *q, struct fq_flow *f)
 213 {
 214         rb_erase(&f->rate_node, &q->delayed);
 215         q->throttled_flows--;
 216         fq_flow_add_tail(q, f, OLD_FLOW);
 217 }
 218
 219 static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f)
 220 {
 221         struct rb_node **p = &q->delayed.rb_node, *parent = NULL;
 222
 223         while (*p) {
 224                 struct fq_flow *aux;
 225
 226                 parent = *p;
 227                 aux = rb_entry(parent, struct fq_flow, rate_node);
 228                 if (f->time_next_packet >= aux->time_next_packet)
 229                         p = &parent->rb_right;
 230                 else
 231                         p = &parent->rb_left;
 232         }
 233         rb_link_node(&f->rate_node, parent, p);
 234         rb_insert_color(&f->rate_node, &q->delayed);
 235         q->throttled_flows++;
 236         q->stat_throttled++;
 237
 238         f->next = &throttled;
 239         if (q->time_next_delayed_flow > f->time_next_packet)
 240                 q->time_next_delayed_flow = f->time_next_packet;
 241 }
 242
 243
 244 static struct kmem_cache *fq_flow_cachep __read_mostly;
 245
 246
 247 /* limit number of collected flows per round */
 248 #define FQ_GC_MAX 8
 249 #define FQ_GC_AGE (3*HZ)
 250
 251 static bool fq_gc_candidate(const struct fq_flow *f)
 252 {
 253         return fq_flow_is_detached(f) &&
 254                time_after(jiffies, f->age + FQ_GC_AGE);
 255 }
 256
 257 static void fq_gc(struct fq_sched_data *q,
 258                   struct rb_root *root,
 259                   struct sock *sk)
 260 {
 261         struct rb_node **p, *parent;
 262         void *tofree[FQ_GC_MAX];
 263         struct fq_flow *f;
 264         int i, fcnt = 0;
 265
 266         p = &root->rb_node;
 267         parent = NULL;
 268         while (*p) {
 269                 parent = *p;
 270
 271                 f = rb_entry(parent, struct fq_flow, fq_node);
 272                 if (f->sk == sk)
 273                         break;
 274
 275                 if (fq_gc_candidate(f)) {
 276                         tofree[fcnt++] = f;
 277                         if (fcnt == FQ_GC_MAX)
 278                                 break;
 279                 }
 280
 281                 if (f->sk > sk)
 282                         p = &parent->rb_right;
 283                 else
 284                         p = &parent->rb_left;
 285         }
 286
 287         if (!fcnt)
 288                 return;
 289
 290         for (i = fcnt; i > 0; ) {
 291                 f = tofree[--i];
 292                 rb_erase(&f->fq_node, root);
 293         }
 294         q->flows -= fcnt;
 295         q->inactive_flows -= fcnt;
 296         q->stat_gc_flows += fcnt;
 297
 298         kmem_cache_free_bulk(fq_flow_cachep, fcnt, tofree);
 299 }
 300
 301 /* Fast path can be used if :
 302  * 1) Packet tstamp is in the past.
 303  * 2) FQ qlen == 0   OR
 304  *   (no flow is currently eligible for transmit,
 305  *    AND fast path queue has less than 8 packets)
 306  * 3) No SO_MAX_PACING_RATE on the socket (if any).
 307  * 4) No @maxrate attribute on this qdisc,
 308  *
 309  * FQ can not use generic TCQ_F_CAN_BYPASS infrastructure.
 310  */
 311 static bool fq_fastpath_check(const struct Qdisc *sch, struct sk_buff *skb,
 312                               u64 now)
 313 {
 314         const struct fq_sched_data *q = qdisc_priv(sch);
 315         const struct sock *sk;
 316
 317         if (fq_skb_cb(skb)->time_to_send > now)
 318                 return false;
 319
 320         if (sch->q.qlen != 0) {
 321                 /* Even if some packets are stored in this qdisc,
 322                  * we can still enable fast path if all of them are
 323                  * scheduled in the future (ie no flows are eligible)
 324                  * or in the fast path queue.
 325                  */
 326                 if (q->flows != q->inactive_flows + q->throttled_flows)
 327                         return false;
 328
 329                 /* Do not allow fast path queue to explode, we want Fair Queue mode
 330                  * under pressure.
 331                  */
 332                 if (q->internal.qlen >= 8)
 333                         return false;
 334         }
 335
 336         sk = skb->sk;
 337         if (sk && sk_fullsock(sk) && !sk_is_tcp(sk) &&
 338             sk->sk_max_pacing_rate != ~0UL)
 339                 return false;
 340
 341         if (q->flow_max_rate != ~0UL)
 342                 return false;
 343
 344         return true;
 345 }
 346
 347 static struct fq_flow *fq_classify(struct Qdisc *sch, struct sk_buff *skb,
 348                                    u64 now)
 349 {
 350         struct fq_sched_data *q = qdisc_priv(sch);
 351         struct rb_node **p, *parent;
 352         struct sock *sk = skb->sk;
 353         struct rb_root *root;
 354         struct fq_flow *f;
 355
 356         /* SYNACK messages are attached to a TCP_NEW_SYN_RECV request socket
 357          * or a listener (SYNCOOKIE mode)
 358          * 1) request sockets are not full blown,
 359          *    they do not contain sk_pacing_rate
 360          * 2) They are not part of a 'flow' yet
 361          * 3) We do not want to rate limit them (eg SYNFLOOD attack),
 362          *    especially if the listener set SO_MAX_PACING_RATE
 363          * 4) We pretend they are orphaned
 364          */
 365         if (!sk || sk_listener(sk)) {
 366                 unsigned long hash = skb_get_hash(skb) & q->orphan_mask;
 367
 368                 /* By forcing low order bit to 1, we make sure to not
 369                  * collide with a local flow (socket pointers are word aligned)
 370                  */
 371                 sk = (struct sock *)((hash << 1) | 1UL);
 372                 skb_orphan(skb);
 373         } else if (sk->sk_state == TCP_CLOSE) {
 374                 unsigned long hash = skb_get_hash(skb) & q->orphan_mask;
 375                 /*
 376                  * Sockets in TCP_CLOSE are non connected.
 377                  * Typical use case is UDP sockets, they can send packets
 378                  * with sendto() to many different destinations.
 379                  * We probably could use a generic bit advertising
 380                  * non connected sockets, instead of sk_state == TCP_CLOSE,
 381                  * if we care enough.
 382                  */
 383                 sk = (struct sock *)((hash << 1) | 1UL);
 384         }
 385
 386         if (fq_fastpath_check(sch, skb, now)) {
 387                 q->internal.stat_fastpath_packets++;
 388                 if (skb->sk == sk && q->rate_enable &&
 389                     READ_ONCE(sk->sk_pacing_status) != SK_PACING_FQ)
 390                         smp_store_release(&sk->sk_pacing_status,
 391                                           SK_PACING_FQ);
 392                 return &q->internal;
 393         }
 394
 395         root = &q->fq_root[hash_ptr(sk, q->fq_trees_log)];
 396
 397         fq_gc(q, root, sk);
 398
 399         p = &root->rb_node;
 400         parent = NULL;
 401         while (*p) {
 402                 parent = *p;
 403
 404                 f = rb_entry(parent, struct fq_flow, fq_node);
 405                 if (f->sk == sk) {
 406                         /* socket might have been reallocated, so check
 407                          * if its sk_hash is the same.
 408                          * It not, we need to refill credit with
 409                          * initial quantum
 410                          */
 411                         if (unlikely(skb->sk == sk &&
 412                                      f->socket_hash != sk->sk_hash)) {
 413                                 f->credit = q->initial_quantum;
 414                                 f->socket_hash = sk->sk_hash;
 415                                 if (q->rate_enable)
 416                                         smp_store_release(&sk->sk_pacing_status,
 417                                                           SK_PACING_FQ);
 418                                 if (fq_flow_is_throttled(f))
 419                                         fq_flow_unset_throttled(q, f);
 420                                 f->time_next_packet = 0ULL;
 421                         }
 422                         return f;
 423                 }
 424                 if (f->sk > sk)
 425                         p = &parent->rb_right;
 426                 else
 427                         p = &parent->rb_left;
 428         }
 429
 430         f = kmem_cache_zalloc(fq_flow_cachep, GFP_ATOMIC | __GFP_NOWARN);
 431         if (unlikely(!f)) {
 432                 q->stat_allocation_errors++;
 433                 return &q->internal;
 434         }
 435         /* f->t_root is already zeroed after kmem_cache_zalloc() */
 436
 437         fq_flow_set_detached(f);
 438         f->sk = sk;
 439         if (skb->sk == sk) {
 440                 f->socket_hash = sk->sk_hash;
 441                 if (q->rate_enable)
 442                         smp_store_release(&sk->sk_pacing_status,
 443                                           SK_PACING_FQ);
 444         }
 445         f->credit = q->initial_quantum;
 446
 447         rb_link_node(&f->fq_node, parent, p);
 448         rb_insert_color(&f->fq_node, root);
 449
 450         q->flows++;
 451         q->inactive_flows++;
 452         return f;
 453 }
 454
 455 static struct sk_buff *fq_peek(struct fq_flow *flow)
 456 {
 457         struct sk_buff *skb = skb_rb_first(&flow->t_root);
 458         struct sk_buff *head = flow->head;
 459
 460         if (!skb)
 461                 return head;
 462
 463         if (!head)
 464                 return skb;
 465
 466         if (fq_skb_cb(skb)->time_to_send < fq_skb_cb(head)->time_to_send)
 467                 return skb;
 468         return head;
 469 }
 470
 471 static void fq_erase_head(struct Qdisc *sch, struct fq_flow *flow,
 472                           struct sk_buff *skb)
 473 {
 474         if (skb == flow->head) {
 475                 flow->head = skb->next;
 476         } else {
 477                 rb_erase(&skb->rbnode, &flow->t_root);
 478                 skb->dev = qdisc_dev(sch);
 479         }
 480 }
 481
 482 /* Remove one skb from flow queue.
 483  * This skb must be the return value of prior fq_peek().
 484  */
 485 static void fq_dequeue_skb(struct Qdisc *sch, struct fq_flow *flow,
 486                            struct sk_buff *skb)
 487 {
 488         fq_erase_head(sch, flow, skb);
 489         skb_mark_not_on_list(skb);
 490         qdisc_qstats_backlog_dec(sch, skb);
 491         sch->q.qlen--;
 492 }
 493
 494 static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb)
 495 {
 496         struct rb_node **p, *parent;
 497         struct sk_buff *head, *aux;
 498
 499         head = flow->head;
 500         if (!head ||
 501             fq_skb_cb(skb)->time_to_send >= fq_skb_cb(flow->tail)->time_to_send) {
 502                 if (!head)
 503                         flow->head = skb;
 504                 else
 505                         flow->tail->next = skb;
 506                 flow->tail = skb;
 507                 skb->next = NULL;
 508                 return;
 509         }
 510
 511         p = &flow->t_root.rb_node;
 512         parent = NULL;
 513
 514         while (*p) {
 515                 parent = *p;
 516                 aux = rb_to_skb(parent);
 517                 if (fq_skb_cb(skb)->time_to_send >= fq_skb_cb(aux)->time_to_send)
 518                         p = &parent->rb_right;
 519                 else
 520                         p = &parent->rb_left;
 521         }
 522         rb_link_node(&skb->rbnode, parent, p);
 523         rb_insert_color(&skb->rbnode, &flow->t_root);
 524 }
 525
 526 static bool fq_packet_beyond_horizon(const struct sk_buff *skb,
 527                                      const struct fq_sched_data *q, u64 now)
 528 {
 529         return unlikely((s64)skb->tstamp > (s64)(now + q->horizon));
 530 }
 531
 532 static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 533                       struct sk_buff **to_free)
 534 {
 535         struct fq_sched_data *q = qdisc_priv(sch);
 536         struct fq_flow *f;
 537         u64 now;
 538         u8 band;
 539
 540         band = fq_prio2band(q->prio2band, skb->priority & TC_PRIO_MAX);
 541         if (unlikely(q->band_pkt_count[band] >= sch->limit)) {
 542                 q->stat_band_drops[band]++;
 543                 return qdisc_drop(skb, sch, to_free);
 544         }
 545
 546         now = ktime_get_ns();
 547         if (!skb->tstamp) {
 548                 fq_skb_cb(skb)->time_to_send = now;
 549         } else {
 550                 /* Check if packet timestamp is too far in the future. */
 551                 if (fq_packet_beyond_horizon(skb, q, now)) {
 552                         if (q->horizon_drop) {
 553                                         q->stat_horizon_drops++;
 554                                         return qdisc_drop(skb, sch, to_free);
 555                         }
 556                         q->stat_horizon_caps++;
 557                         skb->tstamp = now + q->horizon;
 558                 }
 559                 fq_skb_cb(skb)->time_to_send = skb->tstamp;
 560         }
 561
 562         f = fq_classify(sch, skb, now);
 563
 564         if (f != &q->internal) {
 565                 if (unlikely(f->qlen >= q->flow_plimit)) {
 566                         q->stat_flows_plimit++;
 567                         return qdisc_drop(skb, sch, to_free);
 568                 }
 569
 570                 if (fq_flow_is_detached(f)) {
 571                         fq_flow_add_tail(q, f, NEW_FLOW);
 572                         if (time_after(jiffies, f->age + q->flow_refill_delay))
 573                                 f->credit = max_t(u32, f->credit, q->quantum);
 574                 }
 575
 576                 f->band = band;
 577                 q->band_pkt_count[band]++;
 578                 fq_skb_cb(skb)->band = band;
 579                 if (f->qlen == 0)
 580                         q->inactive_flows--;
 581         }
 582
 583         f->qlen++;
 584         /* Note: this overwrites f->age */
 585         flow_queue_add(f, skb);
 586
 587         qdisc_qstats_backlog_inc(sch, skb);
 588         sch->q.qlen++;
 589
 590         return NET_XMIT_SUCCESS;
 591 }
 592
 593 static void fq_check_throttled(struct fq_sched_data *q, u64 now)
 594 {
 595         unsigned long sample;
 596         struct rb_node *p;
 597
 598         if (q->time_next_delayed_flow > now)
 599                 return;
 600
 601         /* Update unthrottle latency EWMA.
 602          * This is cheap and can help diagnosing timer/latency problems.
 603          */
 604         sample = (unsigned long)(now - q->time_next_delayed_flow);
 605         q->unthrottle_latency_ns -= q->unthrottle_latency_ns >> 3;
 606         q->unthrottle_latency_ns += sample >> 3;
 607
 608         q->time_next_delayed_flow = ~0ULL;
 609         while ((p = rb_first(&q->delayed)) != NULL) {
 610                 struct fq_flow *f = rb_entry(p, struct fq_flow, rate_node);
 611
 612                 if (f->time_next_packet > now) {
 613                         q->time_next_delayed_flow = f->time_next_packet;
 614                         break;
 615                 }
 616                 fq_flow_unset_throttled(q, f);
 617         }
 618 }
 619
 620 static struct fq_flow_head *fq_pband_head_select(struct fq_perband_flows *pband)
 621 {
 622         if (pband->credit <= 0)
 623                 return NULL;
 624
 625         if (pband->new_flows.first)
 626                 return &pband->new_flows;
 627
 628         return pband->old_flows.first ? &pband->old_flows : NULL;
 629 }
 630
 631 static struct sk_buff *fq_dequeue(struct Qdisc *sch)
 632 {
 633         struct fq_sched_data *q = qdisc_priv(sch);
 634         struct fq_perband_flows *pband;
 635         struct fq_flow_head *head;
 636         struct sk_buff *skb;
 637         struct fq_flow *f;
 638         unsigned long rate;
 639         int retry;
 640         u32 plen;
 641         u64 now;
 642
 643         if (!sch->q.qlen)
 644                 return NULL;
 645
 646         skb = fq_peek(&q->internal);
 647         if (unlikely(skb)) {
 648                 q->internal.qlen--;
 649                 fq_dequeue_skb(sch, &q->internal, skb);
 650                 goto out;
 651         }
 652
 653         now = ktime_get_ns();
 654         fq_check_throttled(q, now);
 655         retry = 0;
 656         pband = &q->band_flows[q->band_nr];
 657 begin:
 658         head = fq_pband_head_select(pband);
 659         if (!head) {
 660                 while (++retry <= FQ_BANDS) {
 661                         if (++q->band_nr == FQ_BANDS)
 662                                 q->band_nr = 0;
 663                         pband = &q->band_flows[q->band_nr];
 664                         pband->credit = min(pband->credit + pband->quantum,
 665                                             pband->quantum);
 666                         if (pband->credit > 0)
 667                                 goto begin;
 668                         retry = 0;
 669                 }
 670                 if (q->time_next_delayed_flow != ~0ULL)
 671                         qdisc_watchdog_schedule_range_ns(&q->watchdog,
 672                                                         q->time_next_delayed_flow,
 673                                                         q->timer_slack);
 674                 return NULL;
 675         }
 676         f = head->first;
 677         retry = 0;
 678         if (f->credit <= 0) {
 679                 f->credit += q->quantum;
 680                 head->first = f->next;
 681                 fq_flow_add_tail(q, f, OLD_FLOW);
 682                 goto begin;
 683         }
 684
 685         skb = fq_peek(f);
 686         if (skb) {
 687                 u64 time_next_packet = max_t(u64, fq_skb_cb(skb)->time_to_send,
 688                                              f->time_next_packet);
 689
 690                 if (now < time_next_packet) {
 691                         head->first = f->next;
 692                         f->time_next_packet = time_next_packet;
 693                         fq_flow_set_throttled(q, f);
 694                         goto begin;
 695                 }
 696                 prefetch(&skb->end);
 697                 if ((s64)(now - time_next_packet - q->ce_threshold) > 0) {
 698                         INET_ECN_set_ce(skb);
 699                         q->stat_ce_mark++;
 700                 }
 701                 if (--f->qlen == 0)
 702                         q->inactive_flows++;
 703                 q->band_pkt_count[fq_skb_cb(skb)->band]--;
 704                 fq_dequeue_skb(sch, f, skb);
 705         } else {
 706                 head->first = f->next;
 707                 /* force a pass through old_flows to prevent starvation */
 708                 if (head == &pband->new_flows) {
 709                         fq_flow_add_tail(q, f, OLD_FLOW);
 710                 } else {
 711                         fq_flow_set_detached(f);
 712                 }
 713                 goto begin;
 714         }
 715         plen = qdisc_pkt_len(skb);
 716         f->credit -= plen;
 717         pband->credit -= plen;
 718
 719         if (!q->rate_enable)
 720                 goto out;
 721
 722         rate = q->flow_max_rate;
 723
 724         /* If EDT time was provided for this skb, we need to
 725          * update f->time_next_packet only if this qdisc enforces
 726          * a flow max rate.
 727          */
 728         if (!skb->tstamp) {
 729                 if (skb->sk)
 730                         rate = min(READ_ONCE(skb->sk->sk_pacing_rate), rate);
 731
 732                 if (rate <= q->low_rate_threshold) {
 733                         f->credit = 0;
 734                 } else {
 735                         plen = max(plen, q->quantum);
 736                         if (f->credit > 0)
 737                                 goto out;
 738                 }
 739         }
 740         if (rate != ~0UL) {
 741                 u64 len = (u64)plen * NSEC_PER_SEC;
 742
 743                 if (likely(rate))
 744                         len = div64_ul(len, rate);
 745                 /* Since socket rate can change later,
 746                  * clamp the delay to 1 second.
 747                  * Really, providers of too big packets should be fixed !
 748                  */
 749                 if (unlikely(len > NSEC_PER_SEC)) {
 750                         len = NSEC_PER_SEC;
 751                         q->stat_pkts_too_long++;
 752                 }
 753                 /* Account for schedule/timers drifts.
 754                  * f->time_next_packet was set when prior packet was sent,
 755                  * and current time (@now) can be too late by tens of us.
 756                  */
 757                 if (f->time_next_packet)
 758                         len -= min(len/2, now - f->time_next_packet);
 759                 f->time_next_packet = now + len;
 760         }
 761 out:
 762         qdisc_bstats_update(sch, skb);
 763         return skb;
 764 }
 765
 766 static void fq_flow_purge(struct fq_flow *flow)
 767 {
 768         struct rb_node *p = rb_first(&flow->t_root);
 769
 770         while (p) {
 771                 struct sk_buff *skb = rb_to_skb(p);
 772
 773                 p = rb_next(p);
 774                 rb_erase(&skb->rbnode, &flow->t_root);
 775                 rtnl_kfree_skbs(skb, skb);
 776         }
 777         rtnl_kfree_skbs(flow->head, flow->tail);
 778         flow->head = NULL;
 779         flow->qlen = 0;
 780 }
 781
 782 static void fq_reset(struct Qdisc *sch)
 783 {
 784         struct fq_sched_data *q = qdisc_priv(sch);
 785         struct rb_root *root;
 786         struct rb_node *p;
 787         struct fq_flow *f;
 788         unsigned int idx;
 789
 790         sch->q.qlen = 0;
 791         sch->qstats.backlog = 0;
 792
 793         fq_flow_purge(&q->internal);
 794
 795         if (!q->fq_root)
 796                 return;
 797
 798         for (idx = 0; idx < (1U << q->fq_trees_log); idx++) {
 799                 root = &q->fq_root[idx];
 800                 while ((p = rb_first(root)) != NULL) {
 801                         f = rb_entry(p, struct fq_flow, fq_node);
 802                         rb_erase(p, root);
 803
 804                         fq_flow_purge(f);
 805
 806                         kmem_cache_free(fq_flow_cachep, f);
 807                 }
 808         }
 809         for (idx = 0; idx < FQ_BANDS; idx++) {
 810                 q->band_flows[idx].new_flows.first = NULL;
 811                 q->band_flows[idx].old_flows.first = NULL;
 812         }
 813         q->delayed              = RB_ROOT;
 814         q->flows                = 0;
 815         q->inactive_flows       = 0;
 816         q->throttled_flows      = 0;
 817 }
 818
 819 static void fq_rehash(struct fq_sched_data *q,
 820                       struct rb_root *old_array, u32 old_log,
 821                       struct rb_root *new_array, u32 new_log)
 822 {
 823         struct rb_node *op, **np, *parent;
 824         struct rb_root *oroot, *nroot;
 825         struct fq_flow *of, *nf;
 826         int fcnt = 0;
 827         u32 idx;
 828
 829         for (idx = 0; idx < (1U << old_log); idx++) {
 830                 oroot = &old_array[idx];
 831                 while ((op = rb_first(oroot)) != NULL) {
 832                         rb_erase(op, oroot);
 833                         of = rb_entry(op, struct fq_flow, fq_node);
 834                         if (fq_gc_candidate(of)) {
 835                                 fcnt++;
 836                                 kmem_cache_free(fq_flow_cachep, of);
 837                                 continue;
 838                         }
 839                         nroot = &new_array[hash_ptr(of->sk, new_log)];
 840
 841                         np = &nroot->rb_node;
 842                         parent = NULL;
 843                         while (*np) {
 844                                 parent = *np;
 845
 846                                 nf = rb_entry(parent, struct fq_flow, fq_node);
 847                                 BUG_ON(nf->sk == of->sk);
 848
 849                                 if (nf->sk > of->sk)
 850                                         np = &parent->rb_right;
 851                                 else
 852                                         np = &parent->rb_left;
 853                         }
 854
 855                         rb_link_node(&of->fq_node, parent, np);
 856                         rb_insert_color(&of->fq_node, nroot);
 857                 }
 858         }
 859         q->flows -= fcnt;
 860         q->inactive_flows -= fcnt;
 861         q->stat_gc_flows += fcnt;
 862 }
 863
 864 static void fq_free(void *addr)
 865 {
 866         kvfree(addr);
 867 }
 868
 869 static int fq_resize(struct Qdisc *sch, u32 log)
 870 {
 871         struct fq_sched_data *q = qdisc_priv(sch);
 872         struct rb_root *array;
 873         void *old_fq_root;
 874         u32 idx;
 875
 876         if (q->fq_root && log == q->fq_trees_log)
 877                 return 0;
 878
 879         /* If XPS was setup, we can allocate memory on right NUMA node */
 880         array = kvmalloc_node(sizeof(struct rb_root) << log, GFP_KERNEL | __GFP_RETRY_MAYFAIL,
 881                               netdev_queue_numa_node_read(sch->dev_queue));
 882         if (!array)
 883                 return -ENOMEM;
 884
 885         for (idx = 0; idx < (1U << log); idx++)
 886                 array[idx] = RB_ROOT;
 887
 888         sch_tree_lock(sch);
 889
 890         old_fq_root = q->fq_root;
 891         if (old_fq_root)
 892                 fq_rehash(q, old_fq_root, q->fq_trees_log, array, log);
 893
 894         q->fq_root = array;
 895         WRITE_ONCE(q->fq_trees_log, log);
 896
 897         sch_tree_unlock(sch);
 898
 899         fq_free(old_fq_root);
 900
 901         return 0;
 902 }
 903
 904 static const struct netlink_range_validation iq_range = {
 905         .max = INT_MAX,
 906 };
 907
 908 static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
 909         [TCA_FQ_UNSPEC]                 = { .strict_start_type = TCA_FQ_TIMER_SLACK },
 910
 911         [TCA_FQ_PLIMIT]                 = { .type = NLA_U32 },
 912         [TCA_FQ_FLOW_PLIMIT]            = { .type = NLA_U32 },
 913         [TCA_FQ_QUANTUM]                = { .type = NLA_U32 },
 914         [TCA_FQ_INITIAL_QUANTUM]        = NLA_POLICY_FULL_RANGE(NLA_U32, &iq_range),
 915         [TCA_FQ_RATE_ENABLE]            = { .type = NLA_U32 },
 916         [TCA_FQ_FLOW_DEFAULT_RATE]      = { .type = NLA_U32 },
 917         [TCA_FQ_FLOW_MAX_RATE]          = { .type = NLA_U32 },
 918         [TCA_FQ_BUCKETS_LOG]            = { .type = NLA_U32 },
 919         [TCA_FQ_FLOW_REFILL_DELAY]      = { .type = NLA_U32 },
 920         [TCA_FQ_ORPHAN_MASK]            = { .type = NLA_U32 },
 921         [TCA_FQ_LOW_RATE_THRESHOLD]     = { .type = NLA_U32 },
 922         [TCA_FQ_CE_THRESHOLD]           = { .type = NLA_U32 },
 923         [TCA_FQ_TIMER_SLACK]            = { .type = NLA_U32 },
 924         [TCA_FQ_HORIZON]                = { .type = NLA_U32 },
 925         [TCA_FQ_HORIZON_DROP]           = { .type = NLA_U8 },
 926         [TCA_FQ_PRIOMAP]                = NLA_POLICY_EXACT_LEN(sizeof(struct tc_prio_qopt)),
 927         [TCA_FQ_WEIGHTS]                = NLA_POLICY_EXACT_LEN(FQ_BANDS * sizeof(s32)),
 928 };
 929
 930 /* compress a u8 array with all elems <= 3 to an array of 2-bit fields */
 931 static void fq_prio2band_compress_crumb(const u8 *in, u8 *out)
 932 {
 933         const int num_elems = TC_PRIO_MAX + 1;
 934         u8 tmp[FQ_PRIO2BAND_CRUMB_SIZE];
 935         int i;
 936
 937         memset(tmp, 0, sizeof(tmp));
 938         for (i = 0; i < num_elems; i++)
 939                 tmp[i / 4] |= in[i] << (2 * (i & 0x3));
 940
 941         for (i = 0; i < FQ_PRIO2BAND_CRUMB_SIZE; i++)
 942                 WRITE_ONCE(out[i], tmp[i]);
 943 }
 944
 945 static void fq_prio2band_decompress_crumb(const u8 *in, u8 *out)
 946 {
 947         const int num_elems = TC_PRIO_MAX + 1;
 948         int i;
 949
 950         for (i = 0; i < num_elems; i++)
 951                 out[i] = fq_prio2band(in, i);
 952 }
 953
 954 static int fq_load_weights(struct fq_sched_data *q,
 955                            const struct nlattr *attr,
 956                            struct netlink_ext_ack *extack)
 957 {
 958         s32 *weights = nla_data(attr);
 959         int i;
 960
 961         for (i = 0; i < FQ_BANDS; i++) {
 962                 if (weights[i] < FQ_MIN_WEIGHT) {
 963                         NL_SET_ERR_MSG_FMT_MOD(extack, "Weight %d less that minimum allowed %d",
 964                                                weights[i], FQ_MIN_WEIGHT);
 965                         return -EINVAL;
 966                 }
 967         }
 968         for (i = 0; i < FQ_BANDS; i++)
 969                 WRITE_ONCE(q->band_flows[i].quantum, weights[i]);
 970         return 0;
 971 }
 972
 973 static int fq_load_priomap(struct fq_sched_data *q,
 974                            const struct nlattr *attr,
 975                            struct netlink_ext_ack *extack)
 976 {
 977         const struct tc_prio_qopt *map = nla_data(attr);
 978         int i;
 979
 980         if (map->bands != FQ_BANDS) {
 981                 NL_SET_ERR_MSG_MOD(extack, "FQ only supports 3 bands");
 982                 return -EINVAL;
 983         }
 984         for (i = 0; i < TC_PRIO_MAX + 1; i++) {
 985                 if (map->priomap[i] >= FQ_BANDS) {
 986                         NL_SET_ERR_MSG_FMT_MOD(extack, "FQ priomap field %d maps to a too high band %d",
 987                                                i, map->priomap[i]);
 988                         return -EINVAL;
 989                 }
 990         }
 991         fq_prio2band_compress_crumb(map->priomap, q->prio2band);
 992         return 0;
 993 }
 994
 995 static int fq_change(struct Qdisc *sch, struct nlattr *opt,
 996                      struct netlink_ext_ack *extack)
 997 {
 998         struct fq_sched_data *q = qdisc_priv(sch);
 999         struct nlattr *tb[TCA_FQ_MAX + 1];
1000         int err, drop_count = 0;
1001         unsigned drop_len = 0;
1002         u32 fq_log;
1003
1004         err = nla_parse_nested_deprecated(tb, TCA_FQ_MAX, opt, fq_policy,
1005                                           NULL);
1006         if (err < 0)
1007                 return err;
1008
1009         sch_tree_lock(sch);
1010
1011         fq_log = q->fq_trees_log;
1012
1013         if (tb[TCA_FQ_BUCKETS_LOG]) {
1014                 u32 nval = nla_get_u32(tb[TCA_FQ_BUCKETS_LOG]);
1015
1016                 if (nval >= 1 && nval <= ilog2(256*1024))
1017                         fq_log = nval;
1018                 else
1019                         err = -EINVAL;
1020         }
1021         if (tb[TCA_FQ_PLIMIT])
1022                 WRITE_ONCE(sch->limit,
1023                            nla_get_u32(tb[TCA_FQ_PLIMIT]));
1024
1025         if (tb[TCA_FQ_FLOW_PLIMIT])
1026                 WRITE_ONCE(q->flow_plimit,
1027                            nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT]));
1028
1029         if (tb[TCA_FQ_QUANTUM]) {
1030                 u32 quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]);
1031
1032                 if (quantum > 0 && quantum <= (1 << 20)) {
1033                         WRITE_ONCE(q->quantum, quantum);
1034                 } else {
1035                         NL_SET_ERR_MSG_MOD(extack, "invalid quantum");
1036                         err = -EINVAL;
1037                 }
1038         }
1039
1040         if (tb[TCA_FQ_INITIAL_QUANTUM])
1041                 WRITE_ONCE(q->initial_quantum,
1042                            nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]));
1043
1044         if (tb[TCA_FQ_FLOW_DEFAULT_RATE])
1045                 pr_warn_ratelimited("sch_fq: defrate %u ignored.\n",
1046                                     nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE]));
1047
1048         if (tb[TCA_FQ_FLOW_MAX_RATE]) {
1049                 u32 rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
1050
1051                 WRITE_ONCE(q->flow_max_rate,
1052                            (rate == ~0U) ? ~0UL : rate);
1053         }
1054         if (tb[TCA_FQ_LOW_RATE_THRESHOLD])
1055                 WRITE_ONCE(q->low_rate_threshold,
1056                            nla_get_u32(tb[TCA_FQ_LOW_RATE_THRESHOLD]));
1057
1058         if (tb[TCA_FQ_RATE_ENABLE]) {
1059                 u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]);
1060
1061                 if (enable <= 1)
1062                         WRITE_ONCE(q->rate_enable,
1063                                    enable);
1064                 else
1065                         err = -EINVAL;
1066         }
1067
1068         if (tb[TCA_FQ_FLOW_REFILL_DELAY]) {
1069                 u32 usecs_delay = nla_get_u32(tb[TCA_FQ_FLOW_REFILL_DELAY]) ;
1070
1071                 WRITE_ONCE(q->flow_refill_delay,
1072                            usecs_to_jiffies(usecs_delay));
1073         }
1074
1075         if (!err && tb[TCA_FQ_PRIOMAP])
1076                 err = fq_load_priomap(q, tb[TCA_FQ_PRIOMAP], extack);
1077
1078         if (!err && tb[TCA_FQ_WEIGHTS])
1079                 err = fq_load_weights(q, tb[TCA_FQ_WEIGHTS], extack);
1080
1081         if (tb[TCA_FQ_ORPHAN_MASK])
1082                 WRITE_ONCE(q->orphan_mask,
1083                            nla_get_u32(tb[TCA_FQ_ORPHAN_MASK]));
1084
1085         if (tb[TCA_FQ_CE_THRESHOLD])
1086                 WRITE_ONCE(q->ce_threshold,
1087                            (u64)NSEC_PER_USEC *
1088                            nla_get_u32(tb[TCA_FQ_CE_THRESHOLD]));
1089
1090         if (tb[TCA_FQ_TIMER_SLACK])
1091                 WRITE_ONCE(q->timer_slack,
1092                            nla_get_u32(tb[TCA_FQ_TIMER_SLACK]));
1093
1094         if (tb[TCA_FQ_HORIZON])
1095                 WRITE_ONCE(q->horizon,
1096                            (u64)NSEC_PER_USEC *
1097                            nla_get_u32(tb[TCA_FQ_HORIZON]));
1098
1099         if (tb[TCA_FQ_HORIZON_DROP])
1100                 WRITE_ONCE(q->horizon_drop,
1101                            nla_get_u8(tb[TCA_FQ_HORIZON_DROP]));
1102
1103         if (!err) {
1104
1105                 sch_tree_unlock(sch);
1106                 err = fq_resize(sch, fq_log);
1107                 sch_tree_lock(sch);
1108         }
1109         while (sch->q.qlen > sch->limit) {
1110                 struct sk_buff *skb = fq_dequeue(sch);
1111
1112                 if (!skb)
1113                         break;
1114                 drop_len += qdisc_pkt_len(skb);
1115                 rtnl_kfree_skbs(skb, skb);
1116                 drop_count++;
1117         }
1118         qdisc_tree_reduce_backlog(sch, drop_count, drop_len);
1119
1120         sch_tree_unlock(sch);
1121         return err;
1122 }
1123
1124 static void fq_destroy(struct Qdisc *sch)
1125 {
1126         struct fq_sched_data *q = qdisc_priv(sch);
1127
1128         fq_reset(sch);
1129         fq_free(q->fq_root);
1130         qdisc_watchdog_cancel(&q->watchdog);
1131 }
1132
1133 static int fq_init(struct Qdisc *sch, struct nlattr *opt,
1134                    struct netlink_ext_ack *extack)
1135 {
1136         struct fq_sched_data *q = qdisc_priv(sch);
1137         int i, err;
1138
1139         sch->limit              = 10000;
1140         q->flow_plimit          = 100;
1141         q->quantum              = 2 * psched_mtu(qdisc_dev(sch));
1142         q->initial_quantum      = 10 * psched_mtu(qdisc_dev(sch));
1143         q->flow_refill_delay    = msecs_to_jiffies(40);
1144         q->flow_max_rate        = ~0UL;
1145         q->time_next_delayed_flow = ~0ULL;
1146         q->rate_enable          = 1;
1147         for (i = 0; i < FQ_BANDS; i++) {
1148                 q->band_flows[i].new_flows.first = NULL;
1149                 q->band_flows[i].old_flows.first = NULL;
1150         }
1151         q->band_flows[0].quantum = 9 << 16;
1152         q->band_flows[1].quantum = 3 << 16;
1153         q->band_flows[2].quantum = 1 << 16;
1154         q->delayed              = RB_ROOT;
1155         q->fq_root              = NULL;
1156         q->fq_trees_log         = ilog2(1024);
1157         q->orphan_mask          = 1024 - 1;
1158         q->low_rate_threshold   = 550000 / 8;
1159
1160         q->timer_slack = 10 * NSEC_PER_USEC; /* 10 usec of hrtimer slack */
1161
1162         q->horizon = 10ULL * NSEC_PER_SEC; /* 10 seconds */
1163         q->horizon_drop = 1; /* by default, drop packets beyond horizon */
1164
1165         /* Default ce_threshold of 4294 seconds */
1166         q->ce_threshold         = (u64)NSEC_PER_USEC * ~0U;
1167
1168         fq_prio2band_compress_crumb(sch_default_prio2band, q->prio2band);
1169         qdisc_watchdog_init_clockid(&q->watchdog, sch, CLOCK_MONOTONIC);
1170
1171         if (opt)
1172                 err = fq_change(sch, opt, extack);
1173         else
1174                 err = fq_resize(sch, q->fq_trees_log);
1175
1176         return err;
1177 }
1178
1179 static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
1180 {
1181         struct fq_sched_data *q = qdisc_priv(sch);
1182         struct tc_prio_qopt prio = {
1183                 .bands = FQ_BANDS,
1184         };
1185         struct nlattr *opts;
1186         u64 ce_threshold;
1187         s32 weights[3];
1188         u64 horizon;
1189
1190         opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
1191         if (opts == NULL)
1192                 goto nla_put_failure;
1193
1194         /* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */
1195
1196         ce_threshold = READ_ONCE(q->ce_threshold);
1197         do_div(ce_threshold, NSEC_PER_USEC);
1198
1199         horizon = READ_ONCE(q->horizon);
1200         do_div(horizon, NSEC_PER_USEC);
1201
1202         if (nla_put_u32(skb, TCA_FQ_PLIMIT,
1203                         READ_ONCE(sch->limit)) ||
1204             nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT,
1205                         READ_ONCE(q->flow_plimit)) ||
1206             nla_put_u32(skb, TCA_FQ_QUANTUM,
1207                         READ_ONCE(q->quantum)) ||
1208             nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM,
1209                         READ_ONCE(q->initial_quantum)) ||
1210             nla_put_u32(skb, TCA_FQ_RATE_ENABLE,
1211                         READ_ONCE(q->rate_enable)) ||
1212             nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE,
1213                         min_t(unsigned long,
1214                               READ_ONCE(q->flow_max_rate), ~0U)) ||
1215             nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY,
1216                         jiffies_to_usecs(READ_ONCE(q->flow_refill_delay))) ||
1217             nla_put_u32(skb, TCA_FQ_ORPHAN_MASK,
1218                         READ_ONCE(q->orphan_mask)) ||
1219             nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD,
1220                         READ_ONCE(q->low_rate_threshold)) ||
1221             nla_put_u32(skb, TCA_FQ_CE_THRESHOLD, (u32)ce_threshold) ||
1222             nla_put_u32(skb, TCA_FQ_BUCKETS_LOG,
1223                         READ_ONCE(q->fq_trees_log)) ||
1224             nla_put_u32(skb, TCA_FQ_TIMER_SLACK,
1225                         READ_ONCE(q->timer_slack)) ||
1226             nla_put_u32(skb, TCA_FQ_HORIZON, (u32)horizon) ||
1227             nla_put_u8(skb, TCA_FQ_HORIZON_DROP,
1228                        READ_ONCE(q->horizon_drop)))
1229                 goto nla_put_failure;
1230
1231         fq_prio2band_decompress_crumb(q->prio2band, prio.priomap);
1232         if (nla_put(skb, TCA_FQ_PRIOMAP, sizeof(prio), &prio))
1233                 goto nla_put_failure;
1234
1235         weights[0] = READ_ONCE(q->band_flows[0].quantum);
1236         weights[1] = READ_ONCE(q->band_flows[1].quantum);
1237         weights[2] = READ_ONCE(q->band_flows[2].quantum);
1238         if (nla_put(skb, TCA_FQ_WEIGHTS, sizeof(weights), &weights))
1239                 goto nla_put_failure;
1240
1241         return nla_nest_end(skb, opts);
1242
1243 nla_put_failure:
1244         return -1;
1245 }
1246
1247 static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1248 {
1249         struct fq_sched_data *q = qdisc_priv(sch);
1250         struct tc_fq_qd_stats st;
1251         int i;
1252
1253         st.pad = 0;
1254
1255         sch_tree_lock(sch);
1256
1257         st.gc_flows               = q->stat_gc_flows;
1258         st.highprio_packets       = 0;
1259         st.fastpath_packets       = q->internal.stat_fastpath_packets;
1260         st.tcp_retrans            = 0;
1261         st.throttled              = q->stat_throttled;
1262         st.flows_plimit           = q->stat_flows_plimit;
1263         st.pkts_too_long          = q->stat_pkts_too_long;
1264         st.allocation_errors      = q->stat_allocation_errors;
1265         st.time_next_delayed_flow = q->time_next_delayed_flow + q->timer_slack -
1266                                     ktime_get_ns();
1267         st.flows                  = q->flows;
1268         st.inactive_flows         = q->inactive_flows;
1269         st.throttled_flows        = q->throttled_flows;
1270         st.unthrottle_latency_ns  = min_t(unsigned long,
1271                                           q->unthrottle_latency_ns, ~0U);
1272         st.ce_mark                = q->stat_ce_mark;
1273         st.horizon_drops          = q->stat_horizon_drops;
1274         st.horizon_caps           = q->stat_horizon_caps;
1275         for (i = 0; i < FQ_BANDS; i++) {
1276                 st.band_drops[i]  = q->stat_band_drops[i];
1277                 st.band_pkt_count[i] = q->band_pkt_count[i];
1278         }
1279         sch_tree_unlock(sch);
1280
1281         return gnet_stats_copy_app(d, &st, sizeof(st));
1282 }
1283
1284 static struct Qdisc_ops fq_qdisc_ops __read_mostly = {
1285         .id             =       "fq",
1286         .priv_size      =       sizeof(struct fq_sched_data),
1287
1288         .enqueue        =       fq_enqueue,
1289         .dequeue        =       fq_dequeue,
1290         .peek           =       qdisc_peek_dequeued,
1291         .init           =       fq_init,
1292         .reset          =       fq_reset,
1293         .destroy        =       fq_destroy,
1294         .change         =       fq_change,
1295         .dump           =       fq_dump,
1296         .dump_stats     =       fq_dump_stats,
1297         .owner          =       THIS_MODULE,
1298 };
1299 MODULE_ALIAS_NET_SCH("fq");
1300
1301 static int __init fq_module_init(void)
1302 {
1303         int ret;
1304
1305         fq_flow_cachep = kmem_cache_create("fq_flow_cache",
1306                                            sizeof(struct fq_flow),
1307                                            0, SLAB_HWCACHE_ALIGN, NULL);
1308         if (!fq_flow_cachep)
1309                 return -ENOMEM;
1310
1311         ret = register_qdisc(&fq_qdisc_ops);
1312         if (ret)
1313                 kmem_cache_destroy(fq_flow_cachep);
1314         return ret;
1315 }
1316
1317 static void __exit fq_module_exit(void)
1318 {
1319         unregister_qdisc(&fq_qdisc_ops);
1320         kmem_cache_destroy(fq_flow_cachep);
1321 }
1322
1323 module_init(fq_module_init)
1324 module_exit(fq_module_exit)
1325 MODULE_AUTHOR("Eric Dumazet");
1326 MODULE_LICENSE("GPL");
1327 MODULE_DESCRIPTION("Fair Queue Packet Scheduler");