net/sched/sch_sfq.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * net/sched/sch_sfq.c  Stochastic Fairness Queueing discipline.
   4  *
   5  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
   6  */
   7
   8 #include <linux/module.h>
   9 #include <linux/types.h>
  10 #include <linux/kernel.h>
  11 #include <linux/jiffies.h>
  12 #include <linux/string.h>
  13 #include <linux/in.h>
  14 #include <linux/errno.h>
  15 #include <linux/init.h>
  16 #include <linux/skbuff.h>
  17 #include <linux/siphash.h>
  18 #include <linux/slab.h>
  19 #include <linux/vmalloc.h>
  20 #include <net/netlink.h>
  21 #include <net/pkt_sched.h>
  22 #include <net/pkt_cls.h>
  23 #include <net/red.h>
  24
  25
  26 /*      Stochastic Fairness Queuing algorithm.
  27         =======================================
  28
  29         Source:
  30         Paul E. McKenney "Stochastic Fairness Queuing",
  31         IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
  32
  33         Paul E. McKenney "Stochastic Fairness Queuing",
  34         "Interworking: Research and Experience", v.2, 1991, p.113-131.
  35
  36
  37         See also:
  38         M. Shreedhar and George Varghese "Efficient Fair
  39         Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
  40
  41
  42         This is not the thing that is usually called (W)FQ nowadays.
  43         It does not use any timestamp mechanism, but instead
  44         processes queues in round-robin order.
  45
  46         ADVANTAGE:
  47
  48         - It is very cheap. Both CPU and memory requirements are minimal.
  49
  50         DRAWBACKS:
  51
  52         - "Stochastic" -> It is not 100% fair.
  53         When hash collisions occur, several flows are considered as one.
  54
  55         - "Round-robin" -> It introduces larger delays than virtual clock
  56         based schemes, and should not be used for isolating interactive
  57         traffic from non-interactive. It means, that this scheduler
  58         should be used as leaf of CBQ or P3, which put interactive traffic
  59         to higher priority band.
  60
  61         We still need true WFQ for top level CSZ, but using WFQ
  62         for the best effort traffic is absolutely pointless:
  63         SFQ is superior for this purpose.
  64
  65         IMPLEMENTATION:
  66         This implementation limits :
  67         - maximal queue length per flow to 127 packets.
  68         - max mtu to 2^18-1;
  69         - max 65408 flows,
  70         - number of hash buckets to 65536.
  71
  72         It is easy to increase these values, but not in flight.  */
  73
  74 #define SFQ_MAX_DEPTH           127 /* max number of packets per flow */
  75 #define SFQ_DEFAULT_FLOWS       128
  76 #define SFQ_MAX_FLOWS           (0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
  77 #define SFQ_EMPTY_SLOT          0xffff
  78 #define SFQ_DEFAULT_HASH_DIVISOR 1024
  79
  80 /* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
  81 typedef u16 sfq_index;
  82
  83 /*
  84  * We dont use pointers to save space.
  85  * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
  86  * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
  87  * are 'pointers' to dep[] array
  88  */
  89 struct sfq_head {
  90         sfq_index       next;
  91         sfq_index       prev;
  92 };
  93
  94 struct sfq_slot {
  95         struct sk_buff  *skblist_next;
  96         struct sk_buff  *skblist_prev;
  97         sfq_index       qlen; /* number of skbs in skblist */
  98         sfq_index       next; /* next slot in sfq RR chain */
  99         struct sfq_head dep; /* anchor in dep[] chains */
 100         unsigned short  hash; /* hash value (index in ht[]) */
 101         int             allot; /* credit for this slot */
 102
 103         unsigned int    backlog;
 104         struct red_vars vars;
 105 };
 106
 107 struct sfq_sched_data {
 108 /* frequently used fields */
 109         int             limit;          /* limit of total number of packets in this qdisc */
 110         unsigned int    divisor;        /* number of slots in hash table */
 111         u8              headdrop;
 112         u8              maxdepth;       /* limit of packets per flow */
 113
 114         siphash_key_t   perturbation;
 115         u8              cur_depth;      /* depth of longest slot */
 116         u8              flags;
 117         struct tcf_proto __rcu *filter_list;
 118         struct tcf_block *block;
 119         sfq_index       *ht;            /* Hash table ('divisor' slots) */
 120         struct sfq_slot *slots;         /* Flows table ('maxflows' entries) */
 121
 122         struct red_parms *red_parms;
 123         struct tc_sfqred_stats stats;
 124         struct sfq_slot *tail;          /* current slot in round */
 125
 126         struct sfq_head dep[SFQ_MAX_DEPTH + 1];
 127                                         /* Linked lists of slots, indexed by depth
 128                                          * dep[0] : list of unused flows
 129                                          * dep[1] : list of flows with 1 packet
 130                                          * dep[X] : list of flows with X packets
 131                                          */
 132
 133         unsigned int    maxflows;       /* number of flows in flows array */
 134         int             perturb_period;
 135         unsigned int    quantum;        /* Allotment per round: MUST BE >= MTU */
 136         struct timer_list perturb_timer;
 137         struct Qdisc    *sch;
 138 };
 139
 140 /*
 141  * sfq_head are either in a sfq_slot or in dep[] array
 142  */
 143 static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
 144 {
 145         if (val < SFQ_MAX_FLOWS)
 146                 return &q->slots[val].dep;
 147         return &q->dep[val - SFQ_MAX_FLOWS];
 148 }
 149
 150 static unsigned int sfq_hash(const struct sfq_sched_data *q,
 151                              const struct sk_buff *skb)
 152 {
 153         return skb_get_hash_perturb(skb, &q->perturbation) & (q->divisor - 1);
 154 }
 155
 156 static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
 157                                  int *qerr)
 158 {
 159         struct sfq_sched_data *q = qdisc_priv(sch);
 160         struct tcf_result res;
 161         struct tcf_proto *fl;
 162         int result;
 163
 164         if (TC_H_MAJ(skb->priority) == sch->handle &&
 165             TC_H_MIN(skb->priority) > 0 &&
 166             TC_H_MIN(skb->priority) <= q->divisor)
 167                 return TC_H_MIN(skb->priority);
 168
 169         fl = rcu_dereference_bh(q->filter_list);
 170         if (!fl)
 171                 return sfq_hash(q, skb) + 1;
 172
 173         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 174         result = tcf_classify(skb, NULL, fl, &res, false);
 175         if (result >= 0) {
 176 #ifdef CONFIG_NET_CLS_ACT
 177                 switch (result) {
 178                 case TC_ACT_STOLEN:
 179                 case TC_ACT_QUEUED:
 180                 case TC_ACT_TRAP:
 181                         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 182                         fallthrough;
 183                 case TC_ACT_SHOT:
 184                         return 0;
 185                 }
 186 #endif
 187                 if (TC_H_MIN(res.classid) <= q->divisor)
 188                         return TC_H_MIN(res.classid);
 189         }
 190         return 0;
 191 }
 192
 193 /*
 194  * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
 195  */
 196 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
 197 {
 198         sfq_index p, n;
 199         struct sfq_slot *slot = &q->slots[x];
 200         int qlen = slot->qlen;
 201
 202         p = qlen + SFQ_MAX_FLOWS;
 203         n = q->dep[qlen].next;
 204
 205         slot->dep.next = n;
 206         slot->dep.prev = p;
 207
 208         q->dep[qlen].next = x;          /* sfq_dep_head(q, p)->next = x */
 209         sfq_dep_head(q, n)->prev = x;
 210 }
 211
 212 #define sfq_unlink(q, x, n, p)                  \
 213         do {                                    \
 214                 n = q->slots[x].dep.next;       \
 215                 p = q->slots[x].dep.prev;       \
 216                 sfq_dep_head(q, p)->next = n;   \
 217                 sfq_dep_head(q, n)->prev = p;   \
 218         } while (0)
 219
 220
 221 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
 222 {
 223         sfq_index p, n;
 224         int d;
 225
 226         sfq_unlink(q, x, n, p);
 227
 228         d = q->slots[x].qlen--;
 229         if (n == p && q->cur_depth == d)
 230                 q->cur_depth--;
 231         sfq_link(q, x);
 232 }
 233
 234 static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
 235 {
 236         sfq_index p, n;
 237         int d;
 238
 239         sfq_unlink(q, x, n, p);
 240
 241         d = ++q->slots[x].qlen;
 242         if (q->cur_depth < d)
 243                 q->cur_depth = d;
 244         sfq_link(q, x);
 245 }
 246
 247 /* helper functions : might be changed when/if skb use a standard list_head */
 248
 249 /* remove one skb from tail of slot queue */
 250 static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
 251 {
 252         struct sk_buff *skb = slot->skblist_prev;
 253
 254         slot->skblist_prev = skb->prev;
 255         skb->prev->next = (struct sk_buff *)slot;
 256         skb->next = skb->prev = NULL;
 257         return skb;
 258 }
 259
 260 /* remove one skb from head of slot queue */
 261 static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
 262 {
 263         struct sk_buff *skb = slot->skblist_next;
 264
 265         slot->skblist_next = skb->next;
 266         skb->next->prev = (struct sk_buff *)slot;
 267         skb->next = skb->prev = NULL;
 268         return skb;
 269 }
 270
 271 static inline void slot_queue_init(struct sfq_slot *slot)
 272 {
 273         memset(slot, 0, sizeof(*slot));
 274         slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
 275 }
 276
 277 /* add skb to slot queue (tail add) */
 278 static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
 279 {
 280         skb->prev = slot->skblist_prev;
 281         skb->next = (struct sk_buff *)slot;
 282         slot->skblist_prev->next = skb;
 283         slot->skblist_prev = skb;
 284 }
 285
 286 static unsigned int sfq_drop(struct Qdisc *sch, struct sk_buff **to_free)
 287 {
 288         struct sfq_sched_data *q = qdisc_priv(sch);
 289         sfq_index x, d = q->cur_depth;
 290         struct sk_buff *skb;
 291         unsigned int len;
 292         struct sfq_slot *slot;
 293
 294         /* Queue is full! Find the longest slot and drop tail packet from it */
 295         if (d > 1) {
 296                 x = q->dep[d].next;
 297                 slot = &q->slots[x];
 298 drop:
 299                 skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
 300                 len = qdisc_pkt_len(skb);
 301                 slot->backlog -= len;
 302                 sfq_dec(q, x);
 303                 sch->q.qlen--;
 304                 qdisc_qstats_backlog_dec(sch, skb);
 305                 qdisc_drop(skb, sch, to_free);
 306                 return len;
 307         }
 308
 309         if (d == 1) {
 310                 /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
 311                 x = q->tail->next;
 312                 slot = &q->slots[x];
 313                 q->tail->next = slot->next;
 314                 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 315                 goto drop;
 316         }
 317
 318         return 0;
 319 }
 320
 321 /* Is ECN parameter configured */
 322 static int sfq_prob_mark(const struct sfq_sched_data *q)
 323 {
 324         return q->flags & TC_RED_ECN;
 325 }
 326
 327 /* Should packets over max threshold just be marked */
 328 static int sfq_hard_mark(const struct sfq_sched_data *q)
 329 {
 330         return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
 331 }
 332
 333 static int sfq_headdrop(const struct sfq_sched_data *q)
 334 {
 335         return q->headdrop;
 336 }
 337
 338 static int
 339 sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
 340 {
 341         struct sfq_sched_data *q = qdisc_priv(sch);
 342         unsigned int hash, dropped;
 343         sfq_index x, qlen;
 344         struct sfq_slot *slot;
 345         int ret;
 346         struct sk_buff *head;
 347         int delta;
 348
 349         hash = sfq_classify(skb, sch, &ret);
 350         if (hash == 0) {
 351                 if (ret & __NET_XMIT_BYPASS)
 352                         qdisc_qstats_drop(sch);
 353                 __qdisc_drop(skb, to_free);
 354                 return ret;
 355         }
 356         hash--;
 357
 358         x = q->ht[hash];
 359         slot = &q->slots[x];
 360         if (x == SFQ_EMPTY_SLOT) {
 361                 x = q->dep[0].next; /* get a free slot */
 362                 if (x >= SFQ_MAX_FLOWS)
 363                         return qdisc_drop(skb, sch, to_free);
 364                 q->ht[hash] = x;
 365                 slot = &q->slots[x];
 366                 slot->hash = hash;
 367                 slot->backlog = 0; /* should already be 0 anyway... */
 368                 red_set_vars(&slot->vars);
 369                 goto enqueue;
 370         }
 371         if (q->red_parms) {
 372                 slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
 373                                                         &slot->vars,
 374                                                         slot->backlog);
 375                 switch (red_action(q->red_parms,
 376                                    &slot->vars,
 377                                    slot->vars.qavg)) {
 378                 case RED_DONT_MARK:
 379                         break;
 380
 381                 case RED_PROB_MARK:
 382                         qdisc_qstats_overlimit(sch);
 383                         if (sfq_prob_mark(q)) {
 384                                 /* We know we have at least one packet in queue */
 385                                 if (sfq_headdrop(q) &&
 386                                     INET_ECN_set_ce(slot->skblist_next)) {
 387                                         q->stats.prob_mark_head++;
 388                                         break;
 389                                 }
 390                                 if (INET_ECN_set_ce(skb)) {
 391                                         q->stats.prob_mark++;
 392                                         break;
 393                                 }
 394                         }
 395                         q->stats.prob_drop++;
 396                         goto congestion_drop;
 397
 398                 case RED_HARD_MARK:
 399                         qdisc_qstats_overlimit(sch);
 400                         if (sfq_hard_mark(q)) {
 401                                 /* We know we have at least one packet in queue */
 402                                 if (sfq_headdrop(q) &&
 403                                     INET_ECN_set_ce(slot->skblist_next)) {
 404                                         q->stats.forced_mark_head++;
 405                                         break;
 406                                 }
 407                                 if (INET_ECN_set_ce(skb)) {
 408                                         q->stats.forced_mark++;
 409                                         break;
 410                                 }
 411                         }
 412                         q->stats.forced_drop++;
 413                         goto congestion_drop;
 414                 }
 415         }
 416
 417         if (slot->qlen >= q->maxdepth) {
 418 congestion_drop:
 419                 if (!sfq_headdrop(q))
 420                         return qdisc_drop(skb, sch, to_free);
 421
 422                 /* We know we have at least one packet in queue */
 423                 head = slot_dequeue_head(slot);
 424                 delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
 425                 sch->qstats.backlog -= delta;
 426                 slot->backlog -= delta;
 427                 qdisc_drop(head, sch, to_free);
 428
 429                 slot_queue_add(slot, skb);
 430                 qdisc_tree_reduce_backlog(sch, 0, delta);
 431                 return NET_XMIT_CN;
 432         }
 433
 434 enqueue:
 435         qdisc_qstats_backlog_inc(sch, skb);
 436         slot->backlog += qdisc_pkt_len(skb);
 437         slot_queue_add(slot, skb);
 438         sfq_inc(q, x);
 439         if (slot->qlen == 1) {          /* The flow is new */
 440                 if (q->tail == NULL) {  /* It is the first flow */
 441                         slot->next = x;
 442                 } else {
 443                         slot->next = q->tail->next;
 444                         q->tail->next = x;
 445                 }
 446                 /* We put this flow at the end of our flow list.
 447                  * This might sound unfair for a new flow to wait after old ones,
 448                  * but we could endup servicing new flows only, and freeze old ones.
 449                  */
 450                 q->tail = slot;
 451                 /* We could use a bigger initial quantum for new flows */
 452                 slot->allot = q->quantum;
 453         }
 454         if (++sch->q.qlen <= q->limit)
 455                 return NET_XMIT_SUCCESS;
 456
 457         qlen = slot->qlen;
 458         dropped = sfq_drop(sch, to_free);
 459         /* Return Congestion Notification only if we dropped a packet
 460          * from this flow.
 461          */
 462         if (qlen != slot->qlen) {
 463                 qdisc_tree_reduce_backlog(sch, 0, dropped - qdisc_pkt_len(skb));
 464                 return NET_XMIT_CN;
 465         }
 466
 467         /* As we dropped a packet, better let upper stack know this */
 468         qdisc_tree_reduce_backlog(sch, 1, dropped);
 469         return NET_XMIT_SUCCESS;
 470 }
 471
 472 static struct sk_buff *
 473 sfq_dequeue(struct Qdisc *sch)
 474 {
 475         struct sfq_sched_data *q = qdisc_priv(sch);
 476         struct sk_buff *skb;
 477         sfq_index a, next_a;
 478         struct sfq_slot *slot;
 479
 480         /* No active slots */
 481         if (q->tail == NULL)
 482                 return NULL;
 483
 484 next_slot:
 485         a = q->tail->next;
 486         slot = &q->slots[a];
 487         if (slot->allot <= 0) {
 488                 q->tail = slot;
 489                 slot->allot += q->quantum;
 490                 goto next_slot;
 491         }
 492         skb = slot_dequeue_head(slot);
 493         sfq_dec(q, a);
 494         qdisc_bstats_update(sch, skb);
 495         sch->q.qlen--;
 496         qdisc_qstats_backlog_dec(sch, skb);
 497         slot->backlog -= qdisc_pkt_len(skb);
 498         /* Is the slot empty? */
 499         if (slot->qlen == 0) {
 500                 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 501                 next_a = slot->next;
 502                 if (a == next_a) {
 503                         q->tail = NULL; /* no more active slots */
 504                         return skb;
 505                 }
 506                 q->tail->next = next_a;
 507         } else {
 508                 slot->allot -= qdisc_pkt_len(skb);
 509         }
 510         return skb;
 511 }
 512
 513 static void
 514 sfq_reset(struct Qdisc *sch)
 515 {
 516         struct sk_buff *skb;
 517
 518         while ((skb = sfq_dequeue(sch)) != NULL)
 519                 rtnl_kfree_skbs(skb, skb);
 520 }
 521
 522 /*
 523  * When q->perturbation is changed, we rehash all queued skbs
 524  * to avoid OOO (Out Of Order) effects.
 525  * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
 526  * counters.
 527  */
 528 static void sfq_rehash(struct Qdisc *sch)
 529 {
 530         struct sfq_sched_data *q = qdisc_priv(sch);
 531         struct sk_buff *skb;
 532         int i;
 533         struct sfq_slot *slot;
 534         struct sk_buff_head list;
 535         int dropped = 0;
 536         unsigned int drop_len = 0;
 537
 538         __skb_queue_head_init(&list);
 539
 540         for (i = 0; i < q->maxflows; i++) {
 541                 slot = &q->slots[i];
 542                 if (!slot->qlen)
 543                         continue;
 544                 while (slot->qlen) {
 545                         skb = slot_dequeue_head(slot);
 546                         sfq_dec(q, i);
 547                         __skb_queue_tail(&list, skb);
 548                 }
 549                 slot->backlog = 0;
 550                 red_set_vars(&slot->vars);
 551                 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 552         }
 553         q->tail = NULL;
 554
 555         while ((skb = __skb_dequeue(&list)) != NULL) {
 556                 unsigned int hash = sfq_hash(q, skb);
 557                 sfq_index x = q->ht[hash];
 558
 559                 slot = &q->slots[x];
 560                 if (x == SFQ_EMPTY_SLOT) {
 561                         x = q->dep[0].next; /* get a free slot */
 562                         if (x >= SFQ_MAX_FLOWS) {
 563 drop:
 564                                 qdisc_qstats_backlog_dec(sch, skb);
 565                                 drop_len += qdisc_pkt_len(skb);
 566                                 kfree_skb(skb);
 567                                 dropped++;
 568                                 continue;
 569                         }
 570                         q->ht[hash] = x;
 571                         slot = &q->slots[x];
 572                         slot->hash = hash;
 573                 }
 574                 if (slot->qlen >= q->maxdepth)
 575                         goto drop;
 576                 slot_queue_add(slot, skb);
 577                 if (q->red_parms)
 578                         slot->vars.qavg = red_calc_qavg(q->red_parms,
 579                                                         &slot->vars,
 580                                                         slot->backlog);
 581                 slot->backlog += qdisc_pkt_len(skb);
 582                 sfq_inc(q, x);
 583                 if (slot->qlen == 1) {          /* The flow is new */
 584                         if (q->tail == NULL) {  /* It is the first flow */
 585                                 slot->next = x;
 586                         } else {
 587                                 slot->next = q->tail->next;
 588                                 q->tail->next = x;
 589                         }
 590                         q->tail = slot;
 591                         slot->allot = q->quantum;
 592                 }
 593         }
 594         sch->q.qlen -= dropped;
 595         qdisc_tree_reduce_backlog(sch, dropped, drop_len);
 596 }
 597
 598 static void sfq_perturbation(struct timer_list *t)
 599 {
 600         struct sfq_sched_data *q = from_timer(q, t, perturb_timer);
 601         struct Qdisc *sch = q->sch;
 602         spinlock_t *root_lock;
 603         siphash_key_t nkey;
 604         int period;
 605
 606         get_random_bytes(&nkey, sizeof(nkey));
 607         rcu_read_lock();
 608         root_lock = qdisc_lock(qdisc_root_sleeping(sch));
 609         spin_lock(root_lock);
 610         q->perturbation = nkey;
 611         if (!q->filter_list && q->tail)
 612                 sfq_rehash(sch);
 613         spin_unlock(root_lock);
 614
 615         /* q->perturb_period can change under us from
 616          * sfq_change() and sfq_destroy().
 617          */
 618         period = READ_ONCE(q->perturb_period);
 619         if (period)
 620                 mod_timer(&q->perturb_timer, jiffies + period);
 621         rcu_read_unlock();
 622 }
 623
 624 static int sfq_change(struct Qdisc *sch, struct nlattr *opt,
 625                       struct netlink_ext_ack *extack)
 626 {
 627         struct sfq_sched_data *q = qdisc_priv(sch);
 628         struct tc_sfq_qopt *ctl = nla_data(opt);
 629         struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
 630         unsigned int qlen, dropped = 0;
 631         struct red_parms *p = NULL;
 632         struct sk_buff *to_free = NULL;
 633         struct sk_buff *tail = NULL;
 634
 635         if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
 636                 return -EINVAL;
 637         if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
 638                 ctl_v1 = nla_data(opt);
 639         if (ctl->divisor &&
 640             (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
 641                 return -EINVAL;
 642
 643         if ((int)ctl->quantum < 0) {
 644                 NL_SET_ERR_MSG_MOD(extack, "invalid quantum");
 645                 return -EINVAL;
 646         }
 647         if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max,
 648                                         ctl_v1->Wlog, ctl_v1->Scell_log, NULL))
 649                 return -EINVAL;
 650         if (ctl_v1 && ctl_v1->qth_min) {
 651                 p = kmalloc(sizeof(*p), GFP_KERNEL);
 652                 if (!p)
 653                         return -ENOMEM;
 654         }
 655         if (ctl->limit == 1) {
 656                 NL_SET_ERR_MSG_MOD(extack, "invalid limit");
 657                 return -EINVAL;
 658         }
 659         sch_tree_lock(sch);
 660         if (ctl->quantum)
 661                 q->quantum = ctl->quantum;
 662         WRITE_ONCE(q->perturb_period, ctl->perturb_period * HZ);
 663         if (ctl->flows)
 664                 q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
 665         if (ctl->divisor) {
 666                 q->divisor = ctl->divisor;
 667                 q->maxflows = min_t(u32, q->maxflows, q->divisor);
 668         }
 669         if (ctl_v1) {
 670                 if (ctl_v1->depth)
 671                         q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
 672                 if (p) {
 673                         swap(q->red_parms, p);
 674                         red_set_parms(q->red_parms,
 675                                       ctl_v1->qth_min, ctl_v1->qth_max,
 676                                       ctl_v1->Wlog,
 677                                       ctl_v1->Plog, ctl_v1->Scell_log,
 678                                       NULL,
 679                                       ctl_v1->max_P);
 680                 }
 681                 q->flags = ctl_v1->flags;
 682                 q->headdrop = ctl_v1->headdrop;
 683         }
 684         if (ctl->limit) {
 685                 q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
 686                 q->maxflows = min_t(u32, q->maxflows, q->limit);
 687         }
 688
 689         qlen = sch->q.qlen;
 690         while (sch->q.qlen > q->limit) {
 691                 dropped += sfq_drop(sch, &to_free);
 692                 if (!tail)
 693                         tail = to_free;
 694         }
 695
 696         rtnl_kfree_skbs(to_free, tail);
 697         qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
 698
 699         del_timer(&q->perturb_timer);
 700         if (q->perturb_period) {
 701                 mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 702                 get_random_bytes(&q->perturbation, sizeof(q->perturbation));
 703         }
 704         sch_tree_unlock(sch);
 705         kfree(p);
 706         return 0;
 707 }
 708
 709 static void *sfq_alloc(size_t sz)
 710 {
 711         return  kvmalloc(sz, GFP_KERNEL);
 712 }
 713
 714 static void sfq_free(void *addr)
 715 {
 716         kvfree(addr);
 717 }
 718
 719 static void sfq_destroy(struct Qdisc *sch)
 720 {
 721         struct sfq_sched_data *q = qdisc_priv(sch);
 722
 723         tcf_block_put(q->block);
 724         WRITE_ONCE(q->perturb_period, 0);
 725         del_timer_sync(&q->perturb_timer);
 726         sfq_free(q->ht);
 727         sfq_free(q->slots);
 728         kfree(q->red_parms);
 729 }
 730
 731 static int sfq_init(struct Qdisc *sch, struct nlattr *opt,
 732                     struct netlink_ext_ack *extack)
 733 {
 734         struct sfq_sched_data *q = qdisc_priv(sch);
 735         int i;
 736         int err;
 737
 738         q->sch = sch;
 739         timer_setup(&q->perturb_timer, sfq_perturbation, TIMER_DEFERRABLE);
 740
 741         err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
 742         if (err)
 743                 return err;
 744
 745         for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
 746                 q->dep[i].next = i + SFQ_MAX_FLOWS;
 747                 q->dep[i].prev = i + SFQ_MAX_FLOWS;
 748         }
 749
 750         q->limit = SFQ_MAX_DEPTH;
 751         q->maxdepth = SFQ_MAX_DEPTH;
 752         q->cur_depth = 0;
 753         q->tail = NULL;
 754         q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
 755         q->maxflows = SFQ_DEFAULT_FLOWS;
 756         q->quantum = psched_mtu(qdisc_dev(sch));
 757         q->perturb_period = 0;
 758         get_random_bytes(&q->perturbation, sizeof(q->perturbation));
 759
 760         if (opt) {
 761                 int err = sfq_change(sch, opt, extack);
 762                 if (err)
 763                         return err;
 764         }
 765
 766         q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
 767         q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
 768         if (!q->ht || !q->slots) {
 769                 /* Note: sfq_destroy() will be called by our caller */
 770                 return -ENOMEM;
 771         }
 772
 773         for (i = 0; i < q->divisor; i++)
 774                 q->ht[i] = SFQ_EMPTY_SLOT;
 775
 776         for (i = 0; i < q->maxflows; i++) {
 777                 slot_queue_init(&q->slots[i]);
 778                 sfq_link(q, i);
 779         }
 780         if (q->limit >= 1)
 781                 sch->flags |= TCQ_F_CAN_BYPASS;
 782         else
 783                 sch->flags &= ~TCQ_F_CAN_BYPASS;
 784         return 0;
 785 }
 786
 787 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
 788 {
 789         struct sfq_sched_data *q = qdisc_priv(sch);
 790         unsigned char *b = skb_tail_pointer(skb);
 791         struct tc_sfq_qopt_v1 opt;
 792         struct red_parms *p = q->red_parms;
 793
 794         memset(&opt, 0, sizeof(opt));
 795         opt.v0.quantum  = q->quantum;
 796         opt.v0.perturb_period = q->perturb_period / HZ;
 797         opt.v0.limit    = q->limit;
 798         opt.v0.divisor  = q->divisor;
 799         opt.v0.flows    = q->maxflows;
 800         opt.depth       = q->maxdepth;
 801         opt.headdrop    = q->headdrop;
 802
 803         if (p) {
 804                 opt.qth_min     = p->qth_min >> p->Wlog;
 805                 opt.qth_max     = p->qth_max >> p->Wlog;
 806                 opt.Wlog        = p->Wlog;
 807                 opt.Plog        = p->Plog;
 808                 opt.Scell_log   = p->Scell_log;
 809                 opt.max_P       = p->max_P;
 810         }
 811         memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
 812         opt.flags       = q->flags;
 813
 814         if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
 815                 goto nla_put_failure;
 816
 817         return skb->len;
 818
 819 nla_put_failure:
 820         nlmsg_trim(skb, b);
 821         return -1;
 822 }
 823
 824 static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
 825 {
 826         return NULL;
 827 }
 828
 829 static unsigned long sfq_find(struct Qdisc *sch, u32 classid)
 830 {
 831         return 0;
 832 }
 833
 834 static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
 835                               u32 classid)
 836 {
 837         return 0;
 838 }
 839
 840 static void sfq_unbind(struct Qdisc *q, unsigned long cl)
 841 {
 842 }
 843
 844 static struct tcf_block *sfq_tcf_block(struct Qdisc *sch, unsigned long cl,
 845                                        struct netlink_ext_ack *extack)
 846 {
 847         struct sfq_sched_data *q = qdisc_priv(sch);
 848
 849         if (cl)
 850                 return NULL;
 851         return q->block;
 852 }
 853
 854 static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
 855                           struct sk_buff *skb, struct tcmsg *tcm)
 856 {
 857         tcm->tcm_handle |= TC_H_MIN(cl);
 858         return 0;
 859 }
 860
 861 static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 862                                 struct gnet_dump *d)
 863 {
 864         struct sfq_sched_data *q = qdisc_priv(sch);
 865         sfq_index idx = q->ht[cl - 1];
 866         struct gnet_stats_queue qs = { 0 };
 867         struct tc_sfq_xstats xstats = { 0 };
 868
 869         if (idx != SFQ_EMPTY_SLOT) {
 870                 const struct sfq_slot *slot = &q->slots[idx];
 871
 872                 xstats.allot = slot->allot;
 873                 qs.qlen = slot->qlen;
 874                 qs.backlog = slot->backlog;
 875         }
 876         if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
 877                 return -1;
 878         return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
 879 }
 880
 881 static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 882 {
 883         struct sfq_sched_data *q = qdisc_priv(sch);
 884         unsigned int i;
 885
 886         if (arg->stop)
 887                 return;
 888
 889         for (i = 0; i < q->divisor; i++) {
 890                 if (q->ht[i] == SFQ_EMPTY_SLOT) {
 891                         arg->count++;
 892                         continue;
 893                 }
 894                 if (!tc_qdisc_stats_dump(sch, i + 1, arg))
 895                         break;
 896         }
 897 }
 898
 899 static const struct Qdisc_class_ops sfq_class_ops = {
 900         .leaf           =       sfq_leaf,
 901         .find           =       sfq_find,
 902         .tcf_block      =       sfq_tcf_block,
 903         .bind_tcf       =       sfq_bind,
 904         .unbind_tcf     =       sfq_unbind,
 905         .dump           =       sfq_dump_class,
 906         .dump_stats     =       sfq_dump_class_stats,
 907         .walk           =       sfq_walk,
 908 };
 909
 910 static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
 911         .cl_ops         =       &sfq_class_ops,
 912         .id             =       "sfq",
 913         .priv_size      =       sizeof(struct sfq_sched_data),
 914         .enqueue        =       sfq_enqueue,
 915         .dequeue        =       sfq_dequeue,
 916         .peek           =       qdisc_peek_dequeued,
 917         .init           =       sfq_init,
 918         .reset          =       sfq_reset,
 919         .destroy        =       sfq_destroy,
 920         .change         =       NULL,
 921         .dump           =       sfq_dump,
 922         .owner          =       THIS_MODULE,
 923 };
 924 MODULE_ALIAS_NET_SCH("sfq");
 925
 926 static int __init sfq_module_init(void)
 927 {
 928         return register_qdisc(&sfq_qdisc_ops);
 929 }
 930 static void __exit sfq_module_exit(void)
 931 {
 932         unregister_qdisc(&sfq_qdisc_ops);
 933 }
 934 module_init(sfq_module_init)
 935 module_exit(sfq_module_exit)
 936 MODULE_LICENSE("GPL");
 937 MODULE_DESCRIPTION("Stochastic Fairness qdisc");