net/sched/sch_sfq.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * net/sched/sch_sfq.c  Stochastic Fairness Queueing discipline.
   4  *
   5  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
   6  */
   7
   8 #include <linux/module.h>
   9 #include <linux/types.h>
  10 #include <linux/kernel.h>
  11 #include <linux/jiffies.h>
  12 #include <linux/string.h>
  13 #include <linux/in.h>
  14 #include <linux/errno.h>
  15 #include <linux/init.h>
  16 #include <linux/skbuff.h>
  17 #include <linux/siphash.h>
  18 #include <linux/slab.h>
  19 #include <linux/vmalloc.h>
  20 #include <net/netlink.h>
  21 #include <net/pkt_sched.h>
  22 #include <net/pkt_cls.h>
  23 #include <net/red.h>
  24
  25
  26 /*      Stochastic Fairness Queuing algorithm.
  27         =======================================
  28
  29         Source:
  30         Paul E. McKenney "Stochastic Fairness Queuing",
  31         IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
  32
  33         Paul E. McKenney "Stochastic Fairness Queuing",
  34         "Interworking: Research and Experience", v.2, 1991, p.113-131.
  35
  36
  37         See also:
  38         M. Shreedhar and George Varghese "Efficient Fair
  39         Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
  40
  41
  42         This is not the thing that is usually called (W)FQ nowadays.
  43         It does not use any timestamp mechanism, but instead
  44         processes queues in round-robin order.
  45
  46         ADVANTAGE:
  47
  48         - It is very cheap. Both CPU and memory requirements are minimal.
  49
  50         DRAWBACKS:
  51
  52         - "Stochastic" -> It is not 100% fair.
  53         When hash collisions occur, several flows are considered as one.
  54
  55         - "Round-robin" -> It introduces larger delays than virtual clock
  56         based schemes, and should not be used for isolating interactive
  57         traffic from non-interactive. It means, that this scheduler
  58         should be used as leaf of CBQ or P3, which put interactive traffic
  59         to higher priority band.
  60
  61         We still need true WFQ for top level CSZ, but using WFQ
  62         for the best effort traffic is absolutely pointless:
  63         SFQ is superior for this purpose.
  64
  65         IMPLEMENTATION:
  66         This implementation limits :
  67         - maximal queue length per flow to 127 packets.
  68         - max mtu to 2^18-1;
  69         - max 65408 flows,
  70         - number of hash buckets to 65536.
  71
  72         It is easy to increase these values, but not in flight.  */
  73
  74 #define SFQ_MAX_DEPTH           127 /* max number of packets per flow */
  75 #define SFQ_DEFAULT_FLOWS       128
  76 #define SFQ_MAX_FLOWS           (0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
  77 #define SFQ_EMPTY_SLOT          0xffff
  78 #define SFQ_DEFAULT_HASH_DIVISOR 1024
  79
  80 /* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
  81 typedef u16 sfq_index;
  82
  83 /*
  84  * We dont use pointers to save space.
  85  * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
  86  * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
  87  * are 'pointers' to dep[] array
  88  */
  89 struct sfq_head {
  90         sfq_index       next;
  91         sfq_index       prev;
  92 };
  93
  94 struct sfq_slot {
  95         struct sk_buff  *skblist_next;
  96         struct sk_buff  *skblist_prev;
  97         sfq_index       qlen; /* number of skbs in skblist */
  98         sfq_index       next; /* next slot in sfq RR chain */
  99         struct sfq_head dep; /* anchor in dep[] chains */
 100         unsigned short  hash; /* hash value (index in ht[]) */
 101         int             allot; /* credit for this slot */
 102
 103         unsigned int    backlog;
 104         struct red_vars vars;
 105 };
 106
 107 struct sfq_sched_data {
 108 /* frequently used fields */
 109         int             limit;          /* limit of total number of packets in this qdisc */
 110         unsigned int    divisor;        /* number of slots in hash table */
 111         u8              headdrop;
 112         u8              maxdepth;       /* limit of packets per flow */
 113
 114         siphash_key_t   perturbation;
 115         u8              cur_depth;      /* depth of longest slot */
 116         u8              flags;
 117         struct tcf_proto __rcu *filter_list;
 118         struct tcf_block *block;
 119         sfq_index       *ht;            /* Hash table ('divisor' slots) */
 120         struct sfq_slot *slots;         /* Flows table ('maxflows' entries) */
 121
 122         struct red_parms *red_parms;
 123         struct tc_sfqred_stats stats;
 124         struct sfq_slot *tail;          /* current slot in round */
 125
 126         struct sfq_head dep[SFQ_MAX_DEPTH + 1];
 127                                         /* Linked lists of slots, indexed by depth
 128                                          * dep[0] : list of unused flows
 129                                          * dep[1] : list of flows with 1 packet
 130                                          * dep[X] : list of flows with X packets
 131                                          */
 132
 133         unsigned int    maxflows;       /* number of flows in flows array */
 134         int             perturb_period;
 135         unsigned int    quantum;        /* Allotment per round: MUST BE >= MTU */
 136         struct timer_list perturb_timer;
 137         struct Qdisc    *sch;
 138 };
 139
 140 /*
 141  * sfq_head are either in a sfq_slot or in dep[] array
 142  */
 143 static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
 144 {
 145         if (val < SFQ_MAX_FLOWS)
 146                 return &q->slots[val].dep;
 147         return &q->dep[val - SFQ_MAX_FLOWS];
 148 }
 149
 150 static unsigned int sfq_hash(const struct sfq_sched_data *q,
 151                              const struct sk_buff *skb)
 152 {
 153         return skb_get_hash_perturb(skb, &q->perturbation) & (q->divisor - 1);
 154 }
 155
 156 static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
 157                                  int *qerr)
 158 {
 159         struct sfq_sched_data *q = qdisc_priv(sch);
 160         struct tcf_result res;
 161         struct tcf_proto *fl;
 162         int result;
 163
 164         if (TC_H_MAJ(skb->priority) == sch->handle &&
 165             TC_H_MIN(skb->priority) > 0 &&
 166             TC_H_MIN(skb->priority) <= q->divisor)
 167                 return TC_H_MIN(skb->priority);
 168
 169         fl = rcu_dereference_bh(q->filter_list);
 170         if (!fl)
 171                 return sfq_hash(q, skb) + 1;
 172
 173         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 174         result = tcf_classify(skb, NULL, fl, &res, false);
 175         if (result >= 0) {
 176 #ifdef CONFIG_NET_CLS_ACT
 177                 switch (result) {
 178                 case TC_ACT_STOLEN:
 179                 case TC_ACT_QUEUED:
 180                 case TC_ACT_TRAP:
 181                         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 182                         fallthrough;
 183                 case TC_ACT_SHOT:
 184                         return 0;
 185                 }
 186 #endif
 187                 if (TC_H_MIN(res.classid) <= q->divisor)
 188                         return TC_H_MIN(res.classid);
 189         }
 190         return 0;
 191 }
 192
 193 /*
 194  * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
 195  */
 196 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
 197 {
 198         sfq_index p, n;
 199         struct sfq_slot *slot = &q->slots[x];
 200         int qlen = slot->qlen;
 201
 202         p = qlen + SFQ_MAX_FLOWS;
 203         n = q->dep[qlen].next;
 204
 205         slot->dep.next = n;
 206         slot->dep.prev = p;
 207
 208         q->dep[qlen].next = x;          /* sfq_dep_head(q, p)->next = x */
 209         sfq_dep_head(q, n)->prev = x;
 210 }
 211
 212 #define sfq_unlink(q, x, n, p)                  \
 213         do {                                    \
 214                 n = q->slots[x].dep.next;       \
 215                 p = q->slots[x].dep.prev;       \
 216                 sfq_dep_head(q, p)->next = n;   \
 217                 sfq_dep_head(q, n)->prev = p;   \
 218         } while (0)
 219
 220
 221 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
 222 {
 223         sfq_index p, n;
 224         int d;
 225
 226         sfq_unlink(q, x, n, p);
 227
 228         d = q->slots[x].qlen--;
 229         if (n == p && q->cur_depth == d)
 230                 q->cur_depth--;
 231         sfq_link(q, x);
 232 }
 233
 234 static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
 235 {
 236         sfq_index p, n;
 237         int d;
 238
 239         sfq_unlink(q, x, n, p);
 240
 241         d = ++q->slots[x].qlen;
 242         if (q->cur_depth < d)
 243                 q->cur_depth = d;
 244         sfq_link(q, x);
 245 }
 246
 247 /* helper functions : might be changed when/if skb use a standard list_head */
 248
 249 /* remove one skb from tail of slot queue */
 250 static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
 251 {
 252         struct sk_buff *skb = slot->skblist_prev;
 253
 254         slot->skblist_prev = skb->prev;
 255         skb->prev->next = (struct sk_buff *)slot;
 256         skb->next = skb->prev = NULL;
 257         return skb;
 258 }
 259
 260 /* remove one skb from head of slot queue */
 261 static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
 262 {
 263         struct sk_buff *skb = slot->skblist_next;
 264
 265         slot->skblist_next = skb->next;
 266         skb->next->prev = (struct sk_buff *)slot;
 267         skb->next = skb->prev = NULL;
 268         return skb;
 269 }
 270
 271 static inline void slot_queue_init(struct sfq_slot *slot)
 272 {
 273         memset(slot, 0, sizeof(*slot));
 274         slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
 275 }
 276
 277 /* add skb to slot queue (tail add) */
 278 static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
 279 {
 280         skb->prev = slot->skblist_prev;
 281         skb->next = (struct sk_buff *)slot;
 282         slot->skblist_prev->next = skb;
 283         slot->skblist_prev = skb;
 284 }
 285
 286 static unsigned int sfq_drop(struct Qdisc *sch, struct sk_buff **to_free)
 287 {
 288         struct sfq_sched_data *q = qdisc_priv(sch);
 289         sfq_index x, d = q->cur_depth;
 290         struct sk_buff *skb;
 291         unsigned int len;
 292         struct sfq_slot *slot;
 293
 294         /* Queue is full! Find the longest slot and drop tail packet from it */
 295         if (d > 1) {
 296                 x = q->dep[d].next;
 297                 slot = &q->slots[x];
 298 drop:
 299                 skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
 300                 len = qdisc_pkt_len(skb);
 301                 slot->backlog -= len;
 302                 sfq_dec(q, x);
 303                 sch->q.qlen--;
 304                 qdisc_qstats_backlog_dec(sch, skb);
 305                 qdisc_drop(skb, sch, to_free);
 306                 return len;
 307         }
 308
 309         if (d == 1) {
 310                 /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
 311                 x = q->tail->next;
 312                 slot = &q->slots[x];
 313                 q->tail->next = slot->next;
 314                 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 315                 goto drop;
 316         }
 317
 318         return 0;
 319 }
 320
 321 /* Is ECN parameter configured */
 322 static int sfq_prob_mark(const struct sfq_sched_data *q)
 323 {
 324         return q->flags & TC_RED_ECN;
 325 }
 326
 327 /* Should packets over max threshold just be marked */
 328 static int sfq_hard_mark(const struct sfq_sched_data *q)
 329 {
 330         return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
 331 }
 332
 333 static int sfq_headdrop(const struct sfq_sched_data *q)
 334 {
 335         return q->headdrop;
 336 }
 337
 338 static int
 339 sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
 340 {
 341         struct sfq_sched_data *q = qdisc_priv(sch);
 342         unsigned int hash, dropped;
 343         sfq_index x, qlen;
 344         struct sfq_slot *slot;
 345         int ret;
 346         struct sk_buff *head;
 347         int delta;
 348
 349         hash = sfq_classify(skb, sch, &ret);
 350         if (hash == 0) {
 351                 if (ret & __NET_XMIT_BYPASS)
 352                         qdisc_qstats_drop(sch);
 353                 __qdisc_drop(skb, to_free);
 354                 return ret;
 355         }
 356         hash--;
 357
 358         x = q->ht[hash];
 359         slot = &q->slots[x];
 360         if (x == SFQ_EMPTY_SLOT) {
 361                 x = q->dep[0].next; /* get a free slot */
 362                 if (x >= SFQ_MAX_FLOWS)
 363                         return qdisc_drop(skb, sch, to_free);
 364                 q->ht[hash] = x;
 365                 slot = &q->slots[x];
 366                 slot->hash = hash;
 367                 slot->backlog = 0; /* should already be 0 anyway... */
 368                 red_set_vars(&slot->vars);
 369                 goto enqueue;
 370         }
 371         if (q->red_parms) {
 372                 slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
 373                                                         &slot->vars,
 374                                                         slot->backlog);
 375                 switch (red_action(q->red_parms,
 376                                    &slot->vars,
 377                                    slot->vars.qavg)) {
 378                 case RED_DONT_MARK:
 379                         break;
 380
 381                 case RED_PROB_MARK:
 382                         qdisc_qstats_overlimit(sch);
 383                         if (sfq_prob_mark(q)) {
 384                                 /* We know we have at least one packet in queue */
 385                                 if (sfq_headdrop(q) &&
 386                                     INET_ECN_set_ce(slot->skblist_next)) {
 387                                         q->stats.prob_mark_head++;
 388                                         break;
 389                                 }
 390                                 if (INET_ECN_set_ce(skb)) {
 391                                         q->stats.prob_mark++;
 392                                         break;
 393                                 }
 394                         }
 395                         q->stats.prob_drop++;
 396                         goto congestion_drop;
 397
 398                 case RED_HARD_MARK:
 399                         qdisc_qstats_overlimit(sch);
 400                         if (sfq_hard_mark(q)) {
 401                                 /* We know we have at least one packet in queue */
 402                                 if (sfq_headdrop(q) &&
 403                                     INET_ECN_set_ce(slot->skblist_next)) {
 404                                         q->stats.forced_mark_head++;
 405                                         break;
 406                                 }
 407                                 if (INET_ECN_set_ce(skb)) {
 408                                         q->stats.forced_mark++;
 409                                         break;
 410                                 }
 411                         }
 412                         q->stats.forced_drop++;
 413                         goto congestion_drop;
 414                 }
 415         }
 416
 417         if (slot->qlen >= q->maxdepth) {
 418 congestion_drop:
 419                 if (!sfq_headdrop(q))
 420                         return qdisc_drop(skb, sch, to_free);
 421
 422                 /* We know we have at least one packet in queue */
 423                 head = slot_dequeue_head(slot);
 424                 delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
 425                 sch->qstats.backlog -= delta;
 426                 slot->backlog -= delta;
 427                 qdisc_drop(head, sch, to_free);
 428
 429                 slot_queue_add(slot, skb);
 430                 qdisc_tree_reduce_backlog(sch, 0, delta);
 431                 return NET_XMIT_CN;
 432         }
 433
 434 enqueue:
 435         qdisc_qstats_backlog_inc(sch, skb);
 436         slot->backlog += qdisc_pkt_len(skb);
 437         slot_queue_add(slot, skb);
 438         sfq_inc(q, x);
 439         if (slot->qlen == 1) {          /* The flow is new */
 440                 if (q->tail == NULL) {  /* It is the first flow */
 441                         slot->next = x;
 442                 } else {
 443                         slot->next = q->tail->next;
 444                         q->tail->next = x;
 445                 }
 446                 /* We put this flow at the end of our flow list.
 447                  * This might sound unfair for a new flow to wait after old ones,
 448                  * but we could endup servicing new flows only, and freeze old ones.
 449                  */
 450                 q->tail = slot;
 451                 /* We could use a bigger initial quantum for new flows */
 452                 slot->allot = q->quantum;
 453         }
 454         if (++sch->q.qlen <= q->limit)
 455                 return NET_XMIT_SUCCESS;
 456
 457         qlen = slot->qlen;
 458         dropped = sfq_drop(sch, to_free);
 459         /* Return Congestion Notification only if we dropped a packet
 460          * from this flow.
 461          */
 462         if (qlen != slot->qlen) {
 463                 qdisc_tree_reduce_backlog(sch, 0, dropped - qdisc_pkt_len(skb));
 464                 return NET_XMIT_CN;
 465         }
 466
 467         /* As we dropped a packet, better let upper stack know this */
 468         qdisc_tree_reduce_backlog(sch, 1, dropped);
 469         return NET_XMIT_SUCCESS;
 470 }
 471
 472 static struct sk_buff *
 473 sfq_dequeue(struct Qdisc *sch)
 474 {
 475         struct sfq_sched_data *q = qdisc_priv(sch);
 476         struct sk_buff *skb;
 477         sfq_index a, next_a;
 478         struct sfq_slot *slot;
 479
 480         /* No active slots */
 481         if (q->tail == NULL)
 482                 return NULL;
 483
 484 next_slot:
 485         a = q->tail->next;
 486         slot = &q->slots[a];
 487         if (slot->allot <= 0) {
 488                 q->tail = slot;
 489                 slot->allot += q->quantum;
 490                 goto next_slot;
 491         }
 492         skb = slot_dequeue_head(slot);
 493         sfq_dec(q, a);
 494         qdisc_bstats_update(sch, skb);
 495         sch->q.qlen--;
 496         qdisc_qstats_backlog_dec(sch, skb);
 497         slot->backlog -= qdisc_pkt_len(skb);
 498         /* Is the slot empty? */
 499         if (slot->qlen == 0) {
 500                 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 501                 next_a = slot->next;
 502                 if (a == next_a) {
 503                         q->tail = NULL; /* no more active slots */
 504                         return skb;
 505                 }
 506                 q->tail->next = next_a;
 507         } else {
 508                 slot->allot -= qdisc_pkt_len(skb);
 509         }
 510         return skb;
 511 }
 512
 513 static void
 514 sfq_reset(struct Qdisc *sch)
 515 {
 516         struct sk_buff *skb;
 517
 518         while ((skb = sfq_dequeue(sch)) != NULL)
 519                 rtnl_kfree_skbs(skb, skb);
 520 }
 521
 522 /*
 523  * When q->perturbation is changed, we rehash all queued skbs
 524  * to avoid OOO (Out Of Order) effects.
 525  * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
 526  * counters.
 527  */
 528 static void sfq_rehash(struct Qdisc *sch)
 529 {
 530         struct sfq_sched_data *q = qdisc_priv(sch);
 531         struct sk_buff *skb;
 532         int i;
 533         struct sfq_slot *slot;
 534         struct sk_buff_head list;
 535         int dropped = 0;
 536         unsigned int drop_len = 0;
 537
 538         __skb_queue_head_init(&list);
 539
 540         for (i = 0; i < q->maxflows; i++) {
 541                 slot = &q->slots[i];
 542                 if (!slot->qlen)
 543                         continue;
 544                 while (slot->qlen) {
 545                         skb = slot_dequeue_head(slot);
 546                         sfq_dec(q, i);
 547                         __skb_queue_tail(&list, skb);
 548                 }
 549                 slot->backlog = 0;
 550                 red_set_vars(&slot->vars);
 551                 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 552         }
 553         q->tail = NULL;
 554
 555         while ((skb = __skb_dequeue(&list)) != NULL) {
 556                 unsigned int hash = sfq_hash(q, skb);
 557                 sfq_index x = q->ht[hash];
 558
 559                 slot = &q->slots[x];
 560                 if (x == SFQ_EMPTY_SLOT) {
 561                         x = q->dep[0].next; /* get a free slot */
 562                         if (x >= SFQ_MAX_FLOWS) {
 563 drop:
 564                                 qdisc_qstats_backlog_dec(sch, skb);
 565                                 drop_len += qdisc_pkt_len(skb);
 566                                 kfree_skb(skb);
 567                                 dropped++;
 568                                 continue;
 569                         }
 570                         q->ht[hash] = x;
 571                         slot = &q->slots[x];
 572                         slot->hash = hash;
 573                 }
 574                 if (slot->qlen >= q->maxdepth)
 575                         goto drop;
 576                 slot_queue_add(slot, skb);
 577                 if (q->red_parms)
 578                         slot->vars.qavg = red_calc_qavg(q->red_parms,
 579                                                         &slot->vars,
 580                                                         slot->backlog);
 581                 slot->backlog += qdisc_pkt_len(skb);
 582                 sfq_inc(q, x);
 583                 if (slot->qlen == 1) {          /* The flow is new */
 584                         if (q->tail == NULL) {  /* It is the first flow */
 585                                 slot->next = x;
 586                         } else {
 587                                 slot->next = q->tail->next;
 588                                 q->tail->next = x;
 589                         }
 590                         q->tail = slot;
 591                         slot->allot = q->quantum;
 592                 }
 593         }
 594         sch->q.qlen -= dropped;
 595         qdisc_tree_reduce_backlog(sch, dropped, drop_len);
 596 }
 597
 598 static void sfq_perturbation(struct timer_list *t)
 599 {
 600         struct sfq_sched_data *q = from_timer(q, t, perturb_timer);
 601         struct Qdisc *sch = q->sch;
 602         spinlock_t *root_lock;
 603         siphash_key_t nkey;
 604         int period;
 605
 606         get_random_bytes(&nkey, sizeof(nkey));
 607         rcu_read_lock();
 608         root_lock = qdisc_lock(qdisc_root_sleeping(sch));
 609         spin_lock(root_lock);
 610         q->perturbation = nkey;
 611         if (!q->filter_list && q->tail)
 612                 sfq_rehash(sch);
 613         spin_unlock(root_lock);
 614
 615         /* q->perturb_period can change under us from
 616          * sfq_change() and sfq_destroy().
 617          */
 618         period = READ_ONCE(q->perturb_period);
 619         if (period)
 620                 mod_timer(&q->perturb_timer, jiffies + period);
 621         rcu_read_unlock();
 622 }
 623
 624 static int sfq_change(struct Qdisc *sch, struct nlattr *opt,
 625                       struct netlink_ext_ack *extack)
 626 {
 627         struct sfq_sched_data *q = qdisc_priv(sch);
 628         struct tc_sfq_qopt *ctl = nla_data(opt);
 629         struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
 630         unsigned int qlen, dropped = 0;
 631         struct red_parms *p = NULL;
 632         struct sk_buff *to_free = NULL;
 633         struct sk_buff *tail = NULL;
 634
 635         if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
 636                 return -EINVAL;
 637         if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
 638                 ctl_v1 = nla_data(opt);
 639         if (ctl->divisor &&
 640             (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
 641                 return -EINVAL;
 642
 643         if ((int)ctl->quantum < 0) {
 644                 NL_SET_ERR_MSG_MOD(extack, "invalid quantum");
 645                 return -EINVAL;
 646         }
 647         if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max,
 648                                         ctl_v1->Wlog, ctl_v1->Scell_log, NULL))
 649                 return -EINVAL;
 650         if (ctl_v1 && ctl_v1->qth_min) {
 651                 p = kmalloc(sizeof(*p), GFP_KERNEL);
 652                 if (!p)
 653                         return -ENOMEM;
 654         }
 655         sch_tree_lock(sch);
 656         if (ctl->quantum)
 657                 q->quantum = ctl->quantum;
 658         WRITE_ONCE(q->perturb_period, ctl->perturb_period * HZ);
 659         if (ctl->flows)
 660                 q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
 661         if (ctl->divisor) {
 662                 q->divisor = ctl->divisor;
 663                 q->maxflows = min_t(u32, q->maxflows, q->divisor);
 664         }
 665         if (ctl_v1) {
 666                 if (ctl_v1->depth)
 667                         q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
 668                 if (p) {
 669                         swap(q->red_parms, p);
 670                         red_set_parms(q->red_parms,
 671                                       ctl_v1->qth_min, ctl_v1->qth_max,
 672                                       ctl_v1->Wlog,
 673                                       ctl_v1->Plog, ctl_v1->Scell_log,
 674                                       NULL,
 675                                       ctl_v1->max_P);
 676                 }
 677                 q->flags = ctl_v1->flags;
 678                 q->headdrop = ctl_v1->headdrop;
 679         }
 680         if (ctl->limit) {
 681                 q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
 682                 q->maxflows = min_t(u32, q->maxflows, q->limit);
 683         }
 684
 685         qlen = sch->q.qlen;
 686         while (sch->q.qlen > q->limit) {
 687                 dropped += sfq_drop(sch, &to_free);
 688                 if (!tail)
 689                         tail = to_free;
 690         }
 691
 692         rtnl_kfree_skbs(to_free, tail);
 693         qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
 694
 695         del_timer(&q->perturb_timer);
 696         if (q->perturb_period) {
 697                 mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 698                 get_random_bytes(&q->perturbation, sizeof(q->perturbation));
 699         }
 700         sch_tree_unlock(sch);
 701         kfree(p);
 702         return 0;
 703 }
 704
 705 static void *sfq_alloc(size_t sz)
 706 {
 707         return  kvmalloc(sz, GFP_KERNEL);
 708 }
 709
 710 static void sfq_free(void *addr)
 711 {
 712         kvfree(addr);
 713 }
 714
 715 static void sfq_destroy(struct Qdisc *sch)
 716 {
 717         struct sfq_sched_data *q = qdisc_priv(sch);
 718
 719         tcf_block_put(q->block);
 720         WRITE_ONCE(q->perturb_period, 0);
 721         del_timer_sync(&q->perturb_timer);
 722         sfq_free(q->ht);
 723         sfq_free(q->slots);
 724         kfree(q->red_parms);
 725 }
 726
 727 static int sfq_init(struct Qdisc *sch, struct nlattr *opt,
 728                     struct netlink_ext_ack *extack)
 729 {
 730         struct sfq_sched_data *q = qdisc_priv(sch);
 731         int i;
 732         int err;
 733
 734         q->sch = sch;
 735         timer_setup(&q->perturb_timer, sfq_perturbation, TIMER_DEFERRABLE);
 736
 737         err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
 738         if (err)
 739                 return err;
 740
 741         for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
 742                 q->dep[i].next = i + SFQ_MAX_FLOWS;
 743                 q->dep[i].prev = i + SFQ_MAX_FLOWS;
 744         }
 745
 746         q->limit = SFQ_MAX_DEPTH;
 747         q->maxdepth = SFQ_MAX_DEPTH;
 748         q->cur_depth = 0;
 749         q->tail = NULL;
 750         q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
 751         q->maxflows = SFQ_DEFAULT_FLOWS;
 752         q->quantum = psched_mtu(qdisc_dev(sch));
 753         q->perturb_period = 0;
 754         get_random_bytes(&q->perturbation, sizeof(q->perturbation));
 755
 756         if (opt) {
 757                 int err = sfq_change(sch, opt, extack);
 758                 if (err)
 759                         return err;
 760         }
 761
 762         q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
 763         q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
 764         if (!q->ht || !q->slots) {
 765                 /* Note: sfq_destroy() will be called by our caller */
 766                 return -ENOMEM;
 767         }
 768
 769         for (i = 0; i < q->divisor; i++)
 770                 q->ht[i] = SFQ_EMPTY_SLOT;
 771
 772         for (i = 0; i < q->maxflows; i++) {
 773                 slot_queue_init(&q->slots[i]);
 774                 sfq_link(q, i);
 775         }
 776         if (q->limit >= 1)
 777                 sch->flags |= TCQ_F_CAN_BYPASS;
 778         else
 779                 sch->flags &= ~TCQ_F_CAN_BYPASS;
 780         return 0;
 781 }
 782
 783 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
 784 {
 785         struct sfq_sched_data *q = qdisc_priv(sch);
 786         unsigned char *b = skb_tail_pointer(skb);
 787         struct tc_sfq_qopt_v1 opt;
 788         struct red_parms *p = q->red_parms;
 789
 790         memset(&opt, 0, sizeof(opt));
 791         opt.v0.quantum  = q->quantum;
 792         opt.v0.perturb_period = q->perturb_period / HZ;
 793         opt.v0.limit    = q->limit;
 794         opt.v0.divisor  = q->divisor;
 795         opt.v0.flows    = q->maxflows;
 796         opt.depth       = q->maxdepth;
 797         opt.headdrop    = q->headdrop;
 798
 799         if (p) {
 800                 opt.qth_min     = p->qth_min >> p->Wlog;
 801                 opt.qth_max     = p->qth_max >> p->Wlog;
 802                 opt.Wlog        = p->Wlog;
 803                 opt.Plog        = p->Plog;
 804                 opt.Scell_log   = p->Scell_log;
 805                 opt.max_P       = p->max_P;
 806         }
 807         memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
 808         opt.flags       = q->flags;
 809
 810         if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
 811                 goto nla_put_failure;
 812
 813         return skb->len;
 814
 815 nla_put_failure:
 816         nlmsg_trim(skb, b);
 817         return -1;
 818 }
 819
 820 static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
 821 {
 822         return NULL;
 823 }
 824
 825 static unsigned long sfq_find(struct Qdisc *sch, u32 classid)
 826 {
 827         return 0;
 828 }
 829
 830 static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
 831                               u32 classid)
 832 {
 833         return 0;
 834 }
 835
 836 static void sfq_unbind(struct Qdisc *q, unsigned long cl)
 837 {
 838 }
 839
 840 static struct tcf_block *sfq_tcf_block(struct Qdisc *sch, unsigned long cl,
 841                                        struct netlink_ext_ack *extack)
 842 {
 843         struct sfq_sched_data *q = qdisc_priv(sch);
 844
 845         if (cl)
 846                 return NULL;
 847         return q->block;
 848 }
 849
 850 static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
 851                           struct sk_buff *skb, struct tcmsg *tcm)
 852 {
 853         tcm->tcm_handle |= TC_H_MIN(cl);
 854         return 0;
 855 }
 856
 857 static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 858                                 struct gnet_dump *d)
 859 {
 860         struct sfq_sched_data *q = qdisc_priv(sch);
 861         sfq_index idx = q->ht[cl - 1];
 862         struct gnet_stats_queue qs = { 0 };
 863         struct tc_sfq_xstats xstats = { 0 };
 864
 865         if (idx != SFQ_EMPTY_SLOT) {
 866                 const struct sfq_slot *slot = &q->slots[idx];
 867
 868                 xstats.allot = slot->allot;
 869                 qs.qlen = slot->qlen;
 870                 qs.backlog = slot->backlog;
 871         }
 872         if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
 873                 return -1;
 874         return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
 875 }
 876
 877 static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 878 {
 879         struct sfq_sched_data *q = qdisc_priv(sch);
 880         unsigned int i;
 881
 882         if (arg->stop)
 883                 return;
 884
 885         for (i = 0; i < q->divisor; i++) {
 886                 if (q->ht[i] == SFQ_EMPTY_SLOT) {
 887                         arg->count++;
 888                         continue;
 889                 }
 890                 if (!tc_qdisc_stats_dump(sch, i + 1, arg))
 891                         break;
 892         }
 893 }
 894
 895 static const struct Qdisc_class_ops sfq_class_ops = {
 896         .leaf           =       sfq_leaf,
 897         .find           =       sfq_find,
 898         .tcf_block      =       sfq_tcf_block,
 899         .bind_tcf       =       sfq_bind,
 900         .unbind_tcf     =       sfq_unbind,
 901         .dump           =       sfq_dump_class,
 902         .dump_stats     =       sfq_dump_class_stats,
 903         .walk           =       sfq_walk,
 904 };
 905
 906 static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
 907         .cl_ops         =       &sfq_class_ops,
 908         .id             =       "sfq",
 909         .priv_size      =       sizeof(struct sfq_sched_data),
 910         .enqueue        =       sfq_enqueue,
 911         .dequeue        =       sfq_dequeue,
 912         .peek           =       qdisc_peek_dequeued,
 913         .init           =       sfq_init,
 914         .reset          =       sfq_reset,
 915         .destroy        =       sfq_destroy,
 916         .change         =       NULL,
 917         .dump           =       sfq_dump,
 918         .owner          =       THIS_MODULE,
 919 };
 920 MODULE_ALIAS_NET_SCH("sfq");
 921
 922 static int __init sfq_module_init(void)
 923 {
 924         return register_qdisc(&sfq_qdisc_ops);
 925 }
 926 static void __exit sfq_module_exit(void)
 927 {
 928         unregister_qdisc(&sfq_qdisc_ops);
 929 }
 930 module_init(sfq_module_init)
 931 module_exit(sfq_module_exit)
 932 MODULE_LICENSE("GPL");
 933 MODULE_DESCRIPTION("Stochastic Fairness qdisc");