net/sched/sch_sfq.c

   1 /*
   2  * net/sched/sch_sfq.c  Stochastic Fairness Queueing discipline.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10  */
  11
  12 #include <linux/module.h>
  13 #include <linux/types.h>
  14 #include <linux/kernel.h>
  15 #include <linux/jiffies.h>
  16 #include <linux/string.h>
  17 #include <linux/in.h>
  18 #include <linux/errno.h>
  19 #include <linux/init.h>
  20 #include <linux/skbuff.h>
  21 #include <linux/jhash.h>
  22 #include <linux/slab.h>
  23 #include <linux/vmalloc.h>
  24 #include <net/netlink.h>
  25 #include <net/pkt_sched.h>
  26 #include <net/flow_keys.h>
  27 #include <net/red.h>
  28
  29
  30 /*      Stochastic Fairness Queuing algorithm.
  31         =======================================
  32
  33         Source:
  34         Paul E. McKenney "Stochastic Fairness Queuing",
  35         IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
  36
  37         Paul E. McKenney "Stochastic Fairness Queuing",
  38         "Interworking: Research and Experience", v.2, 1991, p.113-131.
  39
  40
  41         See also:
  42         M. Shreedhar and George Varghese "Efficient Fair
  43         Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
  44
  45
  46         This is not the thing that is usually called (W)FQ nowadays.
  47         It does not use any timestamp mechanism, but instead
  48         processes queues in round-robin order.
  49
  50         ADVANTAGE:
  51
  52         - It is very cheap. Both CPU and memory requirements are minimal.
  53
  54         DRAWBACKS:
  55
  56         - "Stochastic" -> It is not 100% fair.
  57         When hash collisions occur, several flows are considered as one.
  58
  59         - "Round-robin" -> It introduces larger delays than virtual clock
  60         based schemes, and should not be used for isolating interactive
  61         traffic from non-interactive. It means, that this scheduler
  62         should be used as leaf of CBQ or P3, which put interactive traffic
  63         to higher priority band.
  64
  65         We still need true WFQ for top level CSZ, but using WFQ
  66         for the best effort traffic is absolutely pointless:
  67         SFQ is superior for this purpose.
  68
  69         IMPLEMENTATION:
  70         This implementation limits :
  71         - maximal queue length per flow to 127 packets.
  72         - max mtu to 2^18-1;
  73         - max 65408 flows,
  74         - number of hash buckets to 65536.
  75
  76         It is easy to increase these values, but not in flight.  */
  77
  78 #define SFQ_MAX_DEPTH           127 /* max number of packets per flow */
  79 #define SFQ_DEFAULT_FLOWS       128
  80 #define SFQ_MAX_FLOWS           (0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
  81 #define SFQ_EMPTY_SLOT          0xffff
  82 #define SFQ_DEFAULT_HASH_DIVISOR 1024
  83
  84 /* We use 16 bits to store allot, and want to handle packets up to 64K
  85  * Scale allot by 8 (1<<3) so that no overflow occurs.
  86  */
  87 #define SFQ_ALLOT_SHIFT         3
  88 #define SFQ_ALLOT_SIZE(X)       DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
  89
  90 /* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
  91 typedef u16 sfq_index;
  92
  93 /*
  94  * We dont use pointers to save space.
  95  * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
  96  * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
  97  * are 'pointers' to dep[] array
  98  */
  99 struct sfq_head {
 100         sfq_index       next;
 101         sfq_index       prev;
 102 };
 103
 104 struct sfq_slot {
 105         struct sk_buff  *skblist_next;
 106         struct sk_buff  *skblist_prev;
 107         sfq_index       qlen; /* number of skbs in skblist */
 108         sfq_index       next; /* next slot in sfq RR chain */
 109         struct sfq_head dep; /* anchor in dep[] chains */
 110         unsigned short  hash; /* hash value (index in ht[]) */
 111         short           allot; /* credit for this slot */
 112
 113         unsigned int    backlog;
 114         struct red_vars vars;
 115 };
 116
 117 struct sfq_sched_data {
 118 /* frequently used fields */
 119         int             limit;          /* limit of total number of packets in this qdisc */
 120         unsigned int    divisor;        /* number of slots in hash table */
 121         u8              headdrop;
 122         u8              maxdepth;       /* limit of packets per flow */
 123
 124         u32             perturbation;
 125         u8              cur_depth;      /* depth of longest slot */
 126         u8              flags;
 127         unsigned short  scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
 128         struct tcf_proto __rcu *filter_list;
 129         sfq_index       *ht;            /* Hash table ('divisor' slots) */
 130         struct sfq_slot *slots;         /* Flows table ('maxflows' entries) */
 131
 132         struct red_parms *red_parms;
 133         struct tc_sfqred_stats stats;
 134         struct sfq_slot *tail;          /* current slot in round */
 135
 136         struct sfq_head dep[SFQ_MAX_DEPTH + 1];
 137                                         /* Linked lists of slots, indexed by depth
 138                                          * dep[0] : list of unused flows
 139                                          * dep[1] : list of flows with 1 packet
 140                                          * dep[X] : list of flows with X packets
 141                                          */
 142
 143         unsigned int    maxflows;       /* number of flows in flows array */
 144         int             perturb_period;
 145         unsigned int    quantum;        /* Allotment per round: MUST BE >= MTU */
 146         struct timer_list perturb_timer;
 147 };
 148
 149 /*
 150  * sfq_head are either in a sfq_slot or in dep[] array
 151  */
 152 static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
 153 {
 154         if (val < SFQ_MAX_FLOWS)
 155                 return &q->slots[val].dep;
 156         return &q->dep[val - SFQ_MAX_FLOWS];
 157 }
 158
 159 /*
 160  * In order to be able to quickly rehash our queue when timer changes
 161  * q->perturbation, we store flow_keys in skb->cb[]
 162  */
 163 struct sfq_skb_cb {
 164        struct flow_keys        keys;
 165 };
 166
 167 static inline struct sfq_skb_cb *sfq_skb_cb(const struct sk_buff *skb)
 168 {
 169         qdisc_cb_private_validate(skb, sizeof(struct sfq_skb_cb));
 170         return (struct sfq_skb_cb *)qdisc_skb_cb(skb)->data;
 171 }
 172
 173 static unsigned int sfq_hash(const struct sfq_sched_data *q,
 174                              const struct sk_buff *skb)
 175 {
 176         const struct flow_keys *keys = &sfq_skb_cb(skb)->keys;
 177         unsigned int hash;
 178
 179         hash = jhash_3words((__force u32)keys->dst,
 180                             (__force u32)keys->src ^ keys->ip_proto,
 181                             (__force u32)keys->ports, q->perturbation);
 182         return hash & (q->divisor - 1);
 183 }
 184
 185 static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
 186                                  int *qerr)
 187 {
 188         struct sfq_sched_data *q = qdisc_priv(sch);
 189         struct tcf_result res;
 190         struct tcf_proto *fl;
 191         int result;
 192
 193         if (TC_H_MAJ(skb->priority) == sch->handle &&
 194             TC_H_MIN(skb->priority) > 0 &&
 195             TC_H_MIN(skb->priority) <= q->divisor)
 196                 return TC_H_MIN(skb->priority);
 197
 198         fl = rcu_dereference_bh(q->filter_list);
 199         if (!fl) {
 200                 skb_flow_dissect(skb, &sfq_skb_cb(skb)->keys);
 201                 return sfq_hash(q, skb) + 1;
 202         }
 203
 204         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 205         result = tc_classify(skb, fl, &res);
 206         if (result >= 0) {
 207 #ifdef CONFIG_NET_CLS_ACT
 208                 switch (result) {
 209                 case TC_ACT_STOLEN:
 210                 case TC_ACT_QUEUED:
 211                         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 212                 case TC_ACT_SHOT:
 213                         return 0;
 214                 }
 215 #endif
 216                 if (TC_H_MIN(res.classid) <= q->divisor)
 217                         return TC_H_MIN(res.classid);
 218         }
 219         return 0;
 220 }
 221
 222 /*
 223  * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
 224  */
 225 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
 226 {
 227         sfq_index p, n;
 228         struct sfq_slot *slot = &q->slots[x];
 229         int qlen = slot->qlen;
 230
 231         p = qlen + SFQ_MAX_FLOWS;
 232         n = q->dep[qlen].next;
 233
 234         slot->dep.next = n;
 235         slot->dep.prev = p;
 236
 237         q->dep[qlen].next = x;          /* sfq_dep_head(q, p)->next = x */
 238         sfq_dep_head(q, n)->prev = x;
 239 }
 240
 241 #define sfq_unlink(q, x, n, p)                  \
 242         do {                                    \
 243                 n = q->slots[x].dep.next;       \
 244                 p = q->slots[x].dep.prev;       \
 245                 sfq_dep_head(q, p)->next = n;   \
 246                 sfq_dep_head(q, n)->prev = p;   \
 247         } while (0)
 248
 249
 250 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
 251 {
 252         sfq_index p, n;
 253         int d;
 254
 255         sfq_unlink(q, x, n, p);
 256
 257         d = q->slots[x].qlen--;
 258         if (n == p && q->cur_depth == d)
 259                 q->cur_depth--;
 260         sfq_link(q, x);
 261 }
 262
 263 static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
 264 {
 265         sfq_index p, n;
 266         int d;
 267
 268         sfq_unlink(q, x, n, p);
 269
 270         d = ++q->slots[x].qlen;
 271         if (q->cur_depth < d)
 272                 q->cur_depth = d;
 273         sfq_link(q, x);
 274 }
 275
 276 /* helper functions : might be changed when/if skb use a standard list_head */
 277
 278 /* remove one skb from tail of slot queue */
 279 static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
 280 {
 281         struct sk_buff *skb = slot->skblist_prev;
 282
 283         slot->skblist_prev = skb->prev;
 284         skb->prev->next = (struct sk_buff *)slot;
 285         skb->next = skb->prev = NULL;
 286         return skb;
 287 }
 288
 289 /* remove one skb from head of slot queue */
 290 static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
 291 {
 292         struct sk_buff *skb = slot->skblist_next;
 293
 294         slot->skblist_next = skb->next;
 295         skb->next->prev = (struct sk_buff *)slot;
 296         skb->next = skb->prev = NULL;
 297         return skb;
 298 }
 299
 300 static inline void slot_queue_init(struct sfq_slot *slot)
 301 {
 302         memset(slot, 0, sizeof(*slot));
 303         slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
 304 }
 305
 306 /* add skb to slot queue (tail add) */
 307 static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
 308 {
 309         skb->prev = slot->skblist_prev;
 310         skb->next = (struct sk_buff *)slot;
 311         slot->skblist_prev->next = skb;
 312         slot->skblist_prev = skb;
 313 }
 314
 315 static unsigned int sfq_drop(struct Qdisc *sch)
 316 {
 317         struct sfq_sched_data *q = qdisc_priv(sch);
 318         sfq_index x, d = q->cur_depth;
 319         struct sk_buff *skb;
 320         unsigned int len;
 321         struct sfq_slot *slot;
 322
 323         /* Queue is full! Find the longest slot and drop tail packet from it */
 324         if (d > 1) {
 325                 x = q->dep[d].next;
 326                 slot = &q->slots[x];
 327 drop:
 328                 skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
 329                 len = qdisc_pkt_len(skb);
 330                 slot->backlog -= len;
 331                 sfq_dec(q, x);
 332                 kfree_skb(skb);
 333                 sch->q.qlen--;
 334                 qdisc_qstats_drop(sch);
 335                 qdisc_qstats_backlog_dec(sch, skb);
 336                 return len;
 337         }
 338
 339         if (d == 1) {
 340                 /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
 341                 x = q->tail->next;
 342                 slot = &q->slots[x];
 343                 q->tail->next = slot->next;
 344                 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 345                 goto drop;
 346         }
 347
 348         return 0;
 349 }
 350
 351 /* Is ECN parameter configured */
 352 static int sfq_prob_mark(const struct sfq_sched_data *q)
 353 {
 354         return q->flags & TC_RED_ECN;
 355 }
 356
 357 /* Should packets over max threshold just be marked */
 358 static int sfq_hard_mark(const struct sfq_sched_data *q)
 359 {
 360         return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
 361 }
 362
 363 static int sfq_headdrop(const struct sfq_sched_data *q)
 364 {
 365         return q->headdrop;
 366 }
 367
 368 static int
 369 sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 370 {
 371         struct sfq_sched_data *q = qdisc_priv(sch);
 372         unsigned int hash;
 373         sfq_index x, qlen;
 374         struct sfq_slot *slot;
 375         int uninitialized_var(ret);
 376         struct sk_buff *head;
 377         int delta;
 378
 379         hash = sfq_classify(skb, sch, &ret);
 380         if (hash == 0) {
 381                 if (ret & __NET_XMIT_BYPASS)
 382                         qdisc_qstats_drop(sch);
 383                 kfree_skb(skb);
 384                 return ret;
 385         }
 386         hash--;
 387
 388         x = q->ht[hash];
 389         slot = &q->slots[x];
 390         if (x == SFQ_EMPTY_SLOT) {
 391                 x = q->dep[0].next; /* get a free slot */
 392                 if (x >= SFQ_MAX_FLOWS)
 393                         return qdisc_drop(skb, sch);
 394                 q->ht[hash] = x;
 395                 slot = &q->slots[x];
 396                 slot->hash = hash;
 397                 slot->backlog = 0; /* should already be 0 anyway... */
 398                 red_set_vars(&slot->vars);
 399                 goto enqueue;
 400         }
 401         if (q->red_parms) {
 402                 slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
 403                                                         &slot->vars,
 404                                                         slot->backlog);
 405                 switch (red_action(q->red_parms,
 406                                    &slot->vars,
 407                                    slot->vars.qavg)) {
 408                 case RED_DONT_MARK:
 409                         break;
 410
 411                 case RED_PROB_MARK:
 412                         qdisc_qstats_overlimit(sch);
 413                         if (sfq_prob_mark(q)) {
 414                                 /* We know we have at least one packet in queue */
 415                                 if (sfq_headdrop(q) &&
 416                                     INET_ECN_set_ce(slot->skblist_next)) {
 417                                         q->stats.prob_mark_head++;
 418                                         break;
 419                                 }
 420                                 if (INET_ECN_set_ce(skb)) {
 421                                         q->stats.prob_mark++;
 422                                         break;
 423                                 }
 424                         }
 425                         q->stats.prob_drop++;
 426                         goto congestion_drop;
 427
 428                 case RED_HARD_MARK:
 429                         qdisc_qstats_overlimit(sch);
 430                         if (sfq_hard_mark(q)) {
 431                                 /* We know we have at least one packet in queue */
 432                                 if (sfq_headdrop(q) &&
 433                                     INET_ECN_set_ce(slot->skblist_next)) {
 434                                         q->stats.forced_mark_head++;
 435                                         break;
 436                                 }
 437                                 if (INET_ECN_set_ce(skb)) {
 438                                         q->stats.forced_mark++;
 439                                         break;
 440                                 }
 441                         }
 442                         q->stats.forced_drop++;
 443                         goto congestion_drop;
 444                 }
 445         }
 446
 447         if (slot->qlen >= q->maxdepth) {
 448 congestion_drop:
 449                 if (!sfq_headdrop(q))
 450                         return qdisc_drop(skb, sch);
 451
 452                 /* We know we have at least one packet in queue */
 453                 head = slot_dequeue_head(slot);
 454                 delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
 455                 sch->qstats.backlog -= delta;
 456                 slot->backlog -= delta;
 457                 qdisc_drop(head, sch);
 458
 459                 slot_queue_add(slot, skb);
 460                 return NET_XMIT_CN;
 461         }
 462
 463 enqueue:
 464         qdisc_qstats_backlog_inc(sch, skb);
 465         slot->backlog += qdisc_pkt_len(skb);
 466         slot_queue_add(slot, skb);
 467         sfq_inc(q, x);
 468         if (slot->qlen == 1) {          /* The flow is new */
 469                 if (q->tail == NULL) {  /* It is the first flow */
 470                         slot->next = x;
 471                 } else {
 472                         slot->next = q->tail->next;
 473                         q->tail->next = x;
 474                 }
 475                 /* We put this flow at the end of our flow list.
 476                  * This might sound unfair for a new flow to wait after old ones,
 477                  * but we could endup servicing new flows only, and freeze old ones.
 478                  */
 479                 q->tail = slot;
 480                 /* We could use a bigger initial quantum for new flows */
 481                 slot->allot = q->scaled_quantum;
 482         }
 483         if (++sch->q.qlen <= q->limit)
 484                 return NET_XMIT_SUCCESS;
 485
 486         qlen = slot->qlen;
 487         sfq_drop(sch);
 488         /* Return Congestion Notification only if we dropped a packet
 489          * from this flow.
 490          */
 491         if (qlen != slot->qlen)
 492                 return NET_XMIT_CN;
 493
 494         /* As we dropped a packet, better let upper stack know this */
 495         qdisc_tree_decrease_qlen(sch, 1);
 496         return NET_XMIT_SUCCESS;
 497 }
 498
 499 static struct sk_buff *
 500 sfq_dequeue(struct Qdisc *sch)
 501 {
 502         struct sfq_sched_data *q = qdisc_priv(sch);
 503         struct sk_buff *skb;
 504         sfq_index a, next_a;
 505         struct sfq_slot *slot;
 506
 507         /* No active slots */
 508         if (q->tail == NULL)
 509                 return NULL;
 510
 511 next_slot:
 512         a = q->tail->next;
 513         slot = &q->slots[a];
 514         if (slot->allot <= 0) {
 515                 q->tail = slot;
 516                 slot->allot += q->scaled_quantum;
 517                 goto next_slot;
 518         }
 519         skb = slot_dequeue_head(slot);
 520         sfq_dec(q, a);
 521         qdisc_bstats_update(sch, skb);
 522         sch->q.qlen--;
 523         qdisc_qstats_backlog_dec(sch, skb);
 524         slot->backlog -= qdisc_pkt_len(skb);
 525         /* Is the slot empty? */
 526         if (slot->qlen == 0) {
 527                 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 528                 next_a = slot->next;
 529                 if (a == next_a) {
 530                         q->tail = NULL; /* no more active slots */
 531                         return skb;
 532                 }
 533                 q->tail->next = next_a;
 534         } else {
 535                 slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
 536         }
 537         return skb;
 538 }
 539
 540 static void
 541 sfq_reset(struct Qdisc *sch)
 542 {
 543         struct sk_buff *skb;
 544
 545         while ((skb = sfq_dequeue(sch)) != NULL)
 546                 kfree_skb(skb);
 547 }
 548
 549 /*
 550  * When q->perturbation is changed, we rehash all queued skbs
 551  * to avoid OOO (Out Of Order) effects.
 552  * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
 553  * counters.
 554  */
 555 static void sfq_rehash(struct Qdisc *sch)
 556 {
 557         struct sfq_sched_data *q = qdisc_priv(sch);
 558         struct sk_buff *skb;
 559         int i;
 560         struct sfq_slot *slot;
 561         struct sk_buff_head list;
 562         int dropped = 0;
 563
 564         __skb_queue_head_init(&list);
 565
 566         for (i = 0; i < q->maxflows; i++) {
 567                 slot = &q->slots[i];
 568                 if (!slot->qlen)
 569                         continue;
 570                 while (slot->qlen) {
 571                         skb = slot_dequeue_head(slot);
 572                         sfq_dec(q, i);
 573                         __skb_queue_tail(&list, skb);
 574                 }
 575                 slot->backlog = 0;
 576                 red_set_vars(&slot->vars);
 577                 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 578         }
 579         q->tail = NULL;
 580
 581         while ((skb = __skb_dequeue(&list)) != NULL) {
 582                 unsigned int hash = sfq_hash(q, skb);
 583                 sfq_index x = q->ht[hash];
 584
 585                 slot = &q->slots[x];
 586                 if (x == SFQ_EMPTY_SLOT) {
 587                         x = q->dep[0].next; /* get a free slot */
 588                         if (x >= SFQ_MAX_FLOWS) {
 589 drop:
 590                                 qdisc_qstats_backlog_dec(sch, skb);
 591                                 kfree_skb(skb);
 592                                 dropped++;
 593                                 continue;
 594                         }
 595                         q->ht[hash] = x;
 596                         slot = &q->slots[x];
 597                         slot->hash = hash;
 598                 }
 599                 if (slot->qlen >= q->maxdepth)
 600                         goto drop;
 601                 slot_queue_add(slot, skb);
 602                 if (q->red_parms)
 603                         slot->vars.qavg = red_calc_qavg(q->red_parms,
 604                                                         &slot->vars,
 605                                                         slot->backlog);
 606                 slot->backlog += qdisc_pkt_len(skb);
 607                 sfq_inc(q, x);
 608                 if (slot->qlen == 1) {          /* The flow is new */
 609                         if (q->tail == NULL) {  /* It is the first flow */
 610                                 slot->next = x;
 611                         } else {
 612                                 slot->next = q->tail->next;
 613                                 q->tail->next = x;
 614                         }
 615                         q->tail = slot;
 616                         slot->allot = q->scaled_quantum;
 617                 }
 618         }
 619         sch->q.qlen -= dropped;
 620         qdisc_tree_decrease_qlen(sch, dropped);
 621 }
 622
 623 static void sfq_perturbation(unsigned long arg)
 624 {
 625         struct Qdisc *sch = (struct Qdisc *)arg;
 626         struct sfq_sched_data *q = qdisc_priv(sch);
 627         spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
 628
 629         spin_lock(root_lock);
 630         q->perturbation = prandom_u32();
 631         if (!q->filter_list && q->tail)
 632                 sfq_rehash(sch);
 633         spin_unlock(root_lock);
 634
 635         if (q->perturb_period)
 636                 mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 637 }
 638
 639 static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
 640 {
 641         struct sfq_sched_data *q = qdisc_priv(sch);
 642         struct tc_sfq_qopt *ctl = nla_data(opt);
 643         struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
 644         unsigned int qlen;
 645         struct red_parms *p = NULL;
 646
 647         if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
 648                 return -EINVAL;
 649         if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
 650                 ctl_v1 = nla_data(opt);
 651         if (ctl->divisor &&
 652             (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
 653                 return -EINVAL;
 654         if (ctl_v1 && ctl_v1->qth_min) {
 655                 p = kmalloc(sizeof(*p), GFP_KERNEL);
 656                 if (!p)
 657                         return -ENOMEM;
 658         }
 659         sch_tree_lock(sch);
 660         if (ctl->quantum) {
 661                 q->quantum = ctl->quantum;
 662                 q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
 663         }
 664         q->perturb_period = ctl->perturb_period * HZ;
 665         if (ctl->flows)
 666                 q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
 667         if (ctl->divisor) {
 668                 q->divisor = ctl->divisor;
 669                 q->maxflows = min_t(u32, q->maxflows, q->divisor);
 670         }
 671         if (ctl_v1) {
 672                 if (ctl_v1->depth)
 673                         q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
 674                 if (p) {
 675                         swap(q->red_parms, p);
 676                         red_set_parms(q->red_parms,
 677                                       ctl_v1->qth_min, ctl_v1->qth_max,
 678                                       ctl_v1->Wlog,
 679                                       ctl_v1->Plog, ctl_v1->Scell_log,
 680                                       NULL,
 681                                       ctl_v1->max_P);
 682                 }
 683                 q->flags = ctl_v1->flags;
 684                 q->headdrop = ctl_v1->headdrop;
 685         }
 686         if (ctl->limit) {
 687                 q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
 688                 q->maxflows = min_t(u32, q->maxflows, q->limit);
 689         }
 690
 691         qlen = sch->q.qlen;
 692         while (sch->q.qlen > q->limit)
 693                 sfq_drop(sch);
 694         qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
 695
 696         del_timer(&q->perturb_timer);
 697         if (q->perturb_period) {
 698                 mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 699                 q->perturbation = prandom_u32();
 700         }
 701         sch_tree_unlock(sch);
 702         kfree(p);
 703         return 0;
 704 }
 705
 706 static void *sfq_alloc(size_t sz)
 707 {
 708         void *ptr = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN);
 709
 710         if (!ptr)
 711                 ptr = vmalloc(sz);
 712         return ptr;
 713 }
 714
 715 static void sfq_free(void *addr)
 716 {
 717         kvfree(addr);
 718 }
 719
 720 static void sfq_destroy(struct Qdisc *sch)
 721 {
 722         struct sfq_sched_data *q = qdisc_priv(sch);
 723
 724         tcf_destroy_chain(&q->filter_list);
 725         q->perturb_period = 0;
 726         del_timer_sync(&q->perturb_timer);
 727         sfq_free(q->ht);
 728         sfq_free(q->slots);
 729         kfree(q->red_parms);
 730 }
 731
 732 static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
 733 {
 734         struct sfq_sched_data *q = qdisc_priv(sch);
 735         int i;
 736
 737         q->perturb_timer.function = sfq_perturbation;
 738         q->perturb_timer.data = (unsigned long)sch;
 739         init_timer_deferrable(&q->perturb_timer);
 740
 741         for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
 742                 q->dep[i].next = i + SFQ_MAX_FLOWS;
 743                 q->dep[i].prev = i + SFQ_MAX_FLOWS;
 744         }
 745
 746         q->limit = SFQ_MAX_DEPTH;
 747         q->maxdepth = SFQ_MAX_DEPTH;
 748         q->cur_depth = 0;
 749         q->tail = NULL;
 750         q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
 751         q->maxflows = SFQ_DEFAULT_FLOWS;
 752         q->quantum = psched_mtu(qdisc_dev(sch));
 753         q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
 754         q->perturb_period = 0;
 755         q->perturbation = prandom_u32();
 756
 757         if (opt) {
 758                 int err = sfq_change(sch, opt);
 759                 if (err)
 760                         return err;
 761         }
 762
 763         q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
 764         q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
 765         if (!q->ht || !q->slots) {
 766                 sfq_destroy(sch);
 767                 return -ENOMEM;
 768         }
 769         for (i = 0; i < q->divisor; i++)
 770                 q->ht[i] = SFQ_EMPTY_SLOT;
 771
 772         for (i = 0; i < q->maxflows; i++) {
 773                 slot_queue_init(&q->slots[i]);
 774                 sfq_link(q, i);
 775         }
 776         if (q->limit >= 1)
 777                 sch->flags |= TCQ_F_CAN_BYPASS;
 778         else
 779                 sch->flags &= ~TCQ_F_CAN_BYPASS;
 780         return 0;
 781 }
 782
 783 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
 784 {
 785         struct sfq_sched_data *q = qdisc_priv(sch);
 786         unsigned char *b = skb_tail_pointer(skb);
 787         struct tc_sfq_qopt_v1 opt;
 788         struct red_parms *p = q->red_parms;
 789
 790         memset(&opt, 0, sizeof(opt));
 791         opt.v0.quantum  = q->quantum;
 792         opt.v0.perturb_period = q->perturb_period / HZ;
 793         opt.v0.limit    = q->limit;
 794         opt.v0.divisor  = q->divisor;
 795         opt.v0.flows    = q->maxflows;
 796         opt.depth       = q->maxdepth;
 797         opt.headdrop    = q->headdrop;
 798
 799         if (p) {
 800                 opt.qth_min     = p->qth_min >> p->Wlog;
 801                 opt.qth_max     = p->qth_max >> p->Wlog;
 802                 opt.Wlog        = p->Wlog;
 803                 opt.Plog        = p->Plog;
 804                 opt.Scell_log   = p->Scell_log;
 805                 opt.max_P       = p->max_P;
 806         }
 807         memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
 808         opt.flags       = q->flags;
 809
 810         if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
 811                 goto nla_put_failure;
 812
 813         return skb->len;
 814
 815 nla_put_failure:
 816         nlmsg_trim(skb, b);
 817         return -1;
 818 }
 819
 820 static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
 821 {
 822         return NULL;
 823 }
 824
 825 static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
 826 {
 827         return 0;
 828 }
 829
 830 static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
 831                               u32 classid)
 832 {
 833         /* we cannot bypass queue discipline anymore */
 834         sch->flags &= ~TCQ_F_CAN_BYPASS;
 835         return 0;
 836 }
 837
 838 static void sfq_put(struct Qdisc *q, unsigned long cl)
 839 {
 840 }
 841
 842 static struct tcf_proto __rcu **sfq_find_tcf(struct Qdisc *sch,
 843                                              unsigned long cl)
 844 {
 845         struct sfq_sched_data *q = qdisc_priv(sch);
 846
 847         if (cl)
 848                 return NULL;
 849         return &q->filter_list;
 850 }
 851
 852 static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
 853                           struct sk_buff *skb, struct tcmsg *tcm)
 854 {
 855         tcm->tcm_handle |= TC_H_MIN(cl);
 856         return 0;
 857 }
 858
 859 static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 860                                 struct gnet_dump *d)
 861 {
 862         struct sfq_sched_data *q = qdisc_priv(sch);
 863         sfq_index idx = q->ht[cl - 1];
 864         struct gnet_stats_queue qs = { 0 };
 865         struct tc_sfq_xstats xstats = { 0 };
 866
 867         if (idx != SFQ_EMPTY_SLOT) {
 868                 const struct sfq_slot *slot = &q->slots[idx];
 869
 870                 xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
 871                 qs.qlen = slot->qlen;
 872                 qs.backlog = slot->backlog;
 873         }
 874         if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
 875                 return -1;
 876         return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
 877 }
 878
 879 static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 880 {
 881         struct sfq_sched_data *q = qdisc_priv(sch);
 882         unsigned int i;
 883
 884         if (arg->stop)
 885                 return;
 886
 887         for (i = 0; i < q->divisor; i++) {
 888                 if (q->ht[i] == SFQ_EMPTY_SLOT ||
 889                     arg->count < arg->skip) {
 890                         arg->count++;
 891                         continue;
 892                 }
 893                 if (arg->fn(sch, i + 1, arg) < 0) {
 894                         arg->stop = 1;
 895                         break;
 896                 }
 897                 arg->count++;
 898         }
 899 }
 900
 901 static const struct Qdisc_class_ops sfq_class_ops = {
 902         .leaf           =       sfq_leaf,
 903         .get            =       sfq_get,
 904         .put            =       sfq_put,
 905         .tcf_chain      =       sfq_find_tcf,
 906         .bind_tcf       =       sfq_bind,
 907         .unbind_tcf     =       sfq_put,
 908         .dump           =       sfq_dump_class,
 909         .dump_stats     =       sfq_dump_class_stats,
 910         .walk           =       sfq_walk,
 911 };
 912
 913 static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
 914         .cl_ops         =       &sfq_class_ops,
 915         .id             =       "sfq",
 916         .priv_size      =       sizeof(struct sfq_sched_data),
 917         .enqueue        =       sfq_enqueue,
 918         .dequeue        =       sfq_dequeue,
 919         .peek           =       qdisc_peek_dequeued,
 920         .drop           =       sfq_drop,
 921         .init           =       sfq_init,
 922         .reset          =       sfq_reset,
 923         .destroy        =       sfq_destroy,
 924         .change         =       NULL,
 925         .dump           =       sfq_dump,
 926         .owner          =       THIS_MODULE,
 927 };
 928
 929 static int __init sfq_module_init(void)
 930 {
 931         return register_qdisc(&sfq_qdisc_ops);
 932 }
 933 static void __exit sfq_module_exit(void)
 934 {
 935         unregister_qdisc(&sfq_qdisc_ops);
 936 }
 937 module_init(sfq_module_init)
 938 module_exit(sfq_module_exit)
 939 MODULE_LICENSE("GPL");