net/sched/sch_sfq.c

   1 /*
   2  * net/sched/sch_sfq.c  Stochastic Fairness Queueing discipline.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10  */
  11
  12 #include <linux/module.h>
  13 #include <linux/types.h>
  14 #include <linux/kernel.h>
  15 #include <linux/jiffies.h>
  16 #include <linux/string.h>
  17 #include <linux/in.h>
  18 #include <linux/errno.h>
  19 #include <linux/init.h>
  20 #include <linux/skbuff.h>
  21 #include <linux/jhash.h>
  22 #include <linux/slab.h>
  23 #include <linux/vmalloc.h>
  24 #include <net/netlink.h>
  25 #include <net/pkt_sched.h>
  26 #include <net/flow_keys.h>
  27 #include <net/red.h>
  28
  29
  30 /*      Stochastic Fairness Queuing algorithm.
  31         =======================================
  32
  33         Source:
  34         Paul E. McKenney "Stochastic Fairness Queuing",
  35         IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
  36
  37         Paul E. McKenney "Stochastic Fairness Queuing",
  38         "Interworking: Research and Experience", v.2, 1991, p.113-131.
  39
  40
  41         See also:
  42         M. Shreedhar and George Varghese "Efficient Fair
  43         Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
  44
  45
  46         This is not the thing that is usually called (W)FQ nowadays.
  47         It does not use any timestamp mechanism, but instead
  48         processes queues in round-robin order.
  49
  50         ADVANTAGE:
  51
  52         - It is very cheap. Both CPU and memory requirements are minimal.
  53
  54         DRAWBACKS:
  55
  56         - "Stochastic" -> It is not 100% fair.
  57         When hash collisions occur, several flows are considered as one.
  58
  59         - "Round-robin" -> It introduces larger delays than virtual clock
  60         based schemes, and should not be used for isolating interactive
  61         traffic from non-interactive. It means, that this scheduler
  62         should be used as leaf of CBQ or P3, which put interactive traffic
  63         to higher priority band.
  64
  65         We still need true WFQ for top level CSZ, but using WFQ
  66         for the best effort traffic is absolutely pointless:
  67         SFQ is superior for this purpose.
  68
  69         IMPLEMENTATION:
  70         This implementation limits :
  71         - maximal queue length per flow to 127 packets.
  72         - max mtu to 2^18-1;
  73         - max 65408 flows,
  74         - number of hash buckets to 65536.
  75
  76         It is easy to increase these values, but not in flight.  */
  77
  78 #define SFQ_MAX_DEPTH           127 /* max number of packets per flow */
  79 #define SFQ_DEFAULT_FLOWS       128
  80 #define SFQ_MAX_FLOWS           (0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
  81 #define SFQ_EMPTY_SLOT          0xffff
  82 #define SFQ_DEFAULT_HASH_DIVISOR 1024
  83
  84 /* We use 16 bits to store allot, and want to handle packets up to 64K
  85  * Scale allot by 8 (1<<3) so that no overflow occurs.
  86  */
  87 #define SFQ_ALLOT_SHIFT         3
  88 #define SFQ_ALLOT_SIZE(X)       DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
  89
  90 /* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
  91 typedef u16 sfq_index;
  92
  93 /*
  94  * We dont use pointers to save space.
  95  * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
  96  * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
  97  * are 'pointers' to dep[] array
  98  */
  99 struct sfq_head {
 100         sfq_index       next;
 101         sfq_index       prev;
 102 };
 103
 104 struct sfq_slot {
 105         struct sk_buff  *skblist_next;
 106         struct sk_buff  *skblist_prev;
 107         sfq_index       qlen; /* number of skbs in skblist */
 108         sfq_index       next; /* next slot in sfq RR chain */
 109         struct sfq_head dep; /* anchor in dep[] chains */
 110         unsigned short  hash; /* hash value (index in ht[]) */
 111         short           allot; /* credit for this slot */
 112
 113         unsigned int    backlog;
 114         struct red_vars vars;
 115 };
 116
 117 struct sfq_sched_data {
 118 /* frequently used fields */
 119         int             limit;          /* limit of total number of packets in this qdisc */
 120         unsigned int    divisor;        /* number of slots in hash table */
 121         u8              headdrop;
 122         u8              maxdepth;       /* limit of packets per flow */
 123
 124         u32             perturbation;
 125         u8              cur_depth;      /* depth of longest slot */
 126         u8              flags;
 127         unsigned short  scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
 128         struct tcf_proto *filter_list;
 129         sfq_index       *ht;            /* Hash table ('divisor' slots) */
 130         struct sfq_slot *slots;         /* Flows table ('maxflows' entries) */
 131
 132         struct red_parms *red_parms;
 133         struct tc_sfqred_stats stats;
 134         struct sfq_slot *tail;          /* current slot in round */
 135
 136         struct sfq_head dep[SFQ_MAX_DEPTH + 1];
 137                                         /* Linked lists of slots, indexed by depth
 138                                          * dep[0] : list of unused flows
 139                                          * dep[1] : list of flows with 1 packet
 140                                          * dep[X] : list of flows with X packets
 141                                          */
 142
 143         unsigned int    maxflows;       /* number of flows in flows array */
 144         int             perturb_period;
 145         unsigned int    quantum;        /* Allotment per round: MUST BE >= MTU */
 146         struct timer_list perturb_timer;
 147 };
 148
 149 /*
 150  * sfq_head are either in a sfq_slot or in dep[] array
 151  */
 152 static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
 153 {
 154         if (val < SFQ_MAX_FLOWS)
 155                 return &q->slots[val].dep;
 156         return &q->dep[val - SFQ_MAX_FLOWS];
 157 }
 158
 159 /*
 160  * In order to be able to quickly rehash our queue when timer changes
 161  * q->perturbation, we store flow_keys in skb->cb[]
 162  */
 163 struct sfq_skb_cb {
 164        struct flow_keys        keys;
 165 };
 166
 167 static inline struct sfq_skb_cb *sfq_skb_cb(const struct sk_buff *skb)
 168 {
 169        BUILD_BUG_ON(sizeof(skb->cb) <
 170                sizeof(struct qdisc_skb_cb) + sizeof(struct sfq_skb_cb));
 171        return (struct sfq_skb_cb *)qdisc_skb_cb(skb)->data;
 172 }
 173
 174 static unsigned int sfq_hash(const struct sfq_sched_data *q,
 175                              const struct sk_buff *skb)
 176 {
 177         const struct flow_keys *keys = &sfq_skb_cb(skb)->keys;
 178         unsigned int hash;
 179
 180         hash = jhash_3words((__force u32)keys->dst,
 181                             (__force u32)keys->src ^ keys->ip_proto,
 182                             (__force u32)keys->ports, q->perturbation);
 183         return hash & (q->divisor - 1);
 184 }
 185
 186 static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
 187                                  int *qerr)
 188 {
 189         struct sfq_sched_data *q = qdisc_priv(sch);
 190         struct tcf_result res;
 191         int result;
 192
 193         if (TC_H_MAJ(skb->priority) == sch->handle &&
 194             TC_H_MIN(skb->priority) > 0 &&
 195             TC_H_MIN(skb->priority) <= q->divisor)
 196                 return TC_H_MIN(skb->priority);
 197
 198         if (!q->filter_list) {
 199                 skb_flow_dissect(skb, &sfq_skb_cb(skb)->keys);
 200                 return sfq_hash(q, skb) + 1;
 201         }
 202
 203         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 204         result = tc_classify(skb, q->filter_list, &res);
 205         if (result >= 0) {
 206 #ifdef CONFIG_NET_CLS_ACT
 207                 switch (result) {
 208                 case TC_ACT_STOLEN:
 209                 case TC_ACT_QUEUED:
 210                         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 211                 case TC_ACT_SHOT:
 212                         return 0;
 213                 }
 214 #endif
 215                 if (TC_H_MIN(res.classid) <= q->divisor)
 216                         return TC_H_MIN(res.classid);
 217         }
 218         return 0;
 219 }
 220
 221 /*
 222  * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
 223  */
 224 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
 225 {
 226         sfq_index p, n;
 227         struct sfq_slot *slot = &q->slots[x];
 228         int qlen = slot->qlen;
 229
 230         p = qlen + SFQ_MAX_FLOWS;
 231         n = q->dep[qlen].next;
 232
 233         slot->dep.next = n;
 234         slot->dep.prev = p;
 235
 236         q->dep[qlen].next = x;          /* sfq_dep_head(q, p)->next = x */
 237         sfq_dep_head(q, n)->prev = x;
 238 }
 239
 240 #define sfq_unlink(q, x, n, p)                  \
 241         n = q->slots[x].dep.next;               \
 242         p = q->slots[x].dep.prev;               \
 243         sfq_dep_head(q, p)->next = n;           \
 244         sfq_dep_head(q, n)->prev = p
 245
 246
 247 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
 248 {
 249         sfq_index p, n;
 250         int d;
 251
 252         sfq_unlink(q, x, n, p);
 253
 254         d = q->slots[x].qlen--;
 255         if (n == p && q->cur_depth == d)
 256                 q->cur_depth--;
 257         sfq_link(q, x);
 258 }
 259
 260 static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
 261 {
 262         sfq_index p, n;
 263         int d;
 264
 265         sfq_unlink(q, x, n, p);
 266
 267         d = ++q->slots[x].qlen;
 268         if (q->cur_depth < d)
 269                 q->cur_depth = d;
 270         sfq_link(q, x);
 271 }
 272
 273 /* helper functions : might be changed when/if skb use a standard list_head */
 274
 275 /* remove one skb from tail of slot queue */
 276 static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
 277 {
 278         struct sk_buff *skb = slot->skblist_prev;
 279
 280         slot->skblist_prev = skb->prev;
 281         skb->prev->next = (struct sk_buff *)slot;
 282         skb->next = skb->prev = NULL;
 283         return skb;
 284 }
 285
 286 /* remove one skb from head of slot queue */
 287 static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
 288 {
 289         struct sk_buff *skb = slot->skblist_next;
 290
 291         slot->skblist_next = skb->next;
 292         skb->next->prev = (struct sk_buff *)slot;
 293         skb->next = skb->prev = NULL;
 294         return skb;
 295 }
 296
 297 static inline void slot_queue_init(struct sfq_slot *slot)
 298 {
 299         memset(slot, 0, sizeof(*slot));
 300         slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
 301 }
 302
 303 /* add skb to slot queue (tail add) */
 304 static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
 305 {
 306         skb->prev = slot->skblist_prev;
 307         skb->next = (struct sk_buff *)slot;
 308         slot->skblist_prev->next = skb;
 309         slot->skblist_prev = skb;
 310 }
 311
 312 #define slot_queue_walk(slot, skb)              \
 313         for (skb = slot->skblist_next;          \
 314              skb != (struct sk_buff *)slot;     \
 315              skb = skb->next)
 316
 317 static unsigned int sfq_drop(struct Qdisc *sch)
 318 {
 319         struct sfq_sched_data *q = qdisc_priv(sch);
 320         sfq_index x, d = q->cur_depth;
 321         struct sk_buff *skb;
 322         unsigned int len;
 323         struct sfq_slot *slot;
 324
 325         /* Queue is full! Find the longest slot and drop tail packet from it */
 326         if (d > 1) {
 327                 x = q->dep[d].next;
 328                 slot = &q->slots[x];
 329 drop:
 330                 skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
 331                 len = qdisc_pkt_len(skb);
 332                 slot->backlog -= len;
 333                 sfq_dec(q, x);
 334                 kfree_skb(skb);
 335                 sch->q.qlen--;
 336                 sch->qstats.drops++;
 337                 sch->qstats.backlog -= len;
 338                 return len;
 339         }
 340
 341         if (d == 1) {
 342                 /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
 343                 x = q->tail->next;
 344                 slot = &q->slots[x];
 345                 q->tail->next = slot->next;
 346                 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 347                 goto drop;
 348         }
 349
 350         return 0;
 351 }
 352
 353 /* Is ECN parameter configured */
 354 static int sfq_prob_mark(const struct sfq_sched_data *q)
 355 {
 356         return q->flags & TC_RED_ECN;
 357 }
 358
 359 /* Should packets over max threshold just be marked */
 360 static int sfq_hard_mark(const struct sfq_sched_data *q)
 361 {
 362         return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
 363 }
 364
 365 static int sfq_headdrop(const struct sfq_sched_data *q)
 366 {
 367         return q->headdrop;
 368 }
 369
 370 static int
 371 sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 372 {
 373         struct sfq_sched_data *q = qdisc_priv(sch);
 374         unsigned int hash;
 375         sfq_index x, qlen;
 376         struct sfq_slot *slot;
 377         int uninitialized_var(ret);
 378         struct sk_buff *head;
 379         int delta;
 380
 381         hash = sfq_classify(skb, sch, &ret);
 382         if (hash == 0) {
 383                 if (ret & __NET_XMIT_BYPASS)
 384                         sch->qstats.drops++;
 385                 kfree_skb(skb);
 386                 return ret;
 387         }
 388         hash--;
 389
 390         x = q->ht[hash];
 391         slot = &q->slots[x];
 392         if (x == SFQ_EMPTY_SLOT) {
 393                 x = q->dep[0].next; /* get a free slot */
 394                 if (x >= SFQ_MAX_FLOWS)
 395                         return qdisc_drop(skb, sch);
 396                 q->ht[hash] = x;
 397                 slot = &q->slots[x];
 398                 slot->hash = hash;
 399                 slot->backlog = 0; /* should already be 0 anyway... */
 400                 red_set_vars(&slot->vars);
 401                 goto enqueue;
 402         }
 403         if (q->red_parms) {
 404                 slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
 405                                                         &slot->vars,
 406                                                         slot->backlog);
 407                 switch (red_action(q->red_parms,
 408                                    &slot->vars,
 409                                    slot->vars.qavg)) {
 410                 case RED_DONT_MARK:
 411                         break;
 412
 413                 case RED_PROB_MARK:
 414                         sch->qstats.overlimits++;
 415                         if (sfq_prob_mark(q)) {
 416                                 /* We know we have at least one packet in queue */
 417                                 if (sfq_headdrop(q) &&
 418                                     INET_ECN_set_ce(slot->skblist_next)) {
 419                                         q->stats.prob_mark_head++;
 420                                         break;
 421                                 }
 422                                 if (INET_ECN_set_ce(skb)) {
 423                                         q->stats.prob_mark++;
 424                                         break;
 425                                 }
 426                         }
 427                         q->stats.prob_drop++;
 428                         goto congestion_drop;
 429
 430                 case RED_HARD_MARK:
 431                         sch->qstats.overlimits++;
 432                         if (sfq_hard_mark(q)) {
 433                                 /* We know we have at least one packet in queue */
 434                                 if (sfq_headdrop(q) &&
 435                                     INET_ECN_set_ce(slot->skblist_next)) {
 436                                         q->stats.forced_mark_head++;
 437                                         break;
 438                                 }
 439                                 if (INET_ECN_set_ce(skb)) {
 440                                         q->stats.forced_mark++;
 441                                         break;
 442                                 }
 443                         }
 444                         q->stats.forced_drop++;
 445                         goto congestion_drop;
 446                 }
 447         }
 448
 449         if (slot->qlen >= q->maxdepth) {
 450 congestion_drop:
 451                 if (!sfq_headdrop(q))
 452                         return qdisc_drop(skb, sch);
 453
 454                 /* We know we have at least one packet in queue */
 455                 head = slot_dequeue_head(slot);
 456                 delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
 457                 sch->qstats.backlog -= delta;
 458                 slot->backlog -= delta;
 459                 qdisc_drop(head, sch);
 460
 461                 slot_queue_add(slot, skb);
 462                 return NET_XMIT_CN;
 463         }
 464
 465 enqueue:
 466         sch->qstats.backlog += qdisc_pkt_len(skb);
 467         slot->backlog += qdisc_pkt_len(skb);
 468         slot_queue_add(slot, skb);
 469         sfq_inc(q, x);
 470         if (slot->qlen == 1) {          /* The flow is new */
 471                 if (q->tail == NULL) {  /* It is the first flow */
 472                         slot->next = x;
 473                         q->tail = slot;
 474                 } else {
 475                         slot->next = q->tail->next;
 476                         q->tail->next = x;
 477                 }
 478                 /* We could use a bigger initial quantum for new flows */
 479                 slot->allot = q->scaled_quantum;
 480         }
 481         if (++sch->q.qlen <= q->limit)
 482                 return NET_XMIT_SUCCESS;
 483
 484         qlen = slot->qlen;
 485         sfq_drop(sch);
 486         /* Return Congestion Notification only if we dropped a packet
 487          * from this flow.
 488          */
 489         if (qlen != slot->qlen)
 490                 return NET_XMIT_CN;
 491
 492         /* As we dropped a packet, better let upper stack know this */
 493         qdisc_tree_decrease_qlen(sch, 1);
 494         return NET_XMIT_SUCCESS;
 495 }
 496
 497 static struct sk_buff *
 498 sfq_dequeue(struct Qdisc *sch)
 499 {
 500         struct sfq_sched_data *q = qdisc_priv(sch);
 501         struct sk_buff *skb;
 502         sfq_index a, next_a;
 503         struct sfq_slot *slot;
 504
 505         /* No active slots */
 506         if (q->tail == NULL)
 507                 return NULL;
 508
 509 next_slot:
 510         a = q->tail->next;
 511         slot = &q->slots[a];
 512         if (slot->allot <= 0) {
 513                 q->tail = slot;
 514                 slot->allot += q->scaled_quantum;
 515                 goto next_slot;
 516         }
 517         skb = slot_dequeue_head(slot);
 518         sfq_dec(q, a);
 519         qdisc_bstats_update(sch, skb);
 520         sch->q.qlen--;
 521         sch->qstats.backlog -= qdisc_pkt_len(skb);
 522         slot->backlog -= qdisc_pkt_len(skb);
 523         /* Is the slot empty? */
 524         if (slot->qlen == 0) {
 525                 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 526                 next_a = slot->next;
 527                 if (a == next_a) {
 528                         q->tail = NULL; /* no more active slots */
 529                         return skb;
 530                 }
 531                 q->tail->next = next_a;
 532         } else {
 533                 slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
 534         }
 535         return skb;
 536 }
 537
 538 static void
 539 sfq_reset(struct Qdisc *sch)
 540 {
 541         struct sk_buff *skb;
 542
 543         while ((skb = sfq_dequeue(sch)) != NULL)
 544                 kfree_skb(skb);
 545 }
 546
 547 /*
 548  * When q->perturbation is changed, we rehash all queued skbs
 549  * to avoid OOO (Out Of Order) effects.
 550  * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
 551  * counters.
 552  */
 553 static void sfq_rehash(struct Qdisc *sch)
 554 {
 555         struct sfq_sched_data *q = qdisc_priv(sch);
 556         struct sk_buff *skb;
 557         int i;
 558         struct sfq_slot *slot;
 559         struct sk_buff_head list;
 560         int dropped = 0;
 561
 562         __skb_queue_head_init(&list);
 563
 564         for (i = 0; i < q->maxflows; i++) {
 565                 slot = &q->slots[i];
 566                 if (!slot->qlen)
 567                         continue;
 568                 while (slot->qlen) {
 569                         skb = slot_dequeue_head(slot);
 570                         sfq_dec(q, i);
 571                         __skb_queue_tail(&list, skb);
 572                 }
 573                 slot->backlog = 0;
 574                 red_set_vars(&slot->vars);
 575                 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 576         }
 577         q->tail = NULL;
 578
 579         while ((skb = __skb_dequeue(&list)) != NULL) {
 580                 unsigned int hash = sfq_hash(q, skb);
 581                 sfq_index x = q->ht[hash];
 582
 583                 slot = &q->slots[x];
 584                 if (x == SFQ_EMPTY_SLOT) {
 585                         x = q->dep[0].next; /* get a free slot */
 586                         if (x >= SFQ_MAX_FLOWS) {
 587 drop:                           sch->qstats.backlog -= qdisc_pkt_len(skb);
 588                                 kfree_skb(skb);
 589                                 dropped++;
 590                                 continue;
 591                         }
 592                         q->ht[hash] = x;
 593                         slot = &q->slots[x];
 594                         slot->hash = hash;
 595                 }
 596                 if (slot->qlen >= q->maxdepth)
 597                         goto drop;
 598                 slot_queue_add(slot, skb);
 599                 if (q->red_parms)
 600                         slot->vars.qavg = red_calc_qavg(q->red_parms,
 601                                                         &slot->vars,
 602                                                         slot->backlog);
 603                 slot->backlog += qdisc_pkt_len(skb);
 604                 sfq_inc(q, x);
 605                 if (slot->qlen == 1) {          /* The flow is new */
 606                         if (q->tail == NULL) {  /* It is the first flow */
 607                                 slot->next = x;
 608                         } else {
 609                                 slot->next = q->tail->next;
 610                                 q->tail->next = x;
 611                         }
 612                         q->tail = slot;
 613                         slot->allot = q->scaled_quantum;
 614                 }
 615         }
 616         sch->q.qlen -= dropped;
 617         qdisc_tree_decrease_qlen(sch, dropped);
 618 }
 619
 620 static void sfq_perturbation(unsigned long arg)
 621 {
 622         struct Qdisc *sch = (struct Qdisc *)arg;
 623         struct sfq_sched_data *q = qdisc_priv(sch);
 624         spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
 625
 626         spin_lock(root_lock);
 627         q->perturbation = net_random();
 628         if (!q->filter_list && q->tail)
 629                 sfq_rehash(sch);
 630         spin_unlock(root_lock);
 631
 632         if (q->perturb_period)
 633                 mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 634 }
 635
 636 static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
 637 {
 638         struct sfq_sched_data *q = qdisc_priv(sch);
 639         struct tc_sfq_qopt *ctl = nla_data(opt);
 640         struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
 641         unsigned int qlen;
 642         struct red_parms *p = NULL;
 643
 644         if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
 645                 return -EINVAL;
 646         if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
 647                 ctl_v1 = nla_data(opt);
 648         if (ctl->divisor &&
 649             (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
 650                 return -EINVAL;
 651         if (ctl_v1 && ctl_v1->qth_min) {
 652                 p = kmalloc(sizeof(*p), GFP_KERNEL);
 653                 if (!p)
 654                         return -ENOMEM;
 655         }
 656         sch_tree_lock(sch);
 657         if (ctl->quantum) {
 658                 q->quantum = ctl->quantum;
 659                 q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
 660         }
 661         q->perturb_period = ctl->perturb_period * HZ;
 662         if (ctl->flows)
 663                 q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
 664         if (ctl->divisor) {
 665                 q->divisor = ctl->divisor;
 666                 q->maxflows = min_t(u32, q->maxflows, q->divisor);
 667         }
 668         if (ctl_v1) {
 669                 if (ctl_v1->depth)
 670                         q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
 671                 if (p) {
 672                         swap(q->red_parms, p);
 673                         red_set_parms(q->red_parms,
 674                                       ctl_v1->qth_min, ctl_v1->qth_max,
 675                                       ctl_v1->Wlog,
 676                                       ctl_v1->Plog, ctl_v1->Scell_log,
 677                                       NULL,
 678                                       ctl_v1->max_P);
 679                 }
 680                 q->flags = ctl_v1->flags;
 681                 q->headdrop = ctl_v1->headdrop;
 682         }
 683         if (ctl->limit) {
 684                 q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
 685                 q->maxflows = min_t(u32, q->maxflows, q->limit);
 686         }
 687
 688         qlen = sch->q.qlen;
 689         while (sch->q.qlen > q->limit)
 690                 sfq_drop(sch);
 691         qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
 692
 693         del_timer(&q->perturb_timer);
 694         if (q->perturb_period) {
 695                 mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 696                 q->perturbation = net_random();
 697         }
 698         sch_tree_unlock(sch);
 699         kfree(p);
 700         return 0;
 701 }
 702
 703 static void *sfq_alloc(size_t sz)
 704 {
 705         void *ptr = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN);
 706
 707         if (!ptr)
 708                 ptr = vmalloc(sz);
 709         return ptr;
 710 }
 711
 712 static void sfq_free(void *addr)
 713 {
 714         if (addr) {
 715                 if (is_vmalloc_addr(addr))
 716                         vfree(addr);
 717                 else
 718                         kfree(addr);
 719         }
 720 }
 721
 722 static void sfq_destroy(struct Qdisc *sch)
 723 {
 724         struct sfq_sched_data *q = qdisc_priv(sch);
 725
 726         tcf_destroy_chain(&q->filter_list);
 727         q->perturb_period = 0;
 728         del_timer_sync(&q->perturb_timer);
 729         sfq_free(q->ht);
 730         sfq_free(q->slots);
 731         kfree(q->red_parms);
 732 }
 733
 734 static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
 735 {
 736         struct sfq_sched_data *q = qdisc_priv(sch);
 737         int i;
 738
 739         q->perturb_timer.function = sfq_perturbation;
 740         q->perturb_timer.data = (unsigned long)sch;
 741         init_timer_deferrable(&q->perturb_timer);
 742
 743         for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
 744                 q->dep[i].next = i + SFQ_MAX_FLOWS;
 745                 q->dep[i].prev = i + SFQ_MAX_FLOWS;
 746         }
 747
 748         q->limit = SFQ_MAX_DEPTH;
 749         q->maxdepth = SFQ_MAX_DEPTH;
 750         q->cur_depth = 0;
 751         q->tail = NULL;
 752         q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
 753         q->maxflows = SFQ_DEFAULT_FLOWS;
 754         q->quantum = psched_mtu(qdisc_dev(sch));
 755         q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
 756         q->perturb_period = 0;
 757         q->perturbation = net_random();
 758
 759         if (opt) {
 760                 int err = sfq_change(sch, opt);
 761                 if (err)
 762                         return err;
 763         }
 764
 765         q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
 766         q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
 767         if (!q->ht || !q->slots) {
 768                 sfq_destroy(sch);
 769                 return -ENOMEM;
 770         }
 771         for (i = 0; i < q->divisor; i++)
 772                 q->ht[i] = SFQ_EMPTY_SLOT;
 773
 774         for (i = 0; i < q->maxflows; i++) {
 775                 slot_queue_init(&q->slots[i]);
 776                 sfq_link(q, i);
 777         }
 778         if (q->limit >= 1)
 779                 sch->flags |= TCQ_F_CAN_BYPASS;
 780         else
 781                 sch->flags &= ~TCQ_F_CAN_BYPASS;
 782         return 0;
 783 }
 784
 785 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
 786 {
 787         struct sfq_sched_data *q = qdisc_priv(sch);
 788         unsigned char *b = skb_tail_pointer(skb);
 789         struct tc_sfq_qopt_v1 opt;
 790         struct red_parms *p = q->red_parms;
 791
 792         memset(&opt, 0, sizeof(opt));
 793         opt.v0.quantum  = q->quantum;
 794         opt.v0.perturb_period = q->perturb_period / HZ;
 795         opt.v0.limit    = q->limit;
 796         opt.v0.divisor  = q->divisor;
 797         opt.v0.flows    = q->maxflows;
 798         opt.depth       = q->maxdepth;
 799         opt.headdrop    = q->headdrop;
 800
 801         if (p) {
 802                 opt.qth_min     = p->qth_min >> p->Wlog;
 803                 opt.qth_max     = p->qth_max >> p->Wlog;
 804                 opt.Wlog        = p->Wlog;
 805                 opt.Plog        = p->Plog;
 806                 opt.Scell_log   = p->Scell_log;
 807                 opt.max_P       = p->max_P;
 808         }
 809         memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
 810         opt.flags       = q->flags;
 811
 812         NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
 813
 814         return skb->len;
 815
 816 nla_put_failure:
 817         nlmsg_trim(skb, b);
 818         return -1;
 819 }
 820
 821 static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
 822 {
 823         return NULL;
 824 }
 825
 826 static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
 827 {
 828         return 0;
 829 }
 830
 831 static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
 832                               u32 classid)
 833 {
 834         /* we cannot bypass queue discipline anymore */
 835         sch->flags &= ~TCQ_F_CAN_BYPASS;
 836         return 0;
 837 }
 838
 839 static void sfq_put(struct Qdisc *q, unsigned long cl)
 840 {
 841 }
 842
 843 static struct tcf_proto **sfq_find_tcf(struct Qdisc *sch, unsigned long cl)
 844 {
 845         struct sfq_sched_data *q = qdisc_priv(sch);
 846
 847         if (cl)
 848                 return NULL;
 849         return &q->filter_list;
 850 }
 851
 852 static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
 853                           struct sk_buff *skb, struct tcmsg *tcm)
 854 {
 855         tcm->tcm_handle |= TC_H_MIN(cl);
 856         return 0;
 857 }
 858
 859 static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 860                                 struct gnet_dump *d)
 861 {
 862         struct sfq_sched_data *q = qdisc_priv(sch);
 863         sfq_index idx = q->ht[cl - 1];
 864         struct gnet_stats_queue qs = { 0 };
 865         struct tc_sfq_xstats xstats = { 0 };
 866
 867         if (idx != SFQ_EMPTY_SLOT) {
 868                 const struct sfq_slot *slot = &q->slots[idx];
 869
 870                 xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
 871                 qs.qlen = slot->qlen;
 872                 qs.backlog = slot->backlog;
 873         }
 874         if (gnet_stats_copy_queue(d, &qs) < 0)
 875                 return -1;
 876         return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
 877 }
 878
 879 static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 880 {
 881         struct sfq_sched_data *q = qdisc_priv(sch);
 882         unsigned int i;
 883
 884         if (arg->stop)
 885                 return;
 886
 887         for (i = 0; i < q->divisor; i++) {
 888                 if (q->ht[i] == SFQ_EMPTY_SLOT ||
 889                     arg->count < arg->skip) {
 890                         arg->count++;
 891                         continue;
 892                 }
 893                 if (arg->fn(sch, i + 1, arg) < 0) {
 894                         arg->stop = 1;
 895                         break;
 896                 }
 897                 arg->count++;
 898         }
 899 }
 900
 901 static const struct Qdisc_class_ops sfq_class_ops = {
 902         .leaf           =       sfq_leaf,
 903         .get            =       sfq_get,
 904         .put            =       sfq_put,
 905         .tcf_chain      =       sfq_find_tcf,
 906         .bind_tcf       =       sfq_bind,
 907         .unbind_tcf     =       sfq_put,
 908         .dump           =       sfq_dump_class,
 909         .dump_stats     =       sfq_dump_class_stats,
 910         .walk           =       sfq_walk,
 911 };
 912
 913 static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
 914         .cl_ops         =       &sfq_class_ops,
 915         .id             =       "sfq",
 916         .priv_size      =       sizeof(struct sfq_sched_data),
 917         .enqueue        =       sfq_enqueue,
 918         .dequeue        =       sfq_dequeue,
 919         .peek           =       qdisc_peek_dequeued,
 920         .drop           =       sfq_drop,
 921         .init           =       sfq_init,
 922         .reset          =       sfq_reset,
 923         .destroy        =       sfq_destroy,
 924         .change         =       NULL,
 925         .dump           =       sfq_dump,
 926         .owner          =       THIS_MODULE,
 927 };
 928
 929 static int __init sfq_module_init(void)
 930 {
 931         return register_qdisc(&sfq_qdisc_ops);
 932 }
 933 static void __exit sfq_module_exit(void)
 934 {
 935         unregister_qdisc(&sfq_qdisc_ops);
 936 }
 937 module_init(sfq_module_init)
 938 module_exit(sfq_module_exit)
 939 MODULE_LICENSE("GPL");