net/sched/sch_hhf.c

   1 /* net/sched/sch_hhf.c          Heavy-Hitter Filter (HHF)
   2  *
   3  * Copyright (C) 2013 Terry Lam <vtlam@google.com>
   4  * Copyright (C) 2013 Nandita Dukkipati <nanditad@google.com>
   5  */
   6
   7 #include <linux/jhash.h>
   8 #include <linux/jiffies.h>
   9 #include <linux/module.h>
  10 #include <linux/skbuff.h>
  11 #include <linux/vmalloc.h>
  12 #include <net/flow_keys.h>
  13 #include <net/pkt_sched.h>
  14 #include <net/sock.h>
  15
  16 /*      Heavy-Hitter Filter (HHF)
  17  *
  18  * Principles :
  19  * Flows are classified into two buckets: non-heavy-hitter and heavy-hitter
  20  * buckets. Initially, a new flow starts as non-heavy-hitter. Once classified
  21  * as heavy-hitter, it is immediately switched to the heavy-hitter bucket.
  22  * The buckets are dequeued by a Weighted Deficit Round Robin (WDRR) scheduler,
  23  * in which the heavy-hitter bucket is served with less weight.
  24  * In other words, non-heavy-hitters (e.g., short bursts of critical traffic)
  25  * are isolated from heavy-hitters (e.g., persistent bulk traffic) and also have
  26  * higher share of bandwidth.
  27  *
  28  * To capture heavy-hitters, we use the "multi-stage filter" algorithm in the
  29  * following paper:
  30  * [EV02] C. Estan and G. Varghese, "New Directions in Traffic Measurement and
  31  * Accounting", in ACM SIGCOMM, 2002.
  32  *
  33  * Conceptually, a multi-stage filter comprises k independent hash functions
  34  * and k counter arrays. Packets are indexed into k counter arrays by k hash
  35  * functions, respectively. The counters are then increased by the packet sizes.
  36  * Therefore,
  37  *    - For a heavy-hitter flow: *all* of its k array counters must be large.
  38  *    - For a non-heavy-hitter flow: some of its k array counters can be large
  39  *      due to hash collision with other small flows; however, with high
  40  *      probability, not *all* k counters are large.
  41  *
  42  * By the design of the multi-stage filter algorithm, the false negative rate
  43  * (heavy-hitters getting away uncaptured) is zero. However, the algorithm is
  44  * susceptible to false positives (non-heavy-hitters mistakenly classified as
  45  * heavy-hitters).
  46  * Therefore, we also implement the following optimizations to reduce false
  47  * positives by avoiding unnecessary increment of the counter values:
  48  *    - Optimization O1: once a heavy-hitter is identified, its bytes are not
  49  *        accounted in the array counters. This technique is called "shielding"
  50  *        in Section 3.3.1 of [EV02].
  51  *    - Optimization O2: conservative update of counters
  52  *                       (Section 3.3.2 of [EV02]),
  53  *        New counter value = max {old counter value,
  54  *                                 smallest counter value + packet bytes}
  55  *
  56  * Finally, we refresh the counters periodically since otherwise the counter
  57  * values will keep accumulating.
  58  *
  59  * Once a flow is classified as heavy-hitter, we also save its per-flow state
  60  * in an exact-matching flow table so that its subsequent packets can be
  61  * dispatched to the heavy-hitter bucket accordingly.
  62  *
  63  *
  64  * At a high level, this qdisc works as follows:
  65  * Given a packet p:
  66  *   - If the flow-id of p (e.g., TCP 5-tuple) is already in the exact-matching
  67  *     heavy-hitter flow table, denoted table T, then send p to the heavy-hitter
  68  *     bucket.
  69  *   - Otherwise, forward p to the multi-stage filter, denoted filter F
  70  *        + If F decides that p belongs to a non-heavy-hitter flow, then send p
  71  *          to the non-heavy-hitter bucket.
  72  *        + Otherwise, if F decides that p belongs to a new heavy-hitter flow,
  73  *          then set up a new flow entry for the flow-id of p in the table T and
  74  *          send p to the heavy-hitter bucket.
  75  *
  76  * In this implementation:
  77  *   - T is a fixed-size hash-table with 1024 entries. Hash collision is
  78  *     resolved by linked-list chaining.
  79  *   - F has four counter arrays, each array containing 1024 32-bit counters.
  80  *     That means 4 * 1024 * 32 bits = 16KB of memory.
  81  *   - Since each array in F contains 1024 counters, 10 bits are sufficient to
  82  *     index into each array.
  83  *     Hence, instead of having four hash functions, we chop the 32-bit
  84  *     skb-hash into three 10-bit chunks, and the remaining 10-bit chunk is
  85  *     computed as XOR sum of those three chunks.
  86  *   - We need to clear the counter arrays periodically; however, directly
  87  *     memsetting 16KB of memory can lead to cache eviction and unwanted delay.
  88  *     So by representing each counter by a valid bit, we only need to reset
  89  *     4K of 1 bit (i.e. 512 bytes) instead of 16KB of memory.
  90  *   - The Deficit Round Robin engine is taken from fq_codel implementation
  91  *     (net/sched/sch_fq_codel.c). Note that wdrr_bucket corresponds to
  92  *     fq_codel_flow in fq_codel implementation.
  93  *
  94  */
  95
  96 /* Non-configurable parameters */
  97 #define HH_FLOWS_CNT     1024  /* number of entries in exact-matching table T */
  98 #define HHF_ARRAYS_CNT   4     /* number of arrays in multi-stage filter F */
  99 #define HHF_ARRAYS_LEN   1024  /* number of counters in each array of F */
 100 #define HHF_BIT_MASK_LEN 10    /* masking 10 bits */
 101 #define HHF_BIT_MASK     0x3FF /* bitmask of 10 bits */
 102
 103 #define WDRR_BUCKET_CNT  2     /* two buckets for Weighted DRR */
 104 enum wdrr_bucket_idx {
 105         WDRR_BUCKET_FOR_HH      = 0, /* bucket id for heavy-hitters */
 106         WDRR_BUCKET_FOR_NON_HH  = 1  /* bucket id for non-heavy-hitters */
 107 };
 108
 109 #define hhf_time_before(a, b)   \
 110         (typecheck(u32, a) && typecheck(u32, b) && ((s32)((a) - (b)) < 0))
 111
 112 /* Heavy-hitter per-flow state */
 113 struct hh_flow_state {
 114         u32              hash_id;       /* hash of flow-id (e.g. TCP 5-tuple) */
 115         u32              hit_timestamp; /* last time heavy-hitter was seen */
 116         struct list_head flowchain;     /* chaining under hash collision */
 117 };
 118
 119 /* Weighted Deficit Round Robin (WDRR) scheduler */
 120 struct wdrr_bucket {
 121         struct sk_buff    *head;
 122         struct sk_buff    *tail;
 123         struct list_head  bucketchain;
 124         int               deficit;
 125 };
 126
 127 struct hhf_sched_data {
 128         struct wdrr_bucket buckets[WDRR_BUCKET_CNT];
 129         u32                perturbation;   /* hash perturbation */
 130         u32                quantum;        /* psched_mtu(qdisc_dev(sch)); */
 131         u32                drop_overlimit; /* number of times max qdisc packet
 132                                             * limit was hit
 133                                             */
 134         struct list_head   *hh_flows;       /* table T (currently active HHs) */
 135         u32                hh_flows_limit;            /* max active HH allocs */
 136         u32                hh_flows_overlimit; /* num of disallowed HH allocs */
 137         u32                hh_flows_total_cnt;          /* total admitted HHs */
 138         u32                hh_flows_current_cnt;        /* total current HHs  */
 139         u32                *hhf_arrays[HHF_ARRAYS_CNT]; /* HH filter F */
 140         u32                hhf_arrays_reset_timestamp;  /* last time hhf_arrays
 141                                                          * was reset
 142                                                          */
 143         unsigned long      *hhf_valid_bits[HHF_ARRAYS_CNT]; /* shadow valid bits
 144                                                              * of hhf_arrays
 145                                                              */
 146         /* Similar to the "new_flows" vs. "old_flows" concept in fq_codel DRR */
 147         struct list_head   new_buckets; /* list of new buckets */
 148         struct list_head   old_buckets; /* list of old buckets */
 149
 150         /* Configurable HHF parameters */
 151         u32                hhf_reset_timeout; /* interval to reset counter
 152                                                * arrays in filter F
 153                                                * (default 40ms)
 154                                                */
 155         u32                hhf_admit_bytes;   /* counter thresh to classify as
 156                                                * HH (default 128KB).
 157                                                * With these default values,
 158                                                * 128KB / 40ms = 25 Mbps
 159                                                * i.e., we expect to capture HHs
 160                                                * sending > 25 Mbps.
 161                                                */
 162         u32                hhf_evict_timeout; /* aging threshold to evict idle
 163                                                * HHs out of table T. This should
 164                                                * be large enough to avoid
 165                                                * reordering during HH eviction.
 166                                                * (default 1s)
 167                                                */
 168         u32                hhf_non_hh_weight; /* WDRR weight for non-HHs
 169                                                * (default 2,
 170                                                *  i.e., non-HH : HH = 2 : 1)
 171                                                */
 172 };
 173
 174 static u32 hhf_time_stamp(void)
 175 {
 176         return jiffies;
 177 }
 178
 179 static unsigned int skb_hash(const struct hhf_sched_data *q,
 180                              const struct sk_buff *skb)
 181 {
 182         struct flow_keys keys;
 183         unsigned int hash;
 184
 185         if (skb->sk && skb->sk->sk_hash)
 186                 return skb->sk->sk_hash;
 187
 188         skb_flow_dissect(skb, &keys);
 189         hash = jhash_3words((__force u32)keys.dst,
 190                             (__force u32)keys.src ^ keys.ip_proto,
 191                             (__force u32)keys.ports, q->perturbation);
 192         return hash;
 193 }
 194
 195 /* Looks up a heavy-hitter flow in a chaining list of table T. */
 196 static struct hh_flow_state *seek_list(const u32 hash,
 197                                        struct list_head *head,
 198                                        struct hhf_sched_data *q)
 199 {
 200         struct hh_flow_state *flow, *next;
 201         u32 now = hhf_time_stamp();
 202
 203         if (list_empty(head))
 204                 return NULL;
 205
 206         list_for_each_entry_safe(flow, next, head, flowchain) {
 207                 u32 prev = flow->hit_timestamp + q->hhf_evict_timeout;
 208
 209                 if (hhf_time_before(prev, now)) {
 210                         /* Delete expired heavy-hitters, but preserve one entry
 211                          * to avoid kzalloc() when next time this slot is hit.
 212                          */
 213                         if (list_is_last(&flow->flowchain, head))
 214                                 return NULL;
 215                         list_del(&flow->flowchain);
 216                         kfree(flow);
 217                         q->hh_flows_current_cnt--;
 218                 } else if (flow->hash_id == hash) {
 219                         return flow;
 220                 }
 221         }
 222         return NULL;
 223 }
 224
 225 /* Returns a flow state entry for a new heavy-hitter.  Either reuses an expired
 226  * entry or dynamically alloc a new entry.
 227  */
 228 static struct hh_flow_state *alloc_new_hh(struct list_head *head,
 229                                           struct hhf_sched_data *q)
 230 {
 231         struct hh_flow_state *flow;
 232         u32 now = hhf_time_stamp();
 233
 234         if (!list_empty(head)) {
 235                 /* Find an expired heavy-hitter flow entry. */
 236                 list_for_each_entry(flow, head, flowchain) {
 237                         u32 prev = flow->hit_timestamp + q->hhf_evict_timeout;
 238
 239                         if (hhf_time_before(prev, now))
 240                                 return flow;
 241                 }
 242         }
 243
 244         if (q->hh_flows_current_cnt >= q->hh_flows_limit) {
 245                 q->hh_flows_overlimit++;
 246                 return NULL;
 247         }
 248         /* Create new entry. */
 249         flow = kzalloc(sizeof(struct hh_flow_state), GFP_ATOMIC);
 250         if (!flow)
 251                 return NULL;
 252
 253         q->hh_flows_current_cnt++;
 254         INIT_LIST_HEAD(&flow->flowchain);
 255         list_add_tail(&flow->flowchain, head);
 256
 257         return flow;
 258 }
 259
 260 /* Assigns packets to WDRR buckets.  Implements a multi-stage filter to
 261  * classify heavy-hitters.
 262  */
 263 static enum wdrr_bucket_idx hhf_classify(struct sk_buff *skb, struct Qdisc *sch)
 264 {
 265         struct hhf_sched_data *q = qdisc_priv(sch);
 266         u32 tmp_hash, hash;
 267         u32 xorsum, filter_pos[HHF_ARRAYS_CNT], flow_pos;
 268         struct hh_flow_state *flow;
 269         u32 pkt_len, min_hhf_val;
 270         int i;
 271         u32 prev;
 272         u32 now = hhf_time_stamp();
 273
 274         /* Reset the HHF counter arrays if this is the right time. */
 275         prev = q->hhf_arrays_reset_timestamp + q->hhf_reset_timeout;
 276         if (hhf_time_before(prev, now)) {
 277                 for (i = 0; i < HHF_ARRAYS_CNT; i++)
 278                         bitmap_zero(q->hhf_valid_bits[i], HHF_ARRAYS_LEN);
 279                 q->hhf_arrays_reset_timestamp = now;
 280         }
 281
 282         /* Get hashed flow-id of the skb. */
 283         hash = skb_hash(q, skb);
 284
 285         /* Check if this packet belongs to an already established HH flow. */
 286         flow_pos = hash & HHF_BIT_MASK;
 287         flow = seek_list(hash, &q->hh_flows[flow_pos], q);
 288         if (flow) { /* found its HH flow */
 289                 flow->hit_timestamp = now;
 290                 return WDRR_BUCKET_FOR_HH;
 291         }
 292
 293         /* Now pass the packet through the multi-stage filter. */
 294         tmp_hash = hash;
 295         xorsum = 0;
 296         for (i = 0; i < HHF_ARRAYS_CNT - 1; i++) {
 297                 /* Split the skb_hash into three 10-bit chunks. */
 298                 filter_pos[i] = tmp_hash & HHF_BIT_MASK;
 299                 xorsum ^= filter_pos[i];
 300                 tmp_hash >>= HHF_BIT_MASK_LEN;
 301         }
 302         /* The last chunk is computed as XOR sum of other chunks. */
 303         filter_pos[HHF_ARRAYS_CNT - 1] = xorsum ^ tmp_hash;
 304
 305         pkt_len = qdisc_pkt_len(skb);
 306         min_hhf_val = ~0U;
 307         for (i = 0; i < HHF_ARRAYS_CNT; i++) {
 308                 u32 val;
 309
 310                 if (!test_bit(filter_pos[i], q->hhf_valid_bits[i])) {
 311                         q->hhf_arrays[i][filter_pos[i]] = 0;
 312                         __set_bit(filter_pos[i], q->hhf_valid_bits[i]);
 313                 }
 314
 315                 val = q->hhf_arrays[i][filter_pos[i]] + pkt_len;
 316                 if (min_hhf_val > val)
 317                         min_hhf_val = val;
 318         }
 319
 320         /* Found a new HH iff all counter values > HH admit threshold. */
 321         if (min_hhf_val > q->hhf_admit_bytes) {
 322                 /* Just captured a new heavy-hitter. */
 323                 flow = alloc_new_hh(&q->hh_flows[flow_pos], q);
 324                 if (!flow) /* memory alloc problem */
 325                         return WDRR_BUCKET_FOR_NON_HH;
 326                 flow->hash_id = hash;
 327                 flow->hit_timestamp = now;
 328                 q->hh_flows_total_cnt++;
 329
 330                 /* By returning without updating counters in q->hhf_arrays,
 331                  * we implicitly implement "shielding" (see Optimization O1).
 332                  */
 333                 return WDRR_BUCKET_FOR_HH;
 334         }
 335
 336         /* Conservative update of HHF arrays (see Optimization O2). */
 337         for (i = 0; i < HHF_ARRAYS_CNT; i++) {
 338                 if (q->hhf_arrays[i][filter_pos[i]] < min_hhf_val)
 339                         q->hhf_arrays[i][filter_pos[i]] = min_hhf_val;
 340         }
 341         return WDRR_BUCKET_FOR_NON_HH;
 342 }
 343
 344 /* Removes one skb from head of bucket. */
 345 static struct sk_buff *dequeue_head(struct wdrr_bucket *bucket)
 346 {
 347         struct sk_buff *skb = bucket->head;
 348
 349         bucket->head = skb->next;
 350         skb->next = NULL;
 351         return skb;
 352 }
 353
 354 /* Tail-adds skb to bucket. */
 355 static void bucket_add(struct wdrr_bucket *bucket, struct sk_buff *skb)
 356 {
 357         if (bucket->head == NULL)
 358                 bucket->head = skb;
 359         else
 360                 bucket->tail->next = skb;
 361         bucket->tail = skb;
 362         skb->next = NULL;
 363 }
 364
 365 static unsigned int hhf_drop(struct Qdisc *sch)
 366 {
 367         struct hhf_sched_data *q = qdisc_priv(sch);
 368         struct wdrr_bucket *bucket;
 369
 370         /* Always try to drop from heavy-hitters first. */
 371         bucket = &q->buckets[WDRR_BUCKET_FOR_HH];
 372         if (!bucket->head)
 373                 bucket = &q->buckets[WDRR_BUCKET_FOR_NON_HH];
 374
 375         if (bucket->head) {
 376                 struct sk_buff *skb = dequeue_head(bucket);
 377
 378                 sch->q.qlen--;
 379                 sch->qstats.drops++;
 380                 sch->qstats.backlog -= qdisc_pkt_len(skb);
 381                 kfree_skb(skb);
 382         }
 383
 384         /* Return id of the bucket from which the packet was dropped. */
 385         return bucket - q->buckets;
 386 }
 387
 388 static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 389 {
 390         struct hhf_sched_data *q = qdisc_priv(sch);
 391         enum wdrr_bucket_idx idx;
 392         struct wdrr_bucket *bucket;
 393
 394         idx = hhf_classify(skb, sch);
 395
 396         bucket = &q->buckets[idx];
 397         bucket_add(bucket, skb);
 398         sch->qstats.backlog += qdisc_pkt_len(skb);
 399
 400         if (list_empty(&bucket->bucketchain)) {
 401                 unsigned int weight;
 402
 403                 /* The logic of new_buckets vs. old_buckets is the same as
 404                  * new_flows vs. old_flows in the implementation of fq_codel,
 405                  * i.e., short bursts of non-HHs should have strict priority.
 406                  */
 407                 if (idx == WDRR_BUCKET_FOR_HH) {
 408                         /* Always move heavy-hitters to old bucket. */
 409                         weight = 1;
 410                         list_add_tail(&bucket->bucketchain, &q->old_buckets);
 411                 } else {
 412                         weight = q->hhf_non_hh_weight;
 413                         list_add_tail(&bucket->bucketchain, &q->new_buckets);
 414                 }
 415                 bucket->deficit = weight * q->quantum;
 416         }
 417         if (++sch->q.qlen < sch->limit)
 418                 return NET_XMIT_SUCCESS;
 419
 420         q->drop_overlimit++;
 421         /* Return Congestion Notification only if we dropped a packet from this
 422          * bucket.
 423          */
 424         if (hhf_drop(sch) == idx)
 425                 return NET_XMIT_CN;
 426
 427         /* As we dropped a packet, better let upper stack know this. */
 428         qdisc_tree_decrease_qlen(sch, 1);
 429         return NET_XMIT_SUCCESS;
 430 }
 431
 432 static struct sk_buff *hhf_dequeue(struct Qdisc *sch)
 433 {
 434         struct hhf_sched_data *q = qdisc_priv(sch);
 435         struct sk_buff *skb = NULL;
 436         struct wdrr_bucket *bucket;
 437         struct list_head *head;
 438
 439 begin:
 440         head = &q->new_buckets;
 441         if (list_empty(head)) {
 442                 head = &q->old_buckets;
 443                 if (list_empty(head))
 444                         return NULL;
 445         }
 446         bucket = list_first_entry(head, struct wdrr_bucket, bucketchain);
 447
 448         if (bucket->deficit <= 0) {
 449                 int weight = (bucket - q->buckets == WDRR_BUCKET_FOR_HH) ?
 450                               1 : q->hhf_non_hh_weight;
 451
 452                 bucket->deficit += weight * q->quantum;
 453                 list_move_tail(&bucket->bucketchain, &q->old_buckets);
 454                 goto begin;
 455         }
 456
 457         if (bucket->head) {
 458                 skb = dequeue_head(bucket);
 459                 sch->q.qlen--;
 460                 sch->qstats.backlog -= qdisc_pkt_len(skb);
 461         }
 462
 463         if (!skb) {
 464                 /* Force a pass through old_buckets to prevent starvation. */
 465                 if ((head == &q->new_buckets) && !list_empty(&q->old_buckets))
 466                         list_move_tail(&bucket->bucketchain, &q->old_buckets);
 467                 else
 468                         list_del_init(&bucket->bucketchain);
 469                 goto begin;
 470         }
 471         qdisc_bstats_update(sch, skb);
 472         bucket->deficit -= qdisc_pkt_len(skb);
 473
 474         return skb;
 475 }
 476
 477 static void hhf_reset(struct Qdisc *sch)
 478 {
 479         struct sk_buff *skb;
 480
 481         while ((skb = hhf_dequeue(sch)) != NULL)
 482                 kfree_skb(skb);
 483 }
 484
 485 static void *hhf_zalloc(size_t sz)
 486 {
 487         void *ptr = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN);
 488
 489         if (!ptr)
 490                 ptr = vzalloc(sz);
 491
 492         return ptr;
 493 }
 494
 495 static void hhf_free(void *addr)
 496 {
 497         if (addr) {
 498                 if (is_vmalloc_addr(addr))
 499                         vfree(addr);
 500                 else
 501                         kfree(addr);
 502         }
 503 }
 504
 505 static void hhf_destroy(struct Qdisc *sch)
 506 {
 507         int i;
 508         struct hhf_sched_data *q = qdisc_priv(sch);
 509
 510         for (i = 0; i < HHF_ARRAYS_CNT; i++) {
 511                 hhf_free(q->hhf_arrays[i]);
 512                 hhf_free(q->hhf_valid_bits[i]);
 513         }
 514
 515         for (i = 0; i < HH_FLOWS_CNT; i++) {
 516                 struct hh_flow_state *flow, *next;
 517                 struct list_head *head = &q->hh_flows[i];
 518
 519                 if (list_empty(head))
 520                         continue;
 521                 list_for_each_entry_safe(flow, next, head, flowchain) {
 522                         list_del(&flow->flowchain);
 523                         kfree(flow);
 524                 }
 525         }
 526         hhf_free(q->hh_flows);
 527 }
 528
 529 static const struct nla_policy hhf_policy[TCA_HHF_MAX + 1] = {
 530         [TCA_HHF_BACKLOG_LIMIT]  = { .type = NLA_U32 },
 531         [TCA_HHF_QUANTUM]        = { .type = NLA_U32 },
 532         [TCA_HHF_HH_FLOWS_LIMIT] = { .type = NLA_U32 },
 533         [TCA_HHF_RESET_TIMEOUT]  = { .type = NLA_U32 },
 534         [TCA_HHF_ADMIT_BYTES]    = { .type = NLA_U32 },
 535         [TCA_HHF_EVICT_TIMEOUT]  = { .type = NLA_U32 },
 536         [TCA_HHF_NON_HH_WEIGHT]  = { .type = NLA_U32 },
 537 };
 538
 539 static int hhf_change(struct Qdisc *sch, struct nlattr *opt)
 540 {
 541         struct hhf_sched_data *q = qdisc_priv(sch);
 542         struct nlattr *tb[TCA_HHF_MAX + 1];
 543         unsigned int qlen;
 544         int err;
 545         u64 non_hh_quantum;
 546         u32 new_quantum = q->quantum;
 547         u32 new_hhf_non_hh_weight = q->hhf_non_hh_weight;
 548
 549         if (!opt)
 550                 return -EINVAL;
 551
 552         err = nla_parse_nested(tb, TCA_HHF_MAX, opt, hhf_policy);
 553         if (err < 0)
 554                 return err;
 555
 556         sch_tree_lock(sch);
 557
 558         if (tb[TCA_HHF_BACKLOG_LIMIT])
 559                 sch->limit = nla_get_u32(tb[TCA_HHF_BACKLOG_LIMIT]);
 560
 561         if (tb[TCA_HHF_QUANTUM])
 562                 new_quantum = nla_get_u32(tb[TCA_HHF_QUANTUM]);
 563
 564         if (tb[TCA_HHF_NON_HH_WEIGHT])
 565                 new_hhf_non_hh_weight = nla_get_u32(tb[TCA_HHF_NON_HH_WEIGHT]);
 566
 567         non_hh_quantum = (u64)new_quantum * new_hhf_non_hh_weight;
 568         if (non_hh_quantum > INT_MAX)
 569                 return -EINVAL;
 570         q->quantum = new_quantum;
 571         q->hhf_non_hh_weight = new_hhf_non_hh_weight;
 572
 573         if (tb[TCA_HHF_HH_FLOWS_LIMIT])
 574                 q->hh_flows_limit = nla_get_u32(tb[TCA_HHF_HH_FLOWS_LIMIT]);
 575
 576         if (tb[TCA_HHF_RESET_TIMEOUT]) {
 577                 u32 us = nla_get_u32(tb[TCA_HHF_RESET_TIMEOUT]);
 578
 579                 q->hhf_reset_timeout = usecs_to_jiffies(us);
 580         }
 581
 582         if (tb[TCA_HHF_ADMIT_BYTES])
 583                 q->hhf_admit_bytes = nla_get_u32(tb[TCA_HHF_ADMIT_BYTES]);
 584
 585         if (tb[TCA_HHF_EVICT_TIMEOUT]) {
 586                 u32 us = nla_get_u32(tb[TCA_HHF_EVICT_TIMEOUT]);
 587
 588                 q->hhf_evict_timeout = usecs_to_jiffies(us);
 589         }
 590
 591         qlen = sch->q.qlen;
 592         while (sch->q.qlen > sch->limit) {
 593                 struct sk_buff *skb = hhf_dequeue(sch);
 594
 595                 kfree_skb(skb);
 596         }
 597         qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
 598
 599         sch_tree_unlock(sch);
 600         return 0;
 601 }
 602
 603 static int hhf_init(struct Qdisc *sch, struct nlattr *opt)
 604 {
 605         struct hhf_sched_data *q = qdisc_priv(sch);
 606         int i;
 607
 608         sch->limit = 1000;
 609         q->quantum = psched_mtu(qdisc_dev(sch));
 610         q->perturbation = prandom_u32();
 611         INIT_LIST_HEAD(&q->new_buckets);
 612         INIT_LIST_HEAD(&q->old_buckets);
 613
 614         /* Configurable HHF parameters */
 615         q->hhf_reset_timeout = HZ / 25; /* 40  ms */
 616         q->hhf_admit_bytes = 131072;    /* 128 KB */
 617         q->hhf_evict_timeout = HZ;      /* 1  sec */
 618         q->hhf_non_hh_weight = 2;
 619
 620         if (opt) {
 621                 int err = hhf_change(sch, opt);
 622
 623                 if (err)
 624                         return err;
 625         }
 626
 627         if (!q->hh_flows) {
 628                 /* Initialize heavy-hitter flow table. */
 629                 q->hh_flows = hhf_zalloc(HH_FLOWS_CNT *
 630                                          sizeof(struct list_head));
 631                 if (!q->hh_flows)
 632                         return -ENOMEM;
 633                 for (i = 0; i < HH_FLOWS_CNT; i++)
 634                         INIT_LIST_HEAD(&q->hh_flows[i]);
 635
 636                 /* Cap max active HHs at twice len of hh_flows table. */
 637                 q->hh_flows_limit = 2 * HH_FLOWS_CNT;
 638                 q->hh_flows_overlimit = 0;
 639                 q->hh_flows_total_cnt = 0;
 640                 q->hh_flows_current_cnt = 0;
 641
 642                 /* Initialize heavy-hitter filter arrays. */
 643                 for (i = 0; i < HHF_ARRAYS_CNT; i++) {
 644                         q->hhf_arrays[i] = hhf_zalloc(HHF_ARRAYS_LEN *
 645                                                       sizeof(u32));
 646                         if (!q->hhf_arrays[i]) {
 647                                 hhf_destroy(sch);
 648                                 return -ENOMEM;
 649                         }
 650                 }
 651                 q->hhf_arrays_reset_timestamp = hhf_time_stamp();
 652
 653                 /* Initialize valid bits of heavy-hitter filter arrays. */
 654                 for (i = 0; i < HHF_ARRAYS_CNT; i++) {
 655                         q->hhf_valid_bits[i] = hhf_zalloc(HHF_ARRAYS_LEN /
 656                                                           BITS_PER_BYTE);
 657                         if (!q->hhf_valid_bits[i]) {
 658                                 hhf_destroy(sch);
 659                                 return -ENOMEM;
 660                         }
 661                 }
 662
 663                 /* Initialize Weighted DRR buckets. */
 664                 for (i = 0; i < WDRR_BUCKET_CNT; i++) {
 665                         struct wdrr_bucket *bucket = q->buckets + i;
 666
 667                         INIT_LIST_HEAD(&bucket->bucketchain);
 668                 }
 669         }
 670
 671         return 0;
 672 }
 673
 674 static int hhf_dump(struct Qdisc *sch, struct sk_buff *skb)
 675 {
 676         struct hhf_sched_data *q = qdisc_priv(sch);
 677         struct nlattr *opts;
 678
 679         opts = nla_nest_start(skb, TCA_OPTIONS);
 680         if (opts == NULL)
 681                 goto nla_put_failure;
 682
 683         if (nla_put_u32(skb, TCA_HHF_BACKLOG_LIMIT, sch->limit) ||
 684             nla_put_u32(skb, TCA_HHF_QUANTUM, q->quantum) ||
 685             nla_put_u32(skb, TCA_HHF_HH_FLOWS_LIMIT, q->hh_flows_limit) ||
 686             nla_put_u32(skb, TCA_HHF_RESET_TIMEOUT,
 687                         jiffies_to_usecs(q->hhf_reset_timeout)) ||
 688             nla_put_u32(skb, TCA_HHF_ADMIT_BYTES, q->hhf_admit_bytes) ||
 689             nla_put_u32(skb, TCA_HHF_EVICT_TIMEOUT,
 690                         jiffies_to_usecs(q->hhf_evict_timeout)) ||
 691             nla_put_u32(skb, TCA_HHF_NON_HH_WEIGHT, q->hhf_non_hh_weight))
 692                 goto nla_put_failure;
 693
 694         nla_nest_end(skb, opts);
 695         return skb->len;
 696
 697 nla_put_failure:
 698         return -1;
 699 }
 700
 701 static int hhf_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
 702 {
 703         struct hhf_sched_data *q = qdisc_priv(sch);
 704         struct tc_hhf_xstats st = {
 705                 .drop_overlimit = q->drop_overlimit,
 706                 .hh_overlimit   = q->hh_flows_overlimit,
 707                 .hh_tot_count   = q->hh_flows_total_cnt,
 708                 .hh_cur_count   = q->hh_flows_current_cnt,
 709         };
 710
 711         return gnet_stats_copy_app(d, &st, sizeof(st));
 712 }
 713
 714 static struct Qdisc_ops hhf_qdisc_ops __read_mostly = {
 715         .id             =       "hhf",
 716         .priv_size      =       sizeof(struct hhf_sched_data),
 717
 718         .enqueue        =       hhf_enqueue,
 719         .dequeue        =       hhf_dequeue,
 720         .peek           =       qdisc_peek_dequeued,
 721         .drop           =       hhf_drop,
 722         .init           =       hhf_init,
 723         .reset          =       hhf_reset,
 724         .destroy        =       hhf_destroy,
 725         .change         =       hhf_change,
 726         .dump           =       hhf_dump,
 727         .dump_stats     =       hhf_dump_stats,
 728         .owner          =       THIS_MODULE,
 729 };
 730
 731 static int __init hhf_module_init(void)
 732 {
 733         return register_qdisc(&hhf_qdisc_ops);
 734 }
 735
 736 static void __exit hhf_module_exit(void)
 737 {
 738         unregister_qdisc(&hhf_qdisc_ops);
 739 }
 740
 741 module_init(hhf_module_init)
 742 module_exit(hhf_module_exit)
 743 MODULE_AUTHOR("Terry Lam");
 744 MODULE_AUTHOR("Nandita Dukkipati");
 745 MODULE_LICENSE("GPL");