net/sched/sch_netem.c

   1 /*
   2  * net/sched/sch_netem.c        Network emulator
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License.
   8  *
   9  *              Many of the algorithms and ideas for this came from
  10  *              NIST Net which is not copyrighted.
  11  *
  12  * Authors:     Stephen Hemminger <shemminger@osdl.org>
  13  *              Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
  14  */
  15
  16 #include <linux/module.h>
  17 #include <linux/slab.h>
  18 #include <linux/types.h>
  19 #include <linux/kernel.h>
  20 #include <linux/errno.h>
  21 #include <linux/skbuff.h>
  22 #include <linux/vmalloc.h>
  23 #include <linux/rtnetlink.h>
  24
  25 #include <net/netlink.h>
  26 #include <net/pkt_sched.h>
  27
  28 #define VERSION "1.3"
  29
  30 /*      Network Emulation Queuing algorithm.
  31         ====================================
  32
  33         Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
  34                  Network Emulation Tool
  35                  [2] Luigi Rizzo, DummyNet for FreeBSD
  36
  37          ----------------------------------------------------------------
  38
  39          This started out as a simple way to delay outgoing packets to
  40          test TCP but has grown to include most of the functionality
  41          of a full blown network emulator like NISTnet. It can delay
  42          packets and add random jitter (and correlation). The random
  43          distribution can be loaded from a table as well to provide
  44          normal, Pareto, or experimental curves. Packet loss,
  45          duplication, and reordering can also be emulated.
  46
  47          This qdisc does not do classification that can be handled in
  48          layering other disciplines.  It does not need to do bandwidth
  49          control either since that can be handled by using token
  50          bucket or other rate control.
  51
  52      Correlated Loss Generator models
  53
  54         Added generation of correlated loss according to the
  55         "Gilbert-Elliot" model, a 4-state markov model.
  56
  57         References:
  58         [1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
  59         [2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
  60         and intuitive loss model for packet networks and its implementation
  61         in the Netem module in the Linux kernel", available in [1]
  62
  63         Authors: Stefano Salsano <stefano.salsano at uniroma2.it
  64                  Fabio Ludovici <fabio.ludovici at yahoo.it>
  65 */
  66
  67 struct netem_sched_data {
  68         struct Qdisc    *qdisc;
  69         struct qdisc_watchdog watchdog;
  70
  71         psched_tdiff_t latency;
  72         psched_tdiff_t jitter;
  73
  74         u32 loss;
  75         u32 limit;
  76         u32 counter;
  77         u32 gap;
  78         u32 duplicate;
  79         u32 reorder;
  80         u32 corrupt;
  81
  82         struct crndstate {
  83                 u32 last;
  84                 u32 rho;
  85         } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
  86
  87         struct disttable {
  88                 u32  size;
  89                 s16 table[0];
  90         } *delay_dist;
  91
  92         enum  {
  93                 CLG_RANDOM,
  94                 CLG_4_STATES,
  95                 CLG_GILB_ELL,
  96         } loss_model;
  97
  98         /* Correlated Loss Generation models */
  99         struct clgstate {
 100                 /* state of the Markov chain */
 101                 u8 state;
 102
 103                 /* 4-states and Gilbert-Elliot models */
 104                 u32 a1; /* p13 for 4-states or p for GE */
 105                 u32 a2; /* p31 for 4-states or r for GE */
 106                 u32 a3; /* p32 for 4-states or h for GE */
 107                 u32 a4; /* p14 for 4-states or 1-k for GE */
 108                 u32 a5; /* p23 used only in 4-states */
 109         } clg;
 110
 111 };
 112
 113 /* Time stamp put into socket buffer control block */
 114 struct netem_skb_cb {
 115         psched_time_t   time_to_send;
 116 };
 117
 118 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
 119 {
 120         qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));
 121         return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
 122 }
 123
 124 /* init_crandom - initialize correlated random number generator
 125  * Use entropy source for initial seed.
 126  */
 127 static void init_crandom(struct crndstate *state, unsigned long rho)
 128 {
 129         state->rho = rho;
 130         state->last = net_random();
 131 }
 132
 133 /* get_crandom - correlated random number generator
 134  * Next number depends on last value.
 135  * rho is scaled to avoid floating point.
 136  */
 137 static u32 get_crandom(struct crndstate *state)
 138 {
 139         u64 value, rho;
 140         unsigned long answer;
 141
 142         if (state->rho == 0)    /* no correlation */
 143                 return net_random();
 144
 145         value = net_random();
 146         rho = (u64)state->rho + 1;
 147         answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
 148         state->last = answer;
 149         return answer;
 150 }
 151
 152 /* loss_4state - 4-state model loss generator
 153  * Generates losses according to the 4-state Markov chain adopted in
 154  * the GI (General and Intuitive) loss model.
 155  */
 156 static bool loss_4state(struct netem_sched_data *q)
 157 {
 158         struct clgstate *clg = &q->clg;
 159         u32 rnd = net_random();
 160
 161         /*
 162          * Makes a comparison between rnd and the transition
 163          * probabilities outgoing from the current state, then decides the
 164          * next state and if the next packet has to be transmitted or lost.
 165          * The four states correspond to:
 166          *   1 => successfully transmitted packets within a gap period
 167          *   4 => isolated losses within a gap period
 168          *   3 => lost packets within a burst period
 169          *   2 => successfully transmitted packets within a burst period
 170          */
 171         switch (clg->state) {
 172         case 1:
 173                 if (rnd < clg->a4) {
 174                         clg->state = 4;
 175                         return true;
 176                 } else if (clg->a4 < rnd && rnd < clg->a1) {
 177                         clg->state = 3;
 178                         return true;
 179                 } else if (clg->a1 < rnd)
 180                         clg->state = 1;
 181
 182                 break;
 183         case 2:
 184                 if (rnd < clg->a5) {
 185                         clg->state = 3;
 186                         return true;
 187                 } else
 188                         clg->state = 2;
 189
 190                 break;
 191         case 3:
 192                 if (rnd < clg->a3)
 193                         clg->state = 2;
 194                 else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
 195                         clg->state = 1;
 196                         return true;
 197                 } else if (clg->a2 + clg->a3 < rnd) {
 198                         clg->state = 3;
 199                         return true;
 200                 }
 201                 break;
 202         case 4:
 203                 clg->state = 1;
 204                 break;
 205         }
 206
 207         return false;
 208 }
 209
 210 /* loss_gilb_ell - Gilbert-Elliot model loss generator
 211  * Generates losses according to the Gilbert-Elliot loss model or
 212  * its special cases  (Gilbert or Simple Gilbert)
 213  *
 214  * Makes a comparison between random number and the transition
 215  * probabilities outgoing from the current state, then decides the
 216  * next state. A second random number is extracted and the comparison
 217  * with the loss probability of the current state decides if the next
 218  * packet will be transmitted or lost.
 219  */
 220 static bool loss_gilb_ell(struct netem_sched_data *q)
 221 {
 222         struct clgstate *clg = &q->clg;
 223
 224         switch (clg->state) {
 225         case 1:
 226                 if (net_random() < clg->a1)
 227                         clg->state = 2;
 228                 if (net_random() < clg->a4)
 229                         return true;
 230         case 2:
 231                 if (net_random() < clg->a2)
 232                         clg->state = 1;
 233                 if (clg->a3 > net_random())
 234                         return true;
 235         }
 236
 237         return false;
 238 }
 239
 240 static bool loss_event(struct netem_sched_data *q)
 241 {
 242         switch (q->loss_model) {
 243         case CLG_RANDOM:
 244                 /* Random packet drop 0 => none, ~0 => all */
 245                 return q->loss && q->loss >= get_crandom(&q->loss_cor);
 246
 247         case CLG_4_STATES:
 248                 /* 4state loss model algorithm (used also for GI model)
 249                 * Extracts a value from the markov 4 state loss generator,
 250                 * if it is 1 drops a packet and if needed writes the event in
 251                 * the kernel logs
 252                 */
 253                 return loss_4state(q);
 254
 255         case CLG_GILB_ELL:
 256                 /* Gilbert-Elliot loss model algorithm
 257                 * Extracts a value from the Gilbert-Elliot loss generator,
 258                 * if it is 1 drops a packet and if needed writes the event in
 259                 * the kernel logs
 260                 */
 261                 return loss_gilb_ell(q);
 262         }
 263
 264         return false;   /* not reached */
 265 }
 266
 267
 268 /* tabledist - return a pseudo-randomly distributed value with mean mu and
 269  * std deviation sigma.  Uses table lookup to approximate the desired
 270  * distribution, and a uniformly-distributed pseudo-random source.
 271  */
 272 static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma,
 273                                 struct crndstate *state,
 274                                 const struct disttable *dist)
 275 {
 276         psched_tdiff_t x;
 277         long t;
 278         u32 rnd;
 279
 280         if (sigma == 0)
 281                 return mu;
 282
 283         rnd = get_crandom(state);
 284
 285         /* default uniform distribution */
 286         if (dist == NULL)
 287                 return (rnd % (2*sigma)) - sigma + mu;
 288
 289         t = dist->table[rnd % dist->size];
 290         x = (sigma % NETEM_DIST_SCALE) * t;
 291         if (x >= 0)
 292                 x += NETEM_DIST_SCALE/2;
 293         else
 294                 x -= NETEM_DIST_SCALE/2;
 295
 296         return  x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
 297 }
 298
 299 /*
 300  * Insert one skb into qdisc.
 301  * Note: parent depends on return value to account for queue length.
 302  *      NET_XMIT_DROP: queue length didn't change.
 303  *      NET_XMIT_SUCCESS: one skb was queued.
 304  */
 305 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 306 {
 307         struct netem_sched_data *q = qdisc_priv(sch);
 308         /* We don't fill cb now as skb_unshare() may invalidate it */
 309         struct netem_skb_cb *cb;
 310         struct sk_buff *skb2;
 311         int ret;
 312         int count = 1;
 313
 314         /* Random duplication */
 315         if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
 316                 ++count;
 317
 318         /* Drop packet? */
 319         if (loss_event(q))
 320                 --count;
 321
 322         if (count == 0) {
 323                 sch->qstats.drops++;
 324                 kfree_skb(skb);
 325                 return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 326         }
 327
 328         skb_orphan(skb);
 329
 330         /*
 331          * If we need to duplicate packet, then re-insert at top of the
 332          * qdisc tree, since parent queuer expects that only one
 333          * skb will be queued.
 334          */
 335         if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
 336                 struct Qdisc *rootq = qdisc_root(sch);
 337                 u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
 338                 q->duplicate = 0;
 339
 340                 qdisc_enqueue_root(skb2, rootq);
 341                 q->duplicate = dupsave;
 342         }
 343
 344         /*
 345          * Randomized packet corruption.
 346          * Make copy if needed since we are modifying
 347          * If packet is going to be hardware checksummed, then
 348          * do it now in software before we mangle it.
 349          */
 350         if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
 351                 if (!(skb = skb_unshare(skb, GFP_ATOMIC)) ||
 352                     (skb->ip_summed == CHECKSUM_PARTIAL &&
 353                      skb_checksum_help(skb)))
 354                         return qdisc_drop(skb, sch);
 355
 356                 skb->data[net_random() % skb_headlen(skb)] ^= 1<<(net_random() % 8);
 357         }
 358
 359         cb = netem_skb_cb(skb);
 360         if (q->gap == 0 ||              /* not doing reordering */
 361             q->counter < q->gap ||      /* inside last reordering gap */
 362             q->reorder < get_crandom(&q->reorder_cor)) {
 363                 psched_time_t now;
 364                 psched_tdiff_t delay;
 365
 366                 delay = tabledist(q->latency, q->jitter,
 367                                   &q->delay_cor, q->delay_dist);
 368
 369                 now = psched_get_time();
 370                 cb->time_to_send = now + delay;
 371                 ++q->counter;
 372                 ret = qdisc_enqueue(skb, q->qdisc);
 373         } else {
 374                 /*
 375                  * Do re-ordering by putting one out of N packets at the front
 376                  * of the queue.
 377                  */
 378                 cb->time_to_send = psched_get_time();
 379                 q->counter = 0;
 380
 381                 __skb_queue_head(&q->qdisc->q, skb);
 382                 sch->qstats.backlog += qdisc_pkt_len(skb);
 383                 sch->qstats.requeues++;
 384                 ret = NET_XMIT_SUCCESS;
 385         }
 386
 387         if (ret != NET_XMIT_SUCCESS) {
 388                 if (net_xmit_drop_count(ret)) {
 389                         sch->qstats.drops++;
 390                         return ret;
 391                 }
 392         }
 393
 394         sch->q.qlen++;
 395         return NET_XMIT_SUCCESS;
 396 }
 397
 398 static unsigned int netem_drop(struct Qdisc *sch)
 399 {
 400         struct netem_sched_data *q = qdisc_priv(sch);
 401         unsigned int len = 0;
 402
 403         if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
 404                 sch->q.qlen--;
 405                 sch->qstats.drops++;
 406         }
 407         return len;
 408 }
 409
 410 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
 411 {
 412         struct netem_sched_data *q = qdisc_priv(sch);
 413         struct sk_buff *skb;
 414
 415         if (qdisc_is_throttled(sch))
 416                 return NULL;
 417
 418         skb = q->qdisc->ops->peek(q->qdisc);
 419         if (skb) {
 420                 const struct netem_skb_cb *cb = netem_skb_cb(skb);
 421                 psched_time_t now = psched_get_time();
 422
 423                 /* if more time remaining? */
 424                 if (cb->time_to_send <= now) {
 425                         skb = qdisc_dequeue_peeked(q->qdisc);
 426                         if (unlikely(!skb))
 427                                 return NULL;
 428
 429 #ifdef CONFIG_NET_CLS_ACT
 430                         /*
 431                          * If it's at ingress let's pretend the delay is
 432                          * from the network (tstamp will be updated).
 433                          */
 434                         if (G_TC_FROM(skb->tc_verd) & AT_INGRESS)
 435                                 skb->tstamp.tv64 = 0;
 436 #endif
 437
 438                         sch->q.qlen--;
 439                         qdisc_unthrottled(sch);
 440                         qdisc_bstats_update(sch, skb);
 441                         return skb;
 442                 }
 443
 444                 qdisc_watchdog_schedule(&q->watchdog, cb->time_to_send);
 445         }
 446
 447         return NULL;
 448 }
 449
 450 static void netem_reset(struct Qdisc *sch)
 451 {
 452         struct netem_sched_data *q = qdisc_priv(sch);
 453
 454         qdisc_reset(q->qdisc);
 455         sch->q.qlen = 0;
 456         qdisc_watchdog_cancel(&q->watchdog);
 457 }
 458
 459 static void dist_free(struct disttable *d)
 460 {
 461         if (d) {
 462                 if (is_vmalloc_addr(d))
 463                         vfree(d);
 464                 else
 465                         kfree(d);
 466         }
 467 }
 468
 469 /*
 470  * Distribution data is a variable size payload containing
 471  * signed 16 bit values.
 472  */
 473 static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
 474 {
 475         struct netem_sched_data *q = qdisc_priv(sch);
 476         size_t n = nla_len(attr)/sizeof(__s16);
 477         const __s16 *data = nla_data(attr);
 478         spinlock_t *root_lock;
 479         struct disttable *d;
 480         int i;
 481         size_t s;
 482
 483         if (n > NETEM_DIST_MAX)
 484                 return -EINVAL;
 485
 486         s = sizeof(struct disttable) + n * sizeof(s16);
 487         d = kmalloc(s, GFP_KERNEL);
 488         if (!d)
 489                 d = vmalloc(s);
 490         if (!d)
 491                 return -ENOMEM;
 492
 493         d->size = n;
 494         for (i = 0; i < n; i++)
 495                 d->table[i] = data[i];
 496
 497         root_lock = qdisc_root_sleeping_lock(sch);
 498
 499         spin_lock_bh(root_lock);
 500         dist_free(q->delay_dist);
 501         q->delay_dist = d;
 502         spin_unlock_bh(root_lock);
 503         return 0;
 504 }
 505
 506 static void get_correlation(struct Qdisc *sch, const struct nlattr *attr)
 507 {
 508         struct netem_sched_data *q = qdisc_priv(sch);
 509         const struct tc_netem_corr *c = nla_data(attr);
 510
 511         init_crandom(&q->delay_cor, c->delay_corr);
 512         init_crandom(&q->loss_cor, c->loss_corr);
 513         init_crandom(&q->dup_cor, c->dup_corr);
 514 }
 515
 516 static void get_reorder(struct Qdisc *sch, const struct nlattr *attr)
 517 {
 518         struct netem_sched_data *q = qdisc_priv(sch);
 519         const struct tc_netem_reorder *r = nla_data(attr);
 520
 521         q->reorder = r->probability;
 522         init_crandom(&q->reorder_cor, r->correlation);
 523 }
 524
 525 static void get_corrupt(struct Qdisc *sch, const struct nlattr *attr)
 526 {
 527         struct netem_sched_data *q = qdisc_priv(sch);
 528         const struct tc_netem_corrupt *r = nla_data(attr);
 529
 530         q->corrupt = r->probability;
 531         init_crandom(&q->corrupt_cor, r->correlation);
 532 }
 533
 534 static int get_loss_clg(struct Qdisc *sch, const struct nlattr *attr)
 535 {
 536         struct netem_sched_data *q = qdisc_priv(sch);
 537         const struct nlattr *la;
 538         int rem;
 539
 540         nla_for_each_nested(la, attr, rem) {
 541                 u16 type = nla_type(la);
 542
 543                 switch(type) {
 544                 case NETEM_LOSS_GI: {
 545                         const struct tc_netem_gimodel *gi = nla_data(la);
 546
 547                         if (nla_len(la) != sizeof(struct tc_netem_gimodel)) {
 548                                 pr_info("netem: incorrect gi model size\n");
 549                                 return -EINVAL;
 550                         }
 551
 552                         q->loss_model = CLG_4_STATES;
 553
 554                         q->clg.state = 1;
 555                         q->clg.a1 = gi->p13;
 556                         q->clg.a2 = gi->p31;
 557                         q->clg.a3 = gi->p32;
 558                         q->clg.a4 = gi->p14;
 559                         q->clg.a5 = gi->p23;
 560                         break;
 561                 }
 562
 563                 case NETEM_LOSS_GE: {
 564                         const struct tc_netem_gemodel *ge = nla_data(la);
 565
 566                         if (nla_len(la) != sizeof(struct tc_netem_gemodel)) {
 567                                 pr_info("netem: incorrect gi model size\n");
 568                                 return -EINVAL;
 569                         }
 570
 571                         q->loss_model = CLG_GILB_ELL;
 572                         q->clg.state = 1;
 573                         q->clg.a1 = ge->p;
 574                         q->clg.a2 = ge->r;
 575                         q->clg.a3 = ge->h;
 576                         q->clg.a4 = ge->k1;
 577                         break;
 578                 }
 579
 580                 default:
 581                         pr_info("netem: unknown loss type %u\n", type);
 582                         return -EINVAL;
 583                 }
 584         }
 585
 586         return 0;
 587 }
 588
 589 static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
 590         [TCA_NETEM_CORR]        = { .len = sizeof(struct tc_netem_corr) },
 591         [TCA_NETEM_REORDER]     = { .len = sizeof(struct tc_netem_reorder) },
 592         [TCA_NETEM_CORRUPT]     = { .len = sizeof(struct tc_netem_corrupt) },
 593         [TCA_NETEM_LOSS]        = { .type = NLA_NESTED },
 594 };
 595
 596 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
 597                       const struct nla_policy *policy, int len)
 598 {
 599         int nested_len = nla_len(nla) - NLA_ALIGN(len);
 600
 601         if (nested_len < 0) {
 602                 pr_info("netem: invalid attributes len %d\n", nested_len);
 603                 return -EINVAL;
 604         }
 605
 606         if (nested_len >= nla_attr_size(0))
 607                 return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
 608                                  nested_len, policy);
 609
 610         memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
 611         return 0;
 612 }
 613
 614 /* Parse netlink message to set options */
 615 static int netem_change(struct Qdisc *sch, struct nlattr *opt)
 616 {
 617         struct netem_sched_data *q = qdisc_priv(sch);
 618         struct nlattr *tb[TCA_NETEM_MAX + 1];
 619         struct tc_netem_qopt *qopt;
 620         int ret;
 621
 622         if (opt == NULL)
 623                 return -EINVAL;
 624
 625         qopt = nla_data(opt);
 626         ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
 627         if (ret < 0)
 628                 return ret;
 629
 630         ret = fifo_set_limit(q->qdisc, qopt->limit);
 631         if (ret) {
 632                 pr_info("netem: can't set fifo limit\n");
 633                 return ret;
 634         }
 635
 636         q->latency = qopt->latency;
 637         q->jitter = qopt->jitter;
 638         q->limit = qopt->limit;
 639         q->gap = qopt->gap;
 640         q->counter = 0;
 641         q->loss = qopt->loss;
 642         q->duplicate = qopt->duplicate;
 643
 644         /* for compatibility with earlier versions.
 645          * if gap is set, need to assume 100% probability
 646          */
 647         if (q->gap)
 648                 q->reorder = ~0;
 649
 650         if (tb[TCA_NETEM_CORR])
 651                 get_correlation(sch, tb[TCA_NETEM_CORR]);
 652
 653         if (tb[TCA_NETEM_DELAY_DIST]) {
 654                 ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]);
 655                 if (ret)
 656                         return ret;
 657         }
 658
 659         if (tb[TCA_NETEM_REORDER])
 660                 get_reorder(sch, tb[TCA_NETEM_REORDER]);
 661
 662         if (tb[TCA_NETEM_CORRUPT])
 663                 get_corrupt(sch, tb[TCA_NETEM_CORRUPT]);
 664
 665         q->loss_model = CLG_RANDOM;
 666         if (tb[TCA_NETEM_LOSS])
 667                 ret = get_loss_clg(sch, tb[TCA_NETEM_LOSS]);
 668
 669         return ret;
 670 }
 671
 672 /*
 673  * Special case version of FIFO queue for use by netem.
 674  * It queues in order based on timestamps in skb's
 675  */
 676 struct fifo_sched_data {
 677         u32 limit;
 678         psched_time_t oldest;
 679 };
 680
 681 static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
 682 {
 683         struct fifo_sched_data *q = qdisc_priv(sch);
 684         struct sk_buff_head *list = &sch->q;
 685         psched_time_t tnext = netem_skb_cb(nskb)->time_to_send;
 686         struct sk_buff *skb;
 687
 688         if (likely(skb_queue_len(list) < q->limit)) {
 689                 /* Optimize for add at tail */
 690                 if (likely(skb_queue_empty(list) || tnext >= q->oldest)) {
 691                         q->oldest = tnext;
 692                         return qdisc_enqueue_tail(nskb, sch);
 693                 }
 694
 695                 skb_queue_reverse_walk(list, skb) {
 696                         const struct netem_skb_cb *cb = netem_skb_cb(skb);
 697
 698                         if (tnext >= cb->time_to_send)
 699                                 break;
 700                 }
 701
 702                 __skb_queue_after(list, skb, nskb);
 703
 704                 sch->qstats.backlog += qdisc_pkt_len(nskb);
 705
 706                 return NET_XMIT_SUCCESS;
 707         }
 708
 709         return qdisc_reshape_fail(nskb, sch);
 710 }
 711
 712 static int tfifo_init(struct Qdisc *sch, struct nlattr *opt)
 713 {
 714         struct fifo_sched_data *q = qdisc_priv(sch);
 715
 716         if (opt) {
 717                 struct tc_fifo_qopt *ctl = nla_data(opt);
 718                 if (nla_len(opt) < sizeof(*ctl))
 719                         return -EINVAL;
 720
 721                 q->limit = ctl->limit;
 722         } else
 723                 q->limit = max_t(u32, qdisc_dev(sch)->tx_queue_len, 1);
 724
 725         q->oldest = PSCHED_PASTPERFECT;
 726         return 0;
 727 }
 728
 729 static int tfifo_dump(struct Qdisc *sch, struct sk_buff *skb)
 730 {
 731         struct fifo_sched_data *q = qdisc_priv(sch);
 732         struct tc_fifo_qopt opt = { .limit = q->limit };
 733
 734         NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
 735         return skb->len;
 736
 737 nla_put_failure:
 738         return -1;
 739 }
 740
 741 static struct Qdisc_ops tfifo_qdisc_ops __read_mostly = {
 742         .id             =       "tfifo",
 743         .priv_size      =       sizeof(struct fifo_sched_data),
 744         .enqueue        =       tfifo_enqueue,
 745         .dequeue        =       qdisc_dequeue_head,
 746         .peek           =       qdisc_peek_head,
 747         .drop           =       qdisc_queue_drop,
 748         .init           =       tfifo_init,
 749         .reset          =       qdisc_reset_queue,
 750         .change         =       tfifo_init,
 751         .dump           =       tfifo_dump,
 752 };
 753
 754 static int netem_init(struct Qdisc *sch, struct nlattr *opt)
 755 {
 756         struct netem_sched_data *q = qdisc_priv(sch);
 757         int ret;
 758
 759         if (!opt)
 760                 return -EINVAL;
 761
 762         qdisc_watchdog_init(&q->watchdog, sch);
 763
 764         q->loss_model = CLG_RANDOM;
 765         q->qdisc = qdisc_create_dflt(sch->dev_queue, &tfifo_qdisc_ops,
 766                                      TC_H_MAKE(sch->handle, 1));
 767         if (!q->qdisc) {
 768                 pr_notice("netem: qdisc create tfifo qdisc failed\n");
 769                 return -ENOMEM;
 770         }
 771
 772         ret = netem_change(sch, opt);
 773         if (ret) {
 774                 pr_info("netem: change failed\n");
 775                 qdisc_destroy(q->qdisc);
 776         }
 777         return ret;
 778 }
 779
 780 static void netem_destroy(struct Qdisc *sch)
 781 {
 782         struct netem_sched_data *q = qdisc_priv(sch);
 783
 784         qdisc_watchdog_cancel(&q->watchdog);
 785         qdisc_destroy(q->qdisc);
 786         dist_free(q->delay_dist);
 787 }
 788
 789 static int dump_loss_model(const struct netem_sched_data *q,
 790                            struct sk_buff *skb)
 791 {
 792         struct nlattr *nest;
 793
 794         nest = nla_nest_start(skb, TCA_NETEM_LOSS);
 795         if (nest == NULL)
 796                 goto nla_put_failure;
 797
 798         switch (q->loss_model) {
 799         case CLG_RANDOM:
 800                 /* legacy loss model */
 801                 nla_nest_cancel(skb, nest);
 802                 return 0;       /* no data */
 803
 804         case CLG_4_STATES: {
 805                 struct tc_netem_gimodel gi = {
 806                         .p13 = q->clg.a1,
 807                         .p31 = q->clg.a2,
 808                         .p32 = q->clg.a3,
 809                         .p14 = q->clg.a4,
 810                         .p23 = q->clg.a5,
 811                 };
 812
 813                 NLA_PUT(skb, NETEM_LOSS_GI, sizeof(gi), &gi);
 814                 break;
 815         }
 816         case CLG_GILB_ELL: {
 817                 struct tc_netem_gemodel ge = {
 818                         .p = q->clg.a1,
 819                         .r = q->clg.a2,
 820                         .h = q->clg.a3,
 821                         .k1 = q->clg.a4,
 822                 };
 823
 824                 NLA_PUT(skb, NETEM_LOSS_GE, sizeof(ge), &ge);
 825                 break;
 826         }
 827         }
 828
 829         nla_nest_end(skb, nest);
 830         return 0;
 831
 832 nla_put_failure:
 833         nla_nest_cancel(skb, nest);
 834         return -1;
 835 }
 836
 837 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
 838 {
 839         const struct netem_sched_data *q = qdisc_priv(sch);
 840         struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb);
 841         struct tc_netem_qopt qopt;
 842         struct tc_netem_corr cor;
 843         struct tc_netem_reorder reorder;
 844         struct tc_netem_corrupt corrupt;
 845
 846         qopt.latency = q->latency;
 847         qopt.jitter = q->jitter;
 848         qopt.limit = q->limit;
 849         qopt.loss = q->loss;
 850         qopt.gap = q->gap;
 851         qopt.duplicate = q->duplicate;
 852         NLA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt);
 853
 854         cor.delay_corr = q->delay_cor.rho;
 855         cor.loss_corr = q->loss_cor.rho;
 856         cor.dup_corr = q->dup_cor.rho;
 857         NLA_PUT(skb, TCA_NETEM_CORR, sizeof(cor), &cor);
 858
 859         reorder.probability = q->reorder;
 860         reorder.correlation = q->reorder_cor.rho;
 861         NLA_PUT(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder);
 862
 863         corrupt.probability = q->corrupt;
 864         corrupt.correlation = q->corrupt_cor.rho;
 865         NLA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt);
 866
 867         if (dump_loss_model(q, skb) != 0)
 868                 goto nla_put_failure;
 869
 870         return nla_nest_end(skb, nla);
 871
 872 nla_put_failure:
 873         nlmsg_trim(skb, nla);
 874         return -1;
 875 }
 876
 877 static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
 878                           struct sk_buff *skb, struct tcmsg *tcm)
 879 {
 880         struct netem_sched_data *q = qdisc_priv(sch);
 881
 882         if (cl != 1)    /* only one class */
 883                 return -ENOENT;
 884
 885         tcm->tcm_handle |= TC_H_MIN(1);
 886         tcm->tcm_info = q->qdisc->handle;
 887
 888         return 0;
 889 }
 890
 891 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 892                      struct Qdisc **old)
 893 {
 894         struct netem_sched_data *q = qdisc_priv(sch);
 895
 896         if (new == NULL)
 897                 new = &noop_qdisc;
 898
 899         sch_tree_lock(sch);
 900         *old = q->qdisc;
 901         q->qdisc = new;
 902         qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
 903         qdisc_reset(*old);
 904         sch_tree_unlock(sch);
 905
 906         return 0;
 907 }
 908
 909 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
 910 {
 911         struct netem_sched_data *q = qdisc_priv(sch);
 912         return q->qdisc;
 913 }
 914
 915 static unsigned long netem_get(struct Qdisc *sch, u32 classid)
 916 {
 917         return 1;
 918 }
 919
 920 static void netem_put(struct Qdisc *sch, unsigned long arg)
 921 {
 922 }
 923
 924 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
 925 {
 926         if (!walker->stop) {
 927                 if (walker->count >= walker->skip)
 928                         if (walker->fn(sch, 1, walker) < 0) {
 929                                 walker->stop = 1;
 930                                 return;
 931                         }
 932                 walker->count++;
 933         }
 934 }
 935
 936 static const struct Qdisc_class_ops netem_class_ops = {
 937         .graft          =       netem_graft,
 938         .leaf           =       netem_leaf,
 939         .get            =       netem_get,
 940         .put            =       netem_put,
 941         .walk           =       netem_walk,
 942         .dump           =       netem_dump_class,
 943 };
 944
 945 static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
 946         .id             =       "netem",
 947         .cl_ops         =       &netem_class_ops,
 948         .priv_size      =       sizeof(struct netem_sched_data),
 949         .enqueue        =       netem_enqueue,
 950         .dequeue        =       netem_dequeue,
 951         .peek           =       qdisc_peek_dequeued,
 952         .drop           =       netem_drop,
 953         .init           =       netem_init,
 954         .reset          =       netem_reset,
 955         .destroy        =       netem_destroy,
 956         .change         =       netem_change,
 957         .dump           =       netem_dump,
 958         .owner          =       THIS_MODULE,
 959 };
 960
 961
 962 static int __init netem_module_init(void)
 963 {
 964         pr_info("netem: version " VERSION "\n");
 965         return register_qdisc(&netem_qdisc_ops);
 966 }
 967 static void __exit netem_module_exit(void)
 968 {
 969         unregister_qdisc(&netem_qdisc_ops);
 970 }
 971 module_init(netem_module_init)
 972 module_exit(netem_module_exit)
 973 MODULE_LICENSE("GPL");