net/sched/sch_netem.c

   1 /*
   2  * net/sched/sch_netem.c        Network emulator
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License.
   8  *
   9  *              Many of the algorithms and ideas for this came from
  10  *              NIST Net which is not copyrighted.
  11  *
  12  * Authors:     Stephen Hemminger <shemminger@osdl.org>
  13  *              Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
  14  */
  15
  16 #include <linux/mm.h>
  17 #include <linux/module.h>
  18 #include <linux/slab.h>
  19 #include <linux/types.h>
  20 #include <linux/kernel.h>
  21 #include <linux/errno.h>
  22 #include <linux/skbuff.h>
  23 #include <linux/vmalloc.h>
  24 #include <linux/rtnetlink.h>
  25 #include <linux/reciprocal_div.h>
  26 #include <linux/rbtree.h>
  27
  28 #include <net/netlink.h>
  29 #include <net/pkt_sched.h>
  30 #include <net/inet_ecn.h>
  31
  32 #define VERSION "1.3"
  33
  34 /*      Network Emulation Queuing algorithm.
  35         ====================================
  36
  37         Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
  38                  Network Emulation Tool
  39                  [2] Luigi Rizzo, DummyNet for FreeBSD
  40
  41          ----------------------------------------------------------------
  42
  43          This started out as a simple way to delay outgoing packets to
  44          test TCP but has grown to include most of the functionality
  45          of a full blown network emulator like NISTnet. It can delay
  46          packets and add random jitter (and correlation). The random
  47          distribution can be loaded from a table as well to provide
  48          normal, Pareto, or experimental curves. Packet loss,
  49          duplication, and reordering can also be emulated.
  50
  51          This qdisc does not do classification that can be handled in
  52          layering other disciplines.  It does not need to do bandwidth
  53          control either since that can be handled by using token
  54          bucket or other rate control.
  55
  56      Correlated Loss Generator models
  57
  58         Added generation of correlated loss according to the
  59         "Gilbert-Elliot" model, a 4-state markov model.
  60
  61         References:
  62         [1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
  63         [2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
  64         and intuitive loss model for packet networks and its implementation
  65         in the Netem module in the Linux kernel", available in [1]
  66
  67         Authors: Stefano Salsano <stefano.salsano at uniroma2.it
  68                  Fabio Ludovici <fabio.ludovici at yahoo.it>
  69 */
  70
  71 struct netem_sched_data {
  72         /* internal t(ime)fifo qdisc uses t_root and sch->limit */
  73         struct rb_root t_root;
  74
  75         /* optional qdisc for classful handling (NULL at netem init) */
  76         struct Qdisc    *qdisc;
  77
  78         struct qdisc_watchdog watchdog;
  79
  80         psched_tdiff_t latency;
  81         psched_tdiff_t jitter;
  82
  83         u32 loss;
  84         u32 ecn;
  85         u32 limit;
  86         u32 counter;
  87         u32 gap;
  88         u32 duplicate;
  89         u32 reorder;
  90         u32 corrupt;
  91         u32 rate;
  92         s32 packet_overhead;
  93         u32 cell_size;
  94         u32 cell_size_reciprocal;
  95         s32 cell_overhead;
  96
  97         struct crndstate {
  98                 u32 last;
  99                 u32 rho;
 100         } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
 101
 102         struct disttable {
 103                 u32  size;
 104                 s16 table[0];
 105         } *delay_dist;
 106
 107         enum  {
 108                 CLG_RANDOM,
 109                 CLG_4_STATES,
 110                 CLG_GILB_ELL,
 111         } loss_model;
 112
 113         /* Correlated Loss Generation models */
 114         struct clgstate {
 115                 /* state of the Markov chain */
 116                 u8 state;
 117
 118                 /* 4-states and Gilbert-Elliot models */
 119                 u32 a1; /* p13 for 4-states or p for GE */
 120                 u32 a2; /* p31 for 4-states or r for GE */
 121                 u32 a3; /* p32 for 4-states or h for GE */
 122                 u32 a4; /* p14 for 4-states or 1-k for GE */
 123                 u32 a5; /* p23 used only in 4-states */
 124         } clg;
 125
 126 };
 127
 128 /* Time stamp put into socket buffer control block
 129  * Only valid when skbs are in our internal t(ime)fifo queue.
 130  */
 131 struct netem_skb_cb {
 132         psched_time_t   time_to_send;
 133         ktime_t         tstamp_save;
 134 };
 135
 136 /* Because space in skb->cb[] is tight, netem overloads skb->next/prev/tstamp
 137  * to hold a rb_node structure.
 138  *
 139  * If struct sk_buff layout is changed, the following checks will complain.
 140  */
 141 static struct rb_node *netem_rb_node(struct sk_buff *skb)
 142 {
 143         BUILD_BUG_ON(offsetof(struct sk_buff, next) != 0);
 144         BUILD_BUG_ON(offsetof(struct sk_buff, prev) !=
 145                      offsetof(struct sk_buff, next) + sizeof(skb->next));
 146         BUILD_BUG_ON(offsetof(struct sk_buff, tstamp) !=
 147                      offsetof(struct sk_buff, prev) + sizeof(skb->prev));
 148         BUILD_BUG_ON(sizeof(struct rb_node) > sizeof(skb->next) +
 149                                               sizeof(skb->prev) +
 150                                               sizeof(skb->tstamp));
 151         return (struct rb_node *)&skb->next;
 152 }
 153
 154 static struct sk_buff *netem_rb_to_skb(struct rb_node *rb)
 155 {
 156         return (struct sk_buff *)rb;
 157 }
 158
 159 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
 160 {
 161         /* we assume we can use skb next/prev/tstamp as storage for rb_node */
 162         qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));
 163         return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
 164 }
 165
 166 /* init_crandom - initialize correlated random number generator
 167  * Use entropy source for initial seed.
 168  */
 169 static void init_crandom(struct crndstate *state, unsigned long rho)
 170 {
 171         state->rho = rho;
 172         state->last = net_random();
 173 }
 174
 175 /* get_crandom - correlated random number generator
 176  * Next number depends on last value.
 177  * rho is scaled to avoid floating point.
 178  */
 179 static u32 get_crandom(struct crndstate *state)
 180 {
 181         u64 value, rho;
 182         unsigned long answer;
 183
 184         if (state->rho == 0)    /* no correlation */
 185                 return net_random();
 186
 187         value = net_random();
 188         rho = (u64)state->rho + 1;
 189         answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
 190         state->last = answer;
 191         return answer;
 192 }
 193
 194 /* loss_4state - 4-state model loss generator
 195  * Generates losses according to the 4-state Markov chain adopted in
 196  * the GI (General and Intuitive) loss model.
 197  */
 198 static bool loss_4state(struct netem_sched_data *q)
 199 {
 200         struct clgstate *clg = &q->clg;
 201         u32 rnd = net_random();
 202
 203         /*
 204          * Makes a comparison between rnd and the transition
 205          * probabilities outgoing from the current state, then decides the
 206          * next state and if the next packet has to be transmitted or lost.
 207          * The four states correspond to:
 208          *   1 => successfully transmitted packets within a gap period
 209          *   4 => isolated losses within a gap period
 210          *   3 => lost packets within a burst period
 211          *   2 => successfully transmitted packets within a burst period
 212          */
 213         switch (clg->state) {
 214         case 1:
 215                 if (rnd < clg->a4) {
 216                         clg->state = 4;
 217                         return true;
 218                 } else if (clg->a4 < rnd && rnd < clg->a1) {
 219                         clg->state = 3;
 220                         return true;
 221                 } else if (clg->a1 < rnd)
 222                         clg->state = 1;
 223
 224                 break;
 225         case 2:
 226                 if (rnd < clg->a5) {
 227                         clg->state = 3;
 228                         return true;
 229                 } else
 230                         clg->state = 2;
 231
 232                 break;
 233         case 3:
 234                 if (rnd < clg->a3)
 235                         clg->state = 2;
 236                 else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
 237                         clg->state = 1;
 238                         return true;
 239                 } else if (clg->a2 + clg->a3 < rnd) {
 240                         clg->state = 3;
 241                         return true;
 242                 }
 243                 break;
 244         case 4:
 245                 clg->state = 1;
 246                 break;
 247         }
 248
 249         return false;
 250 }
 251
 252 /* loss_gilb_ell - Gilbert-Elliot model loss generator
 253  * Generates losses according to the Gilbert-Elliot loss model or
 254  * its special cases  (Gilbert or Simple Gilbert)
 255  *
 256  * Makes a comparison between random number and the transition
 257  * probabilities outgoing from the current state, then decides the
 258  * next state. A second random number is extracted and the comparison
 259  * with the loss probability of the current state decides if the next
 260  * packet will be transmitted or lost.
 261  */
 262 static bool loss_gilb_ell(struct netem_sched_data *q)
 263 {
 264         struct clgstate *clg = &q->clg;
 265
 266         switch (clg->state) {
 267         case 1:
 268                 if (net_random() < clg->a1)
 269                         clg->state = 2;
 270                 if (net_random() < clg->a4)
 271                         return true;
 272         case 2:
 273                 if (net_random() < clg->a2)
 274                         clg->state = 1;
 275                 if (clg->a3 > net_random())
 276                         return true;
 277         }
 278
 279         return false;
 280 }
 281
 282 static bool loss_event(struct netem_sched_data *q)
 283 {
 284         switch (q->loss_model) {
 285         case CLG_RANDOM:
 286                 /* Random packet drop 0 => none, ~0 => all */
 287                 return q->loss && q->loss >= get_crandom(&q->loss_cor);
 288
 289         case CLG_4_STATES:
 290                 /* 4state loss model algorithm (used also for GI model)
 291                 * Extracts a value from the markov 4 state loss generator,
 292                 * if it is 1 drops a packet and if needed writes the event in
 293                 * the kernel logs
 294                 */
 295                 return loss_4state(q);
 296
 297         case CLG_GILB_ELL:
 298                 /* Gilbert-Elliot loss model algorithm
 299                 * Extracts a value from the Gilbert-Elliot loss generator,
 300                 * if it is 1 drops a packet and if needed writes the event in
 301                 * the kernel logs
 302                 */
 303                 return loss_gilb_ell(q);
 304         }
 305
 306         return false;   /* not reached */
 307 }
 308
 309
 310 /* tabledist - return a pseudo-randomly distributed value with mean mu and
 311  * std deviation sigma.  Uses table lookup to approximate the desired
 312  * distribution, and a uniformly-distributed pseudo-random source.
 313  */
 314 static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma,
 315                                 struct crndstate *state,
 316                                 const struct disttable *dist)
 317 {
 318         psched_tdiff_t x;
 319         long t;
 320         u32 rnd;
 321
 322         if (sigma == 0)
 323                 return mu;
 324
 325         rnd = get_crandom(state);
 326
 327         /* default uniform distribution */
 328         if (dist == NULL)
 329                 return (rnd % (2*sigma)) - sigma + mu;
 330
 331         t = dist->table[rnd % dist->size];
 332         x = (sigma % NETEM_DIST_SCALE) * t;
 333         if (x >= 0)
 334                 x += NETEM_DIST_SCALE/2;
 335         else
 336                 x -= NETEM_DIST_SCALE/2;
 337
 338         return  x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
 339 }
 340
 341 static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sched_data *q)
 342 {
 343         u64 ticks;
 344
 345         len += q->packet_overhead;
 346
 347         if (q->cell_size) {
 348                 u32 cells = reciprocal_divide(len, q->cell_size_reciprocal);
 349
 350                 if (len > cells * q->cell_size) /* extra cell needed for remainder */
 351                         cells++;
 352                 len = cells * (q->cell_size + q->cell_overhead);
 353         }
 354
 355         ticks = (u64)len * NSEC_PER_SEC;
 356
 357         do_div(ticks, q->rate);
 358         return PSCHED_NS2TICKS(ticks);
 359 }
 360
 361 static void tfifo_reset(struct Qdisc *sch)
 362 {
 363         struct netem_sched_data *q = qdisc_priv(sch);
 364         struct rb_node *p;
 365
 366         while ((p = rb_first(&q->t_root))) {
 367                 struct sk_buff *skb = netem_rb_to_skb(p);
 368
 369                 rb_erase(p, &q->t_root);
 370                 skb->next = NULL;
 371                 skb->prev = NULL;
 372                 kfree_skb(skb);
 373         }
 374 }
 375
 376 static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
 377 {
 378         struct netem_sched_data *q = qdisc_priv(sch);
 379         psched_time_t tnext = netem_skb_cb(nskb)->time_to_send;
 380         struct rb_node **p = &q->t_root.rb_node, *parent = NULL;
 381
 382         while (*p) {
 383                 struct sk_buff *skb;
 384
 385                 parent = *p;
 386                 skb = netem_rb_to_skb(parent);
 387                 if (tnext >= netem_skb_cb(skb)->time_to_send)
 388                         p = &parent->rb_right;
 389                 else
 390                         p = &parent->rb_left;
 391         }
 392         rb_link_node(netem_rb_node(nskb), parent, p);
 393         rb_insert_color(netem_rb_node(nskb), &q->t_root);
 394         sch->q.qlen++;
 395 }
 396
 397 /*
 398  * Insert one skb into qdisc.
 399  * Note: parent depends on return value to account for queue length.
 400  *      NET_XMIT_DROP: queue length didn't change.
 401  *      NET_XMIT_SUCCESS: one skb was queued.
 402  */
 403 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 404 {
 405         struct netem_sched_data *q = qdisc_priv(sch);
 406         /* We don't fill cb now as skb_unshare() may invalidate it */
 407         struct netem_skb_cb *cb;
 408         struct sk_buff *skb2;
 409         int count = 1;
 410
 411         /* Random duplication */
 412         if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
 413                 ++count;
 414
 415         /* Drop packet? */
 416         if (loss_event(q)) {
 417                 if (q->ecn && INET_ECN_set_ce(skb))
 418                         sch->qstats.drops++; /* mark packet */
 419                 else
 420                         --count;
 421         }
 422         if (count == 0) {
 423                 sch->qstats.drops++;
 424                 kfree_skb(skb);
 425                 return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 426         }
 427
 428         /* If a delay is expected, orphan the skb. (orphaning usually takes
 429          * place at TX completion time, so _before_ the link transit delay)
 430          */
 431         if (q->latency || q->jitter)
 432                 skb_orphan_partial(skb);
 433
 434         /*
 435          * If we need to duplicate packet, then re-insert at top of the
 436          * qdisc tree, since parent queuer expects that only one
 437          * skb will be queued.
 438          */
 439         if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
 440                 struct Qdisc *rootq = qdisc_root(sch);
 441                 u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
 442                 q->duplicate = 0;
 443
 444                 qdisc_enqueue_root(skb2, rootq);
 445                 q->duplicate = dupsave;
 446         }
 447
 448         /*
 449          * Randomized packet corruption.
 450          * Make copy if needed since we are modifying
 451          * If packet is going to be hardware checksummed, then
 452          * do it now in software before we mangle it.
 453          */
 454         if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
 455                 if (!(skb = skb_unshare(skb, GFP_ATOMIC)) ||
 456                     (skb->ip_summed == CHECKSUM_PARTIAL &&
 457                      skb_checksum_help(skb)))
 458                         return qdisc_drop(skb, sch);
 459
 460                 skb->data[net_random() % skb_headlen(skb)] ^= 1<<(net_random() % 8);
 461         }
 462
 463         if (unlikely(skb_queue_len(&sch->q) >= sch->limit))
 464                 return qdisc_reshape_fail(skb, sch);
 465
 466         sch->qstats.backlog += qdisc_pkt_len(skb);
 467
 468         cb = netem_skb_cb(skb);
 469         if (q->gap == 0 ||              /* not doing reordering */
 470             q->counter < q->gap - 1 ||  /* inside last reordering gap */
 471             q->reorder < get_crandom(&q->reorder_cor)) {
 472                 psched_time_t now;
 473                 psched_tdiff_t delay;
 474
 475                 delay = tabledist(q->latency, q->jitter,
 476                                   &q->delay_cor, q->delay_dist);
 477
 478                 now = psched_get_time();
 479
 480                 if (q->rate) {
 481                         struct sk_buff *last;
 482
 483                         if (!skb_queue_empty(&sch->q))
 484                                 last = skb_peek_tail(&sch->q);
 485                         else
 486                                 last = netem_rb_to_skb(rb_last(&q->t_root));
 487                         if (last) {
 488                                 /*
 489                                  * Last packet in queue is reference point (now),
 490                                  * calculate this time bonus and subtract
 491                                  * from delay.
 492                                  */
 493                                 delay -= netem_skb_cb(last)->time_to_send - now;
 494                                 delay = max_t(psched_tdiff_t, 0, delay);
 495                                 now = netem_skb_cb(last)->time_to_send;
 496                         }
 497
 498                         delay += packet_len_2_sched_time(skb->len, q);
 499                 }
 500
 501                 cb->time_to_send = now + delay;
 502                 cb->tstamp_save = skb->tstamp;
 503                 ++q->counter;
 504                 tfifo_enqueue(skb, sch);
 505         } else {
 506                 /*
 507                  * Do re-ordering by putting one out of N packets at the front
 508                  * of the queue.
 509                  */
 510                 cb->time_to_send = psched_get_time();
 511                 q->counter = 0;
 512
 513                 __skb_queue_head(&sch->q, skb);
 514                 sch->qstats.requeues++;
 515         }
 516
 517         return NET_XMIT_SUCCESS;
 518 }
 519
 520 static unsigned int netem_drop(struct Qdisc *sch)
 521 {
 522         struct netem_sched_data *q = qdisc_priv(sch);
 523         unsigned int len;
 524
 525         len = qdisc_queue_drop(sch);
 526
 527         if (!len) {
 528                 struct rb_node *p = rb_first(&q->t_root);
 529
 530                 if (p) {
 531                         struct sk_buff *skb = netem_rb_to_skb(p);
 532
 533                         rb_erase(p, &q->t_root);
 534                         sch->q.qlen--;
 535                         skb->next = NULL;
 536                         skb->prev = NULL;
 537                         len = qdisc_pkt_len(skb);
 538                         sch->qstats.backlog -= len;
 539                         kfree_skb(skb);
 540                 }
 541         }
 542         if (!len && q->qdisc && q->qdisc->ops->drop)
 543             len = q->qdisc->ops->drop(q->qdisc);
 544         if (len)
 545                 sch->qstats.drops++;
 546
 547         return len;
 548 }
 549
 550 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
 551 {
 552         struct netem_sched_data *q = qdisc_priv(sch);
 553         struct sk_buff *skb;
 554         struct rb_node *p;
 555
 556         if (qdisc_is_throttled(sch))
 557                 return NULL;
 558
 559 tfifo_dequeue:
 560         skb = __skb_dequeue(&sch->q);
 561         if (skb) {
 562 deliver:
 563                 sch->qstats.backlog -= qdisc_pkt_len(skb);
 564                 qdisc_unthrottled(sch);
 565                 qdisc_bstats_update(sch, skb);
 566                 return skb;
 567         }
 568         p = rb_first(&q->t_root);
 569         if (p) {
 570                 psched_time_t time_to_send;
 571
 572                 skb = netem_rb_to_skb(p);
 573
 574                 /* if more time remaining? */
 575                 time_to_send = netem_skb_cb(skb)->time_to_send;
 576                 if (time_to_send <= psched_get_time()) {
 577                         rb_erase(p, &q->t_root);
 578
 579                         sch->q.qlen--;
 580                         skb->next = NULL;
 581                         skb->prev = NULL;
 582                         skb->tstamp = netem_skb_cb(skb)->tstamp_save;
 583
 584 #ifdef CONFIG_NET_CLS_ACT
 585                         /*
 586                          * If it's at ingress let's pretend the delay is
 587                          * from the network (tstamp will be updated).
 588                          */
 589                         if (G_TC_FROM(skb->tc_verd) & AT_INGRESS)
 590                                 skb->tstamp.tv64 = 0;
 591 #endif
 592
 593                         if (q->qdisc) {
 594                                 int err = qdisc_enqueue(skb, q->qdisc);
 595
 596                                 if (unlikely(err != NET_XMIT_SUCCESS)) {
 597                                         if (net_xmit_drop_count(err)) {
 598                                                 sch->qstats.drops++;
 599                                                 qdisc_tree_decrease_qlen(sch, 1);
 600                                         }
 601                                 }
 602                                 goto tfifo_dequeue;
 603                         }
 604                         goto deliver;
 605                 }
 606
 607                 if (q->qdisc) {
 608                         skb = q->qdisc->ops->dequeue(q->qdisc);
 609                         if (skb)
 610                                 goto deliver;
 611                 }
 612                 qdisc_watchdog_schedule(&q->watchdog, time_to_send);
 613         }
 614
 615         if (q->qdisc) {
 616                 skb = q->qdisc->ops->dequeue(q->qdisc);
 617                 if (skb)
 618                         goto deliver;
 619         }
 620         return NULL;
 621 }
 622
 623 static void netem_reset(struct Qdisc *sch)
 624 {
 625         struct netem_sched_data *q = qdisc_priv(sch);
 626
 627         qdisc_reset_queue(sch);
 628         tfifo_reset(sch);
 629         if (q->qdisc)
 630                 qdisc_reset(q->qdisc);
 631         qdisc_watchdog_cancel(&q->watchdog);
 632 }
 633
 634 static void dist_free(struct disttable *d)
 635 {
 636         if (d) {
 637                 if (is_vmalloc_addr(d))
 638                         vfree(d);
 639                 else
 640                         kfree(d);
 641         }
 642 }
 643
 644 /*
 645  * Distribution data is a variable size payload containing
 646  * signed 16 bit values.
 647  */
 648 static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
 649 {
 650         struct netem_sched_data *q = qdisc_priv(sch);
 651         size_t n = nla_len(attr)/sizeof(__s16);
 652         const __s16 *data = nla_data(attr);
 653         spinlock_t *root_lock;
 654         struct disttable *d;
 655         int i;
 656         size_t s;
 657
 658         if (n > NETEM_DIST_MAX)
 659                 return -EINVAL;
 660
 661         s = sizeof(struct disttable) + n * sizeof(s16);
 662         d = kmalloc(s, GFP_KERNEL | __GFP_NOWARN);
 663         if (!d)
 664                 d = vmalloc(s);
 665         if (!d)
 666                 return -ENOMEM;
 667
 668         d->size = n;
 669         for (i = 0; i < n; i++)
 670                 d->table[i] = data[i];
 671
 672         root_lock = qdisc_root_sleeping_lock(sch);
 673
 674         spin_lock_bh(root_lock);
 675         swap(q->delay_dist, d);
 676         spin_unlock_bh(root_lock);
 677
 678         dist_free(d);
 679         return 0;
 680 }
 681
 682 static void get_correlation(struct Qdisc *sch, const struct nlattr *attr)
 683 {
 684         struct netem_sched_data *q = qdisc_priv(sch);
 685         const struct tc_netem_corr *c = nla_data(attr);
 686
 687         init_crandom(&q->delay_cor, c->delay_corr);
 688         init_crandom(&q->loss_cor, c->loss_corr);
 689         init_crandom(&q->dup_cor, c->dup_corr);
 690 }
 691
 692 static void get_reorder(struct Qdisc *sch, const struct nlattr *attr)
 693 {
 694         struct netem_sched_data *q = qdisc_priv(sch);
 695         const struct tc_netem_reorder *r = nla_data(attr);
 696
 697         q->reorder = r->probability;
 698         init_crandom(&q->reorder_cor, r->correlation);
 699 }
 700
 701 static void get_corrupt(struct Qdisc *sch, const struct nlattr *attr)
 702 {
 703         struct netem_sched_data *q = qdisc_priv(sch);
 704         const struct tc_netem_corrupt *r = nla_data(attr);
 705
 706         q->corrupt = r->probability;
 707         init_crandom(&q->corrupt_cor, r->correlation);
 708 }
 709
 710 static void get_rate(struct Qdisc *sch, const struct nlattr *attr)
 711 {
 712         struct netem_sched_data *q = qdisc_priv(sch);
 713         const struct tc_netem_rate *r = nla_data(attr);
 714
 715         q->rate = r->rate;
 716         q->packet_overhead = r->packet_overhead;
 717         q->cell_size = r->cell_size;
 718         if (q->cell_size)
 719                 q->cell_size_reciprocal = reciprocal_value(q->cell_size);
 720         q->cell_overhead = r->cell_overhead;
 721 }
 722
 723 static int get_loss_clg(struct Qdisc *sch, const struct nlattr *attr)
 724 {
 725         struct netem_sched_data *q = qdisc_priv(sch);
 726         const struct nlattr *la;
 727         int rem;
 728
 729         nla_for_each_nested(la, attr, rem) {
 730                 u16 type = nla_type(la);
 731
 732                 switch(type) {
 733                 case NETEM_LOSS_GI: {
 734                         const struct tc_netem_gimodel *gi = nla_data(la);
 735
 736                         if (nla_len(la) < sizeof(struct tc_netem_gimodel)) {
 737                                 pr_info("netem: incorrect gi model size\n");
 738                                 return -EINVAL;
 739                         }
 740
 741                         q->loss_model = CLG_4_STATES;
 742
 743                         q->clg.state = 1;
 744                         q->clg.a1 = gi->p13;
 745                         q->clg.a2 = gi->p31;
 746                         q->clg.a3 = gi->p32;
 747                         q->clg.a4 = gi->p14;
 748                         q->clg.a5 = gi->p23;
 749                         break;
 750                 }
 751
 752                 case NETEM_LOSS_GE: {
 753                         const struct tc_netem_gemodel *ge = nla_data(la);
 754
 755                         if (nla_len(la) < sizeof(struct tc_netem_gemodel)) {
 756                                 pr_info("netem: incorrect ge model size\n");
 757                                 return -EINVAL;
 758                         }
 759
 760                         q->loss_model = CLG_GILB_ELL;
 761                         q->clg.state = 1;
 762                         q->clg.a1 = ge->p;
 763                         q->clg.a2 = ge->r;
 764                         q->clg.a3 = ge->h;
 765                         q->clg.a4 = ge->k1;
 766                         break;
 767                 }
 768
 769                 default:
 770                         pr_info("netem: unknown loss type %u\n", type);
 771                         return -EINVAL;
 772                 }
 773         }
 774
 775         return 0;
 776 }
 777
 778 static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
 779         [TCA_NETEM_CORR]        = { .len = sizeof(struct tc_netem_corr) },
 780         [TCA_NETEM_REORDER]     = { .len = sizeof(struct tc_netem_reorder) },
 781         [TCA_NETEM_CORRUPT]     = { .len = sizeof(struct tc_netem_corrupt) },
 782         [TCA_NETEM_RATE]        = { .len = sizeof(struct tc_netem_rate) },
 783         [TCA_NETEM_LOSS]        = { .type = NLA_NESTED },
 784         [TCA_NETEM_ECN]         = { .type = NLA_U32 },
 785 };
 786
 787 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
 788                       const struct nla_policy *policy, int len)
 789 {
 790         int nested_len = nla_len(nla) - NLA_ALIGN(len);
 791
 792         if (nested_len < 0) {
 793                 pr_info("netem: invalid attributes len %d\n", nested_len);
 794                 return -EINVAL;
 795         }
 796
 797         if (nested_len >= nla_attr_size(0))
 798                 return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
 799                                  nested_len, policy);
 800
 801         memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
 802         return 0;
 803 }
 804
 805 /* Parse netlink message to set options */
 806 static int netem_change(struct Qdisc *sch, struct nlattr *opt)
 807 {
 808         struct netem_sched_data *q = qdisc_priv(sch);
 809         struct nlattr *tb[TCA_NETEM_MAX + 1];
 810         struct tc_netem_qopt *qopt;
 811         int ret;
 812
 813         if (opt == NULL)
 814                 return -EINVAL;
 815
 816         qopt = nla_data(opt);
 817         ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
 818         if (ret < 0)
 819                 return ret;
 820
 821         sch->limit = qopt->limit;
 822
 823         q->latency = qopt->latency;
 824         q->jitter = qopt->jitter;
 825         q->limit = qopt->limit;
 826         q->gap = qopt->gap;
 827         q->counter = 0;
 828         q->loss = qopt->loss;
 829         q->duplicate = qopt->duplicate;
 830
 831         /* for compatibility with earlier versions.
 832          * if gap is set, need to assume 100% probability
 833          */
 834         if (q->gap)
 835                 q->reorder = ~0;
 836
 837         if (tb[TCA_NETEM_CORR])
 838                 get_correlation(sch, tb[TCA_NETEM_CORR]);
 839
 840         if (tb[TCA_NETEM_DELAY_DIST]) {
 841                 ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]);
 842                 if (ret)
 843                         return ret;
 844         }
 845
 846         if (tb[TCA_NETEM_REORDER])
 847                 get_reorder(sch, tb[TCA_NETEM_REORDER]);
 848
 849         if (tb[TCA_NETEM_CORRUPT])
 850                 get_corrupt(sch, tb[TCA_NETEM_CORRUPT]);
 851
 852         if (tb[TCA_NETEM_RATE])
 853                 get_rate(sch, tb[TCA_NETEM_RATE]);
 854
 855         if (tb[TCA_NETEM_ECN])
 856                 q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]);
 857
 858         q->loss_model = CLG_RANDOM;
 859         if (tb[TCA_NETEM_LOSS])
 860                 ret = get_loss_clg(sch, tb[TCA_NETEM_LOSS]);
 861
 862         return ret;
 863 }
 864
 865 static int netem_init(struct Qdisc *sch, struct nlattr *opt)
 866 {
 867         struct netem_sched_data *q = qdisc_priv(sch);
 868         int ret;
 869
 870         if (!opt)
 871                 return -EINVAL;
 872
 873         qdisc_watchdog_init(&q->watchdog, sch);
 874
 875         q->loss_model = CLG_RANDOM;
 876         ret = netem_change(sch, opt);
 877         if (ret)
 878                 pr_info("netem: change failed\n");
 879         return ret;
 880 }
 881
 882 static void netem_destroy(struct Qdisc *sch)
 883 {
 884         struct netem_sched_data *q = qdisc_priv(sch);
 885
 886         qdisc_watchdog_cancel(&q->watchdog);
 887         if (q->qdisc)
 888                 qdisc_destroy(q->qdisc);
 889         dist_free(q->delay_dist);
 890 }
 891
 892 static int dump_loss_model(const struct netem_sched_data *q,
 893                            struct sk_buff *skb)
 894 {
 895         struct nlattr *nest;
 896
 897         nest = nla_nest_start(skb, TCA_NETEM_LOSS);
 898         if (nest == NULL)
 899                 goto nla_put_failure;
 900
 901         switch (q->loss_model) {
 902         case CLG_RANDOM:
 903                 /* legacy loss model */
 904                 nla_nest_cancel(skb, nest);
 905                 return 0;       /* no data */
 906
 907         case CLG_4_STATES: {
 908                 struct tc_netem_gimodel gi = {
 909                         .p13 = q->clg.a1,
 910                         .p31 = q->clg.a2,
 911                         .p32 = q->clg.a3,
 912                         .p14 = q->clg.a4,
 913                         .p23 = q->clg.a5,
 914                 };
 915
 916                 if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi))
 917                         goto nla_put_failure;
 918                 break;
 919         }
 920         case CLG_GILB_ELL: {
 921                 struct tc_netem_gemodel ge = {
 922                         .p = q->clg.a1,
 923                         .r = q->clg.a2,
 924                         .h = q->clg.a3,
 925                         .k1 = q->clg.a4,
 926                 };
 927
 928                 if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge))
 929                         goto nla_put_failure;
 930                 break;
 931         }
 932         }
 933
 934         nla_nest_end(skb, nest);
 935         return 0;
 936
 937 nla_put_failure:
 938         nla_nest_cancel(skb, nest);
 939         return -1;
 940 }
 941
 942 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
 943 {
 944         const struct netem_sched_data *q = qdisc_priv(sch);
 945         struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb);
 946         struct tc_netem_qopt qopt;
 947         struct tc_netem_corr cor;
 948         struct tc_netem_reorder reorder;
 949         struct tc_netem_corrupt corrupt;
 950         struct tc_netem_rate rate;
 951
 952         qopt.latency = q->latency;
 953         qopt.jitter = q->jitter;
 954         qopt.limit = q->limit;
 955         qopt.loss = q->loss;
 956         qopt.gap = q->gap;
 957         qopt.duplicate = q->duplicate;
 958         if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
 959                 goto nla_put_failure;
 960
 961         cor.delay_corr = q->delay_cor.rho;
 962         cor.loss_corr = q->loss_cor.rho;
 963         cor.dup_corr = q->dup_cor.rho;
 964         if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor))
 965                 goto nla_put_failure;
 966
 967         reorder.probability = q->reorder;
 968         reorder.correlation = q->reorder_cor.rho;
 969         if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder))
 970                 goto nla_put_failure;
 971
 972         corrupt.probability = q->corrupt;
 973         corrupt.correlation = q->corrupt_cor.rho;
 974         if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt))
 975                 goto nla_put_failure;
 976
 977         rate.rate = q->rate;
 978         rate.packet_overhead = q->packet_overhead;
 979         rate.cell_size = q->cell_size;
 980         rate.cell_overhead = q->cell_overhead;
 981         if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate))
 982                 goto nla_put_failure;
 983
 984         if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn))
 985                 goto nla_put_failure;
 986
 987         if (dump_loss_model(q, skb) != 0)
 988                 goto nla_put_failure;
 989
 990         return nla_nest_end(skb, nla);
 991
 992 nla_put_failure:
 993         nlmsg_trim(skb, nla);
 994         return -1;
 995 }
 996
 997 static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
 998                           struct sk_buff *skb, struct tcmsg *tcm)
 999 {
1000         struct netem_sched_data *q = qdisc_priv(sch);
1001
1002         if (cl != 1 || !q->qdisc)       /* only one class */
1003                 return -ENOENT;
1004
1005         tcm->tcm_handle |= TC_H_MIN(1);
1006         tcm->tcm_info = q->qdisc->handle;
1007
1008         return 0;
1009 }
1010
1011 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1012                      struct Qdisc **old)
1013 {
1014         struct netem_sched_data *q = qdisc_priv(sch);
1015
1016         sch_tree_lock(sch);
1017         *old = q->qdisc;
1018         q->qdisc = new;
1019         if (*old) {
1020                 qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
1021                 qdisc_reset(*old);
1022         }
1023         sch_tree_unlock(sch);
1024
1025         return 0;
1026 }
1027
1028 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
1029 {
1030         struct netem_sched_data *q = qdisc_priv(sch);
1031         return q->qdisc;
1032 }
1033
1034 static unsigned long netem_get(struct Qdisc *sch, u32 classid)
1035 {
1036         return 1;
1037 }
1038
1039 static void netem_put(struct Qdisc *sch, unsigned long arg)
1040 {
1041 }
1042
1043 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
1044 {
1045         if (!walker->stop) {
1046                 if (walker->count >= walker->skip)
1047                         if (walker->fn(sch, 1, walker) < 0) {
1048                                 walker->stop = 1;
1049                                 return;
1050                         }
1051                 walker->count++;
1052         }
1053 }
1054
1055 static const struct Qdisc_class_ops netem_class_ops = {
1056         .graft          =       netem_graft,
1057         .leaf           =       netem_leaf,
1058         .get            =       netem_get,
1059         .put            =       netem_put,
1060         .walk           =       netem_walk,
1061         .dump           =       netem_dump_class,
1062 };
1063
1064 static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
1065         .id             =       "netem",
1066         .cl_ops         =       &netem_class_ops,
1067         .priv_size      =       sizeof(struct netem_sched_data),
1068         .enqueue        =       netem_enqueue,
1069         .dequeue        =       netem_dequeue,
1070         .peek           =       qdisc_peek_dequeued,
1071         .drop           =       netem_drop,
1072         .init           =       netem_init,
1073         .reset          =       netem_reset,
1074         .destroy        =       netem_destroy,
1075         .change         =       netem_change,
1076         .dump           =       netem_dump,
1077         .owner          =       THIS_MODULE,
1078 };
1079
1080
1081 static int __init netem_module_init(void)
1082 {
1083         pr_info("netem: version " VERSION "\n");
1084         return register_qdisc(&netem_qdisc_ops);
1085 }
1086 static void __exit netem_module_exit(void)
1087 {
1088         unregister_qdisc(&netem_qdisc_ops);
1089 }
1090 module_init(netem_module_init)
1091 module_exit(netem_module_exit)
1092 MODULE_LICENSE("GPL");