2 * net/sched/sch_netem.c Network emulator
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Many of the algorithms and ideas for this came from
10 * NIST Net which is not copyrighted.
12 * Authors: Stephen Hemminger <shemminger@osdl.org>
13 * Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
16 #include <linux/module.h>
17 #include <linux/bitops.h>
18 #include <linux/types.h>
19 #include <linux/kernel.h>
20 #include <linux/errno.h>
21 #include <linux/netdevice.h>
22 #include <linux/skbuff.h>
23 #include <linux/rtnetlink.h>
25 #include <net/pkt_sched.h>
29 /* Network Emulation Queuing algorithm.
30 ====================================
32 Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
33 Network Emulation Tool
34 [2] Luigi Rizzo, DummyNet for FreeBSD
36 ----------------------------------------------------------------
38 This started out as a simple way to delay outgoing packets to
39 test TCP but has grown to include most of the functionality
40 of a full blown network emulator like NISTnet. It can delay
41 packets and add random jitter (and correlation). The random
42 distribution can be loaded from a table as well to provide
43 normal, Pareto, or experimental curves. Packet loss,
44 duplication, and reordering can also be emulated.
46 This qdisc does not do classification that can be handled in
47 layering other disciplines. It does not need to do bandwidth
48 control either since that can be handled by using token
49 bucket or other rate control.
51 The simulator is limited by the Linux timer resolution
52 and will create packet bursts on the HZ boundary (1ms).
55 struct netem_sched_data
{
57 struct timer_list timer
;
72 } delay_cor
, loss_cor
, dup_cor
, reorder_cor
, corrupt_cor
;
80 /* Time stamp put into socket buffer control block */
82 psched_time_t time_to_send
;
85 /* init_crandom - initialize correlated random number generator
86 * Use entropy source for initial seed.
88 static void init_crandom(struct crndstate
*state
, unsigned long rho
)
91 state
->last
= net_random();
94 /* get_crandom - correlated random number generator
95 * Next number depends on last value.
96 * rho is scaled to avoid floating point.
98 static unsigned long get_crandom(struct crndstate
*state
)
101 unsigned long answer
;
103 if (state
->rho
== 0) /* no correllation */
106 value
= net_random();
107 rho
= (u64
)state
->rho
+ 1;
108 answer
= (value
* ((1ull<<32) - rho
) + state
->last
* rho
) >> 32;
109 state
->last
= answer
;
113 /* tabledist - return a pseudo-randomly distributed value with mean mu and
114 * std deviation sigma. Uses table lookup to approximate the desired
115 * distribution, and a uniformly-distributed pseudo-random source.
117 static long tabledist(unsigned long mu
, long sigma
,
118 struct crndstate
*state
, const struct disttable
*dist
)
126 rnd
= get_crandom(state
);
128 /* default uniform distribution */
130 return (rnd
% (2*sigma
)) - sigma
+ mu
;
132 t
= dist
->table
[rnd
% dist
->size
];
133 x
= (sigma
% NETEM_DIST_SCALE
) * t
;
135 x
+= NETEM_DIST_SCALE
/2;
137 x
-= NETEM_DIST_SCALE
/2;
139 return x
/ NETEM_DIST_SCALE
+ (sigma
/ NETEM_DIST_SCALE
) * t
+ mu
;
143 * Insert one skb into qdisc.
144 * Note: parent depends on return value to account for queue length.
145 * NET_XMIT_DROP: queue length didn't change.
146 * NET_XMIT_SUCCESS: one skb was queued.
148 static int netem_enqueue(struct sk_buff
*skb
, struct Qdisc
*sch
)
150 struct netem_sched_data
*q
= qdisc_priv(sch
);
151 /* We don't fill cb now as skb_unshare() may invalidate it */
152 struct netem_skb_cb
*cb
;
153 struct sk_buff
*skb2
;
157 pr_debug("netem_enqueue skb=%p\n", skb
);
159 /* Random duplication */
160 if (q
->duplicate
&& q
->duplicate
>= get_crandom(&q
->dup_cor
))
163 /* Random packet drop 0 => none, ~0 => all */
164 if (q
->loss
&& q
->loss
>= get_crandom(&q
->loss_cor
))
170 return NET_XMIT_BYPASS
;
176 * If we need to duplicate packet, then re-insert at top of the
177 * qdisc tree, since parent queuer expects that only one
178 * skb will be queued.
180 if (count
> 1 && (skb2
= skb_clone(skb
, GFP_ATOMIC
)) != NULL
) {
181 struct Qdisc
*rootq
= sch
->dev
->qdisc
;
182 u32 dupsave
= q
->duplicate
; /* prevent duplicating a dup... */
185 rootq
->enqueue(skb2
, rootq
);
186 q
->duplicate
= dupsave
;
190 * Randomized packet corruption.
191 * Make copy if needed since we are modifying
192 * If packet is going to be hardware checksummed, then
193 * do it now in software before we mangle it.
195 if (q
->corrupt
&& q
->corrupt
>= get_crandom(&q
->corrupt_cor
)) {
196 if (!(skb
= skb_unshare(skb
, GFP_ATOMIC
))
197 || (skb
->ip_summed
== CHECKSUM_PARTIAL
198 && skb_checksum_help(skb
))) {
200 return NET_XMIT_DROP
;
203 skb
->data
[net_random() % skb_headlen(skb
)] ^= 1<<(net_random() % 8);
206 cb
= (struct netem_skb_cb
*)skb
->cb
;
207 if (q
->gap
== 0 /* not doing reordering */
208 || q
->counter
< q
->gap
/* inside last reordering gap */
209 || q
->reorder
< get_crandom(&q
->reorder_cor
)) {
211 psched_tdiff_t delay
;
213 delay
= tabledist(q
->latency
, q
->jitter
,
214 &q
->delay_cor
, q
->delay_dist
);
216 PSCHED_GET_TIME(now
);
217 PSCHED_TADD2(now
, delay
, cb
->time_to_send
);
219 ret
= q
->qdisc
->enqueue(skb
, q
->qdisc
);
222 * Do re-ordering by putting one out of N packets at the front
225 PSCHED_GET_TIME(cb
->time_to_send
);
227 ret
= q
->qdisc
->ops
->requeue(skb
, q
->qdisc
);
230 if (likely(ret
== NET_XMIT_SUCCESS
)) {
232 sch
->bstats
.bytes
+= skb
->len
;
233 sch
->bstats
.packets
++;
237 pr_debug("netem: enqueue ret %d\n", ret
);
241 /* Requeue packets but don't change time stamp */
242 static int netem_requeue(struct sk_buff
*skb
, struct Qdisc
*sch
)
244 struct netem_sched_data
*q
= qdisc_priv(sch
);
247 if ((ret
= q
->qdisc
->ops
->requeue(skb
, q
->qdisc
)) == 0) {
249 sch
->qstats
.requeues
++;
255 static unsigned int netem_drop(struct Qdisc
* sch
)
257 struct netem_sched_data
*q
= qdisc_priv(sch
);
258 unsigned int len
= 0;
260 if (q
->qdisc
->ops
->drop
&& (len
= q
->qdisc
->ops
->drop(q
->qdisc
)) != 0) {
267 static struct sk_buff
*netem_dequeue(struct Qdisc
*sch
)
269 struct netem_sched_data
*q
= qdisc_priv(sch
);
272 skb
= q
->qdisc
->dequeue(q
->qdisc
);
274 const struct netem_skb_cb
*cb
275 = (const struct netem_skb_cb
*)skb
->cb
;
278 /* if more time remaining? */
279 PSCHED_GET_TIME(now
);
281 if (PSCHED_TLESS(cb
->time_to_send
, now
)) {
282 pr_debug("netem_dequeue: return skb=%p\n", skb
);
284 sch
->flags
&= ~TCQ_F_THROTTLED
;
287 psched_tdiff_t delay
= PSCHED_TDIFF(cb
->time_to_send
, now
);
289 if (q
->qdisc
->ops
->requeue(skb
, q
->qdisc
) != NET_XMIT_SUCCESS
) {
292 /* After this qlen is confused */
293 printk(KERN_ERR
"netem: queue discpline %s could not requeue\n",
299 mod_timer(&q
->timer
, jiffies
+ PSCHED_US2JIFFIE(delay
));
300 sch
->flags
|= TCQ_F_THROTTLED
;
307 static void netem_watchdog(unsigned long arg
)
309 struct Qdisc
*sch
= (struct Qdisc
*)arg
;
311 pr_debug("netem_watchdog qlen=%d\n", sch
->q
.qlen
);
312 sch
->flags
&= ~TCQ_F_THROTTLED
;
313 netif_schedule(sch
->dev
);
316 static void netem_reset(struct Qdisc
*sch
)
318 struct netem_sched_data
*q
= qdisc_priv(sch
);
320 qdisc_reset(q
->qdisc
);
322 sch
->flags
&= ~TCQ_F_THROTTLED
;
323 del_timer_sync(&q
->timer
);
326 /* Pass size change message down to embedded FIFO */
327 static int set_fifo_limit(struct Qdisc
*q
, int limit
)
332 /* Hack to avoid sending change message to non-FIFO */
333 if (strncmp(q
->ops
->id
+ 1, "fifo", 4) != 0)
336 rta
= kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt
)), GFP_KERNEL
);
338 rta
->rta_type
= RTM_NEWQDISC
;
339 rta
->rta_len
= RTA_LENGTH(sizeof(struct tc_fifo_qopt
));
340 ((struct tc_fifo_qopt
*)RTA_DATA(rta
))->limit
= limit
;
342 ret
= q
->ops
->change(q
, rta
);
349 * Distribution data is a variable size payload containing
350 * signed 16 bit values.
352 static int get_dist_table(struct Qdisc
*sch
, const struct rtattr
*attr
)
354 struct netem_sched_data
*q
= qdisc_priv(sch
);
355 unsigned long n
= RTA_PAYLOAD(attr
)/sizeof(__s16
);
356 const __s16
*data
= RTA_DATA(attr
);
363 d
= kmalloc(sizeof(*d
) + n
*sizeof(d
->table
[0]), GFP_KERNEL
);
368 for (i
= 0; i
< n
; i
++)
369 d
->table
[i
] = data
[i
];
371 spin_lock_bh(&sch
->dev
->queue_lock
);
372 d
= xchg(&q
->delay_dist
, d
);
373 spin_unlock_bh(&sch
->dev
->queue_lock
);
379 static int get_correlation(struct Qdisc
*sch
, const struct rtattr
*attr
)
381 struct netem_sched_data
*q
= qdisc_priv(sch
);
382 const struct tc_netem_corr
*c
= RTA_DATA(attr
);
384 if (RTA_PAYLOAD(attr
) != sizeof(*c
))
387 init_crandom(&q
->delay_cor
, c
->delay_corr
);
388 init_crandom(&q
->loss_cor
, c
->loss_corr
);
389 init_crandom(&q
->dup_cor
, c
->dup_corr
);
393 static int get_reorder(struct Qdisc
*sch
, const struct rtattr
*attr
)
395 struct netem_sched_data
*q
= qdisc_priv(sch
);
396 const struct tc_netem_reorder
*r
= RTA_DATA(attr
);
398 if (RTA_PAYLOAD(attr
) != sizeof(*r
))
401 q
->reorder
= r
->probability
;
402 init_crandom(&q
->reorder_cor
, r
->correlation
);
406 static int get_corrupt(struct Qdisc
*sch
, const struct rtattr
*attr
)
408 struct netem_sched_data
*q
= qdisc_priv(sch
);
409 const struct tc_netem_corrupt
*r
= RTA_DATA(attr
);
411 if (RTA_PAYLOAD(attr
) != sizeof(*r
))
414 q
->corrupt
= r
->probability
;
415 init_crandom(&q
->corrupt_cor
, r
->correlation
);
419 /* Parse netlink message to set options */
420 static int netem_change(struct Qdisc
*sch
, struct rtattr
*opt
)
422 struct netem_sched_data
*q
= qdisc_priv(sch
);
423 struct tc_netem_qopt
*qopt
;
426 if (opt
== NULL
|| RTA_PAYLOAD(opt
) < sizeof(*qopt
))
429 qopt
= RTA_DATA(opt
);
430 ret
= set_fifo_limit(q
->qdisc
, qopt
->limit
);
432 pr_debug("netem: can't set fifo limit\n");
436 q
->latency
= qopt
->latency
;
437 q
->jitter
= qopt
->jitter
;
438 q
->limit
= qopt
->limit
;
441 q
->loss
= qopt
->loss
;
442 q
->duplicate
= qopt
->duplicate
;
444 /* for compatiablity with earlier versions.
445 * if gap is set, need to assume 100% probablity
449 /* Handle nested options after initial queue options.
450 * Should have put all options in nested format but too late now.
452 if (RTA_PAYLOAD(opt
) > sizeof(*qopt
)) {
453 struct rtattr
*tb
[TCA_NETEM_MAX
];
454 if (rtattr_parse(tb
, TCA_NETEM_MAX
,
455 RTA_DATA(opt
) + sizeof(*qopt
),
456 RTA_PAYLOAD(opt
) - sizeof(*qopt
)))
459 if (tb
[TCA_NETEM_CORR
-1]) {
460 ret
= get_correlation(sch
, tb
[TCA_NETEM_CORR
-1]);
465 if (tb
[TCA_NETEM_DELAY_DIST
-1]) {
466 ret
= get_dist_table(sch
, tb
[TCA_NETEM_DELAY_DIST
-1]);
471 if (tb
[TCA_NETEM_REORDER
-1]) {
472 ret
= get_reorder(sch
, tb
[TCA_NETEM_REORDER
-1]);
477 if (tb
[TCA_NETEM_CORRUPT
-1]) {
478 ret
= get_corrupt(sch
, tb
[TCA_NETEM_CORRUPT
-1]);
488 * Special case version of FIFO queue for use by netem.
489 * It queues in order based on timestamps in skb's
491 struct fifo_sched_data
{
495 static int tfifo_enqueue(struct sk_buff
*nskb
, struct Qdisc
*sch
)
497 struct fifo_sched_data
*q
= qdisc_priv(sch
);
498 struct sk_buff_head
*list
= &sch
->q
;
499 const struct netem_skb_cb
*ncb
500 = (const struct netem_skb_cb
*)nskb
->cb
;
503 if (likely(skb_queue_len(list
) < q
->limit
)) {
504 skb_queue_reverse_walk(list
, skb
) {
505 const struct netem_skb_cb
*cb
506 = (const struct netem_skb_cb
*)skb
->cb
;
508 if (!PSCHED_TLESS(ncb
->time_to_send
, cb
->time_to_send
))
512 __skb_queue_after(list
, skb
, nskb
);
514 sch
->qstats
.backlog
+= nskb
->len
;
515 sch
->bstats
.bytes
+= nskb
->len
;
516 sch
->bstats
.packets
++;
518 return NET_XMIT_SUCCESS
;
521 return qdisc_drop(nskb
, sch
);
524 static int tfifo_init(struct Qdisc
*sch
, struct rtattr
*opt
)
526 struct fifo_sched_data
*q
= qdisc_priv(sch
);
529 struct tc_fifo_qopt
*ctl
= RTA_DATA(opt
);
530 if (RTA_PAYLOAD(opt
) < sizeof(*ctl
))
533 q
->limit
= ctl
->limit
;
535 q
->limit
= max_t(u32
, sch
->dev
->tx_queue_len
, 1);
540 static int tfifo_dump(struct Qdisc
*sch
, struct sk_buff
*skb
)
542 struct fifo_sched_data
*q
= qdisc_priv(sch
);
543 struct tc_fifo_qopt opt
= { .limit
= q
->limit
};
545 RTA_PUT(skb
, TCA_OPTIONS
, sizeof(opt
), &opt
);
552 static struct Qdisc_ops tfifo_qdisc_ops
= {
554 .priv_size
= sizeof(struct fifo_sched_data
),
555 .enqueue
= tfifo_enqueue
,
556 .dequeue
= qdisc_dequeue_head
,
557 .requeue
= qdisc_requeue
,
558 .drop
= qdisc_queue_drop
,
560 .reset
= qdisc_reset_queue
,
561 .change
= tfifo_init
,
565 static int netem_init(struct Qdisc
*sch
, struct rtattr
*opt
)
567 struct netem_sched_data
*q
= qdisc_priv(sch
);
573 init_timer(&q
->timer
);
574 q
->timer
.function
= netem_watchdog
;
575 q
->timer
.data
= (unsigned long) sch
;
577 q
->qdisc
= qdisc_create_dflt(sch
->dev
, &tfifo_qdisc_ops
);
579 pr_debug("netem: qdisc create failed\n");
583 ret
= netem_change(sch
, opt
);
585 pr_debug("netem: change failed\n");
586 qdisc_destroy(q
->qdisc
);
591 static void netem_destroy(struct Qdisc
*sch
)
593 struct netem_sched_data
*q
= qdisc_priv(sch
);
595 del_timer_sync(&q
->timer
);
596 qdisc_destroy(q
->qdisc
);
597 kfree(q
->delay_dist
);
600 static int netem_dump(struct Qdisc
*sch
, struct sk_buff
*skb
)
602 const struct netem_sched_data
*q
= qdisc_priv(sch
);
603 unsigned char *b
= skb
->tail
;
604 struct rtattr
*rta
= (struct rtattr
*) b
;
605 struct tc_netem_qopt qopt
;
606 struct tc_netem_corr cor
;
607 struct tc_netem_reorder reorder
;
608 struct tc_netem_corrupt corrupt
;
610 qopt
.latency
= q
->latency
;
611 qopt
.jitter
= q
->jitter
;
612 qopt
.limit
= q
->limit
;
615 qopt
.duplicate
= q
->duplicate
;
616 RTA_PUT(skb
, TCA_OPTIONS
, sizeof(qopt
), &qopt
);
618 cor
.delay_corr
= q
->delay_cor
.rho
;
619 cor
.loss_corr
= q
->loss_cor
.rho
;
620 cor
.dup_corr
= q
->dup_cor
.rho
;
621 RTA_PUT(skb
, TCA_NETEM_CORR
, sizeof(cor
), &cor
);
623 reorder
.probability
= q
->reorder
;
624 reorder
.correlation
= q
->reorder_cor
.rho
;
625 RTA_PUT(skb
, TCA_NETEM_REORDER
, sizeof(reorder
), &reorder
);
627 corrupt
.probability
= q
->corrupt
;
628 corrupt
.correlation
= q
->corrupt_cor
.rho
;
629 RTA_PUT(skb
, TCA_NETEM_CORRUPT
, sizeof(corrupt
), &corrupt
);
631 rta
->rta_len
= skb
->tail
- b
;
636 skb_trim(skb
, b
- skb
->data
);
640 static int netem_dump_class(struct Qdisc
*sch
, unsigned long cl
,
641 struct sk_buff
*skb
, struct tcmsg
*tcm
)
643 struct netem_sched_data
*q
= qdisc_priv(sch
);
645 if (cl
!= 1) /* only one class */
648 tcm
->tcm_handle
|= TC_H_MIN(1);
649 tcm
->tcm_info
= q
->qdisc
->handle
;
654 static int netem_graft(struct Qdisc
*sch
, unsigned long arg
, struct Qdisc
*new,
657 struct netem_sched_data
*q
= qdisc_priv(sch
);
663 *old
= xchg(&q
->qdisc
, new);
666 sch_tree_unlock(sch
);
671 static struct Qdisc
*netem_leaf(struct Qdisc
*sch
, unsigned long arg
)
673 struct netem_sched_data
*q
= qdisc_priv(sch
);
677 static unsigned long netem_get(struct Qdisc
*sch
, u32 classid
)
682 static void netem_put(struct Qdisc
*sch
, unsigned long arg
)
686 static int netem_change_class(struct Qdisc
*sch
, u32 classid
, u32 parentid
,
687 struct rtattr
**tca
, unsigned long *arg
)
692 static int netem_delete(struct Qdisc
*sch
, unsigned long arg
)
697 static void netem_walk(struct Qdisc
*sch
, struct qdisc_walker
*walker
)
700 if (walker
->count
>= walker
->skip
)
701 if (walker
->fn(sch
, 1, walker
) < 0) {
709 static struct tcf_proto
**netem_find_tcf(struct Qdisc
*sch
, unsigned long cl
)
714 static struct Qdisc_class_ops netem_class_ops
= {
715 .graft
= netem_graft
,
719 .change
= netem_change_class
,
720 .delete = netem_delete
,
722 .tcf_chain
= netem_find_tcf
,
723 .dump
= netem_dump_class
,
726 static struct Qdisc_ops netem_qdisc_ops
= {
728 .cl_ops
= &netem_class_ops
,
729 .priv_size
= sizeof(struct netem_sched_data
),
730 .enqueue
= netem_enqueue
,
731 .dequeue
= netem_dequeue
,
732 .requeue
= netem_requeue
,
735 .reset
= netem_reset
,
736 .destroy
= netem_destroy
,
737 .change
= netem_change
,
739 .owner
= THIS_MODULE
,
743 static int __init
netem_module_init(void)
745 pr_info("netem: version " VERSION
"\n");
746 return register_qdisc(&netem_qdisc_ops
);
748 static void __exit
netem_module_exit(void)
750 unregister_qdisc(&netem_qdisc_ops
);
752 module_init(netem_module_init
)
753 module_exit(netem_module_exit
)
754 MODULE_LICENSE("GPL");