1 // SPDX-License-Identifier: GPL-2.0-only
3 * net/sched/sch_netem.c Network emulator
5 * Many of the algorithms and ideas for this came from
6 * NIST Net which is not copyrighted.
8 * Authors: Stephen Hemminger <shemminger@osdl.org>
9 * Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
13 #include <linux/module.h>
14 #include <linux/slab.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/errno.h>
18 #include <linux/skbuff.h>
19 #include <linux/vmalloc.h>
20 #include <linux/prandom.h>
21 #include <linux/rtnetlink.h>
22 #include <linux/reciprocal_div.h>
23 #include <linux/rbtree.h>
26 #include <net/netlink.h>
27 #include <net/pkt_sched.h>
28 #include <net/inet_ecn.h>
32 /* Network Emulation Queuing algorithm.
33 ====================================
35 Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
36 Network Emulation Tool
37 [2] Luigi Rizzo, DummyNet for FreeBSD
39 ----------------------------------------------------------------
41 This started out as a simple way to delay outgoing packets to
42 test TCP but has grown to include most of the functionality
43 of a full blown network emulator like NISTnet. It can delay
44 packets and add random jitter (and correlation). The random
45 distribution can be loaded from a table as well to provide
46 normal, Pareto, or experimental curves. Packet loss,
47 duplication, and reordering can also be emulated.
49 This qdisc does not do classification that can be handled in
50 layering other disciplines. It does not need to do bandwidth
51 control either since that can be handled by using token
52 bucket or other rate control.
54 Correlated Loss Generator models
56 Added generation of correlated loss according to the
57 "Gilbert-Elliot" model, a 4-state markov model.
60 [1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
61 [2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
62 and intuitive loss model for packet networks and its implementation
63 in the Netem module in the Linux kernel", available in [1]
65 Authors: Stefano Salsano <stefano.salsano at uniroma2.it
66 Fabio Ludovici <fabio.ludovici at yahoo.it>
71 s16 table
[] __counted_by(size
);
74 struct netem_sched_data
{
75 /* internal t(ime)fifo qdisc uses t_root and sch->limit */
76 struct rb_root t_root
;
78 /* a linear queue; reduces rbtree rebalancing when jitter is low */
79 struct sk_buff
*t_head
;
80 struct sk_buff
*t_tail
;
84 /* optional qdisc for classful handling (NULL at netem init) */
87 struct qdisc_watchdog watchdog
;
103 struct reciprocal_value cell_size_reciprocal
;
109 } delay_cor
, loss_cor
, dup_cor
, reorder_cor
, corrupt_cor
;
113 struct rnd_state prng_state
;
116 struct disttable
*delay_dist
;
125 TX_IN_GAP_PERIOD
= 1,
128 LOST_IN_BURST_PERIOD
,
136 /* Correlated Loss Generation models */
138 /* state of the Markov chain */
141 /* 4-states and Gilbert-Elliot models */
142 u32 a1
; /* p13 for 4-states or p for GE */
143 u32 a2
; /* p31 for 4-states or r for GE */
144 u32 a3
; /* p32 for 4-states or h for GE */
145 u32 a4
; /* p14 for 4-states or 1-k for GE */
146 u32 a5
; /* p23 used only in 4-states */
149 struct tc_netem_slot slot_config
;
156 struct disttable
*slot_dist
;
159 /* Time stamp put into socket buffer control block
160 * Only valid when skbs are in our internal t(ime)fifo queue.
162 * As skb->rbnode uses same storage than skb->next, skb->prev and skb->tstamp,
163 * and skb->next & skb->prev are scratch space for a qdisc,
164 * we save skb->tstamp value in skb->cb[] before destroying it.
166 struct netem_skb_cb
{
170 static inline struct netem_skb_cb
*netem_skb_cb(struct sk_buff
*skb
)
172 /* we assume we can use skb next/prev/tstamp as storage for rb_node */
173 qdisc_cb_private_validate(skb
, sizeof(struct netem_skb_cb
));
174 return (struct netem_skb_cb
*)qdisc_skb_cb(skb
)->data
;
177 /* init_crandom - initialize correlated random number generator
178 * Use entropy source for initial seed.
180 static void init_crandom(struct crndstate
*state
, unsigned long rho
)
183 state
->last
= get_random_u32();
186 /* get_crandom - correlated random number generator
187 * Next number depends on last value.
188 * rho is scaled to avoid floating point.
190 static u32
get_crandom(struct crndstate
*state
, struct prng
*p
)
193 unsigned long answer
;
194 struct rnd_state
*s
= &p
->prng_state
;
196 if (!state
|| state
->rho
== 0) /* no correlation */
197 return prandom_u32_state(s
);
199 value
= prandom_u32_state(s
);
200 rho
= (u64
)state
->rho
+ 1;
201 answer
= (value
* ((1ull<<32) - rho
) + state
->last
* rho
) >> 32;
202 state
->last
= answer
;
206 /* loss_4state - 4-state model loss generator
207 * Generates losses according to the 4-state Markov chain adopted in
208 * the GI (General and Intuitive) loss model.
210 static bool loss_4state(struct netem_sched_data
*q
)
212 struct clgstate
*clg
= &q
->clg
;
213 u32 rnd
= prandom_u32_state(&q
->prng
.prng_state
);
216 * Makes a comparison between rnd and the transition
217 * probabilities outgoing from the current state, then decides the
218 * next state and if the next packet has to be transmitted or lost.
219 * The four states correspond to:
220 * TX_IN_GAP_PERIOD => successfully transmitted packets within a gap period
221 * LOST_IN_GAP_PERIOD => isolated losses within a gap period
222 * LOST_IN_BURST_PERIOD => lost packets within a burst period
223 * TX_IN_BURST_PERIOD => successfully transmitted packets within a burst period
225 switch (clg
->state
) {
226 case TX_IN_GAP_PERIOD
:
228 clg
->state
= LOST_IN_GAP_PERIOD
;
230 } else if (clg
->a4
< rnd
&& rnd
< clg
->a1
+ clg
->a4
) {
231 clg
->state
= LOST_IN_BURST_PERIOD
;
233 } else if (clg
->a1
+ clg
->a4
< rnd
) {
234 clg
->state
= TX_IN_GAP_PERIOD
;
238 case TX_IN_BURST_PERIOD
:
240 clg
->state
= LOST_IN_BURST_PERIOD
;
243 clg
->state
= TX_IN_BURST_PERIOD
;
247 case LOST_IN_BURST_PERIOD
:
249 clg
->state
= TX_IN_BURST_PERIOD
;
250 else if (clg
->a3
< rnd
&& rnd
< clg
->a2
+ clg
->a3
) {
251 clg
->state
= TX_IN_GAP_PERIOD
;
252 } else if (clg
->a2
+ clg
->a3
< rnd
) {
253 clg
->state
= LOST_IN_BURST_PERIOD
;
257 case LOST_IN_GAP_PERIOD
:
258 clg
->state
= TX_IN_GAP_PERIOD
;
265 /* loss_gilb_ell - Gilbert-Elliot model loss generator
266 * Generates losses according to the Gilbert-Elliot loss model or
267 * its special cases (Gilbert or Simple Gilbert)
269 * Makes a comparison between random number and the transition
270 * probabilities outgoing from the current state, then decides the
271 * next state. A second random number is extracted and the comparison
272 * with the loss probability of the current state decides if the next
273 * packet will be transmitted or lost.
275 static bool loss_gilb_ell(struct netem_sched_data
*q
)
277 struct clgstate
*clg
= &q
->clg
;
278 struct rnd_state
*s
= &q
->prng
.prng_state
;
280 switch (clg
->state
) {
282 if (prandom_u32_state(s
) < clg
->a1
)
283 clg
->state
= BAD_STATE
;
284 if (prandom_u32_state(s
) < clg
->a4
)
288 if (prandom_u32_state(s
) < clg
->a2
)
289 clg
->state
= GOOD_STATE
;
290 if (prandom_u32_state(s
) > clg
->a3
)
297 static bool loss_event(struct netem_sched_data
*q
)
299 switch (q
->loss_model
) {
301 /* Random packet drop 0 => none, ~0 => all */
302 return q
->loss
&& q
->loss
>= get_crandom(&q
->loss_cor
, &q
->prng
);
305 /* 4state loss model algorithm (used also for GI model)
306 * Extracts a value from the markov 4 state loss generator,
307 * if it is 1 drops a packet and if needed writes the event in
310 return loss_4state(q
);
313 /* Gilbert-Elliot loss model algorithm
314 * Extracts a value from the Gilbert-Elliot loss generator,
315 * if it is 1 drops a packet and if needed writes the event in
318 return loss_gilb_ell(q
);
321 return false; /* not reached */
325 /* tabledist - return a pseudo-randomly distributed value with mean mu and
326 * std deviation sigma. Uses table lookup to approximate the desired
327 * distribution, and a uniformly-distributed pseudo-random source.
329 static s64
tabledist(s64 mu
, s32 sigma
,
330 struct crndstate
*state
,
332 const struct disttable
*dist
)
341 rnd
= get_crandom(state
, prng
);
343 /* default uniform distribution */
345 return ((rnd
% (2 * (u32
)sigma
)) + mu
) - sigma
;
347 t
= dist
->table
[rnd
% dist
->size
];
348 x
= (sigma
% NETEM_DIST_SCALE
) * t
;
350 x
+= NETEM_DIST_SCALE
/2;
352 x
-= NETEM_DIST_SCALE
/2;
354 return x
/ NETEM_DIST_SCALE
+ (sigma
/ NETEM_DIST_SCALE
) * t
+ mu
;
357 static u64
packet_time_ns(u64 len
, const struct netem_sched_data
*q
)
359 len
+= q
->packet_overhead
;
362 u32 cells
= reciprocal_divide(len
, q
->cell_size_reciprocal
);
364 if (len
> cells
* q
->cell_size
) /* extra cell needed for remainder */
366 len
= cells
* (q
->cell_size
+ q
->cell_overhead
);
369 return div64_u64(len
* NSEC_PER_SEC
, q
->rate
);
372 static void tfifo_reset(struct Qdisc
*sch
)
374 struct netem_sched_data
*q
= qdisc_priv(sch
);
375 struct rb_node
*p
= rb_first(&q
->t_root
);
378 struct sk_buff
*skb
= rb_to_skb(p
);
381 rb_erase(&skb
->rbnode
, &q
->t_root
);
382 rtnl_kfree_skbs(skb
, skb
);
385 rtnl_kfree_skbs(q
->t_head
, q
->t_tail
);
391 static void tfifo_enqueue(struct sk_buff
*nskb
, struct Qdisc
*sch
)
393 struct netem_sched_data
*q
= qdisc_priv(sch
);
394 u64 tnext
= netem_skb_cb(nskb
)->time_to_send
;
396 if (!q
->t_tail
|| tnext
>= netem_skb_cb(q
->t_tail
)->time_to_send
) {
398 q
->t_tail
->next
= nskb
;
403 struct rb_node
**p
= &q
->t_root
.rb_node
, *parent
= NULL
;
409 skb
= rb_to_skb(parent
);
410 if (tnext
>= netem_skb_cb(skb
)->time_to_send
)
411 p
= &parent
->rb_right
;
413 p
= &parent
->rb_left
;
415 rb_link_node(&nskb
->rbnode
, parent
, p
);
416 rb_insert_color(&nskb
->rbnode
, &q
->t_root
);
422 /* netem can't properly corrupt a megapacket (like we get from GSO), so instead
423 * when we statistically choose to corrupt one, we instead segment it, returning
424 * the first packet to be corrupted, and re-enqueue the remaining frames
426 static struct sk_buff
*netem_segment(struct sk_buff
*skb
, struct Qdisc
*sch
,
427 struct sk_buff
**to_free
)
429 struct sk_buff
*segs
;
430 netdev_features_t features
= netif_skb_features(skb
);
432 segs
= skb_gso_segment(skb
, features
& ~NETIF_F_GSO_MASK
);
434 if (IS_ERR_OR_NULL(segs
)) {
435 qdisc_drop(skb
, sch
, to_free
);
443 * Insert one skb into qdisc.
444 * Note: parent depends on return value to account for queue length.
445 * NET_XMIT_DROP: queue length didn't change.
446 * NET_XMIT_SUCCESS: one skb was queued.
448 static int netem_enqueue(struct sk_buff
*skb
, struct Qdisc
*sch
,
449 struct sk_buff
**to_free
)
451 struct netem_sched_data
*q
= qdisc_priv(sch
);
452 /* We don't fill cb now as skb_unshare() may invalidate it */
453 struct netem_skb_cb
*cb
;
454 struct sk_buff
*skb2
= NULL
;
455 struct sk_buff
*segs
= NULL
;
456 unsigned int prev_len
= qdisc_pkt_len(skb
);
459 /* Do not fool qdisc_drop_all() */
462 /* Random duplication */
463 if (q
->duplicate
&& q
->duplicate
>= get_crandom(&q
->dup_cor
, &q
->prng
))
468 if (q
->ecn
&& INET_ECN_set_ce(skb
))
469 qdisc_qstats_drop(sch
); /* mark packet */
474 qdisc_qstats_drop(sch
);
475 __qdisc_drop(skb
, to_free
);
476 return NET_XMIT_SUCCESS
| __NET_XMIT_BYPASS
;
479 /* If a delay is expected, orphan the skb. (orphaning usually takes
480 * place at TX completion time, so _before_ the link transit delay)
482 if (q
->latency
|| q
->jitter
|| q
->rate
)
483 skb_orphan_partial(skb
);
486 * If we need to duplicate packet, then clone it before
487 * original is modified.
490 skb2
= skb_clone(skb
, GFP_ATOMIC
);
493 * Randomized packet corruption.
494 * Make copy if needed since we are modifying
495 * If packet is going to be hardware checksummed, then
496 * do it now in software before we mangle it.
498 if (q
->corrupt
&& q
->corrupt
>= get_crandom(&q
->corrupt_cor
, &q
->prng
)) {
499 if (skb_is_gso(skb
)) {
500 skb
= netem_segment(skb
, sch
, to_free
);
505 skb_mark_not_on_list(skb
);
506 qdisc_skb_cb(skb
)->pkt_len
= skb
->len
;
509 skb
= skb_unshare(skb
, GFP_ATOMIC
);
510 if (unlikely(!skb
)) {
511 qdisc_qstats_drop(sch
);
514 if (skb
->ip_summed
== CHECKSUM_PARTIAL
&&
515 skb_checksum_help(skb
)) {
516 qdisc_drop(skb
, sch
, to_free
);
521 skb
->data
[get_random_u32_below(skb_headlen(skb
))] ^=
522 1<<get_random_u32_below(8);
525 if (unlikely(q
->t_len
>= sch
->limit
)) {
526 /* re-link segs, so that qdisc_drop_all() frees them all */
528 qdisc_drop_all(skb
, sch
, to_free
);
530 __qdisc_drop(skb2
, to_free
);
531 return NET_XMIT_DROP
;
535 * If doing duplication then re-insert at top of the
536 * qdisc tree, since parent queuer expects that only one
537 * skb will be queued.
540 struct Qdisc
*rootq
= qdisc_root_bh(sch
);
541 u32 dupsave
= q
->duplicate
; /* prevent duplicating a dup... */
544 rootq
->enqueue(skb2
, rootq
, to_free
);
545 q
->duplicate
= dupsave
;
549 qdisc_qstats_backlog_inc(sch
, skb
);
551 cb
= netem_skb_cb(skb
);
552 if (q
->gap
== 0 || /* not doing reordering */
553 q
->counter
< q
->gap
- 1 || /* inside last reordering gap */
554 q
->reorder
< get_crandom(&q
->reorder_cor
, &q
->prng
)) {
558 delay
= tabledist(q
->latency
, q
->jitter
,
559 &q
->delay_cor
, &q
->prng
, q
->delay_dist
);
561 now
= ktime_get_ns();
564 struct netem_skb_cb
*last
= NULL
;
567 last
= netem_skb_cb(sch
->q
.tail
);
568 if (q
->t_root
.rb_node
) {
569 struct sk_buff
*t_skb
;
570 struct netem_skb_cb
*t_last
;
572 t_skb
= skb_rb_last(&q
->t_root
);
573 t_last
= netem_skb_cb(t_skb
);
575 t_last
->time_to_send
> last
->time_to_send
)
579 struct netem_skb_cb
*t_last
=
580 netem_skb_cb(q
->t_tail
);
583 t_last
->time_to_send
> last
->time_to_send
)
589 * Last packet in queue is reference point (now),
590 * calculate this time bonus and subtract
593 delay
-= last
->time_to_send
- now
;
594 delay
= max_t(s64
, 0, delay
);
595 now
= last
->time_to_send
;
598 delay
+= packet_time_ns(qdisc_pkt_len(skb
), q
);
601 cb
->time_to_send
= now
+ delay
;
603 tfifo_enqueue(skb
, sch
);
606 * Do re-ordering by putting one out of N packets at the front
609 cb
->time_to_send
= ktime_get_ns();
612 __qdisc_enqueue_head(skb
, &sch
->q
);
613 sch
->qstats
.requeues
++;
618 __qdisc_drop(skb2
, to_free
);
621 unsigned int len
, last_len
;
624 len
= skb
? skb
->len
: 0;
629 skb_mark_not_on_list(segs
);
630 qdisc_skb_cb(segs
)->pkt_len
= segs
->len
;
631 last_len
= segs
->len
;
632 rc
= qdisc_enqueue(segs
, sch
, to_free
);
633 if (rc
!= NET_XMIT_SUCCESS
) {
634 if (net_xmit_drop_count(rc
))
635 qdisc_qstats_drop(sch
);
642 /* Parent qdiscs accounted for 1 skb of size @prev_len */
643 qdisc_tree_reduce_backlog(sch
, -(nb
- 1), -(len
- prev_len
));
645 return NET_XMIT_DROP
;
647 return NET_XMIT_SUCCESS
;
650 /* Delay the next round with a new future slot with a
651 * correct number of bytes and packets.
654 static void get_slot_next(struct netem_sched_data
*q
, u64 now
)
659 next_delay
= q
->slot_config
.min_delay
+
661 (q
->slot_config
.max_delay
-
662 q
->slot_config
.min_delay
) >> 32);
664 next_delay
= tabledist(q
->slot_config
.dist_delay
,
665 (s32
)(q
->slot_config
.dist_jitter
),
666 NULL
, &q
->prng
, q
->slot_dist
);
668 q
->slot
.slot_next
= now
+ next_delay
;
669 q
->slot
.packets_left
= q
->slot_config
.max_packets
;
670 q
->slot
.bytes_left
= q
->slot_config
.max_bytes
;
673 static struct sk_buff
*netem_peek(struct netem_sched_data
*q
)
675 struct sk_buff
*skb
= skb_rb_first(&q
->t_root
);
683 t1
= netem_skb_cb(skb
)->time_to_send
;
684 t2
= netem_skb_cb(q
->t_head
)->time_to_send
;
690 static void netem_erase_head(struct netem_sched_data
*q
, struct sk_buff
*skb
)
692 if (skb
== q
->t_head
) {
693 q
->t_head
= skb
->next
;
697 rb_erase(&skb
->rbnode
, &q
->t_root
);
701 static struct sk_buff
*netem_dequeue(struct Qdisc
*sch
)
703 struct netem_sched_data
*q
= qdisc_priv(sch
);
707 skb
= __qdisc_dequeue_head(&sch
->q
);
710 qdisc_qstats_backlog_dec(sch
, skb
);
711 qdisc_bstats_update(sch
, skb
);
717 u64 now
= ktime_get_ns();
719 /* if more time remaining? */
720 time_to_send
= netem_skb_cb(skb
)->time_to_send
;
721 if (q
->slot
.slot_next
&& q
->slot
.slot_next
< time_to_send
)
722 get_slot_next(q
, now
);
724 if (time_to_send
<= now
&& q
->slot
.slot_next
<= now
) {
725 netem_erase_head(q
, skb
);
729 /* skb->dev shares skb->rbnode area,
730 * we need to restore its value.
732 skb
->dev
= qdisc_dev(sch
);
734 if (q
->slot
.slot_next
) {
735 q
->slot
.packets_left
--;
736 q
->slot
.bytes_left
-= qdisc_pkt_len(skb
);
737 if (q
->slot
.packets_left
<= 0 ||
738 q
->slot
.bytes_left
<= 0)
739 get_slot_next(q
, now
);
743 unsigned int pkt_len
= qdisc_pkt_len(skb
);
744 struct sk_buff
*to_free
= NULL
;
747 err
= qdisc_enqueue(skb
, q
->qdisc
, &to_free
);
748 kfree_skb_list(to_free
);
749 if (err
!= NET_XMIT_SUCCESS
) {
750 if (net_xmit_drop_count(err
))
751 qdisc_qstats_drop(sch
);
752 qdisc_tree_reduce_backlog(sch
, 1, pkt_len
);
753 sch
->qstats
.backlog
-= pkt_len
;
763 skb
= q
->qdisc
->ops
->dequeue(q
->qdisc
);
770 qdisc_watchdog_schedule_ns(&q
->watchdog
,
776 skb
= q
->qdisc
->ops
->dequeue(q
->qdisc
);
785 static void netem_reset(struct Qdisc
*sch
)
787 struct netem_sched_data
*q
= qdisc_priv(sch
);
789 qdisc_reset_queue(sch
);
792 qdisc_reset(q
->qdisc
);
793 qdisc_watchdog_cancel(&q
->watchdog
);
796 static void dist_free(struct disttable
*d
)
802 * Distribution data is a variable size payload containing
803 * signed 16 bit values.
806 static int get_dist_table(struct disttable
**tbl
, const struct nlattr
*attr
)
808 size_t n
= nla_len(attr
)/sizeof(__s16
);
809 const __s16
*data
= nla_data(attr
);
813 if (!n
|| n
> NETEM_DIST_MAX
)
816 d
= kvmalloc(struct_size(d
, table
, n
), GFP_KERNEL
);
821 for (i
= 0; i
< n
; i
++)
822 d
->table
[i
] = data
[i
];
828 static void get_slot(struct netem_sched_data
*q
, const struct nlattr
*attr
)
830 const struct tc_netem_slot
*c
= nla_data(attr
);
833 if (q
->slot_config
.max_packets
== 0)
834 q
->slot_config
.max_packets
= INT_MAX
;
835 if (q
->slot_config
.max_bytes
== 0)
836 q
->slot_config
.max_bytes
= INT_MAX
;
838 /* capping dist_jitter to the range acceptable by tabledist() */
839 q
->slot_config
.dist_jitter
= min_t(__s64
, INT_MAX
, abs(q
->slot_config
.dist_jitter
));
841 q
->slot
.packets_left
= q
->slot_config
.max_packets
;
842 q
->slot
.bytes_left
= q
->slot_config
.max_bytes
;
843 if (q
->slot_config
.min_delay
| q
->slot_config
.max_delay
|
844 q
->slot_config
.dist_jitter
)
845 q
->slot
.slot_next
= ktime_get_ns();
847 q
->slot
.slot_next
= 0;
850 static void get_correlation(struct netem_sched_data
*q
, const struct nlattr
*attr
)
852 const struct tc_netem_corr
*c
= nla_data(attr
);
854 init_crandom(&q
->delay_cor
, c
->delay_corr
);
855 init_crandom(&q
->loss_cor
, c
->loss_corr
);
856 init_crandom(&q
->dup_cor
, c
->dup_corr
);
859 static void get_reorder(struct netem_sched_data
*q
, const struct nlattr
*attr
)
861 const struct tc_netem_reorder
*r
= nla_data(attr
);
863 q
->reorder
= r
->probability
;
864 init_crandom(&q
->reorder_cor
, r
->correlation
);
867 static void get_corrupt(struct netem_sched_data
*q
, const struct nlattr
*attr
)
869 const struct tc_netem_corrupt
*r
= nla_data(attr
);
871 q
->corrupt
= r
->probability
;
872 init_crandom(&q
->corrupt_cor
, r
->correlation
);
875 static void get_rate(struct netem_sched_data
*q
, const struct nlattr
*attr
)
877 const struct tc_netem_rate
*r
= nla_data(attr
);
880 q
->packet_overhead
= r
->packet_overhead
;
881 q
->cell_size
= r
->cell_size
;
882 q
->cell_overhead
= r
->cell_overhead
;
884 q
->cell_size_reciprocal
= reciprocal_value(q
->cell_size
);
886 q
->cell_size_reciprocal
= (struct reciprocal_value
) { 0 };
889 static int get_loss_clg(struct netem_sched_data
*q
, const struct nlattr
*attr
)
891 const struct nlattr
*la
;
894 nla_for_each_nested(la
, attr
, rem
) {
895 u16 type
= nla_type(la
);
898 case NETEM_LOSS_GI
: {
899 const struct tc_netem_gimodel
*gi
= nla_data(la
);
901 if (nla_len(la
) < sizeof(struct tc_netem_gimodel
)) {
902 pr_info("netem: incorrect gi model size\n");
906 q
->loss_model
= CLG_4_STATES
;
908 q
->clg
.state
= TX_IN_GAP_PERIOD
;
917 case NETEM_LOSS_GE
: {
918 const struct tc_netem_gemodel
*ge
= nla_data(la
);
920 if (nla_len(la
) < sizeof(struct tc_netem_gemodel
)) {
921 pr_info("netem: incorrect ge model size\n");
925 q
->loss_model
= CLG_GILB_ELL
;
926 q
->clg
.state
= GOOD_STATE
;
935 pr_info("netem: unknown loss type %u\n", type
);
943 static const struct nla_policy netem_policy
[TCA_NETEM_MAX
+ 1] = {
944 [TCA_NETEM_CORR
] = { .len
= sizeof(struct tc_netem_corr
) },
945 [TCA_NETEM_REORDER
] = { .len
= sizeof(struct tc_netem_reorder
) },
946 [TCA_NETEM_CORRUPT
] = { .len
= sizeof(struct tc_netem_corrupt
) },
947 [TCA_NETEM_RATE
] = { .len
= sizeof(struct tc_netem_rate
) },
948 [TCA_NETEM_LOSS
] = { .type
= NLA_NESTED
},
949 [TCA_NETEM_ECN
] = { .type
= NLA_U32
},
950 [TCA_NETEM_RATE64
] = { .type
= NLA_U64
},
951 [TCA_NETEM_LATENCY64
] = { .type
= NLA_S64
},
952 [TCA_NETEM_JITTER64
] = { .type
= NLA_S64
},
953 [TCA_NETEM_SLOT
] = { .len
= sizeof(struct tc_netem_slot
) },
954 [TCA_NETEM_PRNG_SEED
] = { .type
= NLA_U64
},
957 static int parse_attr(struct nlattr
*tb
[], int maxtype
, struct nlattr
*nla
,
958 const struct nla_policy
*policy
, int len
)
960 int nested_len
= nla_len(nla
) - NLA_ALIGN(len
);
962 if (nested_len
< 0) {
963 pr_info("netem: invalid attributes len %d\n", nested_len
);
967 if (nested_len
>= nla_attr_size(0))
968 return nla_parse_deprecated(tb
, maxtype
,
969 nla_data(nla
) + NLA_ALIGN(len
),
970 nested_len
, policy
, NULL
);
972 memset(tb
, 0, sizeof(struct nlattr
*) * (maxtype
+ 1));
976 /* Parse netlink message to set options */
977 static int netem_change(struct Qdisc
*sch
, struct nlattr
*opt
,
978 struct netlink_ext_ack
*extack
)
980 struct netem_sched_data
*q
= qdisc_priv(sch
);
981 struct nlattr
*tb
[TCA_NETEM_MAX
+ 1];
982 struct disttable
*delay_dist
= NULL
;
983 struct disttable
*slot_dist
= NULL
;
984 struct tc_netem_qopt
*qopt
;
985 struct clgstate old_clg
;
986 int old_loss_model
= CLG_RANDOM
;
989 qopt
= nla_data(opt
);
990 ret
= parse_attr(tb
, TCA_NETEM_MAX
, opt
, netem_policy
, sizeof(*qopt
));
994 if (tb
[TCA_NETEM_DELAY_DIST
]) {
995 ret
= get_dist_table(&delay_dist
, tb
[TCA_NETEM_DELAY_DIST
]);
1000 if (tb
[TCA_NETEM_SLOT_DIST
]) {
1001 ret
= get_dist_table(&slot_dist
, tb
[TCA_NETEM_SLOT_DIST
]);
1007 /* backup q->clg and q->loss_model */
1009 old_loss_model
= q
->loss_model
;
1011 if (tb
[TCA_NETEM_LOSS
]) {
1012 ret
= get_loss_clg(q
, tb
[TCA_NETEM_LOSS
]);
1014 q
->loss_model
= old_loss_model
;
1019 q
->loss_model
= CLG_RANDOM
;
1023 swap(q
->delay_dist
, delay_dist
);
1025 swap(q
->slot_dist
, slot_dist
);
1026 sch
->limit
= qopt
->limit
;
1028 q
->latency
= PSCHED_TICKS2NS(qopt
->latency
);
1029 q
->jitter
= PSCHED_TICKS2NS(qopt
->jitter
);
1030 q
->limit
= qopt
->limit
;
1033 q
->loss
= qopt
->loss
;
1034 q
->duplicate
= qopt
->duplicate
;
1036 /* for compatibility with earlier versions.
1037 * if gap is set, need to assume 100% probability
1042 if (tb
[TCA_NETEM_CORR
])
1043 get_correlation(q
, tb
[TCA_NETEM_CORR
]);
1045 if (tb
[TCA_NETEM_REORDER
])
1046 get_reorder(q
, tb
[TCA_NETEM_REORDER
]);
1048 if (tb
[TCA_NETEM_CORRUPT
])
1049 get_corrupt(q
, tb
[TCA_NETEM_CORRUPT
]);
1051 if (tb
[TCA_NETEM_RATE
])
1052 get_rate(q
, tb
[TCA_NETEM_RATE
]);
1054 if (tb
[TCA_NETEM_RATE64
])
1055 q
->rate
= max_t(u64
, q
->rate
,
1056 nla_get_u64(tb
[TCA_NETEM_RATE64
]));
1058 if (tb
[TCA_NETEM_LATENCY64
])
1059 q
->latency
= nla_get_s64(tb
[TCA_NETEM_LATENCY64
]);
1061 if (tb
[TCA_NETEM_JITTER64
])
1062 q
->jitter
= nla_get_s64(tb
[TCA_NETEM_JITTER64
]);
1064 if (tb
[TCA_NETEM_ECN
])
1065 q
->ecn
= nla_get_u32(tb
[TCA_NETEM_ECN
]);
1067 if (tb
[TCA_NETEM_SLOT
])
1068 get_slot(q
, tb
[TCA_NETEM_SLOT
]);
1070 /* capping jitter to the range acceptable by tabledist() */
1071 q
->jitter
= min_t(s64
, abs(q
->jitter
), INT_MAX
);
1073 if (tb
[TCA_NETEM_PRNG_SEED
])
1074 q
->prng
.seed
= nla_get_u64(tb
[TCA_NETEM_PRNG_SEED
]);
1076 q
->prng
.seed
= get_random_u64();
1077 prandom_seed_state(&q
->prng
.prng_state
, q
->prng
.seed
);
1080 sch_tree_unlock(sch
);
1083 dist_free(delay_dist
);
1084 dist_free(slot_dist
);
1088 static int netem_init(struct Qdisc
*sch
, struct nlattr
*opt
,
1089 struct netlink_ext_ack
*extack
)
1091 struct netem_sched_data
*q
= qdisc_priv(sch
);
1094 qdisc_watchdog_init(&q
->watchdog
, sch
);
1099 q
->loss_model
= CLG_RANDOM
;
1100 ret
= netem_change(sch
, opt
, extack
);
1102 pr_info("netem: change failed\n");
1106 static void netem_destroy(struct Qdisc
*sch
)
1108 struct netem_sched_data
*q
= qdisc_priv(sch
);
1110 qdisc_watchdog_cancel(&q
->watchdog
);
1112 qdisc_put(q
->qdisc
);
1113 dist_free(q
->delay_dist
);
1114 dist_free(q
->slot_dist
);
1117 static int dump_loss_model(const struct netem_sched_data
*q
,
1118 struct sk_buff
*skb
)
1120 struct nlattr
*nest
;
1122 nest
= nla_nest_start_noflag(skb
, TCA_NETEM_LOSS
);
1124 goto nla_put_failure
;
1126 switch (q
->loss_model
) {
1128 /* legacy loss model */
1129 nla_nest_cancel(skb
, nest
);
1130 return 0; /* no data */
1132 case CLG_4_STATES
: {
1133 struct tc_netem_gimodel gi
= {
1141 if (nla_put(skb
, NETEM_LOSS_GI
, sizeof(gi
), &gi
))
1142 goto nla_put_failure
;
1145 case CLG_GILB_ELL
: {
1146 struct tc_netem_gemodel ge
= {
1153 if (nla_put(skb
, NETEM_LOSS_GE
, sizeof(ge
), &ge
))
1154 goto nla_put_failure
;
1159 nla_nest_end(skb
, nest
);
1163 nla_nest_cancel(skb
, nest
);
1167 static int netem_dump(struct Qdisc
*sch
, struct sk_buff
*skb
)
1169 const struct netem_sched_data
*q
= qdisc_priv(sch
);
1170 struct nlattr
*nla
= (struct nlattr
*) skb_tail_pointer(skb
);
1171 struct tc_netem_qopt qopt
;
1172 struct tc_netem_corr cor
;
1173 struct tc_netem_reorder reorder
;
1174 struct tc_netem_corrupt corrupt
;
1175 struct tc_netem_rate rate
;
1176 struct tc_netem_slot slot
;
1178 qopt
.latency
= min_t(psched_time_t
, PSCHED_NS2TICKS(q
->latency
),
1180 qopt
.jitter
= min_t(psched_time_t
, PSCHED_NS2TICKS(q
->jitter
),
1182 qopt
.limit
= q
->limit
;
1183 qopt
.loss
= q
->loss
;
1185 qopt
.duplicate
= q
->duplicate
;
1186 if (nla_put(skb
, TCA_OPTIONS
, sizeof(qopt
), &qopt
))
1187 goto nla_put_failure
;
1189 if (nla_put(skb
, TCA_NETEM_LATENCY64
, sizeof(q
->latency
), &q
->latency
))
1190 goto nla_put_failure
;
1192 if (nla_put(skb
, TCA_NETEM_JITTER64
, sizeof(q
->jitter
), &q
->jitter
))
1193 goto nla_put_failure
;
1195 cor
.delay_corr
= q
->delay_cor
.rho
;
1196 cor
.loss_corr
= q
->loss_cor
.rho
;
1197 cor
.dup_corr
= q
->dup_cor
.rho
;
1198 if (nla_put(skb
, TCA_NETEM_CORR
, sizeof(cor
), &cor
))
1199 goto nla_put_failure
;
1201 reorder
.probability
= q
->reorder
;
1202 reorder
.correlation
= q
->reorder_cor
.rho
;
1203 if (nla_put(skb
, TCA_NETEM_REORDER
, sizeof(reorder
), &reorder
))
1204 goto nla_put_failure
;
1206 corrupt
.probability
= q
->corrupt
;
1207 corrupt
.correlation
= q
->corrupt_cor
.rho
;
1208 if (nla_put(skb
, TCA_NETEM_CORRUPT
, sizeof(corrupt
), &corrupt
))
1209 goto nla_put_failure
;
1211 if (q
->rate
>= (1ULL << 32)) {
1212 if (nla_put_u64_64bit(skb
, TCA_NETEM_RATE64
, q
->rate
,
1214 goto nla_put_failure
;
1217 rate
.rate
= q
->rate
;
1219 rate
.packet_overhead
= q
->packet_overhead
;
1220 rate
.cell_size
= q
->cell_size
;
1221 rate
.cell_overhead
= q
->cell_overhead
;
1222 if (nla_put(skb
, TCA_NETEM_RATE
, sizeof(rate
), &rate
))
1223 goto nla_put_failure
;
1225 if (q
->ecn
&& nla_put_u32(skb
, TCA_NETEM_ECN
, q
->ecn
))
1226 goto nla_put_failure
;
1228 if (dump_loss_model(q
, skb
) != 0)
1229 goto nla_put_failure
;
1231 if (q
->slot_config
.min_delay
| q
->slot_config
.max_delay
|
1232 q
->slot_config
.dist_jitter
) {
1233 slot
= q
->slot_config
;
1234 if (slot
.max_packets
== INT_MAX
)
1235 slot
.max_packets
= 0;
1236 if (slot
.max_bytes
== INT_MAX
)
1238 if (nla_put(skb
, TCA_NETEM_SLOT
, sizeof(slot
), &slot
))
1239 goto nla_put_failure
;
1242 if (nla_put_u64_64bit(skb
, TCA_NETEM_PRNG_SEED
, q
->prng
.seed
,
1244 goto nla_put_failure
;
1246 return nla_nest_end(skb
, nla
);
1249 nlmsg_trim(skb
, nla
);
1253 static int netem_dump_class(struct Qdisc
*sch
, unsigned long cl
,
1254 struct sk_buff
*skb
, struct tcmsg
*tcm
)
1256 struct netem_sched_data
*q
= qdisc_priv(sch
);
1258 if (cl
!= 1 || !q
->qdisc
) /* only one class */
1261 tcm
->tcm_handle
|= TC_H_MIN(1);
1262 tcm
->tcm_info
= q
->qdisc
->handle
;
1267 static int netem_graft(struct Qdisc
*sch
, unsigned long arg
, struct Qdisc
*new,
1268 struct Qdisc
**old
, struct netlink_ext_ack
*extack
)
1270 struct netem_sched_data
*q
= qdisc_priv(sch
);
1272 *old
= qdisc_replace(sch
, new, &q
->qdisc
);
1276 static struct Qdisc
*netem_leaf(struct Qdisc
*sch
, unsigned long arg
)
1278 struct netem_sched_data
*q
= qdisc_priv(sch
);
1282 static unsigned long netem_find(struct Qdisc
*sch
, u32 classid
)
1287 static void netem_walk(struct Qdisc
*sch
, struct qdisc_walker
*walker
)
1289 if (!walker
->stop
) {
1290 if (!tc_qdisc_stats_dump(sch
, 1, walker
))
1295 static const struct Qdisc_class_ops netem_class_ops
= {
1296 .graft
= netem_graft
,
1300 .dump
= netem_dump_class
,
1303 static struct Qdisc_ops netem_qdisc_ops __read_mostly
= {
1305 .cl_ops
= &netem_class_ops
,
1306 .priv_size
= sizeof(struct netem_sched_data
),
1307 .enqueue
= netem_enqueue
,
1308 .dequeue
= netem_dequeue
,
1309 .peek
= qdisc_peek_dequeued
,
1311 .reset
= netem_reset
,
1312 .destroy
= netem_destroy
,
1313 .change
= netem_change
,
1315 .owner
= THIS_MODULE
,
1317 MODULE_ALIAS_NET_SCH("netem");
1320 static int __init
netem_module_init(void)
1322 pr_info("netem: version " VERSION
"\n");
1323 return register_qdisc(&netem_qdisc_ops
);
1325 static void __exit
netem_module_exit(void)
1327 unregister_qdisc(&netem_qdisc_ops
);
1329 module_init(netem_module_init
)
1330 module_exit(netem_module_exit
)
1331 MODULE_LICENSE("GPL");
1332 MODULE_DESCRIPTION("Network characteristics emulator qdisc");