1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing)
5 * Copyright (C) 2013-2015 Eric Dumazet <edumazet@google.com>
7 * Meant to be mostly used for locally generated traffic :
8 * Fast classification depends on skb->sk being set before reaching us.
9 * If not, (router workload), we use rxhash as fallback, with 32 bits wide hash.
10 * All packets belonging to a socket are considered as a 'flow'.
12 * Flows are dynamically allocated and stored in a hash table of RB trees
13 * They are also part of one Round Robin 'queues' (new or old flows)
15 * Burst avoidance (aka pacing) capability :
17 * Transport (eg TCP) can set in sk->sk_pacing_rate a rate, enqueue a
18 * bunch of packets, and this packet scheduler adds delay between
19 * packets to respect rate limitation.
22 * - lookup one RB tree (out of 1024 or more) to find the flow.
23 * If non existent flow, create it, add it to the tree.
24 * Add skb to the per flow list of skb (fifo).
25 * - Use a special fifo for high prio packets
27 * dequeue() : serves flows in Round Robin
28 * Note : When a flow becomes empty, we do not immediately remove it from
29 * rb trees, for performance reasons (its expected to send additional packets,
30 * or SLAB cache will reuse socket for another flow)
33 #include <linux/module.h>
34 #include <linux/types.h>
35 #include <linux/kernel.h>
36 #include <linux/jiffies.h>
37 #include <linux/string.h>
39 #include <linux/errno.h>
40 #include <linux/init.h>
41 #include <linux/skbuff.h>
42 #include <linux/slab.h>
43 #include <linux/rbtree.h>
44 #include <linux/hash.h>
45 #include <linux/prefetch.h>
46 #include <linux/vmalloc.h>
47 #include <net/netlink.h>
48 #include <net/pkt_sched.h>
50 #include <net/tcp_states.h>
57 static inline struct fq_skb_cb
*fq_skb_cb(struct sk_buff
*skb
)
59 qdisc_cb_private_validate(skb
, sizeof(struct fq_skb_cb
));
60 return (struct fq_skb_cb
*)qdisc_skb_cb(skb
)->data
;
64 * Per flow structure, dynamically allocated.
65 * If packets have monotically increasing time_to_send, they are placed in O(1)
66 * in linear list (head,tail), otherwise are placed in a rbtree (t_root).
69 /* First cache line : used in fq_gc(), fq_enqueue(), fq_dequeue() */
70 struct rb_root t_root
;
71 struct sk_buff
*head
; /* list of skbs for this flow : first skb */
73 struct sk_buff
*tail
; /* last skb in the list */
74 unsigned long age
; /* (jiffies | 1UL) when flow was emptied, for gc */
76 struct rb_node fq_node
; /* anchor in fq_root[] trees */
78 u32 socket_hash
; /* sk_hash */
79 int qlen
; /* number of packets in flow queue */
81 /* Second cache line, used in fq_dequeue() */
83 /* 32bit hole on 64bit arches */
85 struct fq_flow
*next
; /* next pointer in RR lists */
87 struct rb_node rate_node
; /* anchor in q->delayed tree */
89 } ____cacheline_aligned_in_smp
;
92 struct fq_flow
*first
;
96 struct fq_sched_data
{
97 struct fq_flow_head new_flows
;
99 struct fq_flow_head old_flows
;
101 struct rb_root delayed
; /* for rate limited flows */
102 u64 time_next_delayed_flow
;
103 u64 ktime_cache
; /* copy of last ktime_get_ns() */
104 unsigned long unthrottle_latency_ns
;
106 struct fq_flow internal
; /* for non classified or high prio packets */
109 u32 flow_refill_delay
;
110 u32 flow_plimit
; /* max packets per flow */
111 unsigned long flow_max_rate
; /* optional max rate per flow */
113 u64 horizon
; /* horizon in ns */
114 u32 orphan_mask
; /* mask for orphaned skb */
115 u32 low_rate_threshold
;
116 struct rb_root
*fq_root
;
125 u64 stat_internal_packets
;
128 u64 stat_horizon_drops
;
129 u64 stat_horizon_caps
;
130 u64 stat_flows_plimit
;
131 u64 stat_pkts_too_long
;
132 u64 stat_allocation_errors
;
134 u32 timer_slack
; /* hrtimer slack in ns */
135 struct qdisc_watchdog watchdog
;
139 * f->tail and f->age share the same location.
140 * We can use the low order bit to differentiate if this location points
141 * to a sk_buff or contains a jiffies value, if we force this value to be odd.
142 * This assumes f->tail low order bit must be 0 since alignof(struct sk_buff) >= 2
144 static void fq_flow_set_detached(struct fq_flow
*f
)
146 f
->age
= jiffies
| 1UL;
149 static bool fq_flow_is_detached(const struct fq_flow
*f
)
151 return !!(f
->age
& 1UL);
154 /* special value to mark a throttled flow (not on old/new list) */
155 static struct fq_flow throttled
;
157 static bool fq_flow_is_throttled(const struct fq_flow
*f
)
159 return f
->next
== &throttled
;
162 static void fq_flow_add_tail(struct fq_flow_head
*head
, struct fq_flow
*flow
)
165 head
->last
->next
= flow
;
172 static void fq_flow_unset_throttled(struct fq_sched_data
*q
, struct fq_flow
*f
)
174 rb_erase(&f
->rate_node
, &q
->delayed
);
175 q
->throttled_flows
--;
176 fq_flow_add_tail(&q
->old_flows
, f
);
179 static void fq_flow_set_throttled(struct fq_sched_data
*q
, struct fq_flow
*f
)
181 struct rb_node
**p
= &q
->delayed
.rb_node
, *parent
= NULL
;
187 aux
= rb_entry(parent
, struct fq_flow
, rate_node
);
188 if (f
->time_next_packet
>= aux
->time_next_packet
)
189 p
= &parent
->rb_right
;
191 p
= &parent
->rb_left
;
193 rb_link_node(&f
->rate_node
, parent
, p
);
194 rb_insert_color(&f
->rate_node
, &q
->delayed
);
195 q
->throttled_flows
++;
198 f
->next
= &throttled
;
199 if (q
->time_next_delayed_flow
> f
->time_next_packet
)
200 q
->time_next_delayed_flow
= f
->time_next_packet
;
204 static struct kmem_cache
*fq_flow_cachep __read_mostly
;
207 /* limit number of collected flows per round */
209 #define FQ_GC_AGE (3*HZ)
211 static bool fq_gc_candidate(const struct fq_flow
*f
)
213 return fq_flow_is_detached(f
) &&
214 time_after(jiffies
, f
->age
+ FQ_GC_AGE
);
217 static void fq_gc(struct fq_sched_data
*q
,
218 struct rb_root
*root
,
221 struct rb_node
**p
, *parent
;
222 void *tofree
[FQ_GC_MAX
];
231 f
= rb_entry(parent
, struct fq_flow
, fq_node
);
235 if (fq_gc_candidate(f
)) {
237 if (fcnt
== FQ_GC_MAX
)
242 p
= &parent
->rb_right
;
244 p
= &parent
->rb_left
;
250 for (i
= fcnt
; i
> 0; ) {
252 rb_erase(&f
->fq_node
, root
);
255 q
->inactive_flows
-= fcnt
;
256 q
->stat_gc_flows
+= fcnt
;
258 kmem_cache_free_bulk(fq_flow_cachep
, fcnt
, tofree
);
261 static struct fq_flow
*fq_classify(struct sk_buff
*skb
, struct fq_sched_data
*q
)
263 struct rb_node
**p
, *parent
;
264 struct sock
*sk
= skb
->sk
;
265 struct rb_root
*root
;
268 /* warning: no starvation prevention... */
269 if (unlikely((skb
->priority
& TC_PRIO_MAX
) == TC_PRIO_CONTROL
))
272 /* SYNACK messages are attached to a TCP_NEW_SYN_RECV request socket
273 * or a listener (SYNCOOKIE mode)
274 * 1) request sockets are not full blown,
275 * they do not contain sk_pacing_rate
276 * 2) They are not part of a 'flow' yet
277 * 3) We do not want to rate limit them (eg SYNFLOOD attack),
278 * especially if the listener set SO_MAX_PACING_RATE
279 * 4) We pretend they are orphaned
281 if (!sk
|| sk_listener(sk
)) {
282 unsigned long hash
= skb_get_hash(skb
) & q
->orphan_mask
;
284 /* By forcing low order bit to 1, we make sure to not
285 * collide with a local flow (socket pointers are word aligned)
287 sk
= (struct sock
*)((hash
<< 1) | 1UL);
289 } else if (sk
->sk_state
== TCP_CLOSE
) {
290 unsigned long hash
= skb_get_hash(skb
) & q
->orphan_mask
;
292 * Sockets in TCP_CLOSE are non connected.
293 * Typical use case is UDP sockets, they can send packets
294 * with sendto() to many different destinations.
295 * We probably could use a generic bit advertising
296 * non connected sockets, instead of sk_state == TCP_CLOSE,
299 sk
= (struct sock
*)((hash
<< 1) | 1UL);
302 root
= &q
->fq_root
[hash_ptr(sk
, q
->fq_trees_log
)];
304 if (q
->flows
>= (2U << q
->fq_trees_log
) &&
305 q
->inactive_flows
> q
->flows
/2)
313 f
= rb_entry(parent
, struct fq_flow
, fq_node
);
315 /* socket might have been reallocated, so check
316 * if its sk_hash is the same.
317 * It not, we need to refill credit with
320 if (unlikely(skb
->sk
== sk
&&
321 f
->socket_hash
!= sk
->sk_hash
)) {
322 f
->credit
= q
->initial_quantum
;
323 f
->socket_hash
= sk
->sk_hash
;
325 smp_store_release(&sk
->sk_pacing_status
,
327 if (fq_flow_is_throttled(f
))
328 fq_flow_unset_throttled(q
, f
);
329 f
->time_next_packet
= 0ULL;
334 p
= &parent
->rb_right
;
336 p
= &parent
->rb_left
;
339 f
= kmem_cache_zalloc(fq_flow_cachep
, GFP_ATOMIC
| __GFP_NOWARN
);
341 q
->stat_allocation_errors
++;
344 /* f->t_root is already zeroed after kmem_cache_zalloc() */
346 fq_flow_set_detached(f
);
349 f
->socket_hash
= sk
->sk_hash
;
351 smp_store_release(&sk
->sk_pacing_status
,
354 f
->credit
= q
->initial_quantum
;
356 rb_link_node(&f
->fq_node
, parent
, p
);
357 rb_insert_color(&f
->fq_node
, root
);
364 static struct sk_buff
*fq_peek(struct fq_flow
*flow
)
366 struct sk_buff
*skb
= skb_rb_first(&flow
->t_root
);
367 struct sk_buff
*head
= flow
->head
;
375 if (fq_skb_cb(skb
)->time_to_send
< fq_skb_cb(head
)->time_to_send
)
380 static void fq_erase_head(struct Qdisc
*sch
, struct fq_flow
*flow
,
383 if (skb
== flow
->head
) {
384 flow
->head
= skb
->next
;
386 rb_erase(&skb
->rbnode
, &flow
->t_root
);
387 skb
->dev
= qdisc_dev(sch
);
391 /* Remove one skb from flow queue.
392 * This skb must be the return value of prior fq_peek().
394 static void fq_dequeue_skb(struct Qdisc
*sch
, struct fq_flow
*flow
,
397 fq_erase_head(sch
, flow
, skb
);
398 skb_mark_not_on_list(skb
);
400 qdisc_qstats_backlog_dec(sch
, skb
);
404 static void flow_queue_add(struct fq_flow
*flow
, struct sk_buff
*skb
)
406 struct rb_node
**p
, *parent
;
407 struct sk_buff
*head
, *aux
;
411 fq_skb_cb(skb
)->time_to_send
>= fq_skb_cb(flow
->tail
)->time_to_send
) {
415 flow
->tail
->next
= skb
;
421 p
= &flow
->t_root
.rb_node
;
426 aux
= rb_to_skb(parent
);
427 if (fq_skb_cb(skb
)->time_to_send
>= fq_skb_cb(aux
)->time_to_send
)
428 p
= &parent
->rb_right
;
430 p
= &parent
->rb_left
;
432 rb_link_node(&skb
->rbnode
, parent
, p
);
433 rb_insert_color(&skb
->rbnode
, &flow
->t_root
);
436 static bool fq_packet_beyond_horizon(const struct sk_buff
*skb
,
437 const struct fq_sched_data
*q
)
439 return unlikely((s64
)skb
->tstamp
> (s64
)(q
->ktime_cache
+ q
->horizon
));
442 static int fq_enqueue(struct sk_buff
*skb
, struct Qdisc
*sch
,
443 struct sk_buff
**to_free
)
445 struct fq_sched_data
*q
= qdisc_priv(sch
);
448 if (unlikely(sch
->q
.qlen
>= sch
->limit
))
449 return qdisc_drop(skb
, sch
, to_free
);
452 fq_skb_cb(skb
)->time_to_send
= q
->ktime_cache
= ktime_get_ns();
454 /* Check if packet timestamp is too far in the future.
455 * Try first if our cached value, to avoid ktime_get_ns()
456 * cost in most cases.
458 if (fq_packet_beyond_horizon(skb
, q
)) {
459 /* Refresh our cache and check another time */
460 q
->ktime_cache
= ktime_get_ns();
461 if (fq_packet_beyond_horizon(skb
, q
)) {
462 if (q
->horizon_drop
) {
463 q
->stat_horizon_drops
++;
464 return qdisc_drop(skb
, sch
, to_free
);
466 q
->stat_horizon_caps
++;
467 skb
->tstamp
= q
->ktime_cache
+ q
->horizon
;
470 fq_skb_cb(skb
)->time_to_send
= skb
->tstamp
;
473 f
= fq_classify(skb
, q
);
474 if (unlikely(f
->qlen
>= q
->flow_plimit
&& f
!= &q
->internal
)) {
475 q
->stat_flows_plimit
++;
476 return qdisc_drop(skb
, sch
, to_free
);
480 qdisc_qstats_backlog_inc(sch
, skb
);
481 if (fq_flow_is_detached(f
)) {
482 fq_flow_add_tail(&q
->new_flows
, f
);
483 if (time_after(jiffies
, f
->age
+ q
->flow_refill_delay
))
484 f
->credit
= max_t(u32
, f
->credit
, q
->quantum
);
488 /* Note: this overwrites f->age */
489 flow_queue_add(f
, skb
);
491 if (unlikely(f
== &q
->internal
)) {
492 q
->stat_internal_packets
++;
496 return NET_XMIT_SUCCESS
;
499 static void fq_check_throttled(struct fq_sched_data
*q
, u64 now
)
501 unsigned long sample
;
504 if (q
->time_next_delayed_flow
> now
)
507 /* Update unthrottle latency EWMA.
508 * This is cheap and can help diagnosing timer/latency problems.
510 sample
= (unsigned long)(now
- q
->time_next_delayed_flow
);
511 q
->unthrottle_latency_ns
-= q
->unthrottle_latency_ns
>> 3;
512 q
->unthrottle_latency_ns
+= sample
>> 3;
514 q
->time_next_delayed_flow
= ~0ULL;
515 while ((p
= rb_first(&q
->delayed
)) != NULL
) {
516 struct fq_flow
*f
= rb_entry(p
, struct fq_flow
, rate_node
);
518 if (f
->time_next_packet
> now
) {
519 q
->time_next_delayed_flow
= f
->time_next_packet
;
522 fq_flow_unset_throttled(q
, f
);
526 static struct sk_buff
*fq_dequeue(struct Qdisc
*sch
)
528 struct fq_sched_data
*q
= qdisc_priv(sch
);
529 struct fq_flow_head
*head
;
539 skb
= fq_peek(&q
->internal
);
541 fq_dequeue_skb(sch
, &q
->internal
, skb
);
545 q
->ktime_cache
= now
= ktime_get_ns();
546 fq_check_throttled(q
, now
);
548 head
= &q
->new_flows
;
550 head
= &q
->old_flows
;
552 if (q
->time_next_delayed_flow
!= ~0ULL)
553 qdisc_watchdog_schedule_range_ns(&q
->watchdog
,
554 q
->time_next_delayed_flow
,
561 if (f
->credit
<= 0) {
562 f
->credit
+= q
->quantum
;
563 head
->first
= f
->next
;
564 fq_flow_add_tail(&q
->old_flows
, f
);
570 u64 time_next_packet
= max_t(u64
, fq_skb_cb(skb
)->time_to_send
,
571 f
->time_next_packet
);
573 if (now
< time_next_packet
) {
574 head
->first
= f
->next
;
575 f
->time_next_packet
= time_next_packet
;
576 fq_flow_set_throttled(q
, f
);
580 if ((s64
)(now
- time_next_packet
- q
->ce_threshold
) > 0) {
581 INET_ECN_set_ce(skb
);
584 fq_dequeue_skb(sch
, f
, skb
);
586 head
->first
= f
->next
;
587 /* force a pass through old_flows to prevent starvation */
588 if ((head
== &q
->new_flows
) && q
->old_flows
.first
) {
589 fq_flow_add_tail(&q
->old_flows
, f
);
591 fq_flow_set_detached(f
);
596 plen
= qdisc_pkt_len(skb
);
602 rate
= q
->flow_max_rate
;
604 /* If EDT time was provided for this skb, we need to
605 * update f->time_next_packet only if this qdisc enforces
610 rate
= min(skb
->sk
->sk_pacing_rate
, rate
);
612 if (rate
<= q
->low_rate_threshold
) {
615 plen
= max(plen
, q
->quantum
);
621 u64 len
= (u64
)plen
* NSEC_PER_SEC
;
624 len
= div64_ul(len
, rate
);
625 /* Since socket rate can change later,
626 * clamp the delay to 1 second.
627 * Really, providers of too big packets should be fixed !
629 if (unlikely(len
> NSEC_PER_SEC
)) {
631 q
->stat_pkts_too_long
++;
633 /* Account for schedule/timers drifts.
634 * f->time_next_packet was set when prior packet was sent,
635 * and current time (@now) can be too late by tens of us.
637 if (f
->time_next_packet
)
638 len
-= min(len
/2, now
- f
->time_next_packet
);
639 f
->time_next_packet
= now
+ len
;
642 qdisc_bstats_update(sch
, skb
);
646 static void fq_flow_purge(struct fq_flow
*flow
)
648 struct rb_node
*p
= rb_first(&flow
->t_root
);
651 struct sk_buff
*skb
= rb_to_skb(p
);
654 rb_erase(&skb
->rbnode
, &flow
->t_root
);
655 rtnl_kfree_skbs(skb
, skb
);
657 rtnl_kfree_skbs(flow
->head
, flow
->tail
);
662 static void fq_reset(struct Qdisc
*sch
)
664 struct fq_sched_data
*q
= qdisc_priv(sch
);
665 struct rb_root
*root
;
671 sch
->qstats
.backlog
= 0;
673 fq_flow_purge(&q
->internal
);
678 for (idx
= 0; idx
< (1U << q
->fq_trees_log
); idx
++) {
679 root
= &q
->fq_root
[idx
];
680 while ((p
= rb_first(root
)) != NULL
) {
681 f
= rb_entry(p
, struct fq_flow
, fq_node
);
686 kmem_cache_free(fq_flow_cachep
, f
);
689 q
->new_flows
.first
= NULL
;
690 q
->old_flows
.first
= NULL
;
691 q
->delayed
= RB_ROOT
;
693 q
->inactive_flows
= 0;
694 q
->throttled_flows
= 0;
697 static void fq_rehash(struct fq_sched_data
*q
,
698 struct rb_root
*old_array
, u32 old_log
,
699 struct rb_root
*new_array
, u32 new_log
)
701 struct rb_node
*op
, **np
, *parent
;
702 struct rb_root
*oroot
, *nroot
;
703 struct fq_flow
*of
, *nf
;
707 for (idx
= 0; idx
< (1U << old_log
); idx
++) {
708 oroot
= &old_array
[idx
];
709 while ((op
= rb_first(oroot
)) != NULL
) {
711 of
= rb_entry(op
, struct fq_flow
, fq_node
);
712 if (fq_gc_candidate(of
)) {
714 kmem_cache_free(fq_flow_cachep
, of
);
717 nroot
= &new_array
[hash_ptr(of
->sk
, new_log
)];
719 np
= &nroot
->rb_node
;
724 nf
= rb_entry(parent
, struct fq_flow
, fq_node
);
725 BUG_ON(nf
->sk
== of
->sk
);
728 np
= &parent
->rb_right
;
730 np
= &parent
->rb_left
;
733 rb_link_node(&of
->fq_node
, parent
, np
);
734 rb_insert_color(&of
->fq_node
, nroot
);
738 q
->inactive_flows
-= fcnt
;
739 q
->stat_gc_flows
+= fcnt
;
742 static void fq_free(void *addr
)
747 static int fq_resize(struct Qdisc
*sch
, u32 log
)
749 struct fq_sched_data
*q
= qdisc_priv(sch
);
750 struct rb_root
*array
;
754 if (q
->fq_root
&& log
== q
->fq_trees_log
)
757 /* If XPS was setup, we can allocate memory on right NUMA node */
758 array
= kvmalloc_node(sizeof(struct rb_root
) << log
, GFP_KERNEL
| __GFP_RETRY_MAYFAIL
,
759 netdev_queue_numa_node_read(sch
->dev_queue
));
763 for (idx
= 0; idx
< (1U << log
); idx
++)
764 array
[idx
] = RB_ROOT
;
768 old_fq_root
= q
->fq_root
;
770 fq_rehash(q
, old_fq_root
, q
->fq_trees_log
, array
, log
);
773 q
->fq_trees_log
= log
;
775 sch_tree_unlock(sch
);
777 fq_free(old_fq_root
);
782 static const struct nla_policy fq_policy
[TCA_FQ_MAX
+ 1] = {
783 [TCA_FQ_UNSPEC
] = { .strict_start_type
= TCA_FQ_TIMER_SLACK
},
785 [TCA_FQ_PLIMIT
] = { .type
= NLA_U32
},
786 [TCA_FQ_FLOW_PLIMIT
] = { .type
= NLA_U32
},
787 [TCA_FQ_QUANTUM
] = { .type
= NLA_U32
},
788 [TCA_FQ_INITIAL_QUANTUM
] = { .type
= NLA_U32
},
789 [TCA_FQ_RATE_ENABLE
] = { .type
= NLA_U32
},
790 [TCA_FQ_FLOW_DEFAULT_RATE
] = { .type
= NLA_U32
},
791 [TCA_FQ_FLOW_MAX_RATE
] = { .type
= NLA_U32
},
792 [TCA_FQ_BUCKETS_LOG
] = { .type
= NLA_U32
},
793 [TCA_FQ_FLOW_REFILL_DELAY
] = { .type
= NLA_U32
},
794 [TCA_FQ_ORPHAN_MASK
] = { .type
= NLA_U32
},
795 [TCA_FQ_LOW_RATE_THRESHOLD
] = { .type
= NLA_U32
},
796 [TCA_FQ_CE_THRESHOLD
] = { .type
= NLA_U32
},
797 [TCA_FQ_TIMER_SLACK
] = { .type
= NLA_U32
},
798 [TCA_FQ_HORIZON
] = { .type
= NLA_U32
},
799 [TCA_FQ_HORIZON_DROP
] = { .type
= NLA_U8
},
802 static int fq_change(struct Qdisc
*sch
, struct nlattr
*opt
,
803 struct netlink_ext_ack
*extack
)
805 struct fq_sched_data
*q
= qdisc_priv(sch
);
806 struct nlattr
*tb
[TCA_FQ_MAX
+ 1];
807 int err
, drop_count
= 0;
808 unsigned drop_len
= 0;
814 err
= nla_parse_nested_deprecated(tb
, TCA_FQ_MAX
, opt
, fq_policy
,
821 fq_log
= q
->fq_trees_log
;
823 if (tb
[TCA_FQ_BUCKETS_LOG
]) {
824 u32 nval
= nla_get_u32(tb
[TCA_FQ_BUCKETS_LOG
]);
826 if (nval
>= 1 && nval
<= ilog2(256*1024))
831 if (tb
[TCA_FQ_PLIMIT
])
832 sch
->limit
= nla_get_u32(tb
[TCA_FQ_PLIMIT
]);
834 if (tb
[TCA_FQ_FLOW_PLIMIT
])
835 q
->flow_plimit
= nla_get_u32(tb
[TCA_FQ_FLOW_PLIMIT
]);
837 if (tb
[TCA_FQ_QUANTUM
]) {
838 u32 quantum
= nla_get_u32(tb
[TCA_FQ_QUANTUM
]);
840 if (quantum
> 0 && quantum
<= (1 << 20)) {
841 q
->quantum
= quantum
;
843 NL_SET_ERR_MSG_MOD(extack
, "invalid quantum");
848 if (tb
[TCA_FQ_INITIAL_QUANTUM
])
849 q
->initial_quantum
= nla_get_u32(tb
[TCA_FQ_INITIAL_QUANTUM
]);
851 if (tb
[TCA_FQ_FLOW_DEFAULT_RATE
])
852 pr_warn_ratelimited("sch_fq: defrate %u ignored.\n",
853 nla_get_u32(tb
[TCA_FQ_FLOW_DEFAULT_RATE
]));
855 if (tb
[TCA_FQ_FLOW_MAX_RATE
]) {
856 u32 rate
= nla_get_u32(tb
[TCA_FQ_FLOW_MAX_RATE
]);
858 q
->flow_max_rate
= (rate
== ~0U) ? ~0UL : rate
;
860 if (tb
[TCA_FQ_LOW_RATE_THRESHOLD
])
861 q
->low_rate_threshold
=
862 nla_get_u32(tb
[TCA_FQ_LOW_RATE_THRESHOLD
]);
864 if (tb
[TCA_FQ_RATE_ENABLE
]) {
865 u32 enable
= nla_get_u32(tb
[TCA_FQ_RATE_ENABLE
]);
868 q
->rate_enable
= enable
;
873 if (tb
[TCA_FQ_FLOW_REFILL_DELAY
]) {
874 u32 usecs_delay
= nla_get_u32(tb
[TCA_FQ_FLOW_REFILL_DELAY
]) ;
876 q
->flow_refill_delay
= usecs_to_jiffies(usecs_delay
);
879 if (tb
[TCA_FQ_ORPHAN_MASK
])
880 q
->orphan_mask
= nla_get_u32(tb
[TCA_FQ_ORPHAN_MASK
]);
882 if (tb
[TCA_FQ_CE_THRESHOLD
])
883 q
->ce_threshold
= (u64
)NSEC_PER_USEC
*
884 nla_get_u32(tb
[TCA_FQ_CE_THRESHOLD
]);
886 if (tb
[TCA_FQ_TIMER_SLACK
])
887 q
->timer_slack
= nla_get_u32(tb
[TCA_FQ_TIMER_SLACK
]);
889 if (tb
[TCA_FQ_HORIZON
])
890 q
->horizon
= (u64
)NSEC_PER_USEC
*
891 nla_get_u32(tb
[TCA_FQ_HORIZON
]);
893 if (tb
[TCA_FQ_HORIZON_DROP
])
894 q
->horizon_drop
= nla_get_u8(tb
[TCA_FQ_HORIZON_DROP
]);
898 sch_tree_unlock(sch
);
899 err
= fq_resize(sch
, fq_log
);
902 while (sch
->q
.qlen
> sch
->limit
) {
903 struct sk_buff
*skb
= fq_dequeue(sch
);
907 drop_len
+= qdisc_pkt_len(skb
);
908 rtnl_kfree_skbs(skb
, skb
);
911 qdisc_tree_reduce_backlog(sch
, drop_count
, drop_len
);
913 sch_tree_unlock(sch
);
917 static void fq_destroy(struct Qdisc
*sch
)
919 struct fq_sched_data
*q
= qdisc_priv(sch
);
923 qdisc_watchdog_cancel(&q
->watchdog
);
926 static int fq_init(struct Qdisc
*sch
, struct nlattr
*opt
,
927 struct netlink_ext_ack
*extack
)
929 struct fq_sched_data
*q
= qdisc_priv(sch
);
933 q
->flow_plimit
= 100;
934 q
->quantum
= 2 * psched_mtu(qdisc_dev(sch
));
935 q
->initial_quantum
= 10 * psched_mtu(qdisc_dev(sch
));
936 q
->flow_refill_delay
= msecs_to_jiffies(40);
937 q
->flow_max_rate
= ~0UL;
938 q
->time_next_delayed_flow
= ~0ULL;
940 q
->new_flows
.first
= NULL
;
941 q
->old_flows
.first
= NULL
;
942 q
->delayed
= RB_ROOT
;
944 q
->fq_trees_log
= ilog2(1024);
945 q
->orphan_mask
= 1024 - 1;
946 q
->low_rate_threshold
= 550000 / 8;
948 q
->timer_slack
= 10 * NSEC_PER_USEC
; /* 10 usec of hrtimer slack */
950 q
->horizon
= 10ULL * NSEC_PER_SEC
; /* 10 seconds */
951 q
->horizon_drop
= 1; /* by default, drop packets beyond horizon */
953 /* Default ce_threshold of 4294 seconds */
954 q
->ce_threshold
= (u64
)NSEC_PER_USEC
* ~0U;
956 qdisc_watchdog_init_clockid(&q
->watchdog
, sch
, CLOCK_MONOTONIC
);
959 err
= fq_change(sch
, opt
, extack
);
961 err
= fq_resize(sch
, q
->fq_trees_log
);
966 static int fq_dump(struct Qdisc
*sch
, struct sk_buff
*skb
)
968 struct fq_sched_data
*q
= qdisc_priv(sch
);
969 u64 ce_threshold
= q
->ce_threshold
;
970 u64 horizon
= q
->horizon
;
973 opts
= nla_nest_start_noflag(skb
, TCA_OPTIONS
);
975 goto nla_put_failure
;
977 /* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */
979 do_div(ce_threshold
, NSEC_PER_USEC
);
980 do_div(horizon
, NSEC_PER_USEC
);
982 if (nla_put_u32(skb
, TCA_FQ_PLIMIT
, sch
->limit
) ||
983 nla_put_u32(skb
, TCA_FQ_FLOW_PLIMIT
, q
->flow_plimit
) ||
984 nla_put_u32(skb
, TCA_FQ_QUANTUM
, q
->quantum
) ||
985 nla_put_u32(skb
, TCA_FQ_INITIAL_QUANTUM
, q
->initial_quantum
) ||
986 nla_put_u32(skb
, TCA_FQ_RATE_ENABLE
, q
->rate_enable
) ||
987 nla_put_u32(skb
, TCA_FQ_FLOW_MAX_RATE
,
988 min_t(unsigned long, q
->flow_max_rate
, ~0U)) ||
989 nla_put_u32(skb
, TCA_FQ_FLOW_REFILL_DELAY
,
990 jiffies_to_usecs(q
->flow_refill_delay
)) ||
991 nla_put_u32(skb
, TCA_FQ_ORPHAN_MASK
, q
->orphan_mask
) ||
992 nla_put_u32(skb
, TCA_FQ_LOW_RATE_THRESHOLD
,
993 q
->low_rate_threshold
) ||
994 nla_put_u32(skb
, TCA_FQ_CE_THRESHOLD
, (u32
)ce_threshold
) ||
995 nla_put_u32(skb
, TCA_FQ_BUCKETS_LOG
, q
->fq_trees_log
) ||
996 nla_put_u32(skb
, TCA_FQ_TIMER_SLACK
, q
->timer_slack
) ||
997 nla_put_u32(skb
, TCA_FQ_HORIZON
, (u32
)horizon
) ||
998 nla_put_u8(skb
, TCA_FQ_HORIZON_DROP
, q
->horizon_drop
))
999 goto nla_put_failure
;
1001 return nla_nest_end(skb
, opts
);
1007 static int fq_dump_stats(struct Qdisc
*sch
, struct gnet_dump
*d
)
1009 struct fq_sched_data
*q
= qdisc_priv(sch
);
1010 struct tc_fq_qd_stats st
;
1014 st
.gc_flows
= q
->stat_gc_flows
;
1015 st
.highprio_packets
= q
->stat_internal_packets
;
1017 st
.throttled
= q
->stat_throttled
;
1018 st
.flows_plimit
= q
->stat_flows_plimit
;
1019 st
.pkts_too_long
= q
->stat_pkts_too_long
;
1020 st
.allocation_errors
= q
->stat_allocation_errors
;
1021 st
.time_next_delayed_flow
= q
->time_next_delayed_flow
+ q
->timer_slack
-
1023 st
.flows
= q
->flows
;
1024 st
.inactive_flows
= q
->inactive_flows
;
1025 st
.throttled_flows
= q
->throttled_flows
;
1026 st
.unthrottle_latency_ns
= min_t(unsigned long,
1027 q
->unthrottle_latency_ns
, ~0U);
1028 st
.ce_mark
= q
->stat_ce_mark
;
1029 st
.horizon_drops
= q
->stat_horizon_drops
;
1030 st
.horizon_caps
= q
->stat_horizon_caps
;
1031 sch_tree_unlock(sch
);
1033 return gnet_stats_copy_app(d
, &st
, sizeof(st
));
1036 static struct Qdisc_ops fq_qdisc_ops __read_mostly
= {
1038 .priv_size
= sizeof(struct fq_sched_data
),
1040 .enqueue
= fq_enqueue
,
1041 .dequeue
= fq_dequeue
,
1042 .peek
= qdisc_peek_dequeued
,
1045 .destroy
= fq_destroy
,
1046 .change
= fq_change
,
1048 .dump_stats
= fq_dump_stats
,
1049 .owner
= THIS_MODULE
,
1052 static int __init
fq_module_init(void)
1056 fq_flow_cachep
= kmem_cache_create("fq_flow_cache",
1057 sizeof(struct fq_flow
),
1059 if (!fq_flow_cachep
)
1062 ret
= register_qdisc(&fq_qdisc_ops
);
1064 kmem_cache_destroy(fq_flow_cachep
);
1068 static void __exit
fq_module_exit(void)
1070 unregister_qdisc(&fq_qdisc_ops
);
1071 kmem_cache_destroy(fq_flow_cachep
);
1074 module_init(fq_module_init
)
1075 module_exit(fq_module_exit
)
1076 MODULE_AUTHOR("Eric Dumazet");
1077 MODULE_LICENSE("GPL");
1078 MODULE_DESCRIPTION("Fair Queue Packet Scheduler");