1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing)
5 * Copyright (C) 2013-2023 Eric Dumazet <edumazet@google.com>
7 * Meant to be mostly used for locally generated traffic :
8 * Fast classification depends on skb->sk being set before reaching us.
9 * If not, (router workload), we use rxhash as fallback, with 32 bits wide hash.
10 * All packets belonging to a socket are considered as a 'flow'.
12 * Flows are dynamically allocated and stored in a hash table of RB trees
13 * They are also part of one Round Robin 'queues' (new or old flows)
15 * Burst avoidance (aka pacing) capability :
17 * Transport (eg TCP) can set in sk->sk_pacing_rate a rate, enqueue a
18 * bunch of packets, and this packet scheduler adds delay between
19 * packets to respect rate limitation.
22 * - lookup one RB tree (out of 1024 or more) to find the flow.
23 * If non existent flow, create it, add it to the tree.
24 * Add skb to the per flow list of skb (fifo).
25 * - Use a special fifo for high prio packets
27 * dequeue() : serves flows in Round Robin
28 * Note : When a flow becomes empty, we do not immediately remove it from
29 * rb trees, for performance reasons (its expected to send additional packets,
30 * or SLAB cache will reuse socket for another flow)
33 #include <linux/module.h>
34 #include <linux/types.h>
35 #include <linux/kernel.h>
36 #include <linux/jiffies.h>
37 #include <linux/string.h>
39 #include <linux/errno.h>
40 #include <linux/init.h>
41 #include <linux/skbuff.h>
42 #include <linux/slab.h>
43 #include <linux/rbtree.h>
44 #include <linux/hash.h>
45 #include <linux/prefetch.h>
46 #include <linux/vmalloc.h>
47 #include <net/netlink.h>
48 #include <net/pkt_sched.h>
50 #include <net/tcp_states.h>
58 static inline struct fq_skb_cb
*fq_skb_cb(struct sk_buff
*skb
)
60 qdisc_cb_private_validate(skb
, sizeof(struct fq_skb_cb
));
61 return (struct fq_skb_cb
*)qdisc_skb_cb(skb
)->data
;
65 * Per flow structure, dynamically allocated.
66 * If packets have monotically increasing time_to_send, they are placed in O(1)
67 * in linear list (head,tail), otherwise are placed in a rbtree (t_root).
70 /* First cache line : used in fq_gc(), fq_enqueue(), fq_dequeue() */
71 struct rb_root t_root
;
72 struct sk_buff
*head
; /* list of skbs for this flow : first skb */
74 struct sk_buff
*tail
; /* last skb in the list */
75 unsigned long age
; /* (jiffies | 1UL) when flow was emptied, for gc */
78 struct rb_node fq_node
; /* anchor in fq_root[] trees */
79 /* Following field is only used for q->internal,
80 * because q->internal is not hashed in fq_root[]
82 u64 stat_fastpath_packets
;
85 u32 socket_hash
; /* sk_hash */
86 int qlen
; /* number of packets in flow queue */
88 /* Second cache line */
91 struct fq_flow
*next
; /* next pointer in RR lists */
93 struct rb_node rate_node
; /* anchor in q->delayed tree */
98 struct fq_flow
*first
;
102 struct fq_perband_flows
{
103 struct fq_flow_head new_flows
;
104 struct fq_flow_head old_flows
;
106 int quantum
; /* based on band nr : 576KB, 192KB, 64KB */
109 #define FQ_PRIO2BAND_CRUMB_SIZE ((TC_PRIO_MAX + 1) >> 2)
111 struct fq_sched_data
{
112 /* Read mostly cache line */
117 u32 flow_refill_delay
;
118 u32 flow_plimit
; /* max packets per flow */
119 unsigned long flow_max_rate
; /* optional max rate per flow */
121 u64 horizon
; /* horizon in ns */
122 u32 orphan_mask
; /* mask for orphaned skb */
123 u32 low_rate_threshold
;
124 struct rb_root
*fq_root
;
128 u8 prio2band
[FQ_PRIO2BAND_CRUMB_SIZE
];
129 u32 timer_slack
; /* hrtimer slack in ns */
131 /* Read/Write fields. */
133 unsigned int band_nr
; /* band being serviced in fq_dequeue() */
135 struct fq_perband_flows band_flows
[FQ_BANDS
];
137 struct fq_flow internal
; /* fastpath queue. */
138 struct rb_root delayed
; /* for rate limited flows */
139 u64 time_next_delayed_flow
;
140 unsigned long unthrottle_latency_ns
;
142 u32 band_pkt_count
[FQ_BANDS
];
144 u32 inactive_flows
; /* Flows with no packet to send. */
148 struct qdisc_watchdog watchdog
;
151 /* Seldom used fields. */
153 u64 stat_band_drops
[FQ_BANDS
];
155 u64 stat_horizon_drops
;
156 u64 stat_horizon_caps
;
157 u64 stat_flows_plimit
;
158 u64 stat_pkts_too_long
;
159 u64 stat_allocation_errors
;
162 /* return the i-th 2-bit value ("crumb") */
163 static u8
fq_prio2band(const u8
*prio2band
, unsigned int prio
)
165 return (READ_ONCE(prio2band
[prio
/ 4]) >> (2 * (prio
& 0x3))) & 0x3;
169 * f->tail and f->age share the same location.
170 * We can use the low order bit to differentiate if this location points
171 * to a sk_buff or contains a jiffies value, if we force this value to be odd.
172 * This assumes f->tail low order bit must be 0 since alignof(struct sk_buff) >= 2
174 static void fq_flow_set_detached(struct fq_flow
*f
)
176 f
->age
= jiffies
| 1UL;
179 static bool fq_flow_is_detached(const struct fq_flow
*f
)
181 return !!(f
->age
& 1UL);
184 /* special value to mark a throttled flow (not on old/new list) */
185 static struct fq_flow throttled
;
187 static bool fq_flow_is_throttled(const struct fq_flow
*f
)
189 return f
->next
== &throttled
;
197 static void fq_flow_add_tail(struct fq_sched_data
*q
, struct fq_flow
*flow
,
198 enum new_flow list_sel
)
200 struct fq_perband_flows
*pband
= &q
->band_flows
[flow
->band
];
201 struct fq_flow_head
*head
= (list_sel
== NEW_FLOW
) ?
206 head
->last
->next
= flow
;
213 static void fq_flow_unset_throttled(struct fq_sched_data
*q
, struct fq_flow
*f
)
215 rb_erase(&f
->rate_node
, &q
->delayed
);
216 q
->throttled_flows
--;
217 fq_flow_add_tail(q
, f
, OLD_FLOW
);
220 static void fq_flow_set_throttled(struct fq_sched_data
*q
, struct fq_flow
*f
)
222 struct rb_node
**p
= &q
->delayed
.rb_node
, *parent
= NULL
;
228 aux
= rb_entry(parent
, struct fq_flow
, rate_node
);
229 if (f
->time_next_packet
>= aux
->time_next_packet
)
230 p
= &parent
->rb_right
;
232 p
= &parent
->rb_left
;
234 rb_link_node(&f
->rate_node
, parent
, p
);
235 rb_insert_color(&f
->rate_node
, &q
->delayed
);
236 q
->throttled_flows
++;
239 f
->next
= &throttled
;
240 if (q
->time_next_delayed_flow
> f
->time_next_packet
)
241 q
->time_next_delayed_flow
= f
->time_next_packet
;
245 static struct kmem_cache
*fq_flow_cachep __read_mostly
;
248 /* limit number of collected flows per round */
250 #define FQ_GC_AGE (3*HZ)
252 static bool fq_gc_candidate(const struct fq_flow
*f
)
254 return fq_flow_is_detached(f
) &&
255 time_after(jiffies
, f
->age
+ FQ_GC_AGE
);
258 static void fq_gc(struct fq_sched_data
*q
,
259 struct rb_root
*root
,
262 struct rb_node
**p
, *parent
;
263 void *tofree
[FQ_GC_MAX
];
272 f
= rb_entry(parent
, struct fq_flow
, fq_node
);
276 if (fq_gc_candidate(f
)) {
278 if (fcnt
== FQ_GC_MAX
)
283 p
= &parent
->rb_right
;
285 p
= &parent
->rb_left
;
291 for (i
= fcnt
; i
> 0; ) {
293 rb_erase(&f
->fq_node
, root
);
296 q
->inactive_flows
-= fcnt
;
297 q
->stat_gc_flows
+= fcnt
;
299 kmem_cache_free_bulk(fq_flow_cachep
, fcnt
, tofree
);
302 /* Fast path can be used if :
303 * 1) Packet tstamp is in the past, or within the pacing offload horizon.
305 * (no flow is currently eligible for transmit,
306 * AND fast path queue has less than 8 packets)
307 * 3) No SO_MAX_PACING_RATE on the socket (if any).
308 * 4) No @maxrate attribute on this qdisc,
310 * FQ can not use generic TCQ_F_CAN_BYPASS infrastructure.
312 static bool fq_fastpath_check(const struct Qdisc
*sch
, struct sk_buff
*skb
,
315 const struct fq_sched_data
*q
= qdisc_priv(sch
);
316 const struct sock
*sk
;
318 if (fq_skb_cb(skb
)->time_to_send
> now
+ q
->offload_horizon
)
321 if (sch
->q
.qlen
!= 0) {
322 /* Even if some packets are stored in this qdisc,
323 * we can still enable fast path if all of them are
324 * scheduled in the future (ie no flows are eligible)
325 * or in the fast path queue.
327 if (q
->flows
!= q
->inactive_flows
+ q
->throttled_flows
)
330 /* Do not allow fast path queue to explode, we want Fair Queue mode
333 if (q
->internal
.qlen
>= 8)
336 /* Ordering invariants fall apart if some delayed flows
337 * are ready but we haven't serviced them, yet.
339 if (q
->time_next_delayed_flow
<= now
+ q
->offload_horizon
)
344 if (sk
&& sk_fullsock(sk
) && !sk_is_tcp(sk
) &&
345 sk
->sk_max_pacing_rate
!= ~0UL)
348 if (q
->flow_max_rate
!= ~0UL)
354 static struct fq_flow
*fq_classify(struct Qdisc
*sch
, struct sk_buff
*skb
,
357 struct fq_sched_data
*q
= qdisc_priv(sch
);
358 struct rb_node
**p
, *parent
;
359 struct sock
*sk
= skb
->sk
;
360 struct rb_root
*root
;
363 /* SYNACK messages are attached to a TCP_NEW_SYN_RECV request socket
364 * or a listener (SYNCOOKIE mode)
365 * 1) request sockets are not full blown,
366 * they do not contain sk_pacing_rate
367 * 2) They are not part of a 'flow' yet
368 * 3) We do not want to rate limit them (eg SYNFLOOD attack),
369 * especially if the listener set SO_MAX_PACING_RATE
370 * 4) We pretend they are orphaned
371 * TCP can also associate TIME_WAIT sockets with RST or ACK packets.
373 if (!sk
|| sk_listener_or_tw(sk
)) {
374 unsigned long hash
= skb_get_hash(skb
) & q
->orphan_mask
;
376 /* By forcing low order bit to 1, we make sure to not
377 * collide with a local flow (socket pointers are word aligned)
379 sk
= (struct sock
*)((hash
<< 1) | 1UL);
381 } else if (sk
->sk_state
== TCP_CLOSE
) {
382 unsigned long hash
= skb_get_hash(skb
) & q
->orphan_mask
;
384 * Sockets in TCP_CLOSE are non connected.
385 * Typical use case is UDP sockets, they can send packets
386 * with sendto() to many different destinations.
387 * We probably could use a generic bit advertising
388 * non connected sockets, instead of sk_state == TCP_CLOSE,
391 sk
= (struct sock
*)((hash
<< 1) | 1UL);
394 if (fq_fastpath_check(sch
, skb
, now
)) {
395 q
->internal
.stat_fastpath_packets
++;
396 if (skb
->sk
== sk
&& q
->rate_enable
&&
397 READ_ONCE(sk
->sk_pacing_status
) != SK_PACING_FQ
)
398 smp_store_release(&sk
->sk_pacing_status
,
403 root
= &q
->fq_root
[hash_ptr(sk
, q
->fq_trees_log
)];
412 f
= rb_entry(parent
, struct fq_flow
, fq_node
);
414 /* socket might have been reallocated, so check
415 * if its sk_hash is the same.
416 * It not, we need to refill credit with
419 if (unlikely(skb
->sk
== sk
&&
420 f
->socket_hash
!= sk
->sk_hash
)) {
421 f
->credit
= q
->initial_quantum
;
422 f
->socket_hash
= sk
->sk_hash
;
424 smp_store_release(&sk
->sk_pacing_status
,
426 if (fq_flow_is_throttled(f
))
427 fq_flow_unset_throttled(q
, f
);
428 f
->time_next_packet
= 0ULL;
433 p
= &parent
->rb_right
;
435 p
= &parent
->rb_left
;
438 f
= kmem_cache_zalloc(fq_flow_cachep
, GFP_ATOMIC
| __GFP_NOWARN
);
440 q
->stat_allocation_errors
++;
443 /* f->t_root is already zeroed after kmem_cache_zalloc() */
445 fq_flow_set_detached(f
);
448 f
->socket_hash
= sk
->sk_hash
;
450 smp_store_release(&sk
->sk_pacing_status
,
453 f
->credit
= q
->initial_quantum
;
455 rb_link_node(&f
->fq_node
, parent
, p
);
456 rb_insert_color(&f
->fq_node
, root
);
463 static struct sk_buff
*fq_peek(struct fq_flow
*flow
)
465 struct sk_buff
*skb
= skb_rb_first(&flow
->t_root
);
466 struct sk_buff
*head
= flow
->head
;
474 if (fq_skb_cb(skb
)->time_to_send
< fq_skb_cb(head
)->time_to_send
)
479 static void fq_erase_head(struct Qdisc
*sch
, struct fq_flow
*flow
,
482 if (skb
== flow
->head
) {
483 flow
->head
= skb
->next
;
485 rb_erase(&skb
->rbnode
, &flow
->t_root
);
486 skb
->dev
= qdisc_dev(sch
);
490 /* Remove one skb from flow queue.
491 * This skb must be the return value of prior fq_peek().
493 static void fq_dequeue_skb(struct Qdisc
*sch
, struct fq_flow
*flow
,
496 fq_erase_head(sch
, flow
, skb
);
497 skb_mark_not_on_list(skb
);
498 qdisc_qstats_backlog_dec(sch
, skb
);
502 static void flow_queue_add(struct fq_flow
*flow
, struct sk_buff
*skb
)
504 struct rb_node
**p
, *parent
;
505 struct sk_buff
*head
, *aux
;
509 fq_skb_cb(skb
)->time_to_send
>= fq_skb_cb(flow
->tail
)->time_to_send
) {
513 flow
->tail
->next
= skb
;
519 p
= &flow
->t_root
.rb_node
;
524 aux
= rb_to_skb(parent
);
525 if (fq_skb_cb(skb
)->time_to_send
>= fq_skb_cb(aux
)->time_to_send
)
526 p
= &parent
->rb_right
;
528 p
= &parent
->rb_left
;
530 rb_link_node(&skb
->rbnode
, parent
, p
);
531 rb_insert_color(&skb
->rbnode
, &flow
->t_root
);
534 static bool fq_packet_beyond_horizon(const struct sk_buff
*skb
,
535 const struct fq_sched_data
*q
, u64 now
)
537 return unlikely((s64
)skb
->tstamp
> (s64
)(now
+ q
->horizon
));
540 #define FQDR(reason) SKB_DROP_REASON_FQ_##reason
542 static int fq_enqueue(struct sk_buff
*skb
, struct Qdisc
*sch
,
543 struct sk_buff
**to_free
)
545 struct fq_sched_data
*q
= qdisc_priv(sch
);
550 band
= fq_prio2band(q
->prio2band
, skb
->priority
& TC_PRIO_MAX
);
551 if (unlikely(q
->band_pkt_count
[band
] >= sch
->limit
)) {
552 q
->stat_band_drops
[band
]++;
553 return qdisc_drop_reason(skb
, sch
, to_free
,
557 now
= ktime_get_ns();
559 fq_skb_cb(skb
)->time_to_send
= now
;
561 /* Check if packet timestamp is too far in the future. */
562 if (fq_packet_beyond_horizon(skb
, q
, now
)) {
563 if (q
->horizon_drop
) {
564 q
->stat_horizon_drops
++;
565 return qdisc_drop_reason(skb
, sch
, to_free
,
566 FQDR(HORIZON_LIMIT
));
568 q
->stat_horizon_caps
++;
569 skb
->tstamp
= now
+ q
->horizon
;
571 fq_skb_cb(skb
)->time_to_send
= skb
->tstamp
;
574 f
= fq_classify(sch
, skb
, now
);
576 if (f
!= &q
->internal
) {
577 if (unlikely(f
->qlen
>= q
->flow_plimit
)) {
578 q
->stat_flows_plimit
++;
579 return qdisc_drop_reason(skb
, sch
, to_free
,
583 if (fq_flow_is_detached(f
)) {
584 fq_flow_add_tail(q
, f
, NEW_FLOW
);
585 if (time_after(jiffies
, f
->age
+ q
->flow_refill_delay
))
586 f
->credit
= max_t(u32
, f
->credit
, q
->quantum
);
590 q
->band_pkt_count
[band
]++;
591 fq_skb_cb(skb
)->band
= band
;
597 /* Note: this overwrites f->age */
598 flow_queue_add(f
, skb
);
600 qdisc_qstats_backlog_inc(sch
, skb
);
603 return NET_XMIT_SUCCESS
;
607 static void fq_check_throttled(struct fq_sched_data
*q
, u64 now
)
609 unsigned long sample
;
612 if (q
->time_next_delayed_flow
> now
+ q
->offload_horizon
)
615 /* Update unthrottle latency EWMA.
616 * This is cheap and can help diagnosing timer/latency problems.
618 sample
= (unsigned long)(now
- q
->time_next_delayed_flow
);
619 if ((long)sample
> 0) {
620 q
->unthrottle_latency_ns
-= q
->unthrottle_latency_ns
>> 3;
621 q
->unthrottle_latency_ns
+= sample
>> 3;
623 now
+= q
->offload_horizon
;
625 q
->time_next_delayed_flow
= ~0ULL;
626 while ((p
= rb_first(&q
->delayed
)) != NULL
) {
627 struct fq_flow
*f
= rb_entry(p
, struct fq_flow
, rate_node
);
629 if (f
->time_next_packet
> now
) {
630 q
->time_next_delayed_flow
= f
->time_next_packet
;
633 fq_flow_unset_throttled(q
, f
);
637 static struct fq_flow_head
*fq_pband_head_select(struct fq_perband_flows
*pband
)
639 if (pband
->credit
<= 0)
642 if (pband
->new_flows
.first
)
643 return &pband
->new_flows
;
645 return pband
->old_flows
.first
? &pband
->old_flows
: NULL
;
648 static struct sk_buff
*fq_dequeue(struct Qdisc
*sch
)
650 struct fq_sched_data
*q
= qdisc_priv(sch
);
651 struct fq_perband_flows
*pband
;
652 struct fq_flow_head
*head
;
663 skb
= fq_peek(&q
->internal
);
666 fq_dequeue_skb(sch
, &q
->internal
, skb
);
670 now
= ktime_get_ns();
671 fq_check_throttled(q
, now
);
673 pband
= &q
->band_flows
[q
->band_nr
];
675 head
= fq_pband_head_select(pband
);
677 while (++retry
<= FQ_BANDS
) {
678 if (++q
->band_nr
== FQ_BANDS
)
680 pband
= &q
->band_flows
[q
->band_nr
];
681 pband
->credit
= min(pband
->credit
+ pband
->quantum
,
683 if (pband
->credit
> 0)
687 if (q
->time_next_delayed_flow
!= ~0ULL)
688 qdisc_watchdog_schedule_range_ns(&q
->watchdog
,
689 q
->time_next_delayed_flow
,
695 if (f
->credit
<= 0) {
696 f
->credit
+= q
->quantum
;
697 head
->first
= f
->next
;
698 fq_flow_add_tail(q
, f
, OLD_FLOW
);
704 u64 time_next_packet
= max_t(u64
, fq_skb_cb(skb
)->time_to_send
,
705 f
->time_next_packet
);
707 if (now
+ q
->offload_horizon
< time_next_packet
) {
708 head
->first
= f
->next
;
709 f
->time_next_packet
= time_next_packet
;
710 fq_flow_set_throttled(q
, f
);
714 if ((s64
)(now
- time_next_packet
- q
->ce_threshold
) > 0) {
715 INET_ECN_set_ce(skb
);
720 q
->band_pkt_count
[fq_skb_cb(skb
)->band
]--;
721 fq_dequeue_skb(sch
, f
, skb
);
723 head
->first
= f
->next
;
724 /* force a pass through old_flows to prevent starvation */
725 if (head
== &pband
->new_flows
) {
726 fq_flow_add_tail(q
, f
, OLD_FLOW
);
728 fq_flow_set_detached(f
);
732 plen
= qdisc_pkt_len(skb
);
734 pband
->credit
-= plen
;
739 rate
= q
->flow_max_rate
;
741 /* If EDT time was provided for this skb, we need to
742 * update f->time_next_packet only if this qdisc enforces
747 rate
= min(READ_ONCE(skb
->sk
->sk_pacing_rate
), rate
);
749 if (rate
<= q
->low_rate_threshold
) {
752 plen
= max(plen
, q
->quantum
);
758 u64 len
= (u64
)plen
* NSEC_PER_SEC
;
761 len
= div64_ul(len
, rate
);
762 /* Since socket rate can change later,
763 * clamp the delay to 1 second.
764 * Really, providers of too big packets should be fixed !
766 if (unlikely(len
> NSEC_PER_SEC
)) {
768 q
->stat_pkts_too_long
++;
770 /* Account for schedule/timers drifts.
771 * f->time_next_packet was set when prior packet was sent,
772 * and current time (@now) can be too late by tens of us.
774 if (f
->time_next_packet
)
775 len
-= min(len
/2, now
- f
->time_next_packet
);
776 f
->time_next_packet
= now
+ len
;
779 qdisc_bstats_update(sch
, skb
);
783 static void fq_flow_purge(struct fq_flow
*flow
)
785 struct rb_node
*p
= rb_first(&flow
->t_root
);
788 struct sk_buff
*skb
= rb_to_skb(p
);
791 rb_erase(&skb
->rbnode
, &flow
->t_root
);
792 rtnl_kfree_skbs(skb
, skb
);
794 rtnl_kfree_skbs(flow
->head
, flow
->tail
);
799 static void fq_reset(struct Qdisc
*sch
)
801 struct fq_sched_data
*q
= qdisc_priv(sch
);
802 struct rb_root
*root
;
808 sch
->qstats
.backlog
= 0;
810 fq_flow_purge(&q
->internal
);
815 for (idx
= 0; idx
< (1U << q
->fq_trees_log
); idx
++) {
816 root
= &q
->fq_root
[idx
];
817 while ((p
= rb_first(root
)) != NULL
) {
818 f
= rb_entry(p
, struct fq_flow
, fq_node
);
823 kmem_cache_free(fq_flow_cachep
, f
);
826 for (idx
= 0; idx
< FQ_BANDS
; idx
++) {
827 q
->band_flows
[idx
].new_flows
.first
= NULL
;
828 q
->band_flows
[idx
].old_flows
.first
= NULL
;
830 q
->delayed
= RB_ROOT
;
832 q
->inactive_flows
= 0;
833 q
->throttled_flows
= 0;
836 static void fq_rehash(struct fq_sched_data
*q
,
837 struct rb_root
*old_array
, u32 old_log
,
838 struct rb_root
*new_array
, u32 new_log
)
840 struct rb_node
*op
, **np
, *parent
;
841 struct rb_root
*oroot
, *nroot
;
842 struct fq_flow
*of
, *nf
;
846 for (idx
= 0; idx
< (1U << old_log
); idx
++) {
847 oroot
= &old_array
[idx
];
848 while ((op
= rb_first(oroot
)) != NULL
) {
850 of
= rb_entry(op
, struct fq_flow
, fq_node
);
851 if (fq_gc_candidate(of
)) {
853 kmem_cache_free(fq_flow_cachep
, of
);
856 nroot
= &new_array
[hash_ptr(of
->sk
, new_log
)];
858 np
= &nroot
->rb_node
;
863 nf
= rb_entry(parent
, struct fq_flow
, fq_node
);
864 BUG_ON(nf
->sk
== of
->sk
);
867 np
= &parent
->rb_right
;
869 np
= &parent
->rb_left
;
872 rb_link_node(&of
->fq_node
, parent
, np
);
873 rb_insert_color(&of
->fq_node
, nroot
);
877 q
->inactive_flows
-= fcnt
;
878 q
->stat_gc_flows
+= fcnt
;
881 static void fq_free(void *addr
)
886 static int fq_resize(struct Qdisc
*sch
, u32 log
)
888 struct fq_sched_data
*q
= qdisc_priv(sch
);
889 struct rb_root
*array
;
893 if (q
->fq_root
&& log
== q
->fq_trees_log
)
896 /* If XPS was setup, we can allocate memory on right NUMA node */
897 array
= kvmalloc_node(sizeof(struct rb_root
) << log
, GFP_KERNEL
| __GFP_RETRY_MAYFAIL
,
898 netdev_queue_numa_node_read(sch
->dev_queue
));
902 for (idx
= 0; idx
< (1U << log
); idx
++)
903 array
[idx
] = RB_ROOT
;
907 old_fq_root
= q
->fq_root
;
909 fq_rehash(q
, old_fq_root
, q
->fq_trees_log
, array
, log
);
912 WRITE_ONCE(q
->fq_trees_log
, log
);
914 sch_tree_unlock(sch
);
916 fq_free(old_fq_root
);
921 static const struct netlink_range_validation iq_range
= {
925 static const struct nla_policy fq_policy
[TCA_FQ_MAX
+ 1] = {
926 [TCA_FQ_UNSPEC
] = { .strict_start_type
= TCA_FQ_TIMER_SLACK
},
928 [TCA_FQ_PLIMIT
] = { .type
= NLA_U32
},
929 [TCA_FQ_FLOW_PLIMIT
] = { .type
= NLA_U32
},
930 [TCA_FQ_QUANTUM
] = { .type
= NLA_U32
},
931 [TCA_FQ_INITIAL_QUANTUM
] = NLA_POLICY_FULL_RANGE(NLA_U32
, &iq_range
),
932 [TCA_FQ_RATE_ENABLE
] = { .type
= NLA_U32
},
933 [TCA_FQ_FLOW_DEFAULT_RATE
] = { .type
= NLA_U32
},
934 [TCA_FQ_FLOW_MAX_RATE
] = { .type
= NLA_U32
},
935 [TCA_FQ_BUCKETS_LOG
] = { .type
= NLA_U32
},
936 [TCA_FQ_FLOW_REFILL_DELAY
] = { .type
= NLA_U32
},
937 [TCA_FQ_ORPHAN_MASK
] = { .type
= NLA_U32
},
938 [TCA_FQ_LOW_RATE_THRESHOLD
] = { .type
= NLA_U32
},
939 [TCA_FQ_CE_THRESHOLD
] = { .type
= NLA_U32
},
940 [TCA_FQ_TIMER_SLACK
] = { .type
= NLA_U32
},
941 [TCA_FQ_HORIZON
] = { .type
= NLA_U32
},
942 [TCA_FQ_HORIZON_DROP
] = { .type
= NLA_U8
},
943 [TCA_FQ_PRIOMAP
] = NLA_POLICY_EXACT_LEN(sizeof(struct tc_prio_qopt
)),
944 [TCA_FQ_WEIGHTS
] = NLA_POLICY_EXACT_LEN(FQ_BANDS
* sizeof(s32
)),
945 [TCA_FQ_OFFLOAD_HORIZON
] = { .type
= NLA_U32
},
948 /* compress a u8 array with all elems <= 3 to an array of 2-bit fields */
949 static void fq_prio2band_compress_crumb(const u8
*in
, u8
*out
)
951 const int num_elems
= TC_PRIO_MAX
+ 1;
952 u8 tmp
[FQ_PRIO2BAND_CRUMB_SIZE
];
955 memset(tmp
, 0, sizeof(tmp
));
956 for (i
= 0; i
< num_elems
; i
++)
957 tmp
[i
/ 4] |= in
[i
] << (2 * (i
& 0x3));
959 for (i
= 0; i
< FQ_PRIO2BAND_CRUMB_SIZE
; i
++)
960 WRITE_ONCE(out
[i
], tmp
[i
]);
963 static void fq_prio2band_decompress_crumb(const u8
*in
, u8
*out
)
965 const int num_elems
= TC_PRIO_MAX
+ 1;
968 for (i
= 0; i
< num_elems
; i
++)
969 out
[i
] = fq_prio2band(in
, i
);
972 static int fq_load_weights(struct fq_sched_data
*q
,
973 const struct nlattr
*attr
,
974 struct netlink_ext_ack
*extack
)
976 s32
*weights
= nla_data(attr
);
979 for (i
= 0; i
< FQ_BANDS
; i
++) {
980 if (weights
[i
] < FQ_MIN_WEIGHT
) {
981 NL_SET_ERR_MSG_FMT_MOD(extack
, "Weight %d less that minimum allowed %d",
982 weights
[i
], FQ_MIN_WEIGHT
);
986 for (i
= 0; i
< FQ_BANDS
; i
++)
987 WRITE_ONCE(q
->band_flows
[i
].quantum
, weights
[i
]);
991 static int fq_load_priomap(struct fq_sched_data
*q
,
992 const struct nlattr
*attr
,
993 struct netlink_ext_ack
*extack
)
995 const struct tc_prio_qopt
*map
= nla_data(attr
);
998 if (map
->bands
!= FQ_BANDS
) {
999 NL_SET_ERR_MSG_MOD(extack
, "FQ only supports 3 bands");
1002 for (i
= 0; i
< TC_PRIO_MAX
+ 1; i
++) {
1003 if (map
->priomap
[i
] >= FQ_BANDS
) {
1004 NL_SET_ERR_MSG_FMT_MOD(extack
, "FQ priomap field %d maps to a too high band %d",
1005 i
, map
->priomap
[i
]);
1009 fq_prio2band_compress_crumb(map
->priomap
, q
->prio2band
);
1013 static int fq_change(struct Qdisc
*sch
, struct nlattr
*opt
,
1014 struct netlink_ext_ack
*extack
)
1016 struct fq_sched_data
*q
= qdisc_priv(sch
);
1017 struct nlattr
*tb
[TCA_FQ_MAX
+ 1];
1018 int err
, drop_count
= 0;
1019 unsigned drop_len
= 0;
1022 err
= nla_parse_nested_deprecated(tb
, TCA_FQ_MAX
, opt
, fq_policy
,
1029 fq_log
= q
->fq_trees_log
;
1031 if (tb
[TCA_FQ_BUCKETS_LOG
]) {
1032 u32 nval
= nla_get_u32(tb
[TCA_FQ_BUCKETS_LOG
]);
1034 if (nval
>= 1 && nval
<= ilog2(256*1024))
1039 if (tb
[TCA_FQ_PLIMIT
])
1040 WRITE_ONCE(sch
->limit
,
1041 nla_get_u32(tb
[TCA_FQ_PLIMIT
]));
1043 if (tb
[TCA_FQ_FLOW_PLIMIT
])
1044 WRITE_ONCE(q
->flow_plimit
,
1045 nla_get_u32(tb
[TCA_FQ_FLOW_PLIMIT
]));
1047 if (tb
[TCA_FQ_QUANTUM
]) {
1048 u32 quantum
= nla_get_u32(tb
[TCA_FQ_QUANTUM
]);
1050 if (quantum
> 0 && quantum
<= (1 << 20)) {
1051 WRITE_ONCE(q
->quantum
, quantum
);
1053 NL_SET_ERR_MSG_MOD(extack
, "invalid quantum");
1058 if (tb
[TCA_FQ_INITIAL_QUANTUM
])
1059 WRITE_ONCE(q
->initial_quantum
,
1060 nla_get_u32(tb
[TCA_FQ_INITIAL_QUANTUM
]));
1062 if (tb
[TCA_FQ_FLOW_DEFAULT_RATE
])
1063 pr_warn_ratelimited("sch_fq: defrate %u ignored.\n",
1064 nla_get_u32(tb
[TCA_FQ_FLOW_DEFAULT_RATE
]));
1066 if (tb
[TCA_FQ_FLOW_MAX_RATE
]) {
1067 u32 rate
= nla_get_u32(tb
[TCA_FQ_FLOW_MAX_RATE
]);
1069 WRITE_ONCE(q
->flow_max_rate
,
1070 (rate
== ~0U) ? ~0UL : rate
);
1072 if (tb
[TCA_FQ_LOW_RATE_THRESHOLD
])
1073 WRITE_ONCE(q
->low_rate_threshold
,
1074 nla_get_u32(tb
[TCA_FQ_LOW_RATE_THRESHOLD
]));
1076 if (tb
[TCA_FQ_RATE_ENABLE
]) {
1077 u32 enable
= nla_get_u32(tb
[TCA_FQ_RATE_ENABLE
]);
1080 WRITE_ONCE(q
->rate_enable
,
1086 if (tb
[TCA_FQ_FLOW_REFILL_DELAY
]) {
1087 u32 usecs_delay
= nla_get_u32(tb
[TCA_FQ_FLOW_REFILL_DELAY
]) ;
1089 WRITE_ONCE(q
->flow_refill_delay
,
1090 usecs_to_jiffies(usecs_delay
));
1093 if (!err
&& tb
[TCA_FQ_PRIOMAP
])
1094 err
= fq_load_priomap(q
, tb
[TCA_FQ_PRIOMAP
], extack
);
1096 if (!err
&& tb
[TCA_FQ_WEIGHTS
])
1097 err
= fq_load_weights(q
, tb
[TCA_FQ_WEIGHTS
], extack
);
1099 if (tb
[TCA_FQ_ORPHAN_MASK
])
1100 WRITE_ONCE(q
->orphan_mask
,
1101 nla_get_u32(tb
[TCA_FQ_ORPHAN_MASK
]));
1103 if (tb
[TCA_FQ_CE_THRESHOLD
])
1104 WRITE_ONCE(q
->ce_threshold
,
1105 (u64
)NSEC_PER_USEC
*
1106 nla_get_u32(tb
[TCA_FQ_CE_THRESHOLD
]));
1108 if (tb
[TCA_FQ_TIMER_SLACK
])
1109 WRITE_ONCE(q
->timer_slack
,
1110 nla_get_u32(tb
[TCA_FQ_TIMER_SLACK
]));
1112 if (tb
[TCA_FQ_HORIZON
])
1113 WRITE_ONCE(q
->horizon
,
1114 (u64
)NSEC_PER_USEC
*
1115 nla_get_u32(tb
[TCA_FQ_HORIZON
]));
1117 if (tb
[TCA_FQ_HORIZON_DROP
])
1118 WRITE_ONCE(q
->horizon_drop
,
1119 nla_get_u8(tb
[TCA_FQ_HORIZON_DROP
]));
1121 if (tb
[TCA_FQ_OFFLOAD_HORIZON
]) {
1122 u64 offload_horizon
= (u64
)NSEC_PER_USEC
*
1123 nla_get_u32(tb
[TCA_FQ_OFFLOAD_HORIZON
]);
1125 if (offload_horizon
<= qdisc_dev(sch
)->max_pacing_offload_horizon
) {
1126 WRITE_ONCE(q
->offload_horizon
, offload_horizon
);
1128 NL_SET_ERR_MSG_MOD(extack
, "invalid offload_horizon");
1134 sch_tree_unlock(sch
);
1135 err
= fq_resize(sch
, fq_log
);
1138 while (sch
->q
.qlen
> sch
->limit
) {
1139 struct sk_buff
*skb
= fq_dequeue(sch
);
1143 drop_len
+= qdisc_pkt_len(skb
);
1144 rtnl_kfree_skbs(skb
, skb
);
1147 qdisc_tree_reduce_backlog(sch
, drop_count
, drop_len
);
1149 sch_tree_unlock(sch
);
1153 static void fq_destroy(struct Qdisc
*sch
)
1155 struct fq_sched_data
*q
= qdisc_priv(sch
);
1158 fq_free(q
->fq_root
);
1159 qdisc_watchdog_cancel(&q
->watchdog
);
1162 static int fq_init(struct Qdisc
*sch
, struct nlattr
*opt
,
1163 struct netlink_ext_ack
*extack
)
1165 struct fq_sched_data
*q
= qdisc_priv(sch
);
1169 q
->flow_plimit
= 100;
1170 q
->quantum
= 2 * psched_mtu(qdisc_dev(sch
));
1171 q
->initial_quantum
= 10 * psched_mtu(qdisc_dev(sch
));
1172 q
->flow_refill_delay
= msecs_to_jiffies(40);
1173 q
->flow_max_rate
= ~0UL;
1174 q
->time_next_delayed_flow
= ~0ULL;
1176 for (i
= 0; i
< FQ_BANDS
; i
++) {
1177 q
->band_flows
[i
].new_flows
.first
= NULL
;
1178 q
->band_flows
[i
].old_flows
.first
= NULL
;
1180 q
->band_flows
[0].quantum
= 9 << 16;
1181 q
->band_flows
[1].quantum
= 3 << 16;
1182 q
->band_flows
[2].quantum
= 1 << 16;
1183 q
->delayed
= RB_ROOT
;
1185 q
->fq_trees_log
= ilog2(1024);
1186 q
->orphan_mask
= 1024 - 1;
1187 q
->low_rate_threshold
= 550000 / 8;
1189 q
->timer_slack
= 10 * NSEC_PER_USEC
; /* 10 usec of hrtimer slack */
1191 q
->horizon
= 10ULL * NSEC_PER_SEC
; /* 10 seconds */
1192 q
->horizon_drop
= 1; /* by default, drop packets beyond horizon */
1194 /* Default ce_threshold of 4294 seconds */
1195 q
->ce_threshold
= (u64
)NSEC_PER_USEC
* ~0U;
1197 fq_prio2band_compress_crumb(sch_default_prio2band
, q
->prio2band
);
1198 qdisc_watchdog_init_clockid(&q
->watchdog
, sch
, CLOCK_MONOTONIC
);
1201 err
= fq_change(sch
, opt
, extack
);
1203 err
= fq_resize(sch
, q
->fq_trees_log
);
1208 static int fq_dump(struct Qdisc
*sch
, struct sk_buff
*skb
)
1210 struct fq_sched_data
*q
= qdisc_priv(sch
);
1211 struct tc_prio_qopt prio
= {
1214 struct nlattr
*opts
;
1215 u64 offload_horizon
;
1220 opts
= nla_nest_start_noflag(skb
, TCA_OPTIONS
);
1222 goto nla_put_failure
;
1224 /* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */
1226 ce_threshold
= READ_ONCE(q
->ce_threshold
);
1227 do_div(ce_threshold
, NSEC_PER_USEC
);
1229 horizon
= READ_ONCE(q
->horizon
);
1230 do_div(horizon
, NSEC_PER_USEC
);
1232 offload_horizon
= READ_ONCE(q
->offload_horizon
);
1233 do_div(offload_horizon
, NSEC_PER_USEC
);
1235 if (nla_put_u32(skb
, TCA_FQ_PLIMIT
,
1236 READ_ONCE(sch
->limit
)) ||
1237 nla_put_u32(skb
, TCA_FQ_FLOW_PLIMIT
,
1238 READ_ONCE(q
->flow_plimit
)) ||
1239 nla_put_u32(skb
, TCA_FQ_QUANTUM
,
1240 READ_ONCE(q
->quantum
)) ||
1241 nla_put_u32(skb
, TCA_FQ_INITIAL_QUANTUM
,
1242 READ_ONCE(q
->initial_quantum
)) ||
1243 nla_put_u32(skb
, TCA_FQ_RATE_ENABLE
,
1244 READ_ONCE(q
->rate_enable
)) ||
1245 nla_put_u32(skb
, TCA_FQ_FLOW_MAX_RATE
,
1246 min_t(unsigned long,
1247 READ_ONCE(q
->flow_max_rate
), ~0U)) ||
1248 nla_put_u32(skb
, TCA_FQ_FLOW_REFILL_DELAY
,
1249 jiffies_to_usecs(READ_ONCE(q
->flow_refill_delay
))) ||
1250 nla_put_u32(skb
, TCA_FQ_ORPHAN_MASK
,
1251 READ_ONCE(q
->orphan_mask
)) ||
1252 nla_put_u32(skb
, TCA_FQ_LOW_RATE_THRESHOLD
,
1253 READ_ONCE(q
->low_rate_threshold
)) ||
1254 nla_put_u32(skb
, TCA_FQ_CE_THRESHOLD
, (u32
)ce_threshold
) ||
1255 nla_put_u32(skb
, TCA_FQ_BUCKETS_LOG
,
1256 READ_ONCE(q
->fq_trees_log
)) ||
1257 nla_put_u32(skb
, TCA_FQ_TIMER_SLACK
,
1258 READ_ONCE(q
->timer_slack
)) ||
1259 nla_put_u32(skb
, TCA_FQ_HORIZON
, (u32
)horizon
) ||
1260 nla_put_u32(skb
, TCA_FQ_OFFLOAD_HORIZON
, (u32
)offload_horizon
) ||
1261 nla_put_u8(skb
, TCA_FQ_HORIZON_DROP
,
1262 READ_ONCE(q
->horizon_drop
)))
1263 goto nla_put_failure
;
1265 fq_prio2band_decompress_crumb(q
->prio2band
, prio
.priomap
);
1266 if (nla_put(skb
, TCA_FQ_PRIOMAP
, sizeof(prio
), &prio
))
1267 goto nla_put_failure
;
1269 weights
[0] = READ_ONCE(q
->band_flows
[0].quantum
);
1270 weights
[1] = READ_ONCE(q
->band_flows
[1].quantum
);
1271 weights
[2] = READ_ONCE(q
->band_flows
[2].quantum
);
1272 if (nla_put(skb
, TCA_FQ_WEIGHTS
, sizeof(weights
), &weights
))
1273 goto nla_put_failure
;
1275 return nla_nest_end(skb
, opts
);
1281 static int fq_dump_stats(struct Qdisc
*sch
, struct gnet_dump
*d
)
1283 struct fq_sched_data
*q
= qdisc_priv(sch
);
1284 struct tc_fq_qd_stats st
;
1291 st
.gc_flows
= q
->stat_gc_flows
;
1292 st
.highprio_packets
= 0;
1293 st
.fastpath_packets
= q
->internal
.stat_fastpath_packets
;
1295 st
.throttled
= q
->stat_throttled
;
1296 st
.flows_plimit
= q
->stat_flows_plimit
;
1297 st
.pkts_too_long
= q
->stat_pkts_too_long
;
1298 st
.allocation_errors
= q
->stat_allocation_errors
;
1299 st
.time_next_delayed_flow
= q
->time_next_delayed_flow
+ q
->timer_slack
-
1301 st
.flows
= q
->flows
;
1302 st
.inactive_flows
= q
->inactive_flows
;
1303 st
.throttled_flows
= q
->throttled_flows
;
1304 st
.unthrottle_latency_ns
= min_t(unsigned long,
1305 q
->unthrottle_latency_ns
, ~0U);
1306 st
.ce_mark
= q
->stat_ce_mark
;
1307 st
.horizon_drops
= q
->stat_horizon_drops
;
1308 st
.horizon_caps
= q
->stat_horizon_caps
;
1309 for (i
= 0; i
< FQ_BANDS
; i
++) {
1310 st
.band_drops
[i
] = q
->stat_band_drops
[i
];
1311 st
.band_pkt_count
[i
] = q
->band_pkt_count
[i
];
1313 sch_tree_unlock(sch
);
1315 return gnet_stats_copy_app(d
, &st
, sizeof(st
));
1318 static struct Qdisc_ops fq_qdisc_ops __read_mostly
= {
1320 .priv_size
= sizeof(struct fq_sched_data
),
1322 .enqueue
= fq_enqueue
,
1323 .dequeue
= fq_dequeue
,
1324 .peek
= qdisc_peek_dequeued
,
1327 .destroy
= fq_destroy
,
1328 .change
= fq_change
,
1330 .dump_stats
= fq_dump_stats
,
1331 .owner
= THIS_MODULE
,
1333 MODULE_ALIAS_NET_SCH("fq");
1335 static int __init
fq_module_init(void)
1339 fq_flow_cachep
= kmem_cache_create("fq_flow_cache",
1340 sizeof(struct fq_flow
),
1341 0, SLAB_HWCACHE_ALIGN
, NULL
);
1342 if (!fq_flow_cachep
)
1345 ret
= register_qdisc(&fq_qdisc_ops
);
1347 kmem_cache_destroy(fq_flow_cachep
);
1351 static void __exit
fq_module_exit(void)
1353 unregister_qdisc(&fq_qdisc_ops
);
1354 kmem_cache_destroy(fq_flow_cachep
);
1357 module_init(fq_module_init
)
1358 module_exit(fq_module_exit
)
1359 MODULE_AUTHOR("Eric Dumazet");
1360 MODULE_LICENSE("GPL");
1361 MODULE_DESCRIPTION("Fair Queue Packet Scheduler");