1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Flow Queue PIE discipline
4 * Copyright (C) 2019 Mohit P. Tahiliani <tahiliani@nitk.edu.in>
5 * Copyright (C) 2019 Sachin D. Patil <sdp.sachin@gmail.com>
6 * Copyright (C) 2019 V. Saicharan <vsaicharan1998@gmail.com>
7 * Copyright (C) 2019 Mohit Bhasi <mohitbhasi1998@gmail.com>
8 * Copyright (C) 2019 Leslie Monis <lesliemonis@gmail.com>
9 * Copyright (C) 2019 Gautam Ramakrishnan <gautamramk@gmail.com>
12 #include <linux/jhash.h>
13 #include <linux/module.h>
14 #include <linux/sizes.h>
15 #include <linux/vmalloc.h>
16 #include <net/pkt_cls.h>
22 * - Packets are classified on flows.
23 * - This is a Stochastic model (as we use a hash, several flows might
24 * be hashed to the same slot)
25 * - Each flow has a PIE managed queue.
26 * - Flows are linked onto two (Round Robin) lists,
27 * so that new flows have priority on old ones.
28 * - For a given flow, packets are not reordered.
29 * - Drops during enqueue only.
30 * - ECN capability is off by default.
31 * - ECN threshold (if ECN is enabled) is at 10% by default.
32 * - Uses timestamps to calculate queue delay by default.
36 * struct fq_pie_flow - contains data for each flow
37 * @vars: pie vars associated with the flow
38 * @deficit: number of remaining byte credits
39 * @backlog: size of data in the flow
40 * @qlen: number of packets in the flow
41 * @flowchain: flowchain for the flow
42 * @head: first packet in the flow
43 * @tail: last packet in the flow
50 struct list_head flowchain
;
55 struct fq_pie_sched_data
{
56 struct tcf_proto __rcu
*filter_list
; /* optional external classifier */
57 struct tcf_block
*block
;
58 struct fq_pie_flow
*flows
;
60 struct list_head old_flows
;
61 struct list_head new_flows
;
62 struct pie_params p_params
;
71 struct pie_stats stats
;
72 struct timer_list adapt_timer
;
75 static unsigned int fq_pie_hash(const struct fq_pie_sched_data
*q
,
78 return reciprocal_scale(skb_get_hash(skb
), q
->flows_cnt
);
81 static unsigned int fq_pie_classify(struct sk_buff
*skb
, struct Qdisc
*sch
,
84 struct fq_pie_sched_data
*q
= qdisc_priv(sch
);
85 struct tcf_proto
*filter
;
86 struct tcf_result res
;
89 if (TC_H_MAJ(skb
->priority
) == sch
->handle
&&
90 TC_H_MIN(skb
->priority
) > 0 &&
91 TC_H_MIN(skb
->priority
) <= q
->flows_cnt
)
92 return TC_H_MIN(skb
->priority
);
94 filter
= rcu_dereference_bh(q
->filter_list
);
96 return fq_pie_hash(q
, skb
) + 1;
98 *qerr
= NET_XMIT_SUCCESS
| __NET_XMIT_BYPASS
;
99 result
= tcf_classify(skb
, NULL
, filter
, &res
, false);
101 #ifdef CONFIG_NET_CLS_ACT
106 *qerr
= NET_XMIT_SUCCESS
| __NET_XMIT_STOLEN
;
112 if (TC_H_MIN(res
.classid
) <= q
->flows_cnt
)
113 return TC_H_MIN(res
.classid
);
118 /* add skb to flow queue (tail add) */
119 static inline void flow_queue_add(struct fq_pie_flow
*flow
,
125 flow
->tail
->next
= skb
;
130 static int fq_pie_qdisc_enqueue(struct sk_buff
*skb
, struct Qdisc
*sch
,
131 struct sk_buff
**to_free
)
133 struct fq_pie_sched_data
*q
= qdisc_priv(sch
);
134 struct fq_pie_flow
*sel_flow
;
136 u8 memory_limited
= false;
141 /* Classifies packet into corresponding flow */
142 idx
= fq_pie_classify(skb
, sch
, &ret
);
144 if (ret
& __NET_XMIT_BYPASS
)
145 qdisc_qstats_drop(sch
);
146 __qdisc_drop(skb
, to_free
);
151 sel_flow
= &q
->flows
[idx
];
152 /* Checks whether adding a new packet would exceed memory limit */
153 get_pie_cb(skb
)->mem_usage
= skb
->truesize
;
154 memory_limited
= q
->memory_usage
> q
->memory_limit
+ skb
->truesize
;
156 /* Checks if the qdisc is full */
157 if (unlikely(qdisc_qlen(sch
) >= sch
->limit
)) {
158 q
->stats
.overlimit
++;
160 } else if (unlikely(memory_limited
)) {
164 if (!pie_drop_early(sch
, &q
->p_params
, &sel_flow
->vars
,
165 sel_flow
->backlog
, skb
->len
)) {
167 } else if (q
->p_params
.ecn
&&
168 sel_flow
->vars
.prob
<= (MAX_PROB
/ 100) * q
->ecn_prob
&&
169 INET_ECN_set_ce(skb
)) {
170 /* If packet is ecn capable, mark it if drop probability
171 * is lower than the parameter ecn_prob, else drop it.
177 /* Set enqueue time only when dq_rate_estimator is disabled. */
178 if (!q
->p_params
.dq_rate_estimator
)
179 pie_set_enqueue_time(skb
);
181 pkt_len
= qdisc_pkt_len(skb
);
182 q
->stats
.packets_in
++;
183 q
->memory_usage
+= skb
->truesize
;
184 sch
->qstats
.backlog
+= pkt_len
;
186 flow_queue_add(sel_flow
, skb
);
187 if (list_empty(&sel_flow
->flowchain
)) {
188 list_add_tail(&sel_flow
->flowchain
, &q
->new_flows
);
190 sel_flow
->deficit
= q
->quantum
;
192 sel_flow
->backlog
= 0;
195 sel_flow
->backlog
+= pkt_len
;
196 return NET_XMIT_SUCCESS
;
200 sel_flow
->vars
.accu_prob
= 0;
201 __qdisc_drop(skb
, to_free
);
202 qdisc_qstats_drop(sch
);
206 static const struct netlink_range_validation fq_pie_q_range
= {
211 static const struct nla_policy fq_pie_policy
[TCA_FQ_PIE_MAX
+ 1] = {
212 [TCA_FQ_PIE_LIMIT
] = {.type
= NLA_U32
},
213 [TCA_FQ_PIE_FLOWS
] = {.type
= NLA_U32
},
214 [TCA_FQ_PIE_TARGET
] = {.type
= NLA_U32
},
215 [TCA_FQ_PIE_TUPDATE
] = {.type
= NLA_U32
},
216 [TCA_FQ_PIE_ALPHA
] = {.type
= NLA_U32
},
217 [TCA_FQ_PIE_BETA
] = {.type
= NLA_U32
},
218 [TCA_FQ_PIE_QUANTUM
] =
219 NLA_POLICY_FULL_RANGE(NLA_U32
, &fq_pie_q_range
),
220 [TCA_FQ_PIE_MEMORY_LIMIT
] = {.type
= NLA_U32
},
221 [TCA_FQ_PIE_ECN_PROB
] = {.type
= NLA_U32
},
222 [TCA_FQ_PIE_ECN
] = {.type
= NLA_U32
},
223 [TCA_FQ_PIE_BYTEMODE
] = {.type
= NLA_U32
},
224 [TCA_FQ_PIE_DQ_RATE_ESTIMATOR
] = {.type
= NLA_U32
},
227 static inline struct sk_buff
*dequeue_head(struct fq_pie_flow
*flow
)
229 struct sk_buff
*skb
= flow
->head
;
231 flow
->head
= skb
->next
;
236 static struct sk_buff
*fq_pie_qdisc_dequeue(struct Qdisc
*sch
)
238 struct fq_pie_sched_data
*q
= qdisc_priv(sch
);
239 struct sk_buff
*skb
= NULL
;
240 struct fq_pie_flow
*flow
;
241 struct list_head
*head
;
245 head
= &q
->new_flows
;
246 if (list_empty(head
)) {
247 head
= &q
->old_flows
;
248 if (list_empty(head
))
252 flow
= list_first_entry(head
, struct fq_pie_flow
, flowchain
);
253 /* Flow has exhausted all its credits */
254 if (flow
->deficit
<= 0) {
255 flow
->deficit
+= q
->quantum
;
256 list_move_tail(&flow
->flowchain
, &q
->old_flows
);
261 skb
= dequeue_head(flow
);
262 pkt_len
= qdisc_pkt_len(skb
);
263 sch
->qstats
.backlog
-= pkt_len
;
265 qdisc_bstats_update(sch
, skb
);
269 /* force a pass through old_flows to prevent starvation */
270 if (head
== &q
->new_flows
&& !list_empty(&q
->old_flows
))
271 list_move_tail(&flow
->flowchain
, &q
->old_flows
);
273 list_del_init(&flow
->flowchain
);
278 flow
->deficit
-= pkt_len
;
279 flow
->backlog
-= pkt_len
;
280 q
->memory_usage
-= get_pie_cb(skb
)->mem_usage
;
281 pie_process_dequeue(skb
, &q
->p_params
, &flow
->vars
, flow
->backlog
);
285 static int fq_pie_change(struct Qdisc
*sch
, struct nlattr
*opt
,
286 struct netlink_ext_ack
*extack
)
288 struct fq_pie_sched_data
*q
= qdisc_priv(sch
);
289 struct nlattr
*tb
[TCA_FQ_PIE_MAX
+ 1];
290 unsigned int len_dropped
= 0;
291 unsigned int num_dropped
= 0;
294 err
= nla_parse_nested(tb
, TCA_FQ_PIE_MAX
, opt
, fq_pie_policy
, extack
);
299 if (tb
[TCA_FQ_PIE_LIMIT
]) {
300 u32 limit
= nla_get_u32(tb
[TCA_FQ_PIE_LIMIT
]);
302 WRITE_ONCE(q
->p_params
.limit
, limit
);
303 WRITE_ONCE(sch
->limit
, limit
);
305 if (tb
[TCA_FQ_PIE_FLOWS
]) {
307 NL_SET_ERR_MSG_MOD(extack
,
308 "Number of flows cannot be changed");
311 q
->flows_cnt
= nla_get_u32(tb
[TCA_FQ_PIE_FLOWS
]);
312 if (!q
->flows_cnt
|| q
->flows_cnt
> 65536) {
313 NL_SET_ERR_MSG_MOD(extack
,
314 "Number of flows must range in [1..65536]");
319 /* convert from microseconds to pschedtime */
320 if (tb
[TCA_FQ_PIE_TARGET
]) {
321 /* target is in us */
322 u32 target
= nla_get_u32(tb
[TCA_FQ_PIE_TARGET
]);
324 /* convert to pschedtime */
325 WRITE_ONCE(q
->p_params
.target
,
326 PSCHED_NS2TICKS((u64
)target
* NSEC_PER_USEC
));
329 /* tupdate is in jiffies */
330 if (tb
[TCA_FQ_PIE_TUPDATE
])
331 WRITE_ONCE(q
->p_params
.tupdate
,
332 usecs_to_jiffies(nla_get_u32(tb
[TCA_FQ_PIE_TUPDATE
])));
334 if (tb
[TCA_FQ_PIE_ALPHA
])
335 WRITE_ONCE(q
->p_params
.alpha
,
336 nla_get_u32(tb
[TCA_FQ_PIE_ALPHA
]));
338 if (tb
[TCA_FQ_PIE_BETA
])
339 WRITE_ONCE(q
->p_params
.beta
,
340 nla_get_u32(tb
[TCA_FQ_PIE_BETA
]));
342 if (tb
[TCA_FQ_PIE_QUANTUM
])
343 WRITE_ONCE(q
->quantum
, nla_get_u32(tb
[TCA_FQ_PIE_QUANTUM
]));
345 if (tb
[TCA_FQ_PIE_MEMORY_LIMIT
])
346 WRITE_ONCE(q
->memory_limit
,
347 nla_get_u32(tb
[TCA_FQ_PIE_MEMORY_LIMIT
]));
349 if (tb
[TCA_FQ_PIE_ECN_PROB
])
350 WRITE_ONCE(q
->ecn_prob
,
351 nla_get_u32(tb
[TCA_FQ_PIE_ECN_PROB
]));
353 if (tb
[TCA_FQ_PIE_ECN
])
354 WRITE_ONCE(q
->p_params
.ecn
,
355 nla_get_u32(tb
[TCA_FQ_PIE_ECN
]));
357 if (tb
[TCA_FQ_PIE_BYTEMODE
])
358 WRITE_ONCE(q
->p_params
.bytemode
,
359 nla_get_u32(tb
[TCA_FQ_PIE_BYTEMODE
]));
361 if (tb
[TCA_FQ_PIE_DQ_RATE_ESTIMATOR
])
362 WRITE_ONCE(q
->p_params
.dq_rate_estimator
,
363 nla_get_u32(tb
[TCA_FQ_PIE_DQ_RATE_ESTIMATOR
]));
365 /* Drop excess packets if new limit is lower */
366 while (sch
->q
.qlen
> sch
->limit
) {
367 struct sk_buff
*skb
= fq_pie_qdisc_dequeue(sch
);
369 len_dropped
+= qdisc_pkt_len(skb
);
371 rtnl_kfree_skbs(skb
, skb
);
373 qdisc_tree_reduce_backlog(sch
, num_dropped
, len_dropped
);
375 sch_tree_unlock(sch
);
379 sch_tree_unlock(sch
);
383 static void fq_pie_timer(struct timer_list
*t
)
385 struct fq_pie_sched_data
*q
= from_timer(q
, t
, adapt_timer
);
386 unsigned long next
, tupdate
;
387 struct Qdisc
*sch
= q
->sch
;
388 spinlock_t
*root_lock
; /* to lock qdisc for probability calculations */
392 root_lock
= qdisc_lock(qdisc_root_sleeping(sch
));
393 spin_lock(root_lock
);
395 /* Limit this expensive loop to 2048 flows per round. */
396 max_cnt
= min_t(int, q
->flows_cnt
- q
->flows_cursor
, 2048);
397 for (i
= 0; i
< max_cnt
; i
++) {
398 pie_calculate_probability(&q
->p_params
,
399 &q
->flows
[q
->flows_cursor
].vars
,
400 q
->flows
[q
->flows_cursor
].backlog
);
404 tupdate
= q
->p_params
.tupdate
;
406 if (q
->flows_cursor
>= q
->flows_cnt
) {
411 mod_timer(&q
->adapt_timer
, jiffies
+ next
);
412 spin_unlock(root_lock
);
416 static int fq_pie_init(struct Qdisc
*sch
, struct nlattr
*opt
,
417 struct netlink_ext_ack
*extack
)
419 struct fq_pie_sched_data
*q
= qdisc_priv(sch
);
423 pie_params_init(&q
->p_params
);
424 sch
->limit
= 10 * 1024;
425 q
->p_params
.limit
= sch
->limit
;
426 q
->quantum
= psched_mtu(qdisc_dev(sch
));
430 q
->memory_limit
= SZ_32M
;
432 INIT_LIST_HEAD(&q
->new_flows
);
433 INIT_LIST_HEAD(&q
->old_flows
);
434 timer_setup(&q
->adapt_timer
, fq_pie_timer
, 0);
437 err
= fq_pie_change(sch
, opt
, extack
);
443 err
= tcf_block_get(&q
->block
, &q
->filter_list
, sch
, extack
);
447 q
->flows
= kvcalloc(q
->flows_cnt
, sizeof(struct fq_pie_flow
),
453 for (idx
= 0; idx
< q
->flows_cnt
; idx
++) {
454 struct fq_pie_flow
*flow
= q
->flows
+ idx
;
456 INIT_LIST_HEAD(&flow
->flowchain
);
457 pie_vars_init(&flow
->vars
);
460 mod_timer(&q
->adapt_timer
, jiffies
+ HZ
/ 2);
470 static int fq_pie_dump(struct Qdisc
*sch
, struct sk_buff
*skb
)
472 struct fq_pie_sched_data
*q
= qdisc_priv(sch
);
475 opts
= nla_nest_start(skb
, TCA_OPTIONS
);
479 /* convert target from pschedtime to us */
480 if (nla_put_u32(skb
, TCA_FQ_PIE_LIMIT
, READ_ONCE(sch
->limit
)) ||
481 nla_put_u32(skb
, TCA_FQ_PIE_FLOWS
, READ_ONCE(q
->flows_cnt
)) ||
482 nla_put_u32(skb
, TCA_FQ_PIE_TARGET
,
483 ((u32
)PSCHED_TICKS2NS(READ_ONCE(q
->p_params
.target
))) /
485 nla_put_u32(skb
, TCA_FQ_PIE_TUPDATE
,
486 jiffies_to_usecs(READ_ONCE(q
->p_params
.tupdate
))) ||
487 nla_put_u32(skb
, TCA_FQ_PIE_ALPHA
, READ_ONCE(q
->p_params
.alpha
)) ||
488 nla_put_u32(skb
, TCA_FQ_PIE_BETA
, READ_ONCE(q
->p_params
.beta
)) ||
489 nla_put_u32(skb
, TCA_FQ_PIE_QUANTUM
, READ_ONCE(q
->quantum
)) ||
490 nla_put_u32(skb
, TCA_FQ_PIE_MEMORY_LIMIT
,
491 READ_ONCE(q
->memory_limit
)) ||
492 nla_put_u32(skb
, TCA_FQ_PIE_ECN_PROB
, READ_ONCE(q
->ecn_prob
)) ||
493 nla_put_u32(skb
, TCA_FQ_PIE_ECN
, READ_ONCE(q
->p_params
.ecn
)) ||
494 nla_put_u32(skb
, TCA_FQ_PIE_BYTEMODE
, READ_ONCE(q
->p_params
.bytemode
)) ||
495 nla_put_u32(skb
, TCA_FQ_PIE_DQ_RATE_ESTIMATOR
,
496 READ_ONCE(q
->p_params
.dq_rate_estimator
)))
497 goto nla_put_failure
;
499 return nla_nest_end(skb
, opts
);
502 nla_nest_cancel(skb
, opts
);
506 static int fq_pie_dump_stats(struct Qdisc
*sch
, struct gnet_dump
*d
)
508 struct fq_pie_sched_data
*q
= qdisc_priv(sch
);
509 struct tc_fq_pie_xstats st
= {
510 .packets_in
= q
->stats
.packets_in
,
511 .overlimit
= q
->stats
.overlimit
,
512 .overmemory
= q
->overmemory
,
513 .dropped
= q
->stats
.dropped
,
514 .ecn_mark
= q
->stats
.ecn_mark
,
515 .new_flow_count
= q
->new_flow_count
,
516 .memory_usage
= q
->memory_usage
,
518 struct list_head
*pos
;
521 list_for_each(pos
, &q
->new_flows
)
524 list_for_each(pos
, &q
->old_flows
)
526 sch_tree_unlock(sch
);
528 return gnet_stats_copy_app(d
, &st
, sizeof(st
));
531 static void fq_pie_reset(struct Qdisc
*sch
)
533 struct fq_pie_sched_data
*q
= qdisc_priv(sch
);
536 INIT_LIST_HEAD(&q
->new_flows
);
537 INIT_LIST_HEAD(&q
->old_flows
);
538 for (idx
= 0; idx
< q
->flows_cnt
; idx
++) {
539 struct fq_pie_flow
*flow
= q
->flows
+ idx
;
541 /* Removes all packets from flow */
542 rtnl_kfree_skbs(flow
->head
, flow
->tail
);
545 INIT_LIST_HEAD(&flow
->flowchain
);
546 pie_vars_init(&flow
->vars
);
550 static void fq_pie_destroy(struct Qdisc
*sch
)
552 struct fq_pie_sched_data
*q
= qdisc_priv(sch
);
554 tcf_block_put(q
->block
);
555 q
->p_params
.tupdate
= 0;
556 del_timer_sync(&q
->adapt_timer
);
560 static struct Qdisc_ops fq_pie_qdisc_ops __read_mostly
= {
562 .priv_size
= sizeof(struct fq_pie_sched_data
),
563 .enqueue
= fq_pie_qdisc_enqueue
,
564 .dequeue
= fq_pie_qdisc_dequeue
,
565 .peek
= qdisc_peek_dequeued
,
567 .destroy
= fq_pie_destroy
,
568 .reset
= fq_pie_reset
,
569 .change
= fq_pie_change
,
571 .dump_stats
= fq_pie_dump_stats
,
572 .owner
= THIS_MODULE
,
574 MODULE_ALIAS_NET_SCH("fq_pie");
576 static int __init
fq_pie_module_init(void)
578 return register_qdisc(&fq_pie_qdisc_ops
);
581 static void __exit
fq_pie_module_exit(void)
583 unregister_qdisc(&fq_pie_qdisc_ops
);
586 module_init(fq_pie_module_init
);
587 module_exit(fq_pie_module_exit
);
589 MODULE_DESCRIPTION("Flow Queue Proportional Integral controller Enhanced (FQ-PIE)");
590 MODULE_AUTHOR("Mohit P. Tahiliani");
591 MODULE_LICENSE("GPL");