1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Flow Queue PIE discipline
4 * Copyright (C) 2019 Mohit P. Tahiliani <tahiliani@nitk.edu.in>
5 * Copyright (C) 2019 Sachin D. Patil <sdp.sachin@gmail.com>
6 * Copyright (C) 2019 V. Saicharan <vsaicharan1998@gmail.com>
7 * Copyright (C) 2019 Mohit Bhasi <mohitbhasi1998@gmail.com>
8 * Copyright (C) 2019 Leslie Monis <lesliemonis@gmail.com>
9 * Copyright (C) 2019 Gautam Ramakrishnan <gautamramk@gmail.com>
12 #include <linux/jhash.h>
13 #include <linux/sizes.h>
14 #include <linux/vmalloc.h>
15 #include <net/pkt_cls.h>
21 * - Packets are classified on flows.
22 * - This is a Stochastic model (as we use a hash, several flows might
23 * be hashed to the same slot)
24 * - Each flow has a PIE managed queue.
25 * - Flows are linked onto two (Round Robin) lists,
26 * so that new flows have priority on old ones.
27 * - For a given flow, packets are not reordered.
28 * - Drops during enqueue only.
29 * - ECN capability is off by default.
30 * - ECN threshold (if ECN is enabled) is at 10% by default.
31 * - Uses timestamps to calculate queue delay by default.
35 * struct fq_pie_flow - contains data for each flow
36 * @vars: pie vars associated with the flow
37 * @deficit: number of remaining byte credits
38 * @backlog: size of data in the flow
39 * @qlen: number of packets in the flow
40 * @flowchain: flowchain for the flow
41 * @head: first packet in the flow
42 * @tail: last packet in the flow
49 struct list_head flowchain
;
54 struct fq_pie_sched_data
{
55 struct tcf_proto __rcu
*filter_list
; /* optional external classifier */
56 struct tcf_block
*block
;
57 struct fq_pie_flow
*flows
;
59 struct list_head old_flows
;
60 struct list_head new_flows
;
61 struct pie_params p_params
;
69 struct pie_stats stats
;
70 struct timer_list adapt_timer
;
73 static unsigned int fq_pie_hash(const struct fq_pie_sched_data
*q
,
76 return reciprocal_scale(skb_get_hash(skb
), q
->flows_cnt
);
79 static unsigned int fq_pie_classify(struct sk_buff
*skb
, struct Qdisc
*sch
,
82 struct fq_pie_sched_data
*q
= qdisc_priv(sch
);
83 struct tcf_proto
*filter
;
84 struct tcf_result res
;
87 if (TC_H_MAJ(skb
->priority
) == sch
->handle
&&
88 TC_H_MIN(skb
->priority
) > 0 &&
89 TC_H_MIN(skb
->priority
) <= q
->flows_cnt
)
90 return TC_H_MIN(skb
->priority
);
92 filter
= rcu_dereference_bh(q
->filter_list
);
94 return fq_pie_hash(q
, skb
) + 1;
96 *qerr
= NET_XMIT_SUCCESS
| __NET_XMIT_BYPASS
;
97 result
= tcf_classify(skb
, filter
, &res
, false);
99 #ifdef CONFIG_NET_CLS_ACT
104 *qerr
= NET_XMIT_SUCCESS
| __NET_XMIT_STOLEN
;
110 if (TC_H_MIN(res
.classid
) <= q
->flows_cnt
)
111 return TC_H_MIN(res
.classid
);
116 /* add skb to flow queue (tail add) */
117 static inline void flow_queue_add(struct fq_pie_flow
*flow
,
123 flow
->tail
->next
= skb
;
128 static int fq_pie_qdisc_enqueue(struct sk_buff
*skb
, struct Qdisc
*sch
,
129 struct sk_buff
**to_free
)
131 struct fq_pie_sched_data
*q
= qdisc_priv(sch
);
132 struct fq_pie_flow
*sel_flow
;
133 int uninitialized_var(ret
);
134 u8 memory_limited
= false;
139 /* Classifies packet into corresponding flow */
140 idx
= fq_pie_classify(skb
, sch
, &ret
);
141 sel_flow
= &q
->flows
[idx
];
143 /* Checks whether adding a new packet would exceed memory limit */
144 get_pie_cb(skb
)->mem_usage
= skb
->truesize
;
145 memory_limited
= q
->memory_usage
> q
->memory_limit
+ skb
->truesize
;
147 /* Checks if the qdisc is full */
148 if (unlikely(qdisc_qlen(sch
) >= sch
->limit
)) {
149 q
->stats
.overlimit
++;
151 } else if (unlikely(memory_limited
)) {
155 if (!pie_drop_early(sch
, &q
->p_params
, &sel_flow
->vars
,
156 sel_flow
->backlog
, skb
->len
)) {
158 } else if (q
->p_params
.ecn
&&
159 sel_flow
->vars
.prob
<= (MAX_PROB
/ 100) * q
->ecn_prob
&&
160 INET_ECN_set_ce(skb
)) {
161 /* If packet is ecn capable, mark it if drop probability
162 * is lower than the parameter ecn_prob, else drop it.
168 /* Set enqueue time only when dq_rate_estimator is disabled. */
169 if (!q
->p_params
.dq_rate_estimator
)
170 pie_set_enqueue_time(skb
);
172 pkt_len
= qdisc_pkt_len(skb
);
173 q
->stats
.packets_in
++;
174 q
->memory_usage
+= skb
->truesize
;
175 sch
->qstats
.backlog
+= pkt_len
;
177 flow_queue_add(sel_flow
, skb
);
178 if (list_empty(&sel_flow
->flowchain
)) {
179 list_add_tail(&sel_flow
->flowchain
, &q
->new_flows
);
181 sel_flow
->deficit
= q
->quantum
;
183 sel_flow
->backlog
= 0;
186 sel_flow
->backlog
+= pkt_len
;
187 return NET_XMIT_SUCCESS
;
191 sel_flow
->vars
.accu_prob
= 0;
192 __qdisc_drop(skb
, to_free
);
193 qdisc_qstats_drop(sch
);
197 static const struct nla_policy fq_pie_policy
[TCA_FQ_PIE_MAX
+ 1] = {
198 [TCA_FQ_PIE_LIMIT
] = {.type
= NLA_U32
},
199 [TCA_FQ_PIE_FLOWS
] = {.type
= NLA_U32
},
200 [TCA_FQ_PIE_TARGET
] = {.type
= NLA_U32
},
201 [TCA_FQ_PIE_TUPDATE
] = {.type
= NLA_U32
},
202 [TCA_FQ_PIE_ALPHA
] = {.type
= NLA_U32
},
203 [TCA_FQ_PIE_BETA
] = {.type
= NLA_U32
},
204 [TCA_FQ_PIE_QUANTUM
] = {.type
= NLA_U32
},
205 [TCA_FQ_PIE_MEMORY_LIMIT
] = {.type
= NLA_U32
},
206 [TCA_FQ_PIE_ECN_PROB
] = {.type
= NLA_U32
},
207 [TCA_FQ_PIE_ECN
] = {.type
= NLA_U32
},
208 [TCA_FQ_PIE_BYTEMODE
] = {.type
= NLA_U32
},
209 [TCA_FQ_PIE_DQ_RATE_ESTIMATOR
] = {.type
= NLA_U32
},
212 static inline struct sk_buff
*dequeue_head(struct fq_pie_flow
*flow
)
214 struct sk_buff
*skb
= flow
->head
;
216 flow
->head
= skb
->next
;
221 static struct sk_buff
*fq_pie_qdisc_dequeue(struct Qdisc
*sch
)
223 struct fq_pie_sched_data
*q
= qdisc_priv(sch
);
224 struct sk_buff
*skb
= NULL
;
225 struct fq_pie_flow
*flow
;
226 struct list_head
*head
;
230 head
= &q
->new_flows
;
231 if (list_empty(head
)) {
232 head
= &q
->old_flows
;
233 if (list_empty(head
))
237 flow
= list_first_entry(head
, struct fq_pie_flow
, flowchain
);
238 /* Flow has exhausted all its credits */
239 if (flow
->deficit
<= 0) {
240 flow
->deficit
+= q
->quantum
;
241 list_move_tail(&flow
->flowchain
, &q
->old_flows
);
246 skb
= dequeue_head(flow
);
247 pkt_len
= qdisc_pkt_len(skb
);
248 sch
->qstats
.backlog
-= pkt_len
;
250 qdisc_bstats_update(sch
, skb
);
254 /* force a pass through old_flows to prevent starvation */
255 if (head
== &q
->new_flows
&& !list_empty(&q
->old_flows
))
256 list_move_tail(&flow
->flowchain
, &q
->old_flows
);
258 list_del_init(&flow
->flowchain
);
263 flow
->deficit
-= pkt_len
;
264 flow
->backlog
-= pkt_len
;
265 q
->memory_usage
-= get_pie_cb(skb
)->mem_usage
;
266 pie_process_dequeue(skb
, &q
->p_params
, &flow
->vars
, flow
->backlog
);
270 static int fq_pie_change(struct Qdisc
*sch
, struct nlattr
*opt
,
271 struct netlink_ext_ack
*extack
)
273 struct fq_pie_sched_data
*q
= qdisc_priv(sch
);
274 struct nlattr
*tb
[TCA_FQ_PIE_MAX
+ 1];
275 unsigned int len_dropped
= 0;
276 unsigned int num_dropped
= 0;
282 err
= nla_parse_nested(tb
, TCA_FQ_PIE_MAX
, opt
, fq_pie_policy
, extack
);
287 if (tb
[TCA_FQ_PIE_LIMIT
]) {
288 u32 limit
= nla_get_u32(tb
[TCA_FQ_PIE_LIMIT
]);
290 q
->p_params
.limit
= limit
;
293 if (tb
[TCA_FQ_PIE_FLOWS
]) {
295 NL_SET_ERR_MSG_MOD(extack
,
296 "Number of flows cannot be changed");
299 q
->flows_cnt
= nla_get_u32(tb
[TCA_FQ_PIE_FLOWS
]);
300 if (!q
->flows_cnt
|| q
->flows_cnt
> 65536) {
301 NL_SET_ERR_MSG_MOD(extack
,
302 "Number of flows must be < 65536");
307 /* convert from microseconds to pschedtime */
308 if (tb
[TCA_FQ_PIE_TARGET
]) {
309 /* target is in us */
310 u32 target
= nla_get_u32(tb
[TCA_FQ_PIE_TARGET
]);
312 /* convert to pschedtime */
314 PSCHED_NS2TICKS((u64
)target
* NSEC_PER_USEC
);
317 /* tupdate is in jiffies */
318 if (tb
[TCA_FQ_PIE_TUPDATE
])
319 q
->p_params
.tupdate
=
320 usecs_to_jiffies(nla_get_u32(tb
[TCA_FQ_PIE_TUPDATE
]));
322 if (tb
[TCA_FQ_PIE_ALPHA
])
323 q
->p_params
.alpha
= nla_get_u32(tb
[TCA_FQ_PIE_ALPHA
]);
325 if (tb
[TCA_FQ_PIE_BETA
])
326 q
->p_params
.beta
= nla_get_u32(tb
[TCA_FQ_PIE_BETA
]);
328 if (tb
[TCA_FQ_PIE_QUANTUM
])
329 q
->quantum
= nla_get_u32(tb
[TCA_FQ_PIE_QUANTUM
]);
331 if (tb
[TCA_FQ_PIE_MEMORY_LIMIT
])
332 q
->memory_limit
= nla_get_u32(tb
[TCA_FQ_PIE_MEMORY_LIMIT
]);
334 if (tb
[TCA_FQ_PIE_ECN_PROB
])
335 q
->ecn_prob
= nla_get_u32(tb
[TCA_FQ_PIE_ECN_PROB
]);
337 if (tb
[TCA_FQ_PIE_ECN
])
338 q
->p_params
.ecn
= nla_get_u32(tb
[TCA_FQ_PIE_ECN
]);
340 if (tb
[TCA_FQ_PIE_BYTEMODE
])
341 q
->p_params
.bytemode
= nla_get_u32(tb
[TCA_FQ_PIE_BYTEMODE
]);
343 if (tb
[TCA_FQ_PIE_DQ_RATE_ESTIMATOR
])
344 q
->p_params
.dq_rate_estimator
=
345 nla_get_u32(tb
[TCA_FQ_PIE_DQ_RATE_ESTIMATOR
]);
347 /* Drop excess packets if new limit is lower */
348 while (sch
->q
.qlen
> sch
->limit
) {
349 struct sk_buff
*skb
= fq_pie_qdisc_dequeue(sch
);
351 len_dropped
+= qdisc_pkt_len(skb
);
353 rtnl_kfree_skbs(skb
, skb
);
355 qdisc_tree_reduce_backlog(sch
, num_dropped
, len_dropped
);
357 sch_tree_unlock(sch
);
361 sch_tree_unlock(sch
);
365 static void fq_pie_timer(struct timer_list
*t
)
367 struct fq_pie_sched_data
*q
= from_timer(q
, t
, adapt_timer
);
368 struct Qdisc
*sch
= q
->sch
;
369 spinlock_t
*root_lock
; /* to lock qdisc for probability calculations */
372 root_lock
= qdisc_lock(qdisc_root_sleeping(sch
));
373 spin_lock(root_lock
);
375 for (idx
= 0; idx
< q
->flows_cnt
; idx
++)
376 pie_calculate_probability(&q
->p_params
, &q
->flows
[idx
].vars
,
377 q
->flows
[idx
].backlog
);
379 /* reset the timer to fire after 'tupdate' jiffies. */
380 if (q
->p_params
.tupdate
)
381 mod_timer(&q
->adapt_timer
, jiffies
+ q
->p_params
.tupdate
);
383 spin_unlock(root_lock
);
386 static int fq_pie_init(struct Qdisc
*sch
, struct nlattr
*opt
,
387 struct netlink_ext_ack
*extack
)
389 struct fq_pie_sched_data
*q
= qdisc_priv(sch
);
393 pie_params_init(&q
->p_params
);
394 sch
->limit
= 10 * 1024;
395 q
->p_params
.limit
= sch
->limit
;
396 q
->quantum
= psched_mtu(qdisc_dev(sch
));
400 q
->memory_limit
= SZ_32M
;
402 INIT_LIST_HEAD(&q
->new_flows
);
403 INIT_LIST_HEAD(&q
->old_flows
);
406 err
= fq_pie_change(sch
, opt
, extack
);
412 err
= tcf_block_get(&q
->block
, &q
->filter_list
, sch
, extack
);
416 q
->flows
= kvcalloc(q
->flows_cnt
, sizeof(struct fq_pie_flow
),
422 for (idx
= 0; idx
< q
->flows_cnt
; idx
++) {
423 struct fq_pie_flow
*flow
= q
->flows
+ idx
;
425 INIT_LIST_HEAD(&flow
->flowchain
);
426 pie_vars_init(&flow
->vars
);
429 timer_setup(&q
->adapt_timer
, fq_pie_timer
, 0);
430 mod_timer(&q
->adapt_timer
, jiffies
+ HZ
/ 2);
440 static int fq_pie_dump(struct Qdisc
*sch
, struct sk_buff
*skb
)
442 struct fq_pie_sched_data
*q
= qdisc_priv(sch
);
445 opts
= nla_nest_start(skb
, TCA_OPTIONS
);
449 /* convert target from pschedtime to us */
450 if (nla_put_u32(skb
, TCA_FQ_PIE_LIMIT
, sch
->limit
) ||
451 nla_put_u32(skb
, TCA_FQ_PIE_FLOWS
, q
->flows_cnt
) ||
452 nla_put_u32(skb
, TCA_FQ_PIE_TARGET
,
453 ((u32
)PSCHED_TICKS2NS(q
->p_params
.target
)) /
455 nla_put_u32(skb
, TCA_FQ_PIE_TUPDATE
,
456 jiffies_to_usecs(q
->p_params
.tupdate
)) ||
457 nla_put_u32(skb
, TCA_FQ_PIE_ALPHA
, q
->p_params
.alpha
) ||
458 nla_put_u32(skb
, TCA_FQ_PIE_BETA
, q
->p_params
.beta
) ||
459 nla_put_u32(skb
, TCA_FQ_PIE_QUANTUM
, q
->quantum
) ||
460 nla_put_u32(skb
, TCA_FQ_PIE_MEMORY_LIMIT
, q
->memory_limit
) ||
461 nla_put_u32(skb
, TCA_FQ_PIE_ECN_PROB
, q
->ecn_prob
) ||
462 nla_put_u32(skb
, TCA_FQ_PIE_ECN
, q
->p_params
.ecn
) ||
463 nla_put_u32(skb
, TCA_FQ_PIE_BYTEMODE
, q
->p_params
.bytemode
) ||
464 nla_put_u32(skb
, TCA_FQ_PIE_DQ_RATE_ESTIMATOR
,
465 q
->p_params
.dq_rate_estimator
))
466 goto nla_put_failure
;
468 return nla_nest_end(skb
, opts
);
471 nla_nest_cancel(skb
, opts
);
475 static int fq_pie_dump_stats(struct Qdisc
*sch
, struct gnet_dump
*d
)
477 struct fq_pie_sched_data
*q
= qdisc_priv(sch
);
478 struct tc_fq_pie_xstats st
= {
479 .packets_in
= q
->stats
.packets_in
,
480 .overlimit
= q
->stats
.overlimit
,
481 .overmemory
= q
->overmemory
,
482 .dropped
= q
->stats
.dropped
,
483 .ecn_mark
= q
->stats
.ecn_mark
,
484 .new_flow_count
= q
->new_flow_count
,
485 .memory_usage
= q
->memory_usage
,
487 struct list_head
*pos
;
490 list_for_each(pos
, &q
->new_flows
)
493 list_for_each(pos
, &q
->old_flows
)
495 sch_tree_unlock(sch
);
497 return gnet_stats_copy_app(d
, &st
, sizeof(st
));
500 static void fq_pie_reset(struct Qdisc
*sch
)
502 struct fq_pie_sched_data
*q
= qdisc_priv(sch
);
505 INIT_LIST_HEAD(&q
->new_flows
);
506 INIT_LIST_HEAD(&q
->old_flows
);
507 for (idx
= 0; idx
< q
->flows_cnt
; idx
++) {
508 struct fq_pie_flow
*flow
= q
->flows
+ idx
;
510 /* Removes all packets from flow */
511 rtnl_kfree_skbs(flow
->head
, flow
->tail
);
514 INIT_LIST_HEAD(&flow
->flowchain
);
515 pie_vars_init(&flow
->vars
);
519 sch
->qstats
.backlog
= 0;
522 static void fq_pie_destroy(struct Qdisc
*sch
)
524 struct fq_pie_sched_data
*q
= qdisc_priv(sch
);
526 tcf_block_put(q
->block
);
527 del_timer_sync(&q
->adapt_timer
);
531 static struct Qdisc_ops fq_pie_qdisc_ops __read_mostly
= {
533 .priv_size
= sizeof(struct fq_pie_sched_data
),
534 .enqueue
= fq_pie_qdisc_enqueue
,
535 .dequeue
= fq_pie_qdisc_dequeue
,
536 .peek
= qdisc_peek_dequeued
,
538 .destroy
= fq_pie_destroy
,
539 .reset
= fq_pie_reset
,
540 .change
= fq_pie_change
,
542 .dump_stats
= fq_pie_dump_stats
,
543 .owner
= THIS_MODULE
,
546 static int __init
fq_pie_module_init(void)
548 return register_qdisc(&fq_pie_qdisc_ops
);
551 static void __exit
fq_pie_module_exit(void)
553 unregister_qdisc(&fq_pie_qdisc_ops
);
556 module_init(fq_pie_module_init
);
557 module_exit(fq_pie_module_exit
);
559 MODULE_DESCRIPTION("Flow Queue Proportional Integral controller Enhanced (FQ-PIE)");
560 MODULE_AUTHOR("Mohit P. Tahiliani");
561 MODULE_LICENSE("GPL");