3 * SPDX-License-Identifier: BSD-3-Clause
5 * Copyright (c) 2018-2020
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * Author: Randall Stewart <rrs@netflix.com>
34 #include <sys/cdefs.h>
36 #include "opt_inet6.h"
37 #include "opt_ipsec.h"
38 #include "opt_ratelimit.h"
39 #include <sys/param.h>
40 #include <sys/kernel.h>
41 #include <sys/malloc.h>
43 #include <sys/socket.h>
44 #include <sys/socketvar.h>
45 #include <sys/sysctl.h>
46 #include <sys/eventhandler.h>
47 #include <sys/mutex.h>
50 #include <net/if_var.h>
51 #include <net/if_private.h>
52 #include <netinet/in.h>
53 #include <netinet/in_pcb.h>
54 #define TCPSTATES /* for logging */
55 #include <netinet/tcp_var.h>
56 #include <netinet/tcp_hpts.h>
57 #include <netinet/tcp_log_buf.h>
58 #include <netinet/tcp_ratelimit.h>
59 #ifndef USECS_IN_SECOND
60 #define USECS_IN_SECOND 1000000
63 * For the purposes of each send, what is the size
64 * of an ethernet frame.
66 MALLOC_DEFINE(M_TCPPACE
, "tcp_hwpace", "TCP Hardware pacing memory");
70 * The following preferred table will seem weird to
71 * the casual viewer. Why do we not have any rates below
72 * 1Mbps? Why do we have a rate at 1.44Mbps called common?
73 * Why do the rates cluster in the 1-100Mbps range more
74 * than others? Why does the table jump around at the beginnign
75 * and then be more consistently raising?
77 * Let me try to answer those questions. A lot of
78 * this is dependant on the hardware. We have three basic
79 * supporters of rate limiting
81 * Chelsio - Supporting 16 configurable rates.
82 * Mlx - c4 supporting 13 fixed rates.
83 * Mlx - c5 & c6 supporting 127 configurable rates.
85 * The c4 is why we have a common rate that is available
86 * in all rate tables. This is a selected rate from the
87 * c4 table and we assure its available in all ratelimit
88 * tables. This way the tcp_ratelimit code has an assured
89 * rate it should always be able to get. This answers a
90 * couple of the questions above.
92 * So what about the rest, well the table is built to
93 * try to get the most out of a joint hardware/software
94 * pacing system. The software pacer will always pick
95 * a rate higher than the b/w that it is estimating
97 * on the path. This is done for two reasons.
98 * a) So we can discover more b/w
100 * b) So we can send a block of MSS's down and then
101 * have the software timer go off after the previous
102 * send is completely out of the hardware.
104 * But when we do <b> we don't want to have the delay
105 * between the last packet sent by the hardware be
106 * excessively long (to reach our desired rate).
108 * So let me give an example for clarity.
110 * Lets assume that the tcp stack sees that 29,110,000 bps is
111 * what the bw of the path is. The stack would select the
112 * rate 31Mbps. 31Mbps means that each send that is done
113 * by the hardware will cause a 390 micro-second gap between
114 * the packets sent at that rate. For 29,110,000 bps we
115 * would need 416 micro-seconds gap between each send.
117 * Note that are calculating a complete time for pacing
118 * which includes the ethernet, IP and TCP overhead. So
119 * a full 1514 bytes is used for the above calculations.
120 * My testing has shown that both cards are also using this
121 * as their basis i.e. full payload size of the ethernet frame.
122 * The TCP stack caller needs to be aware of this and make the
123 * appropriate overhead calculations be included in its choices.
125 * Now, continuing our example, we pick a MSS size based on the
126 * delta between the two rates (416 - 390) divided into the rate
127 * we really wish to send at rounded up. That results in a MSS
128 * send of 17 mss's at once. The hardware then will
129 * run out of data in a single 17MSS send in 6,630 micro-seconds.
131 * On the other hand the software pacer will send more data
132 * in 7,072 micro-seconds. This means that we will refill
133 * the hardware 52 microseconds after it would have sent
134 * next if it had not ran out of data. This is a win since we are
135 * only sending every 7ms or so and yet all the packets are spaced on
136 * the wire with 94% of what they should be and only
137 * the last packet is delayed extra to make up for the
140 * Note that the above formula has two important caveat.
141 * If we are above (b/w wise) over 100Mbps we double the result
142 * of the MSS calculation. The second caveat is if we are 500Mbps
143 * or more we just send the maximum MSS at once i.e. 45MSS. At
144 * the higher b/w's even the cards have limits to what times (timer granularity)
145 * they can insert between packets and start to send more than one
146 * packet at a time on the wire.
149 #define COMMON_RATE 180500
150 const uint64_t desired_rates
[] = {
151 122500, /* 1Mbps - rate 1 */
152 180500, /* 1.44Mpbs - rate 2 common rate */
153 375000, /* 3Mbps - rate 3 */
154 625000, /* 5Mbps - rate 4 */
155 1250000, /* 10Mbps - rate 5 */
156 1875000, /* 15Mbps - rate 6 */
157 2500000, /* 20Mbps - rate 7 */
158 3125000, /* 25Mbps - rate 8 */
159 3750000, /* 30Mbps - rate 9 */
160 4375000, /* 35Mbps - rate 10 */
161 5000000, /* 40Meg - rate 11 */
162 6250000, /* 50Mbps - rate 12 */
163 12500000, /* 100Mbps - rate 13 */
164 25000000, /* 200Mbps - rate 14 */
165 50000000, /* 400Mbps - rate 15 */
166 100000000, /* 800Mbps - rate 16 */
167 5625000, /* 45Mbps - rate 17 */
168 6875000, /* 55Mbps - rate 19 */
169 7500000, /* 60Mbps - rate 20 */
170 8125000, /* 65Mbps - rate 21 */
171 8750000, /* 70Mbps - rate 22 */
172 9375000, /* 75Mbps - rate 23 */
173 10000000, /* 80Mbps - rate 24 */
174 10625000, /* 85Mbps - rate 25 */
175 11250000, /* 90Mbps - rate 26 */
176 11875000, /* 95Mbps - rate 27 */
177 12500000, /* 100Mbps - rate 28 */
178 13750000, /* 110Mbps - rate 29 */
179 15000000, /* 120Mbps - rate 30 */
180 16250000, /* 130Mbps - rate 31 */
181 17500000, /* 140Mbps - rate 32 */
182 18750000, /* 150Mbps - rate 33 */
183 20000000, /* 160Mbps - rate 34 */
184 21250000, /* 170Mbps - rate 35 */
185 22500000, /* 180Mbps - rate 36 */
186 23750000, /* 190Mbps - rate 37 */
187 26250000, /* 210Mbps - rate 38 */
188 27500000, /* 220Mbps - rate 39 */
189 28750000, /* 230Mbps - rate 40 */
190 30000000, /* 240Mbps - rate 41 */
191 31250000, /* 250Mbps - rate 42 */
192 34375000, /* 275Mbps - rate 43 */
193 37500000, /* 300Mbps - rate 44 */
194 40625000, /* 325Mbps - rate 45 */
195 43750000, /* 350Mbps - rate 46 */
196 46875000, /* 375Mbps - rate 47 */
197 53125000, /* 425Mbps - rate 48 */
198 56250000, /* 450Mbps - rate 49 */
199 59375000, /* 475Mbps - rate 50 */
200 62500000, /* 500Mbps - rate 51 */
201 68750000, /* 550Mbps - rate 52 */
202 75000000, /* 600Mbps - rate 53 */
203 81250000, /* 650Mbps - rate 54 */
204 87500000, /* 700Mbps - rate 55 */
205 93750000, /* 750Mbps - rate 56 */
206 106250000, /* 850Mbps - rate 57 */
207 112500000, /* 900Mbps - rate 58 */
208 125000000, /* 1Gbps - rate 59 */
209 156250000, /* 1.25Gps - rate 60 */
210 187500000, /* 1.5Gps - rate 61 */
211 218750000, /* 1.75Gps - rate 62 */
212 250000000, /* 2Gbps - rate 63 */
213 281250000, /* 2.25Gps - rate 64 */
214 312500000, /* 2.5Gbps - rate 65 */
215 343750000, /* 2.75Gbps - rate 66 */
216 375000000, /* 3Gbps - rate 67 */
217 500000000, /* 4Gbps - rate 68 */
218 625000000, /* 5Gbps - rate 69 */
219 750000000, /* 6Gbps - rate 70 */
220 875000000, /* 7Gbps - rate 71 */
221 1000000000, /* 8Gbps - rate 72 */
222 1125000000, /* 9Gbps - rate 73 */
223 1250000000, /* 10Gbps - rate 74 */
224 1875000000, /* 15Gbps - rate 75 */
225 2500000000 /* 20Gbps - rate 76 */
228 #define MAX_HDWR_RATES (sizeof(desired_rates)/sizeof(uint64_t))
229 #define RS_ORDERED_COUNT 16 /*
230 * Number that are in order
231 * at the beginning of the table,
232 * over this a sort is required.
234 #define RS_NEXT_ORDER_GROUP 16 /*
235 * The point in our table where
236 * we come fill in a second ordered
237 * group (index wise means -1).
239 #define ALL_HARDWARE_RATES 1004 /*
240 * 1Meg - 1Gig in 1 Meg steps
241 * plus 100, 200k and 500k and
245 #define RS_ONE_MEGABIT_PERSEC 1000000
246 #define RS_ONE_GIGABIT_PERSEC 1000000000
247 #define RS_TEN_GIGABIT_PERSEC 10000000000
249 static struct head_tcp_rate_set int_rs
;
250 static struct mtx rs_mtx
;
251 uint32_t rs_number_alive
;
252 uint32_t rs_number_dead
;
253 static uint32_t rs_floor_mss
= 0;
254 static uint32_t wait_time_floor
= 8000; /* 8 ms */
255 static uint32_t rs_hw_floor_mss
= 16;
256 static uint32_t num_of_waits_allowed
= 1; /* How many time blocks are we willing to wait */
258 static uint32_t mss_divisor
= RL_DEFAULT_DIVISOR
;
259 static uint32_t even_num_segs
= 1;
260 static uint32_t even_threshold
= 4;
262 SYSCTL_NODE(_net_inet_tcp
, OID_AUTO
, rl
, CTLFLAG_RW
| CTLFLAG_MPSAFE
, 0,
263 "TCP Ratelimit stats");
264 SYSCTL_UINT(_net_inet_tcp_rl
, OID_AUTO
, alive
, CTLFLAG_RW
,
266 "Number of interfaces initialized for ratelimiting");
267 SYSCTL_UINT(_net_inet_tcp_rl
, OID_AUTO
, dead
, CTLFLAG_RW
,
269 "Number of interfaces departing from ratelimiting");
270 SYSCTL_UINT(_net_inet_tcp_rl
, OID_AUTO
, floor_mss
, CTLFLAG_RW
,
272 "Number of MSS that will override the normal minimums (0 means don't enforce)");
273 SYSCTL_UINT(_net_inet_tcp_rl
, OID_AUTO
, wait_floor
, CTLFLAG_RW
,
274 &wait_time_floor
, 2000,
275 "Has b/w increases what is the wait floor we are willing to wait at the end?");
276 SYSCTL_UINT(_net_inet_tcp_rl
, OID_AUTO
, time_blocks
, CTLFLAG_RW
,
277 &num_of_waits_allowed
, 1,
278 "How many time blocks on the end should software pacing be willing to wait?");
280 SYSCTL_UINT(_net_inet_tcp_rl
, OID_AUTO
, hw_floor_mss
, CTLFLAG_RW
,
281 &rs_hw_floor_mss
, 16,
282 "Number of mss that are a minum for hardware pacing?");
284 SYSCTL_INT(_net_inet_tcp_rl
, OID_AUTO
, divisor
, CTLFLAG_RW
,
285 &mss_divisor
, RL_DEFAULT_DIVISOR
,
286 "The value divided into bytes per second to help establish mss size");
287 SYSCTL_INT(_net_inet_tcp_rl
, OID_AUTO
, even
, CTLFLAG_RW
,
289 "Do we round mss size up to an even number of segments for delayed ack");
290 SYSCTL_INT(_net_inet_tcp_rl
, OID_AUTO
, eventhresh
, CTLFLAG_RW
,
292 "At what number of mss do we start rounding up to an even number of mss?");
295 rl_add_syctl_entries(struct sysctl_oid
*rl_sysctl_root
, struct tcp_rate_set
*rs
)
298 * Add sysctl entries for thus interface.
300 if (rs
->rs_flags
& RS_INTF_NO_SUP
) {
301 SYSCTL_ADD_S32(&rs
->sysctl_ctx
,
302 SYSCTL_CHILDREN(rl_sysctl_root
),
303 OID_AUTO
, "disable", CTLFLAG_RD
,
305 "Disable this interface from new hdwr limiting?");
307 SYSCTL_ADD_S32(&rs
->sysctl_ctx
,
308 SYSCTL_CHILDREN(rl_sysctl_root
),
309 OID_AUTO
, "disable", CTLFLAG_RW
,
311 "Disable this interface from new hdwr limiting?");
313 SYSCTL_ADD_S32(&rs
->sysctl_ctx
,
314 SYSCTL_CHILDREN(rl_sysctl_root
),
315 OID_AUTO
, "minseg", CTLFLAG_RW
,
317 "What is the minimum we need to send on this interface?");
318 SYSCTL_ADD_U64(&rs
->sysctl_ctx
,
319 SYSCTL_CHILDREN(rl_sysctl_root
),
320 OID_AUTO
, "flow_limit", CTLFLAG_RW
,
321 &rs
->rs_flow_limit
, 0,
322 "What is the limit for number of flows (0=unlimited)?");
323 SYSCTL_ADD_S32(&rs
->sysctl_ctx
,
324 SYSCTL_CHILDREN(rl_sysctl_root
),
325 OID_AUTO
, "highest", CTLFLAG_RD
,
326 &rs
->rs_highest_valid
, 0,
327 "Highest valid rate");
328 SYSCTL_ADD_S32(&rs
->sysctl_ctx
,
329 SYSCTL_CHILDREN(rl_sysctl_root
),
330 OID_AUTO
, "lowest", CTLFLAG_RD
,
331 &rs
->rs_lowest_valid
, 0,
332 "Lowest valid rate");
333 SYSCTL_ADD_S32(&rs
->sysctl_ctx
,
334 SYSCTL_CHILDREN(rl_sysctl_root
),
335 OID_AUTO
, "flags", CTLFLAG_RD
,
337 "What lags are on the entry?");
338 SYSCTL_ADD_S32(&rs
->sysctl_ctx
,
339 SYSCTL_CHILDREN(rl_sysctl_root
),
340 OID_AUTO
, "numrates", CTLFLAG_RD
,
342 "How many rates re there?");
343 SYSCTL_ADD_U64(&rs
->sysctl_ctx
,
344 SYSCTL_CHILDREN(rl_sysctl_root
),
345 OID_AUTO
, "flows_using", CTLFLAG_RD
,
346 &rs
->rs_flows_using
, 0,
347 "How many flows are using this interface now?");
348 #ifdef DETAILED_RATELIMIT_SYSCTL
349 if (rs
->rs_rlt
&& rs
->rs_rate_cnt
> 0) {
350 /* Lets display the rates */
352 struct sysctl_oid
*rl_rates
;
353 struct sysctl_oid
*rl_rate_num
;
355 rl_rates
= SYSCTL_ADD_NODE(&rs
->sysctl_ctx
,
356 SYSCTL_CHILDREN(rl_sysctl_root
),
359 CTLFLAG_RW
| CTLFLAG_MPSAFE
, 0,
361 for( i
= 0; i
< rs
->rs_rate_cnt
; i
++) {
362 sprintf(rate_num
, "%d", i
);
363 rl_rate_num
= SYSCTL_ADD_NODE(&rs
->sysctl_ctx
,
364 SYSCTL_CHILDREN(rl_rates
),
367 CTLFLAG_RW
| CTLFLAG_MPSAFE
, 0,
369 SYSCTL_ADD_U32(&rs
->sysctl_ctx
,
370 SYSCTL_CHILDREN(rl_rate_num
),
371 OID_AUTO
, "flags", CTLFLAG_RD
,
372 &rs
->rs_rlt
[i
].flags
, 0,
373 "Flags on this rate");
374 SYSCTL_ADD_U32(&rs
->sysctl_ctx
,
375 SYSCTL_CHILDREN(rl_rate_num
),
376 OID_AUTO
, "pacetime", CTLFLAG_RD
,
377 &rs
->rs_rlt
[i
].time_between
, 0,
378 "Time hardware inserts between 1500 byte sends");
379 SYSCTL_ADD_LONG(&rs
->sysctl_ctx
,
380 SYSCTL_CHILDREN(rl_rate_num
),
381 OID_AUTO
, "rate", CTLFLAG_RD
,
383 "Rate in bytes per second");
384 SYSCTL_ADD_LONG(&rs
->sysctl_ctx
,
385 SYSCTL_CHILDREN(rl_rate_num
),
386 OID_AUTO
, "using", CTLFLAG_RD
,
387 &rs
->rs_rlt
[i
].using,
388 "Number of flows using");
389 SYSCTL_ADD_LONG(&rs
->sysctl_ctx
,
390 SYSCTL_CHILDREN(rl_rate_num
),
391 OID_AUTO
, "enobufs", CTLFLAG_RD
,
392 &rs
->rs_rlt
[i
].rs_num_enobufs
,
393 "Number of enobufs logged on this rate");
401 rs_destroy(epoch_context_t ctx
)
403 struct tcp_rate_set
*rs
;
406 rs
= __containerof(ctx
, struct tcp_rate_set
, rs_epoch_ctx
);
409 rs
->rs_flags
&= ~RS_FUNERAL_SCHD
;
411 * In theory its possible (but unlikely)
412 * that while the delete was occuring
413 * and we were applying the DEAD flag
414 * someone slipped in and found the
415 * interface in a lookup. While we
416 * decided rs_flows_using were 0 and
417 * scheduling the epoch_call, the other
418 * thread incremented rs_flow_using. This
419 * is because users have a pointer and
420 * we only use the rs_flows_using in an
421 * atomic fashion, i.e. the other entities
422 * are not protected. To assure this did
423 * not occur, we check rs_flows_using here
426 do_free_rs
= (rs
->rs_flows_using
== 0);
431 sysctl_ctx_free(&rs
->sysctl_ctx
);
432 free(rs
->rs_rlt
, M_TCPPACE
);
438 rs_defer_destroy(struct tcp_rate_set
*rs
)
441 mtx_assert(&rs_mtx
, MA_OWNED
);
443 /* Check if already pending. */
444 if (rs
->rs_flags
& RS_FUNERAL_SCHD
)
449 /* Set flag to only defer once. */
450 rs
->rs_flags
|= RS_FUNERAL_SCHD
;
451 NET_EPOCH_CALL(rs_destroy
, &rs
->rs_epoch_ctx
);
455 extern counter_u64_t rate_limit_new
;
456 extern counter_u64_t rate_limit_chg
;
457 extern counter_u64_t rate_limit_set_ok
;
458 extern counter_u64_t rate_limit_active
;
459 extern counter_u64_t rate_limit_alloc_fail
;
463 rl_attach_txrtlmt(struct ifnet
*ifp
,
467 struct m_snd_tag
**tag
)
470 union if_snd_tag_alloc_params params
= {
471 .rate_limit
.hdr
.type
= IF_SND_TAG_TYPE_RATE_LIMIT
,
472 .rate_limit
.hdr
.flowid
= flowid
,
473 .rate_limit
.hdr
.flowtype
= flowtype
,
474 .rate_limit
.max_rate
= cfg_rate
,
475 .rate_limit
.flags
= M_NOWAIT
,
478 error
= m_snd_tag_alloc(ifp
, ¶ms
, tag
);
481 counter_u64_add(rate_limit_set_ok
, 1);
482 counter_u64_add(rate_limit_active
, 1);
483 } else if (error
!= EOPNOTSUPP
)
484 counter_u64_add(rate_limit_alloc_fail
, 1);
490 populate_canned_table(struct tcp_rate_set
*rs
, const uint64_t *rate_table_act
)
493 * The internal table is "special", it
494 * is two seperate ordered tables that
495 * must be merged. We get here when the
496 * adapter specifies a number of rates that
497 * covers both ranges in the table in some
500 int i
, at_low
, at_high
;
501 uint8_t low_disabled
= 0, high_disabled
= 0;
503 for(i
= 0, at_low
= 0, at_high
= RS_NEXT_ORDER_GROUP
; i
< rs
->rs_rate_cnt
; i
++) {
504 rs
->rs_rlt
[i
].flags
= 0;
505 rs
->rs_rlt
[i
].time_between
= 0;
506 if ((low_disabled
== 0) &&
508 (rate_table_act
[at_low
] < rate_table_act
[at_high
]))) {
509 rs
->rs_rlt
[i
].rate
= rate_table_act
[at_low
];
511 if (at_low
== RS_NEXT_ORDER_GROUP
)
513 } else if (high_disabled
== 0) {
514 rs
->rs_rlt
[i
].rate
= rate_table_act
[at_high
];
516 if (at_high
== MAX_HDWR_RATES
)
522 static struct tcp_rate_set
*
523 rt_setup_new_rs(struct ifnet
*ifp
, int *error
)
525 struct tcp_rate_set
*rs
;
526 const uint64_t *rate_table_act
;
527 uint64_t lentim
, res
;
531 struct if_ratelimit_query_results rl
;
532 struct sysctl_oid
*rl_sysctl_root
;
533 struct epoch_tracker et
;
535 * We expect to enter with the
539 if (ifp
->if_ratelimit_query
== NULL
) {
541 * We can do nothing if we cannot
542 * get a query back from the driver.
544 printf("Warning:No query functions for %s:%d-- failed\n",
545 ifp
->if_dname
, ifp
->if_dunit
);
548 rs
= malloc(sizeof(struct tcp_rate_set
), M_TCPPACE
, M_NOWAIT
| M_ZERO
);
552 printf("Warning:No memory for malloc of tcp_rate_set\n");
555 memset(&rl
, 0, sizeof(rl
));
556 rl
.flags
= RT_NOSUPPORT
;
557 ifp
->if_ratelimit_query(ifp
, &rl
);
558 if (rl
.flags
& RT_IS_UNUSABLE
) {
560 * The interface does not really support
563 memset(rs
, 0, sizeof(struct tcp_rate_set
));
565 rs
->rs_if_dunit
= ifp
->if_dunit
;
566 rs
->rs_flags
= RS_INTF_NO_SUP
;
569 sysctl_ctx_init(&rs
->sysctl_ctx
);
570 rl_sysctl_root
= SYSCTL_ADD_NODE(&rs
->sysctl_ctx
,
571 SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl
),
573 rs
->rs_ifp
->if_xname
,
574 CTLFLAG_RW
| CTLFLAG_MPSAFE
, 0,
576 rl_add_syctl_entries(rl_sysctl_root
, rs
);
579 CK_LIST_INSERT_HEAD(&int_rs
, rs
, next
);
583 } else if ((rl
.flags
& RT_IS_INDIRECT
) == RT_IS_INDIRECT
) {
584 memset(rs
, 0, sizeof(struct tcp_rate_set
));
586 rs
->rs_if_dunit
= ifp
->if_dunit
;
587 rs
->rs_flags
= RS_IS_DEFF
;
589 sysctl_ctx_init(&rs
->sysctl_ctx
);
590 rl_sysctl_root
= SYSCTL_ADD_NODE(&rs
->sysctl_ctx
,
591 SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl
),
593 rs
->rs_ifp
->if_xname
,
594 CTLFLAG_RW
| CTLFLAG_MPSAFE
, 0,
596 rl_add_syctl_entries(rl_sysctl_root
, rs
);
599 CK_LIST_INSERT_HEAD(&int_rs
, rs
, next
);
603 } else if ((rl
.flags
& RT_IS_FIXED_TABLE
) == RT_IS_FIXED_TABLE
) {
604 /* Mellanox C4 likely */
606 rs
->rs_if_dunit
= ifp
->if_dunit
;
607 rs
->rs_rate_cnt
= rl
.number_of_rates
;
608 rs
->rs_min_seg
= rl
.min_segment_burst
;
609 rs
->rs_highest_valid
= 0;
610 rs
->rs_flow_limit
= rl
.max_flows
;
611 rs
->rs_flags
= RS_IS_INTF
| RS_NO_PRE
;
613 rate_table_act
= rl
.rate_table
;
614 } else if ((rl
.flags
& RT_IS_SELECTABLE
) == RT_IS_SELECTABLE
) {
615 /* Chelsio, C5 and C6 of Mellanox? */
617 rs
->rs_if_dunit
= ifp
->if_dunit
;
618 rs
->rs_rate_cnt
= rl
.number_of_rates
;
619 rs
->rs_min_seg
= rl
.min_segment_burst
;
621 rs
->rs_flow_limit
= rl
.max_flows
;
622 rate_table_act
= desired_rates
;
623 if ((rs
->rs_rate_cnt
> MAX_HDWR_RATES
) &&
624 (rs
->rs_rate_cnt
< ALL_HARDWARE_RATES
)) {
626 * Our desired table is not big
627 * enough, do what we can.
629 rs
->rs_rate_cnt
= MAX_HDWR_RATES
;
631 if (rs
->rs_rate_cnt
<= RS_ORDERED_COUNT
)
632 rs
->rs_flags
= RS_IS_INTF
;
634 rs
->rs_flags
= RS_IS_INTF
| RS_INT_TBL
;
635 if (rs
->rs_rate_cnt
>= ALL_HARDWARE_RATES
)
636 rs
->rs_rate_cnt
= ALL_HARDWARE_RATES
;
641 sz
= sizeof(struct tcp_hwrate_limit_table
) * rs
->rs_rate_cnt
;
642 rs
->rs_rlt
= malloc(sz
, M_TCPPACE
, M_NOWAIT
);
643 if (rs
->rs_rlt
== NULL
) {
650 if (rs
->rs_rate_cnt
>= ALL_HARDWARE_RATES
) {
652 * The interface supports all
653 * the rates we could possibly want.
657 rs
->rs_rlt
[0].rate
= 12500; /* 100k */
658 rs
->rs_rlt
[1].rate
= 25000; /* 200k */
659 rs
->rs_rlt
[2].rate
= 62500; /* 500k */
660 /* Note 125000 == 1Megabit
661 * populate 1Meg - 1000meg.
663 for(i
= 3, rat
= 125000; i
< (ALL_HARDWARE_RATES
-1); i
++) {
664 rs
->rs_rlt
[i
].rate
= rat
;
667 rs
->rs_rlt
[(ALL_HARDWARE_RATES
-1)].rate
= 1250000000;
668 } else if (rs
->rs_flags
& RS_INT_TBL
) {
669 /* We populate this in a special way */
670 populate_canned_table(rs
, rate_table_act
);
673 * Just copy in the rates from
674 * the table, it is in order.
676 for (i
=0; i
<rs
->rs_rate_cnt
; i
++) {
677 rs
->rs_rlt
[i
].rate
= rate_table_act
[i
];
678 rs
->rs_rlt
[i
].time_between
= 0;
679 rs
->rs_rlt
[i
].flags
= 0;
682 for (i
= (rs
->rs_rate_cnt
- 1); i
>= 0; i
--) {
684 * We go backwards through the list so that if we can't get
685 * a rate and fail to init one, we have at least a chance of
686 * getting the highest one.
688 rs
->rs_rlt
[i
].ptbl
= rs
;
689 rs
->rs_rlt
[i
].tag
= NULL
;
690 rs
->rs_rlt
[i
].using = 0;
691 rs
->rs_rlt
[i
].rs_num_enobufs
= 0;
693 * Calculate the time between.
695 lentim
= ETHERNET_SEGMENT_SIZE
* USECS_IN_SECOND
;
696 res
= lentim
/ rs
->rs_rlt
[i
].rate
;
698 rs
->rs_rlt
[i
].time_between
= res
;
700 rs
->rs_rlt
[i
].time_between
= 1;
701 if (rs
->rs_flags
& RS_NO_PRE
) {
702 rs
->rs_rlt
[i
].flags
= HDWRPACE_INITED
;
703 rs
->rs_lowest_valid
= i
;
707 if ((rl
.flags
& RT_IS_SETUP_REQ
) &&
708 (ifp
->if_ratelimit_query
)) {
709 err
= ifp
->if_ratelimit_setup(ifp
,
710 rs
->rs_rlt
[i
].rate
, i
);
715 hash_type
= M_HASHTYPE_RSS_TCP_IPV4
;
717 hash_type
= M_HASHTYPE_OPAQUE_HASH
;
719 err
= rl_attach_txrtlmt(ifp
,
726 if (i
== (rs
->rs_rate_cnt
- 1)) {
728 * Huh - first rate and we can't get
731 free(rs
->rs_rlt
, M_TCPPACE
);
741 rs
->rs_rlt
[i
].flags
= HDWRPACE_INITED
| HDWRPACE_TAGPRESENT
;
742 rs
->rs_lowest_valid
= i
;
746 /* Did we get at least 1 rate? */
747 if (rs
->rs_rlt
[(rs
->rs_rate_cnt
- 1)].flags
& HDWRPACE_INITED
)
748 rs
->rs_highest_valid
= rs
->rs_rate_cnt
- 1;
750 free(rs
->rs_rlt
, M_TCPPACE
);
754 sysctl_ctx_init(&rs
->sysctl_ctx
);
755 rl_sysctl_root
= SYSCTL_ADD_NODE(&rs
->sysctl_ctx
,
756 SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl
),
758 rs
->rs_ifp
->if_xname
,
759 CTLFLAG_RW
| CTLFLAG_MPSAFE
, 0,
761 rl_add_syctl_entries(rl_sysctl_root
, rs
);
764 CK_LIST_INSERT_HEAD(&int_rs
, rs
, next
);
771 * For an explanation of why the argument is volatile please
772 * look at the comments around rt_setup_rate().
774 static const struct tcp_hwrate_limit_table
*
775 tcp_int_find_suitable_rate(const volatile struct tcp_rate_set
*rs
,
776 uint64_t bytes_per_sec
, uint32_t flags
, uint64_t *lower_rate
)
778 struct tcp_hwrate_limit_table
*arte
= NULL
, *rte
= NULL
;
779 uint64_t mbits_per_sec
, ind_calc
, previous_rate
= 0;
782 mbits_per_sec
= (bytes_per_sec
* 8);
783 if (flags
& RS_PACING_LT
) {
784 if ((mbits_per_sec
< RS_ONE_MEGABIT_PERSEC
) &&
785 (rs
->rs_lowest_valid
<= 2)){
787 * Smaller than 1Meg, only
788 * 3 entries can match it.
791 for(i
= rs
->rs_lowest_valid
; i
< 3; i
++) {
792 if (bytes_per_sec
<= rs
->rs_rlt
[i
].rate
) {
793 rte
= &rs
->rs_rlt
[i
];
795 } else if (rs
->rs_rlt
[i
].flags
& HDWRPACE_INITED
) {
796 arte
= &rs
->rs_rlt
[i
];
798 previous_rate
= rs
->rs_rlt
[i
].rate
;
801 } else if ((mbits_per_sec
> RS_ONE_GIGABIT_PERSEC
) &&
802 (rs
->rs_rlt
[(ALL_HARDWARE_RATES
-1)].flags
& HDWRPACE_INITED
)){
804 * Larger than 1G (the majority of
807 if (mbits_per_sec
< RS_TEN_GIGABIT_PERSEC
)
808 rte
= &rs
->rs_rlt
[(ALL_HARDWARE_RATES
-1)];
810 arte
= &rs
->rs_rlt
[(ALL_HARDWARE_RATES
-1)];
811 previous_rate
= rs
->rs_rlt
[(ALL_HARDWARE_RATES
-2)].rate
;
815 * If we reach here its in our table (between 1Meg - 1000Meg),
816 * just take the rounded down mbits per second, and add
817 * 1Megabit to it, from this we can calculate
818 * the index in the table.
820 ind_calc
= mbits_per_sec
/RS_ONE_MEGABIT_PERSEC
;
821 if ((ind_calc
* RS_ONE_MEGABIT_PERSEC
) != mbits_per_sec
)
823 /* our table is offset by 3, we add 2 */
825 if (ind_calc
> (ALL_HARDWARE_RATES
-1)) {
826 /* This should not happen */
827 ind_calc
= ALL_HARDWARE_RATES
-1;
829 if ((ind_calc
>= rs
->rs_lowest_valid
) &&
830 (ind_calc
<= rs
->rs_highest_valid
)) {
831 rte
= &rs
->rs_rlt
[ind_calc
];
833 previous_rate
= rs
->rs_rlt
[(ind_calc
-1)].rate
;
835 } else if (flags
& RS_PACING_EXACT_MATCH
) {
836 if ((mbits_per_sec
< RS_ONE_MEGABIT_PERSEC
) &&
837 (rs
->rs_lowest_valid
<= 2)){
838 for(i
= rs
->rs_lowest_valid
; i
< 3; i
++) {
839 if (bytes_per_sec
== rs
->rs_rlt
[i
].rate
) {
840 rte
= &rs
->rs_rlt
[i
];
844 } else if ((mbits_per_sec
> RS_ONE_GIGABIT_PERSEC
) &&
845 (rs
->rs_rlt
[(ALL_HARDWARE_RATES
-1)].flags
& HDWRPACE_INITED
)) {
846 /* > 1Gbps only one rate */
847 if (bytes_per_sec
== rs
->rs_rlt
[(ALL_HARDWARE_RATES
-1)].rate
) {
849 rte
= &rs
->rs_rlt
[(ALL_HARDWARE_RATES
-1)];
852 /* Ok it must be a exact meg (its between 1G and 1Meg) */
853 ind_calc
= mbits_per_sec
/RS_ONE_MEGABIT_PERSEC
;
854 if ((ind_calc
* RS_ONE_MEGABIT_PERSEC
) == mbits_per_sec
) {
855 /* its an exact Mbps */
857 if (ind_calc
> (ALL_HARDWARE_RATES
-1)) {
858 /* This should not happen */
859 ind_calc
= ALL_HARDWARE_RATES
-1;
861 if (rs
->rs_rlt
[ind_calc
].flags
& HDWRPACE_INITED
)
862 rte
= &rs
->rs_rlt
[ind_calc
];
866 /* we want greater than the requested rate */
867 if ((mbits_per_sec
< RS_ONE_MEGABIT_PERSEC
) &&
868 (rs
->rs_lowest_valid
<= 2)){
869 arte
= &rs
->rs_rlt
[3]; /* set alternate to 1Meg */
870 for (i
=2; i
>=rs
->rs_lowest_valid
; i
--) {
871 if (bytes_per_sec
< rs
->rs_rlt
[i
].rate
) {
872 rte
= &rs
->rs_rlt
[i
];
874 previous_rate
= rs
->rs_rlt
[(i
-1)].rate
;
877 } else if ((flags
& RS_PACING_GEQ
) &&
878 (bytes_per_sec
== rs
->rs_rlt
[i
].rate
)) {
879 rte
= &rs
->rs_rlt
[i
];
881 previous_rate
= rs
->rs_rlt
[(i
-1)].rate
;
885 arte
= &rs
->rs_rlt
[i
]; /* new alternate */
888 } else if (mbits_per_sec
> RS_ONE_GIGABIT_PERSEC
) {
889 if ((bytes_per_sec
< rs
->rs_rlt
[(ALL_HARDWARE_RATES
-1)].rate
) &&
890 (rs
->rs_rlt
[(ALL_HARDWARE_RATES
-1)].flags
& HDWRPACE_INITED
)){
891 /* Our top rate is larger than the request */
892 rte
= &rs
->rs_rlt
[(ALL_HARDWARE_RATES
-1)];
893 } else if ((flags
& RS_PACING_GEQ
) &&
894 (bytes_per_sec
== rs
->rs_rlt
[(ALL_HARDWARE_RATES
-1)].rate
) &&
895 (rs
->rs_rlt
[(ALL_HARDWARE_RATES
-1)].flags
& HDWRPACE_INITED
)) {
896 /* It matches our top rate */
897 rte
= &rs
->rs_rlt
[(ALL_HARDWARE_RATES
-1)];
898 } else if (rs
->rs_rlt
[(ALL_HARDWARE_RATES
-1)].flags
& HDWRPACE_INITED
) {
899 /* The top rate is an alternative */
900 arte
= &rs
->rs_rlt
[(ALL_HARDWARE_RATES
-1)];
902 previous_rate
= rs
->rs_rlt
[(ALL_HARDWARE_RATES
-2)].rate
;
904 /* Its in our range 1Meg - 1Gig */
905 if (flags
& RS_PACING_GEQ
) {
906 ind_calc
= mbits_per_sec
/RS_ONE_MEGABIT_PERSEC
;
907 if ((ind_calc
* RS_ONE_MEGABIT_PERSEC
) == mbits_per_sec
) {
908 if (ind_calc
> (ALL_HARDWARE_RATES
-1)) {
909 /* This should not happen */
910 ind_calc
= (ALL_HARDWARE_RATES
-1);
912 rte
= &rs
->rs_rlt
[ind_calc
];
914 previous_rate
= rs
->rs_rlt
[(ind_calc
-1)].rate
;
918 ind_calc
= (mbits_per_sec
+ (RS_ONE_MEGABIT_PERSEC
-1))/RS_ONE_MEGABIT_PERSEC
;
920 if (ind_calc
> (ALL_HARDWARE_RATES
-1)) {
921 /* This should not happen */
922 ind_calc
= ALL_HARDWARE_RATES
-1;
924 if (rs
->rs_rlt
[ind_calc
].flags
& HDWRPACE_INITED
) {
925 rte
= &rs
->rs_rlt
[ind_calc
];
927 previous_rate
= rs
->rs_rlt
[(ind_calc
-1)].rate
;
934 (flags
& RS_PACING_SUB_OK
)) {
935 /* We can use the substitute */
939 *lower_rate
= previous_rate
;
944 * For an explanation of why the argument is volatile please
945 * look at the comments around rt_setup_rate().
947 static const struct tcp_hwrate_limit_table
*
948 tcp_find_suitable_rate(const volatile struct tcp_rate_set
*rs
, uint64_t bytes_per_sec
, uint32_t flags
, uint64_t *lower_rate
)
951 * Hunt the rate table with the restrictions in flags and find a
952 * suitable rate if possible.
953 * RS_PACING_EXACT_MATCH - look for an exact match to rate.
954 * RS_PACING_GT - must be greater than.
955 * RS_PACING_GEQ - must be greater than or equal.
956 * RS_PACING_LT - must be less than.
957 * RS_PACING_SUB_OK - If we don't meet criteria a
961 struct tcp_hwrate_limit_table
*rte
= NULL
;
962 uint64_t previous_rate
= 0;
964 if ((rs
->rs_flags
& RS_INT_TBL
) &&
965 (rs
->rs_rate_cnt
>= ALL_HARDWARE_RATES
)) {
967 * Here we don't want to paw thru
968 * a big table, we have everything
969 * from 1Meg - 1000Meg in 1Meg increments.
970 * Use an alternate method to "lookup".
972 return (tcp_int_find_suitable_rate(rs
, bytes_per_sec
, flags
, lower_rate
));
974 if ((flags
& RS_PACING_LT
) ||
975 (flags
& RS_PACING_EXACT_MATCH
)) {
977 * For exact and less than we go forward through the table.
978 * This way when we find one larger we stop (exact was a
981 for (i
= rs
->rs_lowest_valid
, matched
= 0; i
<= rs
->rs_highest_valid
; i
++) {
982 if ((flags
& RS_PACING_EXACT_MATCH
) &&
983 (bytes_per_sec
== rs
->rs_rlt
[i
].rate
)) {
984 rte
= &rs
->rs_rlt
[i
];
986 if (lower_rate
!= NULL
)
987 *lower_rate
= previous_rate
;
989 } else if ((flags
& RS_PACING_LT
) &&
990 (bytes_per_sec
<= rs
->rs_rlt
[i
].rate
)) {
991 rte
= &rs
->rs_rlt
[i
];
993 if (lower_rate
!= NULL
)
994 *lower_rate
= previous_rate
;
997 previous_rate
= rs
->rs_rlt
[i
].rate
;
998 if (bytes_per_sec
> rs
->rs_rlt
[i
].rate
)
1001 if ((matched
== 0) &&
1002 (flags
& RS_PACING_LT
) &&
1003 (flags
& RS_PACING_SUB_OK
)) {
1004 /* Kick in a substitute (the lowest) */
1005 rte
= &rs
->rs_rlt
[rs
->rs_lowest_valid
];
1009 * Here we go backward through the table so that we can find
1010 * the one greater in theory faster (but its probably a
1013 for (i
= rs
->rs_highest_valid
, matched
= 0; i
>= rs
->rs_lowest_valid
; i
--) {
1014 if (rs
->rs_rlt
[i
].rate
> bytes_per_sec
) {
1015 /* A possible candidate */
1016 rte
= &rs
->rs_rlt
[i
];
1018 if ((flags
& RS_PACING_GEQ
) &&
1019 (bytes_per_sec
== rs
->rs_rlt
[i
].rate
)) {
1020 /* An exact match and we want equal */
1022 rte
= &rs
->rs_rlt
[i
];
1026 * Found one that is larger than but don't
1027 * stop, there may be a more closer match.
1031 if (rs
->rs_rlt
[i
].rate
< bytes_per_sec
) {
1033 * We found a table entry that is smaller,
1034 * stop there will be none greater or equal.
1036 if (lower_rate
!= NULL
)
1037 *lower_rate
= rs
->rs_rlt
[i
].rate
;
1041 if ((matched
== 0) &&
1042 (flags
& RS_PACING_SUB_OK
)) {
1043 /* Kick in a substitute (the highest) */
1044 rte
= &rs
->rs_rlt
[rs
->rs_highest_valid
];
1050 static struct ifnet
*
1051 rt_find_real_interface(struct ifnet
*ifp
, struct inpcb
*inp
, int *error
)
1054 struct m_snd_tag
*tag
, *ntag
;
1055 union if_snd_tag_alloc_params params
= {
1056 .rate_limit
.hdr
.type
= IF_SND_TAG_TYPE_RATE_LIMIT
,
1057 .rate_limit
.hdr
.flowid
= inp
->inp_flowid
,
1058 .rate_limit
.hdr
.numa_domain
= inp
->inp_numa_domain
,
1059 .rate_limit
.max_rate
= COMMON_RATE
,
1060 .rate_limit
.flags
= M_NOWAIT
,
1064 params
.rate_limit
.hdr
.flowtype
= ((inp
->inp_vflag
& INP_IPV6
) ?
1065 M_HASHTYPE_RSS_TCP_IPV6
: M_HASHTYPE_RSS_TCP_IPV4
);
1067 params
.rate_limit
.hdr
.flowtype
= M_HASHTYPE_OPAQUE_HASH
;
1069 err
= m_snd_tag_alloc(ifp
, ¶ms
, &tag
);
1071 /* Failed to setup a tag? */
1077 while (ntag
->sw
->next_snd_tag
!= NULL
) {
1078 ntag
= ntag
->sw
->next_snd_tag(ntag
);
1081 m_snd_tag_rele(tag
);
1086 rl_increment_using(const struct tcp_hwrate_limit_table
*rte
)
1088 struct tcp_hwrate_limit_table
*decon_rte
;
1090 decon_rte
= __DECONST(struct tcp_hwrate_limit_table
*, rte
);
1091 atomic_add_long(&decon_rte
->using, 1);
1095 rl_decrement_using(const struct tcp_hwrate_limit_table
*rte
)
1097 struct tcp_hwrate_limit_table
*decon_rte
;
1099 decon_rte
= __DECONST(struct tcp_hwrate_limit_table
*, rte
);
1100 atomic_subtract_long(&decon_rte
->using, 1);
1104 tcp_rl_log_enobuf(const struct tcp_hwrate_limit_table
*rte
)
1106 struct tcp_hwrate_limit_table
*decon_rte
;
1108 decon_rte
= __DECONST(struct tcp_hwrate_limit_table
*, rte
);
1109 atomic_add_long(&decon_rte
->rs_num_enobufs
, 1);
1113 * Do NOT take the __noinline out of the
1114 * find_rs_for_ifp() function. If you do the inline
1115 * of it for the rt_setup_rate() will show you a
1116 * compiler bug. For some reason the compiler thinks
1117 * the list can never be empty. The consequence of
1118 * this will be a crash when we dereference NULL
1119 * if an ifp is removed just has a hw rate limit
1120 * is attempted. If you are working on the compiler
1121 * and want to "test" this go ahead and take the noinline
1122 * out otherwise let sleeping dogs ly until such time
1123 * as we get a compiler fix 10/2/20 -- RRS
1125 static __noinline
struct tcp_rate_set
*
1126 find_rs_for_ifp(struct ifnet
*ifp
)
1128 struct tcp_rate_set
*rs
;
1130 CK_LIST_FOREACH(rs
, &int_rs
, next
) {
1131 if ((rs
->rs_ifp
== ifp
) &&
1132 (rs
->rs_if_dunit
== ifp
->if_dunit
)) {
1133 /* Ok we found it */
1141 static const struct tcp_hwrate_limit_table
*
1142 rt_setup_rate(struct inpcb
*inp
, struct ifnet
*ifp
, uint64_t bytes_per_sec
,
1143 uint32_t flags
, int *error
, uint64_t *lower_rate
)
1145 /* First lets find the interface if it exists */
1146 const struct tcp_hwrate_limit_table
*rte
;
1148 * So why is rs volatile? This is to defeat a
1149 * compiler bug where in the compiler is convinced
1150 * that rs can never be NULL (which is not true). Because
1151 * of its conviction it nicely optimizes out the if ((rs == NULL
1152 * below which means if you get a NULL back you dereference it.
1154 volatile struct tcp_rate_set
*rs
;
1155 struct epoch_tracker et
;
1156 struct ifnet
*oifp
= ifp
;
1159 NET_EPOCH_ENTER(et
);
1161 rs
= find_rs_for_ifp(ifp
);
1163 (rs
->rs_flags
& RS_INTF_NO_SUP
) ||
1164 (rs
->rs_flags
& RS_IS_DEAD
)) {
1166 * This means we got a packet *before*
1167 * the IF-UP was processed below, <or>
1168 * while or after we already received an interface
1169 * departed event. In either case we really don't
1170 * want to do anything with pacing, in
1171 * the departing case the packet is not
1172 * going to go very far. The new case
1173 * might be arguable, but its impossible
1174 * to tell from the departing case.
1182 if ((rs
== NULL
) || (rs
->rs_disable
!= 0)) {
1188 if (rs
->rs_flags
& RS_IS_DEFF
) {
1189 /* We need to find the real interface */
1192 tifp
= rt_find_real_interface(ifp
, inp
, error
);
1194 if (rs
->rs_disable
&& error
)
1199 KASSERT((tifp
!= ifp
),
1200 ("Lookup failure ifp:%p inp:%p rt_find_real_interface() returns the same interface tifp:%p?\n",
1203 goto use_real_interface
;
1205 if (rs
->rs_flow_limit
&&
1206 ((rs
->rs_flows_using
+ 1) > rs
->rs_flow_limit
)) {
1212 rte
= tcp_find_suitable_rate(rs
, bytes_per_sec
, flags
, lower_rate
);
1214 err
= in_pcbattach_txrtlmt(inp
, oifp
,
1220 /* Failed to attach */
1225 KASSERT((inp
->inp_snd_tag
!= NULL
) ,
1226 ("Setup rate has no snd_tag inp:%p rte:%p rate:%llu rs:%p",
1227 inp
, rte
, (unsigned long long)rte
->rate
, rs
));
1229 counter_u64_add(rate_limit_new
, 1);
1235 * We use an atomic here for accounting so we don't have to
1236 * use locks when freeing.
1238 atomic_add_64(&rs
->rs_flows_using
, 1);
1245 tcp_rl_ifnet_link(void *arg __unused
, struct ifnet
*ifp
, int link_state
)
1248 struct tcp_rate_set
*rs
;
1249 struct epoch_tracker et
;
1251 if (((ifp
->if_capenable
& IFCAP_TXRTLMT
) == 0) ||
1252 (link_state
!= LINK_STATE_UP
)) {
1254 * We only care on an interface going up that is rate-limit
1259 NET_EPOCH_ENTER(et
);
1261 rs
= find_rs_for_ifp(ifp
);
1263 /* We already have initialized this guy */
1264 mtx_unlock(&rs_mtx
);
1268 mtx_unlock(&rs_mtx
);
1270 rt_setup_new_rs(ifp
, &error
);
1274 tcp_rl_ifnet_departure(void *arg __unused
, struct ifnet
*ifp
)
1276 struct tcp_rate_set
*rs
;
1277 struct epoch_tracker et
;
1280 NET_EPOCH_ENTER(et
);
1282 rs
= find_rs_for_ifp(ifp
);
1284 CK_LIST_REMOVE(rs
, next
);
1286 rs
->rs_flags
|= RS_IS_DEAD
;
1287 for (i
= 0; i
< rs
->rs_rate_cnt
; i
++) {
1288 if (rs
->rs_rlt
[i
].flags
& HDWRPACE_TAGPRESENT
) {
1289 in_pcbdetach_tag(rs
->rs_rlt
[i
].tag
);
1290 rs
->rs_rlt
[i
].tag
= NULL
;
1292 rs
->rs_rlt
[i
].flags
= HDWRPACE_IFPDEPARTED
;
1294 if (rs
->rs_flows_using
== 0)
1295 rs_defer_destroy(rs
);
1297 mtx_unlock(&rs_mtx
);
1302 tcp_rl_release_ifnet(struct ifnet
*ifp
)
1304 tcp_rl_ifnet_departure(NULL
, ifp
);
1308 tcp_rl_shutdown(void *arg __unused
, int howto __unused
)
1310 struct tcp_rate_set
*rs
, *nrs
;
1311 struct epoch_tracker et
;
1314 NET_EPOCH_ENTER(et
);
1316 CK_LIST_FOREACH_SAFE(rs
, &int_rs
, next
, nrs
) {
1317 CK_LIST_REMOVE(rs
, next
);
1319 rs
->rs_flags
|= RS_IS_DEAD
;
1320 for (i
= 0; i
< rs
->rs_rate_cnt
; i
++) {
1321 if (rs
->rs_rlt
[i
].flags
& HDWRPACE_TAGPRESENT
) {
1322 in_pcbdetach_tag(rs
->rs_rlt
[i
].tag
);
1323 rs
->rs_rlt
[i
].tag
= NULL
;
1325 rs
->rs_rlt
[i
].flags
= HDWRPACE_IFPDEPARTED
;
1327 if (rs
->rs_flows_using
== 0)
1328 rs_defer_destroy(rs
);
1330 mtx_unlock(&rs_mtx
);
1334 const struct tcp_hwrate_limit_table
*
1335 tcp_set_pacing_rate(struct tcpcb
*tp
, struct ifnet
*ifp
,
1336 uint64_t bytes_per_sec
, int flags
, int *error
, uint64_t *lower_rate
)
1338 struct inpcb
*inp
= tptoinpcb(tp
);
1339 const struct tcp_hwrate_limit_table
*rte
;
1341 struct ktls_session
*tls
;
1344 INP_WLOCK_ASSERT(inp
);
1346 if (inp
->inp_snd_tag
== NULL
) {
1348 * We are setting up a rate for the first time.
1350 if ((ifp
->if_capenable
& IFCAP_TXRTLMT
) == 0) {
1351 /* Not supported by the egress */
1358 if (tp
->t_nic_ktls_xmit
!= 0) {
1359 tls
= tptosocket(tp
)->so_snd
.sb_tls_info
;
1361 if ((ifp
->if_capenable
& IFCAP_TXTLS_RTLMT
) == 0 ||
1362 tls
->mode
!= TCP_TLS_MODE_IFNET
) {
1369 rte
= rt_setup_rate(inp
, ifp
, bytes_per_sec
, flags
, error
, lower_rate
);
1371 rl_increment_using(rte
);
1373 if (rte
!= NULL
&& tls
!= NULL
&& tls
->snd_tag
!= NULL
) {
1375 * Fake a route change error to reset the TLS
1376 * send tag. This will convert the existing
1377 * tag to a TLS ratelimit tag.
1379 MPASS(tls
->snd_tag
->sw
->type
== IF_SND_TAG_TYPE_TLS
);
1380 ktls_output_eagain(inp
, tls
);
1385 * We are modifying a rate, wrong interface?
1392 tp
->t_pacing_rate
= rte
->rate
;
1398 const struct tcp_hwrate_limit_table
*
1399 tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table
*crte
,
1400 struct tcpcb
*tp
, struct ifnet
*ifp
,
1401 uint64_t bytes_per_sec
, int flags
, int *error
, uint64_t *lower_rate
)
1403 struct inpcb
*inp
= tptoinpcb(tp
);
1404 const struct tcp_hwrate_limit_table
*nrte
;
1405 const struct tcp_rate_set
*rs
;
1407 struct ktls_session
*tls
= NULL
;
1411 INP_WLOCK_ASSERT(inp
);
1414 /* Wrong interface */
1421 if (tp
->t_nic_ktls_xmit
) {
1422 tls
= tptosocket(tp
)->so_snd
.sb_tls_info
;
1423 if (tls
->mode
!= TCP_TLS_MODE_IFNET
)
1425 else if (tls
->snd_tag
!= NULL
&&
1426 tls
->snd_tag
->sw
->type
!= IF_SND_TAG_TYPE_TLS_RATE_LIMIT
) {
1427 if (!tls
->reset_pending
) {
1429 * NIC probably doesn't support
1430 * ratelimit TLS tags if it didn't
1431 * allocate one when an existing rate
1432 * was present, so ignore.
1434 tcp_rel_pacing_rate(crte
, tp
);
1436 *error
= EOPNOTSUPP
;
1441 * The send tag is being converted, so set the
1442 * rate limit on the inpcb tag. There is a
1443 * race that the new NIC send tag might use
1444 * the current rate instead of this one.
1450 if (inp
->inp_snd_tag
== NULL
) {
1451 /* Wrong interface */
1452 tcp_rel_pacing_rate(crte
, tp
);
1458 if ((rs
->rs_flags
& RS_IS_DEAD
) ||
1459 (crte
->flags
& HDWRPACE_IFPDEPARTED
)) {
1460 /* Release the rate, and try anew */
1462 tcp_rel_pacing_rate(crte
, tp
);
1463 nrte
= tcp_set_pacing_rate(tp
, ifp
,
1464 bytes_per_sec
, flags
, error
, lower_rate
);
1467 nrte
= tcp_find_suitable_rate(rs
, bytes_per_sec
, flags
, lower_rate
);
1475 /* Release the old rate */
1478 tcp_rel_pacing_rate(crte
, tp
);
1481 rl_decrement_using(crte
);
1482 rl_increment_using(nrte
);
1483 /* Change rates to our new entry */
1486 err
= ktls_modify_txrtlmt(tls
, nrte
->rate
);
1489 err
= in_pcbmodify_txrtlmt(inp
, nrte
->rate
);
1491 struct tcp_rate_set
*lrs
;
1494 rl_decrement_using(nrte
);
1495 lrs
= __DECONST(struct tcp_rate_set
*, rs
);
1496 pre
= atomic_fetchadd_64(&lrs
->rs_flows_using
, -1);
1497 /* Do we still have a snd-tag attached? */
1498 if (inp
->inp_snd_tag
)
1499 in_pcbdetach_txrtlmt(inp
);
1502 struct epoch_tracker et
;
1504 NET_EPOCH_ENTER(et
);
1509 if (lrs
->rs_flags
& RS_IS_DEAD
)
1510 rs_defer_destroy(lrs
);
1511 mtx_unlock(&rs_mtx
);
1519 counter_u64_add(rate_limit_chg
, 1);
1524 tp
->t_pacing_rate
= nrte
->rate
;
1529 tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table
*crte
, struct tcpcb
*tp
)
1531 struct inpcb
*inp
= tptoinpcb(tp
);
1532 const struct tcp_rate_set
*crs
;
1533 struct tcp_rate_set
*rs
;
1536 INP_WLOCK_ASSERT(inp
);
1538 tp
->t_pacing_rate
= -1;
1541 * Now we must break the const
1542 * in order to release our refcount.
1544 rs
= __DECONST(struct tcp_rate_set
*, crs
);
1545 rl_decrement_using(crte
);
1546 pre
= atomic_fetchadd_64(&rs
->rs_flows_using
, -1);
1548 struct epoch_tracker et
;
1550 NET_EPOCH_ENTER(et
);
1555 if (rs
->rs_flags
& RS_IS_DEAD
)
1556 rs_defer_destroy(rs
);
1557 mtx_unlock(&rs_mtx
);
1562 * XXX: If this connection is using ifnet TLS, should we
1563 * switch it to using an unlimited rate, or perhaps use
1564 * ktls_output_eagain() to reset the send tag to a plain
1567 in_pcbdetach_txrtlmt(inp
);
1570 #define ONE_POINT_TWO_MEG 150000 /* 1.2 megabits in bytes */
1571 #define ONE_HUNDRED_MBPS 12500000 /* 100Mbps in bytes per second */
1572 #define FIVE_HUNDRED_MBPS 62500000 /* 500Mbps in bytes per second */
1573 #define MAX_MSS_SENT 43 /* 43 mss = 43 x 1500 = 64,500 bytes */
1576 tcp_log_pacing_size(struct tcpcb
*tp
, uint64_t bw
, uint32_t segsiz
, uint32_t new_tso
,
1577 uint64_t hw_rate
, uint32_t time_between
, uint32_t calc_time_between
,
1578 uint32_t segs
, uint32_t res_div
, uint16_t mult
, uint8_t mod
)
1580 if (tcp_bblogging_on(tp
)) {
1581 union tcp_log_stackspecific log
;
1584 memset(&log
, 0, sizeof(log
));
1585 log
.u_bbr
.flex1
= segsiz
;
1586 log
.u_bbr
.flex2
= new_tso
;
1587 log
.u_bbr
.flex3
= time_between
;
1588 log
.u_bbr
.flex4
= calc_time_between
;
1589 log
.u_bbr
.flex5
= segs
;
1590 log
.u_bbr
.flex6
= res_div
;
1591 log
.u_bbr
.flex7
= mult
;
1592 log
.u_bbr
.flex8
= mod
;
1593 log
.u_bbr
.timeStamp
= tcp_get_usecs(&tv
);
1594 log
.u_bbr
.cur_del_rate
= bw
;
1595 log
.u_bbr
.delRate
= hw_rate
;
1596 TCP_LOG_EVENTP(tp
, NULL
,
1597 &tptosocket(tp
)->so_rcv
,
1598 &tptosocket(tp
)->so_snd
,
1599 TCP_HDWR_PACE_SIZE
, 0,
1600 0, &log
, false, &tv
);
1605 tcp_get_pacing_burst_size_w_divisor(struct tcpcb
*tp
, uint64_t bw
, uint32_t segsiz
, int can_use_1mss
,
1606 const struct tcp_hwrate_limit_table
*te
, int *err
, int divisor
)
1609 * We use the google formula to calculate the
1614 * tso = min(bw/(div=1000), 64k)
1616 * Note for these calculations we ignore the
1617 * packet overhead (enet hdr, ip hdr and tcp hdr).
1618 * We only get the google formula when we have
1619 * divisor = 1000, which is the default for now.
1621 uint64_t lentim
, res
, bytes
;
1622 uint32_t new_tso
, min_tso_segs
;
1624 /* It can't be zero */
1625 if ((divisor
== 0) ||
1626 (divisor
< RL_MIN_DIVISOR
)) {
1628 bytes
= bw
/ mss_divisor
;
1632 bytes
= bw
/ divisor
;
1633 /* We can't ever send more than 65k in a TSO */
1634 if (bytes
> 0xffff) {
1638 new_tso
= (bytes
+ segsiz
- 1) / segsiz
;
1639 /* Are we enforcing even boundaries? */
1640 if (even_num_segs
&& (new_tso
& 1) && (new_tso
> even_threshold
))
1646 if (rs_floor_mss
&& (new_tso
< rs_floor_mss
))
1647 new_tso
= rs_floor_mss
;
1648 else if (new_tso
< min_tso_segs
)
1649 new_tso
= min_tso_segs
;
1650 if (new_tso
> MAX_MSS_SENT
)
1651 new_tso
= MAX_MSS_SENT
;
1653 tcp_log_pacing_size(tp
, bw
, segsiz
, new_tso
,
1654 0, 0, 0, 0, 0, 0, 1);
1656 * If we are not doing hardware pacing
1665 * For hardware pacing we look at the
1666 * rate you are sending at and compare
1667 * that to the rate you have in hardware.
1669 * If the hardware rate is slower than your
1670 * software rate then you are in error and
1671 * we will build a queue in our hardware whic
1672 * is probably not desired, in such a case
1673 * just return the non-hardware TSO size.
1675 * If the rate in hardware is faster (which
1676 * it should be) then look at how long it
1677 * takes to send one ethernet segment size at
1678 * your b/w and compare that to the time it
1679 * takes to send at the rate you had selected.
1681 * If your time is greater (which we hope it is)
1682 * we get the delta between the two, and then
1683 * divide that into your pacing time. This tells
1684 * us how many MSS you can send down at once (rounded up).
1686 * Note we also double this value if the b/w is over
1687 * 100Mbps. If its over 500meg we just set you to the
1688 * max (43 segments).
1690 if (te
->rate
> FIVE_HUNDRED_MBPS
)
1692 if (te
->rate
== bw
) {
1693 /* We are pacing at exactly the hdwr rate */
1695 tcp_log_pacing_size(tp
, bw
, segsiz
, new_tso
,
1696 te
->rate
, te
->time_between
, (uint32_t)0,
1697 (segsiz
* MAX_MSS_SENT
), 0, 0, 3);
1698 return (segsiz
* MAX_MSS_SENT
);
1700 lentim
= ETHERNET_SEGMENT_SIZE
* USECS_IN_SECOND
;
1702 if (res
> te
->time_between
) {
1703 uint32_t delta
, segs
, res_div
;
1705 res_div
= ((res
* num_of_waits_allowed
) + wait_time_floor
);
1706 delta
= res
- te
->time_between
;
1707 segs
= (res_div
+ delta
- 1)/delta
;
1708 if (segs
< min_tso_segs
)
1709 segs
= min_tso_segs
;
1710 if (segs
< rs_hw_floor_mss
)
1711 segs
= rs_hw_floor_mss
;
1712 if (segs
> MAX_MSS_SENT
)
1713 segs
= MAX_MSS_SENT
;
1715 tcp_log_pacing_size(tp
, bw
, segsiz
, new_tso
,
1716 te
->rate
, te
->time_between
, (uint32_t)res
,
1717 segs
, res_div
, 1, 3);
1720 if (segs
< new_tso
) {
1728 * Your time is smaller which means
1729 * we will grow a queue on our
1730 * hardware. Send back the non-hardware
1733 tcp_log_pacing_size(tp
, bw
, segsiz
, new_tso
,
1734 te
->rate
, te
->time_between
, (uint32_t)res
,
1743 tcp_hw_highest_rate_ifp(struct ifnet
*ifp
, struct inpcb
*inp
)
1745 struct epoch_tracker et
;
1746 struct tcp_rate_set
*rs
;
1749 NET_EPOCH_ENTER(et
);
1751 rs
= find_rs_for_ifp(ifp
);
1753 /* This interface does not do ratelimiting */
1755 } else if (rs
->rs_flags
& RS_IS_DEFF
) {
1756 /* We need to find the real interface */
1759 tifp
= rt_find_real_interface(ifp
, inp
, NULL
);
1765 goto use_next_interface
;
1767 /* Lets return the highest rate this guy has */
1768 rate_ret
= rs
->rs_rlt
[rs
->rs_highest_valid
].rate
;
1774 static eventhandler_tag rl_ifnet_departs
;
1775 static eventhandler_tag rl_ifnet_arrives
;
1776 static eventhandler_tag rl_shutdown_start
;
1779 tcp_rs_init(void *st __unused
)
1781 CK_LIST_INIT(&int_rs
);
1782 rs_number_alive
= 0;
1784 mtx_init(&rs_mtx
, "tcp_rs_mtx", "rsmtx", MTX_DEF
);
1785 rl_ifnet_departs
= EVENTHANDLER_REGISTER(ifnet_departure_event
,
1786 tcp_rl_ifnet_departure
,
1787 NULL
, EVENTHANDLER_PRI_ANY
);
1788 rl_ifnet_arrives
= EVENTHANDLER_REGISTER(ifnet_link_event
,
1790 NULL
, EVENTHANDLER_PRI_ANY
);
1791 rl_shutdown_start
= EVENTHANDLER_REGISTER(shutdown_pre_sync
,
1792 tcp_rl_shutdown
, NULL
,
1793 SHUTDOWN_PRI_FIRST
);
1794 printf("TCP_ratelimit: Is now initialized\n");
1797 SYSINIT(tcp_rl_init
, SI_SUB_SMP
+ 1, SI_ORDER_ANY
, tcp_rs_init
, NULL
);