1 /* Copyright (c) 2021, The Tor Project, Inc. */
2 /* See LICENSE for licensing information */
5 * \file congestion_control_common.c
6 * \brief Common code used by all congestion control algorithms.
9 #define TOR_CONGESTION_CONTROL_COMMON_PRIVATE
10 #define TOR_CONGESTION_CONTROL_PRIVATE
12 #include "core/or/or.h"
14 #include "core/crypto/onion_crypto.h"
15 #include "core/or/circuitlist.h"
16 #include "core/or/crypt_path.h"
17 #include "core/or/or_circuit_st.h"
18 #include "core/or/origin_circuit_st.h"
19 #include "core/or/channel.h"
20 #include "core/mainloop/connection.h"
21 #include "core/or/sendme.h"
22 #include "core/or/congestion_control_st.h"
23 #include "core/or/congestion_control_common.h"
24 #include "core/or/congestion_control_vegas.h"
25 #include "core/or/congestion_control_st.h"
26 #include "core/or/conflux.h"
27 #include "core/or/conflux_util.h"
28 #include "core/or/trace_probes_cc.h"
29 #include "lib/time/compat_time.h"
30 #include "feature/nodelist/networkstatus.h"
31 #include "app/config/config.h"
33 #include "trunnel/congestion_control.h"
34 #include "trunnel/extension.h"
36 /* Consensus parameter defaults.
38 * More details for each of the parameters can be found in proposal 324,
39 * section 6.5 including tuning notes. */
40 #define SENDME_INC_DFLT (TLS_RECORD_MAX_CELLS)
41 #define CIRCWINDOW_INIT (4*SENDME_INC_DFLT)
43 #define CC_ALG_DFLT (CC_ALG_VEGAS)
44 #define CC_ALG_DFLT_ALWAYS (CC_ALG_VEGAS)
46 #define CWND_INC_DFLT (1)
47 #define CWND_INC_PCT_SS_DFLT (100)
48 #define CWND_INC_RATE_DFLT (SENDME_INC_DFLT)
50 #define CWND_MIN_DFLT (CIRCWINDOW_INIT)
51 #define CWND_MAX_DFLT (INT32_MAX)
53 #define BWE_SENDME_MIN_DFLT (5)
55 #define N_EWMA_CWND_PCT_DFLT (50)
56 #define N_EWMA_MAX_DFLT (10)
57 #define N_EWMA_SS_DFLT (2)
59 #define RTT_RESET_PCT_DFLT (100)
61 /* BDP algorithms for each congestion control algorithms use the piecewise
62 * estimattor. See section 3.1.4 of proposal 324. */
63 #define WESTWOOD_BDP_ALG BDP_ALG_PIECEWISE
64 #define VEGAS_BDP_MIX_ALG BDP_ALG_PIECEWISE
65 #define NOLA_BDP_ALG BDP_ALG_PIECEWISE
67 /* Indicate OR connection buffer limitations used to stop or start accepting
68 * cells in its outbuf.
70 * These watermarks are historical to tor in a sense that they've been used
71 * almost from the genesis point. And were likely defined to fit the bounds of
72 * TLS records of 16KB which would be around 32 cells.
74 * These are defaults of the consensus parameter "orconn_high" and "orconn_low"
76 #define OR_CONN_HIGHWATER_DFLT (32*1024)
77 #define OR_CONN_LOWWATER_DFLT (16*1024)
79 /* Low and high values of circuit cell queue sizes. They are used to tell when
80 * to start or stop reading on the streams attached on the circuit.
82 * These are defaults of the consensus parameters "cellq_high" and "cellq_low".
84 #define CELL_QUEUE_LOW_DFLT (10)
85 #define CELL_QUEUE_HIGH_DFLT (256)
87 static bool congestion_control_update_circuit_bdp(congestion_control_t
*,
90 /* Number of times the RTT value was reset. For MetricsPort. */
91 static uint64_t num_rtt_reset
;
93 /* Number of times the clock was stalled. For MetricsPort. */
94 static uint64_t num_clock_stalls
;
96 /* Consensus parameters cached. The non static ones are extern. */
97 static uint32_t cwnd_max
= CWND_MAX_DFLT
;
98 int32_t cell_queue_high
= CELL_QUEUE_HIGH_DFLT
;
99 int32_t cell_queue_low
= CELL_QUEUE_LOW_DFLT
;
100 uint32_t or_conn_highwater
= OR_CONN_HIGHWATER_DFLT
;
101 uint32_t or_conn_lowwater
= OR_CONN_LOWWATER_DFLT
;
102 uint8_t cc_sendme_inc
= SENDME_INC_DFLT
;
103 STATIC cc_alg_t cc_alg
= CC_ALG_DFLT
;
106 * Number of cwnd worth of sendme acks to smooth RTT and BDP with,
108 static uint8_t n_ewma_cwnd_pct
= N_EWMA_CWND_PCT_DFLT
;
111 * Maximum number N for the N-count EWMA averaging of RTT and BDP.
113 static uint8_t n_ewma_max
= N_EWMA_MAX_DFLT
;
116 * Maximum number N for the N-count EWMA averaging of RTT in Slow Start.
118 static uint8_t n_ewma_ss
= N_EWMA_SS_DFLT
;
121 * Minimum number of sendmes before we begin BDP estimates
123 static uint8_t bwe_sendme_min
= BWE_SENDME_MIN_DFLT
;
126 * Percentage of the current RTT to use when resetting the minimum RTT
127 * for a circuit. (RTT is reset when the cwnd hits cwnd_min).
129 static uint8_t rtt_reset_pct
= RTT_RESET_PCT_DFLT
;
131 /** Metric to count the number of congestion control circuits **/
132 uint64_t cc_stats_circs_created
= 0;
134 /** Return the number of RTT reset that have been done. */
136 congestion_control_get_num_rtt_reset(void)
138 return num_rtt_reset
;
141 /** Return the number of clock stalls that have been done. */
143 congestion_control_get_num_clock_stalls(void)
145 return num_clock_stalls
;
149 * Update global congestion control related consensus parameter values,
150 * every consensus update.
153 congestion_control_new_consensus_params(const networkstatus_t
*ns
)
155 #define CELL_QUEUE_HIGH_MIN (1)
156 #define CELL_QUEUE_HIGH_MAX (1000)
157 cell_queue_high
= networkstatus_get_param(ns
, "cellq_high",
158 CELL_QUEUE_HIGH_DFLT
,
160 CELL_QUEUE_HIGH_MAX
);
162 #define CELL_QUEUE_LOW_MIN (1)
163 #define CELL_QUEUE_LOW_MAX (1000)
164 cell_queue_low
= networkstatus_get_param(ns
, "cellq_low",
169 #define OR_CONN_HIGHWATER_MIN (CELL_PAYLOAD_SIZE)
170 #define OR_CONN_HIGHWATER_MAX (INT32_MAX)
172 networkstatus_get_param(ns
, "orconn_high",
173 OR_CONN_HIGHWATER_DFLT
,
174 OR_CONN_HIGHWATER_MIN
,
175 OR_CONN_HIGHWATER_MAX
);
177 #define OR_CONN_LOWWATER_MIN (CELL_PAYLOAD_SIZE)
178 #define OR_CONN_LOWWATER_MAX (INT32_MAX)
180 networkstatus_get_param(ns
, "orconn_low",
181 OR_CONN_LOWWATER_DFLT
,
182 OR_CONN_LOWWATER_MIN
,
183 OR_CONN_LOWWATER_MAX
);
185 #define CWND_MAX_MIN 500
186 #define CWND_MAX_MAX (INT32_MAX)
188 networkstatus_get_param(NULL
, "cc_cwnd_max",
193 #define RTT_RESET_PCT_MIN (0)
194 #define RTT_RESET_PCT_MAX (100)
196 networkstatus_get_param(NULL
, "cc_rtt_reset_pct",
201 #define SENDME_INC_MIN 1
202 #define SENDME_INC_MAX (254)
204 networkstatus_get_param(NULL
, "cc_sendme_inc",
210 #define CC_ALG_MAX (NUM_CC_ALGS-1)
212 networkstatus_get_param(NULL
, "cc_alg",
216 if (cc_alg
!= CC_ALG_SENDME
&& cc_alg
!= CC_ALG_VEGAS
) {
217 // Does not need rate limiting because consensus updates
218 // are at most 1x/hour
219 log_warn(LD_BUG
, "Unsupported congestion control algorithm %d",
221 cc_alg
= CC_ALG_DFLT
;
224 #define BWE_SENDME_MIN_MIN 2
225 #define BWE_SENDME_MIN_MAX (20)
227 networkstatus_get_param(NULL
, "cc_bwe_min",
232 #define N_EWMA_CWND_PCT_MIN 1
233 #define N_EWMA_CWND_PCT_MAX (255)
235 networkstatus_get_param(NULL
, "cc_ewma_cwnd_pct",
236 N_EWMA_CWND_PCT_DFLT
,
238 N_EWMA_CWND_PCT_MAX
);
240 #define N_EWMA_MAX_MIN 2
241 #define N_EWMA_MAX_MAX (INT32_MAX)
243 networkstatus_get_param(NULL
, "cc_ewma_max",
248 #define N_EWMA_SS_MIN 2
249 #define N_EWMA_SS_MAX (INT32_MAX)
251 networkstatus_get_param(NULL
, "cc_ewma_ss",
258 * Set congestion control parameters on a circuit's congestion
259 * control object based on values from the consensus.
261 * cc_alg is the negotiated congestion control algorithm.
263 * sendme_inc is the number of packaged cells that a sendme cell
264 * acks. This parameter will come from circuit negotiation.
267 congestion_control_init_params(congestion_control_t
*cc
,
268 const circuit_params_t
*params
,
271 const or_options_t
*opts
= get_options();
272 cc
->sendme_inc
= params
->sendme_inc_cells
;
274 #define CWND_INIT_MIN SENDME_INC_DFLT
275 #define CWND_INIT_MAX (10000)
277 networkstatus_get_param(NULL
, "cc_cwnd_init",
282 #define CWND_INC_PCT_SS_MIN 1
283 #define CWND_INC_PCT_SS_MAX (500)
284 cc
->cwnd_inc_pct_ss
=
285 networkstatus_get_param(NULL
, "cc_cwnd_inc_pct_ss",
286 CWND_INC_PCT_SS_DFLT
,
288 CWND_INC_PCT_SS_MAX
);
290 #define CWND_INC_MIN 1
291 #define CWND_INC_MAX (1000)
293 networkstatus_get_param(NULL
, "cc_cwnd_inc",
298 #define CWND_INC_RATE_MIN 1
299 #define CWND_INC_RATE_MAX (250)
301 networkstatus_get_param(NULL
, "cc_cwnd_inc_rate",
306 #define CWND_MIN_MIN SENDME_INC_DFLT
307 #define CWND_MIN_MAX (1000)
309 networkstatus_get_param(NULL
, "cc_cwnd_min",
314 /* If the consensus says to use OG sendme, but torrc has
315 * always-enabled, use the default "always" alg (vegas),
316 * else use cached conensus alg. */
317 if (cc_alg
== CC_ALG_SENDME
&& opts
->AlwaysCongestionControl
) {
318 cc
->cc_alg
= CC_ALG_DFLT_ALWAYS
;
323 /* Algorithm-specific parameters */
324 if (cc
->cc_alg
== CC_ALG_VEGAS
) {
325 congestion_control_vegas_set_params(cc
, path
);
327 // This should not happen anymore
328 log_warn(LD_BUG
, "Unknown congestion control algorithm %d",
333 /** Returns true if congestion control is enabled in the most recent
334 * consensus, or if __AlwaysCongestionControl is set to true.
336 * Note that this function (and many many other functions) should not
337 * be called from the CPU worker threads when handling congestion
338 * control negotiation. Relevant values are marshaled into the
339 * `circuit_params_t` struct, in order to be used in worker threads
340 * without touching global state. Use those values in CPU worker
341 * threads, instead of calling this function.
343 * The danger is still present, in your time, as it was in ours.
346 congestion_control_enabled(void)
348 const or_options_t
*opts
= NULL
;
350 tor_assert_nonfatal_once(in_main_thread());
352 opts
= get_options();
354 /* If the user has set "__AlwaysCongesttionControl",
355 * then always try to negotiate congestion control, regardless
356 * of consensus param. This is to be used for testing and sbws.
358 * Note that we do *not* allow disabling congestion control
359 * if the consensus says to use it, as this is bad for queueing
361 if (opts
->AlwaysCongestionControl
)
364 return cc_alg
!= CC_ALG_SENDME
;
367 #ifdef TOR_UNIT_TESTS
369 * For unit tests only: set the cached consensus cc alg to
373 congestion_control_set_cc_enabled(void)
375 cc_alg
= CC_ALG_VEGAS
;
379 * For unit tests only: set the cached consensus cc alg to
383 congestion_control_set_cc_disabled(void)
385 cc_alg
= CC_ALG_SENDME
;
390 * Allocate and initialize fields in congestion control object.
392 * cc_alg is the negotiated congestion control algorithm.
394 * sendme_inc is the number of packaged cells that a sendme cell
395 * acks. This parameter will come from circuit negotiation.
398 congestion_control_init(congestion_control_t
*cc
,
399 const circuit_params_t
*params
,
402 cc
->sendme_pending_timestamps
= smartlist_new();
404 cc
->in_slow_start
= 1;
405 congestion_control_init_params(cc
, params
, path
);
407 cc
->next_cc_event
= CWND_UPDATE_RATE(cc
);
410 /** Allocate and initialize a new congestion control object */
411 congestion_control_t
*
412 congestion_control_new(const circuit_params_t
*params
, cc_path_t path
)
414 congestion_control_t
*cc
= tor_malloc_zero(sizeof(congestion_control_t
));
416 congestion_control_init(cc
, params
, path
);
418 cc_stats_circs_created
++;
424 * Free a congestion control object and its associated state.
427 congestion_control_free_(congestion_control_t
*cc
)
432 SMARTLIST_FOREACH(cc
->sendme_pending_timestamps
, uint64_t *, t
, tor_free(t
));
433 smartlist_free(cc
->sendme_pending_timestamps
);
439 * Enqueue a u64 timestamp to the end of a queue of timestamps.
442 enqueue_timestamp(smartlist_t
*timestamps_u64
, uint64_t timestamp_usec
)
444 uint64_t *timestamp_ptr
= tor_malloc(sizeof(uint64_t));
445 *timestamp_ptr
= timestamp_usec
;
447 smartlist_add(timestamps_u64
, timestamp_ptr
);
451 * Dequeue a u64 monotime usec timestamp from the front of a
452 * smartlist of pointers to 64.
454 static inline uint64_t
455 dequeue_timestamp(smartlist_t
*timestamps_u64_usecs
)
457 uint64_t *timestamp_ptr
= smartlist_get(timestamps_u64_usecs
, 0);
458 uint64_t timestamp_u64
;
460 if (BUG(!timestamp_ptr
)) {
461 log_err(LD_CIRC
, "Congestion control timestamp list became empty!");
465 timestamp_u64
= *timestamp_ptr
;
466 smartlist_del_keeporder(timestamps_u64_usecs
, 0);
467 tor_free(timestamp_ptr
);
469 return timestamp_u64
;
473 * Returns the number N of N-count EWMA, for averaging RTT and BDP over
476 * This N is bracketed between a divisor of the number of acks in a CWND
477 * and a max value. It is always at least 2.
479 static inline uint64_t
480 n_ewma_count(const congestion_control_t
*cc
)
482 uint64_t ewma_cnt
= 0;
484 if (cc
->in_slow_start
) {
485 /* In slow-start, we check the Vegas condition every sendme,
486 * so much lower ewma counts are needed. */
487 ewma_cnt
= n_ewma_ss
;
489 /* After slow-start, we check the Vegas condition only once per
490 * CWND, so it is better to average over longer periods. */
491 ewma_cnt
= MIN(CWND_UPDATE_RATE(cc
)*n_ewma_cwnd_pct
/100,
494 ewma_cnt
= MAX(ewma_cnt
, 2);
499 * Get a package window from either old sendme logic, or congestion control.
501 * A package window is how many cells you can still send.
504 congestion_control_get_package_window(const circuit_t
*circ
,
505 const crypt_path_t
*cpath
)
508 congestion_control_t
*cc
;
513 package_window
= cpath
->package_window
;
514 cc
= cpath
->ccontrol
;
516 package_window
= circ
->package_window
;
521 return package_window
;
523 /* Inflight can be above cwnd if cwnd was just reduced */
524 if (cc
->inflight
> cc
->cwnd
)
526 /* In the extremely unlikely event that cwnd-inflight is larger than
527 * INT32_MAX, just return that cap, so old code doesn't explode. */
528 else if (cc
->cwnd
- cc
->inflight
> INT32_MAX
)
531 return (int)(cc
->cwnd
- cc
->inflight
);
536 * Returns the number of cells that are acked by every sendme.
539 sendme_get_inc_count(const circuit_t
*circ
, const crypt_path_t
*layer_hint
)
541 int sendme_inc
= CIRCWINDOW_INCREMENT
;
542 congestion_control_t
*cc
= NULL
;
545 cc
= layer_hint
->ccontrol
;
551 sendme_inc
= cc
->sendme_inc
;
557 /** Return true iff the next cell we send will result in the other endpoint
560 * We are able to know that because the package or inflight window value minus
561 * one cell (the possible SENDME cell) should be a multiple of the
562 * cells-per-sendme increment value (set via consensus parameter, negotiated
563 * for the circuit, and passed in as sendme_inc).
565 * This function is used when recording a cell digest and this is done quite
566 * low in the stack when decrypting or encrypting a cell. The window is only
567 * updated once the cell is actually put in the outbuf.
570 circuit_sent_cell_for_sendme(const circuit_t
*circ
,
571 const crypt_path_t
*layer_hint
)
573 congestion_control_t
*cc
;
579 window
= layer_hint
->package_window
;
580 cc
= layer_hint
->ccontrol
;
582 window
= circ
->package_window
;
586 /* If we are using congestion control and the alg is not
587 * old-school 'fixed', then use cc->inflight to determine
588 * when sendmes will be sent */
593 /* This check must be +1 because this function is called *before*
594 * inflight is incremented for the sent cell */
595 if ((cc
->inflight
+1) % cc
->sendme_inc
!= 0)
601 /* At the start of the window, no SENDME will be expected. */
602 if (window
== CIRCWINDOW_START
) {
606 /* Are we at the limit of the increment and if not, we don't expect next
609 * We test against the window minus 1 because when we are looking if the
610 * next cell is a SENDME, the window (either package or deliver) hasn't been
611 * decremented just yet so when this is called, we are currently processing
612 * the "window - 1" cell.
614 if (((window
- 1) % CIRCWINDOW_INCREMENT
) != 0) {
618 /* Next cell is expected to be a SENDME. */
623 * Call-in to tell congestion control code that this circuit sent a cell.
625 * This updates the 'inflight' counter, and if this is a cell that will
626 * cause the other end to send a SENDME, record the current time in a list
627 * of pending timestamps, so that we can later compute the circuit RTT when
628 * the SENDME comes back. */
630 congestion_control_note_cell_sent(congestion_control_t
*cc
,
631 const circuit_t
*circ
,
632 const crypt_path_t
*cpath
)
637 /* Is this the last cell before a SENDME? The idea is that if the
638 * package_window reaches a multiple of the increment, after this cell, we
639 * should expect a SENDME. Note that this function must be called *before*
640 * we account for the sent cell. */
641 if (!circuit_sent_cell_for_sendme(circ
, cpath
)) {
648 /* Record this cell time for RTT computation when SENDME arrives */
649 enqueue_timestamp(cc
->sendme_pending_timestamps
,
650 monotime_absolute_usec());
654 * Upon receipt of a SENDME, pop the oldest timestamp off the timestamp
655 * list, and use this to update RTT.
657 * Returns true if circuit estimates were successfully updated, false
661 congestion_control_update_circuit_estimates(congestion_control_t
*cc
,
662 const circuit_t
*circ
)
664 uint64_t now_usec
= monotime_absolute_usec();
666 /* Update RTT first, then BDP. BDP needs fresh RTT */
667 uint64_t curr_rtt_usec
= congestion_control_update_circuit_rtt(cc
, now_usec
);
668 return congestion_control_update_circuit_bdp(cc
, circ
, curr_rtt_usec
);
672 * Returns true if we have enough time data to use heuristics
673 * to compare RTT to a baseline.
676 time_delta_should_use_heuristics(const congestion_control_t
*cc
)
678 /* If we have exited slow start and also have an EWMA RTT, we
679 * should have processed at least a cwnd worth of RTTs */
680 if (!cc
->in_slow_start
&& cc
->ewma_rtt_usec
) {
684 /* Not enough data to estimate clock jumps */
688 STATIC
bool is_monotime_clock_broken
= false;
691 * Returns true if the monotime delta is 0, or is significantly
692 * different than the previous delta. Either case indicates
693 * that the monotime time source stalled or jumped.
695 * Also caches the clock state in the is_monotime_clock_broken flag,
696 * so we can also provide a is_monotime_clock_reliable() function,
697 * used by flow control rate timing.
700 time_delta_stalled_or_jumped(const congestion_control_t
*cc
,
701 uint64_t old_delta
, uint64_t new_delta
)
703 #define DELTA_DISCREPENCY_RATIO_MAX 5000
704 /* If we have a 0 new_delta, that is definitely a monotime stall */
705 if (new_delta
== 0) {
706 static ratelim_t stall_info_limit
= RATELIM_INIT(60);
707 log_fn_ratelim(&stall_info_limit
, LOG_INFO
, LD_CIRC
,
708 "Congestion control cannot measure RTT due to monotime stall.");
710 is_monotime_clock_broken
= true;
715 * For the heuristic cases, we need at least a few timestamps,
716 * to average out any previous partial stalls or jumps. So until
717 * that point, let's just assume its OK.
719 if (!time_delta_should_use_heuristics(cc
)) {
723 /* If old_delta is significantly larger than new_delta, then
724 * this means that the monotime clock could have recently
725 * stopped moving forward. However, use the cache for this
726 * value, because it may also be caused by network activity,
727 * or by a previous clock jump that was not detected.
729 * So if we have not gotten a 0-delta recently, we will
730 * still allow this new low RTT, but just yell about it. */
731 if (old_delta
> new_delta
* DELTA_DISCREPENCY_RATIO_MAX
) {
732 static ratelim_t dec_notice_limit
= RATELIM_INIT(300);
733 log_fn_ratelim(&dec_notice_limit
, LOG_NOTICE
, LD_CIRC
,
734 "Sudden decrease in circuit RTT (%"PRIu64
" vs %"PRIu64
735 "), likely due to clock jump.",
736 new_delta
/1000, old_delta
/1000);
738 return is_monotime_clock_broken
;
741 /* If new_delta is significantly larger than old_delta, then
742 * this means that the monotime clock suddenly jumped forward.
743 * However, do not cache this value, because it may also be caused
744 * by network activity.
746 if (new_delta
> old_delta
* DELTA_DISCREPENCY_RATIO_MAX
) {
747 static ratelim_t dec_notice_limit
= RATELIM_INIT(300);
748 log_fn_ratelim(&dec_notice_limit
, LOG_PROTOCOL_WARN
, LD_CIRC
,
749 "Sudden increase in circuit RTT (%"PRIu64
" vs %"PRIu64
750 "), likely due to clock jump or suspended remote endpoint.",
751 new_delta
/1000, old_delta
/1000);
756 /* All good! Update cached status, too */
757 is_monotime_clock_broken
= false;
763 * Is the monotime clock stalled according to any circuits?
766 is_monotime_clock_reliable(void)
768 return !is_monotime_clock_broken
;
772 * Called when we get a SENDME. Updates circuit RTT by pulling off a
773 * timestamp of when we sent the CIRCWINDOW_INCREMENT-th cell from
774 * the queue of such timestamps, and comparing that to current time.
776 * Also updates min, max, and EWMA of RTT.
778 * Returns the current circuit RTT in usecs, or 0 if it could not be
779 * measured (due to clock jump, stall, etc).
782 congestion_control_update_circuit_rtt(congestion_control_t
*cc
,
785 uint64_t rtt
, ewma_cnt
;
786 uint64_t sent_at_timestamp
;
790 /* Get the time that we sent the cell that resulted in the other
791 * end sending this sendme. Use this to calculate RTT */
792 sent_at_timestamp
= dequeue_timestamp(cc
->sendme_pending_timestamps
);
794 rtt
= now_usec
- sent_at_timestamp
;
796 /* Do not update RTT at all if it looks fishy */
797 if (time_delta_stalled_or_jumped(cc
, cc
->ewma_rtt_usec
, rtt
)) {
798 num_clock_stalls
++; /* Accounting */
802 ewma_cnt
= n_ewma_count(cc
);
804 cc
->ewma_rtt_usec
= n_count_ewma(rtt
, cc
->ewma_rtt_usec
, ewma_cnt
);
806 if (rtt
> cc
->max_rtt_usec
) {
807 cc
->max_rtt_usec
= rtt
;
810 if (cc
->min_rtt_usec
== 0) {
811 // If we do not have a min_rtt yet, use current ewma
812 cc
->min_rtt_usec
= cc
->ewma_rtt_usec
;
813 } else if (cc
->cwnd
== cc
->cwnd_min
&& !cc
->in_slow_start
) {
814 // Raise min rtt if cwnd hit cwnd_min. This gets us out of a wedge state
815 // if we hit cwnd_min due to an abnormally low rtt.
816 uint64_t new_rtt
= percent_max_mix(cc
->ewma_rtt_usec
, cc
->min_rtt_usec
,
819 static ratelim_t rtt_notice_limit
= RATELIM_INIT(300);
820 log_fn_ratelim(&rtt_notice_limit
, LOG_NOTICE
, LD_CIRC
,
821 "Resetting circ RTT from %"PRIu64
" to %"PRIu64
" due to low cwnd",
822 cc
->min_rtt_usec
/1000, new_rtt
/1000);
824 cc
->min_rtt_usec
= new_rtt
;
825 num_rtt_reset
++; /* Accounting */
826 } else if (cc
->ewma_rtt_usec
< cc
->min_rtt_usec
) {
827 // Using the EWMA for min instead of current RTT helps average out
828 // effects from other conns
829 cc
->min_rtt_usec
= cc
->ewma_rtt_usec
;
836 * Called when we get a SENDME. Updates the bandwidth-delay-product (BDP)
837 * estimates of a circuit. Several methods of computing BDP are used,
838 * depending on scenario. While some congestion control algorithms only
839 * use one of these methods, we update them all because it's quick and easy.
841 * - now_usec is the current monotime in usecs.
842 * - curr_rtt_usec is the current circuit RTT in usecs. It may be 0 if no
843 * RTT could bemeasured.
845 * Returns true if we were able to update BDP, false otherwise.
848 congestion_control_update_circuit_bdp(congestion_control_t
*cc
,
849 const circuit_t
*circ
,
850 uint64_t curr_rtt_usec
)
853 unsigned int blocked_on_chan
= 0;
857 if (CIRCUIT_IS_ORIGIN(circ
)) {
858 /* origin circs use n_chan */
859 chan_q
= circ
->n_chan_cells
.n
;
860 blocked_on_chan
= circ
->circuit_blocked_on_n_chan
;
862 /* Both onion services and exits use or_circuit and p_chan */
863 chan_q
= CONST_TO_OR_CIRCUIT(circ
)->p_chan_cells
.n
;
864 blocked_on_chan
= circ
->circuit_blocked_on_p_chan
;
867 /* If we have no EWMA RTT, it is because monotime has been stalled
868 * or messed up the entire time so far. Set our BDP estimates directly
870 if (!cc
->ewma_rtt_usec
) {
871 uint64_t cwnd
= cc
->cwnd
;
873 tor_assert_nonfatal(cc
->cwnd
<= cwnd_max
);
875 /* If the channel is blocked, keep subtracting off the chan_q
876 * until we hit the min cwnd. */
877 if (blocked_on_chan
) {
878 /* Cast is fine because we're less than int32 */
879 if (chan_q
>= (int64_t)cwnd
) {
881 "Clock stall with large chanq: %d %"PRIu64
, chan_q
, cwnd
);
884 cwnd
= MAX(cwnd
- chan_q
, cc
->cwnd_min
);
886 cc
->blocked_chan
= 1;
888 cc
->blocked_chan
= 0;
893 static ratelim_t dec_notice_limit
= RATELIM_INIT(300);
894 log_fn_ratelim(&dec_notice_limit
, LOG_NOTICE
, LD_CIRC
,
895 "Our clock has been stalled for the entire lifetime of a circuit. "
896 "Performance may be sub-optimal.");
898 return blocked_on_chan
;
901 /* Congestion window based BDP will respond to changes in RTT only, and is
902 * relative to cwnd growth. It is useful for correcting for BDP
903 * overestimation, but if BDP is higher than the current cwnd, it will
906 * We multiply here first to avoid precision issues from min_RTT being
907 * close to ewma RTT. Since all fields are u64, there is plenty of
908 * room here to multiply first.
910 cc
->bdp
= cc
->cwnd
*cc
->min_rtt_usec
/cc
->ewma_rtt_usec
;
912 /* The orconn is blocked; use smaller of inflight vs SENDME */
913 if (blocked_on_chan
) {
914 log_info(LD_CIRC
, "CC: Streams blocked on circ channel. Chanq: %d",
917 /* A blocked channel is an immediate congestion signal, but it still
918 * happens only once per cwnd */
919 if (!cc
->blocked_chan
) {
920 cc
->next_cc_event
= 0;
921 cc
->blocked_chan
= 1;
924 /* If we were previously blocked, emit a new congestion event
925 * now that we are unblocked, to re-evaluate cwnd */
926 if (cc
->blocked_chan
) {
927 cc
->blocked_chan
= 0;
928 cc
->next_cc_event
= 0;
929 log_info(LD_CIRC
, "CC: Streams un-blocked on circ channel. Chanq: %d",
934 if (cc
->next_cc_event
== 0) {
935 if (CIRCUIT_IS_ORIGIN(circ
)) {
938 "SENDME RTT: %"PRIu64
", %"PRIu64
", %"PRIu64
", %"PRIu64
", "
939 "BDP estimate: %"PRIu64
,
940 CONST_TO_ORIGIN_CIRCUIT(circ
)->global_identifier
,
941 cc
->min_rtt_usec
/1000,
943 cc
->ewma_rtt_usec
/1000,
944 cc
->max_rtt_usec
/1000,
948 "CC: Circuit %"PRIu64
":%d "
949 "SENDME RTT: %"PRIu64
", %"PRIu64
", %"PRIu64
", %"PRIu64
", "
951 CONST_TO_OR_CIRCUIT(circ
)->p_chan
->global_identifier
,
952 CONST_TO_OR_CIRCUIT(circ
)->p_circ_id
,
953 cc
->min_rtt_usec
/1000,
955 cc
->ewma_rtt_usec
/1000,
956 cc
->max_rtt_usec
/1000,
961 /* We updated BDP this round if either we had a blocked channel, or
962 * the curr_rtt_usec was not 0. */
963 bool ret
= (blocked_on_chan
|| curr_rtt_usec
!= 0);
965 tor_trace(TR_SUBSYS(cc
), TR_EV(bdp_update
), circ
, cc
, curr_rtt_usec
);
971 * Dispatch the sendme to the appropriate congestion control algorithm.
974 congestion_control_dispatch_cc_alg(congestion_control_t
*cc
,
977 int ret
= -END_CIRC_REASON_INTERNAL
;
979 tor_assert_nonfatal_once(cc
->cc_alg
== CC_ALG_VEGAS
);
980 ret
= congestion_control_vegas_process_sendme(cc
, circ
);
982 if (cc
->cwnd
> cwnd_max
) {
983 static ratelim_t cwnd_limit
= RATELIM_INIT(60);
984 log_fn_ratelim(&cwnd_limit
, LOG_NOTICE
, LD_CIRC
,
985 "Congestion control cwnd %"PRIu64
" exceeds max %d, clamping.",
990 /* If we have a non-zero RTT measurement, update conflux. */
991 if (circ
->conflux
&& cc
->ewma_rtt_usec
)
992 conflux_update_rtt(circ
->conflux
, circ
, cc
->ewma_rtt_usec
);
998 * Build an extension field request to negotiate congestion control.
1000 * If congestion control is enabled, field TRUNNEL_EXT_TYPE_CC_FIELD_REQUEST
1001 * is created in msg_out. It is a single 0-length field that signifies that we
1002 * want to use congestion control. The length of msg_out is provided via
1005 * If congestion control is not enabled, a payload with 0 extensions is created
1008 * If there is a failure building the request, -1 is returned, else 0.
1010 * *msg_out must be freed if the return value is 0.
1013 congestion_control_build_ext_request(uint8_t **msg_out
, size_t *msg_len_out
)
1015 uint8_t *request
= NULL
;
1016 trn_extension_t
*ext
= NULL
;
1017 trn_extension_field_t
*field
= NULL
;
1019 ext
= trn_extension_new();
1021 /* With congestion control enabled, add the request, else it is an empty
1022 * request in the payload. */
1024 if (congestion_control_enabled()) {
1025 /* Build the extension field that will hold the CC field. */
1026 field
= trn_extension_field_new();
1027 trn_extension_field_set_field_type(field
,
1028 TRUNNEL_EXT_TYPE_CC_FIELD_REQUEST
);
1030 /* No payload indicating a request to use congestion control. */
1031 trn_extension_field_set_field_len(field
, 0);
1033 /* Build final extension. */
1034 trn_extension_add_fields(ext
, field
);
1035 trn_extension_set_num(ext
, 1);
1038 /* Encode extension. */
1039 ssize_t ret
= trn_extension_encoded_len(ext
);
1043 size_t request_len
= ret
;
1044 request
= tor_malloc_zero(request_len
);
1045 ret
= trn_extension_encode(request
, request_len
, ext
);
1051 *msg_len_out
= request_len
;
1053 /* Free everything, we've encoded the request now. */
1057 trn_extension_free(ext
);
1062 * Parse a congestion control ntorv3 request payload for extensions.
1064 * On parsing failure, -1 is returned.
1066 * If congestion control request is present, return 1. If it is not present,
1069 * WARNING: Called from CPU worker! Must not access any global state.
1072 congestion_control_parse_ext_request(const uint8_t *msg
, const size_t msg_len
)
1075 trn_extension_t
*ext
= NULL
;
1076 size_t num_fields
= 0;
1078 /* Parse extension from payload. */
1079 ret
= trn_extension_parse(&ext
, msg
, msg_len
);
1084 /* No extension implies no support for congestion control. In this case, we
1085 * simply return 0 to indicate CC is disabled. */
1086 if ((num_fields
= trn_extension_get_num(ext
)) == 0) {
1091 /* Go over all fields. If any field is TRUNNEL_EXT_TYPE_CC_FIELD_REQUEST,
1092 * then congestion control is enabled. Ignore unknown fields. */
1093 for (size_t f
= 0; f
< num_fields
; f
++) {
1094 const trn_extension_field_t
*field
= trn_extension_get_fields(ext
, f
);
1095 if (field
== NULL
) {
1100 /* For congestion control to be enabled, we only need the field type. */
1101 if (trn_extension_field_get_field_type(field
) ==
1102 TRUNNEL_EXT_TYPE_CC_FIELD_REQUEST
) {
1109 trn_extension_free(ext
);
1114 * Given our observed parameters for circuits and congestion control,
1115 * as well as the parameters for the resulting circuit, build a response
1116 * payload using extension fields into *msg_out, with length specified in
1119 * If congestion control will be enabled, the extension field for
1120 * TRUNNEL_EXT_TYPE_CC_FIELD_RESPONSE will contain the sendme_inc value.
1122 * If congestion control won't be enabled, an extension payload with 0
1123 * fields will be created.
1125 * Return 0 if an extension payload was created in *msg_out, and -1 on
1128 * *msg_out must be freed if the return value is 0.
1130 * WARNING: Called from CPU worker! Must not access any global state.
1133 congestion_control_build_ext_response(const circuit_params_t
*our_params
,
1134 const circuit_params_t
*circ_params
,
1135 uint8_t **msg_out
, size_t *msg_len_out
)
1138 uint8_t *request
= NULL
;
1139 trn_extension_t
*ext
= NULL
;
1140 trn_extension_field_t
*field
= NULL
;
1141 trn_extension_field_cc_t
*cc_field
= NULL
;
1143 tor_assert(our_params
);
1144 tor_assert(circ_params
);
1145 tor_assert(msg_out
);
1146 tor_assert(msg_len_out
);
1148 ext
= trn_extension_new();
1150 if (circ_params
->cc_enabled
) {
1151 /* Build the extension field that will hold the CC field. */
1152 field
= trn_extension_field_new();
1153 trn_extension_field_set_field_type(field
,
1154 TRUNNEL_EXT_TYPE_CC_FIELD_RESPONSE
);
1156 /* Build the congestion control field response. */
1157 cc_field
= trn_extension_field_cc_new();
1158 trn_extension_field_cc_set_sendme_inc(cc_field
,
1159 our_params
->sendme_inc_cells
);
1161 ret
= trn_extension_field_cc_encoded_len(cc_field
);
1162 if (BUG(ret
<= 0)) {
1163 trn_extension_field_free(field
);
1166 size_t field_len
= ret
;
1167 trn_extension_field_set_field_len(field
, field_len
);
1168 trn_extension_field_setlen_field(field
, field_len
);
1170 uint8_t *field_array
= trn_extension_field_getarray_field(field
);
1171 ret
= trn_extension_field_cc_encode(field_array
,
1172 trn_extension_field_getlen_field(field
), cc_field
);
1173 if (BUG(ret
<= 0)) {
1174 trn_extension_field_free(field
);
1178 /* Build final extension. */
1179 trn_extension_add_fields(ext
, field
);
1180 trn_extension_set_num(ext
, 1);
1183 /* Encode extension. */
1184 ret
= trn_extension_encoded_len(ext
);
1188 size_t request_len
= ret
;
1189 request
= tor_malloc_zero(request_len
);
1190 ret
= trn_extension_encode(request
, request_len
, ext
);
1196 *msg_len_out
= request_len
;
1198 /* We've just encoded the extension, clean everything. */
1202 trn_extension_free(ext
);
1203 trn_extension_field_cc_free(cc_field
);
1207 /** Return true iff the given sendme increment is within the acceptable
1210 congestion_control_validate_sendme_increment(uint8_t sendme_inc
)
1212 /* We will only accept this response (and this circuit) if sendme_inc
1213 * is within +/- 1 of the current consensus value. We should not need
1214 * to change cc_sendme_inc much, and if we do, we can spread out those
1215 * changes over smaller increments once every 4 hours. Exits that
1216 * violate this range should just not be used. */
1218 if (sendme_inc
== 0)
1221 if (sendme_inc
> (congestion_control_sendme_inc() + 1) ||
1222 sendme_inc
< (congestion_control_sendme_inc() - 1)) {
1228 /** Return 1 if CC is enabled which also will set the SENDME increment into our
1229 * params_out. Return 0 if CC is disabled. Else, return -1 on error. */
1231 congestion_control_parse_ext_response(const uint8_t *msg
,
1232 const size_t msg_len
,
1233 circuit_params_t
*params_out
)
1236 size_t num_fields
= 0;
1237 trn_extension_t
*ext
= NULL
;
1238 trn_extension_field_cc_t
*cc_field
= NULL
;
1240 /* We will only accept this response (and this circuit) if sendme_inc
1241 * is within a factor of 2 of our consensus value. We should not need
1242 * to change cc_sendme_inc much, and if we do, we can spread out those
1243 * changes over smaller increments once every 4 hours. Exits that
1244 * violate this range should just not be used. */
1245 #define MAX_SENDME_INC_NEGOTIATE_FACTOR 2
1247 /* Parse extension from payload. */
1248 ret
= trn_extension_parse(&ext
, msg
, msg_len
);
1253 if ((num_fields
= trn_extension_get_num(ext
)) == 0) {
1258 /* Go over all fields. If any field is TRUNNEL_EXT_TYPE_CC_FIELD_RESPONSE,
1259 * then congestion control is enabled. Ignore unknown fields. */
1260 for (size_t f
= 0; f
< num_fields
; f
++) {
1261 const trn_extension_field_t
*field
= trn_extension_get_fields(ext
, f
);
1262 if (field
== NULL
) {
1267 /* Only examine TRUNNEL_EXT_TYPE_CC_FIELD_RESPONSE; ignore other fields */
1268 if (trn_extension_field_get_field_type(field
) ==
1269 TRUNNEL_EXT_TYPE_CC_FIELD_RESPONSE
) {
1271 /* Parse the field into the congestion control field. */
1272 ret
= trn_extension_field_cc_parse(&cc_field
,
1273 trn_extension_field_getconstarray_field(field
),
1274 trn_extension_field_getlen_field(field
));
1279 uint8_t sendme_inc_cells
=
1280 trn_extension_field_cc_get_sendme_inc(cc_field
);
1281 if (!congestion_control_validate_sendme_increment(sendme_inc_cells
)) {
1286 /* All good. Get value and break */
1287 params_out
->sendme_inc_cells
= sendme_inc_cells
;
1294 trn_extension_free(ext
);
1295 trn_extension_field_cc_free(cc_field
);
1301 * Returns a formatted string of fields containing congestion
1302 * control information, for the CIRC_BW control port event.
1304 * An origin circuit can have a ccontrol object directly on it,
1305 * if it is an onion service, or onion client. Exit-bound clients
1306 * will have the ccontrol on the cpath associated with their exit
1307 * (the last one in the cpath list).
1309 * WARNING: This function does not support leaky-pipe topology. It
1310 * is to be used for control port information only.
1313 congestion_control_get_control_port_fields(const origin_circuit_t
*circ
)
1315 const congestion_control_t
*ccontrol
= NULL
;
1319 if (TO_CIRCUIT(circ
)->ccontrol
) {
1320 ccontrol
= TO_CIRCUIT(circ
)->ccontrol
;
1321 } else if (circ
->cpath
&& circ
->cpath
->prev
->ccontrol
) {
1322 /* Get ccontrol for last hop (exit) if it exists */
1323 ccontrol
= circ
->cpath
->prev
->ccontrol
;
1329 len
= tor_asprintf(&ret
,
1330 " SS=%d CWND=%"PRIu64
" RTT=%"PRIu64
" MIN_RTT=%"PRIu64
,
1331 ccontrol
->in_slow_start
, ccontrol
->cwnd
,
1332 ccontrol
->ewma_rtt_usec
/1000,
1333 ccontrol
->min_rtt_usec
/1000);
1335 log_warn(LD_BUG
, "Unable to format event for controller.");