Merge branch 'maint-0.4.8'
[tor.git] / src / core / or / congestion_control_common.c
blob7dc9dbafddf15ef5f55ec688a9b47d6bba3e819f
1 /* Copyright (c) 2021, The Tor Project, Inc. */
2 /* See LICENSE for licensing information */
4 /**
5 * \file congestion_control_common.c
6 * \brief Common code used by all congestion control algorithms.
7 */
9 #define TOR_CONGESTION_CONTROL_COMMON_PRIVATE
10 #define TOR_CONGESTION_CONTROL_PRIVATE
12 #include "core/or/or.h"
14 #include "core/crypto/onion_crypto.h"
15 #include "core/or/circuitlist.h"
16 #include "core/or/crypt_path.h"
17 #include "core/or/or_circuit_st.h"
18 #include "core/or/origin_circuit_st.h"
19 #include "core/or/channel.h"
20 #include "core/mainloop/connection.h"
21 #include "core/or/sendme.h"
22 #include "core/or/congestion_control_st.h"
23 #include "core/or/congestion_control_common.h"
24 #include "core/or/congestion_control_vegas.h"
25 #include "core/or/congestion_control_st.h"
26 #include "core/or/conflux.h"
27 #include "core/or/conflux_util.h"
28 #include "core/or/trace_probes_cc.h"
29 #include "lib/time/compat_time.h"
30 #include "feature/nodelist/networkstatus.h"
31 #include "app/config/config.h"
33 #include "trunnel/congestion_control.h"
34 #include "trunnel/extension.h"
36 /* Consensus parameter defaults.
38 * More details for each of the parameters can be found in proposal 324,
39 * section 6.5 including tuning notes. */
40 #define SENDME_INC_DFLT (TLS_RECORD_MAX_CELLS)
41 #define CIRCWINDOW_INIT (4*SENDME_INC_DFLT)
43 #define CC_ALG_DFLT (CC_ALG_VEGAS)
44 #define CC_ALG_DFLT_ALWAYS (CC_ALG_VEGAS)
46 #define CWND_INC_DFLT (1)
47 #define CWND_INC_PCT_SS_DFLT (100)
48 #define CWND_INC_RATE_DFLT (SENDME_INC_DFLT)
50 #define CWND_MIN_DFLT (CIRCWINDOW_INIT)
51 #define CWND_MAX_DFLT (INT32_MAX)
53 #define BWE_SENDME_MIN_DFLT (5)
55 #define N_EWMA_CWND_PCT_DFLT (50)
56 #define N_EWMA_MAX_DFLT (10)
57 #define N_EWMA_SS_DFLT (2)
59 #define RTT_RESET_PCT_DFLT (100)
61 /* BDP algorithms for each congestion control algorithms use the piecewise
62 * estimattor. See section 3.1.4 of proposal 324. */
63 #define WESTWOOD_BDP_ALG BDP_ALG_PIECEWISE
64 #define VEGAS_BDP_MIX_ALG BDP_ALG_PIECEWISE
65 #define NOLA_BDP_ALG BDP_ALG_PIECEWISE
67 /* Indicate OR connection buffer limitations used to stop or start accepting
68 * cells in its outbuf.
70 * These watermarks are historical to tor in a sense that they've been used
71 * almost from the genesis point. And were likely defined to fit the bounds of
72 * TLS records of 16KB which would be around 32 cells.
74 * These are defaults of the consensus parameter "orconn_high" and "orconn_low"
75 * values. */
76 #define OR_CONN_HIGHWATER_DFLT (32*1024)
77 #define OR_CONN_LOWWATER_DFLT (16*1024)
79 /* Low and high values of circuit cell queue sizes. They are used to tell when
80 * to start or stop reading on the streams attached on the circuit.
82 * These are defaults of the consensus parameters "cellq_high" and "cellq_low".
84 #define CELL_QUEUE_LOW_DFLT (10)
85 #define CELL_QUEUE_HIGH_DFLT (256)
87 static bool congestion_control_update_circuit_bdp(congestion_control_t *,
88 const circuit_t *,
89 uint64_t);
90 /* Number of times the RTT value was reset. For MetricsPort. */
91 static uint64_t num_rtt_reset;
93 /* Number of times the clock was stalled. For MetricsPort. */
94 static uint64_t num_clock_stalls;
96 /* Consensus parameters cached. The non static ones are extern. */
97 static uint32_t cwnd_max = CWND_MAX_DFLT;
98 int32_t cell_queue_high = CELL_QUEUE_HIGH_DFLT;
99 int32_t cell_queue_low = CELL_QUEUE_LOW_DFLT;
100 uint32_t or_conn_highwater = OR_CONN_HIGHWATER_DFLT;
101 uint32_t or_conn_lowwater = OR_CONN_LOWWATER_DFLT;
102 uint8_t cc_sendme_inc = SENDME_INC_DFLT;
103 STATIC cc_alg_t cc_alg = CC_ALG_DFLT;
106 * Number of cwnd worth of sendme acks to smooth RTT and BDP with,
107 * using N_EWMA */
108 static uint8_t n_ewma_cwnd_pct = N_EWMA_CWND_PCT_DFLT;
111 * Maximum number N for the N-count EWMA averaging of RTT and BDP.
113 static uint8_t n_ewma_max = N_EWMA_MAX_DFLT;
116 * Maximum number N for the N-count EWMA averaging of RTT in Slow Start.
118 static uint8_t n_ewma_ss = N_EWMA_SS_DFLT;
121 * Minimum number of sendmes before we begin BDP estimates
123 static uint8_t bwe_sendme_min = BWE_SENDME_MIN_DFLT;
126 * Percentage of the current RTT to use when resetting the minimum RTT
127 * for a circuit. (RTT is reset when the cwnd hits cwnd_min).
129 static uint8_t rtt_reset_pct = RTT_RESET_PCT_DFLT;
131 /** Metric to count the number of congestion control circuits **/
132 uint64_t cc_stats_circs_created = 0;
134 /** Return the number of RTT reset that have been done. */
135 uint64_t
136 congestion_control_get_num_rtt_reset(void)
138 return num_rtt_reset;
141 /** Return the number of clock stalls that have been done. */
142 uint64_t
143 congestion_control_get_num_clock_stalls(void)
145 return num_clock_stalls;
149 * Update global congestion control related consensus parameter values,
150 * every consensus update.
152 void
153 congestion_control_new_consensus_params(const networkstatus_t *ns)
155 #define CELL_QUEUE_HIGH_MIN (1)
156 #define CELL_QUEUE_HIGH_MAX (1000)
157 cell_queue_high = networkstatus_get_param(ns, "cellq_high",
158 CELL_QUEUE_HIGH_DFLT,
159 CELL_QUEUE_HIGH_MIN,
160 CELL_QUEUE_HIGH_MAX);
162 #define CELL_QUEUE_LOW_MIN (1)
163 #define CELL_QUEUE_LOW_MAX (1000)
164 cell_queue_low = networkstatus_get_param(ns, "cellq_low",
165 CELL_QUEUE_LOW_DFLT,
166 CELL_QUEUE_LOW_MIN,
167 CELL_QUEUE_LOW_MAX);
169 #define OR_CONN_HIGHWATER_MIN (CELL_PAYLOAD_SIZE)
170 #define OR_CONN_HIGHWATER_MAX (INT32_MAX)
171 or_conn_highwater =
172 networkstatus_get_param(ns, "orconn_high",
173 OR_CONN_HIGHWATER_DFLT,
174 OR_CONN_HIGHWATER_MIN,
175 OR_CONN_HIGHWATER_MAX);
177 #define OR_CONN_LOWWATER_MIN (CELL_PAYLOAD_SIZE)
178 #define OR_CONN_LOWWATER_MAX (INT32_MAX)
179 or_conn_lowwater =
180 networkstatus_get_param(ns, "orconn_low",
181 OR_CONN_LOWWATER_DFLT,
182 OR_CONN_LOWWATER_MIN,
183 OR_CONN_LOWWATER_MAX);
185 #define CWND_MAX_MIN 500
186 #define CWND_MAX_MAX (INT32_MAX)
187 cwnd_max =
188 networkstatus_get_param(NULL, "cc_cwnd_max",
189 CWND_MAX_DFLT,
190 CWND_MAX_MIN,
191 CWND_MAX_MAX);
193 #define RTT_RESET_PCT_MIN (0)
194 #define RTT_RESET_PCT_MAX (100)
195 rtt_reset_pct =
196 networkstatus_get_param(NULL, "cc_rtt_reset_pct",
197 RTT_RESET_PCT_DFLT,
198 RTT_RESET_PCT_MIN,
199 RTT_RESET_PCT_MAX);
201 #define SENDME_INC_MIN 1
202 #define SENDME_INC_MAX (254)
203 cc_sendme_inc =
204 networkstatus_get_param(NULL, "cc_sendme_inc",
205 SENDME_INC_DFLT,
206 SENDME_INC_MIN,
207 SENDME_INC_MAX);
209 #define CC_ALG_MIN 0
210 #define CC_ALG_MAX (NUM_CC_ALGS-1)
211 cc_alg =
212 networkstatus_get_param(NULL, "cc_alg",
213 CC_ALG_DFLT,
214 CC_ALG_MIN,
215 CC_ALG_MAX);
216 if (cc_alg != CC_ALG_SENDME && cc_alg != CC_ALG_VEGAS) {
217 // Does not need rate limiting because consensus updates
218 // are at most 1x/hour
219 log_warn(LD_BUG, "Unsupported congestion control algorithm %d",
220 cc_alg);
221 cc_alg = CC_ALG_DFLT;
224 #define BWE_SENDME_MIN_MIN 2
225 #define BWE_SENDME_MIN_MAX (20)
226 bwe_sendme_min =
227 networkstatus_get_param(NULL, "cc_bwe_min",
228 BWE_SENDME_MIN_DFLT,
229 BWE_SENDME_MIN_MIN,
230 BWE_SENDME_MIN_MAX);
232 #define N_EWMA_CWND_PCT_MIN 1
233 #define N_EWMA_CWND_PCT_MAX (255)
234 n_ewma_cwnd_pct =
235 networkstatus_get_param(NULL, "cc_ewma_cwnd_pct",
236 N_EWMA_CWND_PCT_DFLT,
237 N_EWMA_CWND_PCT_MIN,
238 N_EWMA_CWND_PCT_MAX);
240 #define N_EWMA_MAX_MIN 2
241 #define N_EWMA_MAX_MAX (INT32_MAX)
242 n_ewma_max =
243 networkstatus_get_param(NULL, "cc_ewma_max",
244 N_EWMA_MAX_DFLT,
245 N_EWMA_MAX_MIN,
246 N_EWMA_MAX_MAX);
248 #define N_EWMA_SS_MIN 2
249 #define N_EWMA_SS_MAX (INT32_MAX)
250 n_ewma_ss =
251 networkstatus_get_param(NULL, "cc_ewma_ss",
252 N_EWMA_SS_DFLT,
253 N_EWMA_SS_MIN,
254 N_EWMA_SS_MAX);
258 * Set congestion control parameters on a circuit's congestion
259 * control object based on values from the consensus.
261 * cc_alg is the negotiated congestion control algorithm.
263 * sendme_inc is the number of packaged cells that a sendme cell
264 * acks. This parameter will come from circuit negotiation.
266 static void
267 congestion_control_init_params(congestion_control_t *cc,
268 const circuit_params_t *params,
269 cc_path_t path)
271 const or_options_t *opts = get_options();
272 cc->sendme_inc = params->sendme_inc_cells;
274 #define CWND_INIT_MIN SENDME_INC_DFLT
275 #define CWND_INIT_MAX (10000)
276 cc->cwnd =
277 networkstatus_get_param(NULL, "cc_cwnd_init",
278 CIRCWINDOW_INIT,
279 CWND_INIT_MIN,
280 CWND_INIT_MAX);
282 #define CWND_INC_PCT_SS_MIN 1
283 #define CWND_INC_PCT_SS_MAX (500)
284 cc->cwnd_inc_pct_ss =
285 networkstatus_get_param(NULL, "cc_cwnd_inc_pct_ss",
286 CWND_INC_PCT_SS_DFLT,
287 CWND_INC_PCT_SS_MIN,
288 CWND_INC_PCT_SS_MAX);
290 #define CWND_INC_MIN 1
291 #define CWND_INC_MAX (1000)
292 cc->cwnd_inc =
293 networkstatus_get_param(NULL, "cc_cwnd_inc",
294 CWND_INC_DFLT,
295 CWND_INC_MIN,
296 CWND_INC_MAX);
298 #define CWND_INC_RATE_MIN 1
299 #define CWND_INC_RATE_MAX (250)
300 cc->cwnd_inc_rate =
301 networkstatus_get_param(NULL, "cc_cwnd_inc_rate",
302 CWND_INC_RATE_DFLT,
303 CWND_INC_RATE_MIN,
304 CWND_INC_RATE_MAX);
306 #define CWND_MIN_MIN SENDME_INC_DFLT
307 #define CWND_MIN_MAX (1000)
308 cc->cwnd_min =
309 networkstatus_get_param(NULL, "cc_cwnd_min",
310 CWND_MIN_DFLT,
311 CWND_MIN_MIN,
312 CWND_MIN_MAX);
314 /* If the consensus says to use OG sendme, but torrc has
315 * always-enabled, use the default "always" alg (vegas),
316 * else use cached conensus alg. */
317 if (cc_alg == CC_ALG_SENDME && opts->AlwaysCongestionControl) {
318 cc->cc_alg = CC_ALG_DFLT_ALWAYS;
319 } else {
320 cc->cc_alg = cc_alg;
323 /* Algorithm-specific parameters */
324 if (cc->cc_alg == CC_ALG_VEGAS) {
325 congestion_control_vegas_set_params(cc, path);
326 } else {
327 // This should not happen anymore
328 log_warn(LD_BUG, "Unknown congestion control algorithm %d",
329 cc->cc_alg);
333 /** Returns true if congestion control is enabled in the most recent
334 * consensus, or if __AlwaysCongestionControl is set to true.
336 * Note that this function (and many many other functions) should not
337 * be called from the CPU worker threads when handling congestion
338 * control negotiation. Relevant values are marshaled into the
339 * `circuit_params_t` struct, in order to be used in worker threads
340 * without touching global state. Use those values in CPU worker
341 * threads, instead of calling this function.
343 * The danger is still present, in your time, as it was in ours.
345 bool
346 congestion_control_enabled(void)
348 const or_options_t *opts = NULL;
350 tor_assert_nonfatal_once(in_main_thread());
352 opts = get_options();
354 /* If the user has set "__AlwaysCongesttionControl",
355 * then always try to negotiate congestion control, regardless
356 * of consensus param. This is to be used for testing and sbws.
358 * Note that we do *not* allow disabling congestion control
359 * if the consensus says to use it, as this is bad for queueing
360 * and fairness. */
361 if (opts->AlwaysCongestionControl)
362 return 1;
364 return cc_alg != CC_ALG_SENDME;
367 #ifdef TOR_UNIT_TESTS
369 * For unit tests only: set the cached consensus cc alg to
370 * specified value.
372 void
373 congestion_control_set_cc_enabled(void)
375 cc_alg = CC_ALG_VEGAS;
379 * For unit tests only: set the cached consensus cc alg to
380 * specified value.
382 void
383 congestion_control_set_cc_disabled(void)
385 cc_alg = CC_ALG_SENDME;
387 #endif
390 * Allocate and initialize fields in congestion control object.
392 * cc_alg is the negotiated congestion control algorithm.
394 * sendme_inc is the number of packaged cells that a sendme cell
395 * acks. This parameter will come from circuit negotiation.
397 static void
398 congestion_control_init(congestion_control_t *cc,
399 const circuit_params_t *params,
400 cc_path_t path)
402 cc->sendme_pending_timestamps = smartlist_new();
404 cc->in_slow_start = 1;
405 congestion_control_init_params(cc, params, path);
407 cc->next_cc_event = CWND_UPDATE_RATE(cc);
410 /** Allocate and initialize a new congestion control object */
411 congestion_control_t *
412 congestion_control_new(const circuit_params_t *params, cc_path_t path)
414 congestion_control_t *cc = tor_malloc_zero(sizeof(congestion_control_t));
416 congestion_control_init(cc, params, path);
418 cc_stats_circs_created++;
420 return cc;
424 * Free a congestion control object and its associated state.
426 void
427 congestion_control_free_(congestion_control_t *cc)
429 if (!cc)
430 return;
432 SMARTLIST_FOREACH(cc->sendme_pending_timestamps, uint64_t *, t, tor_free(t));
433 smartlist_free(cc->sendme_pending_timestamps);
435 tor_free(cc);
439 * Enqueue a u64 timestamp to the end of a queue of timestamps.
441 STATIC inline void
442 enqueue_timestamp(smartlist_t *timestamps_u64, uint64_t timestamp_usec)
444 uint64_t *timestamp_ptr = tor_malloc(sizeof(uint64_t));
445 *timestamp_ptr = timestamp_usec;
447 smartlist_add(timestamps_u64, timestamp_ptr);
451 * Dequeue a u64 monotime usec timestamp from the front of a
452 * smartlist of pointers to 64.
454 static inline uint64_t
455 dequeue_timestamp(smartlist_t *timestamps_u64_usecs)
457 uint64_t *timestamp_ptr = smartlist_get(timestamps_u64_usecs, 0);
458 uint64_t timestamp_u64;
460 if (BUG(!timestamp_ptr)) {
461 log_err(LD_CIRC, "Congestion control timestamp list became empty!");
462 return 0;
465 timestamp_u64 = *timestamp_ptr;
466 smartlist_del_keeporder(timestamps_u64_usecs, 0);
467 tor_free(timestamp_ptr);
469 return timestamp_u64;
473 * Returns the number N of N-count EWMA, for averaging RTT and BDP over
474 * N SENDME acks.
476 * This N is bracketed between a divisor of the number of acks in a CWND
477 * and a max value. It is always at least 2.
479 static inline uint64_t
480 n_ewma_count(const congestion_control_t *cc)
482 uint64_t ewma_cnt = 0;
484 if (cc->in_slow_start) {
485 /* In slow-start, we check the Vegas condition every sendme,
486 * so much lower ewma counts are needed. */
487 ewma_cnt = n_ewma_ss;
488 } else {
489 /* After slow-start, we check the Vegas condition only once per
490 * CWND, so it is better to average over longer periods. */
491 ewma_cnt = MIN(CWND_UPDATE_RATE(cc)*n_ewma_cwnd_pct/100,
492 n_ewma_max);
494 ewma_cnt = MAX(ewma_cnt, 2);
495 return ewma_cnt;
499 * Get a package window from either old sendme logic, or congestion control.
501 * A package window is how many cells you can still send.
504 congestion_control_get_package_window(const circuit_t *circ,
505 const crypt_path_t *cpath)
507 int package_window;
508 congestion_control_t *cc;
510 tor_assert(circ);
512 if (cpath) {
513 package_window = cpath->package_window;
514 cc = cpath->ccontrol;
515 } else {
516 package_window = circ->package_window;
517 cc = circ->ccontrol;
520 if (!cc) {
521 return package_window;
522 } else {
523 /* Inflight can be above cwnd if cwnd was just reduced */
524 if (cc->inflight > cc->cwnd)
525 return 0;
526 /* In the extremely unlikely event that cwnd-inflight is larger than
527 * INT32_MAX, just return that cap, so old code doesn't explode. */
528 else if (cc->cwnd - cc->inflight > INT32_MAX)
529 return INT32_MAX;
530 else
531 return (int)(cc->cwnd - cc->inflight);
536 * Returns the number of cells that are acked by every sendme.
539 sendme_get_inc_count(const circuit_t *circ, const crypt_path_t *layer_hint)
541 int sendme_inc = CIRCWINDOW_INCREMENT;
542 congestion_control_t *cc = NULL;
544 if (layer_hint) {
545 cc = layer_hint->ccontrol;
546 } else {
547 cc = circ->ccontrol;
550 if (cc) {
551 sendme_inc = cc->sendme_inc;
554 return sendme_inc;
557 /** Return true iff the next cell we send will result in the other endpoint
558 * sending a SENDME.
560 * We are able to know that because the package or inflight window value minus
561 * one cell (the possible SENDME cell) should be a multiple of the
562 * cells-per-sendme increment value (set via consensus parameter, negotiated
563 * for the circuit, and passed in as sendme_inc).
565 * This function is used when recording a cell digest and this is done quite
566 * low in the stack when decrypting or encrypting a cell. The window is only
567 * updated once the cell is actually put in the outbuf.
569 bool
570 circuit_sent_cell_for_sendme(const circuit_t *circ,
571 const crypt_path_t *layer_hint)
573 congestion_control_t *cc;
574 int window;
576 tor_assert(circ);
578 if (layer_hint) {
579 window = layer_hint->package_window;
580 cc = layer_hint->ccontrol;
581 } else {
582 window = circ->package_window;
583 cc = circ->ccontrol;
586 /* If we are using congestion control and the alg is not
587 * old-school 'fixed', then use cc->inflight to determine
588 * when sendmes will be sent */
589 if (cc) {
590 if (!cc->inflight)
591 return false;
593 /* This check must be +1 because this function is called *before*
594 * inflight is incremented for the sent cell */
595 if ((cc->inflight+1) % cc->sendme_inc != 0)
596 return false;
598 return true;
601 /* At the start of the window, no SENDME will be expected. */
602 if (window == CIRCWINDOW_START) {
603 return false;
606 /* Are we at the limit of the increment and if not, we don't expect next
607 * cell is a SENDME.
609 * We test against the window minus 1 because when we are looking if the
610 * next cell is a SENDME, the window (either package or deliver) hasn't been
611 * decremented just yet so when this is called, we are currently processing
612 * the "window - 1" cell.
614 if (((window - 1) % CIRCWINDOW_INCREMENT) != 0) {
615 return false;
618 /* Next cell is expected to be a SENDME. */
619 return true;
623 * Call-in to tell congestion control code that this circuit sent a cell.
625 * This updates the 'inflight' counter, and if this is a cell that will
626 * cause the other end to send a SENDME, record the current time in a list
627 * of pending timestamps, so that we can later compute the circuit RTT when
628 * the SENDME comes back. */
629 void
630 congestion_control_note_cell_sent(congestion_control_t *cc,
631 const circuit_t *circ,
632 const crypt_path_t *cpath)
634 tor_assert(circ);
635 tor_assert(cc);
637 /* Is this the last cell before a SENDME? The idea is that if the
638 * package_window reaches a multiple of the increment, after this cell, we
639 * should expect a SENDME. Note that this function must be called *before*
640 * we account for the sent cell. */
641 if (!circuit_sent_cell_for_sendme(circ, cpath)) {
642 cc->inflight++;
643 return;
646 cc->inflight++;
648 /* Record this cell time for RTT computation when SENDME arrives */
649 enqueue_timestamp(cc->sendme_pending_timestamps,
650 monotime_absolute_usec());
654 * Upon receipt of a SENDME, pop the oldest timestamp off the timestamp
655 * list, and use this to update RTT.
657 * Returns true if circuit estimates were successfully updated, false
658 * otherwise.
660 bool
661 congestion_control_update_circuit_estimates(congestion_control_t *cc,
662 const circuit_t *circ)
664 uint64_t now_usec = monotime_absolute_usec();
666 /* Update RTT first, then BDP. BDP needs fresh RTT */
667 uint64_t curr_rtt_usec = congestion_control_update_circuit_rtt(cc, now_usec);
668 return congestion_control_update_circuit_bdp(cc, circ, curr_rtt_usec);
672 * Returns true if we have enough time data to use heuristics
673 * to compare RTT to a baseline.
675 static bool
676 time_delta_should_use_heuristics(const congestion_control_t *cc)
678 /* If we have exited slow start and also have an EWMA RTT, we
679 * should have processed at least a cwnd worth of RTTs */
680 if (!cc->in_slow_start && cc->ewma_rtt_usec) {
681 return true;
684 /* Not enough data to estimate clock jumps */
685 return false;
688 STATIC bool is_monotime_clock_broken = false;
691 * Returns true if the monotime delta is 0, or is significantly
692 * different than the previous delta. Either case indicates
693 * that the monotime time source stalled or jumped.
695 * Also caches the clock state in the is_monotime_clock_broken flag,
696 * so we can also provide a is_monotime_clock_reliable() function,
697 * used by flow control rate timing.
699 STATIC bool
700 time_delta_stalled_or_jumped(const congestion_control_t *cc,
701 uint64_t old_delta, uint64_t new_delta)
703 #define DELTA_DISCREPENCY_RATIO_MAX 5000
704 /* If we have a 0 new_delta, that is definitely a monotime stall */
705 if (new_delta == 0) {
706 static ratelim_t stall_info_limit = RATELIM_INIT(60);
707 log_fn_ratelim(&stall_info_limit, LOG_INFO, LD_CIRC,
708 "Congestion control cannot measure RTT due to monotime stall.");
710 is_monotime_clock_broken = true;
711 return true;
715 * For the heuristic cases, we need at least a few timestamps,
716 * to average out any previous partial stalls or jumps. So until
717 * that point, let's just assume its OK.
719 if (!time_delta_should_use_heuristics(cc)) {
720 return false;
723 /* If old_delta is significantly larger than new_delta, then
724 * this means that the monotime clock could have recently
725 * stopped moving forward. However, use the cache for this
726 * value, because it may also be caused by network activity,
727 * or by a previous clock jump that was not detected.
729 * So if we have not gotten a 0-delta recently, we will
730 * still allow this new low RTT, but just yell about it. */
731 if (old_delta > new_delta * DELTA_DISCREPENCY_RATIO_MAX) {
732 static ratelim_t dec_notice_limit = RATELIM_INIT(300);
733 log_fn_ratelim(&dec_notice_limit, LOG_NOTICE, LD_CIRC,
734 "Sudden decrease in circuit RTT (%"PRIu64" vs %"PRIu64
735 "), likely due to clock jump.",
736 new_delta/1000, old_delta/1000);
738 return is_monotime_clock_broken;
741 /* If new_delta is significantly larger than old_delta, then
742 * this means that the monotime clock suddenly jumped forward.
743 * However, do not cache this value, because it may also be caused
744 * by network activity.
746 if (new_delta > old_delta * DELTA_DISCREPENCY_RATIO_MAX) {
747 static ratelim_t dec_notice_limit = RATELIM_INIT(300);
748 log_fn_ratelim(&dec_notice_limit, LOG_PROTOCOL_WARN, LD_CIRC,
749 "Sudden increase in circuit RTT (%"PRIu64" vs %"PRIu64
750 "), likely due to clock jump or suspended remote endpoint.",
751 new_delta/1000, old_delta/1000);
753 return true;
756 /* All good! Update cached status, too */
757 is_monotime_clock_broken = false;
759 return false;
763 * Is the monotime clock stalled according to any circuits?
765 bool
766 is_monotime_clock_reliable(void)
768 return !is_monotime_clock_broken;
772 * Called when we get a SENDME. Updates circuit RTT by pulling off a
773 * timestamp of when we sent the CIRCWINDOW_INCREMENT-th cell from
774 * the queue of such timestamps, and comparing that to current time.
776 * Also updates min, max, and EWMA of RTT.
778 * Returns the current circuit RTT in usecs, or 0 if it could not be
779 * measured (due to clock jump, stall, etc).
781 STATIC uint64_t
782 congestion_control_update_circuit_rtt(congestion_control_t *cc,
783 uint64_t now_usec)
785 uint64_t rtt, ewma_cnt;
786 uint64_t sent_at_timestamp;
788 tor_assert(cc);
790 /* Get the time that we sent the cell that resulted in the other
791 * end sending this sendme. Use this to calculate RTT */
792 sent_at_timestamp = dequeue_timestamp(cc->sendme_pending_timestamps);
794 rtt = now_usec - sent_at_timestamp;
796 /* Do not update RTT at all if it looks fishy */
797 if (time_delta_stalled_or_jumped(cc, cc->ewma_rtt_usec, rtt)) {
798 num_clock_stalls++; /* Accounting */
799 return 0;
802 ewma_cnt = n_ewma_count(cc);
804 cc->ewma_rtt_usec = n_count_ewma(rtt, cc->ewma_rtt_usec, ewma_cnt);
806 if (rtt > cc->max_rtt_usec) {
807 cc->max_rtt_usec = rtt;
810 if (cc->min_rtt_usec == 0) {
811 // If we do not have a min_rtt yet, use current ewma
812 cc->min_rtt_usec = cc->ewma_rtt_usec;
813 } else if (cc->cwnd == cc->cwnd_min && !cc->in_slow_start) {
814 // Raise min rtt if cwnd hit cwnd_min. This gets us out of a wedge state
815 // if we hit cwnd_min due to an abnormally low rtt.
816 uint64_t new_rtt = percent_max_mix(cc->ewma_rtt_usec, cc->min_rtt_usec,
817 rtt_reset_pct);
819 static ratelim_t rtt_notice_limit = RATELIM_INIT(300);
820 log_fn_ratelim(&rtt_notice_limit, LOG_NOTICE, LD_CIRC,
821 "Resetting circ RTT from %"PRIu64" to %"PRIu64" due to low cwnd",
822 cc->min_rtt_usec/1000, new_rtt/1000);
824 cc->min_rtt_usec = new_rtt;
825 num_rtt_reset++; /* Accounting */
826 } else if (cc->ewma_rtt_usec < cc->min_rtt_usec) {
827 // Using the EWMA for min instead of current RTT helps average out
828 // effects from other conns
829 cc->min_rtt_usec = cc->ewma_rtt_usec;
832 return rtt;
836 * Called when we get a SENDME. Updates the bandwidth-delay-product (BDP)
837 * estimates of a circuit. Several methods of computing BDP are used,
838 * depending on scenario. While some congestion control algorithms only
839 * use one of these methods, we update them all because it's quick and easy.
841 * - now_usec is the current monotime in usecs.
842 * - curr_rtt_usec is the current circuit RTT in usecs. It may be 0 if no
843 * RTT could bemeasured.
845 * Returns true if we were able to update BDP, false otherwise.
847 static bool
848 congestion_control_update_circuit_bdp(congestion_control_t *cc,
849 const circuit_t *circ,
850 uint64_t curr_rtt_usec)
852 int chan_q = 0;
853 unsigned int blocked_on_chan = 0;
855 tor_assert(cc);
857 if (CIRCUIT_IS_ORIGIN(circ)) {
858 /* origin circs use n_chan */
859 chan_q = circ->n_chan_cells.n;
860 blocked_on_chan = circ->circuit_blocked_on_n_chan;
861 } else {
862 /* Both onion services and exits use or_circuit and p_chan */
863 chan_q = CONST_TO_OR_CIRCUIT(circ)->p_chan_cells.n;
864 blocked_on_chan = circ->circuit_blocked_on_p_chan;
867 /* If we have no EWMA RTT, it is because monotime has been stalled
868 * or messed up the entire time so far. Set our BDP estimates directly
869 * to current cwnd */
870 if (!cc->ewma_rtt_usec) {
871 uint64_t cwnd = cc->cwnd;
873 tor_assert_nonfatal(cc->cwnd <= cwnd_max);
875 /* If the channel is blocked, keep subtracting off the chan_q
876 * until we hit the min cwnd. */
877 if (blocked_on_chan) {
878 /* Cast is fine because we're less than int32 */
879 if (chan_q >= (int64_t)cwnd) {
880 log_notice(LD_CIRC,
881 "Clock stall with large chanq: %d %"PRIu64, chan_q, cwnd);
882 cwnd = cc->cwnd_min;
883 } else {
884 cwnd = MAX(cwnd - chan_q, cc->cwnd_min);
886 cc->blocked_chan = 1;
887 } else {
888 cc->blocked_chan = 0;
891 cc->bdp = cwnd;
893 static ratelim_t dec_notice_limit = RATELIM_INIT(300);
894 log_fn_ratelim(&dec_notice_limit, LOG_NOTICE, LD_CIRC,
895 "Our clock has been stalled for the entire lifetime of a circuit. "
896 "Performance may be sub-optimal.");
898 return blocked_on_chan;
901 /* Congestion window based BDP will respond to changes in RTT only, and is
902 * relative to cwnd growth. It is useful for correcting for BDP
903 * overestimation, but if BDP is higher than the current cwnd, it will
904 * underestimate it.
906 * We multiply here first to avoid precision issues from min_RTT being
907 * close to ewma RTT. Since all fields are u64, there is plenty of
908 * room here to multiply first.
910 cc->bdp = cc->cwnd*cc->min_rtt_usec/cc->ewma_rtt_usec;
912 /* The orconn is blocked; use smaller of inflight vs SENDME */
913 if (blocked_on_chan) {
914 log_info(LD_CIRC, "CC: Streams blocked on circ channel. Chanq: %d",
915 chan_q);
917 /* A blocked channel is an immediate congestion signal, but it still
918 * happens only once per cwnd */
919 if (!cc->blocked_chan) {
920 cc->next_cc_event = 0;
921 cc->blocked_chan = 1;
923 } else {
924 /* If we were previously blocked, emit a new congestion event
925 * now that we are unblocked, to re-evaluate cwnd */
926 if (cc->blocked_chan) {
927 cc->blocked_chan = 0;
928 cc->next_cc_event = 0;
929 log_info(LD_CIRC, "CC: Streams un-blocked on circ channel. Chanq: %d",
930 chan_q);
934 if (cc->next_cc_event == 0) {
935 if (CIRCUIT_IS_ORIGIN(circ)) {
936 log_info(LD_CIRC,
937 "CC: Circuit %d "
938 "SENDME RTT: %"PRIu64", %"PRIu64", %"PRIu64", %"PRIu64", "
939 "BDP estimate: %"PRIu64,
940 CONST_TO_ORIGIN_CIRCUIT(circ)->global_identifier,
941 cc->min_rtt_usec/1000,
942 curr_rtt_usec/1000,
943 cc->ewma_rtt_usec/1000,
944 cc->max_rtt_usec/1000,
945 cc->bdp);
946 } else {
947 log_info(LD_CIRC,
948 "CC: Circuit %"PRIu64":%d "
949 "SENDME RTT: %"PRIu64", %"PRIu64", %"PRIu64", %"PRIu64", "
950 "%"PRIu64,
951 CONST_TO_OR_CIRCUIT(circ)->p_chan->global_identifier,
952 CONST_TO_OR_CIRCUIT(circ)->p_circ_id,
953 cc->min_rtt_usec/1000,
954 curr_rtt_usec/1000,
955 cc->ewma_rtt_usec/1000,
956 cc->max_rtt_usec/1000,
957 cc->bdp);
961 /* We updated BDP this round if either we had a blocked channel, or
962 * the curr_rtt_usec was not 0. */
963 bool ret = (blocked_on_chan || curr_rtt_usec != 0);
964 if (ret) {
965 tor_trace(TR_SUBSYS(cc), TR_EV(bdp_update), circ, cc, curr_rtt_usec);
967 return ret;
971 * Dispatch the sendme to the appropriate congestion control algorithm.
974 congestion_control_dispatch_cc_alg(congestion_control_t *cc,
975 circuit_t *circ)
977 int ret = -END_CIRC_REASON_INTERNAL;
979 tor_assert_nonfatal_once(cc->cc_alg == CC_ALG_VEGAS);
980 ret = congestion_control_vegas_process_sendme(cc, circ);
982 if (cc->cwnd > cwnd_max) {
983 static ratelim_t cwnd_limit = RATELIM_INIT(60);
984 log_fn_ratelim(&cwnd_limit, LOG_NOTICE, LD_CIRC,
985 "Congestion control cwnd %"PRIu64" exceeds max %d, clamping.",
986 cc->cwnd, cwnd_max);
987 cc->cwnd = cwnd_max;
990 /* If we have a non-zero RTT measurement, update conflux. */
991 if (circ->conflux && cc->ewma_rtt_usec)
992 conflux_update_rtt(circ->conflux, circ, cc->ewma_rtt_usec);
994 return ret;
998 * Build an extension field request to negotiate congestion control.
1000 * If congestion control is enabled, field TRUNNEL_EXT_TYPE_CC_FIELD_REQUEST
1001 * is created in msg_out. It is a single 0-length field that signifies that we
1002 * want to use congestion control. The length of msg_out is provided via
1003 * msg_len_out.
1005 * If congestion control is not enabled, a payload with 0 extensions is created
1006 * and returned.
1008 * If there is a failure building the request, -1 is returned, else 0.
1010 * *msg_out must be freed if the return value is 0.
1013 congestion_control_build_ext_request(uint8_t **msg_out, size_t *msg_len_out)
1015 uint8_t *request = NULL;
1016 trn_extension_t *ext = NULL;
1017 trn_extension_field_t *field = NULL;
1019 ext = trn_extension_new();
1021 /* With congestion control enabled, add the request, else it is an empty
1022 * request in the payload. */
1024 if (congestion_control_enabled()) {
1025 /* Build the extension field that will hold the CC field. */
1026 field = trn_extension_field_new();
1027 trn_extension_field_set_field_type(field,
1028 TRUNNEL_EXT_TYPE_CC_FIELD_REQUEST);
1030 /* No payload indicating a request to use congestion control. */
1031 trn_extension_field_set_field_len(field, 0);
1033 /* Build final extension. */
1034 trn_extension_add_fields(ext, field);
1035 trn_extension_set_num(ext, 1);
1038 /* Encode extension. */
1039 ssize_t ret = trn_extension_encoded_len(ext);
1040 if (BUG(ret < 0)) {
1041 goto err;
1043 size_t request_len = ret;
1044 request = tor_malloc_zero(request_len);
1045 ret = trn_extension_encode(request, request_len, ext);
1046 if (BUG(ret < 0)) {
1047 tor_free(request);
1048 goto err;
1050 *msg_out = request;
1051 *msg_len_out = request_len;
1053 /* Free everything, we've encoded the request now. */
1054 ret = 0;
1056 err:
1057 trn_extension_free(ext);
1058 return (int)ret;
1062 * Parse a congestion control ntorv3 request payload for extensions.
1064 * On parsing failure, -1 is returned.
1066 * If congestion control request is present, return 1. If it is not present,
1067 * return 0.
1069 * WARNING: Called from CPU worker! Must not access any global state.
1072 congestion_control_parse_ext_request(const uint8_t *msg, const size_t msg_len)
1074 ssize_t ret = 0;
1075 trn_extension_t *ext = NULL;
1076 size_t num_fields = 0;
1078 /* Parse extension from payload. */
1079 ret = trn_extension_parse(&ext, msg, msg_len);
1080 if (ret < 0) {
1081 goto end;
1084 /* No extension implies no support for congestion control. In this case, we
1085 * simply return 0 to indicate CC is disabled. */
1086 if ((num_fields = trn_extension_get_num(ext)) == 0) {
1087 ret = 0;
1088 goto end;
1091 /* Go over all fields. If any field is TRUNNEL_EXT_TYPE_CC_FIELD_REQUEST,
1092 * then congestion control is enabled. Ignore unknown fields. */
1093 for (size_t f = 0; f < num_fields; f++) {
1094 const trn_extension_field_t *field = trn_extension_get_fields(ext, f);
1095 if (field == NULL) {
1096 ret = -1;
1097 goto end;
1100 /* For congestion control to be enabled, we only need the field type. */
1101 if (trn_extension_field_get_field_type(field) ==
1102 TRUNNEL_EXT_TYPE_CC_FIELD_REQUEST) {
1103 ret = 1;
1104 break;
1108 end:
1109 trn_extension_free(ext);
1110 return (int)ret;
1114 * Given our observed parameters for circuits and congestion control,
1115 * as well as the parameters for the resulting circuit, build a response
1116 * payload using extension fields into *msg_out, with length specified in
1117 * *msg_out_len.
1119 * If congestion control will be enabled, the extension field for
1120 * TRUNNEL_EXT_TYPE_CC_FIELD_RESPONSE will contain the sendme_inc value.
1122 * If congestion control won't be enabled, an extension payload with 0
1123 * fields will be created.
1125 * Return 0 if an extension payload was created in *msg_out, and -1 on
1126 * error.
1128 * *msg_out must be freed if the return value is 0.
1130 * WARNING: Called from CPU worker! Must not access any global state.
1133 congestion_control_build_ext_response(const circuit_params_t *our_params,
1134 const circuit_params_t *circ_params,
1135 uint8_t **msg_out, size_t *msg_len_out)
1137 ssize_t ret;
1138 uint8_t *request = NULL;
1139 trn_extension_t *ext = NULL;
1140 trn_extension_field_t *field = NULL;
1141 trn_extension_field_cc_t *cc_field = NULL;
1143 tor_assert(our_params);
1144 tor_assert(circ_params);
1145 tor_assert(msg_out);
1146 tor_assert(msg_len_out);
1148 ext = trn_extension_new();
1150 if (circ_params->cc_enabled) {
1151 /* Build the extension field that will hold the CC field. */
1152 field = trn_extension_field_new();
1153 trn_extension_field_set_field_type(field,
1154 TRUNNEL_EXT_TYPE_CC_FIELD_RESPONSE);
1156 /* Build the congestion control field response. */
1157 cc_field = trn_extension_field_cc_new();
1158 trn_extension_field_cc_set_sendme_inc(cc_field,
1159 our_params->sendme_inc_cells);
1161 ret = trn_extension_field_cc_encoded_len(cc_field);
1162 if (BUG(ret <= 0)) {
1163 trn_extension_field_free(field);
1164 goto err;
1166 size_t field_len = ret;
1167 trn_extension_field_set_field_len(field, field_len);
1168 trn_extension_field_setlen_field(field, field_len);
1170 uint8_t *field_array = trn_extension_field_getarray_field(field);
1171 ret = trn_extension_field_cc_encode(field_array,
1172 trn_extension_field_getlen_field(field), cc_field);
1173 if (BUG(ret <= 0)) {
1174 trn_extension_field_free(field);
1175 goto err;
1178 /* Build final extension. */
1179 trn_extension_add_fields(ext, field);
1180 trn_extension_set_num(ext, 1);
1183 /* Encode extension. */
1184 ret = trn_extension_encoded_len(ext);
1185 if (BUG(ret < 0)) {
1186 goto err;
1188 size_t request_len = ret;
1189 request = tor_malloc_zero(request_len);
1190 ret = trn_extension_encode(request, request_len, ext);
1191 if (BUG(ret < 0)) {
1192 tor_free(request);
1193 goto err;
1195 *msg_out = request;
1196 *msg_len_out = request_len;
1198 /* We've just encoded the extension, clean everything. */
1199 ret = 0;
1201 err:
1202 trn_extension_free(ext);
1203 trn_extension_field_cc_free(cc_field);
1204 return (int)ret;
1207 /** Return true iff the given sendme increment is within the acceptable
1208 * margins. */
1209 bool
1210 congestion_control_validate_sendme_increment(uint8_t sendme_inc)
1212 /* We will only accept this response (and this circuit) if sendme_inc
1213 * is within +/- 1 of the current consensus value. We should not need
1214 * to change cc_sendme_inc much, and if we do, we can spread out those
1215 * changes over smaller increments once every 4 hours. Exits that
1216 * violate this range should just not be used. */
1218 if (sendme_inc == 0)
1219 return false;
1221 if (sendme_inc > (congestion_control_sendme_inc() + 1) ||
1222 sendme_inc < (congestion_control_sendme_inc() - 1)) {
1223 return false;
1225 return true;
1228 /** Return 1 if CC is enabled which also will set the SENDME increment into our
1229 * params_out. Return 0 if CC is disabled. Else, return -1 on error. */
1231 congestion_control_parse_ext_response(const uint8_t *msg,
1232 const size_t msg_len,
1233 circuit_params_t *params_out)
1235 ssize_t ret = 0;
1236 size_t num_fields = 0;
1237 trn_extension_t *ext = NULL;
1238 trn_extension_field_cc_t *cc_field = NULL;
1240 /* We will only accept this response (and this circuit) if sendme_inc
1241 * is within a factor of 2 of our consensus value. We should not need
1242 * to change cc_sendme_inc much, and if we do, we can spread out those
1243 * changes over smaller increments once every 4 hours. Exits that
1244 * violate this range should just not be used. */
1245 #define MAX_SENDME_INC_NEGOTIATE_FACTOR 2
1247 /* Parse extension from payload. */
1248 ret = trn_extension_parse(&ext, msg, msg_len);
1249 if (ret < 0) {
1250 goto end;
1253 if ((num_fields = trn_extension_get_num(ext)) == 0) {
1254 ret = 0;
1255 goto end;
1258 /* Go over all fields. If any field is TRUNNEL_EXT_TYPE_CC_FIELD_RESPONSE,
1259 * then congestion control is enabled. Ignore unknown fields. */
1260 for (size_t f = 0; f < num_fields; f++) {
1261 const trn_extension_field_t *field = trn_extension_get_fields(ext, f);
1262 if (field == NULL) {
1263 ret = -1;
1264 goto end;
1267 /* Only examine TRUNNEL_EXT_TYPE_CC_FIELD_RESPONSE; ignore other fields */
1268 if (trn_extension_field_get_field_type(field) ==
1269 TRUNNEL_EXT_TYPE_CC_FIELD_RESPONSE) {
1271 /* Parse the field into the congestion control field. */
1272 ret = trn_extension_field_cc_parse(&cc_field,
1273 trn_extension_field_getconstarray_field(field),
1274 trn_extension_field_getlen_field(field));
1275 if (ret < 0) {
1276 goto end;
1279 uint8_t sendme_inc_cells =
1280 trn_extension_field_cc_get_sendme_inc(cc_field);
1281 if (!congestion_control_validate_sendme_increment(sendme_inc_cells)) {
1282 ret = -1;
1283 goto end;
1286 /* All good. Get value and break */
1287 params_out->sendme_inc_cells = sendme_inc_cells;
1288 ret = 1;
1289 break;
1293 end:
1294 trn_extension_free(ext);
1295 trn_extension_field_cc_free(cc_field);
1297 return (int)ret;
1301 * Returns a formatted string of fields containing congestion
1302 * control information, for the CIRC_BW control port event.
1304 * An origin circuit can have a ccontrol object directly on it,
1305 * if it is an onion service, or onion client. Exit-bound clients
1306 * will have the ccontrol on the cpath associated with their exit
1307 * (the last one in the cpath list).
1309 * WARNING: This function does not support leaky-pipe topology. It
1310 * is to be used for control port information only.
1312 char *
1313 congestion_control_get_control_port_fields(const origin_circuit_t *circ)
1315 const congestion_control_t *ccontrol = NULL;
1316 char *ret = NULL;
1317 int len;
1319 if (TO_CIRCUIT(circ)->ccontrol) {
1320 ccontrol = TO_CIRCUIT(circ)->ccontrol;
1321 } else if (circ->cpath && circ->cpath->prev->ccontrol) {
1322 /* Get ccontrol for last hop (exit) if it exists */
1323 ccontrol = circ->cpath->prev->ccontrol;
1326 if (!ccontrol)
1327 return NULL;
1329 len = tor_asprintf(&ret,
1330 " SS=%d CWND=%"PRIu64" RTT=%"PRIu64" MIN_RTT=%"PRIu64,
1331 ccontrol->in_slow_start, ccontrol->cwnd,
1332 ccontrol->ewma_rtt_usec/1000,
1333 ccontrol->min_rtt_usec/1000);
1334 if (len < 0) {
1335 log_warn(LD_BUG, "Unable to format event for controller.");
1336 return NULL;
1339 return ret;