src/core/or/scheduler_kist.c

   1 /* Copyright (c) 2017-2021, The Tor Project, Inc. */
   2 /* See LICENSE for licensing information */
   3
   4 /**
   5  * @file scheduler_kist.c
   6  * @brief Implements the KIST cell scheduler.
   7  **/
   8
   9 #define SCHEDULER_KIST_PRIVATE
  10
  11 #include "core/or/or.h"
  12 #include "lib/buf/buffers.h"
  13 #include "app/config/config.h"
  14 #include "core/mainloop/connection.h"
  15 #include "feature/nodelist/networkstatus.h"
  16 #include "feature/relay/routermode.h"
  17 #define CHANNEL_OBJECT_PRIVATE
  18 #include "core/or/channel.h"
  19 #include "core/or/channeltls.h"
  20 #define SCHEDULER_PRIVATE
  21 #include "core/or/scheduler.h"
  22 #include "lib/math/fp.h"
  23
  24 #include "core/or/or_connection_st.h"
  25
  26 #ifdef HAVE_SYS_IOCTL_H
  27 #include <sys/ioctl.h>
  28 #endif
  29
  30 #ifdef HAVE_KIST_SUPPORT
  31 /* Kernel interface needed for KIST. */
  32 #include <netinet/tcp.h>
  33 #include <linux/sockios.h>
  34 #endif /* HAVE_KIST_SUPPORT */
  35
  36 /*****************************************************************************
  37  * Data structures and supporting functions
  38  *****************************************************************************/
  39
  40 /* Socket_table hash table stuff. The socket_table keeps track of per-socket
  41  * limit information imposed by kist and used by kist. */
  42
  43 static uint32_t
  44 socket_table_ent_hash(const socket_table_ent_t *ent)
  45 {
  46   return (uint32_t)ent->chan->global_identifier;
  47 }
  48
  49 static unsigned
  50 socket_table_ent_eq(const socket_table_ent_t *a, const socket_table_ent_t *b)
  51 {
  52   return a->chan == b->chan;
  53 }
  54
  55 typedef HT_HEAD(socket_table_s, socket_table_ent_t) socket_table_t;
  56
  57 static socket_table_t socket_table = HT_INITIALIZER();
  58
  59 HT_PROTOTYPE(socket_table_s, socket_table_ent_t, node, socket_table_ent_hash,
  60              socket_table_ent_eq);
  61 HT_GENERATE2(socket_table_s, socket_table_ent_t, node, socket_table_ent_hash,
  62              socket_table_ent_eq, 0.6, tor_reallocarray, tor_free_);
  63
  64 /* outbuf_table hash table stuff. The outbuf_table keeps track of which
  65  * channels have data sitting in their outbuf so the kist scheduler can force
  66  * a write from outbuf to kernel periodically during a run and at the end of a
  67  * run. */
  68
  69 typedef struct outbuf_table_ent_t {
  70   HT_ENTRY(outbuf_table_ent_t) node;
  71   channel_t *chan;
  72 } outbuf_table_ent_t;
  73
  74 static uint32_t
  75 outbuf_table_ent_hash(const outbuf_table_ent_t *ent)
  76 {
  77   return (uint32_t)ent->chan->global_identifier;
  78 }
  79
  80 static unsigned
  81 outbuf_table_ent_eq(const outbuf_table_ent_t *a, const outbuf_table_ent_t *b)
  82 {
  83   return a->chan->global_identifier == b->chan->global_identifier;
  84 }
  85
  86 HT_PROTOTYPE(outbuf_table_s, outbuf_table_ent_t, node, outbuf_table_ent_hash,
  87              outbuf_table_ent_eq);
  88 HT_GENERATE2(outbuf_table_s, outbuf_table_ent_t, node, outbuf_table_ent_hash,
  89              outbuf_table_ent_eq, 0.6, tor_reallocarray, tor_free_);
  90
  91 /*****************************************************************************
  92  * Other internal data
  93  *****************************************************************************/
  94
  95 /* Store the last time the scheduler was run so we can decide when to next run
  96  * the scheduler based on it. */
  97 static monotime_t scheduler_last_run;
  98 /* This is a factor for the extra_space calculation in kist per-socket limits.
  99  * It is the number of extra congestion windows we want to write to the kernel.
 100  */
 101 static double sock_buf_size_factor = 1.0;
 102 /* How often the scheduler runs. */
 103 STATIC int sched_run_interval = KIST_SCHED_RUN_INTERVAL_DEFAULT;
 104
 105 #ifdef HAVE_KIST_SUPPORT
 106 /* Indicate if KIST lite mode is on or off. We can disable it at runtime.
 107  * Important to have because of the KISTLite -> KIST possible transition. */
 108 static unsigned int kist_lite_mode = 0;
 109 /* Indicate if we don't have the kernel support. This can happen if the kernel
 110  * changed and it doesn't recognized the values passed to the syscalls needed
 111  * by KIST. In that case, fallback to the naive approach. */
 112 static unsigned int kist_no_kernel_support = 0;
 113 #else /* !defined(HAVE_KIST_SUPPORT) */
 114 static unsigned int kist_lite_mode = 1;
 115 #endif /* defined(HAVE_KIST_SUPPORT) */
 116
 117 /*****************************************************************************
 118  * Internally called function implementations
 119  *****************************************************************************/
 120
 121 /* Little helper function to get the length of a channel's output buffer */
 122 static inline size_t
 123 channel_outbuf_length(channel_t *chan)
 124 {
 125   tor_assert(chan);
 126   /* In theory, this can not happen because we can not scheduler a channel
 127    * without a connection that has its outbuf initialized. Just in case, bug
 128    * on this so we can understand a bit more why it happened. */
 129   if (SCHED_BUG(BASE_CHAN_TO_TLS(chan)->conn == NULL, chan)) {
 130     return 0;
 131   }
 132   return buf_datalen(TO_CONN(BASE_CHAN_TO_TLS(chan)->conn)->outbuf);
 133 }
 134
 135 /* Little helper function for HT_FOREACH_FN. */
 136 static int
 137 each_channel_write_to_kernel(outbuf_table_ent_t *ent, void *data)
 138 {
 139   (void) data; /* Make compiler happy. */
 140   channel_write_to_kernel(ent->chan);
 141   return 0; /* Returning non-zero removes the element from the table. */
 142 }
 143
 144 /* Free the given outbuf table entry ent. */
 145 static int
 146 free_outbuf_info_by_ent(outbuf_table_ent_t *ent, void *data)
 147 {
 148   (void) data; /* Make compiler happy. */
 149   log_debug(LD_SCHED, "Freeing outbuf table entry from chan=%" PRIu64,
 150             ent->chan->global_identifier);
 151   tor_free(ent);
 152   return 1; /* So HT_FOREACH_FN will remove the element */
 153 }
 154
 155 /* Free the given socket table entry ent. */
 156 static int
 157 free_socket_info_by_ent(socket_table_ent_t *ent, void *data)
 158 {
 159   (void) data; /* Make compiler happy. */
 160   log_debug(LD_SCHED, "Freeing socket table entry from chan=%" PRIu64,
 161             ent->chan->global_identifier);
 162   tor_free(ent);
 163   return 1; /* So HT_FOREACH_FN will remove the element */
 164 }
 165
 166 /* Clean up socket_table. Probably because the KIST sched impl is going away */
 167 static void
 168 free_all_socket_info(void)
 169 {
 170   HT_FOREACH_FN(socket_table_s, &socket_table, free_socket_info_by_ent, NULL);
 171   HT_CLEAR(socket_table_s, &socket_table);
 172 }
 173
 174 static socket_table_ent_t *
 175 socket_table_search(socket_table_t *table, const channel_t *chan)
 176 {
 177   socket_table_ent_t search, *ent = NULL;
 178   search.chan = chan;
 179   ent = HT_FIND(socket_table_s, table, &search);
 180   return ent;
 181 }
 182
 183 /* Free a socket entry in table for the given chan. */
 184 static void
 185 free_socket_info_by_chan(socket_table_t *table, const channel_t *chan)
 186 {
 187   socket_table_ent_t *ent = NULL;
 188   ent = socket_table_search(table, chan);
 189   if (!ent)
 190     return;
 191   log_debug(LD_SCHED, "scheduler free socket info for chan=%" PRIu64,
 192             chan->global_identifier);
 193   HT_REMOVE(socket_table_s, table, ent);
 194   free_socket_info_by_ent(ent, NULL);
 195 }
 196
 197 /* Perform system calls for the given socket in order to calculate kist's
 198  * per-socket limit as documented in the function body. */
 199 MOCK_IMPL(void,
 200 update_socket_info_impl, (socket_table_ent_t *ent))
 201 {
 202 #ifdef HAVE_KIST_SUPPORT
 203   int64_t tcp_space, extra_space;
 204   tor_assert(ent);
 205   tor_assert(ent->chan);
 206   const tor_socket_t sock =
 207     TO_CONN(CONST_BASE_CHAN_TO_TLS(ent->chan)->conn)->s;
 208   struct tcp_info tcp;
 209   socklen_t tcp_info_len = sizeof(tcp);
 210
 211   if (kist_no_kernel_support || kist_lite_mode) {
 212     goto fallback;
 213   }
 214
 215   /* Gather information */
 216   if (getsockopt(sock, SOL_TCP, TCP_INFO, (void *)&(tcp), &tcp_info_len) < 0) {
 217     if (errno == EINVAL) {
 218       /* Oops, this option is not provided by the kernel, we'll have to
 219        * disable KIST entirely. This can happen if tor was built on a machine
 220        * with the support previously or if the kernel was updated and lost the
 221        * support. */
 222       log_notice(LD_SCHED, "Looks like our kernel doesn't have the support "
 223                            "for KIST anymore. We will fallback to the naive "
 224                            "approach. Remove KIST from the Schedulers list "
 225                            "to disable.");
 226       kist_no_kernel_support = 1;
 227     }
 228     goto fallback;
 229   }
 230   if (ioctl(sock, SIOCOUTQNSD, &(ent->notsent)) < 0) {
 231     if (errno == EINVAL) {
 232       log_notice(LD_SCHED, "Looks like our kernel doesn't have the support "
 233                            "for KIST anymore. We will fallback to the naive "
 234                            "approach. Remove KIST from the Schedulers list "
 235                            "to disable.");
 236       /* Same reason as the above. */
 237       kist_no_kernel_support = 1;
 238     }
 239     goto fallback;
 240   }
 241   ent->cwnd = tcp.tcpi_snd_cwnd;
 242   ent->unacked = tcp.tcpi_unacked;
 243   ent->mss = tcp.tcpi_snd_mss;
 244
 245   /* In order to reduce outbound kernel queuing delays and thus improve Tor's
 246    * ability to prioritize circuits, KIST wants to set a socket write limit
 247    * that is near the amount that the socket would be able to immediately send
 248    * into the Internet.
 249    *
 250    * We first calculate how much the socket could send immediately (assuming
 251    * completely full packets) according to the congestion window and the number
 252    * of unacked packets.
 253    *
 254    * Then we add a little extra space in a controlled way. We do this so any
 255    * when the kernel gets ACKs back for data currently sitting in the "TCP
 256    * space", it will already have some more data to send immediately. It will
 257    * not have to wait for the scheduler to run again. The amount of extra space
 258    * is a factor of the current congestion window. With the suggested
 259    * sock_buf_size_factor value of 1.0, we allow at most 2*cwnd bytes to sit in
 260    * the kernel: 1 cwnd on the wire waiting for ACKs and 1 cwnd ready and
 261    * waiting to be sent when those ACKs finally come.
 262    *
 263    * In the below diagram, we see some bytes in the TCP-space (denoted by '*')
 264    * that have be sent onto the wire and are waiting for ACKs. We have a little
 265    * more room in "TCP space" that we can fill with data that will be
 266    * immediately sent. We also see the "extra space" KIST calculates. The sum
 267    * of the empty "TCP space" and the "extra space" is the kist-imposed write
 268    * limit for this socket.
 269    *
 270    * <----------------kernel-outbound-socket-queue----------------|
 271    * <*********---------------------------------------------------|
 272    * |----TCP-space-----|----extra-space-----|
 273    * |------------------|
 274    *                    ^ ((cwnd - unacked) * mss) bytes
 275    *                    |--------------------|
 276    *                                         ^ ((cwnd * mss) * factor) bytes
 277    */
 278
 279   /* These values from the kernel are uint32_t, they will always fit into a
 280    * int64_t tcp_space variable but if the congestion window cwnd is smaller
 281    * than the unacked packets, the remaining TCP space is set to 0. */
 282   if (ent->cwnd >= ent->unacked) {
 283     tcp_space = (ent->cwnd - ent->unacked) * (int64_t)(ent->mss);
 284   } else {
 285     tcp_space = 0;
 286   }
 287
 288   /* The clamp_double_to_int64 makes sure the first part fits into an int64_t.
 289    * In fact, if sock_buf_size_factor is still forced to be >= 0 in config.c,
 290    * then it will be positive for sure. Then we subtract a uint32_t. Getting a
 291    * negative value is OK, see after how it is being handled. */
 292   extra_space =
 293     clamp_double_to_int64(
 294                  (ent->cwnd * (int64_t)ent->mss) * sock_buf_size_factor) -
 295     ent->notsent - (int64_t)channel_outbuf_length((channel_t *) ent->chan);
 296   if ((tcp_space + extra_space) < 0) {
 297     /* This means that the "notsent" queue is just too big so we shouldn't put
 298      * more in the kernel for now. */
 299     ent->limit = 0;
 300   } else {
 301     /* The positive sum of two int64_t will always fit into an uint64_t.
 302      * And we know this will always be positive, since we checked above. */
 303     ent->limit = (uint64_t)tcp_space + (uint64_t)extra_space;
 304   }
 305   return;
 306
 307 #else /* !defined(HAVE_KIST_SUPPORT) */
 308   goto fallback;
 309 #endif /* defined(HAVE_KIST_SUPPORT) */
 310
 311  fallback:
 312   /* If all of a sudden we don't have kist support, we just zero out all the
 313    * variables for this socket since we don't know what they should be. We
 314    * also allow the socket to write as much as it can from the estimated
 315    * number of cells the lower layer can accept, effectively returning it to
 316    * Vanilla scheduler behavior. */
 317   ent->cwnd = ent->unacked = ent->mss = ent->notsent = 0;
 318   /* This function calls the specialized channel object (currently channeltls)
 319    * and ask how many cells it can write on the outbuf which we then multiply
 320    * by the size of the cells for this channel. The cast is because this
 321    * function requires a non-const channel object, meh. */
 322   ent->limit = channel_num_cells_writeable((channel_t *) ent->chan) *
 323                (get_cell_network_size(ent->chan->wide_circ_ids) +
 324                 TLS_PER_CELL_OVERHEAD);
 325 }
 326
 327 /* Given a socket that isn't in the table, add it.
 328  * Given a socket that is in the table, re-init values that need init-ing
 329  * every scheduling run
 330  */
 331 static void
 332 init_socket_info(socket_table_t *table, const channel_t *chan)
 333 {
 334   socket_table_ent_t *ent = NULL;
 335   ent = socket_table_search(table, chan);
 336   if (!ent) {
 337     log_debug(LD_SCHED, "scheduler init socket info for chan=%" PRIu64,
 338               chan->global_identifier);
 339     ent = tor_malloc_zero(sizeof(*ent));
 340     ent->chan = chan;
 341     HT_INSERT(socket_table_s, table, ent);
 342   }
 343   ent->written = 0;
 344 }
 345
 346 /* Add chan to the outbuf table if it isn't already in it. If it is, then don't
 347  * do anything */
 348 static void
 349 outbuf_table_add(outbuf_table_t *table, channel_t *chan)
 350 {
 351   outbuf_table_ent_t search, *ent;
 352   search.chan = chan;
 353   ent = HT_FIND(outbuf_table_s, table, &search);
 354   if (!ent) {
 355     log_debug(LD_SCHED, "scheduler init outbuf info for chan=%" PRIu64,
 356               chan->global_identifier);
 357     ent = tor_malloc_zero(sizeof(*ent));
 358     ent->chan = chan;
 359     HT_INSERT(outbuf_table_s, table, ent);
 360   }
 361 }
 362
 363 static void
 364 outbuf_table_remove(outbuf_table_t *table, channel_t *chan)
 365 {
 366   outbuf_table_ent_t search, *ent;
 367   search.chan = chan;
 368   ent = HT_FIND(outbuf_table_s, table, &search);
 369   if (ent) {
 370     HT_REMOVE(outbuf_table_s, table, ent);
 371     free_outbuf_info_by_ent(ent, NULL);
 372   }
 373 }
 374
 375 /* Set the scheduler running interval. */
 376 static void
 377 set_scheduler_run_interval(void)
 378 {
 379   int old_sched_run_interval = sched_run_interval;
 380   sched_run_interval = kist_scheduler_run_interval();
 381   if (old_sched_run_interval != sched_run_interval) {
 382     log_info(LD_SCHED, "Scheduler KIST changing its running interval "
 383                        "from %" PRId32 " to %" PRId32,
 384              old_sched_run_interval, sched_run_interval);
 385   }
 386 }
 387
 388 /* Return true iff the channel hasn't hit its kist-imposed write limit yet */
 389 static int
 390 socket_can_write(socket_table_t *table, const channel_t *chan)
 391 {
 392   socket_table_ent_t *ent = NULL;
 393   ent = socket_table_search(table, chan);
 394   if (SCHED_BUG(!ent, chan)) {
 395     return 1; // Just return true, saying that kist wouldn't limit the socket
 396   }
 397
 398   /* We previously calculated a write limit for this socket. In the below
 399    * calculation, first determine how much room is left in bytes. Then divide
 400    * that by the amount of space a cell takes. If there's room for at least 1
 401    * cell, then KIST will allow the socket to write. */
 402   int64_t kist_limit_space =
 403     (int64_t) (ent->limit - ent->written) /
 404     (CELL_MAX_NETWORK_SIZE + TLS_PER_CELL_OVERHEAD);
 405   return kist_limit_space > 0;
 406 }
 407
 408 /* Update the channel's socket kernel information. */
 409 static void
 410 update_socket_info(socket_table_t *table, const channel_t *chan)
 411 {
 412   socket_table_ent_t *ent = NULL;
 413   ent = socket_table_search(table, chan);
 414   if (SCHED_BUG(!ent, chan)) {
 415     return; // Whelp. Entry didn't exist for some reason so nothing to do.
 416   }
 417   update_socket_info_impl(ent);
 418   log_debug(LD_SCHED, "chan=%" PRIu64 " updated socket info, limit: %" PRIu64
 419                       ", cwnd: %" PRIu32 ", unacked: %" PRIu32
 420                       ", notsent: %" PRIu32 ", mss: %" PRIu32,
 421             ent->chan->global_identifier, ent->limit, ent->cwnd, ent->unacked,
 422             ent->notsent, ent->mss);
 423 }
 424
 425 /* Increment the channel's socket written value by the number of bytes. */
 426 static void
 427 update_socket_written(socket_table_t *table, channel_t *chan, size_t bytes)
 428 {
 429   socket_table_ent_t *ent = NULL;
 430   ent = socket_table_search(table, chan);
 431   if (SCHED_BUG(!ent, chan)) {
 432     return; // Whelp. Entry didn't exist so nothing to do.
 433   }
 434
 435   log_debug(LD_SCHED, "chan=%" PRIu64 " wrote %lu bytes, old was %" PRIi64,
 436             chan->global_identifier, (unsigned long) bytes, ent->written);
 437
 438   ent->written += bytes;
 439 }
 440
 441 /*
 442  * A naive KIST impl would write every single cell all the way to the kernel.
 443  * That would take a lot of system calls. A less bad KIST impl would write a
 444  * channel's outbuf to the kernel only when we are switching to a different
 445  * channel. But if we have two channels with equal priority, we end up writing
 446  * one cell for each and bouncing back and forth. This KIST impl avoids that
 447  * by only writing a channel's outbuf to the kernel if it has 8 cells or more
 448  * in it.
 449  *
 450  * Note: The number 8 was picked so that, when using 512-byte cells, it
 451  * would produce 4096 bytes: a common number for buffering.  A TLS
 452  * record can hold up to 16KiB; thus, using 8 512-byte cells means that
 453  * a relay will at most send a TLS record of 4KiB or 1/4 of the maximum
 454  * capacity of a TLS record.
 455  *
 456  * Of course, the above calculation became incorrect when we moved to
 457  * 514-byte cells in order to accommodate a 4-byte circuit ID; we may
 458  * want to consider profiling with '7' to see if it produces better
 459  * results.  (TODO)
 460  */
 461 MOCK_IMPL(int, channel_should_write_to_kernel,
 462           (outbuf_table_t *table, channel_t *chan))
 463 {
 464   outbuf_table_add(table, chan);
 465   /* CELL_MAX_NETWORK_SIZE * 8 because we only want to write the outbuf to the
 466    * kernel if there's 8 or more cells waiting */
 467   return channel_outbuf_length(chan) > (CELL_MAX_NETWORK_SIZE * 8);
 468 }
 469
 470 /* Little helper function to write a channel's outbuf all the way to the
 471  * kernel */
 472 MOCK_IMPL(void, channel_write_to_kernel, (channel_t *chan))
 473 {
 474   tor_assert(chan);
 475
 476   /* This is possible because a channel might have an outbuf table entry even
 477    * though it has no more cells in its outbuf. Just move on. */
 478   size_t outbuf_len = channel_outbuf_length(chan);
 479   if (outbuf_len == 0) {
 480     return;
 481   }
 482
 483   log_debug(LD_SCHED, "Writing %lu bytes to kernel for chan %" PRIu64,
 484             (unsigned long) outbuf_len, chan->global_identifier);
 485
 486   /* Note that 'connection_handle_write()' may change the scheduler state of
 487    * the channel during the scheduling loop with
 488    * 'connection_or_flushed_some()' -> 'scheduler_channel_wants_writes()'.
 489    * This side-effect will only occur if the channel is currently in the
 490    * 'SCHED_CHAN_WAITING_TO_WRITE' or 'SCHED_CHAN_IDLE' states, which KIST
 491    * rarely uses, so it should be fine unless KIST begins using these states
 492    * in the future. */
 493   connection_handle_write(TO_CONN(BASE_CHAN_TO_TLS(chan)->conn), 0);
 494 }
 495
 496 /* Return true iff the scheduler has work to perform. */
 497 static int
 498 have_work(void)
 499 {
 500   smartlist_t *cp = get_channels_pending();
 501   IF_BUG_ONCE(!cp) {
 502     return 0; // channels_pending doesn't exist so... no work?
 503   }
 504   return smartlist_len(cp) > 0;
 505 }
 506
 507 /* Function of the scheduler interface: free_all() */
 508 static void
 509 kist_free_all(void)
 510 {
 511   free_all_socket_info();
 512 }
 513
 514 /* Function of the scheduler interface: on_channel_free() */
 515 static void
 516 kist_on_channel_free_fn(const channel_t *chan)
 517 {
 518   free_socket_info_by_chan(&socket_table, chan);
 519 }
 520
 521 /* Function of the scheduler interface: on_new_consensus() */
 522 static void
 523 kist_scheduler_on_new_consensus(void)
 524 {
 525   set_scheduler_run_interval();
 526 }
 527
 528 /* Function of the scheduler interface: on_new_options() */
 529 static void
 530 kist_scheduler_on_new_options(void)
 531 {
 532   sock_buf_size_factor = get_options()->KISTSockBufSizeFactor;
 533
 534   /* Calls kist_scheduler_run_interval which calls get_options(). */
 535   set_scheduler_run_interval();
 536 }
 537
 538 /* Function of the scheduler interface: init() */
 539 static void
 540 kist_scheduler_init(void)
 541 {
 542   /* When initializing the scheduler, the last run could be 0 because it is
 543    * declared static or a value in the past that was set when it was last
 544    * used. In both cases, we want to initialize it to now so we don't risk
 545    * using the value 0 which doesn't play well with our monotonic time
 546    * interface.
 547    *
 548    * One side effect is that the first scheduler run will be at the next tick
 549    * that is in now + 10 msec (KIST_SCHED_RUN_INTERVAL_DEFAULT) by default. */
 550   monotime_get(&scheduler_last_run);
 551
 552   kist_scheduler_on_new_options();
 553   IF_BUG_ONCE(sched_run_interval == 0) {
 554     log_warn(LD_SCHED, "We are initing the KIST scheduler and noticed the "
 555              "KISTSchedRunInterval is telling us to not use KIST. That's "
 556              "weird! We'll continue using KIST, but at %" PRId32 "ms.",
 557              KIST_SCHED_RUN_INTERVAL_DEFAULT);
 558     sched_run_interval = KIST_SCHED_RUN_INTERVAL_DEFAULT;
 559   }
 560 }
 561
 562 /* Function of the scheduler interface: schedule() */
 563 static void
 564 kist_scheduler_schedule(void)
 565 {
 566   struct monotime_t now;
 567   struct timeval next_run;
 568   int64_t diff;
 569
 570   if (!have_work()) {
 571     return;
 572   }
 573   monotime_get(&now);
 574
 575   /* If time is really monotonic, we can never have now being smaller than the
 576    * last scheduler run. The scheduler_last_run at first is set to 0.
 577    * Unfortunately, not all platforms guarantee monotonic time so we log at
 578    * info level but don't make it more noisy. */
 579   diff = monotime_diff_msec(&scheduler_last_run, &now);
 580   if (diff < 0) {
 581     log_info(LD_SCHED, "Monotonic time between now and last run of scheduler "
 582                        "is negative: %" PRId64 ". Setting diff to 0.", diff);
 583     diff = 0;
 584   }
 585   if (diff < sched_run_interval) {
 586     next_run.tv_sec = 0;
 587     /* Takes 1000 ms -> us. This will always be valid because diff can NOT be
 588      * negative and can NOT be bigger than sched_run_interval so values can
 589      * only go from 1000 usec (diff set to interval - 1) to 100000 usec (diff
 590      * set to 0) for the maximum allowed run interval (100ms). */
 591     next_run.tv_usec = (int) ((sched_run_interval - diff) * 1000);
 592     /* Re-adding an event reschedules it. It does not duplicate it. */
 593     scheduler_ev_add(&next_run);
 594   } else {
 595     scheduler_ev_active();
 596   }
 597 }
 598
 599 /* Function of the scheduler interface: run() */
 600 static void
 601 kist_scheduler_run(void)
 602 {
 603   /* Define variables */
 604   channel_t *chan = NULL; // current working channel
 605   /* The last distinct chan served in a sched loop. */
 606   channel_t *prev_chan = NULL;
 607   int flush_result; // temporarily store results from flush calls
 608   /* Channels to be re-adding to pending at the end */
 609   smartlist_t *to_readd = NULL;
 610   smartlist_t *cp = get_channels_pending();
 611
 612   outbuf_table_t outbuf_table = HT_INITIALIZER();
 613
 614   /* For each pending channel, collect new kernel information */
 615   SMARTLIST_FOREACH_BEGIN(cp, const channel_t *, pchan) {
 616       init_socket_info(&socket_table, pchan);
 617       update_socket_info(&socket_table, pchan);
 618   } SMARTLIST_FOREACH_END(pchan);
 619
 620   log_debug(LD_SCHED, "Running the scheduler. %d channels pending",
 621             smartlist_len(cp));
 622
 623   /* The main scheduling loop. Loop until there are no more pending channels */
 624   while (smartlist_len(cp) > 0) {
 625     /* get best channel */
 626     chan = smartlist_pqueue_pop(cp, scheduler_compare_channels,
 627                                 offsetof(channel_t, sched_heap_idx));
 628     if (SCHED_BUG(!chan, NULL)) {
 629       /* Some-freaking-how a NULL got into the channels_pending. That should
 630        * never happen, but it should be harmless to ignore it and keep looping.
 631        */
 632       continue;
 633     }
 634     outbuf_table_add(&outbuf_table, chan);
 635
 636     /* if we have switched to a new channel, consider writing the previous
 637      * channel's outbuf to the kernel. */
 638     if (!prev_chan) {
 639       prev_chan = chan;
 640     }
 641     if (prev_chan != chan) {
 642       if (channel_should_write_to_kernel(&outbuf_table, prev_chan)) {
 643         channel_write_to_kernel(prev_chan);
 644         outbuf_table_remove(&outbuf_table, prev_chan);
 645       }
 646       prev_chan = chan;
 647     }
 648
 649     /* Only flush and write if the per-socket limit hasn't been hit */
 650     if (socket_can_write(&socket_table, chan)) {
 651       /* flush to channel queue/outbuf */
 652       flush_result = (int)channel_flush_some_cells(chan, 1); // 1 for num cells
 653       /* XXX: While flushing cells, it is possible that the connection write
 654        * fails leading to the channel to be closed which triggers a release
 655        * and free its entry in the socket table. And because of a engineering
 656        * design issue, the error is not propagated back so we don't get an
 657        * error at this point. So before we continue, make sure the channel is
 658        * open and if not just ignore it. See #23751. */
 659       if (!CHANNEL_IS_OPEN(chan)) {
 660         /* Channel isn't open so we put it back in IDLE mode. It is either
 661          * renegotiating its TLS session or about to be released. */
 662         scheduler_set_channel_state(chan, SCHED_CHAN_IDLE);
 663         continue;
 664       }
 665       /* flush_result has the # cells flushed */
 666       if (flush_result > 0) {
 667         update_socket_written(&socket_table, chan, flush_result *
 668                               (CELL_MAX_NETWORK_SIZE + TLS_PER_CELL_OVERHEAD));
 669       } else {
 670         /* XXX: This can happen because tor sometimes does flush in an
 671          * opportunistic way cells from the circuit to the outbuf so the
 672          * channel can end up here without having anything to flush nor needed
 673          * to write to the kernel. Hopefully we'll fix that soon but for now
 674          * we have to handle this case which happens kind of often. */
 675         log_debug(LD_SCHED,
 676                  "We didn't flush anything on a chan that we think "
 677                  "can write and wants to write. The channel's state is '%s' "
 678                  "and in scheduler state '%s'. We're going to mark it as "
 679                  "waiting_for_cells (as that's most likely the issue) and "
 680                  "stop scheduling it this round.",
 681                  channel_state_to_string(chan->state),
 682                  get_scheduler_state_string(chan->scheduler_state));
 683         scheduler_set_channel_state(chan, SCHED_CHAN_WAITING_FOR_CELLS);
 684         continue;
 685       }
 686     }
 687
 688     /* Decide what to do with the channel now */
 689
 690     if (!channel_more_to_flush(chan) &&
 691         !socket_can_write(&socket_table, chan)) {
 692
 693       /* Case 1: no more cells to send, and cannot write */
 694
 695       /*
 696        * You might think we should put the channel in SCHED_CHAN_IDLE. And
 697        * you're probably correct. While implementing KIST, we found that the
 698        * scheduling system would sometimes lose track of channels when we did
 699        * that. We suspect it has to do with the difference between "can't
 700        * write because socket/outbuf is full" and KIST's "can't write because
 701        * we've arbitrarily decided that that's enough for now." Sometimes
 702        * channels run out of cells at the same time they hit their
 703        * kist-imposed write limit and maybe the rest of Tor doesn't put the
 704        * channel back in pending when it is supposed to.
 705        *
 706        * This should be investigated again. It is as simple as changing
 707        * SCHED_CHAN_WAITING_FOR_CELLS to SCHED_CHAN_IDLE and seeing if Tor
 708        * starts having serious throughput issues. Best done in shadow/chutney.
 709        */
 710       scheduler_set_channel_state(chan, SCHED_CHAN_WAITING_FOR_CELLS);
 711     } else if (!channel_more_to_flush(chan)) {
 712
 713       /* Case 2: no more cells to send, but still open for writes */
 714
 715       scheduler_set_channel_state(chan, SCHED_CHAN_WAITING_FOR_CELLS);
 716     } else if (!socket_can_write(&socket_table, chan)) {
 717
 718       /* Case 3: cells to send, but cannot write */
 719
 720       /*
 721        * We want to write, but can't. If we left the channel in
 722        * channels_pending, we would never exit the scheduling loop. We need to
 723        * add it to a temporary list of channels to be added to channels_pending
 724        * after the scheduling loop is over. They can hopefully be taken care of
 725        * in the next scheduling round.
 726        */
 727       if (!to_readd) {
 728         to_readd = smartlist_new();
 729       }
 730       smartlist_add(to_readd, chan);
 731     } else {
 732
 733       /* Case 4: cells to send, and still open for writes */
 734
 735       scheduler_set_channel_state(chan, SCHED_CHAN_PENDING);
 736       if (!SCHED_BUG(chan->sched_heap_idx != -1, chan)) {
 737         smartlist_pqueue_add(cp, scheduler_compare_channels,
 738                              offsetof(channel_t, sched_heap_idx), chan);
 739       }
 740     }
 741   } /* End of main scheduling loop */
 742
 743   /* Write the outbuf of any channels that still have data */
 744   HT_FOREACH_FN(outbuf_table_s, &outbuf_table, each_channel_write_to_kernel,
 745                 NULL);
 746   /* We are done with it. */
 747   HT_FOREACH_FN(outbuf_table_s, &outbuf_table, free_outbuf_info_by_ent, NULL);
 748   HT_CLEAR(outbuf_table_s, &outbuf_table);
 749
 750   log_debug(LD_SCHED, "len pending=%d, len to_readd=%d",
 751             smartlist_len(cp),
 752             (to_readd ? smartlist_len(to_readd) : -1));
 753
 754   /* Re-add any channels we need to */
 755   if (to_readd) {
 756     SMARTLIST_FOREACH_BEGIN(to_readd, channel_t *, readd_chan) {
 757       scheduler_set_channel_state(readd_chan, SCHED_CHAN_PENDING);
 758       if (!smartlist_contains(cp, readd_chan)) {
 759         if (!SCHED_BUG(readd_chan->sched_heap_idx != -1, readd_chan)) {
 760           /* XXXX Note that the check above is in theory redundant with
 761            * the smartlist_contains check.  But let's make sure we're
 762            * not messing anything up, and leave them both for now. */
 763           smartlist_pqueue_add(cp, scheduler_compare_channels,
 764                              offsetof(channel_t, sched_heap_idx), readd_chan);
 765         }
 766       }
 767     } SMARTLIST_FOREACH_END(readd_chan);
 768     smartlist_free(to_readd);
 769   }
 770
 771   monotime_get(&scheduler_last_run);
 772 }
 773
 774 /*****************************************************************************
 775  * Externally called function implementations not called through scheduler_t
 776  *****************************************************************************/
 777
 778 /* Stores the kist scheduler function pointers. */
 779 static scheduler_t kist_scheduler = {
 780   .type = SCHEDULER_KIST,
 781   .free_all = kist_free_all,
 782   .on_channel_free = kist_on_channel_free_fn,
 783   .init = kist_scheduler_init,
 784   .on_new_consensus = kist_scheduler_on_new_consensus,
 785   .schedule = kist_scheduler_schedule,
 786   .run = kist_scheduler_run,
 787   .on_new_options = kist_scheduler_on_new_options,
 788 };
 789
 790 /* Return the KIST scheduler object. If it didn't exists, return a newly
 791  * allocated one but init() is not called. */
 792 scheduler_t *
 793 get_kist_scheduler(void)
 794 {
 795   return &kist_scheduler;
 796 }
 797
 798 /* Check the torrc (and maybe consensus) for the configured KIST scheduler run
 799  * interval.
 800  * - If torrc > 0, then return the positive torrc value (should use KIST, and
 801  *   should use the set value)
 802  * - If torrc == 0, then look in the consensus for what the value should be.
 803  *   - If == 0, then return 0 (don't use KIST)
 804  *   - If > 0, then return the positive consensus value
 805  *   - If consensus doesn't say anything, return 10 milliseconds, default.
 806  */
 807 int
 808 kist_scheduler_run_interval(void)
 809 {
 810   int run_interval = get_options()->KISTSchedRunInterval;
 811
 812   if (run_interval != 0) {
 813     log_debug(LD_SCHED, "Found KISTSchedRunInterval=%" PRId32 " in torrc. "
 814                         "Using that.", run_interval);
 815     return run_interval;
 816   }
 817
 818   log_debug(LD_SCHED, "KISTSchedRunInterval=0, turning to the consensus.");
 819
 820   /* Clients and relays have a separate consensus parameter. Clients
 821    * need a lower KIST interval, since they have only a couple connections */
 822   if (server_mode(get_options())) {
 823     return networkstatus_get_param(NULL, "KISTSchedRunInterval",
 824                                  KIST_SCHED_RUN_INTERVAL_DEFAULT,
 825                                  KIST_SCHED_RUN_INTERVAL_MIN,
 826                                  KIST_SCHED_RUN_INTERVAL_MAX);
 827   } else {
 828     return networkstatus_get_param(NULL, "KISTSchedRunIntervalClient",
 829                                  KIST_SCHED_RUN_INTERVAL_DEFAULT,
 830                                  KIST_SCHED_RUN_INTERVAL_MIN,
 831                                  KIST_SCHED_RUN_INTERVAL_MAX);
 832   }
 833 }
 834
 835 /* Set KISTLite mode that is KIST without kernel support. */
 836 void
 837 scheduler_kist_set_lite_mode(void)
 838 {
 839   kist_lite_mode = 1;
 840   kist_scheduler.type = SCHEDULER_KIST_LITE;
 841   log_info(LD_SCHED,
 842            "Setting KIST scheduler without kernel support (KISTLite mode)");
 843 }
 844
 845 /* Set KIST mode that is KIST with kernel support. */
 846 void
 847 scheduler_kist_set_full_mode(void)
 848 {
 849   kist_lite_mode = 0;
 850   kist_scheduler.type = SCHEDULER_KIST;
 851   log_info(LD_SCHED,
 852            "Setting KIST scheduler with kernel support (KIST mode)");
 853 }
 854
 855 #ifdef HAVE_KIST_SUPPORT
 856
 857 /* Return true iff the scheduler subsystem should use KIST. */
 858 int
 859 scheduler_can_use_kist(void)
 860 {
 861   if (kist_no_kernel_support) {
 862     /* We have no kernel support so we can't use KIST. */
 863     return 0;
 864   }
 865
 866   /* We do have the support, time to check if we can get the interval that the
 867    * consensus can be disabling. */
 868   int run_interval = kist_scheduler_run_interval();
 869   log_debug(LD_SCHED, "Determined KIST sched_run_interval should be "
 870                       "%" PRId32 ". Can%s use KIST.",
 871            run_interval, (run_interval > 0 ? "" : " not"));
 872   return run_interval > 0;
 873 }
 874
 875 #else /* !defined(HAVE_KIST_SUPPORT) */
 876
 877 int
 878 scheduler_can_use_kist(void)
 879 {
 880   return 0;
 881 }
 882
 883 #endif /* defined(HAVE_KIST_SUPPORT) */