perl/lib/Thrasher/ConnectionManager.pm

   1 package Thrasher::ConnectionManager;
   2 use strict;
   3 use warnings;
   4
   5 =pod
   6
   7 =head1 NAME
   8
   9 Thrasher::ConnectionManager - manage the act of connecting to the
  10 legacy services
  11
  12 =head1 DESCRIPTION
  13
  14 Thrasher::ConnectionManager centralizes and separates the management
  15 of whether or not we want to connect to a legacy server. Likely this
  16 will get more complicated later.
  17
  18 Suppose we have a couple hundred users on the gateway when the
  19 connection is lost to the legacy service, that is, I<entirely> lost.
  20 Later, it comes back. We don't want to hammer the service with
  21 repeated connection attempts from each user.
  22
  23 We also want to be able to control the rate at which re-connection
  24 attempts are made. If someone tries to log on during the period when
  25 the connection is down, we don't really want them to add to the
  26 failure proceedings.
  27
  28 ::ConnectionManager implements the ability to schedule connection
  29 attempts, collect information about whether the logins are successful
  30 or not, and decide when to attempt to log in again.
  31
  32 This module sort of functions as a singleton (sort of), exporting the
  33 following function:
  34
  35 =over 4
  36
  37 =item *
  38
  39 C<request_connect>($connection_closure): Request that the given
  40 closure be run at some point in the future. This closure B<must>
  41 eventually result in a call to C<connection_success> or
  42 C<connection_failure>, which may itself arise from a callback
  43 or something (like for a connection), which is why we can't
  44 just use the return value. The connection manager has no
  45 way to enforce this, but the queue will be scheduled improperly
  46 if it does not occur.
  47
  48 A connection is successful if the user successfully logged on.
  49 That is, a password failure is a failure too. This avoids the
  50 case where we could hammer a service that was having
  51 authentication problems.
  52
  53 =back
  54
  55 =cut
  56
  57 use Carp qw(confess);
  58
  59 our $NOW = undef;
  60
  61 # Setting this to 1 will simply bypass this entire thing.
  62 our $IGNORE_CONNECTION_MANAGER = 0;
  63
  64 use base 'Exporter';
  65 our @EXPORT_OK = qw(decayable connection_success connection_failure
  66                     request_connect schedule_request);
  67 our %EXPORT_TAGS = (all => \@EXPORT_OK);
  68
  69 use Thrasher::Log qw(log debug);
  70 require Thrasher::Component;
  71
  72 sub decayable {
  73     return new Thrasher::ConnectionManager::Decayable(@_);
  74 }
  75
  76 # So, the design goals:
  77 # * If there's no reason to suspect network troubles, connect away.
  78 # * If there's a total network disconnect, a bare minimum of
  79 #   connection attempts should be made, not a straight
  80 #   (number of logins) * (constant frequency) * time
  81 # * It can't take too long to recover from a total network failure.
  82 #   Even for hundreds of people, it should recover relatively quickly.
  83 # * If at all possible, this should adapt to rate limiting. (It may
  84 #   not be possible if the legacy service simply cuts of your
  85 #   service, rather than metering it.)
  86 #
  87 # Approach: We define a "decayable number", which exponentially
  88 # decays over time, as seen in
  89 # Thrasher::ConnectionManager::Decayable. As successful connections
  90 # are made, the count of successful connections is incremented. As
  91 # unsuccessful connections are made, the count of unsuccessful
  92 # connections is incremented. The ratio of the two is used to
  93 # determine connection time, bounded on both ends by a limit
  94 # (if it's below a certain number, we simply connect immediately,
  95 # if it's above a certain number we assume it needs to be bounded
  96 # to something like one attempt every 30 seconds). We also, after
  97 # a certain point, start queuing up connection requests. This way, the
  98 # system reacts to the current apparent state of the connection
  99 # to the legacy service without hammering on the legacy system,
 100 # which may be considered hostile.
 101
 102
 103 ### TUNABLE PARAMETERS
 104
 105 # This is the base decay rate applied to our two counters,
 106 # applied directly to the failure and indirectly to the success
 107 # counter.
 108 our $decay_rate = .65;
 109
 110 # Optimism is how much more likely we are to think that we can
 111 # connect again. Values about that cause the system to "remember"
 112 # success more strongly than failure.
 113 our $optimism = 3;
 114
 115 my $log30 = CORE::log(30);
 116
 117 ### Values
 118
 119 our $success = decayable(5, 1 - ((1-$decay_rate) / $optimism));
 120 our $failure = decayable(0, $decay_rate);
 121
 122 # $SIMULTANEOUS_CONNECT_BOUND's initial value sets the maximum number
 123 # of simultaneous, possibly asynchronous, connection attempts. It will
 124 # be decremented each time a connection attempt starts and incremented
 125 # when it finishes. Thus, at any given time while the
 126 # ConnectionManager is managing it will be the current number of new
 127 # connect attempts that could start at that time.
 128 our $USE_SIMULTANEOUS_CONNECT_BOUND = 0;
 129 our $SIMULTANEOUS_CONNECT_BOUND = 2;
 130
 131 # This is incremented once per connection attempt and is used to
 132 # limit the total rate of reconnect attempts.
 133 # Please, Thrasher, don't hammer 'em.
 134 our $hammering_limit = 15;
 135 # "After about $SIMULTANEOUS_CONNECT_BOUND connections, $hammering
 136 # should be strictly greater than $hammering_limit":
 137 our $hammering = decayable(($hammering_limit + 1 - $SIMULTANEOUS_CONNECT_BOUND),
 138                            .02);
 139
 140 # Things queued up because we can't try to connect yet.
 141 my @queue;
 142
 143 my $next_connection_attempt_time;
 144
 145 # This takes the clousure to run and the time to run it. This
 146 # allows you to either call out to the event loop, or, in testing,
 147 # override this and capture the execution attempts to verify
 148 # the time is correct.
 149 our $scheduler = sub {
 150     my $closure = shift;
 151     my $timeout = shift;
 152
 153     return $Thrasher::Component::COMPONENT->{'event_loop'}->schedule($closure,
 154                                                                      $timeout);
 155 };
 156
 157 # "If I make a request right now, what will the delay and
 158 # forcibly_enqueue values be?"
 159 sub schedule_request {
 160     # Check that we aren't hammering the remote server.
 161     if ($hammering->value > $hammering_limit) {
 162         log("Hammering: " . $hammering->value . " of $hammering_limit."
 163             ." Delaying connection request.");
 164         return [0, 1];
 165     }
 166
 167     my $current_success = $success->value;
 168     my $current_failure = $failure->value;
 169
 170     if ($current_failure < 0.0001) {
 171         log("Never failed, immediately connecting.");
 172         return [0, 0];
 173     }
 174     if ($current_success < 0.00001 && $current_failure > 0.00001) {
 175         log("Can't remember having had failure or success, immediately connecting.");
 176         return [0, 1];
 177     }
 178     if ($current_success < 0.0001) {
 179         log("Can't remember having succeeded ($current_success), "
 180             ."but I remember failure ($current_failure). Max delay.");
 181         return [30, 0];
 182     }
 183
 184     my $ratio = $current_failure / $current_success;
 185
 186     if ($ratio > $log30) {
 187         log("Failure/success ratio: $ratio, ($current_failure / $current_success), capped to 30 second delay");
 188         return [30, 0];
 189     }
 190
 191     my $delay = exp($ratio) - 1;
 192     log("Failure/success ratio: $ratio ($current_failure / $current_success), comes out to $delay for delay");
 193     return [$delay, 0];
 194 }
 195
 196 # This returns true if the connection will be immediate,
 197 # otherwise it returns false.
 198 sub request_connect {
 199     my $connection_closure = shift;
 200
 201     # Deal with this when we once again reschedule things.
 202     if (defined($next_connection_attempt_time) &&
 203         ($NOW || time) < $next_connection_attempt_time) {
 204         push @queue, $connection_closure;
 205         return 0;
 206     }
 207
 208     if ($USE_SIMULTANEOUS_CONNECT_BOUND
 209           && $SIMULTANEOUS_CONNECT_BOUND <= 0) {
 210         log('Enough connect attempts in progress; delaying.');
 211         push(@queue, $connection_closure);
 212         schedule_connection_executor(1);
 213         return 0;
 214     }
 215     elsif (!@queue) {
 216         my ($delay, $forcibly_enqueue) = @{schedule_request()};
 217
 218         # Catch this in the next scheduling run.
 219         if ($forcibly_enqueue) {
 220             push @queue, $connection_closure;
 221             schedule_connection_executor(1);
 222             return 0;
 223         }
 224
 225         # Little to no delay called for, just go for it
 226         # (leaning on hammering protection to prevent hammering)
 227         if ($delay < 1) {
 228             service_request($connection_closure);
 229             return 1;
 230         }
 231
 232         # We're not in a hammering delay and we need to
 233         # reschedule to delay beyond a second
 234         push @queue, $connection_closure;
 235         schedule_connection_executor($delay);
 236         return 0;
 237     } else {
 238         push @queue, $connection_closure;
 239         return 0;
 240     }
 241 }
 242
 243 # schedule_connection_executor($timeout_in_seconds)
 244 my $run_scheduled = 0;
 245 sub schedule_connection_executor {
 246     my $timeout = shift;
 247
 248     if ($run_scheduled) {
 249         # request_connect() called in between scheduled
 250         # execution_runner() calls.
 251         return;
 252     }
 253
 254     $next_connection_attempt_time = ($NOW || time) + $timeout;
 255
 256     my $execution_runner = sub {
 257         log(scalar(@queue) . ' connection attempts queued.');
 258         if ($USE_SIMULTANEOUS_CONNECT_BOUND
 259               && @queue && $SIMULTANEOUS_CONNECT_BOUND <= 0) {
 260             debug('Enough connect attempts in progress (not rescheduled).');
 261             return 1;
 262         }
 263
 264         my $new_timeout = eval {
 265             return connection_executor();
 266         };
 267         if ($@) {
 268             $run_scheduled = 0;
 269             die;
 270         }
 271         elsif (! $new_timeout) {
 272             debug("Executor done.");
 273             $run_scheduled = 0;
 274             return 0;
 275         }
 276         elsif ($new_timeout == $timeout) {
 277             debug("Next executor in $timeout seconds (not rescheduled).");
 278             return 1;
 279         }
 280         else {
 281             debug("Next executor in $new_timeout seconds (rescheduled).");
 282             $run_scheduled = 0;
 283             schedule_connection_executor($new_timeout);
 284             return 0;
 285         }
 286     };
 287
 288     $run_scheduled = $scheduler->($execution_runner,
 289                                   $timeout * 1000);
 290 }
 291
 292 # connection_executor() is what is run by the scheduler, which will
 293 # propogate itself until it runs out of work.
 294 #
 295 # Returns the new $timeout_in_seconds until the desired next run, or
 296 # false when no next run should be scheduled.
 297 sub connection_executor {
 298     # If we're still in a hammering lock, delay another second
 299     if ($hammering->value >= $hammering_limit) {
 300         return 1;
 301     }
 302
 303     if (!@queue) {
 304         # We've finally emptied the queue. Resume normal usage.
 305         return 0;
 306     }
 307
 308     elsif ($USE_SIMULTANEOUS_CONNECT_BOUND
 309              && $SIMULTANEOUS_CONNECT_BOUND <= 0) {
 310         # Delay repeat until some connection attempts finish.
 311         return 1;
 312     }
 313
 314     # Otherwise, it's time to take one thing off the queue and run it.
 315     my $current_closure = shift @queue;
 316     service_request($current_closure);
 317
 318     # And now it's time to schedule the next request
 319     my ($delay, $forcibly_enqueue) = @{schedule_request()};
 320     if ($forcibly_enqueue) {
 321         return 1;
 322     }
 323
 324     if ($delay < 1) {
 325         goto &connection_executor;
 326     }
 327
 328     # Otherwise, reschedule $delay seconds
 329     return $delay;
 330 }
 331
 332 sub service_request {
 333     my $closure = shift;
 334     $hammering->add(1);
 335     local $@;
 336
 337     if (!defined($closure)) {
 338         confess "Undefined closure passed to service_request.";
 339     }
 340     log("Servicing connection request.");
 341     my $orig_simultaneous_connect_bound = $SIMULTANEOUS_CONNECT_BOUND;
 342     $SIMULTANEOUS_CONNECT_BOUND--;
 343     eval {
 344         $closure->();
 345     };
 346     if ($@) {
 347         log("Failure in connection request closure: $@");
 348
 349         # $closure may or may not have called
 350         # connection_{success,failure} before dying. Reset the bound
 351         # (instead of incrementing) in case it did.
 352         $SIMULTANEOUS_CONNECT_BOUND = $orig_simultaneous_connect_bound;
 353     }
 354     # Else *rely* on $closure to increment $SIMULTANEOUS_CONNECT_BOUND.
 355     # Can't detect here if connection_{success,failure} will be called
 356     # later or synchronously.
 357 }
 358
 359 # connection_failure($local_only)
 360 #
 361 # Update tracking information to note that a connection succeeded.
 362 sub connection_success {
 363     $SIMULTANEOUS_CONNECT_BOUND++;
 364
 365     debug('Connection success managed.');
 366     $success->add(1);
 367 }
 368
 369 # connection_failure($local_only)
 370 #
 371 # Update tracking information to note that a connection failed. Set
 372 # $local_only to indicate that the failure did not involve the remote
 373 # service and thus shouldn't influence the load scheduling.
 374 sub connection_failure {
 375     my ($local_only) = @_;
 376
 377     $SIMULTANEOUS_CONNECT_BOUND++;
 378
 379     debug('Connection failure managed.');
 380     if (! $local_only) {
 381         my $current_schedule = schedule_request;
 382         if ($current_schedule->[0] != 30) {
 383             $failure->add(10);
 384         }
 385         else {
 386             log("Declining to increment failure.");
 387         }
 388     }
 389 }
 390
 391 package Thrasher::ConnectionManager::Decayable;
 392 use strict;
 393 use warnings;
 394
 395 # This implements a "decayable" number. It starts with the value
 396 # you give it, and is multiplied by the decay rate raise to the
 397 # power of the number of minutes since the last time it was queried.
 398 # The fact that this tends to decay to zero is desired.
 399 # Example: new Thrasher::ConnectionManager::Decayable(16, .5)
 400 # will be 8 in one minute, 4 in the next, 2 in the next, and so on.
 401 # The transition is smooth; it will be 11.3137... after 30 seconds.
 402
 403 use Time::HiRes qw(time tv_interval);
 404
 405 sub new {
 406     my $class = shift;
 407     my $current_value = shift;
 408     my $decay_rate = shift;
 409     my $now = $Thrasher::ConnectionManager::NOW || scalar(time);
 410
 411     my $decayable = { value => $current_value,
 412                       decay_rate => $decay_rate,
 413                       last_computed => $now };
 414     bless $decayable, $class;
 415
 416     return $decayable;
 417 }
 418
 419 sub value {
 420     my $self = shift;
 421     my $now = $Thrasher::ConnectionManager::NOW || scalar(time);
 422
 423     my $since_last_computation = $now - $self->{last_computed};
 424     my $minutes = $since_last_computation / 60;
 425
 426     my $decay = $self->{decay_rate} ** $minutes;
 427     my $current_value = $self->{value} * $decay;
 428
 429     $self->{value} = $current_value;
 430     $self->{last_computed} = $now;
 431
 432     return $self->{value};
 433 }
 434
 435 sub add {
 436     my $self = shift;
 437     my $inc = shift;
 438
 439     my $value = $self->value;
 440     $self->{value} = $value + $inc;
 441
 442     return $self->{value};
 443 }
 444
 445 sub subtract {
 446     my $self = shift;
 447     my $dec = shift;
 448     $self->add(-$dec);
 449 }
 450
 451 1;