ConnectionManager: Disable hard limit in favor of twiddled hammering values.
[thrasher.git] / perl / lib / Thrasher / ConnectionManager.pm
blob53140af0cbc69ddd2066eafb4296580cc03f5d19
1 package Thrasher::ConnectionManager;
2 use strict;
3 use warnings;
5 =pod
7 =head1 NAME
9 Thrasher::ConnectionManager - manage the act of connecting to the
10 legacy services
12 =head1 DESCRIPTION
14 Thrasher::ConnectionManager centralizes and separates the management
15 of whether or not we want to connect to a legacy server. Likely this
16 will get more complicated later.
18 Suppose we have a couple hundred users on the gateway when the
19 connection is lost to the legacy service, that is, I<entirely> lost.
20 Later, it comes back. We don't want to hammer the service with
21 repeated connection attempts from each user.
23 We also want to be able to control the rate at which re-connection
24 attempts are made. If someone tries to log on during the period when
25 the connection is down, we don't really want them to add to the
26 failure proceedings.
28 ::ConnectionManager implements the ability to schedule connection
29 attempts, collect information about whether the logins are successful
30 or not, and decide when to attempt to log in again.
32 This module sort of functions as a singleton (sort of), exporting the
33 following function:
35 =over 4
37 =item *
39 C<request_connect>($connection_closure): Request that the given
40 closure be run at some point in the future. This closure B<must>
41 eventually result in a call to C<connection_success> or
42 C<connection_failure>, which may itself arise from a callback
43 or something (like for a connection), which is why we can't
44 just use the return value. The connection manager has no
45 way to enforce this, but the queue will be scheduled improperly
46 if it does not occur.
48 A connection is successful if the user successfully logged on.
49 That is, a password failure is a failure too. This avoids the
50 case where we could hammer a service that was having
51 authentication problems.
53 =back
55 =cut
57 use Carp qw(confess);
59 our $NOW = undef;
61 # Setting this to 1 will simply bypass this entire thing.
62 our $IGNORE_CONNECTION_MANAGER = 0;
64 use base 'Exporter';
65 our @EXPORT_OK = qw(decayable connection_success connection_failure
66 request_connect schedule_request);
67 our %EXPORT_TAGS = (all => \@EXPORT_OK);
69 use Thrasher::Log qw(log debug);
70 require Thrasher::Component;
72 sub decayable {
73 return new Thrasher::ConnectionManager::Decayable(@_);
76 # So, the design goals:
77 # * If there's no reason to suspect network troubles, connect away.
78 # * If there's a total network disconnect, a bare minimum of
79 # connection attempts should be made, not a straight
80 # (number of logins) * (constant frequency) * time
81 # * It can't take too long to recover from a total network failure.
82 # Even for hundreds of people, it should recover relatively quickly.
83 # * If at all possible, this should adapt to rate limiting. (It may
84 # not be possible if the legacy service simply cuts of your
85 # service, rather than metering it.)
87 # Approach: We define a "decayable number", which exponentially
88 # decays over time, as seen in
89 # Thrasher::ConnectionManager::Decayable. As successful connections
90 # are made, the count of successful connections is incremented. As
91 # unsuccessful connections are made, the count of unsuccessful
92 # connections is incremented. The ratio of the two is used to
93 # determine connection time, bounded on both ends by a limit
94 # (if it's below a certain number, we simply connect immediately,
95 # if it's above a certain number we assume it needs to be bounded
96 # to something like one attempt every 30 seconds). We also, after
97 # a certain point, start queuing up connection requests. This way, the
98 # system reacts to the current apparent state of the connection
99 # to the legacy service without hammering on the legacy system,
100 # which may be considered hostile.
103 ### TUNABLE PARAMETERS
105 # This is the base decay rate applied to our two counters,
106 # applied directly to the failure and indirectly to the success
107 # counter.
108 our $decay_rate = .65;
110 # Optimism is how much more likely we are to think that we can
111 # connect again. Values about that cause the system to "remember"
112 # success more strongly than failure.
113 our $optimism = 3;
115 my $log30 = CORE::log(30);
117 ### Values
119 our $success = decayable(5, 1 - ((1-$decay_rate) / $optimism));
120 our $failure = decayable(0, $decay_rate);
122 # $SIMULTANEOUS_CONNECT_BOUND's initial value sets the maximum number
123 # of simultaneous, possibly asynchronous, connection attempts. It will
124 # be decremented each time a connection attempt starts and incremented
125 # when it finishes. Thus, at any given time while the
126 # ConnectionManager is managing it will be the current number of new
127 # connect attempts that could start at that time.
128 our $USE_SIMULTANEOUS_CONNECT_BOUND = 0;
129 our $SIMULTANEOUS_CONNECT_BOUND = 2;
131 # This is incremented once per connection attempt and is used to
132 # limit the total rate of reconnect attempts.
133 # Please, Thrasher, don't hammer 'em.
134 our $hammering_limit = 15;
135 # "After about $SIMULTANEOUS_CONNECT_BOUND connections, $hammering
136 # should be strictly greater than $hammering_limit":
137 our $hammering = decayable(($hammering_limit + 1 - $SIMULTANEOUS_CONNECT_BOUND),
138 .02);
140 # Things queued up because we can't try to connect yet.
141 my @queue;
143 my $next_connection_attempt_time;
145 # This takes the clousure to run and the time to run it. This
146 # allows you to either call out to the event loop, or, in testing,
147 # override this and capture the execution attempts to verify
148 # the time is correct.
149 our $scheduler = sub {
150 my $closure = shift;
151 my $timeout = shift;
153 return $Thrasher::Component::COMPONENT->{'event_loop'}->schedule($closure,
154 $timeout);
157 # "If I make a request right now, what will the delay and
158 # forcibly_enqueue values be?"
159 sub schedule_request {
160 # Check that we aren't hammering the remote server.
161 if ($hammering->value > $hammering_limit) {
162 log("Hammering: " . $hammering->value . " of $hammering_limit."
163 ." Delaying connection request.");
164 return [0, 1];
167 my $current_success = $success->value;
168 my $current_failure = $failure->value;
170 if ($current_failure < 0.0001) {
171 log("Never failed, immediately connecting.");
172 return [0, 0];
174 if ($current_success < 0.00001 && $current_failure > 0.00001) {
175 log("Can't remember having had failure or success, immediately connecting.");
176 return [0, 1];
178 if ($current_success < 0.0001) {
179 log("Can't remember having succeeded ($current_success), "
180 ."but I remember failure ($current_failure). Max delay.");
181 return [30, 0];
184 my $ratio = $current_failure / $current_success;
186 if ($ratio > $log30) {
187 log("Failure/success ratio: $ratio, ($current_failure / $current_success), capped to 30 second delay");
188 return [30, 0];
191 my $delay = exp($ratio) - 1;
192 log("Failure/success ratio: $ratio ($current_failure / $current_success), comes out to $delay for delay");
193 return [$delay, 0];
196 # This returns true if the connection will be immediate,
197 # otherwise it returns false.
198 sub request_connect {
199 my $connection_closure = shift;
201 # Deal with this when we once again reschedule things.
202 if (defined($next_connection_attempt_time) &&
203 ($NOW || time) < $next_connection_attempt_time) {
204 push @queue, $connection_closure;
205 return 0;
208 if ($USE_SIMULTANEOUS_CONNECT_BOUND
209 && $SIMULTANEOUS_CONNECT_BOUND <= 0) {
210 log('Enough connect attempts in progress; delaying.');
211 push(@queue, $connection_closure);
212 schedule_connection_executor(1);
213 return 0;
215 elsif (!@queue) {
216 my ($delay, $forcibly_enqueue) = @{schedule_request()};
218 # Catch this in the next scheduling run.
219 if ($forcibly_enqueue) {
220 push @queue, $connection_closure;
221 schedule_connection_executor(1);
222 return 0;
225 # Little to no delay called for, just go for it
226 # (leaning on hammering protection to prevent hammering)
227 if ($delay < 1) {
228 service_request($connection_closure);
229 return 1;
232 # We're not in a hammering delay and we need to
233 # reschedule to delay beyond a second
234 push @queue, $connection_closure;
235 schedule_connection_executor($delay);
236 return 0;
237 } else {
238 push @queue, $connection_closure;
239 return 0;
243 # schedule_connection_executor($timeout_in_seconds)
244 my $run_scheduled = 0;
245 sub schedule_connection_executor {
246 my $timeout = shift;
248 if ($run_scheduled) {
249 # request_connect() called in between scheduled
250 # execution_runner() calls.
251 return;
254 $next_connection_attempt_time = ($NOW || time) + $timeout;
256 my $execution_runner = sub {
257 log(scalar(@queue) . ' connection attempts queued.');
258 if ($USE_SIMULTANEOUS_CONNECT_BOUND
259 && @queue && $SIMULTANEOUS_CONNECT_BOUND <= 0) {
260 debug('Enough connect attempts in progress (not rescheduled).');
261 return 1;
264 my $new_timeout = eval {
265 return connection_executor();
267 if ($@) {
268 $run_scheduled = 0;
269 die;
271 elsif (! $new_timeout) {
272 debug("Executor done.");
273 $run_scheduled = 0;
274 return 0;
276 elsif ($new_timeout == $timeout) {
277 debug("Next executor in $timeout seconds (not rescheduled).");
278 return 1;
280 else {
281 debug("Next executor in $new_timeout seconds (rescheduled).");
282 $run_scheduled = 0;
283 schedule_connection_executor($new_timeout);
284 return 0;
288 $run_scheduled = $scheduler->($execution_runner,
289 $timeout * 1000);
292 # connection_executor() is what is run by the scheduler, which will
293 # propogate itself until it runs out of work.
295 # Returns the new $timeout_in_seconds until the desired next run, or
296 # false when no next run should be scheduled.
297 sub connection_executor {
298 # If we're still in a hammering lock, delay another second
299 if ($hammering->value >= $hammering_limit) {
300 return 1;
303 if (!@queue) {
304 # We've finally emptied the queue. Resume normal usage.
305 return 0;
308 elsif ($USE_SIMULTANEOUS_CONNECT_BOUND
309 && $SIMULTANEOUS_CONNECT_BOUND <= 0) {
310 # Delay repeat until some connection attempts finish.
311 return 1;
314 # Otherwise, it's time to take one thing off the queue and run it.
315 my $current_closure = shift @queue;
316 service_request($current_closure);
318 # And now it's time to schedule the next request
319 my ($delay, $forcibly_enqueue) = @{schedule_request()};
320 if ($forcibly_enqueue) {
321 return 1;
324 if ($delay < 1) {
325 goto &connection_executor;
328 # Otherwise, reschedule $delay seconds
329 return $delay;
332 sub service_request {
333 my $closure = shift;
334 $hammering->add(1);
335 local $@;
337 if (!defined($closure)) {
338 confess "Undefined closure passed to service_request.";
340 log("Servicing connection request.");
341 my $orig_simultaneous_connect_bound = $SIMULTANEOUS_CONNECT_BOUND;
342 $SIMULTANEOUS_CONNECT_BOUND--;
343 eval {
344 $closure->();
346 if ($@) {
347 log("Failure in connection request closure: $@");
349 # $closure may or may not have called
350 # connection_{success,failure} before dying. Reset the bound
351 # (instead of incrementing) in case it did.
352 $SIMULTANEOUS_CONNECT_BOUND = $orig_simultaneous_connect_bound;
354 # Else *rely* on $closure to increment $SIMULTANEOUS_CONNECT_BOUND.
355 # Can't detect here if connection_{success,failure} will be called
356 # later or synchronously.
359 # connection_failure($local_only)
361 # Update tracking information to note that a connection succeeded.
362 sub connection_success {
363 $SIMULTANEOUS_CONNECT_BOUND++;
365 debug('Connection success managed.');
366 $success->add(1);
369 # connection_failure($local_only)
371 # Update tracking information to note that a connection failed. Set
372 # $local_only to indicate that the failure did not involve the remote
373 # service and thus shouldn't influence the load scheduling.
374 sub connection_failure {
375 my ($local_only) = @_;
377 $SIMULTANEOUS_CONNECT_BOUND++;
379 debug('Connection failure managed.');
380 if (! $local_only) {
381 my $current_schedule = schedule_request;
382 if ($current_schedule->[0] != 30) {
383 $failure->add(10);
385 else {
386 log("Declining to increment failure.");
391 package Thrasher::ConnectionManager::Decayable;
392 use strict;
393 use warnings;
395 # This implements a "decayable" number. It starts with the value
396 # you give it, and is multiplied by the decay rate raise to the
397 # power of the number of minutes since the last time it was queried.
398 # The fact that this tends to decay to zero is desired.
399 # Example: new Thrasher::ConnectionManager::Decayable(16, .5)
400 # will be 8 in one minute, 4 in the next, 2 in the next, and so on.
401 # The transition is smooth; it will be 11.3137... after 30 seconds.
403 use Time::HiRes qw(time tv_interval);
405 sub new {
406 my $class = shift;
407 my $current_value = shift;
408 my $decay_rate = shift;
409 my $now = $Thrasher::ConnectionManager::NOW || scalar(time);
411 my $decayable = { value => $current_value,
412 decay_rate => $decay_rate,
413 last_computed => $now };
414 bless $decayable, $class;
416 return $decayable;
419 sub value {
420 my $self = shift;
421 my $now = $Thrasher::ConnectionManager::NOW || scalar(time);
423 my $since_last_computation = $now - $self->{last_computed};
424 my $minutes = $since_last_computation / 60;
426 my $decay = $self->{decay_rate} ** $minutes;
427 my $current_value = $self->{value} * $decay;
429 $self->{value} = $current_value;
430 $self->{last_computed} = $now;
432 return $self->{value};
435 sub add {
436 my $self = shift;
437 my $inc = shift;
439 my $value = $self->value;
440 $self->{value} = $value + $inc;
442 return $self->{value};
445 sub subtract {
446 my $self = shift;
447 my $dec = shift;
448 $self->add(-$dec);