1 // SPDX-License-Identifier: GPL-2.0-only
3 * Historical Service Time
5 * Keeps a time-weighted exponential moving average of the historical
6 * service time. Estimates future service time based on the historical
7 * service time and the number of outstanding requests.
9 * Marks paths stale if they have not finished within hst *
10 * num_paths. If a path is stale and unused, we will send a single
11 * request to probe in case the path has improved. This situation
12 * generally arises if the path is so much worse than others that it
13 * will never have the best estimated service time, or if the entire
14 * multipath device is unused. If a path is stale and in use, limit the
15 * number of requests it can receive with the assumption that the path
16 * has become degraded.
18 * To avoid repeatedly calculating exponents for time weighting, times
19 * are split into HST_WEIGHT_COUNT buckets each (1 >> HST_BUCKET_SHIFT)
20 * ns, and the weighting is pre-calculated.
25 #include "dm-path-selector.h"
27 #include <linux/blkdev.h>
28 #include <linux/slab.h>
29 #include <linux/module.h>
32 #define DM_MSG_PREFIX "multipath historical-service-time"
34 #define HST_VERSION "0.1.1"
36 #define HST_FIXED_SHIFT 10 /* 10 bits of decimal precision */
37 #define HST_FIXED_MAX (ULLONG_MAX >> HST_FIXED_SHIFT)
38 #define HST_FIXED_1 (1 << HST_FIXED_SHIFT)
39 #define HST_FIXED_95 972
40 #define HST_MAX_INFLIGHT HST_FIXED_1
41 #define HST_BUCKET_SHIFT 24 /* Buckets are ~ 16ms */
42 #define HST_WEIGHT_COUNT 64ULL
45 struct list_head valid_paths
;
46 struct list_head failed_paths
;
50 unsigned int weights
[HST_WEIGHT_COUNT
];
51 unsigned int threshold_multiplier
;
55 struct list_head list
;
57 unsigned int repeat_count
;
61 u64 historical_service_time
; /* Fixed point */
70 * fixed_power - compute: x^n, in O(log n) time
72 * @x: base of the power
73 * @frac_bits: fractional bits of @x
74 * @n: power to raise @x to.
76 * By exploiting the relation between the definition of the natural power
77 * function: x^n := x*x*...*x (x multiplied by itself for n times), and
78 * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
79 * (where: n_i \elem {0, 1}, the binary vector representing n),
80 * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
81 * of course trivially computable in O(log_2 n), the length of our binary
84 * (see: kernel/sched/loadavg.c)
86 static u64
fixed_power(u64 x
, unsigned int frac_bits
, unsigned int n
)
88 unsigned long result
= 1UL << frac_bits
;
94 result
+= 1UL << (frac_bits
- 1);
101 x
+= 1UL << (frac_bits
- 1);
110 * Calculate the next value of an exponential moving average
111 * a_1 = a_0 * e + a * (1 - e)
113 * @last: [0, ULLONG_MAX >> HST_FIXED_SHIFT]
114 * @next: [0, ULLONG_MAX >> HST_FIXED_SHIFT]
115 * @weight: [0, HST_FIXED_1]
118 * To account for multiple periods in the same calculation,
119 * a_n = a_0 * e^n + a * (1 - e^n),
120 * so call fixed_ema(last, next, pow(weight, N))
122 static u64
fixed_ema(u64 last
, u64 next
, u64 weight
)
125 last
+= next
* (HST_FIXED_1
- weight
);
126 last
+= 1ULL << (HST_FIXED_SHIFT
- 1);
127 return last
>> HST_FIXED_SHIFT
;
130 static struct selector
*alloc_selector(void)
132 struct selector
*s
= kmalloc(sizeof(*s
), GFP_KERNEL
);
135 INIT_LIST_HEAD(&s
->valid_paths
);
136 INIT_LIST_HEAD(&s
->failed_paths
);
137 spin_lock_init(&s
->lock
);
145 * Get the weight for a given time span.
147 static u64
hst_weight(struct path_selector
*ps
, u64 delta
)
149 struct selector
*s
= ps
->context
;
150 int bucket
= clamp(delta
>> HST_BUCKET_SHIFT
, 0ULL,
151 HST_WEIGHT_COUNT
- 1);
153 return s
->weights
[bucket
];
157 * Set up the weights array.
160 * weights[n] = base ^ (n + 1)
162 static void hst_set_weights(struct path_selector
*ps
, unsigned int base
)
164 struct selector
*s
= ps
->context
;
167 if (base
>= HST_FIXED_1
)
170 for (i
= 0; i
< HST_WEIGHT_COUNT
- 1; i
++)
171 s
->weights
[i
] = fixed_power(base
, HST_FIXED_SHIFT
, i
+ 1);
172 s
->weights
[HST_WEIGHT_COUNT
- 1] = 0;
175 static int hst_create(struct path_selector
*ps
, unsigned int argc
, char **argv
)
178 unsigned int base_weight
= HST_FIXED_95
;
179 unsigned int threshold_multiplier
= 0;
183 * Arguments: [<base_weight> [<threshold_multiplier>]]
184 * <base_weight>: Base weight for ema [0, 1024) 10-bit fixed point. A
185 * value of 0 will completely ignore any history.
186 * If not given, default (HST_FIXED_95) is used.
187 * <threshold_multiplier>: Minimum threshold multiplier for paths to
188 * be considered different. That is, a path is
189 * considered different iff (p1 > N * p2) where p1
190 * is the path with higher service time. A threshold
191 * of 1 or 0 has no effect. Defaults to 0.
196 if (argc
&& (sscanf(argv
[0], "%u%c", &base_weight
, &dummy
) != 1 ||
197 base_weight
>= HST_FIXED_1
)) {
201 if (argc
> 1 && (sscanf(argv
[1], "%u%c",
202 &threshold_multiplier
, &dummy
) != 1)) {
206 s
= alloc_selector();
212 hst_set_weights(ps
, base_weight
);
213 s
->threshold_multiplier
= threshold_multiplier
;
217 static void free_paths(struct list_head
*paths
)
219 struct path_info
*pi
, *next
;
221 list_for_each_entry_safe(pi
, next
, paths
, list
) {
227 static void hst_destroy(struct path_selector
*ps
)
229 struct selector
*s
= ps
->context
;
231 free_paths(&s
->valid_paths
);
232 free_paths(&s
->failed_paths
);
237 static int hst_status(struct path_selector
*ps
, struct dm_path
*path
,
238 status_type_t type
, char *result
, unsigned int maxlen
)
241 struct path_info
*pi
;
244 struct selector
*s
= ps
->context
;
246 DMEMIT("2 %u %u ", s
->weights
[0], s
->threshold_multiplier
);
248 pi
= path
->pscontext
;
251 case STATUSTYPE_INFO
:
252 DMEMIT("%llu %llu %llu ", pi
->historical_service_time
,
253 pi
->outstanding
, pi
->stale_after
);
255 case STATUSTYPE_TABLE
:
267 static int hst_add_path(struct path_selector
*ps
, struct dm_path
*path
,
268 int argc
, char **argv
, char **error
)
270 struct selector
*s
= ps
->context
;
271 struct path_info
*pi
;
272 unsigned int repeat_count
= HST_MIN_IO
;
277 * Arguments: [<repeat_count>]
278 * <repeat_count>: The number of I/Os before switching path.
279 * If not given, default (HST_MIN_IO) is used.
282 *error
= "historical-service-time ps: incorrect number of arguments";
286 if (argc
&& (sscanf(argv
[0], "%u%c", &repeat_count
, &dummy
) != 1)) {
287 *error
= "historical-service-time ps: invalid repeat count";
291 /* allocate the path */
292 pi
= kmalloc(sizeof(*pi
), GFP_KERNEL
);
294 *error
= "historical-service-time ps: Error allocating path context";
299 pi
->repeat_count
= repeat_count
;
301 pi
->historical_service_time
= HST_FIXED_1
;
303 spin_lock_init(&pi
->lock
);
309 path
->pscontext
= pi
;
311 spin_lock_irqsave(&s
->lock
, flags
);
312 list_add_tail(&pi
->list
, &s
->valid_paths
);
314 spin_unlock_irqrestore(&s
->lock
, flags
);
319 static void hst_fail_path(struct path_selector
*ps
, struct dm_path
*path
)
321 struct selector
*s
= ps
->context
;
322 struct path_info
*pi
= path
->pscontext
;
325 spin_lock_irqsave(&s
->lock
, flags
);
326 list_move(&pi
->list
, &s
->failed_paths
);
328 spin_unlock_irqrestore(&s
->lock
, flags
);
331 static int hst_reinstate_path(struct path_selector
*ps
, struct dm_path
*path
)
333 struct selector
*s
= ps
->context
;
334 struct path_info
*pi
= path
->pscontext
;
337 spin_lock_irqsave(&s
->lock
, flags
);
338 list_move_tail(&pi
->list
, &s
->valid_paths
);
340 spin_unlock_irqrestore(&s
->lock
, flags
);
345 static void hst_fill_compare(struct path_info
*pi
, u64
*hst
,
346 u64
*out
, u64
*stale
)
350 spin_lock_irqsave(&pi
->lock
, flags
);
351 *hst
= pi
->historical_service_time
;
352 *out
= pi
->outstanding
;
353 *stale
= pi
->stale_after
;
354 spin_unlock_irqrestore(&pi
->lock
, flags
);
358 * Compare the estimated service time of 2 paths, pi1 and pi2,
359 * for the incoming I/O.
362 * < 0 : pi1 is better
363 * 0 : no difference between pi1 and pi2
364 * > 0 : pi2 is better
367 static long long hst_compare(struct path_info
*pi1
, struct path_info
*pi2
,
368 u64 time_now
, struct path_selector
*ps
)
370 struct selector
*s
= ps
->context
;
372 long long out1
, out2
, stale1
, stale2
;
373 int pi2_better
, over_threshold
;
375 hst_fill_compare(pi1
, &hst1
, &out1
, &stale1
);
376 hst_fill_compare(pi2
, &hst2
, &out2
, &stale2
);
378 /* Check here if estimated latency for two paths are too similar.
379 * If this is the case, we skip extra calculation and just compare
380 * outstanding requests. In this case, any unloaded paths will
384 over_threshold
= hst1
> (s
->threshold_multiplier
* hst2
);
386 over_threshold
= hst2
> (s
->threshold_multiplier
* hst1
);
392 * If an unloaded path is stale, choose it. If both paths are unloaded,
393 * choose path that is the most stale.
394 * (If one path is loaded, choose the other)
396 if ((!out1
&& stale1
< time_now
) || (!out2
&& stale2
< time_now
) ||
398 return (!out2
* stale1
) - (!out1
* stale2
);
400 /* Compare estimated service time. If outstanding is the same, we
401 * don't need to multiply
404 pi2_better
= hst1
> hst2
;
406 /* Potential overflow with out >= 1024 */
407 if (unlikely(out1
>= HST_MAX_INFLIGHT
||
408 out2
>= HST_MAX_INFLIGHT
)) {
409 /* If over 1023 in-flights, we may overflow if hst
410 * is at max. (With this shift we still overflow at
411 * 1048576 in-flights, which is high enough).
413 hst1
>>= HST_FIXED_SHIFT
;
414 hst2
>>= HST_FIXED_SHIFT
;
416 pi2_better
= (1 + out1
) * hst1
> (1 + out2
) * hst2
;
419 /* In the case that the 'winner' is stale, limit to equal usage. */
421 if (stale2
< time_now
)
425 if (stale1
< time_now
)
430 static struct dm_path
*hst_select_path(struct path_selector
*ps
,
433 struct selector
*s
= ps
->context
;
434 struct path_info
*pi
= NULL
, *best
= NULL
;
435 u64 time_now
= ktime_get_ns();
436 struct dm_path
*ret
= NULL
;
439 spin_lock_irqsave(&s
->lock
, flags
);
440 if (list_empty(&s
->valid_paths
))
443 list_for_each_entry(pi
, &s
->valid_paths
, list
) {
444 if (!best
|| (hst_compare(pi
, best
, time_now
, ps
) < 0))
451 /* Move last used path to end (least preferred in case of ties) */
452 list_move_tail(&best
->list
, &s
->valid_paths
);
457 spin_unlock_irqrestore(&s
->lock
, flags
);
461 static int hst_start_io(struct path_selector
*ps
, struct dm_path
*path
,
464 struct path_info
*pi
= path
->pscontext
;
467 spin_lock_irqsave(&pi
->lock
, flags
);
469 spin_unlock_irqrestore(&pi
->lock
, flags
);
474 static u64
path_service_time(struct path_info
*pi
, u64 start_time
)
476 u64 now
= ktime_get_ns();
478 /* if a previous disk request has finished after this IO was
479 * sent to the hardware, pretend the submission happened
482 if (time_after64(pi
->last_finish
, start_time
))
483 start_time
= pi
->last_finish
;
485 pi
->last_finish
= now
;
486 if (time_before64(now
, start_time
))
489 return now
- start_time
;
492 static int hst_end_io(struct path_selector
*ps
, struct dm_path
*path
,
493 size_t nr_bytes
, u64 start_time
)
495 struct path_info
*pi
= path
->pscontext
;
496 struct selector
*s
= ps
->context
;
500 spin_lock_irqsave(&pi
->lock
, flags
);
502 st
= path_service_time(pi
, start_time
);
504 pi
->historical_service_time
=
505 fixed_ema(pi
->historical_service_time
,
506 min(st
* HST_FIXED_1
, HST_FIXED_MAX
),
510 * On request end, mark path as fresh. If a path hasn't
511 * finished any requests within the fresh period, the estimated
512 * service time is considered too optimistic and we limit the
513 * maximum requests on that path.
515 pi
->stale_after
= pi
->last_finish
+
516 (s
->valid_count
* (pi
->historical_service_time
>> HST_FIXED_SHIFT
));
518 spin_unlock_irqrestore(&pi
->lock
, flags
);
523 static struct path_selector_type hst_ps
= {
524 .name
= "historical-service-time",
525 .module
= THIS_MODULE
,
526 .features
= DM_PS_USE_HR_TIMER
,
529 .create
= hst_create
,
530 .destroy
= hst_destroy
,
531 .status
= hst_status
,
532 .add_path
= hst_add_path
,
533 .fail_path
= hst_fail_path
,
534 .reinstate_path
= hst_reinstate_path
,
535 .select_path
= hst_select_path
,
536 .start_io
= hst_start_io
,
537 .end_io
= hst_end_io
,
540 static int __init
dm_hst_init(void)
542 int r
= dm_register_path_selector(&hst_ps
);
545 DMERR("register failed %d", r
);
547 DMINFO("version " HST_VERSION
" loaded");
552 static void __exit
dm_hst_exit(void)
554 int r
= dm_unregister_path_selector(&hst_ps
);
557 DMERR("unregister failed %d", r
);
560 module_init(dm_hst_init
);
561 module_exit(dm_hst_exit
);
563 MODULE_DESCRIPTION(DM_NAME
" measured service time oriented path selector");
564 MODULE_AUTHOR("Khazhismel Kumykov <khazhy@google.com>");
565 MODULE_LICENSE("GPL");