2 * Copyright (C) 2015 Red Hat. All rights reserved.
4 * This file is released under the GPL.
7 #include "dm-cache-policy.h"
8 #include "dm-cache-policy-internal.h"
11 #include <linux/hash.h>
12 #include <linux/jiffies.h>
13 #include <linux/module.h>
14 #include <linux/mutex.h>
15 #include <linux/vmalloc.h>
16 #include <linux/math64.h>
18 #define DM_MSG_PREFIX "cache-policy-smq"
20 /*----------------------------------------------------------------*/
23 * Safe division functions that return zero on divide by zero.
25 static unsigned safe_div(unsigned n
, unsigned d
)
27 return d
? n
/ d
: 0u;
30 static unsigned safe_mod(unsigned n
, unsigned d
)
32 return d
? n
% d
: 0u;
35 /*----------------------------------------------------------------*/
38 unsigned hash_next
:28;
49 /*----------------------------------------------------------------*/
51 #define INDEXER_NULL ((1u << 28u) - 1u)
54 * An entry_space manages a set of entries that we use for the queues.
55 * The clean and dirty queues share entries, so this object is separate
56 * from the queue itself.
63 static int space_init(struct entry_space
*es
, unsigned nr_entries
)
66 es
->begin
= es
->end
= NULL
;
70 es
->begin
= vzalloc(sizeof(struct entry
) * nr_entries
);
74 es
->end
= es
->begin
+ nr_entries
;
78 static void space_exit(struct entry_space
*es
)
83 static struct entry
*__get_entry(struct entry_space
*es
, unsigned block
)
87 e
= es
->begin
+ block
;
93 static unsigned to_index(struct entry_space
*es
, struct entry
*e
)
95 BUG_ON(e
< es
->begin
|| e
>= es
->end
);
99 static struct entry
*to_entry(struct entry_space
*es
, unsigned block
)
101 if (block
== INDEXER_NULL
)
104 return __get_entry(es
, block
);
107 /*----------------------------------------------------------------*/
110 unsigned nr_elts
; /* excluding sentinel entries */
114 static void l_init(struct ilist
*l
)
117 l
->head
= l
->tail
= INDEXER_NULL
;
120 static struct entry
*l_head(struct entry_space
*es
, struct ilist
*l
)
122 return to_entry(es
, l
->head
);
125 static struct entry
*l_tail(struct entry_space
*es
, struct ilist
*l
)
127 return to_entry(es
, l
->tail
);
130 static struct entry
*l_next(struct entry_space
*es
, struct entry
*e
)
132 return to_entry(es
, e
->next
);
135 static struct entry
*l_prev(struct entry_space
*es
, struct entry
*e
)
137 return to_entry(es
, e
->prev
);
140 static bool l_empty(struct ilist
*l
)
142 return l
->head
== INDEXER_NULL
;
145 static void l_add_head(struct entry_space
*es
, struct ilist
*l
, struct entry
*e
)
147 struct entry
*head
= l_head(es
, l
);
150 e
->prev
= INDEXER_NULL
;
153 head
->prev
= l
->head
= to_index(es
, e
);
155 l
->head
= l
->tail
= to_index(es
, e
);
161 static void l_add_tail(struct entry_space
*es
, struct ilist
*l
, struct entry
*e
)
163 struct entry
*tail
= l_tail(es
, l
);
165 e
->next
= INDEXER_NULL
;
169 tail
->next
= l
->tail
= to_index(es
, e
);
171 l
->head
= l
->tail
= to_index(es
, e
);
177 static void l_add_before(struct entry_space
*es
, struct ilist
*l
,
178 struct entry
*old
, struct entry
*e
)
180 struct entry
*prev
= l_prev(es
, old
);
183 l_add_head(es
, l
, e
);
187 e
->next
= to_index(es
, old
);
188 prev
->next
= old
->prev
= to_index(es
, e
);
195 static void l_del(struct entry_space
*es
, struct ilist
*l
, struct entry
*e
)
197 struct entry
*prev
= l_prev(es
, e
);
198 struct entry
*next
= l_next(es
, e
);
201 prev
->next
= e
->next
;
206 next
->prev
= e
->prev
;
214 static struct entry
*l_pop_tail(struct entry_space
*es
, struct ilist
*l
)
218 for (e
= l_tail(es
, l
); e
; e
= l_prev(es
, e
))
227 /*----------------------------------------------------------------*/
230 * The stochastic-multi-queue is a set of lru lists stacked into levels.
231 * Entries are moved up levels when they are used, which loosely orders the
232 * most accessed entries in the top levels and least in the bottom. This
233 * structure is *much* better than a single lru list.
235 #define MAX_LEVELS 64u
238 struct entry_space
*es
;
242 struct ilist qs
[MAX_LEVELS
];
245 * We maintain a count of the number of entries we would like in each
248 unsigned last_target_nr_elts
;
249 unsigned nr_top_levels
;
250 unsigned nr_in_top_levels
;
251 unsigned target_count
[MAX_LEVELS
];
254 static void q_init(struct queue
*q
, struct entry_space
*es
, unsigned nr_levels
)
260 q
->nr_levels
= nr_levels
;
262 for (i
= 0; i
< q
->nr_levels
; i
++) {
264 q
->target_count
[i
] = 0u;
267 q
->last_target_nr_elts
= 0u;
268 q
->nr_top_levels
= 0u;
269 q
->nr_in_top_levels
= 0u;
272 static unsigned q_size(struct queue
*q
)
278 * Insert an entry to the back of the given level.
280 static void q_push(struct queue
*q
, struct entry
*e
)
285 l_add_tail(q
->es
, q
->qs
+ e
->level
, e
);
288 static void q_push_before(struct queue
*q
, struct entry
*old
, struct entry
*e
)
293 l_add_before(q
->es
, q
->qs
+ e
->level
, old
, e
);
296 static void q_del(struct queue
*q
, struct entry
*e
)
298 l_del(q
->es
, q
->qs
+ e
->level
, e
);
304 * Return the oldest entry of the lowest populated level.
306 static struct entry
*q_peek(struct queue
*q
, unsigned max_level
, bool can_cross_sentinel
)
311 max_level
= min(max_level
, q
->nr_levels
);
313 for (level
= 0; level
< max_level
; level
++)
314 for (e
= l_head(q
->es
, q
->qs
+ level
); e
; e
= l_next(q
->es
, e
)) {
316 if (can_cross_sentinel
)
328 static struct entry
*q_pop(struct queue
*q
)
330 struct entry
*e
= q_peek(q
, q
->nr_levels
, true);
339 * Pops an entry from a level that is not past a sentinel.
341 static struct entry
*q_pop_old(struct queue
*q
, unsigned max_level
)
343 struct entry
*e
= q_peek(q
, max_level
, false);
352 * This function assumes there is a non-sentinel entry to pop. It's only
353 * used by redistribute, so we know this is true. It also doesn't adjust
354 * the q->nr_elts count.
356 static struct entry
*__redist_pop_from(struct queue
*q
, unsigned level
)
360 for (; level
< q
->nr_levels
; level
++)
361 for (e
= l_head(q
->es
, q
->qs
+ level
); e
; e
= l_next(q
->es
, e
))
363 l_del(q
->es
, q
->qs
+ e
->level
, e
);
370 static void q_set_targets_subrange_(struct queue
*q
, unsigned nr_elts
, unsigned lbegin
, unsigned lend
)
372 unsigned level
, nr_levels
, entries_per_level
, remainder
;
374 BUG_ON(lbegin
> lend
);
375 BUG_ON(lend
> q
->nr_levels
);
376 nr_levels
= lend
- lbegin
;
377 entries_per_level
= safe_div(nr_elts
, nr_levels
);
378 remainder
= safe_mod(nr_elts
, nr_levels
);
380 for (level
= lbegin
; level
< lend
; level
++)
381 q
->target_count
[level
] =
382 (level
< (lbegin
+ remainder
)) ? entries_per_level
+ 1u : entries_per_level
;
386 * Typically we have fewer elements in the top few levels which allows us
387 * to adjust the promote threshold nicely.
389 static void q_set_targets(struct queue
*q
)
391 if (q
->last_target_nr_elts
== q
->nr_elts
)
394 q
->last_target_nr_elts
= q
->nr_elts
;
396 if (q
->nr_top_levels
> q
->nr_levels
)
397 q_set_targets_subrange_(q
, q
->nr_elts
, 0, q
->nr_levels
);
400 q_set_targets_subrange_(q
, q
->nr_in_top_levels
,
401 q
->nr_levels
- q
->nr_top_levels
, q
->nr_levels
);
403 if (q
->nr_in_top_levels
< q
->nr_elts
)
404 q_set_targets_subrange_(q
, q
->nr_elts
- q
->nr_in_top_levels
,
405 0, q
->nr_levels
- q
->nr_top_levels
);
407 q_set_targets_subrange_(q
, 0, 0, q
->nr_levels
- q
->nr_top_levels
);
411 static void q_redistribute(struct queue
*q
)
413 unsigned target
, level
;
414 struct ilist
*l
, *l_above
;
419 for (level
= 0u; level
< q
->nr_levels
- 1u; level
++) {
421 target
= q
->target_count
[level
];
424 * Pull down some entries from the level above.
426 while (l
->nr_elts
< target
) {
427 e
= __redist_pop_from(q
, level
+ 1u);
434 l_add_tail(q
->es
, l
, e
);
438 * Push some entries up.
440 l_above
= q
->qs
+ level
+ 1u;
441 while (l
->nr_elts
> target
) {
442 e
= l_pop_tail(q
->es
, l
);
448 e
->level
= level
+ 1u;
449 l_add_head(q
->es
, l_above
, e
);
454 static void q_requeue_before(struct queue
*q
, struct entry
*dest
, struct entry
*e
, unsigned extra_levels
)
461 if (extra_levels
&& (e
->level
< q
->nr_levels
- 1u)) {
462 new_level
= min(q
->nr_levels
- 1u, e
->level
+ extra_levels
);
463 for (de
= l_head(q
->es
, q
->qs
+ new_level
); de
; de
= l_next(q
->es
, de
)) {
468 de
->level
= e
->level
;
471 q_push_before(q
, dest
, de
);
477 e
->level
= new_level
;
483 static void q_requeue(struct queue
*q
, struct entry
*e
, unsigned extra_levels
)
485 q_requeue_before(q
, NULL
, e
, extra_levels
);
488 /*----------------------------------------------------------------*/
491 #define SIXTEENTH (1u << (FP_SHIFT - 4u))
492 #define EIGHTH (1u << (FP_SHIFT - 3u))
495 unsigned hit_threshold
;
506 static void stats_init(struct stats
*s
, unsigned nr_levels
)
508 s
->hit_threshold
= (nr_levels
* 3u) / 4u;
513 static void stats_reset(struct stats
*s
)
515 s
->hits
= s
->misses
= 0u;
518 static void stats_level_accessed(struct stats
*s
, unsigned level
)
520 if (level
>= s
->hit_threshold
)
526 static void stats_miss(struct stats
*s
)
532 * There are times when we don't have any confidence in the hotspot queue.
533 * Such as when a fresh cache is created and the blocks have been spread
534 * out across the levels, or if an io load changes. We detect this by
535 * seeing how often a lookup is in the top levels of the hotspot queue.
537 static enum performance
stats_assess(struct stats
*s
)
539 unsigned confidence
= safe_div(s
->hits
<< FP_SHIFT
, s
->hits
+ s
->misses
);
541 if (confidence
< SIXTEENTH
)
544 else if (confidence
< EIGHTH
)
551 /*----------------------------------------------------------------*/
554 struct entry_space
*es
;
555 unsigned long long hash_bits
;
560 * All cache entries are stored in a chained hash table. To save space we
561 * use indexing again, and only store indexes to the next entry.
563 static int h_init(struct hash_table
*ht
, struct entry_space
*es
, unsigned nr_entries
)
565 unsigned i
, nr_buckets
;
568 nr_buckets
= roundup_pow_of_two(max(nr_entries
/ 4u, 16u));
569 ht
->hash_bits
= ffs(nr_buckets
) - 1;
571 ht
->buckets
= vmalloc(sizeof(*ht
->buckets
) * nr_buckets
);
575 for (i
= 0; i
< nr_buckets
; i
++)
576 ht
->buckets
[i
] = INDEXER_NULL
;
581 static void h_exit(struct hash_table
*ht
)
586 static struct entry
*h_head(struct hash_table
*ht
, unsigned bucket
)
588 return to_entry(ht
->es
, ht
->buckets
[bucket
]);
591 static struct entry
*h_next(struct hash_table
*ht
, struct entry
*e
)
593 return to_entry(ht
->es
, e
->hash_next
);
596 static void __h_insert(struct hash_table
*ht
, unsigned bucket
, struct entry
*e
)
598 e
->hash_next
= ht
->buckets
[bucket
];
599 ht
->buckets
[bucket
] = to_index(ht
->es
, e
);
602 static void h_insert(struct hash_table
*ht
, struct entry
*e
)
604 unsigned h
= hash_64(from_oblock(e
->oblock
), ht
->hash_bits
);
605 __h_insert(ht
, h
, e
);
608 static struct entry
*__h_lookup(struct hash_table
*ht
, unsigned h
, dm_oblock_t oblock
,
614 for (e
= h_head(ht
, h
); e
; e
= h_next(ht
, e
)) {
615 if (e
->oblock
== oblock
)
624 static void __h_unlink(struct hash_table
*ht
, unsigned h
,
625 struct entry
*e
, struct entry
*prev
)
628 prev
->hash_next
= e
->hash_next
;
630 ht
->buckets
[h
] = e
->hash_next
;
634 * Also moves each entry to the front of the bucket.
636 static struct entry
*h_lookup(struct hash_table
*ht
, dm_oblock_t oblock
)
638 struct entry
*e
, *prev
;
639 unsigned h
= hash_64(from_oblock(oblock
), ht
->hash_bits
);
641 e
= __h_lookup(ht
, h
, oblock
, &prev
);
644 * Move to the front because this entry is likely
647 __h_unlink(ht
, h
, e
, prev
);
648 __h_insert(ht
, h
, e
);
654 static void h_remove(struct hash_table
*ht
, struct entry
*e
)
656 unsigned h
= hash_64(from_oblock(e
->oblock
), ht
->hash_bits
);
660 * The down side of using a singly linked list is we have to
661 * iterate the bucket to remove an item.
663 e
= __h_lookup(ht
, h
, e
->oblock
, &prev
);
665 __h_unlink(ht
, h
, e
, prev
);
668 /*----------------------------------------------------------------*/
671 struct entry_space
*es
;
674 unsigned nr_allocated
;
678 static void init_allocator(struct entry_alloc
*ea
, struct entry_space
*es
,
679 unsigned begin
, unsigned end
)
684 ea
->nr_allocated
= 0u;
688 for (i
= begin
; i
!= end
; i
++)
689 l_add_tail(ea
->es
, &ea
->free
, __get_entry(ea
->es
, i
));
692 static void init_entry(struct entry
*e
)
695 * We can't memset because that would clear the hotspot and
696 * sentinel bits which remain constant.
698 e
->hash_next
= INDEXER_NULL
;
699 e
->next
= INDEXER_NULL
;
700 e
->prev
= INDEXER_NULL
;
705 static struct entry
*alloc_entry(struct entry_alloc
*ea
)
709 if (l_empty(&ea
->free
))
712 e
= l_pop_tail(ea
->es
, &ea
->free
);
720 * This assumes the cblock hasn't already been allocated.
722 static struct entry
*alloc_particular_entry(struct entry_alloc
*ea
, unsigned i
)
724 struct entry
*e
= __get_entry(ea
->es
, ea
->begin
+ i
);
726 BUG_ON(e
->allocated
);
728 l_del(ea
->es
, &ea
->free
, e
);
735 static void free_entry(struct entry_alloc
*ea
, struct entry
*e
)
737 BUG_ON(!ea
->nr_allocated
);
738 BUG_ON(!e
->allocated
);
741 e
->allocated
= false;
742 l_add_tail(ea
->es
, &ea
->free
, e
);
745 static bool allocator_empty(struct entry_alloc
*ea
)
747 return l_empty(&ea
->free
);
750 static unsigned get_index(struct entry_alloc
*ea
, struct entry
*e
)
752 return to_index(ea
->es
, e
) - ea
->begin
;
755 static struct entry
*get_entry(struct entry_alloc
*ea
, unsigned index
)
757 return __get_entry(ea
->es
, ea
->begin
+ index
);
760 /*----------------------------------------------------------------*/
762 #define NR_HOTSPOT_LEVELS 64u
763 #define NR_CACHE_LEVELS 64u
765 #define WRITEBACK_PERIOD (10 * HZ)
766 #define DEMOTE_PERIOD (60 * HZ)
768 #define HOTSPOT_UPDATE_PERIOD (HZ)
769 #define CACHE_UPDATE_PERIOD (10u * HZ)
772 struct dm_cache_policy policy
;
774 /* protects everything */
776 dm_cblock_t cache_size
;
777 sector_t cache_block_size
;
779 sector_t hotspot_block_size
;
780 unsigned nr_hotspot_blocks
;
781 unsigned cache_blocks_per_hotspot_block
;
782 unsigned hotspot_level_jump
;
784 struct entry_space es
;
785 struct entry_alloc writeback_sentinel_alloc
;
786 struct entry_alloc demote_sentinel_alloc
;
787 struct entry_alloc hotspot_alloc
;
788 struct entry_alloc cache_alloc
;
790 unsigned long *hotspot_hit_bits
;
791 unsigned long *cache_hit_bits
;
794 * We maintain three queues of entries. The cache proper,
795 * consisting of a clean and dirty queue, containing the currently
796 * active mappings. The hotspot queue uses a larger block size to
797 * track blocks that are being hit frequently and potential
798 * candidates for promotion to the cache.
800 struct queue hotspot
;
804 struct stats hotspot_stats
;
805 struct stats cache_stats
;
808 * Keeps track of time, incremented by the core. We use this to
809 * avoid attributing multiple hits within the same tick.
811 * Access to tick_protected should be done with the spin lock held.
812 * It's copied to tick at the start of the map function (within the
815 spinlock_t tick_lock
;
816 unsigned tick_protected
;
820 * The hash tables allows us to quickly find an entry by origin
823 struct hash_table table
;
824 struct hash_table hotspot_table
;
826 bool current_writeback_sentinels
;
827 unsigned long next_writeback_period
;
829 bool current_demote_sentinels
;
830 unsigned long next_demote_period
;
832 unsigned write_promote_level
;
833 unsigned read_promote_level
;
835 unsigned long next_hotspot_period
;
836 unsigned long next_cache_period
;
839 /*----------------------------------------------------------------*/
841 static struct entry
*get_sentinel(struct entry_alloc
*ea
, unsigned level
, bool which
)
843 return get_entry(ea
, which
? level
: NR_CACHE_LEVELS
+ level
);
846 static struct entry
*writeback_sentinel(struct smq_policy
*mq
, unsigned level
)
848 return get_sentinel(&mq
->writeback_sentinel_alloc
, level
, mq
->current_writeback_sentinels
);
851 static struct entry
*demote_sentinel(struct smq_policy
*mq
, unsigned level
)
853 return get_sentinel(&mq
->demote_sentinel_alloc
, level
, mq
->current_demote_sentinels
);
856 static void __update_writeback_sentinels(struct smq_policy
*mq
)
859 struct queue
*q
= &mq
->dirty
;
860 struct entry
*sentinel
;
862 for (level
= 0; level
< q
->nr_levels
; level
++) {
863 sentinel
= writeback_sentinel(mq
, level
);
869 static void __update_demote_sentinels(struct smq_policy
*mq
)
872 struct queue
*q
= &mq
->clean
;
873 struct entry
*sentinel
;
875 for (level
= 0; level
< q
->nr_levels
; level
++) {
876 sentinel
= demote_sentinel(mq
, level
);
882 static void update_sentinels(struct smq_policy
*mq
)
884 if (time_after(jiffies
, mq
->next_writeback_period
)) {
885 __update_writeback_sentinels(mq
);
886 mq
->next_writeback_period
= jiffies
+ WRITEBACK_PERIOD
;
887 mq
->current_writeback_sentinels
= !mq
->current_writeback_sentinels
;
890 if (time_after(jiffies
, mq
->next_demote_period
)) {
891 __update_demote_sentinels(mq
);
892 mq
->next_demote_period
= jiffies
+ DEMOTE_PERIOD
;
893 mq
->current_demote_sentinels
= !mq
->current_demote_sentinels
;
897 static void __sentinels_init(struct smq_policy
*mq
)
900 struct entry
*sentinel
;
902 for (level
= 0; level
< NR_CACHE_LEVELS
; level
++) {
903 sentinel
= writeback_sentinel(mq
, level
);
904 sentinel
->level
= level
;
905 q_push(&mq
->dirty
, sentinel
);
907 sentinel
= demote_sentinel(mq
, level
);
908 sentinel
->level
= level
;
909 q_push(&mq
->clean
, sentinel
);
913 static void sentinels_init(struct smq_policy
*mq
)
915 mq
->next_writeback_period
= jiffies
+ WRITEBACK_PERIOD
;
916 mq
->next_demote_period
= jiffies
+ DEMOTE_PERIOD
;
918 mq
->current_writeback_sentinels
= false;
919 mq
->current_demote_sentinels
= false;
920 __sentinels_init(mq
);
922 mq
->current_writeback_sentinels
= !mq
->current_writeback_sentinels
;
923 mq
->current_demote_sentinels
= !mq
->current_demote_sentinels
;
924 __sentinels_init(mq
);
927 /*----------------------------------------------------------------*/
930 * These methods tie together the dirty queue, clean queue and hash table.
932 static void push_new(struct smq_policy
*mq
, struct entry
*e
)
934 struct queue
*q
= e
->dirty
? &mq
->dirty
: &mq
->clean
;
935 h_insert(&mq
->table
, e
);
939 static void push(struct smq_policy
*mq
, struct entry
*e
)
941 struct entry
*sentinel
;
943 h_insert(&mq
->table
, e
);
946 * Punch this into the queue just in front of the sentinel, to
947 * ensure it's cleaned straight away.
950 sentinel
= writeback_sentinel(mq
, e
->level
);
951 q_push_before(&mq
->dirty
, sentinel
, e
);
953 sentinel
= demote_sentinel(mq
, e
->level
);
954 q_push_before(&mq
->clean
, sentinel
, e
);
959 * Removes an entry from cache. Removes from the hash table.
961 static void __del(struct smq_policy
*mq
, struct queue
*q
, struct entry
*e
)
964 h_remove(&mq
->table
, e
);
967 static void del(struct smq_policy
*mq
, struct entry
*e
)
969 __del(mq
, e
->dirty
? &mq
->dirty
: &mq
->clean
, e
);
972 static struct entry
*pop_old(struct smq_policy
*mq
, struct queue
*q
, unsigned max_level
)
974 struct entry
*e
= q_pop_old(q
, max_level
);
976 h_remove(&mq
->table
, e
);
980 static dm_cblock_t
infer_cblock(struct smq_policy
*mq
, struct entry
*e
)
982 return to_cblock(get_index(&mq
->cache_alloc
, e
));
985 static void requeue(struct smq_policy
*mq
, struct entry
*e
)
987 struct entry
*sentinel
;
989 if (!test_and_set_bit(from_cblock(infer_cblock(mq
, e
)), mq
->cache_hit_bits
)) {
991 sentinel
= writeback_sentinel(mq
, e
->level
);
992 q_requeue_before(&mq
->dirty
, sentinel
, e
, 1u);
994 sentinel
= demote_sentinel(mq
, e
->level
);
995 q_requeue_before(&mq
->clean
, sentinel
, e
, 1u);
1000 static unsigned default_promote_level(struct smq_policy
*mq
)
1003 * The promote level depends on the current performance of the
1006 * If the cache is performing badly, then we can't afford
1007 * to promote much without causing performance to drop below that
1008 * of the origin device.
1010 * If the cache is performing well, then we don't need to promote
1011 * much. If it isn't broken, don't fix it.
1013 * If the cache is middling then we promote more.
1015 * This scheme reminds me of a graph of entropy vs probability of a
1018 static unsigned table
[] = {1, 1, 1, 2, 4, 6, 7, 8, 7, 6, 4, 4, 3, 3, 2, 2, 1};
1020 unsigned hits
= mq
->cache_stats
.hits
;
1021 unsigned misses
= mq
->cache_stats
.misses
;
1022 unsigned index
= safe_div(hits
<< 4u, hits
+ misses
);
1023 return table
[index
];
1026 static void update_promote_levels(struct smq_policy
*mq
)
1029 * If there are unused cache entries then we want to be really
1032 unsigned threshold_level
= allocator_empty(&mq
->cache_alloc
) ?
1033 default_promote_level(mq
) : (NR_HOTSPOT_LEVELS
/ 2u);
1036 * If the hotspot queue is performing badly then we have little
1037 * confidence that we know which blocks to promote. So we cut down
1038 * the amount of promotions.
1040 switch (stats_assess(&mq
->hotspot_stats
)) {
1042 threshold_level
/= 4u;
1046 threshold_level
/= 2u;
1053 mq
->read_promote_level
= NR_HOTSPOT_LEVELS
- threshold_level
;
1054 mq
->write_promote_level
= (NR_HOTSPOT_LEVELS
- threshold_level
) + 2u;
1058 * If the hotspot queue is performing badly, then we try and move entries
1059 * around more quickly.
1061 static void update_level_jump(struct smq_policy
*mq
)
1063 switch (stats_assess(&mq
->hotspot_stats
)) {
1065 mq
->hotspot_level_jump
= 4u;
1069 mq
->hotspot_level_jump
= 2u;
1073 mq
->hotspot_level_jump
= 1u;
1078 static void end_hotspot_period(struct smq_policy
*mq
)
1080 clear_bitset(mq
->hotspot_hit_bits
, mq
->nr_hotspot_blocks
);
1081 update_promote_levels(mq
);
1083 if (time_after(jiffies
, mq
->next_hotspot_period
)) {
1084 update_level_jump(mq
);
1085 q_redistribute(&mq
->hotspot
);
1086 stats_reset(&mq
->hotspot_stats
);
1087 mq
->next_hotspot_period
= jiffies
+ HOTSPOT_UPDATE_PERIOD
;
1091 static void end_cache_period(struct smq_policy
*mq
)
1093 if (time_after(jiffies
, mq
->next_cache_period
)) {
1094 clear_bitset(mq
->cache_hit_bits
, from_cblock(mq
->cache_size
));
1096 q_redistribute(&mq
->dirty
);
1097 q_redistribute(&mq
->clean
);
1098 stats_reset(&mq
->cache_stats
);
1100 mq
->next_cache_period
= jiffies
+ CACHE_UPDATE_PERIOD
;
1104 static int demote_cblock(struct smq_policy
*mq
,
1105 struct policy_locker
*locker
,
1106 dm_oblock_t
*oblock
)
1108 struct entry
*demoted
= q_peek(&mq
->clean
, mq
->clean
.nr_levels
, false);
1111 * We could get a block from mq->dirty, but that
1112 * would add extra latency to the triggering bio as it
1113 * waits for the writeback. Better to not promote this
1114 * time and hope there's a clean block next time this block
1119 if (locker
->fn(locker
, demoted
->oblock
))
1121 * We couldn't lock this block.
1126 *oblock
= demoted
->oblock
;
1127 free_entry(&mq
->cache_alloc
, demoted
);
1132 enum promote_result
{
1139 * Converts a boolean into a promote result.
1141 static enum promote_result
maybe_promote(bool promote
)
1143 return promote
? PROMOTE_PERMANENT
: PROMOTE_NOT
;
1146 static enum promote_result
should_promote(struct smq_policy
*mq
, struct entry
*hs_e
, struct bio
*bio
,
1149 if (bio_data_dir(bio
) == WRITE
) {
1150 if (!allocator_empty(&mq
->cache_alloc
) && fast_promote
)
1151 return PROMOTE_TEMPORARY
;
1154 return maybe_promote(hs_e
->level
>= mq
->write_promote_level
);
1156 return maybe_promote(hs_e
->level
>= mq
->read_promote_level
);
1159 static void insert_in_cache(struct smq_policy
*mq
, dm_oblock_t oblock
,
1160 struct policy_locker
*locker
,
1161 struct policy_result
*result
, enum promote_result pr
)
1166 if (allocator_empty(&mq
->cache_alloc
)) {
1167 result
->op
= POLICY_REPLACE
;
1168 r
= demote_cblock(mq
, locker
, &result
->old_oblock
);
1170 result
->op
= POLICY_MISS
;
1175 result
->op
= POLICY_NEW
;
1177 e
= alloc_entry(&mq
->cache_alloc
);
1181 if (pr
== PROMOTE_TEMPORARY
)
1186 result
->cblock
= infer_cblock(mq
, e
);
1189 static dm_oblock_t
to_hblock(struct smq_policy
*mq
, dm_oblock_t b
)
1191 sector_t r
= from_oblock(b
);
1192 (void) sector_div(r
, mq
->cache_blocks_per_hotspot_block
);
1193 return to_oblock(r
);
1196 static struct entry
*update_hotspot_queue(struct smq_policy
*mq
, dm_oblock_t b
, struct bio
*bio
)
1199 dm_oblock_t hb
= to_hblock(mq
, b
);
1200 struct entry
*e
= h_lookup(&mq
->hotspot_table
, hb
);
1203 stats_level_accessed(&mq
->hotspot_stats
, e
->level
);
1205 hi
= get_index(&mq
->hotspot_alloc
, e
);
1206 q_requeue(&mq
->hotspot
, e
,
1207 test_and_set_bit(hi
, mq
->hotspot_hit_bits
) ?
1208 0u : mq
->hotspot_level_jump
);
1211 stats_miss(&mq
->hotspot_stats
);
1213 e
= alloc_entry(&mq
->hotspot_alloc
);
1215 e
= q_pop(&mq
->hotspot
);
1217 h_remove(&mq
->hotspot_table
, e
);
1218 hi
= get_index(&mq
->hotspot_alloc
, e
);
1219 clear_bit(hi
, mq
->hotspot_hit_bits
);
1226 q_push(&mq
->hotspot
, e
);
1227 h_insert(&mq
->hotspot_table
, e
);
1235 * Looks the oblock up in the hash table, then decides whether to put in
1236 * pre_cache, or cache etc.
1238 static int map(struct smq_policy
*mq
, struct bio
*bio
, dm_oblock_t oblock
,
1239 bool can_migrate
, bool fast_promote
,
1240 struct policy_locker
*locker
, struct policy_result
*result
)
1242 struct entry
*e
, *hs_e
;
1243 enum promote_result pr
;
1245 hs_e
= update_hotspot_queue(mq
, oblock
, bio
);
1247 e
= h_lookup(&mq
->table
, oblock
);
1249 stats_level_accessed(&mq
->cache_stats
, e
->level
);
1252 result
->op
= POLICY_HIT
;
1253 result
->cblock
= infer_cblock(mq
, e
);
1256 stats_miss(&mq
->cache_stats
);
1258 pr
= should_promote(mq
, hs_e
, bio
, fast_promote
);
1259 if (pr
== PROMOTE_NOT
)
1260 result
->op
= POLICY_MISS
;
1264 result
->op
= POLICY_MISS
;
1265 return -EWOULDBLOCK
;
1268 insert_in_cache(mq
, oblock
, locker
, result
, pr
);
1275 /*----------------------------------------------------------------*/
1278 * Public interface, via the policy struct. See dm-cache-policy.h for a
1279 * description of these.
1282 static struct smq_policy
*to_smq_policy(struct dm_cache_policy
*p
)
1284 return container_of(p
, struct smq_policy
, policy
);
1287 static void smq_destroy(struct dm_cache_policy
*p
)
1289 struct smq_policy
*mq
= to_smq_policy(p
);
1291 h_exit(&mq
->hotspot_table
);
1293 free_bitset(mq
->hotspot_hit_bits
);
1294 free_bitset(mq
->cache_hit_bits
);
1295 space_exit(&mq
->es
);
1299 static void copy_tick(struct smq_policy
*mq
)
1301 unsigned long flags
, tick
;
1303 spin_lock_irqsave(&mq
->tick_lock
, flags
);
1304 tick
= mq
->tick_protected
;
1305 if (tick
!= mq
->tick
) {
1306 update_sentinels(mq
);
1307 end_hotspot_period(mq
);
1308 end_cache_period(mq
);
1311 spin_unlock_irqrestore(&mq
->tick_lock
, flags
);
1314 static bool maybe_lock(struct smq_policy
*mq
, bool can_block
)
1317 mutex_lock(&mq
->lock
);
1320 return mutex_trylock(&mq
->lock
);
1323 static int smq_map(struct dm_cache_policy
*p
, dm_oblock_t oblock
,
1324 bool can_block
, bool can_migrate
, bool fast_promote
,
1325 struct bio
*bio
, struct policy_locker
*locker
,
1326 struct policy_result
*result
)
1329 struct smq_policy
*mq
= to_smq_policy(p
);
1331 result
->op
= POLICY_MISS
;
1333 if (!maybe_lock(mq
, can_block
))
1334 return -EWOULDBLOCK
;
1337 r
= map(mq
, bio
, oblock
, can_migrate
, fast_promote
, locker
, result
);
1338 mutex_unlock(&mq
->lock
);
1343 static int smq_lookup(struct dm_cache_policy
*p
, dm_oblock_t oblock
, dm_cblock_t
*cblock
)
1346 struct smq_policy
*mq
= to_smq_policy(p
);
1349 if (!mutex_trylock(&mq
->lock
))
1350 return -EWOULDBLOCK
;
1352 e
= h_lookup(&mq
->table
, oblock
);
1354 *cblock
= infer_cblock(mq
, e
);
1359 mutex_unlock(&mq
->lock
);
1364 static void __smq_set_clear_dirty(struct smq_policy
*mq
, dm_oblock_t oblock
, bool set
)
1368 e
= h_lookup(&mq
->table
, oblock
);
1376 static void smq_set_dirty(struct dm_cache_policy
*p
, dm_oblock_t oblock
)
1378 struct smq_policy
*mq
= to_smq_policy(p
);
1380 mutex_lock(&mq
->lock
);
1381 __smq_set_clear_dirty(mq
, oblock
, true);
1382 mutex_unlock(&mq
->lock
);
1385 static void smq_clear_dirty(struct dm_cache_policy
*p
, dm_oblock_t oblock
)
1387 struct smq_policy
*mq
= to_smq_policy(p
);
1389 mutex_lock(&mq
->lock
);
1390 __smq_set_clear_dirty(mq
, oblock
, false);
1391 mutex_unlock(&mq
->lock
);
1394 static int smq_load_mapping(struct dm_cache_policy
*p
,
1395 dm_oblock_t oblock
, dm_cblock_t cblock
,
1396 uint32_t hint
, bool hint_valid
)
1398 struct smq_policy
*mq
= to_smq_policy(p
);
1401 e
= alloc_particular_entry(&mq
->cache_alloc
, from_cblock(cblock
));
1403 e
->dirty
= false; /* this gets corrected in a minute */
1404 e
->level
= hint_valid
? min(hint
, NR_CACHE_LEVELS
- 1) : 1;
1410 static int smq_save_hints(struct smq_policy
*mq
, struct queue
*q
,
1411 policy_walk_fn fn
, void *context
)
1417 for (level
= 0; level
< q
->nr_levels
; level
++)
1418 for (e
= l_head(q
->es
, q
->qs
+ level
); e
; e
= l_next(q
->es
, e
)) {
1420 r
= fn(context
, infer_cblock(mq
, e
),
1421 e
->oblock
, e
->level
);
1430 static int smq_walk_mappings(struct dm_cache_policy
*p
, policy_walk_fn fn
,
1433 struct smq_policy
*mq
= to_smq_policy(p
);
1436 mutex_lock(&mq
->lock
);
1438 r
= smq_save_hints(mq
, &mq
->clean
, fn
, context
);
1440 r
= smq_save_hints(mq
, &mq
->dirty
, fn
, context
);
1442 mutex_unlock(&mq
->lock
);
1447 static void __remove_mapping(struct smq_policy
*mq
, dm_oblock_t oblock
)
1451 e
= h_lookup(&mq
->table
, oblock
);
1455 free_entry(&mq
->cache_alloc
, e
);
1458 static void smq_remove_mapping(struct dm_cache_policy
*p
, dm_oblock_t oblock
)
1460 struct smq_policy
*mq
= to_smq_policy(p
);
1462 mutex_lock(&mq
->lock
);
1463 __remove_mapping(mq
, oblock
);
1464 mutex_unlock(&mq
->lock
);
1467 static int __remove_cblock(struct smq_policy
*mq
, dm_cblock_t cblock
)
1469 struct entry
*e
= get_entry(&mq
->cache_alloc
, from_cblock(cblock
));
1471 if (!e
|| !e
->allocated
)
1475 free_entry(&mq
->cache_alloc
, e
);
1480 static int smq_remove_cblock(struct dm_cache_policy
*p
, dm_cblock_t cblock
)
1483 struct smq_policy
*mq
= to_smq_policy(p
);
1485 mutex_lock(&mq
->lock
);
1486 r
= __remove_cblock(mq
, cblock
);
1487 mutex_unlock(&mq
->lock
);
1493 #define CLEAN_TARGET_CRITICAL 5u /* percent */
1495 static bool clean_target_met(struct smq_policy
*mq
, bool critical
)
1499 * Cache entries may not be populated. So we're cannot rely on the
1500 * size of the clean queue.
1502 unsigned nr_clean
= from_cblock(mq
->cache_size
) - q_size(&mq
->dirty
);
1503 unsigned target
= from_cblock(mq
->cache_size
) * CLEAN_TARGET_CRITICAL
/ 100u;
1505 return nr_clean
>= target
;
1507 return !q_size(&mq
->dirty
);
1510 static int __smq_writeback_work(struct smq_policy
*mq
, dm_oblock_t
*oblock
,
1511 dm_cblock_t
*cblock
, bool critical_only
)
1513 struct entry
*e
= NULL
;
1514 bool target_met
= clean_target_met(mq
, critical_only
);
1518 * Always try and keep the bottom level clean.
1520 e
= pop_old(mq
, &mq
->dirty
, target_met
? 1u : mq
->dirty
.nr_levels
);
1523 e
= pop_old(mq
, &mq
->dirty
, mq
->dirty
.nr_levels
);
1528 *oblock
= e
->oblock
;
1529 *cblock
= infer_cblock(mq
, e
);
1536 static int smq_writeback_work(struct dm_cache_policy
*p
, dm_oblock_t
*oblock
,
1537 dm_cblock_t
*cblock
, bool critical_only
)
1540 struct smq_policy
*mq
= to_smq_policy(p
);
1542 mutex_lock(&mq
->lock
);
1543 r
= __smq_writeback_work(mq
, oblock
, cblock
, critical_only
);
1544 mutex_unlock(&mq
->lock
);
1549 static void __force_mapping(struct smq_policy
*mq
,
1550 dm_oblock_t current_oblock
, dm_oblock_t new_oblock
)
1552 struct entry
*e
= h_lookup(&mq
->table
, current_oblock
);
1556 e
->oblock
= new_oblock
;
1562 static void smq_force_mapping(struct dm_cache_policy
*p
,
1563 dm_oblock_t current_oblock
, dm_oblock_t new_oblock
)
1565 struct smq_policy
*mq
= to_smq_policy(p
);
1567 mutex_lock(&mq
->lock
);
1568 __force_mapping(mq
, current_oblock
, new_oblock
);
1569 mutex_unlock(&mq
->lock
);
1572 static dm_cblock_t
smq_residency(struct dm_cache_policy
*p
)
1575 struct smq_policy
*mq
= to_smq_policy(p
);
1577 mutex_lock(&mq
->lock
);
1578 r
= to_cblock(mq
->cache_alloc
.nr_allocated
);
1579 mutex_unlock(&mq
->lock
);
1584 static void smq_tick(struct dm_cache_policy
*p
, bool can_block
)
1586 struct smq_policy
*mq
= to_smq_policy(p
);
1587 unsigned long flags
;
1589 spin_lock_irqsave(&mq
->tick_lock
, flags
);
1590 mq
->tick_protected
++;
1591 spin_unlock_irqrestore(&mq
->tick_lock
, flags
);
1594 mutex_lock(&mq
->lock
);
1596 mutex_unlock(&mq
->lock
);
1600 /* Init the policy plugin interface function pointers. */
1601 static void init_policy_functions(struct smq_policy
*mq
)
1603 mq
->policy
.destroy
= smq_destroy
;
1604 mq
->policy
.map
= smq_map
;
1605 mq
->policy
.lookup
= smq_lookup
;
1606 mq
->policy
.set_dirty
= smq_set_dirty
;
1607 mq
->policy
.clear_dirty
= smq_clear_dirty
;
1608 mq
->policy
.load_mapping
= smq_load_mapping
;
1609 mq
->policy
.walk_mappings
= smq_walk_mappings
;
1610 mq
->policy
.remove_mapping
= smq_remove_mapping
;
1611 mq
->policy
.remove_cblock
= smq_remove_cblock
;
1612 mq
->policy
.writeback_work
= smq_writeback_work
;
1613 mq
->policy
.force_mapping
= smq_force_mapping
;
1614 mq
->policy
.residency
= smq_residency
;
1615 mq
->policy
.tick
= smq_tick
;
1618 static bool too_many_hotspot_blocks(sector_t origin_size
,
1619 sector_t hotspot_block_size
,
1620 unsigned nr_hotspot_blocks
)
1622 return (hotspot_block_size
* nr_hotspot_blocks
) > origin_size
;
1625 static void calc_hotspot_params(sector_t origin_size
,
1626 sector_t cache_block_size
,
1627 unsigned nr_cache_blocks
,
1628 sector_t
*hotspot_block_size
,
1629 unsigned *nr_hotspot_blocks
)
1631 *hotspot_block_size
= cache_block_size
* 16u;
1632 *nr_hotspot_blocks
= max(nr_cache_blocks
/ 4u, 1024u);
1634 while ((*hotspot_block_size
> cache_block_size
) &&
1635 too_many_hotspot_blocks(origin_size
, *hotspot_block_size
, *nr_hotspot_blocks
))
1636 *hotspot_block_size
/= 2u;
1639 static struct dm_cache_policy
*smq_create(dm_cblock_t cache_size
,
1640 sector_t origin_size
,
1641 sector_t cache_block_size
)
1644 unsigned nr_sentinels_per_queue
= 2u * NR_CACHE_LEVELS
;
1645 unsigned total_sentinels
= 2u * nr_sentinels_per_queue
;
1646 struct smq_policy
*mq
= kzalloc(sizeof(*mq
), GFP_KERNEL
);
1651 init_policy_functions(mq
);
1652 mq
->cache_size
= cache_size
;
1653 mq
->cache_block_size
= cache_block_size
;
1655 calc_hotspot_params(origin_size
, cache_block_size
, from_cblock(cache_size
),
1656 &mq
->hotspot_block_size
, &mq
->nr_hotspot_blocks
);
1658 mq
->cache_blocks_per_hotspot_block
= div64_u64(mq
->hotspot_block_size
, mq
->cache_block_size
);
1659 mq
->hotspot_level_jump
= 1u;
1660 if (space_init(&mq
->es
, total_sentinels
+ mq
->nr_hotspot_blocks
+ from_cblock(cache_size
))) {
1661 DMERR("couldn't initialize entry space");
1665 init_allocator(&mq
->writeback_sentinel_alloc
, &mq
->es
, 0, nr_sentinels_per_queue
);
1666 for (i
= 0; i
< nr_sentinels_per_queue
; i
++)
1667 get_entry(&mq
->writeback_sentinel_alloc
, i
)->sentinel
= true;
1669 init_allocator(&mq
->demote_sentinel_alloc
, &mq
->es
, nr_sentinels_per_queue
, total_sentinels
);
1670 for (i
= 0; i
< nr_sentinels_per_queue
; i
++)
1671 get_entry(&mq
->demote_sentinel_alloc
, i
)->sentinel
= true;
1673 init_allocator(&mq
->hotspot_alloc
, &mq
->es
, total_sentinels
,
1674 total_sentinels
+ mq
->nr_hotspot_blocks
);
1676 init_allocator(&mq
->cache_alloc
, &mq
->es
,
1677 total_sentinels
+ mq
->nr_hotspot_blocks
,
1678 total_sentinels
+ mq
->nr_hotspot_blocks
+ from_cblock(cache_size
));
1680 mq
->hotspot_hit_bits
= alloc_bitset(mq
->nr_hotspot_blocks
);
1681 if (!mq
->hotspot_hit_bits
) {
1682 DMERR("couldn't allocate hotspot hit bitset");
1683 goto bad_hotspot_hit_bits
;
1685 clear_bitset(mq
->hotspot_hit_bits
, mq
->nr_hotspot_blocks
);
1687 if (from_cblock(cache_size
)) {
1688 mq
->cache_hit_bits
= alloc_bitset(from_cblock(cache_size
));
1689 if (!mq
->cache_hit_bits
) {
1690 DMERR("couldn't allocate cache hit bitset");
1691 goto bad_cache_hit_bits
;
1693 clear_bitset(mq
->cache_hit_bits
, from_cblock(mq
->cache_size
));
1695 mq
->cache_hit_bits
= NULL
;
1697 mq
->tick_protected
= 0;
1699 mutex_init(&mq
->lock
);
1700 spin_lock_init(&mq
->tick_lock
);
1702 q_init(&mq
->hotspot
, &mq
->es
, NR_HOTSPOT_LEVELS
);
1703 mq
->hotspot
.nr_top_levels
= 8;
1704 mq
->hotspot
.nr_in_top_levels
= min(mq
->nr_hotspot_blocks
/ NR_HOTSPOT_LEVELS
,
1705 from_cblock(mq
->cache_size
) / mq
->cache_blocks_per_hotspot_block
);
1707 q_init(&mq
->clean
, &mq
->es
, NR_CACHE_LEVELS
);
1708 q_init(&mq
->dirty
, &mq
->es
, NR_CACHE_LEVELS
);
1710 stats_init(&mq
->hotspot_stats
, NR_HOTSPOT_LEVELS
);
1711 stats_init(&mq
->cache_stats
, NR_CACHE_LEVELS
);
1713 if (h_init(&mq
->table
, &mq
->es
, from_cblock(cache_size
)))
1714 goto bad_alloc_table
;
1716 if (h_init(&mq
->hotspot_table
, &mq
->es
, mq
->nr_hotspot_blocks
))
1717 goto bad_alloc_hotspot_table
;
1720 mq
->write_promote_level
= mq
->read_promote_level
= NR_HOTSPOT_LEVELS
;
1722 mq
->next_hotspot_period
= jiffies
;
1723 mq
->next_cache_period
= jiffies
;
1727 bad_alloc_hotspot_table
:
1730 free_bitset(mq
->cache_hit_bits
);
1732 free_bitset(mq
->hotspot_hit_bits
);
1733 bad_hotspot_hit_bits
:
1734 space_exit(&mq
->es
);
1741 /*----------------------------------------------------------------*/
1743 static struct dm_cache_policy_type smq_policy_type
= {
1745 .version
= {1, 0, 0},
1747 .owner
= THIS_MODULE
,
1748 .create
= smq_create
1751 static struct dm_cache_policy_type default_policy_type
= {
1753 .version
= {1, 4, 0},
1755 .owner
= THIS_MODULE
,
1756 .create
= smq_create
,
1757 .real
= &smq_policy_type
1760 static int __init
smq_init(void)
1764 r
= dm_cache_policy_register(&smq_policy_type
);
1766 DMERR("register failed %d", r
);
1770 r
= dm_cache_policy_register(&default_policy_type
);
1772 DMERR("register failed (as default) %d", r
);
1773 dm_cache_policy_unregister(&smq_policy_type
);
1780 static void __exit
smq_exit(void)
1782 dm_cache_policy_unregister(&smq_policy_type
);
1783 dm_cache_policy_unregister(&default_policy_type
);
1786 module_init(smq_init
);
1787 module_exit(smq_exit
);
1789 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
1790 MODULE_LICENSE("GPL");
1791 MODULE_DESCRIPTION("smq cache policy");
1793 MODULE_ALIAS("dm-cache-default");