2 * Copyright (C) 2015 Red Hat. All rights reserved.
4 * This file is released under the GPL.
7 #include "dm-cache-policy.h"
8 #include "dm-cache-policy-internal.h"
11 #include <linux/hash.h>
12 #include <linux/jiffies.h>
13 #include <linux/module.h>
14 #include <linux/mutex.h>
15 #include <linux/vmalloc.h>
16 #include <linux/math64.h>
18 #define DM_MSG_PREFIX "cache-policy-smq"
20 /*----------------------------------------------------------------*/
23 * Safe division functions that return zero on divide by zero.
25 static unsigned safe_div(unsigned n
, unsigned d
)
27 return d
? n
/ d
: 0u;
30 static unsigned safe_mod(unsigned n
, unsigned d
)
32 return d
? n
% d
: 0u;
35 /*----------------------------------------------------------------*/
38 unsigned hash_next
:28;
49 /*----------------------------------------------------------------*/
51 #define INDEXER_NULL ((1u << 28u) - 1u)
54 * An entry_space manages a set of entries that we use for the queues.
55 * The clean and dirty queues share entries, so this object is separate
56 * from the queue itself.
63 static int space_init(struct entry_space
*es
, unsigned nr_entries
)
66 es
->begin
= es
->end
= NULL
;
70 es
->begin
= vzalloc(sizeof(struct entry
) * nr_entries
);
74 es
->end
= es
->begin
+ nr_entries
;
78 static void space_exit(struct entry_space
*es
)
83 static struct entry
*__get_entry(struct entry_space
*es
, unsigned block
)
87 e
= es
->begin
+ block
;
93 static unsigned to_index(struct entry_space
*es
, struct entry
*e
)
95 BUG_ON(e
< es
->begin
|| e
>= es
->end
);
99 static struct entry
*to_entry(struct entry_space
*es
, unsigned block
)
101 if (block
== INDEXER_NULL
)
104 return __get_entry(es
, block
);
107 /*----------------------------------------------------------------*/
110 unsigned nr_elts
; /* excluding sentinel entries */
114 static void l_init(struct ilist
*l
)
117 l
->head
= l
->tail
= INDEXER_NULL
;
120 static struct entry
*l_head(struct entry_space
*es
, struct ilist
*l
)
122 return to_entry(es
, l
->head
);
125 static struct entry
*l_tail(struct entry_space
*es
, struct ilist
*l
)
127 return to_entry(es
, l
->tail
);
130 static struct entry
*l_next(struct entry_space
*es
, struct entry
*e
)
132 return to_entry(es
, e
->next
);
135 static struct entry
*l_prev(struct entry_space
*es
, struct entry
*e
)
137 return to_entry(es
, e
->prev
);
140 static bool l_empty(struct ilist
*l
)
142 return l
->head
== INDEXER_NULL
;
145 static void l_add_head(struct entry_space
*es
, struct ilist
*l
, struct entry
*e
)
147 struct entry
*head
= l_head(es
, l
);
150 e
->prev
= INDEXER_NULL
;
153 head
->prev
= l
->head
= to_index(es
, e
);
155 l
->head
= l
->tail
= to_index(es
, e
);
161 static void l_add_tail(struct entry_space
*es
, struct ilist
*l
, struct entry
*e
)
163 struct entry
*tail
= l_tail(es
, l
);
165 e
->next
= INDEXER_NULL
;
169 tail
->next
= l
->tail
= to_index(es
, e
);
171 l
->head
= l
->tail
= to_index(es
, e
);
177 static void l_add_before(struct entry_space
*es
, struct ilist
*l
,
178 struct entry
*old
, struct entry
*e
)
180 struct entry
*prev
= l_prev(es
, old
);
183 l_add_head(es
, l
, e
);
187 e
->next
= to_index(es
, old
);
188 prev
->next
= old
->prev
= to_index(es
, e
);
195 static void l_del(struct entry_space
*es
, struct ilist
*l
, struct entry
*e
)
197 struct entry
*prev
= l_prev(es
, e
);
198 struct entry
*next
= l_next(es
, e
);
201 prev
->next
= e
->next
;
206 next
->prev
= e
->prev
;
214 static struct entry
*l_pop_tail(struct entry_space
*es
, struct ilist
*l
)
218 for (e
= l_tail(es
, l
); e
; e
= l_prev(es
, e
))
227 /*----------------------------------------------------------------*/
230 * The stochastic-multi-queue is a set of lru lists stacked into levels.
231 * Entries are moved up levels when they are used, which loosely orders the
232 * most accessed entries in the top levels and least in the bottom. This
233 * structure is *much* better than a single lru list.
235 #define MAX_LEVELS 64u
238 struct entry_space
*es
;
242 struct ilist qs
[MAX_LEVELS
];
245 * We maintain a count of the number of entries we would like in each
248 unsigned last_target_nr_elts
;
249 unsigned nr_top_levels
;
250 unsigned nr_in_top_levels
;
251 unsigned target_count
[MAX_LEVELS
];
254 static void q_init(struct queue
*q
, struct entry_space
*es
, unsigned nr_levels
)
260 q
->nr_levels
= nr_levels
;
262 for (i
= 0; i
< q
->nr_levels
; i
++) {
264 q
->target_count
[i
] = 0u;
267 q
->last_target_nr_elts
= 0u;
268 q
->nr_top_levels
= 0u;
269 q
->nr_in_top_levels
= 0u;
272 static unsigned q_size(struct queue
*q
)
278 * Insert an entry to the back of the given level.
280 static void q_push(struct queue
*q
, struct entry
*e
)
285 l_add_tail(q
->es
, q
->qs
+ e
->level
, e
);
288 static void q_push_before(struct queue
*q
, struct entry
*old
, struct entry
*e
)
293 l_add_before(q
->es
, q
->qs
+ e
->level
, old
, e
);
296 static void q_del(struct queue
*q
, struct entry
*e
)
298 l_del(q
->es
, q
->qs
+ e
->level
, e
);
304 * Return the oldest entry of the lowest populated level.
306 static struct entry
*q_peek(struct queue
*q
, unsigned max_level
, bool can_cross_sentinel
)
311 max_level
= min(max_level
, q
->nr_levels
);
313 for (level
= 0; level
< max_level
; level
++)
314 for (e
= l_head(q
->es
, q
->qs
+ level
); e
; e
= l_next(q
->es
, e
)) {
316 if (can_cross_sentinel
)
328 static struct entry
*q_pop(struct queue
*q
)
330 struct entry
*e
= q_peek(q
, q
->nr_levels
, true);
339 * Pops an entry from a level that is not past a sentinel.
341 static struct entry
*q_pop_old(struct queue
*q
, unsigned max_level
)
343 struct entry
*e
= q_peek(q
, max_level
, false);
352 * This function assumes there is a non-sentinel entry to pop. It's only
353 * used by redistribute, so we know this is true. It also doesn't adjust
354 * the q->nr_elts count.
356 static struct entry
*__redist_pop_from(struct queue
*q
, unsigned level
)
360 for (; level
< q
->nr_levels
; level
++)
361 for (e
= l_head(q
->es
, q
->qs
+ level
); e
; e
= l_next(q
->es
, e
))
363 l_del(q
->es
, q
->qs
+ e
->level
, e
);
370 static void q_set_targets_subrange_(struct queue
*q
, unsigned nr_elts
, unsigned lbegin
, unsigned lend
)
372 unsigned level
, nr_levels
, entries_per_level
, remainder
;
374 BUG_ON(lbegin
> lend
);
375 BUG_ON(lend
> q
->nr_levels
);
376 nr_levels
= lend
- lbegin
;
377 entries_per_level
= safe_div(nr_elts
, nr_levels
);
378 remainder
= safe_mod(nr_elts
, nr_levels
);
380 for (level
= lbegin
; level
< lend
; level
++)
381 q
->target_count
[level
] =
382 (level
< (lbegin
+ remainder
)) ? entries_per_level
+ 1u : entries_per_level
;
386 * Typically we have fewer elements in the top few levels which allows us
387 * to adjust the promote threshold nicely.
389 static void q_set_targets(struct queue
*q
)
391 if (q
->last_target_nr_elts
== q
->nr_elts
)
394 q
->last_target_nr_elts
= q
->nr_elts
;
396 if (q
->nr_top_levels
> q
->nr_levels
)
397 q_set_targets_subrange_(q
, q
->nr_elts
, 0, q
->nr_levels
);
400 q_set_targets_subrange_(q
, q
->nr_in_top_levels
,
401 q
->nr_levels
- q
->nr_top_levels
, q
->nr_levels
);
403 if (q
->nr_in_top_levels
< q
->nr_elts
)
404 q_set_targets_subrange_(q
, q
->nr_elts
- q
->nr_in_top_levels
,
405 0, q
->nr_levels
- q
->nr_top_levels
);
407 q_set_targets_subrange_(q
, 0, 0, q
->nr_levels
- q
->nr_top_levels
);
411 static void q_redistribute(struct queue
*q
)
413 unsigned target
, level
;
414 struct ilist
*l
, *l_above
;
419 for (level
= 0u; level
< q
->nr_levels
- 1u; level
++) {
421 target
= q
->target_count
[level
];
424 * Pull down some entries from the level above.
426 while (l
->nr_elts
< target
) {
427 e
= __redist_pop_from(q
, level
+ 1u);
434 l_add_tail(q
->es
, l
, e
);
438 * Push some entries up.
440 l_above
= q
->qs
+ level
+ 1u;
441 while (l
->nr_elts
> target
) {
442 e
= l_pop_tail(q
->es
, l
);
448 e
->level
= level
+ 1u;
449 l_add_head(q
->es
, l_above
, e
);
454 static void q_requeue_before(struct queue
*q
, struct entry
*dest
, struct entry
*e
, unsigned extra_levels
)
461 if (extra_levels
&& (e
->level
< q
->nr_levels
- 1u)) {
462 new_level
= min(q
->nr_levels
- 1u, e
->level
+ extra_levels
);
463 for (de
= l_head(q
->es
, q
->qs
+ new_level
); de
; de
= l_next(q
->es
, de
)) {
468 de
->level
= e
->level
;
471 q_push_before(q
, dest
, de
);
477 e
->level
= new_level
;
483 static void q_requeue(struct queue
*q
, struct entry
*e
, unsigned extra_levels
)
485 q_requeue_before(q
, NULL
, e
, extra_levels
);
488 /*----------------------------------------------------------------*/
491 #define SIXTEENTH (1u << (FP_SHIFT - 4u))
492 #define EIGHTH (1u << (FP_SHIFT - 3u))
495 unsigned hit_threshold
;
506 static void stats_init(struct stats
*s
, unsigned nr_levels
)
508 s
->hit_threshold
= (nr_levels
* 3u) / 4u;
513 static void stats_reset(struct stats
*s
)
515 s
->hits
= s
->misses
= 0u;
518 static void stats_level_accessed(struct stats
*s
, unsigned level
)
520 if (level
>= s
->hit_threshold
)
526 static void stats_miss(struct stats
*s
)
532 * There are times when we don't have any confidence in the hotspot queue.
533 * Such as when a fresh cache is created and the blocks have been spread
534 * out across the levels, or if an io load changes. We detect this by
535 * seeing how often a lookup is in the top levels of the hotspot queue.
537 static enum performance
stats_assess(struct stats
*s
)
539 unsigned confidence
= safe_div(s
->hits
<< FP_SHIFT
, s
->hits
+ s
->misses
);
541 if (confidence
< SIXTEENTH
)
544 else if (confidence
< EIGHTH
)
551 /*----------------------------------------------------------------*/
554 struct entry_space
*es
;
555 unsigned long long hash_bits
;
560 * All cache entries are stored in a chained hash table. To save space we
561 * use indexing again, and only store indexes to the next entry.
563 static int h_init(struct hash_table
*ht
, struct entry_space
*es
, unsigned nr_entries
)
565 unsigned i
, nr_buckets
;
568 nr_buckets
= roundup_pow_of_two(max(nr_entries
/ 4u, 16u));
569 ht
->hash_bits
= ffs(nr_buckets
) - 1;
571 ht
->buckets
= vmalloc(sizeof(*ht
->buckets
) * nr_buckets
);
575 for (i
= 0; i
< nr_buckets
; i
++)
576 ht
->buckets
[i
] = INDEXER_NULL
;
581 static void h_exit(struct hash_table
*ht
)
586 static struct entry
*h_head(struct hash_table
*ht
, unsigned bucket
)
588 return to_entry(ht
->es
, ht
->buckets
[bucket
]);
591 static struct entry
*h_next(struct hash_table
*ht
, struct entry
*e
)
593 return to_entry(ht
->es
, e
->hash_next
);
596 static void __h_insert(struct hash_table
*ht
, unsigned bucket
, struct entry
*e
)
598 e
->hash_next
= ht
->buckets
[bucket
];
599 ht
->buckets
[bucket
] = to_index(ht
->es
, e
);
602 static void h_insert(struct hash_table
*ht
, struct entry
*e
)
604 unsigned h
= hash_64(from_oblock(e
->oblock
), ht
->hash_bits
);
605 __h_insert(ht
, h
, e
);
608 static struct entry
*__h_lookup(struct hash_table
*ht
, unsigned h
, dm_oblock_t oblock
,
614 for (e
= h_head(ht
, h
); e
; e
= h_next(ht
, e
)) {
615 if (e
->oblock
== oblock
)
624 static void __h_unlink(struct hash_table
*ht
, unsigned h
,
625 struct entry
*e
, struct entry
*prev
)
628 prev
->hash_next
= e
->hash_next
;
630 ht
->buckets
[h
] = e
->hash_next
;
634 * Also moves each entry to the front of the bucket.
636 static struct entry
*h_lookup(struct hash_table
*ht
, dm_oblock_t oblock
)
638 struct entry
*e
, *prev
;
639 unsigned h
= hash_64(from_oblock(oblock
), ht
->hash_bits
);
641 e
= __h_lookup(ht
, h
, oblock
, &prev
);
644 * Move to the front because this entry is likely
647 __h_unlink(ht
, h
, e
, prev
);
648 __h_insert(ht
, h
, e
);
654 static void h_remove(struct hash_table
*ht
, struct entry
*e
)
656 unsigned h
= hash_64(from_oblock(e
->oblock
), ht
->hash_bits
);
660 * The down side of using a singly linked list is we have to
661 * iterate the bucket to remove an item.
663 e
= __h_lookup(ht
, h
, e
->oblock
, &prev
);
665 __h_unlink(ht
, h
, e
, prev
);
668 /*----------------------------------------------------------------*/
671 struct entry_space
*es
;
674 unsigned nr_allocated
;
678 static void init_allocator(struct entry_alloc
*ea
, struct entry_space
*es
,
679 unsigned begin
, unsigned end
)
684 ea
->nr_allocated
= 0u;
688 for (i
= begin
; i
!= end
; i
++)
689 l_add_tail(ea
->es
, &ea
->free
, __get_entry(ea
->es
, i
));
692 static void init_entry(struct entry
*e
)
695 * We can't memset because that would clear the hotspot and
696 * sentinel bits which remain constant.
698 e
->hash_next
= INDEXER_NULL
;
699 e
->next
= INDEXER_NULL
;
700 e
->prev
= INDEXER_NULL
;
705 static struct entry
*alloc_entry(struct entry_alloc
*ea
)
709 if (l_empty(&ea
->free
))
712 e
= l_pop_tail(ea
->es
, &ea
->free
);
720 * This assumes the cblock hasn't already been allocated.
722 static struct entry
*alloc_particular_entry(struct entry_alloc
*ea
, unsigned i
)
724 struct entry
*e
= __get_entry(ea
->es
, ea
->begin
+ i
);
726 BUG_ON(e
->allocated
);
728 l_del(ea
->es
, &ea
->free
, e
);
735 static void free_entry(struct entry_alloc
*ea
, struct entry
*e
)
737 BUG_ON(!ea
->nr_allocated
);
738 BUG_ON(!e
->allocated
);
741 e
->allocated
= false;
742 l_add_tail(ea
->es
, &ea
->free
, e
);
745 static bool allocator_empty(struct entry_alloc
*ea
)
747 return l_empty(&ea
->free
);
750 static unsigned get_index(struct entry_alloc
*ea
, struct entry
*e
)
752 return to_index(ea
->es
, e
) - ea
->begin
;
755 static struct entry
*get_entry(struct entry_alloc
*ea
, unsigned index
)
757 return __get_entry(ea
->es
, ea
->begin
+ index
);
760 /*----------------------------------------------------------------*/
762 #define NR_HOTSPOT_LEVELS 64u
763 #define NR_CACHE_LEVELS 64u
765 #define WRITEBACK_PERIOD (10 * HZ)
766 #define DEMOTE_PERIOD (60 * HZ)
768 #define HOTSPOT_UPDATE_PERIOD (HZ)
769 #define CACHE_UPDATE_PERIOD (10u * HZ)
772 struct dm_cache_policy policy
;
774 /* protects everything */
776 dm_cblock_t cache_size
;
777 sector_t cache_block_size
;
779 sector_t hotspot_block_size
;
780 unsigned nr_hotspot_blocks
;
781 unsigned cache_blocks_per_hotspot_block
;
782 unsigned hotspot_level_jump
;
784 struct entry_space es
;
785 struct entry_alloc writeback_sentinel_alloc
;
786 struct entry_alloc demote_sentinel_alloc
;
787 struct entry_alloc hotspot_alloc
;
788 struct entry_alloc cache_alloc
;
790 unsigned long *hotspot_hit_bits
;
791 unsigned long *cache_hit_bits
;
794 * We maintain three queues of entries. The cache proper,
795 * consisting of a clean and dirty queue, containing the currently
796 * active mappings. The hotspot queue uses a larger block size to
797 * track blocks that are being hit frequently and potential
798 * candidates for promotion to the cache.
800 struct queue hotspot
;
804 struct stats hotspot_stats
;
805 struct stats cache_stats
;
808 * Keeps track of time, incremented by the core. We use this to
809 * avoid attributing multiple hits within the same tick.
814 * The hash tables allows us to quickly find an entry by origin
817 struct hash_table table
;
818 struct hash_table hotspot_table
;
820 bool current_writeback_sentinels
;
821 unsigned long next_writeback_period
;
823 bool current_demote_sentinels
;
824 unsigned long next_demote_period
;
826 unsigned write_promote_level
;
827 unsigned read_promote_level
;
829 unsigned long next_hotspot_period
;
830 unsigned long next_cache_period
;
833 /*----------------------------------------------------------------*/
835 static struct entry
*get_sentinel(struct entry_alloc
*ea
, unsigned level
, bool which
)
837 return get_entry(ea
, which
? level
: NR_CACHE_LEVELS
+ level
);
840 static struct entry
*writeback_sentinel(struct smq_policy
*mq
, unsigned level
)
842 return get_sentinel(&mq
->writeback_sentinel_alloc
, level
, mq
->current_writeback_sentinels
);
845 static struct entry
*demote_sentinel(struct smq_policy
*mq
, unsigned level
)
847 return get_sentinel(&mq
->demote_sentinel_alloc
, level
, mq
->current_demote_sentinels
);
850 static void __update_writeback_sentinels(struct smq_policy
*mq
)
853 struct queue
*q
= &mq
->dirty
;
854 struct entry
*sentinel
;
856 for (level
= 0; level
< q
->nr_levels
; level
++) {
857 sentinel
= writeback_sentinel(mq
, level
);
863 static void __update_demote_sentinels(struct smq_policy
*mq
)
866 struct queue
*q
= &mq
->clean
;
867 struct entry
*sentinel
;
869 for (level
= 0; level
< q
->nr_levels
; level
++) {
870 sentinel
= demote_sentinel(mq
, level
);
876 static void update_sentinels(struct smq_policy
*mq
)
878 if (time_after(jiffies
, mq
->next_writeback_period
)) {
879 __update_writeback_sentinels(mq
);
880 mq
->next_writeback_period
= jiffies
+ WRITEBACK_PERIOD
;
881 mq
->current_writeback_sentinels
= !mq
->current_writeback_sentinels
;
884 if (time_after(jiffies
, mq
->next_demote_period
)) {
885 __update_demote_sentinels(mq
);
886 mq
->next_demote_period
= jiffies
+ DEMOTE_PERIOD
;
887 mq
->current_demote_sentinels
= !mq
->current_demote_sentinels
;
891 static void __sentinels_init(struct smq_policy
*mq
)
894 struct entry
*sentinel
;
896 for (level
= 0; level
< NR_CACHE_LEVELS
; level
++) {
897 sentinel
= writeback_sentinel(mq
, level
);
898 sentinel
->level
= level
;
899 q_push(&mq
->dirty
, sentinel
);
901 sentinel
= demote_sentinel(mq
, level
);
902 sentinel
->level
= level
;
903 q_push(&mq
->clean
, sentinel
);
907 static void sentinels_init(struct smq_policy
*mq
)
909 mq
->next_writeback_period
= jiffies
+ WRITEBACK_PERIOD
;
910 mq
->next_demote_period
= jiffies
+ DEMOTE_PERIOD
;
912 mq
->current_writeback_sentinels
= false;
913 mq
->current_demote_sentinels
= false;
914 __sentinels_init(mq
);
916 mq
->current_writeback_sentinels
= !mq
->current_writeback_sentinels
;
917 mq
->current_demote_sentinels
= !mq
->current_demote_sentinels
;
918 __sentinels_init(mq
);
921 /*----------------------------------------------------------------*/
924 * These methods tie together the dirty queue, clean queue and hash table.
926 static void push_new(struct smq_policy
*mq
, struct entry
*e
)
928 struct queue
*q
= e
->dirty
? &mq
->dirty
: &mq
->clean
;
929 h_insert(&mq
->table
, e
);
933 static void push(struct smq_policy
*mq
, struct entry
*e
)
935 struct entry
*sentinel
;
937 h_insert(&mq
->table
, e
);
940 * Punch this into the queue just in front of the sentinel, to
941 * ensure it's cleaned straight away.
944 sentinel
= writeback_sentinel(mq
, e
->level
);
945 q_push_before(&mq
->dirty
, sentinel
, e
);
947 sentinel
= demote_sentinel(mq
, e
->level
);
948 q_push_before(&mq
->clean
, sentinel
, e
);
953 * Removes an entry from cache. Removes from the hash table.
955 static void __del(struct smq_policy
*mq
, struct queue
*q
, struct entry
*e
)
958 h_remove(&mq
->table
, e
);
961 static void del(struct smq_policy
*mq
, struct entry
*e
)
963 __del(mq
, e
->dirty
? &mq
->dirty
: &mq
->clean
, e
);
966 static struct entry
*pop_old(struct smq_policy
*mq
, struct queue
*q
, unsigned max_level
)
968 struct entry
*e
= q_pop_old(q
, max_level
);
970 h_remove(&mq
->table
, e
);
974 static dm_cblock_t
infer_cblock(struct smq_policy
*mq
, struct entry
*e
)
976 return to_cblock(get_index(&mq
->cache_alloc
, e
));
979 static void requeue(struct smq_policy
*mq
, struct entry
*e
)
981 struct entry
*sentinel
;
983 if (!test_and_set_bit(from_cblock(infer_cblock(mq
, e
)), mq
->cache_hit_bits
)) {
985 sentinel
= writeback_sentinel(mq
, e
->level
);
986 q_requeue_before(&mq
->dirty
, sentinel
, e
, 1u);
988 sentinel
= demote_sentinel(mq
, e
->level
);
989 q_requeue_before(&mq
->clean
, sentinel
, e
, 1u);
994 static unsigned default_promote_level(struct smq_policy
*mq
)
997 * The promote level depends on the current performance of the
1000 * If the cache is performing badly, then we can't afford
1001 * to promote much without causing performance to drop below that
1002 * of the origin device.
1004 * If the cache is performing well, then we don't need to promote
1005 * much. If it isn't broken, don't fix it.
1007 * If the cache is middling then we promote more.
1009 * This scheme reminds me of a graph of entropy vs probability of a
1012 static unsigned table
[] = {1, 1, 1, 2, 4, 6, 7, 8, 7, 6, 4, 4, 3, 3, 2, 2, 1};
1014 unsigned hits
= mq
->cache_stats
.hits
;
1015 unsigned misses
= mq
->cache_stats
.misses
;
1016 unsigned index
= safe_div(hits
<< 4u, hits
+ misses
);
1017 return table
[index
];
1020 static void update_promote_levels(struct smq_policy
*mq
)
1023 * If there are unused cache entries then we want to be really
1026 unsigned threshold_level
= allocator_empty(&mq
->cache_alloc
) ?
1027 default_promote_level(mq
) : (NR_HOTSPOT_LEVELS
/ 2u);
1030 * If the hotspot queue is performing badly then we have little
1031 * confidence that we know which blocks to promote. So we cut down
1032 * the amount of promotions.
1034 switch (stats_assess(&mq
->hotspot_stats
)) {
1036 threshold_level
/= 4u;
1040 threshold_level
/= 2u;
1047 mq
->read_promote_level
= NR_HOTSPOT_LEVELS
- threshold_level
;
1048 mq
->write_promote_level
= (NR_HOTSPOT_LEVELS
- threshold_level
) + 2u;
1052 * If the hotspot queue is performing badly, then we try and move entries
1053 * around more quickly.
1055 static void update_level_jump(struct smq_policy
*mq
)
1057 switch (stats_assess(&mq
->hotspot_stats
)) {
1059 mq
->hotspot_level_jump
= 4u;
1063 mq
->hotspot_level_jump
= 2u;
1067 mq
->hotspot_level_jump
= 1u;
1072 static void end_hotspot_period(struct smq_policy
*mq
)
1074 clear_bitset(mq
->hotspot_hit_bits
, mq
->nr_hotspot_blocks
);
1075 update_promote_levels(mq
);
1077 if (time_after(jiffies
, mq
->next_hotspot_period
)) {
1078 update_level_jump(mq
);
1079 q_redistribute(&mq
->hotspot
);
1080 stats_reset(&mq
->hotspot_stats
);
1081 mq
->next_hotspot_period
= jiffies
+ HOTSPOT_UPDATE_PERIOD
;
1085 static void end_cache_period(struct smq_policy
*mq
)
1087 if (time_after(jiffies
, mq
->next_cache_period
)) {
1088 clear_bitset(mq
->cache_hit_bits
, from_cblock(mq
->cache_size
));
1090 q_redistribute(&mq
->dirty
);
1091 q_redistribute(&mq
->clean
);
1092 stats_reset(&mq
->cache_stats
);
1094 mq
->next_cache_period
= jiffies
+ CACHE_UPDATE_PERIOD
;
1098 static int demote_cblock(struct smq_policy
*mq
,
1099 struct policy_locker
*locker
,
1100 dm_oblock_t
*oblock
)
1102 struct entry
*demoted
= q_peek(&mq
->clean
, mq
->clean
.nr_levels
, false);
1105 * We could get a block from mq->dirty, but that
1106 * would add extra latency to the triggering bio as it
1107 * waits for the writeback. Better to not promote this
1108 * time and hope there's a clean block next time this block
1113 if (locker
->fn(locker
, demoted
->oblock
))
1115 * We couldn't lock this block.
1120 *oblock
= demoted
->oblock
;
1121 free_entry(&mq
->cache_alloc
, demoted
);
1126 enum promote_result
{
1133 * Converts a boolean into a promote result.
1135 static enum promote_result
maybe_promote(bool promote
)
1137 return promote
? PROMOTE_PERMANENT
: PROMOTE_NOT
;
1140 static enum promote_result
should_promote(struct smq_policy
*mq
, struct entry
*hs_e
, struct bio
*bio
,
1143 if (bio_data_dir(bio
) == WRITE
) {
1144 if (!allocator_empty(&mq
->cache_alloc
) && fast_promote
)
1145 return PROMOTE_TEMPORARY
;
1148 return maybe_promote(hs_e
->level
>= mq
->write_promote_level
);
1150 return maybe_promote(hs_e
->level
>= mq
->read_promote_level
);
1153 static void insert_in_cache(struct smq_policy
*mq
, dm_oblock_t oblock
,
1154 struct policy_locker
*locker
,
1155 struct policy_result
*result
, enum promote_result pr
)
1160 if (allocator_empty(&mq
->cache_alloc
)) {
1161 result
->op
= POLICY_REPLACE
;
1162 r
= demote_cblock(mq
, locker
, &result
->old_oblock
);
1164 result
->op
= POLICY_MISS
;
1169 result
->op
= POLICY_NEW
;
1171 e
= alloc_entry(&mq
->cache_alloc
);
1175 if (pr
== PROMOTE_TEMPORARY
)
1180 result
->cblock
= infer_cblock(mq
, e
);
1183 static dm_oblock_t
to_hblock(struct smq_policy
*mq
, dm_oblock_t b
)
1185 sector_t r
= from_oblock(b
);
1186 (void) sector_div(r
, mq
->cache_blocks_per_hotspot_block
);
1187 return to_oblock(r
);
1190 static struct entry
*update_hotspot_queue(struct smq_policy
*mq
, dm_oblock_t b
, struct bio
*bio
)
1193 dm_oblock_t hb
= to_hblock(mq
, b
);
1194 struct entry
*e
= h_lookup(&mq
->hotspot_table
, hb
);
1197 stats_level_accessed(&mq
->hotspot_stats
, e
->level
);
1199 hi
= get_index(&mq
->hotspot_alloc
, e
);
1200 q_requeue(&mq
->hotspot
, e
,
1201 test_and_set_bit(hi
, mq
->hotspot_hit_bits
) ?
1202 0u : mq
->hotspot_level_jump
);
1205 stats_miss(&mq
->hotspot_stats
);
1207 e
= alloc_entry(&mq
->hotspot_alloc
);
1209 e
= q_pop(&mq
->hotspot
);
1211 h_remove(&mq
->hotspot_table
, e
);
1212 hi
= get_index(&mq
->hotspot_alloc
, e
);
1213 clear_bit(hi
, mq
->hotspot_hit_bits
);
1220 q_push(&mq
->hotspot
, e
);
1221 h_insert(&mq
->hotspot_table
, e
);
1229 * Looks the oblock up in the hash table, then decides whether to put in
1230 * pre_cache, or cache etc.
1232 static int map(struct smq_policy
*mq
, struct bio
*bio
, dm_oblock_t oblock
,
1233 bool can_migrate
, bool fast_promote
,
1234 struct policy_locker
*locker
, struct policy_result
*result
)
1236 struct entry
*e
, *hs_e
;
1237 enum promote_result pr
;
1239 hs_e
= update_hotspot_queue(mq
, oblock
, bio
);
1241 e
= h_lookup(&mq
->table
, oblock
);
1243 stats_level_accessed(&mq
->cache_stats
, e
->level
);
1246 result
->op
= POLICY_HIT
;
1247 result
->cblock
= infer_cblock(mq
, e
);
1250 stats_miss(&mq
->cache_stats
);
1252 pr
= should_promote(mq
, hs_e
, bio
, fast_promote
);
1253 if (pr
== PROMOTE_NOT
)
1254 result
->op
= POLICY_MISS
;
1258 result
->op
= POLICY_MISS
;
1259 return -EWOULDBLOCK
;
1262 insert_in_cache(mq
, oblock
, locker
, result
, pr
);
1269 /*----------------------------------------------------------------*/
1272 * Public interface, via the policy struct. See dm-cache-policy.h for a
1273 * description of these.
1276 static struct smq_policy
*to_smq_policy(struct dm_cache_policy
*p
)
1278 return container_of(p
, struct smq_policy
, policy
);
1281 static void smq_destroy(struct dm_cache_policy
*p
)
1283 struct smq_policy
*mq
= to_smq_policy(p
);
1285 h_exit(&mq
->hotspot_table
);
1287 free_bitset(mq
->hotspot_hit_bits
);
1288 free_bitset(mq
->cache_hit_bits
);
1289 space_exit(&mq
->es
);
1293 static int smq_map(struct dm_cache_policy
*p
, dm_oblock_t oblock
,
1294 bool can_block
, bool can_migrate
, bool fast_promote
,
1295 struct bio
*bio
, struct policy_locker
*locker
,
1296 struct policy_result
*result
)
1299 unsigned long flags
;
1300 struct smq_policy
*mq
= to_smq_policy(p
);
1302 result
->op
= POLICY_MISS
;
1304 spin_lock_irqsave(&mq
->lock
, flags
);
1305 r
= map(mq
, bio
, oblock
, can_migrate
, fast_promote
, locker
, result
);
1306 spin_unlock_irqrestore(&mq
->lock
, flags
);
1311 static int smq_lookup(struct dm_cache_policy
*p
, dm_oblock_t oblock
, dm_cblock_t
*cblock
)
1314 unsigned long flags
;
1315 struct smq_policy
*mq
= to_smq_policy(p
);
1318 spin_lock_irqsave(&mq
->lock
, flags
);
1319 e
= h_lookup(&mq
->table
, oblock
);
1321 *cblock
= infer_cblock(mq
, e
);
1325 spin_unlock_irqrestore(&mq
->lock
, flags
);
1330 static void __smq_set_clear_dirty(struct smq_policy
*mq
, dm_oblock_t oblock
, bool set
)
1334 e
= h_lookup(&mq
->table
, oblock
);
1342 static void smq_set_dirty(struct dm_cache_policy
*p
, dm_oblock_t oblock
)
1344 unsigned long flags
;
1345 struct smq_policy
*mq
= to_smq_policy(p
);
1347 spin_lock_irqsave(&mq
->lock
, flags
);
1348 __smq_set_clear_dirty(mq
, oblock
, true);
1349 spin_unlock_irqrestore(&mq
->lock
, flags
);
1352 static void smq_clear_dirty(struct dm_cache_policy
*p
, dm_oblock_t oblock
)
1354 struct smq_policy
*mq
= to_smq_policy(p
);
1355 unsigned long flags
;
1357 spin_lock_irqsave(&mq
->lock
, flags
);
1358 __smq_set_clear_dirty(mq
, oblock
, false);
1359 spin_unlock_irqrestore(&mq
->lock
, flags
);
1362 static int smq_load_mapping(struct dm_cache_policy
*p
,
1363 dm_oblock_t oblock
, dm_cblock_t cblock
,
1364 uint32_t hint
, bool hint_valid
)
1366 struct smq_policy
*mq
= to_smq_policy(p
);
1369 e
= alloc_particular_entry(&mq
->cache_alloc
, from_cblock(cblock
));
1371 e
->dirty
= false; /* this gets corrected in a minute */
1372 e
->level
= hint_valid
? min(hint
, NR_CACHE_LEVELS
- 1) : 1;
1378 static int smq_save_hints(struct smq_policy
*mq
, struct queue
*q
,
1379 policy_walk_fn fn
, void *context
)
1385 for (level
= 0; level
< q
->nr_levels
; level
++)
1386 for (e
= l_head(q
->es
, q
->qs
+ level
); e
; e
= l_next(q
->es
, e
)) {
1388 r
= fn(context
, infer_cblock(mq
, e
),
1389 e
->oblock
, e
->level
);
1398 static int smq_walk_mappings(struct dm_cache_policy
*p
, policy_walk_fn fn
,
1401 struct smq_policy
*mq
= to_smq_policy(p
);
1405 * We don't need to lock here since this method is only called once
1406 * the IO has stopped.
1408 r
= smq_save_hints(mq
, &mq
->clean
, fn
, context
);
1410 r
= smq_save_hints(mq
, &mq
->dirty
, fn
, context
);
1415 static void __remove_mapping(struct smq_policy
*mq
, dm_oblock_t oblock
)
1419 e
= h_lookup(&mq
->table
, oblock
);
1423 free_entry(&mq
->cache_alloc
, e
);
1426 static void smq_remove_mapping(struct dm_cache_policy
*p
, dm_oblock_t oblock
)
1428 struct smq_policy
*mq
= to_smq_policy(p
);
1429 unsigned long flags
;
1431 spin_lock_irqsave(&mq
->lock
, flags
);
1432 __remove_mapping(mq
, oblock
);
1433 spin_unlock_irqrestore(&mq
->lock
, flags
);
1436 static int __remove_cblock(struct smq_policy
*mq
, dm_cblock_t cblock
)
1438 struct entry
*e
= get_entry(&mq
->cache_alloc
, from_cblock(cblock
));
1440 if (!e
|| !e
->allocated
)
1444 free_entry(&mq
->cache_alloc
, e
);
1449 static int smq_remove_cblock(struct dm_cache_policy
*p
, dm_cblock_t cblock
)
1452 unsigned long flags
;
1453 struct smq_policy
*mq
= to_smq_policy(p
);
1455 spin_lock_irqsave(&mq
->lock
, flags
);
1456 r
= __remove_cblock(mq
, cblock
);
1457 spin_unlock_irqrestore(&mq
->lock
, flags
);
1463 #define CLEAN_TARGET_CRITICAL 5u /* percent */
1465 static bool clean_target_met(struct smq_policy
*mq
, bool critical
)
1469 * Cache entries may not be populated. So we're cannot rely on the
1470 * size of the clean queue.
1472 unsigned nr_clean
= from_cblock(mq
->cache_size
) - q_size(&mq
->dirty
);
1473 unsigned target
= from_cblock(mq
->cache_size
) * CLEAN_TARGET_CRITICAL
/ 100u;
1475 return nr_clean
>= target
;
1477 return !q_size(&mq
->dirty
);
1480 static int __smq_writeback_work(struct smq_policy
*mq
, dm_oblock_t
*oblock
,
1481 dm_cblock_t
*cblock
, bool critical_only
)
1483 struct entry
*e
= NULL
;
1484 bool target_met
= clean_target_met(mq
, critical_only
);
1488 * Always try and keep the bottom level clean.
1490 e
= pop_old(mq
, &mq
->dirty
, target_met
? 1u : mq
->dirty
.nr_levels
);
1493 e
= pop_old(mq
, &mq
->dirty
, mq
->dirty
.nr_levels
);
1498 *oblock
= e
->oblock
;
1499 *cblock
= infer_cblock(mq
, e
);
1506 static int smq_writeback_work(struct dm_cache_policy
*p
, dm_oblock_t
*oblock
,
1507 dm_cblock_t
*cblock
, bool critical_only
)
1510 unsigned long flags
;
1511 struct smq_policy
*mq
= to_smq_policy(p
);
1513 spin_lock_irqsave(&mq
->lock
, flags
);
1514 r
= __smq_writeback_work(mq
, oblock
, cblock
, critical_only
);
1515 spin_unlock_irqrestore(&mq
->lock
, flags
);
1520 static void __force_mapping(struct smq_policy
*mq
,
1521 dm_oblock_t current_oblock
, dm_oblock_t new_oblock
)
1523 struct entry
*e
= h_lookup(&mq
->table
, current_oblock
);
1527 e
->oblock
= new_oblock
;
1533 static void smq_force_mapping(struct dm_cache_policy
*p
,
1534 dm_oblock_t current_oblock
, dm_oblock_t new_oblock
)
1536 unsigned long flags
;
1537 struct smq_policy
*mq
= to_smq_policy(p
);
1539 spin_lock_irqsave(&mq
->lock
, flags
);
1540 __force_mapping(mq
, current_oblock
, new_oblock
);
1541 spin_unlock_irqrestore(&mq
->lock
, flags
);
1544 static dm_cblock_t
smq_residency(struct dm_cache_policy
*p
)
1547 unsigned long flags
;
1548 struct smq_policy
*mq
= to_smq_policy(p
);
1550 spin_lock_irqsave(&mq
->lock
, flags
);
1551 r
= to_cblock(mq
->cache_alloc
.nr_allocated
);
1552 spin_unlock_irqrestore(&mq
->lock
, flags
);
1557 static void smq_tick(struct dm_cache_policy
*p
, bool can_block
)
1559 struct smq_policy
*mq
= to_smq_policy(p
);
1560 unsigned long flags
;
1562 spin_lock_irqsave(&mq
->lock
, flags
);
1564 update_sentinels(mq
);
1565 end_hotspot_period(mq
);
1566 end_cache_period(mq
);
1567 spin_unlock_irqrestore(&mq
->lock
, flags
);
1570 /* Init the policy plugin interface function pointers. */
1571 static void init_policy_functions(struct smq_policy
*mq
)
1573 mq
->policy
.destroy
= smq_destroy
;
1574 mq
->policy
.map
= smq_map
;
1575 mq
->policy
.lookup
= smq_lookup
;
1576 mq
->policy
.set_dirty
= smq_set_dirty
;
1577 mq
->policy
.clear_dirty
= smq_clear_dirty
;
1578 mq
->policy
.load_mapping
= smq_load_mapping
;
1579 mq
->policy
.walk_mappings
= smq_walk_mappings
;
1580 mq
->policy
.remove_mapping
= smq_remove_mapping
;
1581 mq
->policy
.remove_cblock
= smq_remove_cblock
;
1582 mq
->policy
.writeback_work
= smq_writeback_work
;
1583 mq
->policy
.force_mapping
= smq_force_mapping
;
1584 mq
->policy
.residency
= smq_residency
;
1585 mq
->policy
.tick
= smq_tick
;
1588 static bool too_many_hotspot_blocks(sector_t origin_size
,
1589 sector_t hotspot_block_size
,
1590 unsigned nr_hotspot_blocks
)
1592 return (hotspot_block_size
* nr_hotspot_blocks
) > origin_size
;
1595 static void calc_hotspot_params(sector_t origin_size
,
1596 sector_t cache_block_size
,
1597 unsigned nr_cache_blocks
,
1598 sector_t
*hotspot_block_size
,
1599 unsigned *nr_hotspot_blocks
)
1601 *hotspot_block_size
= cache_block_size
* 16u;
1602 *nr_hotspot_blocks
= max(nr_cache_blocks
/ 4u, 1024u);
1604 while ((*hotspot_block_size
> cache_block_size
) &&
1605 too_many_hotspot_blocks(origin_size
, *hotspot_block_size
, *nr_hotspot_blocks
))
1606 *hotspot_block_size
/= 2u;
1609 static struct dm_cache_policy
*smq_create(dm_cblock_t cache_size
,
1610 sector_t origin_size
,
1611 sector_t cache_block_size
)
1614 unsigned nr_sentinels_per_queue
= 2u * NR_CACHE_LEVELS
;
1615 unsigned total_sentinels
= 2u * nr_sentinels_per_queue
;
1616 struct smq_policy
*mq
= kzalloc(sizeof(*mq
), GFP_KERNEL
);
1621 init_policy_functions(mq
);
1622 mq
->cache_size
= cache_size
;
1623 mq
->cache_block_size
= cache_block_size
;
1625 calc_hotspot_params(origin_size
, cache_block_size
, from_cblock(cache_size
),
1626 &mq
->hotspot_block_size
, &mq
->nr_hotspot_blocks
);
1628 mq
->cache_blocks_per_hotspot_block
= div64_u64(mq
->hotspot_block_size
, mq
->cache_block_size
);
1629 mq
->hotspot_level_jump
= 1u;
1630 if (space_init(&mq
->es
, total_sentinels
+ mq
->nr_hotspot_blocks
+ from_cblock(cache_size
))) {
1631 DMERR("couldn't initialize entry space");
1635 init_allocator(&mq
->writeback_sentinel_alloc
, &mq
->es
, 0, nr_sentinels_per_queue
);
1636 for (i
= 0; i
< nr_sentinels_per_queue
; i
++)
1637 get_entry(&mq
->writeback_sentinel_alloc
, i
)->sentinel
= true;
1639 init_allocator(&mq
->demote_sentinel_alloc
, &mq
->es
, nr_sentinels_per_queue
, total_sentinels
);
1640 for (i
= 0; i
< nr_sentinels_per_queue
; i
++)
1641 get_entry(&mq
->demote_sentinel_alloc
, i
)->sentinel
= true;
1643 init_allocator(&mq
->hotspot_alloc
, &mq
->es
, total_sentinels
,
1644 total_sentinels
+ mq
->nr_hotspot_blocks
);
1646 init_allocator(&mq
->cache_alloc
, &mq
->es
,
1647 total_sentinels
+ mq
->nr_hotspot_blocks
,
1648 total_sentinels
+ mq
->nr_hotspot_blocks
+ from_cblock(cache_size
));
1650 mq
->hotspot_hit_bits
= alloc_bitset(mq
->nr_hotspot_blocks
);
1651 if (!mq
->hotspot_hit_bits
) {
1652 DMERR("couldn't allocate hotspot hit bitset");
1653 goto bad_hotspot_hit_bits
;
1655 clear_bitset(mq
->hotspot_hit_bits
, mq
->nr_hotspot_blocks
);
1657 if (from_cblock(cache_size
)) {
1658 mq
->cache_hit_bits
= alloc_bitset(from_cblock(cache_size
));
1659 if (!mq
->cache_hit_bits
) {
1660 DMERR("couldn't allocate cache hit bitset");
1661 goto bad_cache_hit_bits
;
1663 clear_bitset(mq
->cache_hit_bits
, from_cblock(mq
->cache_size
));
1665 mq
->cache_hit_bits
= NULL
;
1668 spin_lock_init(&mq
->lock
);
1670 q_init(&mq
->hotspot
, &mq
->es
, NR_HOTSPOT_LEVELS
);
1671 mq
->hotspot
.nr_top_levels
= 8;
1672 mq
->hotspot
.nr_in_top_levels
= min(mq
->nr_hotspot_blocks
/ NR_HOTSPOT_LEVELS
,
1673 from_cblock(mq
->cache_size
) / mq
->cache_blocks_per_hotspot_block
);
1675 q_init(&mq
->clean
, &mq
->es
, NR_CACHE_LEVELS
);
1676 q_init(&mq
->dirty
, &mq
->es
, NR_CACHE_LEVELS
);
1678 stats_init(&mq
->hotspot_stats
, NR_HOTSPOT_LEVELS
);
1679 stats_init(&mq
->cache_stats
, NR_CACHE_LEVELS
);
1681 if (h_init(&mq
->table
, &mq
->es
, from_cblock(cache_size
)))
1682 goto bad_alloc_table
;
1684 if (h_init(&mq
->hotspot_table
, &mq
->es
, mq
->nr_hotspot_blocks
))
1685 goto bad_alloc_hotspot_table
;
1688 mq
->write_promote_level
= mq
->read_promote_level
= NR_HOTSPOT_LEVELS
;
1690 mq
->next_hotspot_period
= jiffies
;
1691 mq
->next_cache_period
= jiffies
;
1695 bad_alloc_hotspot_table
:
1698 free_bitset(mq
->cache_hit_bits
);
1700 free_bitset(mq
->hotspot_hit_bits
);
1701 bad_hotspot_hit_bits
:
1702 space_exit(&mq
->es
);
1709 /*----------------------------------------------------------------*/
1711 static struct dm_cache_policy_type smq_policy_type
= {
1713 .version
= {1, 0, 0},
1715 .owner
= THIS_MODULE
,
1716 .create
= smq_create
1719 static struct dm_cache_policy_type default_policy_type
= {
1721 .version
= {1, 4, 0},
1723 .owner
= THIS_MODULE
,
1724 .create
= smq_create
,
1725 .real
= &smq_policy_type
1728 static int __init
smq_init(void)
1732 r
= dm_cache_policy_register(&smq_policy_type
);
1734 DMERR("register failed %d", r
);
1738 r
= dm_cache_policy_register(&default_policy_type
);
1740 DMERR("register failed (as default) %d", r
);
1741 dm_cache_policy_unregister(&smq_policy_type
);
1748 static void __exit
smq_exit(void)
1750 dm_cache_policy_unregister(&smq_policy_type
);
1751 dm_cache_policy_unregister(&default_policy_type
);
1754 module_init(smq_init
);
1755 module_exit(smq_exit
);
1757 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
1758 MODULE_LICENSE("GPL");
1759 MODULE_DESCRIPTION("smq cache policy");
1761 MODULE_ALIAS("dm-cache-default");