1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright 2023 Red Hat
6 #include "sparse-cache.h"
8 #include <linux/cache.h>
9 #include <linux/delay.h>
10 #include <linux/dm-bufio.h>
13 #include "memory-alloc.h"
14 #include "permassert.h"
16 #include "chapter-index.h"
21 * Since the cache is small, it is implemented as a simple array of cache entries. Searching for a
22 * specific virtual chapter is implemented as a linear search. The cache replacement policy is
23 * least-recently-used (LRU). Again, the small size of the cache allows the LRU order to be
24 * maintained by shifting entries in an array list.
26 * Changing the contents of the cache requires the coordinated participation of all zone threads
27 * via the careful use of barrier messages sent to all the index zones by the triage queue worker
28 * thread. The critical invariant for coordination is that the cache membership must not change
29 * between updates, so that all calls to uds_sparse_cache_contains() from the zone threads must all
30 * receive the same results for every virtual chapter number. To ensure that critical invariant,
31 * state changes such as "that virtual chapter is no longer in the volume" and "skip searching that
32 * chapter because it has had too many cache misses" are represented separately from the cache
33 * membership information (the virtual chapter number).
35 * As a result of this invariant, we have the guarantee that every zone thread will call
36 * uds_update_sparse_cache() once and exactly once to request a chapter that is not in the cache,
37 * and the serialization of the barrier requests from the triage queue ensures they will all
38 * request the same chapter number. This means the only synchronization we need can be provided by
39 * a pair of thread barriers used only in the uds_update_sparse_cache() call, providing a critical
40 * section where a single zone thread can drive the cache update while all the other zone threads
41 * are known to be blocked, waiting in the second barrier. Outside that critical section, all the
42 * zone threads implicitly hold a shared lock. Inside it, the thread for zone zero holds an
43 * exclusive lock. No other threads may access or modify the cache entries.
45 * Chapter statistics must only be modified by a single thread, which is also the zone zero thread.
46 * All fields that might be frequently updated by that thread are kept in separate cache-aligned
47 * structures so they will not cause cache contention via "false sharing" with the fields that are
48 * frequently accessed by all of the zone threads.
50 * The LRU order is managed independently by each zone thread, and each zone uses its own list for
51 * searching and cache membership queries. The zone zero list is used to decide which chapter to
52 * evict when the cache is updated, and its search list is copied to the other threads at that
55 * The virtual chapter number field of the cache entry is the single field indicating whether a
56 * chapter is a member of the cache or not. The value NO_CHAPTER is used to represent a null or
57 * undefined chapter number. When present in the virtual chapter number field of a
58 * cached_chapter_index, it indicates that the cache entry is dead, and all the other fields of
59 * that entry (other than immutable pointers to cache memory) are undefined and irrelevant. Any
60 * cache entry that is not marked as dead is fully defined and a member of the cache, and
61 * uds_sparse_cache_contains() will always return true for any virtual chapter number that appears
62 * in any of the cache entries.
64 * A chapter index that is a member of the cache may be excluded from searches between calls to
65 * uds_update_sparse_cache() in two different ways. First, when a chapter falls off the end of the
66 * volume, its virtual chapter number will be less that the oldest virtual chapter number. Since
67 * that chapter is no longer part of the volume, there's no point in continuing to search that
68 * chapter index. Once invalidated, that virtual chapter will still be considered a member of the
69 * cache, but it will no longer be searched for matching names.
71 * The second mechanism is a heuristic based on keeping track of the number of consecutive search
72 * misses in a given chapter index. Once that count exceeds a threshold, the skip_search flag will
73 * be set to true, causing the chapter to be skipped when searching the entire cache, but still
74 * allowing it to be found when searching for a hook in that specific chapter. Finding a hook will
75 * clear the skip_search flag, once again allowing the non-hook searches to use that cache entry.
76 * Again, regardless of the state of the skip_search flag, the virtual chapter must still
77 * considered to be a member of the cache for uds_sparse_cache_contains().
80 #define SKIP_SEARCH_THRESHOLD 20000
84 * These counters are essentially fields of the struct cached_chapter_index, but are segregated
85 * into this structure because they are frequently modified. They are grouped and aligned to keep
86 * them on different cache lines from the chapter fields that are accessed far more often than they
89 struct __aligned(L1_CACHE_BYTES
) cached_index_counters
{
90 u64 consecutive_misses
;
93 struct __aligned(L1_CACHE_BYTES
) cached_chapter_index
{
95 * The virtual chapter number of the cached chapter index. NO_CHAPTER means this cache
96 * entry is unused. This field must only be modified in the critical section in
97 * uds_update_sparse_cache().
101 u32 index_pages_count
;
104 * These pointers are immutable during the life of the cache. The contents of the arrays
105 * change when the cache entry is replaced.
107 struct delta_index_page
*index_pages
;
108 struct dm_buffer
**page_buffers
;
111 * If set, skip the chapter when searching the entire cache. This flag is just a
112 * performance optimization. This flag is mutable between cache updates, but it rarely
113 * changes and is frequently accessed, so it groups with the immutable fields.
118 * The cache-aligned counters change often and are placed at the end of the structure to
119 * prevent false sharing with the more stable fields above.
121 struct cached_index_counters counters
;
125 * A search_list represents an ordering of the sparse chapter index cache entry array, from most
126 * recently accessed to least recently accessed, which is the order in which the indexes should be
127 * searched and the reverse order in which they should be evicted from the cache.
129 * Cache entries that are dead or empty are kept at the end of the list, avoiding the need to even
130 * iterate over them to search, and ensuring that dead entries are replaced before any live entries
133 * The search list is instantiated for each zone thread, avoiding any need for synchronization. The
134 * structure is allocated on a cache boundary to avoid false sharing of memory cache lines between
140 struct cached_chapter_index
*entries
[];
143 struct threads_barrier
{
144 /* Lock for this barrier object */
145 struct semaphore lock
;
146 /* Semaphore for threads waiting at this barrier */
147 struct semaphore wait
;
148 /* Number of threads which have arrived */
150 /* Total number of threads using this barrier */
154 struct sparse_cache
{
155 const struct index_geometry
*geometry
;
156 unsigned int capacity
;
157 unsigned int zone_count
;
159 unsigned int skip_threshold
;
160 struct search_list
*search_lists
[MAX_ZONES
];
161 struct cached_chapter_index
**scratch_entries
;
163 struct threads_barrier begin_update_barrier
;
164 struct threads_barrier end_update_barrier
;
166 struct cached_chapter_index chapters
[];
169 static void initialize_threads_barrier(struct threads_barrier
*barrier
,
170 unsigned int thread_count
)
172 sema_init(&barrier
->lock
, 1);
173 barrier
->arrived
= 0;
174 barrier
->thread_count
= thread_count
;
175 sema_init(&barrier
->wait
, 0);
178 static inline void __down(struct semaphore
*semaphore
)
181 * Do not use down(semaphore). Instead use down_interruptible so that
182 * we do not get 120 second stall messages in kern.log.
184 while (down_interruptible(semaphore
) != 0) {
186 * If we're called from a user-mode process (e.g., "dmsetup
187 * remove") while waiting for an operation that may take a
188 * while (e.g., UDS index save), and a signal is sent (SIGINT,
189 * SIGUSR2), then down_interruptible will not block. If that
190 * happens, sleep briefly to avoid keeping the CPU locked up in
191 * this loop. We could just call cond_resched, but then we'd
192 * still keep consuming CPU time slices and swamp other threads
193 * trying to do computational work.
199 static void enter_threads_barrier(struct threads_barrier
*barrier
)
201 __down(&barrier
->lock
);
202 if (++barrier
->arrived
== barrier
->thread_count
) {
206 for (i
= 1; i
< barrier
->thread_count
; i
++)
209 barrier
->arrived
= 0;
213 __down(&barrier
->wait
);
217 static int __must_check
initialize_cached_chapter_index(struct cached_chapter_index
*chapter
,
218 const struct index_geometry
*geometry
)
222 chapter
->virtual_chapter
= NO_CHAPTER
;
223 chapter
->index_pages_count
= geometry
->index_pages_per_chapter
;
225 result
= vdo_allocate(chapter
->index_pages_count
, struct delta_index_page
,
226 __func__
, &chapter
->index_pages
);
227 if (result
!= VDO_SUCCESS
)
230 return vdo_allocate(chapter
->index_pages_count
, struct dm_buffer
*,
231 "sparse index volume pages", &chapter
->page_buffers
);
234 static int __must_check
make_search_list(struct sparse_cache
*cache
,
235 struct search_list
**list_ptr
)
237 struct search_list
*list
;
242 bytes
= (sizeof(struct search_list
) +
243 (cache
->capacity
* sizeof(struct cached_chapter_index
*)));
244 result
= vdo_allocate_cache_aligned(bytes
, "search list", &list
);
245 if (result
!= VDO_SUCCESS
)
248 list
->capacity
= cache
->capacity
;
249 list
->first_dead_entry
= 0;
251 for (i
= 0; i
< list
->capacity
; i
++)
252 list
->entries
[i
] = &cache
->chapters
[i
];
258 int uds_make_sparse_cache(const struct index_geometry
*geometry
, unsigned int capacity
,
259 unsigned int zone_count
, struct sparse_cache
**cache_ptr
)
263 struct sparse_cache
*cache
;
266 bytes
= (sizeof(struct sparse_cache
) + (capacity
* sizeof(struct cached_chapter_index
)));
267 result
= vdo_allocate_cache_aligned(bytes
, "sparse cache", &cache
);
268 if (result
!= VDO_SUCCESS
)
271 cache
->geometry
= geometry
;
272 cache
->capacity
= capacity
;
273 cache
->zone_count
= zone_count
;
276 * Scale down the skip threshold since the cache only counts cache misses in zone zero, but
277 * requests are being handled in all zones.
279 cache
->skip_threshold
= (SKIP_SEARCH_THRESHOLD
/ zone_count
);
281 initialize_threads_barrier(&cache
->begin_update_barrier
, zone_count
);
282 initialize_threads_barrier(&cache
->end_update_barrier
, zone_count
);
284 for (i
= 0; i
< capacity
; i
++) {
285 result
= initialize_cached_chapter_index(&cache
->chapters
[i
], geometry
);
286 if (result
!= UDS_SUCCESS
)
290 for (i
= 0; i
< zone_count
; i
++) {
291 result
= make_search_list(cache
, &cache
->search_lists
[i
]);
292 if (result
!= UDS_SUCCESS
)
296 /* purge_search_list() needs some temporary lists for sorting. */
297 result
= vdo_allocate(capacity
* 2, struct cached_chapter_index
*,
298 "scratch entries", &cache
->scratch_entries
);
299 if (result
!= VDO_SUCCESS
)
305 uds_free_sparse_cache(cache
);
309 static inline void set_skip_search(struct cached_chapter_index
*chapter
,
312 /* Check before setting to reduce cache line contention. */
313 if (READ_ONCE(chapter
->skip_search
) != skip_search
)
314 WRITE_ONCE(chapter
->skip_search
, skip_search
);
317 static void score_search_hit(struct cached_chapter_index
*chapter
)
319 chapter
->counters
.consecutive_misses
= 0;
320 set_skip_search(chapter
, false);
323 static void score_search_miss(struct sparse_cache
*cache
,
324 struct cached_chapter_index
*chapter
)
326 chapter
->counters
.consecutive_misses
++;
327 if (chapter
->counters
.consecutive_misses
> cache
->skip_threshold
)
328 set_skip_search(chapter
, true);
331 static void release_cached_chapter_index(struct cached_chapter_index
*chapter
)
335 chapter
->virtual_chapter
= NO_CHAPTER
;
336 if (chapter
->page_buffers
== NULL
)
339 for (i
= 0; i
< chapter
->index_pages_count
; i
++) {
340 if (chapter
->page_buffers
[i
] != NULL
)
341 dm_bufio_release(vdo_forget(chapter
->page_buffers
[i
]));
345 void uds_free_sparse_cache(struct sparse_cache
*cache
)
352 vdo_free(cache
->scratch_entries
);
354 for (i
= 0; i
< cache
->zone_count
; i
++)
355 vdo_free(cache
->search_lists
[i
]);
357 for (i
= 0; i
< cache
->capacity
; i
++) {
358 release_cached_chapter_index(&cache
->chapters
[i
]);
359 vdo_free(cache
->chapters
[i
].index_pages
);
360 vdo_free(cache
->chapters
[i
].page_buffers
);
367 * Take the indicated element of the search list and move it to the start, pushing the pointers
368 * previously before it back down the list.
370 static inline void set_newest_entry(struct search_list
*search_list
, u8 index
)
372 struct cached_chapter_index
*newest
;
375 newest
= search_list
->entries
[index
];
376 memmove(&search_list
->entries
[1], &search_list
->entries
[0],
377 index
* sizeof(struct cached_chapter_index
*));
378 search_list
->entries
[0] = newest
;
382 * This function may have moved a dead chapter to the front of the list for reuse, in which
383 * case the set of dead chapters becomes smaller.
385 if (search_list
->first_dead_entry
<= index
)
386 search_list
->first_dead_entry
++;
389 bool uds_sparse_cache_contains(struct sparse_cache
*cache
, u64 virtual_chapter
,
390 unsigned int zone_number
)
392 struct search_list
*search_list
;
393 struct cached_chapter_index
*chapter
;
397 * The correctness of the barriers depends on the invariant that between calls to
398 * uds_update_sparse_cache(), the answers this function returns must never vary: the result
399 * for a given chapter must be identical across zones. That invariant must be maintained
400 * even if the chapter falls off the end of the volume, or if searching it is disabled
401 * because of too many search misses.
403 search_list
= cache
->search_lists
[zone_number
];
404 for (i
= 0; i
< search_list
->first_dead_entry
; i
++) {
405 chapter
= search_list
->entries
[i
];
407 if (virtual_chapter
== chapter
->virtual_chapter
) {
408 if (zone_number
== ZONE_ZERO
)
409 score_search_hit(chapter
);
411 set_newest_entry(search_list
, i
);
420 * Re-sort cache entries into three sets (active, skippable, and dead) while maintaining the LRU
421 * ordering that already existed. This operation must only be called during the critical section in
422 * uds_update_sparse_cache().
424 static void purge_search_list(struct search_list
*search_list
,
425 struct sparse_cache
*cache
, u64 oldest_virtual_chapter
)
427 struct cached_chapter_index
**entries
;
428 struct cached_chapter_index
**skipped
;
429 struct cached_chapter_index
**dead
;
430 struct cached_chapter_index
*chapter
;
431 unsigned int next_alive
= 0;
432 unsigned int next_skipped
= 0;
433 unsigned int next_dead
= 0;
436 entries
= &search_list
->entries
[0];
437 skipped
= &cache
->scratch_entries
[0];
438 dead
= &cache
->scratch_entries
[search_list
->capacity
];
440 for (i
= 0; i
< search_list
->first_dead_entry
; i
++) {
441 chapter
= search_list
->entries
[i
];
442 if ((chapter
->virtual_chapter
< oldest_virtual_chapter
) ||
443 (chapter
->virtual_chapter
== NO_CHAPTER
))
444 dead
[next_dead
++] = chapter
;
445 else if (chapter
->skip_search
)
446 skipped
[next_skipped
++] = chapter
;
448 entries
[next_alive
++] = chapter
;
451 memcpy(&entries
[next_alive
], skipped
,
452 next_skipped
* sizeof(struct cached_chapter_index
*));
453 memcpy(&entries
[next_alive
+ next_skipped
], dead
,
454 next_dead
* sizeof(struct cached_chapter_index
*));
455 search_list
->first_dead_entry
= next_alive
+ next_skipped
;
458 static int __must_check
cache_chapter_index(struct cached_chapter_index
*chapter
,
460 const struct volume
*volume
)
464 release_cached_chapter_index(chapter
);
466 result
= uds_read_chapter_index_from_volume(volume
, virtual_chapter
,
467 chapter
->page_buffers
,
468 chapter
->index_pages
);
469 if (result
!= UDS_SUCCESS
)
472 chapter
->counters
.consecutive_misses
= 0;
473 chapter
->virtual_chapter
= virtual_chapter
;
474 chapter
->skip_search
= false;
479 static inline void copy_search_list(const struct search_list
*source
,
480 struct search_list
*target
)
483 memcpy(target
->entries
, source
->entries
,
484 source
->capacity
* sizeof(struct cached_chapter_index
*));
488 * Update the sparse cache to contain a chapter index. This function must be called by all the zone
489 * threads with the same chapter number to correctly enter the thread barriers used to synchronize
492 int uds_update_sparse_cache(struct index_zone
*zone
, u64 virtual_chapter
)
494 int result
= UDS_SUCCESS
;
495 const struct uds_index
*index
= zone
->index
;
496 struct sparse_cache
*cache
= index
->volume
->sparse_cache
;
498 if (uds_sparse_cache_contains(cache
, virtual_chapter
, zone
->id
))
502 * Wait for every zone thread to reach its corresponding barrier request and invoke this
503 * function before starting to modify the cache.
505 enter_threads_barrier(&cache
->begin_update_barrier
);
508 * This is the start of the critical section: the zone zero thread is captain, effectively
509 * holding an exclusive lock on the sparse cache. All the other zone threads must do
510 * nothing between the two barriers. They will wait at the end_update_barrier again for the
511 * captain to finish the update.
514 if (zone
->id
== ZONE_ZERO
) {
516 struct search_list
*list
= cache
->search_lists
[ZONE_ZERO
];
518 purge_search_list(list
, cache
, zone
->oldest_virtual_chapter
);
520 if (virtual_chapter
>= index
->oldest_virtual_chapter
) {
521 set_newest_entry(list
, list
->capacity
- 1);
522 result
= cache_chapter_index(list
->entries
[0], virtual_chapter
,
526 for (z
= 1; z
< cache
->zone_count
; z
++)
527 copy_search_list(list
, cache
->search_lists
[z
]);
531 * This is the end of the critical section. All cache invariants must have been restored.
533 enter_threads_barrier(&cache
->end_update_barrier
);
537 void uds_invalidate_sparse_cache(struct sparse_cache
*cache
)
541 for (i
= 0; i
< cache
->capacity
; i
++)
542 release_cached_chapter_index(&cache
->chapters
[i
]);
545 static inline bool should_skip_chapter(struct cached_chapter_index
*chapter
,
546 u64 oldest_chapter
, u64 requested_chapter
)
548 if ((chapter
->virtual_chapter
== NO_CHAPTER
) ||
549 (chapter
->virtual_chapter
< oldest_chapter
))
552 if (requested_chapter
!= NO_CHAPTER
)
553 return requested_chapter
!= chapter
->virtual_chapter
;
555 return READ_ONCE(chapter
->skip_search
);
558 static int __must_check
search_cached_chapter_index(struct cached_chapter_index
*chapter
,
559 const struct index_geometry
*geometry
,
560 const struct index_page_map
*index_page_map
,
561 const struct uds_record_name
*name
,
562 u16
*record_page_ptr
)
564 u32 physical_chapter
=
565 uds_map_to_physical_chapter(geometry
, chapter
->virtual_chapter
);
566 u32 index_page_number
=
567 uds_find_index_page_number(index_page_map
, name
, physical_chapter
);
568 struct delta_index_page
*index_page
=
569 &chapter
->index_pages
[index_page_number
];
571 return uds_search_chapter_index_page(index_page
, geometry
, name
,
575 int uds_search_sparse_cache(struct index_zone
*zone
, const struct uds_record_name
*name
,
576 u64
*virtual_chapter_ptr
, u16
*record_page_ptr
)
579 struct volume
*volume
= zone
->index
->volume
;
580 struct sparse_cache
*cache
= volume
->sparse_cache
;
581 struct cached_chapter_index
*chapter
;
582 struct search_list
*search_list
;
584 /* Search the entire cache unless a specific chapter was requested. */
585 bool search_one
= (*virtual_chapter_ptr
!= NO_CHAPTER
);
587 *record_page_ptr
= NO_CHAPTER_INDEX_ENTRY
;
588 search_list
= cache
->search_lists
[zone
->id
];
589 for (i
= 0; i
< search_list
->first_dead_entry
; i
++) {
590 chapter
= search_list
->entries
[i
];
592 if (should_skip_chapter(chapter
, zone
->oldest_virtual_chapter
,
593 *virtual_chapter_ptr
))
596 result
= search_cached_chapter_index(chapter
, cache
->geometry
,
597 volume
->index_page_map
, name
,
599 if (result
!= UDS_SUCCESS
)
602 if (*record_page_ptr
!= NO_CHAPTER_INDEX_ENTRY
) {
604 * In theory, this might be a false match while a true match exists in
605 * another chapter, but that's a very rare case and not worth the extra
608 set_newest_entry(search_list
, i
);
609 if (zone
->id
== ZONE_ZERO
)
610 score_search_hit(chapter
);
612 *virtual_chapter_ptr
= chapter
->virtual_chapter
;
616 if (zone
->id
== ZONE_ZERO
)
617 score_search_miss(cache
, chapter
);