1 /* SPDX-License-Identifier: GPL-2.0-only */
3 * Copyright 2023 Red Hat
6 #ifndef VDO_SLAB_DEPOT_H
7 #define VDO_SLAB_DEPOT_H
9 #include <linux/atomic.h>
10 #include <linux/dm-kcopyd.h>
11 #include <linux/list.h>
15 #include "admin-state.h"
16 #include "completion.h"
18 #include "encodings.h"
19 #include "physical-zone.h"
20 #include "priority-table.h"
21 #include "recovery-journal.h"
22 #include "statistics.h"
25 #include "wait-queue.h"
28 * A slab_depot is responsible for managing all of the slabs and block allocators of a VDO. It has
29 * a single array of slabs in order to eliminate the need for additional math in order to compute
30 * which physical zone a PBN is in. It also has a block_allocator per zone.
32 * Each physical zone has a single dedicated queue and thread for performing all updates to the
33 * slabs assigned to that zone. The concurrency guarantees of this single-threaded model allow the
34 * code to omit more fine-grained locking for the various slab structures. Each physical zone
35 * maintains a separate copy of the slab summary to remove the need for explicit locking on that
38 * Load operations must be performed on the admin thread. Normal operations, such as allocations
39 * and reference count updates, must be performed on the appropriate physical zone thread. Requests
40 * from the recovery journal to commit slab journal tail blocks must be scheduled from the recovery
41 * journal thread to run on the appropriate physical zone thread. Save operations must be launched
42 * from the same admin thread as the original load operation.
46 /* The number of vios in the vio pool is proportional to the throughput of the VDO. */
47 BLOCK_ALLOCATOR_VIO_POOL_SIZE
= 128,
51 * Represents the possible status of a block.
53 enum reference_status
{
54 RS_FREE
, /* this block is free */
55 RS_SINGLE
, /* this block is singly-referenced */
56 RS_SHARED
, /* this block is shared */
57 RS_PROVISIONAL
/* this block is provisionally allocated */
64 sequence_number_t recovery_start
;
68 /* A waiter object for getting a VIO pool entry */
69 struct vdo_waiter resource_waiter
;
70 /* A waiter object for updating the slab summary */
71 struct vdo_waiter slab_summary_waiter
;
72 /* A waiter object for getting a vio with which to flush */
73 struct vdo_waiter flush_waiter
;
74 /* The queue of VIOs waiting to make an entry */
75 struct vdo_wait_queue entry_waiters
;
76 /* The parent slab reference of this journal */
77 struct vdo_slab
*slab
;
79 /* Whether a tail block commit is pending */
80 bool waiting_to_commit
;
81 /* Whether the journal is updating the slab summary */
82 bool updating_slab_summary
;
83 /* Whether the journal is adding entries from the entry_waiters queue */
85 /* Whether a partial write is in progress */
86 bool partial_write_in_progress
;
88 /* The oldest block in the journal on disk */
89 sequence_number_t head
;
90 /* The oldest block in the journal which may not be reaped */
91 sequence_number_t unreapable
;
92 /* The end of the half-open interval of the active journal */
93 sequence_number_t tail
;
94 /* The next journal block to be committed */
95 sequence_number_t next_commit
;
96 /* The tail sequence number that is written in the slab summary */
97 sequence_number_t summarized
;
98 /* The tail sequence number that was last summarized in slab summary */
99 sequence_number_t last_summarized
;
101 /* The sequence number of the recovery journal lock */
102 sequence_number_t recovery_lock
;
105 * The number of entries which fit in a single block. Can't use the constant because unit
106 * tests change this number.
108 journal_entry_count_t entries_per_block
;
110 * The number of full entries which fit in a single block. Can't use the constant because
111 * unit tests change this number.
113 journal_entry_count_t full_entries_per_block
;
115 /* The recovery journal of the VDO (slab journal holds locks on it) */
116 struct recovery_journal
*recovery_journal
;
118 /* The statistics shared by all slab journals in our physical zone */
119 struct slab_journal_statistics
*events
;
120 /* A list of the VIO pool entries for outstanding journal block writes */
121 struct list_head uncommitted_blocks
;
124 * The current tail block header state. This will be packed into the block just before it
127 struct slab_journal_block_header tail_header
;
128 /* A pointer to a block-sized buffer holding the packed block data */
129 struct packed_slab_journal_block
*block
;
131 /* The number of blocks in the on-disk journal */
133 /* The number of blocks at which to start pushing reference blocks */
134 block_count_t flushing_threshold
;
135 /* The number of blocks at which all reference blocks should be writing */
136 block_count_t flushing_deadline
;
137 /* The number of blocks at which to wait for reference blocks to write */
138 block_count_t blocking_threshold
;
139 /* The number of blocks at which to scrub the slab before coming online */
140 block_count_t scrubbing_threshold
;
142 /* This list entry is for block_allocator to keep a queue of dirty journals */
143 struct list_head dirty_entry
;
145 /* The lock for the oldest unreaped block of the journal */
146 struct journal_lock
*reap_lock
;
147 /* The locks for each on disk block */
148 struct journal_lock
*locks
;
152 * Reference_block structure
154 * Blocks are used as a proxy, permitting saves of partial refcounts.
156 struct reference_block
{
157 /* This block waits on the ref_counts to tell it to write */
158 struct vdo_waiter waiter
;
159 /* The slab to which this reference_block belongs */
160 struct vdo_slab
*slab
;
161 /* The number of references in this block that represent allocations */
162 block_size_t allocated_count
;
163 /* The slab journal block on which this block must hold a lock */
164 sequence_number_t slab_journal_lock
;
165 /* The slab journal block which should be released when this block is committed */
166 sequence_number_t slab_journal_lock_to_release
;
167 /* The point up to which each sector is accurate on disk */
168 struct journal_point commit_points
[VDO_SECTORS_PER_BLOCK
];
169 /* Whether this block has been modified since it was written to disk */
171 /* Whether this block is currently writing */
175 /* The search_cursor represents the saved position of a free block search. */
176 struct search_cursor
{
177 /* The reference block containing the current search index */
178 struct reference_block
*block
;
179 /* The position at which to start searching for the next free counter */
180 slab_block_number index
;
181 /* The position just past the last valid counter in the current block */
182 slab_block_number end_index
;
184 /* A pointer to the first reference block in the slab */
185 struct reference_block
*first_block
;
186 /* A pointer to the last reference block in the slab */
187 struct reference_block
*last_block
;
190 enum slab_rebuild_status
{
193 VDO_SLAB_REQUIRES_SCRUBBING
,
194 VDO_SLAB_REQUIRES_HIGH_PRIORITY_SCRUBBING
,
199 * This is the type declaration for the vdo_slab type. A vdo_slab currently consists of a run of
200 * 2^23 data blocks, but that will soon change to dedicate a small number of those blocks for
201 * metadata storage for the reference counts and slab journal for the slab.
203 * A reference count is maintained for each physical block number. The vast majority of blocks have
204 * a very small reference count (usually 0 or 1). For references less than or equal to MAXIMUM_REFS
205 * (254) the reference count is stored in counters[pbn].
208 /* A list entry to queue this slab in a block_allocator list */
209 struct list_head allocq_entry
;
211 /* The struct block_allocator that owns this slab */
212 struct block_allocator
*allocator
;
214 /* The journal for this slab */
215 struct slab_journal journal
;
217 /* The slab number of this slab */
218 slab_count_t slab_number
;
219 /* The offset in the allocator partition of the first block in this slab */
220 physical_block_number_t start
;
221 /* The offset of the first block past the end of this slab */
222 physical_block_number_t end
;
223 /* The starting translated PBN of the slab journal */
224 physical_block_number_t journal_origin
;
225 /* The starting translated PBN of the reference counts */
226 physical_block_number_t ref_counts_origin
;
228 /* The administrative state of the slab */
229 struct admin_state state
;
230 /* The status of the slab */
231 enum slab_rebuild_status status
;
232 /* Whether the slab was ever queued for scrubbing */
233 bool was_queued_for_scrubbing
;
235 /* The priority at which this slab has been queued for allocation */
238 /* Fields beyond this point are the reference counts for the data blocks in this slab. */
239 /* The size of the counters array */
241 /* The number of free blocks */
243 /* The array of reference counts */
244 vdo_refcount_t
*counters
; /* use vdo_allocate() to align data ptr */
246 /* The saved block pointer and array indexes for the free block search */
247 struct search_cursor search_cursor
;
249 /* A list of the dirty blocks waiting to be written out */
250 struct vdo_wait_queue dirty_blocks
;
251 /* The number of blocks which are currently writing */
254 /* A waiter object for updating the slab summary */
255 struct vdo_waiter summary_waiter
;
257 /* The latest slab journal for which there has been a reference count update */
258 struct journal_point slab_journal_point
;
260 /* The number of reference count blocks */
261 u32 reference_block_count
;
262 /* reference count block array */
263 struct reference_block
*reference_blocks
;
266 enum block_allocator_drain_step
{
267 VDO_DRAIN_ALLOCATOR_START
,
268 VDO_DRAIN_ALLOCATOR_STEP_SCRUBBER
,
269 VDO_DRAIN_ALLOCATOR_STEP_SLABS
,
270 VDO_DRAIN_ALLOCATOR_STEP_SUMMARY
,
271 VDO_DRAIN_ALLOCATOR_STEP_FINISHED
,
274 struct slab_scrubber
{
275 /* The queue of slabs to scrub first */
276 struct list_head high_priority_slabs
;
277 /* The queue of slabs to scrub once there are no high_priority_slabs */
278 struct list_head slabs
;
279 /* The queue of VIOs waiting for a slab to be scrubbed */
280 struct vdo_wait_queue waiters
;
283 * The number of slabs that are unrecovered or being scrubbed. This field is modified by
284 * the physical zone thread, but is queried by other threads.
286 slab_count_t slab_count
;
288 /* The administrative state of the scrubber */
289 struct admin_state admin_state
;
290 /* Whether to only scrub high-priority slabs */
291 bool high_priority_only
;
292 /* The slab currently being scrubbed */
293 struct vdo_slab
*slab
;
294 /* The vio for loading slab journal blocks */
298 /* A sub-structure for applying actions in parallel to all an allocator's slabs. */
300 /* The number of slabs performing a slab action */
301 slab_count_t slab_action_count
;
302 /* The method to call when a slab action has been completed by all slabs */
303 vdo_action_fn callback
;
306 /* A slab_iterator is a structure for iterating over a set of slabs. */
307 struct slab_iterator
{
308 struct vdo_slab
**slabs
;
309 struct vdo_slab
*next
;
315 * The slab_summary provides hints during load and recovery about the state of the slabs in order
316 * to avoid the need to read the slab journals in their entirety before a VDO can come online.
318 * The information in the summary for each slab includes the rough number of free blocks (which is
319 * used to prioritize scrubbing), the cleanliness of a slab (so that clean slabs containing free
320 * space will be used on restart), and the location of the tail block of the slab's journal.
322 * The slab_summary has its own partition at the end of the volume which is sized to allow for a
323 * complete copy of the summary for each of up to 16 physical zones.
325 * During resize, the slab_summary moves its backing partition and is saved once moved; the
326 * slab_summary is not permitted to overwrite the previous recovery journal space.
328 * The slab_summary does not have its own version information, but relies on the VDO volume version
333 * A slab status is a very small structure for use in determining the ordering of slabs in the
337 slab_count_t slab_number
;
342 struct slab_summary_block
{
343 /* The block_allocator to which this block belongs */
344 struct block_allocator
*allocator
;
345 /* The index of this block in its zone's summary */
347 /* Whether this block has a write outstanding */
349 /* Ring of updates waiting on the outstanding write */
350 struct vdo_wait_queue current_update_waiters
;
351 /* Ring of updates waiting on the next write */
352 struct vdo_wait_queue next_update_waiters
;
353 /* The active slab_summary_entry array for this block */
354 struct slab_summary_entry
*entries
;
355 /* The vio used to write this block */
357 /* The packed entries, one block long, backing the vio */
358 char *outgoing_entries
;
362 * The statistics for all the slab summary zones owned by this slab summary. These fields are all
363 * mutated only by their physical zone threads, but are read by other threads when gathering
364 * statistics for the entire depot.
366 struct atomic_slab_summary_statistics
{
367 /* Number of blocks written */
368 atomic64_t blocks_written
;
371 struct block_allocator
{
372 struct vdo_completion completion
;
373 /* The slab depot for this allocator */
374 struct slab_depot
*depot
;
375 /* The nonce of the VDO */
377 /* The physical zone number of this allocator */
378 zone_count_t zone_number
;
379 /* The thread ID for this allocator's physical zone */
380 thread_id_t thread_id
;
381 /* The number of slabs in this allocator */
382 slab_count_t slab_count
;
383 /* The number of the last slab owned by this allocator */
384 slab_count_t last_slab
;
385 /* The reduced priority level used to preserve unopened slabs */
386 unsigned int unopened_slab_priority
;
387 /* The state of this allocator */
388 struct admin_state state
;
389 /* The actor for applying an action to all slabs */
390 struct slab_actor slab_actor
;
392 /* The slab from which blocks are currently being allocated */
393 struct vdo_slab
*open_slab
;
394 /* A priority queue containing all slabs available for allocation */
395 struct priority_table
*prioritized_slabs
;
396 /* The slab scrubber */
397 struct slab_scrubber scrubber
;
398 /* What phase of the close operation the allocator is to perform */
399 enum block_allocator_drain_step drain_step
;
402 * These statistics are all mutated only by the physical zone thread, but are read by other
403 * threads when gathering statistics for the entire depot.
406 * The count of allocated blocks in this zone. Not in block_allocator_statistics for
407 * historical reasons.
409 u64 allocated_blocks
;
410 /* Statistics for this block allocator */
411 struct block_allocator_statistics statistics
;
412 /* Cumulative statistics for the slab journals in this zone */
413 struct slab_journal_statistics slab_journal_statistics
;
414 /* Cumulative statistics for the reference counters in this zone */
415 struct ref_counts_statistics ref_counts_statistics
;
418 * This is the head of a queue of slab journals which have entries in their tail blocks
419 * which have not yet started to commit. When the recovery journal is under space pressure,
420 * slab journals which have uncommitted entries holding a lock on the recovery journal head
421 * are forced to commit their blocks early. This list is kept in order, with the tail
422 * containing the slab journal holding the most recent recovery journal lock.
424 struct list_head dirty_slab_journals
;
426 /* The vio pool for reading and writing block allocator metadata */
427 struct vio_pool
*vio_pool
;
428 /* The dm_kcopyd client for erasing slab journals */
429 struct dm_kcopyd_client
*eraser
;
430 /* Iterator over the slabs to be erased */
431 struct slab_iterator slabs_to_erase
;
433 /* The portion of the slab summary managed by this allocator */
434 /* The state of the slab summary */
435 struct admin_state summary_state
;
436 /* The number of outstanding summary writes */
437 block_count_t summary_write_count
;
438 /* The array (owned by the blocks) of all entries */
439 struct slab_summary_entry
*summary_entries
;
440 /* The array of slab_summary_blocks */
441 struct slab_summary_block
*summary_blocks
;
444 enum slab_depot_load_type
{
445 VDO_SLAB_DEPOT_NORMAL_LOAD
,
446 VDO_SLAB_DEPOT_RECOVERY_LOAD
,
447 VDO_SLAB_DEPOT_REBUILD_LOAD
451 zone_count_t zone_count
;
452 zone_count_t old_zone_count
;
454 struct slab_config slab_config
;
455 struct action_manager
*action_manager
;
457 physical_block_number_t first_block
;
458 physical_block_number_t last_block
;
459 physical_block_number_t origin
;
461 /* slab_size == (1 << slab_size_shift) */
462 unsigned int slab_size_shift
;
464 /* Determines how slabs should be queued during load */
465 enum slab_depot_load_type load_type
;
467 /* The state for notifying slab journals to release recovery journal */
468 sequence_number_t active_release_request
;
469 sequence_number_t new_release_request
;
471 /* State variables for scrubbing complete handling */
472 atomic_t zones_to_scrub
;
474 /* Array of pointers to individually allocated slabs */
475 struct vdo_slab
**slabs
;
476 /* The number of slabs currently allocated and stored in 'slabs' */
477 slab_count_t slab_count
;
479 /* Array of pointers to a larger set of slabs (used during resize) */
480 struct vdo_slab
**new_slabs
;
481 /* The number of slabs currently allocated and stored in 'new_slabs' */
482 slab_count_t new_slab_count
;
483 /* The size that 'new_slabs' was allocated for */
484 block_count_t new_size
;
486 /* The last block before resize, for rollback */
487 physical_block_number_t old_last_block
;
488 /* The last block after resize, for resize */
489 physical_block_number_t new_last_block
;
491 /* The statistics for the slab summary */
492 struct atomic_slab_summary_statistics summary_statistics
;
493 /* The start of the slab summary partition */
494 physical_block_number_t summary_origin
;
495 /* The number of bits to shift to get a 7-bit fullness hint */
496 unsigned int hint_shift
;
497 /* The slab summary entries for all of the zones the partition can hold */
498 struct slab_summary_entry
*summary_entries
;
500 /* The block allocators for this depot */
501 struct block_allocator allocators
[];
504 struct reference_updater
;
506 bool __must_check
vdo_attempt_replay_into_slab(struct vdo_slab
*slab
,
507 physical_block_number_t pbn
,
508 enum journal_operation operation
,
510 struct journal_point
*recovery_point
,
511 struct vdo_completion
*parent
);
513 int __must_check
vdo_adjust_reference_count_for_rebuild(struct slab_depot
*depot
,
514 physical_block_number_t pbn
,
515 enum journal_operation operation
);
517 static inline struct block_allocator
*vdo_as_block_allocator(struct vdo_completion
*completion
)
519 vdo_assert_completion_type(completion
, VDO_BLOCK_ALLOCATOR_COMPLETION
);
520 return container_of(completion
, struct block_allocator
, completion
);
523 int __must_check
vdo_acquire_provisional_reference(struct vdo_slab
*slab
,
524 physical_block_number_t pbn
,
525 struct pbn_lock
*lock
);
527 int __must_check
vdo_allocate_block(struct block_allocator
*allocator
,
528 physical_block_number_t
*block_number_ptr
);
530 int vdo_enqueue_clean_slab_waiter(struct block_allocator
*allocator
,
531 struct vdo_waiter
*waiter
);
533 void vdo_modify_reference_count(struct vdo_completion
*completion
,
534 struct reference_updater
*updater
);
536 int __must_check
vdo_release_block_reference(struct block_allocator
*allocator
,
537 physical_block_number_t pbn
);
539 void vdo_notify_slab_journals_are_recovered(struct vdo_completion
*completion
);
541 void vdo_dump_block_allocator(const struct block_allocator
*allocator
);
543 int __must_check
vdo_decode_slab_depot(struct slab_depot_state_2_0 state
,
545 struct partition
*summary_partition
,
546 struct slab_depot
**depot_ptr
);
548 void vdo_free_slab_depot(struct slab_depot
*depot
);
550 struct slab_depot_state_2_0 __must_check
vdo_record_slab_depot(const struct slab_depot
*depot
);
552 int __must_check
vdo_allocate_reference_counters(struct slab_depot
*depot
);
554 struct vdo_slab
* __must_check
vdo_get_slab(const struct slab_depot
*depot
,
555 physical_block_number_t pbn
);
557 u8 __must_check
vdo_get_increment_limit(struct slab_depot
*depot
,
558 physical_block_number_t pbn
);
560 bool __must_check
vdo_is_physical_data_block(const struct slab_depot
*depot
,
561 physical_block_number_t pbn
);
563 block_count_t __must_check
vdo_get_slab_depot_allocated_blocks(const struct slab_depot
*depot
);
565 block_count_t __must_check
vdo_get_slab_depot_data_blocks(const struct slab_depot
*depot
);
567 void vdo_get_slab_depot_statistics(const struct slab_depot
*depot
,
568 struct vdo_statistics
*stats
);
570 void vdo_load_slab_depot(struct slab_depot
*depot
,
571 const struct admin_state_code
*operation
,
572 struct vdo_completion
*parent
, void *context
);
574 void vdo_prepare_slab_depot_to_allocate(struct slab_depot
*depot
,
575 enum slab_depot_load_type load_type
,
576 struct vdo_completion
*parent
);
578 void vdo_update_slab_depot_size(struct slab_depot
*depot
);
580 int __must_check
vdo_prepare_to_grow_slab_depot(struct slab_depot
*depot
,
581 const struct partition
*partition
);
583 void vdo_use_new_slabs(struct slab_depot
*depot
, struct vdo_completion
*parent
);
585 void vdo_abandon_new_slabs(struct slab_depot
*depot
);
587 void vdo_drain_slab_depot(struct slab_depot
*depot
,
588 const struct admin_state_code
*operation
,
589 struct vdo_completion
*parent
);
591 void vdo_resume_slab_depot(struct slab_depot
*depot
, struct vdo_completion
*parent
);
593 void vdo_commit_oldest_slab_journal_tail_blocks(struct slab_depot
*depot
,
594 sequence_number_t recovery_block_number
);
596 void vdo_scrub_all_unrecovered_slabs(struct slab_depot
*depot
,
597 struct vdo_completion
*parent
);
599 void vdo_dump_slab_depot(const struct slab_depot
*depot
);
601 #endif /* VDO_SLAB_DEPOT_H */