1 /* SPDX-License-Identifier: GPL-2.0-only */
3 * Copyright 2023 Red Hat
6 #ifndef VDO_BLOCK_MAP_H
7 #define VDO_BLOCK_MAP_H
9 #include <linux/list.h>
13 #include "admin-state.h"
14 #include "completion.h"
15 #include "encodings.h"
17 #include "statistics.h"
20 #include "wait-queue.h"
23 * The block map is responsible for tracking all the logical to physical mappings of a VDO. It
24 * consists of a collection of 60 radix trees gradually allocated as logical addresses are used.
25 * Each tree is assigned to a logical zone such that it is easy to compute which zone must handle
26 * each logical address. Each logical zone also has a dedicated portion of the leaf page cache.
28 * Each logical zone has a single dedicated queue and thread for performing all updates to the
29 * radix trees assigned to that zone. The concurrency guarantees of this single-threaded model
30 * allow the code to omit more fine-grained locking for the block map structures.
32 * Load operations must be performed on the admin thread. Normal operations, such as reading and
33 * updating mappings, must be performed on the appropriate logical zone thread. Save operations
34 * must be launched from the same admin thread as the original load operation.
38 BLOCK_MAP_VIO_POOL_SIZE
= 64,
42 * Generation counter for page references.
44 typedef u32 vdo_page_generation
;
46 extern const struct block_map_entry UNMAPPED_BLOCK_MAP_ENTRY
;
48 /* The VDO Page Cache abstraction. */
49 struct vdo_page_cache
{
50 /* the VDO which owns this cache */
52 /* number of pages in cache */
53 page_count_t page_count
;
54 /* number of pages to write in the current batch */
55 page_count_t pages_in_batch
;
56 /* Whether the VDO is doing a read-only rebuild */
59 /* array of page information entries */
60 struct page_info
*infos
;
61 /* raw memory for pages */
63 /* cache last found page info */
64 struct page_info
*last_found
;
65 /* map of page number to info */
66 struct int_map
*page_map
;
67 /* main LRU list (all infos) */
68 struct list_head lru_list
;
69 /* free page list (oldest first) */
70 struct list_head free_list
;
71 /* outgoing page list */
72 struct list_head outgoing_list
;
73 /* number of read I/O operations pending */
74 page_count_t outstanding_reads
;
75 /* number of write I/O operations pending */
76 page_count_t outstanding_writes
;
77 /* number of pages covered by the current flush */
78 page_count_t pages_in_flush
;
79 /* number of pages waiting to be included in the next flush */
80 page_count_t pages_to_flush
;
81 /* number of discards in progress */
82 unsigned int discard_count
;
83 /* how many VPCs waiting for free page */
84 unsigned int waiter_count
;
85 /* queue of waiters who want a free page */
86 struct vdo_wait_queue free_waiters
;
88 * Statistics are only updated on the logical zone thread, but are accessed from other
91 struct block_map_statistics stats
;
92 /* counter for pressure reports */
94 /* the block map zone to which this cache belongs */
95 struct block_map_zone
*zone
;
99 * The state of a page buffer. If the page buffer is free no particular page is bound to it,
100 * otherwise the page buffer is bound to particular page whose absolute pbn is in the pbn field. If
101 * the page is resident or dirty the page data is stable and may be accessed. Otherwise the page is
102 * in flight (incoming or outgoing) and its data should not be accessed.
104 * @note Update the static data in get_page_state_name() if you change this enumeration.
106 enum vdo_page_buffer_state
{
107 /* this page buffer is not being used */
109 /* this page is being read from store */
111 /* attempt to load this page failed */
113 /* this page is valid and un-modified */
115 /* this page is valid and modified */
117 /* this page is being written and should not be used */
124 * The write status of page
126 enum vdo_page_write_status
{
128 WRITE_STATUS_DISCARD
,
129 WRITE_STATUS_DEFERRED
,
132 /* Per-page-slot information. */
134 /* Preallocated page struct vio */
136 /* back-link for references */
137 struct vdo_page_cache
*cache
;
138 /* the pbn of the page */
139 physical_block_number_t pbn
;
140 /* page is busy (temporarily locked) */
142 /* the write status the page */
143 enum vdo_page_write_status write_status
;
145 enum vdo_page_buffer_state state
;
146 /* queue of completions awaiting this item */
147 struct vdo_wait_queue waiting
;
148 /* state linked list entry */
149 struct list_head state_entry
;
151 struct list_head lru_entry
;
153 * The earliest recovery journal block containing uncommitted updates to the block map page
154 * associated with this page_info. A reference (lock) is held on that block to prevent it
155 * from being reaped. When this value changes, the reference on the old value must be
156 * released and a reference on the new value must be acquired.
158 sequence_number_t recovery_lock
;
162 * A completion awaiting a specific page. Also a live reference into the page once completed, until
165 struct vdo_page_completion
{
166 /* The generic completion */
167 struct vdo_completion completion
;
168 /* The cache involved */
169 struct vdo_page_cache
*cache
;
170 /* The waiter for the pending list */
171 struct vdo_waiter waiter
;
172 /* The absolute physical block number of the page on disk */
173 physical_block_number_t pbn
;
174 /* Whether the page may be modified */
176 /* Whether the page is available */
178 /* The info structure for the page, only valid when ready */
179 struct page_info
*info
;
185 struct vdo_waiter waiter
;
187 /* Dirty list entry */
188 struct list_head entry
;
190 /* If dirty, the tree zone flush generation in which it was last dirtied. */
193 /* Whether this page is an interior tree page being written out. */
196 /* If writing, the tree zone flush generation of the copy being written. */
197 u8 writing_generation
;
200 * Sequence number of the earliest recovery journal block containing uncommitted updates to
203 sequence_number_t recovery_lock
;
205 /* The value of recovery_lock when the this page last started writing */
206 sequence_number_t writing_recovery_lock
;
208 char page_buffer
[VDO_BLOCK_SIZE
];
211 enum block_map_page_type
{
216 typedef struct list_head dirty_era_t
[2];
219 /* The number of periods after which an element will be expired */
220 block_count_t maximum_age
;
221 /* The oldest period which has unexpired elements */
222 sequence_number_t oldest_period
;
223 /* One more than the current period */
224 sequence_number_t next_period
;
225 /* The offset in the array of lists of the oldest period */
226 block_count_t offset
;
229 /* The lists of dirty pages */
233 struct block_map_zone
{
234 zone_count_t zone_number
;
235 thread_id_t thread_id
;
236 struct admin_state state
;
237 struct block_map
*block_map
;
238 /* Dirty pages, by era*/
239 struct dirty_lists
*dirty_lists
;
240 struct vdo_page_cache page_cache
;
241 data_vio_count_t active_lookups
;
242 struct int_map
*loading_pages
;
243 struct vio_pool
*vio_pool
;
244 /* The tree page which has issued or will be issuing a flush */
245 struct tree_page
*flusher
;
246 struct vdo_wait_queue flush_waiters
;
247 /* The generation after the most recent flush */
249 u8 oldest_generation
;
250 /* The counts of dirty pages in each generation */
251 u32 dirty_page_counts
[256];
256 struct action_manager
*action_manager
;
257 /* The absolute PBN of the first root of the tree part of the block map */
258 physical_block_number_t root_origin
;
259 block_count_t root_count
;
261 /* The era point we are currently distributing to the zones */
262 sequence_number_t current_era_point
;
263 /* The next era point */
264 sequence_number_t pending_era_point
;
266 /* The number of entries in block map */
267 block_count_t entry_count
;
269 struct recovery_journal
*journal
;
271 /* The trees for finding block map pages */
272 struct forest
*forest
;
273 /* The expanded trees awaiting growth */
274 struct forest
*next_forest
;
275 /* The number of entries after growth */
276 block_count_t next_entry_count
;
278 zone_count_t zone_count
;
279 struct block_map_zone zones
[];
283 * typedef vdo_entry_callback_fn - A function to be called for each allocated PBN when traversing
285 * @pbn: A PBN of a tree node.
286 * @completion: The parent completion of the traversal.
288 * Return: VDO_SUCCESS or an error.
290 typedef int (*vdo_entry_callback_fn
)(physical_block_number_t pbn
,
291 struct vdo_completion
*completion
);
293 static inline struct vdo_page_completion
*as_vdo_page_completion(struct vdo_completion
*completion
)
295 vdo_assert_completion_type(completion
, VDO_PAGE_COMPLETION
);
296 return container_of(completion
, struct vdo_page_completion
, completion
);
299 void vdo_release_page_completion(struct vdo_completion
*completion
);
301 void vdo_get_page(struct vdo_page_completion
*page_completion
,
302 struct block_map_zone
*zone
, physical_block_number_t pbn
,
303 bool writable
, void *parent
, vdo_action_fn callback
,
304 vdo_action_fn error_handler
, bool requeue
);
306 void vdo_request_page_write(struct vdo_completion
*completion
);
308 int __must_check
vdo_get_cached_page(struct vdo_completion
*completion
,
309 struct block_map_page
**page_ptr
);
311 int __must_check
vdo_invalidate_page_cache(struct vdo_page_cache
*cache
);
313 static inline struct block_map_page
* __must_check
314 vdo_as_block_map_page(struct tree_page
*tree_page
)
316 return (struct block_map_page
*) tree_page
->page_buffer
;
319 bool vdo_copy_valid_page(char *buffer
, nonce_t nonce
,
320 physical_block_number_t pbn
,
321 struct block_map_page
*page
);
323 void vdo_find_block_map_slot(struct data_vio
*data_vio
);
325 physical_block_number_t
vdo_find_block_map_page_pbn(struct block_map
*map
,
326 page_number_t page_number
);
328 void vdo_write_tree_page(struct tree_page
*page
, struct block_map_zone
*zone
);
330 void vdo_traverse_forest(struct block_map
*map
, vdo_entry_callback_fn callback
,
331 struct vdo_completion
*completion
);
333 int __must_check
vdo_decode_block_map(struct block_map_state_2_0 state
,
334 block_count_t logical_blocks
, struct vdo
*vdo
,
335 struct recovery_journal
*journal
, nonce_t nonce
,
336 page_count_t cache_size
, block_count_t maximum_age
,
337 struct block_map
**map_ptr
);
339 void vdo_drain_block_map(struct block_map
*map
, const struct admin_state_code
*operation
,
340 struct vdo_completion
*parent
);
342 void vdo_resume_block_map(struct block_map
*map
, struct vdo_completion
*parent
);
344 int __must_check
vdo_prepare_to_grow_block_map(struct block_map
*map
,
345 block_count_t new_logical_blocks
);
347 void vdo_grow_block_map(struct block_map
*map
, struct vdo_completion
*parent
);
349 void vdo_abandon_block_map_growth(struct block_map
*map
);
351 void vdo_free_block_map(struct block_map
*map
);
353 struct block_map_state_2_0 __must_check
vdo_record_block_map(const struct block_map
*map
);
355 void vdo_initialize_block_map_from_journal(struct block_map
*map
,
356 struct recovery_journal
*journal
);
358 zone_count_t
vdo_compute_logical_zone(struct data_vio
*data_vio
);
360 void vdo_advance_block_map_era(struct block_map
*map
,
361 sequence_number_t recovery_block_number
);
363 void vdo_update_block_map_page(struct block_map_page
*page
, struct data_vio
*data_vio
,
364 physical_block_number_t pbn
,
365 enum block_mapping_state mapping_state
,
366 sequence_number_t
*recovery_lock
);
368 void vdo_get_mapped_block(struct data_vio
*data_vio
);
370 void vdo_put_mapped_block(struct data_vio
*data_vio
);
372 struct block_map_statistics __must_check
vdo_get_block_map_statistics(struct block_map
*map
);
375 * vdo_convert_maximum_age() - Convert the maximum age to reflect the new recovery journal format
376 * @age: The configured maximum age
378 * Return: The converted age
380 * In the old recovery journal format, each journal block held 311 entries, and every write bio
381 * made two entries. The old maximum age was half the usable journal length. In the new format,
382 * each block holds only 217 entries, but each bio only makes one entry. We convert the configured
383 * age so that the number of writes in a block map era is the same in the old and new formats. This
384 * keeps the bound on the amount of work required to recover the block map from the recovery
385 * journal the same across the format change. It also keeps the amortization of block map page
386 * writes to write bios the same.
388 static inline block_count_t
vdo_convert_maximum_age(block_count_t age
)
390 return DIV_ROUND_UP(age
* RECOVERY_JOURNAL_1_ENTRIES_PER_BLOCK
,
391 2 * RECOVERY_JOURNAL_ENTRIES_PER_BLOCK
);
394 #endif /* VDO_BLOCK_MAP_H */