1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright 2023 Red Hat
8 #include <linux/min_heap.h>
9 #include <linux/minmax.h>
12 #include "memory-alloc.h"
13 #include "permassert.h"
15 #include "block-map.h"
16 #include "completion.h"
17 #include "constants.h"
18 #include "encodings.h"
20 #include "io-submitter.h"
21 #include "recovery-journal.h"
22 #include "slab-depot.h"
25 #include "wait-queue.h"
28 * An explicitly numbered block mapping. Numbering the mappings allows them to be sorted by logical
29 * block number during repair while still preserving the relative order of journal entries with
30 * the same logical block number.
32 struct numbered_block_mapping
{
33 struct block_map_slot block_map_slot
;
34 struct block_map_entry block_map_entry
;
35 /* A serial number to use during replay */
40 * The absolute position of an entry in the recovery journal, including the sector number and the
41 * entry number within the sector.
43 struct recovery_point
{
44 /* Block sequence number */
45 sequence_number_t sequence_number
;
49 journal_entry_count_t entry_count
;
50 /* Whether or not the increment portion of the current entry has been applied */
51 bool increment_applied
;
54 DEFINE_MIN_HEAP(struct numbered_block_mapping
, replay_heap
);
56 struct repair_completion
{
57 /* The completion header */
58 struct vdo_completion completion
;
60 /* A buffer to hold the data read off disk */
63 /* For loading the journal */
64 data_vio_count_t vio_count
;
65 data_vio_count_t vios_complete
;
68 /* The number of entries to be applied to the block map */
69 size_t block_map_entry_count
;
70 /* The sequence number of the first valid block for block map recovery */
71 sequence_number_t block_map_head
;
72 /* The sequence number of the first valid block for slab journal replay */
73 sequence_number_t slab_journal_head
;
74 /* The sequence number of the last valid block of the journal (if known) */
75 sequence_number_t tail
;
77 * The highest sequence number of the journal. During recovery (vs read-only rebuild), not
78 * the same as the tail, since the tail ignores blocks after the first hole.
80 sequence_number_t highest_tail
;
82 /* The number of logical blocks currently known to be in use */
83 block_count_t logical_blocks_used
;
84 /* The number of block map data blocks known to be allocated */
85 block_count_t block_map_data_blocks
;
87 /* These fields are for playing the journal into the block map */
88 /* The entry data for the block map recovery */
89 struct numbered_block_mapping
*entries
;
90 /* The number of entries in the entry array */
92 /* number of pending (non-ready) requests*/
93 page_count_t outstanding
;
94 /* number of page completions */
95 page_count_t page_count
;
98 * a heap wrapping journal_entries. It re-orders and sorts journal entries in ascending LBN
99 * order, then original journal order. This permits efficient iteration over the journal
102 struct replay_heap replay_heap
;
103 /* Fields tracking progress through the journal entries. */
104 struct numbered_block_mapping
*current_entry
;
105 struct numbered_block_mapping
*current_unfetched_entry
;
106 /* Current requested page's PBN */
107 physical_block_number_t pbn
;
109 /* These fields are only used during recovery. */
110 /* A location just beyond the last valid entry of the journal */
111 struct recovery_point tail_recovery_point
;
112 /* The location of the next recovery journal entry to apply */
113 struct recovery_point next_recovery_point
;
114 /* The journal point to give to the next synthesized decref */
115 struct journal_point next_journal_point
;
116 /* The number of entries played into slab journals */
117 size_t entries_added_to_slab_journals
;
119 /* These fields are only used during read-only rebuild */
120 page_count_t page_to_fetch
;
121 /* the number of leaf pages in the block map */
122 page_count_t leaf_pages
;
123 /* the last slot of the block map */
124 struct block_map_slot last_slot
;
127 * The page completions used for playing the journal into the block map, and, during
128 * read-only rebuild, for rebuilding the reference counts from the block map.
130 struct vdo_page_completion page_completions
[];
134 * This is a min_heap callback function that orders numbered_block_mappings using the
135 * 'block_map_slot' field as the primary key and the mapping 'number' field as the secondary key.
136 * Using the mapping number preserves the journal order of entries for the same slot, allowing us
137 * to sort by slot while still ensuring we replay all entries with the same slot in the exact order
138 * as they appeared in the journal.
140 static bool mapping_is_less_than(const void *item1
, const void *item2
, void __always_unused
*args
)
142 const struct numbered_block_mapping
*mapping1
=
143 (const struct numbered_block_mapping
*) item1
;
144 const struct numbered_block_mapping
*mapping2
=
145 (const struct numbered_block_mapping
*) item2
;
147 if (mapping1
->block_map_slot
.pbn
!= mapping2
->block_map_slot
.pbn
)
148 return mapping1
->block_map_slot
.pbn
< mapping2
->block_map_slot
.pbn
;
150 if (mapping1
->block_map_slot
.slot
!= mapping2
->block_map_slot
.slot
)
151 return mapping1
->block_map_slot
.slot
< mapping2
->block_map_slot
.slot
;
153 if (mapping1
->number
!= mapping2
->number
)
154 return mapping1
->number
< mapping2
->number
;
159 static void swap_mappings(void *item1
, void *item2
, void __always_unused
*args
)
161 struct numbered_block_mapping
*mapping1
= item1
;
162 struct numbered_block_mapping
*mapping2
= item2
;
164 swap(*mapping1
, *mapping2
);
167 static const struct min_heap_callbacks repair_min_heap
= {
168 .less
= mapping_is_less_than
,
172 static struct numbered_block_mapping
*sort_next_heap_element(struct repair_completion
*repair
)
174 struct replay_heap
*heap
= &repair
->replay_heap
;
175 struct numbered_block_mapping
*last
;
181 * Swap the next heap element with the last one on the heap, popping it off the heap,
182 * restore the heap invariant, and return a pointer to the popped element.
184 last
= &repair
->entries
[--heap
->nr
];
185 swap_mappings(heap
->data
, last
, NULL
);
186 min_heap_sift_down(heap
, 0, &repair_min_heap
, NULL
);
191 * as_repair_completion() - Convert a generic completion to a repair_completion.
192 * @completion: The completion to convert.
194 * Return: The repair_completion.
196 static inline struct repair_completion
* __must_check
197 as_repair_completion(struct vdo_completion
*completion
)
199 vdo_assert_completion_type(completion
, VDO_REPAIR_COMPLETION
);
200 return container_of(completion
, struct repair_completion
, completion
);
203 static void prepare_repair_completion(struct repair_completion
*repair
,
204 vdo_action_fn callback
, enum vdo_zone_type zone_type
)
206 struct vdo_completion
*completion
= &repair
->completion
;
207 const struct thread_config
*thread_config
= &completion
->vdo
->thread_config
;
208 thread_id_t thread_id
;
210 /* All blockmap access is done on single thread, so use logical zone 0. */
211 thread_id
= ((zone_type
== VDO_ZONE_TYPE_LOGICAL
) ?
212 thread_config
->logical_threads
[0] :
213 thread_config
->admin_thread
);
214 vdo_reset_completion(completion
);
215 vdo_set_completion_callback(completion
, callback
, thread_id
);
218 static void launch_repair_completion(struct repair_completion
*repair
,
219 vdo_action_fn callback
, enum vdo_zone_type zone_type
)
221 prepare_repair_completion(repair
, callback
, zone_type
);
222 vdo_launch_completion(&repair
->completion
);
225 static void uninitialize_vios(struct repair_completion
*repair
)
227 while (repair
->vio_count
> 0)
228 free_vio_components(&repair
->vios
[--repair
->vio_count
]);
230 vdo_free(vdo_forget(repair
->vios
));
233 static void free_repair_completion(struct repair_completion
*repair
)
239 * We do this here because this function is the only common bottleneck for all clean up
242 repair
->completion
.vdo
->block_map
->zones
[0].page_cache
.rebuilding
= false;
244 uninitialize_vios(repair
);
245 vdo_free(vdo_forget(repair
->journal_data
));
246 vdo_free(vdo_forget(repair
->entries
));
250 static void finish_repair(struct vdo_completion
*completion
)
252 struct vdo_completion
*parent
= completion
->parent
;
253 struct vdo
*vdo
= completion
->vdo
;
254 struct repair_completion
*repair
= as_repair_completion(completion
);
256 vdo_assert_on_admin_thread(vdo
, __func__
);
258 if (vdo
->load_state
!= VDO_REBUILD_FOR_UPGRADE
)
259 vdo
->states
.vdo
.complete_recoveries
++;
261 vdo_initialize_recovery_journal_post_repair(vdo
->recovery_journal
,
262 vdo
->states
.vdo
.complete_recoveries
,
263 repair
->highest_tail
,
264 repair
->logical_blocks_used
,
265 repair
->block_map_data_blocks
);
266 free_repair_completion(vdo_forget(repair
));
268 if (vdo_state_requires_read_only_rebuild(vdo
->load_state
)) {
269 vdo_log_info("Read-only rebuild complete");
270 vdo_launch_completion(parent
);
274 /* FIXME: shouldn't this say either "recovery" or "repair"? */
275 vdo_log_info("Rebuild complete");
278 * Now that we've freed the repair completion and its vast array of journal entries, we
279 * can allocate refcounts.
281 vdo_continue_completion(parent
, vdo_allocate_reference_counters(vdo
->depot
));
285 * abort_repair() - Handle a repair error.
286 * @completion: The repair completion.
288 static void abort_repair(struct vdo_completion
*completion
)
290 struct vdo_completion
*parent
= completion
->parent
;
291 int result
= completion
->result
;
292 struct repair_completion
*repair
= as_repair_completion(completion
);
294 if (vdo_state_requires_read_only_rebuild(completion
->vdo
->load_state
))
295 vdo_log_info("Read-only rebuild aborted");
297 vdo_log_warning("Recovery aborted");
299 free_repair_completion(vdo_forget(repair
));
300 vdo_continue_completion(parent
, result
);
304 * abort_on_error() - Abort a repair if there is an error.
305 * @result: The result to check.
306 * @repair: The repair completion.
308 * Return: true if the result was an error.
310 static bool __must_check
abort_on_error(int result
, struct repair_completion
*repair
)
312 if (result
== VDO_SUCCESS
)
315 vdo_fail_completion(&repair
->completion
, result
);
320 * drain_slab_depot() - Flush out all dirty refcounts blocks now that they have been rebuilt or
322 * @completion: The repair completion.
324 static void drain_slab_depot(struct vdo_completion
*completion
)
326 struct vdo
*vdo
= completion
->vdo
;
327 struct repair_completion
*repair
= as_repair_completion(completion
);
328 const struct admin_state_code
*operation
;
330 vdo_assert_on_admin_thread(vdo
, __func__
);
332 prepare_repair_completion(repair
, finish_repair
, VDO_ZONE_TYPE_ADMIN
);
333 if (vdo_state_requires_read_only_rebuild(vdo
->load_state
)) {
334 vdo_log_info("Saving rebuilt state");
335 operation
= VDO_ADMIN_STATE_REBUILDING
;
337 vdo_log_info("Replayed %zu journal entries into slab journals",
338 repair
->entries_added_to_slab_journals
);
339 operation
= VDO_ADMIN_STATE_RECOVERING
;
342 vdo_drain_slab_depot(vdo
->depot
, operation
, completion
);
346 * flush_block_map_updates() - Flush the block map now that all the reference counts are rebuilt.
347 * @completion: The repair completion.
349 * This callback is registered in finish_if_done().
351 static void flush_block_map_updates(struct vdo_completion
*completion
)
353 vdo_assert_on_admin_thread(completion
->vdo
, __func__
);
355 vdo_log_info("Flushing block map changes");
356 prepare_repair_completion(as_repair_completion(completion
), drain_slab_depot
,
357 VDO_ZONE_TYPE_ADMIN
);
358 vdo_drain_block_map(completion
->vdo
->block_map
, VDO_ADMIN_STATE_RECOVERING
,
362 static bool fetch_page(struct repair_completion
*repair
,
363 struct vdo_completion
*completion
);
366 * handle_page_load_error() - Handle an error loading a page.
367 * @completion: The vdo_page_completion.
369 static void handle_page_load_error(struct vdo_completion
*completion
)
371 struct repair_completion
*repair
= completion
->parent
;
373 repair
->outstanding
--;
374 vdo_set_completion_result(&repair
->completion
, completion
->result
);
375 vdo_release_page_completion(completion
);
376 fetch_page(repair
, completion
);
380 * unmap_entry() - Unmap an invalid entry and indicate that its page must be written out.
381 * @page: The page containing the entries
382 * @completion: The page_completion for writing the page
383 * @slot: The slot to unmap
385 static void unmap_entry(struct block_map_page
*page
, struct vdo_completion
*completion
,
388 page
->entries
[slot
] = UNMAPPED_BLOCK_MAP_ENTRY
;
389 vdo_request_page_write(completion
);
393 * remove_out_of_bounds_entries() - Unmap entries which outside the logical space.
394 * @page: The page containing the entries
395 * @completion: The page_completion for writing the page
396 * @start: The first slot to check
398 static void remove_out_of_bounds_entries(struct block_map_page
*page
,
399 struct vdo_completion
*completion
,
404 for (slot
= start
; slot
< VDO_BLOCK_MAP_ENTRIES_PER_PAGE
; slot
++) {
405 struct data_location mapping
= vdo_unpack_block_map_entry(&page
->entries
[slot
]);
407 if (vdo_is_mapped_location(&mapping
))
408 unmap_entry(page
, completion
, slot
);
413 * process_slot() - Update the reference counts for a single entry.
414 * @page: The page containing the entries
415 * @completion: The page_completion for writing the page
416 * @slot: The slot to check
418 * Return: true if the entry was a valid mapping
420 static bool process_slot(struct block_map_page
*page
, struct vdo_completion
*completion
,
423 struct slab_depot
*depot
= completion
->vdo
->depot
;
425 struct data_location mapping
= vdo_unpack_block_map_entry(&page
->entries
[slot
]);
427 if (!vdo_is_valid_location(&mapping
)) {
428 /* This entry is invalid, so remove it from the page. */
429 unmap_entry(page
, completion
, slot
);
433 if (!vdo_is_mapped_location(&mapping
))
437 if (mapping
.pbn
== VDO_ZERO_BLOCK
)
440 if (!vdo_is_physical_data_block(depot
, mapping
.pbn
)) {
442 * This is a nonsense mapping. Remove it from the map so we're at least consistent
443 * and mark the page dirty.
445 unmap_entry(page
, completion
, slot
);
449 result
= vdo_adjust_reference_count_for_rebuild(depot
, mapping
.pbn
,
450 VDO_JOURNAL_DATA_REMAPPING
);
451 if (result
== VDO_SUCCESS
)
454 vdo_log_error_strerror(result
,
455 "Could not adjust reference count for PBN %llu, slot %u mapped to PBN %llu",
456 (unsigned long long) vdo_get_block_map_page_pbn(page
),
457 slot
, (unsigned long long) mapping
.pbn
);
458 unmap_entry(page
, completion
, slot
);
463 * rebuild_reference_counts_from_page() - Rebuild reference counts from a block map page.
464 * @repair: The repair completion.
465 * @completion: The page completion holding the page.
467 static void rebuild_reference_counts_from_page(struct repair_completion
*repair
,
468 struct vdo_completion
*completion
)
470 slot_number_t slot
, last_slot
;
471 struct block_map_page
*page
;
474 result
= vdo_get_cached_page(completion
, &page
);
475 if (result
!= VDO_SUCCESS
) {
476 vdo_set_completion_result(&repair
->completion
, result
);
480 if (!page
->header
.initialized
)
483 /* Remove any bogus entries which exist beyond the end of the logical space. */
484 if (vdo_get_block_map_page_pbn(page
) == repair
->last_slot
.pbn
) {
485 last_slot
= repair
->last_slot
.slot
;
486 remove_out_of_bounds_entries(page
, completion
, last_slot
);
488 last_slot
= VDO_BLOCK_MAP_ENTRIES_PER_PAGE
;
491 /* Inform the slab depot of all entries on this page. */
492 for (slot
= 0; slot
< last_slot
; slot
++) {
493 if (process_slot(page
, completion
, slot
))
494 repair
->logical_blocks_used
++;
499 * page_loaded() - Process a page which has just been loaded.
500 * @completion: The vdo_page_completion for the fetched page.
502 * This callback is registered by fetch_page().
504 static void page_loaded(struct vdo_completion
*completion
)
506 struct repair_completion
*repair
= completion
->parent
;
508 repair
->outstanding
--;
509 rebuild_reference_counts_from_page(repair
, completion
);
510 vdo_release_page_completion(completion
);
512 /* Advance progress to the next page, and fetch the next page we haven't yet requested. */
513 fetch_page(repair
, completion
);
516 static physical_block_number_t
get_pbn_to_fetch(struct repair_completion
*repair
,
517 struct block_map
*block_map
)
519 physical_block_number_t pbn
= VDO_ZERO_BLOCK
;
521 if (repair
->completion
.result
!= VDO_SUCCESS
)
522 return VDO_ZERO_BLOCK
;
524 while ((pbn
== VDO_ZERO_BLOCK
) && (repair
->page_to_fetch
< repair
->leaf_pages
))
525 pbn
= vdo_find_block_map_page_pbn(block_map
, repair
->page_to_fetch
++);
527 if (vdo_is_physical_data_block(repair
->completion
.vdo
->depot
, pbn
))
530 vdo_set_completion_result(&repair
->completion
, VDO_BAD_MAPPING
);
531 return VDO_ZERO_BLOCK
;
535 * fetch_page() - Fetch a page from the block map.
536 * @repair: The repair_completion.
537 * @completion: The page completion to use.
539 * Return true if the rebuild is complete
541 static bool fetch_page(struct repair_completion
*repair
,
542 struct vdo_completion
*completion
)
544 struct vdo_page_completion
*page_completion
= (struct vdo_page_completion
*) completion
;
545 struct block_map
*block_map
= repair
->completion
.vdo
->block_map
;
546 physical_block_number_t pbn
= get_pbn_to_fetch(repair
, block_map
);
548 if (pbn
!= VDO_ZERO_BLOCK
) {
549 repair
->outstanding
++;
551 * We must set the requeue flag here to ensure that we don't blow the stack if all
552 * the requested pages are already in the cache or get load errors.
554 vdo_get_page(page_completion
, &block_map
->zones
[0], pbn
, true, repair
,
555 page_loaded
, handle_page_load_error
, true);
558 if (repair
->outstanding
> 0)
561 launch_repair_completion(repair
, flush_block_map_updates
, VDO_ZONE_TYPE_ADMIN
);
566 * rebuild_from_leaves() - Rebuild reference counts from the leaf block map pages.
567 * @completion: The repair completion.
569 * Rebuilds reference counts from the leaf block map pages now that reference counts have been
570 * rebuilt from the interior tree pages (which have been loaded in the process). This callback is
571 * registered in rebuild_reference_counts().
573 static void rebuild_from_leaves(struct vdo_completion
*completion
)
576 struct repair_completion
*repair
= as_repair_completion(completion
);
577 struct block_map
*map
= completion
->vdo
->block_map
;
579 repair
->logical_blocks_used
= 0;
582 * The PBN calculation doesn't work until the tree pages have been loaded, so we can't set
583 * this value at the start of repair.
585 repair
->leaf_pages
= vdo_compute_block_map_page_count(map
->entry_count
);
586 repair
->last_slot
= (struct block_map_slot
) {
587 .slot
= map
->entry_count
% VDO_BLOCK_MAP_ENTRIES_PER_PAGE
,
588 .pbn
= vdo_find_block_map_page_pbn(map
, repair
->leaf_pages
- 1),
590 if (repair
->last_slot
.slot
== 0)
591 repair
->last_slot
.slot
= VDO_BLOCK_MAP_ENTRIES_PER_PAGE
;
593 for (i
= 0; i
< repair
->page_count
; i
++) {
594 if (fetch_page(repair
, &repair
->page_completions
[i
].completion
)) {
596 * The rebuild has already moved on, so it isn't safe nor is there a need
597 * to launch any more fetches.
605 * process_entry() - Process a single entry from the block map tree.
606 * @pbn: A pbn which holds a block map tree page.
607 * @completion: The parent completion of the traversal.
609 * Implements vdo_entry_callback_fn.
611 * Return: VDO_SUCCESS or an error.
613 static int process_entry(physical_block_number_t pbn
, struct vdo_completion
*completion
)
615 struct repair_completion
*repair
= as_repair_completion(completion
);
616 struct slab_depot
*depot
= completion
->vdo
->depot
;
619 if ((pbn
== VDO_ZERO_BLOCK
) || !vdo_is_physical_data_block(depot
, pbn
)) {
620 return vdo_log_error_strerror(VDO_BAD_CONFIGURATION
,
621 "PBN %llu out of range",
622 (unsigned long long) pbn
);
625 result
= vdo_adjust_reference_count_for_rebuild(depot
, pbn
,
626 VDO_JOURNAL_BLOCK_MAP_REMAPPING
);
627 if (result
!= VDO_SUCCESS
) {
628 return vdo_log_error_strerror(result
,
629 "Could not adjust reference count for block map tree PBN %llu",
630 (unsigned long long) pbn
);
633 repair
->block_map_data_blocks
++;
637 static void rebuild_reference_counts(struct vdo_completion
*completion
)
639 struct repair_completion
*repair
= as_repair_completion(completion
);
640 struct vdo
*vdo
= completion
->vdo
;
641 struct vdo_page_cache
*cache
= &vdo
->block_map
->zones
[0].page_cache
;
643 /* We must allocate ref_counts before we can rebuild them. */
644 if (abort_on_error(vdo_allocate_reference_counters(vdo
->depot
), repair
))
648 * Completion chaining from page cache hits can lead to stack overflow during the rebuild,
649 * so clear out the cache before this rebuild phase.
651 if (abort_on_error(vdo_invalidate_page_cache(cache
), repair
))
654 prepare_repair_completion(repair
, rebuild_from_leaves
, VDO_ZONE_TYPE_LOGICAL
);
655 vdo_traverse_forest(vdo
->block_map
, process_entry
, completion
);
658 static void increment_recovery_point(struct recovery_point
*point
)
660 if (++point
->entry_count
< RECOVERY_JOURNAL_ENTRIES_PER_SECTOR
)
663 point
->entry_count
= 0;
664 if (point
->sector_count
< (VDO_SECTORS_PER_BLOCK
- 1)) {
665 point
->sector_count
++;
669 point
->sequence_number
++;
670 point
->sector_count
= 1;
674 * advance_points() - Advance the current recovery and journal points.
675 * @repair: The repair_completion whose points are to be advanced.
676 * @entries_per_block: The number of entries in a recovery journal block.
678 static void advance_points(struct repair_completion
*repair
,
679 journal_entry_count_t entries_per_block
)
681 if (!repair
->next_recovery_point
.increment_applied
) {
682 repair
->next_recovery_point
.increment_applied
= true;
686 increment_recovery_point(&repair
->next_recovery_point
);
687 vdo_advance_journal_point(&repair
->next_journal_point
, entries_per_block
);
688 repair
->next_recovery_point
.increment_applied
= false;
692 * before_recovery_point() - Check whether the first point precedes the second point.
693 * @first: The first recovery point.
694 * @second: The second recovery point.
696 * Return: true if the first point precedes the second point.
698 static bool __must_check
before_recovery_point(const struct recovery_point
*first
,
699 const struct recovery_point
*second
)
701 if (first
->sequence_number
< second
->sequence_number
)
704 if (first
->sequence_number
> second
->sequence_number
)
707 if (first
->sector_count
< second
->sector_count
)
710 return ((first
->sector_count
== second
->sector_count
) &&
711 (first
->entry_count
< second
->entry_count
));
714 static struct packed_journal_sector
* __must_check
get_sector(struct recovery_journal
*journal
,
716 sequence_number_t sequence
,
721 offset
= ((vdo_get_recovery_journal_block_number(journal
, sequence
) * VDO_BLOCK_SIZE
) +
722 (VDO_SECTOR_SIZE
* sector_number
));
723 return (struct packed_journal_sector
*) (journal_data
+ offset
);
727 * get_entry() - Unpack the recovery journal entry associated with the given recovery point.
728 * @repair: The repair completion.
729 * @point: The recovery point.
731 * Return: The unpacked contents of the matching recovery journal entry.
733 static struct recovery_journal_entry
get_entry(const struct repair_completion
*repair
,
734 const struct recovery_point
*point
)
736 struct packed_journal_sector
*sector
;
738 sector
= get_sector(repair
->completion
.vdo
->recovery_journal
,
739 repair
->journal_data
, point
->sequence_number
,
740 point
->sector_count
);
741 return vdo_unpack_recovery_journal_entry(§or
->entries
[point
->entry_count
]);
745 * validate_recovery_journal_entry() - Validate a recovery journal entry.
747 * @entry: The entry to validate.
749 * Return: VDO_SUCCESS or an error.
751 static int validate_recovery_journal_entry(const struct vdo
*vdo
,
752 const struct recovery_journal_entry
*entry
)
754 if ((entry
->slot
.pbn
>= vdo
->states
.vdo
.config
.physical_blocks
) ||
755 (entry
->slot
.slot
>= VDO_BLOCK_MAP_ENTRIES_PER_PAGE
) ||
756 !vdo_is_valid_location(&entry
->mapping
) ||
757 !vdo_is_valid_location(&entry
->unmapping
) ||
758 !vdo_is_physical_data_block(vdo
->depot
, entry
->mapping
.pbn
) ||
759 !vdo_is_physical_data_block(vdo
->depot
, entry
->unmapping
.pbn
)) {
760 return vdo_log_error_strerror(VDO_CORRUPT_JOURNAL
,
761 "Invalid entry: %s (%llu, %u) from %llu to %llu is not within bounds",
762 vdo_get_journal_operation_name(entry
->operation
),
763 (unsigned long long) entry
->slot
.pbn
,
765 (unsigned long long) entry
->unmapping
.pbn
,
766 (unsigned long long) entry
->mapping
.pbn
);
769 if ((entry
->operation
== VDO_JOURNAL_BLOCK_MAP_REMAPPING
) &&
770 (vdo_is_state_compressed(entry
->mapping
.state
) ||
771 (entry
->mapping
.pbn
== VDO_ZERO_BLOCK
) ||
772 (entry
->unmapping
.state
!= VDO_MAPPING_STATE_UNMAPPED
) ||
773 (entry
->unmapping
.pbn
!= VDO_ZERO_BLOCK
))) {
774 return vdo_log_error_strerror(VDO_CORRUPT_JOURNAL
,
775 "Invalid entry: %s (%llu, %u) from %llu to %llu is not a valid tree mapping",
776 vdo_get_journal_operation_name(entry
->operation
),
777 (unsigned long long) entry
->slot
.pbn
,
779 (unsigned long long) entry
->unmapping
.pbn
,
780 (unsigned long long) entry
->mapping
.pbn
);
787 * add_slab_journal_entries() - Replay recovery journal entries into the slab journals of the
788 * allocator currently being recovered.
789 * @completion: The allocator completion.
791 * Waits for slab journal tailblock space when necessary. This method is its own callback.
793 static void add_slab_journal_entries(struct vdo_completion
*completion
)
795 struct recovery_point
*recovery_point
;
796 struct repair_completion
*repair
= completion
->parent
;
797 struct vdo
*vdo
= completion
->vdo
;
798 struct recovery_journal
*journal
= vdo
->recovery_journal
;
799 struct block_allocator
*allocator
= vdo_as_block_allocator(completion
);
801 /* Get ready in case we need to enqueue again. */
802 vdo_prepare_completion(completion
, add_slab_journal_entries
,
803 vdo_notify_slab_journals_are_recovered
,
804 completion
->callback_thread_id
, repair
);
805 for (recovery_point
= &repair
->next_recovery_point
;
806 before_recovery_point(recovery_point
, &repair
->tail_recovery_point
);
807 advance_points(repair
, journal
->entries_per_block
)) {
809 physical_block_number_t pbn
;
810 struct vdo_slab
*slab
;
811 struct recovery_journal_entry entry
= get_entry(repair
, recovery_point
);
812 bool increment
= !repair
->next_recovery_point
.increment_applied
;
815 result
= validate_recovery_journal_entry(vdo
, &entry
);
816 if (result
!= VDO_SUCCESS
) {
817 vdo_enter_read_only_mode(vdo
, result
);
818 vdo_fail_completion(completion
, result
);
822 pbn
= entry
.mapping
.pbn
;
824 pbn
= entry
.unmapping
.pbn
;
827 if (pbn
== VDO_ZERO_BLOCK
)
830 slab
= vdo_get_slab(vdo
->depot
, pbn
);
831 if (slab
->allocator
!= allocator
)
834 if (!vdo_attempt_replay_into_slab(slab
, pbn
, entry
.operation
, increment
,
835 &repair
->next_journal_point
,
839 repair
->entries_added_to_slab_journals
++;
842 vdo_notify_slab_journals_are_recovered(completion
);
846 * vdo_replay_into_slab_journals() - Replay recovery journal entries in the slab journals of slabs
847 * owned by a given block_allocator.
848 * @allocator: The allocator whose slab journals are to be recovered.
849 * @context: The slab depot load context supplied by a recovery when it loads the depot.
851 void vdo_replay_into_slab_journals(struct block_allocator
*allocator
, void *context
)
853 struct vdo_completion
*completion
= &allocator
->completion
;
854 struct repair_completion
*repair
= context
;
855 struct vdo
*vdo
= completion
->vdo
;
857 vdo_assert_on_physical_zone_thread(vdo
, allocator
->zone_number
, __func__
);
858 if (repair
->entry_count
== 0) {
859 /* there's nothing to replay */
860 repair
->logical_blocks_used
= vdo
->recovery_journal
->logical_blocks_used
;
861 repair
->block_map_data_blocks
= vdo
->recovery_journal
->block_map_data_blocks
;
862 vdo_notify_slab_journals_are_recovered(completion
);
866 repair
->next_recovery_point
= (struct recovery_point
) {
867 .sequence_number
= repair
->slab_journal_head
,
872 repair
->next_journal_point
= (struct journal_point
) {
873 .sequence_number
= repair
->slab_journal_head
,
877 vdo_log_info("Replaying entries into slab journals for zone %u",
878 allocator
->zone_number
);
879 completion
->parent
= repair
;
880 add_slab_journal_entries(completion
);
883 static void load_slab_depot(struct vdo_completion
*completion
)
885 struct repair_completion
*repair
= as_repair_completion(completion
);
886 const struct admin_state_code
*operation
;
888 vdo_assert_on_admin_thread(completion
->vdo
, __func__
);
890 if (vdo_state_requires_read_only_rebuild(completion
->vdo
->load_state
)) {
891 prepare_repair_completion(repair
, rebuild_reference_counts
,
892 VDO_ZONE_TYPE_LOGICAL
);
893 operation
= VDO_ADMIN_STATE_LOADING_FOR_REBUILD
;
895 prepare_repair_completion(repair
, drain_slab_depot
, VDO_ZONE_TYPE_ADMIN
);
896 operation
= VDO_ADMIN_STATE_LOADING_FOR_RECOVERY
;
899 vdo_load_slab_depot(completion
->vdo
->depot
, operation
, completion
, repair
);
902 static void flush_block_map(struct vdo_completion
*completion
)
904 struct repair_completion
*repair
= as_repair_completion(completion
);
905 const struct admin_state_code
*operation
;
907 vdo_assert_on_admin_thread(completion
->vdo
, __func__
);
909 vdo_log_info("Flushing block map changes");
910 prepare_repair_completion(repair
, load_slab_depot
, VDO_ZONE_TYPE_ADMIN
);
911 operation
= (vdo_state_requires_read_only_rebuild(completion
->vdo
->load_state
) ?
912 VDO_ADMIN_STATE_REBUILDING
:
913 VDO_ADMIN_STATE_RECOVERING
);
914 vdo_drain_block_map(completion
->vdo
->block_map
, operation
, completion
);
917 static bool finish_if_done(struct repair_completion
*repair
)
919 /* Pages are still being launched or there is still work to do */
920 if (repair
->launching
|| (repair
->outstanding
> 0))
923 if (repair
->completion
.result
!= VDO_SUCCESS
) {
926 for (i
= 0; i
< repair
->page_count
; i
++) {
927 struct vdo_page_completion
*page_completion
=
928 &repair
->page_completions
[i
];
930 if (page_completion
->ready
)
931 vdo_release_page_completion(&page_completion
->completion
);
934 vdo_launch_completion(&repair
->completion
);
938 if (repair
->current_entry
>= repair
->entries
)
941 launch_repair_completion(repair
, flush_block_map
, VDO_ZONE_TYPE_ADMIN
);
945 static void abort_block_map_recovery(struct repair_completion
*repair
, int result
)
947 vdo_set_completion_result(&repair
->completion
, result
);
948 finish_if_done(repair
);
952 * find_entry_starting_next_page() - Find the first journal entry after a given entry which is not
953 * on the same block map page.
954 * @repair: The repair completion.
955 * @current_entry: The entry to search from.
956 * @needs_sort: Whether sorting is needed to proceed.
958 * Return: Pointer to the first later journal entry on a different block map page, or a pointer to
959 * just before the journal entries if no subsequent entry is on a different block map page.
961 static struct numbered_block_mapping
*
962 find_entry_starting_next_page(struct repair_completion
*repair
,
963 struct numbered_block_mapping
*current_entry
, bool needs_sort
)
967 /* If current_entry is invalid, return immediately. */
968 if (current_entry
< repair
->entries
)
969 return current_entry
;
971 current_page
= current_entry
->block_map_slot
.pbn
;
973 /* Decrement current_entry until it's out of bounds or on a different page. */
974 while ((current_entry
>= repair
->entries
) &&
975 (current_entry
->block_map_slot
.pbn
== current_page
)) {
977 struct numbered_block_mapping
*just_sorted_entry
=
978 sort_next_heap_element(repair
);
979 VDO_ASSERT_LOG_ONLY(just_sorted_entry
< current_entry
,
980 "heap is returning elements in an unexpected order");
986 return current_entry
;
990 * Apply a range of journal entries [starting_entry, ending_entry) journal
991 * entries to a block map page.
993 static void apply_journal_entries_to_page(struct block_map_page
*page
,
994 struct numbered_block_mapping
*starting_entry
,
995 struct numbered_block_mapping
*ending_entry
)
997 struct numbered_block_mapping
*current_entry
= starting_entry
;
999 while (current_entry
!= ending_entry
) {
1000 page
->entries
[current_entry
->block_map_slot
.slot
] = current_entry
->block_map_entry
;
1005 static void recover_ready_pages(struct repair_completion
*repair
,
1006 struct vdo_completion
*completion
);
1008 static void block_map_page_loaded(struct vdo_completion
*completion
)
1010 struct repair_completion
*repair
= as_repair_completion(completion
->parent
);
1012 repair
->outstanding
--;
1013 if (!repair
->launching
)
1014 recover_ready_pages(repair
, completion
);
1017 static void handle_block_map_page_load_error(struct vdo_completion
*completion
)
1019 struct repair_completion
*repair
= as_repair_completion(completion
->parent
);
1021 repair
->outstanding
--;
1022 abort_block_map_recovery(repair
, completion
->result
);
1025 static void fetch_block_map_page(struct repair_completion
*repair
,
1026 struct vdo_completion
*completion
)
1028 physical_block_number_t pbn
;
1030 if (repair
->current_unfetched_entry
< repair
->entries
)
1031 /* Nothing left to fetch. */
1034 /* Fetch the next page we haven't yet requested. */
1035 pbn
= repair
->current_unfetched_entry
->block_map_slot
.pbn
;
1036 repair
->current_unfetched_entry
=
1037 find_entry_starting_next_page(repair
, repair
->current_unfetched_entry
,
1039 repair
->outstanding
++;
1040 vdo_get_page(((struct vdo_page_completion
*) completion
),
1041 &repair
->completion
.vdo
->block_map
->zones
[0], pbn
, true,
1042 &repair
->completion
, block_map_page_loaded
,
1043 handle_block_map_page_load_error
, false);
1046 static struct vdo_page_completion
*get_next_page_completion(struct repair_completion
*repair
,
1047 struct vdo_page_completion
*completion
)
1050 if (completion
== (&repair
->page_completions
[repair
->page_count
]))
1051 completion
= &repair
->page_completions
[0];
1055 static void recover_ready_pages(struct repair_completion
*repair
,
1056 struct vdo_completion
*completion
)
1058 struct vdo_page_completion
*page_completion
= (struct vdo_page_completion
*) completion
;
1060 if (finish_if_done(repair
))
1063 if (repair
->pbn
!= page_completion
->pbn
)
1066 while (page_completion
->ready
) {
1067 struct numbered_block_mapping
*start_of_next_page
;
1068 struct block_map_page
*page
;
1071 result
= vdo_get_cached_page(completion
, &page
);
1072 if (result
!= VDO_SUCCESS
) {
1073 abort_block_map_recovery(repair
, result
);
1077 start_of_next_page
=
1078 find_entry_starting_next_page(repair
, repair
->current_entry
,
1080 apply_journal_entries_to_page(page
, repair
->current_entry
,
1081 start_of_next_page
);
1082 repair
->current_entry
= start_of_next_page
;
1083 vdo_request_page_write(completion
);
1084 vdo_release_page_completion(completion
);
1086 if (finish_if_done(repair
))
1089 repair
->pbn
= repair
->current_entry
->block_map_slot
.pbn
;
1090 fetch_block_map_page(repair
, completion
);
1091 page_completion
= get_next_page_completion(repair
, page_completion
);
1092 completion
= &page_completion
->completion
;
1096 static void recover_block_map(struct vdo_completion
*completion
)
1098 struct repair_completion
*repair
= as_repair_completion(completion
);
1099 struct vdo
*vdo
= completion
->vdo
;
1100 struct numbered_block_mapping
*first_sorted_entry
;
1103 vdo_assert_on_logical_zone_thread(vdo
, 0, __func__
);
1105 /* Suppress block map errors. */
1106 vdo
->block_map
->zones
[0].page_cache
.rebuilding
=
1107 vdo_state_requires_read_only_rebuild(vdo
->load_state
);
1109 if (repair
->block_map_entry_count
== 0) {
1110 vdo_log_info("Replaying 0 recovery entries into block map");
1111 vdo_free(vdo_forget(repair
->journal_data
));
1112 launch_repair_completion(repair
, load_slab_depot
, VDO_ZONE_TYPE_ADMIN
);
1117 * Organize the journal entries into a binary heap so we can iterate over them in sorted
1118 * order incrementally, avoiding an expensive sort call.
1120 repair
->replay_heap
= (struct replay_heap
) {
1121 .data
= repair
->entries
,
1122 .nr
= repair
->block_map_entry_count
,
1123 .size
= repair
->block_map_entry_count
,
1125 min_heapify_all(&repair
->replay_heap
, &repair_min_heap
, NULL
);
1127 vdo_log_info("Replaying %zu recovery entries into block map",
1128 repair
->block_map_entry_count
);
1130 repair
->current_entry
= &repair
->entries
[repair
->block_map_entry_count
- 1];
1131 first_sorted_entry
= sort_next_heap_element(repair
);
1132 VDO_ASSERT_LOG_ONLY(first_sorted_entry
== repair
->current_entry
,
1133 "heap is returning elements in an unexpected order");
1135 /* Prevent any page from being processed until all pages have been launched. */
1136 repair
->launching
= true;
1137 repair
->pbn
= repair
->current_entry
->block_map_slot
.pbn
;
1138 repair
->current_unfetched_entry
= repair
->current_entry
;
1139 for (i
= 0; i
< repair
->page_count
; i
++) {
1140 if (repair
->current_unfetched_entry
< repair
->entries
)
1143 fetch_block_map_page(repair
, &repair
->page_completions
[i
].completion
);
1145 repair
->launching
= false;
1147 /* Process any ready pages. */
1148 recover_ready_pages(repair
, &repair
->page_completions
[0].completion
);
1152 * get_recovery_journal_block_header() - Get the block header for a block at a position in the
1153 * journal data and unpack it.
1154 * @journal: The recovery journal.
1155 * @data: The recovery journal data.
1156 * @sequence: The sequence number.
1158 * Return: The unpacked header.
1160 static struct recovery_block_header __must_check
1161 get_recovery_journal_block_header(struct recovery_journal
*journal
, char *data
,
1162 sequence_number_t sequence
)
1164 physical_block_number_t pbn
=
1165 vdo_get_recovery_journal_block_number(journal
, sequence
);
1166 char *header
= &data
[pbn
* VDO_BLOCK_SIZE
];
1168 return vdo_unpack_recovery_block_header((struct packed_journal_header
*) header
);
1172 * is_valid_recovery_journal_block() - Determine whether the given header describes a valid block
1173 * for the given journal.
1174 * @journal: The journal to use.
1175 * @header: The unpacked block header to check.
1176 * @old_ok: Whether an old format header is valid.
1178 * A block is not valid if it is unformatted, or if it is older than the last successful recovery
1181 * Return: True if the header is valid.
1183 static bool __must_check
is_valid_recovery_journal_block(const struct recovery_journal
*journal
,
1184 const struct recovery_block_header
*header
,
1187 if ((header
->nonce
!= journal
->nonce
) ||
1188 (header
->recovery_count
!= journal
->recovery_count
))
1191 if (header
->metadata_type
== VDO_METADATA_RECOVERY_JOURNAL_2
)
1192 return (header
->entry_count
<= journal
->entries_per_block
);
1195 (header
->metadata_type
== VDO_METADATA_RECOVERY_JOURNAL
) &&
1196 (header
->entry_count
<= RECOVERY_JOURNAL_1_ENTRIES_PER_BLOCK
));
1200 * is_exact_recovery_journal_block() - Determine whether the given header describes the exact block
1202 * @journal: The journal to use.
1203 * @header: The unpacked block header to check.
1204 * @sequence: The expected sequence number.
1206 * Return: True if the block matches.
1208 static bool __must_check
is_exact_recovery_journal_block(const struct recovery_journal
*journal
,
1209 const struct recovery_block_header
*header
,
1210 sequence_number_t sequence
)
1212 return ((header
->sequence_number
== sequence
) &&
1213 (is_valid_recovery_journal_block(journal
, header
, true)));
1217 * find_recovery_journal_head_and_tail() - Find the tail and head of the journal.
1218 * @repair: The repair completion.
1220 * Return: True if there were valid journal blocks.
1222 static bool find_recovery_journal_head_and_tail(struct repair_completion
*repair
)
1224 struct recovery_journal
*journal
= repair
->completion
.vdo
->recovery_journal
;
1225 bool found_entries
= false;
1226 physical_block_number_t i
;
1229 * Ensure that we don't replay old entries since we know the tail recorded in the super
1230 * block must be a lower bound. Not doing so can result in extra data loss by setting the
1233 repair
->highest_tail
= journal
->tail
;
1234 for (i
= 0; i
< journal
->size
; i
++) {
1235 struct recovery_block_header header
=
1236 get_recovery_journal_block_header(journal
, repair
->journal_data
, i
);
1238 if (!is_valid_recovery_journal_block(journal
, &header
, true)) {
1239 /* This block is old or incorrectly formatted */
1243 if (vdo_get_recovery_journal_block_number(journal
, header
.sequence_number
) != i
) {
1244 /* This block is in the wrong location */
1248 if (header
.sequence_number
>= repair
->highest_tail
) {
1249 found_entries
= true;
1250 repair
->highest_tail
= header
.sequence_number
;
1256 if (header
.block_map_head
> repair
->block_map_head
)
1257 repair
->block_map_head
= header
.block_map_head
;
1259 if (header
.slab_journal_head
> repair
->slab_journal_head
)
1260 repair
->slab_journal_head
= header
.slab_journal_head
;
1263 return found_entries
;
1267 * unpack_entry() - Unpack a recovery journal entry in either format.
1269 * @packed: The entry to unpack.
1270 * @format: The expected format of the entry.
1271 * @entry: The unpacked entry.
1273 * Return: true if the entry should be applied.3
1275 static bool unpack_entry(struct vdo
*vdo
, char *packed
, enum vdo_metadata_type format
,
1276 struct recovery_journal_entry
*entry
)
1278 if (format
== VDO_METADATA_RECOVERY_JOURNAL_2
) {
1279 struct packed_recovery_journal_entry
*packed_entry
=
1280 (struct packed_recovery_journal_entry
*) packed
;
1282 *entry
= vdo_unpack_recovery_journal_entry(packed_entry
);
1284 physical_block_number_t low32
, high4
;
1286 struct packed_recovery_journal_entry_1
*packed_entry
=
1287 (struct packed_recovery_journal_entry_1
*) packed
;
1289 if (packed_entry
->operation
== VDO_JOURNAL_DATA_INCREMENT
)
1290 entry
->operation
= VDO_JOURNAL_DATA_REMAPPING
;
1291 else if (packed_entry
->operation
== VDO_JOURNAL_BLOCK_MAP_INCREMENT
)
1292 entry
->operation
= VDO_JOURNAL_BLOCK_MAP_REMAPPING
;
1296 low32
= __le32_to_cpu(packed_entry
->pbn_low_word
);
1297 high4
= packed_entry
->pbn_high_nibble
;
1298 entry
->slot
= (struct block_map_slot
) {
1299 .pbn
= ((high4
<< 32) | low32
),
1300 .slot
= (packed_entry
->slot_low
| (packed_entry
->slot_high
<< 6)),
1302 entry
->mapping
= vdo_unpack_block_map_entry(&packed_entry
->block_map_entry
);
1303 entry
->unmapping
= (struct data_location
) {
1304 .pbn
= VDO_ZERO_BLOCK
,
1305 .state
= VDO_MAPPING_STATE_UNMAPPED
,
1309 return (validate_recovery_journal_entry(vdo
, entry
) == VDO_SUCCESS
);
1313 * append_sector_entries() - Append an array of recovery journal entries from a journal block
1314 * sector to the array of numbered mappings in the repair completion,
1315 * numbering each entry in the order they are appended.
1316 * @repair: The repair completion.
1317 * @entries: The entries in the sector.
1318 * @format: The format of the sector.
1319 * @entry_count: The number of entries to append.
1321 static void append_sector_entries(struct repair_completion
*repair
, char *entries
,
1322 enum vdo_metadata_type format
,
1323 journal_entry_count_t entry_count
)
1325 journal_entry_count_t i
;
1326 struct vdo
*vdo
= repair
->completion
.vdo
;
1327 off_t increment
= ((format
== VDO_METADATA_RECOVERY_JOURNAL_2
)
1328 ? sizeof(struct packed_recovery_journal_entry
)
1329 : sizeof(struct packed_recovery_journal_entry_1
));
1331 for (i
= 0; i
< entry_count
; i
++, entries
+= increment
) {
1332 struct recovery_journal_entry entry
;
1334 if (!unpack_entry(vdo
, entries
, format
, &entry
))
1335 /* When recovering from read-only mode, ignore damaged entries. */
1338 repair
->entries
[repair
->block_map_entry_count
] =
1339 (struct numbered_block_mapping
) {
1340 .block_map_slot
= entry
.slot
,
1341 .block_map_entry
= vdo_pack_block_map_entry(entry
.mapping
.pbn
,
1342 entry
.mapping
.state
),
1343 .number
= repair
->block_map_entry_count
,
1345 repair
->block_map_entry_count
++;
1349 static journal_entry_count_t
entries_per_sector(enum vdo_metadata_type format
,
1352 if (format
== VDO_METADATA_RECOVERY_JOURNAL_2
)
1353 return RECOVERY_JOURNAL_ENTRIES_PER_SECTOR
;
1355 return ((sector_number
== (VDO_SECTORS_PER_BLOCK
- 1))
1356 ? RECOVERY_JOURNAL_1_ENTRIES_IN_LAST_SECTOR
1357 : RECOVERY_JOURNAL_1_ENTRIES_PER_SECTOR
);
1360 static void extract_entries_from_block(struct repair_completion
*repair
,
1361 struct recovery_journal
*journal
,
1362 sequence_number_t sequence
,
1363 enum vdo_metadata_type format
,
1364 journal_entry_count_t entries
)
1367 struct recovery_block_header header
=
1368 get_recovery_journal_block_header(journal
, repair
->journal_data
,
1371 if (!is_exact_recovery_journal_block(journal
, &header
, sequence
) ||
1372 (header
.metadata_type
!= format
)) {
1373 /* This block is invalid, so skip it. */
1377 entries
= min(entries
, header
.entry_count
);
1378 for (i
= 1; i
< VDO_SECTORS_PER_BLOCK
; i
++) {
1379 struct packed_journal_sector
*sector
=
1380 get_sector(journal
, repair
->journal_data
, sequence
, i
);
1381 journal_entry_count_t sector_entries
=
1382 min(entries
, entries_per_sector(format
, i
));
1384 if (vdo_is_valid_recovery_journal_sector(&header
, sector
, i
)) {
1385 /* Only extract as many as the block header calls for. */
1386 append_sector_entries(repair
, (char *) sector
->entries
, format
,
1387 min_t(journal_entry_count_t
,
1388 sector
->entry_count
,
1393 * Even if the sector wasn't full, count it as full when counting up to the
1394 * entry count the block header claims.
1396 entries
-= sector_entries
;
1400 static int parse_journal_for_rebuild(struct repair_completion
*repair
)
1403 sequence_number_t i
;
1404 block_count_t count
;
1405 enum vdo_metadata_type format
;
1406 struct vdo
*vdo
= repair
->completion
.vdo
;
1407 struct recovery_journal
*journal
= vdo
->recovery_journal
;
1408 journal_entry_count_t entries_per_block
= journal
->entries_per_block
;
1410 format
= get_recovery_journal_block_header(journal
, repair
->journal_data
,
1411 repair
->highest_tail
).metadata_type
;
1412 if (format
== VDO_METADATA_RECOVERY_JOURNAL
)
1413 entries_per_block
= RECOVERY_JOURNAL_1_ENTRIES_PER_BLOCK
;
1416 * Allocate an array of numbered_block_mapping structures large enough to transcribe every
1417 * packed_recovery_journal_entry from every valid journal block.
1419 count
= ((repair
->highest_tail
- repair
->block_map_head
+ 1) * entries_per_block
);
1420 result
= vdo_allocate(count
, struct numbered_block_mapping
, __func__
,
1422 if (result
!= VDO_SUCCESS
)
1425 for (i
= repair
->block_map_head
; i
<= repair
->highest_tail
; i
++)
1426 extract_entries_from_block(repair
, journal
, i
, format
, entries_per_block
);
1431 static int validate_heads(struct repair_completion
*repair
)
1433 /* Both reap heads must be behind the tail. */
1434 if ((repair
->block_map_head
<= repair
->tail
) &&
1435 (repair
->slab_journal_head
<= repair
->tail
))
1439 return vdo_log_error_strerror(VDO_CORRUPT_JOURNAL
,
1440 "Journal tail too early. block map head: %llu, slab journal head: %llu, tail: %llu",
1441 (unsigned long long) repair
->block_map_head
,
1442 (unsigned long long) repair
->slab_journal_head
,
1443 (unsigned long long) repair
->tail
);
1447 * extract_new_mappings() - Find all valid new mappings to be applied to the block map.
1448 * @repair: The repair completion.
1450 * The mappings are extracted from the journal and stored in a sortable array so that all of the
1451 * mappings to be applied to a given block map page can be done in a single page fetch.
1453 static int extract_new_mappings(struct repair_completion
*repair
)
1456 struct vdo
*vdo
= repair
->completion
.vdo
;
1457 struct recovery_point recovery_point
= {
1458 .sequence_number
= repair
->block_map_head
,
1464 * Allocate an array of numbered_block_mapping structs just large enough to transcribe
1465 * every packed_recovery_journal_entry from every valid journal block.
1467 result
= vdo_allocate(repair
->entry_count
, struct numbered_block_mapping
,
1468 __func__
, &repair
->entries
);
1469 if (result
!= VDO_SUCCESS
)
1472 for (; before_recovery_point(&recovery_point
, &repair
->tail_recovery_point
);
1473 increment_recovery_point(&recovery_point
)) {
1474 struct recovery_journal_entry entry
= get_entry(repair
, &recovery_point
);
1476 result
= validate_recovery_journal_entry(vdo
, &entry
);
1477 if (result
!= VDO_SUCCESS
) {
1478 vdo_enter_read_only_mode(vdo
, result
);
1482 repair
->entries
[repair
->block_map_entry_count
] =
1483 (struct numbered_block_mapping
) {
1484 .block_map_slot
= entry
.slot
,
1485 .block_map_entry
= vdo_pack_block_map_entry(entry
.mapping
.pbn
,
1486 entry
.mapping
.state
),
1487 .number
= repair
->block_map_entry_count
,
1489 repair
->block_map_entry_count
++;
1492 result
= VDO_ASSERT((repair
->block_map_entry_count
<= repair
->entry_count
),
1493 "approximate entry count is an upper bound");
1494 if (result
!= VDO_SUCCESS
)
1495 vdo_enter_read_only_mode(vdo
, result
);
1501 * compute_usages() - Compute the lbns in use and block map data blocks counts from the tail of
1503 * @repair: The repair completion.
1505 static noinline
int compute_usages(struct repair_completion
*repair
)
1508 * This function is declared noinline to avoid a spurious valgrind error regarding the
1509 * following structure being uninitialized.
1511 struct recovery_point recovery_point
= {
1512 .sequence_number
= repair
->tail
,
1517 struct vdo
*vdo
= repair
->completion
.vdo
;
1518 struct recovery_journal
*journal
= vdo
->recovery_journal
;
1519 struct recovery_block_header header
=
1520 get_recovery_journal_block_header(journal
, repair
->journal_data
,
1523 repair
->logical_blocks_used
= header
.logical_blocks_used
;
1524 repair
->block_map_data_blocks
= header
.block_map_data_blocks
;
1526 for (; before_recovery_point(&recovery_point
, &repair
->tail_recovery_point
);
1527 increment_recovery_point(&recovery_point
)) {
1528 struct recovery_journal_entry entry
= get_entry(repair
, &recovery_point
);
1531 result
= validate_recovery_journal_entry(vdo
, &entry
);
1532 if (result
!= VDO_SUCCESS
) {
1533 vdo_enter_read_only_mode(vdo
, result
);
1537 if (entry
.operation
== VDO_JOURNAL_BLOCK_MAP_REMAPPING
) {
1538 repair
->block_map_data_blocks
++;
1542 if (vdo_is_mapped_location(&entry
.mapping
))
1543 repair
->logical_blocks_used
++;
1545 if (vdo_is_mapped_location(&entry
.unmapping
))
1546 repair
->logical_blocks_used
--;
1552 static int parse_journal_for_recovery(struct repair_completion
*repair
)
1555 sequence_number_t i
, head
;
1556 bool found_entries
= false;
1557 struct recovery_journal
*journal
= repair
->completion
.vdo
->recovery_journal
;
1558 struct recovery_block_header header
;
1559 enum vdo_metadata_type expected_format
;
1561 head
= min(repair
->block_map_head
, repair
->slab_journal_head
);
1562 header
= get_recovery_journal_block_header(journal
, repair
->journal_data
, head
);
1563 expected_format
= header
.metadata_type
;
1564 for (i
= head
; i
<= repair
->highest_tail
; i
++) {
1565 journal_entry_count_t block_entries
;
1569 repair
->tail_recovery_point
= (struct recovery_point
) {
1570 .sequence_number
= i
,
1575 header
= get_recovery_journal_block_header(journal
, repair
->journal_data
, i
);
1576 if (!is_exact_recovery_journal_block(journal
, &header
, i
)) {
1577 /* A bad block header was found so this must be the end of the journal. */
1579 } else if (header
.metadata_type
!= expected_format
) {
1580 /* There is a mix of old and new format blocks, so we need to rebuild. */
1581 vdo_log_error_strerror(VDO_CORRUPT_JOURNAL
,
1582 "Recovery journal is in an invalid format, a read-only rebuild is required.");
1583 vdo_enter_read_only_mode(repair
->completion
.vdo
, VDO_CORRUPT_JOURNAL
);
1584 return VDO_CORRUPT_JOURNAL
;
1587 block_entries
= header
.entry_count
;
1589 /* Examine each sector in turn to determine the last valid sector. */
1590 for (j
= 1; j
< VDO_SECTORS_PER_BLOCK
; j
++) {
1591 struct packed_journal_sector
*sector
=
1592 get_sector(journal
, repair
->journal_data
, i
, j
);
1593 journal_entry_count_t sector_entries
=
1594 min_t(journal_entry_count_t
, sector
->entry_count
,
1597 /* A bad sector means that this block was torn. */
1598 if (!vdo_is_valid_recovery_journal_sector(&header
, sector
, j
))
1601 if (sector_entries
> 0) {
1602 found_entries
= true;
1603 repair
->tail_recovery_point
.sector_count
++;
1604 repair
->tail_recovery_point
.entry_count
= sector_entries
;
1605 block_entries
-= sector_entries
;
1606 repair
->entry_count
+= sector_entries
;
1609 /* If this sector is short, the later sectors can't matter. */
1610 if ((sector_entries
< RECOVERY_JOURNAL_ENTRIES_PER_SECTOR
) ||
1611 (block_entries
== 0))
1615 /* If this block was not filled, or if it tore, no later block can matter. */
1616 if ((header
.entry_count
!= journal
->entries_per_block
) || (block_entries
> 0))
1620 if (!found_entries
) {
1621 return validate_heads(repair
);
1622 } else if (expected_format
== VDO_METADATA_RECOVERY_JOURNAL
) {
1623 /* All journal blocks have the old format, so we need to upgrade. */
1624 vdo_log_error_strerror(VDO_UNSUPPORTED_VERSION
,
1625 "Recovery journal is in the old format. Downgrade and complete recovery, then upgrade with a clean volume");
1626 return VDO_UNSUPPORTED_VERSION
;
1629 /* Set the tail to the last valid tail block, if there is one. */
1630 if (repair
->tail_recovery_point
.sector_count
== 0)
1633 result
= validate_heads(repair
);
1634 if (result
!= VDO_SUCCESS
)
1637 vdo_log_info("Highest-numbered recovery journal block has sequence number %llu, and the highest-numbered usable block is %llu",
1638 (unsigned long long) repair
->highest_tail
,
1639 (unsigned long long) repair
->tail
);
1641 result
= extract_new_mappings(repair
);
1642 if (result
!= VDO_SUCCESS
)
1645 return compute_usages(repair
);
1648 static int parse_journal(struct repair_completion
*repair
)
1650 if (!find_recovery_journal_head_and_tail(repair
))
1653 return (vdo_state_requires_read_only_rebuild(repair
->completion
.vdo
->load_state
) ?
1654 parse_journal_for_rebuild(repair
) :
1655 parse_journal_for_recovery(repair
));
1658 static void finish_journal_load(struct vdo_completion
*completion
)
1660 struct repair_completion
*repair
= completion
->parent
;
1662 if (++repair
->vios_complete
!= repair
->vio_count
)
1665 vdo_log_info("Finished reading recovery journal");
1666 uninitialize_vios(repair
);
1667 prepare_repair_completion(repair
, recover_block_map
, VDO_ZONE_TYPE_LOGICAL
);
1668 vdo_continue_completion(&repair
->completion
, parse_journal(repair
));
1671 static void handle_journal_load_error(struct vdo_completion
*completion
)
1673 struct repair_completion
*repair
= completion
->parent
;
1675 /* Preserve the error */
1676 vdo_set_completion_result(&repair
->completion
, completion
->result
);
1677 vio_record_metadata_io_error(as_vio(completion
));
1678 completion
->callback(completion
);
1681 static void read_journal_endio(struct bio
*bio
)
1683 struct vio
*vio
= bio
->bi_private
;
1684 struct vdo
*vdo
= vio
->completion
.vdo
;
1686 continue_vio_after_io(vio
, finish_journal_load
, vdo
->thread_config
.admin_thread
);
1690 * vdo_repair() - Load the recovery journal and then recover or rebuild a vdo.
1691 * @parent: The completion to notify when the operation is complete
1693 void vdo_repair(struct vdo_completion
*parent
)
1697 struct repair_completion
*repair
;
1698 struct vdo
*vdo
= parent
->vdo
;
1699 struct recovery_journal
*journal
= vdo
->recovery_journal
;
1700 physical_block_number_t pbn
= journal
->origin
;
1701 block_count_t remaining
= journal
->size
;
1702 block_count_t vio_count
= DIV_ROUND_UP(remaining
, MAX_BLOCKS_PER_VIO
);
1703 page_count_t page_count
= min_t(page_count_t
,
1704 vdo
->device_config
->cache_size
>> 1,
1705 MAXIMUM_SIMULTANEOUS_VDO_BLOCK_MAP_RESTORATION_READS
);
1707 vdo_assert_on_admin_thread(vdo
, __func__
);
1709 if (vdo
->load_state
== VDO_FORCE_REBUILD
) {
1710 vdo_log_warning("Rebuilding reference counts to clear read-only mode");
1711 vdo
->states
.vdo
.read_only_recoveries
++;
1712 } else if (vdo
->load_state
== VDO_REBUILD_FOR_UPGRADE
) {
1713 vdo_log_warning("Rebuilding reference counts for upgrade");
1715 vdo_log_warning("Device was dirty, rebuilding reference counts");
1718 result
= vdo_allocate_extended(struct repair_completion
, page_count
,
1719 struct vdo_page_completion
, __func__
,
1721 if (result
!= VDO_SUCCESS
) {
1722 vdo_fail_completion(parent
, result
);
1726 vdo_initialize_completion(&repair
->completion
, vdo
, VDO_REPAIR_COMPLETION
);
1727 repair
->completion
.error_handler
= abort_repair
;
1728 repair
->completion
.parent
= parent
;
1729 prepare_repair_completion(repair
, finish_repair
, VDO_ZONE_TYPE_ADMIN
);
1730 repair
->page_count
= page_count
;
1732 result
= vdo_allocate(remaining
* VDO_BLOCK_SIZE
, char, __func__
,
1733 &repair
->journal_data
);
1734 if (abort_on_error(result
, repair
))
1737 result
= vdo_allocate(vio_count
, struct vio
, __func__
, &repair
->vios
);
1738 if (abort_on_error(result
, repair
))
1741 ptr
= repair
->journal_data
;
1742 for (repair
->vio_count
= 0; repair
->vio_count
< vio_count
; repair
->vio_count
++) {
1743 block_count_t blocks
= min_t(block_count_t
, remaining
,
1744 MAX_BLOCKS_PER_VIO
);
1746 result
= allocate_vio_components(vdo
, VIO_TYPE_RECOVERY_JOURNAL
,
1747 VIO_PRIORITY_METADATA
,
1748 repair
, blocks
, ptr
,
1749 &repair
->vios
[repair
->vio_count
]);
1750 if (abort_on_error(result
, repair
))
1753 ptr
+= (blocks
* VDO_BLOCK_SIZE
);
1754 remaining
-= blocks
;
1757 for (vio_count
= 0; vio_count
< repair
->vio_count
;
1758 vio_count
++, pbn
+= MAX_BLOCKS_PER_VIO
) {
1759 vdo_submit_metadata_vio(&repair
->vios
[vio_count
], pbn
, read_journal_endio
,
1760 handle_journal_load_error
, REQ_OP_READ
);