drivers/md/dm-vdo/repair.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Copyright 2023 Red Hat
   4  */
   5
   6 #include "repair.h"
   7
   8 #include <linux/min_heap.h>
   9 #include <linux/minmax.h>
  10
  11 #include "logger.h"
  12 #include "memory-alloc.h"
  13 #include "permassert.h"
  14
  15 #include "block-map.h"
  16 #include "completion.h"
  17 #include "constants.h"
  18 #include "encodings.h"
  19 #include "int-map.h"
  20 #include "io-submitter.h"
  21 #include "recovery-journal.h"
  22 #include "slab-depot.h"
  23 #include "types.h"
  24 #include "vdo.h"
  25 #include "wait-queue.h"
  26
  27 /*
  28  * An explicitly numbered block mapping. Numbering the mappings allows them to be sorted by logical
  29  * block number during repair while still preserving the relative order of journal entries with
  30  * the same logical block number.
  31  */
  32 struct numbered_block_mapping {
  33         struct block_map_slot block_map_slot;
  34         struct block_map_entry block_map_entry;
  35         /* A serial number to use during replay */
  36         u32 number;
  37 } __packed;
  38
  39 /*
  40  * The absolute position of an entry in the recovery journal, including the sector number and the
  41  * entry number within the sector.
  42  */
  43 struct recovery_point {
  44         /* Block sequence number */
  45         sequence_number_t sequence_number;
  46         /* Sector number */
  47         u8 sector_count;
  48         /* Entry number */
  49         journal_entry_count_t entry_count;
  50         /* Whether or not the increment portion of the current entry has been applied */
  51         bool increment_applied;
  52 };
  53
  54 DEFINE_MIN_HEAP(struct numbered_block_mapping, replay_heap);
  55
  56 struct repair_completion {
  57         /* The completion header */
  58         struct vdo_completion completion;
  59
  60         /* A buffer to hold the data read off disk */
  61         char *journal_data;
  62
  63         /* For loading the journal */
  64         data_vio_count_t vio_count;
  65         data_vio_count_t vios_complete;
  66         struct vio *vios;
  67
  68         /* The number of entries to be applied to the block map */
  69         size_t block_map_entry_count;
  70         /* The sequence number of the first valid block for block map recovery */
  71         sequence_number_t block_map_head;
  72         /* The sequence number of the first valid block for slab journal replay */
  73         sequence_number_t slab_journal_head;
  74         /* The sequence number of the last valid block of the journal (if known) */
  75         sequence_number_t tail;
  76         /*
  77          * The highest sequence number of the journal. During recovery (vs read-only rebuild), not
  78          * the same as the tail, since the tail ignores blocks after the first hole.
  79          */
  80         sequence_number_t highest_tail;
  81
  82         /* The number of logical blocks currently known to be in use */
  83         block_count_t logical_blocks_used;
  84         /* The number of block map data blocks known to be allocated */
  85         block_count_t block_map_data_blocks;
  86
  87         /* These fields are for playing the journal into the block map */
  88         /* The entry data for the block map recovery */
  89         struct numbered_block_mapping *entries;
  90         /* The number of entries in the entry array */
  91         size_t entry_count;
  92         /* number of pending (non-ready) requests*/
  93         page_count_t outstanding;
  94         /* number of page completions */
  95         page_count_t page_count;
  96         bool launching;
  97         /*
  98          * a heap wrapping journal_entries. It re-orders and sorts journal entries in ascending LBN
  99          * order, then original journal order. This permits efficient iteration over the journal
 100          * entries in order.
 101          */
 102         struct replay_heap replay_heap;
 103         /* Fields tracking progress through the journal entries. */
 104         struct numbered_block_mapping *current_entry;
 105         struct numbered_block_mapping *current_unfetched_entry;
 106         /* Current requested page's PBN */
 107         physical_block_number_t pbn;
 108
 109         /* These fields are only used during recovery. */
 110         /* A location just beyond the last valid entry of the journal */
 111         struct recovery_point tail_recovery_point;
 112         /* The location of the next recovery journal entry to apply */
 113         struct recovery_point next_recovery_point;
 114         /* The journal point to give to the next synthesized decref */
 115         struct journal_point next_journal_point;
 116         /* The number of entries played into slab journals */
 117         size_t entries_added_to_slab_journals;
 118
 119         /* These fields are only used during read-only rebuild */
 120         page_count_t page_to_fetch;
 121         /* the number of leaf pages in the block map */
 122         page_count_t leaf_pages;
 123         /* the last slot of the block map */
 124         struct block_map_slot last_slot;
 125
 126         /*
 127          * The page completions used for playing the journal into the block map, and, during
 128          * read-only rebuild, for rebuilding the reference counts from the block map.
 129          */
 130         struct vdo_page_completion page_completions[];
 131 };
 132
 133 /*
 134  * This is a min_heap callback function that orders numbered_block_mappings using the
 135  * 'block_map_slot' field as the primary key and the mapping 'number' field as the secondary key.
 136  * Using the mapping number preserves the journal order of entries for the same slot, allowing us
 137  * to sort by slot while still ensuring we replay all entries with the same slot in the exact order
 138  * as they appeared in the journal.
 139  */
 140 static bool mapping_is_less_than(const void *item1, const void *item2, void __always_unused *args)
 141 {
 142         const struct numbered_block_mapping *mapping1 =
 143                 (const struct numbered_block_mapping *) item1;
 144         const struct numbered_block_mapping *mapping2 =
 145                 (const struct numbered_block_mapping *) item2;
 146
 147         if (mapping1->block_map_slot.pbn != mapping2->block_map_slot.pbn)
 148                 return mapping1->block_map_slot.pbn < mapping2->block_map_slot.pbn;
 149
 150         if (mapping1->block_map_slot.slot != mapping2->block_map_slot.slot)
 151                 return mapping1->block_map_slot.slot < mapping2->block_map_slot.slot;
 152
 153         if (mapping1->number != mapping2->number)
 154                 return mapping1->number < mapping2->number;
 155
 156         return 0;
 157 }
 158
 159 static void swap_mappings(void *item1, void *item2, void __always_unused *args)
 160 {
 161         struct numbered_block_mapping *mapping1 = item1;
 162         struct numbered_block_mapping *mapping2 = item2;
 163
 164         swap(*mapping1, *mapping2);
 165 }
 166
 167 static const struct min_heap_callbacks repair_min_heap = {
 168         .less = mapping_is_less_than,
 169         .swp = NULL,
 170 };
 171
 172 static struct numbered_block_mapping *sort_next_heap_element(struct repair_completion *repair)
 173 {
 174         struct replay_heap *heap = &repair->replay_heap;
 175         struct numbered_block_mapping *last;
 176
 177         if (heap->nr == 0)
 178                 return NULL;
 179
 180         /*
 181          * Swap the next heap element with the last one on the heap, popping it off the heap,
 182          * restore the heap invariant, and return a pointer to the popped element.
 183          */
 184         last = &repair->entries[--heap->nr];
 185         swap_mappings(heap->data, last, NULL);
 186         min_heap_sift_down(heap, 0, &repair_min_heap, NULL);
 187         return last;
 188 }
 189
 190 /**
 191  * as_repair_completion() - Convert a generic completion to a repair_completion.
 192  * @completion: The completion to convert.
 193  *
 194  * Return: The repair_completion.
 195  */
 196 static inline struct repair_completion * __must_check
 197 as_repair_completion(struct vdo_completion *completion)
 198 {
 199         vdo_assert_completion_type(completion, VDO_REPAIR_COMPLETION);
 200         return container_of(completion, struct repair_completion, completion);
 201 }
 202
 203 static void prepare_repair_completion(struct repair_completion *repair,
 204                                       vdo_action_fn callback, enum vdo_zone_type zone_type)
 205 {
 206         struct vdo_completion *completion = &repair->completion;
 207         const struct thread_config *thread_config = &completion->vdo->thread_config;
 208         thread_id_t thread_id;
 209
 210         /* All blockmap access is done on single thread, so use logical zone 0. */
 211         thread_id = ((zone_type == VDO_ZONE_TYPE_LOGICAL) ?
 212                      thread_config->logical_threads[0] :
 213                      thread_config->admin_thread);
 214         vdo_reset_completion(completion);
 215         vdo_set_completion_callback(completion, callback, thread_id);
 216 }
 217
 218 static void launch_repair_completion(struct repair_completion *repair,
 219                                      vdo_action_fn callback, enum vdo_zone_type zone_type)
 220 {
 221         prepare_repair_completion(repair, callback, zone_type);
 222         vdo_launch_completion(&repair->completion);
 223 }
 224
 225 static void uninitialize_vios(struct repair_completion *repair)
 226 {
 227         while (repair->vio_count > 0)
 228                 free_vio_components(&repair->vios[--repair->vio_count]);
 229
 230         vdo_free(vdo_forget(repair->vios));
 231 }
 232
 233 static void free_repair_completion(struct repair_completion *repair)
 234 {
 235         if (repair == NULL)
 236                 return;
 237
 238         /*
 239          * We do this here because this function is the only common bottleneck for all clean up
 240          * paths.
 241          */
 242         repair->completion.vdo->block_map->zones[0].page_cache.rebuilding = false;
 243
 244         uninitialize_vios(repair);
 245         vdo_free(vdo_forget(repair->journal_data));
 246         vdo_free(vdo_forget(repair->entries));
 247         vdo_free(repair);
 248 }
 249
 250 static void finish_repair(struct vdo_completion *completion)
 251 {
 252         struct vdo_completion *parent = completion->parent;
 253         struct vdo *vdo = completion->vdo;
 254         struct repair_completion *repair = as_repair_completion(completion);
 255
 256         vdo_assert_on_admin_thread(vdo, __func__);
 257
 258         if (vdo->load_state != VDO_REBUILD_FOR_UPGRADE)
 259                 vdo->states.vdo.complete_recoveries++;
 260
 261         vdo_initialize_recovery_journal_post_repair(vdo->recovery_journal,
 262                                                     vdo->states.vdo.complete_recoveries,
 263                                                     repair->highest_tail,
 264                                                     repair->logical_blocks_used,
 265                                                     repair->block_map_data_blocks);
 266         free_repair_completion(vdo_forget(repair));
 267
 268         if (vdo_state_requires_read_only_rebuild(vdo->load_state)) {
 269                 vdo_log_info("Read-only rebuild complete");
 270                 vdo_launch_completion(parent);
 271                 return;
 272         }
 273
 274         /* FIXME: shouldn't this say either "recovery" or "repair"? */
 275         vdo_log_info("Rebuild complete");
 276
 277         /*
 278          * Now that we've freed the repair completion and its vast array of journal entries, we
 279          * can allocate refcounts.
 280          */
 281         vdo_continue_completion(parent, vdo_allocate_reference_counters(vdo->depot));
 282 }
 283
 284 /**
 285  * abort_repair() - Handle a repair error.
 286  * @completion: The repair completion.
 287  */
 288 static void abort_repair(struct vdo_completion *completion)
 289 {
 290         struct vdo_completion *parent = completion->parent;
 291         int result = completion->result;
 292         struct repair_completion *repair = as_repair_completion(completion);
 293
 294         if (vdo_state_requires_read_only_rebuild(completion->vdo->load_state))
 295                 vdo_log_info("Read-only rebuild aborted");
 296         else
 297                 vdo_log_warning("Recovery aborted");
 298
 299         free_repair_completion(vdo_forget(repair));
 300         vdo_continue_completion(parent, result);
 301 }
 302
 303 /**
 304  * abort_on_error() - Abort a repair if there is an error.
 305  * @result: The result to check.
 306  * @repair: The repair completion.
 307  *
 308  * Return: true if the result was an error.
 309  */
 310 static bool __must_check abort_on_error(int result, struct repair_completion *repair)
 311 {
 312         if (result == VDO_SUCCESS)
 313                 return false;
 314
 315         vdo_fail_completion(&repair->completion, result);
 316         return true;
 317 }
 318
 319 /**
 320  * drain_slab_depot() - Flush out all dirty refcounts blocks now that they have been rebuilt or
 321  *                      recovered.
 322  * @completion: The repair completion.
 323  */
 324 static void drain_slab_depot(struct vdo_completion *completion)
 325 {
 326         struct vdo *vdo = completion->vdo;
 327         struct repair_completion *repair = as_repair_completion(completion);
 328         const struct admin_state_code *operation;
 329
 330         vdo_assert_on_admin_thread(vdo, __func__);
 331
 332         prepare_repair_completion(repair, finish_repair, VDO_ZONE_TYPE_ADMIN);
 333         if (vdo_state_requires_read_only_rebuild(vdo->load_state)) {
 334                 vdo_log_info("Saving rebuilt state");
 335                 operation = VDO_ADMIN_STATE_REBUILDING;
 336         } else {
 337                 vdo_log_info("Replayed %zu journal entries into slab journals",
 338                              repair->entries_added_to_slab_journals);
 339                 operation = VDO_ADMIN_STATE_RECOVERING;
 340         }
 341
 342         vdo_drain_slab_depot(vdo->depot, operation, completion);
 343 }
 344
 345 /**
 346  * flush_block_map_updates() - Flush the block map now that all the reference counts are rebuilt.
 347  * @completion: The repair completion.
 348  *
 349  * This callback is registered in finish_if_done().
 350  */
 351 static void flush_block_map_updates(struct vdo_completion *completion)
 352 {
 353         vdo_assert_on_admin_thread(completion->vdo, __func__);
 354
 355         vdo_log_info("Flushing block map changes");
 356         prepare_repair_completion(as_repair_completion(completion), drain_slab_depot,
 357                                   VDO_ZONE_TYPE_ADMIN);
 358         vdo_drain_block_map(completion->vdo->block_map, VDO_ADMIN_STATE_RECOVERING,
 359                             completion);
 360 }
 361
 362 static bool fetch_page(struct repair_completion *repair,
 363                        struct vdo_completion *completion);
 364
 365 /**
 366  * handle_page_load_error() - Handle an error loading a page.
 367  * @completion: The vdo_page_completion.
 368  */
 369 static void handle_page_load_error(struct vdo_completion *completion)
 370 {
 371         struct repair_completion *repair = completion->parent;
 372
 373         repair->outstanding--;
 374         vdo_set_completion_result(&repair->completion, completion->result);
 375         vdo_release_page_completion(completion);
 376         fetch_page(repair, completion);
 377 }
 378
 379 /**
 380  * unmap_entry() - Unmap an invalid entry and indicate that its page must be written out.
 381  * @page: The page containing the entries
 382  * @completion: The page_completion for writing the page
 383  * @slot: The slot to unmap
 384  */
 385 static void unmap_entry(struct block_map_page *page, struct vdo_completion *completion,
 386                         slot_number_t slot)
 387 {
 388         page->entries[slot] = UNMAPPED_BLOCK_MAP_ENTRY;
 389         vdo_request_page_write(completion);
 390 }
 391
 392 /**
 393  * remove_out_of_bounds_entries() - Unmap entries which outside the logical space.
 394  * @page: The page containing the entries
 395  * @completion: The page_completion for writing the page
 396  * @start: The first slot to check
 397  */
 398 static void remove_out_of_bounds_entries(struct block_map_page *page,
 399                                          struct vdo_completion *completion,
 400                                          slot_number_t start)
 401 {
 402         slot_number_t slot;
 403
 404         for (slot = start; slot < VDO_BLOCK_MAP_ENTRIES_PER_PAGE; slot++) {
 405                 struct data_location mapping = vdo_unpack_block_map_entry(&page->entries[slot]);
 406
 407                 if (vdo_is_mapped_location(&mapping))
 408                         unmap_entry(page, completion, slot);
 409         }
 410 }
 411
 412 /**
 413  * process_slot() - Update the reference counts for a single entry.
 414  * @page: The page containing the entries
 415  * @completion: The page_completion for writing the page
 416  * @slot: The slot to check
 417  *
 418  * Return: true if the entry was a valid mapping
 419  */
 420 static bool process_slot(struct block_map_page *page, struct vdo_completion *completion,
 421                          slot_number_t slot)
 422 {
 423         struct slab_depot *depot = completion->vdo->depot;
 424         int result;
 425         struct data_location mapping = vdo_unpack_block_map_entry(&page->entries[slot]);
 426
 427         if (!vdo_is_valid_location(&mapping)) {
 428                 /* This entry is invalid, so remove it from the page. */
 429                 unmap_entry(page, completion, slot);
 430                 return false;
 431         }
 432
 433         if (!vdo_is_mapped_location(&mapping))
 434                 return false;
 435
 436
 437         if (mapping.pbn == VDO_ZERO_BLOCK)
 438                 return true;
 439
 440         if (!vdo_is_physical_data_block(depot, mapping.pbn)) {
 441                 /*
 442                  * This is a nonsense mapping. Remove it from the map so we're at least consistent
 443                  * and mark the page dirty.
 444                  */
 445                 unmap_entry(page, completion, slot);
 446                 return false;
 447         }
 448
 449         result = vdo_adjust_reference_count_for_rebuild(depot, mapping.pbn,
 450                                                         VDO_JOURNAL_DATA_REMAPPING);
 451         if (result == VDO_SUCCESS)
 452                 return true;
 453
 454         vdo_log_error_strerror(result,
 455                                "Could not adjust reference count for PBN %llu, slot %u mapped to PBN %llu",
 456                                (unsigned long long) vdo_get_block_map_page_pbn(page),
 457                                slot, (unsigned long long) mapping.pbn);
 458         unmap_entry(page, completion, slot);
 459         return false;
 460 }
 461
 462 /**
 463  * rebuild_reference_counts_from_page() - Rebuild reference counts from a block map page.
 464  * @repair: The repair completion.
 465  * @completion: The page completion holding the page.
 466  */
 467 static void rebuild_reference_counts_from_page(struct repair_completion *repair,
 468                                                struct vdo_completion *completion)
 469 {
 470         slot_number_t slot, last_slot;
 471         struct block_map_page *page;
 472         int result;
 473
 474         result = vdo_get_cached_page(completion, &page);
 475         if (result != VDO_SUCCESS) {
 476                 vdo_set_completion_result(&repair->completion, result);
 477                 return;
 478         }
 479
 480         if (!page->header.initialized)
 481                 return;
 482
 483         /* Remove any bogus entries which exist beyond the end of the logical space. */
 484         if (vdo_get_block_map_page_pbn(page) == repair->last_slot.pbn) {
 485                 last_slot = repair->last_slot.slot;
 486                 remove_out_of_bounds_entries(page, completion, last_slot);
 487         } else {
 488                 last_slot = VDO_BLOCK_MAP_ENTRIES_PER_PAGE;
 489         }
 490
 491         /* Inform the slab depot of all entries on this page. */
 492         for (slot = 0; slot < last_slot; slot++) {
 493                 if (process_slot(page, completion, slot))
 494                         repair->logical_blocks_used++;
 495         }
 496 }
 497
 498 /**
 499  * page_loaded() - Process a page which has just been loaded.
 500  * @completion: The vdo_page_completion for the fetched page.
 501  *
 502  * This callback is registered by fetch_page().
 503  */
 504 static void page_loaded(struct vdo_completion *completion)
 505 {
 506         struct repair_completion *repair = completion->parent;
 507
 508         repair->outstanding--;
 509         rebuild_reference_counts_from_page(repair, completion);
 510         vdo_release_page_completion(completion);
 511
 512         /* Advance progress to the next page, and fetch the next page we haven't yet requested. */
 513         fetch_page(repair, completion);
 514 }
 515
 516 static physical_block_number_t get_pbn_to_fetch(struct repair_completion *repair,
 517                                                 struct block_map *block_map)
 518 {
 519         physical_block_number_t pbn = VDO_ZERO_BLOCK;
 520
 521         if (repair->completion.result != VDO_SUCCESS)
 522                 return VDO_ZERO_BLOCK;
 523
 524         while ((pbn == VDO_ZERO_BLOCK) && (repair->page_to_fetch < repair->leaf_pages))
 525                 pbn = vdo_find_block_map_page_pbn(block_map, repair->page_to_fetch++);
 526
 527         if (vdo_is_physical_data_block(repair->completion.vdo->depot, pbn))
 528                 return pbn;
 529
 530         vdo_set_completion_result(&repair->completion, VDO_BAD_MAPPING);
 531         return VDO_ZERO_BLOCK;
 532 }
 533
 534 /**
 535  * fetch_page() - Fetch a page from the block map.
 536  * @repair: The repair_completion.
 537  * @completion: The page completion to use.
 538  *
 539  * Return true if the rebuild is complete
 540  */
 541 static bool fetch_page(struct repair_completion *repair,
 542                        struct vdo_completion *completion)
 543 {
 544         struct vdo_page_completion *page_completion = (struct vdo_page_completion *) completion;
 545         struct block_map *block_map = repair->completion.vdo->block_map;
 546         physical_block_number_t pbn = get_pbn_to_fetch(repair, block_map);
 547
 548         if (pbn != VDO_ZERO_BLOCK) {
 549                 repair->outstanding++;
 550                 /*
 551                  * We must set the requeue flag here to ensure that we don't blow the stack if all
 552                  * the requested pages are already in the cache or get load errors.
 553                  */
 554                 vdo_get_page(page_completion, &block_map->zones[0], pbn, true, repair,
 555                              page_loaded, handle_page_load_error, true);
 556         }
 557
 558         if (repair->outstanding > 0)
 559                 return false;
 560
 561         launch_repair_completion(repair, flush_block_map_updates, VDO_ZONE_TYPE_ADMIN);
 562         return true;
 563 }
 564
 565 /**
 566  * rebuild_from_leaves() - Rebuild reference counts from the leaf block map pages.
 567  * @completion: The repair completion.
 568  *
 569  * Rebuilds reference counts from the leaf block map pages now that reference counts have been
 570  * rebuilt from the interior tree pages (which have been loaded in the process). This callback is
 571  * registered in rebuild_reference_counts().
 572  */
 573 static void rebuild_from_leaves(struct vdo_completion *completion)
 574 {
 575         page_count_t i;
 576         struct repair_completion *repair = as_repair_completion(completion);
 577         struct block_map *map = completion->vdo->block_map;
 578
 579         repair->logical_blocks_used = 0;
 580
 581         /*
 582          * The PBN calculation doesn't work until the tree pages have been loaded, so we can't set
 583          * this value at the start of repair.
 584          */
 585         repair->leaf_pages = vdo_compute_block_map_page_count(map->entry_count);
 586         repair->last_slot = (struct block_map_slot) {
 587                 .slot = map->entry_count % VDO_BLOCK_MAP_ENTRIES_PER_PAGE,
 588                 .pbn = vdo_find_block_map_page_pbn(map, repair->leaf_pages - 1),
 589         };
 590         if (repair->last_slot.slot == 0)
 591                 repair->last_slot.slot = VDO_BLOCK_MAP_ENTRIES_PER_PAGE;
 592
 593         for (i = 0; i < repair->page_count; i++) {
 594                 if (fetch_page(repair, &repair->page_completions[i].completion)) {
 595                         /*
 596                          * The rebuild has already moved on, so it isn't safe nor is there a need
 597                          * to launch any more fetches.
 598                          */
 599                         return;
 600                 }
 601         }
 602 }
 603
 604 /**
 605  * process_entry() - Process a single entry from the block map tree.
 606  * @pbn: A pbn which holds a block map tree page.
 607  * @completion: The parent completion of the traversal.
 608  *
 609  * Implements vdo_entry_callback_fn.
 610  *
 611  * Return: VDO_SUCCESS or an error.
 612  */
 613 static int process_entry(physical_block_number_t pbn, struct vdo_completion *completion)
 614 {
 615         struct repair_completion *repair = as_repair_completion(completion);
 616         struct slab_depot *depot = completion->vdo->depot;
 617         int result;
 618
 619         if ((pbn == VDO_ZERO_BLOCK) || !vdo_is_physical_data_block(depot, pbn)) {
 620                 return vdo_log_error_strerror(VDO_BAD_CONFIGURATION,
 621                                               "PBN %llu out of range",
 622                                               (unsigned long long) pbn);
 623         }
 624
 625         result = vdo_adjust_reference_count_for_rebuild(depot, pbn,
 626                                                         VDO_JOURNAL_BLOCK_MAP_REMAPPING);
 627         if (result != VDO_SUCCESS) {
 628                 return vdo_log_error_strerror(result,
 629                                               "Could not adjust reference count for block map tree PBN %llu",
 630                                               (unsigned long long) pbn);
 631         }
 632
 633         repair->block_map_data_blocks++;
 634         return VDO_SUCCESS;
 635 }
 636
 637 static void rebuild_reference_counts(struct vdo_completion *completion)
 638 {
 639         struct repair_completion *repair = as_repair_completion(completion);
 640         struct vdo *vdo = completion->vdo;
 641         struct vdo_page_cache *cache = &vdo->block_map->zones[0].page_cache;
 642
 643         /* We must allocate ref_counts before we can rebuild them. */
 644         if (abort_on_error(vdo_allocate_reference_counters(vdo->depot), repair))
 645                 return;
 646
 647         /*
 648          * Completion chaining from page cache hits can lead to stack overflow during the rebuild,
 649          * so clear out the cache before this rebuild phase.
 650          */
 651         if (abort_on_error(vdo_invalidate_page_cache(cache), repair))
 652                 return;
 653
 654         prepare_repair_completion(repair, rebuild_from_leaves, VDO_ZONE_TYPE_LOGICAL);
 655         vdo_traverse_forest(vdo->block_map, process_entry, completion);
 656 }
 657
 658 static void increment_recovery_point(struct recovery_point *point)
 659 {
 660         if (++point->entry_count < RECOVERY_JOURNAL_ENTRIES_PER_SECTOR)
 661                 return;
 662
 663         point->entry_count = 0;
 664         if (point->sector_count < (VDO_SECTORS_PER_BLOCK - 1)) {
 665                 point->sector_count++;
 666                 return;
 667         }
 668
 669         point->sequence_number++;
 670         point->sector_count = 1;
 671 }
 672
 673 /**
 674  * advance_points() - Advance the current recovery and journal points.
 675  * @repair: The repair_completion whose points are to be advanced.
 676  * @entries_per_block: The number of entries in a recovery journal block.
 677  */
 678 static void advance_points(struct repair_completion *repair,
 679                            journal_entry_count_t entries_per_block)
 680 {
 681         if (!repair->next_recovery_point.increment_applied) {
 682                 repair->next_recovery_point.increment_applied   = true;
 683                 return;
 684         }
 685
 686         increment_recovery_point(&repair->next_recovery_point);
 687         vdo_advance_journal_point(&repair->next_journal_point, entries_per_block);
 688         repair->next_recovery_point.increment_applied   = false;
 689 }
 690
 691 /**
 692  * before_recovery_point() - Check whether the first point precedes the second point.
 693  * @first: The first recovery point.
 694  * @second: The second recovery point.
 695  *
 696  * Return: true if the first point precedes the second point.
 697  */
 698 static bool __must_check before_recovery_point(const struct recovery_point *first,
 699                                                const struct recovery_point *second)
 700 {
 701         if (first->sequence_number < second->sequence_number)
 702                 return true;
 703
 704         if (first->sequence_number > second->sequence_number)
 705                 return false;
 706
 707         if (first->sector_count < second->sector_count)
 708                 return true;
 709
 710         return ((first->sector_count == second->sector_count) &&
 711                 (first->entry_count < second->entry_count));
 712 }
 713
 714 static struct packed_journal_sector * __must_check get_sector(struct recovery_journal *journal,
 715                                                               char *journal_data,
 716                                                               sequence_number_t sequence,
 717                                                               u8 sector_number)
 718 {
 719         off_t offset;
 720
 721         offset = ((vdo_get_recovery_journal_block_number(journal, sequence) * VDO_BLOCK_SIZE) +
 722                   (VDO_SECTOR_SIZE * sector_number));
 723         return (struct packed_journal_sector *) (journal_data + offset);
 724 }
 725
 726 /**
 727  * get_entry() - Unpack the recovery journal entry associated with the given recovery point.
 728  * @repair: The repair completion.
 729  * @point: The recovery point.
 730  *
 731  * Return: The unpacked contents of the matching recovery journal entry.
 732  */
 733 static struct recovery_journal_entry get_entry(const struct repair_completion *repair,
 734                                                const struct recovery_point *point)
 735 {
 736         struct packed_journal_sector *sector;
 737
 738         sector = get_sector(repair->completion.vdo->recovery_journal,
 739                             repair->journal_data, point->sequence_number,
 740                             point->sector_count);
 741         return vdo_unpack_recovery_journal_entry(&sector->entries[point->entry_count]);
 742 }
 743
 744 /**
 745  * validate_recovery_journal_entry() - Validate a recovery journal entry.
 746  * @vdo: The vdo.
 747  * @entry: The entry to validate.
 748  *
 749  * Return: VDO_SUCCESS or an error.
 750  */
 751 static int validate_recovery_journal_entry(const struct vdo *vdo,
 752                                            const struct recovery_journal_entry *entry)
 753 {
 754         if ((entry->slot.pbn >= vdo->states.vdo.config.physical_blocks) ||
 755             (entry->slot.slot >= VDO_BLOCK_MAP_ENTRIES_PER_PAGE) ||
 756             !vdo_is_valid_location(&entry->mapping) ||
 757             !vdo_is_valid_location(&entry->unmapping) ||
 758             !vdo_is_physical_data_block(vdo->depot, entry->mapping.pbn) ||
 759             !vdo_is_physical_data_block(vdo->depot, entry->unmapping.pbn)) {
 760                 return vdo_log_error_strerror(VDO_CORRUPT_JOURNAL,
 761                                               "Invalid entry: %s (%llu, %u) from %llu to %llu is not within bounds",
 762                                               vdo_get_journal_operation_name(entry->operation),
 763                                               (unsigned long long) entry->slot.pbn,
 764                                               entry->slot.slot,
 765                                               (unsigned long long) entry->unmapping.pbn,
 766                                               (unsigned long long) entry->mapping.pbn);
 767         }
 768
 769         if ((entry->operation == VDO_JOURNAL_BLOCK_MAP_REMAPPING) &&
 770             (vdo_is_state_compressed(entry->mapping.state) ||
 771              (entry->mapping.pbn == VDO_ZERO_BLOCK) ||
 772              (entry->unmapping.state != VDO_MAPPING_STATE_UNMAPPED) ||
 773              (entry->unmapping.pbn != VDO_ZERO_BLOCK))) {
 774                 return vdo_log_error_strerror(VDO_CORRUPT_JOURNAL,
 775                                               "Invalid entry: %s (%llu, %u) from %llu to %llu is not a valid tree mapping",
 776                                               vdo_get_journal_operation_name(entry->operation),
 777                                               (unsigned long long) entry->slot.pbn,
 778                                               entry->slot.slot,
 779                                               (unsigned long long) entry->unmapping.pbn,
 780                                               (unsigned long long) entry->mapping.pbn);
 781         }
 782
 783         return VDO_SUCCESS;
 784 }
 785
 786 /**
 787  * add_slab_journal_entries() - Replay recovery journal entries into the slab journals of the
 788  *                              allocator currently being recovered.
 789  * @completion: The allocator completion.
 790  *
 791  * Waits for slab journal tailblock space when necessary. This method is its own callback.
 792  */
 793 static void add_slab_journal_entries(struct vdo_completion *completion)
 794 {
 795         struct recovery_point *recovery_point;
 796         struct repair_completion *repair = completion->parent;
 797         struct vdo *vdo = completion->vdo;
 798         struct recovery_journal *journal = vdo->recovery_journal;
 799         struct block_allocator *allocator = vdo_as_block_allocator(completion);
 800
 801         /* Get ready in case we need to enqueue again. */
 802         vdo_prepare_completion(completion, add_slab_journal_entries,
 803                                vdo_notify_slab_journals_are_recovered,
 804                                completion->callback_thread_id, repair);
 805         for (recovery_point = &repair->next_recovery_point;
 806              before_recovery_point(recovery_point, &repair->tail_recovery_point);
 807              advance_points(repair, journal->entries_per_block)) {
 808                 int result;
 809                 physical_block_number_t pbn;
 810                 struct vdo_slab *slab;
 811                 struct recovery_journal_entry entry = get_entry(repair, recovery_point);
 812                 bool increment = !repair->next_recovery_point.increment_applied;
 813
 814                 if (increment) {
 815                         result = validate_recovery_journal_entry(vdo, &entry);
 816                         if (result != VDO_SUCCESS) {
 817                                 vdo_enter_read_only_mode(vdo, result);
 818                                 vdo_fail_completion(completion, result);
 819                                 return;
 820                         }
 821
 822                         pbn = entry.mapping.pbn;
 823                 } else {
 824                         pbn = entry.unmapping.pbn;
 825                 }
 826
 827                 if (pbn == VDO_ZERO_BLOCK)
 828                         continue;
 829
 830                 slab = vdo_get_slab(vdo->depot, pbn);
 831                 if (slab->allocator != allocator)
 832                         continue;
 833
 834                 if (!vdo_attempt_replay_into_slab(slab, pbn, entry.operation, increment,
 835                                                   &repair->next_journal_point,
 836                                                   completion))
 837                         return;
 838
 839                 repair->entries_added_to_slab_journals++;
 840         }
 841
 842         vdo_notify_slab_journals_are_recovered(completion);
 843 }
 844
 845 /**
 846  * vdo_replay_into_slab_journals() - Replay recovery journal entries in the slab journals of slabs
 847  *                                   owned by a given block_allocator.
 848  * @allocator: The allocator whose slab journals are to be recovered.
 849  * @context: The slab depot load context supplied by a recovery when it loads the depot.
 850  */
 851 void vdo_replay_into_slab_journals(struct block_allocator *allocator, void *context)
 852 {
 853         struct vdo_completion *completion = &allocator->completion;
 854         struct repair_completion *repair = context;
 855         struct vdo *vdo = completion->vdo;
 856
 857         vdo_assert_on_physical_zone_thread(vdo, allocator->zone_number, __func__);
 858         if (repair->entry_count == 0) {
 859                 /* there's nothing to replay */
 860                 repair->logical_blocks_used = vdo->recovery_journal->logical_blocks_used;
 861                 repair->block_map_data_blocks = vdo->recovery_journal->block_map_data_blocks;
 862                 vdo_notify_slab_journals_are_recovered(completion);
 863                 return;
 864         }
 865
 866         repair->next_recovery_point = (struct recovery_point) {
 867                 .sequence_number = repair->slab_journal_head,
 868                 .sector_count = 1,
 869                 .entry_count = 0,
 870         };
 871
 872         repair->next_journal_point = (struct journal_point) {
 873                 .sequence_number = repair->slab_journal_head,
 874                 .entry_count = 0,
 875         };
 876
 877         vdo_log_info("Replaying entries into slab journals for zone %u",
 878                      allocator->zone_number);
 879         completion->parent = repair;
 880         add_slab_journal_entries(completion);
 881 }
 882
 883 static void load_slab_depot(struct vdo_completion *completion)
 884 {
 885         struct repair_completion *repair = as_repair_completion(completion);
 886         const struct admin_state_code *operation;
 887
 888         vdo_assert_on_admin_thread(completion->vdo, __func__);
 889
 890         if (vdo_state_requires_read_only_rebuild(completion->vdo->load_state)) {
 891                 prepare_repair_completion(repair, rebuild_reference_counts,
 892                                           VDO_ZONE_TYPE_LOGICAL);
 893                 operation = VDO_ADMIN_STATE_LOADING_FOR_REBUILD;
 894         } else {
 895                 prepare_repair_completion(repair, drain_slab_depot, VDO_ZONE_TYPE_ADMIN);
 896                 operation = VDO_ADMIN_STATE_LOADING_FOR_RECOVERY;
 897         }
 898
 899         vdo_load_slab_depot(completion->vdo->depot, operation, completion, repair);
 900 }
 901
 902 static void flush_block_map(struct vdo_completion *completion)
 903 {
 904         struct repair_completion *repair = as_repair_completion(completion);
 905         const struct admin_state_code *operation;
 906
 907         vdo_assert_on_admin_thread(completion->vdo, __func__);
 908
 909         vdo_log_info("Flushing block map changes");
 910         prepare_repair_completion(repair, load_slab_depot, VDO_ZONE_TYPE_ADMIN);
 911         operation = (vdo_state_requires_read_only_rebuild(completion->vdo->load_state) ?
 912                      VDO_ADMIN_STATE_REBUILDING :
 913                      VDO_ADMIN_STATE_RECOVERING);
 914         vdo_drain_block_map(completion->vdo->block_map, operation, completion);
 915 }
 916
 917 static bool finish_if_done(struct repair_completion *repair)
 918 {
 919         /* Pages are still being launched or there is still work to do */
 920         if (repair->launching || (repair->outstanding > 0))
 921                 return false;
 922
 923         if (repair->completion.result != VDO_SUCCESS) {
 924                 page_count_t i;
 925
 926                 for (i = 0; i < repair->page_count; i++) {
 927                         struct vdo_page_completion *page_completion =
 928                                 &repair->page_completions[i];
 929
 930                         if (page_completion->ready)
 931                                 vdo_release_page_completion(&page_completion->completion);
 932                 }
 933
 934                 vdo_launch_completion(&repair->completion);
 935                 return true;
 936         }
 937
 938         if (repair->current_entry >= repair->entries)
 939                 return false;
 940
 941         launch_repair_completion(repair, flush_block_map, VDO_ZONE_TYPE_ADMIN);
 942         return true;
 943 }
 944
 945 static void abort_block_map_recovery(struct repair_completion *repair, int result)
 946 {
 947         vdo_set_completion_result(&repair->completion, result);
 948         finish_if_done(repair);
 949 }
 950
 951 /**
 952  * find_entry_starting_next_page() - Find the first journal entry after a given entry which is not
 953  *                                   on the same block map page.
 954  * @repair: The repair completion.
 955  * @current_entry: The entry to search from.
 956  * @needs_sort: Whether sorting is needed to proceed.
 957  *
 958  * Return: Pointer to the first later journal entry on a different block map page, or a pointer to
 959  *         just before the journal entries if no subsequent entry is on a different block map page.
 960  */
 961 static struct numbered_block_mapping *
 962 find_entry_starting_next_page(struct repair_completion *repair,
 963                               struct numbered_block_mapping *current_entry, bool needs_sort)
 964 {
 965         size_t current_page;
 966
 967         /* If current_entry is invalid, return immediately. */
 968         if (current_entry < repair->entries)
 969                 return current_entry;
 970
 971         current_page = current_entry->block_map_slot.pbn;
 972
 973         /* Decrement current_entry until it's out of bounds or on a different page. */
 974         while ((current_entry >= repair->entries) &&
 975                (current_entry->block_map_slot.pbn == current_page)) {
 976                 if (needs_sort) {
 977                         struct numbered_block_mapping *just_sorted_entry =
 978                                 sort_next_heap_element(repair);
 979                         VDO_ASSERT_LOG_ONLY(just_sorted_entry < current_entry,
 980                                             "heap is returning elements in an unexpected order");
 981                 }
 982
 983                 current_entry--;
 984         }
 985
 986         return current_entry;
 987 }
 988
 989 /*
 990  * Apply a range of journal entries [starting_entry, ending_entry) journal
 991  * entries to a block map page.
 992  */
 993 static void apply_journal_entries_to_page(struct block_map_page *page,
 994                                           struct numbered_block_mapping *starting_entry,
 995                                           struct numbered_block_mapping *ending_entry)
 996 {
 997         struct numbered_block_mapping *current_entry = starting_entry;
 998
 999         while (current_entry != ending_entry) {
1000                 page->entries[current_entry->block_map_slot.slot] = current_entry->block_map_entry;
1001                 current_entry--;
1002         }
1003 }
1004
1005 static void recover_ready_pages(struct repair_completion *repair,
1006                                 struct vdo_completion *completion);
1007
1008 static void block_map_page_loaded(struct vdo_completion *completion)
1009 {
1010         struct repair_completion *repair = as_repair_completion(completion->parent);
1011
1012         repair->outstanding--;
1013         if (!repair->launching)
1014                 recover_ready_pages(repair, completion);
1015 }
1016
1017 static void handle_block_map_page_load_error(struct vdo_completion *completion)
1018 {
1019         struct repair_completion *repair = as_repair_completion(completion->parent);
1020
1021         repair->outstanding--;
1022         abort_block_map_recovery(repair, completion->result);
1023 }
1024
1025 static void fetch_block_map_page(struct repair_completion *repair,
1026                                  struct vdo_completion *completion)
1027 {
1028         physical_block_number_t pbn;
1029
1030         if (repair->current_unfetched_entry < repair->entries)
1031                 /* Nothing left to fetch. */
1032                 return;
1033
1034         /* Fetch the next page we haven't yet requested. */
1035         pbn = repair->current_unfetched_entry->block_map_slot.pbn;
1036         repair->current_unfetched_entry =
1037                 find_entry_starting_next_page(repair, repair->current_unfetched_entry,
1038                                               true);
1039         repair->outstanding++;
1040         vdo_get_page(((struct vdo_page_completion *) completion),
1041                      &repair->completion.vdo->block_map->zones[0], pbn, true,
1042                      &repair->completion, block_map_page_loaded,
1043                      handle_block_map_page_load_error, false);
1044 }
1045
1046 static struct vdo_page_completion *get_next_page_completion(struct repair_completion *repair,
1047                                                             struct vdo_page_completion *completion)
1048 {
1049         completion++;
1050         if (completion == (&repair->page_completions[repair->page_count]))
1051                 completion = &repair->page_completions[0];
1052         return completion;
1053 }
1054
1055 static void recover_ready_pages(struct repair_completion *repair,
1056                                 struct vdo_completion *completion)
1057 {
1058         struct vdo_page_completion *page_completion = (struct vdo_page_completion *) completion;
1059
1060         if (finish_if_done(repair))
1061                 return;
1062
1063         if (repair->pbn != page_completion->pbn)
1064                 return;
1065
1066         while (page_completion->ready) {
1067                 struct numbered_block_mapping *start_of_next_page;
1068                 struct block_map_page *page;
1069                 int result;
1070
1071                 result = vdo_get_cached_page(completion, &page);
1072                 if (result != VDO_SUCCESS) {
1073                         abort_block_map_recovery(repair, result);
1074                         return;
1075                 }
1076
1077                 start_of_next_page =
1078                         find_entry_starting_next_page(repair, repair->current_entry,
1079                                                       false);
1080                 apply_journal_entries_to_page(page, repair->current_entry,
1081                                               start_of_next_page);
1082                 repair->current_entry = start_of_next_page;
1083                 vdo_request_page_write(completion);
1084                 vdo_release_page_completion(completion);
1085
1086                 if (finish_if_done(repair))
1087                         return;
1088
1089                 repair->pbn = repair->current_entry->block_map_slot.pbn;
1090                 fetch_block_map_page(repair, completion);
1091                 page_completion = get_next_page_completion(repair, page_completion);
1092                 completion = &page_completion->completion;
1093         }
1094 }
1095
1096 static void recover_block_map(struct vdo_completion *completion)
1097 {
1098         struct repair_completion *repair = as_repair_completion(completion);
1099         struct vdo *vdo = completion->vdo;
1100         struct numbered_block_mapping *first_sorted_entry;
1101         page_count_t i;
1102
1103         vdo_assert_on_logical_zone_thread(vdo, 0, __func__);
1104
1105         /* Suppress block map errors. */
1106         vdo->block_map->zones[0].page_cache.rebuilding =
1107                 vdo_state_requires_read_only_rebuild(vdo->load_state);
1108
1109         if (repair->block_map_entry_count == 0) {
1110                 vdo_log_info("Replaying 0 recovery entries into block map");
1111                 vdo_free(vdo_forget(repair->journal_data));
1112                 launch_repair_completion(repair, load_slab_depot, VDO_ZONE_TYPE_ADMIN);
1113                 return;
1114         }
1115
1116         /*
1117          * Organize the journal entries into a binary heap so we can iterate over them in sorted
1118          * order incrementally, avoiding an expensive sort call.
1119          */
1120         repair->replay_heap = (struct replay_heap) {
1121                 .data = repair->entries,
1122                 .nr = repair->block_map_entry_count,
1123                 .size = repair->block_map_entry_count,
1124         };
1125         min_heapify_all(&repair->replay_heap, &repair_min_heap, NULL);
1126
1127         vdo_log_info("Replaying %zu recovery entries into block map",
1128                      repair->block_map_entry_count);
1129
1130         repair->current_entry = &repair->entries[repair->block_map_entry_count - 1];
1131         first_sorted_entry = sort_next_heap_element(repair);
1132         VDO_ASSERT_LOG_ONLY(first_sorted_entry == repair->current_entry,
1133                             "heap is returning elements in an unexpected order");
1134
1135         /* Prevent any page from being processed until all pages have been launched. */
1136         repair->launching = true;
1137         repair->pbn = repair->current_entry->block_map_slot.pbn;
1138         repair->current_unfetched_entry = repair->current_entry;
1139         for (i = 0; i < repair->page_count; i++) {
1140                 if (repair->current_unfetched_entry < repair->entries)
1141                         break;
1142
1143                 fetch_block_map_page(repair, &repair->page_completions[i].completion);
1144         }
1145         repair->launching = false;
1146
1147         /* Process any ready pages. */
1148         recover_ready_pages(repair, &repair->page_completions[0].completion);
1149 }
1150
1151 /**
1152  * get_recovery_journal_block_header() - Get the block header for a block at a position in the
1153  *                                       journal data and unpack it.
1154  * @journal: The recovery journal.
1155  * @data: The recovery journal data.
1156  * @sequence: The sequence number.
1157  *
1158  * Return: The unpacked header.
1159  */
1160 static struct recovery_block_header __must_check
1161 get_recovery_journal_block_header(struct recovery_journal *journal, char *data,
1162                                   sequence_number_t sequence)
1163 {
1164         physical_block_number_t pbn =
1165                 vdo_get_recovery_journal_block_number(journal, sequence);
1166         char *header = &data[pbn * VDO_BLOCK_SIZE];
1167
1168         return vdo_unpack_recovery_block_header((struct packed_journal_header *) header);
1169 }
1170
1171 /**
1172  * is_valid_recovery_journal_block() - Determine whether the given header describes a valid block
1173  *                                     for the given journal.
1174  * @journal: The journal to use.
1175  * @header: The unpacked block header to check.
1176  * @old_ok: Whether an old format header is valid.
1177  *
1178  * A block is not valid if it is unformatted, or if it is older than the last successful recovery
1179  * or reformat.
1180  *
1181  * Return: True if the header is valid.
1182  */
1183 static bool __must_check is_valid_recovery_journal_block(const struct recovery_journal *journal,
1184                                                          const struct recovery_block_header *header,
1185                                                          bool old_ok)
1186 {
1187         if ((header->nonce != journal->nonce) ||
1188             (header->recovery_count != journal->recovery_count))
1189                 return false;
1190
1191         if (header->metadata_type == VDO_METADATA_RECOVERY_JOURNAL_2)
1192                 return (header->entry_count <= journal->entries_per_block);
1193
1194         return (old_ok &&
1195                 (header->metadata_type == VDO_METADATA_RECOVERY_JOURNAL) &&
1196                 (header->entry_count <= RECOVERY_JOURNAL_1_ENTRIES_PER_BLOCK));
1197 }
1198
1199 /**
1200  * is_exact_recovery_journal_block() - Determine whether the given header describes the exact block
1201  *                                     indicated.
1202  * @journal: The journal to use.
1203  * @header: The unpacked block header to check.
1204  * @sequence: The expected sequence number.
1205  *
1206  * Return: True if the block matches.
1207  */
1208 static bool __must_check is_exact_recovery_journal_block(const struct recovery_journal *journal,
1209                                                          const struct recovery_block_header *header,
1210                                                          sequence_number_t sequence)
1211 {
1212         return ((header->sequence_number == sequence) &&
1213                 (is_valid_recovery_journal_block(journal, header, true)));
1214 }
1215
1216 /**
1217  * find_recovery_journal_head_and_tail() - Find the tail and head of the journal.
1218  * @repair: The repair completion.
1219  *
1220  * Return: True if there were valid journal blocks.
1221  */
1222 static bool find_recovery_journal_head_and_tail(struct repair_completion *repair)
1223 {
1224         struct recovery_journal *journal = repair->completion.vdo->recovery_journal;
1225         bool found_entries = false;
1226         physical_block_number_t i;
1227
1228         /*
1229          * Ensure that we don't replay old entries since we know the tail recorded in the super
1230          * block must be a lower bound. Not doing so can result in extra data loss by setting the
1231          * tail too early.
1232          */
1233         repair->highest_tail = journal->tail;
1234         for (i = 0; i < journal->size; i++) {
1235                 struct recovery_block_header header =
1236                         get_recovery_journal_block_header(journal, repair->journal_data, i);
1237
1238                 if (!is_valid_recovery_journal_block(journal, &header, true)) {
1239                         /* This block is old or incorrectly formatted */
1240                         continue;
1241                 }
1242
1243                 if (vdo_get_recovery_journal_block_number(journal, header.sequence_number) != i) {
1244                         /* This block is in the wrong location */
1245                         continue;
1246                 }
1247
1248                 if (header.sequence_number >= repair->highest_tail) {
1249                         found_entries = true;
1250                         repair->highest_tail = header.sequence_number;
1251                 }
1252
1253                 if (!found_entries)
1254                         continue;
1255
1256                 if (header.block_map_head > repair->block_map_head)
1257                         repair->block_map_head = header.block_map_head;
1258
1259                 if (header.slab_journal_head > repair->slab_journal_head)
1260                         repair->slab_journal_head = header.slab_journal_head;
1261         }
1262
1263         return found_entries;
1264 }
1265
1266 /**
1267  * unpack_entry() - Unpack a recovery journal entry in either format.
1268  * @vdo: The vdo.
1269  * @packed: The entry to unpack.
1270  * @format: The expected format of the entry.
1271  * @entry: The unpacked entry.
1272  *
1273  * Return: true if the entry should be applied.3
1274  */
1275 static bool unpack_entry(struct vdo *vdo, char *packed, enum vdo_metadata_type format,
1276                          struct recovery_journal_entry *entry)
1277 {
1278         if (format == VDO_METADATA_RECOVERY_JOURNAL_2) {
1279                 struct packed_recovery_journal_entry *packed_entry =
1280                         (struct packed_recovery_journal_entry *) packed;
1281
1282                 *entry = vdo_unpack_recovery_journal_entry(packed_entry);
1283         } else {
1284                 physical_block_number_t low32, high4;
1285
1286                 struct packed_recovery_journal_entry_1 *packed_entry =
1287                         (struct packed_recovery_journal_entry_1 *) packed;
1288
1289                 if (packed_entry->operation == VDO_JOURNAL_DATA_INCREMENT)
1290                         entry->operation = VDO_JOURNAL_DATA_REMAPPING;
1291                 else if (packed_entry->operation == VDO_JOURNAL_BLOCK_MAP_INCREMENT)
1292                         entry->operation = VDO_JOURNAL_BLOCK_MAP_REMAPPING;
1293                 else
1294                         return false;
1295
1296                 low32 = __le32_to_cpu(packed_entry->pbn_low_word);
1297                 high4 = packed_entry->pbn_high_nibble;
1298                 entry->slot = (struct block_map_slot) {
1299                         .pbn = ((high4 << 32) | low32),
1300                         .slot = (packed_entry->slot_low | (packed_entry->slot_high << 6)),
1301                 };
1302                 entry->mapping = vdo_unpack_block_map_entry(&packed_entry->block_map_entry);
1303                 entry->unmapping = (struct data_location) {
1304                         .pbn = VDO_ZERO_BLOCK,
1305                         .state = VDO_MAPPING_STATE_UNMAPPED,
1306                 };
1307         }
1308
1309         return (validate_recovery_journal_entry(vdo, entry) == VDO_SUCCESS);
1310 }
1311
1312 /**
1313  * append_sector_entries() - Append an array of recovery journal entries from a journal block
1314  *                           sector to the array of numbered mappings in the repair completion,
1315  *                           numbering each entry in the order they are appended.
1316  * @repair: The repair completion.
1317  * @entries: The entries in the sector.
1318  * @format: The format of the sector.
1319  * @entry_count: The number of entries to append.
1320  */
1321 static void append_sector_entries(struct repair_completion *repair, char *entries,
1322                                   enum vdo_metadata_type format,
1323                                   journal_entry_count_t entry_count)
1324 {
1325         journal_entry_count_t i;
1326         struct vdo *vdo = repair->completion.vdo;
1327         off_t increment = ((format == VDO_METADATA_RECOVERY_JOURNAL_2)
1328                            ? sizeof(struct packed_recovery_journal_entry)
1329                            : sizeof(struct packed_recovery_journal_entry_1));
1330
1331         for (i = 0; i < entry_count; i++, entries += increment) {
1332                 struct recovery_journal_entry entry;
1333
1334                 if (!unpack_entry(vdo, entries, format, &entry))
1335                         /* When recovering from read-only mode, ignore damaged entries. */
1336                         continue;
1337
1338                 repair->entries[repair->block_map_entry_count] =
1339                         (struct numbered_block_mapping) {
1340                         .block_map_slot = entry.slot,
1341                         .block_map_entry = vdo_pack_block_map_entry(entry.mapping.pbn,
1342                                                                     entry.mapping.state),
1343                         .number = repair->block_map_entry_count,
1344                 };
1345                 repair->block_map_entry_count++;
1346         }
1347 }
1348
1349 static journal_entry_count_t entries_per_sector(enum vdo_metadata_type format,
1350                                                 u8 sector_number)
1351 {
1352         if (format == VDO_METADATA_RECOVERY_JOURNAL_2)
1353                 return RECOVERY_JOURNAL_ENTRIES_PER_SECTOR;
1354
1355         return ((sector_number == (VDO_SECTORS_PER_BLOCK - 1))
1356                 ? RECOVERY_JOURNAL_1_ENTRIES_IN_LAST_SECTOR
1357                 : RECOVERY_JOURNAL_1_ENTRIES_PER_SECTOR);
1358 }
1359
1360 static void extract_entries_from_block(struct repair_completion *repair,
1361                                        struct recovery_journal *journal,
1362                                        sequence_number_t sequence,
1363                                        enum vdo_metadata_type format,
1364                                        journal_entry_count_t entries)
1365 {
1366         sector_count_t i;
1367         struct recovery_block_header header =
1368                 get_recovery_journal_block_header(journal, repair->journal_data,
1369                                                   sequence);
1370
1371         if (!is_exact_recovery_journal_block(journal, &header, sequence) ||
1372             (header.metadata_type != format)) {
1373                 /* This block is invalid, so skip it. */
1374                 return;
1375         }
1376
1377         entries = min(entries, header.entry_count);
1378         for (i = 1; i < VDO_SECTORS_PER_BLOCK; i++) {
1379                 struct packed_journal_sector *sector =
1380                         get_sector(journal, repair->journal_data, sequence, i);
1381                 journal_entry_count_t sector_entries =
1382                         min(entries, entries_per_sector(format, i));
1383
1384                 if (vdo_is_valid_recovery_journal_sector(&header, sector, i)) {
1385                         /* Only extract as many as the block header calls for. */
1386                         append_sector_entries(repair, (char *) sector->entries, format,
1387                                               min_t(journal_entry_count_t,
1388                                                     sector->entry_count,
1389                                                     sector_entries));
1390                 }
1391
1392                 /*
1393                  * Even if the sector wasn't full, count it as full when counting up to the
1394                  * entry count the block header claims.
1395                  */
1396                 entries -= sector_entries;
1397         }
1398 }
1399
1400 static int parse_journal_for_rebuild(struct repair_completion *repair)
1401 {
1402         int result;
1403         sequence_number_t i;
1404         block_count_t count;
1405         enum vdo_metadata_type format;
1406         struct vdo *vdo = repair->completion.vdo;
1407         struct recovery_journal *journal = vdo->recovery_journal;
1408         journal_entry_count_t entries_per_block = journal->entries_per_block;
1409
1410         format = get_recovery_journal_block_header(journal, repair->journal_data,
1411                                                    repair->highest_tail).metadata_type;
1412         if (format == VDO_METADATA_RECOVERY_JOURNAL)
1413                 entries_per_block = RECOVERY_JOURNAL_1_ENTRIES_PER_BLOCK;
1414
1415         /*
1416          * Allocate an array of numbered_block_mapping structures large enough to transcribe every
1417          * packed_recovery_journal_entry from every valid journal block.
1418          */
1419         count = ((repair->highest_tail - repair->block_map_head + 1) * entries_per_block);
1420         result = vdo_allocate(count, struct numbered_block_mapping, __func__,
1421                               &repair->entries);
1422         if (result != VDO_SUCCESS)
1423                 return result;
1424
1425         for (i = repair->block_map_head; i <= repair->highest_tail; i++)
1426                 extract_entries_from_block(repair, journal, i, format, entries_per_block);
1427
1428         return VDO_SUCCESS;
1429 }
1430
1431 static int validate_heads(struct repair_completion *repair)
1432 {
1433         /* Both reap heads must be behind the tail. */
1434         if ((repair->block_map_head <= repair->tail) &&
1435             (repair->slab_journal_head <= repair->tail))
1436                 return VDO_SUCCESS;
1437
1438
1439         return vdo_log_error_strerror(VDO_CORRUPT_JOURNAL,
1440                                       "Journal tail too early. block map head: %llu, slab journal head: %llu, tail: %llu",
1441                                       (unsigned long long) repair->block_map_head,
1442                                       (unsigned long long) repair->slab_journal_head,
1443                                       (unsigned long long) repair->tail);
1444 }
1445
1446 /**
1447  * extract_new_mappings() - Find all valid new mappings to be applied to the block map.
1448  * @repair: The repair completion.
1449  *
1450  * The mappings are extracted from the journal and stored in a sortable array so that all of the
1451  * mappings to be applied to a given block map page can be done in a single page fetch.
1452  */
1453 static int extract_new_mappings(struct repair_completion *repair)
1454 {
1455         int result;
1456         struct vdo *vdo = repair->completion.vdo;
1457         struct recovery_point recovery_point = {
1458                 .sequence_number = repair->block_map_head,
1459                 .sector_count = 1,
1460                 .entry_count = 0,
1461         };
1462
1463         /*
1464          * Allocate an array of numbered_block_mapping structs just large enough to transcribe
1465          * every packed_recovery_journal_entry from every valid journal block.
1466          */
1467         result = vdo_allocate(repair->entry_count, struct numbered_block_mapping,
1468                               __func__, &repair->entries);
1469         if (result != VDO_SUCCESS)
1470                 return result;
1471
1472         for (; before_recovery_point(&recovery_point, &repair->tail_recovery_point);
1473              increment_recovery_point(&recovery_point)) {
1474                 struct recovery_journal_entry entry = get_entry(repair, &recovery_point);
1475
1476                 result = validate_recovery_journal_entry(vdo, &entry);
1477                 if (result != VDO_SUCCESS) {
1478                         vdo_enter_read_only_mode(vdo, result);
1479                         return result;
1480                 }
1481
1482                 repair->entries[repair->block_map_entry_count] =
1483                         (struct numbered_block_mapping) {
1484                         .block_map_slot = entry.slot,
1485                         .block_map_entry = vdo_pack_block_map_entry(entry.mapping.pbn,
1486                                                                     entry.mapping.state),
1487                         .number = repair->block_map_entry_count,
1488                 };
1489                 repair->block_map_entry_count++;
1490         }
1491
1492         result = VDO_ASSERT((repair->block_map_entry_count <= repair->entry_count),
1493                             "approximate entry count is an upper bound");
1494         if (result != VDO_SUCCESS)
1495                 vdo_enter_read_only_mode(vdo, result);
1496
1497         return result;
1498 }
1499
1500 /**
1501  * compute_usages() - Compute the lbns in use and block map data blocks counts from the tail of
1502  *                    the journal.
1503  * @repair: The repair completion.
1504  */
1505 static noinline int compute_usages(struct repair_completion *repair)
1506 {
1507         /*
1508          * This function is declared noinline to avoid a spurious valgrind error regarding the
1509          * following structure being uninitialized.
1510          */
1511         struct recovery_point recovery_point = {
1512                 .sequence_number = repair->tail,
1513                 .sector_count = 1,
1514                 .entry_count = 0,
1515         };
1516
1517         struct vdo *vdo = repair->completion.vdo;
1518         struct recovery_journal *journal = vdo->recovery_journal;
1519         struct recovery_block_header header =
1520                 get_recovery_journal_block_header(journal, repair->journal_data,
1521                                                   repair->tail);
1522
1523         repair->logical_blocks_used = header.logical_blocks_used;
1524         repair->block_map_data_blocks = header.block_map_data_blocks;
1525
1526         for (; before_recovery_point(&recovery_point, &repair->tail_recovery_point);
1527              increment_recovery_point(&recovery_point)) {
1528                 struct recovery_journal_entry entry = get_entry(repair, &recovery_point);
1529                 int result;
1530
1531                 result = validate_recovery_journal_entry(vdo, &entry);
1532                 if (result != VDO_SUCCESS) {
1533                         vdo_enter_read_only_mode(vdo, result);
1534                         return result;
1535                 }
1536
1537                 if (entry.operation == VDO_JOURNAL_BLOCK_MAP_REMAPPING) {
1538                         repair->block_map_data_blocks++;
1539                         continue;
1540                 }
1541
1542                 if (vdo_is_mapped_location(&entry.mapping))
1543                         repair->logical_blocks_used++;
1544
1545                 if (vdo_is_mapped_location(&entry.unmapping))
1546                         repair->logical_blocks_used--;
1547         }
1548
1549         return VDO_SUCCESS;
1550 }
1551
1552 static int parse_journal_for_recovery(struct repair_completion *repair)
1553 {
1554         int result;
1555         sequence_number_t i, head;
1556         bool found_entries = false;
1557         struct recovery_journal *journal = repair->completion.vdo->recovery_journal;
1558         struct recovery_block_header header;
1559         enum vdo_metadata_type expected_format;
1560
1561         head = min(repair->block_map_head, repair->slab_journal_head);
1562         header = get_recovery_journal_block_header(journal, repair->journal_data, head);
1563         expected_format = header.metadata_type;
1564         for (i = head; i <= repair->highest_tail; i++) {
1565                 journal_entry_count_t block_entries;
1566                 u8 j;
1567
1568                 repair->tail = i;
1569                 repair->tail_recovery_point = (struct recovery_point) {
1570                         .sequence_number = i,
1571                         .sector_count = 0,
1572                         .entry_count = 0,
1573                 };
1574
1575                 header = get_recovery_journal_block_header(journal, repair->journal_data, i);
1576                 if (!is_exact_recovery_journal_block(journal, &header, i)) {
1577                         /* A bad block header was found so this must be the end of the journal. */
1578                         break;
1579                 } else if (header.metadata_type != expected_format) {
1580                         /* There is a mix of old and new format blocks, so we need to rebuild. */
1581                         vdo_log_error_strerror(VDO_CORRUPT_JOURNAL,
1582                                                "Recovery journal is in an invalid format, a read-only rebuild is required.");
1583                         vdo_enter_read_only_mode(repair->completion.vdo, VDO_CORRUPT_JOURNAL);
1584                         return VDO_CORRUPT_JOURNAL;
1585                 }
1586
1587                 block_entries = header.entry_count;
1588
1589                 /* Examine each sector in turn to determine the last valid sector. */
1590                 for (j = 1; j < VDO_SECTORS_PER_BLOCK; j++) {
1591                         struct packed_journal_sector *sector =
1592                                 get_sector(journal, repair->journal_data, i, j);
1593                         journal_entry_count_t sector_entries =
1594                                 min_t(journal_entry_count_t, sector->entry_count,
1595                                       block_entries);
1596
1597                         /* A bad sector means that this block was torn. */
1598                         if (!vdo_is_valid_recovery_journal_sector(&header, sector, j))
1599                                 break;
1600
1601                         if (sector_entries > 0) {
1602                                 found_entries = true;
1603                                 repair->tail_recovery_point.sector_count++;
1604                                 repair->tail_recovery_point.entry_count = sector_entries;
1605                                 block_entries -= sector_entries;
1606                                 repair->entry_count += sector_entries;
1607                         }
1608
1609                         /* If this sector is short, the later sectors can't matter. */
1610                         if ((sector_entries < RECOVERY_JOURNAL_ENTRIES_PER_SECTOR) ||
1611                             (block_entries == 0))
1612                                 break;
1613                 }
1614
1615                 /* If this block was not filled, or if it tore, no later block can matter. */
1616                 if ((header.entry_count != journal->entries_per_block) || (block_entries > 0))
1617                         break;
1618         }
1619
1620         if (!found_entries) {
1621                 return validate_heads(repair);
1622         } else if (expected_format == VDO_METADATA_RECOVERY_JOURNAL) {
1623                 /* All journal blocks have the old format, so we need to upgrade. */
1624                 vdo_log_error_strerror(VDO_UNSUPPORTED_VERSION,
1625                                        "Recovery journal is in the old format. Downgrade and complete recovery, then upgrade with a clean volume");
1626                 return VDO_UNSUPPORTED_VERSION;
1627         }
1628
1629         /* Set the tail to the last valid tail block, if there is one. */
1630         if (repair->tail_recovery_point.sector_count == 0)
1631                 repair->tail--;
1632
1633         result = validate_heads(repair);
1634         if (result != VDO_SUCCESS)
1635                 return result;
1636
1637         vdo_log_info("Highest-numbered recovery journal block has sequence number %llu, and the highest-numbered usable block is %llu",
1638                      (unsigned long long) repair->highest_tail,
1639                      (unsigned long long) repair->tail);
1640
1641         result = extract_new_mappings(repair);
1642         if (result != VDO_SUCCESS)
1643                 return result;
1644
1645         return compute_usages(repair);
1646 }
1647
1648 static int parse_journal(struct repair_completion *repair)
1649 {
1650         if (!find_recovery_journal_head_and_tail(repair))
1651                 return VDO_SUCCESS;
1652
1653         return (vdo_state_requires_read_only_rebuild(repair->completion.vdo->load_state) ?
1654                 parse_journal_for_rebuild(repair) :
1655                 parse_journal_for_recovery(repair));
1656 }
1657
1658 static void finish_journal_load(struct vdo_completion *completion)
1659 {
1660         struct repair_completion *repair = completion->parent;
1661
1662         if (++repair->vios_complete != repair->vio_count)
1663                 return;
1664
1665         vdo_log_info("Finished reading recovery journal");
1666         uninitialize_vios(repair);
1667         prepare_repair_completion(repair, recover_block_map, VDO_ZONE_TYPE_LOGICAL);
1668         vdo_continue_completion(&repair->completion, parse_journal(repair));
1669 }
1670
1671 static void handle_journal_load_error(struct vdo_completion *completion)
1672 {
1673         struct repair_completion *repair = completion->parent;
1674
1675         /* Preserve the error */
1676         vdo_set_completion_result(&repair->completion, completion->result);
1677         vio_record_metadata_io_error(as_vio(completion));
1678         completion->callback(completion);
1679 }
1680
1681 static void read_journal_endio(struct bio *bio)
1682 {
1683         struct vio *vio = bio->bi_private;
1684         struct vdo *vdo = vio->completion.vdo;
1685
1686         continue_vio_after_io(vio, finish_journal_load, vdo->thread_config.admin_thread);
1687 }
1688
1689 /**
1690  * vdo_repair() - Load the recovery journal and then recover or rebuild a vdo.
1691  * @parent: The completion to notify when the operation is complete
1692  */
1693 void vdo_repair(struct vdo_completion *parent)
1694 {
1695         int result;
1696         char *ptr;
1697         struct repair_completion *repair;
1698         struct vdo *vdo = parent->vdo;
1699         struct recovery_journal *journal = vdo->recovery_journal;
1700         physical_block_number_t pbn = journal->origin;
1701         block_count_t remaining = journal->size;
1702         block_count_t vio_count = DIV_ROUND_UP(remaining, MAX_BLOCKS_PER_VIO);
1703         page_count_t page_count = min_t(page_count_t,
1704                                         vdo->device_config->cache_size >> 1,
1705                                         MAXIMUM_SIMULTANEOUS_VDO_BLOCK_MAP_RESTORATION_READS);
1706
1707         vdo_assert_on_admin_thread(vdo, __func__);
1708
1709         if (vdo->load_state == VDO_FORCE_REBUILD) {
1710                 vdo_log_warning("Rebuilding reference counts to clear read-only mode");
1711                 vdo->states.vdo.read_only_recoveries++;
1712         } else if (vdo->load_state == VDO_REBUILD_FOR_UPGRADE) {
1713                 vdo_log_warning("Rebuilding reference counts for upgrade");
1714         } else {
1715                 vdo_log_warning("Device was dirty, rebuilding reference counts");
1716         }
1717
1718         result = vdo_allocate_extended(struct repair_completion, page_count,
1719                                        struct vdo_page_completion, __func__,
1720                                        &repair);
1721         if (result != VDO_SUCCESS) {
1722                 vdo_fail_completion(parent, result);
1723                 return;
1724         }
1725
1726         vdo_initialize_completion(&repair->completion, vdo, VDO_REPAIR_COMPLETION);
1727         repair->completion.error_handler = abort_repair;
1728         repair->completion.parent = parent;
1729         prepare_repair_completion(repair, finish_repair, VDO_ZONE_TYPE_ADMIN);
1730         repair->page_count = page_count;
1731
1732         result = vdo_allocate(remaining * VDO_BLOCK_SIZE, char, __func__,
1733                               &repair->journal_data);
1734         if (abort_on_error(result, repair))
1735                 return;
1736
1737         result = vdo_allocate(vio_count, struct vio, __func__, &repair->vios);
1738         if (abort_on_error(result, repair))
1739                 return;
1740
1741         ptr = repair->journal_data;
1742         for (repair->vio_count = 0; repair->vio_count < vio_count; repair->vio_count++) {
1743                 block_count_t blocks = min_t(block_count_t, remaining,
1744                                              MAX_BLOCKS_PER_VIO);
1745
1746                 result = allocate_vio_components(vdo, VIO_TYPE_RECOVERY_JOURNAL,
1747                                                  VIO_PRIORITY_METADATA,
1748                                                  repair, blocks, ptr,
1749                                                  &repair->vios[repair->vio_count]);
1750                 if (abort_on_error(result, repair))
1751                         return;
1752
1753                 ptr += (blocks * VDO_BLOCK_SIZE);
1754                 remaining -= blocks;
1755         }
1756
1757         for (vio_count = 0; vio_count < repair->vio_count;
1758              vio_count++, pbn += MAX_BLOCKS_PER_VIO) {
1759                 vdo_submit_metadata_vio(&repair->vios[vio_count], pbn, read_journal_endio,
1760                                         handle_journal_load_error, REQ_OP_READ);
1761         }
1762 }