drivers/md/dm-vdo/recovery-journal.h

   1 /* SPDX-License-Identifier: GPL-2.0-only */
   2 /*
   3  * Copyright 2023 Red Hat
   4  */
   5
   6 #ifndef VDO_RECOVERY_JOURNAL_H
   7 #define VDO_RECOVERY_JOURNAL_H
   8
   9 #include <linux/list.h>
  10
  11 #include "numeric.h"
  12
  13 #include "admin-state.h"
  14 #include "constants.h"
  15 #include "encodings.h"
  16 #include "flush.h"
  17 #include "statistics.h"
  18 #include "types.h"
  19 #include "wait-queue.h"
  20
  21 /**
  22  * DOC: recovery journal.
  23  *
  24  * The recovery_journal provides a log of all block mapping and reference count changes which have
  25  * not yet been stably written to the block map or slab journals. This log helps to reduce the
  26  * write amplification of writes by providing amortization of slab journal and block map page
  27  * updates.
  28  *
  29  * The recovery journal has a single dedicated queue and thread for performing all journal updates.
  30  * The concurrency guarantees of this single-threaded model allow the code to omit more
  31  * fine-grained locking for recovery journal structures.
  32  *
  33  * The journal consists of a set of on-disk blocks arranged as a circular log with monotonically
  34  * increasing sequence numbers. Three sequence numbers serve to define the active extent of the
  35  * journal. The 'head' is the oldest active block in the journal. The 'tail' is the end of the
  36  * half-open interval containing the active blocks. 'active' is the number of the block actively
  37  * receiving entries. In an empty journal, head == active == tail. Once any entries are added, tail
  38  * = active + 1, and head may be any value in the interval [tail - size, active].
  39  *
  40  * The journal also contains a set of in-memory blocks which are used to buffer up entries until
  41  * they can be committed. In general the number of in-memory blocks ('tail_buffer_count') will be
  42  * less than the on-disk size. Each in-memory block is also a vdo_completion. Each in-memory block
  43  * has a vio which is used to commit that block to disk. The vio's data is the on-disk
  44  * representation of the journal block. In addition each in-memory block has a buffer which is used
  45  * to accumulate entries while a partial commit of the block is in progress. In-memory blocks are
  46  * kept on two rings. Free blocks live on the 'free_tail_blocks' ring. When a block becomes active
  47  * (see below) it is moved to the 'active_tail_blocks' ring. When a block is fully committed, it is
  48  * moved back to the 'free_tail_blocks' ring.
  49  *
  50  * When entries are added to the journal, they are added to the active in-memory block, as
  51  * indicated by the 'active_block' field. If the caller wishes to wait for the entry to be
  52  * committed, the requesting VIO will be attached to the in-memory block to which the caller's
  53  * entry was added. If the caller does wish to wait, or if the entry filled the active block, an
  54  * attempt will be made to commit that block to disk. If there is already another commit in
  55  * progress, the attempt will be ignored and then automatically retried when the in-progress commit
  56  * completes. If there is no commit in progress, any data_vios waiting on the block are transferred
  57  * to the block's vio which is then written, automatically waking all of the waiters when it
  58  * completes. When the write completes, any entries which accumulated in the block are copied to
  59  * the vio's data buffer.
  60  *
  61  * Finally, the journal maintains a set of counters, one for each on disk journal block. These
  62  * counters are used as locks to prevent premature reaping of journal blocks. Each time a new
  63  * sequence number is used, the counter for the corresponding block is incremented. The counter is
  64  * subsequently decremented when that block is filled and then committed for the last time. This
  65  * prevents blocks from being reaped while they are still being updated. The counter is also
  66  * incremented once for each entry added to a block, and decremented once each time the block map
  67  * is updated in memory for that request. This prevents blocks from being reaped while their VIOs
  68  * are still active. Finally, each in-memory block map page tracks the oldest journal block that
  69  * contains entries corresponding to uncommitted updates to that block map page. Each time an
  70  * in-memory block map page is updated, it checks if the journal block for the VIO is earlier than
  71  * the one it references, in which case it increments the count on the earlier journal block and
  72  * decrements the count on the later journal block, maintaining a lock on the oldest journal block
  73  * containing entries for that page. When a block map page has been flushed from the cache, the
  74  * counter for the journal block it references is decremented. Whenever the counter for the head
  75  * block goes to 0, the head is advanced until it comes to a block whose counter is not 0 or until
  76  * it reaches the active block. This is the mechanism for reclaiming journal space on disk.
  77  *
  78  * If there is no in-memory space when a VIO attempts to add an entry, the VIO will be attached to
  79  * the 'commit_completion' and will be woken the next time a full block has committed. If there is
  80  * no on-disk space when a VIO attempts to add an entry, the VIO will be attached to the
  81  * 'reap_completion', and will be woken the next time a journal block is reaped.
  82  */
  83
  84 enum vdo_zone_type {
  85         VDO_ZONE_TYPE_ADMIN,
  86         VDO_ZONE_TYPE_JOURNAL,
  87         VDO_ZONE_TYPE_LOGICAL,
  88         VDO_ZONE_TYPE_PHYSICAL,
  89 };
  90
  91 struct lock_counter {
  92         /* The completion for notifying the owner of a lock release */
  93         struct vdo_completion completion;
  94         /* The number of logical zones which may hold locks */
  95         zone_count_t logical_zones;
  96         /* The number of physical zones which may hold locks */
  97         zone_count_t physical_zones;
  98         /* The number of locks */
  99         block_count_t locks;
 100         /* Whether the lock release notification is in flight */
 101         atomic_t state;
 102         /* The number of logical zones which hold each lock */
 103         atomic_t *logical_zone_counts;
 104         /* The number of physical zones which hold each lock */
 105         atomic_t *physical_zone_counts;
 106         /* The per-lock counts for the journal zone */
 107         u16 *journal_counters;
 108         /* The per-lock decrement counts for the journal zone */
 109         atomic_t *journal_decrement_counts;
 110         /* The per-zone, per-lock reference counts for logical zones */
 111         u16 *logical_counters;
 112         /* The per-zone, per-lock reference counts for physical zones */
 113         u16 *physical_counters;
 114 };
 115
 116 struct recovery_journal_block {
 117         /* The doubly linked pointers for the free or active lists */
 118         struct list_head list_node;
 119         /* The waiter for the pending full block list */
 120         struct vdo_waiter write_waiter;
 121         /* The journal to which this block belongs */
 122         struct recovery_journal *journal;
 123         /* A pointer to the current sector in the packed block buffer */
 124         struct packed_journal_sector *sector;
 125         /* The vio for writing this block */
 126         struct vio vio;
 127         /* The sequence number for this block */
 128         sequence_number_t sequence_number;
 129         /* The location of this block in the on-disk journal */
 130         physical_block_number_t block_number;
 131         /* Whether this block is being committed */
 132         bool committing;
 133         /* The total number of entries in this block */
 134         journal_entry_count_t entry_count;
 135         /* The total number of uncommitted entries (queued or committing) */
 136         journal_entry_count_t uncommitted_entry_count;
 137         /* The number of new entries in the current commit */
 138         journal_entry_count_t entries_in_commit;
 139         /* The queue of vios which will make entries for the next commit */
 140         struct vdo_wait_queue entry_waiters;
 141         /* The queue of vios waiting for the current commit */
 142         struct vdo_wait_queue commit_waiters;
 143 };
 144
 145 struct recovery_journal {
 146         /* The thread ID of the journal zone */
 147         thread_id_t thread_id;
 148         /* The slab depot which can hold locks on this journal */
 149         struct slab_depot *depot;
 150         /* The block map which can hold locks on this journal */
 151         struct block_map *block_map;
 152         /* The queue of vios waiting to make entries */
 153         struct vdo_wait_queue entry_waiters;
 154         /* The number of free entries in the journal */
 155         u64 available_space;
 156         /* The number of decrement entries which need to be made */
 157         data_vio_count_t pending_decrement_count;
 158         /* Whether the journal is adding entries from the increment or decrement waiters queues */
 159         bool adding_entries;
 160         /* The administrative state of the journal */
 161         struct admin_state state;
 162         /* Whether a reap is in progress */
 163         bool reaping;
 164         /* The location of the first journal block */
 165         physical_block_number_t origin;
 166         /* The oldest active block in the journal on disk for block map rebuild */
 167         sequence_number_t block_map_head;
 168         /* The oldest active block in the journal on disk for slab journal replay */
 169         sequence_number_t slab_journal_head;
 170         /* The newest block in the journal on disk to which a write has finished */
 171         sequence_number_t last_write_acknowledged;
 172         /* The end of the half-open interval of the active journal */
 173         sequence_number_t tail;
 174         /* The point at which the last entry will have been added */
 175         struct journal_point append_point;
 176         /* The journal point of the vio most recently released from the journal */
 177         struct journal_point commit_point;
 178         /* The nonce of the VDO */
 179         nonce_t nonce;
 180         /* The number of recoveries completed by the VDO */
 181         u8 recovery_count;
 182         /* The number of entries which fit in a single block */
 183         journal_entry_count_t entries_per_block;
 184         /* Unused in-memory journal blocks */
 185         struct list_head free_tail_blocks;
 186         /* In-memory journal blocks with records */
 187         struct list_head active_tail_blocks;
 188         /* A pointer to the active block (the one we are adding entries to now) */
 189         struct recovery_journal_block *active_block;
 190         /* Journal blocks that need writing */
 191         struct vdo_wait_queue pending_writes;
 192         /* The new block map reap head after reaping */
 193         sequence_number_t block_map_reap_head;
 194         /* The head block number for the block map rebuild range */
 195         block_count_t block_map_head_block_number;
 196         /* The new slab journal reap head after reaping */
 197         sequence_number_t slab_journal_reap_head;
 198         /* The head block number for the slab journal replay range */
 199         block_count_t slab_journal_head_block_number;
 200         /* The data-less vio, usable only for flushing */
 201         struct vio *flush_vio;
 202         /* The number of blocks in the on-disk journal */
 203         block_count_t size;
 204         /* The number of logical blocks that are in-use */
 205         block_count_t logical_blocks_used;
 206         /* The number of block map pages that are allocated */
 207         block_count_t block_map_data_blocks;
 208         /* The number of journal blocks written but not yet acknowledged */
 209         block_count_t pending_write_count;
 210         /* The threshold at which slab journal tail blocks will be written out */
 211         block_count_t slab_journal_commit_threshold;
 212         /* Counters for events in the journal that are reported as statistics */
 213         struct recovery_journal_statistics events;
 214         /* The locks for each on-disk block */
 215         struct lock_counter lock_counter;
 216         /* The tail blocks */
 217         struct recovery_journal_block blocks[];
 218 };
 219
 220 /**
 221  * vdo_get_recovery_journal_block_number() - Get the physical block number for a given sequence
 222  *                                           number.
 223  * @journal: The journal.
 224  * @sequence: The sequence number of the desired block.
 225  *
 226  * Return: The block number corresponding to the sequence number.
 227  */
 228 static inline physical_block_number_t __must_check
 229 vdo_get_recovery_journal_block_number(const struct recovery_journal *journal,
 230                                       sequence_number_t sequence)
 231 {
 232         /*
 233          * Since journal size is a power of two, the block number modulus can just be extracted
 234          * from the low-order bits of the sequence.
 235          */
 236         return vdo_compute_recovery_journal_block_number(journal->size, sequence);
 237 }
 238
 239 /**
 240  * vdo_compute_recovery_journal_check_byte() - Compute the check byte for a given sequence number.
 241  * @journal: The journal.
 242  * @sequence: The sequence number.
 243  *
 244  * Return: The check byte corresponding to the sequence number.
 245  */
 246 static inline u8 __must_check
 247 vdo_compute_recovery_journal_check_byte(const struct recovery_journal *journal,
 248                                         sequence_number_t sequence)
 249 {
 250         /* The check byte must change with each trip around the journal. */
 251         return (((sequence / journal->size) & 0x7F) | 0x80);
 252 }
 253
 254 int __must_check vdo_decode_recovery_journal(struct recovery_journal_state_7_0 state,
 255                                              nonce_t nonce, struct vdo *vdo,
 256                                              struct partition *partition,
 257                                              u64 recovery_count,
 258                                              block_count_t journal_size,
 259                                              struct recovery_journal **journal_ptr);
 260
 261 void vdo_free_recovery_journal(struct recovery_journal *journal);
 262
 263 void vdo_initialize_recovery_journal_post_repair(struct recovery_journal *journal,
 264                                                  u64 recovery_count,
 265                                                  sequence_number_t tail,
 266                                                  block_count_t logical_blocks_used,
 267                                                  block_count_t block_map_data_blocks);
 268
 269 block_count_t __must_check
 270 vdo_get_journal_block_map_data_blocks_used(struct recovery_journal *journal);
 271
 272 thread_id_t __must_check vdo_get_recovery_journal_thread_id(struct recovery_journal *journal);
 273
 274 void vdo_open_recovery_journal(struct recovery_journal *journal,
 275                                struct slab_depot *depot, struct block_map *block_map);
 276
 277 sequence_number_t
 278 vdo_get_recovery_journal_current_sequence_number(struct recovery_journal *journal);
 279
 280 block_count_t __must_check vdo_get_recovery_journal_length(block_count_t journal_size);
 281
 282 struct recovery_journal_state_7_0 __must_check
 283 vdo_record_recovery_journal(const struct recovery_journal *journal);
 284
 285 void vdo_add_recovery_journal_entry(struct recovery_journal *journal,
 286                                     struct data_vio *data_vio);
 287
 288 void vdo_acquire_recovery_journal_block_reference(struct recovery_journal *journal,
 289                                                   sequence_number_t sequence_number,
 290                                                   enum vdo_zone_type zone_type,
 291                                                   zone_count_t zone_id);
 292
 293 void vdo_release_recovery_journal_block_reference(struct recovery_journal *journal,
 294                                                   sequence_number_t sequence_number,
 295                                                   enum vdo_zone_type zone_type,
 296                                                   zone_count_t zone_id);
 297
 298 void vdo_release_journal_entry_lock(struct recovery_journal *journal,
 299                                     sequence_number_t sequence_number);
 300
 301 void vdo_drain_recovery_journal(struct recovery_journal *journal,
 302                                 const struct admin_state_code *operation,
 303                                 struct vdo_completion *parent);
 304
 305 void vdo_resume_recovery_journal(struct recovery_journal *journal,
 306                                  struct vdo_completion *parent);
 307
 308 block_count_t __must_check
 309 vdo_get_recovery_journal_logical_blocks_used(const struct recovery_journal *journal);
 310
 311 struct recovery_journal_statistics __must_check
 312 vdo_get_recovery_journal_statistics(const struct recovery_journal *journal);
 313
 314 void vdo_dump_recovery_journal_statistics(const struct recovery_journal *journal);
 315
 316 #endif /* VDO_RECOVERY_JOURNAL_H */