1 /* SPDX-License-Identifier: GPL-2.0-only */
3 * Copyright 2023 Red Hat
6 #ifndef VDO_RECOVERY_JOURNAL_H
7 #define VDO_RECOVERY_JOURNAL_H
9 #include <linux/list.h>
13 #include "admin-state.h"
14 #include "constants.h"
15 #include "encodings.h"
17 #include "statistics.h"
19 #include "wait-queue.h"
22 * DOC: recovery journal.
24 * The recovery_journal provides a log of all block mapping and reference count changes which have
25 * not yet been stably written to the block map or slab journals. This log helps to reduce the
26 * write amplification of writes by providing amortization of slab journal and block map page
29 * The recovery journal has a single dedicated queue and thread for performing all journal updates.
30 * The concurrency guarantees of this single-threaded model allow the code to omit more
31 * fine-grained locking for recovery journal structures.
33 * The journal consists of a set of on-disk blocks arranged as a circular log with monotonically
34 * increasing sequence numbers. Three sequence numbers serve to define the active extent of the
35 * journal. The 'head' is the oldest active block in the journal. The 'tail' is the end of the
36 * half-open interval containing the active blocks. 'active' is the number of the block actively
37 * receiving entries. In an empty journal, head == active == tail. Once any entries are added, tail
38 * = active + 1, and head may be any value in the interval [tail - size, active].
40 * The journal also contains a set of in-memory blocks which are used to buffer up entries until
41 * they can be committed. In general the number of in-memory blocks ('tail_buffer_count') will be
42 * less than the on-disk size. Each in-memory block is also a vdo_completion. Each in-memory block
43 * has a vio which is used to commit that block to disk. The vio's data is the on-disk
44 * representation of the journal block. In addition each in-memory block has a buffer which is used
45 * to accumulate entries while a partial commit of the block is in progress. In-memory blocks are
46 * kept on two rings. Free blocks live on the 'free_tail_blocks' ring. When a block becomes active
47 * (see below) it is moved to the 'active_tail_blocks' ring. When a block is fully committed, it is
48 * moved back to the 'free_tail_blocks' ring.
50 * When entries are added to the journal, they are added to the active in-memory block, as
51 * indicated by the 'active_block' field. If the caller wishes to wait for the entry to be
52 * committed, the requesting VIO will be attached to the in-memory block to which the caller's
53 * entry was added. If the caller does wish to wait, or if the entry filled the active block, an
54 * attempt will be made to commit that block to disk. If there is already another commit in
55 * progress, the attempt will be ignored and then automatically retried when the in-progress commit
56 * completes. If there is no commit in progress, any data_vios waiting on the block are transferred
57 * to the block's vio which is then written, automatically waking all of the waiters when it
58 * completes. When the write completes, any entries which accumulated in the block are copied to
59 * the vio's data buffer.
61 * Finally, the journal maintains a set of counters, one for each on disk journal block. These
62 * counters are used as locks to prevent premature reaping of journal blocks. Each time a new
63 * sequence number is used, the counter for the corresponding block is incremented. The counter is
64 * subsequently decremented when that block is filled and then committed for the last time. This
65 * prevents blocks from being reaped while they are still being updated. The counter is also
66 * incremented once for each entry added to a block, and decremented once each time the block map
67 * is updated in memory for that request. This prevents blocks from being reaped while their VIOs
68 * are still active. Finally, each in-memory block map page tracks the oldest journal block that
69 * contains entries corresponding to uncommitted updates to that block map page. Each time an
70 * in-memory block map page is updated, it checks if the journal block for the VIO is earlier than
71 * the one it references, in which case it increments the count on the earlier journal block and
72 * decrements the count on the later journal block, maintaining a lock on the oldest journal block
73 * containing entries for that page. When a block map page has been flushed from the cache, the
74 * counter for the journal block it references is decremented. Whenever the counter for the head
75 * block goes to 0, the head is advanced until it comes to a block whose counter is not 0 or until
76 * it reaches the active block. This is the mechanism for reclaiming journal space on disk.
78 * If there is no in-memory space when a VIO attempts to add an entry, the VIO will be attached to
79 * the 'commit_completion' and will be woken the next time a full block has committed. If there is
80 * no on-disk space when a VIO attempts to add an entry, the VIO will be attached to the
81 * 'reap_completion', and will be woken the next time a journal block is reaped.
86 VDO_ZONE_TYPE_JOURNAL
,
87 VDO_ZONE_TYPE_LOGICAL
,
88 VDO_ZONE_TYPE_PHYSICAL
,
92 /* The completion for notifying the owner of a lock release */
93 struct vdo_completion completion
;
94 /* The number of logical zones which may hold locks */
95 zone_count_t logical_zones
;
96 /* The number of physical zones which may hold locks */
97 zone_count_t physical_zones
;
98 /* The number of locks */
100 /* Whether the lock release notification is in flight */
102 /* The number of logical zones which hold each lock */
103 atomic_t
*logical_zone_counts
;
104 /* The number of physical zones which hold each lock */
105 atomic_t
*physical_zone_counts
;
106 /* The per-lock counts for the journal zone */
107 u16
*journal_counters
;
108 /* The per-lock decrement counts for the journal zone */
109 atomic_t
*journal_decrement_counts
;
110 /* The per-zone, per-lock reference counts for logical zones */
111 u16
*logical_counters
;
112 /* The per-zone, per-lock reference counts for physical zones */
113 u16
*physical_counters
;
116 struct recovery_journal_block
{
117 /* The doubly linked pointers for the free or active lists */
118 struct list_head list_node
;
119 /* The waiter for the pending full block list */
120 struct vdo_waiter write_waiter
;
121 /* The journal to which this block belongs */
122 struct recovery_journal
*journal
;
123 /* A pointer to the current sector in the packed block buffer */
124 struct packed_journal_sector
*sector
;
125 /* The vio for writing this block */
127 /* The sequence number for this block */
128 sequence_number_t sequence_number
;
129 /* The location of this block in the on-disk journal */
130 physical_block_number_t block_number
;
131 /* Whether this block is being committed */
133 /* The total number of entries in this block */
134 journal_entry_count_t entry_count
;
135 /* The total number of uncommitted entries (queued or committing) */
136 journal_entry_count_t uncommitted_entry_count
;
137 /* The number of new entries in the current commit */
138 journal_entry_count_t entries_in_commit
;
139 /* The queue of vios which will make entries for the next commit */
140 struct vdo_wait_queue entry_waiters
;
141 /* The queue of vios waiting for the current commit */
142 struct vdo_wait_queue commit_waiters
;
145 struct recovery_journal
{
146 /* The thread ID of the journal zone */
147 thread_id_t thread_id
;
148 /* The slab depot which can hold locks on this journal */
149 struct slab_depot
*depot
;
150 /* The block map which can hold locks on this journal */
151 struct block_map
*block_map
;
152 /* The queue of vios waiting to make entries */
153 struct vdo_wait_queue entry_waiters
;
154 /* The number of free entries in the journal */
156 /* The number of decrement entries which need to be made */
157 data_vio_count_t pending_decrement_count
;
158 /* Whether the journal is adding entries from the increment or decrement waiters queues */
160 /* The administrative state of the journal */
161 struct admin_state state
;
162 /* Whether a reap is in progress */
164 /* The location of the first journal block */
165 physical_block_number_t origin
;
166 /* The oldest active block in the journal on disk for block map rebuild */
167 sequence_number_t block_map_head
;
168 /* The oldest active block in the journal on disk for slab journal replay */
169 sequence_number_t slab_journal_head
;
170 /* The newest block in the journal on disk to which a write has finished */
171 sequence_number_t last_write_acknowledged
;
172 /* The end of the half-open interval of the active journal */
173 sequence_number_t tail
;
174 /* The point at which the last entry will have been added */
175 struct journal_point append_point
;
176 /* The journal point of the vio most recently released from the journal */
177 struct journal_point commit_point
;
178 /* The nonce of the VDO */
180 /* The number of recoveries completed by the VDO */
182 /* The number of entries which fit in a single block */
183 journal_entry_count_t entries_per_block
;
184 /* Unused in-memory journal blocks */
185 struct list_head free_tail_blocks
;
186 /* In-memory journal blocks with records */
187 struct list_head active_tail_blocks
;
188 /* A pointer to the active block (the one we are adding entries to now) */
189 struct recovery_journal_block
*active_block
;
190 /* Journal blocks that need writing */
191 struct vdo_wait_queue pending_writes
;
192 /* The new block map reap head after reaping */
193 sequence_number_t block_map_reap_head
;
194 /* The head block number for the block map rebuild range */
195 block_count_t block_map_head_block_number
;
196 /* The new slab journal reap head after reaping */
197 sequence_number_t slab_journal_reap_head
;
198 /* The head block number for the slab journal replay range */
199 block_count_t slab_journal_head_block_number
;
200 /* The data-less vio, usable only for flushing */
201 struct vio
*flush_vio
;
202 /* The number of blocks in the on-disk journal */
204 /* The number of logical blocks that are in-use */
205 block_count_t logical_blocks_used
;
206 /* The number of block map pages that are allocated */
207 block_count_t block_map_data_blocks
;
208 /* The number of journal blocks written but not yet acknowledged */
209 block_count_t pending_write_count
;
210 /* The threshold at which slab journal tail blocks will be written out */
211 block_count_t slab_journal_commit_threshold
;
212 /* Counters for events in the journal that are reported as statistics */
213 struct recovery_journal_statistics events
;
214 /* The locks for each on-disk block */
215 struct lock_counter lock_counter
;
216 /* The tail blocks */
217 struct recovery_journal_block blocks
[];
221 * vdo_get_recovery_journal_block_number() - Get the physical block number for a given sequence
223 * @journal: The journal.
224 * @sequence: The sequence number of the desired block.
226 * Return: The block number corresponding to the sequence number.
228 static inline physical_block_number_t __must_check
229 vdo_get_recovery_journal_block_number(const struct recovery_journal
*journal
,
230 sequence_number_t sequence
)
233 * Since journal size is a power of two, the block number modulus can just be extracted
234 * from the low-order bits of the sequence.
236 return vdo_compute_recovery_journal_block_number(journal
->size
, sequence
);
240 * vdo_compute_recovery_journal_check_byte() - Compute the check byte for a given sequence number.
241 * @journal: The journal.
242 * @sequence: The sequence number.
244 * Return: The check byte corresponding to the sequence number.
246 static inline u8 __must_check
247 vdo_compute_recovery_journal_check_byte(const struct recovery_journal
*journal
,
248 sequence_number_t sequence
)
250 /* The check byte must change with each trip around the journal. */
251 return (((sequence
/ journal
->size
) & 0x7F) | 0x80);
254 int __must_check
vdo_decode_recovery_journal(struct recovery_journal_state_7_0 state
,
255 nonce_t nonce
, struct vdo
*vdo
,
256 struct partition
*partition
,
258 block_count_t journal_size
,
259 struct recovery_journal
**journal_ptr
);
261 void vdo_free_recovery_journal(struct recovery_journal
*journal
);
263 void vdo_initialize_recovery_journal_post_repair(struct recovery_journal
*journal
,
265 sequence_number_t tail
,
266 block_count_t logical_blocks_used
,
267 block_count_t block_map_data_blocks
);
269 block_count_t __must_check
270 vdo_get_journal_block_map_data_blocks_used(struct recovery_journal
*journal
);
272 thread_id_t __must_check
vdo_get_recovery_journal_thread_id(struct recovery_journal
*journal
);
274 void vdo_open_recovery_journal(struct recovery_journal
*journal
,
275 struct slab_depot
*depot
, struct block_map
*block_map
);
278 vdo_get_recovery_journal_current_sequence_number(struct recovery_journal
*journal
);
280 block_count_t __must_check
vdo_get_recovery_journal_length(block_count_t journal_size
);
282 struct recovery_journal_state_7_0 __must_check
283 vdo_record_recovery_journal(const struct recovery_journal
*journal
);
285 void vdo_add_recovery_journal_entry(struct recovery_journal
*journal
,
286 struct data_vio
*data_vio
);
288 void vdo_acquire_recovery_journal_block_reference(struct recovery_journal
*journal
,
289 sequence_number_t sequence_number
,
290 enum vdo_zone_type zone_type
,
291 zone_count_t zone_id
);
293 void vdo_release_recovery_journal_block_reference(struct recovery_journal
*journal
,
294 sequence_number_t sequence_number
,
295 enum vdo_zone_type zone_type
,
296 zone_count_t zone_id
);
298 void vdo_release_journal_entry_lock(struct recovery_journal
*journal
,
299 sequence_number_t sequence_number
);
301 void vdo_drain_recovery_journal(struct recovery_journal
*journal
,
302 const struct admin_state_code
*operation
,
303 struct vdo_completion
*parent
);
305 void vdo_resume_recovery_journal(struct recovery_journal
*journal
,
306 struct vdo_completion
*parent
);
308 block_count_t __must_check
309 vdo_get_recovery_journal_logical_blocks_used(const struct recovery_journal
*journal
);
311 struct recovery_journal_statistics __must_check
312 vdo_get_recovery_journal_statistics(const struct recovery_journal
*journal
);
314 void vdo_dump_recovery_journal_statistics(const struct recovery_journal
*journal
);
316 #endif /* VDO_RECOVERY_JOURNAL_H */