1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright 2023 Red Hat
6 #include "slab-depot.h"
8 #include <linux/atomic.h>
10 #include <linux/err.h>
11 #include <linux/log2.h>
12 #include <linux/min_heap.h>
13 #include <linux/minmax.h>
16 #include "memory-alloc.h"
18 #include "permassert.h"
19 #include "string-utils.h"
21 #include "action-manager.h"
22 #include "admin-state.h"
23 #include "completion.h"
24 #include "constants.h"
26 #include "encodings.h"
27 #include "io-submitter.h"
28 #include "physical-zone.h"
29 #include "priority-table.h"
30 #include "recovery-journal.h"
32 #include "status-codes.h"
36 #include "wait-queue.h"
38 static const u64 BYTES_PER_WORD
= sizeof(u64
);
39 static const bool NORMAL_OPERATION
= true;
42 * get_lock() - Get the lock object for a slab journal block by sequence number.
43 * @journal: vdo_slab journal to retrieve from.
44 * @sequence_number: Sequence number of the block.
46 * Return: The lock object for the given sequence number.
48 static inline struct journal_lock
* __must_check
get_lock(struct slab_journal
*journal
,
49 sequence_number_t sequence_number
)
51 return &journal
->locks
[sequence_number
% journal
->size
];
54 static bool is_slab_open(struct vdo_slab
*slab
)
56 return (!vdo_is_state_quiescing(&slab
->state
) &&
57 !vdo_is_state_quiescent(&slab
->state
));
61 * must_make_entries_to_flush() - Check whether there are entry waiters which should delay a flush.
62 * @journal: The journal to check.
64 * Return: true if there are no entry waiters, or if the slab is unrecovered.
66 static inline bool __must_check
must_make_entries_to_flush(struct slab_journal
*journal
)
68 return ((journal
->slab
->status
!= VDO_SLAB_REBUILDING
) &&
69 vdo_waitq_has_waiters(&journal
->entry_waiters
));
73 * is_reaping() - Check whether a reap is currently in progress.
74 * @journal: The journal which may be reaping.
76 * Return: true if the journal is reaping.
78 static inline bool __must_check
is_reaping(struct slab_journal
*journal
)
80 return (journal
->head
!= journal
->unreapable
);
84 * initialize_tail_block() - Initialize tail block as a new block.
85 * @journal: The journal whose tail block is being initialized.
87 static void initialize_tail_block(struct slab_journal
*journal
)
89 struct slab_journal_block_header
*header
= &journal
->tail_header
;
91 header
->sequence_number
= journal
->tail
;
92 header
->entry_count
= 0;
93 header
->has_block_map_increments
= false;
97 * initialize_journal_state() - Set all journal fields appropriately to start journaling.
98 * @journal: The journal to be reset, based on its tail sequence number.
100 static void initialize_journal_state(struct slab_journal
*journal
)
102 journal
->unreapable
= journal
->head
;
103 journal
->reap_lock
= get_lock(journal
, journal
->unreapable
);
104 journal
->next_commit
= journal
->tail
;
105 journal
->summarized
= journal
->last_summarized
= journal
->tail
;
106 initialize_tail_block(journal
);
110 * block_is_full() - Check whether a journal block is full.
111 * @journal: The slab journal for the block.
113 * Return: true if the tail block is full.
115 static bool __must_check
block_is_full(struct slab_journal
*journal
)
117 journal_entry_count_t count
= journal
->tail_header
.entry_count
;
119 return (journal
->tail_header
.has_block_map_increments
?
120 (journal
->full_entries_per_block
== count
) :
121 (journal
->entries_per_block
== count
));
124 static void add_entries(struct slab_journal
*journal
);
125 static void update_tail_block_location(struct slab_journal
*journal
);
126 static void release_journal_locks(struct vdo_waiter
*waiter
, void *context
);
129 * is_slab_journal_blank() - Check whether a slab's journal is blank.
131 * A slab journal is blank if it has never had any entries recorded in it.
133 * Return: true if the slab's journal has never been modified.
135 static bool is_slab_journal_blank(const struct vdo_slab
*slab
)
137 return ((slab
->journal
.tail
== 1) &&
138 (slab
->journal
.tail_header
.entry_count
== 0));
142 * mark_slab_journal_dirty() - Put a slab journal on the dirty ring of its allocator in the correct
144 * @journal: The journal to be marked dirty.
145 * @lock: The recovery journal lock held by the slab journal.
147 static void mark_slab_journal_dirty(struct slab_journal
*journal
, sequence_number_t lock
)
149 struct slab_journal
*dirty_journal
;
150 struct list_head
*dirty_list
= &journal
->slab
->allocator
->dirty_slab_journals
;
152 VDO_ASSERT_LOG_ONLY(journal
->recovery_lock
== 0, "slab journal was clean");
154 journal
->recovery_lock
= lock
;
155 list_for_each_entry_reverse(dirty_journal
, dirty_list
, dirty_entry
) {
156 if (dirty_journal
->recovery_lock
<= journal
->recovery_lock
)
160 list_move_tail(&journal
->dirty_entry
, dirty_journal
->dirty_entry
.next
);
163 static void mark_slab_journal_clean(struct slab_journal
*journal
)
165 journal
->recovery_lock
= 0;
166 list_del_init(&journal
->dirty_entry
);
169 static void check_if_slab_drained(struct vdo_slab
*slab
)
172 struct slab_journal
*journal
= &slab
->journal
;
173 const struct admin_state_code
*code
;
175 if (!vdo_is_state_draining(&slab
->state
) ||
176 must_make_entries_to_flush(journal
) ||
177 is_reaping(journal
) ||
178 journal
->waiting_to_commit
||
179 !list_empty(&journal
->uncommitted_blocks
) ||
180 journal
->updating_slab_summary
||
181 (slab
->active_count
> 0))
184 /* When not suspending or recovering, the slab must be clean. */
185 code
= vdo_get_admin_state_code(&slab
->state
);
186 read_only
= vdo_is_read_only(slab
->allocator
->depot
->vdo
);
188 vdo_waitq_has_waiters(&slab
->dirty_blocks
) &&
189 (code
!= VDO_ADMIN_STATE_SUSPENDING
) &&
190 (code
!= VDO_ADMIN_STATE_RECOVERING
))
193 vdo_finish_draining_with_result(&slab
->state
,
194 (read_only
? VDO_READ_ONLY
: VDO_SUCCESS
));
197 /* FULLNESS HINT COMPUTATION */
200 * compute_fullness_hint() - Translate a slab's free block count into a 'fullness hint' that can be
201 * stored in a slab_summary_entry's 7 bits that are dedicated to its free
203 * @depot: The depot whose summary being updated.
204 * @free_blocks: The number of free blocks.
206 * Note: the number of free blocks must be strictly less than 2^23 blocks, even though
207 * theoretically slabs could contain precisely 2^23 blocks; there is an assumption that at least
208 * one block is used by metadata. This assumption is necessary; otherwise, the fullness hint might
209 * overflow. The fullness hint formula is roughly (fullness >> 16) & 0x7f, but (2^23 >> 16) & 0x7f
210 * is 0, which would make it impossible to distinguish completely full from completely empty.
212 * Return: A fullness hint, which can be stored in 7 bits.
214 static u8 __must_check
compute_fullness_hint(struct slab_depot
*depot
,
215 block_count_t free_blocks
)
219 VDO_ASSERT_LOG_ONLY((free_blocks
< (1 << 23)), "free blocks must be less than 2^23");
221 if (free_blocks
== 0)
224 hint
= free_blocks
>> depot
->hint_shift
;
225 return ((hint
== 0) ? 1 : hint
);
229 * check_summary_drain_complete() - Check whether an allocators summary has finished draining.
231 static void check_summary_drain_complete(struct block_allocator
*allocator
)
233 if (!vdo_is_state_draining(&allocator
->summary_state
) ||
234 (allocator
->summary_write_count
> 0))
237 vdo_finish_operation(&allocator
->summary_state
,
238 (vdo_is_read_only(allocator
->depot
->vdo
) ?
239 VDO_READ_ONLY
: VDO_SUCCESS
));
243 * notify_summary_waiters() - Wake all the waiters in a given queue.
244 * @allocator: The block allocator summary which owns the queue.
245 * @queue: The queue to notify.
247 static void notify_summary_waiters(struct block_allocator
*allocator
,
248 struct vdo_wait_queue
*queue
)
250 int result
= (vdo_is_read_only(allocator
->depot
->vdo
) ?
251 VDO_READ_ONLY
: VDO_SUCCESS
);
253 vdo_waitq_notify_all_waiters(queue
, NULL
, &result
);
256 static void launch_write(struct slab_summary_block
*summary_block
);
259 * finish_updating_slab_summary_block() - Finish processing a block which attempted to write,
260 * whether or not the attempt succeeded.
263 static void finish_updating_slab_summary_block(struct slab_summary_block
*block
)
265 notify_summary_waiters(block
->allocator
, &block
->current_update_waiters
);
266 block
->writing
= false;
267 block
->allocator
->summary_write_count
--;
268 if (vdo_waitq_has_waiters(&block
->next_update_waiters
))
271 check_summary_drain_complete(block
->allocator
);
275 * finish_update() - This is the callback for a successful summary block write.
276 * @completion: The write vio.
278 static void finish_update(struct vdo_completion
*completion
)
280 struct slab_summary_block
*block
=
281 container_of(as_vio(completion
), struct slab_summary_block
, vio
);
283 atomic64_inc(&block
->allocator
->depot
->summary_statistics
.blocks_written
);
284 finish_updating_slab_summary_block(block
);
288 * handle_write_error() - Handle an error writing a slab summary block.
289 * @completion: The write VIO.
291 static void handle_write_error(struct vdo_completion
*completion
)
293 struct slab_summary_block
*block
=
294 container_of(as_vio(completion
), struct slab_summary_block
, vio
);
296 vio_record_metadata_io_error(as_vio(completion
));
297 vdo_enter_read_only_mode(completion
->vdo
, completion
->result
);
298 finish_updating_slab_summary_block(block
);
301 static void write_slab_summary_endio(struct bio
*bio
)
303 struct vio
*vio
= bio
->bi_private
;
304 struct slab_summary_block
*block
=
305 container_of(vio
, struct slab_summary_block
, vio
);
307 continue_vio_after_io(vio
, finish_update
, block
->allocator
->thread_id
);
311 * launch_write() - Write a slab summary block unless it is currently out for writing.
312 * @block: The block that needs to be committed.
314 static void launch_write(struct slab_summary_block
*block
)
316 struct block_allocator
*allocator
= block
->allocator
;
317 struct slab_depot
*depot
= allocator
->depot
;
318 physical_block_number_t pbn
;
323 allocator
->summary_write_count
++;
324 vdo_waitq_transfer_all_waiters(&block
->next_update_waiters
,
325 &block
->current_update_waiters
);
326 block
->writing
= true;
328 if (vdo_is_read_only(depot
->vdo
)) {
329 finish_updating_slab_summary_block(block
);
333 memcpy(block
->outgoing_entries
, block
->entries
, VDO_BLOCK_SIZE
);
336 * Flush before writing to ensure that the slab journal tail blocks and reference updates
337 * covered by this summary update are stable. Otherwise, a subsequent recovery could
338 * encounter a slab summary update that refers to a slab journal tail block that has not
339 * actually been written. In such cases, the slab journal referenced will be treated as
340 * empty, causing any data within the slab which predates the existing recovery journal
341 * entries to be lost.
343 pbn
= (depot
->summary_origin
+
344 (VDO_SLAB_SUMMARY_BLOCKS_PER_ZONE
* allocator
->zone_number
) +
346 vdo_submit_metadata_vio(&block
->vio
, pbn
, write_slab_summary_endio
,
347 handle_write_error
, REQ_OP_WRITE
| REQ_PREFLUSH
);
351 * update_slab_summary_entry() - Update the entry for a slab.
352 * @slab: The slab whose entry is to be updated
353 * @waiter: The waiter that is updating the summary.
354 * @tail_block_offset: The offset of the slab journal's tail block.
355 * @load_ref_counts: Whether the reference counts must be loaded from disk on the vdo load.
356 * @is_clean: Whether the slab is clean.
357 * @free_blocks: The number of free blocks.
359 static void update_slab_summary_entry(struct vdo_slab
*slab
, struct vdo_waiter
*waiter
,
360 tail_block_offset_t tail_block_offset
,
361 bool load_ref_counts
, bool is_clean
,
362 block_count_t free_blocks
)
364 u8 index
= slab
->slab_number
/ VDO_SLAB_SUMMARY_ENTRIES_PER_BLOCK
;
365 struct block_allocator
*allocator
= slab
->allocator
;
366 struct slab_summary_block
*block
= &allocator
->summary_blocks
[index
];
368 struct slab_summary_entry
*entry
;
370 if (vdo_is_read_only(block
->vio
.completion
.vdo
)) {
371 result
= VDO_READ_ONLY
;
372 waiter
->callback(waiter
, &result
);
376 if (vdo_is_state_draining(&allocator
->summary_state
) ||
377 vdo_is_state_quiescent(&allocator
->summary_state
)) {
378 result
= VDO_INVALID_ADMIN_STATE
;
379 waiter
->callback(waiter
, &result
);
383 entry
= &allocator
->summary_entries
[slab
->slab_number
];
384 *entry
= (struct slab_summary_entry
) {
385 .tail_block_offset
= tail_block_offset
,
386 .load_ref_counts
= (entry
->load_ref_counts
|| load_ref_counts
),
387 .is_dirty
= !is_clean
,
388 .fullness_hint
= compute_fullness_hint(allocator
->depot
, free_blocks
),
390 vdo_waitq_enqueue_waiter(&block
->next_update_waiters
, waiter
);
395 * finish_reaping() - Actually advance the head of the journal now that any necessary flushes are
397 * @journal: The journal to be reaped.
399 static void finish_reaping(struct slab_journal
*journal
)
401 journal
->head
= journal
->unreapable
;
402 add_entries(journal
);
403 check_if_slab_drained(journal
->slab
);
406 static void reap_slab_journal(struct slab_journal
*journal
);
409 * complete_reaping() - Finish reaping now that we have flushed the lower layer and then try
410 * reaping again in case we deferred reaping due to an outstanding vio.
411 * @completion: The flush vio.
413 static void complete_reaping(struct vdo_completion
*completion
)
415 struct slab_journal
*journal
= completion
->parent
;
417 return_vio_to_pool(journal
->slab
->allocator
->vio_pool
,
418 vio_as_pooled_vio(as_vio(vdo_forget(completion
))));
419 finish_reaping(journal
);
420 reap_slab_journal(journal
);
424 * handle_flush_error() - Handle an error flushing the lower layer.
425 * @completion: The flush vio.
427 static void handle_flush_error(struct vdo_completion
*completion
)
429 vio_record_metadata_io_error(as_vio(completion
));
430 vdo_enter_read_only_mode(completion
->vdo
, completion
->result
);
431 complete_reaping(completion
);
434 static void flush_endio(struct bio
*bio
)
436 struct vio
*vio
= bio
->bi_private
;
437 struct slab_journal
*journal
= vio
->completion
.parent
;
439 continue_vio_after_io(vio
, complete_reaping
,
440 journal
->slab
->allocator
->thread_id
);
444 * flush_for_reaping() - A waiter callback for getting a vio with which to flush the lower layer
446 * @waiter: The journal as a flush waiter.
447 * @context: The newly acquired flush vio.
449 static void flush_for_reaping(struct vdo_waiter
*waiter
, void *context
)
451 struct slab_journal
*journal
=
452 container_of(waiter
, struct slab_journal
, flush_waiter
);
453 struct pooled_vio
*pooled
= context
;
454 struct vio
*vio
= &pooled
->vio
;
456 vio
->completion
.parent
= journal
;
457 vdo_submit_flush_vio(vio
, flush_endio
, handle_flush_error
);
461 * reap_slab_journal() - Conduct a reap on a slab journal to reclaim unreferenced blocks.
462 * @journal: The slab journal.
464 static void reap_slab_journal(struct slab_journal
*journal
)
468 if (is_reaping(journal
)) {
469 /* We already have a reap in progress so wait for it to finish. */
473 if ((journal
->slab
->status
!= VDO_SLAB_REBUILT
) ||
474 !vdo_is_state_normal(&journal
->slab
->state
) ||
475 vdo_is_read_only(journal
->slab
->allocator
->depot
->vdo
)) {
477 * We must not reap in the first two cases, and there's no point in read-only mode.
483 * Start reclaiming blocks only when the journal head has no references. Then stop when a
484 * block is referenced or reap reaches the most recently written block, referenced by the
485 * slab summary, which has the sequence number just before the tail.
487 while ((journal
->unreapable
< journal
->tail
) && (journal
->reap_lock
->count
== 0)) {
489 journal
->unreapable
++;
490 journal
->reap_lock
++;
491 if (journal
->reap_lock
== &journal
->locks
[journal
->size
])
492 journal
->reap_lock
= &journal
->locks
[0];
499 * It is never safe to reap a slab journal block without first issuing a flush, regardless
500 * of whether a user flush has been received or not. In the absence of the flush, the
501 * reference block write which released the locks allowing the slab journal to reap may not
502 * be persisted. Although slab summary writes will eventually issue flushes, multiple slab
503 * journal block writes can be issued while previous slab summary updates have not yet been
504 * made. Even though those slab journal block writes will be ignored if the slab summary
505 * update is not persisted, they may still overwrite the to-be-reaped slab journal block
506 * resulting in a loss of reference count updates.
508 journal
->flush_waiter
.callback
= flush_for_reaping
;
509 acquire_vio_from_pool(journal
->slab
->allocator
->vio_pool
,
510 &journal
->flush_waiter
);
514 * adjust_slab_journal_block_reference() - Adjust the reference count for a slab journal block.
515 * @journal: The slab journal.
516 * @sequence_number: The journal sequence number of the referenced block.
517 * @adjustment: Amount to adjust the reference counter.
519 * Note that when the adjustment is negative, the slab journal will be reaped.
521 static void adjust_slab_journal_block_reference(struct slab_journal
*journal
,
522 sequence_number_t sequence_number
,
525 struct journal_lock
*lock
;
527 if (sequence_number
== 0)
530 if (journal
->slab
->status
== VDO_SLAB_REPLAYING
) {
531 /* Locks should not be used during offline replay. */
535 VDO_ASSERT_LOG_ONLY((adjustment
!= 0), "adjustment must be non-zero");
536 lock
= get_lock(journal
, sequence_number
);
537 if (adjustment
< 0) {
538 VDO_ASSERT_LOG_ONLY((-adjustment
<= lock
->count
),
539 "adjustment %d of lock count %u for slab journal block %llu must not underflow",
540 adjustment
, lock
->count
,
541 (unsigned long long) sequence_number
);
544 lock
->count
+= adjustment
;
545 if (lock
->count
== 0)
546 reap_slab_journal(journal
);
550 * release_journal_locks() - Callback invoked after a slab summary update completes.
551 * @waiter: The slab summary waiter that has just been notified.
552 * @context: The result code of the update.
554 * Registered in the constructor on behalf of update_tail_block_location().
556 * Implements waiter_callback_fn.
558 static void release_journal_locks(struct vdo_waiter
*waiter
, void *context
)
560 sequence_number_t first
, i
;
561 struct slab_journal
*journal
=
562 container_of(waiter
, struct slab_journal
, slab_summary_waiter
);
563 int result
= *((int *) context
);
565 if (result
!= VDO_SUCCESS
) {
566 if (result
!= VDO_READ_ONLY
) {
568 * Don't bother logging what might be lots of errors if we are already in
571 vdo_log_error_strerror(result
, "failed slab summary update %llu",
572 (unsigned long long) journal
->summarized
);
575 journal
->updating_slab_summary
= false;
576 vdo_enter_read_only_mode(journal
->slab
->allocator
->depot
->vdo
, result
);
577 check_if_slab_drained(journal
->slab
);
581 if (journal
->partial_write_in_progress
&& (journal
->summarized
== journal
->tail
)) {
582 journal
->partial_write_in_progress
= false;
583 add_entries(journal
);
586 first
= journal
->last_summarized
;
587 journal
->last_summarized
= journal
->summarized
;
588 for (i
= journal
->summarized
- 1; i
>= first
; i
--) {
590 * Release the lock the summarized block held on the recovery journal. (During
591 * replay, recovery_start will always be 0.)
593 if (journal
->recovery_journal
!= NULL
) {
594 zone_count_t zone_number
= journal
->slab
->allocator
->zone_number
;
595 struct journal_lock
*lock
= get_lock(journal
, i
);
597 vdo_release_recovery_journal_block_reference(journal
->recovery_journal
,
598 lock
->recovery_start
,
599 VDO_ZONE_TYPE_PHYSICAL
,
604 * Release our own lock against reaping for blocks that are committed. (This
605 * function will not change locks during replay.)
607 adjust_slab_journal_block_reference(journal
, i
, -1);
610 journal
->updating_slab_summary
= false;
612 reap_slab_journal(journal
);
614 /* Check if the slab summary needs to be updated again. */
615 update_tail_block_location(journal
);
619 * update_tail_block_location() - Update the tail block location in the slab summary, if necessary.
620 * @journal: The slab journal that is updating its tail block location.
622 static void update_tail_block_location(struct slab_journal
*journal
)
624 block_count_t free_block_count
;
625 struct vdo_slab
*slab
= journal
->slab
;
627 if (journal
->updating_slab_summary
||
628 vdo_is_read_only(journal
->slab
->allocator
->depot
->vdo
) ||
629 (journal
->last_summarized
>= journal
->next_commit
)) {
630 check_if_slab_drained(slab
);
634 if (slab
->status
!= VDO_SLAB_REBUILT
) {
635 u8 hint
= slab
->allocator
->summary_entries
[slab
->slab_number
].fullness_hint
;
637 free_block_count
= ((block_count_t
) hint
) << slab
->allocator
->depot
->hint_shift
;
639 free_block_count
= slab
->free_blocks
;
642 journal
->summarized
= journal
->next_commit
;
643 journal
->updating_slab_summary
= true;
646 * Update slab summary as dirty.
647 * vdo_slab journal can only reap past sequence number 1 when all the ref counts for this
648 * slab have been written to the layer. Therefore, indicate that the ref counts must be
649 * loaded when the journal head has reaped past sequence number 1.
651 update_slab_summary_entry(slab
, &journal
->slab_summary_waiter
,
652 journal
->summarized
% journal
->size
,
653 (journal
->head
> 1), false, free_block_count
);
657 * reopen_slab_journal() - Reopen a slab's journal by emptying it and then adding pending entries.
659 static void reopen_slab_journal(struct vdo_slab
*slab
)
661 struct slab_journal
*journal
= &slab
->journal
;
662 sequence_number_t block
;
664 VDO_ASSERT_LOG_ONLY(journal
->tail_header
.entry_count
== 0,
665 "vdo_slab journal's active block empty before reopening");
666 journal
->head
= journal
->tail
;
667 initialize_journal_state(journal
);
669 /* Ensure no locks are spuriously held on an empty journal. */
670 for (block
= 1; block
<= journal
->size
; block
++) {
671 VDO_ASSERT_LOG_ONLY((get_lock(journal
, block
)->count
== 0),
672 "Scrubbed journal's block %llu is not locked",
673 (unsigned long long) block
);
676 add_entries(journal
);
679 static sequence_number_t
get_committing_sequence_number(const struct pooled_vio
*vio
)
681 const struct packed_slab_journal_block
*block
=
682 (const struct packed_slab_journal_block
*) vio
->vio
.data
;
684 return __le64_to_cpu(block
->header
.sequence_number
);
688 * complete_write() - Handle post-commit processing.
689 * @completion: The write vio as a completion.
691 * This is the callback registered by write_slab_journal_block().
693 static void complete_write(struct vdo_completion
*completion
)
695 int result
= completion
->result
;
696 struct pooled_vio
*pooled
= vio_as_pooled_vio(as_vio(completion
));
697 struct slab_journal
*journal
= completion
->parent
;
698 sequence_number_t committed
= get_committing_sequence_number(pooled
);
700 list_del_init(&pooled
->list_entry
);
701 return_vio_to_pool(journal
->slab
->allocator
->vio_pool
, vdo_forget(pooled
));
703 if (result
!= VDO_SUCCESS
) {
704 vio_record_metadata_io_error(as_vio(completion
));
705 vdo_log_error_strerror(result
, "cannot write slab journal block %llu",
706 (unsigned long long) committed
);
707 vdo_enter_read_only_mode(journal
->slab
->allocator
->depot
->vdo
, result
);
708 check_if_slab_drained(journal
->slab
);
712 WRITE_ONCE(journal
->events
->blocks_written
, journal
->events
->blocks_written
+ 1);
714 if (list_empty(&journal
->uncommitted_blocks
)) {
715 /* If no blocks are outstanding, then the commit point is at the tail. */
716 journal
->next_commit
= journal
->tail
;
718 /* The commit point is always the beginning of the oldest incomplete block. */
719 pooled
= container_of(journal
->uncommitted_blocks
.next
,
720 struct pooled_vio
, list_entry
);
721 journal
->next_commit
= get_committing_sequence_number(pooled
);
724 update_tail_block_location(journal
);
727 static void write_slab_journal_endio(struct bio
*bio
)
729 struct vio
*vio
= bio
->bi_private
;
730 struct slab_journal
*journal
= vio
->completion
.parent
;
732 continue_vio_after_io(vio
, complete_write
, journal
->slab
->allocator
->thread_id
);
736 * write_slab_journal_block() - Write a slab journal block.
737 * @waiter: The vio pool waiter which was just notified.
738 * @context: The vio pool entry for the write.
740 * Callback from acquire_vio_from_pool() registered in commit_tail().
742 static void write_slab_journal_block(struct vdo_waiter
*waiter
, void *context
)
744 struct pooled_vio
*pooled
= context
;
745 struct vio
*vio
= &pooled
->vio
;
746 struct slab_journal
*journal
=
747 container_of(waiter
, struct slab_journal
, resource_waiter
);
748 struct slab_journal_block_header
*header
= &journal
->tail_header
;
749 int unused_entries
= journal
->entries_per_block
- header
->entry_count
;
750 physical_block_number_t block_number
;
751 const struct admin_state_code
*operation
;
753 header
->head
= journal
->head
;
754 list_add_tail(&pooled
->list_entry
, &journal
->uncommitted_blocks
);
755 vdo_pack_slab_journal_block_header(header
, &journal
->block
->header
);
757 /* Copy the tail block into the vio. */
758 memcpy(pooled
->vio
.data
, journal
->block
, VDO_BLOCK_SIZE
);
760 VDO_ASSERT_LOG_ONLY(unused_entries
>= 0, "vdo_slab journal block is not overfull");
761 if (unused_entries
> 0) {
763 * Release the per-entry locks for any unused entries in the block we are about to
766 adjust_slab_journal_block_reference(journal
, header
->sequence_number
,
768 journal
->partial_write_in_progress
= !block_is_full(journal
);
771 block_number
= journal
->slab
->journal_origin
+
772 (header
->sequence_number
% journal
->size
);
773 vio
->completion
.parent
= journal
;
776 * This block won't be read in recovery until the slab summary is updated to refer to it.
777 * The slab summary update does a flush which is sufficient to protect us from corruption
778 * due to out of order slab journal, reference block, or block map writes.
780 vdo_submit_metadata_vio(vdo_forget(vio
), block_number
, write_slab_journal_endio
,
781 complete_write
, REQ_OP_WRITE
);
783 /* Since the write is submitted, the tail block structure can be reused. */
785 initialize_tail_block(journal
);
786 journal
->waiting_to_commit
= false;
788 operation
= vdo_get_admin_state_code(&journal
->slab
->state
);
789 if (operation
== VDO_ADMIN_STATE_WAITING_FOR_RECOVERY
) {
790 vdo_finish_operation(&journal
->slab
->state
,
791 (vdo_is_read_only(journal
->slab
->allocator
->depot
->vdo
) ?
792 VDO_READ_ONLY
: VDO_SUCCESS
));
796 add_entries(journal
);
800 * commit_tail() - Commit the tail block of the slab journal.
801 * @journal: The journal whose tail block should be committed.
803 static void commit_tail(struct slab_journal
*journal
)
805 if ((journal
->tail_header
.entry_count
== 0) && must_make_entries_to_flush(journal
)) {
807 * There are no entries at the moment, but there are some waiters, so defer
808 * initiating the flush until those entries are ready to write.
813 if (vdo_is_read_only(journal
->slab
->allocator
->depot
->vdo
) ||
814 journal
->waiting_to_commit
||
815 (journal
->tail_header
.entry_count
== 0)) {
817 * There is nothing to do since the tail block is empty, or writing, or the journal
818 * is in read-only mode.
824 * Since we are about to commit the tail block, this journal no longer needs to be on the
825 * ring of journals which the recovery journal might ask to commit.
827 mark_slab_journal_clean(journal
);
829 journal
->waiting_to_commit
= true;
831 journal
->resource_waiter
.callback
= write_slab_journal_block
;
832 acquire_vio_from_pool(journal
->slab
->allocator
->vio_pool
,
833 &journal
->resource_waiter
);
837 * encode_slab_journal_entry() - Encode a slab journal entry.
838 * @tail_header: The unpacked header for the block.
839 * @payload: The journal block payload to hold the entry.
840 * @sbn: The slab block number of the entry to encode.
841 * @operation: The type of the entry.
842 * @increment: True if this is an increment.
844 * Exposed for unit tests.
846 static void encode_slab_journal_entry(struct slab_journal_block_header
*tail_header
,
847 slab_journal_payload
*payload
,
848 slab_block_number sbn
,
849 enum journal_operation operation
,
852 journal_entry_count_t entry_number
= tail_header
->entry_count
++;
854 if (operation
== VDO_JOURNAL_BLOCK_MAP_REMAPPING
) {
855 if (!tail_header
->has_block_map_increments
) {
856 memset(payload
->full_entries
.entry_types
, 0,
857 VDO_SLAB_JOURNAL_ENTRY_TYPES_SIZE
);
858 tail_header
->has_block_map_increments
= true;
861 payload
->full_entries
.entry_types
[entry_number
/ 8] |=
862 ((u8
)1 << (entry_number
% 8));
865 vdo_pack_slab_journal_entry(&payload
->entries
[entry_number
], sbn
, increment
);
869 * expand_journal_point() - Convert a recovery journal journal_point which refers to both an
870 * increment and a decrement to a single point which refers to one or the
872 * @recovery_point: The journal point to convert.
873 * @increment: Whether the current entry is an increment.
875 * Return: The expanded journal point
877 * Because each data_vio has but a single recovery journal point, but may need to make both
878 * increment and decrement entries in the same slab journal. In order to distinguish the two
879 * entries, the entry count of the expanded journal point is twice the actual recovery journal
880 * entry count for increments, and one more than that for decrements.
882 static struct journal_point
expand_journal_point(struct journal_point recovery_point
,
885 recovery_point
.entry_count
*= 2;
887 recovery_point
.entry_count
++;
889 return recovery_point
;
893 * add_entry() - Actually add an entry to the slab journal, potentially firing off a write if a
894 * block becomes full.
895 * @journal: The slab journal to append to.
896 * @pbn: The pbn being adjusted.
897 * @operation: The type of entry to make.
898 * @increment: True if this is an increment.
899 * @recovery_point: The expanded recovery point.
901 * This function is synchronous.
903 static void add_entry(struct slab_journal
*journal
, physical_block_number_t pbn
,
904 enum journal_operation operation
, bool increment
,
905 struct journal_point recovery_point
)
907 struct packed_slab_journal_block
*block
= journal
->block
;
910 result
= VDO_ASSERT(vdo_before_journal_point(&journal
->tail_header
.recovery_point
,
912 "recovery journal point is monotonically increasing, recovery point: %llu.%u, block recovery point: %llu.%u",
913 (unsigned long long) recovery_point
.sequence_number
,
914 recovery_point
.entry_count
,
915 (unsigned long long) journal
->tail_header
.recovery_point
.sequence_number
,
916 journal
->tail_header
.recovery_point
.entry_count
);
917 if (result
!= VDO_SUCCESS
) {
918 vdo_enter_read_only_mode(journal
->slab
->allocator
->depot
->vdo
, result
);
922 if (operation
== VDO_JOURNAL_BLOCK_MAP_REMAPPING
) {
923 result
= VDO_ASSERT((journal
->tail_header
.entry_count
<
924 journal
->full_entries_per_block
),
925 "block has room for full entries");
926 if (result
!= VDO_SUCCESS
) {
927 vdo_enter_read_only_mode(journal
->slab
->allocator
->depot
->vdo
,
933 encode_slab_journal_entry(&journal
->tail_header
, &block
->payload
,
934 pbn
- journal
->slab
->start
, operation
, increment
);
935 journal
->tail_header
.recovery_point
= recovery_point
;
936 if (block_is_full(journal
))
937 commit_tail(journal
);
940 static inline block_count_t
journal_length(const struct slab_journal
*journal
)
942 return journal
->tail
- journal
->head
;
946 * vdo_attempt_replay_into_slab() - Replay a recovery journal entry into a slab's journal.
947 * @slab: The slab to play into.
948 * @pbn: The PBN for the entry.
949 * @operation: The type of entry to add.
950 * @increment: True if this entry is an increment.
951 * @recovery_point: The recovery journal point corresponding to this entry.
952 * @parent: The completion to notify when there is space to add the entry if the entry could not be
955 * Return: true if the entry was added immediately.
957 bool vdo_attempt_replay_into_slab(struct vdo_slab
*slab
, physical_block_number_t pbn
,
958 enum journal_operation operation
, bool increment
,
959 struct journal_point
*recovery_point
,
960 struct vdo_completion
*parent
)
962 struct slab_journal
*journal
= &slab
->journal
;
963 struct slab_journal_block_header
*header
= &journal
->tail_header
;
964 struct journal_point expanded
= expand_journal_point(*recovery_point
, increment
);
966 /* Only accept entries after the current recovery point. */
967 if (!vdo_before_journal_point(&journal
->tail_header
.recovery_point
, &expanded
))
970 if ((header
->entry_count
>= journal
->full_entries_per_block
) &&
971 (header
->has_block_map_increments
|| (operation
== VDO_JOURNAL_BLOCK_MAP_REMAPPING
))) {
973 * The tail block does not have room for the entry we are attempting to add so
974 * commit the tail block now.
976 commit_tail(journal
);
979 if (journal
->waiting_to_commit
) {
980 vdo_start_operation_with_waiter(&journal
->slab
->state
,
981 VDO_ADMIN_STATE_WAITING_FOR_RECOVERY
,
986 if (journal_length(journal
) >= journal
->size
) {
988 * We must have reaped the current head before the crash, since the blocked
989 * threshold keeps us from having more entries than fit in a slab journal; hence we
990 * can just advance the head (and unreapable block), as needed.
993 journal
->unreapable
++;
996 if (journal
->slab
->status
== VDO_SLAB_REBUILT
)
997 journal
->slab
->status
= VDO_SLAB_REPLAYING
;
999 add_entry(journal
, pbn
, operation
, increment
, expanded
);
1004 * requires_reaping() - Check whether the journal must be reaped before adding new entries.
1005 * @journal: The journal to check.
1007 * Return: true if the journal must be reaped.
1009 static bool requires_reaping(const struct slab_journal
*journal
)
1011 return (journal_length(journal
) >= journal
->blocking_threshold
);
1014 /** finish_summary_update() - A waiter callback that resets the writing state of a slab. */
1015 static void finish_summary_update(struct vdo_waiter
*waiter
, void *context
)
1017 struct vdo_slab
*slab
= container_of(waiter
, struct vdo_slab
, summary_waiter
);
1018 int result
= *((int *) context
);
1020 slab
->active_count
--;
1022 if ((result
!= VDO_SUCCESS
) && (result
!= VDO_READ_ONLY
)) {
1023 vdo_log_error_strerror(result
, "failed to update slab summary");
1024 vdo_enter_read_only_mode(slab
->allocator
->depot
->vdo
, result
);
1027 check_if_slab_drained(slab
);
1030 static void write_reference_block(struct vdo_waiter
*waiter
, void *context
);
1033 * launch_reference_block_write() - Launch the write of a dirty reference block by first acquiring
1034 * a VIO for it from the pool.
1035 * @waiter: The waiter of the block which is starting to write.
1036 * @context: The parent slab of the block.
1038 * This can be asynchronous since the writer will have to wait if all VIOs in the pool are
1041 static void launch_reference_block_write(struct vdo_waiter
*waiter
, void *context
)
1043 struct vdo_slab
*slab
= context
;
1045 if (vdo_is_read_only(slab
->allocator
->depot
->vdo
))
1048 slab
->active_count
++;
1049 container_of(waiter
, struct reference_block
, waiter
)->is_writing
= true;
1050 waiter
->callback
= write_reference_block
;
1051 acquire_vio_from_pool(slab
->allocator
->vio_pool
, waiter
);
1054 static void save_dirty_reference_blocks(struct vdo_slab
*slab
)
1056 vdo_waitq_notify_all_waiters(&slab
->dirty_blocks
,
1057 launch_reference_block_write
, slab
);
1058 check_if_slab_drained(slab
);
1062 * finish_reference_block_write() - After a reference block has written, clean it, release its
1063 * locks, and return its VIO to the pool.
1064 * @completion: The VIO that just finished writing.
1066 static void finish_reference_block_write(struct vdo_completion
*completion
)
1068 struct vio
*vio
= as_vio(completion
);
1069 struct pooled_vio
*pooled
= vio_as_pooled_vio(vio
);
1070 struct reference_block
*block
= completion
->parent
;
1071 struct vdo_slab
*slab
= block
->slab
;
1072 tail_block_offset_t offset
;
1074 slab
->active_count
--;
1076 /* Release the slab journal lock. */
1077 adjust_slab_journal_block_reference(&slab
->journal
,
1078 block
->slab_journal_lock_to_release
, -1);
1079 return_vio_to_pool(slab
->allocator
->vio_pool
, pooled
);
1082 * We can't clear the is_writing flag earlier as releasing the slab journal lock may cause
1083 * us to be dirtied again, but we don't want to double enqueue.
1085 block
->is_writing
= false;
1087 if (vdo_is_read_only(completion
->vdo
)) {
1088 check_if_slab_drained(slab
);
1092 /* Re-queue the block if it was re-dirtied while it was writing. */
1093 if (block
->is_dirty
) {
1094 vdo_waitq_enqueue_waiter(&block
->slab
->dirty_blocks
, &block
->waiter
);
1095 if (vdo_is_state_draining(&slab
->state
)) {
1096 /* We must be saving, and this block will otherwise not be relaunched. */
1097 save_dirty_reference_blocks(slab
);
1104 * Mark the slab as clean in the slab summary if there are no dirty or writing blocks
1105 * and no summary update in progress.
1107 if ((slab
->active_count
> 0) || vdo_waitq_has_waiters(&slab
->dirty_blocks
)) {
1108 check_if_slab_drained(slab
);
1112 offset
= slab
->allocator
->summary_entries
[slab
->slab_number
].tail_block_offset
;
1113 slab
->active_count
++;
1114 slab
->summary_waiter
.callback
= finish_summary_update
;
1115 update_slab_summary_entry(slab
, &slab
->summary_waiter
, offset
,
1116 true, true, slab
->free_blocks
);
1120 * get_reference_counters_for_block() - Find the reference counters for a given block.
1121 * @block: The reference_block in question.
1123 * Return: A pointer to the reference counters for this block.
1125 static vdo_refcount_t
* __must_check
get_reference_counters_for_block(struct reference_block
*block
)
1127 size_t block_index
= block
- block
->slab
->reference_blocks
;
1129 return &block
->slab
->counters
[block_index
* COUNTS_PER_BLOCK
];
1133 * pack_reference_block() - Copy data from a reference block to a buffer ready to be written out.
1134 * @block: The block to copy.
1135 * @buffer: The char buffer to fill with the packed block.
1137 static void pack_reference_block(struct reference_block
*block
, void *buffer
)
1139 struct packed_reference_block
*packed
= buffer
;
1140 vdo_refcount_t
*counters
= get_reference_counters_for_block(block
);
1142 struct packed_journal_point commit_point
;
1144 vdo_pack_journal_point(&block
->slab
->slab_journal_point
, &commit_point
);
1146 for (i
= 0; i
< VDO_SECTORS_PER_BLOCK
; i
++) {
1147 packed
->sectors
[i
].commit_point
= commit_point
;
1148 memcpy(packed
->sectors
[i
].counts
, counters
+ (i
* COUNTS_PER_SECTOR
),
1149 (sizeof(vdo_refcount_t
) * COUNTS_PER_SECTOR
));
1153 static void write_reference_block_endio(struct bio
*bio
)
1155 struct vio
*vio
= bio
->bi_private
;
1156 struct reference_block
*block
= vio
->completion
.parent
;
1157 thread_id_t thread_id
= block
->slab
->allocator
->thread_id
;
1159 continue_vio_after_io(vio
, finish_reference_block_write
, thread_id
);
1163 * handle_io_error() - Handle an I/O error reading or writing a reference count block.
1164 * @completion: The VIO doing the I/O as a completion.
1166 static void handle_io_error(struct vdo_completion
*completion
)
1168 int result
= completion
->result
;
1169 struct vio
*vio
= as_vio(completion
);
1170 struct vdo_slab
*slab
= ((struct reference_block
*) completion
->parent
)->slab
;
1172 vio_record_metadata_io_error(vio
);
1173 return_vio_to_pool(slab
->allocator
->vio_pool
, vio_as_pooled_vio(vio
));
1174 slab
->active_count
--;
1175 vdo_enter_read_only_mode(slab
->allocator
->depot
->vdo
, result
);
1176 check_if_slab_drained(slab
);
1180 * write_reference_block() - After a dirty block waiter has gotten a VIO from the VIO pool, copy
1181 * its counters and associated data into the VIO, and launch the write.
1182 * @waiter: The waiter of the dirty block.
1183 * @context: The VIO returned by the pool.
1185 static void write_reference_block(struct vdo_waiter
*waiter
, void *context
)
1187 size_t block_offset
;
1188 physical_block_number_t pbn
;
1189 struct pooled_vio
*pooled
= context
;
1190 struct vdo_completion
*completion
= &pooled
->vio
.completion
;
1191 struct reference_block
*block
= container_of(waiter
, struct reference_block
,
1194 pack_reference_block(block
, pooled
->vio
.data
);
1195 block_offset
= (block
- block
->slab
->reference_blocks
);
1196 pbn
= (block
->slab
->ref_counts_origin
+ block_offset
);
1197 block
->slab_journal_lock_to_release
= block
->slab_journal_lock
;
1198 completion
->parent
= block
;
1201 * Mark the block as clean, since we won't be committing any updates that happen after this
1202 * moment. As long as VIO order is preserved, two VIOs updating this block at once will not
1203 * cause complications.
1205 block
->is_dirty
= false;
1208 * Flush before writing to ensure that the recovery journal and slab journal entries which
1209 * cover this reference update are stable. This prevents data corruption that can be caused
1210 * by out of order writes.
1212 WRITE_ONCE(block
->slab
->allocator
->ref_counts_statistics
.blocks_written
,
1213 block
->slab
->allocator
->ref_counts_statistics
.blocks_written
+ 1);
1215 completion
->callback_thread_id
= ((struct block_allocator
*) pooled
->context
)->thread_id
;
1216 vdo_submit_metadata_vio(&pooled
->vio
, pbn
, write_reference_block_endio
,
1217 handle_io_error
, REQ_OP_WRITE
| REQ_PREFLUSH
);
1220 static void reclaim_journal_space(struct slab_journal
*journal
)
1222 block_count_t length
= journal_length(journal
);
1223 struct vdo_slab
*slab
= journal
->slab
;
1224 block_count_t write_count
= vdo_waitq_num_waiters(&slab
->dirty_blocks
);
1225 block_count_t written
;
1227 if ((length
< journal
->flushing_threshold
) || (write_count
== 0))
1230 /* The slab journal is over the first threshold, schedule some reference block writes. */
1231 WRITE_ONCE(journal
->events
->flush_count
, journal
->events
->flush_count
+ 1);
1232 if (length
< journal
->flushing_deadline
) {
1233 /* Schedule more writes the closer to the deadline we get. */
1234 write_count
/= journal
->flushing_deadline
- length
+ 1;
1235 write_count
= max_t(block_count_t
, write_count
, 1);
1238 for (written
= 0; written
< write_count
; written
++) {
1239 vdo_waitq_notify_next_waiter(&slab
->dirty_blocks
,
1240 launch_reference_block_write
, slab
);
1245 * reference_count_to_status() - Convert a reference count to a reference status.
1246 * @count: The count to convert.
1248 * Return: The appropriate reference status.
1250 static enum reference_status __must_check
reference_count_to_status(vdo_refcount_t count
)
1252 if (count
== EMPTY_REFERENCE_COUNT
)
1254 else if (count
== 1)
1256 else if (count
== PROVISIONAL_REFERENCE_COUNT
)
1257 return RS_PROVISIONAL
;
1263 * dirty_block() - Mark a reference count block as dirty, potentially adding it to the dirty queue
1264 * if it wasn't already dirty.
1265 * @block: The reference block to mark as dirty.
1267 static void dirty_block(struct reference_block
*block
)
1269 if (block
->is_dirty
)
1272 block
->is_dirty
= true;
1273 if (!block
->is_writing
)
1274 vdo_waitq_enqueue_waiter(&block
->slab
->dirty_blocks
, &block
->waiter
);
1278 * get_reference_block() - Get the reference block that covers the given block index.
1280 static struct reference_block
* __must_check
get_reference_block(struct vdo_slab
*slab
,
1281 slab_block_number index
)
1283 return &slab
->reference_blocks
[index
/ COUNTS_PER_BLOCK
];
1287 * slab_block_number_from_pbn() - Determine the index within the slab of a particular physical
1290 * @pbn: The physical block number.
1291 * @slab_block_number_ptr: A pointer to the slab block number.
1293 * Return: VDO_SUCCESS or an error code.
1295 static int __must_check
slab_block_number_from_pbn(struct vdo_slab
*slab
,
1296 physical_block_number_t pbn
,
1297 slab_block_number
*slab_block_number_ptr
)
1299 u64 slab_block_number
;
1301 if (pbn
< slab
->start
)
1302 return VDO_OUT_OF_RANGE
;
1304 slab_block_number
= pbn
- slab
->start
;
1305 if (slab_block_number
>= slab
->allocator
->depot
->slab_config
.data_blocks
)
1306 return VDO_OUT_OF_RANGE
;
1308 *slab_block_number_ptr
= slab_block_number
;
1313 * get_reference_counter() - Get the reference counter that covers the given physical block number.
1314 * @slab: The slab to query.
1315 * @pbn: The physical block number.
1316 * @counter_ptr: A pointer to the reference counter.
1318 static int __must_check
get_reference_counter(struct vdo_slab
*slab
,
1319 physical_block_number_t pbn
,
1320 vdo_refcount_t
**counter_ptr
)
1322 slab_block_number index
;
1323 int result
= slab_block_number_from_pbn(slab
, pbn
, &index
);
1325 if (result
!= VDO_SUCCESS
)
1328 *counter_ptr
= &slab
->counters
[index
];
1333 static unsigned int calculate_slab_priority(struct vdo_slab
*slab
)
1335 block_count_t free_blocks
= slab
->free_blocks
;
1336 unsigned int unopened_slab_priority
= slab
->allocator
->unopened_slab_priority
;
1337 unsigned int priority
;
1340 * Wholly full slabs must be the only ones with lowest priority, 0.
1342 * Slabs that have never been opened (empty, newly initialized, and never been written to)
1343 * have lower priority than previously opened slabs that have a significant number of free
1344 * blocks. This ranking causes VDO to avoid writing physical blocks for the first time
1345 * unless there are very few free blocks that have been previously written to.
1347 * Since VDO doesn't discard blocks currently, reusing previously written blocks makes VDO
1348 * a better client of any underlying storage that is thinly-provisioned (though discarding
1351 * For all other slabs, the priority is derived from the logarithm of the number of free
1352 * blocks. Slabs with the same order of magnitude of free blocks have the same priority.
1353 * With 2^23 blocks, the priority will range from 1 to 25. The reserved
1354 * unopened_slab_priority divides the range and is skipped by the logarithmic mapping.
1357 if (free_blocks
== 0)
1360 if (is_slab_journal_blank(slab
))
1361 return unopened_slab_priority
;
1363 priority
= (1 + ilog2(free_blocks
));
1364 return ((priority
< unopened_slab_priority
) ? priority
: priority
+ 1);
1368 * Slabs are essentially prioritized by an approximation of the number of free blocks in the slab
1369 * so slabs with lots of free blocks will be opened for allocation before slabs that have few free
1372 static void prioritize_slab(struct vdo_slab
*slab
)
1374 VDO_ASSERT_LOG_ONLY(list_empty(&slab
->allocq_entry
),
1375 "a slab must not already be on a ring when prioritizing");
1376 slab
->priority
= calculate_slab_priority(slab
);
1377 vdo_priority_table_enqueue(slab
->allocator
->prioritized_slabs
,
1378 slab
->priority
, &slab
->allocq_entry
);
1382 * adjust_free_block_count() - Adjust the free block count and (if needed) reprioritize the slab.
1383 * @incremented: true if the free block count went up.
1385 static void adjust_free_block_count(struct vdo_slab
*slab
, bool incremented
)
1387 struct block_allocator
*allocator
= slab
->allocator
;
1389 WRITE_ONCE(allocator
->allocated_blocks
,
1390 allocator
->allocated_blocks
+ (incremented
? -1 : 1));
1392 /* The open slab doesn't need to be reprioritized until it is closed. */
1393 if (slab
== allocator
->open_slab
)
1396 /* Don't bother adjusting the priority table if unneeded. */
1397 if (slab
->priority
== calculate_slab_priority(slab
))
1401 * Reprioritize the slab to reflect the new free block count by removing it from the table
1402 * and re-enqueuing it with the new priority.
1404 vdo_priority_table_remove(allocator
->prioritized_slabs
, &slab
->allocq_entry
);
1405 prioritize_slab(slab
);
1409 * increment_for_data() - Increment the reference count for a data block.
1410 * @slab: The slab which owns the block.
1411 * @block: The reference block which contains the block being updated.
1412 * @block_number: The block to update.
1413 * @old_status: The reference status of the data block before this increment.
1414 * @lock: The pbn_lock associated with this increment (may be NULL).
1415 * @counter_ptr: A pointer to the count for the data block (in, out).
1416 * @adjust_block_count: Whether to update the allocator's free block count.
1418 * Return: VDO_SUCCESS or an error.
1420 static int increment_for_data(struct vdo_slab
*slab
, struct reference_block
*block
,
1421 slab_block_number block_number
,
1422 enum reference_status old_status
,
1423 struct pbn_lock
*lock
, vdo_refcount_t
*counter_ptr
,
1424 bool adjust_block_count
)
1426 switch (old_status
) {
1429 block
->allocated_count
++;
1430 slab
->free_blocks
--;
1431 if (adjust_block_count
)
1432 adjust_free_block_count(slab
, false);
1436 case RS_PROVISIONAL
:
1441 /* Single or shared */
1442 if (*counter_ptr
>= MAXIMUM_REFERENCE_COUNT
) {
1443 return vdo_log_error_strerror(VDO_REF_COUNT_INVALID
,
1444 "Incrementing a block already having 254 references (slab %u, offset %u)",
1445 slab
->slab_number
, block_number
);
1451 vdo_unassign_pbn_lock_provisional_reference(lock
);
1456 * decrement_for_data() - Decrement the reference count for a data block.
1457 * @slab: The slab which owns the block.
1458 * @block: The reference block which contains the block being updated.
1459 * @block_number: The block to update.
1460 * @old_status: The reference status of the data block before this decrement.
1461 * @updater: The reference updater doing this operation in case we need to look up the pbn lock.
1462 * @counter_ptr: A pointer to the count for the data block (in, out).
1463 * @adjust_block_count: Whether to update the allocator's free block count.
1465 * Return: VDO_SUCCESS or an error.
1467 static int decrement_for_data(struct vdo_slab
*slab
, struct reference_block
*block
,
1468 slab_block_number block_number
,
1469 enum reference_status old_status
,
1470 struct reference_updater
*updater
,
1471 vdo_refcount_t
*counter_ptr
, bool adjust_block_count
)
1473 switch (old_status
) {
1475 return vdo_log_error_strerror(VDO_REF_COUNT_INVALID
,
1476 "Decrementing free block at offset %u in slab %u",
1477 block_number
, slab
->slab_number
);
1479 case RS_PROVISIONAL
:
1481 if (updater
->zpbn
.zone
!= NULL
) {
1482 struct pbn_lock
*lock
= vdo_get_physical_zone_pbn_lock(updater
->zpbn
.zone
,
1487 * There is a read lock on this block, so the block must not become
1490 *counter_ptr
= PROVISIONAL_REFERENCE_COUNT
;
1491 vdo_assign_pbn_lock_provisional_reference(lock
);
1496 *counter_ptr
= EMPTY_REFERENCE_COUNT
;
1497 block
->allocated_count
--;
1498 slab
->free_blocks
++;
1499 if (adjust_block_count
)
1500 adjust_free_block_count(slab
, true);
1513 * increment_for_block_map() - Increment the reference count for a block map page.
1514 * @slab: The slab which owns the block.
1515 * @block: The reference block which contains the block being updated.
1516 * @block_number: The block to update.
1517 * @old_status: The reference status of the block before this increment.
1518 * @lock: The pbn_lock associated with this increment (may be NULL).
1519 * @normal_operation: Whether we are in normal operation vs. recovery or rebuild.
1520 * @counter_ptr: A pointer to the count for the block (in, out).
1521 * @adjust_block_count: Whether to update the allocator's free block count.
1523 * All block map increments should be from provisional to MAXIMUM_REFERENCE_COUNT. Since block map
1524 * blocks never dedupe they should never be adjusted from any other state. The adjustment always
1525 * results in MAXIMUM_REFERENCE_COUNT as this value is used to prevent dedupe against block map
1528 * Return: VDO_SUCCESS or an error.
1530 static int increment_for_block_map(struct vdo_slab
*slab
, struct reference_block
*block
,
1531 slab_block_number block_number
,
1532 enum reference_status old_status
,
1533 struct pbn_lock
*lock
, bool normal_operation
,
1534 vdo_refcount_t
*counter_ptr
, bool adjust_block_count
)
1536 switch (old_status
) {
1538 if (normal_operation
) {
1539 return vdo_log_error_strerror(VDO_REF_COUNT_INVALID
,
1540 "Incrementing unallocated block map block (slab %u, offset %u)",
1541 slab
->slab_number
, block_number
);
1544 *counter_ptr
= MAXIMUM_REFERENCE_COUNT
;
1545 block
->allocated_count
++;
1546 slab
->free_blocks
--;
1547 if (adjust_block_count
)
1548 adjust_free_block_count(slab
, false);
1552 case RS_PROVISIONAL
:
1553 if (!normal_operation
)
1554 return vdo_log_error_strerror(VDO_REF_COUNT_INVALID
,
1555 "Block map block had provisional reference during replay (slab %u, offset %u)",
1556 slab
->slab_number
, block_number
);
1558 *counter_ptr
= MAXIMUM_REFERENCE_COUNT
;
1560 vdo_unassign_pbn_lock_provisional_reference(lock
);
1564 return vdo_log_error_strerror(VDO_REF_COUNT_INVALID
,
1565 "Incrementing a block map block which is already referenced %u times (slab %u, offset %u)",
1566 *counter_ptr
, slab
->slab_number
,
1571 static bool __must_check
is_valid_journal_point(const struct journal_point
*point
)
1573 return ((point
!= NULL
) && (point
->sequence_number
> 0));
1577 * update_reference_count() - Update the reference count of a block.
1578 * @slab: The slab which owns the block.
1579 * @block: The reference block which contains the block being updated.
1580 * @block_number: The block to update.
1581 * @slab_journal_point: The slab journal point at which this update is journaled.
1582 * @updater: The reference updater.
1583 * @normal_operation: Whether we are in normal operation vs. recovery or rebuild.
1584 * @adjust_block_count: Whether to update the slab's free block count.
1585 * @provisional_decrement_ptr: A pointer which will be set to true if this update was a decrement
1586 * of a provisional reference.
1588 * Return: VDO_SUCCESS or an error.
1590 static int update_reference_count(struct vdo_slab
*slab
, struct reference_block
*block
,
1591 slab_block_number block_number
,
1592 const struct journal_point
*slab_journal_point
,
1593 struct reference_updater
*updater
,
1594 bool normal_operation
, bool adjust_block_count
,
1595 bool *provisional_decrement_ptr
)
1597 vdo_refcount_t
*counter_ptr
= &slab
->counters
[block_number
];
1598 enum reference_status old_status
= reference_count_to_status(*counter_ptr
);
1601 if (!updater
->increment
) {
1602 result
= decrement_for_data(slab
, block
, block_number
, old_status
,
1603 updater
, counter_ptr
, adjust_block_count
);
1604 if ((result
== VDO_SUCCESS
) && (old_status
== RS_PROVISIONAL
)) {
1605 if (provisional_decrement_ptr
!= NULL
)
1606 *provisional_decrement_ptr
= true;
1609 } else if (updater
->operation
== VDO_JOURNAL_DATA_REMAPPING
) {
1610 result
= increment_for_data(slab
, block
, block_number
, old_status
,
1611 updater
->lock
, counter_ptr
, adjust_block_count
);
1613 result
= increment_for_block_map(slab
, block
, block_number
, old_status
,
1614 updater
->lock
, normal_operation
,
1615 counter_ptr
, adjust_block_count
);
1618 if (result
!= VDO_SUCCESS
)
1621 if (is_valid_journal_point(slab_journal_point
))
1622 slab
->slab_journal_point
= *slab_journal_point
;
1627 static int __must_check
adjust_reference_count(struct vdo_slab
*slab
,
1628 struct reference_updater
*updater
,
1629 const struct journal_point
*slab_journal_point
)
1631 slab_block_number block_number
;
1633 struct reference_block
*block
;
1634 bool provisional_decrement
= false;
1636 if (!is_slab_open(slab
))
1637 return VDO_INVALID_ADMIN_STATE
;
1639 result
= slab_block_number_from_pbn(slab
, updater
->zpbn
.pbn
, &block_number
);
1640 if (result
!= VDO_SUCCESS
)
1643 block
= get_reference_block(slab
, block_number
);
1644 result
= update_reference_count(slab
, block
, block_number
, slab_journal_point
,
1645 updater
, NORMAL_OPERATION
, true,
1646 &provisional_decrement
);
1647 if ((result
!= VDO_SUCCESS
) || provisional_decrement
)
1650 if (block
->is_dirty
&& (block
->slab_journal_lock
> 0)) {
1651 sequence_number_t entry_lock
= slab_journal_point
->sequence_number
;
1653 * This block is already dirty and a slab journal entry has been made for it since
1654 * the last time it was clean. We must release the per-entry slab journal lock for
1655 * the entry associated with the update we are now doing.
1657 result
= VDO_ASSERT(is_valid_journal_point(slab_journal_point
),
1658 "Reference count adjustments need slab journal points.");
1659 if (result
!= VDO_SUCCESS
)
1662 adjust_slab_journal_block_reference(&slab
->journal
, entry_lock
, -1);
1667 * This may be the first time we are applying an update for which there is a slab journal
1668 * entry to this block since the block was cleaned. Therefore, we convert the per-entry
1669 * slab journal lock to an uncommitted reference block lock, if there is a per-entry lock.
1671 if (is_valid_journal_point(slab_journal_point
))
1672 block
->slab_journal_lock
= slab_journal_point
->sequence_number
;
1674 block
->slab_journal_lock
= 0;
1681 * add_entry_from_waiter() - Add an entry to the slab journal.
1682 * @waiter: The vio which should make an entry now.
1683 * @context: The slab journal to make an entry in.
1685 * This callback is invoked by add_entries() once it has determined that we are ready to make
1686 * another entry in the slab journal. Implements waiter_callback_fn.
1688 static void add_entry_from_waiter(struct vdo_waiter
*waiter
, void *context
)
1691 struct reference_updater
*updater
=
1692 container_of(waiter
, struct reference_updater
, waiter
);
1693 struct data_vio
*data_vio
= data_vio_from_reference_updater(updater
);
1694 struct slab_journal
*journal
= context
;
1695 struct slab_journal_block_header
*header
= &journal
->tail_header
;
1696 struct journal_point slab_journal_point
= {
1697 .sequence_number
= header
->sequence_number
,
1698 .entry_count
= header
->entry_count
,
1700 sequence_number_t recovery_block
= data_vio
->recovery_journal_point
.sequence_number
;
1702 if (header
->entry_count
== 0) {
1704 * This is the first entry in the current tail block, so get a lock on the recovery
1705 * journal which we will hold until this tail block is committed.
1707 get_lock(journal
, header
->sequence_number
)->recovery_start
= recovery_block
;
1708 if (journal
->recovery_journal
!= NULL
) {
1709 zone_count_t zone_number
= journal
->slab
->allocator
->zone_number
;
1711 vdo_acquire_recovery_journal_block_reference(journal
->recovery_journal
,
1713 VDO_ZONE_TYPE_PHYSICAL
,
1717 mark_slab_journal_dirty(journal
, recovery_block
);
1718 reclaim_journal_space(journal
);
1721 add_entry(journal
, updater
->zpbn
.pbn
, updater
->operation
, updater
->increment
,
1722 expand_journal_point(data_vio
->recovery_journal_point
,
1723 updater
->increment
));
1725 if (journal
->slab
->status
!= VDO_SLAB_REBUILT
) {
1727 * If the slab is unrecovered, scrubbing will take care of the count since the
1728 * update is now recorded in the journal.
1730 adjust_slab_journal_block_reference(journal
,
1731 slab_journal_point
.sequence_number
, -1);
1732 result
= VDO_SUCCESS
;
1734 /* Now that an entry has been made in the slab journal, update the counter. */
1735 result
= adjust_reference_count(journal
->slab
, updater
,
1736 &slab_journal_point
);
1739 if (updater
->increment
)
1740 continue_data_vio_with_error(data_vio
, result
);
1742 vdo_continue_completion(&data_vio
->decrement_completion
, result
);
1746 * is_next_entry_a_block_map_increment() - Check whether the next entry to be made is a block map
1748 * @journal: The journal.
1750 * Return: true if the first entry waiter's operation is a block map increment.
1752 static inline bool is_next_entry_a_block_map_increment(struct slab_journal
*journal
)
1754 struct vdo_waiter
*waiter
= vdo_waitq_get_first_waiter(&journal
->entry_waiters
);
1755 struct reference_updater
*updater
=
1756 container_of(waiter
, struct reference_updater
, waiter
);
1758 return (updater
->operation
== VDO_JOURNAL_BLOCK_MAP_REMAPPING
);
1762 * add_entries() - Add as many entries as possible from the queue of vios waiting to make entries.
1763 * @journal: The journal to which entries may be added.
1765 * By processing the queue in order, we ensure that slab journal entries are made in the same order
1766 * as recovery journal entries for the same increment or decrement.
1768 static void add_entries(struct slab_journal
*journal
)
1770 if (journal
->adding_entries
) {
1771 /* Protect against re-entrancy. */
1775 journal
->adding_entries
= true;
1776 while (vdo_waitq_has_waiters(&journal
->entry_waiters
)) {
1777 struct slab_journal_block_header
*header
= &journal
->tail_header
;
1779 if (journal
->partial_write_in_progress
||
1780 (journal
->slab
->status
== VDO_SLAB_REBUILDING
)) {
1782 * Don't add entries while rebuilding or while a partial write is
1783 * outstanding, as it could result in reference count corruption.
1788 if (journal
->waiting_to_commit
) {
1790 * If we are waiting for resources to write the tail block, and the tail
1791 * block is full, we can't make another entry.
1793 WRITE_ONCE(journal
->events
->tail_busy_count
,
1794 journal
->events
->tail_busy_count
+ 1);
1796 } else if (is_next_entry_a_block_map_increment(journal
) &&
1797 (header
->entry_count
>= journal
->full_entries_per_block
)) {
1799 * The tail block does not have room for a block map increment, so commit
1802 commit_tail(journal
);
1803 if (journal
->waiting_to_commit
) {
1804 WRITE_ONCE(journal
->events
->tail_busy_count
,
1805 journal
->events
->tail_busy_count
+ 1);
1810 /* If the slab is over the blocking threshold, make the vio wait. */
1811 if (requires_reaping(journal
)) {
1812 WRITE_ONCE(journal
->events
->blocked_count
,
1813 journal
->events
->blocked_count
+ 1);
1814 save_dirty_reference_blocks(journal
->slab
);
1818 if (header
->entry_count
== 0) {
1819 struct journal_lock
*lock
=
1820 get_lock(journal
, header
->sequence_number
);
1823 * Check if the on disk slab journal is full. Because of the blocking and
1824 * scrubbing thresholds, this should never happen.
1826 if (lock
->count
> 0) {
1827 VDO_ASSERT_LOG_ONLY((journal
->head
+ journal
->size
) == journal
->tail
,
1828 "New block has locks, but journal is not full");
1831 * The blocking threshold must let the journal fill up if the new
1832 * block has locks; if the blocking threshold is smaller than the
1833 * journal size, the new block cannot possibly have locks already.
1835 VDO_ASSERT_LOG_ONLY((journal
->blocking_threshold
>= journal
->size
),
1836 "New block can have locks already iff blocking threshold is at the end of the journal");
1838 WRITE_ONCE(journal
->events
->disk_full_count
,
1839 journal
->events
->disk_full_count
+ 1);
1840 save_dirty_reference_blocks(journal
->slab
);
1845 * Don't allow the new block to be reaped until all of the reference count
1846 * blocks are written and the journal block has been fully committed as
1849 lock
->count
= journal
->entries_per_block
+ 1;
1851 if (header
->sequence_number
== 1) {
1852 struct vdo_slab
*slab
= journal
->slab
;
1856 * This is the first entry in this slab journal, ever. Dirty all of
1857 * the reference count blocks. Each will acquire a lock on the tail
1858 * block so that the journal won't be reaped until the reference
1859 * counts are initialized. The lock acquisition must be done by the
1860 * ref_counts since here we don't know how many reference blocks
1861 * the ref_counts has.
1863 for (i
= 0; i
< slab
->reference_block_count
; i
++) {
1864 slab
->reference_blocks
[i
].slab_journal_lock
= 1;
1865 dirty_block(&slab
->reference_blocks
[i
]);
1868 adjust_slab_journal_block_reference(journal
, 1,
1869 slab
->reference_block_count
);
1873 vdo_waitq_notify_next_waiter(&journal
->entry_waiters
,
1874 add_entry_from_waiter
, journal
);
1877 journal
->adding_entries
= false;
1879 /* If there are no waiters, and we are flushing or saving, commit the tail block. */
1880 if (vdo_is_state_draining(&journal
->slab
->state
) &&
1881 !vdo_is_state_suspending(&journal
->slab
->state
) &&
1882 !vdo_waitq_has_waiters(&journal
->entry_waiters
))
1883 commit_tail(journal
);
1887 * reset_search_cursor() - Reset the free block search back to the first reference counter in the
1888 * first reference block of a slab.
1890 static void reset_search_cursor(struct vdo_slab
*slab
)
1892 struct search_cursor
*cursor
= &slab
->search_cursor
;
1894 cursor
->block
= cursor
->first_block
;
1896 /* Unit tests have slabs with only one reference block (and it's a runt). */
1897 cursor
->end_index
= min_t(u32
, COUNTS_PER_BLOCK
, slab
->block_count
);
1901 * advance_search_cursor() - Advance the search cursor to the start of the next reference block in
1904 * Wraps around to the first reference block if the current block is the last reference block.
1906 * Return: true unless the cursor was at the last reference block.
1908 static bool advance_search_cursor(struct vdo_slab
*slab
)
1910 struct search_cursor
*cursor
= &slab
->search_cursor
;
1913 * If we just finished searching the last reference block, then wrap back around to the
1914 * start of the array.
1916 if (cursor
->block
== cursor
->last_block
) {
1917 reset_search_cursor(slab
);
1921 /* We're not already at the end, so advance to cursor to the next block. */
1923 cursor
->index
= cursor
->end_index
;
1925 if (cursor
->block
== cursor
->last_block
) {
1926 /* The last reference block will usually be a runt. */
1927 cursor
->end_index
= slab
->block_count
;
1929 cursor
->end_index
+= COUNTS_PER_BLOCK
;
1936 * vdo_adjust_reference_count_for_rebuild() - Adjust the reference count of a block during rebuild.
1938 * Return: VDO_SUCCESS or an error.
1940 int vdo_adjust_reference_count_for_rebuild(struct slab_depot
*depot
,
1941 physical_block_number_t pbn
,
1942 enum journal_operation operation
)
1945 slab_block_number block_number
;
1946 struct reference_block
*block
;
1947 struct vdo_slab
*slab
= vdo_get_slab(depot
, pbn
);
1948 struct reference_updater updater
= {
1949 .operation
= operation
,
1953 result
= slab_block_number_from_pbn(slab
, pbn
, &block_number
);
1954 if (result
!= VDO_SUCCESS
)
1957 block
= get_reference_block(slab
, block_number
);
1958 result
= update_reference_count(slab
, block
, block_number
, NULL
,
1959 &updater
, !NORMAL_OPERATION
, false, NULL
);
1960 if (result
!= VDO_SUCCESS
)
1968 * replay_reference_count_change() - Replay the reference count adjustment from a slab journal
1969 * entry into the reference count for a block.
1971 * @entry_point: The slab journal point for the entry.
1972 * @entry: The slab journal entry being replayed.
1974 * The adjustment will be ignored if it was already recorded in the reference count.
1976 * Return: VDO_SUCCESS or an error code.
1978 static int replay_reference_count_change(struct vdo_slab
*slab
,
1979 const struct journal_point
*entry_point
,
1980 struct slab_journal_entry entry
)
1983 struct reference_block
*block
= get_reference_block(slab
, entry
.sbn
);
1984 sector_count_t sector
= (entry
.sbn
% COUNTS_PER_BLOCK
) / COUNTS_PER_SECTOR
;
1985 struct reference_updater updater
= {
1986 .operation
= entry
.operation
,
1987 .increment
= entry
.increment
,
1990 if (!vdo_before_journal_point(&block
->commit_points
[sector
], entry_point
)) {
1991 /* This entry is already reflected in the existing counts, so do nothing. */
1995 /* This entry is not yet counted in the reference counts. */
1996 result
= update_reference_count(slab
, block
, entry
.sbn
, entry_point
,
1997 &updater
, !NORMAL_OPERATION
, false, NULL
);
1998 if (result
!= VDO_SUCCESS
)
2006 * find_zero_byte_in_word() - Find the array index of the first zero byte in word-sized range of
2007 * reference counters.
2008 * @word_ptr: A pointer to the eight counter bytes to check.
2009 * @start_index: The array index corresponding to word_ptr[0].
2010 * @fail_index: The array index to return if no zero byte is found.
2012 * The search does no bounds checking; the function relies on the array being sufficiently padded.
2014 * Return: The array index of the first zero byte in the word, or the value passed as fail_index if
2015 * no zero byte was found.
2017 static inline slab_block_number
find_zero_byte_in_word(const u8
*word_ptr
,
2018 slab_block_number start_index
,
2019 slab_block_number fail_index
)
2021 u64 word
= get_unaligned_le64(word_ptr
);
2023 /* This looks like a loop, but GCC will unroll the eight iterations for us. */
2024 unsigned int offset
;
2026 for (offset
= 0; offset
< BYTES_PER_WORD
; offset
++) {
2027 /* Assumes little-endian byte order, which we have on X86. */
2028 if ((word
& 0xFF) == 0)
2029 return (start_index
+ offset
);
2037 * find_free_block() - Find the first block with a reference count of zero in the specified
2038 * range of reference counter indexes.
2039 * @slab: The slab counters to scan.
2040 * @index_ptr: A pointer to hold the array index of the free block.
2042 * Exposed for unit testing.
2044 * Return: true if a free block was found in the specified range.
2046 static bool find_free_block(const struct vdo_slab
*slab
, slab_block_number
*index_ptr
)
2048 slab_block_number zero_index
;
2049 slab_block_number next_index
= slab
->search_cursor
.index
;
2050 slab_block_number end_index
= slab
->search_cursor
.end_index
;
2051 u8
*next_counter
= &slab
->counters
[next_index
];
2052 u8
*end_counter
= &slab
->counters
[end_index
];
2055 * Search every byte of the first unaligned word. (Array is padded so reading past end is
2058 zero_index
= find_zero_byte_in_word(next_counter
, next_index
, end_index
);
2059 if (zero_index
< end_index
) {
2060 *index_ptr
= zero_index
;
2065 * On architectures where unaligned word access is expensive, this would be a good place to
2066 * advance to an alignment boundary.
2068 next_index
+= BYTES_PER_WORD
;
2069 next_counter
+= BYTES_PER_WORD
;
2072 * Now we're word-aligned; check an word at a time until we find a word containing a zero.
2073 * (Array is padded so reading past end is safe.)
2075 while (next_counter
< end_counter
) {
2077 * The following code is currently an exact copy of the code preceding the loop,
2078 * but if you try to merge them by using a do loop, it runs slower because a jump
2079 * instruction gets added at the start of the iteration.
2081 zero_index
= find_zero_byte_in_word(next_counter
, next_index
, end_index
);
2082 if (zero_index
< end_index
) {
2083 *index_ptr
= zero_index
;
2087 next_index
+= BYTES_PER_WORD
;
2088 next_counter
+= BYTES_PER_WORD
;
2095 * search_current_reference_block() - Search the reference block currently saved in the search
2096 * cursor for a reference count of zero, starting at the saved
2098 * @slab: The slab to search.
2099 * @free_index_ptr: A pointer to receive the array index of the zero reference count.
2101 * Return: true if an unreferenced counter was found.
2103 static bool search_current_reference_block(const struct vdo_slab
*slab
,
2104 slab_block_number
*free_index_ptr
)
2106 /* Don't bother searching if the current block is known to be full. */
2107 return ((slab
->search_cursor
.block
->allocated_count
< COUNTS_PER_BLOCK
) &&
2108 find_free_block(slab
, free_index_ptr
));
2112 * search_reference_blocks() - Search each reference block for a reference count of zero.
2113 * @slab: The slab to search.
2114 * @free_index_ptr: A pointer to receive the array index of the zero reference count.
2116 * Searches each reference block for a reference count of zero, starting at the reference block and
2117 * counter index saved in the search cursor and searching up to the end of the last reference
2118 * block. The search does not wrap.
2120 * Return: true if an unreferenced counter was found.
2122 static bool search_reference_blocks(struct vdo_slab
*slab
,
2123 slab_block_number
*free_index_ptr
)
2125 /* Start searching at the saved search position in the current block. */
2126 if (search_current_reference_block(slab
, free_index_ptr
))
2129 /* Search each reference block up to the end of the slab. */
2130 while (advance_search_cursor(slab
)) {
2131 if (search_current_reference_block(slab
, free_index_ptr
))
2139 * make_provisional_reference() - Do the bookkeeping for making a provisional reference.
2141 static void make_provisional_reference(struct vdo_slab
*slab
,
2142 slab_block_number block_number
)
2144 struct reference_block
*block
= get_reference_block(slab
, block_number
);
2147 * Make the initial transition from an unreferenced block to a
2148 * provisionally allocated block.
2150 slab
->counters
[block_number
] = PROVISIONAL_REFERENCE_COUNT
;
2152 /* Account for the allocation. */
2153 block
->allocated_count
++;
2154 slab
->free_blocks
--;
2158 * dirty_all_reference_blocks() - Mark all reference count blocks in a slab as dirty.
2160 static void dirty_all_reference_blocks(struct vdo_slab
*slab
)
2164 for (i
= 0; i
< slab
->reference_block_count
; i
++)
2165 dirty_block(&slab
->reference_blocks
[i
]);
2169 * clear_provisional_references() - Clear the provisional reference counts from a reference block.
2170 * @block: The block to clear.
2172 static void clear_provisional_references(struct reference_block
*block
)
2174 vdo_refcount_t
*counters
= get_reference_counters_for_block(block
);
2177 for (j
= 0; j
< COUNTS_PER_BLOCK
; j
++) {
2178 if (counters
[j
] == PROVISIONAL_REFERENCE_COUNT
) {
2179 counters
[j
] = EMPTY_REFERENCE_COUNT
;
2180 block
->allocated_count
--;
2185 static inline bool journal_points_equal(struct journal_point first
,
2186 struct journal_point second
)
2188 return ((first
.sequence_number
== second
.sequence_number
) &&
2189 (first
.entry_count
== second
.entry_count
));
2193 * unpack_reference_block() - Unpack reference counts blocks into the internal memory structure.
2194 * @packed: The written reference block to be unpacked.
2195 * @block: The internal reference block to be loaded.
2197 static void unpack_reference_block(struct packed_reference_block
*packed
,
2198 struct reference_block
*block
)
2200 block_count_t index
;
2202 struct vdo_slab
*slab
= block
->slab
;
2203 vdo_refcount_t
*counters
= get_reference_counters_for_block(block
);
2205 for (i
= 0; i
< VDO_SECTORS_PER_BLOCK
; i
++) {
2206 struct packed_reference_sector
*sector
= &packed
->sectors
[i
];
2208 vdo_unpack_journal_point(§or
->commit_point
, &block
->commit_points
[i
]);
2209 memcpy(counters
+ (i
* COUNTS_PER_SECTOR
), sector
->counts
,
2210 (sizeof(vdo_refcount_t
) * COUNTS_PER_SECTOR
));
2211 /* The slab_journal_point must be the latest point found in any sector. */
2212 if (vdo_before_journal_point(&slab
->slab_journal_point
,
2213 &block
->commit_points
[i
]))
2214 slab
->slab_journal_point
= block
->commit_points
[i
];
2217 !journal_points_equal(block
->commit_points
[0],
2218 block
->commit_points
[i
])) {
2219 size_t block_index
= block
- block
->slab
->reference_blocks
;
2221 vdo_log_warning("Torn write detected in sector %u of reference block %zu of slab %u",
2222 i
, block_index
, block
->slab
->slab_number
);
2226 block
->allocated_count
= 0;
2227 for (index
= 0; index
< COUNTS_PER_BLOCK
; index
++) {
2228 if (counters
[index
] != EMPTY_REFERENCE_COUNT
)
2229 block
->allocated_count
++;
2234 * finish_reference_block_load() - After a reference block has been read, unpack it.
2235 * @completion: The VIO that just finished reading.
2237 static void finish_reference_block_load(struct vdo_completion
*completion
)
2239 struct vio
*vio
= as_vio(completion
);
2240 struct pooled_vio
*pooled
= vio_as_pooled_vio(vio
);
2241 struct reference_block
*block
= completion
->parent
;
2242 struct vdo_slab
*slab
= block
->slab
;
2244 unpack_reference_block((struct packed_reference_block
*) vio
->data
, block
);
2245 return_vio_to_pool(slab
->allocator
->vio_pool
, pooled
);
2246 slab
->active_count
--;
2247 clear_provisional_references(block
);
2249 slab
->free_blocks
-= block
->allocated_count
;
2250 check_if_slab_drained(slab
);
2253 static void load_reference_block_endio(struct bio
*bio
)
2255 struct vio
*vio
= bio
->bi_private
;
2256 struct reference_block
*block
= vio
->completion
.parent
;
2258 continue_vio_after_io(vio
, finish_reference_block_load
,
2259 block
->slab
->allocator
->thread_id
);
2263 * load_reference_block() - After a block waiter has gotten a VIO from the VIO pool, load the
2265 * @waiter: The waiter of the block to load.
2266 * @context: The VIO returned by the pool.
2268 static void load_reference_block(struct vdo_waiter
*waiter
, void *context
)
2270 struct pooled_vio
*pooled
= context
;
2271 struct vio
*vio
= &pooled
->vio
;
2272 struct reference_block
*block
=
2273 container_of(waiter
, struct reference_block
, waiter
);
2274 size_t block_offset
= (block
- block
->slab
->reference_blocks
);
2276 vio
->completion
.parent
= block
;
2277 vdo_submit_metadata_vio(vio
, block
->slab
->ref_counts_origin
+ block_offset
,
2278 load_reference_block_endio
, handle_io_error
,
2283 * load_reference_blocks() - Load a slab's reference blocks from the underlying storage into a
2284 * pre-allocated reference counter.
2286 static void load_reference_blocks(struct vdo_slab
*slab
)
2290 slab
->free_blocks
= slab
->block_count
;
2291 slab
->active_count
= slab
->reference_block_count
;
2292 for (i
= 0; i
< slab
->reference_block_count
; i
++) {
2293 struct vdo_waiter
*waiter
= &slab
->reference_blocks
[i
].waiter
;
2295 waiter
->callback
= load_reference_block
;
2296 acquire_vio_from_pool(slab
->allocator
->vio_pool
, waiter
);
2301 * drain_slab() - Drain all reference count I/O.
2303 * Depending upon the type of drain being performed (as recorded in the ref_count's vdo_slab), the
2304 * reference blocks may be loaded from disk or dirty reference blocks may be written out.
2306 static void drain_slab(struct vdo_slab
*slab
)
2310 const struct admin_state_code
*state
= vdo_get_admin_state_code(&slab
->state
);
2312 if (state
== VDO_ADMIN_STATE_SUSPENDING
)
2315 if ((state
!= VDO_ADMIN_STATE_REBUILDING
) &&
2316 (state
!= VDO_ADMIN_STATE_SAVE_FOR_SCRUBBING
))
2317 commit_tail(&slab
->journal
);
2319 if ((state
== VDO_ADMIN_STATE_RECOVERING
) || (slab
->counters
== NULL
))
2323 load
= slab
->allocator
->summary_entries
[slab
->slab_number
].load_ref_counts
;
2324 if (state
== VDO_ADMIN_STATE_SCRUBBING
) {
2326 load_reference_blocks(slab
);
2329 } else if (state
== VDO_ADMIN_STATE_SAVE_FOR_SCRUBBING
) {
2331 /* These reference counts were never written, so mark them all dirty. */
2332 dirty_all_reference_blocks(slab
);
2335 } else if (state
== VDO_ADMIN_STATE_REBUILDING
) {
2337 * Write out the counters if the slab has written them before, or it has any
2338 * non-zero reference counts, or there are any slab journal blocks.
2340 block_count_t data_blocks
= slab
->allocator
->depot
->slab_config
.data_blocks
;
2342 if (load
|| (slab
->free_blocks
!= data_blocks
) ||
2343 !is_slab_journal_blank(slab
)) {
2344 dirty_all_reference_blocks(slab
);
2347 } else if (state
== VDO_ADMIN_STATE_SAVING
) {
2348 save
= (slab
->status
== VDO_SLAB_REBUILT
);
2350 vdo_finish_draining_with_result(&slab
->state
, VDO_SUCCESS
);
2355 save_dirty_reference_blocks(slab
);
2358 static int allocate_slab_counters(struct vdo_slab
*slab
)
2361 size_t index
, bytes
;
2363 result
= VDO_ASSERT(slab
->reference_blocks
== NULL
,
2364 "vdo_slab %u doesn't allocate refcounts twice",
2366 if (result
!= VDO_SUCCESS
)
2369 result
= vdo_allocate(slab
->reference_block_count
, struct reference_block
,
2370 __func__
, &slab
->reference_blocks
);
2371 if (result
!= VDO_SUCCESS
)
2375 * Allocate such that the runt slab has a full-length memory array, plus a little padding
2376 * so we can word-search even at the very end.
2378 bytes
= (slab
->reference_block_count
* COUNTS_PER_BLOCK
) + (2 * BYTES_PER_WORD
);
2379 result
= vdo_allocate(bytes
, vdo_refcount_t
, "ref counts array",
2381 if (result
!= VDO_SUCCESS
) {
2382 vdo_free(vdo_forget(slab
->reference_blocks
));
2386 slab
->search_cursor
.first_block
= slab
->reference_blocks
;
2387 slab
->search_cursor
.last_block
= &slab
->reference_blocks
[slab
->reference_block_count
- 1];
2388 reset_search_cursor(slab
);
2390 for (index
= 0; index
< slab
->reference_block_count
; index
++) {
2391 slab
->reference_blocks
[index
] = (struct reference_block
) {
2399 static int allocate_counters_if_clean(struct vdo_slab
*slab
)
2401 if (vdo_is_state_clean_load(&slab
->state
))
2402 return allocate_slab_counters(slab
);
2407 static void finish_loading_journal(struct vdo_completion
*completion
)
2409 struct vio
*vio
= as_vio(completion
);
2410 struct slab_journal
*journal
= completion
->parent
;
2411 struct vdo_slab
*slab
= journal
->slab
;
2412 struct packed_slab_journal_block
*block
= (struct packed_slab_journal_block
*) vio
->data
;
2413 struct slab_journal_block_header header
;
2415 vdo_unpack_slab_journal_block_header(&block
->header
, &header
);
2417 /* FIXME: should it be an error if the following conditional fails? */
2418 if ((header
.metadata_type
== VDO_METADATA_SLAB_JOURNAL
) &&
2419 (header
.nonce
== slab
->allocator
->nonce
)) {
2420 journal
->tail
= header
.sequence_number
+ 1;
2423 * If the slab is clean, this implies the slab journal is empty, so advance the
2424 * head appropriately.
2426 journal
->head
= (slab
->allocator
->summary_entries
[slab
->slab_number
].is_dirty
?
2427 header
.head
: journal
->tail
);
2428 journal
->tail_header
= header
;
2429 initialize_journal_state(journal
);
2432 return_vio_to_pool(slab
->allocator
->vio_pool
, vio_as_pooled_vio(vio
));
2433 vdo_finish_loading_with_result(&slab
->state
, allocate_counters_if_clean(slab
));
2436 static void read_slab_journal_tail_endio(struct bio
*bio
)
2438 struct vio
*vio
= bio
->bi_private
;
2439 struct slab_journal
*journal
= vio
->completion
.parent
;
2441 continue_vio_after_io(vio
, finish_loading_journal
,
2442 journal
->slab
->allocator
->thread_id
);
2445 static void handle_load_error(struct vdo_completion
*completion
)
2447 int result
= completion
->result
;
2448 struct slab_journal
*journal
= completion
->parent
;
2449 struct vio
*vio
= as_vio(completion
);
2451 vio_record_metadata_io_error(vio
);
2452 return_vio_to_pool(journal
->slab
->allocator
->vio_pool
, vio_as_pooled_vio(vio
));
2453 vdo_finish_loading_with_result(&journal
->slab
->state
, result
);
2457 * read_slab_journal_tail() - Read the slab journal tail block by using a vio acquired from the vio
2459 * @waiter: The vio pool waiter which has just been notified.
2460 * @context: The vio pool entry given to the waiter.
2462 * This is the success callback from acquire_vio_from_pool() when loading a slab journal.
2464 static void read_slab_journal_tail(struct vdo_waiter
*waiter
, void *context
)
2466 struct slab_journal
*journal
=
2467 container_of(waiter
, struct slab_journal
, resource_waiter
);
2468 struct vdo_slab
*slab
= journal
->slab
;
2469 struct pooled_vio
*pooled
= context
;
2470 struct vio
*vio
= &pooled
->vio
;
2471 tail_block_offset_t last_commit_point
=
2472 slab
->allocator
->summary_entries
[slab
->slab_number
].tail_block_offset
;
2475 * Slab summary keeps the commit point offset, so the tail block is the block before that.
2476 * Calculation supports small journals in unit tests.
2478 tail_block_offset_t tail_block
= ((last_commit_point
== 0) ?
2479 (tail_block_offset_t
)(journal
->size
- 1) :
2480 (last_commit_point
- 1));
2482 vio
->completion
.parent
= journal
;
2483 vio
->completion
.callback_thread_id
= slab
->allocator
->thread_id
;
2484 vdo_submit_metadata_vio(vio
, slab
->journal_origin
+ tail_block
,
2485 read_slab_journal_tail_endio
, handle_load_error
,
2490 * load_slab_journal() - Load a slab's journal by reading the journal's tail.
2492 static void load_slab_journal(struct vdo_slab
*slab
)
2494 struct slab_journal
*journal
= &slab
->journal
;
2495 tail_block_offset_t last_commit_point
;
2497 last_commit_point
= slab
->allocator
->summary_entries
[slab
->slab_number
].tail_block_offset
;
2498 if ((last_commit_point
== 0) &&
2499 !slab
->allocator
->summary_entries
[slab
->slab_number
].load_ref_counts
) {
2501 * This slab claims that it has a tail block at (journal->size - 1), but a head of
2502 * 1. This is impossible, due to the scrubbing threshold, on a real system, so
2503 * don't bother reading the (bogus) data off disk.
2505 VDO_ASSERT_LOG_ONLY(((journal
->size
< 16) ||
2506 (journal
->scrubbing_threshold
< (journal
->size
- 1))),
2507 "Scrubbing threshold protects against reads of unwritten slab journal blocks");
2508 vdo_finish_loading_with_result(&slab
->state
,
2509 allocate_counters_if_clean(slab
));
2513 journal
->resource_waiter
.callback
= read_slab_journal_tail
;
2514 acquire_vio_from_pool(slab
->allocator
->vio_pool
, &journal
->resource_waiter
);
2517 static void register_slab_for_scrubbing(struct vdo_slab
*slab
, bool high_priority
)
2519 struct slab_scrubber
*scrubber
= &slab
->allocator
->scrubber
;
2521 VDO_ASSERT_LOG_ONLY((slab
->status
!= VDO_SLAB_REBUILT
),
2522 "slab to be scrubbed is unrecovered");
2524 if (slab
->status
!= VDO_SLAB_REQUIRES_SCRUBBING
)
2527 list_del_init(&slab
->allocq_entry
);
2528 if (!slab
->was_queued_for_scrubbing
) {
2529 WRITE_ONCE(scrubber
->slab_count
, scrubber
->slab_count
+ 1);
2530 slab
->was_queued_for_scrubbing
= true;
2533 if (high_priority
) {
2534 slab
->status
= VDO_SLAB_REQUIRES_HIGH_PRIORITY_SCRUBBING
;
2535 list_add_tail(&slab
->allocq_entry
, &scrubber
->high_priority_slabs
);
2539 list_add_tail(&slab
->allocq_entry
, &scrubber
->slabs
);
2542 /* Queue a slab for allocation or scrubbing. */
2543 static void queue_slab(struct vdo_slab
*slab
)
2545 struct block_allocator
*allocator
= slab
->allocator
;
2546 block_count_t free_blocks
;
2549 VDO_ASSERT_LOG_ONLY(list_empty(&slab
->allocq_entry
),
2550 "a requeued slab must not already be on a ring");
2552 if (vdo_is_read_only(allocator
->depot
->vdo
))
2555 free_blocks
= slab
->free_blocks
;
2556 result
= VDO_ASSERT((free_blocks
<= allocator
->depot
->slab_config
.data_blocks
),
2557 "rebuilt slab %u must have a valid free block count (has %llu, expected maximum %llu)",
2558 slab
->slab_number
, (unsigned long long) free_blocks
,
2559 (unsigned long long) allocator
->depot
->slab_config
.data_blocks
);
2560 if (result
!= VDO_SUCCESS
) {
2561 vdo_enter_read_only_mode(allocator
->depot
->vdo
, result
);
2565 if (slab
->status
!= VDO_SLAB_REBUILT
) {
2566 register_slab_for_scrubbing(slab
, false);
2570 if (!vdo_is_state_resuming(&slab
->state
)) {
2572 * If the slab is resuming, we've already accounted for it here, so don't do it
2574 * FIXME: under what situation would the slab be resuming here?
2576 WRITE_ONCE(allocator
->allocated_blocks
,
2577 allocator
->allocated_blocks
- free_blocks
);
2578 if (!is_slab_journal_blank(slab
)) {
2579 WRITE_ONCE(allocator
->statistics
.slabs_opened
,
2580 allocator
->statistics
.slabs_opened
+ 1);
2584 if (allocator
->depot
->vdo
->suspend_type
== VDO_ADMIN_STATE_SAVING
)
2585 reopen_slab_journal(slab
);
2587 prioritize_slab(slab
);
2591 * initiate_slab_action() - Initiate a slab action.
2593 * Implements vdo_admin_initiator_fn.
2595 static void initiate_slab_action(struct admin_state
*state
)
2597 struct vdo_slab
*slab
= container_of(state
, struct vdo_slab
, state
);
2599 if (vdo_is_state_draining(state
)) {
2600 const struct admin_state_code
*operation
= vdo_get_admin_state_code(state
);
2602 if (operation
== VDO_ADMIN_STATE_SCRUBBING
)
2603 slab
->status
= VDO_SLAB_REBUILDING
;
2606 check_if_slab_drained(slab
);
2610 if (vdo_is_state_loading(state
)) {
2611 load_slab_journal(slab
);
2615 if (vdo_is_state_resuming(state
)) {
2617 vdo_finish_resuming(state
);
2621 vdo_finish_operation(state
, VDO_INVALID_ADMIN_STATE
);
2625 * get_next_slab() - Get the next slab to scrub.
2626 * @scrubber: The slab scrubber.
2628 * Return: The next slab to scrub or NULL if there are none.
2630 static struct vdo_slab
*get_next_slab(struct slab_scrubber
*scrubber
)
2632 struct vdo_slab
*slab
;
2634 slab
= list_first_entry_or_null(&scrubber
->high_priority_slabs
,
2635 struct vdo_slab
, allocq_entry
);
2639 return list_first_entry_or_null(&scrubber
->slabs
, struct vdo_slab
,
2644 * has_slabs_to_scrub() - Check whether a scrubber has slabs to scrub.
2645 * @scrubber: The scrubber to check.
2647 * Return: true if the scrubber has slabs to scrub.
2649 static inline bool __must_check
has_slabs_to_scrub(struct slab_scrubber
*scrubber
)
2651 return (get_next_slab(scrubber
) != NULL
);
2655 * uninitialize_scrubber_vio() - Clean up the slab_scrubber's vio.
2656 * @scrubber: The scrubber.
2658 static void uninitialize_scrubber_vio(struct slab_scrubber
*scrubber
)
2660 vdo_free(vdo_forget(scrubber
->vio
.data
));
2661 free_vio_components(&scrubber
->vio
);
2665 * finish_scrubbing() - Stop scrubbing, either because there are no more slabs to scrub or because
2666 * there's been an error.
2667 * @scrubber: The scrubber.
2669 static void finish_scrubbing(struct slab_scrubber
*scrubber
, int result
)
2671 bool notify
= vdo_waitq_has_waiters(&scrubber
->waiters
);
2672 bool done
= !has_slabs_to_scrub(scrubber
);
2673 struct block_allocator
*allocator
=
2674 container_of(scrubber
, struct block_allocator
, scrubber
);
2677 uninitialize_scrubber_vio(scrubber
);
2679 if (scrubber
->high_priority_only
) {
2680 scrubber
->high_priority_only
= false;
2681 vdo_fail_completion(vdo_forget(scrubber
->vio
.completion
.parent
), result
);
2682 } else if (done
&& (atomic_add_return(-1, &allocator
->depot
->zones_to_scrub
) == 0)) {
2683 /* All of our slabs were scrubbed, and we're the last allocator to finish. */
2684 enum vdo_state prior_state
=
2685 atomic_cmpxchg(&allocator
->depot
->vdo
->state
, VDO_RECOVERING
,
2689 * To be safe, even if the CAS failed, ensure anything that follows is ordered with
2690 * respect to whatever state change did happen.
2692 smp_mb__after_atomic();
2695 * We must check the VDO state here and not the depot's read_only_notifier since
2696 * the compare-swap-above could have failed due to a read-only entry which our own
2697 * thread does not yet know about.
2699 if (prior_state
== VDO_DIRTY
)
2700 vdo_log_info("VDO commencing normal operation");
2701 else if (prior_state
== VDO_RECOVERING
)
2702 vdo_log_info("Exiting recovery mode");
2706 * Note that the scrubber has stopped, and inform anyone who might be waiting for that to
2709 if (!vdo_finish_draining(&scrubber
->admin_state
))
2710 WRITE_ONCE(scrubber
->admin_state
.current_state
,
2711 VDO_ADMIN_STATE_SUSPENDED
);
2714 * We can't notify waiters until after we've finished draining or they'll just requeue.
2715 * Fortunately if there were waiters, we can't have been freed yet.
2718 vdo_waitq_notify_all_waiters(&scrubber
->waiters
, NULL
, NULL
);
2721 static void scrub_next_slab(struct slab_scrubber
*scrubber
);
2724 * slab_scrubbed() - Notify the scrubber that a slab has been scrubbed.
2725 * @completion: The slab rebuild completion.
2727 * This callback is registered in apply_journal_entries().
2729 static void slab_scrubbed(struct vdo_completion
*completion
)
2731 struct slab_scrubber
*scrubber
=
2732 container_of(as_vio(completion
), struct slab_scrubber
, vio
);
2733 struct vdo_slab
*slab
= scrubber
->slab
;
2735 slab
->status
= VDO_SLAB_REBUILT
;
2737 reopen_slab_journal(slab
);
2738 WRITE_ONCE(scrubber
->slab_count
, scrubber
->slab_count
- 1);
2739 scrub_next_slab(scrubber
);
2743 * abort_scrubbing() - Abort scrubbing due to an error.
2744 * @scrubber: The slab scrubber.
2745 * @result: The error.
2747 static void abort_scrubbing(struct slab_scrubber
*scrubber
, int result
)
2749 vdo_enter_read_only_mode(scrubber
->vio
.completion
.vdo
, result
);
2750 finish_scrubbing(scrubber
, result
);
2754 * handle_scrubber_error() - Handle errors while rebuilding a slab.
2755 * @completion: The slab rebuild completion.
2757 static void handle_scrubber_error(struct vdo_completion
*completion
)
2759 struct vio
*vio
= as_vio(completion
);
2761 vio_record_metadata_io_error(vio
);
2762 abort_scrubbing(container_of(vio
, struct slab_scrubber
, vio
),
2763 completion
->result
);
2767 * apply_block_entries() - Apply all the entries in a block to the reference counts.
2768 * @block: A block with entries to apply.
2769 * @entry_count: The number of entries to apply.
2770 * @block_number: The sequence number of the block.
2771 * @slab: The slab to apply the entries to.
2773 * Return: VDO_SUCCESS or an error code.
2775 static int apply_block_entries(struct packed_slab_journal_block
*block
,
2776 journal_entry_count_t entry_count
,
2777 sequence_number_t block_number
, struct vdo_slab
*slab
)
2779 struct journal_point entry_point
= {
2780 .sequence_number
= block_number
,
2784 slab_block_number max_sbn
= slab
->end
- slab
->start
;
2786 while (entry_point
.entry_count
< entry_count
) {
2787 struct slab_journal_entry entry
=
2788 vdo_decode_slab_journal_entry(block
, entry_point
.entry_count
);
2790 if (entry
.sbn
> max_sbn
) {
2791 /* This entry is out of bounds. */
2792 return vdo_log_error_strerror(VDO_CORRUPT_JOURNAL
,
2793 "vdo_slab journal entry (%llu, %u) had invalid offset %u in slab (size %u blocks)",
2794 (unsigned long long) block_number
,
2795 entry_point
.entry_count
,
2796 entry
.sbn
, max_sbn
);
2799 result
= replay_reference_count_change(slab
, &entry_point
, entry
);
2800 if (result
!= VDO_SUCCESS
) {
2801 vdo_log_error_strerror(result
,
2802 "vdo_slab journal entry (%llu, %u) (%s of offset %u) could not be applied in slab %u",
2803 (unsigned long long) block_number
,
2804 entry_point
.entry_count
,
2805 vdo_get_journal_operation_name(entry
.operation
),
2806 entry
.sbn
, slab
->slab_number
);
2809 entry_point
.entry_count
++;
2816 * apply_journal_entries() - Find the relevant vio of the slab journal and apply all valid entries.
2817 * @completion: The metadata read vio completion.
2819 * This is a callback registered in start_scrubbing().
2821 static void apply_journal_entries(struct vdo_completion
*completion
)
2824 struct slab_scrubber
*scrubber
=
2825 container_of(as_vio(completion
), struct slab_scrubber
, vio
);
2826 struct vdo_slab
*slab
= scrubber
->slab
;
2827 struct slab_journal
*journal
= &slab
->journal
;
2829 /* Find the boundaries of the useful part of the journal. */
2830 sequence_number_t tail
= journal
->tail
;
2831 tail_block_offset_t end_index
= (tail
- 1) % journal
->size
;
2832 char *end_data
= scrubber
->vio
.data
+ (end_index
* VDO_BLOCK_SIZE
);
2833 struct packed_slab_journal_block
*end_block
=
2834 (struct packed_slab_journal_block
*) end_data
;
2836 sequence_number_t head
= __le64_to_cpu(end_block
->header
.head
);
2837 tail_block_offset_t head_index
= head
% journal
->size
;
2838 block_count_t index
= head_index
;
2840 struct journal_point ref_counts_point
= slab
->slab_journal_point
;
2841 struct journal_point last_entry_applied
= ref_counts_point
;
2842 sequence_number_t sequence
;
2844 for (sequence
= head
; sequence
< tail
; sequence
++) {
2845 char *block_data
= scrubber
->vio
.data
+ (index
* VDO_BLOCK_SIZE
);
2846 struct packed_slab_journal_block
*block
=
2847 (struct packed_slab_journal_block
*) block_data
;
2848 struct slab_journal_block_header header
;
2850 vdo_unpack_slab_journal_block_header(&block
->header
, &header
);
2852 if ((header
.nonce
!= slab
->allocator
->nonce
) ||
2853 (header
.metadata_type
!= VDO_METADATA_SLAB_JOURNAL
) ||
2854 (header
.sequence_number
!= sequence
) ||
2855 (header
.entry_count
> journal
->entries_per_block
) ||
2856 (header
.has_block_map_increments
&&
2857 (header
.entry_count
> journal
->full_entries_per_block
))) {
2858 /* The block is not what we expect it to be. */
2859 vdo_log_error("vdo_slab journal block for slab %u was invalid",
2861 abort_scrubbing(scrubber
, VDO_CORRUPT_JOURNAL
);
2865 result
= apply_block_entries(block
, header
.entry_count
, sequence
, slab
);
2866 if (result
!= VDO_SUCCESS
) {
2867 abort_scrubbing(scrubber
, result
);
2871 last_entry_applied
.sequence_number
= sequence
;
2872 last_entry_applied
.entry_count
= header
.entry_count
- 1;
2874 if (index
== journal
->size
)
2879 * At the end of rebuild, the reference counters should be accurate to the end of the
2880 * journal we just applied.
2882 result
= VDO_ASSERT(!vdo_before_journal_point(&last_entry_applied
,
2884 "Refcounts are not more accurate than the slab journal");
2885 if (result
!= VDO_SUCCESS
) {
2886 abort_scrubbing(scrubber
, result
);
2890 /* Save out the rebuilt reference blocks. */
2891 vdo_prepare_completion(completion
, slab_scrubbed
, handle_scrubber_error
,
2892 slab
->allocator
->thread_id
, completion
->parent
);
2893 vdo_start_operation_with_waiter(&slab
->state
,
2894 VDO_ADMIN_STATE_SAVE_FOR_SCRUBBING
,
2895 completion
, initiate_slab_action
);
2898 static void read_slab_journal_endio(struct bio
*bio
)
2900 struct vio
*vio
= bio
->bi_private
;
2901 struct slab_scrubber
*scrubber
= container_of(vio
, struct slab_scrubber
, vio
);
2903 continue_vio_after_io(bio
->bi_private
, apply_journal_entries
,
2904 scrubber
->slab
->allocator
->thread_id
);
2908 * start_scrubbing() - Read the current slab's journal from disk now that it has been flushed.
2909 * @completion: The scrubber's vio completion.
2911 * This callback is registered in scrub_next_slab().
2913 static void start_scrubbing(struct vdo_completion
*completion
)
2915 struct slab_scrubber
*scrubber
=
2916 container_of(as_vio(completion
), struct slab_scrubber
, vio
);
2917 struct vdo_slab
*slab
= scrubber
->slab
;
2919 if (!slab
->allocator
->summary_entries
[slab
->slab_number
].is_dirty
) {
2920 slab_scrubbed(completion
);
2924 vdo_submit_metadata_vio(&scrubber
->vio
, slab
->journal_origin
,
2925 read_slab_journal_endio
, handle_scrubber_error
,
2930 * scrub_next_slab() - Scrub the next slab if there is one.
2931 * @scrubber: The scrubber.
2933 static void scrub_next_slab(struct slab_scrubber
*scrubber
)
2935 struct vdo_completion
*completion
= &scrubber
->vio
.completion
;
2936 struct vdo_slab
*slab
;
2939 * Note: this notify call is always safe only because scrubbing can only be started when
2940 * the VDO is quiescent.
2942 vdo_waitq_notify_all_waiters(&scrubber
->waiters
, NULL
, NULL
);
2944 if (vdo_is_read_only(completion
->vdo
)) {
2945 finish_scrubbing(scrubber
, VDO_READ_ONLY
);
2949 slab
= get_next_slab(scrubber
);
2950 if ((slab
== NULL
) ||
2951 (scrubber
->high_priority_only
&& list_empty(&scrubber
->high_priority_slabs
))) {
2952 finish_scrubbing(scrubber
, VDO_SUCCESS
);
2956 if (vdo_finish_draining(&scrubber
->admin_state
))
2959 list_del_init(&slab
->allocq_entry
);
2960 scrubber
->slab
= slab
;
2961 vdo_prepare_completion(completion
, start_scrubbing
, handle_scrubber_error
,
2962 slab
->allocator
->thread_id
, completion
->parent
);
2963 vdo_start_operation_with_waiter(&slab
->state
, VDO_ADMIN_STATE_SCRUBBING
,
2964 completion
, initiate_slab_action
);
2968 * scrub_slabs() - Scrub all of an allocator's slabs that are eligible for scrubbing.
2969 * @allocator: The block_allocator to scrub.
2970 * @parent: The completion to notify when scrubbing is done, implies high_priority, may be NULL.
2972 static void scrub_slabs(struct block_allocator
*allocator
, struct vdo_completion
*parent
)
2974 struct slab_scrubber
*scrubber
= &allocator
->scrubber
;
2976 scrubber
->vio
.completion
.parent
= parent
;
2977 scrubber
->high_priority_only
= (parent
!= NULL
);
2978 if (!has_slabs_to_scrub(scrubber
)) {
2979 finish_scrubbing(scrubber
, VDO_SUCCESS
);
2983 if (scrubber
->high_priority_only
&&
2984 vdo_is_priority_table_empty(allocator
->prioritized_slabs
) &&
2985 list_empty(&scrubber
->high_priority_slabs
))
2986 register_slab_for_scrubbing(get_next_slab(scrubber
), true);
2988 vdo_resume_if_quiescent(&scrubber
->admin_state
);
2989 scrub_next_slab(scrubber
);
2992 static inline void assert_on_allocator_thread(thread_id_t thread_id
,
2993 const char *function_name
)
2995 VDO_ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == thread_id
),
2996 "%s called on correct thread", function_name
);
2999 static void register_slab_with_allocator(struct block_allocator
*allocator
,
3000 struct vdo_slab
*slab
)
3002 allocator
->slab_count
++;
3003 allocator
->last_slab
= slab
->slab_number
;
3007 * get_depot_slab_iterator() - Return a slab_iterator over the slabs in a slab_depot.
3008 * @depot: The depot over which to iterate.
3009 * @start: The number of the slab to start iterating from.
3010 * @end: The number of the last slab which may be returned.
3011 * @stride: The difference in slab number between successive slabs.
3013 * Iteration always occurs from higher to lower numbered slabs.
3015 * Return: An initialized iterator structure.
3017 static struct slab_iterator
get_depot_slab_iterator(struct slab_depot
*depot
,
3018 slab_count_t start
, slab_count_t end
,
3019 slab_count_t stride
)
3021 struct vdo_slab
**slabs
= depot
->slabs
;
3023 return (struct slab_iterator
) {
3025 .next
= (((slabs
== NULL
) || (start
< end
)) ? NULL
: slabs
[start
]),
3031 static struct slab_iterator
get_slab_iterator(const struct block_allocator
*allocator
)
3033 return get_depot_slab_iterator(allocator
->depot
, allocator
->last_slab
,
3034 allocator
->zone_number
,
3035 allocator
->depot
->zone_count
);
3039 * next_slab() - Get the next slab from a slab_iterator and advance the iterator
3040 * @iterator: The slab_iterator.
3042 * Return: The next slab or NULL if the iterator is exhausted.
3044 static struct vdo_slab
*next_slab(struct slab_iterator
*iterator
)
3046 struct vdo_slab
*slab
= iterator
->next
;
3048 if ((slab
== NULL
) || (slab
->slab_number
< iterator
->end
+ iterator
->stride
))
3049 iterator
->next
= NULL
;
3051 iterator
->next
= iterator
->slabs
[slab
->slab_number
- iterator
->stride
];
3057 * abort_waiter() - Abort vios waiting to make journal entries when read-only.
3059 * This callback is invoked on all vios waiting to make slab journal entries after the VDO has gone
3060 * into read-only mode. Implements waiter_callback_fn.
3062 static void abort_waiter(struct vdo_waiter
*waiter
, void *context __always_unused
)
3064 struct reference_updater
*updater
=
3065 container_of(waiter
, struct reference_updater
, waiter
);
3066 struct data_vio
*data_vio
= data_vio_from_reference_updater(updater
);
3068 if (updater
->increment
) {
3069 continue_data_vio_with_error(data_vio
, VDO_READ_ONLY
);
3073 vdo_continue_completion(&data_vio
->decrement_completion
, VDO_READ_ONLY
);
3076 /* Implements vdo_read_only_notification_fn. */
3077 static void notify_block_allocator_of_read_only_mode(void *listener
,
3078 struct vdo_completion
*parent
)
3080 struct block_allocator
*allocator
= listener
;
3081 struct slab_iterator iterator
;
3083 assert_on_allocator_thread(allocator
->thread_id
, __func__
);
3084 iterator
= get_slab_iterator(allocator
);
3085 while (iterator
.next
!= NULL
) {
3086 struct vdo_slab
*slab
= next_slab(&iterator
);
3088 vdo_waitq_notify_all_waiters(&slab
->journal
.entry_waiters
,
3089 abort_waiter
, &slab
->journal
);
3090 check_if_slab_drained(slab
);
3093 vdo_finish_completion(parent
);
3097 * vdo_acquire_provisional_reference() - Acquire a provisional reference on behalf of a PBN lock if
3098 * the block it locks is unreferenced.
3099 * @slab: The slab which contains the block.
3100 * @pbn: The physical block to reference.
3103 * Return: VDO_SUCCESS or an error.
3105 int vdo_acquire_provisional_reference(struct vdo_slab
*slab
, physical_block_number_t pbn
,
3106 struct pbn_lock
*lock
)
3108 slab_block_number block_number
;
3111 if (vdo_pbn_lock_has_provisional_reference(lock
))
3114 if (!is_slab_open(slab
))
3115 return VDO_INVALID_ADMIN_STATE
;
3117 result
= slab_block_number_from_pbn(slab
, pbn
, &block_number
);
3118 if (result
!= VDO_SUCCESS
)
3121 if (slab
->counters
[block_number
] == EMPTY_REFERENCE_COUNT
) {
3122 make_provisional_reference(slab
, block_number
);
3124 vdo_assign_pbn_lock_provisional_reference(lock
);
3127 if (vdo_pbn_lock_has_provisional_reference(lock
))
3128 adjust_free_block_count(slab
, false);
3133 static int __must_check
allocate_slab_block(struct vdo_slab
*slab
,
3134 physical_block_number_t
*block_number_ptr
)
3136 slab_block_number free_index
;
3138 if (!is_slab_open(slab
))
3139 return VDO_INVALID_ADMIN_STATE
;
3141 if (!search_reference_blocks(slab
, &free_index
))
3142 return VDO_NO_SPACE
;
3144 VDO_ASSERT_LOG_ONLY((slab
->counters
[free_index
] == EMPTY_REFERENCE_COUNT
),
3145 "free block must have ref count of zero");
3146 make_provisional_reference(slab
, free_index
);
3147 adjust_free_block_count(slab
, false);
3150 * Update the search hint so the next search will start at the array index just past the
3151 * free block we just found.
3153 slab
->search_cursor
.index
= (free_index
+ 1);
3155 *block_number_ptr
= slab
->start
+ free_index
;
3160 * open_slab() - Prepare a slab to be allocated from.
3163 static void open_slab(struct vdo_slab
*slab
)
3165 reset_search_cursor(slab
);
3166 if (is_slab_journal_blank(slab
)) {
3167 WRITE_ONCE(slab
->allocator
->statistics
.slabs_opened
,
3168 slab
->allocator
->statistics
.slabs_opened
+ 1);
3169 dirty_all_reference_blocks(slab
);
3171 WRITE_ONCE(slab
->allocator
->statistics
.slabs_reopened
,
3172 slab
->allocator
->statistics
.slabs_reopened
+ 1);
3175 slab
->allocator
->open_slab
= slab
;
3180 * The block allocated will have a provisional reference and the reference must be either confirmed
3181 * with a subsequent increment or vacated with a subsequent decrement via
3182 * vdo_release_block_reference().
3184 int vdo_allocate_block(struct block_allocator
*allocator
,
3185 physical_block_number_t
*block_number_ptr
)
3189 if (allocator
->open_slab
!= NULL
) {
3190 /* Try to allocate the next block in the currently open slab. */
3191 result
= allocate_slab_block(allocator
->open_slab
, block_number_ptr
);
3192 if ((result
== VDO_SUCCESS
) || (result
!= VDO_NO_SPACE
))
3195 /* Put the exhausted open slab back into the priority table. */
3196 prioritize_slab(allocator
->open_slab
);
3199 /* Remove the highest priority slab from the priority table and make it the open slab. */
3200 open_slab(list_entry(vdo_priority_table_dequeue(allocator
->prioritized_slabs
),
3201 struct vdo_slab
, allocq_entry
));
3204 * Try allocating again. If we're out of space immediately after opening a slab, then every
3205 * slab must be fully allocated.
3207 return allocate_slab_block(allocator
->open_slab
, block_number_ptr
);
3211 * vdo_enqueue_clean_slab_waiter() - Wait for a clean slab.
3212 * @allocator: The block_allocator on which to wait.
3213 * @waiter: The waiter.
3215 * Return: VDO_SUCCESS if the waiter was queued, VDO_NO_SPACE if there are no slabs to scrub, and
3216 * some other error otherwise.
3218 int vdo_enqueue_clean_slab_waiter(struct block_allocator
*allocator
,
3219 struct vdo_waiter
*waiter
)
3221 if (vdo_is_read_only(allocator
->depot
->vdo
))
3222 return VDO_READ_ONLY
;
3224 if (vdo_is_state_quiescent(&allocator
->scrubber
.admin_state
))
3225 return VDO_NO_SPACE
;
3227 vdo_waitq_enqueue_waiter(&allocator
->scrubber
.waiters
, waiter
);
3232 * vdo_modify_reference_count() - Modify the reference count of a block by first making a slab
3233 * journal entry and then updating the reference counter.
3234 * @completion: The data_vio completion for which to add the entry.
3235 * @updater: Which of the data_vio's reference updaters is being submitted.
3237 void vdo_modify_reference_count(struct vdo_completion
*completion
,
3238 struct reference_updater
*updater
)
3240 struct vdo_slab
*slab
= vdo_get_slab(completion
->vdo
->depot
, updater
->zpbn
.pbn
);
3242 if (!is_slab_open(slab
)) {
3243 vdo_continue_completion(completion
, VDO_INVALID_ADMIN_STATE
);
3247 if (vdo_is_read_only(completion
->vdo
)) {
3248 vdo_continue_completion(completion
, VDO_READ_ONLY
);
3252 vdo_waitq_enqueue_waiter(&slab
->journal
.entry_waiters
, &updater
->waiter
);
3253 if ((slab
->status
!= VDO_SLAB_REBUILT
) && requires_reaping(&slab
->journal
))
3254 register_slab_for_scrubbing(slab
, true);
3256 add_entries(&slab
->journal
);
3259 /* Release an unused provisional reference. */
3260 int vdo_release_block_reference(struct block_allocator
*allocator
,
3261 physical_block_number_t pbn
)
3263 struct reference_updater updater
;
3265 if (pbn
== VDO_ZERO_BLOCK
)
3268 updater
= (struct reference_updater
) {
3269 .operation
= VDO_JOURNAL_DATA_REMAPPING
,
3276 return adjust_reference_count(vdo_get_slab(allocator
->depot
, pbn
),
3281 * This is a min_heap callback function orders slab_status structures using the 'is_clean' field as
3282 * the primary key and the 'emptiness' field as the secondary key.
3284 * Slabs need to be pushed onto the rings in the same order they are to be popped off. Popping
3285 * should always get the most empty first, so pushing should be from most empty to least empty.
3286 * Thus, the ordering is reversed from the usual sense since min_heap returns smaller elements
3287 * before larger ones.
3289 static bool slab_status_is_less_than(const void *item1
, const void *item2
,
3290 void __always_unused
*args
)
3292 const struct slab_status
*info1
= item1
;
3293 const struct slab_status
*info2
= item2
;
3295 if (info1
->is_clean
!= info2
->is_clean
)
3296 return info1
->is_clean
;
3297 if (info1
->emptiness
!= info2
->emptiness
)
3298 return info1
->emptiness
> info2
->emptiness
;
3299 return info1
->slab_number
< info2
->slab_number
;
3302 static const struct min_heap_callbacks slab_status_min_heap
= {
3303 .less
= slab_status_is_less_than
,
3307 /* Inform the slab actor that a action has finished on some slab; used by apply_to_slabs(). */
3308 static void slab_action_callback(struct vdo_completion
*completion
)
3310 struct block_allocator
*allocator
= vdo_as_block_allocator(completion
);
3311 struct slab_actor
*actor
= &allocator
->slab_actor
;
3313 if (--actor
->slab_action_count
== 0) {
3314 actor
->callback(completion
);
3318 vdo_reset_completion(completion
);
3321 /* Preserve the error from part of an action and continue. */
3322 static void handle_operation_error(struct vdo_completion
*completion
)
3324 struct block_allocator
*allocator
= vdo_as_block_allocator(completion
);
3326 if (allocator
->state
.waiter
!= NULL
)
3327 vdo_set_completion_result(allocator
->state
.waiter
, completion
->result
);
3328 completion
->callback(completion
);
3331 /* Perform an action on each of an allocator's slabs in parallel. */
3332 static void apply_to_slabs(struct block_allocator
*allocator
, vdo_action_fn callback
)
3334 struct slab_iterator iterator
;
3336 vdo_prepare_completion(&allocator
->completion
, slab_action_callback
,
3337 handle_operation_error
, allocator
->thread_id
, NULL
);
3338 allocator
->completion
.requeue
= false;
3341 * Since we are going to dequeue all of the slabs, the open slab will become invalid, so
3344 allocator
->open_slab
= NULL
;
3346 /* Ensure that we don't finish before we're done starting. */
3347 allocator
->slab_actor
= (struct slab_actor
) {
3348 .slab_action_count
= 1,
3349 .callback
= callback
,
3352 iterator
= get_slab_iterator(allocator
);
3353 while (iterator
.next
!= NULL
) {
3354 const struct admin_state_code
*operation
=
3355 vdo_get_admin_state_code(&allocator
->state
);
3356 struct vdo_slab
*slab
= next_slab(&iterator
);
3358 list_del_init(&slab
->allocq_entry
);
3359 allocator
->slab_actor
.slab_action_count
++;
3360 vdo_start_operation_with_waiter(&slab
->state
, operation
,
3361 &allocator
->completion
,
3362 initiate_slab_action
);
3365 slab_action_callback(&allocator
->completion
);
3368 static void finish_loading_allocator(struct vdo_completion
*completion
)
3370 struct block_allocator
*allocator
= vdo_as_block_allocator(completion
);
3371 const struct admin_state_code
*operation
=
3372 vdo_get_admin_state_code(&allocator
->state
);
3374 if (allocator
->eraser
!= NULL
)
3375 dm_kcopyd_client_destroy(vdo_forget(allocator
->eraser
));
3377 if (operation
== VDO_ADMIN_STATE_LOADING_FOR_RECOVERY
) {
3379 vdo_get_current_action_context(allocator
->depot
->action_manager
);
3381 vdo_replay_into_slab_journals(allocator
, context
);
3385 vdo_finish_loading(&allocator
->state
);
3388 static void erase_next_slab_journal(struct block_allocator
*allocator
);
3390 static void copy_callback(int read_err
, unsigned long write_err
, void *context
)
3392 struct block_allocator
*allocator
= context
;
3393 int result
= (((read_err
== 0) && (write_err
== 0)) ? VDO_SUCCESS
: -EIO
);
3395 if (result
!= VDO_SUCCESS
) {
3396 vdo_fail_completion(&allocator
->completion
, result
);
3400 erase_next_slab_journal(allocator
);
3403 /* erase_next_slab_journal() - Erase the next slab journal. */
3404 static void erase_next_slab_journal(struct block_allocator
*allocator
)
3406 struct vdo_slab
*slab
;
3407 physical_block_number_t pbn
;
3408 struct dm_io_region regions
[1];
3409 struct slab_depot
*depot
= allocator
->depot
;
3410 block_count_t blocks
= depot
->slab_config
.slab_journal_blocks
;
3412 if (allocator
->slabs_to_erase
.next
== NULL
) {
3413 vdo_finish_completion(&allocator
->completion
);
3417 slab
= next_slab(&allocator
->slabs_to_erase
);
3418 pbn
= slab
->journal_origin
- depot
->vdo
->geometry
.bio_offset
;
3419 regions
[0] = (struct dm_io_region
) {
3420 .bdev
= vdo_get_backing_device(depot
->vdo
),
3421 .sector
= pbn
* VDO_SECTORS_PER_BLOCK
,
3422 .count
= blocks
* VDO_SECTORS_PER_BLOCK
,
3424 dm_kcopyd_zero(allocator
->eraser
, 1, regions
, 0, copy_callback
, allocator
);
3427 /* Implements vdo_admin_initiator_fn. */
3428 static void initiate_load(struct admin_state
*state
)
3430 struct block_allocator
*allocator
=
3431 container_of(state
, struct block_allocator
, state
);
3432 const struct admin_state_code
*operation
= vdo_get_admin_state_code(state
);
3434 if (operation
== VDO_ADMIN_STATE_LOADING_FOR_REBUILD
) {
3436 * Must requeue because the kcopyd client cannot be freed in the same stack frame
3437 * as the kcopyd callback, lest it deadlock.
3439 vdo_prepare_completion_for_requeue(&allocator
->completion
,
3440 finish_loading_allocator
,
3441 handle_operation_error
,
3442 allocator
->thread_id
, NULL
);
3443 allocator
->eraser
= dm_kcopyd_client_create(NULL
);
3444 if (IS_ERR(allocator
->eraser
)) {
3445 vdo_fail_completion(&allocator
->completion
,
3446 PTR_ERR(allocator
->eraser
));
3447 allocator
->eraser
= NULL
;
3450 allocator
->slabs_to_erase
= get_slab_iterator(allocator
);
3452 erase_next_slab_journal(allocator
);
3456 apply_to_slabs(allocator
, finish_loading_allocator
);
3460 * vdo_notify_slab_journals_are_recovered() - Inform a block allocator that its slab journals have
3461 * been recovered from the recovery journal.
3462 * @completion The allocator completion
3464 void vdo_notify_slab_journals_are_recovered(struct vdo_completion
*completion
)
3466 struct block_allocator
*allocator
= vdo_as_block_allocator(completion
);
3468 vdo_finish_loading_with_result(&allocator
->state
, completion
->result
);
3471 static int get_slab_statuses(struct block_allocator
*allocator
,
3472 struct slab_status
**statuses_ptr
)
3475 struct slab_status
*statuses
;
3476 struct slab_iterator iterator
= get_slab_iterator(allocator
);
3478 result
= vdo_allocate(allocator
->slab_count
, struct slab_status
, __func__
,
3480 if (result
!= VDO_SUCCESS
)
3483 *statuses_ptr
= statuses
;
3485 while (iterator
.next
!= NULL
) {
3486 slab_count_t slab_number
= next_slab(&iterator
)->slab_number
;
3488 *statuses
++ = (struct slab_status
) {
3489 .slab_number
= slab_number
,
3490 .is_clean
= !allocator
->summary_entries
[slab_number
].is_dirty
,
3491 .emptiness
= allocator
->summary_entries
[slab_number
].fullness_hint
,
3498 /* Prepare slabs for allocation or scrubbing. */
3499 static int __must_check
vdo_prepare_slabs_for_allocation(struct block_allocator
*allocator
)
3501 struct slab_status current_slab_status
;
3502 DEFINE_MIN_HEAP(struct slab_status
, heap
) heap
;
3504 struct slab_status
*slab_statuses
;
3505 struct slab_depot
*depot
= allocator
->depot
;
3507 WRITE_ONCE(allocator
->allocated_blocks
,
3508 allocator
->slab_count
* depot
->slab_config
.data_blocks
);
3509 result
= get_slab_statuses(allocator
, &slab_statuses
);
3510 if (result
!= VDO_SUCCESS
)
3513 /* Sort the slabs by cleanliness, then by emptiness hint. */
3514 heap
= (struct heap
) {
3515 .data
= slab_statuses
,
3516 .nr
= allocator
->slab_count
,
3517 .size
= allocator
->slab_count
,
3519 min_heapify_all(&heap
, &slab_status_min_heap
, NULL
);
3521 while (heap
.nr
> 0) {
3523 struct vdo_slab
*slab
;
3524 struct slab_journal
*journal
;
3526 current_slab_status
= slab_statuses
[0];
3527 min_heap_pop(&heap
, &slab_status_min_heap
, NULL
);
3528 slab
= depot
->slabs
[current_slab_status
.slab_number
];
3530 if ((depot
->load_type
== VDO_SLAB_DEPOT_REBUILD_LOAD
) ||
3531 (!allocator
->summary_entries
[slab
->slab_number
].load_ref_counts
&&
3532 current_slab_status
.is_clean
)) {
3537 slab
->status
= VDO_SLAB_REQUIRES_SCRUBBING
;
3538 journal
= &slab
->journal
;
3539 high_priority
= ((current_slab_status
.is_clean
&&
3540 (depot
->load_type
== VDO_SLAB_DEPOT_NORMAL_LOAD
)) ||
3541 (journal_length(journal
) >= journal
->scrubbing_threshold
));
3542 register_slab_for_scrubbing(slab
, high_priority
);
3545 vdo_free(slab_statuses
);
3549 static const char *status_to_string(enum slab_rebuild_status status
)
3552 case VDO_SLAB_REBUILT
:
3554 case VDO_SLAB_REQUIRES_SCRUBBING
:
3556 case VDO_SLAB_REQUIRES_HIGH_PRIORITY_SCRUBBING
:
3557 return "PRIORITY_SCRUBBING";
3558 case VDO_SLAB_REBUILDING
:
3559 return "REBUILDING";
3560 case VDO_SLAB_REPLAYING
:
3567 void vdo_dump_block_allocator(const struct block_allocator
*allocator
)
3569 unsigned int pause_counter
= 0;
3570 struct slab_iterator iterator
= get_slab_iterator(allocator
);
3571 const struct slab_scrubber
*scrubber
= &allocator
->scrubber
;
3573 vdo_log_info("block_allocator zone %u", allocator
->zone_number
);
3574 while (iterator
.next
!= NULL
) {
3575 struct vdo_slab
*slab
= next_slab(&iterator
);
3576 struct slab_journal
*journal
= &slab
->journal
;
3578 if (slab
->reference_blocks
!= NULL
) {
3579 /* Terse because there are a lot of slabs to dump and syslog is lossy. */
3580 vdo_log_info("slab %u: P%u, %llu free", slab
->slab_number
,
3582 (unsigned long long) slab
->free_blocks
);
3584 vdo_log_info("slab %u: status %s", slab
->slab_number
,
3585 status_to_string(slab
->status
));
3588 vdo_log_info(" slab journal: entry_waiters=%zu waiting_to_commit=%s updating_slab_summary=%s head=%llu unreapable=%llu tail=%llu next_commit=%llu summarized=%llu last_summarized=%llu recovery_lock=%llu dirty=%s",
3589 vdo_waitq_num_waiters(&journal
->entry_waiters
),
3590 vdo_bool_to_string(journal
->waiting_to_commit
),
3591 vdo_bool_to_string(journal
->updating_slab_summary
),
3592 (unsigned long long) journal
->head
,
3593 (unsigned long long) journal
->unreapable
,
3594 (unsigned long long) journal
->tail
,
3595 (unsigned long long) journal
->next_commit
,
3596 (unsigned long long) journal
->summarized
,
3597 (unsigned long long) journal
->last_summarized
,
3598 (unsigned long long) journal
->recovery_lock
,
3599 vdo_bool_to_string(journal
->recovery_lock
!= 0));
3601 * Given the frequency with which the locks are just a tiny bit off, it might be
3602 * worth dumping all the locks, but that might be too much logging.
3605 if (slab
->counters
!= NULL
) {
3606 /* Terse because there are a lot of slabs to dump and syslog is lossy. */
3607 vdo_log_info(" slab: free=%u/%u blocks=%u dirty=%zu active=%zu journal@(%llu,%u)",
3608 slab
->free_blocks
, slab
->block_count
,
3609 slab
->reference_block_count
,
3610 vdo_waitq_num_waiters(&slab
->dirty_blocks
),
3612 (unsigned long long) slab
->slab_journal_point
.sequence_number
,
3613 slab
->slab_journal_point
.entry_count
);
3615 vdo_log_info(" no counters");
3619 * Wait for a while after each batch of 32 slabs dumped, an arbitrary number,
3620 * allowing the kernel log a chance to be flushed instead of being overrun.
3622 if (pause_counter
++ == 31) {
3624 vdo_pause_for_logger();
3628 vdo_log_info("slab_scrubber slab_count %u waiters %zu %s%s",
3629 READ_ONCE(scrubber
->slab_count
),
3630 vdo_waitq_num_waiters(&scrubber
->waiters
),
3631 vdo_get_admin_state_code(&scrubber
->admin_state
)->name
,
3632 scrubber
->high_priority_only
? ", high_priority_only " : "");
3635 static void free_slab(struct vdo_slab
*slab
)
3640 list_del(&slab
->allocq_entry
);
3641 vdo_free(vdo_forget(slab
->journal
.block
));
3642 vdo_free(vdo_forget(slab
->journal
.locks
));
3643 vdo_free(vdo_forget(slab
->counters
));
3644 vdo_free(vdo_forget(slab
->reference_blocks
));
3648 static int initialize_slab_journal(struct vdo_slab
*slab
)
3650 struct slab_journal
*journal
= &slab
->journal
;
3651 const struct slab_config
*slab_config
= &slab
->allocator
->depot
->slab_config
;
3654 result
= vdo_allocate(slab_config
->slab_journal_blocks
, struct journal_lock
,
3655 __func__
, &journal
->locks
);
3656 if (result
!= VDO_SUCCESS
)
3659 result
= vdo_allocate(VDO_BLOCK_SIZE
, char, "struct packed_slab_journal_block",
3660 (char **) &journal
->block
);
3661 if (result
!= VDO_SUCCESS
)
3664 journal
->slab
= slab
;
3665 journal
->size
= slab_config
->slab_journal_blocks
;
3666 journal
->flushing_threshold
= slab_config
->slab_journal_flushing_threshold
;
3667 journal
->blocking_threshold
= slab_config
->slab_journal_blocking_threshold
;
3668 journal
->scrubbing_threshold
= slab_config
->slab_journal_scrubbing_threshold
;
3669 journal
->entries_per_block
= VDO_SLAB_JOURNAL_ENTRIES_PER_BLOCK
;
3670 journal
->full_entries_per_block
= VDO_SLAB_JOURNAL_FULL_ENTRIES_PER_BLOCK
;
3671 journal
->events
= &slab
->allocator
->slab_journal_statistics
;
3672 journal
->recovery_journal
= slab
->allocator
->depot
->vdo
->recovery_journal
;
3676 journal
->flushing_deadline
= journal
->flushing_threshold
;
3678 * Set there to be some time between the deadline and the blocking threshold, so that
3679 * hopefully all are done before blocking.
3681 if ((journal
->blocking_threshold
- journal
->flushing_threshold
) > 5)
3682 journal
->flushing_deadline
= journal
->blocking_threshold
- 5;
3684 journal
->slab_summary_waiter
.callback
= release_journal_locks
;
3686 INIT_LIST_HEAD(&journal
->dirty_entry
);
3687 INIT_LIST_HEAD(&journal
->uncommitted_blocks
);
3689 journal
->tail_header
.nonce
= slab
->allocator
->nonce
;
3690 journal
->tail_header
.metadata_type
= VDO_METADATA_SLAB_JOURNAL
;
3691 initialize_journal_state(journal
);
3696 * make_slab() - Construct a new, empty slab.
3697 * @slab_origin: The physical block number within the block allocator partition of the first block
3699 * @allocator: The block allocator to which the slab belongs.
3700 * @slab_number: The slab number of the slab.
3701 * @is_new: true if this slab is being allocated as part of a resize.
3702 * @slab_ptr: A pointer to receive the new slab.
3704 * Return: VDO_SUCCESS or an error code.
3706 static int __must_check
make_slab(physical_block_number_t slab_origin
,
3707 struct block_allocator
*allocator
,
3708 slab_count_t slab_number
, bool is_new
,
3709 struct vdo_slab
**slab_ptr
)
3711 const struct slab_config
*slab_config
= &allocator
->depot
->slab_config
;
3712 struct vdo_slab
*slab
;
3715 result
= vdo_allocate(1, struct vdo_slab
, __func__
, &slab
);
3716 if (result
!= VDO_SUCCESS
)
3719 *slab
= (struct vdo_slab
) {
3720 .allocator
= allocator
,
3721 .start
= slab_origin
,
3722 .end
= slab_origin
+ slab_config
->slab_blocks
,
3723 .slab_number
= slab_number
,
3724 .ref_counts_origin
= slab_origin
+ slab_config
->data_blocks
,
3726 vdo_get_slab_journal_start_block(slab_config
, slab_origin
),
3727 .block_count
= slab_config
->data_blocks
,
3728 .free_blocks
= slab_config
->data_blocks
,
3729 .reference_block_count
=
3730 vdo_get_saved_reference_count_size(slab_config
->data_blocks
),
3732 INIT_LIST_HEAD(&slab
->allocq_entry
);
3734 result
= initialize_slab_journal(slab
);
3735 if (result
!= VDO_SUCCESS
) {
3741 vdo_set_admin_state_code(&slab
->state
, VDO_ADMIN_STATE_NEW
);
3742 result
= allocate_slab_counters(slab
);
3743 if (result
!= VDO_SUCCESS
) {
3748 vdo_set_admin_state_code(&slab
->state
, VDO_ADMIN_STATE_NORMAL_OPERATION
);
3756 * allocate_slabs() - Allocate a new slab pointer array.
3757 * @depot: The depot.
3758 * @slab_count: The number of slabs the depot should have in the new array.
3760 * Any existing slab pointers will be copied into the new array, and slabs will be allocated as
3761 * needed. The newly allocated slabs will not be distributed for use by the block allocators.
3763 * Return: VDO_SUCCESS or an error code.
3765 static int allocate_slabs(struct slab_depot
*depot
, slab_count_t slab_count
)
3767 block_count_t slab_size
;
3768 bool resizing
= false;
3769 physical_block_number_t slab_origin
;
3772 result
= vdo_allocate(slab_count
, struct vdo_slab
*,
3773 "slab pointer array", &depot
->new_slabs
);
3774 if (result
!= VDO_SUCCESS
)
3777 if (depot
->slabs
!= NULL
) {
3778 memcpy(depot
->new_slabs
, depot
->slabs
,
3779 depot
->slab_count
* sizeof(struct vdo_slab
*));
3783 slab_size
= depot
->slab_config
.slab_blocks
;
3784 slab_origin
= depot
->first_block
+ (depot
->slab_count
* slab_size
);
3786 for (depot
->new_slab_count
= depot
->slab_count
;
3787 depot
->new_slab_count
< slab_count
;
3788 depot
->new_slab_count
++, slab_origin
+= slab_size
) {
3789 struct block_allocator
*allocator
=
3790 &depot
->allocators
[depot
->new_slab_count
% depot
->zone_count
];
3791 struct vdo_slab
**slab_ptr
= &depot
->new_slabs
[depot
->new_slab_count
];
3793 result
= make_slab(slab_origin
, allocator
, depot
->new_slab_count
,
3794 resizing
, slab_ptr
);
3795 if (result
!= VDO_SUCCESS
)
3803 * vdo_abandon_new_slabs() - Abandon any new slabs in this depot, freeing them as needed.
3804 * @depot: The depot.
3806 void vdo_abandon_new_slabs(struct slab_depot
*depot
)
3810 if (depot
->new_slabs
== NULL
)
3813 for (i
= depot
->slab_count
; i
< depot
->new_slab_count
; i
++)
3814 free_slab(vdo_forget(depot
->new_slabs
[i
]));
3815 depot
->new_slab_count
= 0;
3816 depot
->new_size
= 0;
3817 vdo_free(vdo_forget(depot
->new_slabs
));
3821 * get_allocator_thread_id() - Get the ID of the thread on which a given allocator operates.
3823 * Implements vdo_zone_thread_getter_fn.
3825 static thread_id_t
get_allocator_thread_id(void *context
, zone_count_t zone_number
)
3827 return ((struct slab_depot
*) context
)->allocators
[zone_number
].thread_id
;
3831 * release_recovery_journal_lock() - Request the slab journal to release the recovery journal lock
3832 * it may hold on a specified recovery journal block.
3833 * @journal: The slab journal.
3834 * @recovery_lock: The sequence number of the recovery journal block whose locks should be
3837 * Return: true if the journal does hold a lock on the specified block (which it will release).
3839 static bool __must_check
release_recovery_journal_lock(struct slab_journal
*journal
,
3840 sequence_number_t recovery_lock
)
3842 if (recovery_lock
> journal
->recovery_lock
) {
3843 VDO_ASSERT_LOG_ONLY((recovery_lock
< journal
->recovery_lock
),
3844 "slab journal recovery lock is not older than the recovery journal head");
3848 if ((recovery_lock
< journal
->recovery_lock
) ||
3849 vdo_is_read_only(journal
->slab
->allocator
->depot
->vdo
))
3852 /* All locks are held by the block which is in progress; write it. */
3853 commit_tail(journal
);
3858 * Request a commit of all dirty tail blocks which are locking the recovery journal block the depot
3859 * is seeking to release.
3861 * Implements vdo_zone_action_fn.
3863 static void release_tail_block_locks(void *context
, zone_count_t zone_number
,
3864 struct vdo_completion
*parent
)
3866 struct slab_journal
*journal
, *tmp
;
3867 struct slab_depot
*depot
= context
;
3868 struct list_head
*list
= &depot
->allocators
[zone_number
].dirty_slab_journals
;
3870 list_for_each_entry_safe(journal
, tmp
, list
, dirty_entry
) {
3871 if (!release_recovery_journal_lock(journal
,
3872 depot
->active_release_request
))
3876 vdo_finish_completion(parent
);
3880 * prepare_for_tail_block_commit() - Prepare to commit oldest tail blocks.
3882 * Implements vdo_action_preamble_fn.
3884 static void prepare_for_tail_block_commit(void *context
, struct vdo_completion
*parent
)
3886 struct slab_depot
*depot
= context
;
3888 depot
->active_release_request
= depot
->new_release_request
;
3889 vdo_finish_completion(parent
);
3893 * schedule_tail_block_commit() - Schedule a tail block commit if necessary.
3895 * This method should not be called directly. Rather, call vdo_schedule_default_action() on the
3896 * depot's action manager.
3898 * Implements vdo_action_scheduler_fn.
3900 static bool schedule_tail_block_commit(void *context
)
3902 struct slab_depot
*depot
= context
;
3904 if (depot
->new_release_request
== depot
->active_release_request
)
3907 return vdo_schedule_action(depot
->action_manager
,
3908 prepare_for_tail_block_commit
,
3909 release_tail_block_locks
,
3914 * initialize_slab_scrubber() - Initialize an allocator's slab scrubber.
3915 * @allocator: The allocator being initialized
3917 * Return: VDO_SUCCESS or an error.
3919 static int initialize_slab_scrubber(struct block_allocator
*allocator
)
3921 struct slab_scrubber
*scrubber
= &allocator
->scrubber
;
3922 block_count_t slab_journal_size
=
3923 allocator
->depot
->slab_config
.slab_journal_blocks
;
3927 result
= vdo_allocate(VDO_BLOCK_SIZE
* slab_journal_size
,
3928 char, __func__
, &journal_data
);
3929 if (result
!= VDO_SUCCESS
)
3932 result
= allocate_vio_components(allocator
->completion
.vdo
,
3933 VIO_TYPE_SLAB_JOURNAL
,
3934 VIO_PRIORITY_METADATA
,
3935 allocator
, slab_journal_size
,
3936 journal_data
, &scrubber
->vio
);
3937 if (result
!= VDO_SUCCESS
) {
3938 vdo_free(journal_data
);
3942 INIT_LIST_HEAD(&scrubber
->high_priority_slabs
);
3943 INIT_LIST_HEAD(&scrubber
->slabs
);
3944 vdo_set_admin_state_code(&scrubber
->admin_state
, VDO_ADMIN_STATE_SUSPENDED
);
3949 * initialize_slab_summary_block() - Initialize a slab_summary_block.
3950 * @allocator: The allocator which owns the block.
3951 * @index: The index of this block in its zone's summary.
3953 * Return: VDO_SUCCESS or an error.
3955 static int __must_check
initialize_slab_summary_block(struct block_allocator
*allocator
,
3956 block_count_t index
)
3958 struct slab_summary_block
*block
= &allocator
->summary_blocks
[index
];
3961 result
= vdo_allocate(VDO_BLOCK_SIZE
, char, __func__
, &block
->outgoing_entries
);
3962 if (result
!= VDO_SUCCESS
)
3965 result
= allocate_vio_components(allocator
->depot
->vdo
, VIO_TYPE_SLAB_SUMMARY
,
3966 VIO_PRIORITY_METADATA
, NULL
, 1,
3967 block
->outgoing_entries
, &block
->vio
);
3968 if (result
!= VDO_SUCCESS
)
3971 block
->allocator
= allocator
;
3972 block
->entries
= &allocator
->summary_entries
[VDO_SLAB_SUMMARY_ENTRIES_PER_BLOCK
* index
];
3973 block
->index
= index
;
3977 static int __must_check
initialize_block_allocator(struct slab_depot
*depot
,
3982 struct block_allocator
*allocator
= &depot
->allocators
[zone
];
3983 struct vdo
*vdo
= depot
->vdo
;
3984 block_count_t max_free_blocks
= depot
->slab_config
.data_blocks
;
3985 unsigned int max_priority
= (2 + ilog2(max_free_blocks
));
3987 *allocator
= (struct block_allocator
) {
3989 .zone_number
= zone
,
3990 .thread_id
= vdo
->thread_config
.physical_threads
[zone
],
3991 .nonce
= vdo
->states
.vdo
.nonce
,
3994 INIT_LIST_HEAD(&allocator
->dirty_slab_journals
);
3995 vdo_set_admin_state_code(&allocator
->state
, VDO_ADMIN_STATE_NORMAL_OPERATION
);
3996 result
= vdo_register_read_only_listener(vdo
, allocator
,
3997 notify_block_allocator_of_read_only_mode
,
3998 allocator
->thread_id
);
3999 if (result
!= VDO_SUCCESS
)
4002 vdo_initialize_completion(&allocator
->completion
, vdo
, VDO_BLOCK_ALLOCATOR_COMPLETION
);
4003 result
= make_vio_pool(vdo
, BLOCK_ALLOCATOR_VIO_POOL_SIZE
, allocator
->thread_id
,
4004 VIO_TYPE_SLAB_JOURNAL
, VIO_PRIORITY_METADATA
,
4005 allocator
, &allocator
->vio_pool
);
4006 if (result
!= VDO_SUCCESS
)
4009 result
= initialize_slab_scrubber(allocator
);
4010 if (result
!= VDO_SUCCESS
)
4013 result
= vdo_make_priority_table(max_priority
, &allocator
->prioritized_slabs
);
4014 if (result
!= VDO_SUCCESS
)
4017 result
= vdo_allocate(VDO_SLAB_SUMMARY_BLOCKS_PER_ZONE
,
4018 struct slab_summary_block
, __func__
,
4019 &allocator
->summary_blocks
);
4020 if (result
!= VDO_SUCCESS
)
4023 vdo_set_admin_state_code(&allocator
->summary_state
,
4024 VDO_ADMIN_STATE_NORMAL_OPERATION
);
4025 allocator
->summary_entries
= depot
->summary_entries
+ (MAX_VDO_SLABS
* zone
);
4027 /* Initialize each summary block. */
4028 for (i
= 0; i
< VDO_SLAB_SUMMARY_BLOCKS_PER_ZONE
; i
++) {
4029 result
= initialize_slab_summary_block(allocator
, i
);
4030 if (result
!= VDO_SUCCESS
)
4035 * Performing well atop thin provisioned storage requires either that VDO discards freed
4036 * blocks, or that the block allocator try to use slabs that already have allocated blocks
4037 * in preference to slabs that have never been opened. For reasons we have not been able to
4038 * fully understand, some SSD machines have been have been very sensitive (50% reduction in
4039 * test throughput) to very slight differences in the timing and locality of block
4040 * allocation. Assigning a low priority to unopened slabs (max_priority/2, say) would be
4041 * ideal for the story, but anything less than a very high threshold (max_priority - 1)
4042 * hurts on these machines.
4044 * This sets the free block threshold for preferring to open an unopened slab to the binary
4045 * floor of 3/4ths the total number of data blocks in a slab, which will generally evaluate
4046 * to about half the slab size.
4048 allocator
->unopened_slab_priority
= (1 + ilog2((max_free_blocks
* 3) / 4));
4053 static int allocate_components(struct slab_depot
*depot
,
4054 struct partition
*summary_partition
)
4058 slab_count_t slab_count
;
4061 const struct thread_config
*thread_config
= &depot
->vdo
->thread_config
;
4063 result
= vdo_make_action_manager(depot
->zone_count
, get_allocator_thread_id
,
4064 thread_config
->journal_thread
, depot
,
4065 schedule_tail_block_commit
,
4066 depot
->vdo
, &depot
->action_manager
);
4067 if (result
!= VDO_SUCCESS
)
4070 depot
->origin
= depot
->first_block
;
4072 /* block size must be a multiple of entry size */
4073 BUILD_BUG_ON((VDO_BLOCK_SIZE
% sizeof(struct slab_summary_entry
)) != 0);
4075 depot
->summary_origin
= summary_partition
->offset
;
4076 depot
->hint_shift
= vdo_get_slab_summary_hint_shift(depot
->slab_size_shift
);
4077 result
= vdo_allocate(MAXIMUM_VDO_SLAB_SUMMARY_ENTRIES
,
4078 struct slab_summary_entry
, __func__
,
4079 &depot
->summary_entries
);
4080 if (result
!= VDO_SUCCESS
)
4084 /* Initialize all the entries. */
4085 hint
= compute_fullness_hint(depot
, depot
->slab_config
.data_blocks
);
4086 for (i
= 0; i
< MAXIMUM_VDO_SLAB_SUMMARY_ENTRIES
; i
++) {
4088 * This default tail block offset must be reflected in
4089 * slabJournal.c::read_slab_journal_tail().
4091 depot
->summary_entries
[i
] = (struct slab_summary_entry
) {
4092 .tail_block_offset
= 0,
4093 .fullness_hint
= hint
,
4094 .load_ref_counts
= false,
4099 slab_count
= vdo_compute_slab_count(depot
->first_block
, depot
->last_block
,
4100 depot
->slab_size_shift
);
4101 if (thread_config
->physical_zone_count
> slab_count
) {
4102 return vdo_log_error_strerror(VDO_BAD_CONFIGURATION
,
4103 "%u physical zones exceeds slab count %u",
4104 thread_config
->physical_zone_count
,
4108 /* Initialize the block allocators. */
4109 for (zone
= 0; zone
< depot
->zone_count
; zone
++) {
4110 result
= initialize_block_allocator(depot
, zone
);
4111 if (result
!= VDO_SUCCESS
)
4115 /* Allocate slabs. */
4116 result
= allocate_slabs(depot
, slab_count
);
4117 if (result
!= VDO_SUCCESS
)
4120 /* Use the new slabs. */
4121 for (i
= depot
->slab_count
; i
< depot
->new_slab_count
; i
++) {
4122 struct vdo_slab
*slab
= depot
->new_slabs
[i
];
4124 register_slab_with_allocator(slab
->allocator
, slab
);
4125 WRITE_ONCE(depot
->slab_count
, depot
->slab_count
+ 1);
4128 depot
->slabs
= depot
->new_slabs
;
4129 depot
->new_slabs
= NULL
;
4130 depot
->new_slab_count
= 0;
4136 * vdo_decode_slab_depot() - Make a slab depot and configure it with the state read from the super
4138 * @state: The slab depot state from the super block.
4139 * @vdo: The VDO which will own the depot.
4140 * @summary_partition: The partition which holds the slab summary.
4141 * @depot_ptr: A pointer to hold the depot.
4143 * Return: A success or error code.
4145 int vdo_decode_slab_depot(struct slab_depot_state_2_0 state
, struct vdo
*vdo
,
4146 struct partition
*summary_partition
,
4147 struct slab_depot
**depot_ptr
)
4149 unsigned int slab_size_shift
;
4150 struct slab_depot
*depot
;
4154 * Calculate the bit shift for efficiently mapping block numbers to slabs. Using a shift
4155 * requires that the slab size be a power of two.
4157 block_count_t slab_size
= state
.slab_config
.slab_blocks
;
4159 if (!is_power_of_2(slab_size
)) {
4160 return vdo_log_error_strerror(UDS_INVALID_ARGUMENT
,
4161 "slab size must be a power of two");
4163 slab_size_shift
= ilog2(slab_size
);
4165 result
= vdo_allocate_extended(struct slab_depot
,
4166 vdo
->thread_config
.physical_zone_count
,
4167 struct block_allocator
, __func__
, &depot
);
4168 if (result
!= VDO_SUCCESS
)
4172 depot
->old_zone_count
= state
.zone_count
;
4173 depot
->zone_count
= vdo
->thread_config
.physical_zone_count
;
4174 depot
->slab_config
= state
.slab_config
;
4175 depot
->first_block
= state
.first_block
;
4176 depot
->last_block
= state
.last_block
;
4177 depot
->slab_size_shift
= slab_size_shift
;
4179 result
= allocate_components(depot
, summary_partition
);
4180 if (result
!= VDO_SUCCESS
) {
4181 vdo_free_slab_depot(depot
);
4189 static void uninitialize_allocator_summary(struct block_allocator
*allocator
)
4193 if (allocator
->summary_blocks
== NULL
)
4196 for (i
= 0; i
< VDO_SLAB_SUMMARY_BLOCKS_PER_ZONE
; i
++) {
4197 free_vio_components(&allocator
->summary_blocks
[i
].vio
);
4198 vdo_free(vdo_forget(allocator
->summary_blocks
[i
].outgoing_entries
));
4201 vdo_free(vdo_forget(allocator
->summary_blocks
));
4205 * vdo_free_slab_depot() - Destroy a slab depot.
4206 * @depot: The depot to destroy.
4208 void vdo_free_slab_depot(struct slab_depot
*depot
)
4210 zone_count_t zone
= 0;
4215 vdo_abandon_new_slabs(depot
);
4217 for (zone
= 0; zone
< depot
->zone_count
; zone
++) {
4218 struct block_allocator
*allocator
= &depot
->allocators
[zone
];
4220 if (allocator
->eraser
!= NULL
)
4221 dm_kcopyd_client_destroy(vdo_forget(allocator
->eraser
));
4223 uninitialize_allocator_summary(allocator
);
4224 uninitialize_scrubber_vio(&allocator
->scrubber
);
4225 free_vio_pool(vdo_forget(allocator
->vio_pool
));
4226 vdo_free_priority_table(vdo_forget(allocator
->prioritized_slabs
));
4229 if (depot
->slabs
!= NULL
) {
4232 for (i
= 0; i
< depot
->slab_count
; i
++)
4233 free_slab(vdo_forget(depot
->slabs
[i
]));
4236 vdo_free(vdo_forget(depot
->slabs
));
4237 vdo_free(vdo_forget(depot
->action_manager
));
4238 vdo_free(vdo_forget(depot
->summary_entries
));
4243 * vdo_record_slab_depot() - Record the state of a slab depot for encoding into the super block.
4244 * @depot: The depot to encode.
4246 * Return: The depot state.
4248 struct slab_depot_state_2_0
vdo_record_slab_depot(const struct slab_depot
*depot
)
4251 * If this depot is currently using 0 zones, it must have been synchronously loaded by a
4252 * tool and is now being saved. We did not load and combine the slab summary, so we still
4253 * need to do that next time we load with the old zone count rather than 0.
4255 struct slab_depot_state_2_0 state
;
4256 zone_count_t zones_to_record
= depot
->zone_count
;
4258 if (depot
->zone_count
== 0)
4259 zones_to_record
= depot
->old_zone_count
;
4261 state
= (struct slab_depot_state_2_0
) {
4262 .slab_config
= depot
->slab_config
,
4263 .first_block
= depot
->first_block
,
4264 .last_block
= depot
->last_block
,
4265 .zone_count
= zones_to_record
,
4272 * vdo_allocate_reference_counters() - Allocate the reference counters for all slabs in the depot.
4274 * Context: This method may be called only before entering normal operation from the load thread.
4276 * Return: VDO_SUCCESS or an error.
4278 int vdo_allocate_reference_counters(struct slab_depot
*depot
)
4280 struct slab_iterator iterator
=
4281 get_depot_slab_iterator(depot
, depot
->slab_count
- 1, 0, 1);
4283 while (iterator
.next
!= NULL
) {
4284 int result
= allocate_slab_counters(next_slab(&iterator
));
4286 if (result
!= VDO_SUCCESS
)
4294 * get_slab_number() - Get the number of the slab that contains a specified block.
4295 * @depot: The slab depot.
4296 * @pbn: The physical block number.
4297 * @slab_number_ptr: A pointer to hold the slab number.
4299 * Return: VDO_SUCCESS or an error.
4301 static int __must_check
get_slab_number(const struct slab_depot
*depot
,
4302 physical_block_number_t pbn
,
4303 slab_count_t
*slab_number_ptr
)
4305 slab_count_t slab_number
;
4307 if (pbn
< depot
->first_block
)
4308 return VDO_OUT_OF_RANGE
;
4310 slab_number
= (pbn
- depot
->first_block
) >> depot
->slab_size_shift
;
4311 if (slab_number
>= depot
->slab_count
)
4312 return VDO_OUT_OF_RANGE
;
4314 *slab_number_ptr
= slab_number
;
4319 * vdo_get_slab() - Get the slab object for the slab that contains a specified block.
4320 * @depot: The slab depot.
4321 * @pbn: The physical block number.
4323 * Will put the VDO in read-only mode if the PBN is not a valid data block nor the zero block.
4325 * Return: The slab containing the block, or NULL if the block number is the zero block or
4326 * otherwise out of range.
4328 struct vdo_slab
*vdo_get_slab(const struct slab_depot
*depot
,
4329 physical_block_number_t pbn
)
4331 slab_count_t slab_number
;
4334 if (pbn
== VDO_ZERO_BLOCK
)
4337 result
= get_slab_number(depot
, pbn
, &slab_number
);
4338 if (result
!= VDO_SUCCESS
) {
4339 vdo_enter_read_only_mode(depot
->vdo
, result
);
4343 return depot
->slabs
[slab_number
];
4347 * vdo_get_increment_limit() - Determine how many new references a block can acquire.
4348 * @depot: The slab depot.
4349 * @pbn: The physical block number that is being queried.
4351 * Context: This method must be called from the physical zone thread of the PBN.
4353 * Return: The number of available references.
4355 u8
vdo_get_increment_limit(struct slab_depot
*depot
, physical_block_number_t pbn
)
4357 struct vdo_slab
*slab
= vdo_get_slab(depot
, pbn
);
4358 vdo_refcount_t
*counter_ptr
= NULL
;
4361 if ((slab
== NULL
) || (slab
->status
!= VDO_SLAB_REBUILT
))
4364 result
= get_reference_counter(slab
, pbn
, &counter_ptr
);
4365 if (result
!= VDO_SUCCESS
)
4368 if (*counter_ptr
== PROVISIONAL_REFERENCE_COUNT
)
4369 return (MAXIMUM_REFERENCE_COUNT
- 1);
4371 return (MAXIMUM_REFERENCE_COUNT
- *counter_ptr
);
4375 * vdo_is_physical_data_block() - Determine whether the given PBN refers to a data block.
4376 * @depot: The depot.
4377 * @pbn: The physical block number to ask about.
4379 * Return: True if the PBN corresponds to a data block.
4381 bool vdo_is_physical_data_block(const struct slab_depot
*depot
,
4382 physical_block_number_t pbn
)
4384 slab_count_t slab_number
;
4385 slab_block_number sbn
;
4387 return ((pbn
== VDO_ZERO_BLOCK
) ||
4388 ((get_slab_number(depot
, pbn
, &slab_number
) == VDO_SUCCESS
) &&
4389 (slab_block_number_from_pbn(depot
->slabs
[slab_number
], pbn
, &sbn
) ==
4394 * vdo_get_slab_depot_allocated_blocks() - Get the total number of data blocks allocated across all
4395 * the slabs in the depot.
4396 * @depot: The slab depot.
4398 * This is the total number of blocks with a non-zero reference count.
4400 * Context: This may be called from any thread.
4402 * Return: The total number of blocks with a non-zero reference count.
4404 block_count_t
vdo_get_slab_depot_allocated_blocks(const struct slab_depot
*depot
)
4406 block_count_t total
= 0;
4409 for (zone
= 0; zone
< depot
->zone_count
; zone
++) {
4410 /* The allocators are responsible for thread safety. */
4411 total
+= READ_ONCE(depot
->allocators
[zone
].allocated_blocks
);
4418 * vdo_get_slab_depot_data_blocks() - Get the total number of data blocks in all the slabs in the
4420 * @depot: The slab depot.
4422 * Context: This may be called from any thread.
4424 * Return: The total number of data blocks in all slabs.
4426 block_count_t
vdo_get_slab_depot_data_blocks(const struct slab_depot
*depot
)
4428 return (READ_ONCE(depot
->slab_count
) * depot
->slab_config
.data_blocks
);
4432 * finish_combining_zones() - Clean up after saving out the combined slab summary.
4433 * @completion: The vio which was used to write the summary data.
4435 static void finish_combining_zones(struct vdo_completion
*completion
)
4437 int result
= completion
->result
;
4438 struct vdo_completion
*parent
= completion
->parent
;
4440 free_vio(as_vio(vdo_forget(completion
)));
4441 vdo_fail_completion(parent
, result
);
4444 static void handle_combining_error(struct vdo_completion
*completion
)
4446 vio_record_metadata_io_error(as_vio(completion
));
4447 finish_combining_zones(completion
);
4450 static void write_summary_endio(struct bio
*bio
)
4452 struct vio
*vio
= bio
->bi_private
;
4453 struct vdo
*vdo
= vio
->completion
.vdo
;
4455 continue_vio_after_io(vio
, finish_combining_zones
,
4456 vdo
->thread_config
.admin_thread
);
4460 * combine_summaries() - Treating the current entries buffer as the on-disk value of all zones,
4461 * update every zone to the correct values for every slab.
4462 * @depot: The depot whose summary entries should be combined.
4464 static void combine_summaries(struct slab_depot
*depot
)
4467 * Combine all the old summary data into the portion of the buffer corresponding to the
4470 zone_count_t zone
= 0;
4471 struct slab_summary_entry
*entries
= depot
->summary_entries
;
4473 if (depot
->old_zone_count
> 1) {
4474 slab_count_t entry_number
;
4476 for (entry_number
= 0; entry_number
< MAX_VDO_SLABS
; entry_number
++) {
4478 memcpy(entries
+ entry_number
,
4479 entries
+ (zone
* MAX_VDO_SLABS
) + entry_number
,
4480 sizeof(struct slab_summary_entry
));
4484 if (zone
== depot
->old_zone_count
)
4489 /* Copy the combined data to each zones's region of the buffer. */
4490 for (zone
= 1; zone
< MAX_VDO_PHYSICAL_ZONES
; zone
++) {
4491 memcpy(entries
+ (zone
* MAX_VDO_SLABS
), entries
,
4492 MAX_VDO_SLABS
* sizeof(struct slab_summary_entry
));
4497 * finish_loading_summary() - Finish loading slab summary data.
4498 * @completion: The vio which was used to read the summary data.
4500 * Combines the slab summary data from all the previously written zones and copies the combined
4501 * summary to each partition's data region. Then writes the combined summary back out to disk. This
4502 * callback is registered in load_summary_endio().
4504 static void finish_loading_summary(struct vdo_completion
*completion
)
4506 struct slab_depot
*depot
= completion
->vdo
->depot
;
4508 /* Combine the summary from each zone so each zone is correct for all slabs. */
4509 combine_summaries(depot
);
4511 /* Write the combined summary back out. */
4512 vdo_submit_metadata_vio(as_vio(completion
), depot
->summary_origin
,
4513 write_summary_endio
, handle_combining_error
,
4517 static void load_summary_endio(struct bio
*bio
)
4519 struct vio
*vio
= bio
->bi_private
;
4520 struct vdo
*vdo
= vio
->completion
.vdo
;
4522 continue_vio_after_io(vio
, finish_loading_summary
,
4523 vdo
->thread_config
.admin_thread
);
4527 * load_slab_summary() - The preamble of a load operation.
4529 * Implements vdo_action_preamble_fn.
4531 static void load_slab_summary(void *context
, struct vdo_completion
*parent
)
4535 struct slab_depot
*depot
= context
;
4536 const struct admin_state_code
*operation
=
4537 vdo_get_current_manager_operation(depot
->action_manager
);
4539 result
= create_multi_block_metadata_vio(depot
->vdo
, VIO_TYPE_SLAB_SUMMARY
,
4540 VIO_PRIORITY_METADATA
, parent
,
4541 VDO_SLAB_SUMMARY_BLOCKS
,
4542 (char *) depot
->summary_entries
, &vio
);
4543 if (result
!= VDO_SUCCESS
) {
4544 vdo_fail_completion(parent
, result
);
4548 if ((operation
== VDO_ADMIN_STATE_FORMATTING
) ||
4549 (operation
== VDO_ADMIN_STATE_LOADING_FOR_REBUILD
)) {
4550 finish_loading_summary(&vio
->completion
);
4554 vdo_submit_metadata_vio(vio
, depot
->summary_origin
, load_summary_endio
,
4555 handle_combining_error
, REQ_OP_READ
);
4558 /* Implements vdo_zone_action_fn. */
4559 static void load_allocator(void *context
, zone_count_t zone_number
,
4560 struct vdo_completion
*parent
)
4562 struct slab_depot
*depot
= context
;
4564 vdo_start_loading(&depot
->allocators
[zone_number
].state
,
4565 vdo_get_current_manager_operation(depot
->action_manager
),
4566 parent
, initiate_load
);
4570 * vdo_load_slab_depot() - Asynchronously load any slab depot state that isn't included in the
4571 * super_block component.
4572 * @depot: The depot to load.
4573 * @operation: The type of load to perform.
4574 * @parent: The completion to notify when the load is complete.
4575 * @context: Additional context for the load operation; may be NULL.
4577 * This method may be called only before entering normal operation from the load thread.
4579 void vdo_load_slab_depot(struct slab_depot
*depot
,
4580 const struct admin_state_code
*operation
,
4581 struct vdo_completion
*parent
, void *context
)
4583 if (!vdo_assert_load_operation(operation
, parent
))
4586 vdo_schedule_operation_with_context(depot
->action_manager
, operation
,
4587 load_slab_summary
, load_allocator
,
4588 NULL
, context
, parent
);
4591 /* Implements vdo_zone_action_fn. */
4592 static void prepare_to_allocate(void *context
, zone_count_t zone_number
,
4593 struct vdo_completion
*parent
)
4595 struct slab_depot
*depot
= context
;
4596 struct block_allocator
*allocator
= &depot
->allocators
[zone_number
];
4599 result
= vdo_prepare_slabs_for_allocation(allocator
);
4600 if (result
!= VDO_SUCCESS
) {
4601 vdo_fail_completion(parent
, result
);
4605 scrub_slabs(allocator
, parent
);
4609 * vdo_prepare_slab_depot_to_allocate() - Prepare the slab depot to come online and start
4610 * allocating blocks.
4611 * @depot: The depot to prepare.
4612 * @load_type: The load type.
4613 * @parent: The completion to notify when the operation is complete.
4615 * This method may be called only before entering normal operation from the load thread. It must be
4616 * called before allocation may proceed.
4618 void vdo_prepare_slab_depot_to_allocate(struct slab_depot
*depot
,
4619 enum slab_depot_load_type load_type
,
4620 struct vdo_completion
*parent
)
4622 depot
->load_type
= load_type
;
4623 atomic_set(&depot
->zones_to_scrub
, depot
->zone_count
);
4624 vdo_schedule_action(depot
->action_manager
, NULL
,
4625 prepare_to_allocate
, NULL
, parent
);
4629 * vdo_update_slab_depot_size() - Update the slab depot to reflect its new size in memory.
4630 * @depot: The depot to update.
4632 * This size is saved to disk as part of the super block.
4634 void vdo_update_slab_depot_size(struct slab_depot
*depot
)
4636 depot
->last_block
= depot
->new_last_block
;
4640 * vdo_prepare_to_grow_slab_depot() - Allocate new memory needed for a resize of a slab depot to
4642 * @depot: The depot to prepare to resize.
4643 * @partition: The new depot partition
4645 * Return: VDO_SUCCESS or an error.
4647 int vdo_prepare_to_grow_slab_depot(struct slab_depot
*depot
,
4648 const struct partition
*partition
)
4650 struct slab_depot_state_2_0 new_state
;
4652 slab_count_t new_slab_count
;
4654 if ((partition
->count
>> depot
->slab_size_shift
) <= depot
->slab_count
)
4655 return VDO_INCREMENT_TOO_SMALL
;
4657 /* Generate the depot configuration for the new block count. */
4658 VDO_ASSERT_LOG_ONLY(depot
->first_block
== partition
->offset
,
4659 "New slab depot partition doesn't change origin");
4660 result
= vdo_configure_slab_depot(partition
, depot
->slab_config
,
4661 depot
->zone_count
, &new_state
);
4662 if (result
!= VDO_SUCCESS
)
4665 new_slab_count
= vdo_compute_slab_count(depot
->first_block
,
4666 new_state
.last_block
,
4667 depot
->slab_size_shift
);
4668 if (new_slab_count
<= depot
->slab_count
)
4669 return vdo_log_error_strerror(VDO_INCREMENT_TOO_SMALL
,
4670 "Depot can only grow");
4671 if (new_slab_count
== depot
->new_slab_count
) {
4672 /* Check it out, we've already got all the new slabs allocated! */
4676 vdo_abandon_new_slabs(depot
);
4677 result
= allocate_slabs(depot
, new_slab_count
);
4678 if (result
!= VDO_SUCCESS
) {
4679 vdo_abandon_new_slabs(depot
);
4683 depot
->new_size
= partition
->count
;
4684 depot
->old_last_block
= depot
->last_block
;
4685 depot
->new_last_block
= new_state
.last_block
;
4691 * finish_registration() - Finish registering new slabs now that all of the allocators have
4692 * received their new slabs.
4694 * Implements vdo_action_conclusion_fn.
4696 static int finish_registration(void *context
)
4698 struct slab_depot
*depot
= context
;
4700 WRITE_ONCE(depot
->slab_count
, depot
->new_slab_count
);
4701 vdo_free(depot
->slabs
);
4702 depot
->slabs
= depot
->new_slabs
;
4703 depot
->new_slabs
= NULL
;
4704 depot
->new_slab_count
= 0;
4708 /* Implements vdo_zone_action_fn. */
4709 static void register_new_slabs(void *context
, zone_count_t zone_number
,
4710 struct vdo_completion
*parent
)
4712 struct slab_depot
*depot
= context
;
4713 struct block_allocator
*allocator
= &depot
->allocators
[zone_number
];
4716 for (i
= depot
->slab_count
; i
< depot
->new_slab_count
; i
++) {
4717 struct vdo_slab
*slab
= depot
->new_slabs
[i
];
4719 if (slab
->allocator
== allocator
)
4720 register_slab_with_allocator(allocator
, slab
);
4723 vdo_finish_completion(parent
);
4727 * vdo_use_new_slabs() - Use the new slabs allocated for resize.
4728 * @depot: The depot.
4729 * @parent: The object to notify when complete.
4731 void vdo_use_new_slabs(struct slab_depot
*depot
, struct vdo_completion
*parent
)
4733 VDO_ASSERT_LOG_ONLY(depot
->new_slabs
!= NULL
, "Must have new slabs to use");
4734 vdo_schedule_operation(depot
->action_manager
,
4735 VDO_ADMIN_STATE_SUSPENDED_OPERATION
,
4736 NULL
, register_new_slabs
,
4737 finish_registration
, parent
);
4741 * stop_scrubbing() - Tell the scrubber to stop scrubbing after it finishes the slab it is
4742 * currently working on.
4743 * @allocator: The block allocator owning the scrubber to stop.
4745 static void stop_scrubbing(struct block_allocator
*allocator
)
4747 struct slab_scrubber
*scrubber
= &allocator
->scrubber
;
4749 if (vdo_is_state_quiescent(&scrubber
->admin_state
)) {
4750 vdo_finish_completion(&allocator
->completion
);
4752 vdo_start_draining(&scrubber
->admin_state
,
4753 VDO_ADMIN_STATE_SUSPENDING
,
4754 &allocator
->completion
, NULL
);
4758 /* Implements vdo_admin_initiator_fn. */
4759 static void initiate_summary_drain(struct admin_state
*state
)
4761 check_summary_drain_complete(container_of(state
, struct block_allocator
,
4765 static void do_drain_step(struct vdo_completion
*completion
)
4767 struct block_allocator
*allocator
= vdo_as_block_allocator(completion
);
4769 vdo_prepare_completion_for_requeue(&allocator
->completion
, do_drain_step
,
4770 handle_operation_error
, allocator
->thread_id
,
4772 switch (++allocator
->drain_step
) {
4773 case VDO_DRAIN_ALLOCATOR_STEP_SCRUBBER
:
4774 stop_scrubbing(allocator
);
4777 case VDO_DRAIN_ALLOCATOR_STEP_SLABS
:
4778 apply_to_slabs(allocator
, do_drain_step
);
4781 case VDO_DRAIN_ALLOCATOR_STEP_SUMMARY
:
4782 vdo_start_draining(&allocator
->summary_state
,
4783 vdo_get_admin_state_code(&allocator
->state
),
4784 completion
, initiate_summary_drain
);
4787 case VDO_DRAIN_ALLOCATOR_STEP_FINISHED
:
4788 VDO_ASSERT_LOG_ONLY(!is_vio_pool_busy(allocator
->vio_pool
),
4789 "vio pool not busy");
4790 vdo_finish_draining_with_result(&allocator
->state
, completion
->result
);
4794 vdo_finish_draining_with_result(&allocator
->state
, UDS_BAD_STATE
);
4798 /* Implements vdo_admin_initiator_fn. */
4799 static void initiate_drain(struct admin_state
*state
)
4801 struct block_allocator
*allocator
=
4802 container_of(state
, struct block_allocator
, state
);
4804 allocator
->drain_step
= VDO_DRAIN_ALLOCATOR_START
;
4805 do_drain_step(&allocator
->completion
);
4809 * Drain all allocator I/O. Depending upon the type of drain, some or all dirty metadata may be
4810 * written to disk. The type of drain will be determined from the state of the allocator's depot.
4812 * Implements vdo_zone_action_fn.
4814 static void drain_allocator(void *context
, zone_count_t zone_number
,
4815 struct vdo_completion
*parent
)
4817 struct slab_depot
*depot
= context
;
4819 vdo_start_draining(&depot
->allocators
[zone_number
].state
,
4820 vdo_get_current_manager_operation(depot
->action_manager
),
4821 parent
, initiate_drain
);
4825 * vdo_drain_slab_depot() - Drain all slab depot I/O.
4826 * @depot: The depot to drain.
4827 * @operation: The drain operation (flush, rebuild, suspend, or save).
4828 * @parent: The completion to finish when the drain is complete.
4830 * If saving, or flushing, all dirty depot metadata will be written out. If saving or suspending,
4831 * the depot will be left in a suspended state.
4833 void vdo_drain_slab_depot(struct slab_depot
*depot
,
4834 const struct admin_state_code
*operation
,
4835 struct vdo_completion
*parent
)
4837 vdo_schedule_operation(depot
->action_manager
, operation
,
4838 NULL
, drain_allocator
, NULL
, parent
);
4842 * resume_scrubbing() - Tell the scrubber to resume scrubbing if it has been stopped.
4843 * @allocator: The allocator being resumed.
4845 static void resume_scrubbing(struct block_allocator
*allocator
)
4848 struct slab_scrubber
*scrubber
= &allocator
->scrubber
;
4850 if (!has_slabs_to_scrub(scrubber
)) {
4851 vdo_finish_completion(&allocator
->completion
);
4855 result
= vdo_resume_if_quiescent(&scrubber
->admin_state
);
4856 if (result
!= VDO_SUCCESS
) {
4857 vdo_fail_completion(&allocator
->completion
, result
);
4861 scrub_next_slab(scrubber
);
4862 vdo_finish_completion(&allocator
->completion
);
4865 static void do_resume_step(struct vdo_completion
*completion
)
4867 struct block_allocator
*allocator
= vdo_as_block_allocator(completion
);
4869 vdo_prepare_completion_for_requeue(&allocator
->completion
, do_resume_step
,
4870 handle_operation_error
,
4871 allocator
->thread_id
, NULL
);
4872 switch (--allocator
->drain_step
) {
4873 case VDO_DRAIN_ALLOCATOR_STEP_SUMMARY
:
4874 vdo_fail_completion(completion
,
4875 vdo_resume_if_quiescent(&allocator
->summary_state
));
4878 case VDO_DRAIN_ALLOCATOR_STEP_SLABS
:
4879 apply_to_slabs(allocator
, do_resume_step
);
4882 case VDO_DRAIN_ALLOCATOR_STEP_SCRUBBER
:
4883 resume_scrubbing(allocator
);
4886 case VDO_DRAIN_ALLOCATOR_START
:
4887 vdo_finish_resuming_with_result(&allocator
->state
, completion
->result
);
4891 vdo_finish_resuming_with_result(&allocator
->state
, UDS_BAD_STATE
);
4895 /* Implements vdo_admin_initiator_fn. */
4896 static void initiate_resume(struct admin_state
*state
)
4898 struct block_allocator
*allocator
=
4899 container_of(state
, struct block_allocator
, state
);
4901 allocator
->drain_step
= VDO_DRAIN_ALLOCATOR_STEP_FINISHED
;
4902 do_resume_step(&allocator
->completion
);
4905 /* Implements vdo_zone_action_fn. */
4906 static void resume_allocator(void *context
, zone_count_t zone_number
,
4907 struct vdo_completion
*parent
)
4909 struct slab_depot
*depot
= context
;
4911 vdo_start_resuming(&depot
->allocators
[zone_number
].state
,
4912 vdo_get_current_manager_operation(depot
->action_manager
),
4913 parent
, initiate_resume
);
4917 * vdo_resume_slab_depot() - Resume a suspended slab depot.
4918 * @depot: The depot to resume.
4919 * @parent: The completion to finish when the depot has resumed.
4921 void vdo_resume_slab_depot(struct slab_depot
*depot
, struct vdo_completion
*parent
)
4923 if (vdo_is_read_only(depot
->vdo
)) {
4924 vdo_continue_completion(parent
, VDO_READ_ONLY
);
4928 vdo_schedule_operation(depot
->action_manager
, VDO_ADMIN_STATE_RESUMING
,
4929 NULL
, resume_allocator
, NULL
, parent
);
4933 * vdo_commit_oldest_slab_journal_tail_blocks() - Commit all dirty tail blocks which are locking a
4934 * given recovery journal block.
4935 * @depot: The depot.
4936 * @recovery_block_number: The sequence number of the recovery journal block whose locks should be
4939 * Context: This method must be called from the journal zone thread.
4941 void vdo_commit_oldest_slab_journal_tail_blocks(struct slab_depot
*depot
,
4942 sequence_number_t recovery_block_number
)
4947 depot
->new_release_request
= recovery_block_number
;
4948 vdo_schedule_default_action(depot
->action_manager
);
4951 /* Implements vdo_zone_action_fn. */
4952 static void scrub_all_unrecovered_slabs(void *context
, zone_count_t zone_number
,
4953 struct vdo_completion
*parent
)
4955 struct slab_depot
*depot
= context
;
4957 scrub_slabs(&depot
->allocators
[zone_number
], NULL
);
4958 vdo_launch_completion(parent
);
4962 * vdo_scrub_all_unrecovered_slabs() - Scrub all unrecovered slabs.
4963 * @depot: The depot to scrub.
4964 * @parent: The object to notify when scrubbing has been launched for all zones.
4966 void vdo_scrub_all_unrecovered_slabs(struct slab_depot
*depot
,
4967 struct vdo_completion
*parent
)
4969 vdo_schedule_action(depot
->action_manager
, NULL
,
4970 scrub_all_unrecovered_slabs
,
4975 * get_block_allocator_statistics() - Get the total of the statistics from all the block allocators
4977 * @depot: The slab depot.
4979 * Return: The statistics from all block allocators in the depot.
4981 static struct block_allocator_statistics __must_check
4982 get_block_allocator_statistics(const struct slab_depot
*depot
)
4984 struct block_allocator_statistics totals
;
4987 memset(&totals
, 0, sizeof(totals
));
4989 for (zone
= 0; zone
< depot
->zone_count
; zone
++) {
4990 const struct block_allocator
*allocator
= &depot
->allocators
[zone
];
4991 const struct block_allocator_statistics
*stats
= &allocator
->statistics
;
4993 totals
.slab_count
+= allocator
->slab_count
;
4994 totals
.slabs_opened
+= READ_ONCE(stats
->slabs_opened
);
4995 totals
.slabs_reopened
+= READ_ONCE(stats
->slabs_reopened
);
5002 * get_ref_counts_statistics() - Get the cumulative ref_counts statistics for the depot.
5003 * @depot: The slab depot.
5005 * Return: The cumulative statistics for all ref_counts in the depot.
5007 static struct ref_counts_statistics __must_check
5008 get_ref_counts_statistics(const struct slab_depot
*depot
)
5010 struct ref_counts_statistics totals
;
5013 memset(&totals
, 0, sizeof(totals
));
5015 for (zone
= 0; zone
< depot
->zone_count
; zone
++) {
5016 totals
.blocks_written
+=
5017 READ_ONCE(depot
->allocators
[zone
].ref_counts_statistics
.blocks_written
);
5024 * get_slab_journal_statistics() - Get the aggregated slab journal statistics for the depot.
5025 * @depot: The slab depot.
5027 * Return: The aggregated statistics for all slab journals in the depot.
5029 static struct slab_journal_statistics __must_check
5030 get_slab_journal_statistics(const struct slab_depot
*depot
)
5032 struct slab_journal_statistics totals
;
5035 memset(&totals
, 0, sizeof(totals
));
5037 for (zone
= 0; zone
< depot
->zone_count
; zone
++) {
5038 const struct slab_journal_statistics
*stats
=
5039 &depot
->allocators
[zone
].slab_journal_statistics
;
5041 totals
.disk_full_count
+= READ_ONCE(stats
->disk_full_count
);
5042 totals
.flush_count
+= READ_ONCE(stats
->flush_count
);
5043 totals
.blocked_count
+= READ_ONCE(stats
->blocked_count
);
5044 totals
.blocks_written
+= READ_ONCE(stats
->blocks_written
);
5045 totals
.tail_busy_count
+= READ_ONCE(stats
->tail_busy_count
);
5052 * vdo_get_slab_depot_statistics() - Get all the vdo_statistics fields that are properties of the
5054 * @depot: The slab depot.
5055 * @stats: The vdo statistics structure to partially fill.
5057 void vdo_get_slab_depot_statistics(const struct slab_depot
*depot
,
5058 struct vdo_statistics
*stats
)
5060 slab_count_t slab_count
= READ_ONCE(depot
->slab_count
);
5061 slab_count_t unrecovered
= 0;
5064 for (zone
= 0; zone
< depot
->zone_count
; zone
++) {
5065 /* The allocators are responsible for thread safety. */
5066 unrecovered
+= READ_ONCE(depot
->allocators
[zone
].scrubber
.slab_count
);
5069 stats
->recovery_percentage
= (slab_count
- unrecovered
) * 100 / slab_count
;
5070 stats
->allocator
= get_block_allocator_statistics(depot
);
5071 stats
->ref_counts
= get_ref_counts_statistics(depot
);
5072 stats
->slab_journal
= get_slab_journal_statistics(depot
);
5073 stats
->slab_summary
= (struct slab_summary_statistics
) {
5074 .blocks_written
= atomic64_read(&depot
->summary_statistics
.blocks_written
),
5079 * vdo_dump_slab_depot() - Dump the slab depot, in a thread-unsafe fashion.
5080 * @depot: The slab depot.
5082 void vdo_dump_slab_depot(const struct slab_depot
*depot
)
5084 vdo_log_info("vdo slab depot");
5085 vdo_log_info(" zone_count=%u old_zone_count=%u slabCount=%u active_release_request=%llu new_release_request=%llu",
5086 (unsigned int) depot
->zone_count
,
5087 (unsigned int) depot
->old_zone_count
, READ_ONCE(depot
->slab_count
),
5088 (unsigned long long) depot
->active_release_request
,
5089 (unsigned long long) depot
->new_release_request
);