1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright 2023 Red Hat
5 #include "volume-index.h"
7 #include <linux/bitops.h>
8 #include <linux/bits.h>
9 #include <linux/cache.h>
10 #include <linux/compiler.h>
11 #include <linux/log2.h>
15 #include "memory-alloc.h"
17 #include "permassert.h"
18 #include "thread-utils.h"
22 #include "hash-utils.h"
26 * The volume index is a combination of two separate subindexes, one containing sparse hook entries
27 * (retained for all chapters), and one containing the remaining entries (retained only for the
28 * dense chapters). If there are no sparse chapters, only the non-hook sub index is used, and it
29 * will contain all records for all chapters.
31 * The volume index is also divided into zones, with one thread operating on each zone. Each
32 * incoming request is dispatched to the appropriate thread, and then to the appropriate subindex.
33 * Each delta list is handled by a single zone. To ensure that the distribution of delta lists to
34 * zones doesn't underflow (leaving some zone with no delta lists), the minimum number of delta
35 * lists must be the square of the maximum zone count for both subindexes.
37 * Each subindex zone is a delta index where the payload is a chapter number. The volume index can
38 * compute the delta list number, address, and zone number from the record name in order to
39 * dispatch record handling to the correct structures.
41 * Most operations that use all the zones take place either before request processing is allowed,
42 * or after all requests have been flushed in order to shut down. The only multi-threaded operation
43 * supported during normal operation is the uds_lookup_volume_index_name() method, used to determine
44 * whether a new chapter should be loaded into the sparse index cache. This operation only uses the
45 * sparse hook subindex, and the zone mutexes are used to make this operation safe.
47 * There are three ways of expressing chapter numbers in the volume index: virtual, index, and
48 * rolling. The interface to the volume index uses virtual chapter numbers, which are 64 bits long.
49 * Internally the subindex stores only the minimal number of bits necessary by masking away the
50 * high-order bits. When the index needs to deal with ordering of index chapter numbers, as when
51 * flushing entries from older chapters, it rolls the index chapter number around so that the
52 * smallest one in use is mapped to 0. See convert_index_to_virtual() or flush_invalid_entries()
53 * for an example of this technique.
55 * For efficiency, when older chapter numbers become invalid, the index does not immediately remove
56 * the invalidated entries. Instead it lazily removes them from a given delta list the next time it
57 * walks that list during normal operation. Because of this, the index size must be increased
58 * somewhat to accommodate all the invalid entries that have not yet been removed. For the standard
59 * index sizes, this requires about 4 chapters of old entries per 1024 chapters of valid entries in
63 struct sub_index_parameters
{
64 /* The number of bits in address mask */
66 /* The number of bits in chapter number */
70 /* The number of delta lists */
72 /* The number of chapters used */
74 /* The number of bits per chapter */
75 size_t chapter_size_in_bits
;
76 /* The number of bytes of delta list memory */
78 /* The number of bytes the index should keep free at all times */
79 size_t target_free_bytes
;
83 /* The hook subindex configuration */
84 struct uds_configuration hook_config
;
85 struct index_geometry hook_geometry
;
87 /* The non-hook subindex configuration */
88 struct uds_configuration non_hook_config
;
89 struct index_geometry non_hook_geometry
;
92 struct chapter_range
{
99 static const char MAGIC_START_5
[] = "MI5-0005";
101 struct sub_index_data
{
102 char magic
[MAGIC_SIZE
]; /* MAGIC_START_5 */
104 u64 virtual_chapter_low
;
105 u64 virtual_chapter_high
;
110 static const char MAGIC_START_6
[] = "MI6-0001";
112 struct volume_index_data
{
113 char magic
[MAGIC_SIZE
]; /* MAGIC_START_6 */
114 u32 sparse_sample_rate
;
117 static inline u32
extract_address(const struct volume_sub_index
*sub_index
,
118 const struct uds_record_name
*name
)
120 return uds_extract_volume_index_bytes(name
) & sub_index
->address_mask
;
123 static inline u32
extract_dlist_num(const struct volume_sub_index
*sub_index
,
124 const struct uds_record_name
*name
)
126 u64 bits
= uds_extract_volume_index_bytes(name
);
128 return (bits
>> sub_index
->address_bits
) % sub_index
->list_count
;
131 static inline const struct volume_sub_index_zone
*
132 get_zone_for_record(const struct volume_index_record
*record
)
134 return &record
->sub_index
->zones
[record
->zone_number
];
137 static inline u64
convert_index_to_virtual(const struct volume_index_record
*record
,
140 const struct volume_sub_index_zone
*volume_index_zone
= get_zone_for_record(record
);
141 u32 rolling_chapter
= ((index_chapter
- volume_index_zone
->virtual_chapter_low
) &
142 record
->sub_index
->chapter_mask
);
144 return volume_index_zone
->virtual_chapter_low
+ rolling_chapter
;
147 static inline u32
convert_virtual_to_index(const struct volume_sub_index
*sub_index
,
150 return virtual_chapter
& sub_index
->chapter_mask
;
153 static inline bool is_virtual_chapter_indexed(const struct volume_index_record
*record
,
156 const struct volume_sub_index_zone
*volume_index_zone
= get_zone_for_record(record
);
158 return ((virtual_chapter
>= volume_index_zone
->virtual_chapter_low
) &&
159 (virtual_chapter
<= volume_index_zone
->virtual_chapter_high
));
162 static inline bool has_sparse(const struct volume_index
*volume_index
)
164 return volume_index
->sparse_sample_rate
> 0;
167 bool uds_is_volume_index_sample(const struct volume_index
*volume_index
,
168 const struct uds_record_name
*name
)
170 if (!has_sparse(volume_index
))
173 return (uds_extract_sampling_bytes(name
) % volume_index
->sparse_sample_rate
) == 0;
176 static inline const struct volume_sub_index
*
177 get_volume_sub_index(const struct volume_index
*volume_index
,
178 const struct uds_record_name
*name
)
180 return (uds_is_volume_index_sample(volume_index
, name
) ?
181 &volume_index
->vi_hook
:
182 &volume_index
->vi_non_hook
);
185 static unsigned int get_volume_sub_index_zone(const struct volume_sub_index
*sub_index
,
186 const struct uds_record_name
*name
)
188 return extract_dlist_num(sub_index
, name
) / sub_index
->delta_index
.lists_per_zone
;
191 unsigned int uds_get_volume_index_zone(const struct volume_index
*volume_index
,
192 const struct uds_record_name
*name
)
194 return get_volume_sub_index_zone(get_volume_sub_index(volume_index
, name
), name
);
197 #define DELTA_LIST_SIZE 256
199 static int compute_volume_sub_index_parameters(const struct uds_configuration
*config
,
200 struct sub_index_parameters
*params
)
202 u64 entries_in_volume_index
, address_span
;
203 u32 chapters_in_volume_index
, invalid_chapters
;
204 u32 rounded_chapters
;
205 u64 delta_list_records
;
207 u64 index_size_in_bits
;
208 size_t expected_index_size
;
209 u64 min_delta_lists
= MAX_ZONES
* MAX_ZONES
;
210 struct index_geometry
*geometry
= config
->geometry
;
211 u64 records_per_chapter
= geometry
->records_per_chapter
;
213 params
->chapter_count
= geometry
->chapters_per_volume
;
215 * Make sure that the number of delta list records in the volume index does not change when
216 * the volume is reduced by one chapter. This preserves the mapping from name to volume
219 rounded_chapters
= params
->chapter_count
;
220 if (uds_is_reduced_index_geometry(geometry
))
221 rounded_chapters
+= 1;
222 delta_list_records
= records_per_chapter
* rounded_chapters
;
223 address_count
= config
->volume_index_mean_delta
* DELTA_LIST_SIZE
;
224 params
->list_count
= max(delta_list_records
/ DELTA_LIST_SIZE
, min_delta_lists
);
225 params
->address_bits
= bits_per(address_count
- 1);
226 params
->chapter_bits
= bits_per(rounded_chapters
- 1);
227 if ((u32
) params
->list_count
!= params
->list_count
) {
228 return vdo_log_warning_strerror(UDS_INVALID_ARGUMENT
,
229 "cannot initialize volume index with %llu delta lists",
230 (unsigned long long) params
->list_count
);
233 if (params
->address_bits
> 31) {
234 return vdo_log_warning_strerror(UDS_INVALID_ARGUMENT
,
235 "cannot initialize volume index with %u address bits",
236 params
->address_bits
);
240 * The probability that a given delta list is not touched during the writing of an entire
243 * double p_not_touched = pow((double) (params->list_count - 1) / params->list_count,
244 * records_per_chapter);
246 * For the standard index sizes, about 78% of the delta lists are not touched, and
247 * therefore contain old index entries that have not been eliminated by the lazy LRU
248 * processing. Then the number of old index entries that accumulate over the entire index,
249 * in terms of full chapters worth of entries, is:
251 * double invalid_chapters = p_not_touched / (1.0 - p_not_touched);
253 * For the standard index sizes, the index needs about 3.5 chapters of space for the old
254 * entries in a 1024 chapter index, so round this up to use 4 chapters per 1024 chapters in
257 invalid_chapters
= max(rounded_chapters
/ 256, 2U);
258 chapters_in_volume_index
= rounded_chapters
+ invalid_chapters
;
259 entries_in_volume_index
= records_per_chapter
* chapters_in_volume_index
;
261 address_span
= params
->list_count
<< params
->address_bits
;
262 params
->mean_delta
= address_span
/ entries_in_volume_index
;
265 * Compute the expected size of a full index, then set the total memory to be 6% larger
266 * than that expected size. This number should be large enough that there are not many
267 * rebalances when the index is full.
269 params
->chapter_size_in_bits
= uds_compute_delta_index_size(records_per_chapter
,
271 params
->chapter_bits
);
272 index_size_in_bits
= params
->chapter_size_in_bits
* chapters_in_volume_index
;
273 expected_index_size
= index_size_in_bits
/ BITS_PER_BYTE
;
274 params
->memory_size
= expected_index_size
* 106 / 100;
276 params
->target_free_bytes
= expected_index_size
/ 20;
280 static void uninitialize_volume_sub_index(struct volume_sub_index
*sub_index
)
282 vdo_free(vdo_forget(sub_index
->flush_chapters
));
283 vdo_free(vdo_forget(sub_index
->zones
));
284 uds_uninitialize_delta_index(&sub_index
->delta_index
);
287 void uds_free_volume_index(struct volume_index
*volume_index
)
289 if (volume_index
== NULL
)
292 if (volume_index
->zones
!= NULL
)
293 vdo_free(vdo_forget(volume_index
->zones
));
295 uninitialize_volume_sub_index(&volume_index
->vi_non_hook
);
296 uninitialize_volume_sub_index(&volume_index
->vi_hook
);
297 vdo_free(volume_index
);
301 static int compute_volume_sub_index_save_bytes(const struct uds_configuration
*config
,
304 struct sub_index_parameters params
= { .address_bits
= 0 };
307 result
= compute_volume_sub_index_parameters(config
, ¶ms
);
308 if (result
!= UDS_SUCCESS
)
311 *bytes
= (sizeof(struct sub_index_data
) + params
.list_count
* sizeof(u64
) +
312 uds_compute_delta_index_save_bytes(params
.list_count
,
313 params
.memory_size
));
317 /* This function is only useful if the configuration includes sparse chapters. */
318 static void split_configuration(const struct uds_configuration
*config
,
319 struct split_config
*split
)
321 u64 sample_rate
, sample_records
;
322 u64 dense_chapters
, sparse_chapters
;
324 /* Start with copies of the base configuration. */
325 split
->hook_config
= *config
;
326 split
->hook_geometry
= *config
->geometry
;
327 split
->hook_config
.geometry
= &split
->hook_geometry
;
328 split
->non_hook_config
= *config
;
329 split
->non_hook_geometry
= *config
->geometry
;
330 split
->non_hook_config
.geometry
= &split
->non_hook_geometry
;
332 sample_rate
= config
->sparse_sample_rate
;
333 sparse_chapters
= config
->geometry
->sparse_chapters_per_volume
;
334 dense_chapters
= config
->geometry
->chapters_per_volume
- sparse_chapters
;
335 sample_records
= config
->geometry
->records_per_chapter
/ sample_rate
;
337 /* Adjust the number of records indexed for each chapter. */
338 split
->hook_geometry
.records_per_chapter
= sample_records
;
339 split
->non_hook_geometry
.records_per_chapter
-= sample_records
;
341 /* Adjust the number of chapters indexed. */
342 split
->hook_geometry
.sparse_chapters_per_volume
= 0;
343 split
->non_hook_geometry
.sparse_chapters_per_volume
= 0;
344 split
->non_hook_geometry
.chapters_per_volume
= dense_chapters
;
347 static int compute_volume_index_save_bytes(const struct uds_configuration
*config
,
350 size_t hook_bytes
, non_hook_bytes
;
351 struct split_config split
;
354 if (!uds_is_sparse_index_geometry(config
->geometry
))
355 return compute_volume_sub_index_save_bytes(config
, bytes
);
357 split_configuration(config
, &split
);
358 result
= compute_volume_sub_index_save_bytes(&split
.hook_config
, &hook_bytes
);
359 if (result
!= UDS_SUCCESS
)
362 result
= compute_volume_sub_index_save_bytes(&split
.non_hook_config
,
364 if (result
!= UDS_SUCCESS
)
367 *bytes
= sizeof(struct volume_index_data
) + hook_bytes
+ non_hook_bytes
;
371 int uds_compute_volume_index_save_blocks(const struct uds_configuration
*config
,
372 size_t block_size
, u64
*block_count
)
377 result
= compute_volume_index_save_bytes(config
, &bytes
);
378 if (result
!= UDS_SUCCESS
)
381 bytes
+= sizeof(struct delta_list_save_info
);
382 *block_count
= DIV_ROUND_UP(bytes
, block_size
) + MAX_ZONES
;
386 /* Flush invalid entries while walking the delta list. */
387 static inline int flush_invalid_entries(struct volume_index_record
*record
,
388 struct chapter_range
*flush_range
,
389 u32
*next_chapter_to_invalidate
)
393 result
= uds_next_delta_index_entry(&record
->delta_entry
);
394 if (result
!= UDS_SUCCESS
)
397 while (!record
->delta_entry
.at_end
) {
398 u32 index_chapter
= uds_get_delta_entry_value(&record
->delta_entry
);
399 u32 relative_chapter
= ((index_chapter
- flush_range
->chapter_start
) &
400 record
->sub_index
->chapter_mask
);
402 if (likely(relative_chapter
>= flush_range
->chapter_count
)) {
403 if (relative_chapter
< *next_chapter_to_invalidate
)
404 *next_chapter_to_invalidate
= relative_chapter
;
408 result
= uds_remove_delta_index_entry(&record
->delta_entry
);
409 if (result
!= UDS_SUCCESS
)
416 /* Find the matching record, or the list offset where the record would go. */
417 static int get_volume_index_entry(struct volume_index_record
*record
, u32 list_number
,
418 u32 key
, struct chapter_range
*flush_range
)
420 struct volume_index_record other_record
;
421 const struct volume_sub_index
*sub_index
= record
->sub_index
;
422 u32 next_chapter_to_invalidate
= sub_index
->chapter_mask
;
425 result
= uds_start_delta_index_search(&sub_index
->delta_index
, list_number
, 0,
426 &record
->delta_entry
);
427 if (result
!= UDS_SUCCESS
)
431 result
= flush_invalid_entries(record
, flush_range
,
432 &next_chapter_to_invalidate
);
433 if (result
!= UDS_SUCCESS
)
435 } while (!record
->delta_entry
.at_end
&& (key
> record
->delta_entry
.key
));
437 result
= uds_remember_delta_index_offset(&record
->delta_entry
);
438 if (result
!= UDS_SUCCESS
)
441 /* Check any collision records for a more precise match. */
442 other_record
= *record
;
443 if (!other_record
.delta_entry
.at_end
&& (key
== other_record
.delta_entry
.key
)) {
445 u8 collision_name
[UDS_RECORD_NAME_SIZE
];
447 result
= flush_invalid_entries(&other_record
, flush_range
,
448 &next_chapter_to_invalidate
);
449 if (result
!= UDS_SUCCESS
)
452 if (other_record
.delta_entry
.at_end
||
453 !other_record
.delta_entry
.is_collision
)
456 result
= uds_get_delta_entry_collision(&other_record
.delta_entry
,
458 if (result
!= UDS_SUCCESS
)
461 if (memcmp(collision_name
, record
->name
, UDS_RECORD_NAME_SIZE
) == 0) {
462 *record
= other_record
;
467 while (!other_record
.delta_entry
.at_end
) {
468 result
= flush_invalid_entries(&other_record
, flush_range
,
469 &next_chapter_to_invalidate
);
470 if (result
!= UDS_SUCCESS
)
473 next_chapter_to_invalidate
+= flush_range
->chapter_start
;
474 next_chapter_to_invalidate
&= sub_index
->chapter_mask
;
475 flush_range
->chapter_start
= next_chapter_to_invalidate
;
476 flush_range
->chapter_count
= 0;
480 static int get_volume_sub_index_record(struct volume_sub_index
*sub_index
,
481 const struct uds_record_name
*name
,
482 struct volume_index_record
*record
)
485 const struct volume_sub_index_zone
*volume_index_zone
;
486 u32 address
= extract_address(sub_index
, name
);
487 u32 delta_list_number
= extract_dlist_num(sub_index
, name
);
488 u64 flush_chapter
= sub_index
->flush_chapters
[delta_list_number
];
490 record
->sub_index
= sub_index
;
491 record
->mutex
= NULL
;
493 record
->zone_number
= delta_list_number
/ sub_index
->delta_index
.lists_per_zone
;
494 volume_index_zone
= get_zone_for_record(record
);
496 if (flush_chapter
< volume_index_zone
->virtual_chapter_low
) {
497 struct chapter_range range
;
498 u64 flush_count
= volume_index_zone
->virtual_chapter_low
- flush_chapter
;
500 range
.chapter_start
= convert_virtual_to_index(sub_index
, flush_chapter
);
501 range
.chapter_count
= (flush_count
> sub_index
->chapter_mask
?
502 sub_index
->chapter_mask
+ 1 :
504 result
= get_volume_index_entry(record
, delta_list_number
, address
,
506 flush_chapter
= convert_index_to_virtual(record
, range
.chapter_start
);
507 if (flush_chapter
> volume_index_zone
->virtual_chapter_high
)
508 flush_chapter
= volume_index_zone
->virtual_chapter_high
;
509 sub_index
->flush_chapters
[delta_list_number
] = flush_chapter
;
511 result
= uds_get_delta_index_entry(&sub_index
->delta_index
,
512 delta_list_number
, address
,
513 name
->name
, &record
->delta_entry
);
516 if (result
!= UDS_SUCCESS
)
520 (!record
->delta_entry
.at_end
&& (record
->delta_entry
.key
== address
));
521 if (record
->is_found
) {
522 u32 index_chapter
= uds_get_delta_entry_value(&record
->delta_entry
);
524 record
->virtual_chapter
= convert_index_to_virtual(record
, index_chapter
);
527 record
->is_collision
= record
->delta_entry
.is_collision
;
531 int uds_get_volume_index_record(struct volume_index
*volume_index
,
532 const struct uds_record_name
*name
,
533 struct volume_index_record
*record
)
537 if (uds_is_volume_index_sample(volume_index
, name
)) {
539 * Other threads cannot be allowed to call uds_lookup_volume_index_name() while
540 * this thread is finding the volume index record. Due to the lazy LRU flushing of
541 * the volume index, uds_get_volume_index_record() is not a read-only operation.
544 get_volume_sub_index_zone(&volume_index
->vi_hook
, name
);
545 struct mutex
*mutex
= &volume_index
->zones
[zone
].hook_mutex
;
548 result
= get_volume_sub_index_record(&volume_index
->vi_hook
, name
,
551 /* Remember the mutex so that other operations on the index record can use it. */
552 record
->mutex
= mutex
;
554 result
= get_volume_sub_index_record(&volume_index
->vi_non_hook
, name
,
561 int uds_put_volume_index_record(struct volume_index_record
*record
, u64 virtual_chapter
)
565 const struct volume_sub_index
*sub_index
= record
->sub_index
;
567 if (!is_virtual_chapter_indexed(record
, virtual_chapter
)) {
568 u64 low
= get_zone_for_record(record
)->virtual_chapter_low
;
569 u64 high
= get_zone_for_record(record
)->virtual_chapter_high
;
571 return vdo_log_warning_strerror(UDS_INVALID_ARGUMENT
,
572 "cannot put record into chapter number %llu that is out of the valid range %llu to %llu",
573 (unsigned long long) virtual_chapter
,
574 (unsigned long long) low
,
575 (unsigned long long) high
);
577 address
= extract_address(sub_index
, record
->name
);
578 if (unlikely(record
->mutex
!= NULL
))
579 mutex_lock(record
->mutex
);
580 result
= uds_put_delta_index_entry(&record
->delta_entry
, address
,
581 convert_virtual_to_index(sub_index
,
583 record
->is_found
? record
->name
->name
: NULL
);
584 if (unlikely(record
->mutex
!= NULL
))
585 mutex_unlock(record
->mutex
);
588 record
->virtual_chapter
= virtual_chapter
;
589 record
->is_collision
= record
->delta_entry
.is_collision
;
590 record
->is_found
= true;
593 vdo_log_ratelimit(vdo_log_warning_strerror
, UDS_OVERFLOW
,
594 "Volume index entry dropped due to overflow condition");
595 uds_log_delta_index_entry(&record
->delta_entry
);
604 int uds_remove_volume_index_record(struct volume_index_record
*record
)
608 if (!record
->is_found
)
609 return vdo_log_warning_strerror(UDS_BAD_STATE
,
610 "illegal operation on new record");
612 /* Mark the record so that it cannot be used again */
613 record
->is_found
= false;
614 if (unlikely(record
->mutex
!= NULL
))
615 mutex_lock(record
->mutex
);
616 result
= uds_remove_delta_index_entry(&record
->delta_entry
);
617 if (unlikely(record
->mutex
!= NULL
))
618 mutex_unlock(record
->mutex
);
622 static void set_volume_sub_index_zone_open_chapter(struct volume_sub_index
*sub_index
,
623 unsigned int zone_number
,
627 struct volume_sub_index_zone
*zone
= &sub_index
->zones
[zone_number
];
628 struct delta_zone
*delta_zone
;
631 zone
->virtual_chapter_low
= (virtual_chapter
>= sub_index
->chapter_count
?
632 virtual_chapter
- sub_index
->chapter_count
+ 1 :
634 zone
->virtual_chapter_high
= virtual_chapter
;
636 /* Check to see if the new zone data is too large. */
637 delta_zone
= &sub_index
->delta_index
.delta_zones
[zone_number
];
638 for (i
= 1; i
<= delta_zone
->list_count
; i
++)
639 used_bits
+= delta_zone
->delta_lists
[i
].size
;
641 if (used_bits
> sub_index
->max_zone_bits
) {
642 /* Expire enough chapters to free the desired space. */
644 1 + (used_bits
- sub_index
->max_zone_bits
) / sub_index
->chapter_zone_bits
;
646 if (expire_count
== 1) {
647 vdo_log_ratelimit(vdo_log_info
,
648 "zone %u: At chapter %llu, expiring chapter %llu early",
650 (unsigned long long) virtual_chapter
,
651 (unsigned long long) zone
->virtual_chapter_low
);
652 zone
->early_flushes
++;
653 zone
->virtual_chapter_low
++;
655 u64 first_expired
= zone
->virtual_chapter_low
;
657 if (first_expired
+ expire_count
< zone
->virtual_chapter_high
) {
658 zone
->early_flushes
+= expire_count
;
659 zone
->virtual_chapter_low
+= expire_count
;
661 zone
->early_flushes
+=
662 zone
->virtual_chapter_high
- zone
->virtual_chapter_low
;
663 zone
->virtual_chapter_low
= zone
->virtual_chapter_high
;
665 vdo_log_ratelimit(vdo_log_info
,
666 "zone %u: At chapter %llu, expiring chapters %llu to %llu early",
668 (unsigned long long) virtual_chapter
,
669 (unsigned long long) first_expired
,
670 (unsigned long long) zone
->virtual_chapter_low
- 1);
675 void uds_set_volume_index_zone_open_chapter(struct volume_index
*volume_index
,
676 unsigned int zone_number
,
679 struct mutex
*mutex
= &volume_index
->zones
[zone_number
].hook_mutex
;
681 set_volume_sub_index_zone_open_chapter(&volume_index
->vi_non_hook
, zone_number
,
685 * Other threads cannot be allowed to call uds_lookup_volume_index_name() while the open
686 * chapter number is changing.
688 if (has_sparse(volume_index
)) {
690 set_volume_sub_index_zone_open_chapter(&volume_index
->vi_hook
,
691 zone_number
, virtual_chapter
);
697 * Set the newest open chapter number for the index, while also advancing the oldest valid chapter
700 void uds_set_volume_index_open_chapter(struct volume_index
*volume_index
,
705 for (zone
= 0; zone
< volume_index
->zone_count
; zone
++)
706 uds_set_volume_index_zone_open_chapter(volume_index
, zone
, virtual_chapter
);
709 int uds_set_volume_index_record_chapter(struct volume_index_record
*record
,
712 const struct volume_sub_index
*sub_index
= record
->sub_index
;
715 if (!record
->is_found
)
716 return vdo_log_warning_strerror(UDS_BAD_STATE
,
717 "illegal operation on new record");
719 if (!is_virtual_chapter_indexed(record
, virtual_chapter
)) {
720 u64 low
= get_zone_for_record(record
)->virtual_chapter_low
;
721 u64 high
= get_zone_for_record(record
)->virtual_chapter_high
;
723 return vdo_log_warning_strerror(UDS_INVALID_ARGUMENT
,
724 "cannot set chapter number %llu that is out of the valid range %llu to %llu",
725 (unsigned long long) virtual_chapter
,
726 (unsigned long long) low
,
727 (unsigned long long) high
);
730 if (unlikely(record
->mutex
!= NULL
))
731 mutex_lock(record
->mutex
);
732 result
= uds_set_delta_entry_value(&record
->delta_entry
,
733 convert_virtual_to_index(sub_index
,
735 if (unlikely(record
->mutex
!= NULL
))
736 mutex_unlock(record
->mutex
);
737 if (result
!= UDS_SUCCESS
)
740 record
->virtual_chapter
= virtual_chapter
;
744 static u64
lookup_volume_sub_index_name(const struct volume_sub_index
*sub_index
,
745 const struct uds_record_name
*name
)
748 u32 address
= extract_address(sub_index
, name
);
749 u32 delta_list_number
= extract_dlist_num(sub_index
, name
);
750 unsigned int zone_number
= get_volume_sub_index_zone(sub_index
, name
);
751 const struct volume_sub_index_zone
*zone
= &sub_index
->zones
[zone_number
];
755 struct delta_index_entry delta_entry
;
757 result
= uds_get_delta_index_entry(&sub_index
->delta_index
, delta_list_number
,
758 address
, name
->name
, &delta_entry
);
759 if (result
!= UDS_SUCCESS
)
762 if (delta_entry
.at_end
|| (delta_entry
.key
!= address
))
765 index_chapter
= uds_get_delta_entry_value(&delta_entry
);
766 rolling_chapter
= (index_chapter
- zone
->virtual_chapter_low
) & sub_index
->chapter_mask
;
768 virtual_chapter
= zone
->virtual_chapter_low
+ rolling_chapter
;
769 if (virtual_chapter
> zone
->virtual_chapter_high
)
772 return virtual_chapter
;
775 /* Do a read-only lookup of the record name for sparse cache management. */
776 u64
uds_lookup_volume_index_name(const struct volume_index
*volume_index
,
777 const struct uds_record_name
*name
)
779 unsigned int zone_number
= uds_get_volume_index_zone(volume_index
, name
);
780 struct mutex
*mutex
= &volume_index
->zones
[zone_number
].hook_mutex
;
783 if (!uds_is_volume_index_sample(volume_index
, name
))
787 virtual_chapter
= lookup_volume_sub_index_name(&volume_index
->vi_hook
, name
);
790 return virtual_chapter
;
793 static void abort_restoring_volume_sub_index(struct volume_sub_index
*sub_index
)
795 uds_reset_delta_index(&sub_index
->delta_index
);
798 static void abort_restoring_volume_index(struct volume_index
*volume_index
)
800 abort_restoring_volume_sub_index(&volume_index
->vi_non_hook
);
801 if (has_sparse(volume_index
))
802 abort_restoring_volume_sub_index(&volume_index
->vi_hook
);
805 static int start_restoring_volume_sub_index(struct volume_sub_index
*sub_index
,
806 struct buffered_reader
**readers
,
807 unsigned int reader_count
)
811 u64 virtual_chapter_low
= 0, virtual_chapter_high
= 0;
814 for (i
= 0; i
< reader_count
; i
++) {
815 struct sub_index_data header
;
816 u8 buffer
[sizeof(struct sub_index_data
)];
820 result
= uds_read_from_buffered_reader(readers
[i
], buffer
,
822 if (result
!= UDS_SUCCESS
) {
823 return vdo_log_warning_strerror(result
,
824 "failed to read volume index header");
827 memcpy(&header
.magic
, buffer
, MAGIC_SIZE
);
828 offset
+= MAGIC_SIZE
;
829 decode_u64_le(buffer
, &offset
, &header
.volume_nonce
);
830 decode_u64_le(buffer
, &offset
, &header
.virtual_chapter_low
);
831 decode_u64_le(buffer
, &offset
, &header
.virtual_chapter_high
);
832 decode_u32_le(buffer
, &offset
, &header
.first_list
);
833 decode_u32_le(buffer
, &offset
, &header
.list_count
);
835 result
= VDO_ASSERT(offset
== sizeof(buffer
),
836 "%zu bytes decoded of %zu expected", offset
,
838 if (result
!= VDO_SUCCESS
)
839 result
= UDS_CORRUPT_DATA
;
841 if (memcmp(header
.magic
, MAGIC_START_5
, MAGIC_SIZE
) != 0) {
842 return vdo_log_warning_strerror(UDS_CORRUPT_DATA
,
843 "volume index file had bad magic number");
846 if (sub_index
->volume_nonce
== 0) {
847 sub_index
->volume_nonce
= header
.volume_nonce
;
848 } else if (header
.volume_nonce
!= sub_index
->volume_nonce
) {
849 return vdo_log_warning_strerror(UDS_CORRUPT_DATA
,
850 "volume index volume nonce incorrect");
854 virtual_chapter_low
= header
.virtual_chapter_low
;
855 virtual_chapter_high
= header
.virtual_chapter_high
;
856 } else if (virtual_chapter_high
!= header
.virtual_chapter_high
) {
857 u64 low
= header
.virtual_chapter_low
;
858 u64 high
= header
.virtual_chapter_high
;
860 return vdo_log_warning_strerror(UDS_CORRUPT_DATA
,
861 "Inconsistent volume index zone files: Chapter range is [%llu,%llu], chapter range %d is [%llu,%llu]",
862 (unsigned long long) virtual_chapter_low
,
863 (unsigned long long) virtual_chapter_high
,
864 i
, (unsigned long long) low
,
865 (unsigned long long) high
);
866 } else if (virtual_chapter_low
< header
.virtual_chapter_low
) {
867 virtual_chapter_low
= header
.virtual_chapter_low
;
870 for (j
= 0; j
< header
.list_count
; j
++) {
871 u8 decoded
[sizeof(u64
)];
873 result
= uds_read_from_buffered_reader(readers
[i
], decoded
,
875 if (result
!= UDS_SUCCESS
) {
876 return vdo_log_warning_strerror(result
,
877 "failed to read volume index flush ranges");
880 sub_index
->flush_chapters
[header
.first_list
+ j
] =
881 get_unaligned_le64(decoded
);
885 for (z
= 0; z
< sub_index
->zone_count
; z
++) {
886 memset(&sub_index
->zones
[z
], 0, sizeof(struct volume_sub_index_zone
));
887 sub_index
->zones
[z
].virtual_chapter_low
= virtual_chapter_low
;
888 sub_index
->zones
[z
].virtual_chapter_high
= virtual_chapter_high
;
891 result
= uds_start_restoring_delta_index(&sub_index
->delta_index
, readers
,
893 if (result
!= UDS_SUCCESS
)
894 return vdo_log_warning_strerror(result
, "restoring delta index failed");
899 static int start_restoring_volume_index(struct volume_index
*volume_index
,
900 struct buffered_reader
**buffered_readers
,
901 unsigned int reader_count
)
906 if (!has_sparse(volume_index
)) {
907 return start_restoring_volume_sub_index(&volume_index
->vi_non_hook
,
908 buffered_readers
, reader_count
);
911 for (i
= 0; i
< reader_count
; i
++) {
912 struct volume_index_data header
;
913 u8 buffer
[sizeof(struct volume_index_data
)];
916 result
= uds_read_from_buffered_reader(buffered_readers
[i
], buffer
,
918 if (result
!= UDS_SUCCESS
) {
919 return vdo_log_warning_strerror(result
,
920 "failed to read volume index header");
923 memcpy(&header
.magic
, buffer
, MAGIC_SIZE
);
924 offset
+= MAGIC_SIZE
;
925 decode_u32_le(buffer
, &offset
, &header
.sparse_sample_rate
);
927 result
= VDO_ASSERT(offset
== sizeof(buffer
),
928 "%zu bytes decoded of %zu expected", offset
,
930 if (result
!= VDO_SUCCESS
)
931 result
= UDS_CORRUPT_DATA
;
933 if (memcmp(header
.magic
, MAGIC_START_6
, MAGIC_SIZE
) != 0)
934 return vdo_log_warning_strerror(UDS_CORRUPT_DATA
,
935 "volume index file had bad magic number");
938 volume_index
->sparse_sample_rate
= header
.sparse_sample_rate
;
939 } else if (volume_index
->sparse_sample_rate
!= header
.sparse_sample_rate
) {
940 vdo_log_warning_strerror(UDS_CORRUPT_DATA
,
941 "Inconsistent sparse sample rate in delta index zone files: %u vs. %u",
942 volume_index
->sparse_sample_rate
,
943 header
.sparse_sample_rate
);
944 return UDS_CORRUPT_DATA
;
948 result
= start_restoring_volume_sub_index(&volume_index
->vi_non_hook
,
949 buffered_readers
, reader_count
);
950 if (result
!= UDS_SUCCESS
)
953 return start_restoring_volume_sub_index(&volume_index
->vi_hook
, buffered_readers
,
957 static int finish_restoring_volume_sub_index(struct volume_sub_index
*sub_index
,
958 struct buffered_reader
**buffered_readers
,
959 unsigned int reader_count
)
961 return uds_finish_restoring_delta_index(&sub_index
->delta_index
,
962 buffered_readers
, reader_count
);
965 static int finish_restoring_volume_index(struct volume_index
*volume_index
,
966 struct buffered_reader
**buffered_readers
,
967 unsigned int reader_count
)
971 result
= finish_restoring_volume_sub_index(&volume_index
->vi_non_hook
,
972 buffered_readers
, reader_count
);
973 if ((result
== UDS_SUCCESS
) && has_sparse(volume_index
)) {
974 result
= finish_restoring_volume_sub_index(&volume_index
->vi_hook
,
982 int uds_load_volume_index(struct volume_index
*volume_index
,
983 struct buffered_reader
**readers
, unsigned int reader_count
)
987 /* Start by reading the header section of the stream. */
988 result
= start_restoring_volume_index(volume_index
, readers
, reader_count
);
989 if (result
!= UDS_SUCCESS
)
992 result
= finish_restoring_volume_index(volume_index
, readers
, reader_count
);
993 if (result
!= UDS_SUCCESS
) {
994 abort_restoring_volume_index(volume_index
);
998 /* Check the final guard lists to make sure there is no extra data. */
999 result
= uds_check_guard_delta_lists(readers
, reader_count
);
1000 if (result
!= UDS_SUCCESS
)
1001 abort_restoring_volume_index(volume_index
);
1006 static int start_saving_volume_sub_index(const struct volume_sub_index
*sub_index
,
1007 unsigned int zone_number
,
1008 struct buffered_writer
*buffered_writer
)
1011 struct volume_sub_index_zone
*volume_index_zone
= &sub_index
->zones
[zone_number
];
1012 u32 first_list
= sub_index
->delta_index
.delta_zones
[zone_number
].first_list
;
1013 u32 list_count
= sub_index
->delta_index
.delta_zones
[zone_number
].list_count
;
1014 u8 buffer
[sizeof(struct sub_index_data
)];
1018 memcpy(buffer
, MAGIC_START_5
, MAGIC_SIZE
);
1019 offset
+= MAGIC_SIZE
;
1020 encode_u64_le(buffer
, &offset
, sub_index
->volume_nonce
);
1021 encode_u64_le(buffer
, &offset
, volume_index_zone
->virtual_chapter_low
);
1022 encode_u64_le(buffer
, &offset
, volume_index_zone
->virtual_chapter_high
);
1023 encode_u32_le(buffer
, &offset
, first_list
);
1024 encode_u32_le(buffer
, &offset
, list_count
);
1026 result
= VDO_ASSERT(offset
== sizeof(struct sub_index_data
),
1027 "%zu bytes of config written, of %zu expected", offset
,
1028 sizeof(struct sub_index_data
));
1029 if (result
!= VDO_SUCCESS
)
1032 result
= uds_write_to_buffered_writer(buffered_writer
, buffer
, offset
);
1033 if (result
!= UDS_SUCCESS
)
1034 return vdo_log_warning_strerror(result
,
1035 "failed to write volume index header");
1037 for (i
= 0; i
< list_count
; i
++) {
1038 u8 encoded
[sizeof(u64
)];
1040 put_unaligned_le64(sub_index
->flush_chapters
[first_list
+ i
], &encoded
);
1041 result
= uds_write_to_buffered_writer(buffered_writer
, encoded
,
1043 if (result
!= UDS_SUCCESS
) {
1044 return vdo_log_warning_strerror(result
,
1045 "failed to write volume index flush ranges");
1049 return uds_start_saving_delta_index(&sub_index
->delta_index
, zone_number
,
1053 static int start_saving_volume_index(const struct volume_index
*volume_index
,
1054 unsigned int zone_number
,
1055 struct buffered_writer
*writer
)
1057 u8 buffer
[sizeof(struct volume_index_data
)];
1061 if (!has_sparse(volume_index
)) {
1062 return start_saving_volume_sub_index(&volume_index
->vi_non_hook
,
1063 zone_number
, writer
);
1066 memcpy(buffer
, MAGIC_START_6
, MAGIC_SIZE
);
1067 offset
+= MAGIC_SIZE
;
1068 encode_u32_le(buffer
, &offset
, volume_index
->sparse_sample_rate
);
1069 result
= VDO_ASSERT(offset
== sizeof(struct volume_index_data
),
1070 "%zu bytes of header written, of %zu expected", offset
,
1071 sizeof(struct volume_index_data
));
1072 if (result
!= VDO_SUCCESS
)
1075 result
= uds_write_to_buffered_writer(writer
, buffer
, offset
);
1076 if (result
!= UDS_SUCCESS
) {
1077 vdo_log_warning_strerror(result
, "failed to write volume index header");
1081 result
= start_saving_volume_sub_index(&volume_index
->vi_non_hook
, zone_number
,
1083 if (result
!= UDS_SUCCESS
)
1086 return start_saving_volume_sub_index(&volume_index
->vi_hook
, zone_number
,
1090 static int finish_saving_volume_sub_index(const struct volume_sub_index
*sub_index
,
1091 unsigned int zone_number
)
1093 return uds_finish_saving_delta_index(&sub_index
->delta_index
, zone_number
);
1096 static int finish_saving_volume_index(const struct volume_index
*volume_index
,
1097 unsigned int zone_number
)
1101 result
= finish_saving_volume_sub_index(&volume_index
->vi_non_hook
, zone_number
);
1102 if ((result
== UDS_SUCCESS
) && has_sparse(volume_index
))
1103 result
= finish_saving_volume_sub_index(&volume_index
->vi_hook
, zone_number
);
1107 int uds_save_volume_index(struct volume_index
*volume_index
,
1108 struct buffered_writer
**writers
, unsigned int writer_count
)
1110 int result
= UDS_SUCCESS
;
1113 for (zone
= 0; zone
< writer_count
; zone
++) {
1114 result
= start_saving_volume_index(volume_index
, zone
, writers
[zone
]);
1115 if (result
!= UDS_SUCCESS
)
1118 result
= finish_saving_volume_index(volume_index
, zone
);
1119 if (result
!= UDS_SUCCESS
)
1122 result
= uds_write_guard_delta_list(writers
[zone
]);
1123 if (result
!= UDS_SUCCESS
)
1126 result
= uds_flush_buffered_writer(writers
[zone
]);
1127 if (result
!= UDS_SUCCESS
)
1134 static void get_volume_sub_index_stats(const struct volume_sub_index
*sub_index
,
1135 struct volume_index_stats
*stats
)
1137 struct delta_index_stats dis
;
1140 uds_get_delta_index_stats(&sub_index
->delta_index
, &dis
);
1141 stats
->rebalance_time
= dis
.rebalance_time
;
1142 stats
->rebalance_count
= dis
.rebalance_count
;
1143 stats
->record_count
= dis
.record_count
;
1144 stats
->collision_count
= dis
.collision_count
;
1145 stats
->discard_count
= dis
.discard_count
;
1146 stats
->overflow_count
= dis
.overflow_count
;
1147 stats
->delta_lists
= dis
.list_count
;
1148 stats
->early_flushes
= 0;
1149 for (z
= 0; z
< sub_index
->zone_count
; z
++)
1150 stats
->early_flushes
+= sub_index
->zones
[z
].early_flushes
;
1153 void uds_get_volume_index_stats(const struct volume_index
*volume_index
,
1154 struct volume_index_stats
*stats
)
1156 struct volume_index_stats sparse_stats
;
1158 get_volume_sub_index_stats(&volume_index
->vi_non_hook
, stats
);
1159 if (!has_sparse(volume_index
))
1162 get_volume_sub_index_stats(&volume_index
->vi_hook
, &sparse_stats
);
1163 stats
->rebalance_time
+= sparse_stats
.rebalance_time
;
1164 stats
->rebalance_count
+= sparse_stats
.rebalance_count
;
1165 stats
->record_count
+= sparse_stats
.record_count
;
1166 stats
->collision_count
+= sparse_stats
.collision_count
;
1167 stats
->discard_count
+= sparse_stats
.discard_count
;
1168 stats
->overflow_count
+= sparse_stats
.overflow_count
;
1169 stats
->delta_lists
+= sparse_stats
.delta_lists
;
1170 stats
->early_flushes
+= sparse_stats
.early_flushes
;
1173 static int initialize_volume_sub_index(const struct uds_configuration
*config
,
1174 u64 volume_nonce
, u8 tag
,
1175 struct volume_sub_index
*sub_index
)
1177 struct sub_index_parameters params
= { .address_bits
= 0 };
1178 unsigned int zone_count
= config
->zone_count
;
1179 u64 available_bytes
= 0;
1183 result
= compute_volume_sub_index_parameters(config
, ¶ms
);
1184 if (result
!= UDS_SUCCESS
)
1187 sub_index
->address_bits
= params
.address_bits
;
1188 sub_index
->address_mask
= (1u << params
.address_bits
) - 1;
1189 sub_index
->chapter_bits
= params
.chapter_bits
;
1190 sub_index
->chapter_mask
= (1u << params
.chapter_bits
) - 1;
1191 sub_index
->chapter_count
= params
.chapter_count
;
1192 sub_index
->list_count
= params
.list_count
;
1193 sub_index
->zone_count
= zone_count
;
1194 sub_index
->chapter_zone_bits
= params
.chapter_size_in_bits
/ zone_count
;
1195 sub_index
->volume_nonce
= volume_nonce
;
1197 result
= uds_initialize_delta_index(&sub_index
->delta_index
, zone_count
,
1198 params
.list_count
, params
.mean_delta
,
1199 params
.chapter_bits
, params
.memory_size
,
1201 if (result
!= UDS_SUCCESS
)
1204 for (z
= 0; z
< sub_index
->delta_index
.zone_count
; z
++)
1205 available_bytes
+= sub_index
->delta_index
.delta_zones
[z
].size
;
1206 available_bytes
-= params
.target_free_bytes
;
1207 sub_index
->max_zone_bits
= (available_bytes
* BITS_PER_BYTE
) / zone_count
;
1208 sub_index
->memory_size
= (sub_index
->delta_index
.memory_size
+
1209 sizeof(struct volume_sub_index
) +
1210 (params
.list_count
* sizeof(u64
)) +
1211 (zone_count
* sizeof(struct volume_sub_index_zone
)));
1213 /* The following arrays are initialized to all zeros. */
1214 result
= vdo_allocate(params
.list_count
, u64
, "first chapter to flush",
1215 &sub_index
->flush_chapters
);
1216 if (result
!= VDO_SUCCESS
)
1219 return vdo_allocate(zone_count
, struct volume_sub_index_zone
,
1220 "volume index zones", &sub_index
->zones
);
1223 int uds_make_volume_index(const struct uds_configuration
*config
, u64 volume_nonce
,
1224 struct volume_index
**volume_index_ptr
)
1226 struct split_config split
;
1228 struct volume_index
*volume_index
;
1231 result
= vdo_allocate(1, struct volume_index
, "volume index", &volume_index
);
1232 if (result
!= VDO_SUCCESS
)
1235 volume_index
->zone_count
= config
->zone_count
;
1237 if (!uds_is_sparse_index_geometry(config
->geometry
)) {
1238 result
= initialize_volume_sub_index(config
, volume_nonce
, 'm',
1239 &volume_index
->vi_non_hook
);
1240 if (result
!= UDS_SUCCESS
) {
1241 uds_free_volume_index(volume_index
);
1245 volume_index
->memory_size
= volume_index
->vi_non_hook
.memory_size
;
1246 *volume_index_ptr
= volume_index
;
1250 volume_index
->sparse_sample_rate
= config
->sparse_sample_rate
;
1252 result
= vdo_allocate(config
->zone_count
, struct volume_index_zone
,
1253 "volume index zones", &volume_index
->zones
);
1254 if (result
!= VDO_SUCCESS
) {
1255 uds_free_volume_index(volume_index
);
1259 for (zone
= 0; zone
< config
->zone_count
; zone
++)
1260 mutex_init(&volume_index
->zones
[zone
].hook_mutex
);
1262 split_configuration(config
, &split
);
1263 result
= initialize_volume_sub_index(&split
.non_hook_config
, volume_nonce
, 'd',
1264 &volume_index
->vi_non_hook
);
1265 if (result
!= UDS_SUCCESS
) {
1266 uds_free_volume_index(volume_index
);
1267 return vdo_log_error_strerror(result
,
1268 "Error creating non hook volume index");
1271 result
= initialize_volume_sub_index(&split
.hook_config
, volume_nonce
, 's',
1272 &volume_index
->vi_hook
);
1273 if (result
!= UDS_SUCCESS
) {
1274 uds_free_volume_index(volume_index
);
1275 return vdo_log_error_strerror(result
,
1276 "Error creating hook volume index");
1279 volume_index
->memory_size
=
1280 volume_index
->vi_non_hook
.memory_size
+ volume_index
->vi_hook
.memory_size
;
1281 *volume_index_ptr
= volume_index
;