1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright 2023 Red Hat
5 #include "delta-index.h"
7 #include <linux/bitops.h>
8 #include <linux/bits.h>
9 #include <linux/compiler.h>
10 #include <linux/limits.h>
11 #include <linux/log2.h>
16 #include "memory-alloc.h"
18 #include "permassert.h"
19 #include "string-utils.h"
20 #include "time-utils.h"
26 * The entries in a delta index could be stored in a single delta list, but to reduce search times
27 * and update costs it uses multiple delta lists. These lists are stored in a single chunk of
28 * memory managed by the delta_zone structure. The delta_zone can move the data around within its
29 * memory, so the location of each delta list is recorded as a bit offset into the memory. Because
30 * the volume index can contain over a million delta lists, we want to be efficient with the size
31 * of the delta list header information. This information is encoded into 16 bytes per list. The
32 * volume index delta list memory can easily exceed 4 gigabits, so a 64 bit value is needed to
33 * address the memory. The volume index delta lists average around 6 kilobits, so 16 bits are
34 * sufficient to store the size of a delta list.
36 * Each delta list is stored as a bit stream. Within the delta list encoding, bits and bytes are
37 * numbered in little endian order. Within a byte, bit 0 is the least significant bit (0x1), and
38 * bit 7 is the most significant bit (0x80). Within a bit stream, bit 7 is the most significant bit
39 * of byte 0, and bit 8 is the least significant bit of byte 1. Within a byte array, a byte's
40 * number corresponds to its index in the array.
42 * A standard delta list entry is stored as a fixed length payload (the value) followed by a
43 * variable length key (the delta). A collision entry is used when two block names have the same
44 * delta list address. A collision entry always follows a standard entry for the hash with which it
45 * collides, and is encoded with DELTA == 0 with an additional 256 bits field at the end,
46 * containing the full block name. An entry with a delta of 0 at the beginning of a delta list
47 * indicates a normal entry.
49 * The delta in each entry is encoded with a variable-length Huffman code to minimize the memory
50 * used by small deltas. The Huffman code is specified by three parameters, which can be computed
51 * from the desired mean delta when the index is full. (See compute_coding_constants() for
54 * The bit field utilities used to read and write delta entries assume that it is possible to read
55 * some bytes beyond the end of the bit field, so a delta_zone memory allocation is guarded by two
56 * invalid delta lists to prevent reading outside the delta_zone memory. The valid delta lists are
57 * numbered 1 to N, and the guard lists are numbered 0 and N+1. The function to decode the bit
58 * stream include a step that skips over bits set to 0 until the first 1 bit is found. A corrupted
59 * delta list could cause this step to run off the end of the delta_zone memory, so as extra
60 * protection against this happening, the tail guard list is set to all ones.
62 * The delta_index supports two different forms. The mutable form is created by
63 * uds_initialize_delta_index(), and is used for the volume index and for open chapter indexes. The
64 * immutable form is created by uds_initialize_delta_index_page(), and is used for closed (and
65 * cached) chapter index pages. The immutable form does not allocate delta list headers or
66 * temporary offsets, and thus is somewhat more memory efficient.
70 * This is the largest field size supported by get_field() and set_field(). Any field that is
71 * larger is not guaranteed to fit in a single byte-aligned u32.
73 #define MAX_FIELD_BITS ((sizeof(u32) - 1) * BITS_PER_BYTE + 1)
76 * This is the largest field size supported by get_big_field() and set_big_field(). Any field that
77 * is larger is not guaranteed to fit in a single byte-aligned u64.
79 #define MAX_BIG_FIELD_BITS ((sizeof(u64) - 1) * BITS_PER_BYTE + 1)
82 * This is the number of guard bytes needed at the end of the memory byte array when using the bit
83 * utilities. These utilities call get_big_field() and set_big_field(), which can access up to 7
84 * bytes beyond the end of the desired field. The definition is written to make it clear how this
87 #define POST_FIELD_GUARD_BYTES (sizeof(u64) - 1)
89 /* The number of guard bits that are needed in the tail guard list */
90 #define GUARD_BITS (POST_FIELD_GUARD_BYTES * BITS_PER_BYTE)
93 * The maximum size of a single delta list in bytes. We count guard bytes in this value because a
94 * buffer of this size can be used with move_bits().
96 #define DELTA_LIST_MAX_BYTE_COUNT \
97 ((U16_MAX + BITS_PER_BYTE) / BITS_PER_BYTE + POST_FIELD_GUARD_BYTES)
99 /* The number of extra bytes and bits needed to store a collision entry */
100 #define COLLISION_BYTES UDS_RECORD_NAME_SIZE
101 #define COLLISION_BITS (COLLISION_BYTES * BITS_PER_BYTE)
104 * Immutable delta lists are packed into pages containing a header that encodes the delta list
105 * information into 19 bits per list (64KB bit offset).
107 #define IMMUTABLE_HEADER_SIZE 19
110 * Constants and structures for the saved delta index. "DI" is for delta_index, and -##### is a
111 * number to increment when the format of the data changes.
115 static const char DELTA_INDEX_MAGIC
[] = "DI-00002";
117 struct delta_index_header
{
118 char magic
[MAGIC_SIZE
];
128 * Header data used for immutable delta index pages. This data is followed by the delta list offset
131 struct delta_page_header
{
132 /* Externally-defined nonce */
134 /* The virtual chapter number */
135 u64 virtual_chapter_number
;
136 /* Index of the first delta list on the page */
138 /* Number of delta lists on the page */
142 static inline u64
get_delta_list_byte_start(const struct delta_list
*delta_list
)
144 return delta_list
->start
/ BITS_PER_BYTE
;
147 static inline u16
get_delta_list_byte_size(const struct delta_list
*delta_list
)
149 unsigned int bit_offset
= delta_list
->start
% BITS_PER_BYTE
;
151 return BITS_TO_BYTES(bit_offset
+ delta_list
->size
);
154 static void rebalance_delta_zone(const struct delta_zone
*delta_zone
, u32 first
,
157 struct delta_list
*delta_list
;
161 /* Only one list is moving, and we know there is space. */
162 delta_list
= &delta_zone
->delta_lists
[first
];
163 new_start
= delta_zone
->new_offsets
[first
];
164 if (delta_list
->start
!= new_start
) {
168 source
= get_delta_list_byte_start(delta_list
);
169 delta_list
->start
= new_start
;
170 destination
= get_delta_list_byte_start(delta_list
);
171 memmove(delta_zone
->memory
+ destination
,
172 delta_zone
->memory
+ source
,
173 get_delta_list_byte_size(delta_list
));
177 * There is more than one list. Divide the problem in half, and use recursive calls
178 * to process each half. Note that after this computation, first <= middle, and
181 u32 middle
= (first
+ last
) / 2;
183 delta_list
= &delta_zone
->delta_lists
[middle
];
184 new_start
= delta_zone
->new_offsets
[middle
];
187 * The direction that our middle list is moving determines which half of the
188 * problem must be processed first.
190 if (new_start
> delta_list
->start
) {
191 rebalance_delta_zone(delta_zone
, middle
+ 1, last
);
192 rebalance_delta_zone(delta_zone
, first
, middle
);
194 rebalance_delta_zone(delta_zone
, first
, middle
);
195 rebalance_delta_zone(delta_zone
, middle
+ 1, last
);
200 static inline size_t get_zone_memory_size(unsigned int zone_count
, size_t memory_size
)
202 /* Round up so that each zone is a multiple of 64K in size. */
203 size_t ALLOC_BOUNDARY
= 64 * 1024;
205 return (memory_size
/ zone_count
+ ALLOC_BOUNDARY
- 1) & -ALLOC_BOUNDARY
;
208 void uds_reset_delta_index(const struct delta_index
*delta_index
)
213 * Initialize all delta lists to be empty. We keep 2 extra delta list descriptors, one
214 * before the first real entry and one after so that we don't need to bounds check the
215 * array access when calculating preceding and following gap sizes.
217 for (z
= 0; z
< delta_index
->zone_count
; z
++) {
222 struct delta_zone
*zone
= &delta_index
->delta_zones
[z
];
223 struct delta_list
*delta_lists
= zone
->delta_lists
;
225 /* Zeroing the delta list headers initializes the head guard list correctly. */
226 memset(delta_lists
, 0,
227 (zone
->list_count
+ 2) * sizeof(struct delta_list
));
229 /* Set all the bits in the end guard list. */
230 list_bits
= (u64
) zone
->size
* BITS_PER_BYTE
- GUARD_BITS
;
231 delta_lists
[zone
->list_count
+ 1].start
= list_bits
;
232 delta_lists
[zone
->list_count
+ 1].size
= GUARD_BITS
;
233 memset(zone
->memory
+ (list_bits
/ BITS_PER_BYTE
), ~0,
234 POST_FIELD_GUARD_BYTES
);
236 /* Evenly space out the real delta lists by setting regular offsets. */
237 spacing
= list_bits
/ zone
->list_count
;
238 offset
= spacing
/ 2;
239 for (i
= 1; i
<= zone
->list_count
; i
++) {
240 delta_lists
[i
].start
= offset
;
244 /* Update the statistics. */
245 zone
->discard_count
+= zone
->record_count
;
246 zone
->record_count
= 0;
247 zone
->collision_count
= 0;
251 /* Compute the Huffman coding parameters for the given mean delta. The Huffman code is specified by
254 * MINBITS The number of bits in the smallest code
255 * BASE The number of values coded using a code of length MINBITS
256 * INCR The number of values coded by using one additional bit
258 * These parameters are related by this equation:
260 * BASE + INCR == 1 << MINBITS
262 * The math for the Huffman code of an exponential distribution says that
264 * INCR = log(2) * MEAN_DELTA
266 * Then use the smallest MINBITS value so that
268 * (1 << MINBITS) > INCR
272 * BASE = (1 << MINBITS) - INCR
274 * Now the index can generate a code such that
275 * - The first BASE values code using MINBITS bits.
276 * - The next INCR values code using MINBITS+1 bits.
277 * - The next INCR values code using MINBITS+2 bits.
280 static void compute_coding_constants(u32 mean_delta
, u16
*min_bits
, u32
*min_keys
, u32
*incr_keys
)
283 * We want to compute the rounded value of log(2) * mean_delta. Since we cannot always use
284 * floating point, use a really good integer approximation.
286 *incr_keys
= (836158UL * mean_delta
+ 603160UL) / 1206321UL;
287 *min_bits
= bits_per(*incr_keys
+ 1);
288 *min_keys
= (1 << *min_bits
) - *incr_keys
;
291 void uds_uninitialize_delta_index(struct delta_index
*delta_index
)
295 if (delta_index
->delta_zones
== NULL
)
298 for (z
= 0; z
< delta_index
->zone_count
; z
++) {
299 vdo_free(vdo_forget(delta_index
->delta_zones
[z
].new_offsets
));
300 vdo_free(vdo_forget(delta_index
->delta_zones
[z
].delta_lists
));
301 vdo_free(vdo_forget(delta_index
->delta_zones
[z
].memory
));
304 vdo_free(delta_index
->delta_zones
);
305 memset(delta_index
, 0, sizeof(struct delta_index
));
308 static int initialize_delta_zone(struct delta_zone
*delta_zone
, size_t size
,
309 u32 first_list
, u32 list_count
, u32 mean_delta
,
310 u32 payload_bits
, u8 tag
)
314 result
= vdo_allocate(size
, u8
, "delta list", &delta_zone
->memory
);
315 if (result
!= VDO_SUCCESS
)
318 result
= vdo_allocate(list_count
+ 2, u64
, "delta list temp",
319 &delta_zone
->new_offsets
);
320 if (result
!= VDO_SUCCESS
)
323 /* Allocate the delta lists. */
324 result
= vdo_allocate(list_count
+ 2, struct delta_list
, "delta lists",
325 &delta_zone
->delta_lists
);
326 if (result
!= VDO_SUCCESS
)
329 compute_coding_constants(mean_delta
, &delta_zone
->min_bits
,
330 &delta_zone
->min_keys
, &delta_zone
->incr_keys
);
331 delta_zone
->value_bits
= payload_bits
;
332 delta_zone
->buffered_writer
= NULL
;
333 delta_zone
->size
= size
;
334 delta_zone
->rebalance_time
= 0;
335 delta_zone
->rebalance_count
= 0;
336 delta_zone
->record_count
= 0;
337 delta_zone
->collision_count
= 0;
338 delta_zone
->discard_count
= 0;
339 delta_zone
->overflow_count
= 0;
340 delta_zone
->first_list
= first_list
;
341 delta_zone
->list_count
= list_count
;
342 delta_zone
->tag
= tag
;
347 int uds_initialize_delta_index(struct delta_index
*delta_index
, unsigned int zone_count
,
348 u32 list_count
, u32 mean_delta
, u32 payload_bits
,
349 size_t memory_size
, u8 tag
)
355 result
= vdo_allocate(zone_count
, struct delta_zone
, "Delta Index Zones",
356 &delta_index
->delta_zones
);
357 if (result
!= VDO_SUCCESS
)
360 delta_index
->zone_count
= zone_count
;
361 delta_index
->list_count
= list_count
;
362 delta_index
->lists_per_zone
= DIV_ROUND_UP(list_count
, zone_count
);
363 delta_index
->memory_size
= 0;
364 delta_index
->mutable = true;
365 delta_index
->tag
= tag
;
367 for (z
= 0; z
< zone_count
; z
++) {
368 u32 lists_in_zone
= delta_index
->lists_per_zone
;
369 u32 first_list_in_zone
= z
* lists_in_zone
;
371 if (z
== zone_count
- 1) {
373 * The last zone gets fewer lists if zone_count doesn't evenly divide
374 * list_count. We'll have an underflow if the assertion below doesn't hold.
376 if (delta_index
->list_count
<= first_list_in_zone
) {
377 uds_uninitialize_delta_index(delta_index
);
378 return vdo_log_error_strerror(UDS_INVALID_ARGUMENT
,
379 "%u delta lists not enough for %u zones",
380 list_count
, zone_count
);
382 lists_in_zone
= delta_index
->list_count
- first_list_in_zone
;
385 zone_memory
= get_zone_memory_size(zone_count
, memory_size
);
386 result
= initialize_delta_zone(&delta_index
->delta_zones
[z
], zone_memory
,
387 first_list_in_zone
, lists_in_zone
,
388 mean_delta
, payload_bits
, tag
);
389 if (result
!= UDS_SUCCESS
) {
390 uds_uninitialize_delta_index(delta_index
);
394 delta_index
->memory_size
+=
395 (sizeof(struct delta_zone
) + zone_memory
+
396 (lists_in_zone
+ 2) * (sizeof(struct delta_list
) + sizeof(u64
)));
399 uds_reset_delta_index(delta_index
);
403 /* Read a bit field from an arbitrary bit boundary. */
404 static inline u32
get_field(const u8
*memory
, u64 offset
, u8 size
)
406 const void *addr
= memory
+ offset
/ BITS_PER_BYTE
;
408 return (get_unaligned_le32(addr
) >> (offset
% BITS_PER_BYTE
)) & ((1 << size
) - 1);
411 /* Write a bit field to an arbitrary bit boundary. */
412 static inline void set_field(u32 value
, u8
*memory
, u64 offset
, u8 size
)
414 void *addr
= memory
+ offset
/ BITS_PER_BYTE
;
415 int shift
= offset
% BITS_PER_BYTE
;
416 u32 data
= get_unaligned_le32(addr
);
418 data
&= ~(((1 << size
) - 1) << shift
);
419 data
|= value
<< shift
;
420 put_unaligned_le32(data
, addr
);
423 /* Get the bit offset to the immutable delta list header. */
424 static inline u32
get_immutable_header_offset(u32 list_number
)
426 return sizeof(struct delta_page_header
) * BITS_PER_BYTE
+
427 list_number
* IMMUTABLE_HEADER_SIZE
;
430 /* Get the bit offset to the start of the immutable delta list bit stream. */
431 static inline u32
get_immutable_start(const u8
*memory
, u32 list_number
)
433 return get_field(memory
, get_immutable_header_offset(list_number
),
434 IMMUTABLE_HEADER_SIZE
);
437 /* Set the bit offset to the start of the immutable delta list bit stream. */
438 static inline void set_immutable_start(u8
*memory
, u32 list_number
, u32 start
)
440 set_field(start
, memory
, get_immutable_header_offset(list_number
),
441 IMMUTABLE_HEADER_SIZE
);
444 static bool verify_delta_index_page(u64 nonce
, u16 list_count
, u64 expected_nonce
,
445 u8
*memory
, size_t memory_size
)
450 * Verify the nonce. A mismatch can happen here during rebuild if we haven't written the
451 * entire volume at least once.
453 if (nonce
!= expected_nonce
)
456 /* Verify that the number of delta lists can fit in the page. */
457 if (list_count
> ((memory_size
- sizeof(struct delta_page_header
)) *
458 BITS_PER_BYTE
/ IMMUTABLE_HEADER_SIZE
))
462 * Verify that the first delta list is immediately after the last delta
465 if (get_immutable_start(memory
, 0) != get_immutable_header_offset(list_count
+ 1))
468 /* Verify that the lists are in the correct order. */
469 for (i
= 0; i
< list_count
; i
++) {
470 if (get_immutable_start(memory
, i
) > get_immutable_start(memory
, i
+ 1))
475 * Verify that the last list ends on the page, and that there is room
476 * for the post-field guard bits.
478 if (get_immutable_start(memory
, list_count
) >
479 (memory_size
- POST_FIELD_GUARD_BYTES
) * BITS_PER_BYTE
)
482 /* Verify that the guard bytes are correctly set to all ones. */
483 for (i
= 0; i
< POST_FIELD_GUARD_BYTES
; i
++) {
484 if (memory
[memory_size
- POST_FIELD_GUARD_BYTES
+ i
] != (u8
) ~0)
488 /* All verifications passed. */
492 /* Initialize a delta index page to refer to a supplied page. */
493 int uds_initialize_delta_index_page(struct delta_index_page
*delta_index_page
,
494 u64 expected_nonce
, u32 mean_delta
, u32 payload_bits
,
495 u8
*memory
, size_t memory_size
)
501 struct delta_page_header
*header
= (struct delta_page_header
*) memory
;
502 struct delta_zone
*delta_zone
= &delta_index_page
->delta_zone
;
503 const u8
*nonce_addr
= (const u8
*) &header
->nonce
;
504 const u8
*vcn_addr
= (const u8
*) &header
->virtual_chapter_number
;
505 const u8
*first_list_addr
= (const u8
*) &header
->first_list
;
506 const u8
*list_count_addr
= (const u8
*) &header
->list_count
;
508 /* First assume that the header is little endian. */
509 nonce
= get_unaligned_le64(nonce_addr
);
510 vcn
= get_unaligned_le64(vcn_addr
);
511 first_list
= get_unaligned_le16(first_list_addr
);
512 list_count
= get_unaligned_le16(list_count_addr
);
513 if (!verify_delta_index_page(nonce
, list_count
, expected_nonce
, memory
,
515 /* If that fails, try big endian. */
516 nonce
= get_unaligned_be64(nonce_addr
);
517 vcn
= get_unaligned_be64(vcn_addr
);
518 first_list
= get_unaligned_be16(first_list_addr
);
519 list_count
= get_unaligned_be16(list_count_addr
);
520 if (!verify_delta_index_page(nonce
, list_count
, expected_nonce
, memory
,
523 * Both attempts failed. Do not log this as an error, because it can happen
524 * during a rebuild if we haven't written the entire volume at least once.
526 return UDS_CORRUPT_DATA
;
530 delta_index_page
->delta_index
.delta_zones
= delta_zone
;
531 delta_index_page
->delta_index
.zone_count
= 1;
532 delta_index_page
->delta_index
.list_count
= list_count
;
533 delta_index_page
->delta_index
.lists_per_zone
= list_count
;
534 delta_index_page
->delta_index
.mutable = false;
535 delta_index_page
->delta_index
.tag
= 'p';
536 delta_index_page
->virtual_chapter_number
= vcn
;
537 delta_index_page
->lowest_list_number
= first_list
;
538 delta_index_page
->highest_list_number
= first_list
+ list_count
- 1;
540 compute_coding_constants(mean_delta
, &delta_zone
->min_bits
,
541 &delta_zone
->min_keys
, &delta_zone
->incr_keys
);
542 delta_zone
->value_bits
= payload_bits
;
543 delta_zone
->memory
= memory
;
544 delta_zone
->delta_lists
= NULL
;
545 delta_zone
->new_offsets
= NULL
;
546 delta_zone
->buffered_writer
= NULL
;
547 delta_zone
->size
= memory_size
;
548 delta_zone
->rebalance_time
= 0;
549 delta_zone
->rebalance_count
= 0;
550 delta_zone
->record_count
= 0;
551 delta_zone
->collision_count
= 0;
552 delta_zone
->discard_count
= 0;
553 delta_zone
->overflow_count
= 0;
554 delta_zone
->first_list
= 0;
555 delta_zone
->list_count
= list_count
;
556 delta_zone
->tag
= 'p';
561 /* Read a large bit field from an arbitrary bit boundary. */
562 static inline u64
get_big_field(const u8
*memory
, u64 offset
, u8 size
)
564 const void *addr
= memory
+ offset
/ BITS_PER_BYTE
;
566 return (get_unaligned_le64(addr
) >> (offset
% BITS_PER_BYTE
)) & ((1UL << size
) - 1);
569 /* Write a large bit field to an arbitrary bit boundary. */
570 static inline void set_big_field(u64 value
, u8
*memory
, u64 offset
, u8 size
)
572 void *addr
= memory
+ offset
/ BITS_PER_BYTE
;
573 u8 shift
= offset
% BITS_PER_BYTE
;
574 u64 data
= get_unaligned_le64(addr
);
576 data
&= ~(((1UL << size
) - 1) << shift
);
577 data
|= value
<< shift
;
578 put_unaligned_le64(data
, addr
);
581 /* Set a sequence of bits to all zeros. */
582 static inline void set_zero(u8
*memory
, u64 offset
, u32 size
)
585 u8
*addr
= memory
+ offset
/ BITS_PER_BYTE
;
586 u8 shift
= offset
% BITS_PER_BYTE
;
587 u32 count
= size
+ shift
> BITS_PER_BYTE
? (u32
) BITS_PER_BYTE
- shift
: size
;
589 *addr
++ &= ~(((1 << count
) - 1) << shift
);
590 for (size
-= count
; size
> BITS_PER_BYTE
; size
-= BITS_PER_BYTE
)
594 *addr
&= 0xFF << size
;
599 * Move several bits from a higher to a lower address, moving the lower addressed bits first. The
600 * size and memory offsets are measured in bits.
602 static void move_bits_down(const u8
*from
, u64 from_offset
, u8
*to
, u64 to_offset
, u32 size
)
610 /* Start by moving one field that ends on a to int boundary. */
611 count
= (MAX_BIG_FIELD_BITS
- ((to_offset
+ MAX_BIG_FIELD_BITS
) % BITS_PER_TYPE(u32
)));
612 field
= get_big_field(from
, from_offset
, count
);
613 set_big_field(field
, to
, to_offset
, count
);
614 from_offset
+= count
;
618 /* Now do the main loop to copy 32 bit chunks that are int-aligned at the destination. */
619 offset
= from_offset
% BITS_PER_TYPE(u32
);
620 source
= from
+ (from_offset
- offset
) / BITS_PER_BYTE
;
621 destination
= to
+ to_offset
/ BITS_PER_BYTE
;
622 while (size
> MAX_BIG_FIELD_BITS
) {
623 put_unaligned_le32(get_unaligned_le64(source
) >> offset
, destination
);
624 source
+= sizeof(u32
);
625 destination
+= sizeof(u32
);
626 from_offset
+= BITS_PER_TYPE(u32
);
627 to_offset
+= BITS_PER_TYPE(u32
);
628 size
-= BITS_PER_TYPE(u32
);
631 /* Finish up by moving any remaining bits. */
633 field
= get_big_field(from
, from_offset
, size
);
634 set_big_field(field
, to
, to_offset
, size
);
639 * Move several bits from a lower to a higher address, moving the higher addressed bits first. The
640 * size and memory offsets are measured in bits.
642 static void move_bits_up(const u8
*from
, u64 from_offset
, u8
*to
, u64 to_offset
, u32 size
)
650 /* Start by moving one field that begins on a destination int boundary. */
651 count
= (to_offset
+ size
) % BITS_PER_TYPE(u32
);
654 field
= get_big_field(from
, from_offset
+ size
, count
);
655 set_big_field(field
, to
, to_offset
+ size
, count
);
658 /* Now do the main loop to copy 32 bit chunks that are int-aligned at the destination. */
659 offset
= (from_offset
+ size
) % BITS_PER_TYPE(u32
);
660 source
= from
+ (from_offset
+ size
- offset
) / BITS_PER_BYTE
;
661 destination
= to
+ (to_offset
+ size
) / BITS_PER_BYTE
;
662 while (size
> MAX_BIG_FIELD_BITS
) {
663 source
-= sizeof(u32
);
664 destination
-= sizeof(u32
);
665 size
-= BITS_PER_TYPE(u32
);
666 put_unaligned_le32(get_unaligned_le64(source
) >> offset
, destination
);
669 /* Finish up by moving any remaining bits. */
671 field
= get_big_field(from
, from_offset
, size
);
672 set_big_field(field
, to
, to_offset
, size
);
677 * Move bits from one field to another. When the fields overlap, behave as if we first move all the
678 * bits from the source to a temporary value, and then move all the bits from the temporary value
679 * to the destination. The size and memory offsets are measured in bits.
681 static void move_bits(const u8
*from
, u64 from_offset
, u8
*to
, u64 to_offset
, u32 size
)
685 /* A small move doesn't require special handling. */
686 if (size
<= MAX_BIG_FIELD_BITS
) {
688 field
= get_big_field(from
, from_offset
, size
);
689 set_big_field(field
, to
, to_offset
, size
);
695 if (from_offset
> to_offset
)
696 move_bits_down(from
, from_offset
, to
, to_offset
, size
);
698 move_bits_up(from
, from_offset
, to
, to_offset
, size
);
702 * Pack delta lists from a mutable delta index into an immutable delta index page. A range of delta
703 * lists (starting with a specified list index) is copied from the mutable delta index into a
704 * memory page used in the immutable index. The number of lists copied onto the page is returned in
707 int uds_pack_delta_index_page(const struct delta_index
*delta_index
, u64 header_nonce
,
708 u8
*memory
, size_t memory_size
, u64 virtual_chapter_number
,
709 u32 first_list
, u32
*list_count
)
711 const struct delta_zone
*delta_zone
;
712 struct delta_list
*delta_lists
;
719 struct delta_page_header
*header
;
721 delta_zone
= &delta_index
->delta_zones
[0];
722 delta_lists
= &delta_zone
->delta_lists
[first_list
+ 1];
723 max_lists
= delta_index
->list_count
- first_list
;
726 * Compute how many lists will fit on the page. Subtract the size of the fixed header, one
727 * delta list offset, and the guard bytes from the page size to determine how much space is
728 * available for delta lists.
730 free_bits
= memory_size
* BITS_PER_BYTE
;
731 free_bits
-= get_immutable_header_offset(1);
732 free_bits
-= GUARD_BITS
;
733 if (free_bits
< IMMUTABLE_HEADER_SIZE
) {
734 /* This page is too small to store any delta lists. */
735 return vdo_log_error_strerror(UDS_OVERFLOW
,
736 "Chapter Index Page of %zu bytes is too small",
740 while (n_lists
< max_lists
) {
741 /* Each list requires a delta list offset and the list data. */
742 bits
= IMMUTABLE_HEADER_SIZE
+ delta_lists
[n_lists
].size
;
743 if (bits
> free_bits
)
750 *list_count
= n_lists
;
752 header
= (struct delta_page_header
*) memory
;
753 put_unaligned_le64(header_nonce
, (u8
*) &header
->nonce
);
754 put_unaligned_le64(virtual_chapter_number
,
755 (u8
*) &header
->virtual_chapter_number
);
756 put_unaligned_le16(first_list
, (u8
*) &header
->first_list
);
757 put_unaligned_le16(n_lists
, (u8
*) &header
->list_count
);
759 /* Construct the delta list offset table. */
760 offset
= get_immutable_header_offset(n_lists
+ 1);
761 set_immutable_start(memory
, 0, offset
);
762 for (i
= 0; i
< n_lists
; i
++) {
763 offset
+= delta_lists
[i
].size
;
764 set_immutable_start(memory
, i
+ 1, offset
);
767 /* Copy the delta list data onto the memory page. */
768 for (i
= 0; i
< n_lists
; i
++) {
769 move_bits(delta_zone
->memory
, delta_lists
[i
].start
, memory
,
770 get_immutable_start(memory
, i
), delta_lists
[i
].size
);
773 /* Set all the bits in the guard bytes. */
774 memset(memory
+ memory_size
- POST_FIELD_GUARD_BYTES
, ~0,
775 POST_FIELD_GUARD_BYTES
);
779 /* Compute the new offsets of the delta lists. */
780 static void compute_new_list_offsets(struct delta_zone
*delta_zone
, u32 growing_index
,
781 size_t growing_size
, size_t used_space
)
785 struct delta_list
*delta_lists
= delta_zone
->delta_lists
;
786 u32 tail_guard_index
= delta_zone
->list_count
+ 1;
788 spacing
= (delta_zone
->size
- used_space
) / delta_zone
->list_count
;
789 delta_zone
->new_offsets
[0] = 0;
790 for (i
= 0; i
<= delta_zone
->list_count
; i
++) {
791 delta_zone
->new_offsets
[i
+ 1] =
792 (delta_zone
->new_offsets
[i
] +
793 get_delta_list_byte_size(&delta_lists
[i
]) + spacing
);
794 delta_zone
->new_offsets
[i
] *= BITS_PER_BYTE
;
795 delta_zone
->new_offsets
[i
] += delta_lists
[i
].start
% BITS_PER_BYTE
;
797 delta_zone
->new_offsets
[i
+ 1] -= spacing
/ 2;
798 if (i
+ 1 == growing_index
)
799 delta_zone
->new_offsets
[i
+ 1] += growing_size
;
802 delta_zone
->new_offsets
[tail_guard_index
] =
803 (delta_zone
->size
* BITS_PER_BYTE
- delta_lists
[tail_guard_index
].size
);
806 static void rebalance_lists(struct delta_zone
*delta_zone
)
808 struct delta_list
*delta_lists
;
810 size_t used_space
= 0;
812 /* Extend and balance memory to receive the delta lists */
813 delta_lists
= delta_zone
->delta_lists
;
814 for (i
= 0; i
<= delta_zone
->list_count
+ 1; i
++)
815 used_space
+= get_delta_list_byte_size(&delta_lists
[i
]);
817 compute_new_list_offsets(delta_zone
, 0, 0, used_space
);
818 for (i
= 1; i
<= delta_zone
->list_count
+ 1; i
++)
819 delta_lists
[i
].start
= delta_zone
->new_offsets
[i
];
822 /* Start restoring a delta index from multiple input streams. */
823 int uds_start_restoring_delta_index(struct delta_index
*delta_index
,
824 struct buffered_reader
**buffered_readers
,
825 unsigned int reader_count
)
828 unsigned int zone_count
= reader_count
;
829 u64 record_count
= 0;
830 u64 collision_count
= 0;
831 u32 first_list
[MAX_ZONES
];
832 u32 list_count
[MAX_ZONES
];
835 const struct delta_zone
*delta_zone
;
837 /* Read and validate each header. */
838 for (z
= 0; z
< zone_count
; z
++) {
839 struct delta_index_header header
;
840 u8 buffer
[sizeof(struct delta_index_header
)];
843 result
= uds_read_from_buffered_reader(buffered_readers
[z
], buffer
,
845 if (result
!= UDS_SUCCESS
) {
846 return vdo_log_warning_strerror(result
,
847 "failed to read delta index header");
850 memcpy(&header
.magic
, buffer
, MAGIC_SIZE
);
851 offset
+= MAGIC_SIZE
;
852 decode_u32_le(buffer
, &offset
, &header
.zone_number
);
853 decode_u32_le(buffer
, &offset
, &header
.zone_count
);
854 decode_u32_le(buffer
, &offset
, &header
.first_list
);
855 decode_u32_le(buffer
, &offset
, &header
.list_count
);
856 decode_u64_le(buffer
, &offset
, &header
.record_count
);
857 decode_u64_le(buffer
, &offset
, &header
.collision_count
);
859 result
= VDO_ASSERT(offset
== sizeof(struct delta_index_header
),
860 "%zu bytes decoded of %zu expected", offset
,
861 sizeof(struct delta_index_header
));
862 if (result
!= VDO_SUCCESS
) {
863 return vdo_log_warning_strerror(result
,
864 "failed to read delta index header");
867 if (memcmp(header
.magic
, DELTA_INDEX_MAGIC
, MAGIC_SIZE
) != 0) {
868 return vdo_log_warning_strerror(UDS_CORRUPT_DATA
,
869 "delta index file has bad magic number");
872 if (zone_count
!= header
.zone_count
) {
873 return vdo_log_warning_strerror(UDS_CORRUPT_DATA
,
874 "delta index files contain mismatched zone counts (%u,%u)",
875 zone_count
, header
.zone_count
);
878 if (header
.zone_number
!= z
) {
879 return vdo_log_warning_strerror(UDS_CORRUPT_DATA
,
880 "delta index zone %u found in slot %u",
881 header
.zone_number
, z
);
884 first_list
[z
] = header
.first_list
;
885 list_count
[z
] = header
.list_count
;
886 record_count
+= header
.record_count
;
887 collision_count
+= header
.collision_count
;
889 if (first_list
[z
] != list_next
) {
890 return vdo_log_warning_strerror(UDS_CORRUPT_DATA
,
891 "delta index file for zone %u starts with list %u instead of list %u",
892 z
, first_list
[z
], list_next
);
895 list_next
+= list_count
[z
];
898 if (list_next
!= delta_index
->list_count
) {
899 return vdo_log_warning_strerror(UDS_CORRUPT_DATA
,
900 "delta index files contain %u delta lists instead of %u delta lists",
901 list_next
, delta_index
->list_count
);
904 if (collision_count
> record_count
) {
905 return vdo_log_warning_strerror(UDS_CORRUPT_DATA
,
906 "delta index files contain %llu collisions and %llu records",
907 (unsigned long long) collision_count
,
908 (unsigned long long) record_count
);
911 uds_reset_delta_index(delta_index
);
912 delta_index
->delta_zones
[0].record_count
= record_count
;
913 delta_index
->delta_zones
[0].collision_count
= collision_count
;
915 /* Read the delta lists and distribute them to the proper zones. */
916 for (z
= 0; z
< zone_count
; z
++) {
919 delta_index
->load_lists
[z
] = 0;
920 for (i
= 0; i
< list_count
[z
]; i
++) {
923 unsigned int zone_number
;
924 u8 size_data
[sizeof(u16
)];
926 result
= uds_read_from_buffered_reader(buffered_readers
[z
],
929 if (result
!= UDS_SUCCESS
) {
930 return vdo_log_warning_strerror(result
,
931 "failed to read delta index size");
934 delta_list_size
= get_unaligned_le16(size_data
);
935 if (delta_list_size
> 0)
936 delta_index
->load_lists
[z
] += 1;
938 list_number
= first_list
[z
] + i
;
939 zone_number
= list_number
/ delta_index
->lists_per_zone
;
940 delta_zone
= &delta_index
->delta_zones
[zone_number
];
941 list_number
-= delta_zone
->first_list
;
942 delta_zone
->delta_lists
[list_number
+ 1].size
= delta_list_size
;
946 /* Prepare each zone to start receiving the delta list data. */
947 for (z
= 0; z
< delta_index
->zone_count
; z
++)
948 rebalance_lists(&delta_index
->delta_zones
[z
]);
953 static int restore_delta_list_to_zone(struct delta_zone
*delta_zone
,
954 const struct delta_list_save_info
*save_info
,
957 struct delta_list
*delta_list
;
960 u32 list_number
= save_info
->index
- delta_zone
->first_list
;
962 if (list_number
>= delta_zone
->list_count
) {
963 return vdo_log_warning_strerror(UDS_CORRUPT_DATA
,
964 "invalid delta list number %u not in range [%u,%u)",
965 save_info
->index
, delta_zone
->first_list
,
966 delta_zone
->first_list
+ delta_zone
->list_count
);
969 delta_list
= &delta_zone
->delta_lists
[list_number
+ 1];
970 if (delta_list
->size
== 0) {
971 return vdo_log_warning_strerror(UDS_CORRUPT_DATA
,
972 "unexpected delta list number %u",
976 bit_count
= delta_list
->size
+ save_info
->bit_offset
;
977 byte_count
= BITS_TO_BYTES(bit_count
);
978 if (save_info
->byte_count
!= byte_count
) {
979 return vdo_log_warning_strerror(UDS_CORRUPT_DATA
,
980 "unexpected delta list size %u != %u",
981 save_info
->byte_count
, byte_count
);
984 move_bits(data
, save_info
->bit_offset
, delta_zone
->memory
, delta_list
->start
,
989 static int restore_delta_list_data(struct delta_index
*delta_index
, unsigned int load_zone
,
990 struct buffered_reader
*buffered_reader
, u8
*data
)
993 struct delta_list_save_info save_info
;
994 u8 buffer
[sizeof(struct delta_list_save_info
)];
995 unsigned int new_zone
;
997 result
= uds_read_from_buffered_reader(buffered_reader
, buffer
, sizeof(buffer
));
998 if (result
!= UDS_SUCCESS
) {
999 return vdo_log_warning_strerror(result
,
1000 "failed to read delta list data");
1003 save_info
= (struct delta_list_save_info
) {
1005 .bit_offset
= buffer
[1],
1006 .byte_count
= get_unaligned_le16(&buffer
[2]),
1007 .index
= get_unaligned_le32(&buffer
[4]),
1010 if ((save_info
.bit_offset
>= BITS_PER_BYTE
) ||
1011 (save_info
.byte_count
> DELTA_LIST_MAX_BYTE_COUNT
)) {
1012 return vdo_log_warning_strerror(UDS_CORRUPT_DATA
,
1013 "corrupt delta list data");
1016 /* Make sure the data is intended for this delta index. */
1017 if (save_info
.tag
!= delta_index
->tag
)
1018 return UDS_CORRUPT_DATA
;
1020 if (save_info
.index
>= delta_index
->list_count
) {
1021 return vdo_log_warning_strerror(UDS_CORRUPT_DATA
,
1022 "invalid delta list number %u of %u",
1024 delta_index
->list_count
);
1027 result
= uds_read_from_buffered_reader(buffered_reader
, data
,
1028 save_info
.byte_count
);
1029 if (result
!= UDS_SUCCESS
) {
1030 return vdo_log_warning_strerror(result
,
1031 "failed to read delta list data");
1034 delta_index
->load_lists
[load_zone
] -= 1;
1035 new_zone
= save_info
.index
/ delta_index
->lists_per_zone
;
1036 return restore_delta_list_to_zone(&delta_index
->delta_zones
[new_zone
],
1040 /* Restore delta lists from saved data. */
1041 int uds_finish_restoring_delta_index(struct delta_index
*delta_index
,
1042 struct buffered_reader
**buffered_readers
,
1043 unsigned int reader_count
)
1046 int saved_result
= UDS_SUCCESS
;
1050 result
= vdo_allocate(DELTA_LIST_MAX_BYTE_COUNT
, u8
, __func__
, &data
);
1051 if (result
!= VDO_SUCCESS
)
1054 for (z
= 0; z
< reader_count
; z
++) {
1055 while (delta_index
->load_lists
[z
] > 0) {
1056 result
= restore_delta_list_data(delta_index
, z
,
1057 buffered_readers
[z
], data
);
1058 if (result
!= UDS_SUCCESS
) {
1059 saved_result
= result
;
1066 return saved_result
;
1069 int uds_check_guard_delta_lists(struct buffered_reader
**buffered_readers
,
1070 unsigned int reader_count
)
1074 u8 buffer
[sizeof(struct delta_list_save_info
)];
1076 for (z
= 0; z
< reader_count
; z
++) {
1077 result
= uds_read_from_buffered_reader(buffered_readers
[z
], buffer
,
1079 if (result
!= UDS_SUCCESS
)
1082 if (buffer
[0] != 'z')
1083 return UDS_CORRUPT_DATA
;
1089 static int flush_delta_list(struct delta_zone
*zone
, u32 flush_index
)
1091 struct delta_list
*delta_list
;
1092 u8 buffer
[sizeof(struct delta_list_save_info
)];
1095 delta_list
= &zone
->delta_lists
[flush_index
+ 1];
1097 buffer
[0] = zone
->tag
;
1098 buffer
[1] = delta_list
->start
% BITS_PER_BYTE
;
1099 put_unaligned_le16(get_delta_list_byte_size(delta_list
), &buffer
[2]);
1100 put_unaligned_le32(zone
->first_list
+ flush_index
, &buffer
[4]);
1102 result
= uds_write_to_buffered_writer(zone
->buffered_writer
, buffer
,
1104 if (result
!= UDS_SUCCESS
) {
1105 vdo_log_warning_strerror(result
, "failed to write delta list memory");
1109 result
= uds_write_to_buffered_writer(zone
->buffered_writer
,
1110 zone
->memory
+ get_delta_list_byte_start(delta_list
),
1111 get_delta_list_byte_size(delta_list
));
1112 if (result
!= UDS_SUCCESS
)
1113 vdo_log_warning_strerror(result
, "failed to write delta list memory");
1118 /* Start saving a delta index zone to a buffered output stream. */
1119 int uds_start_saving_delta_index(const struct delta_index
*delta_index
,
1120 unsigned int zone_number
,
1121 struct buffered_writer
*buffered_writer
)
1125 struct delta_zone
*delta_zone
;
1126 u8 buffer
[sizeof(struct delta_index_header
)];
1129 delta_zone
= &delta_index
->delta_zones
[zone_number
];
1130 memcpy(buffer
, DELTA_INDEX_MAGIC
, MAGIC_SIZE
);
1131 offset
+= MAGIC_SIZE
;
1132 encode_u32_le(buffer
, &offset
, zone_number
);
1133 encode_u32_le(buffer
, &offset
, delta_index
->zone_count
);
1134 encode_u32_le(buffer
, &offset
, delta_zone
->first_list
);
1135 encode_u32_le(buffer
, &offset
, delta_zone
->list_count
);
1136 encode_u64_le(buffer
, &offset
, delta_zone
->record_count
);
1137 encode_u64_le(buffer
, &offset
, delta_zone
->collision_count
);
1139 result
= VDO_ASSERT(offset
== sizeof(struct delta_index_header
),
1140 "%zu bytes encoded of %zu expected", offset
,
1141 sizeof(struct delta_index_header
));
1142 if (result
!= VDO_SUCCESS
)
1145 result
= uds_write_to_buffered_writer(buffered_writer
, buffer
, offset
);
1146 if (result
!= UDS_SUCCESS
)
1147 return vdo_log_warning_strerror(result
,
1148 "failed to write delta index header");
1150 for (i
= 0; i
< delta_zone
->list_count
; i
++) {
1151 u8 data
[sizeof(u16
)];
1152 struct delta_list
*delta_list
;
1154 delta_list
= &delta_zone
->delta_lists
[i
+ 1];
1155 put_unaligned_le16(delta_list
->size
, data
);
1156 result
= uds_write_to_buffered_writer(buffered_writer
, data
,
1158 if (result
!= UDS_SUCCESS
)
1159 return vdo_log_warning_strerror(result
,
1160 "failed to write delta list size");
1163 delta_zone
->buffered_writer
= buffered_writer
;
1167 int uds_finish_saving_delta_index(const struct delta_index
*delta_index
,
1168 unsigned int zone_number
)
1171 int first_error
= UDS_SUCCESS
;
1173 struct delta_zone
*delta_zone
;
1174 struct delta_list
*delta_list
;
1176 delta_zone
= &delta_index
->delta_zones
[zone_number
];
1177 for (i
= 0; i
< delta_zone
->list_count
; i
++) {
1178 delta_list
= &delta_zone
->delta_lists
[i
+ 1];
1179 if (delta_list
->size
> 0) {
1180 result
= flush_delta_list(delta_zone
, i
);
1181 if ((result
!= UDS_SUCCESS
) && (first_error
== UDS_SUCCESS
))
1182 first_error
= result
;
1186 delta_zone
->buffered_writer
= NULL
;
1190 int uds_write_guard_delta_list(struct buffered_writer
*buffered_writer
)
1193 u8 buffer
[sizeof(struct delta_list_save_info
)];
1195 memset(buffer
, 0, sizeof(struct delta_list_save_info
));
1198 result
= uds_write_to_buffered_writer(buffered_writer
, buffer
, sizeof(buffer
));
1199 if (result
!= UDS_SUCCESS
)
1200 vdo_log_warning_strerror(result
, "failed to write guard delta list");
1205 size_t uds_compute_delta_index_save_bytes(u32 list_count
, size_t memory_size
)
1207 /* One zone will use at least as much memory as other zone counts. */
1208 return (sizeof(struct delta_index_header
) +
1209 list_count
* (sizeof(struct delta_list_save_info
) + 1) +
1210 get_zone_memory_size(1, memory_size
));
1213 static int assert_not_at_end(const struct delta_index_entry
*delta_entry
)
1215 int result
= VDO_ASSERT(!delta_entry
->at_end
,
1216 "operation is invalid because the list entry is at the end of the delta list");
1217 if (result
!= VDO_SUCCESS
)
1218 result
= UDS_BAD_STATE
;
1224 * Prepare to search for an entry in the specified delta list.
1226 * This is always the first function to be called when dealing with delta index entries. It is
1227 * always followed by calls to uds_next_delta_index_entry() to iterate through a delta list. The
1228 * fields of the delta_index_entry argument will be set up for iteration, but will not contain an
1229 * entry from the list.
1231 int uds_start_delta_index_search(const struct delta_index
*delta_index
, u32 list_number
,
1232 u32 key
, struct delta_index_entry
*delta_entry
)
1235 unsigned int zone_number
;
1236 struct delta_zone
*delta_zone
;
1237 struct delta_list
*delta_list
;
1239 result
= VDO_ASSERT((list_number
< delta_index
->list_count
),
1240 "Delta list number (%u) is out of range (%u)", list_number
,
1241 delta_index
->list_count
);
1242 if (result
!= VDO_SUCCESS
)
1243 return UDS_CORRUPT_DATA
;
1245 zone_number
= list_number
/ delta_index
->lists_per_zone
;
1246 delta_zone
= &delta_index
->delta_zones
[zone_number
];
1247 list_number
-= delta_zone
->first_list
;
1248 result
= VDO_ASSERT((list_number
< delta_zone
->list_count
),
1249 "Delta list number (%u) is out of range (%u) for zone (%u)",
1250 list_number
, delta_zone
->list_count
, zone_number
);
1251 if (result
!= VDO_SUCCESS
)
1252 return UDS_CORRUPT_DATA
;
1254 if (delta_index
->mutable) {
1255 delta_list
= &delta_zone
->delta_lists
[list_number
+ 1];
1260 * Translate the immutable delta list header into a temporary
1261 * full delta list header.
1263 delta_list
= &delta_entry
->temp_delta_list
;
1264 delta_list
->start
= get_immutable_start(delta_zone
->memory
, list_number
);
1265 end_offset
= get_immutable_start(delta_zone
->memory
, list_number
+ 1);
1266 delta_list
->size
= end_offset
- delta_list
->start
;
1267 delta_list
->save_key
= 0;
1268 delta_list
->save_offset
= 0;
1271 if (key
> delta_list
->save_key
) {
1272 delta_entry
->key
= delta_list
->save_key
;
1273 delta_entry
->offset
= delta_list
->save_offset
;
1275 delta_entry
->key
= 0;
1276 delta_entry
->offset
= 0;
1279 * This usually means we're about to walk the entire delta list, so get all
1280 * of it into the CPU cache.
1282 uds_prefetch_range(&delta_zone
->memory
[delta_list
->start
/ BITS_PER_BYTE
],
1283 delta_list
->size
/ BITS_PER_BYTE
, false);
1287 delta_entry
->at_end
= false;
1288 delta_entry
->delta_zone
= delta_zone
;
1289 delta_entry
->delta_list
= delta_list
;
1290 delta_entry
->entry_bits
= 0;
1291 delta_entry
->is_collision
= false;
1292 delta_entry
->list_number
= list_number
;
1293 delta_entry
->list_overflow
= false;
1294 delta_entry
->value_bits
= delta_zone
->value_bits
;
1298 static inline u64
get_delta_entry_offset(const struct delta_index_entry
*delta_entry
)
1300 return delta_entry
->delta_list
->start
+ delta_entry
->offset
;
1304 * Decode a delta index entry delta value. The delta_index_entry basically describes the previous
1305 * list entry, and has had its offset field changed to point to the subsequent entry. We decode the
1306 * bit stream and update the delta_list_entry to describe the entry.
1308 static inline void decode_delta(struct delta_index_entry
*delta_entry
)
1312 const struct delta_zone
*delta_zone
= delta_entry
->delta_zone
;
1313 const u8
*memory
= delta_zone
->memory
;
1314 u64 delta_offset
= get_delta_entry_offset(delta_entry
) + delta_entry
->value_bits
;
1315 const u8
*addr
= memory
+ delta_offset
/ BITS_PER_BYTE
;
1316 int offset
= delta_offset
% BITS_PER_BYTE
;
1317 u32 data
= get_unaligned_le32(addr
) >> offset
;
1319 addr
+= sizeof(u32
);
1320 key_bits
= delta_zone
->min_bits
;
1321 delta
= data
& ((1 << key_bits
) - 1);
1322 if (delta
>= delta_zone
->min_keys
) {
1325 key_bits
= sizeof(u32
) * BITS_PER_BYTE
- offset
;
1326 while ((data
= get_unaligned_le32(addr
)) == 0) {
1327 addr
+= sizeof(u32
);
1328 key_bits
+= sizeof(u32
) * BITS_PER_BYTE
;
1331 key_bits
+= ffs(data
);
1332 delta
+= ((key_bits
- delta_zone
->min_bits
- 1) * delta_zone
->incr_keys
);
1334 delta_entry
->delta
= delta
;
1335 delta_entry
->key
+= delta
;
1337 /* Check for a collision, a delta of zero after the start. */
1338 if (unlikely((delta
== 0) && (delta_entry
->offset
> 0))) {
1339 delta_entry
->is_collision
= true;
1340 delta_entry
->entry_bits
= delta_entry
->value_bits
+ key_bits
+ COLLISION_BITS
;
1342 delta_entry
->is_collision
= false;
1343 delta_entry
->entry_bits
= delta_entry
->value_bits
+ key_bits
;
1347 noinline
int uds_next_delta_index_entry(struct delta_index_entry
*delta_entry
)
1350 const struct delta_list
*delta_list
;
1354 result
= assert_not_at_end(delta_entry
);
1355 if (result
!= UDS_SUCCESS
)
1358 delta_list
= delta_entry
->delta_list
;
1359 delta_entry
->offset
+= delta_entry
->entry_bits
;
1360 size
= delta_list
->size
;
1361 if (unlikely(delta_entry
->offset
>= size
)) {
1362 delta_entry
->at_end
= true;
1363 delta_entry
->delta
= 0;
1364 delta_entry
->is_collision
= false;
1365 result
= VDO_ASSERT((delta_entry
->offset
== size
),
1366 "next offset past end of delta list");
1367 if (result
!= VDO_SUCCESS
)
1368 result
= UDS_CORRUPT_DATA
;
1373 decode_delta(delta_entry
);
1375 next_offset
= delta_entry
->offset
+ delta_entry
->entry_bits
;
1376 if (next_offset
> size
) {
1378 * This is not an assertion because uds_validate_chapter_index_page() wants to
1379 * handle this error.
1381 vdo_log_warning("Decoded past the end of the delta list");
1382 return UDS_CORRUPT_DATA
;
1388 int uds_remember_delta_index_offset(const struct delta_index_entry
*delta_entry
)
1391 struct delta_list
*delta_list
= delta_entry
->delta_list
;
1393 result
= VDO_ASSERT(!delta_entry
->is_collision
, "entry is not a collision");
1394 if (result
!= VDO_SUCCESS
)
1397 delta_list
->save_key
= delta_entry
->key
- delta_entry
->delta
;
1398 delta_list
->save_offset
= delta_entry
->offset
;
1402 static void set_delta(struct delta_index_entry
*delta_entry
, u32 delta
)
1404 const struct delta_zone
*delta_zone
= delta_entry
->delta_zone
;
1405 u32 key_bits
= (delta_zone
->min_bits
+
1406 ((delta_zone
->incr_keys
- delta_zone
->min_keys
+ delta
) /
1407 delta_zone
->incr_keys
));
1409 delta_entry
->delta
= delta
;
1410 delta_entry
->entry_bits
= delta_entry
->value_bits
+ key_bits
;
1413 static void get_collision_name(const struct delta_index_entry
*entry
, u8
*name
)
1415 u64 offset
= get_delta_entry_offset(entry
) + entry
->entry_bits
- COLLISION_BITS
;
1416 const u8
*addr
= entry
->delta_zone
->memory
+ offset
/ BITS_PER_BYTE
;
1417 int size
= COLLISION_BYTES
;
1418 int shift
= offset
% BITS_PER_BYTE
;
1421 *name
++ = get_unaligned_le16(addr
++) >> shift
;
1424 static void set_collision_name(const struct delta_index_entry
*entry
, const u8
*name
)
1426 u64 offset
= get_delta_entry_offset(entry
) + entry
->entry_bits
- COLLISION_BITS
;
1427 u8
*addr
= entry
->delta_zone
->memory
+ offset
/ BITS_PER_BYTE
;
1428 int size
= COLLISION_BYTES
;
1429 int shift
= offset
% BITS_PER_BYTE
;
1430 u16 mask
= ~((u16
) 0xFF << shift
);
1433 while (--size
>= 0) {
1434 data
= (get_unaligned_le16(addr
) & mask
) | (*name
++ << shift
);
1435 put_unaligned_le16(data
, addr
++);
1439 int uds_get_delta_index_entry(const struct delta_index
*delta_index
, u32 list_number
,
1440 u32 key
, const u8
*name
,
1441 struct delta_index_entry
*delta_entry
)
1445 result
= uds_start_delta_index_search(delta_index
, list_number
, key
,
1447 if (result
!= UDS_SUCCESS
)
1451 result
= uds_next_delta_index_entry(delta_entry
);
1452 if (result
!= UDS_SUCCESS
)
1454 } while (!delta_entry
->at_end
&& (key
> delta_entry
->key
));
1456 result
= uds_remember_delta_index_offset(delta_entry
);
1457 if (result
!= UDS_SUCCESS
)
1460 if (!delta_entry
->at_end
&& (key
== delta_entry
->key
)) {
1461 struct delta_index_entry collision_entry
= *delta_entry
;
1464 u8 full_name
[COLLISION_BYTES
];
1466 result
= uds_next_delta_index_entry(&collision_entry
);
1467 if (result
!= UDS_SUCCESS
)
1470 if (collision_entry
.at_end
|| !collision_entry
.is_collision
)
1473 get_collision_name(&collision_entry
, full_name
);
1474 if (memcmp(full_name
, name
, COLLISION_BYTES
) == 0) {
1475 *delta_entry
= collision_entry
;
1484 int uds_get_delta_entry_collision(const struct delta_index_entry
*delta_entry
, u8
*name
)
1488 result
= assert_not_at_end(delta_entry
);
1489 if (result
!= UDS_SUCCESS
)
1492 result
= VDO_ASSERT(delta_entry
->is_collision
,
1493 "Cannot get full block name from a non-collision delta index entry");
1494 if (result
!= VDO_SUCCESS
)
1495 return UDS_BAD_STATE
;
1497 get_collision_name(delta_entry
, name
);
1501 u32
uds_get_delta_entry_value(const struct delta_index_entry
*delta_entry
)
1503 return get_field(delta_entry
->delta_zone
->memory
,
1504 get_delta_entry_offset(delta_entry
), delta_entry
->value_bits
);
1507 static int assert_mutable_entry(const struct delta_index_entry
*delta_entry
)
1509 int result
= VDO_ASSERT((delta_entry
->delta_list
!= &delta_entry
->temp_delta_list
),
1510 "delta index is mutable");
1511 if (result
!= VDO_SUCCESS
)
1512 result
= UDS_BAD_STATE
;
1517 int uds_set_delta_entry_value(const struct delta_index_entry
*delta_entry
, u32 value
)
1520 u32 value_mask
= (1 << delta_entry
->value_bits
) - 1;
1522 result
= assert_mutable_entry(delta_entry
);
1523 if (result
!= UDS_SUCCESS
)
1526 result
= assert_not_at_end(delta_entry
);
1527 if (result
!= UDS_SUCCESS
)
1530 result
= VDO_ASSERT((value
& value_mask
) == value
,
1531 "Value (%u) being set in a delta index is too large (must fit in %u bits)",
1532 value
, delta_entry
->value_bits
);
1533 if (result
!= VDO_SUCCESS
)
1534 return UDS_INVALID_ARGUMENT
;
1536 set_field(value
, delta_entry
->delta_zone
->memory
,
1537 get_delta_entry_offset(delta_entry
), delta_entry
->value_bits
);
1542 * Extend the memory used by the delta lists by adding growing_size bytes before the list indicated
1543 * by growing_index, then rebalancing the lists in the new chunk.
1545 static int extend_delta_zone(struct delta_zone
*delta_zone
, u32 growing_index
,
1546 size_t growing_size
)
1550 struct delta_list
*delta_lists
;
1555 /* Calculate the amount of space that is or will be in use. */
1556 start_time
= current_time_ns(CLOCK_MONOTONIC
);
1557 delta_lists
= delta_zone
->delta_lists
;
1558 used_space
= growing_size
;
1559 for (i
= 0; i
<= delta_zone
->list_count
+ 1; i
++)
1560 used_space
+= get_delta_list_byte_size(&delta_lists
[i
]);
1562 if (delta_zone
->size
< used_space
)
1563 return UDS_OVERFLOW
;
1565 /* Compute the new offsets of the delta lists. */
1566 compute_new_list_offsets(delta_zone
, growing_index
, growing_size
, used_space
);
1569 * When we rebalance the delta list, we will include the end guard list in the rebalancing.
1570 * It contains the end guard data, which must be copied.
1572 rebalance_delta_zone(delta_zone
, 1, delta_zone
->list_count
+ 1);
1573 end_time
= current_time_ns(CLOCK_MONOTONIC
);
1574 delta_zone
->rebalance_count
++;
1575 delta_zone
->rebalance_time
+= ktime_sub(end_time
, start_time
);
1579 static int insert_bits(struct delta_index_entry
*delta_entry
, u16 size
)
1588 struct delta_zone
*delta_zone
= delta_entry
->delta_zone
;
1589 struct delta_list
*delta_list
= delta_entry
->delta_list
;
1590 /* Compute bits in use before and after the inserted bits. */
1591 u32 total_size
= delta_list
->size
;
1592 u32 before_size
= delta_entry
->offset
;
1593 u32 after_size
= total_size
- delta_entry
->offset
;
1595 if (total_size
+ size
> U16_MAX
) {
1596 delta_entry
->list_overflow
= true;
1597 delta_zone
->overflow_count
++;
1598 return UDS_OVERFLOW
;
1601 /* Compute bits available before and after the delta list. */
1602 free_before
= (delta_list
[0].start
- (delta_list
[-1].start
+ delta_list
[-1].size
));
1603 free_after
= (delta_list
[1].start
- (delta_list
[0].start
+ delta_list
[0].size
));
1605 if ((size
<= free_before
) && (size
<= free_after
)) {
1607 * We have enough space to use either before or after the list. Select the smaller
1608 * amount of data. If it is exactly the same, try to take from the larger amount of
1611 if (before_size
< after_size
)
1613 else if (after_size
< before_size
)
1614 before_flag
= false;
1616 before_flag
= free_before
> free_after
;
1617 } else if (size
<= free_before
) {
1618 /* There is space before but not after. */
1620 } else if (size
<= free_after
) {
1621 /* There is space after but not before. */
1622 before_flag
= false;
1625 * Neither of the surrounding spaces is large enough for this request. Extend
1626 * and/or rebalance the delta list memory choosing to move the least amount of
1630 u32 growing_index
= delta_entry
->list_number
+ 1;
1632 before_flag
= before_size
< after_size
;
1635 result
= extend_delta_zone(delta_zone
, growing_index
,
1636 BITS_TO_BYTES(size
));
1637 if (result
!= UDS_SUCCESS
)
1641 delta_list
->size
+= size
;
1643 source
= delta_list
->start
;
1644 destination
= source
- size
;
1645 delta_list
->start
-= size
;
1646 count
= before_size
;
1648 source
= delta_list
->start
+ delta_entry
->offset
;
1649 destination
= source
+ size
;
1653 memory
= delta_zone
->memory
;
1654 move_bits(memory
, source
, memory
, destination
, count
);
1658 static void encode_delta(const struct delta_index_entry
*delta_entry
)
1664 const struct delta_zone
*delta_zone
= delta_entry
->delta_zone
;
1665 u8
*memory
= delta_zone
->memory
;
1667 offset
= get_delta_entry_offset(delta_entry
) + delta_entry
->value_bits
;
1668 if (delta_entry
->delta
< delta_zone
->min_keys
) {
1669 set_field(delta_entry
->delta
, memory
, offset
, delta_zone
->min_bits
);
1673 temp
= delta_entry
->delta
- delta_zone
->min_keys
;
1674 t1
= (temp
% delta_zone
->incr_keys
) + delta_zone
->min_keys
;
1675 t2
= temp
/ delta_zone
->incr_keys
;
1676 set_field(t1
, memory
, offset
, delta_zone
->min_bits
);
1677 set_zero(memory
, offset
+ delta_zone
->min_bits
, t2
);
1678 set_field(1, memory
, offset
+ delta_zone
->min_bits
+ t2
, 1);
1681 static void encode_entry(const struct delta_index_entry
*delta_entry
, u32 value
,
1684 u8
*memory
= delta_entry
->delta_zone
->memory
;
1685 u64 offset
= get_delta_entry_offset(delta_entry
);
1687 set_field(value
, memory
, offset
, delta_entry
->value_bits
);
1688 encode_delta(delta_entry
);
1690 set_collision_name(delta_entry
, name
);
1694 * Create a new entry in the delta index. If the entry is a collision, the full 256 bit name must
1697 int uds_put_delta_index_entry(struct delta_index_entry
*delta_entry
, u32 key
, u32 value
,
1701 struct delta_zone
*delta_zone
;
1703 result
= assert_mutable_entry(delta_entry
);
1704 if (result
!= UDS_SUCCESS
)
1707 if (delta_entry
->is_collision
) {
1709 * The caller wants us to insert a collision entry onto a collision entry. This
1710 * happens when we find a collision and attempt to add the name again to the index.
1711 * This is normally a fatal error unless we are replaying a closed chapter while we
1712 * are rebuilding a volume index.
1714 return UDS_DUPLICATE_NAME
;
1717 if (delta_entry
->offset
< delta_entry
->delta_list
->save_offset
) {
1719 * The saved entry offset is after the new entry and will no longer be valid, so
1720 * replace it with the insertion point.
1722 result
= uds_remember_delta_index_offset(delta_entry
);
1723 if (result
!= UDS_SUCCESS
)
1728 /* Insert a collision entry which is placed after this entry. */
1729 result
= assert_not_at_end(delta_entry
);
1730 if (result
!= UDS_SUCCESS
)
1733 result
= VDO_ASSERT((key
== delta_entry
->key
),
1734 "incorrect key for collision entry");
1735 if (result
!= VDO_SUCCESS
)
1738 delta_entry
->offset
+= delta_entry
->entry_bits
;
1739 set_delta(delta_entry
, 0);
1740 delta_entry
->is_collision
= true;
1741 delta_entry
->entry_bits
+= COLLISION_BITS
;
1742 result
= insert_bits(delta_entry
, delta_entry
->entry_bits
);
1743 } else if (delta_entry
->at_end
) {
1744 /* Insert a new entry at the end of the delta list. */
1745 result
= VDO_ASSERT((key
>= delta_entry
->key
), "key past end of list");
1746 if (result
!= VDO_SUCCESS
)
1749 set_delta(delta_entry
, key
- delta_entry
->key
);
1750 delta_entry
->key
= key
;
1751 delta_entry
->at_end
= false;
1752 result
= insert_bits(delta_entry
, delta_entry
->entry_bits
);
1755 u16 additional_size
;
1756 struct delta_index_entry next_entry
;
1760 * Insert a new entry which requires the delta in the following entry to be
1763 result
= VDO_ASSERT((key
< delta_entry
->key
),
1764 "key precedes following entry");
1765 if (result
!= VDO_SUCCESS
)
1768 result
= VDO_ASSERT((key
>= delta_entry
->key
- delta_entry
->delta
),
1769 "key effects following entry's delta");
1770 if (result
!= VDO_SUCCESS
)
1773 old_entry_size
= delta_entry
->entry_bits
;
1774 next_entry
= *delta_entry
;
1775 next_value
= uds_get_delta_entry_value(&next_entry
);
1776 set_delta(delta_entry
, key
- (delta_entry
->key
- delta_entry
->delta
));
1777 delta_entry
->key
= key
;
1778 set_delta(&next_entry
, next_entry
.key
- key
);
1779 next_entry
.offset
+= delta_entry
->entry_bits
;
1780 /* The two new entries are always bigger than the single entry being replaced. */
1781 additional_size
= (delta_entry
->entry_bits
+
1782 next_entry
.entry_bits
- old_entry_size
);
1783 result
= insert_bits(delta_entry
, additional_size
);
1784 if (result
!= UDS_SUCCESS
)
1787 encode_entry(&next_entry
, next_value
, NULL
);
1790 if (result
!= UDS_SUCCESS
)
1793 encode_entry(delta_entry
, value
, name
);
1794 delta_zone
= delta_entry
->delta_zone
;
1795 delta_zone
->record_count
++;
1796 delta_zone
->collision_count
+= delta_entry
->is_collision
? 1 : 0;
1800 static void delete_bits(const struct delta_index_entry
*delta_entry
, int size
)
1806 struct delta_list
*delta_list
= delta_entry
->delta_list
;
1807 u8
*memory
= delta_entry
->delta_zone
->memory
;
1808 /* Compute bits retained before and after the deleted bits. */
1809 u32 total_size
= delta_list
->size
;
1810 u32 before_size
= delta_entry
->offset
;
1811 u32 after_size
= total_size
- delta_entry
->offset
- size
;
1814 * Determine whether to add to the available space either before or after the delta list.
1815 * We prefer to move the least amount of data. If it is exactly the same, try to add to the
1816 * smaller amount of free space.
1818 if (before_size
< after_size
) {
1820 } else if (after_size
< before_size
) {
1821 before_flag
= false;
1824 (delta_list
[0].start
- (delta_list
[-1].start
+ delta_list
[-1].size
));
1826 (delta_list
[1].start
- (delta_list
[0].start
+ delta_list
[0].size
));
1828 before_flag
= (free_before
< free_after
);
1831 delta_list
->size
-= size
;
1833 source
= delta_list
->start
;
1834 destination
= source
+ size
;
1835 delta_list
->start
+= size
;
1836 count
= before_size
;
1838 destination
= delta_list
->start
+ delta_entry
->offset
;
1839 source
= destination
+ size
;
1843 move_bits(memory
, source
, memory
, destination
, count
);
1846 int uds_remove_delta_index_entry(struct delta_index_entry
*delta_entry
)
1849 struct delta_index_entry next_entry
;
1850 struct delta_zone
*delta_zone
;
1851 struct delta_list
*delta_list
;
1853 result
= assert_mutable_entry(delta_entry
);
1854 if (result
!= UDS_SUCCESS
)
1857 next_entry
= *delta_entry
;
1858 result
= uds_next_delta_index_entry(&next_entry
);
1859 if (result
!= UDS_SUCCESS
)
1862 delta_zone
= delta_entry
->delta_zone
;
1864 if (delta_entry
->is_collision
) {
1865 /* This is a collision entry, so just remove it. */
1866 delete_bits(delta_entry
, delta_entry
->entry_bits
);
1867 next_entry
.offset
= delta_entry
->offset
;
1868 delta_zone
->collision_count
-= 1;
1869 } else if (next_entry
.at_end
) {
1870 /* This entry is at the end of the list, so just remove it. */
1871 delete_bits(delta_entry
, delta_entry
->entry_bits
);
1872 next_entry
.key
-= delta_entry
->delta
;
1873 next_entry
.offset
= delta_entry
->offset
;
1875 /* The delta in the next entry needs to be updated. */
1876 u32 next_value
= uds_get_delta_entry_value(&next_entry
);
1877 u16 old_size
= delta_entry
->entry_bits
+ next_entry
.entry_bits
;
1879 if (next_entry
.is_collision
) {
1880 next_entry
.is_collision
= false;
1881 delta_zone
->collision_count
-= 1;
1884 set_delta(&next_entry
, delta_entry
->delta
+ next_entry
.delta
);
1885 next_entry
.offset
= delta_entry
->offset
;
1886 /* The one new entry is always smaller than the two entries being replaced. */
1887 delete_bits(delta_entry
, old_size
- next_entry
.entry_bits
);
1888 encode_entry(&next_entry
, next_value
, NULL
);
1891 delta_zone
->record_count
--;
1892 delta_zone
->discard_count
++;
1893 *delta_entry
= next_entry
;
1895 delta_list
= delta_entry
->delta_list
;
1896 if (delta_entry
->offset
< delta_list
->save_offset
) {
1897 /* The saved entry offset is no longer valid. */
1898 delta_list
->save_key
= 0;
1899 delta_list
->save_offset
= 0;
1905 void uds_get_delta_index_stats(const struct delta_index
*delta_index
,
1906 struct delta_index_stats
*stats
)
1909 const struct delta_zone
*delta_zone
;
1911 memset(stats
, 0, sizeof(struct delta_index_stats
));
1912 for (z
= 0; z
< delta_index
->zone_count
; z
++) {
1913 delta_zone
= &delta_index
->delta_zones
[z
];
1914 stats
->rebalance_time
+= delta_zone
->rebalance_time
;
1915 stats
->rebalance_count
+= delta_zone
->rebalance_count
;
1916 stats
->record_count
+= delta_zone
->record_count
;
1917 stats
->collision_count
+= delta_zone
->collision_count
;
1918 stats
->discard_count
+= delta_zone
->discard_count
;
1919 stats
->overflow_count
+= delta_zone
->overflow_count
;
1920 stats
->list_count
+= delta_zone
->list_count
;
1924 size_t uds_compute_delta_index_size(u32 entry_count
, u32 mean_delta
, u32 payload_bits
)
1930 compute_coding_constants(mean_delta
, &min_bits
, &min_keys
, &incr_keys
);
1931 /* On average, each delta is encoded into about min_bits + 1.5 bits. */
1932 return entry_count
* (payload_bits
+ min_bits
+ 1) + entry_count
/ 2;
1935 u32
uds_get_delta_index_page_count(u32 entry_count
, u32 list_count
, u32 mean_delta
,
1936 u32 payload_bits
, size_t bytes_per_page
)
1938 unsigned int bits_per_delta_list
;
1939 unsigned int bits_per_page
;
1940 size_t bits_per_index
;
1942 /* Compute the expected number of bits needed for all the entries. */
1943 bits_per_index
= uds_compute_delta_index_size(entry_count
, mean_delta
,
1945 bits_per_delta_list
= bits_per_index
/ list_count
;
1947 /* Add in the immutable delta list headers. */
1948 bits_per_index
+= list_count
* IMMUTABLE_HEADER_SIZE
;
1949 /* Compute the number of usable bits on an immutable index page. */
1950 bits_per_page
= ((bytes_per_page
- sizeof(struct delta_page_header
)) * BITS_PER_BYTE
);
1952 * Reduce the bits per page by one immutable delta list header and one delta list to
1953 * account for internal fragmentation.
1955 bits_per_page
-= IMMUTABLE_HEADER_SIZE
+ bits_per_delta_list
;
1956 /* Now compute the number of pages needed. */
1957 return DIV_ROUND_UP(bits_per_index
, bits_per_page
);
1960 void uds_log_delta_index_entry(struct delta_index_entry
*delta_entry
)
1962 vdo_log_ratelimit(vdo_log_info
,
1963 "List 0x%X Key 0x%X Offset 0x%X%s%s List_size 0x%X%s",
1964 delta_entry
->list_number
, delta_entry
->key
,
1965 delta_entry
->offset
, delta_entry
->at_end
? " end" : "",
1966 delta_entry
->is_collision
? " collision" : "",
1967 delta_entry
->delta_list
->size
,
1968 delta_entry
->list_overflow
? " overflow" : "");
1969 delta_entry
->list_overflow
= false;