Merge tag 'trace-printf-v6.13' of git://git.kernel.org/pub/scm/linux/kernel/git/trace...
[drm/drm-misc.git] / drivers / md / dm-vdo / indexer / delta-index.c
blob0ac2443f0df363cf7897f40c1dbc23d074d741b7
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright 2023 Red Hat
4 */
5 #include "delta-index.h"
7 #include <linux/bitops.h>
8 #include <linux/bits.h>
9 #include <linux/compiler.h>
10 #include <linux/limits.h>
11 #include <linux/log2.h>
13 #include "cpu.h"
14 #include "errors.h"
15 #include "logger.h"
16 #include "memory-alloc.h"
17 #include "numeric.h"
18 #include "permassert.h"
19 #include "string-utils.h"
20 #include "time-utils.h"
22 #include "config.h"
23 #include "indexer.h"
26 * The entries in a delta index could be stored in a single delta list, but to reduce search times
27 * and update costs it uses multiple delta lists. These lists are stored in a single chunk of
28 * memory managed by the delta_zone structure. The delta_zone can move the data around within its
29 * memory, so the location of each delta list is recorded as a bit offset into the memory. Because
30 * the volume index can contain over a million delta lists, we want to be efficient with the size
31 * of the delta list header information. This information is encoded into 16 bytes per list. The
32 * volume index delta list memory can easily exceed 4 gigabits, so a 64 bit value is needed to
33 * address the memory. The volume index delta lists average around 6 kilobits, so 16 bits are
34 * sufficient to store the size of a delta list.
36 * Each delta list is stored as a bit stream. Within the delta list encoding, bits and bytes are
37 * numbered in little endian order. Within a byte, bit 0 is the least significant bit (0x1), and
38 * bit 7 is the most significant bit (0x80). Within a bit stream, bit 7 is the most significant bit
39 * of byte 0, and bit 8 is the least significant bit of byte 1. Within a byte array, a byte's
40 * number corresponds to its index in the array.
42 * A standard delta list entry is stored as a fixed length payload (the value) followed by a
43 * variable length key (the delta). A collision entry is used when two block names have the same
44 * delta list address. A collision entry always follows a standard entry for the hash with which it
45 * collides, and is encoded with DELTA == 0 with an additional 256 bits field at the end,
46 * containing the full block name. An entry with a delta of 0 at the beginning of a delta list
47 * indicates a normal entry.
49 * The delta in each entry is encoded with a variable-length Huffman code to minimize the memory
50 * used by small deltas. The Huffman code is specified by three parameters, which can be computed
51 * from the desired mean delta when the index is full. (See compute_coding_constants() for
52 * details.)
54 * The bit field utilities used to read and write delta entries assume that it is possible to read
55 * some bytes beyond the end of the bit field, so a delta_zone memory allocation is guarded by two
56 * invalid delta lists to prevent reading outside the delta_zone memory. The valid delta lists are
57 * numbered 1 to N, and the guard lists are numbered 0 and N+1. The function to decode the bit
58 * stream include a step that skips over bits set to 0 until the first 1 bit is found. A corrupted
59 * delta list could cause this step to run off the end of the delta_zone memory, so as extra
60 * protection against this happening, the tail guard list is set to all ones.
62 * The delta_index supports two different forms. The mutable form is created by
63 * uds_initialize_delta_index(), and is used for the volume index and for open chapter indexes. The
64 * immutable form is created by uds_initialize_delta_index_page(), and is used for closed (and
65 * cached) chapter index pages. The immutable form does not allocate delta list headers or
66 * temporary offsets, and thus is somewhat more memory efficient.
70 * This is the largest field size supported by get_field() and set_field(). Any field that is
71 * larger is not guaranteed to fit in a single byte-aligned u32.
73 #define MAX_FIELD_BITS ((sizeof(u32) - 1) * BITS_PER_BYTE + 1)
76 * This is the largest field size supported by get_big_field() and set_big_field(). Any field that
77 * is larger is not guaranteed to fit in a single byte-aligned u64.
79 #define MAX_BIG_FIELD_BITS ((sizeof(u64) - 1) * BITS_PER_BYTE + 1)
82 * This is the number of guard bytes needed at the end of the memory byte array when using the bit
83 * utilities. These utilities call get_big_field() and set_big_field(), which can access up to 7
84 * bytes beyond the end of the desired field. The definition is written to make it clear how this
85 * value is derived.
87 #define POST_FIELD_GUARD_BYTES (sizeof(u64) - 1)
89 /* The number of guard bits that are needed in the tail guard list */
90 #define GUARD_BITS (POST_FIELD_GUARD_BYTES * BITS_PER_BYTE)
93 * The maximum size of a single delta list in bytes. We count guard bytes in this value because a
94 * buffer of this size can be used with move_bits().
96 #define DELTA_LIST_MAX_BYTE_COUNT \
97 ((U16_MAX + BITS_PER_BYTE) / BITS_PER_BYTE + POST_FIELD_GUARD_BYTES)
99 /* The number of extra bytes and bits needed to store a collision entry */
100 #define COLLISION_BYTES UDS_RECORD_NAME_SIZE
101 #define COLLISION_BITS (COLLISION_BYTES * BITS_PER_BYTE)
104 * Immutable delta lists are packed into pages containing a header that encodes the delta list
105 * information into 19 bits per list (64KB bit offset).
107 #define IMMUTABLE_HEADER_SIZE 19
110 * Constants and structures for the saved delta index. "DI" is for delta_index, and -##### is a
111 * number to increment when the format of the data changes.
113 #define MAGIC_SIZE 8
115 static const char DELTA_INDEX_MAGIC[] = "DI-00002";
117 struct delta_index_header {
118 char magic[MAGIC_SIZE];
119 u32 zone_number;
120 u32 zone_count;
121 u32 first_list;
122 u32 list_count;
123 u64 record_count;
124 u64 collision_count;
128 * Header data used for immutable delta index pages. This data is followed by the delta list offset
129 * table.
131 struct delta_page_header {
132 /* Externally-defined nonce */
133 u64 nonce;
134 /* The virtual chapter number */
135 u64 virtual_chapter_number;
136 /* Index of the first delta list on the page */
137 u16 first_list;
138 /* Number of delta lists on the page */
139 u16 list_count;
140 } __packed;
142 static inline u64 get_delta_list_byte_start(const struct delta_list *delta_list)
144 return delta_list->start / BITS_PER_BYTE;
147 static inline u16 get_delta_list_byte_size(const struct delta_list *delta_list)
149 unsigned int bit_offset = delta_list->start % BITS_PER_BYTE;
151 return BITS_TO_BYTES(bit_offset + delta_list->size);
154 static void rebalance_delta_zone(const struct delta_zone *delta_zone, u32 first,
155 u32 last)
157 struct delta_list *delta_list;
158 u64 new_start;
160 if (first == last) {
161 /* Only one list is moving, and we know there is space. */
162 delta_list = &delta_zone->delta_lists[first];
163 new_start = delta_zone->new_offsets[first];
164 if (delta_list->start != new_start) {
165 u64 source;
166 u64 destination;
168 source = get_delta_list_byte_start(delta_list);
169 delta_list->start = new_start;
170 destination = get_delta_list_byte_start(delta_list);
171 memmove(delta_zone->memory + destination,
172 delta_zone->memory + source,
173 get_delta_list_byte_size(delta_list));
175 } else {
177 * There is more than one list. Divide the problem in half, and use recursive calls
178 * to process each half. Note that after this computation, first <= middle, and
179 * middle < last.
181 u32 middle = (first + last) / 2;
183 delta_list = &delta_zone->delta_lists[middle];
184 new_start = delta_zone->new_offsets[middle];
187 * The direction that our middle list is moving determines which half of the
188 * problem must be processed first.
190 if (new_start > delta_list->start) {
191 rebalance_delta_zone(delta_zone, middle + 1, last);
192 rebalance_delta_zone(delta_zone, first, middle);
193 } else {
194 rebalance_delta_zone(delta_zone, first, middle);
195 rebalance_delta_zone(delta_zone, middle + 1, last);
200 static inline size_t get_zone_memory_size(unsigned int zone_count, size_t memory_size)
202 /* Round up so that each zone is a multiple of 64K in size. */
203 size_t ALLOC_BOUNDARY = 64 * 1024;
205 return (memory_size / zone_count + ALLOC_BOUNDARY - 1) & -ALLOC_BOUNDARY;
208 void uds_reset_delta_index(const struct delta_index *delta_index)
210 unsigned int z;
213 * Initialize all delta lists to be empty. We keep 2 extra delta list descriptors, one
214 * before the first real entry and one after so that we don't need to bounds check the
215 * array access when calculating preceding and following gap sizes.
217 for (z = 0; z < delta_index->zone_count; z++) {
218 u64 list_bits;
219 u64 spacing;
220 u64 offset;
221 unsigned int i;
222 struct delta_zone *zone = &delta_index->delta_zones[z];
223 struct delta_list *delta_lists = zone->delta_lists;
225 /* Zeroing the delta list headers initializes the head guard list correctly. */
226 memset(delta_lists, 0,
227 (zone->list_count + 2) * sizeof(struct delta_list));
229 /* Set all the bits in the end guard list. */
230 list_bits = (u64) zone->size * BITS_PER_BYTE - GUARD_BITS;
231 delta_lists[zone->list_count + 1].start = list_bits;
232 delta_lists[zone->list_count + 1].size = GUARD_BITS;
233 memset(zone->memory + (list_bits / BITS_PER_BYTE), ~0,
234 POST_FIELD_GUARD_BYTES);
236 /* Evenly space out the real delta lists by setting regular offsets. */
237 spacing = list_bits / zone->list_count;
238 offset = spacing / 2;
239 for (i = 1; i <= zone->list_count; i++) {
240 delta_lists[i].start = offset;
241 offset += spacing;
244 /* Update the statistics. */
245 zone->discard_count += zone->record_count;
246 zone->record_count = 0;
247 zone->collision_count = 0;
251 /* Compute the Huffman coding parameters for the given mean delta. The Huffman code is specified by
252 * three parameters:
254 * MINBITS The number of bits in the smallest code
255 * BASE The number of values coded using a code of length MINBITS
256 * INCR The number of values coded by using one additional bit
258 * These parameters are related by this equation:
260 * BASE + INCR == 1 << MINBITS
262 * The math for the Huffman code of an exponential distribution says that
264 * INCR = log(2) * MEAN_DELTA
266 * Then use the smallest MINBITS value so that
268 * (1 << MINBITS) > INCR
270 * And then
272 * BASE = (1 << MINBITS) - INCR
274 * Now the index can generate a code such that
275 * - The first BASE values code using MINBITS bits.
276 * - The next INCR values code using MINBITS+1 bits.
277 * - The next INCR values code using MINBITS+2 bits.
278 * - (and so on).
280 static void compute_coding_constants(u32 mean_delta, u16 *min_bits, u32 *min_keys, u32 *incr_keys)
283 * We want to compute the rounded value of log(2) * mean_delta. Since we cannot always use
284 * floating point, use a really good integer approximation.
286 *incr_keys = (836158UL * mean_delta + 603160UL) / 1206321UL;
287 *min_bits = bits_per(*incr_keys + 1);
288 *min_keys = (1 << *min_bits) - *incr_keys;
291 void uds_uninitialize_delta_index(struct delta_index *delta_index)
293 unsigned int z;
295 if (delta_index->delta_zones == NULL)
296 return;
298 for (z = 0; z < delta_index->zone_count; z++) {
299 vdo_free(vdo_forget(delta_index->delta_zones[z].new_offsets));
300 vdo_free(vdo_forget(delta_index->delta_zones[z].delta_lists));
301 vdo_free(vdo_forget(delta_index->delta_zones[z].memory));
304 vdo_free(delta_index->delta_zones);
305 memset(delta_index, 0, sizeof(struct delta_index));
308 static int initialize_delta_zone(struct delta_zone *delta_zone, size_t size,
309 u32 first_list, u32 list_count, u32 mean_delta,
310 u32 payload_bits, u8 tag)
312 int result;
314 result = vdo_allocate(size, u8, "delta list", &delta_zone->memory);
315 if (result != VDO_SUCCESS)
316 return result;
318 result = vdo_allocate(list_count + 2, u64, "delta list temp",
319 &delta_zone->new_offsets);
320 if (result != VDO_SUCCESS)
321 return result;
323 /* Allocate the delta lists. */
324 result = vdo_allocate(list_count + 2, struct delta_list, "delta lists",
325 &delta_zone->delta_lists);
326 if (result != VDO_SUCCESS)
327 return result;
329 compute_coding_constants(mean_delta, &delta_zone->min_bits,
330 &delta_zone->min_keys, &delta_zone->incr_keys);
331 delta_zone->value_bits = payload_bits;
332 delta_zone->buffered_writer = NULL;
333 delta_zone->size = size;
334 delta_zone->rebalance_time = 0;
335 delta_zone->rebalance_count = 0;
336 delta_zone->record_count = 0;
337 delta_zone->collision_count = 0;
338 delta_zone->discard_count = 0;
339 delta_zone->overflow_count = 0;
340 delta_zone->first_list = first_list;
341 delta_zone->list_count = list_count;
342 delta_zone->tag = tag;
344 return UDS_SUCCESS;
347 int uds_initialize_delta_index(struct delta_index *delta_index, unsigned int zone_count,
348 u32 list_count, u32 mean_delta, u32 payload_bits,
349 size_t memory_size, u8 tag)
351 int result;
352 unsigned int z;
353 size_t zone_memory;
355 result = vdo_allocate(zone_count, struct delta_zone, "Delta Index Zones",
356 &delta_index->delta_zones);
357 if (result != VDO_SUCCESS)
358 return result;
360 delta_index->zone_count = zone_count;
361 delta_index->list_count = list_count;
362 delta_index->lists_per_zone = DIV_ROUND_UP(list_count, zone_count);
363 delta_index->memory_size = 0;
364 delta_index->mutable = true;
365 delta_index->tag = tag;
367 for (z = 0; z < zone_count; z++) {
368 u32 lists_in_zone = delta_index->lists_per_zone;
369 u32 first_list_in_zone = z * lists_in_zone;
371 if (z == zone_count - 1) {
373 * The last zone gets fewer lists if zone_count doesn't evenly divide
374 * list_count. We'll have an underflow if the assertion below doesn't hold.
376 if (delta_index->list_count <= first_list_in_zone) {
377 uds_uninitialize_delta_index(delta_index);
378 return vdo_log_error_strerror(UDS_INVALID_ARGUMENT,
379 "%u delta lists not enough for %u zones",
380 list_count, zone_count);
382 lists_in_zone = delta_index->list_count - first_list_in_zone;
385 zone_memory = get_zone_memory_size(zone_count, memory_size);
386 result = initialize_delta_zone(&delta_index->delta_zones[z], zone_memory,
387 first_list_in_zone, lists_in_zone,
388 mean_delta, payload_bits, tag);
389 if (result != UDS_SUCCESS) {
390 uds_uninitialize_delta_index(delta_index);
391 return result;
394 delta_index->memory_size +=
395 (sizeof(struct delta_zone) + zone_memory +
396 (lists_in_zone + 2) * (sizeof(struct delta_list) + sizeof(u64)));
399 uds_reset_delta_index(delta_index);
400 return UDS_SUCCESS;
403 /* Read a bit field from an arbitrary bit boundary. */
404 static inline u32 get_field(const u8 *memory, u64 offset, u8 size)
406 const void *addr = memory + offset / BITS_PER_BYTE;
408 return (get_unaligned_le32(addr) >> (offset % BITS_PER_BYTE)) & ((1 << size) - 1);
411 /* Write a bit field to an arbitrary bit boundary. */
412 static inline void set_field(u32 value, u8 *memory, u64 offset, u8 size)
414 void *addr = memory + offset / BITS_PER_BYTE;
415 int shift = offset % BITS_PER_BYTE;
416 u32 data = get_unaligned_le32(addr);
418 data &= ~(((1 << size) - 1) << shift);
419 data |= value << shift;
420 put_unaligned_le32(data, addr);
423 /* Get the bit offset to the immutable delta list header. */
424 static inline u32 get_immutable_header_offset(u32 list_number)
426 return sizeof(struct delta_page_header) * BITS_PER_BYTE +
427 list_number * IMMUTABLE_HEADER_SIZE;
430 /* Get the bit offset to the start of the immutable delta list bit stream. */
431 static inline u32 get_immutable_start(const u8 *memory, u32 list_number)
433 return get_field(memory, get_immutable_header_offset(list_number),
434 IMMUTABLE_HEADER_SIZE);
437 /* Set the bit offset to the start of the immutable delta list bit stream. */
438 static inline void set_immutable_start(u8 *memory, u32 list_number, u32 start)
440 set_field(start, memory, get_immutable_header_offset(list_number),
441 IMMUTABLE_HEADER_SIZE);
444 static bool verify_delta_index_page(u64 nonce, u16 list_count, u64 expected_nonce,
445 u8 *memory, size_t memory_size)
447 unsigned int i;
450 * Verify the nonce. A mismatch can happen here during rebuild if we haven't written the
451 * entire volume at least once.
453 if (nonce != expected_nonce)
454 return false;
456 /* Verify that the number of delta lists can fit in the page. */
457 if (list_count > ((memory_size - sizeof(struct delta_page_header)) *
458 BITS_PER_BYTE / IMMUTABLE_HEADER_SIZE))
459 return false;
462 * Verify that the first delta list is immediately after the last delta
463 * list header.
465 if (get_immutable_start(memory, 0) != get_immutable_header_offset(list_count + 1))
466 return false;
468 /* Verify that the lists are in the correct order. */
469 for (i = 0; i < list_count; i++) {
470 if (get_immutable_start(memory, i) > get_immutable_start(memory, i + 1))
471 return false;
475 * Verify that the last list ends on the page, and that there is room
476 * for the post-field guard bits.
478 if (get_immutable_start(memory, list_count) >
479 (memory_size - POST_FIELD_GUARD_BYTES) * BITS_PER_BYTE)
480 return false;
482 /* Verify that the guard bytes are correctly set to all ones. */
483 for (i = 0; i < POST_FIELD_GUARD_BYTES; i++) {
484 if (memory[memory_size - POST_FIELD_GUARD_BYTES + i] != (u8) ~0)
485 return false;
488 /* All verifications passed. */
489 return true;
492 /* Initialize a delta index page to refer to a supplied page. */
493 int uds_initialize_delta_index_page(struct delta_index_page *delta_index_page,
494 u64 expected_nonce, u32 mean_delta, u32 payload_bits,
495 u8 *memory, size_t memory_size)
497 u64 nonce;
498 u64 vcn;
499 u64 first_list;
500 u64 list_count;
501 struct delta_page_header *header = (struct delta_page_header *) memory;
502 struct delta_zone *delta_zone = &delta_index_page->delta_zone;
503 const u8 *nonce_addr = (const u8 *) &header->nonce;
504 const u8 *vcn_addr = (const u8 *) &header->virtual_chapter_number;
505 const u8 *first_list_addr = (const u8 *) &header->first_list;
506 const u8 *list_count_addr = (const u8 *) &header->list_count;
508 /* First assume that the header is little endian. */
509 nonce = get_unaligned_le64(nonce_addr);
510 vcn = get_unaligned_le64(vcn_addr);
511 first_list = get_unaligned_le16(first_list_addr);
512 list_count = get_unaligned_le16(list_count_addr);
513 if (!verify_delta_index_page(nonce, list_count, expected_nonce, memory,
514 memory_size)) {
515 /* If that fails, try big endian. */
516 nonce = get_unaligned_be64(nonce_addr);
517 vcn = get_unaligned_be64(vcn_addr);
518 first_list = get_unaligned_be16(first_list_addr);
519 list_count = get_unaligned_be16(list_count_addr);
520 if (!verify_delta_index_page(nonce, list_count, expected_nonce, memory,
521 memory_size)) {
523 * Both attempts failed. Do not log this as an error, because it can happen
524 * during a rebuild if we haven't written the entire volume at least once.
526 return UDS_CORRUPT_DATA;
530 delta_index_page->delta_index.delta_zones = delta_zone;
531 delta_index_page->delta_index.zone_count = 1;
532 delta_index_page->delta_index.list_count = list_count;
533 delta_index_page->delta_index.lists_per_zone = list_count;
534 delta_index_page->delta_index.mutable = false;
535 delta_index_page->delta_index.tag = 'p';
536 delta_index_page->virtual_chapter_number = vcn;
537 delta_index_page->lowest_list_number = first_list;
538 delta_index_page->highest_list_number = first_list + list_count - 1;
540 compute_coding_constants(mean_delta, &delta_zone->min_bits,
541 &delta_zone->min_keys, &delta_zone->incr_keys);
542 delta_zone->value_bits = payload_bits;
543 delta_zone->memory = memory;
544 delta_zone->delta_lists = NULL;
545 delta_zone->new_offsets = NULL;
546 delta_zone->buffered_writer = NULL;
547 delta_zone->size = memory_size;
548 delta_zone->rebalance_time = 0;
549 delta_zone->rebalance_count = 0;
550 delta_zone->record_count = 0;
551 delta_zone->collision_count = 0;
552 delta_zone->discard_count = 0;
553 delta_zone->overflow_count = 0;
554 delta_zone->first_list = 0;
555 delta_zone->list_count = list_count;
556 delta_zone->tag = 'p';
558 return UDS_SUCCESS;
561 /* Read a large bit field from an arbitrary bit boundary. */
562 static inline u64 get_big_field(const u8 *memory, u64 offset, u8 size)
564 const void *addr = memory + offset / BITS_PER_BYTE;
566 return (get_unaligned_le64(addr) >> (offset % BITS_PER_BYTE)) & ((1UL << size) - 1);
569 /* Write a large bit field to an arbitrary bit boundary. */
570 static inline void set_big_field(u64 value, u8 *memory, u64 offset, u8 size)
572 void *addr = memory + offset / BITS_PER_BYTE;
573 u8 shift = offset % BITS_PER_BYTE;
574 u64 data = get_unaligned_le64(addr);
576 data &= ~(((1UL << size) - 1) << shift);
577 data |= value << shift;
578 put_unaligned_le64(data, addr);
581 /* Set a sequence of bits to all zeros. */
582 static inline void set_zero(u8 *memory, u64 offset, u32 size)
584 if (size > 0) {
585 u8 *addr = memory + offset / BITS_PER_BYTE;
586 u8 shift = offset % BITS_PER_BYTE;
587 u32 count = size + shift > BITS_PER_BYTE ? (u32) BITS_PER_BYTE - shift : size;
589 *addr++ &= ~(((1 << count) - 1) << shift);
590 for (size -= count; size > BITS_PER_BYTE; size -= BITS_PER_BYTE)
591 *addr++ = 0;
593 if (size > 0)
594 *addr &= 0xFF << size;
599 * Move several bits from a higher to a lower address, moving the lower addressed bits first. The
600 * size and memory offsets are measured in bits.
602 static void move_bits_down(const u8 *from, u64 from_offset, u8 *to, u64 to_offset, u32 size)
604 const u8 *source;
605 u8 *destination;
606 u8 offset;
607 u8 count;
608 u64 field;
610 /* Start by moving one field that ends on a to int boundary. */
611 count = (MAX_BIG_FIELD_BITS - ((to_offset + MAX_BIG_FIELD_BITS) % BITS_PER_TYPE(u32)));
612 field = get_big_field(from, from_offset, count);
613 set_big_field(field, to, to_offset, count);
614 from_offset += count;
615 to_offset += count;
616 size -= count;
618 /* Now do the main loop to copy 32 bit chunks that are int-aligned at the destination. */
619 offset = from_offset % BITS_PER_TYPE(u32);
620 source = from + (from_offset - offset) / BITS_PER_BYTE;
621 destination = to + to_offset / BITS_PER_BYTE;
622 while (size > MAX_BIG_FIELD_BITS) {
623 put_unaligned_le32(get_unaligned_le64(source) >> offset, destination);
624 source += sizeof(u32);
625 destination += sizeof(u32);
626 from_offset += BITS_PER_TYPE(u32);
627 to_offset += BITS_PER_TYPE(u32);
628 size -= BITS_PER_TYPE(u32);
631 /* Finish up by moving any remaining bits. */
632 if (size > 0) {
633 field = get_big_field(from, from_offset, size);
634 set_big_field(field, to, to_offset, size);
639 * Move several bits from a lower to a higher address, moving the higher addressed bits first. The
640 * size and memory offsets are measured in bits.
642 static void move_bits_up(const u8 *from, u64 from_offset, u8 *to, u64 to_offset, u32 size)
644 const u8 *source;
645 u8 *destination;
646 u8 offset;
647 u8 count;
648 u64 field;
650 /* Start by moving one field that begins on a destination int boundary. */
651 count = (to_offset + size) % BITS_PER_TYPE(u32);
652 if (count > 0) {
653 size -= count;
654 field = get_big_field(from, from_offset + size, count);
655 set_big_field(field, to, to_offset + size, count);
658 /* Now do the main loop to copy 32 bit chunks that are int-aligned at the destination. */
659 offset = (from_offset + size) % BITS_PER_TYPE(u32);
660 source = from + (from_offset + size - offset) / BITS_PER_BYTE;
661 destination = to + (to_offset + size) / BITS_PER_BYTE;
662 while (size > MAX_BIG_FIELD_BITS) {
663 source -= sizeof(u32);
664 destination -= sizeof(u32);
665 size -= BITS_PER_TYPE(u32);
666 put_unaligned_le32(get_unaligned_le64(source) >> offset, destination);
669 /* Finish up by moving any remaining bits. */
670 if (size > 0) {
671 field = get_big_field(from, from_offset, size);
672 set_big_field(field, to, to_offset, size);
677 * Move bits from one field to another. When the fields overlap, behave as if we first move all the
678 * bits from the source to a temporary value, and then move all the bits from the temporary value
679 * to the destination. The size and memory offsets are measured in bits.
681 static void move_bits(const u8 *from, u64 from_offset, u8 *to, u64 to_offset, u32 size)
683 u64 field;
685 /* A small move doesn't require special handling. */
686 if (size <= MAX_BIG_FIELD_BITS) {
687 if (size > 0) {
688 field = get_big_field(from, from_offset, size);
689 set_big_field(field, to, to_offset, size);
692 return;
695 if (from_offset > to_offset)
696 move_bits_down(from, from_offset, to, to_offset, size);
697 else
698 move_bits_up(from, from_offset, to, to_offset, size);
702 * Pack delta lists from a mutable delta index into an immutable delta index page. A range of delta
703 * lists (starting with a specified list index) is copied from the mutable delta index into a
704 * memory page used in the immutable index. The number of lists copied onto the page is returned in
705 * list_count.
707 int uds_pack_delta_index_page(const struct delta_index *delta_index, u64 header_nonce,
708 u8 *memory, size_t memory_size, u64 virtual_chapter_number,
709 u32 first_list, u32 *list_count)
711 const struct delta_zone *delta_zone;
712 struct delta_list *delta_lists;
713 u32 max_lists;
714 u32 n_lists = 0;
715 u32 offset;
716 u32 i;
717 int free_bits;
718 int bits;
719 struct delta_page_header *header;
721 delta_zone = &delta_index->delta_zones[0];
722 delta_lists = &delta_zone->delta_lists[first_list + 1];
723 max_lists = delta_index->list_count - first_list;
726 * Compute how many lists will fit on the page. Subtract the size of the fixed header, one
727 * delta list offset, and the guard bytes from the page size to determine how much space is
728 * available for delta lists.
730 free_bits = memory_size * BITS_PER_BYTE;
731 free_bits -= get_immutable_header_offset(1);
732 free_bits -= GUARD_BITS;
733 if (free_bits < IMMUTABLE_HEADER_SIZE) {
734 /* This page is too small to store any delta lists. */
735 return vdo_log_error_strerror(UDS_OVERFLOW,
736 "Chapter Index Page of %zu bytes is too small",
737 memory_size);
740 while (n_lists < max_lists) {
741 /* Each list requires a delta list offset and the list data. */
742 bits = IMMUTABLE_HEADER_SIZE + delta_lists[n_lists].size;
743 if (bits > free_bits)
744 break;
746 n_lists++;
747 free_bits -= bits;
750 *list_count = n_lists;
752 header = (struct delta_page_header *) memory;
753 put_unaligned_le64(header_nonce, (u8 *) &header->nonce);
754 put_unaligned_le64(virtual_chapter_number,
755 (u8 *) &header->virtual_chapter_number);
756 put_unaligned_le16(first_list, (u8 *) &header->first_list);
757 put_unaligned_le16(n_lists, (u8 *) &header->list_count);
759 /* Construct the delta list offset table. */
760 offset = get_immutable_header_offset(n_lists + 1);
761 set_immutable_start(memory, 0, offset);
762 for (i = 0; i < n_lists; i++) {
763 offset += delta_lists[i].size;
764 set_immutable_start(memory, i + 1, offset);
767 /* Copy the delta list data onto the memory page. */
768 for (i = 0; i < n_lists; i++) {
769 move_bits(delta_zone->memory, delta_lists[i].start, memory,
770 get_immutable_start(memory, i), delta_lists[i].size);
773 /* Set all the bits in the guard bytes. */
774 memset(memory + memory_size - POST_FIELD_GUARD_BYTES, ~0,
775 POST_FIELD_GUARD_BYTES);
776 return UDS_SUCCESS;
779 /* Compute the new offsets of the delta lists. */
780 static void compute_new_list_offsets(struct delta_zone *delta_zone, u32 growing_index,
781 size_t growing_size, size_t used_space)
783 size_t spacing;
784 u32 i;
785 struct delta_list *delta_lists = delta_zone->delta_lists;
786 u32 tail_guard_index = delta_zone->list_count + 1;
788 spacing = (delta_zone->size - used_space) / delta_zone->list_count;
789 delta_zone->new_offsets[0] = 0;
790 for (i = 0; i <= delta_zone->list_count; i++) {
791 delta_zone->new_offsets[i + 1] =
792 (delta_zone->new_offsets[i] +
793 get_delta_list_byte_size(&delta_lists[i]) + spacing);
794 delta_zone->new_offsets[i] *= BITS_PER_BYTE;
795 delta_zone->new_offsets[i] += delta_lists[i].start % BITS_PER_BYTE;
796 if (i == 0)
797 delta_zone->new_offsets[i + 1] -= spacing / 2;
798 if (i + 1 == growing_index)
799 delta_zone->new_offsets[i + 1] += growing_size;
802 delta_zone->new_offsets[tail_guard_index] =
803 (delta_zone->size * BITS_PER_BYTE - delta_lists[tail_guard_index].size);
806 static void rebalance_lists(struct delta_zone *delta_zone)
808 struct delta_list *delta_lists;
809 u32 i;
810 size_t used_space = 0;
812 /* Extend and balance memory to receive the delta lists */
813 delta_lists = delta_zone->delta_lists;
814 for (i = 0; i <= delta_zone->list_count + 1; i++)
815 used_space += get_delta_list_byte_size(&delta_lists[i]);
817 compute_new_list_offsets(delta_zone, 0, 0, used_space);
818 for (i = 1; i <= delta_zone->list_count + 1; i++)
819 delta_lists[i].start = delta_zone->new_offsets[i];
822 /* Start restoring a delta index from multiple input streams. */
823 int uds_start_restoring_delta_index(struct delta_index *delta_index,
824 struct buffered_reader **buffered_readers,
825 unsigned int reader_count)
827 int result;
828 unsigned int zone_count = reader_count;
829 u64 record_count = 0;
830 u64 collision_count = 0;
831 u32 first_list[MAX_ZONES];
832 u32 list_count[MAX_ZONES];
833 unsigned int z;
834 u32 list_next = 0;
835 const struct delta_zone *delta_zone;
837 /* Read and validate each header. */
838 for (z = 0; z < zone_count; z++) {
839 struct delta_index_header header;
840 u8 buffer[sizeof(struct delta_index_header)];
841 size_t offset = 0;
843 result = uds_read_from_buffered_reader(buffered_readers[z], buffer,
844 sizeof(buffer));
845 if (result != UDS_SUCCESS) {
846 return vdo_log_warning_strerror(result,
847 "failed to read delta index header");
850 memcpy(&header.magic, buffer, MAGIC_SIZE);
851 offset += MAGIC_SIZE;
852 decode_u32_le(buffer, &offset, &header.zone_number);
853 decode_u32_le(buffer, &offset, &header.zone_count);
854 decode_u32_le(buffer, &offset, &header.first_list);
855 decode_u32_le(buffer, &offset, &header.list_count);
856 decode_u64_le(buffer, &offset, &header.record_count);
857 decode_u64_le(buffer, &offset, &header.collision_count);
859 result = VDO_ASSERT(offset == sizeof(struct delta_index_header),
860 "%zu bytes decoded of %zu expected", offset,
861 sizeof(struct delta_index_header));
862 if (result != VDO_SUCCESS) {
863 return vdo_log_warning_strerror(result,
864 "failed to read delta index header");
867 if (memcmp(header.magic, DELTA_INDEX_MAGIC, MAGIC_SIZE) != 0) {
868 return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
869 "delta index file has bad magic number");
872 if (zone_count != header.zone_count) {
873 return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
874 "delta index files contain mismatched zone counts (%u,%u)",
875 zone_count, header.zone_count);
878 if (header.zone_number != z) {
879 return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
880 "delta index zone %u found in slot %u",
881 header.zone_number, z);
884 first_list[z] = header.first_list;
885 list_count[z] = header.list_count;
886 record_count += header.record_count;
887 collision_count += header.collision_count;
889 if (first_list[z] != list_next) {
890 return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
891 "delta index file for zone %u starts with list %u instead of list %u",
892 z, first_list[z], list_next);
895 list_next += list_count[z];
898 if (list_next != delta_index->list_count) {
899 return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
900 "delta index files contain %u delta lists instead of %u delta lists",
901 list_next, delta_index->list_count);
904 if (collision_count > record_count) {
905 return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
906 "delta index files contain %llu collisions and %llu records",
907 (unsigned long long) collision_count,
908 (unsigned long long) record_count);
911 uds_reset_delta_index(delta_index);
912 delta_index->delta_zones[0].record_count = record_count;
913 delta_index->delta_zones[0].collision_count = collision_count;
915 /* Read the delta lists and distribute them to the proper zones. */
916 for (z = 0; z < zone_count; z++) {
917 u32 i;
919 delta_index->load_lists[z] = 0;
920 for (i = 0; i < list_count[z]; i++) {
921 u16 delta_list_size;
922 u32 list_number;
923 unsigned int zone_number;
924 u8 size_data[sizeof(u16)];
926 result = uds_read_from_buffered_reader(buffered_readers[z],
927 size_data,
928 sizeof(size_data));
929 if (result != UDS_SUCCESS) {
930 return vdo_log_warning_strerror(result,
931 "failed to read delta index size");
934 delta_list_size = get_unaligned_le16(size_data);
935 if (delta_list_size > 0)
936 delta_index->load_lists[z] += 1;
938 list_number = first_list[z] + i;
939 zone_number = list_number / delta_index->lists_per_zone;
940 delta_zone = &delta_index->delta_zones[zone_number];
941 list_number -= delta_zone->first_list;
942 delta_zone->delta_lists[list_number + 1].size = delta_list_size;
946 /* Prepare each zone to start receiving the delta list data. */
947 for (z = 0; z < delta_index->zone_count; z++)
948 rebalance_lists(&delta_index->delta_zones[z]);
950 return UDS_SUCCESS;
953 static int restore_delta_list_to_zone(struct delta_zone *delta_zone,
954 const struct delta_list_save_info *save_info,
955 const u8 *data)
957 struct delta_list *delta_list;
958 u16 bit_count;
959 u16 byte_count;
960 u32 list_number = save_info->index - delta_zone->first_list;
962 if (list_number >= delta_zone->list_count) {
963 return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
964 "invalid delta list number %u not in range [%u,%u)",
965 save_info->index, delta_zone->first_list,
966 delta_zone->first_list + delta_zone->list_count);
969 delta_list = &delta_zone->delta_lists[list_number + 1];
970 if (delta_list->size == 0) {
971 return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
972 "unexpected delta list number %u",
973 save_info->index);
976 bit_count = delta_list->size + save_info->bit_offset;
977 byte_count = BITS_TO_BYTES(bit_count);
978 if (save_info->byte_count != byte_count) {
979 return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
980 "unexpected delta list size %u != %u",
981 save_info->byte_count, byte_count);
984 move_bits(data, save_info->bit_offset, delta_zone->memory, delta_list->start,
985 delta_list->size);
986 return UDS_SUCCESS;
989 static int restore_delta_list_data(struct delta_index *delta_index, unsigned int load_zone,
990 struct buffered_reader *buffered_reader, u8 *data)
992 int result;
993 struct delta_list_save_info save_info;
994 u8 buffer[sizeof(struct delta_list_save_info)];
995 unsigned int new_zone;
997 result = uds_read_from_buffered_reader(buffered_reader, buffer, sizeof(buffer));
998 if (result != UDS_SUCCESS) {
999 return vdo_log_warning_strerror(result,
1000 "failed to read delta list data");
1003 save_info = (struct delta_list_save_info) {
1004 .tag = buffer[0],
1005 .bit_offset = buffer[1],
1006 .byte_count = get_unaligned_le16(&buffer[2]),
1007 .index = get_unaligned_le32(&buffer[4]),
1010 if ((save_info.bit_offset >= BITS_PER_BYTE) ||
1011 (save_info.byte_count > DELTA_LIST_MAX_BYTE_COUNT)) {
1012 return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
1013 "corrupt delta list data");
1016 /* Make sure the data is intended for this delta index. */
1017 if (save_info.tag != delta_index->tag)
1018 return UDS_CORRUPT_DATA;
1020 if (save_info.index >= delta_index->list_count) {
1021 return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
1022 "invalid delta list number %u of %u",
1023 save_info.index,
1024 delta_index->list_count);
1027 result = uds_read_from_buffered_reader(buffered_reader, data,
1028 save_info.byte_count);
1029 if (result != UDS_SUCCESS) {
1030 return vdo_log_warning_strerror(result,
1031 "failed to read delta list data");
1034 delta_index->load_lists[load_zone] -= 1;
1035 new_zone = save_info.index / delta_index->lists_per_zone;
1036 return restore_delta_list_to_zone(&delta_index->delta_zones[new_zone],
1037 &save_info, data);
1040 /* Restore delta lists from saved data. */
1041 int uds_finish_restoring_delta_index(struct delta_index *delta_index,
1042 struct buffered_reader **buffered_readers,
1043 unsigned int reader_count)
1045 int result;
1046 int saved_result = UDS_SUCCESS;
1047 unsigned int z;
1048 u8 *data;
1050 result = vdo_allocate(DELTA_LIST_MAX_BYTE_COUNT, u8, __func__, &data);
1051 if (result != VDO_SUCCESS)
1052 return result;
1054 for (z = 0; z < reader_count; z++) {
1055 while (delta_index->load_lists[z] > 0) {
1056 result = restore_delta_list_data(delta_index, z,
1057 buffered_readers[z], data);
1058 if (result != UDS_SUCCESS) {
1059 saved_result = result;
1060 break;
1065 vdo_free(data);
1066 return saved_result;
1069 int uds_check_guard_delta_lists(struct buffered_reader **buffered_readers,
1070 unsigned int reader_count)
1072 int result;
1073 unsigned int z;
1074 u8 buffer[sizeof(struct delta_list_save_info)];
1076 for (z = 0; z < reader_count; z++) {
1077 result = uds_read_from_buffered_reader(buffered_readers[z], buffer,
1078 sizeof(buffer));
1079 if (result != UDS_SUCCESS)
1080 return result;
1082 if (buffer[0] != 'z')
1083 return UDS_CORRUPT_DATA;
1086 return UDS_SUCCESS;
1089 static int flush_delta_list(struct delta_zone *zone, u32 flush_index)
1091 struct delta_list *delta_list;
1092 u8 buffer[sizeof(struct delta_list_save_info)];
1093 int result;
1095 delta_list = &zone->delta_lists[flush_index + 1];
1097 buffer[0] = zone->tag;
1098 buffer[1] = delta_list->start % BITS_PER_BYTE;
1099 put_unaligned_le16(get_delta_list_byte_size(delta_list), &buffer[2]);
1100 put_unaligned_le32(zone->first_list + flush_index, &buffer[4]);
1102 result = uds_write_to_buffered_writer(zone->buffered_writer, buffer,
1103 sizeof(buffer));
1104 if (result != UDS_SUCCESS) {
1105 vdo_log_warning_strerror(result, "failed to write delta list memory");
1106 return result;
1109 result = uds_write_to_buffered_writer(zone->buffered_writer,
1110 zone->memory + get_delta_list_byte_start(delta_list),
1111 get_delta_list_byte_size(delta_list));
1112 if (result != UDS_SUCCESS)
1113 vdo_log_warning_strerror(result, "failed to write delta list memory");
1115 return result;
1118 /* Start saving a delta index zone to a buffered output stream. */
1119 int uds_start_saving_delta_index(const struct delta_index *delta_index,
1120 unsigned int zone_number,
1121 struct buffered_writer *buffered_writer)
1123 int result;
1124 u32 i;
1125 struct delta_zone *delta_zone;
1126 u8 buffer[sizeof(struct delta_index_header)];
1127 size_t offset = 0;
1129 delta_zone = &delta_index->delta_zones[zone_number];
1130 memcpy(buffer, DELTA_INDEX_MAGIC, MAGIC_SIZE);
1131 offset += MAGIC_SIZE;
1132 encode_u32_le(buffer, &offset, zone_number);
1133 encode_u32_le(buffer, &offset, delta_index->zone_count);
1134 encode_u32_le(buffer, &offset, delta_zone->first_list);
1135 encode_u32_le(buffer, &offset, delta_zone->list_count);
1136 encode_u64_le(buffer, &offset, delta_zone->record_count);
1137 encode_u64_le(buffer, &offset, delta_zone->collision_count);
1139 result = VDO_ASSERT(offset == sizeof(struct delta_index_header),
1140 "%zu bytes encoded of %zu expected", offset,
1141 sizeof(struct delta_index_header));
1142 if (result != VDO_SUCCESS)
1143 return result;
1145 result = uds_write_to_buffered_writer(buffered_writer, buffer, offset);
1146 if (result != UDS_SUCCESS)
1147 return vdo_log_warning_strerror(result,
1148 "failed to write delta index header");
1150 for (i = 0; i < delta_zone->list_count; i++) {
1151 u8 data[sizeof(u16)];
1152 struct delta_list *delta_list;
1154 delta_list = &delta_zone->delta_lists[i + 1];
1155 put_unaligned_le16(delta_list->size, data);
1156 result = uds_write_to_buffered_writer(buffered_writer, data,
1157 sizeof(data));
1158 if (result != UDS_SUCCESS)
1159 return vdo_log_warning_strerror(result,
1160 "failed to write delta list size");
1163 delta_zone->buffered_writer = buffered_writer;
1164 return UDS_SUCCESS;
1167 int uds_finish_saving_delta_index(const struct delta_index *delta_index,
1168 unsigned int zone_number)
1170 int result;
1171 int first_error = UDS_SUCCESS;
1172 u32 i;
1173 struct delta_zone *delta_zone;
1174 struct delta_list *delta_list;
1176 delta_zone = &delta_index->delta_zones[zone_number];
1177 for (i = 0; i < delta_zone->list_count; i++) {
1178 delta_list = &delta_zone->delta_lists[i + 1];
1179 if (delta_list->size > 0) {
1180 result = flush_delta_list(delta_zone, i);
1181 if ((result != UDS_SUCCESS) && (first_error == UDS_SUCCESS))
1182 first_error = result;
1186 delta_zone->buffered_writer = NULL;
1187 return first_error;
1190 int uds_write_guard_delta_list(struct buffered_writer *buffered_writer)
1192 int result;
1193 u8 buffer[sizeof(struct delta_list_save_info)];
1195 memset(buffer, 0, sizeof(struct delta_list_save_info));
1196 buffer[0] = 'z';
1198 result = uds_write_to_buffered_writer(buffered_writer, buffer, sizeof(buffer));
1199 if (result != UDS_SUCCESS)
1200 vdo_log_warning_strerror(result, "failed to write guard delta list");
1202 return UDS_SUCCESS;
1205 size_t uds_compute_delta_index_save_bytes(u32 list_count, size_t memory_size)
1207 /* One zone will use at least as much memory as other zone counts. */
1208 return (sizeof(struct delta_index_header) +
1209 list_count * (sizeof(struct delta_list_save_info) + 1) +
1210 get_zone_memory_size(1, memory_size));
1213 static int assert_not_at_end(const struct delta_index_entry *delta_entry)
1215 int result = VDO_ASSERT(!delta_entry->at_end,
1216 "operation is invalid because the list entry is at the end of the delta list");
1217 if (result != VDO_SUCCESS)
1218 result = UDS_BAD_STATE;
1220 return result;
1224 * Prepare to search for an entry in the specified delta list.
1226 * This is always the first function to be called when dealing with delta index entries. It is
1227 * always followed by calls to uds_next_delta_index_entry() to iterate through a delta list. The
1228 * fields of the delta_index_entry argument will be set up for iteration, but will not contain an
1229 * entry from the list.
1231 int uds_start_delta_index_search(const struct delta_index *delta_index, u32 list_number,
1232 u32 key, struct delta_index_entry *delta_entry)
1234 int result;
1235 unsigned int zone_number;
1236 struct delta_zone *delta_zone;
1237 struct delta_list *delta_list;
1239 result = VDO_ASSERT((list_number < delta_index->list_count),
1240 "Delta list number (%u) is out of range (%u)", list_number,
1241 delta_index->list_count);
1242 if (result != VDO_SUCCESS)
1243 return UDS_CORRUPT_DATA;
1245 zone_number = list_number / delta_index->lists_per_zone;
1246 delta_zone = &delta_index->delta_zones[zone_number];
1247 list_number -= delta_zone->first_list;
1248 result = VDO_ASSERT((list_number < delta_zone->list_count),
1249 "Delta list number (%u) is out of range (%u) for zone (%u)",
1250 list_number, delta_zone->list_count, zone_number);
1251 if (result != VDO_SUCCESS)
1252 return UDS_CORRUPT_DATA;
1254 if (delta_index->mutable) {
1255 delta_list = &delta_zone->delta_lists[list_number + 1];
1256 } else {
1257 u32 end_offset;
1260 * Translate the immutable delta list header into a temporary
1261 * full delta list header.
1263 delta_list = &delta_entry->temp_delta_list;
1264 delta_list->start = get_immutable_start(delta_zone->memory, list_number);
1265 end_offset = get_immutable_start(delta_zone->memory, list_number + 1);
1266 delta_list->size = end_offset - delta_list->start;
1267 delta_list->save_key = 0;
1268 delta_list->save_offset = 0;
1271 if (key > delta_list->save_key) {
1272 delta_entry->key = delta_list->save_key;
1273 delta_entry->offset = delta_list->save_offset;
1274 } else {
1275 delta_entry->key = 0;
1276 delta_entry->offset = 0;
1277 if (key == 0) {
1279 * This usually means we're about to walk the entire delta list, so get all
1280 * of it into the CPU cache.
1282 uds_prefetch_range(&delta_zone->memory[delta_list->start / BITS_PER_BYTE],
1283 delta_list->size / BITS_PER_BYTE, false);
1287 delta_entry->at_end = false;
1288 delta_entry->delta_zone = delta_zone;
1289 delta_entry->delta_list = delta_list;
1290 delta_entry->entry_bits = 0;
1291 delta_entry->is_collision = false;
1292 delta_entry->list_number = list_number;
1293 delta_entry->list_overflow = false;
1294 delta_entry->value_bits = delta_zone->value_bits;
1295 return UDS_SUCCESS;
1298 static inline u64 get_delta_entry_offset(const struct delta_index_entry *delta_entry)
1300 return delta_entry->delta_list->start + delta_entry->offset;
1304 * Decode a delta index entry delta value. The delta_index_entry basically describes the previous
1305 * list entry, and has had its offset field changed to point to the subsequent entry. We decode the
1306 * bit stream and update the delta_list_entry to describe the entry.
1308 static inline void decode_delta(struct delta_index_entry *delta_entry)
1310 int key_bits;
1311 u32 delta;
1312 const struct delta_zone *delta_zone = delta_entry->delta_zone;
1313 const u8 *memory = delta_zone->memory;
1314 u64 delta_offset = get_delta_entry_offset(delta_entry) + delta_entry->value_bits;
1315 const u8 *addr = memory + delta_offset / BITS_PER_BYTE;
1316 int offset = delta_offset % BITS_PER_BYTE;
1317 u32 data = get_unaligned_le32(addr) >> offset;
1319 addr += sizeof(u32);
1320 key_bits = delta_zone->min_bits;
1321 delta = data & ((1 << key_bits) - 1);
1322 if (delta >= delta_zone->min_keys) {
1323 data >>= key_bits;
1324 if (data == 0) {
1325 key_bits = sizeof(u32) * BITS_PER_BYTE - offset;
1326 while ((data = get_unaligned_le32(addr)) == 0) {
1327 addr += sizeof(u32);
1328 key_bits += sizeof(u32) * BITS_PER_BYTE;
1331 key_bits += ffs(data);
1332 delta += ((key_bits - delta_zone->min_bits - 1) * delta_zone->incr_keys);
1334 delta_entry->delta = delta;
1335 delta_entry->key += delta;
1337 /* Check for a collision, a delta of zero after the start. */
1338 if (unlikely((delta == 0) && (delta_entry->offset > 0))) {
1339 delta_entry->is_collision = true;
1340 delta_entry->entry_bits = delta_entry->value_bits + key_bits + COLLISION_BITS;
1341 } else {
1342 delta_entry->is_collision = false;
1343 delta_entry->entry_bits = delta_entry->value_bits + key_bits;
1347 noinline int uds_next_delta_index_entry(struct delta_index_entry *delta_entry)
1349 int result;
1350 const struct delta_list *delta_list;
1351 u32 next_offset;
1352 u16 size;
1354 result = assert_not_at_end(delta_entry);
1355 if (result != UDS_SUCCESS)
1356 return result;
1358 delta_list = delta_entry->delta_list;
1359 delta_entry->offset += delta_entry->entry_bits;
1360 size = delta_list->size;
1361 if (unlikely(delta_entry->offset >= size)) {
1362 delta_entry->at_end = true;
1363 delta_entry->delta = 0;
1364 delta_entry->is_collision = false;
1365 result = VDO_ASSERT((delta_entry->offset == size),
1366 "next offset past end of delta list");
1367 if (result != VDO_SUCCESS)
1368 result = UDS_CORRUPT_DATA;
1370 return result;
1373 decode_delta(delta_entry);
1375 next_offset = delta_entry->offset + delta_entry->entry_bits;
1376 if (next_offset > size) {
1378 * This is not an assertion because uds_validate_chapter_index_page() wants to
1379 * handle this error.
1381 vdo_log_warning("Decoded past the end of the delta list");
1382 return UDS_CORRUPT_DATA;
1385 return UDS_SUCCESS;
1388 int uds_remember_delta_index_offset(const struct delta_index_entry *delta_entry)
1390 int result;
1391 struct delta_list *delta_list = delta_entry->delta_list;
1393 result = VDO_ASSERT(!delta_entry->is_collision, "entry is not a collision");
1394 if (result != VDO_SUCCESS)
1395 return result;
1397 delta_list->save_key = delta_entry->key - delta_entry->delta;
1398 delta_list->save_offset = delta_entry->offset;
1399 return UDS_SUCCESS;
1402 static void set_delta(struct delta_index_entry *delta_entry, u32 delta)
1404 const struct delta_zone *delta_zone = delta_entry->delta_zone;
1405 u32 key_bits = (delta_zone->min_bits +
1406 ((delta_zone->incr_keys - delta_zone->min_keys + delta) /
1407 delta_zone->incr_keys));
1409 delta_entry->delta = delta;
1410 delta_entry->entry_bits = delta_entry->value_bits + key_bits;
1413 static void get_collision_name(const struct delta_index_entry *entry, u8 *name)
1415 u64 offset = get_delta_entry_offset(entry) + entry->entry_bits - COLLISION_BITS;
1416 const u8 *addr = entry->delta_zone->memory + offset / BITS_PER_BYTE;
1417 int size = COLLISION_BYTES;
1418 int shift = offset % BITS_PER_BYTE;
1420 while (--size >= 0)
1421 *name++ = get_unaligned_le16(addr++) >> shift;
1424 static void set_collision_name(const struct delta_index_entry *entry, const u8 *name)
1426 u64 offset = get_delta_entry_offset(entry) + entry->entry_bits - COLLISION_BITS;
1427 u8 *addr = entry->delta_zone->memory + offset / BITS_PER_BYTE;
1428 int size = COLLISION_BYTES;
1429 int shift = offset % BITS_PER_BYTE;
1430 u16 mask = ~((u16) 0xFF << shift);
1431 u16 data;
1433 while (--size >= 0) {
1434 data = (get_unaligned_le16(addr) & mask) | (*name++ << shift);
1435 put_unaligned_le16(data, addr++);
1439 int uds_get_delta_index_entry(const struct delta_index *delta_index, u32 list_number,
1440 u32 key, const u8 *name,
1441 struct delta_index_entry *delta_entry)
1443 int result;
1445 result = uds_start_delta_index_search(delta_index, list_number, key,
1446 delta_entry);
1447 if (result != UDS_SUCCESS)
1448 return result;
1450 do {
1451 result = uds_next_delta_index_entry(delta_entry);
1452 if (result != UDS_SUCCESS)
1453 return result;
1454 } while (!delta_entry->at_end && (key > delta_entry->key));
1456 result = uds_remember_delta_index_offset(delta_entry);
1457 if (result != UDS_SUCCESS)
1458 return result;
1460 if (!delta_entry->at_end && (key == delta_entry->key)) {
1461 struct delta_index_entry collision_entry = *delta_entry;
1463 for (;;) {
1464 u8 full_name[COLLISION_BYTES];
1466 result = uds_next_delta_index_entry(&collision_entry);
1467 if (result != UDS_SUCCESS)
1468 return result;
1470 if (collision_entry.at_end || !collision_entry.is_collision)
1471 break;
1473 get_collision_name(&collision_entry, full_name);
1474 if (memcmp(full_name, name, COLLISION_BYTES) == 0) {
1475 *delta_entry = collision_entry;
1476 break;
1481 return UDS_SUCCESS;
1484 int uds_get_delta_entry_collision(const struct delta_index_entry *delta_entry, u8 *name)
1486 int result;
1488 result = assert_not_at_end(delta_entry);
1489 if (result != UDS_SUCCESS)
1490 return result;
1492 result = VDO_ASSERT(delta_entry->is_collision,
1493 "Cannot get full block name from a non-collision delta index entry");
1494 if (result != VDO_SUCCESS)
1495 return UDS_BAD_STATE;
1497 get_collision_name(delta_entry, name);
1498 return UDS_SUCCESS;
1501 u32 uds_get_delta_entry_value(const struct delta_index_entry *delta_entry)
1503 return get_field(delta_entry->delta_zone->memory,
1504 get_delta_entry_offset(delta_entry), delta_entry->value_bits);
1507 static int assert_mutable_entry(const struct delta_index_entry *delta_entry)
1509 int result = VDO_ASSERT((delta_entry->delta_list != &delta_entry->temp_delta_list),
1510 "delta index is mutable");
1511 if (result != VDO_SUCCESS)
1512 result = UDS_BAD_STATE;
1514 return result;
1517 int uds_set_delta_entry_value(const struct delta_index_entry *delta_entry, u32 value)
1519 int result;
1520 u32 value_mask = (1 << delta_entry->value_bits) - 1;
1522 result = assert_mutable_entry(delta_entry);
1523 if (result != UDS_SUCCESS)
1524 return result;
1526 result = assert_not_at_end(delta_entry);
1527 if (result != UDS_SUCCESS)
1528 return result;
1530 result = VDO_ASSERT((value & value_mask) == value,
1531 "Value (%u) being set in a delta index is too large (must fit in %u bits)",
1532 value, delta_entry->value_bits);
1533 if (result != VDO_SUCCESS)
1534 return UDS_INVALID_ARGUMENT;
1536 set_field(value, delta_entry->delta_zone->memory,
1537 get_delta_entry_offset(delta_entry), delta_entry->value_bits);
1538 return UDS_SUCCESS;
1542 * Extend the memory used by the delta lists by adding growing_size bytes before the list indicated
1543 * by growing_index, then rebalancing the lists in the new chunk.
1545 static int extend_delta_zone(struct delta_zone *delta_zone, u32 growing_index,
1546 size_t growing_size)
1548 ktime_t start_time;
1549 ktime_t end_time;
1550 struct delta_list *delta_lists;
1551 u32 i;
1552 size_t used_space;
1555 /* Calculate the amount of space that is or will be in use. */
1556 start_time = current_time_ns(CLOCK_MONOTONIC);
1557 delta_lists = delta_zone->delta_lists;
1558 used_space = growing_size;
1559 for (i = 0; i <= delta_zone->list_count + 1; i++)
1560 used_space += get_delta_list_byte_size(&delta_lists[i]);
1562 if (delta_zone->size < used_space)
1563 return UDS_OVERFLOW;
1565 /* Compute the new offsets of the delta lists. */
1566 compute_new_list_offsets(delta_zone, growing_index, growing_size, used_space);
1569 * When we rebalance the delta list, we will include the end guard list in the rebalancing.
1570 * It contains the end guard data, which must be copied.
1572 rebalance_delta_zone(delta_zone, 1, delta_zone->list_count + 1);
1573 end_time = current_time_ns(CLOCK_MONOTONIC);
1574 delta_zone->rebalance_count++;
1575 delta_zone->rebalance_time += ktime_sub(end_time, start_time);
1576 return UDS_SUCCESS;
1579 static int insert_bits(struct delta_index_entry *delta_entry, u16 size)
1581 u64 free_before;
1582 u64 free_after;
1583 u64 source;
1584 u64 destination;
1585 u32 count;
1586 bool before_flag;
1587 u8 *memory;
1588 struct delta_zone *delta_zone = delta_entry->delta_zone;
1589 struct delta_list *delta_list = delta_entry->delta_list;
1590 /* Compute bits in use before and after the inserted bits. */
1591 u32 total_size = delta_list->size;
1592 u32 before_size = delta_entry->offset;
1593 u32 after_size = total_size - delta_entry->offset;
1595 if (total_size + size > U16_MAX) {
1596 delta_entry->list_overflow = true;
1597 delta_zone->overflow_count++;
1598 return UDS_OVERFLOW;
1601 /* Compute bits available before and after the delta list. */
1602 free_before = (delta_list[0].start - (delta_list[-1].start + delta_list[-1].size));
1603 free_after = (delta_list[1].start - (delta_list[0].start + delta_list[0].size));
1605 if ((size <= free_before) && (size <= free_after)) {
1607 * We have enough space to use either before or after the list. Select the smaller
1608 * amount of data. If it is exactly the same, try to take from the larger amount of
1609 * free space.
1611 if (before_size < after_size)
1612 before_flag = true;
1613 else if (after_size < before_size)
1614 before_flag = false;
1615 else
1616 before_flag = free_before > free_after;
1617 } else if (size <= free_before) {
1618 /* There is space before but not after. */
1619 before_flag = true;
1620 } else if (size <= free_after) {
1621 /* There is space after but not before. */
1622 before_flag = false;
1623 } else {
1625 * Neither of the surrounding spaces is large enough for this request. Extend
1626 * and/or rebalance the delta list memory choosing to move the least amount of
1627 * data.
1629 int result;
1630 u32 growing_index = delta_entry->list_number + 1;
1632 before_flag = before_size < after_size;
1633 if (!before_flag)
1634 growing_index++;
1635 result = extend_delta_zone(delta_zone, growing_index,
1636 BITS_TO_BYTES(size));
1637 if (result != UDS_SUCCESS)
1638 return result;
1641 delta_list->size += size;
1642 if (before_flag) {
1643 source = delta_list->start;
1644 destination = source - size;
1645 delta_list->start -= size;
1646 count = before_size;
1647 } else {
1648 source = delta_list->start + delta_entry->offset;
1649 destination = source + size;
1650 count = after_size;
1653 memory = delta_zone->memory;
1654 move_bits(memory, source, memory, destination, count);
1655 return UDS_SUCCESS;
1658 static void encode_delta(const struct delta_index_entry *delta_entry)
1660 u32 temp;
1661 u32 t1;
1662 u32 t2;
1663 u64 offset;
1664 const struct delta_zone *delta_zone = delta_entry->delta_zone;
1665 u8 *memory = delta_zone->memory;
1667 offset = get_delta_entry_offset(delta_entry) + delta_entry->value_bits;
1668 if (delta_entry->delta < delta_zone->min_keys) {
1669 set_field(delta_entry->delta, memory, offset, delta_zone->min_bits);
1670 return;
1673 temp = delta_entry->delta - delta_zone->min_keys;
1674 t1 = (temp % delta_zone->incr_keys) + delta_zone->min_keys;
1675 t2 = temp / delta_zone->incr_keys;
1676 set_field(t1, memory, offset, delta_zone->min_bits);
1677 set_zero(memory, offset + delta_zone->min_bits, t2);
1678 set_field(1, memory, offset + delta_zone->min_bits + t2, 1);
1681 static void encode_entry(const struct delta_index_entry *delta_entry, u32 value,
1682 const u8 *name)
1684 u8 *memory = delta_entry->delta_zone->memory;
1685 u64 offset = get_delta_entry_offset(delta_entry);
1687 set_field(value, memory, offset, delta_entry->value_bits);
1688 encode_delta(delta_entry);
1689 if (name != NULL)
1690 set_collision_name(delta_entry, name);
1694 * Create a new entry in the delta index. If the entry is a collision, the full 256 bit name must
1695 * be provided.
1697 int uds_put_delta_index_entry(struct delta_index_entry *delta_entry, u32 key, u32 value,
1698 const u8 *name)
1700 int result;
1701 struct delta_zone *delta_zone;
1703 result = assert_mutable_entry(delta_entry);
1704 if (result != UDS_SUCCESS)
1705 return result;
1707 if (delta_entry->is_collision) {
1709 * The caller wants us to insert a collision entry onto a collision entry. This
1710 * happens when we find a collision and attempt to add the name again to the index.
1711 * This is normally a fatal error unless we are replaying a closed chapter while we
1712 * are rebuilding a volume index.
1714 return UDS_DUPLICATE_NAME;
1717 if (delta_entry->offset < delta_entry->delta_list->save_offset) {
1719 * The saved entry offset is after the new entry and will no longer be valid, so
1720 * replace it with the insertion point.
1722 result = uds_remember_delta_index_offset(delta_entry);
1723 if (result != UDS_SUCCESS)
1724 return result;
1727 if (name != NULL) {
1728 /* Insert a collision entry which is placed after this entry. */
1729 result = assert_not_at_end(delta_entry);
1730 if (result != UDS_SUCCESS)
1731 return result;
1733 result = VDO_ASSERT((key == delta_entry->key),
1734 "incorrect key for collision entry");
1735 if (result != VDO_SUCCESS)
1736 return result;
1738 delta_entry->offset += delta_entry->entry_bits;
1739 set_delta(delta_entry, 0);
1740 delta_entry->is_collision = true;
1741 delta_entry->entry_bits += COLLISION_BITS;
1742 result = insert_bits(delta_entry, delta_entry->entry_bits);
1743 } else if (delta_entry->at_end) {
1744 /* Insert a new entry at the end of the delta list. */
1745 result = VDO_ASSERT((key >= delta_entry->key), "key past end of list");
1746 if (result != VDO_SUCCESS)
1747 return result;
1749 set_delta(delta_entry, key - delta_entry->key);
1750 delta_entry->key = key;
1751 delta_entry->at_end = false;
1752 result = insert_bits(delta_entry, delta_entry->entry_bits);
1753 } else {
1754 u16 old_entry_size;
1755 u16 additional_size;
1756 struct delta_index_entry next_entry;
1757 u32 next_value;
1760 * Insert a new entry which requires the delta in the following entry to be
1761 * updated.
1763 result = VDO_ASSERT((key < delta_entry->key),
1764 "key precedes following entry");
1765 if (result != VDO_SUCCESS)
1766 return result;
1768 result = VDO_ASSERT((key >= delta_entry->key - delta_entry->delta),
1769 "key effects following entry's delta");
1770 if (result != VDO_SUCCESS)
1771 return result;
1773 old_entry_size = delta_entry->entry_bits;
1774 next_entry = *delta_entry;
1775 next_value = uds_get_delta_entry_value(&next_entry);
1776 set_delta(delta_entry, key - (delta_entry->key - delta_entry->delta));
1777 delta_entry->key = key;
1778 set_delta(&next_entry, next_entry.key - key);
1779 next_entry.offset += delta_entry->entry_bits;
1780 /* The two new entries are always bigger than the single entry being replaced. */
1781 additional_size = (delta_entry->entry_bits +
1782 next_entry.entry_bits - old_entry_size);
1783 result = insert_bits(delta_entry, additional_size);
1784 if (result != UDS_SUCCESS)
1785 return result;
1787 encode_entry(&next_entry, next_value, NULL);
1790 if (result != UDS_SUCCESS)
1791 return result;
1793 encode_entry(delta_entry, value, name);
1794 delta_zone = delta_entry->delta_zone;
1795 delta_zone->record_count++;
1796 delta_zone->collision_count += delta_entry->is_collision ? 1 : 0;
1797 return UDS_SUCCESS;
1800 static void delete_bits(const struct delta_index_entry *delta_entry, int size)
1802 u64 source;
1803 u64 destination;
1804 u32 count;
1805 bool before_flag;
1806 struct delta_list *delta_list = delta_entry->delta_list;
1807 u8 *memory = delta_entry->delta_zone->memory;
1808 /* Compute bits retained before and after the deleted bits. */
1809 u32 total_size = delta_list->size;
1810 u32 before_size = delta_entry->offset;
1811 u32 after_size = total_size - delta_entry->offset - size;
1814 * Determine whether to add to the available space either before or after the delta list.
1815 * We prefer to move the least amount of data. If it is exactly the same, try to add to the
1816 * smaller amount of free space.
1818 if (before_size < after_size) {
1819 before_flag = true;
1820 } else if (after_size < before_size) {
1821 before_flag = false;
1822 } else {
1823 u64 free_before =
1824 (delta_list[0].start - (delta_list[-1].start + delta_list[-1].size));
1825 u64 free_after =
1826 (delta_list[1].start - (delta_list[0].start + delta_list[0].size));
1828 before_flag = (free_before < free_after);
1831 delta_list->size -= size;
1832 if (before_flag) {
1833 source = delta_list->start;
1834 destination = source + size;
1835 delta_list->start += size;
1836 count = before_size;
1837 } else {
1838 destination = delta_list->start + delta_entry->offset;
1839 source = destination + size;
1840 count = after_size;
1843 move_bits(memory, source, memory, destination, count);
1846 int uds_remove_delta_index_entry(struct delta_index_entry *delta_entry)
1848 int result;
1849 struct delta_index_entry next_entry;
1850 struct delta_zone *delta_zone;
1851 struct delta_list *delta_list;
1853 result = assert_mutable_entry(delta_entry);
1854 if (result != UDS_SUCCESS)
1855 return result;
1857 next_entry = *delta_entry;
1858 result = uds_next_delta_index_entry(&next_entry);
1859 if (result != UDS_SUCCESS)
1860 return result;
1862 delta_zone = delta_entry->delta_zone;
1864 if (delta_entry->is_collision) {
1865 /* This is a collision entry, so just remove it. */
1866 delete_bits(delta_entry, delta_entry->entry_bits);
1867 next_entry.offset = delta_entry->offset;
1868 delta_zone->collision_count -= 1;
1869 } else if (next_entry.at_end) {
1870 /* This entry is at the end of the list, so just remove it. */
1871 delete_bits(delta_entry, delta_entry->entry_bits);
1872 next_entry.key -= delta_entry->delta;
1873 next_entry.offset = delta_entry->offset;
1874 } else {
1875 /* The delta in the next entry needs to be updated. */
1876 u32 next_value = uds_get_delta_entry_value(&next_entry);
1877 u16 old_size = delta_entry->entry_bits + next_entry.entry_bits;
1879 if (next_entry.is_collision) {
1880 next_entry.is_collision = false;
1881 delta_zone->collision_count -= 1;
1884 set_delta(&next_entry, delta_entry->delta + next_entry.delta);
1885 next_entry.offset = delta_entry->offset;
1886 /* The one new entry is always smaller than the two entries being replaced. */
1887 delete_bits(delta_entry, old_size - next_entry.entry_bits);
1888 encode_entry(&next_entry, next_value, NULL);
1891 delta_zone->record_count--;
1892 delta_zone->discard_count++;
1893 *delta_entry = next_entry;
1895 delta_list = delta_entry->delta_list;
1896 if (delta_entry->offset < delta_list->save_offset) {
1897 /* The saved entry offset is no longer valid. */
1898 delta_list->save_key = 0;
1899 delta_list->save_offset = 0;
1902 return UDS_SUCCESS;
1905 void uds_get_delta_index_stats(const struct delta_index *delta_index,
1906 struct delta_index_stats *stats)
1908 unsigned int z;
1909 const struct delta_zone *delta_zone;
1911 memset(stats, 0, sizeof(struct delta_index_stats));
1912 for (z = 0; z < delta_index->zone_count; z++) {
1913 delta_zone = &delta_index->delta_zones[z];
1914 stats->rebalance_time += delta_zone->rebalance_time;
1915 stats->rebalance_count += delta_zone->rebalance_count;
1916 stats->record_count += delta_zone->record_count;
1917 stats->collision_count += delta_zone->collision_count;
1918 stats->discard_count += delta_zone->discard_count;
1919 stats->overflow_count += delta_zone->overflow_count;
1920 stats->list_count += delta_zone->list_count;
1924 size_t uds_compute_delta_index_size(u32 entry_count, u32 mean_delta, u32 payload_bits)
1926 u16 min_bits;
1927 u32 incr_keys;
1928 u32 min_keys;
1930 compute_coding_constants(mean_delta, &min_bits, &min_keys, &incr_keys);
1931 /* On average, each delta is encoded into about min_bits + 1.5 bits. */
1932 return entry_count * (payload_bits + min_bits + 1) + entry_count / 2;
1935 u32 uds_get_delta_index_page_count(u32 entry_count, u32 list_count, u32 mean_delta,
1936 u32 payload_bits, size_t bytes_per_page)
1938 unsigned int bits_per_delta_list;
1939 unsigned int bits_per_page;
1940 size_t bits_per_index;
1942 /* Compute the expected number of bits needed for all the entries. */
1943 bits_per_index = uds_compute_delta_index_size(entry_count, mean_delta,
1944 payload_bits);
1945 bits_per_delta_list = bits_per_index / list_count;
1947 /* Add in the immutable delta list headers. */
1948 bits_per_index += list_count * IMMUTABLE_HEADER_SIZE;
1949 /* Compute the number of usable bits on an immutable index page. */
1950 bits_per_page = ((bytes_per_page - sizeof(struct delta_page_header)) * BITS_PER_BYTE);
1952 * Reduce the bits per page by one immutable delta list header and one delta list to
1953 * account for internal fragmentation.
1955 bits_per_page -= IMMUTABLE_HEADER_SIZE + bits_per_delta_list;
1956 /* Now compute the number of pages needed. */
1957 return DIV_ROUND_UP(bits_per_index, bits_per_page);
1960 void uds_log_delta_index_entry(struct delta_index_entry *delta_entry)
1962 vdo_log_ratelimit(vdo_log_info,
1963 "List 0x%X Key 0x%X Offset 0x%X%s%s List_size 0x%X%s",
1964 delta_entry->list_number, delta_entry->key,
1965 delta_entry->offset, delta_entry->at_end ? " end" : "",
1966 delta_entry->is_collision ? " collision" : "",
1967 delta_entry->delta_list->size,
1968 delta_entry->list_overflow ? " overflow" : "");
1969 delta_entry->list_overflow = false;