1 /* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
4 /* Key assignment policy implementation */
7 * In reiser4 every piece of file system data and meta-data has a key. Keys
8 * are used to store information in and retrieve it from reiser4 internal
9 * tree. In addition to this, keys define _ordering_ of all file system
10 * information: things having close keys are placed into the same or
11 * neighboring (in the tree order) nodes of the tree. As our block allocator
12 * tries to respect tree order (see flush.c), keys also define order in which
13 * things are laid out on the disk, and hence, affect performance directly.
15 * Obviously, assignment of keys to data and meta-data should be consistent
16 * across whole file system. Algorithm that calculates a key for a given piece
17 * of data or meta-data is referred to as "key assignment".
19 * Key assignment is too expensive to be implemented as a plugin (that is,
20 * with an ability to support different key assignment schemas in the same
21 * compiled kernel image). As a compromise, all key-assignment functions and
22 * data-structures are collected in this single file, so that modifications to
23 * key assignment algorithm can be localized. Additional changes may be
24 * required in key.[ch].
26 * Current default reiser4 key assignment algorithm is dubbed "Plan A". As one
27 * may guess, there is "Plan B" too.
32 * Additional complication with key assignment implementation is a requirement
33 * to support different key length.
37 * KEY ASSIGNMENT: PLAN A, LONG KEYS.
41 * | 60 | 4 | 7 |1| 56 | 64 | 64 |
42 * +--------------+---+---+-+-------------+------------------+-----------------+
43 * | dirid | 0 | F |H| prefix-1 | prefix-2 | prefix-3/hash |
44 * +--------------+---+---+-+-------------+------------------+-----------------+
46 * | 8 bytes | 8 bytes | 8 bytes | 8 bytes |
48 * dirid objectid of directory this item is for
50 * F fibration, see fs/reiser4/plugin/fibration.[ch]
52 * H 1 if last 8 bytes of the key contain hash,
53 * 0 if last 8 bytes of the key contain prefix-3
55 * prefix-1 first 7 characters of file name.
56 * Padded by zeroes if name is not long enough.
58 * prefix-2 next 8 characters of the file name.
60 * prefix-3 next 8 characters of the file name.
62 * hash hash of the rest of file name (i.e., portion of file
63 * name not included into prefix-1 and prefix-2).
65 * File names shorter than 23 (== 7 + 8 + 8) characters are completely encoded
66 * in the key. Such file names are called "short". They are distinguished by H
67 * bit set 0 in the key.
69 * Other file names are "long". For long name, H bit is 1, and first 15 (== 7
70 * + 8) characters are encoded in prefix-1 and prefix-2 portions of the
71 * key. Last 8 bytes of the key are occupied by hash of the remaining
72 * characters of the name.
74 * This key assignment reaches following important goals:
76 * (1) directory entries are sorted in approximately lexicographical
79 * (2) collisions (when multiple directory items have the same key), while
80 * principally unavoidable in a tree with fixed length keys, are rare.
84 * | 60 | 4 | 64 | 4 | 60 | 64 |
85 * +--------------+---+-----------------+---+--------------+-----------------+
86 * | locality id | 1 | ordering | 0 | objectid | 0 |
87 * +--------------+---+-----------------+---+--------------+-----------------+
89 * | 8 bytes | 8 bytes | 8 bytes | 8 bytes |
91 * locality id object id of a directory where first name was created for
94 * ordering copy of second 8-byte portion of the key of directory
95 * entry for the first name of this object. Ordering has a form
101 * see description of key for directory entry above.
103 * objectid object id for this object
105 * This key assignment policy is designed to keep stat-data in the same order
106 * as corresponding directory items, thus speeding up readdir/stat types of
111 * | 60 | 4 | 64 | 4 | 60 | 64 |
112 * +--------------+---+-----------------+---+--------------+-----------------+
113 * | locality id | 4 | ordering | 0 | objectid | offset |
114 * +--------------+---+-----------------+---+--------------+-----------------+
116 * | 8 bytes | 8 bytes | 8 bytes | 8 bytes |
118 * locality id object id of a directory where first name was created for
121 * ordering the same as in the key of stat-data for this object
123 * objectid object id for this object
125 * offset logical offset from the beginning of this file.
129 * KEY ASSIGNMENT: PLAN A, SHORT KEYS.
133 * | 60 | 4 | 7 |1| 56 | 64 |
134 * +--------------+---+---+-+-------------+-----------------+
135 * | dirid | 0 | F |H| prefix-1 | prefix-2/hash |
136 * +--------------+---+---+-+-------------+-----------------+
138 * | 8 bytes | 8 bytes | 8 bytes |
140 * dirid objectid of directory this item is for
142 * F fibration, see fs/reiser4/plugin/fibration.[ch]
144 * H 1 if last 8 bytes of the key contain hash,
145 * 0 if last 8 bytes of the key contain prefix-2
147 * prefix-1 first 7 characters of file name.
148 * Padded by zeroes if name is not long enough.
150 * prefix-2 next 8 characters of the file name.
152 * hash hash of the rest of file name (i.e., portion of file
153 * name not included into prefix-1).
155 * File names shorter than 15 (== 7 + 8) characters are completely encoded in
156 * the key. Such file names are called "short". They are distinguished by H
157 * bit set in the key.
159 * Other file names are "long". For long name, H bit is 0, and first 7
160 * characters are encoded in prefix-1 portion of the key. Last 8 bytes of the
161 * key are occupied by hash of the remaining characters of the name.
165 * | 60 | 4 | 4 | 60 | 64 |
166 * +--------------+---+---+--------------+-----------------+
167 * | locality id | 1 | 0 | objectid | 0 |
168 * +--------------+---+---+--------------+-----------------+
170 * | 8 bytes | 8 bytes | 8 bytes |
172 * locality id object id of a directory where first name was created for
175 * objectid object id for this object
179 * | 60 | 4 | 4 | 60 | 64 |
180 * +--------------+---+---+--------------+-----------------+
181 * | locality id | 4 | 0 | objectid | offset |
182 * +--------------+---+---+--------------+-----------------+
184 * | 8 bytes | 8 bytes | 8 bytes |
186 * locality id object id of a directory where first name was created for
189 * objectid object id for this object
191 * offset logical offset from the beginning of this file.
205 #include <linux/types.h> /* for __u?? */
206 #include <linux/fs.h> /* for struct super_block, etc */
208 /* bitmask for H bit (see comment at the beginning of this file */
209 static const __u64 longname_mark
= 0x0100000000000000ull
;
210 /* bitmask for F and H portions of the key. */
211 static const __u64 fibration_mask
= 0xff00000000000000ull
;
213 /* return true if name is not completely encoded in @key */
214 int is_longname_key(const reiser4_key
* key
)
218 assert("nikita-2863", key
!= NULL
);
219 if (get_key_type(key
) != KEY_FILE_NAME_MINOR
)
220 reiser4_print_key("oops", key
);
221 assert("nikita-2864", get_key_type(key
) == KEY_FILE_NAME_MINOR
);
223 if (REISER4_LARGE_KEY
)
224 highpart
= get_key_ordering(key
);
226 highpart
= get_key_objectid(key
);
228 return (highpart
& longname_mark
) ? 1 : 0;
231 /* return true if @name is too long to be completely encoded in the key */
232 int is_longname(const char *name UNUSED_ARG
, int len
)
234 if (REISER4_LARGE_KEY
)
240 /* code ascii string into __u64.
242 Put characters of @name into result (@str) one after another starting
243 from @start_idx-th highest (arithmetically) byte. This produces
244 endian-safe encoding. memcpy(2) will not do.
247 static __u64
pack_string(const char *name
/* string to encode */ ,
248 int start_idx
/* highest byte in result from
249 * which to start encoding */ )
255 for (i
= 0; (i
< sizeof str
- start_idx
) && name
[i
]; ++i
) {
257 str
|= (unsigned char)name
[i
];
259 str
<<= (sizeof str
- i
- start_idx
) << 3;
263 /* opposite to pack_string(). Takes value produced by pack_string(), restores
264 * string encoded in it and stores result in @buf */
265 char *reiser4_unpack_string(__u64 value
, char *buf
)
268 *buf
= value
>> (64 - 8);
272 } while (value
!= 0);
277 /* obtain name encoded in @key and store it in @buf */
278 char *extract_name_from_key(const reiser4_key
* key
, char *buf
)
282 assert("nikita-2868", !is_longname_key(key
));
285 if (REISER4_LARGE_KEY
) {
286 c
= reiser4_unpack_string(get_key_ordering(key
) &
288 c
= reiser4_unpack_string(get_key_fulloid(key
), c
);
290 c
= reiser4_unpack_string(get_key_fulloid(key
) &
292 reiser4_unpack_string(get_key_offset(key
), c
);
297 * complete_entry_key - calculate entry key by name
298 * @dir: directory where entry is (or will be) in
299 * @name: name to calculate key of
300 * @len: lenth of name
301 * @result: place to store result in
303 * Sets fields of entry key @result which depend on file name.
304 * When REISER4_LARGE_KEY is defined three fields of @result are set: ordering,
305 * objectid and offset. Otherwise, objectid and offset are set.
307 void complete_entry_key(const struct inode
*dir
, const char *name
,
308 int len
, reiser4_key
*result
)
310 #if REISER4_LARGE_KEY
315 assert("nikita-1139", dir
!= NULL
);
316 assert("nikita-1142", result
!= NULL
);
317 assert("nikita-2867", strlen(name
) == len
);
320 * key allocation algorithm for directory entries in case of large
323 * If name is not longer than 7 + 8 + 8 = 23 characters, put first 7
324 * characters into ordering field of key, next 8 charactes (if any)
325 * into objectid field of key and next 8 ones (of any) into offset
328 * If file name is longer than 23 characters, put first 7 characters
329 * into key's ordering, next 8 to objectid and hash of remaining
330 * characters into offset field.
332 * To distinguish above cases, in latter set up unused high bit in
336 /* [0-6] characters to ordering */
337 ordering
= pack_string(name
, 1);
339 /* [7-14] characters to objectid */
340 objectid
= pack_string(name
+ 7, 0);
343 /* [15-23] characters to offset */
344 offset
= pack_string(name
+ 15, 0);
346 /* note in a key the fact that offset contains
348 ordering
|= longname_mark
;
350 /* offset is the hash of the file name's tail */
351 offset
= inode_hash_plugin(dir
)->hash(name
+ 15,
362 assert("nikita-3480", inode_fibration_plugin(dir
) != NULL
);
363 ordering
|= inode_fibration_plugin(dir
)->fibre(dir
, name
, len
);
365 set_key_ordering(result
, ordering
);
366 set_key_fulloid(result
, objectid
);
367 set_key_offset(result
, offset
);
374 assert("nikita-1139", dir
!= NULL
);
375 assert("nikita-1142", result
!= NULL
);
376 assert("nikita-2867", strlen(name
) == len
);
379 * key allocation algorithm for directory entries in case of not large
382 * If name is not longer than 7 + 8 = 15 characters, put first 7
383 * characters into objectid field of key, next 8 charactes (if any)
384 * into offset field of key
386 * If file name is longer than 15 characters, put first 7 characters
387 * into key's objectid, and hash of remaining characters into offset
390 * To distinguish above cases, in latter set up unused high bit in
394 /* [0-6] characters to objectid */
395 objectid
= pack_string(name
, 1);
398 /* [7-14] characters to offset */
399 offset
= pack_string(name
+ 7, 0);
401 /* note in a key the fact that offset contains hash. */
402 objectid
|= longname_mark
;
404 /* offset is the hash of the file name. */
405 offset
= inode_hash_plugin(dir
)->hash(name
+ 7,
411 assert("nikita-3480", inode_fibration_plugin(dir
) != NULL
);
412 objectid
|= inode_fibration_plugin(dir
)->fibre(dir
, name
, len
);
414 set_key_fulloid(result
, objectid
);
415 set_key_offset(result
, offset
);
417 #endif /* ! REISER4_LARGE_KEY */
420 /* true, if @key is the key of "." */
421 int is_dot_key(const reiser4_key
* key
/* key to check */)
423 assert("nikita-1717", key
!= NULL
);
424 assert("nikita-1718", get_key_type(key
) == KEY_FILE_NAME_MINOR
);
426 (get_key_ordering(key
) == 0ull) &&
427 (get_key_objectid(key
) == 0ull) && (get_key_offset(key
) == 0ull);
430 /* build key for stat-data.
432 return key of stat-data of this object. This should became sd plugin
433 method in the future. For now, let it be here.
436 reiser4_key
*build_sd_key(const struct inode
*target
/* inode of an object */ ,
437 reiser4_key
* result
/* resulting key of @target
440 assert("nikita-261", result
!= NULL
);
442 reiser4_key_init(result
);
443 set_key_locality(result
, reiser4_inode_data(target
)->locality_id
);
444 set_key_ordering(result
, get_inode_ordering(target
));
445 set_key_objectid(result
, get_inode_oid(target
));
446 set_key_type(result
, KEY_SD_MINOR
);
447 set_key_offset(result
, (__u64
) 0);
451 /* encode part of key into &obj_key_id
453 This encodes into @id part of @key sufficient to restore @key later,
454 given that latter is key of object (key of stat-data).
458 int build_obj_key_id(const reiser4_key
* key
/* key to encode */ ,
459 obj_key_id
* id
/* id where key is encoded in */)
461 assert("nikita-1151", key
!= NULL
);
462 assert("nikita-1152", id
!= NULL
);
464 memcpy(id
, key
, sizeof *id
);
468 /* encode reference to @obj in @id.
470 This is like build_obj_key_id() above, but takes inode as parameter. */
471 int build_inode_key_id(const struct inode
*obj
/* object to build key of */ ,
472 obj_key_id
* id
/* result */)
476 assert("nikita-1166", obj
!= NULL
);
477 assert("nikita-1167", id
!= NULL
);
479 build_sd_key(obj
, &sdkey
);
480 build_obj_key_id(&sdkey
, id
);
484 /* decode @id back into @key
486 Restore key of object stat-data from @id. This is dual to
487 build_obj_key_id() above.
489 int extract_key_from_id(const obj_key_id
* id
/* object key id to extract key
491 reiser4_key
* key
/* result */)
493 assert("nikita-1153", id
!= NULL
);
494 assert("nikita-1154", key
!= NULL
);
496 reiser4_key_init(key
);
497 memcpy(key
, id
, sizeof *id
);
501 /* extract objectid of directory from key of directory entry within said
504 oid_t
extract_dir_id_from_key(const reiser4_key
* de_key
/* key of
508 assert("nikita-1314", de_key
!= NULL
);
509 return get_key_locality(de_key
);
512 /* encode into @id key of directory entry.
514 Encode into @id information sufficient to later distinguish directory
515 entries within the same directory. This is not whole key, because all
516 directory entries within directory item share locality which is equal
517 to objectid of their directory.
520 int build_de_id(const struct inode
*dir
/* inode of directory */ ,
521 const struct qstr
*name
/* name to be given to @obj by
522 * directory entry being
524 de_id
* id
/* short key of directory entry */)
528 assert("nikita-1290", dir
!= NULL
);
529 assert("nikita-1292", id
!= NULL
);
531 /* NOTE-NIKITA this is suboptimal. */
532 inode_dir_plugin(dir
)->build_entry_key(dir
, name
, &key
);
533 return build_de_id_by_key(&key
, id
);
536 /* encode into @id key of directory entry.
538 Encode into @id information sufficient to later distinguish directory
539 entries within the same directory. This is not whole key, because all
540 directory entries within directory item share locality which is equal
541 to objectid of their directory.
544 int build_de_id_by_key(const reiser4_key
* entry_key
/* full key of directory
546 de_id
* id
/* short key of directory entry */)
548 memcpy(id
, ((__u64
*) entry_key
) + 1, sizeof *id
);
552 /* restore from @id key of directory entry.
554 Function dual to build_de_id(): given @id and locality, build full
555 key of directory entry within directory item.
558 int extract_key_from_de_id(const oid_t locality
/* locality of directory
560 const de_id
* id
/* directory entry id */ ,
561 reiser4_key
* key
/* result */)
563 /* no need to initialise key here: all fields are overwritten */
564 memcpy(((__u64
*) key
) + 1, id
, sizeof *id
);
565 set_key_locality(key
, locality
);
566 set_key_type(key
, KEY_FILE_NAME_MINOR
);
570 /* compare two &de_id's */
571 cmp_t
de_id_cmp(const de_id
* id1
/* first &de_id to compare */ ,
572 const de_id
* id2
/* second &de_id to compare */)
574 /* NOTE-NIKITA ugly implementation */
578 extract_key_from_de_id((oid_t
) 0, id1
, &k1
);
579 extract_key_from_de_id((oid_t
) 0, id2
, &k2
);
580 return keycmp(&k1
, &k2
);
583 /* compare &de_id with key */
584 cmp_t
de_id_key_cmp(const de_id
* id
/* directory entry id to compare */ ,
585 const reiser4_key
* key
/* key to compare */)
590 k1
= (reiser4_key
*) (((unsigned long)id
) - sizeof key
->el
[0]);
591 result
= KEY_DIFF_EL(k1
, key
, 1);
592 if (result
== EQUAL_TO
) {
593 result
= KEY_DIFF_EL(k1
, key
, 2);
594 if (REISER4_LARGE_KEY
&& result
== EQUAL_TO
)
595 result
= KEY_DIFF_EL(k1
, key
, 3);
601 * return number of bytes necessary to encode @inode identity.
603 int inode_onwire_size(const struct inode
*inode
)
607 result
= dscale_bytes_to_write(get_inode_oid(inode
));
608 result
+= dscale_bytes_to_write(get_inode_locality(inode
));
611 * ordering is large (it usually has highest bits set), so it makes
612 * little sense to dscale it.
614 if (REISER4_LARGE_KEY
)
615 result
+= sizeof(get_inode_ordering(inode
));
620 * encode @inode identity at @start
622 char *build_inode_onwire(const struct inode
*inode
, char *start
)
624 start
+= dscale_write(start
, get_inode_locality(inode
));
625 start
+= dscale_write(start
, get_inode_oid(inode
));
627 if (REISER4_LARGE_KEY
) {
628 put_unaligned(cpu_to_le64(get_inode_ordering(inode
)), (__le64
*)start
);
629 start
+= sizeof(get_inode_ordering(inode
));
635 * extract key that was previously encoded by build_inode_onwire() at @addr
637 char *extract_obj_key_id_from_onwire(char *addr
, obj_key_id
* key_id
)
641 addr
+= dscale_read(addr
, &val
);
642 val
= (val
<< KEY_LOCALITY_SHIFT
) | KEY_SD_MINOR
;
643 put_unaligned(cpu_to_le64(val
), (__le64
*)key_id
->locality
);
644 addr
+= dscale_read(addr
, &val
);
645 put_unaligned(cpu_to_le64(val
), (__le64
*)key_id
->objectid
);
646 #if REISER4_LARGE_KEY
647 memcpy(&key_id
->ordering
, addr
, sizeof key_id
->ordering
);
648 addr
+= sizeof key_id
->ordering
;
654 * skip a key that was previously encoded by build_inode_onwire() at @addr
655 * FIXME: handle IO errors.
657 char * locate_obj_key_id_onwire(char * addr
)
660 addr
+= dscale_bytes_to_read(addr
);
662 addr
+= dscale_bytes_to_read(addr
);
663 #if REISER4_LARGE_KEY
664 addr
+= sizeof ((obj_key_id
*)0)->ordering
;
671 c-indentation-style: "K&R"