1 /* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
4 /* Key assignment policy implementation */
7 * In reiser4 every piece of file system data and meta-data has a key. Keys
8 * are used to store information in and retrieve it from reiser4 internal
9 * tree. In addition to this, keys define _ordering_ of all file system
10 * information: things having close keys are placed into the same or
11 * neighboring (in the tree order) nodes of the tree. As our block allocator
12 * tries to respect tree order (see flush.c), keys also define order in which
13 * things are laid out on the disk, and hence, affect performance directly.
15 * Obviously, assignment of keys to data and meta-data should be consistent
16 * across whole file system. Algorithm that calculates a key for a given piece
17 * of data or meta-data is referred to as "key assignment".
19 * Key assignment is too expensive to be implemented as a plugin (that is,
20 * with an ability to support different key assignment schemas in the same
21 * compiled kernel image). As a compromise, all key-assignment functions and
22 * data-structures are collected in this single file, so that modifications to
23 * key assignment algorithm can be localized. Additional changes may be
24 * required in key.[ch].
26 * Current default reiser4 key assignment algorithm is dubbed "Plan A". As one
27 * may guess, there is "Plan B" too.
32 * Additional complication with key assignment implementation is a requirement
33 * to support different key length.
37 * KEY ASSIGNMENT: PLAN A, LONG KEYS.
41 * | 60 | 4 | 7 |1| 56 | 64 | 64 |
42 * +--------------+---+---+-+-------------+------------------+-----------------+
43 * | dirid | 0 | F |H| prefix-1 | prefix-2 | prefix-3/hash |
44 * +--------------+---+---+-+-------------+------------------+-----------------+
46 * | 8 bytes | 8 bytes | 8 bytes | 8 bytes |
48 * dirid objectid of directory this item is for
50 * F fibration, see fs/reiser4/plugin/fibration.[ch]
52 * H 1 if last 8 bytes of the key contain hash,
53 * 0 if last 8 bytes of the key contain prefix-3
55 * prefix-1 first 7 characters of file name.
56 * Padded by zeroes if name is not long enough.
58 * prefix-2 next 8 characters of the file name.
60 * prefix-3 next 8 characters of the file name.
62 * hash hash of the rest of file name (i.e., portion of file
63 * name not included into prefix-1 and prefix-2).
65 * File names shorter than 23 (== 7 + 8 + 8) characters are completely encoded
66 * in the key. Such file names are called "short". They are distinguished by H
67 * bit set 0 in the key.
69 * Other file names are "long". For long name, H bit is 1, and first 15 (== 7
70 * + 8) characters are encoded in prefix-1 and prefix-2 portions of the
71 * key. Last 8 bytes of the key are occupied by hash of the remaining
72 * characters of the name.
74 * This key assignment reaches following important goals:
76 * (1) directory entries are sorted in approximately lexicographical
79 * (2) collisions (when multiple directory items have the same key), while
80 * principally unavoidable in a tree with fixed length keys, are rare.
84 * | 60 | 4 | 64 | 4 | 60 | 64 |
85 * +--------------+---+-----------------+---+--------------+-----------------+
86 * | locality id | 1 | ordering | 0 | objectid | 0 |
87 * +--------------+---+-----------------+---+--------------+-----------------+
89 * | 8 bytes | 8 bytes | 8 bytes | 8 bytes |
91 * locality id object id of a directory where first name was created for
94 * ordering copy of second 8-byte portion of the key of directory
95 * entry for the first name of this object. Ordering has a form
101 * see description of key for directory entry above.
103 * objectid object id for this object
105 * This key assignment policy is designed to keep stat-data in the same order
106 * as corresponding directory items, thus speeding up readdir/stat types of
111 * | 60 | 4 | 64 | 4 | 60 | 64 |
112 * +--------------+---+-----------------+---+--------------+-----------------+
113 * | locality id | 4 | ordering | 0 | objectid | offset |
114 * +--------------+---+-----------------+---+--------------+-----------------+
116 * | 8 bytes | 8 bytes | 8 bytes | 8 bytes |
118 * locality id object id of a directory where first name was created for
121 * ordering the same as in the key of stat-data for this object
123 * objectid object id for this object
125 * offset logical offset from the beginning of this file.
129 * KEY ASSIGNMENT: PLAN A, SHORT KEYS.
133 * | 60 | 4 | 7 |1| 56 | 64 |
134 * +--------------+---+---+-+-------------+-----------------+
135 * | dirid | 0 | F |H| prefix-1 | prefix-2/hash |
136 * +--------------+---+---+-+-------------+-----------------+
138 * | 8 bytes | 8 bytes | 8 bytes |
140 * dirid objectid of directory this item is for
142 * F fibration, see fs/reiser4/plugin/fibration.[ch]
144 * H 1 if last 8 bytes of the key contain hash,
145 * 0 if last 8 bytes of the key contain prefix-2
147 * prefix-1 first 7 characters of file name.
148 * Padded by zeroes if name is not long enough.
150 * prefix-2 next 8 characters of the file name.
152 * hash hash of the rest of file name (i.e., portion of file
153 * name not included into prefix-1).
155 * File names shorter than 15 (== 7 + 8) characters are completely encoded in
156 * the key. Such file names are called "short". They are distinguished by H
157 * bit set in the key.
159 * Other file names are "long". For long name, H bit is 0, and first 7
160 * characters are encoded in prefix-1 portion of the key. Last 8 bytes of the
161 * key are occupied by hash of the remaining characters of the name.
165 * | 60 | 4 | 4 | 60 | 64 |
166 * +--------------+---+---+--------------+-----------------+
167 * | locality id | 1 | 0 | objectid | 0 |
168 * +--------------+---+---+--------------+-----------------+
170 * | 8 bytes | 8 bytes | 8 bytes |
172 * locality id object id of a directory where first name was created for
175 * objectid object id for this object
179 * | 60 | 4 | 4 | 60 | 64 |
180 * +--------------+---+---+--------------+-----------------+
181 * | locality id | 4 | 0 | objectid | offset |
182 * +--------------+---+---+--------------+-----------------+
184 * | 8 bytes | 8 bytes | 8 bytes |
186 * locality id object id of a directory where first name was created for
189 * objectid object id for this object
191 * offset logical offset from the beginning of this file.
205 #include <linux/types.h> /* for __u?? */
206 #include <linux/fs.h> /* for struct super_block, etc */
208 /* bitmask for H bit (see comment at the beginning of this file */
209 static const __u64 longname_mark
= 0x0100000000000000ull
;
210 /* bitmask for F and H portions of the key. */
211 static const __u64 fibration_mask
= 0xff00000000000000ull
;
213 /* return true if name is not completely encoded in @key */
214 int is_longname_key(const reiser4_key
* key
)
218 assert("nikita-2863", key
!= NULL
);
219 if (get_key_type(key
) != KEY_FILE_NAME_MINOR
)
220 reiser4_print_key("oops", key
);
221 assert("nikita-2864", get_key_type(key
) == KEY_FILE_NAME_MINOR
);
223 if (REISER4_LARGE_KEY
)
224 highpart
= get_key_ordering(key
);
226 highpart
= get_key_objectid(key
);
228 return (highpart
& longname_mark
) ? 1 : 0;
231 /* return true if @name is too long to be completely encoded in the key */
232 int is_longname(const char *name UNUSED_ARG
, int len
)
234 if (REISER4_LARGE_KEY
)
240 /* code ascii string into __u64.
242 Put characters of @name into result (@str) one after another starting
243 from @start_idx-th highest (arithmetically) byte. This produces
244 endian-safe encoding. memcpy(2) will not do.
247 static __u64
pack_string(const char *name
/* string to encode */ ,
248 int start_idx
/* highest byte in result from
249 * which to start encoding */ )
255 for (i
= 0; (i
< sizeof str
- start_idx
) && name
[i
]; ++i
) {
257 str
|= (unsigned char)name
[i
];
259 str
<<= (sizeof str
- i
- start_idx
) << 3;
263 /* opposite to pack_string(). Takes value produced by pack_string(), restores
264 * string encoded in it and stores result in @buf */
265 char * reiser4_unpack_string(__u64 value
, char *buf
)
268 *buf
= value
>> (64 - 8);
272 } while (value
!= 0);
277 /* obtain name encoded in @key and store it in @buf */
278 char *extract_name_from_key(const reiser4_key
* key
, char *buf
)
282 assert("nikita-2868", !is_longname_key(key
));
285 if (REISER4_LARGE_KEY
) {
286 c
= reiser4_unpack_string(get_key_ordering(key
) &
288 c
= reiser4_unpack_string(get_key_fulloid(key
), c
);
290 c
= reiser4_unpack_string(get_key_fulloid(key
) &
292 reiser4_unpack_string(get_key_offset(key
), c
);
297 * complete_entry_key - calculate entry key by name
298 * @dir: directory where entry is (or will be) in
299 * @name: name to calculate key of
300 * @len: lenth of name
301 * @result: place to store result in
303 * Sets fields of entry key @result which depend on file name.
304 * When REISER4_LARGE_KEY is defined three fields of @result are set: ordering,
305 * objectid and offset. Otherwise, objectid and offset are set.
307 void complete_entry_key(const struct inode
*dir
, const char *name
,
308 int len
, reiser4_key
*result
)
310 #if REISER4_LARGE_KEY
315 assert("nikita-1139", dir
!= NULL
);
316 assert("nikita-1142", result
!= NULL
);
317 assert("nikita-2867", strlen(name
) == len
);
320 * key allocation algorithm for directory entries in case of large
323 * If name is not longer than 7 + 8 + 8 = 23 characters, put first 7
324 * characters into ordering field of key, next 8 charactes (if any)
325 * into objectid field of key and next 8 ones (of any) into offset
328 * If file name is longer than 23 characters, put first 7 characters
329 * into key's ordering, next 8 to objectid and hash of remaining
330 * characters into offset field.
332 * To distinguish above cases, in latter set up unused high bit in
336 /* [0-6] characters to ordering */
337 ordering
= pack_string(name
, 1);
339 /* [7-14] characters to objectid */
340 objectid
= pack_string(name
+ 7, 0);
343 /* [15-23] characters to offset */
344 offset
= pack_string(name
+ 15, 0);
346 /* note in a key the fact that offset contains hash. */
347 ordering
|= longname_mark
;
349 /* offset is the hash of the file name's tail. */
350 offset
= inode_hash_plugin(dir
)->hash(name
+ 15,
361 assert("nikita-3480", inode_fibration_plugin(dir
) != NULL
);
362 ordering
|= inode_fibration_plugin(dir
)->fibre(dir
, name
, len
);
364 set_key_ordering(result
, ordering
);
365 set_key_fulloid(result
, objectid
);
366 set_key_offset(result
, offset
);
373 assert("nikita-1139", dir
!= NULL
);
374 assert("nikita-1142", result
!= NULL
);
375 assert("nikita-2867", strlen(name
) == len
);
378 * key allocation algorithm for directory entries in case of not large
381 * If name is not longer than 7 + 8 = 15 characters, put first 7
382 * characters into objectid field of key, next 8 charactes (if any)
383 * into offset field of key
385 * If file name is longer than 15 characters, put first 7 characters
386 * into key's objectid, and hash of remaining characters into offset
389 * To distinguish above cases, in latter set up unused high bit in
393 /* [0-6] characters to objectid */
394 objectid
= pack_string(name
, 1);
397 /* [7-14] characters to offset */
398 offset
= pack_string(name
+ 7, 0);
400 /* note in a key the fact that offset contains hash. */
401 objectid
|= longname_mark
;
403 /* offset is the hash of the file name. */
404 offset
= inode_hash_plugin(dir
)->hash(name
+ 7,
410 assert("nikita-3480", inode_fibration_plugin(dir
) != NULL
);
411 objectid
|= inode_fibration_plugin(dir
)->fibre(dir
, name
, len
);
413 set_key_fulloid(result
, objectid
);
414 set_key_offset(result
, offset
);
416 #endif /* ! REISER4_LARGE_KEY */
419 /* true, if @key is the key of "." */
420 int is_dot_key(const reiser4_key
* key
/* key to check */ )
422 assert("nikita-1717", key
!= NULL
);
423 assert("nikita-1718", get_key_type(key
) == KEY_FILE_NAME_MINOR
);
425 (get_key_ordering(key
) == 0ull) &&
426 (get_key_objectid(key
) == 0ull) && (get_key_offset(key
) == 0ull);
429 /* build key for stat-data.
431 return key of stat-data of this object. This should became sd plugin
432 method in the future. For now, let it be here.
435 reiser4_key
*build_sd_key(const struct inode
* target
/* inode of an object */ ,
436 reiser4_key
* result
/* resulting key of @target
439 assert("nikita-261", result
!= NULL
);
441 reiser4_key_init(result
);
442 set_key_locality(result
, reiser4_inode_data(target
)->locality_id
);
443 set_key_ordering(result
, get_inode_ordering(target
));
444 set_key_objectid(result
, get_inode_oid(target
));
445 set_key_type(result
, KEY_SD_MINOR
);
446 set_key_offset(result
, (__u64
) 0);
450 /* encode part of key into &obj_key_id
452 This encodes into @id part of @key sufficient to restore @key later,
453 given that latter is key of object (key of stat-data).
457 int build_obj_key_id(const reiser4_key
* key
/* key to encode */ ,
458 obj_key_id
* id
/* id where key is encoded in */ )
460 assert("nikita-1151", key
!= NULL
);
461 assert("nikita-1152", id
!= NULL
);
463 memcpy(id
, key
, sizeof *id
);
467 /* encode reference to @obj in @id.
469 This is like build_obj_key_id() above, but takes inode as parameter. */
470 int build_inode_key_id(const struct inode
*obj
/* object to build key of */ ,
471 obj_key_id
* id
/* result */ )
475 assert("nikita-1166", obj
!= NULL
);
476 assert("nikita-1167", id
!= NULL
);
478 build_sd_key(obj
, &sdkey
);
479 build_obj_key_id(&sdkey
, id
);
483 /* decode @id back into @key
485 Restore key of object stat-data from @id. This is dual to
486 build_obj_key_id() above.
488 int extract_key_from_id(const obj_key_id
* id
/* object key id to extract key
490 reiser4_key
* key
/* result */ )
492 assert("nikita-1153", id
!= NULL
);
493 assert("nikita-1154", key
!= NULL
);
495 reiser4_key_init(key
);
496 memcpy(key
, id
, sizeof *id
);
500 /* extract objectid of directory from key of directory entry within said
503 oid_t
extract_dir_id_from_key(const reiser4_key
* de_key
/* key of
507 assert("nikita-1314", de_key
!= NULL
);
508 return get_key_locality(de_key
);
511 /* encode into @id key of directory entry.
513 Encode into @id information sufficient to later distinguish directory
514 entries within the same directory. This is not whole key, because all
515 directory entries within directory item share locality which is equal
516 to objectid of their directory.
519 int build_de_id(const struct inode
*dir
/* inode of directory */ ,
520 const struct qstr
*name
/* name to be given to @obj by
521 * directory entry being
523 de_id
* id
/* short key of directory entry */ )
527 assert("nikita-1290", dir
!= NULL
);
528 assert("nikita-1292", id
!= NULL
);
530 /* NOTE-NIKITA this is suboptimal. */
531 inode_dir_plugin(dir
)->build_entry_key(dir
, name
, &key
);
532 return build_de_id_by_key(&key
, id
);
535 /* encode into @id key of directory entry.
537 Encode into @id information sufficient to later distinguish directory
538 entries within the same directory. This is not whole key, because all
539 directory entries within directory item share locality which is equal
540 to objectid of their directory.
543 int build_de_id_by_key(const reiser4_key
* entry_key
/* full key of directory
545 de_id
* id
/* short key of directory entry */ )
547 memcpy(id
, ((__u64
*) entry_key
) + 1, sizeof *id
);
551 /* restore from @id key of directory entry.
553 Function dual to build_de_id(): given @id and locality, build full
554 key of directory entry within directory item.
557 int extract_key_from_de_id(const oid_t locality
/* locality of directory
559 const de_id
* id
/* directory entry id */ ,
560 reiser4_key
* key
/* result */ )
562 /* no need to initialise key here: all fields are overwritten */
563 memcpy(((__u64
*) key
) + 1, id
, sizeof *id
);
564 set_key_locality(key
, locality
);
565 set_key_type(key
, KEY_FILE_NAME_MINOR
);
569 /* compare two &de_id's */
570 cmp_t
de_id_cmp(const de_id
* id1
/* first &de_id to compare */ ,
571 const de_id
* id2
/* second &de_id to compare */ )
573 /* NOTE-NIKITA ugly implementation */
577 extract_key_from_de_id((oid_t
) 0, id1
, &k1
);
578 extract_key_from_de_id((oid_t
) 0, id2
, &k2
);
579 return keycmp(&k1
, &k2
);
582 /* compare &de_id with key */
583 cmp_t
de_id_key_cmp(const de_id
* id
/* directory entry id to compare */ ,
584 const reiser4_key
* key
/* key to compare */ )
589 k1
= (reiser4_key
*) (((unsigned long)id
) - sizeof key
->el
[0]);
590 result
= KEY_DIFF_EL(k1
, key
, 1);
591 if (result
== EQUAL_TO
) {
592 result
= KEY_DIFF_EL(k1
, key
, 2);
593 if (REISER4_LARGE_KEY
&& result
== EQUAL_TO
) {
594 result
= KEY_DIFF_EL(k1
, key
, 3);
601 * return number of bytes necessary to encode @inode identity.
603 int inode_onwire_size(const struct inode
*inode
)
607 result
= dscale_bytes(get_inode_oid(inode
));
608 result
+= dscale_bytes(get_inode_locality(inode
));
611 * ordering is large (it usually has highest bits set), so it makes
612 * little sense to dscale it.
614 if (REISER4_LARGE_KEY
)
615 result
+= sizeof(get_inode_ordering(inode
));
620 * encode @inode identity at @start
622 char *build_inode_onwire(const struct inode
*inode
, char *start
)
624 start
+= dscale_write(start
, get_inode_locality(inode
));
625 start
+= dscale_write(start
, get_inode_oid(inode
));
627 if (REISER4_LARGE_KEY
) {
628 put_unaligned(cpu_to_le64(get_inode_ordering(inode
)), (__le64
*)start
);
629 start
+= sizeof(get_inode_ordering(inode
));
635 * extract key that was previously encoded by build_inode_onwire() at @addr
637 char *extract_obj_key_id_from_onwire(char *addr
, obj_key_id
* key_id
)
641 addr
+= dscale_read(addr
, &val
);
642 val
= (val
<< KEY_LOCALITY_SHIFT
) | KEY_SD_MINOR
;
643 put_unaligned(cpu_to_le64(val
), (__le64
*)key_id
->locality
);
644 addr
+= dscale_read(addr
, &val
);
645 put_unaligned(cpu_to_le64(val
), (__le64
*)key_id
->objectid
);
646 #if REISER4_LARGE_KEY
647 memcpy(&key_id
->ordering
, addr
, sizeof key_id
->ordering
);
648 addr
+= sizeof key_id
->ordering
;
655 c-indentation-style: "K&R"