On Tue, Nov 06, 2007 at 02:33:53AM -0800, akpm@linux-foundation.org wrote:
[mmotm.git] / fs / reiser4 / kassign.c
blob87a04dc849c02c52882bf301e7c57a140f1706fb
1 /* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
2 * reiser4/README */
4 /* Key assignment policy implementation */
6 /*
7 * In reiser4 every piece of file system data and meta-data has a key. Keys
8 * are used to store information in and retrieve it from reiser4 internal
9 * tree. In addition to this, keys define _ordering_ of all file system
10 * information: things having close keys are placed into the same or
11 * neighboring (in the tree order) nodes of the tree. As our block allocator
12 * tries to respect tree order (see flush.c), keys also define order in which
13 * things are laid out on the disk, and hence, affect performance directly.
15 * Obviously, assignment of keys to data and meta-data should be consistent
16 * across whole file system. Algorithm that calculates a key for a given piece
17 * of data or meta-data is referred to as "key assignment".
19 * Key assignment is too expensive to be implemented as a plugin (that is,
20 * with an ability to support different key assignment schemas in the same
21 * compiled kernel image). As a compromise, all key-assignment functions and
22 * data-structures are collected in this single file, so that modifications to
23 * key assignment algorithm can be localized. Additional changes may be
24 * required in key.[ch].
26 * Current default reiser4 key assignment algorithm is dubbed "Plan A". As one
27 * may guess, there is "Plan B" too.
32 * Additional complication with key assignment implementation is a requirement
33 * to support different key length.
37 * KEY ASSIGNMENT: PLAN A, LONG KEYS.
39 * DIRECTORY ITEMS
41 * | 60 | 4 | 7 |1| 56 | 64 | 64 |
42 * +--------------+---+---+-+-------------+------------------+-----------------+
43 * | dirid | 0 | F |H| prefix-1 | prefix-2 | prefix-3/hash |
44 * +--------------+---+---+-+-------------+------------------+-----------------+
45 * | | | | |
46 * | 8 bytes | 8 bytes | 8 bytes | 8 bytes |
48 * dirid objectid of directory this item is for
50 * F fibration, see fs/reiser4/plugin/fibration.[ch]
52 * H 1 if last 8 bytes of the key contain hash,
53 * 0 if last 8 bytes of the key contain prefix-3
55 * prefix-1 first 7 characters of file name.
56 * Padded by zeroes if name is not long enough.
58 * prefix-2 next 8 characters of the file name.
60 * prefix-3 next 8 characters of the file name.
62 * hash hash of the rest of file name (i.e., portion of file
63 * name not included into prefix-1 and prefix-2).
65 * File names shorter than 23 (== 7 + 8 + 8) characters are completely encoded
66 * in the key. Such file names are called "short". They are distinguished by H
67 * bit set 0 in the key.
69 * Other file names are "long". For long name, H bit is 1, and first 15 (== 7
70 * + 8) characters are encoded in prefix-1 and prefix-2 portions of the
71 * key. Last 8 bytes of the key are occupied by hash of the remaining
72 * characters of the name.
74 * This key assignment reaches following important goals:
76 * (1) directory entries are sorted in approximately lexicographical
77 * order.
79 * (2) collisions (when multiple directory items have the same key), while
80 * principally unavoidable in a tree with fixed length keys, are rare.
82 * STAT DATA
84 * | 60 | 4 | 64 | 4 | 60 | 64 |
85 * +--------------+---+-----------------+---+--------------+-----------------+
86 * | locality id | 1 | ordering | 0 | objectid | 0 |
87 * +--------------+---+-----------------+---+--------------+-----------------+
88 * | | | | |
89 * | 8 bytes | 8 bytes | 8 bytes | 8 bytes |
91 * locality id object id of a directory where first name was created for
92 * the object
94 * ordering copy of second 8-byte portion of the key of directory
95 * entry for the first name of this object. Ordering has a form
96 * {
97 * fibration :7;
98 * h :1;
99 * prefix1 :56;
101 * see description of key for directory entry above.
103 * objectid object id for this object
105 * This key assignment policy is designed to keep stat-data in the same order
106 * as corresponding directory items, thus speeding up readdir/stat types of
107 * workload.
109 * FILE BODY
111 * | 60 | 4 | 64 | 4 | 60 | 64 |
112 * +--------------+---+-----------------+---+--------------+-----------------+
113 * | locality id | 4 | ordering | 0 | objectid | offset |
114 * +--------------+---+-----------------+---+--------------+-----------------+
115 * | | | | |
116 * | 8 bytes | 8 bytes | 8 bytes | 8 bytes |
118 * locality id object id of a directory where first name was created for
119 * the object
121 * ordering the same as in the key of stat-data for this object
123 * objectid object id for this object
125 * offset logical offset from the beginning of this file.
126 * Measured in bytes.
129 * KEY ASSIGNMENT: PLAN A, SHORT KEYS.
131 * DIRECTORY ITEMS
133 * | 60 | 4 | 7 |1| 56 | 64 |
134 * +--------------+---+---+-+-------------+-----------------+
135 * | dirid | 0 | F |H| prefix-1 | prefix-2/hash |
136 * +--------------+---+---+-+-------------+-----------------+
137 * | | | |
138 * | 8 bytes | 8 bytes | 8 bytes |
140 * dirid objectid of directory this item is for
142 * F fibration, see fs/reiser4/plugin/fibration.[ch]
144 * H 1 if last 8 bytes of the key contain hash,
145 * 0 if last 8 bytes of the key contain prefix-2
147 * prefix-1 first 7 characters of file name.
148 * Padded by zeroes if name is not long enough.
150 * prefix-2 next 8 characters of the file name.
152 * hash hash of the rest of file name (i.e., portion of file
153 * name not included into prefix-1).
155 * File names shorter than 15 (== 7 + 8) characters are completely encoded in
156 * the key. Such file names are called "short". They are distinguished by H
157 * bit set in the key.
159 * Other file names are "long". For long name, H bit is 0, and first 7
160 * characters are encoded in prefix-1 portion of the key. Last 8 bytes of the
161 * key are occupied by hash of the remaining characters of the name.
163 * STAT DATA
165 * | 60 | 4 | 4 | 60 | 64 |
166 * +--------------+---+---+--------------+-----------------+
167 * | locality id | 1 | 0 | objectid | 0 |
168 * +--------------+---+---+--------------+-----------------+
169 * | | | |
170 * | 8 bytes | 8 bytes | 8 bytes |
172 * locality id object id of a directory where first name was created for
173 * the object
175 * objectid object id for this object
177 * FILE BODY
179 * | 60 | 4 | 4 | 60 | 64 |
180 * +--------------+---+---+--------------+-----------------+
181 * | locality id | 4 | 0 | objectid | offset |
182 * +--------------+---+---+--------------+-----------------+
183 * | | | |
184 * | 8 bytes | 8 bytes | 8 bytes |
186 * locality id object id of a directory where first name was created for
187 * the object
189 * objectid object id for this object
191 * offset logical offset from the beginning of this file.
192 * Measured in bytes.
197 #include "debug.h"
198 #include "key.h"
199 #include "kassign.h"
200 #include "vfs_ops.h"
201 #include "inode.h"
202 #include "super.h"
203 #include "dscale.h"
205 #include <linux/types.h> /* for __u?? */
206 #include <linux/fs.h> /* for struct super_block, etc */
208 /* bitmask for H bit (see comment at the beginning of this file */
209 static const __u64 longname_mark = 0x0100000000000000ull;
210 /* bitmask for F and H portions of the key. */
211 static const __u64 fibration_mask = 0xff00000000000000ull;
213 /* return true if name is not completely encoded in @key */
214 int is_longname_key(const reiser4_key * key)
216 __u64 highpart;
218 assert("nikita-2863", key != NULL);
219 if (get_key_type(key) != KEY_FILE_NAME_MINOR)
220 reiser4_print_key("oops", key);
221 assert("nikita-2864", get_key_type(key) == KEY_FILE_NAME_MINOR);
223 if (REISER4_LARGE_KEY)
224 highpart = get_key_ordering(key);
225 else
226 highpart = get_key_objectid(key);
228 return (highpart & longname_mark) ? 1 : 0;
231 /* return true if @name is too long to be completely encoded in the key */
232 int is_longname(const char *name UNUSED_ARG, int len)
234 if (REISER4_LARGE_KEY)
235 return len > 23;
236 else
237 return len > 15;
240 /* code ascii string into __u64.
242 Put characters of @name into result (@str) one after another starting
243 from @start_idx-th highest (arithmetically) byte. This produces
244 endian-safe encoding. memcpy(2) will not do.
247 static __u64 pack_string(const char *name /* string to encode */ ,
248 int start_idx /* highest byte in result from
249 * which to start encoding */ )
251 unsigned i;
252 __u64 str;
254 str = 0;
255 for (i = 0; (i < sizeof str - start_idx) && name[i]; ++i) {
256 str <<= 8;
257 str |= (unsigned char)name[i];
259 str <<= (sizeof str - i - start_idx) << 3;
260 return str;
263 /* opposite to pack_string(). Takes value produced by pack_string(), restores
264 * string encoded in it and stores result in @buf */
265 char *reiser4_unpack_string(__u64 value, char *buf)
267 do {
268 *buf = value >> (64 - 8);
269 if (*buf)
270 ++buf;
271 value <<= 8;
272 } while (value != 0);
273 *buf = 0;
274 return buf;
277 /* obtain name encoded in @key and store it in @buf */
278 char *extract_name_from_key(const reiser4_key * key, char *buf)
280 char *c;
282 assert("nikita-2868", !is_longname_key(key));
284 c = buf;
285 if (REISER4_LARGE_KEY) {
286 c = reiser4_unpack_string(get_key_ordering(key) &
287 ~fibration_mask, c);
288 c = reiser4_unpack_string(get_key_fulloid(key), c);
289 } else
290 c = reiser4_unpack_string(get_key_fulloid(key) &
291 ~fibration_mask, c);
292 reiser4_unpack_string(get_key_offset(key), c);
293 return buf;
297 * complete_entry_key - calculate entry key by name
298 * @dir: directory where entry is (or will be) in
299 * @name: name to calculate key of
300 * @len: lenth of name
301 * @result: place to store result in
303 * Sets fields of entry key @result which depend on file name.
304 * When REISER4_LARGE_KEY is defined three fields of @result are set: ordering,
305 * objectid and offset. Otherwise, objectid and offset are set.
307 void complete_entry_key(const struct inode *dir, const char *name,
308 int len, reiser4_key *result)
310 #if REISER4_LARGE_KEY
311 __u64 ordering;
312 __u64 objectid;
313 __u64 offset;
315 assert("nikita-1139", dir != NULL);
316 assert("nikita-1142", result != NULL);
317 assert("nikita-2867", strlen(name) == len);
320 * key allocation algorithm for directory entries in case of large
321 * keys:
323 * If name is not longer than 7 + 8 + 8 = 23 characters, put first 7
324 * characters into ordering field of key, next 8 charactes (if any)
325 * into objectid field of key and next 8 ones (of any) into offset
326 * field of key
328 * If file name is longer than 23 characters, put first 7 characters
329 * into key's ordering, next 8 to objectid and hash of remaining
330 * characters into offset field.
332 * To distinguish above cases, in latter set up unused high bit in
333 * ordering field.
336 /* [0-6] characters to ordering */
337 ordering = pack_string(name, 1);
338 if (len > 7) {
339 /* [7-14] characters to objectid */
340 objectid = pack_string(name + 7, 0);
341 if (len > 15) {
342 if (len <= 23) {
343 /* [15-23] characters to offset */
344 offset = pack_string(name + 15, 0);
345 } else {
346 /* note in a key the fact that offset contains
347 * hash */
348 ordering |= longname_mark;
350 /* offset is the hash of the file name's tail */
351 offset = inode_hash_plugin(dir)->hash(name + 15,
352 len - 15);
354 } else {
355 offset = 0ull;
357 } else {
358 objectid = 0ull;
359 offset = 0ull;
362 assert("nikita-3480", inode_fibration_plugin(dir) != NULL);
363 ordering |= inode_fibration_plugin(dir)->fibre(dir, name, len);
365 set_key_ordering(result, ordering);
366 set_key_fulloid(result, objectid);
367 set_key_offset(result, offset);
368 return;
370 #else
371 __u64 objectid;
372 __u64 offset;
374 assert("nikita-1139", dir != NULL);
375 assert("nikita-1142", result != NULL);
376 assert("nikita-2867", strlen(name) == len);
379 * key allocation algorithm for directory entries in case of not large
380 * keys:
382 * If name is not longer than 7 + 8 = 15 characters, put first 7
383 * characters into objectid field of key, next 8 charactes (if any)
384 * into offset field of key
386 * If file name is longer than 15 characters, put first 7 characters
387 * into key's objectid, and hash of remaining characters into offset
388 * field.
390 * To distinguish above cases, in latter set up unused high bit in
391 * objectid field.
394 /* [0-6] characters to objectid */
395 objectid = pack_string(name, 1);
396 if (len > 7) {
397 if (len <= 15) {
398 /* [7-14] characters to offset */
399 offset = pack_string(name + 7, 0);
400 } else {
401 /* note in a key the fact that offset contains hash. */
402 objectid |= longname_mark;
404 /* offset is the hash of the file name. */
405 offset = inode_hash_plugin(dir)->hash(name + 7,
406 len - 7);
408 } else
409 offset = 0ull;
411 assert("nikita-3480", inode_fibration_plugin(dir) != NULL);
412 objectid |= inode_fibration_plugin(dir)->fibre(dir, name, len);
414 set_key_fulloid(result, objectid);
415 set_key_offset(result, offset);
416 return;
417 #endif /* ! REISER4_LARGE_KEY */
420 /* true, if @key is the key of "." */
421 int is_dot_key(const reiser4_key * key/* key to check */)
423 assert("nikita-1717", key != NULL);
424 assert("nikita-1718", get_key_type(key) == KEY_FILE_NAME_MINOR);
425 return
426 (get_key_ordering(key) == 0ull) &&
427 (get_key_objectid(key) == 0ull) && (get_key_offset(key) == 0ull);
430 /* build key for stat-data.
432 return key of stat-data of this object. This should became sd plugin
433 method in the future. For now, let it be here.
436 reiser4_key *build_sd_key(const struct inode *target /* inode of an object */ ,
437 reiser4_key * result /* resulting key of @target
438 stat-data */ )
440 assert("nikita-261", result != NULL);
442 reiser4_key_init(result);
443 set_key_locality(result, reiser4_inode_data(target)->locality_id);
444 set_key_ordering(result, get_inode_ordering(target));
445 set_key_objectid(result, get_inode_oid(target));
446 set_key_type(result, KEY_SD_MINOR);
447 set_key_offset(result, (__u64) 0);
448 return result;
451 /* encode part of key into &obj_key_id
453 This encodes into @id part of @key sufficient to restore @key later,
454 given that latter is key of object (key of stat-data).
456 See &obj_key_id
458 int build_obj_key_id(const reiser4_key * key /* key to encode */ ,
459 obj_key_id * id/* id where key is encoded in */)
461 assert("nikita-1151", key != NULL);
462 assert("nikita-1152", id != NULL);
464 memcpy(id, key, sizeof *id);
465 return 0;
468 /* encode reference to @obj in @id.
470 This is like build_obj_key_id() above, but takes inode as parameter. */
471 int build_inode_key_id(const struct inode *obj /* object to build key of */ ,
472 obj_key_id * id/* result */)
474 reiser4_key sdkey;
476 assert("nikita-1166", obj != NULL);
477 assert("nikita-1167", id != NULL);
479 build_sd_key(obj, &sdkey);
480 build_obj_key_id(&sdkey, id);
481 return 0;
484 /* decode @id back into @key
486 Restore key of object stat-data from @id. This is dual to
487 build_obj_key_id() above.
489 int extract_key_from_id(const obj_key_id * id /* object key id to extract key
490 * from */ ,
491 reiser4_key * key/* result */)
493 assert("nikita-1153", id != NULL);
494 assert("nikita-1154", key != NULL);
496 reiser4_key_init(key);
497 memcpy(key, id, sizeof *id);
498 return 0;
501 /* extract objectid of directory from key of directory entry within said
502 directory.
504 oid_t extract_dir_id_from_key(const reiser4_key * de_key /* key of
505 * directory
506 * entry */ )
508 assert("nikita-1314", de_key != NULL);
509 return get_key_locality(de_key);
512 /* encode into @id key of directory entry.
514 Encode into @id information sufficient to later distinguish directory
515 entries within the same directory. This is not whole key, because all
516 directory entries within directory item share locality which is equal
517 to objectid of their directory.
520 int build_de_id(const struct inode *dir /* inode of directory */ ,
521 const struct qstr *name /* name to be given to @obj by
522 * directory entry being
523 * constructed */ ,
524 de_id * id/* short key of directory entry */)
526 reiser4_key key;
528 assert("nikita-1290", dir != NULL);
529 assert("nikita-1292", id != NULL);
531 /* NOTE-NIKITA this is suboptimal. */
532 inode_dir_plugin(dir)->build_entry_key(dir, name, &key);
533 return build_de_id_by_key(&key, id);
536 /* encode into @id key of directory entry.
538 Encode into @id information sufficient to later distinguish directory
539 entries within the same directory. This is not whole key, because all
540 directory entries within directory item share locality which is equal
541 to objectid of their directory.
544 int build_de_id_by_key(const reiser4_key * entry_key /* full key of directory
545 * entry */ ,
546 de_id * id/* short key of directory entry */)
548 memcpy(id, ((__u64 *) entry_key) + 1, sizeof *id);
549 return 0;
552 /* restore from @id key of directory entry.
554 Function dual to build_de_id(): given @id and locality, build full
555 key of directory entry within directory item.
558 int extract_key_from_de_id(const oid_t locality /* locality of directory
559 * entry */ ,
560 const de_id * id /* directory entry id */ ,
561 reiser4_key * key/* result */)
563 /* no need to initialise key here: all fields are overwritten */
564 memcpy(((__u64 *) key) + 1, id, sizeof *id);
565 set_key_locality(key, locality);
566 set_key_type(key, KEY_FILE_NAME_MINOR);
567 return 0;
570 /* compare two &de_id's */
571 cmp_t de_id_cmp(const de_id * id1 /* first &de_id to compare */ ,
572 const de_id * id2/* second &de_id to compare */)
574 /* NOTE-NIKITA ugly implementation */
575 reiser4_key k1;
576 reiser4_key k2;
578 extract_key_from_de_id((oid_t) 0, id1, &k1);
579 extract_key_from_de_id((oid_t) 0, id2, &k2);
580 return keycmp(&k1, &k2);
583 /* compare &de_id with key */
584 cmp_t de_id_key_cmp(const de_id * id /* directory entry id to compare */ ,
585 const reiser4_key * key/* key to compare */)
587 cmp_t result;
588 reiser4_key *k1;
590 k1 = (reiser4_key *) (((unsigned long)id) - sizeof key->el[0]);
591 result = KEY_DIFF_EL(k1, key, 1);
592 if (result == EQUAL_TO) {
593 result = KEY_DIFF_EL(k1, key, 2);
594 if (REISER4_LARGE_KEY && result == EQUAL_TO)
595 result = KEY_DIFF_EL(k1, key, 3);
597 return result;
601 * return number of bytes necessary to encode @inode identity.
603 int inode_onwire_size(const struct inode *inode)
605 int result;
607 result = dscale_bytes_to_write(get_inode_oid(inode));
608 result += dscale_bytes_to_write(get_inode_locality(inode));
611 * ordering is large (it usually has highest bits set), so it makes
612 * little sense to dscale it.
614 if (REISER4_LARGE_KEY)
615 result += sizeof(get_inode_ordering(inode));
616 return result;
620 * encode @inode identity at @start
622 char *build_inode_onwire(const struct inode *inode, char *start)
624 start += dscale_write(start, get_inode_locality(inode));
625 start += dscale_write(start, get_inode_oid(inode));
627 if (REISER4_LARGE_KEY) {
628 put_unaligned(cpu_to_le64(get_inode_ordering(inode)), (__le64 *)start);
629 start += sizeof(get_inode_ordering(inode));
631 return start;
635 * extract key that was previously encoded by build_inode_onwire() at @addr
637 char *extract_obj_key_id_from_onwire(char *addr, obj_key_id * key_id)
639 __u64 val;
641 addr += dscale_read(addr, &val);
642 val = (val << KEY_LOCALITY_SHIFT) | KEY_SD_MINOR;
643 put_unaligned(cpu_to_le64(val), (__le64 *)key_id->locality);
644 addr += dscale_read(addr, &val);
645 put_unaligned(cpu_to_le64(val), (__le64 *)key_id->objectid);
646 #if REISER4_LARGE_KEY
647 memcpy(&key_id->ordering, addr, sizeof key_id->ordering);
648 addr += sizeof key_id->ordering;
649 #endif
650 return addr;
654 * skip a key that was previously encoded by build_inode_onwire() at @addr
655 * FIXME: handle IO errors.
657 char * locate_obj_key_id_onwire(char * addr)
659 /* locality */
660 addr += dscale_bytes_to_read(addr);
661 /* objectid */
662 addr += dscale_bytes_to_read(addr);
663 #if REISER4_LARGE_KEY
664 addr += sizeof ((obj_key_id *)0)->ordering;
665 #endif
666 return addr;
669 /* Make Linus happy.
670 Local variables:
671 c-indentation-style: "K&R"
672 mode-name: "LC"
673 c-basic-offset: 8
674 tab-width: 8
675 fill-column: 120
676 End: