revert-mm-fix-blkdev-size-calculation-in-generic_write_checks
[linux-2.6/linux-trees-mm.git] / fs / reiser4 / kassign.c
blob3c8f9f5498daa73417a908a4a57ebb202bd99f31
1 /* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
2 * reiser4/README */
4 /* Key assignment policy implementation */
6 /*
7 * In reiser4 every piece of file system data and meta-data has a key. Keys
8 * are used to store information in and retrieve it from reiser4 internal
9 * tree. In addition to this, keys define _ordering_ of all file system
10 * information: things having close keys are placed into the same or
11 * neighboring (in the tree order) nodes of the tree. As our block allocator
12 * tries to respect tree order (see flush.c), keys also define order in which
13 * things are laid out on the disk, and hence, affect performance directly.
15 * Obviously, assignment of keys to data and meta-data should be consistent
16 * across whole file system. Algorithm that calculates a key for a given piece
17 * of data or meta-data is referred to as "key assignment".
19 * Key assignment is too expensive to be implemented as a plugin (that is,
20 * with an ability to support different key assignment schemas in the same
21 * compiled kernel image). As a compromise, all key-assignment functions and
22 * data-structures are collected in this single file, so that modifications to
23 * key assignment algorithm can be localized. Additional changes may be
24 * required in key.[ch].
26 * Current default reiser4 key assignment algorithm is dubbed "Plan A". As one
27 * may guess, there is "Plan B" too.
32 * Additional complication with key assignment implementation is a requirement
33 * to support different key length.
37 * KEY ASSIGNMENT: PLAN A, LONG KEYS.
39 * DIRECTORY ITEMS
41 * | 60 | 4 | 7 |1| 56 | 64 | 64 |
42 * +--------------+---+---+-+-------------+------------------+-----------------+
43 * | dirid | 0 | F |H| prefix-1 | prefix-2 | prefix-3/hash |
44 * +--------------+---+---+-+-------------+------------------+-----------------+
45 * | | | | |
46 * | 8 bytes | 8 bytes | 8 bytes | 8 bytes |
48 * dirid objectid of directory this item is for
50 * F fibration, see fs/reiser4/plugin/fibration.[ch]
52 * H 1 if last 8 bytes of the key contain hash,
53 * 0 if last 8 bytes of the key contain prefix-3
55 * prefix-1 first 7 characters of file name.
56 * Padded by zeroes if name is not long enough.
58 * prefix-2 next 8 characters of the file name.
60 * prefix-3 next 8 characters of the file name.
62 * hash hash of the rest of file name (i.e., portion of file
63 * name not included into prefix-1 and prefix-2).
65 * File names shorter than 23 (== 7 + 8 + 8) characters are completely encoded
66 * in the key. Such file names are called "short". They are distinguished by H
67 * bit set 0 in the key.
69 * Other file names are "long". For long name, H bit is 1, and first 15 (== 7
70 * + 8) characters are encoded in prefix-1 and prefix-2 portions of the
71 * key. Last 8 bytes of the key are occupied by hash of the remaining
72 * characters of the name.
74 * This key assignment reaches following important goals:
76 * (1) directory entries are sorted in approximately lexicographical
77 * order.
79 * (2) collisions (when multiple directory items have the same key), while
80 * principally unavoidable in a tree with fixed length keys, are rare.
82 * STAT DATA
84 * | 60 | 4 | 64 | 4 | 60 | 64 |
85 * +--------------+---+-----------------+---+--------------+-----------------+
86 * | locality id | 1 | ordering | 0 | objectid | 0 |
87 * +--------------+---+-----------------+---+--------------+-----------------+
88 * | | | | |
89 * | 8 bytes | 8 bytes | 8 bytes | 8 bytes |
91 * locality id object id of a directory where first name was created for
92 * the object
94 * ordering copy of second 8-byte portion of the key of directory
95 * entry for the first name of this object. Ordering has a form
96 * {
97 * fibration :7;
98 * h :1;
99 * prefix1 :56;
101 * see description of key for directory entry above.
103 * objectid object id for this object
105 * This key assignment policy is designed to keep stat-data in the same order
106 * as corresponding directory items, thus speeding up readdir/stat types of
107 * workload.
109 * FILE BODY
111 * | 60 | 4 | 64 | 4 | 60 | 64 |
112 * +--------------+---+-----------------+---+--------------+-----------------+
113 * | locality id | 4 | ordering | 0 | objectid | offset |
114 * +--------------+---+-----------------+---+--------------+-----------------+
115 * | | | | |
116 * | 8 bytes | 8 bytes | 8 bytes | 8 bytes |
118 * locality id object id of a directory where first name was created for
119 * the object
121 * ordering the same as in the key of stat-data for this object
123 * objectid object id for this object
125 * offset logical offset from the beginning of this file.
126 * Measured in bytes.
129 * KEY ASSIGNMENT: PLAN A, SHORT KEYS.
131 * DIRECTORY ITEMS
133 * | 60 | 4 | 7 |1| 56 | 64 |
134 * +--------------+---+---+-+-------------+-----------------+
135 * | dirid | 0 | F |H| prefix-1 | prefix-2/hash |
136 * +--------------+---+---+-+-------------+-----------------+
137 * | | | |
138 * | 8 bytes | 8 bytes | 8 bytes |
140 * dirid objectid of directory this item is for
142 * F fibration, see fs/reiser4/plugin/fibration.[ch]
144 * H 1 if last 8 bytes of the key contain hash,
145 * 0 if last 8 bytes of the key contain prefix-2
147 * prefix-1 first 7 characters of file name.
148 * Padded by zeroes if name is not long enough.
150 * prefix-2 next 8 characters of the file name.
152 * hash hash of the rest of file name (i.e., portion of file
153 * name not included into prefix-1).
155 * File names shorter than 15 (== 7 + 8) characters are completely encoded in
156 * the key. Such file names are called "short". They are distinguished by H
157 * bit set in the key.
159 * Other file names are "long". For long name, H bit is 0, and first 7
160 * characters are encoded in prefix-1 portion of the key. Last 8 bytes of the
161 * key are occupied by hash of the remaining characters of the name.
163 * STAT DATA
165 * | 60 | 4 | 4 | 60 | 64 |
166 * +--------------+---+---+--------------+-----------------+
167 * | locality id | 1 | 0 | objectid | 0 |
168 * +--------------+---+---+--------------+-----------------+
169 * | | | |
170 * | 8 bytes | 8 bytes | 8 bytes |
172 * locality id object id of a directory where first name was created for
173 * the object
175 * objectid object id for this object
177 * FILE BODY
179 * | 60 | 4 | 4 | 60 | 64 |
180 * +--------------+---+---+--------------+-----------------+
181 * | locality id | 4 | 0 | objectid | offset |
182 * +--------------+---+---+--------------+-----------------+
183 * | | | |
184 * | 8 bytes | 8 bytes | 8 bytes |
186 * locality id object id of a directory where first name was created for
187 * the object
189 * objectid object id for this object
191 * offset logical offset from the beginning of this file.
192 * Measured in bytes.
197 #include "debug.h"
198 #include "key.h"
199 #include "kassign.h"
200 #include "vfs_ops.h"
201 #include "inode.h"
202 #include "super.h"
203 #include "dscale.h"
205 #include <linux/types.h> /* for __u?? */
206 #include <linux/fs.h> /* for struct super_block, etc */
208 /* bitmask for H bit (see comment at the beginning of this file */
209 static const __u64 longname_mark = 0x0100000000000000ull;
210 /* bitmask for F and H portions of the key. */
211 static const __u64 fibration_mask = 0xff00000000000000ull;
213 /* return true if name is not completely encoded in @key */
214 int is_longname_key(const reiser4_key * key)
216 __u64 highpart;
218 assert("nikita-2863", key != NULL);
219 if (get_key_type(key) != KEY_FILE_NAME_MINOR)
220 reiser4_print_key("oops", key);
221 assert("nikita-2864", get_key_type(key) == KEY_FILE_NAME_MINOR);
223 if (REISER4_LARGE_KEY)
224 highpart = get_key_ordering(key);
225 else
226 highpart = get_key_objectid(key);
228 return (highpart & longname_mark) ? 1 : 0;
231 /* return true if @name is too long to be completely encoded in the key */
232 int is_longname(const char *name UNUSED_ARG, int len)
234 if (REISER4_LARGE_KEY)
235 return len > 23;
236 else
237 return len > 15;
240 /* code ascii string into __u64.
242 Put characters of @name into result (@str) one after another starting
243 from @start_idx-th highest (arithmetically) byte. This produces
244 endian-safe encoding. memcpy(2) will not do.
247 static __u64 pack_string(const char *name /* string to encode */ ,
248 int start_idx /* highest byte in result from
249 * which to start encoding */ )
251 unsigned i;
252 __u64 str;
254 str = 0;
255 for (i = 0; (i < sizeof str - start_idx) && name[i]; ++i) {
256 str <<= 8;
257 str |= (unsigned char)name[i];
259 str <<= (sizeof str - i - start_idx) << 3;
260 return str;
263 /* opposite to pack_string(). Takes value produced by pack_string(), restores
264 * string encoded in it and stores result in @buf */
265 char * reiser4_unpack_string(__u64 value, char *buf)
267 do {
268 *buf = value >> (64 - 8);
269 if (*buf)
270 ++buf;
271 value <<= 8;
272 } while (value != 0);
273 *buf = 0;
274 return buf;
277 /* obtain name encoded in @key and store it in @buf */
278 char *extract_name_from_key(const reiser4_key * key, char *buf)
280 char *c;
282 assert("nikita-2868", !is_longname_key(key));
284 c = buf;
285 if (REISER4_LARGE_KEY) {
286 c = reiser4_unpack_string(get_key_ordering(key) &
287 ~fibration_mask, c);
288 c = reiser4_unpack_string(get_key_fulloid(key), c);
289 } else
290 c = reiser4_unpack_string(get_key_fulloid(key) &
291 ~fibration_mask, c);
292 reiser4_unpack_string(get_key_offset(key), c);
293 return buf;
297 * complete_entry_key - calculate entry key by name
298 * @dir: directory where entry is (or will be) in
299 * @name: name to calculate key of
300 * @len: lenth of name
301 * @result: place to store result in
303 * Sets fields of entry key @result which depend on file name.
304 * When REISER4_LARGE_KEY is defined three fields of @result are set: ordering,
305 * objectid and offset. Otherwise, objectid and offset are set.
307 void complete_entry_key(const struct inode *dir, const char *name,
308 int len, reiser4_key *result)
310 #if REISER4_LARGE_KEY
311 __u64 ordering;
312 __u64 objectid;
313 __u64 offset;
315 assert("nikita-1139", dir != NULL);
316 assert("nikita-1142", result != NULL);
317 assert("nikita-2867", strlen(name) == len);
320 * key allocation algorithm for directory entries in case of large
321 * keys:
323 * If name is not longer than 7 + 8 + 8 = 23 characters, put first 7
324 * characters into ordering field of key, next 8 charactes (if any)
325 * into objectid field of key and next 8 ones (of any) into offset
326 * field of key
328 * If file name is longer than 23 characters, put first 7 characters
329 * into key's ordering, next 8 to objectid and hash of remaining
330 * characters into offset field.
332 * To distinguish above cases, in latter set up unused high bit in
333 * ordering field.
336 /* [0-6] characters to ordering */
337 ordering = pack_string(name, 1);
338 if (len > 7) {
339 /* [7-14] characters to objectid */
340 objectid = pack_string(name + 7, 0);
341 if (len > 15) {
342 if (len <= 23) {
343 /* [15-23] characters to offset */
344 offset = pack_string(name + 15, 0);
345 } else {
346 /* note in a key the fact that offset contains hash. */
347 ordering |= longname_mark;
349 /* offset is the hash of the file name's tail. */
350 offset = inode_hash_plugin(dir)->hash(name + 15,
351 len - 15);
353 } else {
354 offset = 0ull;
356 } else {
357 objectid = 0ull;
358 offset = 0ull;
361 assert("nikita-3480", inode_fibration_plugin(dir) != NULL);
362 ordering |= inode_fibration_plugin(dir)->fibre(dir, name, len);
364 set_key_ordering(result, ordering);
365 set_key_fulloid(result, objectid);
366 set_key_offset(result, offset);
367 return;
369 #else
370 __u64 objectid;
371 __u64 offset;
373 assert("nikita-1139", dir != NULL);
374 assert("nikita-1142", result != NULL);
375 assert("nikita-2867", strlen(name) == len);
378 * key allocation algorithm for directory entries in case of not large
379 * keys:
381 * If name is not longer than 7 + 8 = 15 characters, put first 7
382 * characters into objectid field of key, next 8 charactes (if any)
383 * into offset field of key
385 * If file name is longer than 15 characters, put first 7 characters
386 * into key's objectid, and hash of remaining characters into offset
387 * field.
389 * To distinguish above cases, in latter set up unused high bit in
390 * objectid field.
393 /* [0-6] characters to objectid */
394 objectid = pack_string(name, 1);
395 if (len > 7) {
396 if (len <= 15) {
397 /* [7-14] characters to offset */
398 offset = pack_string(name + 7, 0);
399 } else {
400 /* note in a key the fact that offset contains hash. */
401 objectid |= longname_mark;
403 /* offset is the hash of the file name. */
404 offset = inode_hash_plugin(dir)->hash(name + 7,
405 len - 7);
407 } else
408 offset = 0ull;
410 assert("nikita-3480", inode_fibration_plugin(dir) != NULL);
411 objectid |= inode_fibration_plugin(dir)->fibre(dir, name, len);
413 set_key_fulloid(result, objectid);
414 set_key_offset(result, offset);
415 return;
416 #endif /* ! REISER4_LARGE_KEY */
419 /* true, if @key is the key of "." */
420 int is_dot_key(const reiser4_key * key /* key to check */ )
422 assert("nikita-1717", key != NULL);
423 assert("nikita-1718", get_key_type(key) == KEY_FILE_NAME_MINOR);
424 return
425 (get_key_ordering(key) == 0ull) &&
426 (get_key_objectid(key) == 0ull) && (get_key_offset(key) == 0ull);
429 /* build key for stat-data.
431 return key of stat-data of this object. This should became sd plugin
432 method in the future. For now, let it be here.
435 reiser4_key *build_sd_key(const struct inode * target /* inode of an object */ ,
436 reiser4_key * result /* resulting key of @target
437 stat-data */ )
439 assert("nikita-261", result != NULL);
441 reiser4_key_init(result);
442 set_key_locality(result, reiser4_inode_data(target)->locality_id);
443 set_key_ordering(result, get_inode_ordering(target));
444 set_key_objectid(result, get_inode_oid(target));
445 set_key_type(result, KEY_SD_MINOR);
446 set_key_offset(result, (__u64) 0);
447 return result;
450 /* encode part of key into &obj_key_id
452 This encodes into @id part of @key sufficient to restore @key later,
453 given that latter is key of object (key of stat-data).
455 See &obj_key_id
457 int build_obj_key_id(const reiser4_key * key /* key to encode */ ,
458 obj_key_id * id /* id where key is encoded in */ )
460 assert("nikita-1151", key != NULL);
461 assert("nikita-1152", id != NULL);
463 memcpy(id, key, sizeof *id);
464 return 0;
467 /* encode reference to @obj in @id.
469 This is like build_obj_key_id() above, but takes inode as parameter. */
470 int build_inode_key_id(const struct inode *obj /* object to build key of */ ,
471 obj_key_id * id /* result */ )
473 reiser4_key sdkey;
475 assert("nikita-1166", obj != NULL);
476 assert("nikita-1167", id != NULL);
478 build_sd_key(obj, &sdkey);
479 build_obj_key_id(&sdkey, id);
480 return 0;
483 /* decode @id back into @key
485 Restore key of object stat-data from @id. This is dual to
486 build_obj_key_id() above.
488 int extract_key_from_id(const obj_key_id * id /* object key id to extract key
489 * from */ ,
490 reiser4_key * key /* result */ )
492 assert("nikita-1153", id != NULL);
493 assert("nikita-1154", key != NULL);
495 reiser4_key_init(key);
496 memcpy(key, id, sizeof *id);
497 return 0;
500 /* extract objectid of directory from key of directory entry within said
501 directory.
503 oid_t extract_dir_id_from_key(const reiser4_key * de_key /* key of
504 * directory
505 * entry */ )
507 assert("nikita-1314", de_key != NULL);
508 return get_key_locality(de_key);
511 /* encode into @id key of directory entry.
513 Encode into @id information sufficient to later distinguish directory
514 entries within the same directory. This is not whole key, because all
515 directory entries within directory item share locality which is equal
516 to objectid of their directory.
519 int build_de_id(const struct inode *dir /* inode of directory */ ,
520 const struct qstr *name /* name to be given to @obj by
521 * directory entry being
522 * constructed */ ,
523 de_id * id /* short key of directory entry */ )
525 reiser4_key key;
527 assert("nikita-1290", dir != NULL);
528 assert("nikita-1292", id != NULL);
530 /* NOTE-NIKITA this is suboptimal. */
531 inode_dir_plugin(dir)->build_entry_key(dir, name, &key);
532 return build_de_id_by_key(&key, id);
535 /* encode into @id key of directory entry.
537 Encode into @id information sufficient to later distinguish directory
538 entries within the same directory. This is not whole key, because all
539 directory entries within directory item share locality which is equal
540 to objectid of their directory.
543 int build_de_id_by_key(const reiser4_key * entry_key /* full key of directory
544 * entry */ ,
545 de_id * id /* short key of directory entry */ )
547 memcpy(id, ((__u64 *) entry_key) + 1, sizeof *id);
548 return 0;
551 /* restore from @id key of directory entry.
553 Function dual to build_de_id(): given @id and locality, build full
554 key of directory entry within directory item.
557 int extract_key_from_de_id(const oid_t locality /* locality of directory
558 * entry */ ,
559 const de_id * id /* directory entry id */ ,
560 reiser4_key * key /* result */ )
562 /* no need to initialise key here: all fields are overwritten */
563 memcpy(((__u64 *) key) + 1, id, sizeof *id);
564 set_key_locality(key, locality);
565 set_key_type(key, KEY_FILE_NAME_MINOR);
566 return 0;
569 /* compare two &de_id's */
570 cmp_t de_id_cmp(const de_id * id1 /* first &de_id to compare */ ,
571 const de_id * id2 /* second &de_id to compare */ )
573 /* NOTE-NIKITA ugly implementation */
574 reiser4_key k1;
575 reiser4_key k2;
577 extract_key_from_de_id((oid_t) 0, id1, &k1);
578 extract_key_from_de_id((oid_t) 0, id2, &k2);
579 return keycmp(&k1, &k2);
582 /* compare &de_id with key */
583 cmp_t de_id_key_cmp(const de_id * id /* directory entry id to compare */ ,
584 const reiser4_key * key /* key to compare */ )
586 cmp_t result;
587 reiser4_key *k1;
589 k1 = (reiser4_key *) (((unsigned long)id) - sizeof key->el[0]);
590 result = KEY_DIFF_EL(k1, key, 1);
591 if (result == EQUAL_TO) {
592 result = KEY_DIFF_EL(k1, key, 2);
593 if (REISER4_LARGE_KEY && result == EQUAL_TO) {
594 result = KEY_DIFF_EL(k1, key, 3);
597 return result;
601 * return number of bytes necessary to encode @inode identity.
603 int inode_onwire_size(const struct inode *inode)
605 int result;
607 result = dscale_bytes(get_inode_oid(inode));
608 result += dscale_bytes(get_inode_locality(inode));
611 * ordering is large (it usually has highest bits set), so it makes
612 * little sense to dscale it.
614 if (REISER4_LARGE_KEY)
615 result += sizeof(get_inode_ordering(inode));
616 return result;
620 * encode @inode identity at @start
622 char *build_inode_onwire(const struct inode *inode, char *start)
624 start += dscale_write(start, get_inode_locality(inode));
625 start += dscale_write(start, get_inode_oid(inode));
627 if (REISER4_LARGE_KEY) {
628 put_unaligned(cpu_to_le64(get_inode_ordering(inode)), (__le64 *)start);
629 start += sizeof(get_inode_ordering(inode));
631 return start;
635 * extract key that was previously encoded by build_inode_onwire() at @addr
637 char *extract_obj_key_id_from_onwire(char *addr, obj_key_id * key_id)
639 __u64 val;
641 addr += dscale_read(addr, &val);
642 val = (val << KEY_LOCALITY_SHIFT) | KEY_SD_MINOR;
643 put_unaligned(cpu_to_le64(val), (__le64 *)key_id->locality);
644 addr += dscale_read(addr, &val);
645 put_unaligned(cpu_to_le64(val), (__le64 *)key_id->objectid);
646 #if REISER4_LARGE_KEY
647 memcpy(&key_id->ordering, addr, sizeof key_id->ordering);
648 addr += sizeof key_id->ordering;
649 #endif
650 return addr;
653 /* Make Linus happy.
654 Local variables:
655 c-indentation-style: "K&R"
656 mode-name: "LC"
657 c-basic-offset: 8
658 tab-width: 8
659 fill-column: 120
660 End: