On Tue, Nov 06, 2007 at 02:33:53AM -0800, akpm@linux-foundation.org wrote:
[mmotm.git] / fs / reiser4 / tree.h
blob73aa70ab5ab0aa6b443683e6468a19d87f4a5eac
1 /* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
2 * reiser4/README */
4 /* Tree operations. See fs/reiser4/tree.c for comments */
6 #if !defined( __REISER4_TREE_H__ )
7 #define __REISER4_TREE_H__
9 #include "forward.h"
10 #include "debug.h"
11 #include "dformat.h"
12 #include "plugin/node/node.h"
13 #include "plugin/plugin.h"
14 #include "znode.h"
15 #include "tap.h"
17 #include <linux/types.h> /* for __u?? */
18 #include <linux/fs.h> /* for struct super_block */
19 #include <linux/spinlock.h>
20 #include <linux/sched.h> /* for struct task_struct */
22 /* fictive block number never actually used */
23 extern const reiser4_block_nr UBER_TREE_ADDR;
25 /* &cbk_cache_slot - entry in a coord cache.
27 This is entry in a coord_by_key (cbk) cache, represented by
28 &cbk_cache.
31 typedef struct cbk_cache_slot {
32 /* cached node */
33 znode *node;
34 /* linkage to the next cbk cache slot in a LRU order */
35 struct list_head lru;
36 } cbk_cache_slot;
38 /* &cbk_cache - coord cache. This is part of reiser4_tree.
40 cbk_cache is supposed to speed up tree lookups by caching results of recent
41 successful lookups (we don't cache negative results as dentry cache
42 does). Cache consists of relatively small number of entries kept in a LRU
43 order. Each entry (&cbk_cache_slot) contains a pointer to znode, from
44 which we can obtain a range of keys that covered by this znode. Before
45 embarking into real tree traversal we scan cbk_cache slot by slot and for
46 each slot check whether key we are looking for is between minimal and
47 maximal keys for node pointed to by this slot. If no match is found, real
48 tree traversal is performed and if result is successful, appropriate entry
49 is inserted into cache, possibly pulling least recently used entry out of
50 it.
52 Tree spin lock is used to protect coord cache. If contention for this
53 lock proves to be too high, more finer grained locking can be added.
55 Invariants involving parts of this data-type:
57 [cbk-cache-invariant]
59 typedef struct cbk_cache {
60 /* serializator */
61 rwlock_t guard;
62 int nr_slots;
63 /* head of LRU list of cache slots */
64 struct list_head lru;
65 /* actual array of slots */
66 cbk_cache_slot *slot;
67 } cbk_cache;
69 /* level_lookup_result - possible outcome of looking up key at some level.
70 This is used by coord_by_key when traversing tree downward. */
71 typedef enum {
72 /* continue to the next level */
73 LOOKUP_CONT,
74 /* done. Either required item was found, or we can prove it
75 doesn't exist, or some error occurred. */
76 LOOKUP_DONE,
77 /* restart traversal from the root. Infamous "repetition". */
78 LOOKUP_REST
79 } level_lookup_result;
81 /* This is representation of internal reiser4 tree where all file-system
82 data and meta-data are stored. This structure is passed to all tree
83 manipulation functions. It's different from the super block because:
84 we don't want to limit ourselves to strictly one to one mapping
85 between super blocks and trees, and, because they are logically
86 different: there are things in a super block that have no relation to
87 the tree (bitmaps, journalling area, mount options, etc.) and there
88 are things in a tree that bear no relation to the super block, like
89 tree of znodes.
91 At this time, there is only one tree
92 per filesystem, and this struct is part of the super block. We only
93 call the super block the super block for historical reasons (most
94 other filesystems call the per filesystem metadata the super block).
97 struct reiser4_tree {
98 /* block_nr == 0 is fake znode. Write lock it, while changing
99 tree height. */
100 /* disk address of root node of a tree */
101 reiser4_block_nr root_block;
103 /* level of the root node. If this is 1, tree consists of root
104 node only */
105 tree_level height;
108 * this is cached here avoid calling plugins through function
109 * dereference all the time.
111 __u64 estimate_one_insert;
113 /* cache of recent tree lookup results */
114 cbk_cache cbk_cache;
116 /* hash table to look up znodes by block number. */
117 z_hash_table zhash_table;
118 z_hash_table zfake_table;
119 /* hash table to look up jnodes by inode and offset. */
120 j_hash_table jhash_table;
122 /* lock protecting:
123 - parent pointers,
124 - sibling pointers,
125 - znode hash table
126 - coord cache
128 /* NOTE: The "giant" tree lock can be replaced by more spin locks,
129 hoping they will be less contented. We can use one spin lock per one
130 znode hash bucket. With adding of some code complexity, sibling
131 pointers can be protected by both znode spin locks. However it looks
132 more SMP scalable we should test this locking change on n-ways (n >
133 4) SMP machines. Current 4-ways machine test does not show that tree
134 lock is contented and it is a bottleneck (2003.07.25). */
136 rwlock_t tree_lock;
138 /* lock protecting delimiting keys */
139 rwlock_t dk_lock;
141 /* spin lock protecting znode_epoch */
142 spinlock_t epoch_lock;
143 /* version stamp used to mark znode updates. See seal.[ch] for more
144 * information. */
145 __u64 znode_epoch;
147 znode *uber;
148 node_plugin *nplug;
149 struct super_block *super;
150 struct {
151 /* carry flags used for insertion of new nodes */
152 __u32 new_node_flags;
153 /* carry flags used for insertion of new extents */
154 __u32 new_extent_flags;
155 /* carry flags used for paste operations */
156 __u32 paste_flags;
157 /* carry flags used for insert operations */
158 __u32 insert_flags;
159 } carry;
162 extern int reiser4_init_tree(reiser4_tree * tree,
163 const reiser4_block_nr * root_block,
164 tree_level height, node_plugin * default_plugin);
165 extern void reiser4_done_tree(reiser4_tree * tree);
167 /* cbk flags: options for coord_by_key() */
168 typedef enum {
169 /* coord_by_key() is called for insertion. This is necessary because
170 of extents being located at the twig level. For explanation, see
171 comment just above is_next_item_internal().
173 CBK_FOR_INSERT = (1 << 0),
174 /* coord_by_key() is called with key that is known to be unique */
175 CBK_UNIQUE = (1 << 1),
176 /* coord_by_key() can trust delimiting keys. This options is not user
177 accessible. coord_by_key() will set it automatically. It will be
178 only cleared by special-case in extents-on-the-twig-level handling
179 where it is necessary to insert item with a key smaller than
180 leftmost key in a node. This is necessary because of extents being
181 located at the twig level. For explanation, see comment just above
182 is_next_item_internal().
184 CBK_TRUST_DK = (1 << 2),
185 CBK_READA = (1 << 3), /* original: readahead leaves which contain items of certain file */
186 CBK_READDIR_RA = (1 << 4), /* readdir: readahead whole directory and all its stat datas */
187 CBK_DKSET = (1 << 5),
188 CBK_EXTENDED_COORD = (1 << 6), /* coord_t is actually */
189 CBK_IN_CACHE = (1 << 7), /* node is already in cache */
190 CBK_USE_CRABLOCK = (1 << 8) /* use crab_lock in stead of long term
191 * lock */
192 } cbk_flags;
194 /* insertion outcome. IBK = insert by key */
195 typedef enum {
196 IBK_INSERT_OK = 0,
197 IBK_ALREADY_EXISTS = -EEXIST,
198 IBK_IO_ERROR = -EIO,
199 IBK_NO_SPACE = -E_NODE_FULL,
200 IBK_OOM = -ENOMEM
201 } insert_result;
203 #define IS_CBKERR(err) ((err) != CBK_COORD_FOUND && (err) != CBK_COORD_NOTFOUND)
205 typedef int (*tree_iterate_actor_t) (reiser4_tree * tree, coord_t * coord,
206 lock_handle * lh, void *arg);
207 extern int reiser4_iterate_tree(reiser4_tree * tree, coord_t * coord,
208 lock_handle * lh,
209 tree_iterate_actor_t actor, void *arg,
210 znode_lock_mode mode, int through_units_p);
211 extern int get_uber_znode(reiser4_tree * tree, znode_lock_mode mode,
212 znode_lock_request pri, lock_handle * lh);
214 /* return node plugin of @node */
215 static inline node_plugin *node_plugin_by_node(const znode *
216 node /* node to query */ )
218 assert("vs-213", node != NULL);
219 assert("vs-214", znode_is_loaded(node));
221 return node->nplug;
224 /* number of items in @node */
225 static inline pos_in_node_t node_num_items(const znode * node)
227 assert("nikita-2754", znode_is_loaded(node));
228 assert("nikita-2468",
229 node_plugin_by_node(node)->num_of_items(node) == node->nr_items);
231 return node->nr_items;
234 /* Return the number of items at the present node. Asserts coord->node !=
235 NULL. */
236 static inline unsigned coord_num_items(const coord_t * coord)
238 assert("jmacd-9805", coord->node != NULL);
240 return node_num_items(coord->node);
243 /* true if @node is empty */
244 static inline int node_is_empty(const znode * node)
246 return node_num_items(node) == 0;
249 typedef enum {
250 SHIFTED_SOMETHING = 0,
251 SHIFT_NO_SPACE = -E_NODE_FULL,
252 SHIFT_IO_ERROR = -EIO,
253 SHIFT_OOM = -ENOMEM,
254 } shift_result;
256 extern node_plugin *node_plugin_by_coord(const coord_t * coord);
257 extern int is_coord_in_node(const coord_t * coord);
258 extern int key_in_node(const reiser4_key *, const coord_t *);
259 extern void coord_item_move_to(coord_t * coord, int items);
260 extern void coord_unit_move_to(coord_t * coord, int units);
262 /* there are two types of repetitive accesses (ra): intra-syscall
263 (local) and inter-syscall (global). Local ra is used when
264 during single syscall we add/delete several items and units in the
265 same place in a tree. Note that plan-A fragments local ra by
266 separating stat-data and file body in key-space. Global ra is
267 used when user does repetitive modifications in the same place in a
268 tree.
270 Our ra implementation serves following purposes:
271 1 it affects balancing decisions so that next operation in a row
272 can be performed faster;
273 2 it affects lower-level read-ahead in page-cache;
274 3 it allows to avoid unnecessary lookups by maintaining some state
275 across several operations (this is only for local ra);
276 4 it leaves room for lazy-micro-balancing: when we start a sequence of
277 operations they are performed without actually doing any intra-node
278 shifts, until we finish sequence or scope of sequence leaves
279 current node, only then we really pack node (local ra only).
282 /* another thing that can be useful is to keep per-tree and/or
283 per-process cache of recent lookups. This cache can be organised as a
284 list of block numbers of formatted nodes sorted by starting key in
285 this node. Balancings should invalidate appropriate parts of this
286 cache.
289 lookup_result coord_by_key(reiser4_tree * tree, const reiser4_key * key,
290 coord_t * coord, lock_handle * handle,
291 znode_lock_mode lock, lookup_bias bias,
292 tree_level lock_level, tree_level stop_level,
293 __u32 flags, ra_info_t *);
295 lookup_result reiser4_object_lookup(struct inode *object,
296 const reiser4_key * key,
297 coord_t * coord,
298 lock_handle * lh,
299 znode_lock_mode lock_mode,
300 lookup_bias bias,
301 tree_level lock_level,
302 tree_level stop_level,
303 __u32 flags, ra_info_t * info);
305 insert_result insert_by_key(reiser4_tree * tree, const reiser4_key * key,
306 reiser4_item_data * data, coord_t * coord,
307 lock_handle * lh,
308 tree_level stop_level, __u32 flags);
309 insert_result insert_by_coord(coord_t * coord,
310 reiser4_item_data * data, const reiser4_key * key,
311 lock_handle * lh, __u32);
312 insert_result insert_extent_by_coord(coord_t * coord,
313 reiser4_item_data * data,
314 const reiser4_key * key, lock_handle * lh);
315 int cut_node_content(coord_t * from, coord_t * to, const reiser4_key * from_key,
316 const reiser4_key * to_key,
317 reiser4_key * smallest_removed);
318 int kill_node_content(coord_t * from, coord_t * to,
319 const reiser4_key * from_key, const reiser4_key * to_key,
320 reiser4_key * smallest_removed,
321 znode * locked_left_neighbor, struct inode *inode,
322 int truncate);
324 int reiser4_resize_item(coord_t * coord, reiser4_item_data * data,
325 reiser4_key * key, lock_handle * lh, cop_insert_flag);
326 int insert_into_item(coord_t * coord, lock_handle * lh, const reiser4_key * key,
327 reiser4_item_data * data, unsigned);
328 int reiser4_insert_flow(coord_t * coord, lock_handle * lh, flow_t * f);
329 int find_new_child_ptr(znode * parent, znode * child, znode * left,
330 coord_t * result);
332 int shift_right_of_but_excluding_insert_coord(coord_t * insert_coord);
333 int shift_left_of_and_including_insert_coord(coord_t * insert_coord);
335 void fake_kill_hook_tail(struct inode *, loff_t start, loff_t end, int);
337 extern int cut_tree_worker_common(tap_t *, const reiser4_key *,
338 const reiser4_key *, reiser4_key *,
339 struct inode *, int, int *);
340 extern int reiser4_cut_tree_object(reiser4_tree *, const reiser4_key *,
341 const reiser4_key *, reiser4_key *,
342 struct inode *, int, int *);
343 extern int reiser4_cut_tree(reiser4_tree * tree, const reiser4_key * from,
344 const reiser4_key * to, struct inode *, int);
346 extern int reiser4_delete_node(znode *, reiser4_key *, struct inode *, int);
347 extern int check_tree_pointer(const coord_t * pointer, const znode * child);
348 extern int find_new_child_ptr(znode * parent, znode * child UNUSED_ARG,
349 znode * left, coord_t * result);
350 extern int find_child_ptr(znode * parent, znode * child, coord_t * result);
351 extern int set_child_delimiting_keys(znode * parent, const coord_t * in_parent,
352 znode * child);
353 extern znode *child_znode(const coord_t * in_parent, znode * parent,
354 int incore_p, int setup_dkeys_p);
356 extern int cbk_cache_init(cbk_cache * cache);
357 extern void cbk_cache_done(cbk_cache * cache);
358 extern void cbk_cache_invalidate(const znode * node, reiser4_tree * tree);
360 extern char *sprint_address(const reiser4_block_nr * block);
362 #if REISER4_DEBUG
363 extern void print_coord_content(const char *prefix, coord_t * p);
364 extern void reiser4_print_address(const char *prefix,
365 const reiser4_block_nr * block);
366 extern void print_tree_rec(const char *prefix, reiser4_tree * tree,
367 __u32 flags);
368 extern void check_dkeys(znode *node);
369 #else
370 #define print_coord_content(p, c) noop
371 #define reiser4_print_address(p, b) noop
372 #endif
374 extern void forget_znode(lock_handle * handle);
375 extern int deallocate_znode(znode * node);
377 extern int is_disk_addr_unallocated(const reiser4_block_nr * addr);
379 /* struct used internally to pack all numerous arguments of tree lookup.
380 Used to avoid passing a lot of arguments to helper functions. */
381 typedef struct cbk_handle {
382 /* tree we are in */
383 reiser4_tree *tree;
384 /* key we are going after */
385 const reiser4_key *key;
386 /* coord we will store result in */
387 coord_t *coord;
388 /* type of lock to take on target node */
389 znode_lock_mode lock_mode;
390 /* lookup bias. See comments at the declaration of lookup_bias */
391 lookup_bias bias;
392 /* lock level: level starting from which tree traversal starts taking
393 * write locks. */
394 tree_level lock_level;
395 /* level where search will stop. Either item will be found between
396 lock_level and stop_level, or CBK_COORD_NOTFOUND will be
397 returned.
399 tree_level stop_level;
400 /* level we are currently at */
401 tree_level level;
402 /* block number of @active node. Tree traversal operates on two
403 nodes: active and parent. */
404 reiser4_block_nr block;
405 /* put here error message to be printed by caller */
406 const char *error;
407 /* result passed back to caller */
408 lookup_result result;
409 /* lock handles for active and parent */
410 lock_handle *parent_lh;
411 lock_handle *active_lh;
412 reiser4_key ld_key;
413 reiser4_key rd_key;
414 /* flags, passed to the cbk routine. Bits of this bitmask are defined
415 in tree.h:cbk_flags enum. */
416 __u32 flags;
417 ra_info_t *ra_info;
418 struct inode *object;
419 } cbk_handle;
421 extern znode_lock_mode cbk_lock_mode(tree_level level, cbk_handle * h);
423 /* eottl.c */
424 extern int handle_eottl(cbk_handle *h, int *outcome);
426 int lookup_multikey(cbk_handle * handle, int nr_keys);
427 int lookup_couple(reiser4_tree * tree,
428 const reiser4_key * key1, const reiser4_key * key2,
429 coord_t * coord1, coord_t * coord2,
430 lock_handle * lh1, lock_handle * lh2,
431 znode_lock_mode lock_mode, lookup_bias bias,
432 tree_level lock_level, tree_level stop_level, __u32 flags,
433 int *result1, int *result2);
435 static inline void read_lock_tree(reiser4_tree *tree)
437 /* check that tree is not locked */
438 assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
439 LOCK_CNT_NIL(read_locked_tree) &&
440 LOCK_CNT_NIL(write_locked_tree)));
441 /* check that spinlocks of lower priorities are not held */
442 assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
443 LOCK_CNT_NIL(rw_locked_dk) &&
444 LOCK_CNT_NIL(spin_locked_stack)));
446 read_lock(&(tree->tree_lock));
448 LOCK_CNT_INC(read_locked_tree);
449 LOCK_CNT_INC(rw_locked_tree);
450 LOCK_CNT_INC(spin_locked);
453 static inline void read_unlock_tree(reiser4_tree *tree)
455 assert("nikita-1375", LOCK_CNT_GTZ(read_locked_tree));
456 assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_tree));
457 assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
459 LOCK_CNT_DEC(read_locked_tree);
460 LOCK_CNT_DEC(rw_locked_tree);
461 LOCK_CNT_DEC(spin_locked);
463 read_unlock(&(tree->tree_lock));
466 static inline void write_lock_tree(reiser4_tree *tree)
468 /* check that tree is not locked */
469 assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
470 LOCK_CNT_NIL(read_locked_tree) &&
471 LOCK_CNT_NIL(write_locked_tree)));
472 /* check that spinlocks of lower priorities are not held */
473 assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
474 LOCK_CNT_NIL(rw_locked_dk) &&
475 LOCK_CNT_NIL(spin_locked_stack)));
477 write_lock(&(tree->tree_lock));
479 LOCK_CNT_INC(write_locked_tree);
480 LOCK_CNT_INC(rw_locked_tree);
481 LOCK_CNT_INC(spin_locked);
484 static inline void write_unlock_tree(reiser4_tree *tree)
486 assert("nikita-1375", LOCK_CNT_GTZ(write_locked_tree));
487 assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_tree));
488 assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
490 LOCK_CNT_DEC(write_locked_tree);
491 LOCK_CNT_DEC(rw_locked_tree);
492 LOCK_CNT_DEC(spin_locked);
494 write_unlock(&(tree->tree_lock));
497 static inline void read_lock_dk(reiser4_tree *tree)
499 /* check that dk is not locked */
500 assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
501 LOCK_CNT_NIL(read_locked_dk) &&
502 LOCK_CNT_NIL(write_locked_dk)));
503 /* check that spinlocks of lower priorities are not held */
504 assert("", LOCK_CNT_NIL(spin_locked_stack));
506 read_lock(&((tree)->dk_lock));
508 LOCK_CNT_INC(read_locked_dk);
509 LOCK_CNT_INC(rw_locked_dk);
510 LOCK_CNT_INC(spin_locked);
513 static inline void read_unlock_dk(reiser4_tree *tree)
515 assert("nikita-1375", LOCK_CNT_GTZ(read_locked_dk));
516 assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_dk));
517 assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
519 LOCK_CNT_DEC(read_locked_dk);
520 LOCK_CNT_DEC(rw_locked_dk);
521 LOCK_CNT_DEC(spin_locked);
523 read_unlock(&(tree->dk_lock));
526 static inline void write_lock_dk(reiser4_tree *tree)
528 /* check that dk is not locked */
529 assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
530 LOCK_CNT_NIL(read_locked_dk) &&
531 LOCK_CNT_NIL(write_locked_dk)));
532 /* check that spinlocks of lower priorities are not held */
533 assert("", LOCK_CNT_NIL(spin_locked_stack));
535 write_lock(&((tree)->dk_lock));
537 LOCK_CNT_INC(write_locked_dk);
538 LOCK_CNT_INC(rw_locked_dk);
539 LOCK_CNT_INC(spin_locked);
542 static inline void write_unlock_dk(reiser4_tree *tree)
544 assert("nikita-1375", LOCK_CNT_GTZ(write_locked_dk));
545 assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_dk));
546 assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
548 LOCK_CNT_DEC(write_locked_dk);
549 LOCK_CNT_DEC(rw_locked_dk);
550 LOCK_CNT_DEC(spin_locked);
552 write_unlock(&(tree->dk_lock));
555 /* estimate api. Implementation is in estimate.c */
556 reiser4_block_nr estimate_one_insert_item(reiser4_tree *);
557 reiser4_block_nr estimate_one_insert_into_item(reiser4_tree *);
558 reiser4_block_nr estimate_insert_flow(tree_level);
559 reiser4_block_nr estimate_one_item_removal(reiser4_tree *);
560 reiser4_block_nr calc_estimate_one_insert(tree_level);
561 reiser4_block_nr estimate_dirty_cluster(struct inode *);
562 reiser4_block_nr estimate_insert_cluster(struct inode *);
563 reiser4_block_nr estimate_update_cluster(struct inode *);
565 /* __REISER4_TREE_H__ */
566 #endif
568 /* Make Linus happy.
569 Local variables:
570 c-indentation-style: "K&R"
571 mode-name: "LC"
572 c-basic-offset: 8
573 tab-width: 8
574 fill-column: 120
575 scroll-step: 1
576 End: