1 /* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
10 #include "plugin/item/item.h"
11 #include "plugin/node/node.h"
12 #include "plugin/plugin.h"
15 #include "block_alloc.h"
16 #include "tree_walk.h"
22 #include <linux/slab.h>
24 static const char *bias_name(lookup_bias bias
);
26 /* tree searching algorithm, intranode searching algorithms are in
31 * The coord by key cache consists of small list of recently accessed nodes
32 * maintained according to the LRU discipline. Before doing real top-to-down
33 * tree traversal this cache is scanned for nodes that can contain key
36 * The efficiency of coord cache depends heavily on locality of reference for
37 * tree accesses. Our user level simulations show reasonably good hit ratios
38 * for coord cache under most loads so far.
41 /* Initialise coord cache slot */
42 static void cbk_cache_init_slot(cbk_cache_slot
*slot
)
44 assert("nikita-345", slot
!= NULL
);
46 INIT_LIST_HEAD(&slot
->lru
);
50 /* Initialize coord cache */
51 int cbk_cache_init(cbk_cache
*cache
/* cache to init */ )
55 assert("nikita-346", cache
!= NULL
);
58 kmalloc(sizeof(cbk_cache_slot
) * cache
->nr_slots
,
59 reiser4_ctx_gfp_mask_get());
60 if (cache
->slot
== NULL
)
61 return RETERR(-ENOMEM
);
63 INIT_LIST_HEAD(&cache
->lru
);
64 for (i
= 0; i
< cache
->nr_slots
; ++i
) {
65 cbk_cache_init_slot(cache
->slot
+ i
);
66 list_add_tail(&((cache
->slot
+ i
)->lru
), &cache
->lru
);
68 rwlock_init(&cache
->guard
);
72 /* free cbk cache data */
73 void cbk_cache_done(cbk_cache
* cache
/* cache to release */ )
75 assert("nikita-2493", cache
!= NULL
);
76 if (cache
->slot
!= NULL
) {
82 /* macro to iterate over all cbk cache slots */
83 #define for_all_slots(cache, slot) \
84 for ((slot) = list_entry((cache)->lru.next, cbk_cache_slot, lru); \
85 &(cache)->lru != &(slot)->lru; \
86 (slot) = list_entry(slot->lru.next, cbk_cache_slot, lru))
89 /* this function assures that [cbk-cache-invariant] invariant holds */
90 static int cbk_cache_invariant(const cbk_cache
*cache
)
96 if (cache
->nr_slots
== 0)
99 assert("nikita-2469", cache
!= NULL
);
102 read_lock(&((cbk_cache
*)cache
)->guard
);
103 for_all_slots(cache
, slot
) {
104 /* in LRU first go all `used' slots followed by `unused' */
105 if (unused
&& (slot
->node
!= NULL
))
107 if (slot
->node
== NULL
)
110 cbk_cache_slot
*scan
;
112 /* all cached nodes are different */
115 scan
= list_entry(scan
->lru
.next
, cbk_cache_slot
, lru
);
116 if (&cache
->lru
== &scan
->lru
)
118 if (slot
->node
== scan
->node
)
125 read_unlock(&((cbk_cache
*)cache
)->guard
);
131 /* Remove references, if any, to @node from coord cache */
132 void cbk_cache_invalidate(const znode
* node
/* node to remove from cache */ ,
133 reiser4_tree
* tree
/* tree to remove node from */ )
135 cbk_cache_slot
*slot
;
139 assert("nikita-350", node
!= NULL
);
140 assert("nikita-1479", LOCK_CNT_GTZ(rw_locked_tree
));
142 cache
= &tree
->cbk_cache
;
143 assert("nikita-2470", cbk_cache_invariant(cache
));
145 write_lock(&(cache
->guard
));
146 for (i
= 0, slot
= cache
->slot
; i
< cache
->nr_slots
; ++i
, ++slot
) {
147 if (slot
->node
== node
) {
148 list_move_tail(&slot
->lru
, &cache
->lru
);
153 write_unlock(&(cache
->guard
));
154 assert("nikita-2471", cbk_cache_invariant(cache
));
157 /* add to the cbk-cache in the "tree" information about "node". This
158 can actually be update of existing slot in a cache. */
159 static void cbk_cache_add(const znode
*node
/* node to add to the cache */ )
162 cbk_cache_slot
*slot
;
165 assert("nikita-352", node
!= NULL
);
167 cache
= &znode_get_tree(node
)->cbk_cache
;
168 assert("nikita-2472", cbk_cache_invariant(cache
));
170 if (cache
->nr_slots
== 0)
173 write_lock(&(cache
->guard
));
174 /* find slot to update/add */
175 for (i
= 0, slot
= cache
->slot
; i
< cache
->nr_slots
; ++i
, ++slot
) {
176 /* oops, this node is already in a cache */
177 if (slot
->node
== node
)
180 /* if all slots are used, reuse least recently used one */
181 if (i
== cache
->nr_slots
) {
182 slot
= list_entry(cache
->lru
.prev
, cbk_cache_slot
, lru
);
183 slot
->node
= (znode
*) node
;
185 list_move(&slot
->lru
, &cache
->lru
);
186 write_unlock(&(cache
->guard
));
187 assert("nikita-2473", cbk_cache_invariant(cache
));
190 static int setup_delimiting_keys(cbk_handle
* h
);
191 static lookup_result
coord_by_handle(cbk_handle
* handle
);
192 static lookup_result
traverse_tree(cbk_handle
* h
);
193 static int cbk_cache_search(cbk_handle
* h
);
195 static level_lookup_result
cbk_level_lookup(cbk_handle
* h
);
196 static level_lookup_result
cbk_node_lookup(cbk_handle
* h
);
198 /* helper functions */
200 static void update_stale_dk(reiser4_tree
* tree
, znode
* node
);
202 /* release parent node during traversal */
203 static void put_parent(cbk_handle
* h
);
204 /* check consistency of fields */
205 static int sanity_check(cbk_handle
* h
);
206 /* release resources in handle */
207 static void hput(cbk_handle
* h
);
209 static level_lookup_result
search_to_left(cbk_handle
* h
);
211 /* pack numerous (numberous I should say) arguments of coord_by_key() into
213 static cbk_handle
*cbk_pack(cbk_handle
* handle
,
215 const reiser4_key
* key
,
217 lock_handle
* active_lh
,
218 lock_handle
* parent_lh
,
219 znode_lock_mode lock_mode
,
221 tree_level lock_level
,
222 tree_level stop_level
,
223 __u32 flags
, ra_info_t
* info
)
225 memset(handle
, 0, sizeof *handle
);
229 handle
->lock_mode
= lock_mode
;
231 handle
->lock_level
= lock_level
;
232 handle
->stop_level
= stop_level
;
233 handle
->coord
= coord
;
234 /* set flags. See comment in tree.h:cbk_flags */
235 handle
->flags
= flags
| CBK_TRUST_DK
| CBK_USE_CRABLOCK
;
237 handle
->active_lh
= active_lh
;
238 handle
->parent_lh
= parent_lh
;
239 handle
->ra_info
= info
;
243 /* main tree lookup procedure
245 Check coord cache. If key we are looking for is not found there, call cbk()
246 to do real tree traversal.
248 As we have extents on the twig level, @lock_level and @stop_level can
249 be different from LEAF_LEVEL and each other.
251 Thread cannot keep any reiser4 locks (tree, znode, dk spin-locks, or znode
252 long term locks) while calling this.
254 lookup_result
coord_by_key(reiser4_tree
* tree
/* tree to perform search
255 * in. Usually this tree is
256 * part of file-system
258 const reiser4_key
* key
/* key to look for */ ,
259 coord_t
* coord
/* where to store found
260 * position in a tree. Fields
261 * in "coord" are only valid if
262 * coord_by_key() returned
263 * "CBK_COORD_FOUND" */ ,
264 lock_handle
* lh
, /* resulting lock handle */
265 znode_lock_mode lock_mode
/* type of lookup we
267 * ZNODE_READ_LOCK here
268 * if you only want to
269 * read item found and
270 * ZNODE_WRITE_LOCK if
273 lookup_bias bias
/* what to return if coord
274 * with exactly the @key is
275 * not in the tree */ ,
276 tree_level lock_level
/* tree level where to start
277 * taking @lock type of
279 tree_level stop_level
/* tree level to stop. Pass
280 * LEAF_LEVEL or TWIG_LEVEL
281 * here Item being looked
282 * for has to be between
284 * @stop_level, inclusive */ ,
285 __u32 flags
/* search flags */ ,
288 /* information about desired tree traversal readahead */
292 lock_handle parent_lh
;
293 lookup_result result
;
298 assert("nikita-3023", reiser4_schedulable());
300 assert("nikita-353", tree
!= NULL
);
301 assert("nikita-354", key
!= NULL
);
302 assert("nikita-355", coord
!= NULL
);
303 assert("nikita-356", (bias
== FIND_EXACT
)
304 || (bias
== FIND_MAX_NOT_MORE_THAN
));
305 assert("nikita-357", stop_level
>= LEAF_LEVEL
);
306 /* no locks can be held during tree traversal */
307 assert("nikita-2104", lock_stack_isclean(get_current_lock_stack()));
315 lock_mode
, bias
, lock_level
, stop_level
, flags
, info
);
317 result
= coord_by_handle(&handle
);
318 assert("nikita-3247",
319 ergo(!IS_CBKERR(result
), coord
->node
== lh
->node
));
323 /* like coord_by_key(), but starts traversal from vroot of @object rather than
325 lookup_result
reiser4_object_lookup(struct inode
* object
,
326 const reiser4_key
* key
,
329 znode_lock_mode lock_mode
,
331 tree_level lock_level
,
332 tree_level stop_level
, __u32 flags
,
336 lock_handle parent_lh
;
337 lookup_result result
;
342 assert("nikita-3023", reiser4_schedulable());
344 assert("nikita-354", key
!= NULL
);
345 assert("nikita-355", coord
!= NULL
);
346 assert("nikita-356", (bias
== FIND_EXACT
)
347 || (bias
== FIND_MAX_NOT_MORE_THAN
));
348 assert("nikita-357", stop_level
>= LEAF_LEVEL
);
349 /* no locks can be held during tree search by key */
350 assert("nikita-2104", lock_stack_isclean(get_current_lock_stack()));
353 object
!= NULL
? reiser4_tree_by_inode(object
) : current_tree
,
358 lock_mode
, bias
, lock_level
, stop_level
, flags
, info
);
359 handle
.object
= object
;
361 result
= coord_by_handle(&handle
);
362 assert("nikita-3247",
363 ergo(!IS_CBKERR(result
), coord
->node
== lh
->node
));
367 /* lookup by cbk_handle. Common part of coord_by_key() and
368 reiser4_object_lookup(). */
369 static lookup_result
coord_by_handle(cbk_handle
* handle
)
372 * first check cbk_cache (which is look-aside cache for our tree) and
373 * of this fails, start traversal.
375 /* first check whether "key" is in cache of recent lookups. */
376 if (cbk_cache_search(handle
) == 0)
377 return handle
->result
;
379 return traverse_tree(handle
);
382 /* Execute actor for each item (or unit, depending on @through_units_p),
383 starting from @coord, right-ward, until either:
385 - end of the tree is reached
386 - unformatted node is met
388 - @actor returns 0 or less
390 Error code, or last actor return value is returned.
392 This is used by plugin/dir/hashe_dir.c:reiser4_find_entry() to move through
393 sequence of entries with identical keys and alikes.
395 int reiser4_iterate_tree(reiser4_tree
* tree
/* tree to scan */ ,
396 coord_t
* coord
/* coord to start from */ ,
397 lock_handle
* lh
/* lock handle to start with and to
398 * update along the way */ ,
399 tree_iterate_actor_t actor
/* function to call on each
401 void *arg
/* argument to pass to @actor */ ,
402 znode_lock_mode mode
/* lock mode on scanned nodes */ ,
403 int through_units_p
/* call @actor on each item or on
408 assert("nikita-1143", tree
!= NULL
);
409 assert("nikita-1145", coord
!= NULL
);
410 assert("nikita-1146", lh
!= NULL
);
411 assert("nikita-1147", actor
!= NULL
);
413 result
= zload(coord
->node
);
414 coord_clear_iplug(coord
);
417 if (!coord_is_existing_unit(coord
)) {
421 while ((result
= actor(tree
, coord
, lh
, arg
)) > 0) {
423 if ((through_units_p
&& coord_next_unit(coord
)) ||
424 (!through_units_p
&& coord_next_item(coord
))) {
428 /* move to the next node */
431 reiser4_get_right_neighbor(&couple
,
434 GN_CAN_USE_UPPER_LEVELS
);
438 result
= zload(couple
.node
);
444 coord_init_first_unit(coord
,
447 move_lh(lh
, &couple
);
450 } while (node_is_empty(coord
->node
));
453 assert("nikita-1149", coord_is_existing_unit(coord
));
459 /* return locked uber znode for @tree */
460 int get_uber_znode(reiser4_tree
* tree
, znode_lock_mode mode
,
461 znode_lock_request pri
, lock_handle
* lh
)
465 result
= longterm_lock_znode(lh
, tree
->uber
, mode
, pri
);
469 /* true if @key is strictly within @node
471 we are looking for possibly non-unique key and it is item is at the edge of
472 @node. May be it is in the neighbor.
474 static int znode_contains_key_strict(znode
* node
/* node to check key
477 key
/* key to check */ ,
482 assert("nikita-1760", node
!= NULL
);
483 assert("nikita-1722", key
!= NULL
);
485 if (keyge(key
, &node
->rd_key
))
488 answer
= keycmp(&node
->ld_key
, key
);
491 return answer
!= GREATER_THAN
;
493 return answer
== LESS_THAN
;
497 * Virtual Root (vroot) code.
499 * For given file system object (e.g., regular file or directory) let's
500 * define its "virtual root" as lowest in the tree (that is, furtherest
501 * from the tree root) node such that all body items of said object are
502 * located in a tree rooted at this node.
504 * Once vroot of object is found all tree lookups for items within body of
505 * this object ("object lookups") can be started from its vroot rather
506 * than from real root. This has following advantages:
508 * 1. amount of nodes traversed during lookup (and, hence, amount of
509 * key comparisons made) decreases, and
511 * 2. contention on tree root is decreased. This latter was actually
512 * motivating reason behind vroot, because spin lock of root node,
513 * which is taken when acquiring long-term lock on root node is the
514 * hottest lock in the reiser4.
518 * When vroot of object F is not yet determined, all object lookups start
519 * from the root of the tree. At each tree level during traversal we have
520 * a node N such that a key we are looking for (which is the key inside
521 * object's body) is located within N. In function handle_vroot() called
522 * from cbk_level_lookup() we check whether N is possible vroot for
523 * F. Check is trivial---if neither leftmost nor rightmost item of N
524 * belongs to F (and we already have helpful ->owns_item() method of
525 * object plugin for this), then N is possible vroot of F. This, of
526 * course, relies on the assumption that each object occupies contiguous
527 * range of keys in the tree.
529 * Thus, traversing tree downward and checking each node as we go, we can
530 * find lowest such node, which, by definition, is vroot.
532 * How to track vroot.
534 * Nohow. If actual vroot changes, next object lookup will just restart
535 * from the actual tree root, refreshing object's vroot along the way.
540 * Check whether @node is possible vroot of @object.
542 static void handle_vroot(struct inode
*object
, znode
* node
)
547 fplug
= inode_file_plugin(object
);
548 assert("nikita-3353", fplug
!= NULL
);
549 assert("nikita-3354", fplug
->owns_item
!= NULL
);
551 if (unlikely(node_is_empty(node
)))
554 coord_init_first_unit(&coord
, node
);
556 * if leftmost item of @node belongs to @object, we cannot be sure
557 * that @node is vroot of @object, because, some items of @object are
558 * probably in the sub-tree rooted at the left neighbor of @node.
560 if (fplug
->owns_item(object
, &coord
))
562 coord_init_last_unit(&coord
, node
);
563 /* mutatis mutandis for the rightmost item */
564 if (fplug
->owns_item(object
, &coord
))
566 /* otherwise, @node is possible vroot of @object */
567 inode_set_vroot(object
, node
);
571 * helper function used by traverse tree to start tree traversal not from the
572 * tree root, but from @h->object's vroot, if possible.
574 static int prepare_object_lookup(cbk_handle
* h
)
579 vroot
= inode_get_vroot(h
->object
);
582 * object doesn't have known vroot, start from real tree root.
587 h
->level
= znode_get_level(vroot
);
588 /* take a long-term lock on vroot */
589 h
->result
= longterm_lock_znode(h
->active_lh
, vroot
,
590 cbk_lock_mode(h
->level
, h
),
592 result
= LOOKUP_REST
;
593 if (h
->result
== 0) {
597 isunique
= h
->flags
& CBK_UNIQUE
;
598 /* check that key is inside vroot */
599 read_lock_dk(h
->tree
);
600 inside
= (znode_contains_key_strict(vroot
, h
->key
, isunique
) &&
601 !ZF_ISSET(vroot
, JNODE_HEARD_BANSHEE
));
602 read_unlock_dk(h
->tree
);
604 h
->result
= zload(vroot
);
605 if (h
->result
== 0) {
606 /* search for key in vroot. */
607 result
= cbk_node_lookup(h
);
608 zrelse(vroot
); /*h->active_lh->node); */
609 if (h
->active_lh
->node
!= vroot
) {
610 result
= LOOKUP_REST
;
611 } else if (result
== LOOKUP_CONT
) {
612 move_lh(h
->parent_lh
, h
->active_lh
);
613 h
->flags
&= ~CBK_DKSET
;
621 if (IS_CBKERR(h
->result
) || result
== LOOKUP_REST
)
626 /* main function that handles common parts of tree traversal: starting
627 (fake znode handling), restarts, error handling, completion */
628 static lookup_result
traverse_tree(cbk_handle
* h
/* search handle */ )
634 assert("nikita-365", h
!= NULL
);
635 assert("nikita-366", h
->tree
!= NULL
);
636 assert("nikita-367", h
->key
!= NULL
);
637 assert("nikita-368", h
->coord
!= NULL
);
638 assert("nikita-369", (h
->bias
== FIND_EXACT
)
639 || (h
->bias
== FIND_MAX_NOT_MORE_THAN
));
640 assert("nikita-370", h
->stop_level
>= LEAF_LEVEL
);
641 assert("nikita-2949", !(h
->flags
& CBK_DKSET
));
642 assert("zam-355", lock_stack_isclean(get_current_lock_stack()));
648 /* loop for restarts */
651 assert("nikita-3024", reiser4_schedulable());
653 h
->result
= CBK_COORD_FOUND
;
654 /* connect_znode() needs it */
655 h
->ld_key
= *reiser4_min_key();
656 h
->rd_key
= *reiser4_max_key();
657 h
->flags
|= CBK_DKSET
;
660 if (!vroot_used
&& h
->object
!= NULL
) {
662 done
= prepare_object_lookup(h
);
663 if (done
== LOOKUP_REST
) {
665 } else if (done
== LOOKUP_DONE
)
668 if (h
->parent_lh
->node
== NULL
) {
670 get_uber_znode(h
->tree
, ZNODE_READ_LOCK
, ZNODE_LOCK_LOPRI
,
673 assert("nikita-1637", done
!= -E_DEADLOCK
);
675 h
->block
= h
->tree
->root_block
;
676 h
->level
= h
->tree
->height
;
677 h
->coord
->node
= h
->parent_lh
->node
;
683 /* loop descending a tree */
686 if (unlikely((iterations
> REISER4_CBK_ITERATIONS_LIMIT
) &&
687 IS_POW(iterations
))) {
688 warning("nikita-1481", "Too many iterations: %i",
690 reiser4_print_key("key", h
->key
);
692 } else if (unlikely(iterations
> REISER4_MAX_CBK_ITERATIONS
)) {
694 "reiser-2018: Too many iterations. Tree corrupted, or (less likely) starvation occurring.";
695 h
->result
= RETERR(-EIO
);
698 switch (cbk_level_lookup(h
)) {
700 move_lh(h
->parent_lh
, h
->active_lh
);
703 wrong_return_value("nikita-372", "cbk_level");
709 /* deadlock avoidance is normal case. */
710 if (h
->result
!= -E_DEADLOCK
)
712 reiser4_preempt_point();
716 /* that's all. The rest is error handling */
717 if (unlikely(h
->error
!= NULL
)) {
718 warning("nikita-373", "%s: level: %i, "
719 "lock_level: %i, stop_level: %i "
720 "lock_mode: %s, bias: %s",
721 h
->error
, h
->level
, h
->lock_level
, h
->stop_level
,
722 lock_mode_name(h
->lock_mode
), bias_name(h
->bias
));
723 reiser4_print_address("block", &h
->block
);
724 reiser4_print_key("key", h
->key
);
725 print_coord_content("coord", h
->coord
);
727 /* `unlikely' error case */
728 if (unlikely(IS_CBKERR(h
->result
))) {
729 /* failure. do cleanup */
732 assert("nikita-1605", WITH_DATA_RET
734 ergo((h
->result
== CBK_COORD_FOUND
) &&
735 (h
->bias
== FIND_EXACT
) &&
736 (!node_is_empty(h
->coord
->node
)),
737 coord_is_existing_item(h
->coord
))));
742 /* find delimiting keys of child
744 Determine left and right delimiting keys for child pointed to by
748 static void find_child_delimiting_keys(znode
* parent
/* parent znode, passed
750 const coord_t
* parent_coord
/* coord where
754 reiser4_key
* ld
/* where to store left
755 * delimiting key */ ,
756 reiser4_key
* rd
/* where to store right
757 * delimiting key */ )
761 assert("nikita-1484", parent
!= NULL
);
762 assert_rw_locked(&(znode_get_tree(parent
)->dk_lock
));
764 coord_dup(&neighbor
, parent_coord
);
766 if (neighbor
.between
== AT_UNIT
)
767 /* imitate item ->lookup() behavior. */
768 neighbor
.between
= AFTER_UNIT
;
770 if (coord_set_to_left(&neighbor
) == 0)
771 unit_key_by_coord(&neighbor
, ld
);
773 assert("nikita-14851", 0);
774 *ld
= *znode_get_ld_key(parent
);
777 coord_dup(&neighbor
, parent_coord
);
778 if (neighbor
.between
== AT_UNIT
)
779 neighbor
.between
= AFTER_UNIT
;
780 if (coord_set_to_right(&neighbor
) == 0)
781 unit_key_by_coord(&neighbor
, rd
);
783 *rd
= *znode_get_rd_key(parent
);
787 * setup delimiting keys for a child
789 * @parent parent node
791 * @coord location in @parent where pointer to @child is
796 set_child_delimiting_keys(znode
* parent
, const coord_t
* coord
, znode
* child
)
800 assert("nikita-2952",
801 znode_get_level(parent
) == znode_get_level(coord
->node
));
803 /* fast check without taking dk lock. This is safe, because
804 * JNODE_DKSET is never cleared once set. */
805 if (!ZF_ISSET(child
, JNODE_DKSET
)) {
806 tree
= znode_get_tree(parent
);
808 if (likely(!ZF_ISSET(child
, JNODE_DKSET
))) {
809 find_child_delimiting_keys(parent
, coord
,
812 ON_DEBUG(child
->ld_key_version
=
813 atomic_inc_return(&delim_key_version
);
814 child
->rd_key_version
=
815 atomic_inc_return(&delim_key_version
););
816 ZF_SET(child
, JNODE_DKSET
);
818 write_unlock_dk(tree
);
824 /* Perform tree lookup at one level. This is called from cbk_traverse()
825 function that drives lookup through tree and calls cbk_node_lookup() to
826 perform lookup within one node.
828 See comments in a code.
830 static level_lookup_result
cbk_level_lookup(cbk_handle
* h
/* search handle */ )
839 assert("nikita-3025", reiser4_schedulable());
841 /* acquire reference to @active node */
843 zget(h
->tree
, &h
->block
, h
->parent_lh
->node
, h
->level
,
844 reiser4_ctx_gfp_mask_get());
846 if (IS_ERR(active
)) {
847 h
->result
= PTR_ERR(active
);
852 h
->result
= longterm_lock_znode(h
->active_lh
,
854 cbk_lock_mode(h
->level
, h
),
856 /* longterm_lock_znode() acquires additional reference to znode (which
857 will be later released by longterm_unlock_znode()). Release
858 reference acquired by zget().
861 if (unlikely(h
->result
!= 0))
862 goto fail_or_restart
;
865 /* if @active is accessed for the first time, setup delimiting keys on
866 it. Delimiting keys are taken from the parent node. See
867 setup_delimiting_keys() for details.
869 if (h
->flags
& CBK_DKSET
) {
870 setdk
= setup_delimiting_keys(h
);
871 h
->flags
&= ~CBK_DKSET
;
875 parent
= h
->parent_lh
->node
;
876 h
->result
= zload(parent
);
877 if (unlikely(h
->result
!= 0))
878 goto fail_or_restart
;
880 if (!ZF_ISSET(active
, JNODE_DKSET
))
881 setdk
= set_child_delimiting_keys(parent
,
884 read_lock_dk(h
->tree
);
885 find_child_delimiting_keys(parent
, h
->coord
, &ldkey
,
887 read_unlock_dk(h
->tree
);
893 /* this is ugly kludge. Reminder: this is necessary, because
894 ->lookup() method returns coord with ->between field probably set
895 to something different from AT_UNIT.
897 h
->coord
->between
= AT_UNIT
;
899 if (znode_just_created(active
) && (h
->coord
->node
!= NULL
)) {
900 write_lock_tree(h
->tree
);
901 /* if we are going to load znode right now, setup
902 ->in_parent: coord where pointer to this node is stored in
905 coord_to_parent_coord(h
->coord
, &active
->in_parent
);
906 write_unlock_tree(h
->tree
);
909 /* check connectedness without holding tree lock---false negatives
910 * will be re-checked by connect_znode(), and false positives are
911 * impossible---@active cannot suddenly turn into unconnected
913 if (!znode_is_connected(active
)) {
914 h
->result
= connect_znode(h
->coord
, active
);
915 if (unlikely(h
->result
!= 0)) {
917 goto fail_or_restart
;
921 jload_prefetch(ZJNODE(active
));
924 update_stale_dk(h
->tree
, active
);
926 /* put_parent() cannot be called earlier, because connect_znode()
927 assumes parent node is referenced; */
930 if ((!znode_contains_key_lock(active
, h
->key
) &&
931 (h
->flags
& CBK_TRUST_DK
))
932 || ZF_ISSET(active
, JNODE_HEARD_BANSHEE
)) {
933 /* 1. key was moved out of this node while this thread was
934 waiting for the lock. Restart. More elaborate solution is
935 to determine where key moved (to the left, or to the right)
936 and try to follow it through sibling pointers.
938 2. or, node itself is going to be removed from the
939 tree. Release lock and restart.
941 h
->result
= -E_REPEAT
;
943 if (h
->result
== -E_REPEAT
)
946 h
->result
= zload_ra(active
, h
->ra_info
);
952 if (sanity_check(h
)) {
957 /* check that key of leftmost item in the @active is the same as in
959 if (ldkeyset
&& !node_is_empty(active
) &&
960 !keyeq(leftmost_key_in_node(active
, &key
), &ldkey
)) {
961 warning("vs-3533", "Keys are inconsistent. Fsck?");
962 reiser4_print_key("inparent", &ldkey
);
963 reiser4_print_key("inchild", &key
);
964 h
->result
= RETERR(-EIO
);
969 if (h
->object
!= NULL
)
970 handle_vroot(h
->object
, active
);
972 ret
= cbk_node_lookup(h
);
974 /* h->active_lh->node might change, but active is yet to be zrelsed */
980 if (h
->result
== -E_DEADLOCK
)
986 /* check left and right delimiting keys of a znode */
987 void check_dkeys(znode
* node
)
992 read_lock_tree(current_tree
);
993 read_lock_dk(current_tree
);
995 assert("vs-1710", znode_is_any_locked(node
));
997 !keygt(znode_get_ld_key(node
), znode_get_rd_key(node
)));
1000 right
= node
->right
;
1002 if (ZF_ISSET(node
, JNODE_LEFT_CONNECTED
) && ZF_ISSET(node
, JNODE_DKSET
)
1003 && left
!= NULL
&& ZF_ISSET(left
, JNODE_DKSET
))
1004 /* check left neighbor. Note that left neighbor is not locked,
1005 so it might get wrong delimiting keys therefore */
1007 (keyeq(znode_get_rd_key(left
), znode_get_ld_key(node
))
1008 || ZF_ISSET(left
, JNODE_HEARD_BANSHEE
)));
1010 if (ZF_ISSET(node
, JNODE_RIGHT_CONNECTED
) && ZF_ISSET(node
, JNODE_DKSET
)
1011 && right
!= NULL
&& ZF_ISSET(right
, JNODE_DKSET
))
1012 /* check right neighbor. Note that right neighbor is not
1013 locked, so it might get wrong delimiting keys therefore */
1015 (keyeq(znode_get_rd_key(node
), znode_get_ld_key(right
))
1016 || ZF_ISSET(right
, JNODE_HEARD_BANSHEE
)));
1018 read_unlock_dk(current_tree
);
1019 read_unlock_tree(current_tree
);
1023 /* true if @key is left delimiting key of @node */
1024 static int key_is_ld(znode
* node
, const reiser4_key
* key
)
1028 assert("nikita-1716", node
!= NULL
);
1029 assert("nikita-1758", key
!= NULL
);
1031 read_lock_dk(znode_get_tree(node
));
1032 assert("nikita-1759", znode_contains_key(node
, key
));
1033 ld
= keyeq(znode_get_ld_key(node
), key
);
1034 read_unlock_dk(znode_get_tree(node
));
1038 /* Process one node during tree traversal.
1040 This is called by cbk_level_lookup(). */
1041 static level_lookup_result
cbk_node_lookup(cbk_handle
* h
/* search handle */ )
1043 /* node plugin of @active */
1045 /* item plugin of item that was found */
1048 lookup_bias node_bias
;
1049 /* node we are operating upon */
1051 /* tree we are searching in */
1056 assert("nikita-379", h
!= NULL
);
1058 active
= h
->active_lh
->node
;
1061 nplug
= active
->nplug
;
1062 assert("nikita-380", nplug
!= NULL
);
1064 ON_DEBUG(check_dkeys(active
));
1066 /* return item from "active" node with maximal key not greater than
1068 node_bias
= h
->bias
;
1069 result
= nplug
->lookup(active
, h
->key
, node_bias
, h
->coord
);
1070 if (unlikely(result
!= NS_FOUND
&& result
!= NS_NOT_FOUND
)) {
1071 /* error occurred */
1075 if (h
->level
== h
->stop_level
) {
1076 /* welcome to the stop level */
1077 assert("nikita-381", h
->coord
->node
== active
);
1078 if (result
== NS_FOUND
) {
1079 /* success of tree lookup */
1080 if (!(h
->flags
& CBK_UNIQUE
)
1081 && key_is_ld(active
, h
->key
)) {
1082 return search_to_left(h
);
1084 h
->result
= CBK_COORD_FOUND
;
1086 h
->result
= CBK_COORD_NOTFOUND
;
1088 if (!(h
->flags
& CBK_IN_CACHE
))
1089 cbk_cache_add(active
);
1093 if (h
->level
> TWIG_LEVEL
&& result
== NS_NOT_FOUND
) {
1094 h
->error
= "not found on internal node";
1099 assert("vs-361", h
->level
> h
->stop_level
);
1101 if (handle_eottl(h
, &result
)) {
1102 assert("vs-1674", (result
== LOOKUP_DONE
||
1103 result
== LOOKUP_REST
));
1107 /* go down to next level */
1108 check_me("vs-12", zload(h
->coord
->node
) == 0);
1109 assert("nikita-2116", item_is_internal(h
->coord
));
1110 iplug
= item_plugin_by_coord(h
->coord
);
1111 iplug
->s
.internal
.down_link(h
->coord
, h
->key
, &h
->block
);
1112 zrelse(h
->coord
->node
);
1114 return LOOKUP_CONT
; /* continue */
1117 /* scan cbk_cache slots looking for a match for @h */
1118 static int cbk_cache_scan_slots(cbk_handle
* h
/* cbk handle */ )
1120 level_lookup_result llr
;
1123 cbk_cache_slot
*slot
;
1127 const reiser4_key
*key
;
1130 assert("nikita-1317", h
!= NULL
);
1131 assert("nikita-1315", h
->tree
!= NULL
);
1132 assert("nikita-1316", h
->key
!= NULL
);
1135 cache
= &tree
->cbk_cache
;
1136 if (cache
->nr_slots
== 0)
1137 /* size of cbk cache was set to 0 by mount time option. */
1138 return RETERR(-ENOENT
);
1140 assert("nikita-2474", cbk_cache_invariant(cache
));
1141 node
= NULL
; /* to keep gcc happy */
1144 isunique
= h
->flags
& CBK_UNIQUE
;
1145 result
= RETERR(-ENOENT
);
1148 * this is time-critical function and dragons had, hence, been settled
1151 * Loop below scans cbk cache slots trying to find matching node with
1152 * suitable range of delimiting keys and located at the h->level.
1154 * Scan is done under cbk cache spin lock that protects slot->node
1155 * pointers. If suitable node is found we want to pin it in
1156 * memory. But slot->node can point to the node with x_count 0
1157 * (unreferenced). Such node can be recycled at any moment, or can
1158 * already be in the process of being recycled (within jput()).
1160 * As we found node in the cbk cache, it means that jput() hasn't yet
1161 * called cbk_cache_invalidate().
1163 * We acquire reference to the node without holding tree lock, and
1164 * later, check node's RIP bit. This avoids races with jput().
1168 read_lock(&((cbk_cache
*)cache
)->guard
);
1170 slot
= list_entry(cache
->lru
.next
, cbk_cache_slot
, lru
);
1171 slot
= list_entry(slot
->lru
.prev
, cbk_cache_slot
, lru
);
1172 BUG_ON(&slot
->lru
!= &cache
->lru
);/*????*/
1175 slot
= list_entry(slot
->lru
.next
, cbk_cache_slot
, lru
);
1177 if (&cache
->lru
!= &slot
->lru
)
1182 if (unlikely(node
== NULL
))
1186 * this is (hopefully) the only place in the code where we are
1187 * working with delimiting keys without holding dk lock. This
1188 * is fine here, because this is only "guess" anyway---keys
1189 * are rechecked under dk lock below.
1191 if (znode_get_level(node
) == level
&&
1192 /* reiser4_min_key < key < reiser4_max_key */
1193 znode_contains_key_strict(node
, key
, isunique
)) {
1196 spin_lock_prefetch(&tree
->tree_lock
);
1200 read_unlock(&((cbk_cache
*)cache
)->guard
);
1202 assert("nikita-2475", cbk_cache_invariant(cache
));
1204 if (unlikely(result
== 0 && ZF_ISSET(node
, JNODE_RIP
)))
1210 h
->result
= CBK_COORD_NOTFOUND
;
1211 return RETERR(-ENOENT
);
1215 longterm_lock_znode(h
->active_lh
, node
, cbk_lock_mode(level
, h
),
1220 result
= zload(node
);
1226 result
= (znode_contains_key_strict(node
, key
, isunique
) &&
1227 !ZF_ISSET(node
, JNODE_HEARD_BANSHEE
));
1228 read_unlock_dk(tree
);
1230 /* do lookup inside node */
1231 llr
= cbk_node_lookup(h
);
1232 /* if cbk_node_lookup() wandered to another node (due to eottl
1233 or non-unique keys), adjust @node */
1234 /*node = h->active_lh->node; */
1236 if (llr
!= LOOKUP_DONE
) {
1237 /* restart or continue on the next level */
1238 result
= RETERR(-ENOENT
);
1239 } else if (IS_CBKERR(h
->result
))
1241 result
= RETERR(-ENOENT
);
1243 /* good. Either item found or definitely not found. */
1246 write_lock(&(cache
->guard
));
1247 if (slot
->node
== h
->active_lh
->node
/*node */ ) {
1248 /* if this node is still in cbk cache---move
1249 its slot to the head of the LRU list. */
1250 list_move(&slot
->lru
, &cache
->lru
);
1252 write_unlock(&(cache
->guard
));
1255 /* race. While this thread was waiting for the lock, node was
1256 rebalanced and item we are looking for, shifted out of it
1257 (if it ever was here).
1259 Continuing scanning is almost hopeless: node key range was
1260 moved to, is almost certainly at the beginning of the LRU
1261 list at this time, because it's hot, but restarting
1262 scanning from the very beginning is complex. Just return,
1263 so that cbk() will be performed. This is not that
1264 important, because such races should be rare. Are they?
1266 result
= RETERR(-ENOENT
); /* -ERAUGHT */
1269 assert("nikita-2476", cbk_cache_invariant(cache
));
1273 /* look for item with given key in the coord cache
1275 This function, called by coord_by_key(), scans "coord cache" (&cbk_cache)
1276 which is a small LRU list of znodes accessed lately. For each znode in
1277 znode in this list, it checks whether key we are looking for fits into key
1278 range covered by this node. If so, and in addition, node lies at allowed
1279 level (this is to handle extents on a twig level), node is locked, and
1280 lookup inside it is performed.
1282 we need a measurement of the cost of this cache search compared to the cost
1286 static int cbk_cache_search(cbk_handle
* h
/* cbk handle */ )
1291 /* add CBK_IN_CACHE to the handle flags. This means that
1292 * cbk_node_lookup() assumes that cbk_cache is scanned and would add
1293 * found node to the cache. */
1294 h
->flags
|= CBK_IN_CACHE
;
1295 for (level
= h
->stop_level
; level
<= h
->lock_level
; ++level
) {
1297 result
= cbk_cache_scan_slots(h
);
1299 done_lh(h
->active_lh
);
1300 done_lh(h
->parent_lh
);
1302 assert("nikita-1319", !IS_CBKERR(h
->result
));
1306 h
->flags
&= ~CBK_IN_CACHE
;
1310 /* type of lock we want to obtain during tree traversal. On stop level
1311 we want type of lock user asked for, on upper levels: read lock. */
1312 znode_lock_mode
cbk_lock_mode(tree_level level
, cbk_handle
* h
)
1314 assert("nikita-382", h
!= NULL
);
1316 return (level
<= h
->lock_level
) ? h
->lock_mode
: ZNODE_READ_LOCK
;
1319 /* update outdated delimiting keys */
1320 static void stale_dk(reiser4_tree
* tree
, znode
* node
)
1324 read_lock_tree(tree
);
1325 write_lock_dk(tree
);
1326 right
= node
->right
;
1328 if (ZF_ISSET(node
, JNODE_RIGHT_CONNECTED
) &&
1329 right
&& ZF_ISSET(right
, JNODE_DKSET
) &&
1330 !keyeq(znode_get_rd_key(node
), znode_get_ld_key(right
)))
1331 znode_set_rd_key(node
, znode_get_ld_key(right
));
1333 write_unlock_dk(tree
);
1334 read_unlock_tree(tree
);
1337 /* check for possibly outdated delimiting keys, and update them if
1339 static void update_stale_dk(reiser4_tree
* tree
, znode
* node
)
1344 read_lock_tree(tree
);
1346 rd
= *znode_get_rd_key(node
);
1347 right
= node
->right
;
1348 if (unlikely(ZF_ISSET(node
, JNODE_RIGHT_CONNECTED
) &&
1349 right
&& ZF_ISSET(right
, JNODE_DKSET
) &&
1350 !keyeq(&rd
, znode_get_ld_key(right
)))) {
1351 assert("nikita-38211", ZF_ISSET(node
, JNODE_DKSET
));
1352 read_unlock_dk(tree
);
1353 read_unlock_tree(tree
);
1354 stale_dk(tree
, node
);
1357 read_unlock_dk(tree
);
1358 read_unlock_tree(tree
);
1362 * handle searches a the non-unique key.
1364 * Suppose that we are looking for an item with possibly non-unique key 100.
1366 * Root node contains two pointers: one to a node with left delimiting key 0,
1367 * and another to a node with left delimiting key 100. Item we interested in
1368 * may well happen in the sub-tree rooted at the first pointer.
1370 * To handle this search_to_left() is called when search reaches stop
1371 * level. This function checks it is _possible_ that item we are looking for
1372 * is in the left neighbor (this can be done by comparing delimiting keys) and
1373 * if so, tries to lock left neighbor (this is low priority lock, so it can
1374 * deadlock, tree traversal is just restarted if it did) and then checks
1375 * whether left neighbor actually contains items with our key.
1377 * Note that this is done on the stop level only. It is possible to try such
1378 * left-check on each level, but as duplicate keys are supposed to be rare
1379 * (very unlikely that more than one node is completely filled with items with
1380 * duplicate keys), it sis cheaper to scan to the left on the stop level once.
1383 static level_lookup_result
search_to_left(cbk_handle
* h
/* search handle */ )
1385 level_lookup_result result
;
1392 assert("nikita-1761", h
!= NULL
);
1393 assert("nikita-1762", h
->level
== h
->stop_level
);
1397 node
= h
->active_lh
->node
;
1398 assert("nikita-1763", coord_is_leftmost_unit(coord
));
1401 reiser4_get_left_neighbor(&lh
, node
, (int)h
->lock_mode
,
1402 GN_CAN_USE_UPPER_LEVELS
);
1404 switch (h
->result
) {
1406 result
= LOOKUP_REST
;
1414 h
->result
= zload(neighbor
);
1415 if (h
->result
!= 0) {
1416 result
= LOOKUP_DONE
;
1420 nplug
= neighbor
->nplug
;
1422 coord_init_zero(&crd
);
1424 h
->bias
= FIND_EXACT
;
1426 nplug
->lookup(neighbor
, h
->key
, h
->bias
, &crd
);
1429 if (h
->result
== NS_NOT_FOUND
) {
1430 case -E_NO_NEIGHBOR
:
1431 h
->result
= CBK_COORD_FOUND
;
1432 if (!(h
->flags
& CBK_IN_CACHE
))
1433 cbk_cache_add(node
);
1434 default: /* some other error */
1435 result
= LOOKUP_DONE
;
1436 } else if (h
->result
== NS_FOUND
) {
1437 read_lock_dk(znode_get_tree(neighbor
));
1438 h
->rd_key
= *znode_get_ld_key(node
);
1439 leftmost_key_in_node(neighbor
, &h
->ld_key
);
1440 read_unlock_dk(znode_get_tree(neighbor
));
1441 h
->flags
|= CBK_DKSET
;
1443 h
->block
= *znode_get_block(neighbor
);
1444 /* clear coord -> node so that cbk_level_lookup()
1445 wouldn't overwrite parent hint in neighbor.
1447 Parent hint was set up by
1448 reiser4_get_left_neighbor()
1450 /* FIXME: why do we have to spinlock here? */
1451 write_lock_tree(znode_get_tree(neighbor
));
1452 h
->coord
->node
= NULL
;
1453 write_unlock_tree(znode_get_tree(neighbor
));
1454 result
= LOOKUP_CONT
;
1456 result
= LOOKUP_DONE
;
1458 if (neighbor
!= NULL
)
1466 /* debugging aid: return symbolic name of search bias */
1467 static const char *bias_name(lookup_bias bias
/* bias to get name of */ )
1469 if (bias
== FIND_EXACT
)
1471 else if (bias
== FIND_MAX_NOT_MORE_THAN
)
1472 return "left-slant";
1473 /* else if( bias == RIGHT_SLANT_BIAS ) */
1474 /* return "right-bias"; */
1476 static char buf
[30];
1478 sprintf(buf
, "unknown: %i", bias
);
1484 /* debugging aid: print human readable information about @p */
1485 void print_coord_content(const char *prefix
/* prefix to print */ ,
1486 coord_t
* p
/* coord to print */ )
1491 printk("%s: null\n", prefix
);
1494 if ((p
->node
!= NULL
) && znode_is_loaded(p
->node
)
1495 && coord_is_existing_item(p
))
1496 printk("%s: data: %p, length: %i\n", prefix
,
1497 item_body_by_coord(p
), item_length_by_coord(p
));
1498 if (znode_is_loaded(p
->node
)) {
1499 item_key_by_coord(p
, &key
);
1500 reiser4_print_key(prefix
, &key
);
1504 /* debugging aid: print human readable information about @block */
1505 void reiser4_print_address(const char *prefix
/* prefix to print */ ,
1506 const reiser4_block_nr
* block
/* block number to print */ )
1508 printk("%s: %s\n", prefix
, sprint_address(block
));
1512 /* return string containing human readable representation of @block */
1513 char *sprint_address(const reiser4_block_nr
*
1514 block
/* block number to print */ )
1516 static char address
[30];
1519 sprintf(address
, "null");
1520 else if (reiser4_blocknr_is_fake(block
))
1521 sprintf(address
, "%llx", (unsigned long long)(*block
));
1523 sprintf(address
, "%llu", (unsigned long long)(*block
));
1527 /* release parent node during traversal */
1528 static void put_parent(cbk_handle
* h
/* search handle */ )
1530 assert("nikita-383", h
!= NULL
);
1531 if (h
->parent_lh
->node
!= NULL
) {
1532 longterm_unlock_znode(h
->parent_lh
);
1536 /* helper function used by coord_by_key(): release reference to parent znode
1537 stored in handle before processing its child. */
1538 static void hput(cbk_handle
* h
/* search handle */ )
1540 assert("nikita-385", h
!= NULL
);
1541 done_lh(h
->parent_lh
);
1542 done_lh(h
->active_lh
);
1545 /* Helper function used by cbk(): update delimiting keys of child node (stored
1546 in h->active_lh->node) using key taken from parent on the parent level. */
1547 static int setup_delimiting_keys(cbk_handle
* h
/* search handle */ )
1552 assert("nikita-1088", h
!= NULL
);
1554 active
= h
->active_lh
->node
;
1556 /* fast check without taking dk lock. This is safe, because
1557 * JNODE_DKSET is never cleared once set. */
1558 if (!ZF_ISSET(active
, JNODE_DKSET
)) {
1559 tree
= znode_get_tree(active
);
1560 write_lock_dk(tree
);
1561 if (!ZF_ISSET(active
, JNODE_DKSET
)) {
1562 znode_set_ld_key(active
, &h
->ld_key
);
1563 znode_set_rd_key(active
, &h
->rd_key
);
1564 ZF_SET(active
, JNODE_DKSET
);
1566 write_unlock_dk(tree
);
1572 /* true if @block makes sense for the @tree. Used to detect corrupted node
1575 block_nr_is_correct(reiser4_block_nr
* block
/* block number to check */ ,
1576 reiser4_tree
* tree
/* tree to check against */ )
1578 assert("nikita-757", block
!= NULL
);
1579 assert("nikita-758", tree
!= NULL
);
1581 /* check to see if it exceeds the size of the device. */
1582 return reiser4_blocknr_is_sane_for(tree
->super
, block
);
1585 /* check consistency of fields */
1586 static int sanity_check(cbk_handle
* h
/* search handle */ )
1588 assert("nikita-384", h
!= NULL
);
1590 if (h
->level
< h
->stop_level
) {
1591 h
->error
= "Buried under leaves";
1592 h
->result
= RETERR(-EIO
);
1594 } else if (!block_nr_is_correct(&h
->block
, h
->tree
)) {
1595 h
->error
= "bad block number";
1596 h
->result
= RETERR(-EIO
);
1602 /* Make Linus happy.
1604 c-indentation-style: "K&R"