1 /* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
10 #include "plugin/item/item.h"
11 #include "plugin/node/node.h"
12 #include "plugin/plugin.h"
15 #include "block_alloc.h"
16 #include "tree_walk.h"
22 #include <linux/slab.h>
24 static const char *bias_name(lookup_bias bias
);
26 /* tree searching algorithm, intranode searching algorithms are in
31 * The coord by key cache consists of small list of recently accessed nodes
32 * maintained according to the LRU discipline. Before doing real top-to-down
33 * tree traversal this cache is scanned for nodes that can contain key
36 * The efficiency of coord cache depends heavily on locality of reference for
37 * tree accesses. Our user level simulations show reasonably good hit ratios
38 * for coord cache under most loads so far.
41 /* Initialise coord cache slot */
42 static void cbk_cache_init_slot(cbk_cache_slot
*slot
)
44 assert("nikita-345", slot
!= NULL
);
46 INIT_LIST_HEAD(&slot
->lru
);
50 /* Initialize coord cache */
51 int cbk_cache_init(cbk_cache
* cache
/* cache to init */)
55 assert("nikita-346", cache
!= NULL
);
58 kmalloc(sizeof(cbk_cache_slot
) * cache
->nr_slots
,
59 reiser4_ctx_gfp_mask_get());
60 if (cache
->slot
== NULL
)
61 return RETERR(-ENOMEM
);
63 INIT_LIST_HEAD(&cache
->lru
);
64 for (i
= 0; i
< cache
->nr_slots
; ++i
) {
65 cbk_cache_init_slot(cache
->slot
+ i
);
66 list_add_tail(&((cache
->slot
+ i
)->lru
), &cache
->lru
);
68 rwlock_init(&cache
->guard
);
72 /* free cbk cache data */
73 void cbk_cache_done(cbk_cache
* cache
/* cache to release */)
75 assert("nikita-2493", cache
!= NULL
);
76 if (cache
->slot
!= NULL
) {
82 /* macro to iterate over all cbk cache slots */
83 #define for_all_slots(cache, slot) \
84 for ((slot) = list_entry((cache)->lru.next, cbk_cache_slot, lru); \
85 &(cache)->lru != &(slot)->lru; \
86 (slot) = list_entry(slot->lru.next, cbk_cache_slot, lru))
89 /* this function assures that [cbk-cache-invariant] invariant holds */
90 static int cbk_cache_invariant(const cbk_cache
* cache
)
96 if (cache
->nr_slots
== 0)
99 assert("nikita-2469", cache
!= NULL
);
102 read_lock(&((cbk_cache
*)cache
)->guard
);
103 for_all_slots(cache
, slot
) {
104 /* in LRU first go all `used' slots followed by `unused' */
105 if (unused
&& (slot
->node
!= NULL
))
107 if (slot
->node
== NULL
)
110 cbk_cache_slot
*scan
;
112 /* all cached nodes are different */
115 scan
= list_entry(scan
->lru
.next
,
116 cbk_cache_slot
, lru
);
117 if (&cache
->lru
== &scan
->lru
)
119 if (slot
->node
== scan
->node
)
126 read_unlock(&((cbk_cache
*)cache
)->guard
);
132 /* Remove references, if any, to @node from coord cache */
133 void cbk_cache_invalidate(const znode
* node
/* node to remove from cache */ ,
134 reiser4_tree
* tree
/* tree to remove node from */)
136 cbk_cache_slot
*slot
;
140 assert("nikita-350", node
!= NULL
);
141 assert("nikita-1479", LOCK_CNT_GTZ(rw_locked_tree
));
143 cache
= &tree
->cbk_cache
;
144 assert("nikita-2470", cbk_cache_invariant(cache
));
146 write_lock(&(cache
->guard
));
147 for (i
= 0, slot
= cache
->slot
; i
< cache
->nr_slots
; ++i
, ++slot
) {
148 if (slot
->node
== node
) {
149 list_move_tail(&slot
->lru
, &cache
->lru
);
154 write_unlock(&(cache
->guard
));
155 assert("nikita-2471", cbk_cache_invariant(cache
));
158 /* add to the cbk-cache in the "tree" information about "node". This
159 can actually be update of existing slot in a cache. */
160 static void cbk_cache_add(const znode
* node
/* node to add to the cache */)
164 cbk_cache_slot
*slot
;
167 assert("nikita-352", node
!= NULL
);
169 cache
= &znode_get_tree(node
)->cbk_cache
;
170 assert("nikita-2472", cbk_cache_invariant(cache
));
172 if (cache
->nr_slots
== 0)
175 write_lock(&(cache
->guard
));
176 /* find slot to update/add */
177 for (i
= 0, slot
= cache
->slot
; i
< cache
->nr_slots
; ++i
, ++slot
) {
178 /* oops, this node is already in a cache */
179 if (slot
->node
== node
)
182 /* if all slots are used, reuse least recently used one */
183 if (i
== cache
->nr_slots
) {
184 slot
= list_entry(cache
->lru
.prev
, cbk_cache_slot
, lru
);
185 slot
->node
= (znode
*) node
;
187 list_move(&slot
->lru
, &cache
->lru
);
188 write_unlock(&(cache
->guard
));
189 assert("nikita-2473", cbk_cache_invariant(cache
));
192 static int setup_delimiting_keys(cbk_handle
* h
);
193 static lookup_result
coord_by_handle(cbk_handle
* handle
);
194 static lookup_result
traverse_tree(cbk_handle
* h
);
195 static int cbk_cache_search(cbk_handle
* h
);
197 static level_lookup_result
cbk_level_lookup(cbk_handle
* h
);
198 static level_lookup_result
cbk_node_lookup(cbk_handle
* h
);
200 /* helper functions */
202 static void update_stale_dk(reiser4_tree
* tree
, znode
* node
);
204 /* release parent node during traversal */
205 static void put_parent(cbk_handle
* h
);
206 /* check consistency of fields */
207 static int sanity_check(cbk_handle
* h
);
208 /* release resources in handle */
209 static void hput(cbk_handle
* h
);
211 static level_lookup_result
search_to_left(cbk_handle
* h
);
213 /* pack numerous (numberous I should say) arguments of coord_by_key() into
215 static cbk_handle
*cbk_pack(cbk_handle
* handle
,
217 const reiser4_key
* key
,
219 lock_handle
* active_lh
,
220 lock_handle
* parent_lh
,
221 znode_lock_mode lock_mode
,
223 tree_level lock_level
,
224 tree_level stop_level
,
225 __u32 flags
, ra_info_t
*info
)
227 memset(handle
, 0, sizeof *handle
);
231 handle
->lock_mode
= lock_mode
;
233 handle
->lock_level
= lock_level
;
234 handle
->stop_level
= stop_level
;
235 handle
->coord
= coord
;
236 /* set flags. See comment in tree.h:cbk_flags */
237 handle
->flags
= flags
| CBK_TRUST_DK
| CBK_USE_CRABLOCK
;
239 handle
->active_lh
= active_lh
;
240 handle
->parent_lh
= parent_lh
;
241 handle
->ra_info
= info
;
245 /* main tree lookup procedure
247 Check coord cache. If key we are looking for is not found there, call cbk()
248 to do real tree traversal.
250 As we have extents on the twig level, @lock_level and @stop_level can
251 be different from LEAF_LEVEL and each other.
253 Thread cannot keep any reiser4 locks (tree, znode, dk spin-locks, or znode
254 long term locks) while calling this.
256 lookup_result
coord_by_key(reiser4_tree
* tree
/* tree to perform search
257 * in. Usually this tree is
258 * part of file-system
260 const reiser4_key
* key
/* key to look for */ ,
261 coord_t
*coord
/* where to store found
262 * position in a tree. Fields
263 * in "coord" are only valid if
264 * coord_by_key() returned
265 * "CBK_COORD_FOUND" */ ,
266 lock_handle
* lh
, /* resulting lock handle */
267 znode_lock_mode lock_mode
/* type of lookup we
269 * ZNODE_READ_LOCK here
270 * if you only want to
271 * read item found and
272 * ZNODE_WRITE_LOCK if
275 lookup_bias bias
/* what to return if coord
276 * with exactly the @key is
277 * not in the tree */ ,
278 tree_level lock_level
/* tree level where to start
279 * taking @lock type of
281 tree_level stop_level
/* tree level to stop. Pass
282 * LEAF_LEVEL or TWIG_LEVEL
283 * here Item being looked
284 * for has to be between
286 * @stop_level, inclusive */ ,
287 __u32 flags
/* search flags */ ,
290 /* information about desired tree traversal
295 lock_handle parent_lh
;
296 lookup_result result
;
301 assert("nikita-3023", reiser4_schedulable());
303 assert("nikita-353", tree
!= NULL
);
304 assert("nikita-354", key
!= NULL
);
305 assert("nikita-355", coord
!= NULL
);
306 assert("nikita-356", (bias
== FIND_EXACT
)
307 || (bias
== FIND_MAX_NOT_MORE_THAN
));
308 assert("nikita-357", stop_level
>= LEAF_LEVEL
);
309 /* no locks can be held during tree traversal */
310 assert("nikita-2104", lock_stack_isclean(get_current_lock_stack()));
318 lock_mode
, bias
, lock_level
, stop_level
, flags
, info
);
320 result
= coord_by_handle(&handle
);
321 assert("nikita-3247",
322 ergo(!IS_CBKERR(result
), coord
->node
== lh
->node
));
326 /* like coord_by_key(), but starts traversal from vroot of @object rather than
328 lookup_result
reiser4_object_lookup(struct inode
*object
,
329 const reiser4_key
* key
,
332 znode_lock_mode lock_mode
,
334 tree_level lock_level
,
335 tree_level stop_level
, __u32 flags
,
339 lock_handle parent_lh
;
340 lookup_result result
;
345 assert("nikita-3023", reiser4_schedulable());
347 assert("nikita-354", key
!= NULL
);
348 assert("nikita-355", coord
!= NULL
);
349 assert("nikita-356", (bias
== FIND_EXACT
)
350 || (bias
== FIND_MAX_NOT_MORE_THAN
));
351 assert("nikita-357", stop_level
>= LEAF_LEVEL
);
352 /* no locks can be held during tree search by key */
353 assert("nikita-2104", lock_stack_isclean(get_current_lock_stack()));
356 object
!= NULL
? reiser4_tree_by_inode(object
) : current_tree
,
361 lock_mode
, bias
, lock_level
, stop_level
, flags
, info
);
362 handle
.object
= object
;
364 result
= coord_by_handle(&handle
);
365 assert("nikita-3247",
366 ergo(!IS_CBKERR(result
), coord
->node
== lh
->node
));
370 /* lookup by cbk_handle. Common part of coord_by_key() and
371 reiser4_object_lookup(). */
372 static lookup_result
coord_by_handle(cbk_handle
* handle
)
375 * first check cbk_cache (which is look-aside cache for our tree) and
376 * of this fails, start traversal.
378 /* first check whether "key" is in cache of recent lookups. */
379 if (cbk_cache_search(handle
) == 0)
380 return handle
->result
;
382 return traverse_tree(handle
);
385 /* Execute actor for each item (or unit, depending on @through_units_p),
386 starting from @coord, right-ward, until either:
388 - end of the tree is reached
389 - unformatted node is met
391 - @actor returns 0 or less
393 Error code, or last actor return value is returned.
395 This is used by plugin/dir/hashe_dir.c:reiser4_find_entry() to move through
396 sequence of entries with identical keys and alikes.
398 int reiser4_iterate_tree(reiser4_tree
* tree
/* tree to scan */ ,
399 coord_t
*coord
/* coord to start from */ ,
400 lock_handle
* lh
/* lock handle to start with and to
401 * update along the way */ ,
402 tree_iterate_actor_t actor
/* function to call on each
404 void *arg
/* argument to pass to @actor */ ,
405 znode_lock_mode mode
/* lock mode on scanned nodes */ ,
406 int through_units_p
/* call @actor on each item or on
411 assert("nikita-1143", tree
!= NULL
);
412 assert("nikita-1145", coord
!= NULL
);
413 assert("nikita-1146", lh
!= NULL
);
414 assert("nikita-1147", actor
!= NULL
);
416 result
= zload(coord
->node
);
417 coord_clear_iplug(coord
);
420 if (!coord_is_existing_unit(coord
)) {
424 while ((result
= actor(tree
, coord
, lh
, arg
)) > 0) {
426 if ((through_units_p
&& coord_next_unit(coord
)) ||
427 (!through_units_p
&& coord_next_item(coord
))) {
431 /* move to the next node */
434 reiser4_get_right_neighbor(&couple
,
437 GN_CAN_USE_UPPER_LEVELS
);
441 result
= zload(couple
.node
);
447 coord_init_first_unit(coord
,
450 move_lh(lh
, &couple
);
453 } while (node_is_empty(coord
->node
));
456 assert("nikita-1149", coord_is_existing_unit(coord
));
462 /* return locked uber znode for @tree */
463 int get_uber_znode(reiser4_tree
* tree
, znode_lock_mode mode
,
464 znode_lock_request pri
, lock_handle
* lh
)
468 result
= longterm_lock_znode(lh
, tree
->uber
, mode
, pri
);
472 /* true if @key is strictly within @node
474 we are looking for possibly non-unique key and it is item is at the edge of
475 @node. May be it is in the neighbor.
477 static int znode_contains_key_strict(znode
* node
/* node to check key
480 key
/* key to check */ ,
485 assert("nikita-1760", node
!= NULL
);
486 assert("nikita-1722", key
!= NULL
);
488 if (keyge(key
, &node
->rd_key
))
491 answer
= keycmp(&node
->ld_key
, key
);
494 return answer
!= GREATER_THAN
;
496 return answer
== LESS_THAN
;
500 * Virtual Root (vroot) code.
502 * For given file system object (e.g., regular file or directory) let's
503 * define its "virtual root" as lowest in the tree (that is, furtherest
504 * from the tree root) node such that all body items of said object are
505 * located in a tree rooted at this node.
507 * Once vroot of object is found all tree lookups for items within body of
508 * this object ("object lookups") can be started from its vroot rather
509 * than from real root. This has following advantages:
511 * 1. amount of nodes traversed during lookup (and, hence, amount of
512 * key comparisons made) decreases, and
514 * 2. contention on tree root is decreased. This latter was actually
515 * motivating reason behind vroot, because spin lock of root node,
516 * which is taken when acquiring long-term lock on root node is the
517 * hottest lock in the reiser4.
521 * When vroot of object F is not yet determined, all object lookups start
522 * from the root of the tree. At each tree level during traversal we have
523 * a node N such that a key we are looking for (which is the key inside
524 * object's body) is located within N. In function handle_vroot() called
525 * from cbk_level_lookup() we check whether N is possible vroot for
526 * F. Check is trivial---if neither leftmost nor rightmost item of N
527 * belongs to F (and we already have helpful ->owns_item() method of
528 * object plugin for this), then N is possible vroot of F. This, of
529 * course, relies on the assumption that each object occupies contiguous
530 * range of keys in the tree.
532 * Thus, traversing tree downward and checking each node as we go, we can
533 * find lowest such node, which, by definition, is vroot.
535 * How to track vroot.
537 * Nohow. If actual vroot changes, next object lookup will just restart
538 * from the actual tree root, refreshing object's vroot along the way.
543 * Check whether @node is possible vroot of @object.
545 static void handle_vroot(struct inode
*object
, znode
* node
)
550 fplug
= inode_file_plugin(object
);
551 assert("nikita-3353", fplug
!= NULL
);
552 assert("nikita-3354", fplug
->owns_item
!= NULL
);
554 if (unlikely(node_is_empty(node
)))
557 coord_init_first_unit(&coord
, node
);
559 * if leftmost item of @node belongs to @object, we cannot be sure
560 * that @node is vroot of @object, because, some items of @object are
561 * probably in the sub-tree rooted at the left neighbor of @node.
563 if (fplug
->owns_item(object
, &coord
))
565 coord_init_last_unit(&coord
, node
);
566 /* mutatis mutandis for the rightmost item */
567 if (fplug
->owns_item(object
, &coord
))
569 /* otherwise, @node is possible vroot of @object */
570 inode_set_vroot(object
, node
);
574 * helper function used by traverse tree to start tree traversal not from the
575 * tree root, but from @h->object's vroot, if possible.
577 static int prepare_object_lookup(cbk_handle
* h
)
582 vroot
= inode_get_vroot(h
->object
);
585 * object doesn't have known vroot, start from real tree root.
590 h
->level
= znode_get_level(vroot
);
591 /* take a long-term lock on vroot */
592 h
->result
= longterm_lock_znode(h
->active_lh
, vroot
,
593 cbk_lock_mode(h
->level
, h
),
595 result
= LOOKUP_REST
;
596 if (h
->result
== 0) {
600 isunique
= h
->flags
& CBK_UNIQUE
;
601 /* check that key is inside vroot */
602 read_lock_dk(h
->tree
);
603 inside
= (znode_contains_key_strict(vroot
, h
->key
, isunique
) &&
604 !ZF_ISSET(vroot
, JNODE_HEARD_BANSHEE
));
605 read_unlock_dk(h
->tree
);
607 h
->result
= zload(vroot
);
608 if (h
->result
== 0) {
609 /* search for key in vroot. */
610 result
= cbk_node_lookup(h
);
611 zrelse(vroot
); /*h->active_lh->node); */
612 if (h
->active_lh
->node
!= vroot
) {
613 result
= LOOKUP_REST
;
614 } else if (result
== LOOKUP_CONT
) {
615 move_lh(h
->parent_lh
, h
->active_lh
);
616 h
->flags
&= ~CBK_DKSET
;
624 if (IS_CBKERR(h
->result
) || result
== LOOKUP_REST
)
629 /* main function that handles common parts of tree traversal: starting
630 (fake znode handling), restarts, error handling, completion */
631 static lookup_result
traverse_tree(cbk_handle
* h
/* search handle */)
637 assert("nikita-365", h
!= NULL
);
638 assert("nikita-366", h
->tree
!= NULL
);
639 assert("nikita-367", h
->key
!= NULL
);
640 assert("nikita-368", h
->coord
!= NULL
);
641 assert("nikita-369", (h
->bias
== FIND_EXACT
)
642 || (h
->bias
== FIND_MAX_NOT_MORE_THAN
));
643 assert("nikita-370", h
->stop_level
>= LEAF_LEVEL
);
644 assert("nikita-2949", !(h
->flags
& CBK_DKSET
));
645 assert("zam-355", lock_stack_isclean(get_current_lock_stack()));
651 /* loop for restarts */
654 assert("nikita-3024", reiser4_schedulable());
656 h
->result
= CBK_COORD_FOUND
;
657 /* connect_znode() needs it */
658 h
->ld_key
= *reiser4_min_key();
659 h
->rd_key
= *reiser4_max_key();
660 h
->flags
|= CBK_DKSET
;
663 if (!vroot_used
&& h
->object
!= NULL
) {
665 done
= prepare_object_lookup(h
);
666 if (done
== LOOKUP_REST
)
668 else if (done
== LOOKUP_DONE
)
671 if (h
->parent_lh
->node
== NULL
) {
673 get_uber_znode(h
->tree
, ZNODE_READ_LOCK
, ZNODE_LOCK_LOPRI
,
676 assert("nikita-1637", done
!= -E_DEADLOCK
);
678 h
->block
= h
->tree
->root_block
;
679 h
->level
= h
->tree
->height
;
680 h
->coord
->node
= h
->parent_lh
->node
;
686 /* loop descending a tree */
689 if (unlikely((iterations
> REISER4_CBK_ITERATIONS_LIMIT
) &&
690 IS_POW(iterations
))) {
691 warning("nikita-1481", "Too many iterations: %i",
693 reiser4_print_key("key", h
->key
);
695 } else if (unlikely(iterations
> REISER4_MAX_CBK_ITERATIONS
)) {
697 "reiser-2018: Too many iterations. Tree corrupted, or (less likely) starvation occurring.";
698 h
->result
= RETERR(-EIO
);
701 switch (cbk_level_lookup(h
)) {
703 move_lh(h
->parent_lh
, h
->active_lh
);
706 wrong_return_value("nikita-372", "cbk_level");
712 /* deadlock avoidance is normal case. */
713 if (h
->result
!= -E_DEADLOCK
)
715 reiser4_preempt_point();
719 /* that's all. The rest is error handling */
720 if (unlikely(h
->error
!= NULL
)) {
721 warning("nikita-373", "%s: level: %i, "
722 "lock_level: %i, stop_level: %i "
723 "lock_mode: %s, bias: %s",
724 h
->error
, h
->level
, h
->lock_level
, h
->stop_level
,
725 lock_mode_name(h
->lock_mode
), bias_name(h
->bias
));
726 reiser4_print_address("block", &h
->block
);
727 reiser4_print_key("key", h
->key
);
728 print_coord_content("coord", h
->coord
);
730 /* `unlikely' error case */
731 if (unlikely(IS_CBKERR(h
->result
))) {
732 /* failure. do cleanup */
735 assert("nikita-1605", WITH_DATA_RET
737 ergo((h
->result
== CBK_COORD_FOUND
) &&
738 (h
->bias
== FIND_EXACT
) &&
739 (!node_is_empty(h
->coord
->node
)),
740 coord_is_existing_item(h
->coord
))));
745 /* find delimiting keys of child
747 Determine left and right delimiting keys for child pointed to by
751 static void find_child_delimiting_keys(znode
* parent
/* parent znode, passed
753 const coord_t
*parent_coord
754 /* coord where pointer
757 reiser4_key
* ld
/* where to store left
758 * delimiting key */ ,
759 reiser4_key
* rd
/* where to store right
760 * delimiting key */ )
764 assert("nikita-1484", parent
!= NULL
);
765 assert_rw_locked(&(znode_get_tree(parent
)->dk_lock
));
767 coord_dup(&neighbor
, parent_coord
);
769 if (neighbor
.between
== AT_UNIT
)
770 /* imitate item ->lookup() behavior. */
771 neighbor
.between
= AFTER_UNIT
;
773 if (coord_set_to_left(&neighbor
) == 0)
774 unit_key_by_coord(&neighbor
, ld
);
776 assert("nikita-14851", 0);
777 *ld
= *znode_get_ld_key(parent
);
780 coord_dup(&neighbor
, parent_coord
);
781 if (neighbor
.between
== AT_UNIT
)
782 neighbor
.between
= AFTER_UNIT
;
783 if (coord_set_to_right(&neighbor
) == 0)
784 unit_key_by_coord(&neighbor
, rd
);
786 *rd
= *znode_get_rd_key(parent
);
790 * setup delimiting keys for a child
792 * @parent parent node
794 * @coord location in @parent where pointer to @child is
799 set_child_delimiting_keys(znode
* parent
, const coord_t
*coord
, znode
* child
)
803 assert("nikita-2952",
804 znode_get_level(parent
) == znode_get_level(coord
->node
));
806 /* fast check without taking dk lock. This is safe, because
807 * JNODE_DKSET is never cleared once set. */
808 if (!ZF_ISSET(child
, JNODE_DKSET
)) {
809 tree
= znode_get_tree(parent
);
811 if (likely(!ZF_ISSET(child
, JNODE_DKSET
))) {
812 find_child_delimiting_keys(parent
, coord
,
815 ON_DEBUG(child
->ld_key_version
=
816 atomic_inc_return(&delim_key_version
);
817 child
->rd_key_version
=
818 atomic_inc_return(&delim_key_version
););
819 ZF_SET(child
, JNODE_DKSET
);
821 write_unlock_dk(tree
);
827 /* Perform tree lookup at one level. This is called from cbk_traverse()
828 function that drives lookup through tree and calls cbk_node_lookup() to
829 perform lookup within one node.
831 See comments in a code.
833 static level_lookup_result
cbk_level_lookup(cbk_handle
* h
/* search handle */)
842 assert("nikita-3025", reiser4_schedulable());
844 /* acquire reference to @active node */
846 zget(h
->tree
, &h
->block
, h
->parent_lh
->node
, h
->level
,
847 reiser4_ctx_gfp_mask_get());
849 if (IS_ERR(active
)) {
850 h
->result
= PTR_ERR(active
);
855 h
->result
= longterm_lock_znode(h
->active_lh
,
857 cbk_lock_mode(h
->level
, h
),
859 /* longterm_lock_znode() acquires additional reference to znode (which
860 will be later released by longterm_unlock_znode()). Release
861 reference acquired by zget().
864 if (unlikely(h
->result
!= 0))
865 goto fail_or_restart
;
868 /* if @active is accessed for the first time, setup delimiting keys on
869 it. Delimiting keys are taken from the parent node. See
870 setup_delimiting_keys() for details.
872 if (h
->flags
& CBK_DKSET
) {
873 setdk
= setup_delimiting_keys(h
);
874 h
->flags
&= ~CBK_DKSET
;
878 parent
= h
->parent_lh
->node
;
879 h
->result
= zload(parent
);
880 if (unlikely(h
->result
!= 0))
881 goto fail_or_restart
;
883 if (!ZF_ISSET(active
, JNODE_DKSET
))
884 setdk
= set_child_delimiting_keys(parent
,
887 read_lock_dk(h
->tree
);
888 find_child_delimiting_keys(parent
, h
->coord
, &ldkey
,
890 read_unlock_dk(h
->tree
);
896 /* this is ugly kludge. Reminder: this is necessary, because
897 ->lookup() method returns coord with ->between field probably set
898 to something different from AT_UNIT.
900 h
->coord
->between
= AT_UNIT
;
902 if (znode_just_created(active
) && (h
->coord
->node
!= NULL
)) {
903 write_lock_tree(h
->tree
);
904 /* if we are going to load znode right now, setup
905 ->in_parent: coord where pointer to this node is stored in
908 coord_to_parent_coord(h
->coord
, &active
->in_parent
);
909 write_unlock_tree(h
->tree
);
912 /* check connectedness without holding tree lock---false negatives
913 * will be re-checked by connect_znode(), and false positives are
914 * impossible---@active cannot suddenly turn into unconnected
916 if (!znode_is_connected(active
)) {
917 h
->result
= connect_znode(h
->coord
, active
);
918 if (unlikely(h
->result
!= 0)) {
920 goto fail_or_restart
;
924 jload_prefetch(ZJNODE(active
));
927 update_stale_dk(h
->tree
, active
);
929 /* put_parent() cannot be called earlier, because connect_znode()
930 assumes parent node is referenced; */
933 if ((!znode_contains_key_lock(active
, h
->key
) &&
934 (h
->flags
& CBK_TRUST_DK
))
935 || ZF_ISSET(active
, JNODE_HEARD_BANSHEE
)) {
936 /* 1. key was moved out of this node while this thread was
937 waiting for the lock. Restart. More elaborate solution is
938 to determine where key moved (to the left, or to the right)
939 and try to follow it through sibling pointers.
941 2. or, node itself is going to be removed from the
942 tree. Release lock and restart.
944 h
->result
= -E_REPEAT
;
946 if (h
->result
== -E_REPEAT
)
949 h
->result
= zload_ra(active
, h
->ra_info
);
954 if (sanity_check(h
)) {
959 /* check that key of leftmost item in the @active is the same as in
961 if (ldkeyset
&& !node_is_empty(active
) &&
962 !keyeq(leftmost_key_in_node(active
, &key
), &ldkey
)) {
963 warning("vs-3533", "Keys are inconsistent. Fsck?");
964 reiser4_print_key("inparent", &ldkey
);
965 reiser4_print_key("inchild", &key
);
966 h
->result
= RETERR(-EIO
);
971 if (h
->object
!= NULL
)
972 handle_vroot(h
->object
, active
);
974 ret
= cbk_node_lookup(h
);
976 /* h->active_lh->node might change, but active is yet to be zrelsed */
982 if (h
->result
== -E_DEADLOCK
)
988 /* check left and right delimiting keys of a znode */
989 void check_dkeys(znode
* node
)
994 read_lock_tree(current_tree
);
995 read_lock_dk(current_tree
);
997 assert("vs-1710", znode_is_any_locked(node
));
999 !keygt(znode_get_ld_key(node
), znode_get_rd_key(node
)));
1002 right
= node
->right
;
1004 if (ZF_ISSET(node
, JNODE_LEFT_CONNECTED
) && ZF_ISSET(node
, JNODE_DKSET
)
1005 && left
!= NULL
&& ZF_ISSET(left
, JNODE_DKSET
))
1006 /* check left neighbor. Note that left neighbor is not locked,
1007 so it might get wrong delimiting keys therefore */
1009 (keyeq(znode_get_rd_key(left
), znode_get_ld_key(node
))
1010 || ZF_ISSET(left
, JNODE_HEARD_BANSHEE
)));
1012 if (ZF_ISSET(node
, JNODE_RIGHT_CONNECTED
) && ZF_ISSET(node
, JNODE_DKSET
)
1013 && right
!= NULL
&& ZF_ISSET(right
, JNODE_DKSET
))
1014 /* check right neighbor. Note that right neighbor is not
1015 locked, so it might get wrong delimiting keys therefore */
1017 (keyeq(znode_get_rd_key(node
), znode_get_ld_key(right
))
1018 || ZF_ISSET(right
, JNODE_HEARD_BANSHEE
)));
1020 read_unlock_dk(current_tree
);
1021 read_unlock_tree(current_tree
);
1025 /* true if @key is left delimiting key of @node */
1026 static int key_is_ld(znode
* node
, const reiser4_key
* key
)
1030 assert("nikita-1716", node
!= NULL
);
1031 assert("nikita-1758", key
!= NULL
);
1033 read_lock_dk(znode_get_tree(node
));
1034 assert("nikita-1759", znode_contains_key(node
, key
));
1035 ld
= keyeq(znode_get_ld_key(node
), key
);
1036 read_unlock_dk(znode_get_tree(node
));
1040 /* Process one node during tree traversal.
1042 This is called by cbk_level_lookup(). */
1043 static level_lookup_result
cbk_node_lookup(cbk_handle
* h
/* search handle */)
1045 /* node plugin of @active */
1047 /* item plugin of item that was found */
1050 lookup_bias node_bias
;
1051 /* node we are operating upon */
1053 /* tree we are searching in */
1058 assert("nikita-379", h
!= NULL
);
1060 active
= h
->active_lh
->node
;
1063 nplug
= active
->nplug
;
1064 assert("nikita-380", nplug
!= NULL
);
1066 ON_DEBUG(check_dkeys(active
));
1068 /* return item from "active" node with maximal key not greater than
1070 node_bias
= h
->bias
;
1071 result
= nplug
->lookup(active
, h
->key
, node_bias
, h
->coord
);
1072 if (unlikely(result
!= NS_FOUND
&& result
!= NS_NOT_FOUND
)) {
1073 /* error occurred */
1077 if (h
->level
== h
->stop_level
) {
1078 /* welcome to the stop level */
1079 assert("nikita-381", h
->coord
->node
== active
);
1080 if (result
== NS_FOUND
) {
1081 /* success of tree lookup */
1082 if (!(h
->flags
& CBK_UNIQUE
)
1083 && key_is_ld(active
, h
->key
))
1084 return search_to_left(h
);
1086 h
->result
= CBK_COORD_FOUND
;
1088 h
->result
= CBK_COORD_NOTFOUND
;
1090 if (!(h
->flags
& CBK_IN_CACHE
))
1091 cbk_cache_add(active
);
1095 if (h
->level
> TWIG_LEVEL
&& result
== NS_NOT_FOUND
) {
1096 h
->error
= "not found on internal node";
1101 assert("vs-361", h
->level
> h
->stop_level
);
1103 if (handle_eottl(h
, &result
)) {
1104 assert("vs-1674", (result
== LOOKUP_DONE
||
1105 result
== LOOKUP_REST
));
1109 /* go down to next level */
1110 check_me("vs-12", zload(h
->coord
->node
) == 0);
1111 assert("nikita-2116", item_is_internal(h
->coord
));
1112 iplug
= item_plugin_by_coord(h
->coord
);
1113 iplug
->s
.internal
.down_link(h
->coord
, h
->key
, &h
->block
);
1114 zrelse(h
->coord
->node
);
1116 return LOOKUP_CONT
; /* continue */
1119 /* scan cbk_cache slots looking for a match for @h */
1120 static int cbk_cache_scan_slots(cbk_handle
* h
/* cbk handle */)
1122 level_lookup_result llr
;
1125 cbk_cache_slot
*slot
;
1129 const reiser4_key
*key
;
1132 assert("nikita-1317", h
!= NULL
);
1133 assert("nikita-1315", h
->tree
!= NULL
);
1134 assert("nikita-1316", h
->key
!= NULL
);
1137 cache
= &tree
->cbk_cache
;
1138 if (cache
->nr_slots
== 0)
1139 /* size of cbk cache was set to 0 by mount time option. */
1140 return RETERR(-ENOENT
);
1142 assert("nikita-2474", cbk_cache_invariant(cache
));
1143 node
= NULL
; /* to keep gcc happy */
1146 isunique
= h
->flags
& CBK_UNIQUE
;
1147 result
= RETERR(-ENOENT
);
1150 * this is time-critical function and dragons had, hence, been settled
1153 * Loop below scans cbk cache slots trying to find matching node with
1154 * suitable range of delimiting keys and located at the h->level.
1156 * Scan is done under cbk cache spin lock that protects slot->node
1157 * pointers. If suitable node is found we want to pin it in
1158 * memory. But slot->node can point to the node with x_count 0
1159 * (unreferenced). Such node can be recycled at any moment, or can
1160 * already be in the process of being recycled (within jput()).
1162 * As we found node in the cbk cache, it means that jput() hasn't yet
1163 * called cbk_cache_invalidate().
1165 * We acquire reference to the node without holding tree lock, and
1166 * later, check node's RIP bit. This avoids races with jput().
1170 read_lock(&((cbk_cache
*)cache
)->guard
);
1172 slot
= list_entry(cache
->lru
.next
, cbk_cache_slot
, lru
);
1173 slot
= list_entry(slot
->lru
.prev
, cbk_cache_slot
, lru
);
1174 BUG_ON(&slot
->lru
!= &cache
->lru
);/*????*/
1177 slot
= list_entry(slot
->lru
.next
, cbk_cache_slot
, lru
);
1179 if (&cache
->lru
!= &slot
->lru
)
1184 if (unlikely(node
== NULL
))
1188 * this is (hopefully) the only place in the code where we are
1189 * working with delimiting keys without holding dk lock. This
1190 * is fine here, because this is only "guess" anyway---keys
1191 * are rechecked under dk lock below.
1193 if (znode_get_level(node
) == level
&&
1194 /* reiser4_min_key < key < reiser4_max_key */
1195 znode_contains_key_strict(node
, key
, isunique
)) {
1198 spin_lock_prefetch(&tree
->tree_lock
);
1202 read_unlock(&((cbk_cache
*)cache
)->guard
);
1204 assert("nikita-2475", cbk_cache_invariant(cache
));
1206 if (unlikely(result
== 0 && ZF_ISSET(node
, JNODE_RIP
)))
1212 h
->result
= CBK_COORD_NOTFOUND
;
1213 return RETERR(-ENOENT
);
1217 longterm_lock_znode(h
->active_lh
, node
, cbk_lock_mode(level
, h
),
1222 result
= zload(node
);
1228 result
= (znode_contains_key_strict(node
, key
, isunique
) &&
1229 !ZF_ISSET(node
, JNODE_HEARD_BANSHEE
));
1230 read_unlock_dk(tree
);
1232 /* do lookup inside node */
1233 llr
= cbk_node_lookup(h
);
1234 /* if cbk_node_lookup() wandered to another node (due to eottl
1235 or non-unique keys), adjust @node */
1236 /*node = h->active_lh->node; */
1238 if (llr
!= LOOKUP_DONE
) {
1239 /* restart or continue on the next level */
1240 result
= RETERR(-ENOENT
);
1241 } else if (IS_CBKERR(h
->result
))
1243 result
= RETERR(-ENOENT
);
1245 /* good. Either item found or definitely not found. */
1248 write_lock(&(cache
->guard
));
1249 if (slot
->node
== h
->active_lh
->node
) {
1250 /* if this node is still in cbk cache---move
1251 its slot to the head of the LRU list. */
1252 list_move(&slot
->lru
, &cache
->lru
);
1254 write_unlock(&(cache
->guard
));
1257 /* race. While this thread was waiting for the lock, node was
1258 rebalanced and item we are looking for, shifted out of it
1259 (if it ever was here).
1261 Continuing scanning is almost hopeless: node key range was
1262 moved to, is almost certainly at the beginning of the LRU
1263 list at this time, because it's hot, but restarting
1264 scanning from the very beginning is complex. Just return,
1265 so that cbk() will be performed. This is not that
1266 important, because such races should be rare. Are they?
1268 result
= RETERR(-ENOENT
); /* -ERAUGHT */
1271 assert("nikita-2476", cbk_cache_invariant(cache
));
1275 /* look for item with given key in the coord cache
1277 This function, called by coord_by_key(), scans "coord cache" (&cbk_cache)
1278 which is a small LRU list of znodes accessed lately. For each znode in
1279 znode in this list, it checks whether key we are looking for fits into key
1280 range covered by this node. If so, and in addition, node lies at allowed
1281 level (this is to handle extents on a twig level), node is locked, and
1282 lookup inside it is performed.
1284 we need a measurement of the cost of this cache search compared to the cost
1288 static int cbk_cache_search(cbk_handle
* h
/* cbk handle */)
1293 /* add CBK_IN_CACHE to the handle flags. This means that
1294 * cbk_node_lookup() assumes that cbk_cache is scanned and would add
1295 * found node to the cache. */
1296 h
->flags
|= CBK_IN_CACHE
;
1297 for (level
= h
->stop_level
; level
<= h
->lock_level
; ++level
) {
1299 result
= cbk_cache_scan_slots(h
);
1301 done_lh(h
->active_lh
);
1302 done_lh(h
->parent_lh
);
1304 assert("nikita-1319", !IS_CBKERR(h
->result
));
1308 h
->flags
&= ~CBK_IN_CACHE
;
1312 /* type of lock we want to obtain during tree traversal. On stop level
1313 we want type of lock user asked for, on upper levels: read lock. */
1314 znode_lock_mode
cbk_lock_mode(tree_level level
, cbk_handle
* h
)
1316 assert("nikita-382", h
!= NULL
);
1318 return (level
<= h
->lock_level
) ? h
->lock_mode
: ZNODE_READ_LOCK
;
1321 /* update outdated delimiting keys */
1322 static void stale_dk(reiser4_tree
* tree
, znode
* node
)
1326 read_lock_tree(tree
);
1327 write_lock_dk(tree
);
1328 right
= node
->right
;
1330 if (ZF_ISSET(node
, JNODE_RIGHT_CONNECTED
) &&
1331 right
&& ZF_ISSET(right
, JNODE_DKSET
) &&
1332 !keyeq(znode_get_rd_key(node
), znode_get_ld_key(right
)))
1333 znode_set_rd_key(node
, znode_get_ld_key(right
));
1335 write_unlock_dk(tree
);
1336 read_unlock_tree(tree
);
1339 /* check for possibly outdated delimiting keys, and update them if
1341 static void update_stale_dk(reiser4_tree
* tree
, znode
* node
)
1346 read_lock_tree(tree
);
1348 rd
= *znode_get_rd_key(node
);
1349 right
= node
->right
;
1350 if (unlikely(ZF_ISSET(node
, JNODE_RIGHT_CONNECTED
) &&
1351 right
&& ZF_ISSET(right
, JNODE_DKSET
) &&
1352 !keyeq(&rd
, znode_get_ld_key(right
)))) {
1353 assert("nikita-38211", ZF_ISSET(node
, JNODE_DKSET
));
1354 read_unlock_dk(tree
);
1355 read_unlock_tree(tree
);
1356 stale_dk(tree
, node
);
1359 read_unlock_dk(tree
);
1360 read_unlock_tree(tree
);
1364 * handle searches a the non-unique key.
1366 * Suppose that we are looking for an item with possibly non-unique key 100.
1368 * Root node contains two pointers: one to a node with left delimiting key 0,
1369 * and another to a node with left delimiting key 100. Item we interested in
1370 * may well happen in the sub-tree rooted at the first pointer.
1372 * To handle this search_to_left() is called when search reaches stop
1373 * level. This function checks it is _possible_ that item we are looking for
1374 * is in the left neighbor (this can be done by comparing delimiting keys) and
1375 * if so, tries to lock left neighbor (this is low priority lock, so it can
1376 * deadlock, tree traversal is just restarted if it did) and then checks
1377 * whether left neighbor actually contains items with our key.
1379 * Note that this is done on the stop level only. It is possible to try such
1380 * left-check on each level, but as duplicate keys are supposed to be rare
1381 * (very unlikely that more than one node is completely filled with items with
1382 * duplicate keys), it sis cheaper to scan to the left on the stop level once.
1385 static level_lookup_result
search_to_left(cbk_handle
* h
/* search handle */)
1387 level_lookup_result result
;
1394 assert("nikita-1761", h
!= NULL
);
1395 assert("nikita-1762", h
->level
== h
->stop_level
);
1399 node
= h
->active_lh
->node
;
1400 assert("nikita-1763", coord_is_leftmost_unit(coord
));
1403 reiser4_get_left_neighbor(&lh
, node
, (int)h
->lock_mode
,
1404 GN_CAN_USE_UPPER_LEVELS
);
1406 switch (h
->result
) {
1408 result
= LOOKUP_REST
;
1416 h
->result
= zload(neighbor
);
1417 if (h
->result
!= 0) {
1418 result
= LOOKUP_DONE
;
1422 nplug
= neighbor
->nplug
;
1424 coord_init_zero(&crd
);
1426 h
->bias
= FIND_EXACT
;
1428 nplug
->lookup(neighbor
, h
->key
, h
->bias
, &crd
);
1431 if (h
->result
== NS_NOT_FOUND
) {
1432 case -E_NO_NEIGHBOR
:
1433 h
->result
= CBK_COORD_FOUND
;
1434 if (!(h
->flags
& CBK_IN_CACHE
))
1435 cbk_cache_add(node
);
1436 default: /* some other error */
1437 result
= LOOKUP_DONE
;
1438 } else if (h
->result
== NS_FOUND
) {
1439 read_lock_dk(znode_get_tree(neighbor
));
1440 h
->rd_key
= *znode_get_ld_key(node
);
1441 leftmost_key_in_node(neighbor
, &h
->ld_key
);
1442 read_unlock_dk(znode_get_tree(neighbor
));
1443 h
->flags
|= CBK_DKSET
;
1445 h
->block
= *znode_get_block(neighbor
);
1446 /* clear coord->node so that cbk_level_lookup()
1447 wouldn't overwrite parent hint in neighbor.
1449 Parent hint was set up by
1450 reiser4_get_left_neighbor()
1452 /* FIXME: why do we have to spinlock here? */
1453 write_lock_tree(znode_get_tree(neighbor
));
1454 h
->coord
->node
= NULL
;
1455 write_unlock_tree(znode_get_tree(neighbor
));
1456 result
= LOOKUP_CONT
;
1458 result
= LOOKUP_DONE
;
1460 if (neighbor
!= NULL
)
1468 /* debugging aid: return symbolic name of search bias */
1469 static const char *bias_name(lookup_bias bias
/* bias to get name of */)
1471 if (bias
== FIND_EXACT
)
1473 else if (bias
== FIND_MAX_NOT_MORE_THAN
)
1474 return "left-slant";
1475 /* else if( bias == RIGHT_SLANT_BIAS ) */
1476 /* return "right-bias"; */
1478 static char buf
[30];
1480 sprintf(buf
, "unknown: %i", bias
);
1486 /* debugging aid: print human readable information about @p */
1487 void print_coord_content(const char *prefix
/* prefix to print */ ,
1488 coord_t
*p
/* coord to print */)
1493 printk("%s: null\n", prefix
);
1496 if ((p
->node
!= NULL
) && znode_is_loaded(p
->node
)
1497 && coord_is_existing_item(p
))
1498 printk("%s: data: %p, length: %i\n", prefix
,
1499 item_body_by_coord(p
), item_length_by_coord(p
));
1500 if (znode_is_loaded(p
->node
)) {
1501 item_key_by_coord(p
, &key
);
1502 reiser4_print_key(prefix
, &key
);
1506 /* debugging aid: print human readable information about @block */
1507 void reiser4_print_address(const char *prefix
/* prefix to print */ ,
1508 const reiser4_block_nr
* block
/* block number to print */)
1510 printk("%s: %s\n", prefix
, sprint_address(block
));
1514 /* return string containing human readable representation of @block */
1515 char *sprint_address(const reiser4_block_nr
*
1516 block
/* block number to print */)
1518 static char address
[30];
1521 sprintf(address
, "null");
1522 else if (reiser4_blocknr_is_fake(block
))
1523 sprintf(address
, "%llx", (unsigned long long)(*block
));
1525 sprintf(address
, "%llu", (unsigned long long)(*block
));
1529 /* release parent node during traversal */
1530 static void put_parent(cbk_handle
* h
/* search handle */)
1532 assert("nikita-383", h
!= NULL
);
1533 if (h
->parent_lh
->node
!= NULL
)
1534 longterm_unlock_znode(h
->parent_lh
);
1537 /* helper function used by coord_by_key(): release reference to parent znode
1538 stored in handle before processing its child. */
1539 static void hput(cbk_handle
* h
/* search handle */)
1541 assert("nikita-385", h
!= NULL
);
1542 done_lh(h
->parent_lh
);
1543 done_lh(h
->active_lh
);
1546 /* Helper function used by cbk(): update delimiting keys of child node (stored
1547 in h->active_lh->node) using key taken from parent on the parent level. */
1548 static int setup_delimiting_keys(cbk_handle
* h
/* search handle */)
1553 assert("nikita-1088", h
!= NULL
);
1555 active
= h
->active_lh
->node
;
1557 /* fast check without taking dk lock. This is safe, because
1558 * JNODE_DKSET is never cleared once set. */
1559 if (!ZF_ISSET(active
, JNODE_DKSET
)) {
1560 tree
= znode_get_tree(active
);
1561 write_lock_dk(tree
);
1562 if (!ZF_ISSET(active
, JNODE_DKSET
)) {
1563 znode_set_ld_key(active
, &h
->ld_key
);
1564 znode_set_rd_key(active
, &h
->rd_key
);
1565 ZF_SET(active
, JNODE_DKSET
);
1567 write_unlock_dk(tree
);
1573 /* true if @block makes sense for the @tree. Used to detect corrupted node
1576 block_nr_is_correct(reiser4_block_nr
* block
/* block number to check */ ,
1577 reiser4_tree
* tree
/* tree to check against */)
1579 assert("nikita-757", block
!= NULL
);
1580 assert("nikita-758", tree
!= NULL
);
1582 /* check to see if it exceeds the size of the device. */
1583 return reiser4_blocknr_is_sane_for(tree
->super
, block
);
1586 /* check consistency of fields */
1587 static int sanity_check(cbk_handle
* h
/* search handle */)
1589 assert("nikita-384", h
!= NULL
);
1591 if (h
->level
< h
->stop_level
) {
1592 h
->error
= "Buried under leaves";
1593 h
->result
= RETERR(-EIO
);
1595 } else if (!block_nr_is_correct(&h
->block
, h
->tree
)) {
1596 h
->error
= "bad block number";
1597 h
->result
= RETERR(-EIO
);
1603 /* Make Linus happy.
1605 c-indentation-style: "K&R"