1 /* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
7 #include "plugin/item/item.h"
8 #include "plugin/node/node.h"
10 #include "block_alloc.h"
11 #include "tree_walk.h"
17 #include <linux/types.h> /* for __u?? */
20 * Extents on the twig level (EOTTL) handling.
22 * EOTTL poses some problems to the tree traversal, that are better explained
25 * Suppose we have block B1 on the twig level with the following items:
27 * 0. internal item I0 with key (0:0:0:0) (locality, key-type, object-id,
29 * 1. extent item E1 with key (1:4:100:0), having 10 blocks of 4k each
30 * 2. internal item I2 with key (10:0:0:0)
32 * We are trying to insert item with key (5:0:0:0). Lookup finds node B1, and
33 * then intra-node lookup is done. This lookup finished on the E1, because the
34 * key we are looking for is larger than the key of E1 and is smaller than key
37 * Here search is stuck.
39 * After some thought it is clear what is wrong here: extents on the twig level
40 * break some basic property of the *search* tree (on the pretext, that they
41 * restore property of balanced tree).
43 * Said property is the following: if in the internal node of the search tree
44 * we have [ ... Key1 Pointer Key2 ... ] then, all data that are or will be
45 * keyed in the tree with the Key such that Key1 <= Key < Key2 are accessible
46 * through the Pointer.
48 * This is not true, when Pointer is Extent-Pointer, simply because extent
49 * cannot expand indefinitely to the right to include any item with
51 * Key1 <= Key <= Key2.
53 * For example, our E1 extent is only responsible for the data with keys
55 * (1:4:100:0) <= key <= (1:4:100:0xffffffffffffffff), and
59 * ( (1:4:100:0xffffffffffffffff), (10:0:0:0) )
61 * is orphaned: there is no way to get there from the tree root.
63 * In other words, extent pointers are different than normal child pointers as
64 * far as search tree is concerned, and this creates such problems.
66 * Possible solution for this problem is to insert our item into node pointed
67 * to by I2. There are some problems through:
69 * (1) I2 can be in a different node.
70 * (2) E1 can be immediately followed by another extent E2.
72 * (1) is solved by calling reiser4_get_right_neighbor() and accounting
73 * for locks/coords as necessary.
75 * (2) is more complex. Solution here is to insert new empty leaf node and
76 * insert internal item between E1 and E2 pointing to said leaf node. This is
77 * further complicated by possibility that E2 is in a different node, etc.
81 * (1) if there was internal item I2 immediately on the right of an extent E1
82 * we and we decided to insert new item S1 into node N2 pointed to by I2, then
83 * key of S1 will be less than smallest key in the N2. Normally, search key
84 * checks that key we are looking for is in the range of keys covered by the
85 * node key is being looked in. To work around of this situation, while
86 * preserving useful consistency check new flag CBK_TRUST_DK was added to the
87 * cbk falgs bitmask. This flag is automatically set on entrance to the
88 * coord_by_key() and is only cleared when we are about to enter situation
91 * (2) If extent E1 is immediately followed by another extent E2 and we are
92 * searching for the key that is between E1 and E2 we only have to insert new
93 * empty leaf node when coord_by_key was called for insertion, rather than just
94 * for lookup. To distinguish these cases, new flag CBK_FOR_INSERT was added to
95 * the cbk falgs bitmask. This flag is automatically set by coord_by_key calls
96 * performed by insert_by_key() and friends.
98 * (3) Insertion of new empty leaf node (possibly) requires balancing. In any
99 * case it requires modification of node content which is only possible under
100 * write lock. It may well happen that we only have read lock on the node where
101 * new internal pointer is to be inserted (common case: lookup of non-existent
102 * stat-data that fells between two extents). If only read lock is held, tree
103 * traversal is restarted with lock_level modified so that next time we hit
104 * this problem, write lock will be held. Once we have write lock, balancing
109 * is_next_item_internal - check whether next item is internal
110 * @coord: coordinate of extent item in twig node
112 * @lh: twig node lock handle
114 * Looks at the unit next to @coord. If it is an internal one - 1 is returned,
115 * @coord is set to that unit. If that unit is in right neighbor, @lh is moved
116 * to that node, @coord is set to its first unit. If next item is not internal
117 * or does not exist then 0 is returned, @coord and @lh are left unchanged. 2
118 * is returned if search restart has to be done.
121 is_next_item_internal(coord_t
*coord
, const reiser4_key
*key
,
128 coord_dup(&next
, coord
);
129 if (coord_next_unit(&next
) == 0) {
130 /* next unit is in this node */
131 if (item_is_internal(&next
)) {
132 coord_dup(coord
, &next
);
135 assert("vs-3", item_is_extent(&next
));
140 * next unit either does not exist or is in right neighbor. If it is in
141 * right neighbor we have to check right delimiting key because
142 * concurrent thread could get their first and insert item with a key
145 read_lock_dk(current_tree
);
146 result
= keycmp(key
, znode_get_rd_key(coord
->node
));
147 read_unlock_dk(current_tree
);
148 assert("vs-6", result
!= EQUAL_TO
);
149 if (result
== GREATER_THAN
)
152 /* lock right neighbor */
154 result
= reiser4_get_right_neighbor(&rn
, coord
->node
,
155 znode_is_wlocked(coord
->node
) ?
156 ZNODE_WRITE_LOCK
: ZNODE_READ_LOCK
,
157 GN_CAN_USE_UPPER_LEVELS
);
158 if (result
== -E_NO_NEIGHBOR
) {
159 /* we are on the rightmost edge of the tree */
165 assert("vs-4", result
< 0);
171 * check whether concurrent thread managed to insert item with a key
174 read_lock_dk(current_tree
);
175 result
= keycmp(key
, znode_get_ld_key(rn
.node
));
176 read_unlock_dk(current_tree
);
177 assert("vs-6", result
!= EQUAL_TO
);
178 if (result
== GREATER_THAN
) {
183 result
= zload(rn
.node
);
185 assert("vs-5", result
< 0);
190 coord_init_first_unit(&next
, rn
.node
);
191 if (item_is_internal(&next
)) {
193 * next unit is in right neighbor and it is an unit of internal
194 * item. Unlock coord->node. Move @lh to right neighbor. @coord
195 * is set to the first unit of right neighbor.
197 coord_dup(coord
, &next
);
205 * next unit is unit of extent item. Return without chaning @lh and
208 assert("vs-6", item_is_extent(&next
));
215 * rd_key - calculate key of an item next to the given one
216 * @coord: position in a node
217 * @key: storage for result key
219 * @coord is set between items or after the last item in a node. Calculate key
220 * of item to the right of @coord.
222 static reiser4_key
*rd_key(const coord_t
*coord
, reiser4_key
*key
)
226 assert("nikita-2281", coord_is_between_items(coord
));
227 coord_dup(&dup
, coord
);
229 if (coord_set_to_right(&dup
) == 0)
230 /* next item is in this node. Return its key. */
231 unit_key_by_coord(&dup
, key
);
234 * next item either does not exist or is in right
235 * neighbor. Return znode's right delimiting key.
237 read_lock_dk(current_tree
);
238 *key
= *znode_get_rd_key(coord
->node
);
239 read_unlock_dk(current_tree
);
245 * add_empty_leaf - insert empty leaf between two extents
246 * @insert_coord: position in twig node between two extents
247 * @lh: twig node lock handle
248 * @key: left delimiting key of new node
249 * @rdkey: right delimiting key of new node
251 * Inserts empty leaf node between two extent items. It is necessary when we
252 * have to insert an item on leaf level between two extents (items on the twig
256 add_empty_leaf(coord_t
*insert_coord
, lock_handle
*lh
,
257 const reiser4_key
*key
, const reiser4_key
*rdkey
)
262 reiser4_item_data
*item
;
263 carry_insert_data
*cdata
;
268 assert("vs-49827", znode_contains_key_lock(insert_coord
->node
, key
));
269 tree
= znode_get_tree(insert_coord
->node
);
270 node
= reiser4_new_node(insert_coord
->node
, LEAF_LEVEL
);
272 return PTR_ERR(node
);
274 /* setup delimiting keys for node being inserted */
276 znode_set_ld_key(node
, key
);
277 znode_set_rd_key(node
, rdkey
);
278 ON_DEBUG(node
->creator
= current
);
279 ON_DEBUG(node
->first_key
= *key
);
280 write_unlock_dk(tree
);
282 ZF_SET(node
, JNODE_ORPHAN
);
285 * allocate carry_pool, 3 carry_level-s, reiser4_item_data and
288 pool
= init_carry_pool(sizeof(*pool
) + 3 * sizeof(*todo
) +
289 sizeof(*item
) + sizeof(*cdata
));
291 return PTR_ERR(pool
);
292 todo
= (carry_level
*) (pool
+ 1);
293 init_carry_level(todo
, pool
);
295 item
= (reiser4_item_data
*) (todo
+ 3);
296 cdata
= (carry_insert_data
*) (item
+ 1);
298 op
= reiser4_post_carry(todo
, COP_INSERT
, insert_coord
->node
, 0);
300 cdata
->coord
= insert_coord
;
303 op
->u
.insert
.d
= cdata
;
304 op
->u
.insert
.type
= COPT_ITEM_DATA
;
305 build_child_ptr_data(node
, item
);
307 /* have @insert_coord to be set at inserted item after
309 todo
->track_type
= CARRY_TRACK_CHANGE
;
312 result
= reiser4_carry(todo
, NULL
);
315 * pin node in memory. This is necessary for
316 * znode_make_dirty() below.
318 result
= zload(node
);
320 lock_handle local_lh
;
323 * if we inserted new child into tree we have
324 * to mark it dirty so that flush will be able
328 result
= longterm_lock_znode(&local_lh
, node
,
332 znode_make_dirty(node
);
335 * when internal item pointing to @node
336 * was inserted into twig node
337 * create_hook_internal did not connect
338 * it properly because its right
339 * neighbor was not known. Do it
342 write_lock_tree(tree
);
343 assert("nikita-3312",
344 znode_is_right_connected(node
));
345 assert("nikita-2984",
346 node
->right
== NULL
);
347 ZF_CLR(node
, JNODE_RIGHT_CONNECTED
);
348 write_unlock_tree(tree
);
350 connect_znode(insert_coord
, node
);
351 ON_DEBUG(if (result
== 0) check_dkeys(node
););
354 move_lh(lh
, &local_lh
);
355 assert("vs-1676", node_is_empty(node
));
356 coord_init_first_unit(insert_coord
,
359 warning("nikita-3136",
360 "Cannot lock child");
367 result
= PTR_ERR(op
);
369 done_carry_pool(pool
);
374 * handle_eottl - handle extent-on-the-twig-level cases in tree traversal
376 * @outcome: flag saying whether search has to restart or is done
378 * Handles search on twig level. If this function completes search itself then
379 * it returns 1. If search has to go one level down then 0 is returned. If
380 * error happens then LOOKUP_DONE is returned via @outcome and error code is saved
383 int handle_eottl(cbk_handle
*h
, int *outcome
)
391 if (h
->level
!= TWIG_LEVEL
||
392 (coord_is_existing_item(coord
) && item_is_internal(coord
))) {
393 /* Continue to traverse tree downward. */
398 * make sure that @h->coord is set to twig node and that it is either
399 * set to extent item or after extent item
401 assert("vs-356", h
->level
== TWIG_LEVEL
);
404 coord_dup(&lcoord
, coord
);
405 check_me("vs-733", coord_set_to_left(&lcoord
) == 0);
406 item_is_extent(&lcoord
);
410 if (*outcome
== NS_FOUND
) {
411 /* we have found desired key on twig level in extent item */
412 h
->result
= CBK_COORD_FOUND
;
413 *outcome
= LOOKUP_DONE
;
417 if (!(h
->flags
& CBK_FOR_INSERT
)) {
418 /* tree traversal is not for insertion. Just return
419 CBK_COORD_NOTFOUND. */
420 h
->result
= CBK_COORD_NOTFOUND
;
421 *outcome
= LOOKUP_DONE
;
425 /* take a look at the item to the right of h -> coord */
426 result
= is_next_item_internal(coord
, h
->key
, h
->active_lh
);
427 if (unlikely(result
< 0)) {
428 h
->error
= "get_right_neighbor failed";
430 *outcome
= LOOKUP_DONE
;
435 * item to the right is also an extent one. Allocate a new node
436 * and insert pointer to it after item h -> coord.
438 * This is a result of extents being located at the twig
439 * level. For explanation, see comment just above
440 * is_next_item_internal().
444 if (cbk_lock_mode(h
->level
, h
) != ZNODE_WRITE_LOCK
) {
446 * we got node read locked, restart coord_by_key to
447 * have write lock on twig level
449 h
->lock_level
= TWIG_LEVEL
;
450 h
->lock_mode
= ZNODE_WRITE_LOCK
;
451 *outcome
= LOOKUP_REST
;
455 loaded
= coord
->node
;
457 add_empty_leaf(coord
, h
->active_lh
, h
->key
,
458 rd_key(coord
, &key
));
460 h
->error
= "could not add empty leaf";
462 *outcome
= LOOKUP_DONE
;
465 /* added empty leaf is locked (h->active_lh), its parent node
466 is unlocked, h->coord is set as EMPTY */
467 assert("vs-13", coord
->between
== EMPTY_NODE
);
468 assert("vs-14", znode_is_write_locked(coord
->node
));
470 WITH_DATA(coord
->node
, node_is_empty(coord
->node
)));
471 assert("vs-16", jnode_is_leaf(ZJNODE(coord
->node
)));
472 assert("vs-17", coord
->node
== h
->active_lh
->node
);
473 *outcome
= LOOKUP_DONE
;
474 h
->result
= CBK_COORD_NOTFOUND
;
476 } else if (result
== 1) {
478 * this is special case mentioned in the comment on
479 * tree.h:cbk_flags. We have found internal item immediately on
480 * the right of extent, and we are going to insert new item
481 * there. Key of item we are going to insert is smaller than
482 * leftmost key in the node pointed to by said internal item
483 * (otherwise search wouldn't come to the extent in the first
486 * This is a result of extents being located at the twig
487 * level. For explanation, see comment just above
488 * is_next_item_internal().
490 h
->flags
&= ~CBK_TRUST_DK
;
492 assert("vs-8", result
== 2);
493 *outcome
= LOOKUP_REST
;
496 assert("vs-362", WITH_DATA(coord
->node
, item_is_internal(coord
)));
502 * c-indentation-style: "K&R"