1 /* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
8 #include "plugin/item/item.h"
9 #include "plugin/node/node.h"
11 #include "block_alloc.h"
12 #include "tree_walk.h"
18 #include <linux/types.h> /* for __u?? */
21 * Extents on the twig level (EOTTL) handling.
23 * EOTTL poses some problems to the tree traversal, that are better explained
26 * Suppose we have block B1 on the twig level with the following items:
28 * 0. internal item I0 with key (0:0:0:0) (locality, key-type, object-id,
30 * 1. extent item E1 with key (1:4:100:0), having 10 blocks of 4k each
31 * 2. internal item I2 with key (10:0:0:0)
33 * We are trying to insert item with key (5:0:0:0). Lookup finds node B1, and
34 * then intra-node lookup is done. This lookup finished on the E1, because the
35 * key we are looking for is larger than the key of E1 and is smaller than key
38 * Here search is stuck.
40 * After some thought it is clear what is wrong here: extents on the twig level
41 * break some basic property of the *search* tree (on the pretext, that they
42 * restore property of balanced tree).
44 * Said property is the following: if in the internal node of the search tree
45 * we have [ ... Key1 Pointer Key2 ... ] then, all data that are or will be
46 * keyed in the tree with the Key such that Key1 <= Key < Key2 are accessible
47 * through the Pointer.
49 * This is not true, when Pointer is Extent-Pointer, simply because extent
50 * cannot expand indefinitely to the right to include any item with
52 * Key1 <= Key <= Key2.
54 * For example, our E1 extent is only responsible for the data with keys
56 * (1:4:100:0) <= key <= (1:4:100:0xffffffffffffffff), and
60 * ( (1:4:100:0xffffffffffffffff), (10:0:0:0) )
62 * is orphaned: there is no way to get there from the tree root.
64 * In other words, extent pointers are different than normal child pointers as
65 * far as search tree is concerned, and this creates such problems.
67 * Possible solution for this problem is to insert our item into node pointed
68 * to by I2. There are some problems through:
70 * (1) I2 can be in a different node.
71 * (2) E1 can be immediately followed by another extent E2.
73 * (1) is solved by calling reiser4_get_right_neighbor() and accounting
74 * for locks/coords as necessary.
76 * (2) is more complex. Solution here is to insert new empty leaf node and
77 * insert internal item between E1 and E2 pointing to said leaf node. This is
78 * further complicated by possibility that E2 is in a different node, etc.
82 * (1) if there was internal item I2 immediately on the right of an extent E1
83 * we and we decided to insert new item S1 into node N2 pointed to by I2, then
84 * key of S1 will be less than smallest key in the N2. Normally, search key
85 * checks that key we are looking for is in the range of keys covered by the
86 * node key is being looked in. To work around of this situation, while
87 * preserving useful consistency check new flag CBK_TRUST_DK was added to the
88 * cbk falgs bitmask. This flag is automatically set on entrance to the
89 * coord_by_key() and is only cleared when we are about to enter situation
92 * (2) If extent E1 is immediately followed by another extent E2 and we are
93 * searching for the key that is between E1 and E2 we only have to insert new
94 * empty leaf node when coord_by_key was called for insertion, rather than just
95 * for lookup. To distinguish these cases, new flag CBK_FOR_INSERT was added to
96 * the cbk falgs bitmask. This flag is automatically set by coord_by_key calls
97 * performed by insert_by_key() and friends.
99 * (3) Insertion of new empty leaf node (possibly) requires balancing. In any
100 * case it requires modification of node content which is only possible under
101 * write lock. It may well happen that we only have read lock on the node where
102 * new internal pointer is to be inserted (common case: lookup of non-existent
103 * stat-data that fells between two extents). If only read lock is held, tree
104 * traversal is restarted with lock_level modified so that next time we hit
105 * this problem, write lock will be held. Once we have write lock, balancing
110 * is_next_item_internal - check whether next item is internal
111 * @coord: coordinate of extent item in twig node
113 * @lh: twig node lock handle
115 * Looks at the unit next to @coord. If it is an internal one - 1 is returned,
116 * @coord is set to that unit. If that unit is in right neighbor, @lh is moved
117 * to that node, @coord is set to its first unit. If next item is not internal
118 * or does not exist then 0 is returned, @coord and @lh are left unchanged. 2
119 * is returned if search restart has to be done.
122 is_next_item_internal(coord_t
*coord
, const reiser4_key
* key
,
129 coord_dup(&next
, coord
);
130 if (coord_next_unit(&next
) == 0) {
131 /* next unit is in this node */
132 if (item_is_internal(&next
)) {
133 coord_dup(coord
, &next
);
136 assert("vs-3", item_is_extent(&next
));
141 * next unit either does not exist or is in right neighbor. If it is in
142 * right neighbor we have to check right delimiting key because
143 * concurrent thread could get their first and insert item with a key
146 read_lock_dk(current_tree
);
147 result
= keycmp(key
, znode_get_rd_key(coord
->node
));
148 read_unlock_dk(current_tree
);
149 assert("vs-6", result
!= EQUAL_TO
);
150 if (result
== GREATER_THAN
)
153 /* lock right neighbor */
155 result
= reiser4_get_right_neighbor(&rn
, coord
->node
,
156 znode_is_wlocked(coord
->node
) ?
157 ZNODE_WRITE_LOCK
: ZNODE_READ_LOCK
,
158 GN_CAN_USE_UPPER_LEVELS
);
159 if (result
== -E_NO_NEIGHBOR
) {
160 /* we are on the rightmost edge of the tree */
166 assert("vs-4", result
< 0);
172 * check whether concurrent thread managed to insert item with a key
175 read_lock_dk(current_tree
);
176 result
= keycmp(key
, znode_get_ld_key(rn
.node
));
177 read_unlock_dk(current_tree
);
178 assert("vs-6", result
!= EQUAL_TO
);
179 if (result
== GREATER_THAN
) {
184 result
= zload(rn
.node
);
186 assert("vs-5", result
< 0);
191 coord_init_first_unit(&next
, rn
.node
);
192 if (item_is_internal(&next
)) {
194 * next unit is in right neighbor and it is an unit of internal
195 * item. Unlock coord->node. Move @lh to right neighbor. @coord
196 * is set to the first unit of right neighbor.
198 coord_dup(coord
, &next
);
206 * next unit is unit of extent item. Return without chaning @lh and
209 assert("vs-6", item_is_extent(&next
));
216 * rd_key - calculate key of an item next to the given one
217 * @coord: position in a node
218 * @key: storage for result key
220 * @coord is set between items or after the last item in a node. Calculate key
221 * of item to the right of @coord.
223 static reiser4_key
*rd_key(const coord_t
*coord
, reiser4_key
*key
)
227 assert("nikita-2281", coord_is_between_items(coord
));
228 coord_dup(&dup
, coord
);
230 if (coord_set_to_right(&dup
) == 0)
231 /* next item is in this node. Return its key. */
232 unit_key_by_coord(&dup
, key
);
235 * next item either does not exist or is in right
236 * neighbor. Return znode's right delimiting key.
238 read_lock_dk(current_tree
);
239 *key
= *znode_get_rd_key(coord
->node
);
240 read_unlock_dk(current_tree
);
246 * add_empty_leaf - insert empty leaf between two extents
247 * @insert_coord: position in twig node between two extents
248 * @lh: twig node lock handle
249 * @key: left delimiting key of new node
250 * @rdkey: right delimiting key of new node
252 * Inserts empty leaf node between two extent items. It is necessary when we
253 * have to insert an item on leaf level between two extents (items on the twig
257 add_empty_leaf(coord_t
*insert_coord
, lock_handle
*lh
,
258 const reiser4_key
*key
, const reiser4_key
*rdkey
)
263 reiser4_item_data
*item
;
264 carry_insert_data
*cdata
;
269 assert("vs-49827", znode_contains_key_lock(insert_coord
->node
, key
));
270 tree
= znode_get_tree(insert_coord
->node
);
271 node
= reiser4_new_node(insert_coord
->node
, LEAF_LEVEL
);
273 return PTR_ERR(node
);
275 /* setup delimiting keys for node being inserted */
277 znode_set_ld_key(node
, key
);
278 znode_set_rd_key(node
, rdkey
);
279 ON_DEBUG(node
->creator
= current
);
280 ON_DEBUG(node
->first_key
= *key
);
281 write_unlock_dk(tree
);
283 ZF_SET(node
, JNODE_ORPHAN
);
286 * allocate carry_pool, 3 carry_level-s, reiser4_item_data and
289 pool
= init_carry_pool(sizeof(*pool
) + 3 * sizeof(*todo
) +
290 sizeof(*item
) + sizeof(*cdata
));
292 return PTR_ERR(pool
);
293 todo
= (carry_level
*) (pool
+ 1);
294 init_carry_level(todo
, pool
);
296 item
= (reiser4_item_data
*) (todo
+ 3);
297 cdata
= (carry_insert_data
*) (item
+ 1);
299 op
= reiser4_post_carry(todo
, COP_INSERT
, insert_coord
->node
, 0);
301 cdata
->coord
= insert_coord
;
304 op
->u
.insert
.d
= cdata
;
305 op
->u
.insert
.type
= COPT_ITEM_DATA
;
306 build_child_ptr_data(node
, item
);
308 /* have @insert_coord to be set at inserted item after
310 todo
->track_type
= CARRY_TRACK_CHANGE
;
313 result
= reiser4_carry(todo
, NULL
);
316 * pin node in memory. This is necessary for
317 * znode_make_dirty() below.
319 result
= zload(node
);
321 lock_handle local_lh
;
324 * if we inserted new child into tree we have
325 * to mark it dirty so that flush will be able
329 result
= longterm_lock_znode(&local_lh
, node
,
333 znode_make_dirty(node
);
336 * when internal item pointing to @node
337 * was inserted into twig node
338 * create_hook_internal did not connect
339 * it properly because its right
340 * neighbor was not known. Do it
343 write_lock_tree(tree
);
344 assert("nikita-3312",
345 znode_is_right_connected(node
));
346 assert("nikita-2984",
347 node
->right
== NULL
);
348 ZF_CLR(node
, JNODE_RIGHT_CONNECTED
);
349 write_unlock_tree(tree
);
351 connect_znode(insert_coord
, node
);
352 ON_DEBUG(if (result
== 0) check_dkeys(node
););
355 move_lh(lh
, &local_lh
);
356 assert("vs-1676", node_is_empty(node
));
357 coord_init_first_unit(insert_coord
,
360 warning("nikita-3136",
361 "Cannot lock child");
368 result
= PTR_ERR(op
);
370 done_carry_pool(pool
);
375 * handle_eottl - handle extent-on-the-twig-level cases in tree traversal
377 * @outcome: flag saying whether search has to restart or is done
379 * Handles search on twig level. If this function completes search itself then
380 * it returns 1. If search has to go one level down then 0 is returned. If
381 * error happens then LOOKUP_DONE is returned via @outcome and error code is
382 * saved in @h->result.
384 int handle_eottl(cbk_handle
*h
, int *outcome
)
392 if (h
->level
!= TWIG_LEVEL
||
393 (coord_is_existing_item(coord
) && item_is_internal(coord
))) {
394 /* Continue to traverse tree downward. */
399 * make sure that @h->coord is set to twig node and that it is either
400 * set to extent item or after extent item
402 assert("vs-356", h
->level
== TWIG_LEVEL
);
405 coord_dup(&lcoord
, coord
);
406 check_me("vs-733", coord_set_to_left(&lcoord
) == 0);
407 item_is_extent(&lcoord
);
411 if (*outcome
== NS_FOUND
) {
412 /* we have found desired key on twig level in extent item */
413 h
->result
= CBK_COORD_FOUND
;
414 *outcome
= LOOKUP_DONE
;
418 if (!(h
->flags
& CBK_FOR_INSERT
)) {
419 /* tree traversal is not for insertion. Just return
420 CBK_COORD_NOTFOUND. */
421 h
->result
= CBK_COORD_NOTFOUND
;
422 *outcome
= LOOKUP_DONE
;
426 /* take a look at the item to the right of h -> coord */
427 result
= is_next_item_internal(coord
, h
->key
, h
->active_lh
);
428 if (unlikely(result
< 0)) {
429 h
->error
= "get_right_neighbor failed";
431 *outcome
= LOOKUP_DONE
;
436 * item to the right is also an extent one. Allocate a new node
437 * and insert pointer to it after item h -> coord.
439 * This is a result of extents being located at the twig
440 * level. For explanation, see comment just above
441 * is_next_item_internal().
445 if (cbk_lock_mode(h
->level
, h
) != ZNODE_WRITE_LOCK
) {
447 * we got node read locked, restart coord_by_key to
448 * have write lock on twig level
450 h
->lock_level
= TWIG_LEVEL
;
451 h
->lock_mode
= ZNODE_WRITE_LOCK
;
452 *outcome
= LOOKUP_REST
;
456 loaded
= coord
->node
;
458 add_empty_leaf(coord
, h
->active_lh
, h
->key
,
459 rd_key(coord
, &key
));
461 h
->error
= "could not add empty leaf";
463 *outcome
= LOOKUP_DONE
;
466 /* added empty leaf is locked (h->active_lh), its parent node
467 is unlocked, h->coord is set as EMPTY */
468 assert("vs-13", coord
->between
== EMPTY_NODE
);
469 assert("vs-14", znode_is_write_locked(coord
->node
));
471 WITH_DATA(coord
->node
, node_is_empty(coord
->node
)));
472 assert("vs-16", jnode_is_leaf(ZJNODE(coord
->node
)));
473 assert("vs-17", coord
->node
== h
->active_lh
->node
);
474 *outcome
= LOOKUP_DONE
;
475 h
->result
= CBK_COORD_NOTFOUND
;
477 } else if (result
== 1) {
479 * this is special case mentioned in the comment on
480 * tree.h:cbk_flags. We have found internal item immediately on
481 * the right of extent, and we are going to insert new item
482 * there. Key of item we are going to insert is smaller than
483 * leftmost key in the node pointed to by said internal item
484 * (otherwise search wouldn't come to the extent in the first
487 * This is a result of extents being located at the twig
488 * level. For explanation, see comment just above
489 * is_next_item_internal().
491 h
->flags
&= ~CBK_TRUST_DK
;
493 assert("vs-8", result
== 2);
494 *outcome
= LOOKUP_REST
;
497 assert("vs-362", WITH_DATA(coord
->node
, item_is_internal(coord
)));
503 * c-indentation-style: "K&R"