On Tue, Nov 06, 2007 at 02:33:53AM -0800, akpm@linux-foundation.org wrote:
[mmotm.git] / fs / reiser4 / eottl.c
blob169b8684a33a609934afe3fb0c7614de6a5e5302
1 /* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
2 reiser4/README */
4 #include "forward.h"
5 #include "debug.h"
6 #include "key.h"
7 #include "coord.h"
8 #include "plugin/item/item.h"
9 #include "plugin/node/node.h"
10 #include "znode.h"
11 #include "block_alloc.h"
12 #include "tree_walk.h"
13 #include "tree_mod.h"
14 #include "carry.h"
15 #include "tree.h"
16 #include "super.h"
18 #include <linux/types.h> /* for __u?? */
21 * Extents on the twig level (EOTTL) handling.
23 * EOTTL poses some problems to the tree traversal, that are better explained
24 * by example.
26 * Suppose we have block B1 on the twig level with the following items:
28 * 0. internal item I0 with key (0:0:0:0) (locality, key-type, object-id,
29 * offset)
30 * 1. extent item E1 with key (1:4:100:0), having 10 blocks of 4k each
31 * 2. internal item I2 with key (10:0:0:0)
33 * We are trying to insert item with key (5:0:0:0). Lookup finds node B1, and
34 * then intra-node lookup is done. This lookup finished on the E1, because the
35 * key we are looking for is larger than the key of E1 and is smaller than key
36 * the of I2.
38 * Here search is stuck.
40 * After some thought it is clear what is wrong here: extents on the twig level
41 * break some basic property of the *search* tree (on the pretext, that they
42 * restore property of balanced tree).
44 * Said property is the following: if in the internal node of the search tree
45 * we have [ ... Key1 Pointer Key2 ... ] then, all data that are or will be
46 * keyed in the tree with the Key such that Key1 <= Key < Key2 are accessible
47 * through the Pointer.
49 * This is not true, when Pointer is Extent-Pointer, simply because extent
50 * cannot expand indefinitely to the right to include any item with
52 * Key1 <= Key <= Key2.
54 * For example, our E1 extent is only responsible for the data with keys
56 * (1:4:100:0) <= key <= (1:4:100:0xffffffffffffffff), and
58 * so, key range
60 * ( (1:4:100:0xffffffffffffffff), (10:0:0:0) )
62 * is orphaned: there is no way to get there from the tree root.
64 * In other words, extent pointers are different than normal child pointers as
65 * far as search tree is concerned, and this creates such problems.
67 * Possible solution for this problem is to insert our item into node pointed
68 * to by I2. There are some problems through:
70 * (1) I2 can be in a different node.
71 * (2) E1 can be immediately followed by another extent E2.
73 * (1) is solved by calling reiser4_get_right_neighbor() and accounting
74 * for locks/coords as necessary.
76 * (2) is more complex. Solution here is to insert new empty leaf node and
77 * insert internal item between E1 and E2 pointing to said leaf node. This is
78 * further complicated by possibility that E2 is in a different node, etc.
80 * Problems:
82 * (1) if there was internal item I2 immediately on the right of an extent E1
83 * we and we decided to insert new item S1 into node N2 pointed to by I2, then
84 * key of S1 will be less than smallest key in the N2. Normally, search key
85 * checks that key we are looking for is in the range of keys covered by the
86 * node key is being looked in. To work around of this situation, while
87 * preserving useful consistency check new flag CBK_TRUST_DK was added to the
88 * cbk falgs bitmask. This flag is automatically set on entrance to the
89 * coord_by_key() and is only cleared when we are about to enter situation
90 * described above.
92 * (2) If extent E1 is immediately followed by another extent E2 and we are
93 * searching for the key that is between E1 and E2 we only have to insert new
94 * empty leaf node when coord_by_key was called for insertion, rather than just
95 * for lookup. To distinguish these cases, new flag CBK_FOR_INSERT was added to
96 * the cbk falgs bitmask. This flag is automatically set by coord_by_key calls
97 * performed by insert_by_key() and friends.
99 * (3) Insertion of new empty leaf node (possibly) requires balancing. In any
100 * case it requires modification of node content which is only possible under
101 * write lock. It may well happen that we only have read lock on the node where
102 * new internal pointer is to be inserted (common case: lookup of non-existent
103 * stat-data that fells between two extents). If only read lock is held, tree
104 * traversal is restarted with lock_level modified so that next time we hit
105 * this problem, write lock will be held. Once we have write lock, balancing
106 * will be performed.
110 * is_next_item_internal - check whether next item is internal
111 * @coord: coordinate of extent item in twig node
112 * @key: search key
113 * @lh: twig node lock handle
115 * Looks at the unit next to @coord. If it is an internal one - 1 is returned,
116 * @coord is set to that unit. If that unit is in right neighbor, @lh is moved
117 * to that node, @coord is set to its first unit. If next item is not internal
118 * or does not exist then 0 is returned, @coord and @lh are left unchanged. 2
119 * is returned if search restart has to be done.
121 static int
122 is_next_item_internal(coord_t *coord, const reiser4_key * key,
123 lock_handle * lh)
125 coord_t next;
126 lock_handle rn;
127 int result;
129 coord_dup(&next, coord);
130 if (coord_next_unit(&next) == 0) {
131 /* next unit is in this node */
132 if (item_is_internal(&next)) {
133 coord_dup(coord, &next);
134 return 1;
136 assert("vs-3", item_is_extent(&next));
137 return 0;
141 * next unit either does not exist or is in right neighbor. If it is in
142 * right neighbor we have to check right delimiting key because
143 * concurrent thread could get their first and insert item with a key
144 * smaller than @key
146 read_lock_dk(current_tree);
147 result = keycmp(key, znode_get_rd_key(coord->node));
148 read_unlock_dk(current_tree);
149 assert("vs-6", result != EQUAL_TO);
150 if (result == GREATER_THAN)
151 return 2;
153 /* lock right neighbor */
154 init_lh(&rn);
155 result = reiser4_get_right_neighbor(&rn, coord->node,
156 znode_is_wlocked(coord->node) ?
157 ZNODE_WRITE_LOCK : ZNODE_READ_LOCK,
158 GN_CAN_USE_UPPER_LEVELS);
159 if (result == -E_NO_NEIGHBOR) {
160 /* we are on the rightmost edge of the tree */
161 done_lh(&rn);
162 return 0;
165 if (result) {
166 assert("vs-4", result < 0);
167 done_lh(&rn);
168 return result;
172 * check whether concurrent thread managed to insert item with a key
173 * smaller than @key
175 read_lock_dk(current_tree);
176 result = keycmp(key, znode_get_ld_key(rn.node));
177 read_unlock_dk(current_tree);
178 assert("vs-6", result != EQUAL_TO);
179 if (result == GREATER_THAN) {
180 done_lh(&rn);
181 return 2;
184 result = zload(rn.node);
185 if (result) {
186 assert("vs-5", result < 0);
187 done_lh(&rn);
188 return result;
191 coord_init_first_unit(&next, rn.node);
192 if (item_is_internal(&next)) {
194 * next unit is in right neighbor and it is an unit of internal
195 * item. Unlock coord->node. Move @lh to right neighbor. @coord
196 * is set to the first unit of right neighbor.
198 coord_dup(coord, &next);
199 zrelse(rn.node);
200 done_lh(lh);
201 move_lh(lh, &rn);
202 return 1;
206 * next unit is unit of extent item. Return without chaning @lh and
207 * @coord.
209 assert("vs-6", item_is_extent(&next));
210 zrelse(rn.node);
211 done_lh(&rn);
212 return 0;
216 * rd_key - calculate key of an item next to the given one
217 * @coord: position in a node
218 * @key: storage for result key
220 * @coord is set between items or after the last item in a node. Calculate key
221 * of item to the right of @coord.
223 static reiser4_key *rd_key(const coord_t *coord, reiser4_key *key)
225 coord_t dup;
227 assert("nikita-2281", coord_is_between_items(coord));
228 coord_dup(&dup, coord);
230 if (coord_set_to_right(&dup) == 0)
231 /* next item is in this node. Return its key. */
232 unit_key_by_coord(&dup, key);
233 else {
235 * next item either does not exist or is in right
236 * neighbor. Return znode's right delimiting key.
238 read_lock_dk(current_tree);
239 *key = *znode_get_rd_key(coord->node);
240 read_unlock_dk(current_tree);
242 return key;
246 * add_empty_leaf - insert empty leaf between two extents
247 * @insert_coord: position in twig node between two extents
248 * @lh: twig node lock handle
249 * @key: left delimiting key of new node
250 * @rdkey: right delimiting key of new node
252 * Inserts empty leaf node between two extent items. It is necessary when we
253 * have to insert an item on leaf level between two extents (items on the twig
254 * level).
256 static int
257 add_empty_leaf(coord_t *insert_coord, lock_handle *lh,
258 const reiser4_key *key, const reiser4_key *rdkey)
260 int result;
261 carry_pool *pool;
262 carry_level *todo;
263 reiser4_item_data *item;
264 carry_insert_data *cdata;
265 carry_op *op;
266 znode *node;
267 reiser4_tree *tree;
269 assert("vs-49827", znode_contains_key_lock(insert_coord->node, key));
270 tree = znode_get_tree(insert_coord->node);
271 node = reiser4_new_node(insert_coord->node, LEAF_LEVEL);
272 if (IS_ERR(node))
273 return PTR_ERR(node);
275 /* setup delimiting keys for node being inserted */
276 write_lock_dk(tree);
277 znode_set_ld_key(node, key);
278 znode_set_rd_key(node, rdkey);
279 ON_DEBUG(node->creator = current);
280 ON_DEBUG(node->first_key = *key);
281 write_unlock_dk(tree);
283 ZF_SET(node, JNODE_ORPHAN);
286 * allocate carry_pool, 3 carry_level-s, reiser4_item_data and
287 * carry_insert_data
289 pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo) +
290 sizeof(*item) + sizeof(*cdata));
291 if (IS_ERR(pool))
292 return PTR_ERR(pool);
293 todo = (carry_level *) (pool + 1);
294 init_carry_level(todo, pool);
296 item = (reiser4_item_data *) (todo + 3);
297 cdata = (carry_insert_data *) (item + 1);
299 op = reiser4_post_carry(todo, COP_INSERT, insert_coord->node, 0);
300 if (!IS_ERR(op)) {
301 cdata->coord = insert_coord;
302 cdata->key = key;
303 cdata->data = item;
304 op->u.insert.d = cdata;
305 op->u.insert.type = COPT_ITEM_DATA;
306 build_child_ptr_data(node, item);
307 item->arg = NULL;
308 /* have @insert_coord to be set at inserted item after
309 insertion is done */
310 todo->track_type = CARRY_TRACK_CHANGE;
311 todo->tracked = lh;
313 result = reiser4_carry(todo, NULL);
314 if (result == 0) {
316 * pin node in memory. This is necessary for
317 * znode_make_dirty() below.
319 result = zload(node);
320 if (result == 0) {
321 lock_handle local_lh;
324 * if we inserted new child into tree we have
325 * to mark it dirty so that flush will be able
326 * to process it.
328 init_lh(&local_lh);
329 result = longterm_lock_znode(&local_lh, node,
330 ZNODE_WRITE_LOCK,
331 ZNODE_LOCK_LOPRI);
332 if (result == 0) {
333 znode_make_dirty(node);
336 * when internal item pointing to @node
337 * was inserted into twig node
338 * create_hook_internal did not connect
339 * it properly because its right
340 * neighbor was not known. Do it
341 * here
343 write_lock_tree(tree);
344 assert("nikita-3312",
345 znode_is_right_connected(node));
346 assert("nikita-2984",
347 node->right == NULL);
348 ZF_CLR(node, JNODE_RIGHT_CONNECTED);
349 write_unlock_tree(tree);
350 result =
351 connect_znode(insert_coord, node);
352 ON_DEBUG(if (result == 0) check_dkeys(node););
354 done_lh(lh);
355 move_lh(lh, &local_lh);
356 assert("vs-1676", node_is_empty(node));
357 coord_init_first_unit(insert_coord,
358 node);
359 } else {
360 warning("nikita-3136",
361 "Cannot lock child");
363 done_lh(&local_lh);
364 zrelse(node);
367 } else
368 result = PTR_ERR(op);
369 zput(node);
370 done_carry_pool(pool);
371 return result;
375 * handle_eottl - handle extent-on-the-twig-level cases in tree traversal
376 * @h: search handle
377 * @outcome: flag saying whether search has to restart or is done
379 * Handles search on twig level. If this function completes search itself then
380 * it returns 1. If search has to go one level down then 0 is returned. If
381 * error happens then LOOKUP_DONE is returned via @outcome and error code is
382 * saved in @h->result.
384 int handle_eottl(cbk_handle *h, int *outcome)
386 int result;
387 reiser4_key key;
388 coord_t *coord;
390 coord = h->coord;
392 if (h->level != TWIG_LEVEL ||
393 (coord_is_existing_item(coord) && item_is_internal(coord))) {
394 /* Continue to traverse tree downward. */
395 return 0;
399 * make sure that @h->coord is set to twig node and that it is either
400 * set to extent item or after extent item
402 assert("vs-356", h->level == TWIG_LEVEL);
403 assert("vs-357", ({
404 coord_t lcoord;
405 coord_dup(&lcoord, coord);
406 check_me("vs-733", coord_set_to_left(&lcoord) == 0);
407 item_is_extent(&lcoord);
411 if (*outcome == NS_FOUND) {
412 /* we have found desired key on twig level in extent item */
413 h->result = CBK_COORD_FOUND;
414 *outcome = LOOKUP_DONE;
415 return 1;
418 if (!(h->flags & CBK_FOR_INSERT)) {
419 /* tree traversal is not for insertion. Just return
420 CBK_COORD_NOTFOUND. */
421 h->result = CBK_COORD_NOTFOUND;
422 *outcome = LOOKUP_DONE;
423 return 1;
426 /* take a look at the item to the right of h -> coord */
427 result = is_next_item_internal(coord, h->key, h->active_lh);
428 if (unlikely(result < 0)) {
429 h->error = "get_right_neighbor failed";
430 h->result = result;
431 *outcome = LOOKUP_DONE;
432 return 1;
434 if (result == 0) {
436 * item to the right is also an extent one. Allocate a new node
437 * and insert pointer to it after item h -> coord.
439 * This is a result of extents being located at the twig
440 * level. For explanation, see comment just above
441 * is_next_item_internal().
443 znode *loaded;
445 if (cbk_lock_mode(h->level, h) != ZNODE_WRITE_LOCK) {
447 * we got node read locked, restart coord_by_key to
448 * have write lock on twig level
450 h->lock_level = TWIG_LEVEL;
451 h->lock_mode = ZNODE_WRITE_LOCK;
452 *outcome = LOOKUP_REST;
453 return 1;
456 loaded = coord->node;
457 result =
458 add_empty_leaf(coord, h->active_lh, h->key,
459 rd_key(coord, &key));
460 if (result) {
461 h->error = "could not add empty leaf";
462 h->result = result;
463 *outcome = LOOKUP_DONE;
464 return 1;
466 /* added empty leaf is locked (h->active_lh), its parent node
467 is unlocked, h->coord is set as EMPTY */
468 assert("vs-13", coord->between == EMPTY_NODE);
469 assert("vs-14", znode_is_write_locked(coord->node));
470 assert("vs-15",
471 WITH_DATA(coord->node, node_is_empty(coord->node)));
472 assert("vs-16", jnode_is_leaf(ZJNODE(coord->node)));
473 assert("vs-17", coord->node == h->active_lh->node);
474 *outcome = LOOKUP_DONE;
475 h->result = CBK_COORD_NOTFOUND;
476 return 1;
477 } else if (result == 1) {
479 * this is special case mentioned in the comment on
480 * tree.h:cbk_flags. We have found internal item immediately on
481 * the right of extent, and we are going to insert new item
482 * there. Key of item we are going to insert is smaller than
483 * leftmost key in the node pointed to by said internal item
484 * (otherwise search wouldn't come to the extent in the first
485 * place).
487 * This is a result of extents being located at the twig
488 * level. For explanation, see comment just above
489 * is_next_item_internal().
491 h->flags &= ~CBK_TRUST_DK;
492 } else {
493 assert("vs-8", result == 2);
494 *outcome = LOOKUP_REST;
495 return 1;
497 assert("vs-362", WITH_DATA(coord->node, item_is_internal(coord)));
498 return 0;
502 * Local variables:
503 * c-indentation-style: "K&R"
504 * mode-name: "LC"
505 * c-basic-offset: 8
506 * tab-width: 8
507 * fill-column: 120
508 * scroll-step: 1
509 * End: