revert-mm-fix-blkdev-size-calculation-in-generic_write_checks
[linux-2.6/linux-trees-mm.git] / fs / reiser4 / eottl.c
blobf921b19a83433a3cad7feb861ed359498e7f585e
1 /* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
3 #include "forward.h"
4 #include "debug.h"
5 #include "key.h"
6 #include "coord.h"
7 #include "plugin/item/item.h"
8 #include "plugin/node/node.h"
9 #include "znode.h"
10 #include "block_alloc.h"
11 #include "tree_walk.h"
12 #include "tree_mod.h"
13 #include "carry.h"
14 #include "tree.h"
15 #include "super.h"
17 #include <linux/types.h> /* for __u?? */
20 * Extents on the twig level (EOTTL) handling.
22 * EOTTL poses some problems to the tree traversal, that are better explained
23 * by example.
25 * Suppose we have block B1 on the twig level with the following items:
27 * 0. internal item I0 with key (0:0:0:0) (locality, key-type, object-id,
28 * offset)
29 * 1. extent item E1 with key (1:4:100:0), having 10 blocks of 4k each
30 * 2. internal item I2 with key (10:0:0:0)
32 * We are trying to insert item with key (5:0:0:0). Lookup finds node B1, and
33 * then intra-node lookup is done. This lookup finished on the E1, because the
34 * key we are looking for is larger than the key of E1 and is smaller than key
35 * the of I2.
37 * Here search is stuck.
39 * After some thought it is clear what is wrong here: extents on the twig level
40 * break some basic property of the *search* tree (on the pretext, that they
41 * restore property of balanced tree).
43 * Said property is the following: if in the internal node of the search tree
44 * we have [ ... Key1 Pointer Key2 ... ] then, all data that are or will be
45 * keyed in the tree with the Key such that Key1 <= Key < Key2 are accessible
46 * through the Pointer.
48 * This is not true, when Pointer is Extent-Pointer, simply because extent
49 * cannot expand indefinitely to the right to include any item with
51 * Key1 <= Key <= Key2.
53 * For example, our E1 extent is only responsible for the data with keys
55 * (1:4:100:0) <= key <= (1:4:100:0xffffffffffffffff), and
57 * so, key range
59 * ( (1:4:100:0xffffffffffffffff), (10:0:0:0) )
61 * is orphaned: there is no way to get there from the tree root.
63 * In other words, extent pointers are different than normal child pointers as
64 * far as search tree is concerned, and this creates such problems.
66 * Possible solution for this problem is to insert our item into node pointed
67 * to by I2. There are some problems through:
69 * (1) I2 can be in a different node.
70 * (2) E1 can be immediately followed by another extent E2.
72 * (1) is solved by calling reiser4_get_right_neighbor() and accounting
73 * for locks/coords as necessary.
75 * (2) is more complex. Solution here is to insert new empty leaf node and
76 * insert internal item between E1 and E2 pointing to said leaf node. This is
77 * further complicated by possibility that E2 is in a different node, etc.
79 * Problems:
81 * (1) if there was internal item I2 immediately on the right of an extent E1
82 * we and we decided to insert new item S1 into node N2 pointed to by I2, then
83 * key of S1 will be less than smallest key in the N2. Normally, search key
84 * checks that key we are looking for is in the range of keys covered by the
85 * node key is being looked in. To work around of this situation, while
86 * preserving useful consistency check new flag CBK_TRUST_DK was added to the
87 * cbk falgs bitmask. This flag is automatically set on entrance to the
88 * coord_by_key() and is only cleared when we are about to enter situation
89 * described above.
91 * (2) If extent E1 is immediately followed by another extent E2 and we are
92 * searching for the key that is between E1 and E2 we only have to insert new
93 * empty leaf node when coord_by_key was called for insertion, rather than just
94 * for lookup. To distinguish these cases, new flag CBK_FOR_INSERT was added to
95 * the cbk falgs bitmask. This flag is automatically set by coord_by_key calls
96 * performed by insert_by_key() and friends.
98 * (3) Insertion of new empty leaf node (possibly) requires balancing. In any
99 * case it requires modification of node content which is only possible under
100 * write lock. It may well happen that we only have read lock on the node where
101 * new internal pointer is to be inserted (common case: lookup of non-existent
102 * stat-data that fells between two extents). If only read lock is held, tree
103 * traversal is restarted with lock_level modified so that next time we hit
104 * this problem, write lock will be held. Once we have write lock, balancing
105 * will be performed.
109 * is_next_item_internal - check whether next item is internal
110 * @coord: coordinate of extent item in twig node
111 * @key: search key
112 * @lh: twig node lock handle
114 * Looks at the unit next to @coord. If it is an internal one - 1 is returned,
115 * @coord is set to that unit. If that unit is in right neighbor, @lh is moved
116 * to that node, @coord is set to its first unit. If next item is not internal
117 * or does not exist then 0 is returned, @coord and @lh are left unchanged. 2
118 * is returned if search restart has to be done.
120 static int
121 is_next_item_internal(coord_t *coord, const reiser4_key *key,
122 lock_handle *lh)
124 coord_t next;
125 lock_handle rn;
126 int result;
128 coord_dup(&next, coord);
129 if (coord_next_unit(&next) == 0) {
130 /* next unit is in this node */
131 if (item_is_internal(&next)) {
132 coord_dup(coord, &next);
133 return 1;
135 assert("vs-3", item_is_extent(&next));
136 return 0;
140 * next unit either does not exist or is in right neighbor. If it is in
141 * right neighbor we have to check right delimiting key because
142 * concurrent thread could get their first and insert item with a key
143 * smaller than @key
145 read_lock_dk(current_tree);
146 result = keycmp(key, znode_get_rd_key(coord->node));
147 read_unlock_dk(current_tree);
148 assert("vs-6", result != EQUAL_TO);
149 if (result == GREATER_THAN)
150 return 2;
152 /* lock right neighbor */
153 init_lh(&rn);
154 result = reiser4_get_right_neighbor(&rn, coord->node,
155 znode_is_wlocked(coord->node) ?
156 ZNODE_WRITE_LOCK : ZNODE_READ_LOCK,
157 GN_CAN_USE_UPPER_LEVELS);
158 if (result == -E_NO_NEIGHBOR) {
159 /* we are on the rightmost edge of the tree */
160 done_lh(&rn);
161 return 0;
164 if (result) {
165 assert("vs-4", result < 0);
166 done_lh(&rn);
167 return result;
171 * check whether concurrent thread managed to insert item with a key
172 * smaller than @key
174 read_lock_dk(current_tree);
175 result = keycmp(key, znode_get_ld_key(rn.node));
176 read_unlock_dk(current_tree);
177 assert("vs-6", result != EQUAL_TO);
178 if (result == GREATER_THAN) {
179 done_lh(&rn);
180 return 2;
183 result = zload(rn.node);
184 if (result) {
185 assert("vs-5", result < 0);
186 done_lh(&rn);
187 return result;
190 coord_init_first_unit(&next, rn.node);
191 if (item_is_internal(&next)) {
193 * next unit is in right neighbor and it is an unit of internal
194 * item. Unlock coord->node. Move @lh to right neighbor. @coord
195 * is set to the first unit of right neighbor.
197 coord_dup(coord, &next);
198 zrelse(rn.node);
199 done_lh(lh);
200 move_lh(lh, &rn);
201 return 1;
205 * next unit is unit of extent item. Return without chaning @lh and
206 * @coord.
208 assert("vs-6", item_is_extent(&next));
209 zrelse(rn.node);
210 done_lh(&rn);
211 return 0;
215 * rd_key - calculate key of an item next to the given one
216 * @coord: position in a node
217 * @key: storage for result key
219 * @coord is set between items or after the last item in a node. Calculate key
220 * of item to the right of @coord.
222 static reiser4_key *rd_key(const coord_t *coord, reiser4_key *key)
224 coord_t dup;
226 assert("nikita-2281", coord_is_between_items(coord));
227 coord_dup(&dup, coord);
229 if (coord_set_to_right(&dup) == 0)
230 /* next item is in this node. Return its key. */
231 unit_key_by_coord(&dup, key);
232 else {
234 * next item either does not exist or is in right
235 * neighbor. Return znode's right delimiting key.
237 read_lock_dk(current_tree);
238 *key = *znode_get_rd_key(coord->node);
239 read_unlock_dk(current_tree);
241 return key;
245 * add_empty_leaf - insert empty leaf between two extents
246 * @insert_coord: position in twig node between two extents
247 * @lh: twig node lock handle
248 * @key: left delimiting key of new node
249 * @rdkey: right delimiting key of new node
251 * Inserts empty leaf node between two extent items. It is necessary when we
252 * have to insert an item on leaf level between two extents (items on the twig
253 * level).
255 static int
256 add_empty_leaf(coord_t *insert_coord, lock_handle *lh,
257 const reiser4_key *key, const reiser4_key *rdkey)
259 int result;
260 carry_pool *pool;
261 carry_level *todo;
262 reiser4_item_data *item;
263 carry_insert_data *cdata;
264 carry_op *op;
265 znode *node;
266 reiser4_tree *tree;
268 assert("vs-49827", znode_contains_key_lock(insert_coord->node, key));
269 tree = znode_get_tree(insert_coord->node);
270 node = reiser4_new_node(insert_coord->node, LEAF_LEVEL);
271 if (IS_ERR(node))
272 return PTR_ERR(node);
274 /* setup delimiting keys for node being inserted */
275 write_lock_dk(tree);
276 znode_set_ld_key(node, key);
277 znode_set_rd_key(node, rdkey);
278 ON_DEBUG(node->creator = current);
279 ON_DEBUG(node->first_key = *key);
280 write_unlock_dk(tree);
282 ZF_SET(node, JNODE_ORPHAN);
285 * allocate carry_pool, 3 carry_level-s, reiser4_item_data and
286 * carry_insert_data
288 pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo) +
289 sizeof(*item) + sizeof(*cdata));
290 if (IS_ERR(pool))
291 return PTR_ERR(pool);
292 todo = (carry_level *) (pool + 1);
293 init_carry_level(todo, pool);
295 item = (reiser4_item_data *) (todo + 3);
296 cdata = (carry_insert_data *) (item + 1);
298 op = reiser4_post_carry(todo, COP_INSERT, insert_coord->node, 0);
299 if (!IS_ERR(op)) {
300 cdata->coord = insert_coord;
301 cdata->key = key;
302 cdata->data = item;
303 op->u.insert.d = cdata;
304 op->u.insert.type = COPT_ITEM_DATA;
305 build_child_ptr_data(node, item);
306 item->arg = NULL;
307 /* have @insert_coord to be set at inserted item after
308 insertion is done */
309 todo->track_type = CARRY_TRACK_CHANGE;
310 todo->tracked = lh;
312 result = reiser4_carry(todo, NULL);
313 if (result == 0) {
315 * pin node in memory. This is necessary for
316 * znode_make_dirty() below.
318 result = zload(node);
319 if (result == 0) {
320 lock_handle local_lh;
323 * if we inserted new child into tree we have
324 * to mark it dirty so that flush will be able
325 * to process it.
327 init_lh(&local_lh);
328 result = longterm_lock_znode(&local_lh, node,
329 ZNODE_WRITE_LOCK,
330 ZNODE_LOCK_LOPRI);
331 if (result == 0) {
332 znode_make_dirty(node);
335 * when internal item pointing to @node
336 * was inserted into twig node
337 * create_hook_internal did not connect
338 * it properly because its right
339 * neighbor was not known. Do it
340 * here
342 write_lock_tree(tree);
343 assert("nikita-3312",
344 znode_is_right_connected(node));
345 assert("nikita-2984",
346 node->right == NULL);
347 ZF_CLR(node, JNODE_RIGHT_CONNECTED);
348 write_unlock_tree(tree);
349 result =
350 connect_znode(insert_coord, node);
351 ON_DEBUG(if (result == 0) check_dkeys(node););
353 done_lh(lh);
354 move_lh(lh, &local_lh);
355 assert("vs-1676", node_is_empty(node));
356 coord_init_first_unit(insert_coord,
357 node);
358 } else {
359 warning("nikita-3136",
360 "Cannot lock child");
362 done_lh(&local_lh);
363 zrelse(node);
366 } else
367 result = PTR_ERR(op);
368 zput(node);
369 done_carry_pool(pool);
370 return result;
374 * handle_eottl - handle extent-on-the-twig-level cases in tree traversal
375 * @h: search handle
376 * @outcome: flag saying whether search has to restart or is done
378 * Handles search on twig level. If this function completes search itself then
379 * it returns 1. If search has to go one level down then 0 is returned. If
380 * error happens then LOOKUP_DONE is returned via @outcome and error code is saved
381 * in @h->result.
383 int handle_eottl(cbk_handle *h, int *outcome)
385 int result;
386 reiser4_key key;
387 coord_t *coord;
389 coord = h->coord;
391 if (h->level != TWIG_LEVEL ||
392 (coord_is_existing_item(coord) && item_is_internal(coord))) {
393 /* Continue to traverse tree downward. */
394 return 0;
398 * make sure that @h->coord is set to twig node and that it is either
399 * set to extent item or after extent item
401 assert("vs-356", h->level == TWIG_LEVEL);
402 assert("vs-357", ( {
403 coord_t lcoord;
404 coord_dup(&lcoord, coord);
405 check_me("vs-733", coord_set_to_left(&lcoord) == 0);
406 item_is_extent(&lcoord);
410 if (*outcome == NS_FOUND) {
411 /* we have found desired key on twig level in extent item */
412 h->result = CBK_COORD_FOUND;
413 *outcome = LOOKUP_DONE;
414 return 1;
417 if (!(h->flags & CBK_FOR_INSERT)) {
418 /* tree traversal is not for insertion. Just return
419 CBK_COORD_NOTFOUND. */
420 h->result = CBK_COORD_NOTFOUND;
421 *outcome = LOOKUP_DONE;
422 return 1;
425 /* take a look at the item to the right of h -> coord */
426 result = is_next_item_internal(coord, h->key, h->active_lh);
427 if (unlikely(result < 0)) {
428 h->error = "get_right_neighbor failed";
429 h->result = result;
430 *outcome = LOOKUP_DONE;
431 return 1;
433 if (result == 0) {
435 * item to the right is also an extent one. Allocate a new node
436 * and insert pointer to it after item h -> coord.
438 * This is a result of extents being located at the twig
439 * level. For explanation, see comment just above
440 * is_next_item_internal().
442 znode *loaded;
444 if (cbk_lock_mode(h->level, h) != ZNODE_WRITE_LOCK) {
446 * we got node read locked, restart coord_by_key to
447 * have write lock on twig level
449 h->lock_level = TWIG_LEVEL;
450 h->lock_mode = ZNODE_WRITE_LOCK;
451 *outcome = LOOKUP_REST;
452 return 1;
455 loaded = coord->node;
456 result =
457 add_empty_leaf(coord, h->active_lh, h->key,
458 rd_key(coord, &key));
459 if (result) {
460 h->error = "could not add empty leaf";
461 h->result = result;
462 *outcome = LOOKUP_DONE;
463 return 1;
465 /* added empty leaf is locked (h->active_lh), its parent node
466 is unlocked, h->coord is set as EMPTY */
467 assert("vs-13", coord->between == EMPTY_NODE);
468 assert("vs-14", znode_is_write_locked(coord->node));
469 assert("vs-15",
470 WITH_DATA(coord->node, node_is_empty(coord->node)));
471 assert("vs-16", jnode_is_leaf(ZJNODE(coord->node)));
472 assert("vs-17", coord->node == h->active_lh->node);
473 *outcome = LOOKUP_DONE;
474 h->result = CBK_COORD_NOTFOUND;
475 return 1;
476 } else if (result == 1) {
478 * this is special case mentioned in the comment on
479 * tree.h:cbk_flags. We have found internal item immediately on
480 * the right of extent, and we are going to insert new item
481 * there. Key of item we are going to insert is smaller than
482 * leftmost key in the node pointed to by said internal item
483 * (otherwise search wouldn't come to the extent in the first
484 * place).
486 * This is a result of extents being located at the twig
487 * level. For explanation, see comment just above
488 * is_next_item_internal().
490 h->flags &= ~CBK_TRUST_DK;
491 } else {
492 assert("vs-8", result == 2);
493 *outcome = LOOKUP_REST;
494 return 1;
496 assert("vs-362", WITH_DATA(coord->node, item_is_internal(coord)));
497 return 0;
501 * Local variables:
502 * c-indentation-style: "K&R"
503 * mode-name: "LC"
504 * c-basic-offset: 8
505 * tab-width: 8
506 * fill-column: 120
507 * scroll-step: 1
508 * End: