revert-mm-fix-blkdev-size-calculation-in-generic_write_checks
[linux-2.6/linux-trees-mm.git] / fs / reiser4 / tree.c
blob32548d2b71149d2b5989adcf6213e1554b5b2f2e
1 /* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
2 * reiser4/README */
4 /*
5 * KEYS IN A TREE.
7 * The tree consists of nodes located on the disk. Node in the tree is either
8 * formatted or unformatted. Formatted node is one that has structure
9 * understood by the tree balancing and traversal code. Formatted nodes are
10 * further classified into leaf and internal nodes. Latter distinctions is
11 * (almost) of only historical importance: general structure of leaves and
12 * internal nodes is the same in Reiser4. Unformatted nodes contain raw data
13 * that are part of bodies of ordinary files and attributes.
15 * Each node in the tree spawns some interval in the key space. Key ranges for
16 * all nodes in the tree are disjoint. Actually, this only holds in some weak
17 * sense, because of the non-unique keys: intersection of key ranges for
18 * different nodes is either empty, or consists of exactly one key.
20 * Formatted node consists of a sequence of items. Each item spawns some
21 * interval in key space. Key ranges for all items in a tree are disjoint,
22 * modulo non-unique keys again. Items within nodes are ordered in the key
23 * order of the smallest key in a item.
25 * Particular type of item can be further split into units. Unit is piece of
26 * item that can be cut from item and moved into another item of the same
27 * time. Units are used by balancing code to repack data during balancing.
29 * Unit can be further split into smaller entities (for example, extent unit
30 * represents several pages, and it is natural for extent code to operate on
31 * particular pages and even bytes within one unit), but this is of no
32 * relevance to the generic balancing and lookup code.
34 * Although item is said to "spawn" range or interval of keys, it is not
35 * necessary that item contains piece of data addressable by each and every
36 * key in this range. For example, compound directory item, consisting of
37 * units corresponding to directory entries and keyed by hashes of file names,
38 * looks more as having "discrete spectrum": only some disjoint keys inside
39 * range occupied by this item really address data.
41 * No than less, each item always has well-defined least (minimal) key, that
42 * is recorded in item header, stored in the node this item is in. Also, item
43 * plugin can optionally define method ->max_key_inside() returning maximal
44 * key that can _possibly_ be located within this item. This method is used
45 * (mainly) to determine when given piece of data should be merged into
46 * existing item, in stead of creating new one. Because of this, even though
47 * ->max_key_inside() can be larger that any key actually located in the item,
48 * intervals
50 * [ reiser4_min_key( item ), ->max_key_inside( item ) ]
52 * are still disjoint for all items within the _same_ node.
54 * In memory node is represented by znode. It plays several roles:
56 * . something locks are taken on
58 * . something tracked by transaction manager (this is going to change)
60 * . something used to access node data
62 * . something used to maintain tree structure in memory: sibling and
63 * parental linkage.
65 * . something used to organize nodes into "slums"
67 * More on znodes see in znode.[ch]
69 * DELIMITING KEYS
71 * To simplify balancing, allow some flexibility in locking and speed up
72 * important coord cache optimization, we keep delimiting keys of nodes in
73 * memory. Depending on disk format (implemented by appropriate node plugin)
74 * node on disk can record both left and right delimiting key, only one of
75 * them, or none. Still, our balancing and tree traversal code keep both
76 * delimiting keys for a node that is in memory stored in the znode. When
77 * node is first brought into memory during tree traversal, its left
78 * delimiting key is taken from its parent, and its right delimiting key is
79 * either next key in its parent, or is right delimiting key of parent if
80 * node is the rightmost child of parent.
82 * Physical consistency of delimiting key is protected by special dk
83 * read-write lock. That is, delimiting keys can only be inspected or
84 * modified under this lock. But dk lock is only sufficient for fast
85 * "pessimistic" check, because to simplify code and to decrease lock
86 * contention, balancing (carry) only updates delimiting keys right before
87 * unlocking all locked nodes on the given tree level. For example,
88 * coord-by-key cache scans LRU list of recently accessed znodes. For each
89 * node it first does fast check under dk spin lock. If key looked for is
90 * not between delimiting keys for this node, next node is inspected and so
91 * on. If key is inside of the key range, long term lock is taken on node
92 * and key range is rechecked.
94 * COORDINATES
96 * To find something in the tree, you supply a key, and the key is resolved
97 * by coord_by_key() into a coord (coordinate) that is valid as long as the
98 * node the coord points to remains locked. As mentioned above trees
99 * consist of nodes that consist of items that consist of units. A unit is
100 * the smallest and indivisible piece of tree as far as balancing and tree
101 * search are concerned. Each node, item, and unit can be addressed by
102 * giving its level in the tree and the key occupied by this entity. A node
103 * knows what the key ranges are of the items within it, and how to find its
104 * items and invoke their item handlers, but it does not know how to access
105 * individual units within its items except through the item handlers.
106 * coord is a structure containing a pointer to the node, the ordinal number
107 * of the item within this node (a sort of item offset), and the ordinal
108 * number of the unit within this item.
110 * TREE LOOKUP
112 * There are two types of access to the tree: lookup and modification.
114 * Lookup is a search for the key in the tree. Search can look for either
115 * exactly the key given to it, or for the largest key that is not greater
116 * than the key given to it. This distinction is determined by "bias"
117 * parameter of search routine (coord_by_key()). coord_by_key() either
118 * returns error (key is not in the tree, or some kind of external error
119 * occurred), or successfully resolves key into coord.
121 * This resolution is done by traversing tree top-to-bottom from root level
122 * to the desired level. On levels above twig level (level one above the
123 * leaf level) nodes consist exclusively of internal items. Internal item is
124 * nothing more than pointer to the tree node on the child level. On twig
125 * level nodes consist of internal items intermixed with extent
126 * items. Internal items form normal search tree structure used by traversal
127 * to descent through the tree.
129 * TREE LOOKUP OPTIMIZATIONS
131 * Tree lookup described above is expensive even if all nodes traversed are
132 * already in the memory: for each node binary search within it has to be
133 * performed and binary searches are CPU consuming and tend to destroy CPU
134 * caches.
136 * Several optimizations are used to work around this:
138 * . cbk_cache (look-aside cache for tree traversals, see search.c for
139 * details)
141 * . seals (see seal.[ch])
143 * . vroot (see search.c)
145 * General search-by-key is layered thusly:
147 * [check seal, if any] --ok--> done
149 * failed
152 * [vroot defined] --no--> node = tree_root
153 * | |
154 * yes |
155 * | |
156 * V |
157 * node = vroot |
158 * | |
159 * | |
160 * | |
161 * V V
162 * [check cbk_cache for key] --ok--> done
164 * failed
167 * [start tree traversal from node]
171 #include "forward.h"
172 #include "debug.h"
173 #include "dformat.h"
174 #include "key.h"
175 #include "coord.h"
176 #include "plugin/item/static_stat.h"
177 #include "plugin/item/item.h"
178 #include "plugin/node/node.h"
179 #include "plugin/plugin.h"
180 #include "txnmgr.h"
181 #include "jnode.h"
182 #include "znode.h"
183 #include "block_alloc.h"
184 #include "tree_walk.h"
185 #include "carry.h"
186 #include "carry_ops.h"
187 #include "tap.h"
188 #include "tree.h"
189 #include "vfs_ops.h"
190 #include "page_cache.h"
191 #include "super.h"
192 #include "reiser4.h"
193 #include "inode.h"
195 #include <linux/fs.h> /* for struct super_block */
196 #include <linux/spinlock.h>
198 /* Disk address (block number) never ever used for any real tree node. This is
199 used as block number of "uber" znode.
201 Invalid block addresses are 0 by tradition.
204 const reiser4_block_nr UBER_TREE_ADDR = 0ull;
206 #define CUT_TREE_MIN_ITERATIONS 64
208 static int find_child_by_addr(znode * parent, znode * child, coord_t * result);
210 /* return node plugin of coord->node */
211 node_plugin *node_plugin_by_coord(const coord_t * coord)
213 assert("vs-1", coord != NULL);
214 assert("vs-2", coord->node != NULL);
216 return coord->node->nplug;
219 /* insert item into tree. Fields of @coord are updated so that they can be
220 * used by consequent insert operation. */
221 insert_result insert_by_key(reiser4_tree * tree /* tree to insert new item
222 * into */ ,
223 const reiser4_key * key /* key of new item */ ,
224 reiser4_item_data * data /* parameters for item
225 * creation */ ,
226 coord_t * coord /* resulting insertion coord */ ,
227 lock_handle * lh /* resulting lock
228 * handle */ ,
229 tree_level stop_level /** level where to insert */ ,
230 __u32 flags /* insertion flags */ )
232 int result;
234 assert("nikita-358", tree != NULL);
235 assert("nikita-360", coord != NULL);
237 result = coord_by_key(tree, key, coord, lh, ZNODE_WRITE_LOCK,
238 FIND_EXACT, stop_level, stop_level,
239 flags | CBK_FOR_INSERT, NULL /*ra_info */ );
240 switch (result) {
241 default:
242 break;
243 case CBK_COORD_FOUND:
244 result = IBK_ALREADY_EXISTS;
245 break;
246 case CBK_COORD_NOTFOUND:
247 assert("nikita-2017", coord->node != NULL);
248 result = insert_by_coord(coord, data, key, lh, 0 /*flags */ );
249 break;
251 return result;
254 /* insert item by calling carry. Helper function called if short-cut
255 insertion failed */
256 static insert_result insert_with_carry_by_coord(coord_t * coord, /* coord where to insert */
257 lock_handle * lh, /* lock handle of insertion
258 * node */
259 reiser4_item_data * data, /* parameters of new
260 * item */
261 const reiser4_key * key, /* key of new item */
262 carry_opcode cop, /* carry operation to perform */
263 cop_insert_flag flags
264 /* carry flags */ )
266 int result;
267 carry_pool *pool;
268 carry_level *lowest_level;
269 carry_insert_data *cdata;
270 carry_op *op;
272 assert("umka-314", coord != NULL);
274 /* allocate carry_pool and 3 carry_level-s */
275 pool =
276 init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
277 sizeof(*cdata));
278 if (IS_ERR(pool))
279 return PTR_ERR(pool);
280 lowest_level = (carry_level *) (pool + 1);
281 init_carry_level(lowest_level, pool);
283 op = reiser4_post_carry(lowest_level, cop, coord->node, 0);
284 if (IS_ERR(op) || (op == NULL)) {
285 done_carry_pool(pool);
286 return RETERR(op ? PTR_ERR(op) : -EIO);
288 cdata = (carry_insert_data *) (lowest_level + 3);
289 cdata->coord = coord;
290 cdata->data = data;
291 cdata->key = key;
292 op->u.insert.d = cdata;
293 if (flags == 0)
294 flags = znode_get_tree(coord->node)->carry.insert_flags;
295 op->u.insert.flags = flags;
296 op->u.insert.type = COPT_ITEM_DATA;
297 op->u.insert.child = NULL;
298 if (lh != NULL) {
299 assert("nikita-3245", lh->node == coord->node);
300 lowest_level->track_type = CARRY_TRACK_CHANGE;
301 lowest_level->tracked = lh;
304 result = reiser4_carry(lowest_level, NULL);
305 done_carry_pool(pool);
307 return result;
310 /* form carry queue to perform paste of @data with @key at @coord, and launch
311 its execution by calling carry().
313 Instruct carry to update @lh it after balancing insertion coord moves into
314 different block.
317 static int paste_with_carry(coord_t * coord, /* coord of paste */
318 lock_handle * lh, /* lock handle of node
319 * where item is
320 * pasted */
321 reiser4_item_data * data, /* parameters of new
322 * item */
323 const reiser4_key * key, /* key of new item */
324 unsigned flags /* paste flags */ )
326 int result;
327 carry_pool *pool;
328 carry_level *lowest_level;
329 carry_insert_data *cdata;
330 carry_op *op;
332 assert("umka-315", coord != NULL);
333 assert("umka-316", key != NULL);
335 pool =
336 init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
337 sizeof(*cdata));
338 if (IS_ERR(pool))
339 return PTR_ERR(pool);
340 lowest_level = (carry_level *) (pool + 1);
341 init_carry_level(lowest_level, pool);
343 op = reiser4_post_carry(lowest_level, COP_PASTE, coord->node, 0);
344 if (IS_ERR(op) || (op == NULL)) {
345 done_carry_pool(pool);
346 return RETERR(op ? PTR_ERR(op) : -EIO);
348 cdata = (carry_insert_data *) (lowest_level + 3);
349 cdata->coord = coord;
350 cdata->data = data;
351 cdata->key = key;
352 op->u.paste.d = cdata;
353 if (flags == 0)
354 flags = znode_get_tree(coord->node)->carry.paste_flags;
355 op->u.paste.flags = flags;
356 op->u.paste.type = COPT_ITEM_DATA;
357 if (lh != NULL) {
358 lowest_level->track_type = CARRY_TRACK_CHANGE;
359 lowest_level->tracked = lh;
362 result = reiser4_carry(lowest_level, NULL);
363 done_carry_pool(pool);
365 return result;
368 /* insert item at the given coord.
370 First try to skip carry by directly calling ->create_item() method of node
371 plugin. If this is impossible (there is not enough free space in the node,
372 or leftmost item in the node is created), call insert_with_carry_by_coord()
373 that will do full carry().
376 insert_result insert_by_coord(coord_t * coord /* coord where to
377 * insert. coord->node has
378 * to be write locked by
379 * caller */ ,
380 reiser4_item_data * data /* data to be
381 * inserted */ ,
382 const reiser4_key * key /* key of new item */ ,
383 lock_handle * lh /* lock handle of write
384 * lock on node */ ,
385 __u32 flags /* insertion flags */ )
387 unsigned item_size;
388 int result;
389 znode *node;
391 assert("vs-247", coord != NULL);
392 assert("vs-248", data != NULL);
393 assert("vs-249", data->length >= 0);
394 assert("nikita-1191", znode_is_write_locked(coord->node));
396 node = coord->node;
397 coord_clear_iplug(coord);
398 result = zload(node);
399 if (result != 0)
400 return result;
402 item_size = space_needed(node, NULL, data, 1);
403 if (item_size > znode_free_space(node) &&
404 (flags & COPI_DONT_SHIFT_LEFT) && (flags & COPI_DONT_SHIFT_RIGHT)
405 && (flags & COPI_DONT_ALLOCATE)) {
406 /* we are forced to use free space of coord->node and new item
407 does not fit into it.
409 Currently we get here only when we allocate and copy units
410 of extent item from a node to its left neighbor during
411 "squalloc"-ing. If @node (this is left neighbor) does not
412 have enough free space - we do not want to attempt any
413 shifting and allocations because we are in squeezing and
414 everything to the left of @node is tightly packed.
416 result = -E_NODE_FULL;
417 } else if ((item_size <= znode_free_space(node)) &&
418 !coord_is_before_leftmost(coord) &&
419 (node_plugin_by_node(node)->fast_insert != NULL)
420 && node_plugin_by_node(node)->fast_insert(coord)) {
421 /* shortcut insertion without carry() overhead.
423 Only possible if:
425 - there is enough free space
427 - insertion is not into the leftmost position in a node
428 (otherwise it would require updating of delimiting key in a
429 parent)
431 - node plugin agrees with this
434 result =
435 node_plugin_by_node(node)->create_item(coord, key, data,
436 NULL);
437 znode_make_dirty(node);
438 } else {
439 /* otherwise do full-fledged carry(). */
440 result =
441 insert_with_carry_by_coord(coord, lh, data, key, COP_INSERT,
442 flags);
444 zrelse(node);
445 return result;
448 /* @coord is set to leaf level and @data is to be inserted to twig level */
449 insert_result
450 insert_extent_by_coord(coord_t *
451 coord
452 /* coord where to insert. coord->node * has to be write * locked by caller */
454 reiser4_item_data * data /* data to be inserted */ ,
455 const reiser4_key * key /* key of new item */ ,
456 lock_handle *
457 lh /* lock handle of write lock on * node */ )
459 assert("vs-405", coord != NULL);
460 assert("vs-406", data != NULL);
461 assert("vs-407", data->length > 0);
462 assert("vs-408", znode_is_write_locked(coord->node));
463 assert("vs-409", znode_get_level(coord->node) == LEAF_LEVEL);
465 return insert_with_carry_by_coord(coord, lh, data, key, COP_EXTENT,
466 0 /*flags */ );
469 /* Insert into the item at the given coord.
471 First try to skip carry by directly calling ->paste() method of item
472 plugin. If this is impossible (there is not enough free space in the node,
473 or we are pasting into leftmost position in the node), call
474 paste_with_carry() that will do full carry().
477 /* paste_into_item */
478 int insert_into_item(coord_t * coord /* coord of pasting */ ,
479 lock_handle * lh /* lock handle on node involved */ ,
480 const reiser4_key * key /* key of unit being pasted */ ,
481 reiser4_item_data * data /* parameters for new unit */ ,
482 unsigned flags /* insert/paste flags */ )
484 int result;
485 int size_change;
486 node_plugin *nplug;
487 item_plugin *iplug;
489 assert("umka-317", coord != NULL);
490 assert("umka-318", key != NULL);
492 iplug = item_plugin_by_coord(coord);
493 nplug = node_plugin_by_coord(coord);
495 assert("nikita-1480", iplug == data->iplug);
497 size_change = space_needed(coord->node, coord, data, 0);
498 if (size_change > (int)znode_free_space(coord->node) &&
499 (flags & COPI_DONT_SHIFT_LEFT) && (flags & COPI_DONT_SHIFT_RIGHT)
500 && (flags & COPI_DONT_ALLOCATE)) {
501 /* we are forced to use free space of coord->node and new data
502 does not fit into it. */
503 return -E_NODE_FULL;
506 /* shortcut paste without carry() overhead.
508 Only possible if:
510 - there is enough free space
512 - paste is not into the leftmost unit in a node (otherwise
513 it would require updating of delimiting key in a parent)
515 - node plugin agrees with this
517 - item plugin agrees with us
519 if (size_change <= (int)znode_free_space(coord->node) &&
520 (coord->item_pos != 0 ||
521 coord->unit_pos != 0 || coord->between == AFTER_UNIT) &&
522 coord->unit_pos != 0 && nplug->fast_paste != NULL &&
523 nplug->fast_paste(coord) &&
524 iplug->b.fast_paste != NULL && iplug->b.fast_paste(coord)) {
525 if (size_change > 0)
526 nplug->change_item_size(coord, size_change);
527 /* NOTE-NIKITA: huh? where @key is used? */
528 result = iplug->b.paste(coord, data, NULL);
529 if (size_change < 0)
530 nplug->change_item_size(coord, size_change);
531 znode_make_dirty(coord->node);
532 } else
533 /* otherwise do full-fledged carry(). */
534 result = paste_with_carry(coord, lh, data, key, flags);
535 return result;
538 /* this either appends or truncates item @coord */
539 int reiser4_resize_item(coord_t * coord /* coord of item being resized */ ,
540 reiser4_item_data * data /* parameters of resize */ ,
541 reiser4_key * key /* key of new unit */ ,
542 lock_handle * lh /* lock handle of node
543 * being modified */ ,
544 cop_insert_flag flags /* carry flags */ )
546 int result;
547 znode *node;
549 assert("nikita-362", coord != NULL);
550 assert("nikita-363", data != NULL);
551 assert("vs-245", data->length != 0);
553 node = coord->node;
554 coord_clear_iplug(coord);
555 result = zload(node);
556 if (result != 0)
557 return result;
559 if (data->length < 0)
560 result = node_plugin_by_coord(coord)->shrink_item(coord,
561 -data->length);
562 else
563 result = insert_into_item(coord, lh, key, data, flags);
565 zrelse(node);
566 return result;
569 /* insert flow @f */
570 int reiser4_insert_flow(coord_t * coord, lock_handle * lh, flow_t * f)
572 int result;
573 carry_pool *pool;
574 carry_level *lowest_level;
575 reiser4_item_data *data;
576 carry_op *op;
578 pool =
579 init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
580 sizeof(*data));
581 if (IS_ERR(pool))
582 return PTR_ERR(pool);
583 lowest_level = (carry_level *) (pool + 1);
584 init_carry_level(lowest_level, pool);
586 op = reiser4_post_carry(lowest_level, COP_INSERT_FLOW, coord->node,
587 0 /* operate directly on coord -> node */ );
588 if (IS_ERR(op) || (op == NULL)) {
589 done_carry_pool(pool);
590 return RETERR(op ? PTR_ERR(op) : -EIO);
593 /* these are permanent during insert_flow */
594 data = (reiser4_item_data *) (lowest_level + 3);
595 data->user = 1;
596 data->iplug = item_plugin_by_id(FORMATTING_ID);
597 data->arg = NULL;
598 /* data.length and data.data will be set before calling paste or
599 insert */
600 data->length = 0;
601 data->data = NULL;
603 op->u.insert_flow.flags = 0;
604 op->u.insert_flow.insert_point = coord;
605 op->u.insert_flow.flow = f;
606 op->u.insert_flow.data = data;
607 op->u.insert_flow.new_nodes = 0;
609 lowest_level->track_type = CARRY_TRACK_CHANGE;
610 lowest_level->tracked = lh;
612 result = reiser4_carry(lowest_level, NULL);
613 done_carry_pool(pool);
615 return result;
618 /* Given a coord in parent node, obtain a znode for the corresponding child */
619 znode *child_znode(const coord_t * parent_coord /* coord of pointer to
620 * child */ ,
621 znode * parent /* parent of child */ ,
622 int incore_p /* if !0 only return child if already in
623 * memory */ ,
624 int setup_dkeys_p /* if !0 update delimiting keys of
625 * child */ )
627 znode *child;
629 assert("nikita-1374", parent_coord != NULL);
630 assert("nikita-1482", parent != NULL);
631 #if REISER4_DEBUG
632 if (setup_dkeys_p)
633 assert_rw_not_locked(&(znode_get_tree(parent)->dk_lock));
634 #endif
635 assert("nikita-2947", znode_is_any_locked(parent));
637 if (znode_get_level(parent) <= LEAF_LEVEL) {
638 /* trying to get child of leaf node */
639 warning("nikita-1217", "Child of maize?");
640 return ERR_PTR(RETERR(-EIO));
642 if (item_is_internal(parent_coord)) {
643 reiser4_block_nr addr;
644 item_plugin *iplug;
645 reiser4_tree *tree;
647 iplug = item_plugin_by_coord(parent_coord);
648 assert("vs-512", iplug->s.internal.down_link);
649 iplug->s.internal.down_link(parent_coord, NULL, &addr);
651 tree = znode_get_tree(parent);
652 if (incore_p)
653 child = zlook(tree, &addr);
654 else
655 child =
656 zget(tree, &addr, parent,
657 znode_get_level(parent) - 1,
658 reiser4_ctx_gfp_mask_get());
659 if ((child != NULL) && !IS_ERR(child) && setup_dkeys_p)
660 set_child_delimiting_keys(parent, parent_coord, child);
661 } else {
662 warning("nikita-1483", "Internal item expected");
663 child = ERR_PTR(RETERR(-EIO));
665 return child;
668 /* remove znode from transaction */
669 static void uncapture_znode(znode * node)
671 struct page *page;
673 assert("zam-1001", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
675 if (!reiser4_blocknr_is_fake(znode_get_block(node))) {
676 int ret;
678 /* An already allocated block goes right to the atom's delete set. */
679 ret =
680 reiser4_dealloc_block(znode_get_block(node), 0,
681 BA_DEFER | BA_FORMATTED);
682 if (ret)
683 warning("zam-942",
684 "can\'t add a block (%llu) number to atom's delete set\n",
685 (unsigned long long)(*znode_get_block(node)));
687 spin_lock_znode(node);
688 /* Here we return flush reserved block which was reserved at the
689 * moment when this allocated node was marked dirty and still
690 * not used by flush in node relocation procedure. */
691 if (ZF_ISSET(node, JNODE_FLUSH_RESERVED)) {
692 txn_atom *atom;
694 atom = jnode_get_atom(ZJNODE(node));
695 assert("zam-939", atom != NULL);
696 spin_unlock_znode(node);
697 flush_reserved2grabbed(atom, (__u64) 1);
698 spin_unlock_atom(atom);
699 } else
700 spin_unlock_znode(node);
701 } else {
702 /* znode has assigned block which is counted as "fake
703 allocated". Return it back to "free blocks") */
704 fake_allocated2free((__u64) 1, BA_FORMATTED);
708 * uncapture page from transaction. There is a possibility of a race
709 * with ->releasepage(): reiser4_releasepage() detaches page from this
710 * jnode and we have nothing to uncapture. To avoid this, get
711 * reference of node->pg under jnode spin lock. reiser4_uncapture_page()
712 * will deal with released page itself.
714 spin_lock_znode(node);
715 page = znode_page(node);
716 if (likely(page != NULL)) {
718 * reiser4_uncapture_page() can only be called when we are sure
719 * that znode is pinned in memory, which we are, because
720 * forget_znode() is only called from longterm_unlock_znode().
722 page_cache_get(page);
723 spin_unlock_znode(node);
724 lock_page(page);
725 reiser4_uncapture_page(page);
726 unlock_page(page);
727 page_cache_release(page);
728 } else {
729 txn_atom *atom;
731 /* handle "flush queued" znodes */
732 while (1) {
733 atom = jnode_get_atom(ZJNODE(node));
734 assert("zam-943", atom != NULL);
736 if (!ZF_ISSET(node, JNODE_FLUSH_QUEUED)
737 || !atom->nr_running_queues)
738 break;
740 spin_unlock_znode(node);
741 reiser4_atom_wait_event(atom);
742 spin_lock_znode(node);
745 reiser4_uncapture_block(ZJNODE(node));
746 spin_unlock_atom(atom);
747 zput(node);
751 /* This is called from longterm_unlock_znode() when last lock is released from
752 the node that has been removed from the tree. At this point node is removed
753 from sibling list and its lock is invalidated. */
754 void forget_znode(lock_handle * handle)
756 znode *node;
757 reiser4_tree *tree;
759 assert("umka-319", handle != NULL);
761 node = handle->node;
762 tree = znode_get_tree(node);
764 assert("vs-164", znode_is_write_locked(node));
765 assert("nikita-1280", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
766 assert_rw_locked(&(node->lock.guard));
768 /* We assume that this node was detached from its parent before
769 * unlocking, it gives no way to reach this node from parent through a
770 * down link. The node should have no children and, thereby, can't be
771 * reached from them by their parent pointers. The only way to obtain a
772 * reference to the node is to use sibling pointers from its left and
773 * right neighbors. In the next several lines we remove the node from
774 * the sibling list. */
776 write_lock_tree(tree);
777 sibling_list_remove(node);
778 znode_remove(node, tree);
779 write_unlock_tree(tree);
781 /* Here we set JNODE_DYING and cancel all pending lock requests. It
782 * forces all lock requestor threads to repeat iterations of getting
783 * lock on a child, neighbor or parent node. But, those threads can't
784 * come to this node again, because this node is no longer a child,
785 * neighbor or parent of any other node. This order of znode
786 * invalidation does not allow other threads to waste cpu time is a busy
787 * loop, trying to lock dying object. The exception is in the flush
788 * code when we take node directly from atom's capture list.*/
789 reiser4_invalidate_lock(handle);
790 uncapture_znode(node);
793 /* Check that internal item at @pointer really contains pointer to @child. */
794 int check_tree_pointer(const coord_t * pointer /* would-be pointer to
795 * @child */ ,
796 const znode * child /* child znode */ )
798 assert("nikita-1016", pointer != NULL);
799 assert("nikita-1017", child != NULL);
800 assert("nikita-1018", pointer->node != NULL);
802 assert("nikita-1325", znode_is_any_locked(pointer->node));
804 assert("nikita-2985",
805 znode_get_level(pointer->node) == znode_get_level(child) + 1);
807 coord_clear_iplug((coord_t *) pointer);
809 if (coord_is_existing_unit(pointer)) {
810 item_plugin *iplug;
811 reiser4_block_nr addr;
813 if (item_is_internal(pointer)) {
814 iplug = item_plugin_by_coord(pointer);
815 assert("vs-513", iplug->s.internal.down_link);
816 iplug->s.internal.down_link(pointer, NULL, &addr);
817 /* check that cached value is correct */
818 if (disk_addr_eq(&addr, znode_get_block(child))) {
819 return NS_FOUND;
823 /* warning ("jmacd-1002", "tree pointer incorrect"); */
824 return NS_NOT_FOUND;
827 /* find coord of pointer to new @child in @parent.
829 Find the &coord_t in the @parent where pointer to a given @child will
830 be in.
833 int find_new_child_ptr(znode * parent /* parent znode, passed locked */ ,
834 znode *
835 child UNUSED_ARG /* child znode, passed locked */ ,
836 znode * left /* left brother of new node */ ,
837 coord_t * result /* where result is stored in */ )
839 int ret;
841 assert("nikita-1486", parent != NULL);
842 assert("nikita-1487", child != NULL);
843 assert("nikita-1488", result != NULL);
845 ret = find_child_ptr(parent, left, result);
846 if (ret != NS_FOUND) {
847 warning("nikita-1489", "Cannot find brother position: %i", ret);
848 return RETERR(-EIO);
849 } else {
850 result->between = AFTER_UNIT;
851 return RETERR(NS_NOT_FOUND);
855 /* find coord of pointer to @child in @parent.
857 Find the &coord_t in the @parent where pointer to a given @child is in.
860 int find_child_ptr(znode * parent /* parent znode, passed locked */ ,
861 znode * child /* child znode, passed locked */ ,
862 coord_t * result /* where result is stored in */ )
864 int lookup_res;
865 node_plugin *nplug;
866 /* left delimiting key of a child */
867 reiser4_key ld;
868 reiser4_tree *tree;
870 assert("nikita-934", parent != NULL);
871 assert("nikita-935", child != NULL);
872 assert("nikita-936", result != NULL);
873 assert("zam-356", znode_is_loaded(parent));
875 coord_init_zero(result);
876 result->node = parent;
878 nplug = parent->nplug;
879 assert("nikita-939", nplug != NULL);
881 tree = znode_get_tree(parent);
882 /* NOTE-NIKITA taking read-lock on tree here assumes that @result is
883 * not aliased to ->in_parent of some znode. Otherwise,
884 * parent_coord_to_coord() below would modify data protected by tree
885 * lock. */
886 read_lock_tree(tree);
887 /* fast path. Try to use cached value. Lock tree to keep
888 node->pos_in_parent and pos->*_blocknr consistent. */
889 if (child->in_parent.item_pos + 1 != 0) {
890 parent_coord_to_coord(&child->in_parent, result);
891 if (check_tree_pointer(result, child) == NS_FOUND) {
892 read_unlock_tree(tree);
893 return NS_FOUND;
896 child->in_parent.item_pos = (unsigned short)~0;
898 read_unlock_tree(tree);
900 /* is above failed, find some key from @child. We are looking for the
901 least key in a child. */
902 read_lock_dk(tree);
903 ld = *znode_get_ld_key(child);
904 read_unlock_dk(tree);
906 * now, lookup parent with key just found. Note, that left delimiting
907 * key doesn't identify node uniquely, because (in extremely rare
908 * case) two nodes can have equal left delimiting keys, if one of them
909 * is completely filled with directory entries that all happened to be
910 * hash collision. But, we check block number in check_tree_pointer()
911 * and, so, are safe.
913 lookup_res = nplug->lookup(parent, &ld, FIND_EXACT, result);
914 /* update cached pos_in_node */
915 if (lookup_res == NS_FOUND) {
916 write_lock_tree(tree);
917 coord_to_parent_coord(result, &child->in_parent);
918 write_unlock_tree(tree);
919 lookup_res = check_tree_pointer(result, child);
921 if (lookup_res == NS_NOT_FOUND)
922 lookup_res = find_child_by_addr(parent, child, result);
923 return lookup_res;
926 /* find coord of pointer to @child in @parent by scanning
928 Find the &coord_t in the @parent where pointer to a given @child
929 is in by scanning all internal items in @parent and comparing block
930 numbers in them with that of @child.
933 static int find_child_by_addr(znode * parent /* parent znode, passed locked */ ,
934 znode * child /* child znode, passed locked */ ,
935 coord_t * result /* where result is stored in */ )
937 int ret;
939 assert("nikita-1320", parent != NULL);
940 assert("nikita-1321", child != NULL);
941 assert("nikita-1322", result != NULL);
943 ret = NS_NOT_FOUND;
945 for_all_units(result, parent) {
946 if (check_tree_pointer(result, child) == NS_FOUND) {
947 write_lock_tree(znode_get_tree(parent));
948 coord_to_parent_coord(result, &child->in_parent);
949 write_unlock_tree(znode_get_tree(parent));
950 ret = NS_FOUND;
951 break;
954 return ret;
957 /* true, if @addr is "unallocated block number", which is just address, with
958 highest bit set. */
959 int is_disk_addr_unallocated(const reiser4_block_nr * addr /* address to
960 * check */ )
962 assert("nikita-1766", addr != NULL);
963 cassert(sizeof(reiser4_block_nr) == 8);
964 return (*addr & REISER4_BLOCKNR_STATUS_BIT_MASK) ==
965 REISER4_UNALLOCATED_STATUS_VALUE;
968 /* returns true if removing bytes of given range of key [from_key, to_key]
969 causes removing of whole item @from */
970 static int
971 item_removed_completely(coord_t * from, const reiser4_key * from_key,
972 const reiser4_key * to_key)
974 item_plugin *iplug;
975 reiser4_key key_in_item;
977 assert("umka-325", from != NULL);
978 assert("", item_is_extent(from));
980 /* check first key just for case */
981 item_key_by_coord(from, &key_in_item);
982 if (keygt(from_key, &key_in_item))
983 return 0;
985 /* check last key */
986 iplug = item_plugin_by_coord(from);
987 assert("vs-611", iplug && iplug->s.file.append_key);
989 iplug->s.file.append_key(from, &key_in_item);
990 set_key_offset(&key_in_item, get_key_offset(&key_in_item) - 1);
992 if (keylt(to_key, &key_in_item))
993 /* last byte is not removed */
994 return 0;
995 return 1;
998 /* helper function for prepare_twig_kill(): @left and @right are formatted
999 * neighbors of extent item being completely removed. Load and lock neighbors
1000 * and store lock handles into @cdata for later use by kill_hook_extent() */
1001 static int
1002 prepare_children(znode * left, znode * right, carry_kill_data * kdata)
1004 int result;
1005 int left_loaded;
1006 int right_loaded;
1008 result = 0;
1009 left_loaded = right_loaded = 0;
1011 if (left != NULL) {
1012 result = zload(left);
1013 if (result == 0) {
1014 left_loaded = 1;
1015 result = longterm_lock_znode(kdata->left, left,
1016 ZNODE_READ_LOCK,
1017 ZNODE_LOCK_LOPRI);
1020 if (result == 0 && right != NULL) {
1021 result = zload(right);
1022 if (result == 0) {
1023 right_loaded = 1;
1024 result = longterm_lock_znode(kdata->right, right,
1025 ZNODE_READ_LOCK,
1026 ZNODE_LOCK_HIPRI |
1027 ZNODE_LOCK_NONBLOCK);
1030 if (result != 0) {
1031 done_lh(kdata->left);
1032 done_lh(kdata->right);
1033 if (left_loaded != 0)
1034 zrelse(left);
1035 if (right_loaded != 0)
1036 zrelse(right);
1038 return result;
1041 static void done_children(carry_kill_data * kdata)
1043 if (kdata->left != NULL && kdata->left->node != NULL) {
1044 zrelse(kdata->left->node);
1045 done_lh(kdata->left);
1047 if (kdata->right != NULL && kdata->right->node != NULL) {
1048 zrelse(kdata->right->node);
1049 done_lh(kdata->right);
1053 /* part of cut_node. It is called when cut_node is called to remove or cut part
1054 of extent item. When head of that item is removed - we have to update right
1055 delimiting of left neighbor of extent. When item is removed completely - we
1056 have to set sibling link between left and right neighbor of removed
1057 extent. This may return -E_DEADLOCK because of trying to get left neighbor
1058 locked. So, caller should repeat an attempt
1060 /* Audited by: umka (2002.06.16) */
1061 static int
1062 prepare_twig_kill(carry_kill_data * kdata, znode * locked_left_neighbor)
1064 int result;
1065 reiser4_key key;
1066 lock_handle left_lh;
1067 lock_handle right_lh;
1068 coord_t left_coord;
1069 coord_t *from;
1070 znode *left_child;
1071 znode *right_child;
1072 reiser4_tree *tree;
1073 int left_zloaded_here, right_zloaded_here;
1075 from = kdata->params.from;
1076 assert("umka-326", from != NULL);
1077 assert("umka-327", kdata->params.to != NULL);
1079 /* for one extent item only yet */
1080 assert("vs-591", item_is_extent(from));
1081 assert("vs-592", from->item_pos == kdata->params.to->item_pos);
1083 if ((kdata->params.from_key
1084 && keygt(kdata->params.from_key, item_key_by_coord(from, &key)))
1085 || from->unit_pos != 0) {
1086 /* head of item @from is not removed, there is nothing to
1087 worry about */
1088 return 0;
1091 result = 0;
1092 left_zloaded_here = 0;
1093 right_zloaded_here = 0;
1095 left_child = right_child = NULL;
1097 coord_dup(&left_coord, from);
1098 init_lh(&left_lh);
1099 init_lh(&right_lh);
1100 if (coord_prev_unit(&left_coord)) {
1101 /* @from is leftmost item in its node */
1102 if (!locked_left_neighbor) {
1103 result =
1104 reiser4_get_left_neighbor(&left_lh, from->node,
1105 ZNODE_READ_LOCK,
1106 GN_CAN_USE_UPPER_LEVELS);
1107 switch (result) {
1108 case 0:
1109 break;
1110 case -E_NO_NEIGHBOR:
1111 /* there is no formatted node to the left of
1112 from->node */
1113 warning("vs-605",
1114 "extent item has smallest key in "
1115 "the tree and it is about to be removed");
1116 return 0;
1117 case -E_DEADLOCK:
1118 /* need to restart */
1119 default:
1120 return result;
1123 /* we have acquired left neighbor of from->node */
1124 result = zload(left_lh.node);
1125 if (result)
1126 goto done;
1128 locked_left_neighbor = left_lh.node;
1129 } else {
1130 /* squalloc_right_twig_cut should have supplied locked
1131 * left neighbor */
1132 assert("vs-834",
1133 znode_is_write_locked(locked_left_neighbor));
1134 result = zload(locked_left_neighbor);
1135 if (result)
1136 return result;
1139 left_zloaded_here = 1;
1140 coord_init_last_unit(&left_coord, locked_left_neighbor);
1143 if (!item_is_internal(&left_coord)) {
1144 /* what else but extent can be on twig level */
1145 assert("vs-606", item_is_extent(&left_coord));
1147 /* there is no left formatted child */
1148 if (left_zloaded_here)
1149 zrelse(locked_left_neighbor);
1150 done_lh(&left_lh);
1151 return 0;
1154 tree = znode_get_tree(left_coord.node);
1155 left_child = child_znode(&left_coord, left_coord.node, 1, 0);
1157 if (IS_ERR(left_child)) {
1158 result = PTR_ERR(left_child);
1159 goto done;
1162 /* left child is acquired, calculate new right delimiting key for it
1163 and get right child if it is necessary */
1164 if (item_removed_completely
1165 (from, kdata->params.from_key, kdata->params.to_key)) {
1166 /* try to get right child of removed item */
1167 coord_t right_coord;
1169 assert("vs-607",
1170 kdata->params.to->unit_pos ==
1171 coord_last_unit_pos(kdata->params.to));
1172 coord_dup(&right_coord, kdata->params.to);
1173 if (coord_next_unit(&right_coord)) {
1174 /* @to is rightmost unit in the node */
1175 result =
1176 reiser4_get_right_neighbor(&right_lh, from->node,
1177 ZNODE_READ_LOCK,
1178 GN_CAN_USE_UPPER_LEVELS);
1179 switch (result) {
1180 case 0:
1181 result = zload(right_lh.node);
1182 if (result)
1183 goto done;
1185 right_zloaded_here = 1;
1186 coord_init_first_unit(&right_coord,
1187 right_lh.node);
1188 item_key_by_coord(&right_coord, &key);
1189 break;
1191 case -E_NO_NEIGHBOR:
1192 /* there is no formatted node to the right of
1193 from->node */
1194 read_lock_dk(tree);
1195 key = *znode_get_rd_key(from->node);
1196 read_unlock_dk(tree);
1197 right_coord.node = NULL;
1198 result = 0;
1199 break;
1200 default:
1201 /* real error */
1202 goto done;
1204 } else {
1205 /* there is an item to the right of @from - take its key */
1206 item_key_by_coord(&right_coord, &key);
1209 /* try to get right child of @from */
1210 if (right_coord.node && /* there is right neighbor of @from */
1211 item_is_internal(&right_coord)) { /* it is internal item */
1212 right_child = child_znode(&right_coord,
1213 right_coord.node, 1, 0);
1215 if (IS_ERR(right_child)) {
1216 result = PTR_ERR(right_child);
1217 goto done;
1221 /* whole extent is removed between znodes left_child and right_child. Prepare them for linking and
1222 update of right delimiting key of left_child */
1223 result = prepare_children(left_child, right_child, kdata);
1224 } else {
1225 /* head of item @to is removed. left_child has to get right delimting key update. Prepare it for that */
1226 result = prepare_children(left_child, NULL, kdata);
1229 done:
1230 if (right_child)
1231 zput(right_child);
1232 if (right_zloaded_here)
1233 zrelse(right_lh.node);
1234 done_lh(&right_lh);
1236 if (left_child)
1237 zput(left_child);
1238 if (left_zloaded_here)
1239 zrelse(locked_left_neighbor);
1240 done_lh(&left_lh);
1241 return result;
1244 /* this is used to remove part of node content between coordinates @from and @to. Units to which @from and @to are set
1245 are to be cut completely */
1246 /* for try_to_merge_with_left, delete_copied, reiser4_delete_node */
1247 int cut_node_content(coord_t * from, coord_t * to, const reiser4_key * from_key, /* first key to be removed */
1248 const reiser4_key * to_key, /* last key to be removed */
1249 reiser4_key *
1250 smallest_removed /* smallest key actually removed */ )
1252 int result;
1253 carry_pool *pool;
1254 carry_level *lowest_level;
1255 carry_cut_data *cut_data;
1256 carry_op *op;
1258 assert("vs-1715", coord_compare(from, to) != COORD_CMP_ON_RIGHT);
1260 pool =
1261 init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
1262 sizeof(*cut_data));
1263 if (IS_ERR(pool))
1264 return PTR_ERR(pool);
1265 lowest_level = (carry_level *) (pool + 1);
1266 init_carry_level(lowest_level, pool);
1268 op = reiser4_post_carry(lowest_level, COP_CUT, from->node, 0);
1269 assert("vs-1509", op != 0);
1270 if (IS_ERR(op)) {
1271 done_carry_pool(pool);
1272 return PTR_ERR(op);
1275 cut_data = (carry_cut_data *) (lowest_level + 3);
1276 cut_data->params.from = from;
1277 cut_data->params.to = to;
1278 cut_data->params.from_key = from_key;
1279 cut_data->params.to_key = to_key;
1280 cut_data->params.smallest_removed = smallest_removed;
1282 op->u.cut_or_kill.is_cut = 1;
1283 op->u.cut_or_kill.u.cut = cut_data;
1285 result = reiser4_carry(lowest_level, NULL);
1286 done_carry_pool(pool);
1288 return result;
1291 /* cut part of the node
1293 Cut part or whole content of node.
1295 cut data between @from and @to of @from->node and call carry() to make
1296 corresponding changes in the tree. @from->node may become empty. If so -
1297 pointer to it will be removed. Neighboring nodes are not changed. Smallest
1298 removed key is stored in @smallest_removed
1301 int kill_node_content(coord_t * from, /* coord of the first unit/item that will be eliminated */
1302 coord_t * to, /* coord of the last unit/item that will be eliminated */
1303 const reiser4_key * from_key, /* first key to be removed */
1304 const reiser4_key * to_key, /* last key to be removed */
1305 reiser4_key * smallest_removed, /* smallest key actually removed */
1306 znode * locked_left_neighbor, /* this is set when kill_node_content is called with left neighbor
1307 * locked (in squalloc_right_twig_cut, namely) */
1308 struct inode *inode, /* inode of file whose item (or its part) is to be killed. This is necessary to
1309 invalidate pages together with item pointing to them */
1310 int truncate)
1311 { /* this call is made for file truncate) */
1312 int result;
1313 carry_pool *pool;
1314 carry_level *lowest_level;
1315 carry_kill_data *kdata;
1316 lock_handle *left_child;
1317 lock_handle *right_child;
1318 carry_op *op;
1320 assert("umka-328", from != NULL);
1321 assert("vs-316", !node_is_empty(from->node));
1322 assert("nikita-1812", coord_is_existing_unit(from)
1323 && coord_is_existing_unit(to));
1325 /* allocate carry_pool, 3 carry_level-s, carry_kill_data and structures for kill_hook_extent */
1326 pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
1327 sizeof(carry_kill_data) +
1328 2 * sizeof(lock_handle) +
1329 5 * sizeof(reiser4_key) + 2 * sizeof(coord_t));
1330 if (IS_ERR(pool))
1331 return PTR_ERR(pool);
1333 lowest_level = (carry_level *) (pool + 1);
1334 init_carry_level(lowest_level, pool);
1336 kdata = (carry_kill_data *) (lowest_level + 3);
1337 left_child = (lock_handle *) (kdata + 1);
1338 right_child = left_child + 1;
1340 init_lh(left_child);
1341 init_lh(right_child);
1343 kdata->params.from = from;
1344 kdata->params.to = to;
1345 kdata->params.from_key = from_key;
1346 kdata->params.to_key = to_key;
1347 kdata->params.smallest_removed = smallest_removed;
1348 kdata->params.truncate = truncate;
1349 kdata->flags = 0;
1350 kdata->inode = inode;
1351 kdata->left = left_child;
1352 kdata->right = right_child;
1353 /* memory for 5 reiser4_key and 2 coord_t will be used in kill_hook_extent */
1354 kdata->buf = (char *)(right_child + 1);
1356 if (znode_get_level(from->node) == TWIG_LEVEL && item_is_extent(from)) {
1357 /* left child of extent item may have to get updated right
1358 delimiting key and to get linked with right child of extent
1359 @from if it will be removed completely */
1360 result = prepare_twig_kill(kdata, locked_left_neighbor);
1361 if (result) {
1362 done_children(kdata);
1363 done_carry_pool(pool);
1364 return result;
1368 op = reiser4_post_carry(lowest_level, COP_CUT, from->node, 0);
1369 if (IS_ERR(op) || (op == NULL)) {
1370 done_children(kdata);
1371 done_carry_pool(pool);
1372 return RETERR(op ? PTR_ERR(op) : -EIO);
1375 op->u.cut_or_kill.is_cut = 0;
1376 op->u.cut_or_kill.u.kill = kdata;
1378 result = reiser4_carry(lowest_level, NULL);
1380 done_children(kdata);
1381 done_carry_pool(pool);
1382 return result;
1385 void
1386 fake_kill_hook_tail(struct inode *inode, loff_t start, loff_t end, int truncate)
1388 if (reiser4_inode_get_flag(inode, REISER4_HAS_MMAP)) {
1389 pgoff_t start_pg, end_pg;
1391 start_pg = start >> PAGE_CACHE_SHIFT;
1392 end_pg = (end - 1) >> PAGE_CACHE_SHIFT;
1394 if ((start & (PAGE_CACHE_SIZE - 1)) == 0) {
1396 * kill up to the page boundary.
1398 assert("vs-123456", start_pg == end_pg);
1399 reiser4_invalidate_pages(inode->i_mapping, start_pg, 1,
1400 truncate);
1401 } else if (start_pg != end_pg) {
1403 * page boundary is within killed portion of node.
1405 assert("vs-654321", end_pg - start_pg == 1);
1406 reiser4_invalidate_pages(inode->i_mapping, end_pg,
1407 end_pg - start_pg, 1);
1410 inode_sub_bytes(inode, end - start);
1414 * Delete whole @node from the reiser4 tree without loading it.
1416 * @left: locked left neighbor,
1417 * @node: node to be deleted,
1418 * @smallest_removed: leftmost key of deleted node,
1419 * @object: inode pointer, if we truncate a file body.
1420 * @truncate: true if called for file truncate.
1422 * @return: 0 if success, error code otherwise.
1424 * NOTE: if @object!=NULL we assume that @smallest_removed != NULL and it
1425 * contains the right value of the smallest removed key from the previous
1426 * cut_worker() iteration. This is needed for proper accounting of
1427 * "i_blocks" and "i_bytes" fields of the @object.
1429 int reiser4_delete_node(znode * node, reiser4_key * smallest_removed,
1430 struct inode *object, int truncate)
1432 lock_handle parent_lock;
1433 coord_t cut_from;
1434 coord_t cut_to;
1435 reiser4_tree *tree;
1436 int ret;
1438 assert("zam-937", node != NULL);
1439 assert("zam-933", znode_is_write_locked(node));
1440 assert("zam-999", smallest_removed != NULL);
1442 init_lh(&parent_lock);
1444 ret = reiser4_get_parent(&parent_lock, node, ZNODE_WRITE_LOCK);
1445 if (ret)
1446 return ret;
1448 assert("zam-934", !znode_above_root(parent_lock.node));
1450 ret = zload(parent_lock.node);
1451 if (ret)
1452 goto failed_nozrelse;
1454 ret = find_child_ptr(parent_lock.node, node, &cut_from);
1455 if (ret)
1456 goto failed;
1458 /* decrement child counter and set parent pointer to NULL before
1459 deleting the list from parent node because of checks in
1460 internal_kill_item_hook (we can delete the last item from the parent
1461 node, the parent node is going to be deleted and its c_count should
1462 be zero). */
1464 tree = znode_get_tree(node);
1465 write_lock_tree(tree);
1466 init_parent_coord(&node->in_parent, NULL);
1467 --parent_lock.node->c_count;
1468 write_unlock_tree(tree);
1470 assert("zam-989", item_is_internal(&cut_from));
1472 /* @node should be deleted after unlocking. */
1473 ZF_SET(node, JNODE_HEARD_BANSHEE);
1475 /* remove a pointer from the parent node to the node being deleted. */
1476 coord_dup(&cut_to, &cut_from);
1477 /* FIXME: shouldn't this be kill_node_content */
1478 ret = cut_node_content(&cut_from, &cut_to, NULL, NULL, NULL);
1479 if (ret)
1480 /* FIXME(Zam): Should we re-connect the node to its parent if
1481 * cut_node fails? */
1482 goto failed;
1485 reiser4_tree *tree = current_tree;
1486 __u64 start_offset = 0, end_offset = 0;
1488 read_lock_tree(tree);
1489 write_lock_dk(tree);
1490 if (object) {
1491 /* We use @smallest_removed and the left delimiting of
1492 * the current node for @object->i_blocks, i_bytes
1493 * calculation. We assume that the items after the
1494 * *@smallest_removed key have been deleted from the
1495 * file body. */
1496 start_offset = get_key_offset(znode_get_ld_key(node));
1497 end_offset = get_key_offset(smallest_removed);
1500 assert("zam-1021", znode_is_connected(node));
1501 if (node->left)
1502 znode_set_rd_key(node->left, znode_get_rd_key(node));
1504 *smallest_removed = *znode_get_ld_key(node);
1506 write_unlock_dk(tree);
1507 read_unlock_tree(tree);
1509 if (object) {
1510 /* we used to perform actions which are to be performed on items on their removal from tree in
1511 special item method - kill_hook. Here for optimization reasons we avoid reading node
1512 containing item we remove and can not call item's kill hook. Instead we call function which
1513 does exactly the same things as tail kill hook in assumption that node we avoid reading
1514 contains only one item and that item is a tail one. */
1515 fake_kill_hook_tail(object, start_offset, end_offset,
1516 truncate);
1519 failed:
1520 zrelse(parent_lock.node);
1521 failed_nozrelse:
1522 done_lh(&parent_lock);
1524 return ret;
1527 static int can_delete(const reiser4_key *key, znode *node)
1529 int result;
1531 read_lock_dk(current_tree);
1532 result = keyle(key, znode_get_ld_key(node));
1533 read_unlock_dk(current_tree);
1534 return result;
1538 * This subroutine is not optimal but implementation seems to
1539 * be easier).
1541 * @tap: the point deletion process begins from,
1542 * @from_key: the beginning of the deleted key range,
1543 * @to_key: the end of the deleted key range,
1544 * @smallest_removed: the smallest removed key,
1545 * @truncate: true if called for file truncate.
1546 * @progress: return true if a progress in file items deletions was made,
1547 * @smallest_removed value is actual in that case.
1549 * @return: 0 if success, error code otherwise, -E_REPEAT means that long
1550 * reiser4_cut_tree operation was interrupted for allowing atom commit.
1553 cut_tree_worker_common(tap_t * tap, const reiser4_key * from_key,
1554 const reiser4_key * to_key,
1555 reiser4_key * smallest_removed, struct inode *object,
1556 int truncate, int *progress)
1558 lock_handle next_node_lock;
1559 coord_t left_coord;
1560 int result;
1562 assert("zam-931", tap->coord->node != NULL);
1563 assert("zam-932", znode_is_write_locked(tap->coord->node));
1565 *progress = 0;
1566 init_lh(&next_node_lock);
1568 while (1) {
1569 znode *node; /* node from which items are cut */
1570 node_plugin *nplug; /* node plugin for @node */
1572 node = tap->coord->node;
1574 /* Move next_node_lock to the next node on the left. */
1575 result =
1576 reiser4_get_left_neighbor(&next_node_lock, node,
1577 ZNODE_WRITE_LOCK,
1578 GN_CAN_USE_UPPER_LEVELS);
1579 if (result != 0 && result != -E_NO_NEIGHBOR)
1580 break;
1581 /* Check can we delete the node as a whole. */
1582 if (*progress && znode_get_level(node) == LEAF_LEVEL &&
1583 can_delete(from_key, node)) {
1584 result = reiser4_delete_node(node, smallest_removed,
1585 object, truncate);
1586 } else {
1587 result = reiser4_tap_load(tap);
1588 if (result)
1589 return result;
1591 /* Prepare the second (right) point for cut_node() */
1592 if (*progress)
1593 coord_init_last_unit(tap->coord, node);
1595 else if (item_plugin_by_coord(tap->coord)->b.lookup ==
1596 NULL)
1597 /* set rightmost unit for the items without lookup method */
1598 tap->coord->unit_pos =
1599 coord_last_unit_pos(tap->coord);
1601 nplug = node->nplug;
1603 assert("vs-686", nplug);
1604 assert("vs-687", nplug->lookup);
1606 /* left_coord is leftmost unit cut from @node */
1607 result = nplug->lookup(node, from_key,
1608 FIND_MAX_NOT_MORE_THAN,
1609 &left_coord);
1611 if (IS_CBKERR(result))
1612 break;
1614 /* adjust coordinates so that they are set to existing units */
1615 if (coord_set_to_right(&left_coord)
1616 || coord_set_to_left(tap->coord)) {
1617 result = 0;
1618 break;
1621 if (coord_compare(&left_coord, tap->coord) ==
1622 COORD_CMP_ON_RIGHT) {
1623 /* keys from @from_key to @to_key are not in the tree */
1624 result = 0;
1625 break;
1628 if (left_coord.item_pos != tap->coord->item_pos) {
1629 /* do not allow to cut more than one item. It is added to solve problem of truncating
1630 partially converted files. If file is partially converted there may exist a twig node
1631 containing both internal item or items pointing to leaf nodes with formatting items
1632 and extent item. We do not want to kill internal items being at twig node here
1633 because cut_tree_worker assumes killing them from level level */
1634 coord_dup(&left_coord, tap->coord);
1635 assert("vs-1652",
1636 coord_is_existing_unit(&left_coord));
1637 left_coord.unit_pos = 0;
1640 /* cut data from one node */
1641 // *smallest_removed = *reiser4_min_key();
1642 result =
1643 kill_node_content(&left_coord, tap->coord, from_key,
1644 to_key, smallest_removed,
1645 next_node_lock.node, object,
1646 truncate);
1647 reiser4_tap_relse(tap);
1649 if (result)
1650 break;
1652 ++(*progress);
1654 /* Check whether all items with keys >= from_key were removed
1655 * from the tree. */
1656 if (keyle(smallest_removed, from_key))
1657 /* result = 0; */
1658 break;
1660 if (next_node_lock.node == NULL)
1661 break;
1663 result = reiser4_tap_move(tap, &next_node_lock);
1664 done_lh(&next_node_lock);
1665 if (result)
1666 break;
1668 /* Break long reiser4_cut_tree operation (deletion of a large
1669 file) if atom requires commit. */
1670 if (*progress > CUT_TREE_MIN_ITERATIONS
1671 && current_atom_should_commit()) {
1672 result = -E_REPEAT;
1673 break;
1676 done_lh(&next_node_lock);
1677 // assert("vs-301", !keyeq(&smallest_removed, reiser4_min_key()));
1678 return result;
1681 /* there is a fundamental problem with optimizing deletes: VFS does it
1682 one file at a time. Another problem is that if an item can be
1683 anything, then deleting items must be done one at a time. It just
1684 seems clean to writes this to specify a from and a to key, and cut
1685 everything between them though. */
1687 /* use this function with care if deleting more than what is part of a single file. */
1688 /* do not use this when cutting a single item, it is suboptimal for that */
1690 /* You are encouraged to write plugin specific versions of this. It
1691 cannot be optimal for all plugins because it works item at a time,
1692 and some plugins could sometimes work node at a time. Regular files
1693 however are not optimizable to work node at a time because of
1694 extents needing to free the blocks they point to.
1696 Optimizations compared to v3 code:
1698 It does not balance (that task is left to memory pressure code).
1700 Nodes are deleted only if empty.
1702 Uses extents.
1704 Performs read-ahead of formatted nodes whose contents are part of
1705 the deletion.
1709 * Delete everything from the reiser4 tree between two keys: @from_key and
1710 * @to_key.
1712 * @from_key: the beginning of the deleted key range,
1713 * @to_key: the end of the deleted key range,
1714 * @smallest_removed: the smallest removed key,
1715 * @object: owner of cutting items.
1716 * @truncate: true if called for file truncate.
1717 * @progress: return true if a progress in file items deletions was made,
1718 * @smallest_removed value is actual in that case.
1720 * @return: 0 if success, error code otherwise, -E_REPEAT means that long cut_tree
1721 * operation was interrupted for allowing atom commit .
1724 int reiser4_cut_tree_object(reiser4_tree * tree, const reiser4_key * from_key,
1725 const reiser4_key * to_key,
1726 reiser4_key * smallest_removed_p,
1727 struct inode *object, int truncate, int *progress)
1729 lock_handle lock;
1730 int result;
1731 tap_t tap;
1732 coord_t right_coord;
1733 reiser4_key smallest_removed;
1734 int (*cut_tree_worker) (tap_t *, const reiser4_key *,
1735 const reiser4_key *, reiser4_key *,
1736 struct inode *, int, int *);
1737 STORE_COUNTERS;
1739 assert("umka-329", tree != NULL);
1740 assert("umka-330", from_key != NULL);
1741 assert("umka-331", to_key != NULL);
1742 assert("zam-936", keyle(from_key, to_key));
1744 if (smallest_removed_p == NULL)
1745 smallest_removed_p = &smallest_removed;
1747 init_lh(&lock);
1749 do {
1750 /* Find rightmost item to cut away from the tree. */
1751 result = reiser4_object_lookup(object, to_key, &right_coord,
1752 &lock, ZNODE_WRITE_LOCK,
1753 FIND_MAX_NOT_MORE_THAN,
1754 TWIG_LEVEL, LEAF_LEVEL,
1755 CBK_UNIQUE, NULL /*ra_info */);
1756 if (result != CBK_COORD_FOUND)
1757 break;
1758 if (object == NULL
1759 || inode_file_plugin(object)->cut_tree_worker == NULL)
1760 cut_tree_worker = cut_tree_worker_common;
1761 else
1762 cut_tree_worker =
1763 inode_file_plugin(object)->cut_tree_worker;
1764 reiser4_tap_init(&tap, &right_coord, &lock, ZNODE_WRITE_LOCK);
1765 result =
1766 cut_tree_worker(&tap, from_key, to_key, smallest_removed_p,
1767 object, truncate, progress);
1768 reiser4_tap_done(&tap);
1770 reiser4_preempt_point();
1772 } while (0);
1774 done_lh(&lock);
1776 if (result) {
1777 switch (result) {
1778 case -E_NO_NEIGHBOR:
1779 result = 0;
1780 break;
1781 case -E_DEADLOCK:
1782 result = -E_REPEAT;
1783 case -E_REPEAT:
1784 case -ENOMEM:
1785 case -ENOENT:
1786 break;
1787 default:
1788 warning("nikita-2861", "failure: %i", result);
1792 CHECK_COUNTERS;
1793 return result;
1796 /* repeat reiser4_cut_tree_object until everything is deleted.
1797 * unlike cut_file_items, it does not end current transaction if -E_REPEAT
1798 * is returned by cut_tree_object. */
1799 int reiser4_cut_tree(reiser4_tree * tree, const reiser4_key * from,
1800 const reiser4_key * to, struct inode *inode, int truncate)
1802 int result;
1803 int progress;
1805 do {
1806 result = reiser4_cut_tree_object(tree, from, to, NULL,
1807 inode, truncate, &progress);
1808 } while (result == -E_REPEAT);
1810 return result;
1813 /* finishing reiser4 initialization */
1814 int reiser4_init_tree(reiser4_tree * tree /* pointer to structure being
1815 * initialized */ ,
1816 const reiser4_block_nr * root_block /* address of a root block
1817 * on a disk */ ,
1818 tree_level height /* height of a tree */ ,
1819 node_plugin * nplug /* default node plugin */ )
1821 int result;
1823 assert("nikita-306", tree != NULL);
1824 assert("nikita-307", root_block != NULL);
1825 assert("nikita-308", height > 0);
1826 assert("nikita-309", nplug != NULL);
1827 assert("zam-587", tree->super != NULL);
1829 tree->root_block = *root_block;
1830 tree->height = height;
1831 tree->estimate_one_insert = calc_estimate_one_insert(height);
1832 tree->nplug = nplug;
1834 tree->znode_epoch = 1ull;
1836 cbk_cache_init(&tree->cbk_cache);
1838 result = znodes_tree_init(tree);
1839 if (result == 0)
1840 result = jnodes_tree_init(tree);
1841 if (result == 0) {
1842 tree->uber = zget(tree, &UBER_TREE_ADDR, NULL, 0,
1843 reiser4_ctx_gfp_mask_get());
1844 if (IS_ERR(tree->uber)) {
1845 result = PTR_ERR(tree->uber);
1846 tree->uber = NULL;
1849 return result;
1852 /* release resources associated with @tree */
1853 void reiser4_done_tree(reiser4_tree * tree /* tree to release */ )
1855 if (tree == NULL)
1856 return;
1858 if (tree->uber != NULL) {
1859 zput(tree->uber);
1860 tree->uber = NULL;
1862 znodes_tree_done(tree);
1863 jnodes_tree_done(tree);
1864 cbk_cache_done(&tree->cbk_cache);
1867 /* Make Linus happy.
1868 Local variables:
1869 c-indentation-style: "K&R"
1870 mode-name: "LC"
1871 c-basic-offset: 8
1872 tab-width: 8
1873 fill-column: 120
1874 scroll-step: 1
1875 End: