1 /* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
7 * The tree consists of nodes located on the disk. Node in the tree is either
8 * formatted or unformatted. Formatted node is one that has structure
9 * understood by the tree balancing and traversal code. Formatted nodes are
10 * further classified into leaf and internal nodes. Latter distinctions is
11 * (almost) of only historical importance: general structure of leaves and
12 * internal nodes is the same in Reiser4. Unformatted nodes contain raw data
13 * that are part of bodies of ordinary files and attributes.
15 * Each node in the tree spawns some interval in the key space. Key ranges for
16 * all nodes in the tree are disjoint. Actually, this only holds in some weak
17 * sense, because of the non-unique keys: intersection of key ranges for
18 * different nodes is either empty, or consists of exactly one key.
20 * Formatted node consists of a sequence of items. Each item spawns some
21 * interval in key space. Key ranges for all items in a tree are disjoint,
22 * modulo non-unique keys again. Items within nodes are ordered in the key
23 * order of the smallest key in a item.
25 * Particular type of item can be further split into units. Unit is piece of
26 * item that can be cut from item and moved into another item of the same
27 * time. Units are used by balancing code to repack data during balancing.
29 * Unit can be further split into smaller entities (for example, extent unit
30 * represents several pages, and it is natural for extent code to operate on
31 * particular pages and even bytes within one unit), but this is of no
32 * relevance to the generic balancing and lookup code.
34 * Although item is said to "spawn" range or interval of keys, it is not
35 * necessary that item contains piece of data addressable by each and every
36 * key in this range. For example, compound directory item, consisting of
37 * units corresponding to directory entries and keyed by hashes of file names,
38 * looks more as having "discrete spectrum": only some disjoint keys inside
39 * range occupied by this item really address data.
41 * No than less, each item always has well-defined least (minimal) key, that
42 * is recorded in item header, stored in the node this item is in. Also, item
43 * plugin can optionally define method ->max_key_inside() returning maximal
44 * key that can _possibly_ be located within this item. This method is used
45 * (mainly) to determine when given piece of data should be merged into
46 * existing item, in stead of creating new one. Because of this, even though
47 * ->max_key_inside() can be larger that any key actually located in the item,
50 * [ reiser4_min_key( item ), ->max_key_inside( item ) ]
52 * are still disjoint for all items within the _same_ node.
54 * In memory node is represented by znode. It plays several roles:
56 * . something locks are taken on
58 * . something tracked by transaction manager (this is going to change)
60 * . something used to access node data
62 * . something used to maintain tree structure in memory: sibling and
65 * . something used to organize nodes into "slums"
67 * More on znodes see in znode.[ch]
71 * To simplify balancing, allow some flexibility in locking and speed up
72 * important coord cache optimization, we keep delimiting keys of nodes in
73 * memory. Depending on disk format (implemented by appropriate node plugin)
74 * node on disk can record both left and right delimiting key, only one of
75 * them, or none. Still, our balancing and tree traversal code keep both
76 * delimiting keys for a node that is in memory stored in the znode. When
77 * node is first brought into memory during tree traversal, its left
78 * delimiting key is taken from its parent, and its right delimiting key is
79 * either next key in its parent, or is right delimiting key of parent if
80 * node is the rightmost child of parent.
82 * Physical consistency of delimiting key is protected by special dk
83 * read-write lock. That is, delimiting keys can only be inspected or
84 * modified under this lock. But dk lock is only sufficient for fast
85 * "pessimistic" check, because to simplify code and to decrease lock
86 * contention, balancing (carry) only updates delimiting keys right before
87 * unlocking all locked nodes on the given tree level. For example,
88 * coord-by-key cache scans LRU list of recently accessed znodes. For each
89 * node it first does fast check under dk spin lock. If key looked for is
90 * not between delimiting keys for this node, next node is inspected and so
91 * on. If key is inside of the key range, long term lock is taken on node
92 * and key range is rechecked.
96 * To find something in the tree, you supply a key, and the key is resolved
97 * by coord_by_key() into a coord (coordinate) that is valid as long as the
98 * node the coord points to remains locked. As mentioned above trees
99 * consist of nodes that consist of items that consist of units. A unit is
100 * the smallest and indivisible piece of tree as far as balancing and tree
101 * search are concerned. Each node, item, and unit can be addressed by
102 * giving its level in the tree and the key occupied by this entity. A node
103 * knows what the key ranges are of the items within it, and how to find its
104 * items and invoke their item handlers, but it does not know how to access
105 * individual units within its items except through the item handlers.
106 * coord is a structure containing a pointer to the node, the ordinal number
107 * of the item within this node (a sort of item offset), and the ordinal
108 * number of the unit within this item.
112 * There are two types of access to the tree: lookup and modification.
114 * Lookup is a search for the key in the tree. Search can look for either
115 * exactly the key given to it, or for the largest key that is not greater
116 * than the key given to it. This distinction is determined by "bias"
117 * parameter of search routine (coord_by_key()). coord_by_key() either
118 * returns error (key is not in the tree, or some kind of external error
119 * occurred), or successfully resolves key into coord.
121 * This resolution is done by traversing tree top-to-bottom from root level
122 * to the desired level. On levels above twig level (level one above the
123 * leaf level) nodes consist exclusively of internal items. Internal item is
124 * nothing more than pointer to the tree node on the child level. On twig
125 * level nodes consist of internal items intermixed with extent
126 * items. Internal items form normal search tree structure used by traversal
127 * to descent through the tree.
129 * TREE LOOKUP OPTIMIZATIONS
131 * Tree lookup described above is expensive even if all nodes traversed are
132 * already in the memory: for each node binary search within it has to be
133 * performed and binary searches are CPU consuming and tend to destroy CPU
136 * Several optimizations are used to work around this:
138 * . cbk_cache (look-aside cache for tree traversals, see search.c for
141 * . seals (see seal.[ch])
143 * . vroot (see search.c)
145 * General search-by-key is layered thusly:
147 * [check seal, if any] --ok--> done
152 * [vroot defined] --no--> node = tree_root
162 * [check cbk_cache for key] --ok--> done
167 * [start tree traversal from node]
176 #include "plugin/item/static_stat.h"
177 #include "plugin/item/item.h"
178 #include "plugin/node/node.h"
179 #include "plugin/plugin.h"
183 #include "block_alloc.h"
184 #include "tree_walk.h"
186 #include "carry_ops.h"
190 #include "page_cache.h"
195 #include <linux/fs.h> /* for struct super_block */
196 #include <linux/spinlock.h>
198 /* Disk address (block number) never ever used for any real tree node. This is
199 used as block number of "uber" znode.
201 Invalid block addresses are 0 by tradition.
204 const reiser4_block_nr UBER_TREE_ADDR
= 0ull;
206 #define CUT_TREE_MIN_ITERATIONS 64
208 static int find_child_by_addr(znode
* parent
, znode
* child
, coord_t
*result
);
210 /* return node plugin of coord->node */
211 node_plugin
*node_plugin_by_coord(const coord_t
*coord
)
213 assert("vs-1", coord
!= NULL
);
214 assert("vs-2", coord
->node
!= NULL
);
216 return coord
->node
->nplug
;
219 /* insert item into tree. Fields of @coord are updated so that they can be
220 * used by consequent insert operation. */
221 insert_result
insert_by_key(reiser4_tree
* tree
/* tree to insert new item
223 const reiser4_key
* key
/* key of new item */ ,
224 reiser4_item_data
* data
/* parameters for item
226 coord_t
*coord
/* resulting insertion coord */ ,
227 lock_handle
* lh
/* resulting lock
229 tree_level stop_level
/* level where to insert */ ,
230 __u32 flags
/* insertion flags */)
234 assert("nikita-358", tree
!= NULL
);
235 assert("nikita-360", coord
!= NULL
);
237 result
= coord_by_key(tree
, key
, coord
, lh
, ZNODE_WRITE_LOCK
,
238 FIND_EXACT
, stop_level
, stop_level
,
239 flags
| CBK_FOR_INSERT
, NULL
/*ra_info */);
243 case CBK_COORD_FOUND
:
244 result
= IBK_ALREADY_EXISTS
;
246 case CBK_COORD_NOTFOUND
:
247 assert("nikita-2017", coord
->node
!= NULL
);
248 result
= insert_by_coord(coord
, data
, key
, lh
, 0/*flags */);
254 /* insert item by calling carry. Helper function called if short-cut
256 static insert_result
insert_with_carry_by_coord(coord_t
*coord
,
257 /* coord where to insert */
259 /* lock handle of insertion node */
260 reiser4_item_data
* data
,
261 /* parameters of new item */
262 const reiser4_key
* key
,
263 /* key of new item */
265 /* carry operation to perform */
266 cop_insert_flag flags
271 carry_level
*lowest_level
;
272 carry_insert_data
*cdata
;
275 assert("umka-314", coord
!= NULL
);
277 /* allocate carry_pool and 3 carry_level-s */
279 init_carry_pool(sizeof(*pool
) + 3 * sizeof(*lowest_level
) +
282 return PTR_ERR(pool
);
283 lowest_level
= (carry_level
*) (pool
+ 1);
284 init_carry_level(lowest_level
, pool
);
286 op
= reiser4_post_carry(lowest_level
, cop
, coord
->node
, 0);
287 if (IS_ERR(op
) || (op
== NULL
)) {
288 done_carry_pool(pool
);
289 return RETERR(op
? PTR_ERR(op
) : -EIO
);
291 cdata
= (carry_insert_data
*) (lowest_level
+ 3);
292 cdata
->coord
= coord
;
295 op
->u
.insert
.d
= cdata
;
297 flags
= znode_get_tree(coord
->node
)->carry
.insert_flags
;
298 op
->u
.insert
.flags
= flags
;
299 op
->u
.insert
.type
= COPT_ITEM_DATA
;
300 op
->u
.insert
.child
= NULL
;
302 assert("nikita-3245", lh
->node
== coord
->node
);
303 lowest_level
->track_type
= CARRY_TRACK_CHANGE
;
304 lowest_level
->tracked
= lh
;
307 result
= reiser4_carry(lowest_level
, NULL
);
308 done_carry_pool(pool
);
313 /* form carry queue to perform paste of @data with @key at @coord, and launch
314 its execution by calling carry().
316 Instruct carry to update @lh it after balancing insertion coord moves into
320 static int paste_with_carry(coord_t
*coord
, /* coord of paste */
321 lock_handle
* lh
, /* lock handle of node
324 reiser4_item_data
* data
, /* parameters of new
326 const reiser4_key
* key
, /* key of new item */
327 unsigned flags
/* paste flags */)
331 carry_level
*lowest_level
;
332 carry_insert_data
*cdata
;
335 assert("umka-315", coord
!= NULL
);
336 assert("umka-316", key
!= NULL
);
339 init_carry_pool(sizeof(*pool
) + 3 * sizeof(*lowest_level
) +
342 return PTR_ERR(pool
);
343 lowest_level
= (carry_level
*) (pool
+ 1);
344 init_carry_level(lowest_level
, pool
);
346 op
= reiser4_post_carry(lowest_level
, COP_PASTE
, coord
->node
, 0);
347 if (IS_ERR(op
) || (op
== NULL
)) {
348 done_carry_pool(pool
);
349 return RETERR(op
? PTR_ERR(op
) : -EIO
);
351 cdata
= (carry_insert_data
*) (lowest_level
+ 3);
352 cdata
->coord
= coord
;
355 op
->u
.paste
.d
= cdata
;
357 flags
= znode_get_tree(coord
->node
)->carry
.paste_flags
;
358 op
->u
.paste
.flags
= flags
;
359 op
->u
.paste
.type
= COPT_ITEM_DATA
;
361 lowest_level
->track_type
= CARRY_TRACK_CHANGE
;
362 lowest_level
->tracked
= lh
;
365 result
= reiser4_carry(lowest_level
, NULL
);
366 done_carry_pool(pool
);
371 /* insert item at the given coord.
373 First try to skip carry by directly calling ->create_item() method of node
374 plugin. If this is impossible (there is not enough free space in the node,
375 or leftmost item in the node is created), call insert_with_carry_by_coord()
376 that will do full carry().
379 insert_result
insert_by_coord(coord_t
*coord
/* coord where to
380 * insert. coord->node has
381 * to be write locked by
383 reiser4_item_data
* data
/* data to be
385 const reiser4_key
* key
/* key of new item */ ,
386 lock_handle
* lh
/* lock handle of write
388 __u32 flags
/* insertion flags */)
394 assert("vs-247", coord
!= NULL
);
395 assert("vs-248", data
!= NULL
);
396 assert("vs-249", data
->length
>= 0);
397 assert("nikita-1191", znode_is_write_locked(coord
->node
));
400 coord_clear_iplug(coord
);
401 result
= zload(node
);
405 item_size
= space_needed(node
, NULL
, data
, 1);
406 if (item_size
> znode_free_space(node
) &&
407 (flags
& COPI_DONT_SHIFT_LEFT
) && (flags
& COPI_DONT_SHIFT_RIGHT
)
408 && (flags
& COPI_DONT_ALLOCATE
)) {
409 /* we are forced to use free space of coord->node and new item
410 does not fit into it.
412 Currently we get here only when we allocate and copy units
413 of extent item from a node to its left neighbor during
414 "squalloc"-ing. If @node (this is left neighbor) does not
415 have enough free space - we do not want to attempt any
416 shifting and allocations because we are in squeezing and
417 everything to the left of @node is tightly packed.
419 result
= -E_NODE_FULL
;
420 } else if ((item_size
<= znode_free_space(node
)) &&
421 !coord_is_before_leftmost(coord
) &&
422 (node_plugin_by_node(node
)->fast_insert
!= NULL
)
423 && node_plugin_by_node(node
)->fast_insert(coord
)) {
424 /* shortcut insertion without carry() overhead.
428 - there is enough free space
430 - insertion is not into the leftmost position in a node
431 (otherwise it would require updating of delimiting key in a
434 - node plugin agrees with this
438 node_plugin_by_node(node
)->create_item(coord
, key
, data
,
440 znode_make_dirty(node
);
442 /* otherwise do full-fledged carry(). */
444 insert_with_carry_by_coord(coord
, lh
, data
, key
, COP_INSERT
,
451 /* @coord is set to leaf level and @data is to be inserted to twig level */
453 insert_extent_by_coord(coord_t
*coord
, /* coord where to insert.
454 * coord->node has to be write
455 * locked by caller */
456 reiser4_item_data
*data
,/* data to be inserted */
457 const reiser4_key
*key
, /* key of new item */
458 lock_handle
*lh
/* lock handle of write lock
461 assert("vs-405", coord
!= NULL
);
462 assert("vs-406", data
!= NULL
);
463 assert("vs-407", data
->length
> 0);
464 assert("vs-408", znode_is_write_locked(coord
->node
));
465 assert("vs-409", znode_get_level(coord
->node
) == LEAF_LEVEL
);
467 return insert_with_carry_by_coord(coord
, lh
, data
, key
, COP_EXTENT
,
471 /* Insert into the item at the given coord.
473 First try to skip carry by directly calling ->paste() method of item
474 plugin. If this is impossible (there is not enough free space in the node,
475 or we are pasting into leftmost position in the node), call
476 paste_with_carry() that will do full carry().
479 /* paste_into_item */
480 int insert_into_item(coord_t
* coord
/* coord of pasting */ ,
481 lock_handle
* lh
/* lock handle on node involved */ ,
482 const reiser4_key
* key
/* key of unit being pasted */ ,
483 reiser4_item_data
* data
/* parameters for new unit */ ,
484 unsigned flags
/* insert/paste flags */ )
491 assert("umka-317", coord
!= NULL
);
492 assert("umka-318", key
!= NULL
);
494 iplug
= item_plugin_by_coord(coord
);
495 nplug
= node_plugin_by_coord(coord
);
497 assert("nikita-1480", iplug
== data
->iplug
);
499 size_change
= space_needed(coord
->node
, coord
, data
, 0);
500 if (size_change
> (int)znode_free_space(coord
->node
) &&
501 (flags
& COPI_DONT_SHIFT_LEFT
) && (flags
& COPI_DONT_SHIFT_RIGHT
)
502 && (flags
& COPI_DONT_ALLOCATE
)) {
503 /* we are forced to use free space of coord->node and new data
504 does not fit into it. */
508 /* shortcut paste without carry() overhead.
512 - there is enough free space
514 - paste is not into the leftmost unit in a node (otherwise
515 it would require updating of delimiting key in a parent)
517 - node plugin agrees with this
519 - item plugin agrees with us
521 if (size_change
<= (int)znode_free_space(coord
->node
) &&
522 (coord
->item_pos
!= 0 ||
523 coord
->unit_pos
!= 0 || coord
->between
== AFTER_UNIT
) &&
524 coord
->unit_pos
!= 0 && nplug
->fast_paste
!= NULL
&&
525 nplug
->fast_paste(coord
) &&
526 iplug
->b
.fast_paste
!= NULL
&& iplug
->b
.fast_paste(coord
)) {
528 nplug
->change_item_size(coord
, size_change
);
529 /* NOTE-NIKITA: huh? where @key is used? */
530 result
= iplug
->b
.paste(coord
, data
, NULL
);
532 nplug
->change_item_size(coord
, size_change
);
533 znode_make_dirty(coord
->node
);
535 /* otherwise do full-fledged carry(). */
536 result
= paste_with_carry(coord
, lh
, data
, key
, flags
);
540 /* this either appends or truncates item @coord */
541 int reiser4_resize_item(coord_t
* coord
/* coord of item being resized */ ,
542 reiser4_item_data
* data
/* parameters of resize */ ,
543 reiser4_key
* key
/* key of new unit */ ,
544 lock_handle
* lh
/* lock handle of node
545 * being modified */ ,
546 cop_insert_flag flags
/* carry flags */ )
551 assert("nikita-362", coord
!= NULL
);
552 assert("nikita-363", data
!= NULL
);
553 assert("vs-245", data
->length
!= 0);
556 coord_clear_iplug(coord
);
557 result
= zload(node
);
561 if (data
->length
< 0)
562 result
= node_plugin_by_coord(coord
)->shrink_item(coord
,
565 result
= insert_into_item(coord
, lh
, key
, data
, flags
);
572 int reiser4_insert_flow(coord_t
* coord
, lock_handle
* lh
, flow_t
* f
)
576 carry_level
*lowest_level
;
577 reiser4_item_data
*data
;
581 init_carry_pool(sizeof(*pool
) + 3 * sizeof(*lowest_level
) +
584 return PTR_ERR(pool
);
585 lowest_level
= (carry_level
*) (pool
+ 1);
586 init_carry_level(lowest_level
, pool
);
588 op
= reiser4_post_carry(lowest_level
, COP_INSERT_FLOW
, coord
->node
,
589 0 /* operate directly on coord -> node */ );
590 if (IS_ERR(op
) || (op
== NULL
)) {
591 done_carry_pool(pool
);
592 return RETERR(op
? PTR_ERR(op
) : -EIO
);
595 /* these are permanent during insert_flow */
596 data
= (reiser4_item_data
*) (lowest_level
+ 3);
598 data
->iplug
= item_plugin_by_id(FORMATTING_ID
);
600 /* data.length and data.data will be set before calling paste or
605 op
->u
.insert_flow
.flags
= 0;
606 op
->u
.insert_flow
.insert_point
= coord
;
607 op
->u
.insert_flow
.flow
= f
;
608 op
->u
.insert_flow
.data
= data
;
609 op
->u
.insert_flow
.new_nodes
= 0;
611 lowest_level
->track_type
= CARRY_TRACK_CHANGE
;
612 lowest_level
->tracked
= lh
;
614 result
= reiser4_carry(lowest_level
, NULL
);
615 done_carry_pool(pool
);
620 /* Given a coord in parent node, obtain a znode for the corresponding child */
621 znode
*child_znode(const coord_t
* parent_coord
/* coord of pointer to
623 znode
* parent
/* parent of child */ ,
624 int incore_p
/* if !0 only return child if already in
626 int setup_dkeys_p
/* if !0 update delimiting keys of
631 assert("nikita-1374", parent_coord
!= NULL
);
632 assert("nikita-1482", parent
!= NULL
);
635 assert_rw_not_locked(&(znode_get_tree(parent
)->dk_lock
));
637 assert("nikita-2947", znode_is_any_locked(parent
));
639 if (znode_get_level(parent
) <= LEAF_LEVEL
) {
640 /* trying to get child of leaf node */
641 warning("nikita-1217", "Child of maize?");
642 return ERR_PTR(RETERR(-EIO
));
644 if (item_is_internal(parent_coord
)) {
645 reiser4_block_nr addr
;
649 iplug
= item_plugin_by_coord(parent_coord
);
650 assert("vs-512", iplug
->s
.internal
.down_link
);
651 iplug
->s
.internal
.down_link(parent_coord
, NULL
, &addr
);
653 tree
= znode_get_tree(parent
);
655 child
= zlook(tree
, &addr
);
658 zget(tree
, &addr
, parent
,
659 znode_get_level(parent
) - 1,
660 reiser4_ctx_gfp_mask_get());
661 if ((child
!= NULL
) && !IS_ERR(child
) && setup_dkeys_p
)
662 set_child_delimiting_keys(parent
, parent_coord
, child
);
664 warning("nikita-1483", "Internal item expected");
665 child
= ERR_PTR(RETERR(-EIO
));
670 /* remove znode from transaction */
671 static void uncapture_znode(znode
* node
)
675 assert("zam-1001", ZF_ISSET(node
, JNODE_HEARD_BANSHEE
));
677 if (!reiser4_blocknr_is_fake(znode_get_block(node
))) {
680 /* An already allocated block goes right to the atom's delete set. */
682 reiser4_dealloc_block(znode_get_block(node
), 0,
683 BA_DEFER
| BA_FORMATTED
);
686 "can\'t add a block (%llu) number to atom's delete set\n",
687 (unsigned long long)(*znode_get_block(node
)));
689 spin_lock_znode(node
);
690 /* Here we return flush reserved block which was reserved at the
691 * moment when this allocated node was marked dirty and still
692 * not used by flush in node relocation procedure. */
693 if (ZF_ISSET(node
, JNODE_FLUSH_RESERVED
)) {
696 atom
= jnode_get_atom(ZJNODE(node
));
697 assert("zam-939", atom
!= NULL
);
698 spin_unlock_znode(node
);
699 flush_reserved2grabbed(atom
, (__u64
) 1);
700 spin_unlock_atom(atom
);
702 spin_unlock_znode(node
);
704 /* znode has assigned block which is counted as "fake
705 allocated". Return it back to "free blocks") */
706 fake_allocated2free((__u64
) 1, BA_FORMATTED
);
710 * uncapture page from transaction. There is a possibility of a race
711 * with ->releasepage(): reiser4_releasepage() detaches page from this
712 * jnode and we have nothing to uncapture. To avoid this, get
713 * reference of node->pg under jnode spin lock. reiser4_uncapture_page()
714 * will deal with released page itself.
716 spin_lock_znode(node
);
717 page
= znode_page(node
);
718 if (likely(page
!= NULL
)) {
720 * reiser4_uncapture_page() can only be called when we are sure
721 * that znode is pinned in memory, which we are, because
722 * forget_znode() is only called from longterm_unlock_znode().
724 page_cache_get(page
);
725 spin_unlock_znode(node
);
727 reiser4_uncapture_page(page
);
729 page_cache_release(page
);
733 /* handle "flush queued" znodes */
735 atom
= jnode_get_atom(ZJNODE(node
));
736 assert("zam-943", atom
!= NULL
);
738 if (!ZF_ISSET(node
, JNODE_FLUSH_QUEUED
)
739 || !atom
->nr_running_queues
)
742 spin_unlock_znode(node
);
743 reiser4_atom_wait_event(atom
);
744 spin_lock_znode(node
);
747 reiser4_uncapture_block(ZJNODE(node
));
748 spin_unlock_atom(atom
);
753 /* This is called from longterm_unlock_znode() when last lock is released from
754 the node that has been removed from the tree. At this point node is removed
755 from sibling list and its lock is invalidated. */
756 void forget_znode(lock_handle
* handle
)
761 assert("umka-319", handle
!= NULL
);
764 tree
= znode_get_tree(node
);
766 assert("vs-164", znode_is_write_locked(node
));
767 assert("nikita-1280", ZF_ISSET(node
, JNODE_HEARD_BANSHEE
));
768 assert_rw_locked(&(node
->lock
.guard
));
770 /* We assume that this node was detached from its parent before
771 * unlocking, it gives no way to reach this node from parent through a
772 * down link. The node should have no children and, thereby, can't be
773 * reached from them by their parent pointers. The only way to obtain a
774 * reference to the node is to use sibling pointers from its left and
775 * right neighbors. In the next several lines we remove the node from
776 * the sibling list. */
778 write_lock_tree(tree
);
779 sibling_list_remove(node
);
780 znode_remove(node
, tree
);
781 write_unlock_tree(tree
);
783 /* Here we set JNODE_DYING and cancel all pending lock requests. It
784 * forces all lock requestor threads to repeat iterations of getting
785 * lock on a child, neighbor or parent node. But, those threads can't
786 * come to this node again, because this node is no longer a child,
787 * neighbor or parent of any other node. This order of znode
788 * invalidation does not allow other threads to waste cpu time is a busy
789 * loop, trying to lock dying object. The exception is in the flush
790 * code when we take node directly from atom's capture list.*/
791 reiser4_invalidate_lock(handle
);
792 uncapture_znode(node
);
795 /* Check that internal item at @pointer really contains pointer to @child. */
796 int check_tree_pointer(const coord_t
* pointer
/* would-be pointer to
798 const znode
* child
/* child znode */ )
800 assert("nikita-1016", pointer
!= NULL
);
801 assert("nikita-1017", child
!= NULL
);
802 assert("nikita-1018", pointer
->node
!= NULL
);
804 assert("nikita-1325", znode_is_any_locked(pointer
->node
));
806 assert("nikita-2985",
807 znode_get_level(pointer
->node
) == znode_get_level(child
) + 1);
809 coord_clear_iplug((coord_t
*) pointer
);
811 if (coord_is_existing_unit(pointer
)) {
813 reiser4_block_nr addr
;
815 if (item_is_internal(pointer
)) {
816 iplug
= item_plugin_by_coord(pointer
);
817 assert("vs-513", iplug
->s
.internal
.down_link
);
818 iplug
->s
.internal
.down_link(pointer
, NULL
, &addr
);
819 /* check that cached value is correct */
820 if (disk_addr_eq(&addr
, znode_get_block(child
))) {
825 /* warning ("jmacd-1002", "tree pointer incorrect"); */
829 /* find coord of pointer to new @child in @parent.
831 Find the &coord_t in the @parent where pointer to a given @child will
835 int find_new_child_ptr(znode
* parent
/* parent znode, passed locked */ ,
837 child UNUSED_ARG
/* child znode, passed locked */ ,
838 znode
* left
/* left brother of new node */ ,
839 coord_t
* result
/* where result is stored in */ )
843 assert("nikita-1486", parent
!= NULL
);
844 assert("nikita-1487", child
!= NULL
);
845 assert("nikita-1488", result
!= NULL
);
847 ret
= find_child_ptr(parent
, left
, result
);
848 if (ret
!= NS_FOUND
) {
849 warning("nikita-1489", "Cannot find brother position: %i", ret
);
852 result
->between
= AFTER_UNIT
;
853 return RETERR(NS_NOT_FOUND
);
857 /* find coord of pointer to @child in @parent.
859 Find the &coord_t in the @parent where pointer to a given @child is in.
862 int find_child_ptr(znode
* parent
/* parent znode, passed locked */ ,
863 znode
* child
/* child znode, passed locked */ ,
864 coord_t
* result
/* where result is stored in */ )
868 /* left delimiting key of a child */
872 assert("nikita-934", parent
!= NULL
);
873 assert("nikita-935", child
!= NULL
);
874 assert("nikita-936", result
!= NULL
);
875 assert("zam-356", znode_is_loaded(parent
));
877 coord_init_zero(result
);
878 result
->node
= parent
;
880 nplug
= parent
->nplug
;
881 assert("nikita-939", nplug
!= NULL
);
883 tree
= znode_get_tree(parent
);
884 /* NOTE-NIKITA taking read-lock on tree here assumes that @result is
885 * not aliased to ->in_parent of some znode. Otherwise,
886 * parent_coord_to_coord() below would modify data protected by tree
888 read_lock_tree(tree
);
889 /* fast path. Try to use cached value. Lock tree to keep
890 node->pos_in_parent and pos->*_blocknr consistent. */
891 if (child
->in_parent
.item_pos
+ 1 != 0) {
892 parent_coord_to_coord(&child
->in_parent
, result
);
893 if (check_tree_pointer(result
, child
) == NS_FOUND
) {
894 read_unlock_tree(tree
);
898 child
->in_parent
.item_pos
= (unsigned short)~0;
900 read_unlock_tree(tree
);
902 /* is above failed, find some key from @child. We are looking for the
903 least key in a child. */
905 ld
= *znode_get_ld_key(child
);
906 read_unlock_dk(tree
);
908 * now, lookup parent with key just found. Note, that left delimiting
909 * key doesn't identify node uniquely, because (in extremely rare
910 * case) two nodes can have equal left delimiting keys, if one of them
911 * is completely filled with directory entries that all happened to be
912 * hash collision. But, we check block number in check_tree_pointer()
915 lookup_res
= nplug
->lookup(parent
, &ld
, FIND_EXACT
, result
);
916 /* update cached pos_in_node */
917 if (lookup_res
== NS_FOUND
) {
918 write_lock_tree(tree
);
919 coord_to_parent_coord(result
, &child
->in_parent
);
920 write_unlock_tree(tree
);
921 lookup_res
= check_tree_pointer(result
, child
);
923 if (lookup_res
== NS_NOT_FOUND
)
924 lookup_res
= find_child_by_addr(parent
, child
, result
);
928 /* find coord of pointer to @child in @parent by scanning
930 Find the &coord_t in the @parent where pointer to a given @child
931 is in by scanning all internal items in @parent and comparing block
932 numbers in them with that of @child.
935 static int find_child_by_addr(znode
* parent
/* parent znode, passed locked */ ,
936 znode
* child
/* child znode, passed locked */ ,
937 coord_t
* result
/* where result is stored in */ )
941 assert("nikita-1320", parent
!= NULL
);
942 assert("nikita-1321", child
!= NULL
);
943 assert("nikita-1322", result
!= NULL
);
947 for_all_units(result
, parent
) {
948 if (check_tree_pointer(result
, child
) == NS_FOUND
) {
949 write_lock_tree(znode_get_tree(parent
));
950 coord_to_parent_coord(result
, &child
->in_parent
);
951 write_unlock_tree(znode_get_tree(parent
));
959 /* true, if @addr is "unallocated block number", which is just address, with
961 int is_disk_addr_unallocated(const reiser4_block_nr
* addr
/* address to
964 assert("nikita-1766", addr
!= NULL
);
965 cassert(sizeof(reiser4_block_nr
) == 8);
966 return (*addr
& REISER4_BLOCKNR_STATUS_BIT_MASK
) ==
967 REISER4_UNALLOCATED_STATUS_VALUE
;
970 /* returns true if removing bytes of given range of key [from_key, to_key]
971 causes removing of whole item @from */
973 item_removed_completely(coord_t
* from
, const reiser4_key
* from_key
,
974 const reiser4_key
* to_key
)
977 reiser4_key key_in_item
;
979 assert("umka-325", from
!= NULL
);
980 assert("", item_is_extent(from
));
982 /* check first key just for case */
983 item_key_by_coord(from
, &key_in_item
);
984 if (keygt(from_key
, &key_in_item
))
988 iplug
= item_plugin_by_coord(from
);
989 assert("vs-611", iplug
&& iplug
->s
.file
.append_key
);
991 iplug
->s
.file
.append_key(from
, &key_in_item
);
992 set_key_offset(&key_in_item
, get_key_offset(&key_in_item
) - 1);
994 if (keylt(to_key
, &key_in_item
))
995 /* last byte is not removed */
1000 /* helper function for prepare_twig_kill(): @left and @right are formatted
1001 * neighbors of extent item being completely removed. Load and lock neighbors
1002 * and store lock handles into @cdata for later use by kill_hook_extent() */
1004 prepare_children(znode
* left
, znode
* right
, carry_kill_data
* kdata
)
1011 left_loaded
= right_loaded
= 0;
1014 result
= zload(left
);
1017 result
= longterm_lock_znode(kdata
->left
, left
,
1022 if (result
== 0 && right
!= NULL
) {
1023 result
= zload(right
);
1026 result
= longterm_lock_znode(kdata
->right
, right
,
1029 ZNODE_LOCK_NONBLOCK
);
1033 done_lh(kdata
->left
);
1034 done_lh(kdata
->right
);
1035 if (left_loaded
!= 0)
1037 if (right_loaded
!= 0)
1043 static void done_children(carry_kill_data
* kdata
)
1045 if (kdata
->left
!= NULL
&& kdata
->left
->node
!= NULL
) {
1046 zrelse(kdata
->left
->node
);
1047 done_lh(kdata
->left
);
1049 if (kdata
->right
!= NULL
&& kdata
->right
->node
!= NULL
) {
1050 zrelse(kdata
->right
->node
);
1051 done_lh(kdata
->right
);
1055 /* part of cut_node. It is called when cut_node is called to remove or cut part
1056 of extent item. When head of that item is removed - we have to update right
1057 delimiting of left neighbor of extent. When item is removed completely - we
1058 have to set sibling link between left and right neighbor of removed
1059 extent. This may return -E_DEADLOCK because of trying to get left neighbor
1060 locked. So, caller should repeat an attempt
1062 /* Audited by: umka (2002.06.16) */
1064 prepare_twig_kill(carry_kill_data
* kdata
, znode
* locked_left_neighbor
)
1068 lock_handle left_lh
;
1069 lock_handle right_lh
;
1075 int left_zloaded_here
, right_zloaded_here
;
1077 from
= kdata
->params
.from
;
1078 assert("umka-326", from
!= NULL
);
1079 assert("umka-327", kdata
->params
.to
!= NULL
);
1081 /* for one extent item only yet */
1082 assert("vs-591", item_is_extent(from
));
1083 assert("vs-592", from
->item_pos
== kdata
->params
.to
->item_pos
);
1085 if ((kdata
->params
.from_key
1086 && keygt(kdata
->params
.from_key
, item_key_by_coord(from
, &key
)))
1087 || from
->unit_pos
!= 0) {
1088 /* head of item @from is not removed, there is nothing to
1094 left_zloaded_here
= 0;
1095 right_zloaded_here
= 0;
1097 left_child
= right_child
= NULL
;
1099 coord_dup(&left_coord
, from
);
1102 if (coord_prev_unit(&left_coord
)) {
1103 /* @from is leftmost item in its node */
1104 if (!locked_left_neighbor
) {
1106 reiser4_get_left_neighbor(&left_lh
, from
->node
,
1108 GN_CAN_USE_UPPER_LEVELS
);
1112 case -E_NO_NEIGHBOR
:
1113 /* there is no formatted node to the left of
1116 "extent item has smallest key in "
1117 "the tree and it is about to be removed");
1120 /* need to restart */
1125 /* we have acquired left neighbor of from->node */
1126 result
= zload(left_lh
.node
);
1130 locked_left_neighbor
= left_lh
.node
;
1132 /* squalloc_right_twig_cut should have supplied locked
1135 znode_is_write_locked(locked_left_neighbor
));
1136 result
= zload(locked_left_neighbor
);
1141 left_zloaded_here
= 1;
1142 coord_init_last_unit(&left_coord
, locked_left_neighbor
);
1145 if (!item_is_internal(&left_coord
)) {
1146 /* what else but extent can be on twig level */
1147 assert("vs-606", item_is_extent(&left_coord
));
1149 /* there is no left formatted child */
1150 if (left_zloaded_here
)
1151 zrelse(locked_left_neighbor
);
1156 tree
= znode_get_tree(left_coord
.node
);
1157 left_child
= child_znode(&left_coord
, left_coord
.node
, 1, 0);
1159 if (IS_ERR(left_child
)) {
1160 result
= PTR_ERR(left_child
);
1164 /* left child is acquired, calculate new right delimiting key for it
1165 and get right child if it is necessary */
1166 if (item_removed_completely
1167 (from
, kdata
->params
.from_key
, kdata
->params
.to_key
)) {
1168 /* try to get right child of removed item */
1169 coord_t right_coord
;
1172 kdata
->params
.to
->unit_pos
==
1173 coord_last_unit_pos(kdata
->params
.to
));
1174 coord_dup(&right_coord
, kdata
->params
.to
);
1175 if (coord_next_unit(&right_coord
)) {
1176 /* @to is rightmost unit in the node */
1178 reiser4_get_right_neighbor(&right_lh
, from
->node
,
1180 GN_CAN_USE_UPPER_LEVELS
);
1183 result
= zload(right_lh
.node
);
1187 right_zloaded_here
= 1;
1188 coord_init_first_unit(&right_coord
,
1190 item_key_by_coord(&right_coord
, &key
);
1193 case -E_NO_NEIGHBOR
:
1194 /* there is no formatted node to the right of
1197 key
= *znode_get_rd_key(from
->node
);
1198 read_unlock_dk(tree
);
1199 right_coord
.node
= NULL
;
1207 /* there is an item to the right of @from - take its key */
1208 item_key_by_coord(&right_coord
, &key
);
1211 /* try to get right child of @from */
1212 if (right_coord
.node
&& /* there is right neighbor of @from */
1213 item_is_internal(&right_coord
)) { /* it is internal item */
1214 right_child
= child_znode(&right_coord
,
1215 right_coord
.node
, 1, 0);
1217 if (IS_ERR(right_child
)) {
1218 result
= PTR_ERR(right_child
);
1223 /* whole extent is removed between znodes left_child and right_child. Prepare them for linking and
1224 update of right delimiting key of left_child */
1225 result
= prepare_children(left_child
, right_child
, kdata
);
1227 /* head of item @to is removed. left_child has to get right delimting key update. Prepare it for that */
1228 result
= prepare_children(left_child
, NULL
, kdata
);
1234 if (right_zloaded_here
)
1235 zrelse(right_lh
.node
);
1240 if (left_zloaded_here
)
1241 zrelse(locked_left_neighbor
);
1246 /* this is used to remove part of node content between coordinates @from and @to. Units to which @from and @to are set
1247 are to be cut completely */
1248 /* for try_to_merge_with_left, delete_copied, reiser4_delete_node */
1249 int cut_node_content(coord_t
* from
, coord_t
* to
, const reiser4_key
* from_key
, /* first key to be removed */
1250 const reiser4_key
* to_key
, /* last key to be removed */
1252 smallest_removed
/* smallest key actually removed */ )
1256 carry_level
*lowest_level
;
1257 carry_cut_data
*cut_data
;
1260 assert("vs-1715", coord_compare(from
, to
) != COORD_CMP_ON_RIGHT
);
1263 init_carry_pool(sizeof(*pool
) + 3 * sizeof(*lowest_level
) +
1266 return PTR_ERR(pool
);
1267 lowest_level
= (carry_level
*) (pool
+ 1);
1268 init_carry_level(lowest_level
, pool
);
1270 op
= reiser4_post_carry(lowest_level
, COP_CUT
, from
->node
, 0);
1271 assert("vs-1509", op
!= 0);
1273 done_carry_pool(pool
);
1277 cut_data
= (carry_cut_data
*) (lowest_level
+ 3);
1278 cut_data
->params
.from
= from
;
1279 cut_data
->params
.to
= to
;
1280 cut_data
->params
.from_key
= from_key
;
1281 cut_data
->params
.to_key
= to_key
;
1282 cut_data
->params
.smallest_removed
= smallest_removed
;
1284 op
->u
.cut_or_kill
.is_cut
= 1;
1285 op
->u
.cut_or_kill
.u
.cut
= cut_data
;
1287 result
= reiser4_carry(lowest_level
, NULL
);
1288 done_carry_pool(pool
);
1293 /* cut part of the node
1295 Cut part or whole content of node.
1297 cut data between @from and @to of @from->node and call carry() to make
1298 corresponding changes in the tree. @from->node may become empty. If so -
1299 pointer to it will be removed. Neighboring nodes are not changed. Smallest
1300 removed key is stored in @smallest_removed
1303 int kill_node_content(coord_t
* from
, /* coord of the first unit/item that will be eliminated */
1304 coord_t
* to
, /* coord of the last unit/item that will be eliminated */
1305 const reiser4_key
* from_key
, /* first key to be removed */
1306 const reiser4_key
* to_key
, /* last key to be removed */
1307 reiser4_key
* smallest_removed
, /* smallest key actually removed */
1308 znode
* locked_left_neighbor
, /* this is set when kill_node_content is called with left neighbor
1309 * locked (in squalloc_right_twig_cut, namely) */
1310 struct inode
*inode
, /* inode of file whose item (or its part) is to be killed. This is necessary to
1311 invalidate pages together with item pointing to them */
1313 { /* this call is made for file truncate) */
1316 carry_level
*lowest_level
;
1317 carry_kill_data
*kdata
;
1318 lock_handle
*left_child
;
1319 lock_handle
*right_child
;
1322 assert("umka-328", from
!= NULL
);
1323 assert("vs-316", !node_is_empty(from
->node
));
1324 assert("nikita-1812", coord_is_existing_unit(from
)
1325 && coord_is_existing_unit(to
));
1327 /* allocate carry_pool, 3 carry_level-s, carry_kill_data and structures for kill_hook_extent */
1328 pool
= init_carry_pool(sizeof(*pool
) + 3 * sizeof(*lowest_level
) +
1329 sizeof(carry_kill_data
) +
1330 2 * sizeof(lock_handle
) +
1331 5 * sizeof(reiser4_key
) + 2 * sizeof(coord_t
));
1333 return PTR_ERR(pool
);
1335 lowest_level
= (carry_level
*) (pool
+ 1);
1336 init_carry_level(lowest_level
, pool
);
1338 kdata
= (carry_kill_data
*) (lowest_level
+ 3);
1339 left_child
= (lock_handle
*) (kdata
+ 1);
1340 right_child
= left_child
+ 1;
1342 init_lh(left_child
);
1343 init_lh(right_child
);
1345 kdata
->params
.from
= from
;
1346 kdata
->params
.to
= to
;
1347 kdata
->params
.from_key
= from_key
;
1348 kdata
->params
.to_key
= to_key
;
1349 kdata
->params
.smallest_removed
= smallest_removed
;
1350 kdata
->params
.truncate
= truncate
;
1352 kdata
->inode
= inode
;
1353 kdata
->left
= left_child
;
1354 kdata
->right
= right_child
;
1355 /* memory for 5 reiser4_key and 2 coord_t will be used in kill_hook_extent */
1356 kdata
->buf
= (char *)(right_child
+ 1);
1358 if (znode_get_level(from
->node
) == TWIG_LEVEL
&& item_is_extent(from
)) {
1359 /* left child of extent item may have to get updated right
1360 delimiting key and to get linked with right child of extent
1361 @from if it will be removed completely */
1362 result
= prepare_twig_kill(kdata
, locked_left_neighbor
);
1364 done_children(kdata
);
1365 done_carry_pool(pool
);
1370 op
= reiser4_post_carry(lowest_level
, COP_CUT
, from
->node
, 0);
1371 if (IS_ERR(op
) || (op
== NULL
)) {
1372 done_children(kdata
);
1373 done_carry_pool(pool
);
1374 return RETERR(op
? PTR_ERR(op
) : -EIO
);
1377 op
->u
.cut_or_kill
.is_cut
= 0;
1378 op
->u
.cut_or_kill
.u
.kill
= kdata
;
1380 result
= reiser4_carry(lowest_level
, NULL
);
1382 done_children(kdata
);
1383 done_carry_pool(pool
);
1388 fake_kill_hook_tail(struct inode
*inode
, loff_t start
, loff_t end
, int truncate
)
1390 if (reiser4_inode_get_flag(inode
, REISER4_HAS_MMAP
)) {
1391 pgoff_t start_pg
, end_pg
;
1393 start_pg
= start
>> PAGE_CACHE_SHIFT
;
1394 end_pg
= (end
- 1) >> PAGE_CACHE_SHIFT
;
1396 if ((start
& (PAGE_CACHE_SIZE
- 1)) == 0) {
1398 * kill up to the page boundary.
1400 assert("vs-123456", start_pg
== end_pg
);
1401 reiser4_invalidate_pages(inode
->i_mapping
, start_pg
, 1,
1403 } else if (start_pg
!= end_pg
) {
1405 * page boundary is within killed portion of node.
1407 assert("vs-654321", end_pg
- start_pg
== 1);
1408 reiser4_invalidate_pages(inode
->i_mapping
, end_pg
,
1409 end_pg
- start_pg
, 1);
1412 inode_sub_bytes(inode
, end
- start
);
1416 * Delete whole @node from the reiser4 tree without loading it.
1418 * @left: locked left neighbor,
1419 * @node: node to be deleted,
1420 * @smallest_removed: leftmost key of deleted node,
1421 * @object: inode pointer, if we truncate a file body.
1422 * @truncate: true if called for file truncate.
1424 * @return: 0 if success, error code otherwise.
1426 * NOTE: if @object!=NULL we assume that @smallest_removed != NULL and it
1427 * contains the right value of the smallest removed key from the previous
1428 * cut_worker() iteration. This is needed for proper accounting of
1429 * "i_blocks" and "i_bytes" fields of the @object.
1431 int reiser4_delete_node(znode
* node
, reiser4_key
* smallest_removed
,
1432 struct inode
*object
, int truncate
)
1434 lock_handle parent_lock
;
1440 assert("zam-937", node
!= NULL
);
1441 assert("zam-933", znode_is_write_locked(node
));
1442 assert("zam-999", smallest_removed
!= NULL
);
1444 init_lh(&parent_lock
);
1446 ret
= reiser4_get_parent(&parent_lock
, node
, ZNODE_WRITE_LOCK
);
1450 assert("zam-934", !znode_above_root(parent_lock
.node
));
1452 ret
= zload(parent_lock
.node
);
1454 goto failed_nozrelse
;
1456 ret
= find_child_ptr(parent_lock
.node
, node
, &cut_from
);
1460 /* decrement child counter and set parent pointer to NULL before
1461 deleting the list from parent node because of checks in
1462 internal_kill_item_hook (we can delete the last item from the parent
1463 node, the parent node is going to be deleted and its c_count should
1466 tree
= znode_get_tree(node
);
1467 write_lock_tree(tree
);
1468 init_parent_coord(&node
->in_parent
, NULL
);
1469 --parent_lock
.node
->c_count
;
1470 write_unlock_tree(tree
);
1472 assert("zam-989", item_is_internal(&cut_from
));
1474 /* @node should be deleted after unlocking. */
1475 ZF_SET(node
, JNODE_HEARD_BANSHEE
);
1477 /* remove a pointer from the parent node to the node being deleted. */
1478 coord_dup(&cut_to
, &cut_from
);
1479 /* FIXME: shouldn't this be kill_node_content */
1480 ret
= cut_node_content(&cut_from
, &cut_to
, NULL
, NULL
, NULL
);
1482 /* FIXME(Zam): Should we re-connect the node to its parent if
1483 * cut_node fails? */
1487 reiser4_tree
*tree
= current_tree
;
1488 __u64 start_offset
= 0, end_offset
= 0;
1490 read_lock_tree(tree
);
1491 write_lock_dk(tree
);
1493 /* We use @smallest_removed and the left delimiting of
1494 * the current node for @object->i_blocks, i_bytes
1495 * calculation. We assume that the items after the
1496 * *@smallest_removed key have been deleted from the
1498 start_offset
= get_key_offset(znode_get_ld_key(node
));
1499 end_offset
= get_key_offset(smallest_removed
);
1502 assert("zam-1021", znode_is_connected(node
));
1504 znode_set_rd_key(node
->left
, znode_get_rd_key(node
));
1506 *smallest_removed
= *znode_get_ld_key(node
);
1508 write_unlock_dk(tree
);
1509 read_unlock_tree(tree
);
1512 /* we used to perform actions which are to be performed on items on their removal from tree in
1513 special item method - kill_hook. Here for optimization reasons we avoid reading node
1514 containing item we remove and can not call item's kill hook. Instead we call function which
1515 does exactly the same things as tail kill hook in assumption that node we avoid reading
1516 contains only one item and that item is a tail one. */
1517 fake_kill_hook_tail(object
, start_offset
, end_offset
,
1522 zrelse(parent_lock
.node
);
1524 done_lh(&parent_lock
);
1529 static int can_delete(const reiser4_key
*key
, znode
*node
)
1533 read_lock_dk(current_tree
);
1534 result
= keyle(key
, znode_get_ld_key(node
));
1535 read_unlock_dk(current_tree
);
1540 * This subroutine is not optimal but implementation seems to
1543 * @tap: the point deletion process begins from,
1544 * @from_key: the beginning of the deleted key range,
1545 * @to_key: the end of the deleted key range,
1546 * @smallest_removed: the smallest removed key,
1547 * @truncate: true if called for file truncate.
1548 * @progress: return true if a progress in file items deletions was made,
1549 * @smallest_removed value is actual in that case.
1551 * @return: 0 if success, error code otherwise, -E_REPEAT means that long
1552 * reiser4_cut_tree operation was interrupted for allowing atom commit.
1555 cut_tree_worker_common(tap_t
* tap
, const reiser4_key
* from_key
,
1556 const reiser4_key
* to_key
,
1557 reiser4_key
* smallest_removed
, struct inode
*object
,
1558 int truncate
, int *progress
)
1560 lock_handle next_node_lock
;
1564 assert("zam-931", tap
->coord
->node
!= NULL
);
1565 assert("zam-932", znode_is_write_locked(tap
->coord
->node
));
1568 init_lh(&next_node_lock
);
1571 znode
*node
; /* node from which items are cut */
1572 node_plugin
*nplug
; /* node plugin for @node */
1574 node
= tap
->coord
->node
;
1576 /* Move next_node_lock to the next node on the left. */
1578 reiser4_get_left_neighbor(&next_node_lock
, node
,
1580 GN_CAN_USE_UPPER_LEVELS
);
1581 if (result
!= 0 && result
!= -E_NO_NEIGHBOR
)
1583 /* Check can we delete the node as a whole. */
1584 if (*progress
&& znode_get_level(node
) == LEAF_LEVEL
&&
1585 can_delete(from_key
, node
)) {
1586 result
= reiser4_delete_node(node
, smallest_removed
,
1589 result
= reiser4_tap_load(tap
);
1593 /* Prepare the second (right) point for cut_node() */
1595 coord_init_last_unit(tap
->coord
, node
);
1597 else if (item_plugin_by_coord(tap
->coord
)->b
.lookup
==
1599 /* set rightmost unit for the items without lookup method */
1600 tap
->coord
->unit_pos
=
1601 coord_last_unit_pos(tap
->coord
);
1603 nplug
= node
->nplug
;
1605 assert("vs-686", nplug
);
1606 assert("vs-687", nplug
->lookup
);
1608 /* left_coord is leftmost unit cut from @node */
1609 result
= nplug
->lookup(node
, from_key
,
1610 FIND_MAX_NOT_MORE_THAN
,
1613 if (IS_CBKERR(result
))
1616 /* adjust coordinates so that they are set to existing units */
1617 if (coord_set_to_right(&left_coord
)
1618 || coord_set_to_left(tap
->coord
)) {
1623 if (coord_compare(&left_coord
, tap
->coord
) ==
1624 COORD_CMP_ON_RIGHT
) {
1625 /* keys from @from_key to @to_key are not in the tree */
1630 if (left_coord
.item_pos
!= tap
->coord
->item_pos
) {
1631 /* do not allow to cut more than one item. It is added to solve problem of truncating
1632 partially converted files. If file is partially converted there may exist a twig node
1633 containing both internal item or items pointing to leaf nodes with formatting items
1634 and extent item. We do not want to kill internal items being at twig node here
1635 because cut_tree_worker assumes killing them from level level */
1636 coord_dup(&left_coord
, tap
->coord
);
1638 coord_is_existing_unit(&left_coord
));
1639 left_coord
.unit_pos
= 0;
1642 /* cut data from one node */
1643 /* *smallest_removed = *reiser4_min_key(); */
1645 kill_node_content(&left_coord
, tap
->coord
, from_key
,
1646 to_key
, smallest_removed
,
1647 next_node_lock
.node
, object
,
1649 reiser4_tap_relse(tap
);
1656 /* Check whether all items with keys >= from_key were removed
1658 if (keyle(smallest_removed
, from_key
))
1662 if (next_node_lock
.node
== NULL
)
1665 result
= reiser4_tap_move(tap
, &next_node_lock
);
1666 done_lh(&next_node_lock
);
1670 /* Break long reiser4_cut_tree operation (deletion of a large
1671 file) if atom requires commit. */
1672 if (*progress
> CUT_TREE_MIN_ITERATIONS
1673 && current_atom_should_commit()) {
1678 done_lh(&next_node_lock
);
1679 /* assert("vs-301", !keyeq(&smallest_removed, reiser4_min_key())); */
1683 /* there is a fundamental problem with optimizing deletes: VFS does it
1684 one file at a time. Another problem is that if an item can be
1685 anything, then deleting items must be done one at a time. It just
1686 seems clean to writes this to specify a from and a to key, and cut
1687 everything between them though. */
1689 /* use this function with care if deleting more than what is part of a single file. */
1690 /* do not use this when cutting a single item, it is suboptimal for that */
1692 /* You are encouraged to write plugin specific versions of this. It
1693 cannot be optimal for all plugins because it works item at a time,
1694 and some plugins could sometimes work node at a time. Regular files
1695 however are not optimizable to work node at a time because of
1696 extents needing to free the blocks they point to.
1698 Optimizations compared to v3 code:
1700 It does not balance (that task is left to memory pressure code).
1702 Nodes are deleted only if empty.
1706 Performs read-ahead of formatted nodes whose contents are part of
1711 * Delete everything from the reiser4 tree between two keys: @from_key and
1714 * @from_key: the beginning of the deleted key range,
1715 * @to_key: the end of the deleted key range,
1716 * @smallest_removed: the smallest removed key,
1717 * @object: owner of cutting items.
1718 * @truncate: true if called for file truncate.
1719 * @progress: return true if a progress in file items deletions was made,
1720 * @smallest_removed value is actual in that case.
1722 * @return: 0 if success, error code otherwise, -E_REPEAT means that long cut_tree
1723 * operation was interrupted for allowing atom commit .
1726 int reiser4_cut_tree_object(reiser4_tree
* tree
, const reiser4_key
* from_key
,
1727 const reiser4_key
* to_key
,
1728 reiser4_key
* smallest_removed_p
,
1729 struct inode
*object
, int truncate
, int *progress
)
1734 coord_t right_coord
;
1735 reiser4_key smallest_removed
;
1736 int (*cut_tree_worker
) (tap_t
*, const reiser4_key
*,
1737 const reiser4_key
*, reiser4_key
*,
1738 struct inode
*, int, int *);
1741 assert("umka-329", tree
!= NULL
);
1742 assert("umka-330", from_key
!= NULL
);
1743 assert("umka-331", to_key
!= NULL
);
1744 assert("zam-936", keyle(from_key
, to_key
));
1746 if (smallest_removed_p
== NULL
)
1747 smallest_removed_p
= &smallest_removed
;
1752 /* Find rightmost item to cut away from the tree. */
1753 result
= reiser4_object_lookup(object
, to_key
, &right_coord
,
1754 &lock
, ZNODE_WRITE_LOCK
,
1755 FIND_MAX_NOT_MORE_THAN
,
1756 TWIG_LEVEL
, LEAF_LEVEL
,
1757 CBK_UNIQUE
, NULL
/*ra_info */);
1758 if (result
!= CBK_COORD_FOUND
)
1761 || inode_file_plugin(object
)->cut_tree_worker
== NULL
)
1762 cut_tree_worker
= cut_tree_worker_common
;
1765 inode_file_plugin(object
)->cut_tree_worker
;
1766 reiser4_tap_init(&tap
, &right_coord
, &lock
, ZNODE_WRITE_LOCK
);
1768 cut_tree_worker(&tap
, from_key
, to_key
, smallest_removed_p
,
1769 object
, truncate
, progress
);
1770 reiser4_tap_done(&tap
);
1772 reiser4_preempt_point();
1780 case -E_NO_NEIGHBOR
:
1790 warning("nikita-2861", "failure: %i", result
);
1798 /* repeat reiser4_cut_tree_object until everything is deleted.
1799 * unlike cut_file_items, it does not end current transaction if -E_REPEAT
1800 * is returned by cut_tree_object. */
1801 int reiser4_cut_tree(reiser4_tree
* tree
, const reiser4_key
* from
,
1802 const reiser4_key
* to
, struct inode
*inode
, int truncate
)
1808 result
= reiser4_cut_tree_object(tree
, from
, to
, NULL
,
1809 inode
, truncate
, &progress
);
1810 } while (result
== -E_REPEAT
);
1815 /* finishing reiser4 initialization */
1816 int reiser4_init_tree(reiser4_tree
* tree
/* pointer to structure being
1818 const reiser4_block_nr
* root_block
/* address of a root block
1820 tree_level height
/* height of a tree */ ,
1821 node_plugin
* nplug
/* default node plugin */ )
1825 assert("nikita-306", tree
!= NULL
);
1826 assert("nikita-307", root_block
!= NULL
);
1827 assert("nikita-308", height
> 0);
1828 assert("nikita-309", nplug
!= NULL
);
1829 assert("zam-587", tree
->super
!= NULL
);
1831 tree
->root_block
= *root_block
;
1832 tree
->height
= height
;
1833 tree
->estimate_one_insert
= calc_estimate_one_insert(height
);
1834 tree
->nplug
= nplug
;
1836 tree
->znode_epoch
= 1ull;
1838 cbk_cache_init(&tree
->cbk_cache
);
1840 result
= znodes_tree_init(tree
);
1842 result
= jnodes_tree_init(tree
);
1844 tree
->uber
= zget(tree
, &UBER_TREE_ADDR
, NULL
, 0,
1845 reiser4_ctx_gfp_mask_get());
1846 if (IS_ERR(tree
->uber
)) {
1847 result
= PTR_ERR(tree
->uber
);
1854 /* release resources associated with @tree */
1855 void reiser4_done_tree(reiser4_tree
* tree
/* tree to release */ )
1860 if (tree
->uber
!= NULL
) {
1864 znodes_tree_done(tree
);
1865 jnodes_tree_done(tree
);
1866 cbk_cache_done(&tree
->cbk_cache
);
1869 /* Make Linus happy.
1871 c-indentation-style: "K&R"