1 /* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
7 * The tree consists of nodes located on the disk. Node in the tree is either
8 * formatted or unformatted. Formatted node is one that has structure
9 * understood by the tree balancing and traversal code. Formatted nodes are
10 * further classified into leaf and internal nodes. Latter distinctions is
11 * (almost) of only historical importance: general structure of leaves and
12 * internal nodes is the same in Reiser4. Unformatted nodes contain raw data
13 * that are part of bodies of ordinary files and attributes.
15 * Each node in the tree spawns some interval in the key space. Key ranges for
16 * all nodes in the tree are disjoint. Actually, this only holds in some weak
17 * sense, because of the non-unique keys: intersection of key ranges for
18 * different nodes is either empty, or consists of exactly one key.
20 * Formatted node consists of a sequence of items. Each item spawns some
21 * interval in key space. Key ranges for all items in a tree are disjoint,
22 * modulo non-unique keys again. Items within nodes are ordered in the key
23 * order of the smallest key in a item.
25 * Particular type of item can be further split into units. Unit is piece of
26 * item that can be cut from item and moved into another item of the same
27 * time. Units are used by balancing code to repack data during balancing.
29 * Unit can be further split into smaller entities (for example, extent unit
30 * represents several pages, and it is natural for extent code to operate on
31 * particular pages and even bytes within one unit), but this is of no
32 * relevance to the generic balancing and lookup code.
34 * Although item is said to "spawn" range or interval of keys, it is not
35 * necessary that item contains piece of data addressable by each and every
36 * key in this range. For example, compound directory item, consisting of
37 * units corresponding to directory entries and keyed by hashes of file names,
38 * looks more as having "discrete spectrum": only some disjoint keys inside
39 * range occupied by this item really address data.
41 * No than less, each item always has well-defined least (minimal) key, that
42 * is recorded in item header, stored in the node this item is in. Also, item
43 * plugin can optionally define method ->max_key_inside() returning maximal
44 * key that can _possibly_ be located within this item. This method is used
45 * (mainly) to determine when given piece of data should be merged into
46 * existing item, in stead of creating new one. Because of this, even though
47 * ->max_key_inside() can be larger that any key actually located in the item,
50 * [ reiser4_min_key( item ), ->max_key_inside( item ) ]
52 * are still disjoint for all items within the _same_ node.
54 * In memory node is represented by znode. It plays several roles:
56 * . something locks are taken on
58 * . something tracked by transaction manager (this is going to change)
60 * . something used to access node data
62 * . something used to maintain tree structure in memory: sibling and
65 * . something used to organize nodes into "slums"
67 * More on znodes see in znode.[ch]
71 * To simplify balancing, allow some flexibility in locking and speed up
72 * important coord cache optimization, we keep delimiting keys of nodes in
73 * memory. Depending on disk format (implemented by appropriate node plugin)
74 * node on disk can record both left and right delimiting key, only one of
75 * them, or none. Still, our balancing and tree traversal code keep both
76 * delimiting keys for a node that is in memory stored in the znode. When
77 * node is first brought into memory during tree traversal, its left
78 * delimiting key is taken from its parent, and its right delimiting key is
79 * either next key in its parent, or is right delimiting key of parent if
80 * node is the rightmost child of parent.
82 * Physical consistency of delimiting key is protected by special dk
83 * read-write lock. That is, delimiting keys can only be inspected or
84 * modified under this lock. But dk lock is only sufficient for fast
85 * "pessimistic" check, because to simplify code and to decrease lock
86 * contention, balancing (carry) only updates delimiting keys right before
87 * unlocking all locked nodes on the given tree level. For example,
88 * coord-by-key cache scans LRU list of recently accessed znodes. For each
89 * node it first does fast check under dk spin lock. If key looked for is
90 * not between delimiting keys for this node, next node is inspected and so
91 * on. If key is inside of the key range, long term lock is taken on node
92 * and key range is rechecked.
96 * To find something in the tree, you supply a key, and the key is resolved
97 * by coord_by_key() into a coord (coordinate) that is valid as long as the
98 * node the coord points to remains locked. As mentioned above trees
99 * consist of nodes that consist of items that consist of units. A unit is
100 * the smallest and indivisible piece of tree as far as balancing and tree
101 * search are concerned. Each node, item, and unit can be addressed by
102 * giving its level in the tree and the key occupied by this entity. A node
103 * knows what the key ranges are of the items within it, and how to find its
104 * items and invoke their item handlers, but it does not know how to access
105 * individual units within its items except through the item handlers.
106 * coord is a structure containing a pointer to the node, the ordinal number
107 * of the item within this node (a sort of item offset), and the ordinal
108 * number of the unit within this item.
112 * There are two types of access to the tree: lookup and modification.
114 * Lookup is a search for the key in the tree. Search can look for either
115 * exactly the key given to it, or for the largest key that is not greater
116 * than the key given to it. This distinction is determined by "bias"
117 * parameter of search routine (coord_by_key()). coord_by_key() either
118 * returns error (key is not in the tree, or some kind of external error
119 * occurred), or successfully resolves key into coord.
121 * This resolution is done by traversing tree top-to-bottom from root level
122 * to the desired level. On levels above twig level (level one above the
123 * leaf level) nodes consist exclusively of internal items. Internal item is
124 * nothing more than pointer to the tree node on the child level. On twig
125 * level nodes consist of internal items intermixed with extent
126 * items. Internal items form normal search tree structure used by traversal
127 * to descent through the tree.
129 * TREE LOOKUP OPTIMIZATIONS
131 * Tree lookup described above is expensive even if all nodes traversed are
132 * already in the memory: for each node binary search within it has to be
133 * performed and binary searches are CPU consuming and tend to destroy CPU
136 * Several optimizations are used to work around this:
138 * . cbk_cache (look-aside cache for tree traversals, see search.c for
141 * . seals (see seal.[ch])
143 * . vroot (see search.c)
145 * General search-by-key is layered thusly:
147 * [check seal, if any] --ok--> done
152 * [vroot defined] --no--> node = tree_root
162 * [check cbk_cache for key] --ok--> done
167 * [start tree traversal from node]
176 #include "plugin/item/static_stat.h"
177 #include "plugin/item/item.h"
178 #include "plugin/node/node.h"
179 #include "plugin/plugin.h"
183 #include "block_alloc.h"
184 #include "tree_walk.h"
186 #include "carry_ops.h"
190 #include "page_cache.h"
195 #include <linux/fs.h> /* for struct super_block */
196 #include <linux/spinlock.h>
198 /* Disk address (block number) never ever used for any real tree node. This is
199 used as block number of "uber" znode.
201 Invalid block addresses are 0 by tradition.
204 const reiser4_block_nr UBER_TREE_ADDR
= 0ull;
206 #define CUT_TREE_MIN_ITERATIONS 64
208 static int find_child_by_addr(znode
* parent
, znode
* child
, coord_t
* result
);
210 /* return node plugin of coord->node */
211 node_plugin
*node_plugin_by_coord(const coord_t
* coord
)
213 assert("vs-1", coord
!= NULL
);
214 assert("vs-2", coord
->node
!= NULL
);
216 return coord
->node
->nplug
;
219 /* insert item into tree. Fields of @coord are updated so that they can be
220 * used by consequent insert operation. */
221 insert_result
insert_by_key(reiser4_tree
* tree
/* tree to insert new item
223 const reiser4_key
* key
/* key of new item */ ,
224 reiser4_item_data
* data
/* parameters for item
226 coord_t
* coord
/* resulting insertion coord */ ,
227 lock_handle
* lh
/* resulting lock
229 tree_level stop_level
/** level where to insert */ ,
230 __u32 flags
/* insertion flags */ )
234 assert("nikita-358", tree
!= NULL
);
235 assert("nikita-360", coord
!= NULL
);
237 result
= coord_by_key(tree
, key
, coord
, lh
, ZNODE_WRITE_LOCK
,
238 FIND_EXACT
, stop_level
, stop_level
,
239 flags
| CBK_FOR_INSERT
, NULL
/*ra_info */ );
243 case CBK_COORD_FOUND
:
244 result
= IBK_ALREADY_EXISTS
;
246 case CBK_COORD_NOTFOUND
:
247 assert("nikita-2017", coord
->node
!= NULL
);
248 result
= insert_by_coord(coord
, data
, key
, lh
, 0 /*flags */ );
254 /* insert item by calling carry. Helper function called if short-cut
256 static insert_result
insert_with_carry_by_coord(coord_t
* coord
, /* coord where to insert */
257 lock_handle
* lh
, /* lock handle of insertion
259 reiser4_item_data
* data
, /* parameters of new
261 const reiser4_key
* key
, /* key of new item */
262 carry_opcode cop
, /* carry operation to perform */
263 cop_insert_flag flags
268 carry_level
*lowest_level
;
269 carry_insert_data
*cdata
;
272 assert("umka-314", coord
!= NULL
);
274 /* allocate carry_pool and 3 carry_level-s */
276 init_carry_pool(sizeof(*pool
) + 3 * sizeof(*lowest_level
) +
279 return PTR_ERR(pool
);
280 lowest_level
= (carry_level
*) (pool
+ 1);
281 init_carry_level(lowest_level
, pool
);
283 op
= reiser4_post_carry(lowest_level
, cop
, coord
->node
, 0);
284 if (IS_ERR(op
) || (op
== NULL
)) {
285 done_carry_pool(pool
);
286 return RETERR(op
? PTR_ERR(op
) : -EIO
);
288 cdata
= (carry_insert_data
*) (lowest_level
+ 3);
289 cdata
->coord
= coord
;
292 op
->u
.insert
.d
= cdata
;
294 flags
= znode_get_tree(coord
->node
)->carry
.insert_flags
;
295 op
->u
.insert
.flags
= flags
;
296 op
->u
.insert
.type
= COPT_ITEM_DATA
;
297 op
->u
.insert
.child
= NULL
;
299 assert("nikita-3245", lh
->node
== coord
->node
);
300 lowest_level
->track_type
= CARRY_TRACK_CHANGE
;
301 lowest_level
->tracked
= lh
;
304 result
= reiser4_carry(lowest_level
, NULL
);
305 done_carry_pool(pool
);
310 /* form carry queue to perform paste of @data with @key at @coord, and launch
311 its execution by calling carry().
313 Instruct carry to update @lh it after balancing insertion coord moves into
317 static int paste_with_carry(coord_t
* coord
, /* coord of paste */
318 lock_handle
* lh
, /* lock handle of node
321 reiser4_item_data
* data
, /* parameters of new
323 const reiser4_key
* key
, /* key of new item */
324 unsigned flags
/* paste flags */ )
328 carry_level
*lowest_level
;
329 carry_insert_data
*cdata
;
332 assert("umka-315", coord
!= NULL
);
333 assert("umka-316", key
!= NULL
);
336 init_carry_pool(sizeof(*pool
) + 3 * sizeof(*lowest_level
) +
339 return PTR_ERR(pool
);
340 lowest_level
= (carry_level
*) (pool
+ 1);
341 init_carry_level(lowest_level
, pool
);
343 op
= reiser4_post_carry(lowest_level
, COP_PASTE
, coord
->node
, 0);
344 if (IS_ERR(op
) || (op
== NULL
)) {
345 done_carry_pool(pool
);
346 return RETERR(op
? PTR_ERR(op
) : -EIO
);
348 cdata
= (carry_insert_data
*) (lowest_level
+ 3);
349 cdata
->coord
= coord
;
352 op
->u
.paste
.d
= cdata
;
354 flags
= znode_get_tree(coord
->node
)->carry
.paste_flags
;
355 op
->u
.paste
.flags
= flags
;
356 op
->u
.paste
.type
= COPT_ITEM_DATA
;
358 lowest_level
->track_type
= CARRY_TRACK_CHANGE
;
359 lowest_level
->tracked
= lh
;
362 result
= reiser4_carry(lowest_level
, NULL
);
363 done_carry_pool(pool
);
368 /* insert item at the given coord.
370 First try to skip carry by directly calling ->create_item() method of node
371 plugin. If this is impossible (there is not enough free space in the node,
372 or leftmost item in the node is created), call insert_with_carry_by_coord()
373 that will do full carry().
376 insert_result
insert_by_coord(coord_t
* coord
/* coord where to
377 * insert. coord->node has
378 * to be write locked by
380 reiser4_item_data
* data
/* data to be
382 const reiser4_key
* key
/* key of new item */ ,
383 lock_handle
* lh
/* lock handle of write
385 __u32 flags
/* insertion flags */ )
391 assert("vs-247", coord
!= NULL
);
392 assert("vs-248", data
!= NULL
);
393 assert("vs-249", data
->length
>= 0);
394 assert("nikita-1191", znode_is_write_locked(coord
->node
));
397 coord_clear_iplug(coord
);
398 result
= zload(node
);
402 item_size
= space_needed(node
, NULL
, data
, 1);
403 if (item_size
> znode_free_space(node
) &&
404 (flags
& COPI_DONT_SHIFT_LEFT
) && (flags
& COPI_DONT_SHIFT_RIGHT
)
405 && (flags
& COPI_DONT_ALLOCATE
)) {
406 /* we are forced to use free space of coord->node and new item
407 does not fit into it.
409 Currently we get here only when we allocate and copy units
410 of extent item from a node to its left neighbor during
411 "squalloc"-ing. If @node (this is left neighbor) does not
412 have enough free space - we do not want to attempt any
413 shifting and allocations because we are in squeezing and
414 everything to the left of @node is tightly packed.
416 result
= -E_NODE_FULL
;
417 } else if ((item_size
<= znode_free_space(node
)) &&
418 !coord_is_before_leftmost(coord
) &&
419 (node_plugin_by_node(node
)->fast_insert
!= NULL
)
420 && node_plugin_by_node(node
)->fast_insert(coord
)) {
421 /* shortcut insertion without carry() overhead.
425 - there is enough free space
427 - insertion is not into the leftmost position in a node
428 (otherwise it would require updating of delimiting key in a
431 - node plugin agrees with this
435 node_plugin_by_node(node
)->create_item(coord
, key
, data
,
437 znode_make_dirty(node
);
439 /* otherwise do full-fledged carry(). */
441 insert_with_carry_by_coord(coord
, lh
, data
, key
, COP_INSERT
,
448 /* @coord is set to leaf level and @data is to be inserted to twig level */
450 insert_extent_by_coord(coord_t
*
452 /* coord where to insert. coord->node * has to be write * locked by caller */
454 reiser4_item_data
* data
/* data to be inserted */ ,
455 const reiser4_key
* key
/* key of new item */ ,
457 lh
/* lock handle of write lock on * node */ )
459 assert("vs-405", coord
!= NULL
);
460 assert("vs-406", data
!= NULL
);
461 assert("vs-407", data
->length
> 0);
462 assert("vs-408", znode_is_write_locked(coord
->node
));
463 assert("vs-409", znode_get_level(coord
->node
) == LEAF_LEVEL
);
465 return insert_with_carry_by_coord(coord
, lh
, data
, key
, COP_EXTENT
,
469 /* Insert into the item at the given coord.
471 First try to skip carry by directly calling ->paste() method of item
472 plugin. If this is impossible (there is not enough free space in the node,
473 or we are pasting into leftmost position in the node), call
474 paste_with_carry() that will do full carry().
477 /* paste_into_item */
478 int insert_into_item(coord_t
* coord
/* coord of pasting */ ,
479 lock_handle
* lh
/* lock handle on node involved */ ,
480 const reiser4_key
* key
/* key of unit being pasted */ ,
481 reiser4_item_data
* data
/* parameters for new unit */ ,
482 unsigned flags
/* insert/paste flags */ )
489 assert("umka-317", coord
!= NULL
);
490 assert("umka-318", key
!= NULL
);
492 iplug
= item_plugin_by_coord(coord
);
493 nplug
= node_plugin_by_coord(coord
);
495 assert("nikita-1480", iplug
== data
->iplug
);
497 size_change
= space_needed(coord
->node
, coord
, data
, 0);
498 if (size_change
> (int)znode_free_space(coord
->node
) &&
499 (flags
& COPI_DONT_SHIFT_LEFT
) && (flags
& COPI_DONT_SHIFT_RIGHT
)
500 && (flags
& COPI_DONT_ALLOCATE
)) {
501 /* we are forced to use free space of coord->node and new data
502 does not fit into it. */
506 /* shortcut paste without carry() overhead.
510 - there is enough free space
512 - paste is not into the leftmost unit in a node (otherwise
513 it would require updating of delimiting key in a parent)
515 - node plugin agrees with this
517 - item plugin agrees with us
519 if (size_change
<= (int)znode_free_space(coord
->node
) &&
520 (coord
->item_pos
!= 0 ||
521 coord
->unit_pos
!= 0 || coord
->between
== AFTER_UNIT
) &&
522 coord
->unit_pos
!= 0 && nplug
->fast_paste
!= NULL
&&
523 nplug
->fast_paste(coord
) &&
524 iplug
->b
.fast_paste
!= NULL
&& iplug
->b
.fast_paste(coord
)) {
526 nplug
->change_item_size(coord
, size_change
);
527 /* NOTE-NIKITA: huh? where @key is used? */
528 result
= iplug
->b
.paste(coord
, data
, NULL
);
530 nplug
->change_item_size(coord
, size_change
);
531 znode_make_dirty(coord
->node
);
533 /* otherwise do full-fledged carry(). */
534 result
= paste_with_carry(coord
, lh
, data
, key
, flags
);
538 /* this either appends or truncates item @coord */
539 int reiser4_resize_item(coord_t
* coord
/* coord of item being resized */ ,
540 reiser4_item_data
* data
/* parameters of resize */ ,
541 reiser4_key
* key
/* key of new unit */ ,
542 lock_handle
* lh
/* lock handle of node
543 * being modified */ ,
544 cop_insert_flag flags
/* carry flags */ )
549 assert("nikita-362", coord
!= NULL
);
550 assert("nikita-363", data
!= NULL
);
551 assert("vs-245", data
->length
!= 0);
554 coord_clear_iplug(coord
);
555 result
= zload(node
);
559 if (data
->length
< 0)
560 result
= node_plugin_by_coord(coord
)->shrink_item(coord
,
563 result
= insert_into_item(coord
, lh
, key
, data
, flags
);
570 int reiser4_insert_flow(coord_t
* coord
, lock_handle
* lh
, flow_t
* f
)
574 carry_level
*lowest_level
;
575 reiser4_item_data
*data
;
579 init_carry_pool(sizeof(*pool
) + 3 * sizeof(*lowest_level
) +
582 return PTR_ERR(pool
);
583 lowest_level
= (carry_level
*) (pool
+ 1);
584 init_carry_level(lowest_level
, pool
);
586 op
= reiser4_post_carry(lowest_level
, COP_INSERT_FLOW
, coord
->node
,
587 0 /* operate directly on coord -> node */ );
588 if (IS_ERR(op
) || (op
== NULL
)) {
589 done_carry_pool(pool
);
590 return RETERR(op
? PTR_ERR(op
) : -EIO
);
593 /* these are permanent during insert_flow */
594 data
= (reiser4_item_data
*) (lowest_level
+ 3);
596 data
->iplug
= item_plugin_by_id(FORMATTING_ID
);
598 /* data.length and data.data will be set before calling paste or
603 op
->u
.insert_flow
.flags
= 0;
604 op
->u
.insert_flow
.insert_point
= coord
;
605 op
->u
.insert_flow
.flow
= f
;
606 op
->u
.insert_flow
.data
= data
;
607 op
->u
.insert_flow
.new_nodes
= 0;
609 lowest_level
->track_type
= CARRY_TRACK_CHANGE
;
610 lowest_level
->tracked
= lh
;
612 result
= reiser4_carry(lowest_level
, NULL
);
613 done_carry_pool(pool
);
618 /* Given a coord in parent node, obtain a znode for the corresponding child */
619 znode
*child_znode(const coord_t
* parent_coord
/* coord of pointer to
621 znode
* parent
/* parent of child */ ,
622 int incore_p
/* if !0 only return child if already in
624 int setup_dkeys_p
/* if !0 update delimiting keys of
629 assert("nikita-1374", parent_coord
!= NULL
);
630 assert("nikita-1482", parent
!= NULL
);
633 assert_rw_not_locked(&(znode_get_tree(parent
)->dk_lock
));
635 assert("nikita-2947", znode_is_any_locked(parent
));
637 if (znode_get_level(parent
) <= LEAF_LEVEL
) {
638 /* trying to get child of leaf node */
639 warning("nikita-1217", "Child of maize?");
640 return ERR_PTR(RETERR(-EIO
));
642 if (item_is_internal(parent_coord
)) {
643 reiser4_block_nr addr
;
647 iplug
= item_plugin_by_coord(parent_coord
);
648 assert("vs-512", iplug
->s
.internal
.down_link
);
649 iplug
->s
.internal
.down_link(parent_coord
, NULL
, &addr
);
651 tree
= znode_get_tree(parent
);
653 child
= zlook(tree
, &addr
);
656 zget(tree
, &addr
, parent
,
657 znode_get_level(parent
) - 1,
658 reiser4_ctx_gfp_mask_get());
659 if ((child
!= NULL
) && !IS_ERR(child
) && setup_dkeys_p
)
660 set_child_delimiting_keys(parent
, parent_coord
, child
);
662 warning("nikita-1483", "Internal item expected");
663 child
= ERR_PTR(RETERR(-EIO
));
668 /* remove znode from transaction */
669 static void uncapture_znode(znode
* node
)
673 assert("zam-1001", ZF_ISSET(node
, JNODE_HEARD_BANSHEE
));
675 if (!reiser4_blocknr_is_fake(znode_get_block(node
))) {
678 /* An already allocated block goes right to the atom's delete set. */
680 reiser4_dealloc_block(znode_get_block(node
), 0,
681 BA_DEFER
| BA_FORMATTED
);
684 "can\'t add a block (%llu) number to atom's delete set\n",
685 (unsigned long long)(*znode_get_block(node
)));
687 spin_lock_znode(node
);
688 /* Here we return flush reserved block which was reserved at the
689 * moment when this allocated node was marked dirty and still
690 * not used by flush in node relocation procedure. */
691 if (ZF_ISSET(node
, JNODE_FLUSH_RESERVED
)) {
694 atom
= jnode_get_atom(ZJNODE(node
));
695 assert("zam-939", atom
!= NULL
);
696 spin_unlock_znode(node
);
697 flush_reserved2grabbed(atom
, (__u64
) 1);
698 spin_unlock_atom(atom
);
700 spin_unlock_znode(node
);
702 /* znode has assigned block which is counted as "fake
703 allocated". Return it back to "free blocks") */
704 fake_allocated2free((__u64
) 1, BA_FORMATTED
);
708 * uncapture page from transaction. There is a possibility of a race
709 * with ->releasepage(): reiser4_releasepage() detaches page from this
710 * jnode and we have nothing to uncapture. To avoid this, get
711 * reference of node->pg under jnode spin lock. reiser4_uncapture_page()
712 * will deal with released page itself.
714 spin_lock_znode(node
);
715 page
= znode_page(node
);
716 if (likely(page
!= NULL
)) {
718 * reiser4_uncapture_page() can only be called when we are sure
719 * that znode is pinned in memory, which we are, because
720 * forget_znode() is only called from longterm_unlock_znode().
722 page_cache_get(page
);
723 spin_unlock_znode(node
);
725 reiser4_uncapture_page(page
);
727 page_cache_release(page
);
731 /* handle "flush queued" znodes */
733 atom
= jnode_get_atom(ZJNODE(node
));
734 assert("zam-943", atom
!= NULL
);
736 if (!ZF_ISSET(node
, JNODE_FLUSH_QUEUED
)
737 || !atom
->nr_running_queues
)
740 spin_unlock_znode(node
);
741 reiser4_atom_wait_event(atom
);
742 spin_lock_znode(node
);
745 reiser4_uncapture_block(ZJNODE(node
));
746 spin_unlock_atom(atom
);
751 /* This is called from longterm_unlock_znode() when last lock is released from
752 the node that has been removed from the tree. At this point node is removed
753 from sibling list and its lock is invalidated. */
754 void forget_znode(lock_handle
* handle
)
759 assert("umka-319", handle
!= NULL
);
762 tree
= znode_get_tree(node
);
764 assert("vs-164", znode_is_write_locked(node
));
765 assert("nikita-1280", ZF_ISSET(node
, JNODE_HEARD_BANSHEE
));
766 assert_rw_locked(&(node
->lock
.guard
));
768 /* We assume that this node was detached from its parent before
769 * unlocking, it gives no way to reach this node from parent through a
770 * down link. The node should have no children and, thereby, can't be
771 * reached from them by their parent pointers. The only way to obtain a
772 * reference to the node is to use sibling pointers from its left and
773 * right neighbors. In the next several lines we remove the node from
774 * the sibling list. */
776 write_lock_tree(tree
);
777 sibling_list_remove(node
);
778 znode_remove(node
, tree
);
779 write_unlock_tree(tree
);
781 /* Here we set JNODE_DYING and cancel all pending lock requests. It
782 * forces all lock requestor threads to repeat iterations of getting
783 * lock on a child, neighbor or parent node. But, those threads can't
784 * come to this node again, because this node is no longer a child,
785 * neighbor or parent of any other node. This order of znode
786 * invalidation does not allow other threads to waste cpu time is a busy
787 * loop, trying to lock dying object. The exception is in the flush
788 * code when we take node directly from atom's capture list.*/
789 reiser4_invalidate_lock(handle
);
790 uncapture_znode(node
);
793 /* Check that internal item at @pointer really contains pointer to @child. */
794 int check_tree_pointer(const coord_t
* pointer
/* would-be pointer to
796 const znode
* child
/* child znode */ )
798 assert("nikita-1016", pointer
!= NULL
);
799 assert("nikita-1017", child
!= NULL
);
800 assert("nikita-1018", pointer
->node
!= NULL
);
802 assert("nikita-1325", znode_is_any_locked(pointer
->node
));
804 assert("nikita-2985",
805 znode_get_level(pointer
->node
) == znode_get_level(child
) + 1);
807 coord_clear_iplug((coord_t
*) pointer
);
809 if (coord_is_existing_unit(pointer
)) {
811 reiser4_block_nr addr
;
813 if (item_is_internal(pointer
)) {
814 iplug
= item_plugin_by_coord(pointer
);
815 assert("vs-513", iplug
->s
.internal
.down_link
);
816 iplug
->s
.internal
.down_link(pointer
, NULL
, &addr
);
817 /* check that cached value is correct */
818 if (disk_addr_eq(&addr
, znode_get_block(child
))) {
823 /* warning ("jmacd-1002", "tree pointer incorrect"); */
827 /* find coord of pointer to new @child in @parent.
829 Find the &coord_t in the @parent where pointer to a given @child will
833 int find_new_child_ptr(znode
* parent
/* parent znode, passed locked */ ,
835 child UNUSED_ARG
/* child znode, passed locked */ ,
836 znode
* left
/* left brother of new node */ ,
837 coord_t
* result
/* where result is stored in */ )
841 assert("nikita-1486", parent
!= NULL
);
842 assert("nikita-1487", child
!= NULL
);
843 assert("nikita-1488", result
!= NULL
);
845 ret
= find_child_ptr(parent
, left
, result
);
846 if (ret
!= NS_FOUND
) {
847 warning("nikita-1489", "Cannot find brother position: %i", ret
);
850 result
->between
= AFTER_UNIT
;
851 return RETERR(NS_NOT_FOUND
);
855 /* find coord of pointer to @child in @parent.
857 Find the &coord_t in the @parent where pointer to a given @child is in.
860 int find_child_ptr(znode
* parent
/* parent znode, passed locked */ ,
861 znode
* child
/* child znode, passed locked */ ,
862 coord_t
* result
/* where result is stored in */ )
866 /* left delimiting key of a child */
870 assert("nikita-934", parent
!= NULL
);
871 assert("nikita-935", child
!= NULL
);
872 assert("nikita-936", result
!= NULL
);
873 assert("zam-356", znode_is_loaded(parent
));
875 coord_init_zero(result
);
876 result
->node
= parent
;
878 nplug
= parent
->nplug
;
879 assert("nikita-939", nplug
!= NULL
);
881 tree
= znode_get_tree(parent
);
882 /* NOTE-NIKITA taking read-lock on tree here assumes that @result is
883 * not aliased to ->in_parent of some znode. Otherwise,
884 * parent_coord_to_coord() below would modify data protected by tree
886 read_lock_tree(tree
);
887 /* fast path. Try to use cached value. Lock tree to keep
888 node->pos_in_parent and pos->*_blocknr consistent. */
889 if (child
->in_parent
.item_pos
+ 1 != 0) {
890 parent_coord_to_coord(&child
->in_parent
, result
);
891 if (check_tree_pointer(result
, child
) == NS_FOUND
) {
892 read_unlock_tree(tree
);
896 child
->in_parent
.item_pos
= (unsigned short)~0;
898 read_unlock_tree(tree
);
900 /* is above failed, find some key from @child. We are looking for the
901 least key in a child. */
903 ld
= *znode_get_ld_key(child
);
904 read_unlock_dk(tree
);
906 * now, lookup parent with key just found. Note, that left delimiting
907 * key doesn't identify node uniquely, because (in extremely rare
908 * case) two nodes can have equal left delimiting keys, if one of them
909 * is completely filled with directory entries that all happened to be
910 * hash collision. But, we check block number in check_tree_pointer()
913 lookup_res
= nplug
->lookup(parent
, &ld
, FIND_EXACT
, result
);
914 /* update cached pos_in_node */
915 if (lookup_res
== NS_FOUND
) {
916 write_lock_tree(tree
);
917 coord_to_parent_coord(result
, &child
->in_parent
);
918 write_unlock_tree(tree
);
919 lookup_res
= check_tree_pointer(result
, child
);
921 if (lookup_res
== NS_NOT_FOUND
)
922 lookup_res
= find_child_by_addr(parent
, child
, result
);
926 /* find coord of pointer to @child in @parent by scanning
928 Find the &coord_t in the @parent where pointer to a given @child
929 is in by scanning all internal items in @parent and comparing block
930 numbers in them with that of @child.
933 static int find_child_by_addr(znode
* parent
/* parent znode, passed locked */ ,
934 znode
* child
/* child znode, passed locked */ ,
935 coord_t
* result
/* where result is stored in */ )
939 assert("nikita-1320", parent
!= NULL
);
940 assert("nikita-1321", child
!= NULL
);
941 assert("nikita-1322", result
!= NULL
);
945 for_all_units(result
, parent
) {
946 if (check_tree_pointer(result
, child
) == NS_FOUND
) {
947 write_lock_tree(znode_get_tree(parent
));
948 coord_to_parent_coord(result
, &child
->in_parent
);
949 write_unlock_tree(znode_get_tree(parent
));
957 /* true, if @addr is "unallocated block number", which is just address, with
959 int is_disk_addr_unallocated(const reiser4_block_nr
* addr
/* address to
962 assert("nikita-1766", addr
!= NULL
);
963 cassert(sizeof(reiser4_block_nr
) == 8);
964 return (*addr
& REISER4_BLOCKNR_STATUS_BIT_MASK
) ==
965 REISER4_UNALLOCATED_STATUS_VALUE
;
968 /* returns true if removing bytes of given range of key [from_key, to_key]
969 causes removing of whole item @from */
971 item_removed_completely(coord_t
* from
, const reiser4_key
* from_key
,
972 const reiser4_key
* to_key
)
975 reiser4_key key_in_item
;
977 assert("umka-325", from
!= NULL
);
978 assert("", item_is_extent(from
));
980 /* check first key just for case */
981 item_key_by_coord(from
, &key_in_item
);
982 if (keygt(from_key
, &key_in_item
))
986 iplug
= item_plugin_by_coord(from
);
987 assert("vs-611", iplug
&& iplug
->s
.file
.append_key
);
989 iplug
->s
.file
.append_key(from
, &key_in_item
);
990 set_key_offset(&key_in_item
, get_key_offset(&key_in_item
) - 1);
992 if (keylt(to_key
, &key_in_item
))
993 /* last byte is not removed */
998 /* helper function for prepare_twig_kill(): @left and @right are formatted
999 * neighbors of extent item being completely removed. Load and lock neighbors
1000 * and store lock handles into @cdata for later use by kill_hook_extent() */
1002 prepare_children(znode
* left
, znode
* right
, carry_kill_data
* kdata
)
1009 left_loaded
= right_loaded
= 0;
1012 result
= zload(left
);
1015 result
= longterm_lock_znode(kdata
->left
, left
,
1020 if (result
== 0 && right
!= NULL
) {
1021 result
= zload(right
);
1024 result
= longterm_lock_znode(kdata
->right
, right
,
1027 ZNODE_LOCK_NONBLOCK
);
1031 done_lh(kdata
->left
);
1032 done_lh(kdata
->right
);
1033 if (left_loaded
!= 0)
1035 if (right_loaded
!= 0)
1041 static void done_children(carry_kill_data
* kdata
)
1043 if (kdata
->left
!= NULL
&& kdata
->left
->node
!= NULL
) {
1044 zrelse(kdata
->left
->node
);
1045 done_lh(kdata
->left
);
1047 if (kdata
->right
!= NULL
&& kdata
->right
->node
!= NULL
) {
1048 zrelse(kdata
->right
->node
);
1049 done_lh(kdata
->right
);
1053 /* part of cut_node. It is called when cut_node is called to remove or cut part
1054 of extent item. When head of that item is removed - we have to update right
1055 delimiting of left neighbor of extent. When item is removed completely - we
1056 have to set sibling link between left and right neighbor of removed
1057 extent. This may return -E_DEADLOCK because of trying to get left neighbor
1058 locked. So, caller should repeat an attempt
1060 /* Audited by: umka (2002.06.16) */
1062 prepare_twig_kill(carry_kill_data
* kdata
, znode
* locked_left_neighbor
)
1066 lock_handle left_lh
;
1067 lock_handle right_lh
;
1073 int left_zloaded_here
, right_zloaded_here
;
1075 from
= kdata
->params
.from
;
1076 assert("umka-326", from
!= NULL
);
1077 assert("umka-327", kdata
->params
.to
!= NULL
);
1079 /* for one extent item only yet */
1080 assert("vs-591", item_is_extent(from
));
1081 assert("vs-592", from
->item_pos
== kdata
->params
.to
->item_pos
);
1083 if ((kdata
->params
.from_key
1084 && keygt(kdata
->params
.from_key
, item_key_by_coord(from
, &key
)))
1085 || from
->unit_pos
!= 0) {
1086 /* head of item @from is not removed, there is nothing to
1092 left_zloaded_here
= 0;
1093 right_zloaded_here
= 0;
1095 left_child
= right_child
= NULL
;
1097 coord_dup(&left_coord
, from
);
1100 if (coord_prev_unit(&left_coord
)) {
1101 /* @from is leftmost item in its node */
1102 if (!locked_left_neighbor
) {
1104 reiser4_get_left_neighbor(&left_lh
, from
->node
,
1106 GN_CAN_USE_UPPER_LEVELS
);
1110 case -E_NO_NEIGHBOR
:
1111 /* there is no formatted node to the left of
1114 "extent item has smallest key in "
1115 "the tree and it is about to be removed");
1118 /* need to restart */
1123 /* we have acquired left neighbor of from->node */
1124 result
= zload(left_lh
.node
);
1128 locked_left_neighbor
= left_lh
.node
;
1130 /* squalloc_right_twig_cut should have supplied locked
1133 znode_is_write_locked(locked_left_neighbor
));
1134 result
= zload(locked_left_neighbor
);
1139 left_zloaded_here
= 1;
1140 coord_init_last_unit(&left_coord
, locked_left_neighbor
);
1143 if (!item_is_internal(&left_coord
)) {
1144 /* what else but extent can be on twig level */
1145 assert("vs-606", item_is_extent(&left_coord
));
1147 /* there is no left formatted child */
1148 if (left_zloaded_here
)
1149 zrelse(locked_left_neighbor
);
1154 tree
= znode_get_tree(left_coord
.node
);
1155 left_child
= child_znode(&left_coord
, left_coord
.node
, 1, 0);
1157 if (IS_ERR(left_child
)) {
1158 result
= PTR_ERR(left_child
);
1162 /* left child is acquired, calculate new right delimiting key for it
1163 and get right child if it is necessary */
1164 if (item_removed_completely
1165 (from
, kdata
->params
.from_key
, kdata
->params
.to_key
)) {
1166 /* try to get right child of removed item */
1167 coord_t right_coord
;
1170 kdata
->params
.to
->unit_pos
==
1171 coord_last_unit_pos(kdata
->params
.to
));
1172 coord_dup(&right_coord
, kdata
->params
.to
);
1173 if (coord_next_unit(&right_coord
)) {
1174 /* @to is rightmost unit in the node */
1176 reiser4_get_right_neighbor(&right_lh
, from
->node
,
1178 GN_CAN_USE_UPPER_LEVELS
);
1181 result
= zload(right_lh
.node
);
1185 right_zloaded_here
= 1;
1186 coord_init_first_unit(&right_coord
,
1188 item_key_by_coord(&right_coord
, &key
);
1191 case -E_NO_NEIGHBOR
:
1192 /* there is no formatted node to the right of
1195 key
= *znode_get_rd_key(from
->node
);
1196 read_unlock_dk(tree
);
1197 right_coord
.node
= NULL
;
1205 /* there is an item to the right of @from - take its key */
1206 item_key_by_coord(&right_coord
, &key
);
1209 /* try to get right child of @from */
1210 if (right_coord
.node
&& /* there is right neighbor of @from */
1211 item_is_internal(&right_coord
)) { /* it is internal item */
1212 right_child
= child_znode(&right_coord
,
1213 right_coord
.node
, 1, 0);
1215 if (IS_ERR(right_child
)) {
1216 result
= PTR_ERR(right_child
);
1221 /* whole extent is removed between znodes left_child and right_child. Prepare them for linking and
1222 update of right delimiting key of left_child */
1223 result
= prepare_children(left_child
, right_child
, kdata
);
1225 /* head of item @to is removed. left_child has to get right delimting key update. Prepare it for that */
1226 result
= prepare_children(left_child
, NULL
, kdata
);
1232 if (right_zloaded_here
)
1233 zrelse(right_lh
.node
);
1238 if (left_zloaded_here
)
1239 zrelse(locked_left_neighbor
);
1244 /* this is used to remove part of node content between coordinates @from and @to. Units to which @from and @to are set
1245 are to be cut completely */
1246 /* for try_to_merge_with_left, delete_copied, reiser4_delete_node */
1247 int cut_node_content(coord_t
* from
, coord_t
* to
, const reiser4_key
* from_key
, /* first key to be removed */
1248 const reiser4_key
* to_key
, /* last key to be removed */
1250 smallest_removed
/* smallest key actually removed */ )
1254 carry_level
*lowest_level
;
1255 carry_cut_data
*cut_data
;
1258 assert("vs-1715", coord_compare(from
, to
) != COORD_CMP_ON_RIGHT
);
1261 init_carry_pool(sizeof(*pool
) + 3 * sizeof(*lowest_level
) +
1264 return PTR_ERR(pool
);
1265 lowest_level
= (carry_level
*) (pool
+ 1);
1266 init_carry_level(lowest_level
, pool
);
1268 op
= reiser4_post_carry(lowest_level
, COP_CUT
, from
->node
, 0);
1269 assert("vs-1509", op
!= 0);
1271 done_carry_pool(pool
);
1275 cut_data
= (carry_cut_data
*) (lowest_level
+ 3);
1276 cut_data
->params
.from
= from
;
1277 cut_data
->params
.to
= to
;
1278 cut_data
->params
.from_key
= from_key
;
1279 cut_data
->params
.to_key
= to_key
;
1280 cut_data
->params
.smallest_removed
= smallest_removed
;
1282 op
->u
.cut_or_kill
.is_cut
= 1;
1283 op
->u
.cut_or_kill
.u
.cut
= cut_data
;
1285 result
= reiser4_carry(lowest_level
, NULL
);
1286 done_carry_pool(pool
);
1291 /* cut part of the node
1293 Cut part or whole content of node.
1295 cut data between @from and @to of @from->node and call carry() to make
1296 corresponding changes in the tree. @from->node may become empty. If so -
1297 pointer to it will be removed. Neighboring nodes are not changed. Smallest
1298 removed key is stored in @smallest_removed
1301 int kill_node_content(coord_t
* from
, /* coord of the first unit/item that will be eliminated */
1302 coord_t
* to
, /* coord of the last unit/item that will be eliminated */
1303 const reiser4_key
* from_key
, /* first key to be removed */
1304 const reiser4_key
* to_key
, /* last key to be removed */
1305 reiser4_key
* smallest_removed
, /* smallest key actually removed */
1306 znode
* locked_left_neighbor
, /* this is set when kill_node_content is called with left neighbor
1307 * locked (in squalloc_right_twig_cut, namely) */
1308 struct inode
*inode
, /* inode of file whose item (or its part) is to be killed. This is necessary to
1309 invalidate pages together with item pointing to them */
1311 { /* this call is made for file truncate) */
1314 carry_level
*lowest_level
;
1315 carry_kill_data
*kdata
;
1316 lock_handle
*left_child
;
1317 lock_handle
*right_child
;
1320 assert("umka-328", from
!= NULL
);
1321 assert("vs-316", !node_is_empty(from
->node
));
1322 assert("nikita-1812", coord_is_existing_unit(from
)
1323 && coord_is_existing_unit(to
));
1325 /* allocate carry_pool, 3 carry_level-s, carry_kill_data and structures for kill_hook_extent */
1326 pool
= init_carry_pool(sizeof(*pool
) + 3 * sizeof(*lowest_level
) +
1327 sizeof(carry_kill_data
) +
1328 2 * sizeof(lock_handle
) +
1329 5 * sizeof(reiser4_key
) + 2 * sizeof(coord_t
));
1331 return PTR_ERR(pool
);
1333 lowest_level
= (carry_level
*) (pool
+ 1);
1334 init_carry_level(lowest_level
, pool
);
1336 kdata
= (carry_kill_data
*) (lowest_level
+ 3);
1337 left_child
= (lock_handle
*) (kdata
+ 1);
1338 right_child
= left_child
+ 1;
1340 init_lh(left_child
);
1341 init_lh(right_child
);
1343 kdata
->params
.from
= from
;
1344 kdata
->params
.to
= to
;
1345 kdata
->params
.from_key
= from_key
;
1346 kdata
->params
.to_key
= to_key
;
1347 kdata
->params
.smallest_removed
= smallest_removed
;
1348 kdata
->params
.truncate
= truncate
;
1350 kdata
->inode
= inode
;
1351 kdata
->left
= left_child
;
1352 kdata
->right
= right_child
;
1353 /* memory for 5 reiser4_key and 2 coord_t will be used in kill_hook_extent */
1354 kdata
->buf
= (char *)(right_child
+ 1);
1356 if (znode_get_level(from
->node
) == TWIG_LEVEL
&& item_is_extent(from
)) {
1357 /* left child of extent item may have to get updated right
1358 delimiting key and to get linked with right child of extent
1359 @from if it will be removed completely */
1360 result
= prepare_twig_kill(kdata
, locked_left_neighbor
);
1362 done_children(kdata
);
1363 done_carry_pool(pool
);
1368 op
= reiser4_post_carry(lowest_level
, COP_CUT
, from
->node
, 0);
1369 if (IS_ERR(op
) || (op
== NULL
)) {
1370 done_children(kdata
);
1371 done_carry_pool(pool
);
1372 return RETERR(op
? PTR_ERR(op
) : -EIO
);
1375 op
->u
.cut_or_kill
.is_cut
= 0;
1376 op
->u
.cut_or_kill
.u
.kill
= kdata
;
1378 result
= reiser4_carry(lowest_level
, NULL
);
1380 done_children(kdata
);
1381 done_carry_pool(pool
);
1386 fake_kill_hook_tail(struct inode
*inode
, loff_t start
, loff_t end
, int truncate
)
1388 if (reiser4_inode_get_flag(inode
, REISER4_HAS_MMAP
)) {
1389 pgoff_t start_pg
, end_pg
;
1391 start_pg
= start
>> PAGE_CACHE_SHIFT
;
1392 end_pg
= (end
- 1) >> PAGE_CACHE_SHIFT
;
1394 if ((start
& (PAGE_CACHE_SIZE
- 1)) == 0) {
1396 * kill up to the page boundary.
1398 assert("vs-123456", start_pg
== end_pg
);
1399 reiser4_invalidate_pages(inode
->i_mapping
, start_pg
, 1,
1401 } else if (start_pg
!= end_pg
) {
1403 * page boundary is within killed portion of node.
1405 assert("vs-654321", end_pg
- start_pg
== 1);
1406 reiser4_invalidate_pages(inode
->i_mapping
, end_pg
,
1407 end_pg
- start_pg
, 1);
1410 inode_sub_bytes(inode
, end
- start
);
1414 * Delete whole @node from the reiser4 tree without loading it.
1416 * @left: locked left neighbor,
1417 * @node: node to be deleted,
1418 * @smallest_removed: leftmost key of deleted node,
1419 * @object: inode pointer, if we truncate a file body.
1420 * @truncate: true if called for file truncate.
1422 * @return: 0 if success, error code otherwise.
1424 * NOTE: if @object!=NULL we assume that @smallest_removed != NULL and it
1425 * contains the right value of the smallest removed key from the previous
1426 * cut_worker() iteration. This is needed for proper accounting of
1427 * "i_blocks" and "i_bytes" fields of the @object.
1429 int reiser4_delete_node(znode
* node
, reiser4_key
* smallest_removed
,
1430 struct inode
*object
, int truncate
)
1432 lock_handle parent_lock
;
1438 assert("zam-937", node
!= NULL
);
1439 assert("zam-933", znode_is_write_locked(node
));
1440 assert("zam-999", smallest_removed
!= NULL
);
1442 init_lh(&parent_lock
);
1444 ret
= reiser4_get_parent(&parent_lock
, node
, ZNODE_WRITE_LOCK
);
1448 assert("zam-934", !znode_above_root(parent_lock
.node
));
1450 ret
= zload(parent_lock
.node
);
1452 goto failed_nozrelse
;
1454 ret
= find_child_ptr(parent_lock
.node
, node
, &cut_from
);
1458 /* decrement child counter and set parent pointer to NULL before
1459 deleting the list from parent node because of checks in
1460 internal_kill_item_hook (we can delete the last item from the parent
1461 node, the parent node is going to be deleted and its c_count should
1464 tree
= znode_get_tree(node
);
1465 write_lock_tree(tree
);
1466 init_parent_coord(&node
->in_parent
, NULL
);
1467 --parent_lock
.node
->c_count
;
1468 write_unlock_tree(tree
);
1470 assert("zam-989", item_is_internal(&cut_from
));
1472 /* @node should be deleted after unlocking. */
1473 ZF_SET(node
, JNODE_HEARD_BANSHEE
);
1475 /* remove a pointer from the parent node to the node being deleted. */
1476 coord_dup(&cut_to
, &cut_from
);
1477 /* FIXME: shouldn't this be kill_node_content */
1478 ret
= cut_node_content(&cut_from
, &cut_to
, NULL
, NULL
, NULL
);
1480 /* FIXME(Zam): Should we re-connect the node to its parent if
1481 * cut_node fails? */
1485 reiser4_tree
*tree
= current_tree
;
1486 __u64 start_offset
= 0, end_offset
= 0;
1488 read_lock_tree(tree
);
1489 write_lock_dk(tree
);
1491 /* We use @smallest_removed and the left delimiting of
1492 * the current node for @object->i_blocks, i_bytes
1493 * calculation. We assume that the items after the
1494 * *@smallest_removed key have been deleted from the
1496 start_offset
= get_key_offset(znode_get_ld_key(node
));
1497 end_offset
= get_key_offset(smallest_removed
);
1500 assert("zam-1021", znode_is_connected(node
));
1502 znode_set_rd_key(node
->left
, znode_get_rd_key(node
));
1504 *smallest_removed
= *znode_get_ld_key(node
);
1506 write_unlock_dk(tree
);
1507 read_unlock_tree(tree
);
1510 /* we used to perform actions which are to be performed on items on their removal from tree in
1511 special item method - kill_hook. Here for optimization reasons we avoid reading node
1512 containing item we remove and can not call item's kill hook. Instead we call function which
1513 does exactly the same things as tail kill hook in assumption that node we avoid reading
1514 contains only one item and that item is a tail one. */
1515 fake_kill_hook_tail(object
, start_offset
, end_offset
,
1520 zrelse(parent_lock
.node
);
1522 done_lh(&parent_lock
);
1527 static int can_delete(const reiser4_key
*key
, znode
*node
)
1531 read_lock_dk(current_tree
);
1532 result
= keyle(key
, znode_get_ld_key(node
));
1533 read_unlock_dk(current_tree
);
1538 * This subroutine is not optimal but implementation seems to
1541 * @tap: the point deletion process begins from,
1542 * @from_key: the beginning of the deleted key range,
1543 * @to_key: the end of the deleted key range,
1544 * @smallest_removed: the smallest removed key,
1545 * @truncate: true if called for file truncate.
1546 * @progress: return true if a progress in file items deletions was made,
1547 * @smallest_removed value is actual in that case.
1549 * @return: 0 if success, error code otherwise, -E_REPEAT means that long
1550 * reiser4_cut_tree operation was interrupted for allowing atom commit.
1553 cut_tree_worker_common(tap_t
* tap
, const reiser4_key
* from_key
,
1554 const reiser4_key
* to_key
,
1555 reiser4_key
* smallest_removed
, struct inode
*object
,
1556 int truncate
, int *progress
)
1558 lock_handle next_node_lock
;
1562 assert("zam-931", tap
->coord
->node
!= NULL
);
1563 assert("zam-932", znode_is_write_locked(tap
->coord
->node
));
1566 init_lh(&next_node_lock
);
1569 znode
*node
; /* node from which items are cut */
1570 node_plugin
*nplug
; /* node plugin for @node */
1572 node
= tap
->coord
->node
;
1574 /* Move next_node_lock to the next node on the left. */
1576 reiser4_get_left_neighbor(&next_node_lock
, node
,
1578 GN_CAN_USE_UPPER_LEVELS
);
1579 if (result
!= 0 && result
!= -E_NO_NEIGHBOR
)
1581 /* Check can we delete the node as a whole. */
1582 if (*progress
&& znode_get_level(node
) == LEAF_LEVEL
&&
1583 can_delete(from_key
, node
)) {
1584 result
= reiser4_delete_node(node
, smallest_removed
,
1587 result
= reiser4_tap_load(tap
);
1591 /* Prepare the second (right) point for cut_node() */
1593 coord_init_last_unit(tap
->coord
, node
);
1595 else if (item_plugin_by_coord(tap
->coord
)->b
.lookup
==
1597 /* set rightmost unit for the items without lookup method */
1598 tap
->coord
->unit_pos
=
1599 coord_last_unit_pos(tap
->coord
);
1601 nplug
= node
->nplug
;
1603 assert("vs-686", nplug
);
1604 assert("vs-687", nplug
->lookup
);
1606 /* left_coord is leftmost unit cut from @node */
1607 result
= nplug
->lookup(node
, from_key
,
1608 FIND_MAX_NOT_MORE_THAN
,
1611 if (IS_CBKERR(result
))
1614 /* adjust coordinates so that they are set to existing units */
1615 if (coord_set_to_right(&left_coord
)
1616 || coord_set_to_left(tap
->coord
)) {
1621 if (coord_compare(&left_coord
, tap
->coord
) ==
1622 COORD_CMP_ON_RIGHT
) {
1623 /* keys from @from_key to @to_key are not in the tree */
1628 if (left_coord
.item_pos
!= tap
->coord
->item_pos
) {
1629 /* do not allow to cut more than one item. It is added to solve problem of truncating
1630 partially converted files. If file is partially converted there may exist a twig node
1631 containing both internal item or items pointing to leaf nodes with formatting items
1632 and extent item. We do not want to kill internal items being at twig node here
1633 because cut_tree_worker assumes killing them from level level */
1634 coord_dup(&left_coord
, tap
->coord
);
1636 coord_is_existing_unit(&left_coord
));
1637 left_coord
.unit_pos
= 0;
1640 /* cut data from one node */
1641 // *smallest_removed = *reiser4_min_key();
1643 kill_node_content(&left_coord
, tap
->coord
, from_key
,
1644 to_key
, smallest_removed
,
1645 next_node_lock
.node
, object
,
1647 reiser4_tap_relse(tap
);
1654 /* Check whether all items with keys >= from_key were removed
1656 if (keyle(smallest_removed
, from_key
))
1660 if (next_node_lock
.node
== NULL
)
1663 result
= reiser4_tap_move(tap
, &next_node_lock
);
1664 done_lh(&next_node_lock
);
1668 /* Break long reiser4_cut_tree operation (deletion of a large
1669 file) if atom requires commit. */
1670 if (*progress
> CUT_TREE_MIN_ITERATIONS
1671 && current_atom_should_commit()) {
1676 done_lh(&next_node_lock
);
1677 // assert("vs-301", !keyeq(&smallest_removed, reiser4_min_key()));
1681 /* there is a fundamental problem with optimizing deletes: VFS does it
1682 one file at a time. Another problem is that if an item can be
1683 anything, then deleting items must be done one at a time. It just
1684 seems clean to writes this to specify a from and a to key, and cut
1685 everything between them though. */
1687 /* use this function with care if deleting more than what is part of a single file. */
1688 /* do not use this when cutting a single item, it is suboptimal for that */
1690 /* You are encouraged to write plugin specific versions of this. It
1691 cannot be optimal for all plugins because it works item at a time,
1692 and some plugins could sometimes work node at a time. Regular files
1693 however are not optimizable to work node at a time because of
1694 extents needing to free the blocks they point to.
1696 Optimizations compared to v3 code:
1698 It does not balance (that task is left to memory pressure code).
1700 Nodes are deleted only if empty.
1704 Performs read-ahead of formatted nodes whose contents are part of
1709 * Delete everything from the reiser4 tree between two keys: @from_key and
1712 * @from_key: the beginning of the deleted key range,
1713 * @to_key: the end of the deleted key range,
1714 * @smallest_removed: the smallest removed key,
1715 * @object: owner of cutting items.
1716 * @truncate: true if called for file truncate.
1717 * @progress: return true if a progress in file items deletions was made,
1718 * @smallest_removed value is actual in that case.
1720 * @return: 0 if success, error code otherwise, -E_REPEAT means that long cut_tree
1721 * operation was interrupted for allowing atom commit .
1724 int reiser4_cut_tree_object(reiser4_tree
* tree
, const reiser4_key
* from_key
,
1725 const reiser4_key
* to_key
,
1726 reiser4_key
* smallest_removed_p
,
1727 struct inode
*object
, int truncate
, int *progress
)
1732 coord_t right_coord
;
1733 reiser4_key smallest_removed
;
1734 int (*cut_tree_worker
) (tap_t
*, const reiser4_key
*,
1735 const reiser4_key
*, reiser4_key
*,
1736 struct inode
*, int, int *);
1739 assert("umka-329", tree
!= NULL
);
1740 assert("umka-330", from_key
!= NULL
);
1741 assert("umka-331", to_key
!= NULL
);
1742 assert("zam-936", keyle(from_key
, to_key
));
1744 if (smallest_removed_p
== NULL
)
1745 smallest_removed_p
= &smallest_removed
;
1750 /* Find rightmost item to cut away from the tree. */
1751 result
= reiser4_object_lookup(object
, to_key
, &right_coord
,
1752 &lock
, ZNODE_WRITE_LOCK
,
1753 FIND_MAX_NOT_MORE_THAN
,
1754 TWIG_LEVEL
, LEAF_LEVEL
,
1755 CBK_UNIQUE
, NULL
/*ra_info */);
1756 if (result
!= CBK_COORD_FOUND
)
1759 || inode_file_plugin(object
)->cut_tree_worker
== NULL
)
1760 cut_tree_worker
= cut_tree_worker_common
;
1763 inode_file_plugin(object
)->cut_tree_worker
;
1764 reiser4_tap_init(&tap
, &right_coord
, &lock
, ZNODE_WRITE_LOCK
);
1766 cut_tree_worker(&tap
, from_key
, to_key
, smallest_removed_p
,
1767 object
, truncate
, progress
);
1768 reiser4_tap_done(&tap
);
1770 reiser4_preempt_point();
1778 case -E_NO_NEIGHBOR
:
1788 warning("nikita-2861", "failure: %i", result
);
1796 /* repeat reiser4_cut_tree_object until everything is deleted.
1797 * unlike cut_file_items, it does not end current transaction if -E_REPEAT
1798 * is returned by cut_tree_object. */
1799 int reiser4_cut_tree(reiser4_tree
* tree
, const reiser4_key
* from
,
1800 const reiser4_key
* to
, struct inode
*inode
, int truncate
)
1806 result
= reiser4_cut_tree_object(tree
, from
, to
, NULL
,
1807 inode
, truncate
, &progress
);
1808 } while (result
== -E_REPEAT
);
1813 /* finishing reiser4 initialization */
1814 int reiser4_init_tree(reiser4_tree
* tree
/* pointer to structure being
1816 const reiser4_block_nr
* root_block
/* address of a root block
1818 tree_level height
/* height of a tree */ ,
1819 node_plugin
* nplug
/* default node plugin */ )
1823 assert("nikita-306", tree
!= NULL
);
1824 assert("nikita-307", root_block
!= NULL
);
1825 assert("nikita-308", height
> 0);
1826 assert("nikita-309", nplug
!= NULL
);
1827 assert("zam-587", tree
->super
!= NULL
);
1829 tree
->root_block
= *root_block
;
1830 tree
->height
= height
;
1831 tree
->estimate_one_insert
= calc_estimate_one_insert(height
);
1832 tree
->nplug
= nplug
;
1834 tree
->znode_epoch
= 1ull;
1836 cbk_cache_init(&tree
->cbk_cache
);
1838 result
= znodes_tree_init(tree
);
1840 result
= jnodes_tree_init(tree
);
1842 tree
->uber
= zget(tree
, &UBER_TREE_ADDR
, NULL
, 0,
1843 reiser4_ctx_gfp_mask_get());
1844 if (IS_ERR(tree
->uber
)) {
1845 result
= PTR_ERR(tree
->uber
);
1852 /* release resources associated with @tree */
1853 void reiser4_done_tree(reiser4_tree
* tree
/* tree to release */ )
1858 if (tree
->uber
!= NULL
) {
1862 znodes_tree_done(tree
);
1863 jnodes_tree_done(tree
);
1864 cbk_cache_done(&tree
->cbk_cache
);
1867 /* Make Linus happy.
1869 c-indentation-style: "K&R"