1 /* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
4 /* The design document for this file is at http://www.namesys.com/v4/v4.html. */
11 #include "plugin/item/item.h"
12 #include "plugin/plugin.h"
13 #include "plugin/object.h"
17 #include "block_alloc.h"
18 #include "tree_walk.h"
23 #include "page_cache.h"
31 #include <asm/atomic.h>
32 #include <linux/fs.h> /* for struct super_block */
33 #include <linux/mm.h> /* for struct page */
34 #include <linux/bio.h> /* for struct bio */
35 #include <linux/pagemap.h>
36 #include <linux/blkdev.h>
38 /* IMPLEMENTATION NOTES */
40 /* PARENT-FIRST: Some terminology: A parent-first traversal is a way of
41 assigning a total order to the nodes of the tree in which the parent is
42 placed before its children, which are ordered (recursively) in left-to-right
43 order. When we speak of a "parent-first preceder", it describes the node that
44 "came before in forward parent-first order". When we speak of a "parent-first
45 follower", it describes the node that "comes next in parent-first order"
46 (alternatively the node that "came before in reverse parent-first order").
48 The following pseudo-code prints the nodes of a tree in forward parent-first
51 void parent_first (node)
54 if (node->level > leaf) {
55 for (i = 0; i < num_children; i += 1) {
56 parent_first (node->child[i]);
62 /* JUST WHAT ARE WE TRYING TO OPTIMIZE, HERE? The idea is to optimize block
63 allocation so that a left-to-right scan of the tree's data (i.e., the leaves
64 in left-to-right order) can be accomplished with sequential reads, which
65 results in reading nodes in their parent-first order. This is a
66 read-optimization aspect of the flush algorithm, and there is also a
67 write-optimization aspect, which is that we wish to make large sequential
68 writes to the disk by allocating or reallocating blocks so that they can be
69 written in sequence. Sometimes the read-optimization and write-optimization
70 goals conflict with each other, as we discuss in more detail below.
73 /* STATE BITS: The flush code revolves around the state of the jnodes it covers.
74 Here are the relevant jnode->state bits and their relevence to flush:
76 JNODE_DIRTY: If a node is dirty, it must be flushed. But in order to be
77 written it must be allocated first. In order to be considered allocated,
78 the jnode must have exactly one of { JNODE_OVRWR, JNODE_RELOC } set. These
79 two bits are exclusive, and all dirtied jnodes eventually have one of these
80 bits set during each transaction.
82 JNODE_CREATED: The node was freshly created in its transaction and has no
83 previous block address, so it is unconditionally assigned to be relocated,
84 although this is mainly for code-convenience. It is not being 'relocated'
85 from anything, but in almost every regard it is treated as part of the
86 relocate set. The JNODE_CREATED bit remains set even after JNODE_RELOC is
87 set, so the actual relocate can be distinguished from the
88 created-and-allocated set easily: relocate-set members (belonging to the
89 preserve-set) have (JNODE_RELOC) set and created-set members which have no
90 previous location to preserve have (JNODE_RELOC | JNODE_CREATED) set.
92 JNODE_OVRWR: The node belongs to atom's overwrite set. The flush algorithm
93 made the decision to maintain the pre-existing location for this node and
94 it will be written to the wandered-log.
96 JNODE_RELOC: The flush algorithm made the decision to relocate this block
97 (if it was not created, see note above). A block with JNODE_RELOC set is
98 eligible for early-flushing and may be submitted during flush_empty_queues.
99 When the JNODE_RELOC bit is set on a znode, the parent node's internal item
100 is modified and the znode is rehashed.
102 JNODE_SQUEEZABLE: Before shifting everything left, the flush algorithm
103 scans the node and calls plugin->f.squeeze() method for its items. By this
104 technology we update disk clusters of cryptcompress objects. Also if
105 leftmost point that was found by flush scan has this flag (races with
106 write(), rare case) the flush algorythm makes the decision to pass it to
107 squalloc() in spite of its flushprepped status for squeezing, not for
110 JNODE_FLUSH_QUEUED: This bit is set when a call to flush enters the jnode
111 into its flush queue. This means the jnode is not on any clean or dirty
112 list, instead it is moved to one of the flush queue (see flush_queue.h)
113 object private list. This prevents multiple concurrent flushes from
114 attempting to start flushing from the same node.
116 (DEAD STATE BIT) JNODE_FLUSH_BUSY: This bit was set during the bottom-up
117 squeeze-and-allocate on a node while its children are actively being
118 squeezed and allocated. This flag was created to avoid submitting a write
119 request for a node while its children are still being allocated and
120 squeezed. Then flush queue was re-implemented to allow unlimited number of
121 nodes be queued. This flag support was commented out in source code because
122 we decided that there was no reason to submit queued nodes before
123 jnode_flush() finishes. However, current code calls fq_write() during a
124 slum traversal and may submit "busy nodes" to disk. Probably we can
125 re-enable the JNODE_FLUSH_BUSY bit support in future.
127 With these state bits, we describe a test used frequently in the code below,
128 jnode_is_flushprepped()(and the spin-lock-taking jnode_check_flushprepped()).
129 The test for "flushprepped" returns true if any of the following are true:
131 - The node is not dirty
132 - The node has JNODE_RELOC set
133 - The node has JNODE_OVRWR set
135 If either the node is not dirty or it has already been processed by flush
136 (and assigned JNODE_OVRWR or JNODE_RELOC), then it is prepped. If
137 jnode_is_flushprepped() returns true then flush has work to do on that node.
140 /* FLUSH_PREP_ONCE_PER_TRANSACTION: Within a single transaction a node is never
141 flushprepped twice (unless an explicit call to flush_unprep is made as
142 described in detail below). For example a node is dirtied, allocated, and
143 then early-flushed to disk and set clean. Before the transaction commits, the
144 page is dirtied again and, due to memory pressure, the node is flushed again.
145 The flush algorithm will not relocate the node to a new disk location, it
146 will simply write it to the same, previously relocated position again.
149 /* THE BOTTOM-UP VS. TOP-DOWN ISSUE: This code implements a bottom-up algorithm
150 where we start at a leaf node and allocate in parent-first order by iterating
151 to the right. At each step of the iteration, we check for the right neighbor.
152 Before advancing to the right neighbor, we check if the current position and
153 the right neighbor share the same parent. If they do not share the same
154 parent, the parent is allocated before the right neighbor.
156 This process goes recursively up the tree and squeeze nodes level by level as
157 long as the right neighbor and the current position have different parents,
158 then it allocates the right-neighbors-with-different-parents on the way back
159 down. This process is described in more detail in
160 flush_squalloc_changed_ancestor and the recursive function
161 squalloc_one_changed_ancestor. But the purpose here is not to discuss the
162 specifics of the bottom-up approach as it is to contrast the bottom-up and
165 The top-down algorithm was implemented earlier (April-May 2002). In the
166 top-down approach, we find a starting point by scanning left along each level
167 past dirty nodes, then going up and repeating the process until the left node
168 and the parent node are clean. We then perform a parent-first traversal from
169 the starting point, which makes allocating in parent-first order trivial.
170 After one subtree has been allocated in this manner, we move to the right,
171 try moving upward, then repeat the parent-first traversal.
173 Both approaches have problems that need to be addressed. Both are
174 approximately the same amount of code, but the bottom-up approach has
175 advantages in the order it acquires locks which, at the very least, make it
176 the better approach. At first glance each one makes the other one look
177 simpler, so it is important to remember a few of the problems with each one.
179 Main problem with the top-down approach: When you encounter a clean child
180 during the parent-first traversal, what do you do? You would like to avoid
181 searching through a large tree of nodes just to find a few dirty leaves at
182 the bottom, and there is not an obvious solution. One of the advantages of
183 the top-down approach is that during the parent-first traversal you check
184 every child of a parent to see if it is dirty. In this way, the top-down
185 approach easily handles the main problem of the bottom-up approach:
186 unallocated children.
188 The unallocated children problem is that before writing a node to disk we
189 must make sure that all of its children are allocated. Otherwise, the writing
190 the node means extra I/O because the node will have to be written again when
191 the child is finally allocated.
193 WE HAVE NOT YET ELIMINATED THE UNALLOCATED CHILDREN PROBLEM. Except for bugs,
194 this should not cause any file system corruption, it only degrades I/O
195 performance because a node may be written when it is sure to be written at
196 least one more time in the same transaction when the remaining children are
197 allocated. What follows is a description of how we will solve the problem.
200 /* HANDLING UNALLOCATED CHILDREN: During flush we may allocate a parent node,
201 then proceeding in parent first order, allocate some of its left-children,
202 then encounter a clean child in the middle of the parent. We do not allocate
203 the clean child, but there may remain unallocated (dirty) children to the
204 right of the clean child. If we were to stop flushing at this moment and
205 write everything to disk, the parent might still contain unallocated
208 We could try to allocate all the descendents of every node that we allocate,
209 but this is not necessary. Doing so could result in allocating the entire
210 tree: if the root node is allocated then every unallocated node would have to
211 be allocated before flushing. Actually, we do not have to write a node just
212 because we allocate it. It is possible to allocate but not write a node
213 during flush, when it still has unallocated children. However, this approach
214 is probably not optimal for the following reason.
216 The flush algorithm is designed to allocate nodes in parent-first order in an
217 attempt to optimize reads that occur in the same order. Thus we are
218 read-optimizing for a left-to-right scan through all the leaves in the
219 system, and we are hoping to write-optimize at the same time because those
220 nodes will be written together in batch. What happens, however, if we assign
221 a block number to a node in its read-optimized order but then avoid writing
222 it because it has unallocated children? In that situation, we lose out on the
223 write-optimization aspect because a node will have to be written again to the
224 its location on the device, later, which likely means seeking back to that
227 So there are tradeoffs. We can choose either:
229 A. Allocate all unallocated children to preserve both write-optimization and
230 read-optimization, but this is not always desirable because it may mean
231 having to allocate and flush very many nodes at once.
233 B. Defer writing nodes with unallocated children, keep their read-optimized
234 locations, but sacrifice write-optimization because those nodes will be
237 C. Defer writing nodes with unallocated children, but do not keep their
238 read-optimized locations. Instead, choose to write-optimize them later, when
239 they are written. To facilitate this, we "undo" the read-optimized allocation
240 that was given to the node so that later it can be write-optimized, thus
241 "unpreparing" the flush decision. This is a case where we disturb the
242 FLUSH_PREP_ONCE_PER_TRANSACTION rule described above. By a call to
243 flush_unprep() we will: if the node was wandered, unset the JNODE_OVRWR bit;
244 if the node was relocated, unset the JNODE_RELOC bit, non-deferred-deallocate
245 its block location, and set the JNODE_CREATED bit, effectively setting the
246 node back to an unallocated state.
248 We will take the following approach in v4.0: for twig nodes we will always
249 finish allocating unallocated children (A). For nodes with (level > TWIG)
250 we will defer writing and choose write-optimization (C).
252 To summarize, there are several parts to a solution that avoids the problem
253 with unallocated children:
255 FIXME-ZAM: Still no one approach is implemented to eliminate the
256 "UNALLOCATED CHILDREN" problem because there was an experiment which was done
257 showed that we have 1-2 nodes with unallocated children for thousands of
258 written nodes. The experiment was simple like coping/deletion of linux kernel
259 sources. However the problem can arise in more complex tests. I think we have
260 jnode_io_hook to insert a check for unallocated children and see what kind of
263 1. When flush reaches a stopping point (e.g. a clean node) it should continue
264 calling squeeze-and-allocate on any remaining unallocated children.
265 FIXME: Difficulty to implement: should be simple -- amounts to adding a while
266 loop to jnode_flush, see comments in that function.
268 2. When flush reaches flush_empty_queue(), some of the (level > TWIG) nodes
269 may still have unallocated children. If the twig level has unallocated
270 children it is an assertion failure. If a higher-level node has unallocated
271 children, then it should be explicitly de-allocated by a call to
273 FIXME: Difficulty to implement: should be simple.
275 3. (CPU-Optimization) Checking whether a node has unallocated children may
276 consume more CPU cycles than we would like, and it is possible (but medium
277 complexity) to optimize this somewhat in the case where large sub-trees are
278 flushed. The following observation helps: if both the left- and
279 right-neighbor of a node are processed by the flush algorithm then the node
280 itself is guaranteed to have all of its children allocated. However, the cost
281 of this check may not be so expensive after all: it is not needed for leaves
282 and flush can guarantee this property for twigs. That leaves only (level >
283 TWIG) nodes that have to be checked, so this optimization only helps if at
284 least three (level > TWIG) nodes are flushed in one pass, and the savings
285 will be very small unless there are many more (level > TWIG) nodes. But if
286 there are many (level > TWIG) nodes then the number of blocks being written
287 will be very large, so the savings may be insignificant. That said, the idea
288 is to maintain both the left and right edges of nodes that are processed in
289 flush. When flush_empty_queue() is called, a relatively simple test will
290 tell whether the (level > TWIG) node is on the edge. If it is on the edge,
291 the slow check is necessary, but if it is in the interior then it can be
292 assumed to have all of its children allocated. FIXME: medium complexity to
293 implement, but simple to verify given that we must have a slow check anyway.
295 4. (Optional) This part is optional, not for v4.0--flush should work
296 independently of whether this option is used or not. Called RAPID_SCAN, the
297 idea is to amend the left-scan operation to take unallocated children into
298 account. Normally, the left-scan operation goes left as long as adjacent
299 nodes are dirty up until some large maximum value (FLUSH_SCAN_MAXNODES) at
300 which point it stops and begins flushing. But scan-left may stop at a
301 position where there are unallocated children to the left with the same
302 parent. When RAPID_SCAN is enabled, the ordinary scan-left operation stops
303 after FLUSH_RELOCATE_THRESHOLD, which is much smaller than
304 FLUSH_SCAN_MAXNODES, then procedes with a rapid scan. The rapid scan skips
305 all the interior children of a node--if the leftmost child of a twig is
306 dirty, check its left neighbor (the rightmost child of the twig to the left).
307 If the left neighbor of the leftmost child is also dirty, then continue the
308 scan at the left twig and repeat. This option will cause flush to allocate
309 more twigs in a single pass, but it also has the potential to write many more
310 nodes than would otherwise be written without the RAPID_SCAN option.
311 RAPID_SCAN was partially implemented, code removed August 12, 2002 by JMACD.
314 /* FLUSH CALLED ON NON-LEAF LEVEL. Most of our design considerations assume that
315 the starting point for flush is a leaf node, but actually the flush code
316 cares very little about whether or not this is true. It is possible that all
317 the leaf nodes are flushed and dirty parent nodes still remain, in which case
318 jnode_flush() is called on a non-leaf argument. Flush doesn't care--it treats
319 the argument node as if it were a leaf, even when it is not. This is a simple
320 approach, and there may be a more optimal policy but until a problem with
321 this approach is discovered, simplest is probably best.
323 NOTE: In this case, the ordering produced by flush is parent-first only if
324 you ignore the leaves. This is done as a matter of simplicity and there is
325 only one (shaky) justification. When an atom commits, it flushes all leaf
326 level nodes first, followed by twigs, and so on. With flushing done in this
327 order, if flush is eventually called on a non-leaf node it means that
328 (somehow) we reached a point where all leaves are clean and only internal
329 nodes need to be flushed. If that it the case, then it means there were no
330 leaves that were the parent-first preceder/follower of the parent. This is
331 expected to be a rare case, which is why we do nothing special about it.
332 However, memory pressure may pass an internal node to flush when there are
333 still dirty leaf nodes that need to be flushed, which could prove our
334 original assumptions "inoperative". If this needs to be fixed, then
335 scan_left/right should have special checks for the non-leaf levels. For
336 example, instead of passing from a node to the left neighbor, it should pass
337 from the node to the left neighbor's rightmost descendent (if dirty).
341 /* UNIMPLEMENTED AS YET: REPACKING AND RESIZING. We walk the tree in 4MB-16MB
342 chunks, dirtying everything and putting it into a transaction. We tell the
343 allocator to allocate the blocks as far as possible towards one end of the
344 logical device--the left (starting) end of the device if we are walking from
345 left to right, the right end of the device if we are walking from right to
346 left. We then make passes in alternating directions, and as we do this the
347 device becomes sorted such that tree order and block number order fully
350 Resizing is done by shifting everything either all the way to the left or all
351 the way to the right, and then reporting the last block.
354 /* RELOCATE DECISIONS: The code makes a decision to relocate in several places.
355 This descibes the policy from the highest level:
357 The FLUSH_RELOCATE_THRESHOLD parameter: If we count this many consecutive
358 nodes on the leaf level during flush-scan (right, left), then we
359 unconditionally decide to relocate leaf nodes.
361 Otherwise, there are two contexts in which we make a decision to relocate:
363 1. The REVERSE PARENT-FIRST context: Implemented in reverse_relocate_test().
364 During the initial stages of flush, after scan-right completes, we want to
365 ask the question: should we relocate this leaf node and thus dirty the parent
366 node. Then if the node is a leftmost child its parent is its own parent-first
367 preceder, thus we repeat the question at the next level up, and so on. In
368 these cases we are moving in the reverse-parent first direction.
370 There is another case which is considered the reverse direction, which comes
371 at the end of a twig in reverse_relocate_end_of_twig(). As we finish
372 processing a twig we may reach a point where there is a clean twig to the
373 right with a dirty leftmost child. In this case, we may wish to relocate the
374 child by testing if it should be relocated relative to its parent.
376 2. The FORWARD PARENT-FIRST context: Testing for forward relocation is done
377 in allocate_znode. What distinguishes the forward parent-first case from the
378 reverse-parent first case is that the preceder has already been allocated in
379 the forward case, whereas in the reverse case we don't know what the preceder
380 is until we finish "going in reverse". That simplifies the forward case
381 considerably, and there we actually use the block allocator to determine
382 whether, e.g., a block closer to the preceder is available.
385 /* SQUEEZE_LEFT_EDGE: Unimplemented idea for future consideration. The idea is,
386 once we finish scan-left and find a starting point, if the parent's left
387 neighbor is dirty then squeeze the parent's left neighbor and the parent.
388 This may change the flush-starting-node's parent. Repeat until the child's
389 parent is stable. If the child is a leftmost child, repeat this left-edge
390 squeezing operation at the next level up. Note that we cannot allocate
391 extents during this or they will be out of parent-first order. There is also
392 some difficult coordinate maintenence issues. We can't do a tree search to
393 find coordinates again (because we hold locks), we have to determine them
394 from the two nodes being squeezed. Looks difficult, but has potential to
395 increase space utilization. */
397 /* Flush-scan helper functions. */
398 static void scan_init(flush_scan
* scan
);
399 static void scan_done(flush_scan
* scan
);
401 /* Flush-scan algorithm. */
402 static int scan_left(flush_scan
* scan
, flush_scan
* right
, jnode
* node
,
404 static int scan_right(flush_scan
* scan
, jnode
* node
, unsigned limit
);
405 static int scan_common(flush_scan
* scan
, flush_scan
* other
);
406 static int scan_formatted(flush_scan
* scan
);
407 static int scan_unformatted(flush_scan
* scan
, flush_scan
* other
);
408 static int scan_by_coord(flush_scan
* scan
);
410 /* Initial flush-point ancestor allocation. */
411 static int alloc_pos_and_ancestors(flush_pos_t
*pos
);
412 static int alloc_one_ancestor(const coord_t
*coord
, flush_pos_t
*pos
);
413 static int set_preceder(const coord_t
*coord_in
, flush_pos_t
*pos
);
415 /* Main flush algorithm.
416 Note on abbreviation: "squeeze and allocate" == "squalloc". */
417 static int squalloc(flush_pos_t
*pos
);
419 /* Flush squeeze implementation. */
420 static int squeeze_right_non_twig(znode
* left
, znode
* right
);
421 static int shift_one_internal_unit(znode
* left
, znode
* right
);
423 /* Flush reverse parent-first relocation routines. */
424 static int reverse_relocate_if_close_enough(const reiser4_block_nr
* pblk
,
425 const reiser4_block_nr
* nblk
);
426 static int reverse_relocate_test(jnode
* node
, const coord_t
*parent_coord
,
428 static int reverse_relocate_check_dirty_parent(jnode
* node
,
429 const coord_t
*parent_coord
,
432 /* Flush allocate write-queueing functions: */
433 static int allocate_znode(znode
* node
, const coord_t
*parent_coord
,
435 static int allocate_znode_update(znode
* node
, const coord_t
*parent_coord
,
437 static int lock_parent_and_allocate_znode(znode
*, flush_pos_t
*);
439 /* Flush helper functions: */
440 static int jnode_lock_parent_coord(jnode
* node
,
442 lock_handle
* parent_lh
,
443 load_count
* parent_zh
,
444 znode_lock_mode mode
, int try);
445 static int neighbor_in_slum(znode
* node
, lock_handle
* right_lock
, sideof side
,
446 znode_lock_mode mode
, int check_dirty
, int expected
);
447 static int znode_same_parents(znode
* a
, znode
* b
);
449 static int znode_check_flushprepped(znode
* node
)
451 return jnode_check_flushprepped(ZJNODE(node
));
454 /* Flush position functions */
455 static void pos_init(flush_pos_t
*pos
);
456 static int pos_valid(flush_pos_t
*pos
);
457 static void pos_done(flush_pos_t
*pos
);
458 static int pos_stop(flush_pos_t
*pos
);
460 /* check that @org is first jnode extent unit, if extent is unallocated,
461 * because all jnodes of unallocated extent are dirty and of the same atom. */
462 #define checkchild(scan) \
463 assert("nikita-3435", \
464 ergo(scan->direction == LEFT_SIDE && \
465 (scan->parent_coord.node->level == TWIG_LEVEL) && \
466 jnode_is_unformatted(scan->node) && \
467 extent_is_unallocated(&scan->parent_coord), \
468 extent_unit_index(&scan->parent_coord) == index_jnode(scan->node)))
470 /* This flush_cnt variable is used to track the number of concurrent flush
471 operations, useful for debugging. It is initialized in txnmgr.c out of
472 laziness (because flush has no static initializer function...) */
473 ON_DEBUG(atomic_t flush_cnt
;
476 /* check fs backing device for write congestion */
477 static int check_write_congestion(void)
479 struct super_block
*sb
;
480 struct backing_dev_info
*bdi
;
482 sb
= reiser4_get_current_sb();
483 bdi
= reiser4_get_super_fake(sb
)->i_mapping
->backing_dev_info
;
484 return bdi_write_congested(bdi
);
487 /* conditionally write flush queue */
488 static int write_prepped_nodes(flush_pos_t
*pos
)
492 assert("zam-831", pos
);
493 assert("zam-832", pos
->fq
);
495 if (!(pos
->flags
& JNODE_FLUSH_WRITE_BLOCKS
))
498 if (check_write_congestion())
501 ret
= reiser4_write_fq(pos
->fq
, pos
->nr_written
,
502 WRITEOUT_SINGLE_STREAM
| WRITEOUT_FOR_PAGE_RECLAIM
);
506 /* Proper release all flush pos. resources then move flush position to new
508 static void move_flush_pos(flush_pos_t
*pos
, lock_handle
* new_lock
,
509 load_count
* new_load
, const coord_t
*new_coord
)
511 assert("zam-857", new_lock
->node
== new_load
->node
);
514 assert("zam-858", new_coord
->node
== new_lock
->node
);
515 coord_dup(&pos
->coord
, new_coord
);
517 coord_init_first_unit(&pos
->coord
, new_lock
->node
);
525 move_load_count(&pos
->load
, new_load
);
527 move_lh(&pos
->lock
, new_lock
);
530 /* delete empty node which link from the parent still exists. */
531 static int delete_empty_node(znode
* node
)
533 reiser4_key smallest_removed
;
535 assert("zam-1019", node
!= NULL
);
536 assert("zam-1020", node_is_empty(node
));
537 assert("zam-1023", znode_is_wlocked(node
));
539 return reiser4_delete_node(node
, &smallest_removed
, NULL
, 1);
542 /* Prepare flush position for alloc_pos_and_ancestors() and squalloc() */
543 static int prepare_flush_pos(flush_pos_t
*pos
, jnode
* org
)
550 init_load_count(&load
);
552 if (jnode_is_znode(org
)) {
553 ret
= longterm_lock_znode(&lock
, JZNODE(org
),
554 ZNODE_WRITE_LOCK
, ZNODE_LOCK_HIPRI
);
558 ret
= incr_load_count_znode(&load
, JZNODE(org
));
563 (jnode_get_level(org
) ==
564 LEAF_LEVEL
) ? POS_ON_LEAF
: POS_ON_INTERNAL
;
565 move_flush_pos(pos
, &lock
, &load
, NULL
);
567 coord_t parent_coord
;
568 ret
= jnode_lock_parent_coord(org
, &parent_coord
, &lock
,
569 &load
, ZNODE_WRITE_LOCK
, 0);
572 if (!item_is_extent(&parent_coord
)) {
573 /* file was converted to tail, org became HB, we found
579 pos
->state
= POS_ON_EPOINT
;
580 move_flush_pos(pos
, &lock
, &load
, &parent_coord
);
581 pos
->child
= jref(org
);
582 if (extent_is_unallocated(&parent_coord
)
583 && extent_unit_index(&parent_coord
) != index_jnode(org
)) {
584 /* @org is not first child of its parent unit. This may
585 happen because longerm lock of its parent node was
586 released between scan_left and scan_right. For now
587 work around this having flush to repeat */
593 done_load_count(&load
);
598 /* TODO LIST (no particular order): */
599 /* I have labelled most of the legitimate FIXME comments in this file with
600 letters to indicate which issue they relate to. There are a few miscellaneous
601 FIXMEs with specific names mentioned instead that need to be
602 inspected/resolved. */
603 /* B. There is an issue described in reverse_relocate_test having to do with an
604 imprecise is_preceder? check having to do with partially-dirty extents. The
605 code that sets preceder hints and computes the preceder is basically
606 untested. Careful testing needs to be done that preceder calculations are
607 done correctly, since if it doesn't affect correctness we will not catch this
608 stuff during regular testing. */
609 /* C. EINVAL, E_DEADLOCK, E_NO_NEIGHBOR, ENOENT handling. It is unclear which of
610 these are considered expected but unlikely conditions. Flush currently
611 returns 0 (i.e., success but no progress, i.e., restart) whenever it receives
612 any of these in jnode_flush(). Many of the calls that may produce one of
613 these return values (i.e., longterm_lock_znode, reiser4_get_parent,
614 reiser4_get_neighbor, ...) check some of these values themselves and, for
615 instance, stop flushing instead of resulting in a restart. If any of these
616 results are true error conditions then flush will go into a busy-loop, as we
617 noticed during testing when a corrupt tree caused find_child_ptr to return
618 ENOENT. It needs careful thought and testing of corner conditions.
620 /* D. Atomicity of flush_prep against deletion and flush concurrency. Suppose a
621 created block is assigned a block number then early-flushed to disk. It is
622 dirtied again and flush is called again. Concurrently, that block is deleted,
623 and the de-allocation of its block number does not need to be deferred, since
624 it is not part of the preserve set (i.e., it didn't exist before the
625 transaction). I think there may be a race condition where flush writes the
626 dirty, created block after the non-deferred deallocated block number is
627 re-allocated, making it possible to write deleted data on top of non-deleted
628 data. Its just a theory, but it needs to be thought out. */
629 /* F. bio_alloc() failure is not handled gracefully. */
630 /* G. Unallocated children. */
631 /* H. Add a WANDERED_LIST to the atom to clarify the placement of wandered
633 /* I. Rename flush-scan to scan-point, (flush-pos to flush-point?) */
635 /* JNODE_FLUSH: MAIN ENTRY POINT */
636 /* This is the main entry point for flushing a jnode and its dirty neighborhood
637 (dirty neighborhood is named "slum"). Jnode_flush() is called if reiser4 has
638 to write dirty blocks to disk, it happens when Linux VM decides to reduce
639 number of dirty pages or as a part of transaction commit.
641 Our objective here is to prep and flush the slum the jnode belongs to. We
642 want to squish the slum together, and allocate the nodes in it as we squish
643 because allocation of children affects squishing of parents.
645 The "argument" @node tells flush where to start. From there, flush finds the
646 left edge of the slum, and calls squalloc (in which nodes are squeezed and
647 allocated). To find a "better place" to start squalloc first we perform a
650 Flush-scanning may be performed in both left and right directions, but for
651 different purposes. When scanning to the left, we are searching for a node
652 that precedes a sequence of parent-first-ordered nodes which we will then
653 flush in parent-first order. During flush-scanning, we also take the
654 opportunity to count the number of consecutive leaf nodes. If this number is
655 past some threshold (FLUSH_RELOCATE_THRESHOLD), then we make a decision to
656 reallocate leaf nodes (thus favoring write-optimization).
658 Since the flush argument node can be anywhere in a sequence of dirty leaves,
659 there may also be dirty nodes to the right of the argument. If the scan-left
660 operation does not count at least FLUSH_RELOCATE_THRESHOLD nodes then we
661 follow it with a right-scan operation to see whether there is, in fact,
662 enough nodes to meet the relocate threshold. Each right- and left-scan
663 operation uses a single flush_scan object.
665 After left-scan and possibly right-scan, we prepare a flush_position object
666 with the starting flush point or parent coordinate, which was determined
669 Next we call the main flush routine, squalloc, which iterates along the leaf
670 level, squeezing and allocating nodes (and placing them into the flush
673 After squalloc returns we take extra steps to ensure that all the children
674 of the final twig node are allocated--this involves repeating squalloc
675 until we finish at a twig with no unallocated children.
677 Finally, we call flush_empty_queue to submit write-requests to disk. If we
678 encounter any above-twig nodes during flush_empty_queue that still have
679 unallocated children, we flush_unprep them.
681 Flush treats several "failure" cases as non-failures, essentially causing
682 them to start over. E_DEADLOCK is one example.
683 FIXME:(C) EINVAL, E_NO_NEIGHBOR, ENOENT: these should probably be handled
684 properly rather than restarting, but there are a bunch of cases to audit.
688 jnode_flush(jnode
* node
, long nr_to_write
, long *nr_written
,
689 flush_queue_t
*fq
, int flags
)
692 flush_scan
*right_scan
;
693 flush_scan
*left_scan
;
694 flush_pos_t
*flush_pos
;
696 struct super_block
*sb
;
697 reiser4_super_info_data
*sbinfo
;
698 jnode
*leftmost_in_slum
= NULL
;
700 assert("jmacd-76619", lock_stack_isclean(get_current_lock_stack()));
701 assert("nikita-3022", reiser4_schedulable());
703 assert("nikita-3185",
704 get_current_super_private()->delete_mutex_owner
!= current
);
706 /* allocate right_scan, left_scan and flush_pos */
708 kmalloc(2 * sizeof(*right_scan
) + sizeof(*flush_pos
),
709 reiser4_ctx_gfp_mask_get());
710 if (right_scan
== NULL
)
711 return RETERR(-ENOMEM
);
712 left_scan
= right_scan
+ 1;
713 flush_pos
= (flush_pos_t
*) (left_scan
+ 1);
715 sb
= reiser4_get_current_sb();
716 sbinfo
= get_super_private(sb
);
718 /* Flush-concurrency debug code */
720 atomic_inc(&flush_cnt
);
723 reiser4_enter_flush(sb
);
725 /* Initialize a flush position. */
728 flush_pos
->nr_written
= nr_written
;
730 flush_pos
->flags
= flags
;
731 flush_pos
->nr_to_write
= nr_to_write
;
733 scan_init(right_scan
);
734 scan_init(left_scan
);
736 /* First scan left and remember the leftmost scan position. If the
737 leftmost position is unformatted we remember its parent_coord. We
738 scan until counting FLUSH_SCAN_MAXNODES.
740 If starting @node is unformatted, at the beginning of left scan its
741 parent (twig level node, containing extent item) will be long term
742 locked and lock handle will be stored in the
743 @right_scan->parent_lock. This lock is used to start the rightward
744 scan without redoing the tree traversal (necessary to find parent)
745 and, hence, is kept during leftward scan. As a result, we have to
746 use try-lock when taking long term locks during the leftward scan.
748 ret
= scan_left(left_scan
, right_scan
,
749 node
, sbinfo
->flush
.scan_maxnodes
);
753 leftmost_in_slum
= jref(left_scan
->node
);
754 scan_done(left_scan
);
756 /* Then possibly go right to decide if we will use a policy of
757 relocating leaves. This is only done if we did not scan past (and
758 count) enough nodes during the leftward scan. If we do scan right,
759 we only care to go far enough to establish that at least
760 FLUSH_RELOCATE_THRESHOLD number of nodes are being flushed. The scan
761 limit is the difference between left_scan.count and the threshold. */
763 todo
= sbinfo
->flush
.relocate_threshold
- left_scan
->count
;
764 /* scan right is inherently deadlock prone, because we are
765 * (potentially) holding a lock on the twig node at this moment.
766 * FIXME: this is incorrect comment: lock is not held */
768 ret
= scan_right(right_scan
, node
, (unsigned)todo
);
773 /* Only the right-scan count is needed, release any rightward locks
775 scan_done(right_scan
);
777 /* ... and the answer is: we should relocate leaf nodes if at least
778 FLUSH_RELOCATE_THRESHOLD nodes were found. */
779 flush_pos
->leaf_relocate
= JF_ISSET(node
, JNODE_REPACK
) ||
780 (left_scan
->count
+ right_scan
->count
>=
781 sbinfo
->flush
.relocate_threshold
);
783 /* Funny business here. We set the 'point' in the flush_position at
784 prior to starting squalloc regardless of whether the first point is
785 formatted or unformatted. Without this there would be an invariant,
786 in the rest of the code, that if the flush_position is unformatted
787 then flush_position->point is NULL and
788 flush_position->parent_{lock,coord} is set, and if the flush_position
789 is formatted then flush_position->point is non-NULL and no parent
792 This seems lazy, but it makes the initial calls to
793 reverse_relocate_test (which ask "is it the pos->point the leftmost
794 child of its parent") much easier because we know the first child
795 already. Nothing is broken by this, but the reasoning is subtle.
796 Holding an extra reference on a jnode during flush can cause us to
797 see nodes with HEARD_BANSHEE during squalloc, because nodes are not
798 removed from sibling lists until they have zero reference count.
799 Flush would never observe a HEARD_BANSHEE node on the left-edge of
800 flush, nodes are only deleted to the right. So if nothing is broken,
803 NOTE-NIKITA actually, flush can meet HEARD_BANSHEE node at any
804 point and in any moment, because of the concurrent file system
805 activity (for example, truncate). */
807 /* Check jnode state after flush_scan completed. Having a lock on this
808 node or its parent (in case of unformatted) helps us in case of
809 concurrent flushing. */
810 if (jnode_check_flushprepped(leftmost_in_slum
)
811 && !jnode_convertible(leftmost_in_slum
)) {
816 /* Now setup flush_pos using scan_left's endpoint. */
817 ret
= prepare_flush_pos(flush_pos
, leftmost_in_slum
);
821 if (znode_get_level(flush_pos
->coord
.node
) == LEAF_LEVEL
822 && node_is_empty(flush_pos
->coord
.node
)) {
823 znode
*empty
= flush_pos
->coord
.node
;
825 assert("zam-1022", !ZF_ISSET(empty
, JNODE_HEARD_BANSHEE
));
826 ret
= delete_empty_node(empty
);
830 if (jnode_check_flushprepped(leftmost_in_slum
)
831 && !jnode_convertible(leftmost_in_slum
)) {
836 /* Set pos->preceder and (re)allocate pos and its ancestors if it is
838 ret
= alloc_pos_and_ancestors(flush_pos
);
842 /* Do the main rightward-bottom-up squeeze and allocate loop. */
843 ret
= squalloc(flush_pos
);
848 /* FIXME_NFQUCMPD: Here, handle the twig-special case for unallocated
849 children. First, the pos_stop() and pos_valid() routines should be
850 modified so that pos_stop() sets a flush_position->stop flag to 1
851 without releasing the current position immediately--instead release
852 it in pos_done(). This is a better implementation than the current
855 It is not clear that all fields of the flush_position should not be
856 released, but at the very least the parent_lock, parent_coord, and
857 parent_load should remain held because they are hold the last twig
858 when pos_stop() is called.
860 When we reach this point in the code, if the parent_coord is set to
861 after the last item then we know that flush reached the end of a twig
862 (and according to the new flush queueing design, we will return now).
863 If parent_coord is not past the last item, we should check if the
864 current twig has any unallocated children to the right (we are not
865 concerned with unallocated children to the left--in that case the
866 twig itself should not have been allocated). If the twig has
867 unallocated children to the right, set the parent_coord to that
868 position and then repeat the call to squalloc.
870 Testing for unallocated children may be defined in two ways: if any
871 internal item has a fake block number, it is unallocated; if any
872 extent item is unallocated then all of its children are unallocated.
873 But there is a more aggressive approach: if there are any dirty
874 children of the twig to the right of the current position, we may
875 wish to relocate those nodes now. Checking for potential relocation
876 is more expensive as it requires knowing whether there are any dirty
877 children that are not unallocated. The extent_needs_allocation should
878 be used after setting the correct preceder.
880 When we reach the end of a twig at this point in the code, if the
881 flush can continue (when the queue is ready) it will need some
882 information on the future starting point. That should be stored away
883 in the flush_handle using a seal, I believe. Holding a jref() on the
884 future starting point may break other code that deletes that node.
887 /* FIXME_NFQUCMPD: Also, we don't want to do any flushing when flush is
888 called above the twig level. If the VM calls flush above the twig
889 level, do nothing and return (but figure out why this happens). The
890 txnmgr should be modified to only flush its leaf-level dirty list.
891 This will do all the necessary squeeze and allocate steps but leave
892 unallocated branches and possibly unallocated twigs (when the twig's
893 leftmost child is not dirty). After flushing the leaf level, the
894 remaining unallocated nodes should be given write-optimized
895 locations. (Possibly, the remaining unallocated twigs should be
896 allocated just before their leftmost child.)
899 /* Any failure reaches this point. */
908 /* FIXME(C): Except for E_DEADLOCK, these should probably be
909 handled properly in each case. They already are handled in
911 /* Something bad happened, but difficult to avoid... Try again!
916 if (leftmost_in_slum
)
917 jput(leftmost_in_slum
);
920 scan_done(left_scan
);
921 scan_done(right_scan
);
924 ON_DEBUG(atomic_dec(&flush_cnt
));
926 reiser4_leave_flush(sb
);
931 /* The reiser4 flush subsystem can be turned into "rapid flush mode" means that
932 * flusher should submit all prepped nodes immediately without keeping them in
933 * flush queues for long time. The reason for rapid flush mode is to free
934 * memory as fast as possible. */
936 #if REISER4_USE_RAPID_FLUSH
939 * submit all prepped nodes if rapid flush mode is set,
940 * turn rapid flush mode off.
943 static int rapid_flush(flush_pos_t
*pos
)
945 if (!wbq_available())
948 return write_prepped_nodes(pos
);
953 #define rapid_flush(pos) (0)
955 #endif /* REISER4_USE_RAPID_FLUSH */
957 static jnode
*find_flush_start_jnode(jnode
*start
, txn_atom
* atom
,
958 flush_queue_t
*fq
, int *nr_queued
,
964 spin_lock_jnode(start
);
965 if (!jnode_is_flushprepped(start
)) {
966 assert("zam-1056", start
->atom
== atom
);
970 spin_unlock_jnode(start
);
973 * In this loop we process all already prepped (RELOC or OVRWR) and
974 * dirtied again nodes. The atom spin lock is not released until all
975 * dirty nodes processed or not prepped node found in the atom dirty
978 while ((node
= find_first_dirty_jnode(atom
, flags
))) {
979 spin_lock_jnode(node
);
981 assert("zam-881", JF_ISSET(node
, JNODE_DIRTY
));
982 assert("zam-898", !JF_ISSET(node
, JNODE_OVRWR
));
984 if (JF_ISSET(node
, JNODE_WRITEBACK
)) {
985 /* move node to the end of atom's writeback list */
986 list_move_tail(&node
->capture_link
, ATOM_WB_LIST(atom
));
989 * jnode is not necessarily on dirty list: if it was
990 * dirtied when it was on flush queue - it does not get
991 * moved to dirty list
993 ON_DEBUG(count_jnode(atom
, node
, NODE_LIST(node
),
996 } else if (jnode_is_znode(node
)
997 && znode_above_root(JZNODE(node
))) {
999 * A special case for znode-above-root. The above-root
1000 * (fake) znode is captured and dirtied when the tree
1001 * height changes or when the root node is relocated.
1002 * This causes atoms to fuse so that changes at the root
1003 * are serialized. However, this node is never flushed.
1004 * This special case used to be in lock.c to prevent the
1005 * above-root node from ever being captured, but now
1006 * that it is captured we simply prevent it from
1007 * flushing. The log-writer code relies on this to
1008 * properly log superblock modifications of the tree
1011 jnode_make_wander_nolock(node
);
1012 } else if (JF_ISSET(node
, JNODE_RELOC
)) {
1013 queue_jnode(fq
, node
);
1018 spin_unlock_jnode(node
);
1023 /* Flush some nodes of current atom, usually slum, return -E_REPEAT if there are
1024 * more nodes to flush, return 0 if atom's dirty lists empty and keep current
1025 * atom locked, return other errors as they are. */
1027 flush_current_atom(int flags
, long nr_to_write
, long *nr_submitted
,
1028 txn_atom
** atom
, jnode
*start
)
1030 reiser4_super_info_data
*sinfo
= get_current_super_private();
1031 flush_queue_t
*fq
= NULL
;
1036 assert("zam-889", atom
!= NULL
&& *atom
!= NULL
);
1037 assert_spin_locked(&((*atom
)->alock
));
1038 assert("zam-892", get_current_context()->trans
->atom
== *atom
);
1040 nr_to_write
= LONG_MAX
;
1042 ret
= reiser4_fq_by_atom(*atom
, &fq
);
1043 if (ret
!= -E_REPEAT
)
1045 *atom
= get_current_atom_locked();
1050 assert_spin_locked(&((*atom
)->alock
));
1052 /* parallel flushers limit */
1053 if (sinfo
->tmgr
.atom_max_flushers
!= 0) {
1054 while ((*atom
)->nr_flushers
>= sinfo
->tmgr
.atom_max_flushers
) {
1055 /* An reiser4_atom_send_event() call is inside
1056 reiser4_fq_put_nolock() which is called when flush is
1057 finished and nr_flushers is decremented. */
1058 reiser4_atom_wait_event(*atom
);
1059 *atom
= get_current_atom_locked();
1063 /* count ourself as a flusher */
1064 (*atom
)->nr_flushers
++;
1066 writeout_mode_enable();
1069 node
= find_flush_start_jnode(start
, *atom
, fq
, &nr_queued
, flags
);
1072 if (nr_queued
== 0) {
1073 (*atom
)->nr_flushers
--;
1074 reiser4_fq_put_nolock(fq
);
1075 reiser4_atom_send_event(*atom
);
1076 /* current atom remains locked */
1077 writeout_mode_disable();
1080 spin_unlock_atom(*atom
);
1083 BUG_ON((*atom
)->super
!= node
->tree
->super
);
1084 spin_unlock_atom(*atom
);
1085 spin_unlock_jnode(node
);
1086 BUG_ON(nr_to_write
== 0);
1087 ret
= jnode_flush(node
, nr_to_write
, nr_submitted
, fq
, flags
);
1092 reiser4_write_fq(fq
, nr_submitted
,
1093 WRITEOUT_SINGLE_STREAM
| WRITEOUT_FOR_PAGE_RECLAIM
);
1095 *atom
= get_current_atom_locked();
1096 (*atom
)->nr_flushers
--;
1097 reiser4_fq_put_nolock(fq
);
1098 reiser4_atom_send_event(*atom
);
1099 spin_unlock_atom(*atom
);
1101 writeout_mode_disable();
1109 /* REVERSE PARENT-FIRST RELOCATION POLICIES */
1111 /* This implements the is-it-close-enough-to-its-preceder? test for relocation
1112 in the reverse parent-first relocate context. Here all we know is the
1113 preceder and the block number. Since we are going in reverse, the preceder
1114 may still be relocated as well, so we can't ask the block allocator "is there
1115 a closer block available to relocate?" here. In the _forward_ parent-first
1116 relocate context (not here) we actually call the block allocator to try and
1117 find a closer location. */
1119 reverse_relocate_if_close_enough(const reiser4_block_nr
* pblk
,
1120 const reiser4_block_nr
* nblk
)
1122 reiser4_block_nr dist
;
1124 assert("jmacd-7710", *pblk
!= 0 && *nblk
!= 0);
1125 assert("jmacd-7711", !reiser4_blocknr_is_fake(pblk
));
1126 assert("jmacd-7712", !reiser4_blocknr_is_fake(nblk
));
1128 /* Distance is the absolute value. */
1129 dist
= (*pblk
> *nblk
) ? (*pblk
- *nblk
) : (*nblk
- *pblk
);
1131 /* If the block is less than FLUSH_RELOCATE_DISTANCE blocks away from
1132 its preceder block, do not relocate. */
1133 if (dist
<= get_current_super_private()->flush
.relocate_distance
)
1139 /* This function is a predicate that tests for relocation. Always called in the
1140 reverse-parent-first context, when we are asking whether the current node
1141 should be relocated in order to expand the flush by dirtying the parent level
1142 (and thus proceeding to flush that level). When traversing in the forward
1143 parent-first direction (not here), relocation decisions are handled in two
1144 places: allocate_znode() and extent_needs_allocation(). */
1146 reverse_relocate_test(jnode
* node
, const coord_t
*parent_coord
,
1149 reiser4_block_nr pblk
= 0;
1150 reiser4_block_nr nblk
= 0;
1152 assert("jmacd-8989", !jnode_is_root(node
));
1155 * This function is called only from the
1156 * reverse_relocate_check_dirty_parent() and only if the parent
1157 * node is clean. This implies that the parent has the real (i.e., not
1158 * fake) block number, and, so does the child, because otherwise the
1159 * parent would be dirty.
1162 /* New nodes are treated as if they are being relocated. */
1163 if (JF_ISSET(node
, JNODE_CREATED
) ||
1164 (pos
->leaf_relocate
&& jnode_get_level(node
) == LEAF_LEVEL
))
1167 /* Find the preceder. FIXME(B): When the child is an unformatted,
1168 previously existing node, the coord may be leftmost even though the
1169 child is not the parent-first preceder of the parent. If the first
1170 dirty node appears somewhere in the middle of the first extent unit,
1171 this preceder calculation is wrong.
1172 Needs more logic in here. */
1173 if (coord_is_leftmost_unit(parent_coord
)) {
1174 pblk
= *znode_get_block(parent_coord
->node
);
1176 pblk
= pos
->preceder
.blk
;
1178 check_preceder(pblk
);
1180 /* If (pblk == 0) then the preceder isn't allocated or isn't known:
1185 nblk
= *jnode_get_block(node
);
1187 if (reiser4_blocknr_is_fake(&nblk
))
1188 /* child is unallocated, mark parent dirty */
1191 return reverse_relocate_if_close_enough(&pblk
, &nblk
);
1194 /* This function calls reverse_relocate_test to make a reverse-parent-first
1195 relocation decision and then, if yes, it marks the parent dirty. */
1197 reverse_relocate_check_dirty_parent(jnode
* node
, const coord_t
*parent_coord
,
1202 if (!JF_ISSET(ZJNODE(parent_coord
->node
), JNODE_DIRTY
)) {
1204 ret
= reverse_relocate_test(node
, parent_coord
, pos
);
1209 if parent is already relocated - we do not want to grab space,
1214 grabbed
= get_current_context()->grabbed_blocks
;
1215 if (reiser4_grab_space_force((__u64
) 1, BA_RESERVED
) !=
1217 reiser4_panic("umka-1250",
1218 "No space left during flush.");
1220 assert("jmacd-18923",
1221 znode_is_write_locked(parent_coord
->node
));
1222 znode_make_dirty(parent_coord
->node
);
1223 grabbed2free_mark(grabbed
);
1230 /* INITIAL ALLOCATE ANCESTORS STEP (REVERSE PARENT-FIRST ALLOCATION BEFORE
1231 FORWARD PARENT-FIRST LOOP BEGINS) */
1233 /* Get the leftmost child for given coord. */
1234 static int get_leftmost_child_of_unit(const coord_t
*coord
, jnode
** child
)
1238 ret
= item_utmost_child(coord
, LEFT_SIDE
, child
);
1244 return PTR_ERR(*child
);
1249 /* This step occurs after the left- and right-scans are completed, before
1250 starting the forward parent-first traversal. Here we attempt to allocate
1251 ancestors of the starting flush point, which means continuing in the reverse
1252 parent-first direction to the parent, grandparent, and so on (as long as the
1253 child is a leftmost child). This routine calls a recursive process,
1254 alloc_one_ancestor, which does the real work, except there is special-case
1255 handling here for the first ancestor, which may be a twig. At each level
1256 (here and alloc_one_ancestor), we check for relocation and then, if the child
1257 is a leftmost child, repeat at the next level. On the way back down (the
1258 recursion), we allocate the ancestors in parent-first order. */
1259 static int alloc_pos_and_ancestors(flush_pos_t
*pos
)
1266 if (znode_check_flushprepped(pos
->lock
.node
))
1269 coord_init_invalid(&pcoord
, NULL
);
1271 init_load_count(&pload
);
1273 if (pos
->state
== POS_ON_EPOINT
) {
1274 /* a special case for pos on twig level, where we already have
1275 a lock on parent node. */
1276 /* The parent may not be dirty, in which case we should decide
1277 whether to relocate the child now. If decision is made to
1278 relocate the child, the parent is marked dirty. */
1280 reverse_relocate_check_dirty_parent(pos
->child
, &pos
->coord
,
1285 /* FIXME_NFQUCMPD: We only need to allocate the twig (if child
1286 is leftmost) and the leaf/child, so recursion is not needed.
1287 Levels above the twig will be allocated for
1288 write-optimization before the transaction commits. */
1290 /* Do the recursive step, allocating zero or more of our
1292 ret
= alloc_one_ancestor(&pos
->coord
, pos
);
1295 if (!znode_is_root(pos
->lock
.node
)) {
1296 /* all formatted nodes except tree root */
1298 reiser4_get_parent(&plock
, pos
->lock
.node
,
1303 ret
= incr_load_count_znode(&pload
, plock
.node
);
1308 find_child_ptr(plock
.node
, pos
->lock
.node
, &pcoord
);
1313 reverse_relocate_check_dirty_parent(ZJNODE
1320 ret
= alloc_one_ancestor(&pcoord
, pos
);
1325 ret
= allocate_znode(pos
->lock
.node
, &pcoord
, pos
);
1328 done_load_count(&pload
);
1333 /* This is the recursive step described in alloc_pos_and_ancestors, above.
1334 Ignoring the call to set_preceder, which is the next function described, this
1335 checks if the child is a leftmost child and returns if it is not. If the
1336 child is a leftmost child it checks for relocation, possibly dirtying the
1337 parent. Then it performs the recursive step. */
1338 static int alloc_one_ancestor(const coord_t
*coord
, flush_pos_t
*pos
)
1345 /* As we ascend at the left-edge of the region to flush, take this
1346 opportunity at the twig level to find our parent-first preceder
1347 unless we have already set it. */
1348 if (pos
->preceder
.blk
== 0) {
1349 ret
= set_preceder(coord
, pos
);
1354 /* If the ancestor is clean or already allocated, or if the child is not
1355 a leftmost child, stop going up, even leaving coord->node not
1357 if (znode_check_flushprepped(coord
->node
)
1358 || !coord_is_leftmost_unit(coord
))
1362 init_load_count(&aload
);
1363 coord_init_invalid(&acoord
, NULL
);
1365 /* Only ascend to the next level if it is a leftmost child, but
1366 write-lock the parent in case we will relocate the child. */
1367 if (!znode_is_root(coord
->node
)) {
1370 jnode_lock_parent_coord(ZJNODE(coord
->node
), &acoord
,
1371 &alock
, &aload
, ZNODE_WRITE_LOCK
,
1374 /* FIXME(C): check EINVAL, E_DEADLOCK */
1379 reverse_relocate_check_dirty_parent(ZJNODE(coord
->node
),
1384 /* Recursive call. */
1385 if (!znode_check_flushprepped(acoord
.node
)) {
1386 ret
= alloc_one_ancestor(&acoord
, pos
);
1392 /* Note: we call allocate with the parent write-locked (except at the
1393 root) in case we relocate the child, in which case it will modify the
1394 parent during this call. */
1395 ret
= allocate_znode(coord
->node
, &acoord
, pos
);
1398 done_load_count(&aload
);
1403 /* During the reverse parent-first alloc_pos_and_ancestors process described
1404 above there is a call to this function at the twig level. During
1405 alloc_pos_and_ancestors we may ask: should this node be relocated (in reverse
1406 parent-first context)? We repeat this process as long as the child is the
1407 leftmost child, eventually reaching an ancestor of the flush point that is
1408 not a leftmost child. The preceder of that ancestors, which is not a leftmost
1409 child, is actually on the leaf level. The preceder of that block is the
1410 left-neighbor of the flush point. The preceder of that block is the rightmost
1411 child of the twig on the left. So, when alloc_pos_and_ancestors passes upward
1412 through the twig level, it stops momentarily to remember the block of the
1413 rightmost child of the twig on the left and sets it to the flush_position's
1416 There is one other place where we may set the flush_position's preceder hint,
1417 which is during scan-left.
1419 static int set_preceder(const coord_t
*coord_in
, flush_pos_t
*pos
)
1423 lock_handle left_lock
;
1424 load_count left_load
;
1426 coord_dup(&coord
, coord_in
);
1428 init_lh(&left_lock
);
1429 init_load_count(&left_load
);
1431 /* FIXME(B): Same FIXME as in "Find the preceder" in
1432 reverse_relocate_test. coord_is_leftmost_unit is not the right test
1433 if the unformatted child is in the middle of the first extent unit.*/
1434 if (!coord_is_leftmost_unit(&coord
)) {
1435 coord_prev_unit(&coord
);
1438 reiser4_get_left_neighbor(&left_lock
, coord
.node
,
1439 ZNODE_READ_LOCK
, GN_SAME_ATOM
);
1441 /* If we fail for any reason it doesn't matter because
1442 the preceder is only a hint. We are low-priority at
1443 this point, so this must be the case. */
1444 if (ret
== -E_REPEAT
|| ret
== -E_NO_NEIGHBOR
||
1445 ret
== -ENOENT
|| ret
== -EINVAL
1446 || ret
== -E_DEADLOCK
)
1451 ret
= incr_load_count_znode(&left_load
, left_lock
.node
);
1455 coord_init_last_unit(&coord
, left_lock
.node
);
1459 item_utmost_child_real_block(&coord
, RIGHT_SIDE
,
1460 &pos
->preceder
.blk
);
1462 check_preceder(pos
->preceder
.blk
);
1463 done_load_count(&left_load
);
1464 done_lh(&left_lock
);
1468 /* MAIN SQUEEZE AND ALLOCATE LOOP (THREE BIG FUNCTIONS) */
1470 /* This procedure implements the outer loop of the flush algorithm. To put this
1471 in context, here is the general list of steps taken by the flush routine as a
1475 2. Scan-right (maybe)
1476 3. Allocate initial flush position and its ancestors
1478 5. <squeeze and next position and its ancestors to-the-right,
1479 then update position to-the-right>
1480 6. <repeat from #4 until flush is stopped>
1482 This procedure implements the loop in steps 4 through 6 in the above listing.
1484 Step 4: if the current flush position is an extent item (position on the twig
1485 level), it allocates the extent (allocate_extent_item_in_place) then shifts
1486 to the next coordinate. If the next coordinate's leftmost child needs
1487 flushprep, we will continue. If the next coordinate is an internal item, we
1488 descend back to the leaf level, otherwise we repeat a step #4 (labeled
1489 ALLOC_EXTENTS below). If the "next coordinate" brings us past the end of the
1490 twig level, then we call reverse_relocate_end_of_twig to possibly dirty the
1491 next (right) twig, prior to step #5 which moves to the right.
1493 Step 5: calls squalloc_changed_ancestors, which initiates a recursive call up
1494 the tree to allocate any ancestors of the next-right flush position that are
1495 not also ancestors of the current position. Those ancestors (in top-down
1496 order) are the next in parent-first order. We squeeze adjacent nodes on the
1497 way up until the right node and current node share the same parent, then
1498 allocate on the way back down. Finally, this step sets the flush position to
1499 the next-right node. Then repeat steps 4 and 5.
1504 /* squalloc_right_twig helper function, cut a range of extent items from
1505 cut node to->node from the beginning up to coord @to. */
1506 static int squalloc_right_twig_cut(coord_t
*to
, reiser4_key
* to_key
,
1510 reiser4_key from_key
;
1512 coord_init_first_unit(&from
, to
->node
);
1513 item_key_by_coord(&from
, &from_key
);
1515 return cut_node_content(&from
, to
, &from_key
, to_key
, NULL
);
1518 /* Copy as much of the leading extents from @right to @left, allocating
1519 unallocated extents as they are copied. Returns SQUEEZE_TARGET_FULL or
1520 SQUEEZE_SOURCE_EMPTY when no more can be shifted. If the next item is an
1521 internal item it calls shift_one_internal_unit and may then return
1523 static int squeeze_right_twig(znode
* left
, znode
* right
, flush_pos_t
*pos
)
1525 int ret
= SUBTREE_MOVED
;
1526 coord_t coord
; /* used to iterate over items */
1527 reiser4_key stop_key
;
1529 assert("jmacd-2008", !node_is_empty(right
));
1530 coord_init_first_unit(&coord
, right
);
1532 /* FIXME: can be optimized to cut once */
1533 while (!node_is_empty(coord
.node
) && item_is_extent(&coord
)) {
1536 assert("vs-1468", coord_is_leftmost_unit(&coord
));
1537 ON_DEBUG(vp
= shift_check_prepare(left
, coord
.node
));
1539 /* stop_key is used to find what was copied and what to cut */
1540 stop_key
= *reiser4_min_key();
1541 ret
= squalloc_extent(left
, &coord
, pos
, &stop_key
);
1542 if (ret
!= SQUEEZE_CONTINUE
) {
1543 ON_DEBUG(kfree(vp
));
1546 assert("vs-1465", !keyeq(&stop_key
, reiser4_min_key()));
1548 /* Helper function to do the cutting. */
1549 set_key_offset(&stop_key
, get_key_offset(&stop_key
) - 1);
1551 squalloc_right_twig_cut(&coord
, &stop_key
, left
) == 0);
1553 ON_DEBUG(shift_check(vp
, left
, coord
.node
));
1556 if (node_is_empty(coord
.node
))
1557 ret
= SQUEEZE_SOURCE_EMPTY
;
1559 if (ret
== SQUEEZE_TARGET_FULL
)
1562 if (node_is_empty(right
)) {
1563 /* The whole right node was copied into @left. */
1564 assert("vs-464", ret
== SQUEEZE_SOURCE_EMPTY
);
1568 coord_init_first_unit(&coord
, right
);
1570 if (!item_is_internal(&coord
)) {
1571 /* we do not want to squeeze anything else to left neighbor
1572 because "slum" is over */
1573 ret
= SQUEEZE_TARGET_FULL
;
1576 assert("jmacd-433", item_is_internal(&coord
));
1578 /* Shift an internal unit. The child must be allocated before shifting
1579 any more extents, so we stop here. */
1580 ret
= shift_one_internal_unit(left
, right
);
1583 assert("jmacd-8612", ret
< 0 || ret
== SQUEEZE_TARGET_FULL
1584 || ret
== SUBTREE_MOVED
|| ret
== SQUEEZE_SOURCE_EMPTY
);
1586 if (ret
== SQUEEZE_TARGET_FULL
) {
1587 /* We submit prepped nodes here and expect that this @left twig
1588 * will not be modified again during this jnode_flush() call. */
1591 /* NOTE: seems like io is done under long term locks. */
1592 ret1
= write_prepped_nodes(pos
);
1601 static void item_convert_invariant(flush_pos_t
*pos
)
1603 assert("edward-1225", coord_is_existing_item(&pos
->coord
));
1604 if (chaining_data_present(pos
)) {
1605 item_plugin
*iplug
= item_convert_plug(pos
);
1607 assert("edward-1000",
1608 iplug
== item_plugin_by_coord(&pos
->coord
));
1609 assert("edward-1001", iplug
->f
.convert
!= NULL
);
1611 assert("edward-1226", pos
->child
== NULL
);
1615 #define item_convert_invariant(pos) noop
1619 /* Scan node items starting from the first one and apply for each
1620 item its flush ->convert() method (if any). This method may
1621 resize/kill the item so the tree will be changed.
1623 static int convert_node(flush_pos_t
*pos
, znode
* node
)
1628 assert("edward-304", pos
!= NULL
);
1629 assert("edward-305", pos
->child
== NULL
);
1630 assert("edward-475", znode_convertible(node
));
1631 assert("edward-669", znode_is_wlocked(node
));
1632 assert("edward-1210", !node_is_empty(node
));
1634 if (znode_get_level(node
) != LEAF_LEVEL
)
1638 coord_init_first_unit(&pos
->coord
, node
);
1642 coord_set_to_left(&pos
->coord
);
1643 item_convert_invariant(pos
);
1645 iplug
= item_plugin_by_coord(&pos
->coord
);
1646 assert("edward-844", iplug
!= NULL
);
1648 if (iplug
->f
.convert
) {
1649 ret
= iplug
->f
.convert(pos
);
1653 assert("edward-307", pos
->child
== NULL
);
1655 if (coord_next_item(&pos
->coord
)) {
1658 if (!chaining_data_present(pos
))
1659 /* finished this node */
1661 if (should_chain_next_node(pos
)) {
1662 /* go to next node */
1663 move_chaining_data(pos
, 0/* to next node */);
1666 /* repeat this node */
1667 move_chaining_data(pos
, 1/* this node */);
1670 /* Node is not over.
1671 Check if there is attached convert data.
1672 If so roll one item position back and repeat
1675 if (chaining_data_present(pos
)) {
1677 if (iplug
!= item_plugin_by_coord(&pos
->coord
))
1678 set_item_convert_count(pos
, 0);
1680 ret
= coord_prev_item(&pos
->coord
);
1681 assert("edward-1003", !ret
);
1683 move_chaining_data(pos
, 1/* this node */);
1686 JF_CLR(ZJNODE(node
), JNODE_CONVERTIBLE
);
1687 znode_make_dirty(node
);
1689 assert("edward-1004", !ret
);
1693 /* Squeeze and allocate the right neighbor. This is called after @left and
1694 its current children have been squeezed and allocated already. This
1695 procedure's job is to squeeze and items from @right to @left.
1697 If at the leaf level, use the shift_everything_left memcpy-optimized
1698 version of shifting (squeeze_right_leaf).
1700 If at the twig level, extents are allocated as they are shifted from @right
1701 to @left (squalloc_right_twig).
1703 At any other level, shift one internal item and return to the caller
1704 (squalloc_parent_first) so that the shifted-subtree can be processed in
1707 When unit of internal item is moved, squeezing stops and SUBTREE_MOVED is
1708 returned. When all content of @right is squeezed, SQUEEZE_SOURCE_EMPTY is
1709 returned. If nothing can be moved into @left anymore, SQUEEZE_TARGET_FULL
1713 static int squeeze_right_neighbor(flush_pos_t
*pos
, znode
* left
,
1718 /* FIXME it is possible to see empty hasn't-heard-banshee node in a
1719 * tree owing to error (for example, ENOSPC) in write */
1720 /* assert("jmacd-9321", !node_is_empty(left)); */
1721 assert("jmacd-9322", !node_is_empty(right
));
1722 assert("jmacd-9323", znode_get_level(left
) == znode_get_level(right
));
1724 switch (znode_get_level(left
)) {
1726 /* Shift with extent allocating until either an internal item
1727 is encountered or everything is shifted or no free space
1729 ret
= squeeze_right_twig(left
, right
, pos
);
1733 /* All other levels can use shift_everything until we implement
1734 per-item flush plugins. */
1735 ret
= squeeze_right_non_twig(left
, right
);
1739 assert("jmacd-2011", (ret
< 0 ||
1740 ret
== SQUEEZE_SOURCE_EMPTY
1741 || ret
== SQUEEZE_TARGET_FULL
1742 || ret
== SUBTREE_MOVED
));
1746 static int squeeze_right_twig_and_advance_coord(flush_pos_t
*pos
,
1751 ret
= squeeze_right_twig(pos
->lock
.node
, right
, pos
);
1755 coord_init_after_last_item(&pos
->coord
, pos
->lock
.node
);
1759 coord_init_last_unit(&pos
->coord
, pos
->lock
.node
);
1763 /* forward declaration */
1764 static int squalloc_upper_levels(flush_pos_t
*, znode
*, znode
*);
1766 /* do a fast check for "same parents" condition before calling
1767 * squalloc_upper_levels() */
1768 static inline int check_parents_and_squalloc_upper_levels(flush_pos_t
*pos
,
1772 if (znode_same_parents(left
, right
))
1775 return squalloc_upper_levels(pos
, left
, right
);
1778 /* Check whether the parent of given @right node needs to be processes
1779 ((re)allocated) prior to processing of the child. If @left and @right do not
1780 share at least the parent of the @right is after the @left but before the
1781 @right in parent-first order, we have to (re)allocate it before the @right
1782 gets (re)allocated. */
1783 static int squalloc_upper_levels(flush_pos_t
*pos
, znode
* left
, znode
* right
)
1787 lock_handle left_parent_lock
;
1788 lock_handle right_parent_lock
;
1790 load_count left_parent_load
;
1791 load_count right_parent_load
;
1793 init_lh(&left_parent_lock
);
1794 init_lh(&right_parent_lock
);
1796 init_load_count(&left_parent_load
);
1797 init_load_count(&right_parent_load
);
1799 ret
= reiser4_get_parent(&left_parent_lock
, left
, ZNODE_WRITE_LOCK
);
1803 ret
= reiser4_get_parent(&right_parent_lock
, right
, ZNODE_WRITE_LOCK
);
1807 /* Check for same parents */
1808 if (left_parent_lock
.node
== right_parent_lock
.node
)
1811 if (znode_check_flushprepped(right_parent_lock
.node
)) {
1812 /* Keep parent-first order. In the order, the right parent node
1813 stands before the @right node. If it is already allocated,
1814 we set the preceder (next block search start point) to its
1815 block number, @right node should be allocated after it.
1817 However, preceder is set only if the right parent is on twig
1818 level. The explanation is the following: new branch nodes are
1819 allocated over already allocated children while the tree
1820 grows, it is difficult to keep tree ordered, we assume that
1821 only leaves and twings are correctly allocated. So, only
1822 twigs are used as a preceder for allocating of the rest of
1824 if (znode_get_level(right_parent_lock
.node
) == TWIG_LEVEL
) {
1826 *znode_get_block(right_parent_lock
.node
);
1827 check_preceder(pos
->preceder
.blk
);
1832 ret
= incr_load_count_znode(&left_parent_load
, left_parent_lock
.node
);
1836 ret
= incr_load_count_znode(&right_parent_load
, right_parent_lock
.node
);
1841 squeeze_right_neighbor(pos
, left_parent_lock
.node
,
1842 right_parent_lock
.node
);
1843 /* We stop if error. We stop if some items/units were shifted (ret == 0)
1844 * and thus @right changed its parent. It means we have not process
1845 * right_parent node prior to processing of @right. Positive return
1846 * values say that shifting items was not happen because of "empty
1847 * source" or "target full" conditions. */
1851 /* parent(@left) and parent(@right) may have different parents also. We
1852 * do a recursive call for checking that. */
1854 check_parents_and_squalloc_upper_levels(pos
, left_parent_lock
.node
,
1855 right_parent_lock
.node
);
1859 /* allocate znode when going down */
1860 ret
= lock_parent_and_allocate_znode(right_parent_lock
.node
, pos
);
1863 done_load_count(&left_parent_load
);
1864 done_load_count(&right_parent_load
);
1866 done_lh(&left_parent_lock
);
1867 done_lh(&right_parent_lock
);
1872 /* Check the leftmost child "flushprepped" status, also returns true if child
1873 * node was not found in cache. */
1874 static int leftmost_child_of_unit_check_flushprepped(const coord_t
*coord
)
1881 ret
= get_leftmost_child_of_unit(coord
, &child
);
1887 prepped
= jnode_check_flushprepped(child
);
1890 /* We consider not existing child as a node which slum
1891 processing should not continue to. Not cached node is clean,
1892 so it is flushprepped. */
1899 /* (re)allocate znode with automated getting parent node */
1900 static int lock_parent_and_allocate_znode(znode
* node
, flush_pos_t
*pos
)
1903 lock_handle parent_lock
;
1904 load_count parent_load
;
1907 assert("zam-851", znode_is_write_locked(node
));
1909 init_lh(&parent_lock
);
1910 init_load_count(&parent_load
);
1912 ret
= reiser4_get_parent(&parent_lock
, node
, ZNODE_WRITE_LOCK
);
1916 ret
= incr_load_count_znode(&parent_load
, parent_lock
.node
);
1920 ret
= find_child_ptr(parent_lock
.node
, node
, &pcoord
);
1924 ret
= allocate_znode(node
, &pcoord
, pos
);
1927 done_load_count(&parent_load
);
1928 done_lh(&parent_lock
);
1932 /* Process nodes on leaf level until unformatted node or rightmost node in the
1934 static int handle_pos_on_formatted(flush_pos_t
*pos
)
1937 lock_handle right_lock
;
1938 load_count right_load
;
1940 init_lh(&right_lock
);
1941 init_load_count(&right_load
);
1943 if (should_convert_node(pos
, pos
->lock
.node
)) {
1944 ret
= convert_node(pos
, pos
->lock
.node
);
1951 expected
= should_convert_next_node(pos
);
1952 ret
= neighbor_in_slum(pos
->lock
.node
, &right_lock
, RIGHT_SIDE
,
1953 ZNODE_WRITE_LOCK
, !expected
, expected
);
1956 warning("edward-1495",
1957 "Expected neighbor not found (ret = %d). Fsck?",
1962 /* we don't prep(allocate) nodes for flushing twice. This can be
1963 * suboptimal, or it can be optimal. For now we choose to live
1964 * with the risk that it will be suboptimal because it would be
1965 * quite complex to code it to be smarter. */
1966 if (znode_check_flushprepped(right_lock
.node
)
1967 && !znode_convertible(right_lock
.node
)) {
1968 assert("edward-1005", !should_convert_next_node(pos
));
1973 ret
= incr_load_count_znode(&right_load
, right_lock
.node
);
1976 if (should_convert_node(pos
, right_lock
.node
)) {
1977 ret
= convert_node(pos
, right_lock
.node
);
1980 if (node_is_empty(right_lock
.node
)) {
1981 /* node became empty after converting, repeat */
1982 done_load_count(&right_load
);
1983 done_lh(&right_lock
);
1988 /* squeeze _before_ going upward. */
1990 squeeze_right_neighbor(pos
, pos
->lock
.node
,
1995 if (znode_check_flushprepped(right_lock
.node
)) {
1996 if (should_convert_next_node(pos
)) {
1997 /* in spite of flushprepped status of the node,
1998 its right slum neighbor should be converted*/
1999 assert("edward-953", convert_data(pos
));
2000 assert("edward-954", item_convert_data(pos
));
2002 if (node_is_empty(right_lock
.node
)) {
2003 done_load_count(&right_load
);
2004 done_lh(&right_lock
);
2006 move_flush_pos(pos
, &right_lock
,
2014 if (node_is_empty(right_lock
.node
)) {
2015 /* repeat if right node was squeezed completely */
2016 done_load_count(&right_load
);
2017 done_lh(&right_lock
);
2021 /* parent(right_lock.node) has to be processed before
2022 * (right_lock.node) due to "parent-first" allocation order. */
2024 check_parents_and_squalloc_upper_levels(pos
, pos
->lock
.node
,
2028 /* (re)allocate _after_ going upward */
2029 ret
= lock_parent_and_allocate_znode(right_lock
.node
, pos
);
2032 if (should_terminate_squalloc(pos
)) {
2033 set_item_convert_count(pos
, 0);
2037 /* advance the flush position to the right neighbor */
2038 move_flush_pos(pos
, &right_lock
, &right_load
, NULL
);
2040 ret
= rapid_flush(pos
);
2044 check_convert_info(pos
);
2045 done_load_count(&right_load
);
2046 done_lh(&right_lock
);
2048 /* This function indicates via pos whether to stop or go to twig or
2049 * continue on current level. */
2054 /* Process nodes on leaf level until unformatted node or rightmost node in the
2056 static int handle_pos_on_leaf(flush_pos_t
*pos
)
2060 assert("zam-845", pos
->state
== POS_ON_LEAF
);
2062 ret
= handle_pos_on_formatted(pos
);
2064 if (ret
== -E_NO_NEIGHBOR
) {
2065 /* cannot get right neighbor, go process extents. */
2066 pos
->state
= POS_TO_TWIG
;
2073 /* Process slum on level > 1 */
2074 static int handle_pos_on_internal(flush_pos_t
*pos
)
2076 assert("zam-850", pos
->state
== POS_ON_INTERNAL
);
2077 return handle_pos_on_formatted(pos
);
2080 /* check whether squalloc should stop before processing given extent */
2081 static int squalloc_extent_should_stop(flush_pos_t
*pos
)
2083 assert("zam-869", item_is_extent(&pos
->coord
));
2085 /* pos->child is a jnode handle_pos_on_extent() should start with in
2086 * stead of the first child of the first extent unit. */
2090 assert("vs-1383", jnode_is_unformatted(pos
->child
));
2091 prepped
= jnode_check_flushprepped(pos
->child
);
2093 jnode_get_index(pos
->child
) -
2094 extent_unit_index(&pos
->coord
);
2096 pos
->pos_in_unit
< extent_unit_width(&pos
->coord
));
2097 assert("nikita-3434",
2098 ergo(extent_is_unallocated(&pos
->coord
),
2099 pos
->pos_in_unit
== 0));
2106 pos
->pos_in_unit
= 0;
2107 if (extent_is_unallocated(&pos
->coord
))
2110 return leftmost_child_of_unit_check_flushprepped(&pos
->coord
);
2113 /* Handle the case when regular reiser4 tree (znodes connected one to its
2114 * neighbors by sibling pointers) is interrupted on leaf level by one or more
2115 * unformatted nodes. By having a lock on twig level and use extent code
2116 * routines to process unformatted nodes we swim around an irregular part of
2118 static int handle_pos_on_twig(flush_pos_t
*pos
)
2122 assert("zam-844", pos
->state
== POS_ON_EPOINT
);
2123 assert("zam-843", item_is_extent(&pos
->coord
));
2125 /* We decide should we continue slum processing with current extent
2126 unit: if leftmost child of current extent unit is flushprepped
2127 (i.e. clean or already processed by flush) we stop squalloc(). There
2128 is a fast check for unallocated extents which we assume contain all
2129 not flushprepped nodes. */
2130 /* FIXME: Here we implement simple check, we are only looking on the
2132 ret
= squalloc_extent_should_stop(pos
);
2138 while (pos_valid(pos
) && coord_is_existing_unit(&pos
->coord
)
2139 && item_is_extent(&pos
->coord
)) {
2140 ret
= reiser4_alloc_extent(pos
);
2143 coord_next_unit(&pos
->coord
);
2146 if (coord_is_after_rightmost(&pos
->coord
)) {
2147 pos
->state
= POS_END_OF_TWIG
;
2150 if (item_is_internal(&pos
->coord
)) {
2151 pos
->state
= POS_TO_LEAF
;
2155 assert("zam-860", item_is_extent(&pos
->coord
));
2157 /* "slum" is over */
2158 pos
->state
= POS_INVALID
;
2162 /* When we about to return flush position from twig to leaf level we can process
2163 * the right twig node or move position to the leaf. This processes right twig
2164 * if it is possible and jump to leaf level if not. */
2165 static int handle_pos_end_of_twig(flush_pos_t
*pos
)
2168 lock_handle right_lock
;
2169 load_count right_load
;
2171 jnode
*child
= NULL
;
2173 assert("zam-848", pos
->state
== POS_END_OF_TWIG
);
2174 assert("zam-849", coord_is_after_rightmost(&pos
->coord
));
2176 init_lh(&right_lock
);
2177 init_load_count(&right_load
);
2179 /* We get a lock on the right twig node even it is not dirty because
2180 * slum continues or discontinues on leaf level not on next twig. This
2181 * lock on the right twig is needed for getting its leftmost child. */
2183 reiser4_get_right_neighbor(&right_lock
, pos
->lock
.node
,
2184 ZNODE_WRITE_LOCK
, GN_SAME_ATOM
);
2188 ret
= incr_load_count_znode(&right_load
, right_lock
.node
);
2192 /* right twig could be not dirty */
2193 if (JF_ISSET(ZJNODE(right_lock
.node
), JNODE_DIRTY
)) {
2194 /* If right twig node is dirty we always attempt to squeeze it
2195 * content to the left... */
2198 squeeze_right_twig_and_advance_coord(pos
, right_lock
.node
);
2200 /* pos->coord is on internal item, go to leaf level, or
2201 * we have an error which will be caught in squalloc()
2203 pos
->state
= POS_TO_LEAF
;
2207 /* If right twig was squeezed completely we wave to re-lock
2208 * right twig. now it is done through the top-level squalloc
2210 if (node_is_empty(right_lock
.node
))
2213 /* ... and prep it if it is not yet prepped */
2214 if (!znode_check_flushprepped(right_lock
.node
)) {
2215 /* As usual, process parent before ... */
2217 check_parents_and_squalloc_upper_levels(pos
,
2225 /* ... processing the child */
2227 lock_parent_and_allocate_znode(right_lock
.node
,
2233 coord_init_first_unit(&at_right
, right_lock
.node
);
2235 /* check first child of next twig, should we continue there ? */
2236 ret
= get_leftmost_child_of_unit(&at_right
, &child
);
2237 if (ret
|| child
== NULL
|| jnode_check_flushprepped(child
)) {
2242 /* check clean twig for possible relocation */
2243 if (!znode_check_flushprepped(right_lock
.node
)) {
2245 reverse_relocate_check_dirty_parent(child
,
2249 if (JF_ISSET(ZJNODE(right_lock
.node
), JNODE_DIRTY
))
2254 assert("zam-875", znode_check_flushprepped(right_lock
.node
));
2256 /* Update the preceder by a block number of just processed right twig
2257 * node. The code above could miss the preceder updating because
2258 * allocate_znode() could not be called for this node. */
2259 pos
->preceder
.blk
= *znode_get_block(right_lock
.node
);
2260 check_preceder(pos
->preceder
.blk
);
2262 coord_init_first_unit(&at_right
, right_lock
.node
);
2263 assert("zam-868", coord_is_existing_unit(&at_right
));
2265 pos
->state
= item_is_extent(&at_right
) ? POS_ON_EPOINT
: POS_TO_LEAF
;
2266 move_flush_pos(pos
, &right_lock
, &right_load
, &at_right
);
2269 done_load_count(&right_load
);
2270 done_lh(&right_lock
);
2278 /* Move the pos->lock to leaf node pointed by pos->coord, check should we
2279 * continue there. */
2280 static int handle_pos_to_leaf(flush_pos_t
*pos
)
2283 lock_handle child_lock
;
2284 load_count child_load
;
2287 assert("zam-846", pos
->state
== POS_TO_LEAF
);
2288 assert("zam-847", item_is_internal(&pos
->coord
));
2290 init_lh(&child_lock
);
2291 init_load_count(&child_load
);
2293 ret
= get_leftmost_child_of_unit(&pos
->coord
, &child
);
2296 if (child
== NULL
) {
2301 if (jnode_check_flushprepped(child
)) {
2302 pos
->state
= POS_INVALID
;
2307 longterm_lock_znode(&child_lock
, JZNODE(child
), ZNODE_WRITE_LOCK
,
2312 ret
= incr_load_count_znode(&child_load
, JZNODE(child
));
2316 ret
= allocate_znode(JZNODE(child
), &pos
->coord
, pos
);
2320 /* move flush position to leaf level */
2321 pos
->state
= POS_ON_LEAF
;
2322 move_flush_pos(pos
, &child_lock
, &child_load
, NULL
);
2324 if (node_is_empty(JZNODE(child
))) {
2325 ret
= delete_empty_node(JZNODE(child
));
2326 pos
->state
= POS_INVALID
;
2329 done_load_count(&child_load
);
2330 done_lh(&child_lock
);
2336 /* move pos from leaf to twig, and move lock from leaf to twig. */
2337 /* Move pos->lock to upper (twig) level */
2338 static int handle_pos_to_twig(flush_pos_t
*pos
)
2342 lock_handle parent_lock
;
2343 load_count parent_load
;
2346 assert("zam-852", pos
->state
== POS_TO_TWIG
);
2348 init_lh(&parent_lock
);
2349 init_load_count(&parent_load
);
2352 reiser4_get_parent(&parent_lock
, pos
->lock
.node
, ZNODE_WRITE_LOCK
);
2356 ret
= incr_load_count_znode(&parent_load
, parent_lock
.node
);
2360 ret
= find_child_ptr(parent_lock
.node
, pos
->lock
.node
, &pcoord
);
2364 assert("zam-870", item_is_internal(&pcoord
));
2365 coord_next_item(&pcoord
);
2367 if (coord_is_after_rightmost(&pcoord
))
2368 pos
->state
= POS_END_OF_TWIG
;
2369 else if (item_is_extent(&pcoord
))
2370 pos
->state
= POS_ON_EPOINT
;
2372 /* Here we understand that getting -E_NO_NEIGHBOR in
2373 * handle_pos_on_leaf() was because of just a reaching edge of
2379 move_flush_pos(pos
, &parent_lock
, &parent_load
, &pcoord
);
2382 done_load_count(&parent_load
);
2383 done_lh(&parent_lock
);
2388 typedef int (*pos_state_handle_t
) (flush_pos_t
*);
2389 static pos_state_handle_t flush_pos_handlers
[] = {
2390 /* process formatted nodes on leaf level, keep lock on a leaf node */
2391 [POS_ON_LEAF
] = handle_pos_on_leaf
,
2392 /* process unformatted nodes, keep lock on twig node, pos->coord points
2393 * to extent currently being processed */
2394 [POS_ON_EPOINT
] = handle_pos_on_twig
,
2395 /* move a lock from leaf node to its parent for further processing of
2396 unformatted nodes */
2397 [POS_TO_TWIG
] = handle_pos_to_twig
,
2398 /* move a lock from twig to leaf level when a processing of unformatted
2399 * nodes finishes, pos->coord points to the leaf node we jump to */
2400 [POS_TO_LEAF
] = handle_pos_to_leaf
,
2401 /* after processing last extent in the twig node, attempting to shift
2402 * items from the twigs right neighbor and process them while shifting*/
2403 [POS_END_OF_TWIG
] = handle_pos_end_of_twig
,
2404 /* process formatted nodes on internal level, keep lock on an internal
2406 [POS_ON_INTERNAL
] = handle_pos_on_internal
2409 /* Advance flush position horizontally, prepare for flushing ((re)allocate,
2410 * squeeze, encrypt) nodes and their ancestors in "parent-first" order */
2411 static int squalloc(flush_pos_t
*pos
)
2415 /* maybe needs to be made a case statement with handle_pos_on_leaf as
2416 * first case, for greater CPU efficiency? Measure and see.... -Hans */
2417 while (pos_valid(pos
)) {
2418 ret
= flush_pos_handlers
[pos
->state
] (pos
);
2422 ret
= rapid_flush(pos
);
2427 /* any positive value or -E_NO_NEIGHBOR are legal return codes for
2428 handle_pos* routines, -E_NO_NEIGHBOR means that slum edge was
2430 if (ret
> 0 || ret
== -E_NO_NEIGHBOR
)
2436 static void update_ldkey(znode
* node
)
2440 assert_rw_write_locked(&(znode_get_tree(node
)->dk_lock
));
2441 if (node_is_empty(node
))
2444 znode_set_ld_key(node
, leftmost_key_in_node(node
, &ldkey
));
2447 /* this is to be called after calling of shift node's method to shift data from
2448 @right to @left. It sets left delimiting keys of @left and @right to keys of
2449 first items of @left and @right correspondingly and sets right delimiting key
2450 of @left to first key of @right */
2451 static void update_znode_dkeys(znode
* left
, znode
* right
)
2453 assert_rw_write_locked(&(znode_get_tree(right
)->dk_lock
));
2454 assert("vs-1629", (znode_is_write_locked(left
) &&
2455 znode_is_write_locked(right
)));
2457 /* we need to update left delimiting of left if it was empty before
2460 update_ldkey(right
);
2461 if (node_is_empty(right
))
2462 znode_set_rd_key(left
, znode_get_rd_key(right
));
2464 znode_set_rd_key(left
, znode_get_ld_key(right
));
2467 /* try to shift everything from @right to @left. If everything was shifted -
2468 @right is removed from the tree. Result is the number of bytes shifted. */
2470 shift_everything_left(znode
* right
, znode
* left
, carry_level
* todo
)
2474 carry_plugin_info info
;
2476 coord_init_after_last_item(&from
, right
);
2478 nplug
= node_plugin_by_node(right
);
2481 return nplug
->shift(&from
, left
, SHIFT_LEFT
,
2482 1 /* delete @right if it becomes empty */ ,
2484 /* move coord @from to node @left if everything will
2490 /* Shift as much as possible from @right to @left using the memcpy-optimized
2491 shift_everything_left. @left and @right are formatted neighboring nodes on
2493 static int squeeze_right_non_twig(znode
* left
, znode
* right
)
2499 assert("nikita-2246", znode_get_level(left
) == znode_get_level(right
));
2501 if (!JF_ISSET(ZJNODE(left
), JNODE_DIRTY
) ||
2502 !JF_ISSET(ZJNODE(right
), JNODE_DIRTY
))
2503 return SQUEEZE_TARGET_FULL
;
2505 pool
= init_carry_pool(sizeof(*pool
) + 3 * sizeof(*todo
));
2507 return PTR_ERR(pool
);
2508 todo
= (carry_level
*) (pool
+ 1);
2509 init_carry_level(todo
, pool
);
2511 ret
= shift_everything_left(right
, left
, todo
);
2513 /* something was shifted */
2517 znode_make_dirty(left
);
2518 znode_make_dirty(right
);
2520 /* update delimiting keys of nodes which participated in
2521 shift. FIXME: it would be better to have this in shift
2522 node's operation. But it can not be done there. Nobody
2523 remembers why, though */
2524 tree
= znode_get_tree(left
);
2525 write_lock_dk(tree
);
2526 update_znode_dkeys(left
, right
);
2527 write_unlock_dk(tree
);
2529 /* Carry is called to update delimiting key and, maybe, to
2530 remove empty node. */
2531 grabbed
= get_current_context()->grabbed_blocks
;
2532 ret
= reiser4_grab_space_force(tree
->height
, BA_RESERVED
);
2533 assert("nikita-3003", ret
== 0); /* reserved space is
2534 exhausted. Ask Hans. */
2535 ret
= reiser4_carry(todo
, NULL
/* previous level */);
2536 grabbed2free_mark(grabbed
);
2538 /* Shifting impossible, we return appropriate result code */
2540 node_is_empty(right
) ? SQUEEZE_SOURCE_EMPTY
:
2541 SQUEEZE_TARGET_FULL
;
2544 done_carry_pool(pool
);
2550 static int sibling_link_is_ok(const znode
*left
, const znode
*right
)
2554 read_lock_tree(znode_get_tree(left
));
2555 result
= (left
->right
== right
&& left
== right
->left
);
2556 read_unlock_tree(znode_get_tree(left
));
2561 /* Shift first unit of first item if it is an internal one. Return
2562 SQUEEZE_TARGET_FULL if it fails to shift an item, otherwise return
2564 static int shift_one_internal_unit(znode
* left
, znode
* right
)
2570 carry_plugin_info
*info
;
2573 assert("nikita-2247", znode_get_level(left
) == znode_get_level(right
));
2574 assert("nikita-2435", znode_is_write_locked(left
));
2575 assert("nikita-2436", znode_is_write_locked(right
));
2576 assert("nikita-2434", sibling_link_is_ok(left
, right
));
2578 pool
= init_carry_pool(sizeof(*pool
) + 3 * sizeof(*todo
) +
2579 sizeof(*coord
) + sizeof(*info
)
2581 + sizeof(*coord
) + 2 * sizeof(reiser4_key
)
2585 return PTR_ERR(pool
);
2586 todo
= (carry_level
*) (pool
+ 1);
2587 init_carry_level(todo
, pool
);
2589 coord
= (coord_t
*) (todo
+ 3);
2590 coord_init_first_unit(coord
, right
);
2591 info
= (carry_plugin_info
*) (coord
+ 1);
2594 if (!node_is_empty(left
)) {
2596 reiser4_key
*right_key
;
2597 reiser4_key
*left_key
;
2599 last
= (coord_t
*) (info
+ 1);
2600 right_key
= (reiser4_key
*) (last
+ 1);
2601 left_key
= right_key
+ 1;
2602 coord_init_last_unit(last
, left
);
2604 assert("nikita-2463",
2605 keyle(item_key_by_coord(last
, left_key
),
2606 item_key_by_coord(coord
, right_key
)));
2610 assert("jmacd-2007", item_is_internal(coord
));
2612 size
= item_length_by_coord(coord
);
2616 ret
= node_plugin_by_node(left
)->shift(coord
, left
, SHIFT_LEFT
,
2618 /* delete @right if it becomes
2622 /* do not move coord @coord to
2627 /* If shift returns positive, then we shifted the item. */
2628 assert("vs-423", ret
<= 0 || size
== ret
);
2632 /* something was moved */
2636 znode_make_dirty(left
);
2637 znode_make_dirty(right
);
2638 tree
= znode_get_tree(left
);
2639 write_lock_dk(tree
);
2640 update_znode_dkeys(left
, right
);
2641 write_unlock_dk(tree
);
2643 /* reserve space for delimiting keys after shifting */
2644 grabbed
= get_current_context()->grabbed_blocks
;
2645 ret
= reiser4_grab_space_force(tree
->height
, BA_RESERVED
);
2646 assert("nikita-3003", ret
== 0); /* reserved space is
2647 exhausted. Ask Hans. */
2649 ret
= reiser4_carry(todo
, NULL
/* previous level */);
2650 grabbed2free_mark(grabbed
);
2653 done_carry_pool(pool
);
2656 /* Shift or carry operation failed. */
2657 assert("jmacd-7325", ret
< 0);
2661 return moved
? SUBTREE_MOVED
: SQUEEZE_TARGET_FULL
;
2664 /* Make the final relocate/wander decision during forward parent-first squalloc
2665 for a znode. For unformatted nodes this is done in
2666 plugin/item/extent.c:extent_needs_allocation(). */
2668 allocate_znode_loaded(znode
* node
,
2669 const coord_t
*parent_coord
, flush_pos_t
*pos
)
2672 reiser4_super_info_data
*sbinfo
= get_current_super_private();
2673 /* FIXME(D): We have the node write-locked and should have checked for !
2674 allocated() somewhere before reaching this point, but there can be a
2675 race, so this assertion is bogus. */
2676 assert("jmacd-7987", !jnode_check_flushprepped(ZJNODE(node
)));
2677 assert("jmacd-7988", znode_is_write_locked(node
));
2678 assert("jmacd-7989", coord_is_invalid(parent_coord
)
2679 || znode_is_write_locked(parent_coord
->node
));
2681 if (ZF_ISSET(node
, JNODE_REPACK
) || ZF_ISSET(node
, JNODE_CREATED
) ||
2682 znode_is_root(node
) ||
2683 /* We have enough nodes to relocate no matter what. */
2684 (pos
->leaf_relocate
!= 0 && znode_get_level(node
) == LEAF_LEVEL
)) {
2685 /* No need to decide with new nodes, they are treated the same
2686 as relocate. If the root node is dirty, relocate. */
2687 if (pos
->preceder
.blk
== 0) {
2688 /* preceder is unknown and we have decided to relocate
2689 node -- using of default value for search start is
2690 better than search from block #0. */
2691 get_blocknr_hint_default(&pos
->preceder
.blk
);
2692 check_preceder(pos
->preceder
.blk
);
2697 } else if (pos
->preceder
.blk
== 0) {
2698 /* If we don't know the preceder, leave it where it is. */
2699 jnode_make_wander(ZJNODE(node
));
2701 /* Make a decision based on block distance. */
2702 reiser4_block_nr dist
;
2703 reiser4_block_nr nblk
= *znode_get_block(node
);
2705 assert("jmacd-6172", !reiser4_blocknr_is_fake(&nblk
));
2706 assert("jmacd-6173", !reiser4_blocknr_is_fake(&pos
->preceder
.blk
));
2707 assert("jmacd-6174", pos
->preceder
.blk
!= 0);
2709 if (pos
->preceder
.blk
== nblk
- 1) {
2711 jnode_make_wander(ZJNODE(node
));
2716 pos
->preceder
.blk
) ? (pos
->preceder
.blk
-
2720 /* See if we can find a closer block
2721 (forward direction only). */
2722 pos
->preceder
.max_dist
=
2723 min((reiser4_block_nr
) sbinfo
->flush
.
2724 relocate_distance
, dist
);
2725 pos
->preceder
.level
= znode_get_level(node
);
2727 ret
= allocate_znode_update(node
, parent_coord
, pos
);
2729 pos
->preceder
.max_dist
= 0;
2731 if (ret
&& (ret
!= -ENOSPC
))
2735 /* Got a better allocation. */
2736 znode_make_reloc(node
, pos
->fq
);
2737 } else if (dist
< sbinfo
->flush
.relocate_distance
) {
2738 /* The present allocation is good enough. */
2739 jnode_make_wander(ZJNODE(node
));
2741 /* Otherwise, try to relocate to the best
2745 allocate_znode_update(node
, parent_coord
,
2750 /* set JNODE_RELOC bit _after_ node gets
2752 znode_make_reloc(node
, pos
->fq
);
2757 /* This is the new preceder. */
2758 pos
->preceder
.blk
= *znode_get_block(node
);
2759 check_preceder(pos
->preceder
.blk
);
2760 pos
->alloc_cnt
+= 1;
2762 assert("jmacd-4277", !reiser4_blocknr_is_fake(&pos
->preceder
.blk
));
2768 allocate_znode(znode
* node
, const coord_t
*parent_coord
, flush_pos_t
*pos
)
2771 * perform znode allocation with znode pinned in memory to avoid races
2772 * with asynchronous emergency flush (which plays with
2773 * JNODE_FLUSH_RESERVED bit).
2775 return WITH_DATA(node
, allocate_znode_loaded(node
, parent_coord
, pos
));
2778 /* A subroutine of allocate_znode, this is called first to see if there is a
2779 close position to relocate to. It may return ENOSPC if there is no close
2780 position. If there is no close position it may not relocate. This takes care
2781 of updating the parent node with the relocated block address. */
2783 allocate_znode_update(znode
* node
, const coord_t
*parent_coord
,
2787 reiser4_block_nr blk
;
2788 lock_handle uber_lock
;
2789 int flush_reserved_used
= 0;
2791 reiser4_context
*ctx
;
2792 reiser4_super_info_data
*sbinfo
;
2794 init_lh(&uber_lock
);
2796 ctx
= get_current_context();
2797 sbinfo
= get_super_private(ctx
->super
);
2799 grabbed
= ctx
->grabbed_blocks
;
2801 /* discard e-flush allocation */
2806 if (ZF_ISSET(node
, JNODE_CREATED
)) {
2807 assert("zam-816", reiser4_blocknr_is_fake(znode_get_block(node
)));
2808 pos
->preceder
.block_stage
= BLOCK_UNALLOCATED
;
2810 pos
->preceder
.block_stage
= BLOCK_GRABBED
;
2812 /* The disk space for relocating the @node is already reserved
2813 * in "flush reserved" counter if @node is leaf, otherwise we
2814 * grab space using BA_RESERVED (means grab space from whole
2815 * disk not from only 95%). */
2816 if (znode_get_level(node
) == LEAF_LEVEL
) {
2818 * earlier (during do_jnode_make_dirty()) we decided
2819 * that @node can possibly go into overwrite set and
2820 * reserved block for its wandering location.
2822 txn_atom
*atom
= get_current_atom_locked();
2823 assert("nikita-3449",
2824 ZF_ISSET(node
, JNODE_FLUSH_RESERVED
));
2825 flush_reserved2grabbed(atom
, (__u64
) 1);
2826 spin_unlock_atom(atom
);
2828 * we are trying to move node into relocate
2829 * set. Allocation of relocated position "uses"
2832 ZF_CLR(node
, JNODE_FLUSH_RESERVED
);
2833 flush_reserved_used
= 1;
2835 ret
= reiser4_grab_space_force((__u64
) 1, BA_RESERVED
);
2841 /* We may do not use 5% of reserved disk space here and flush will not
2843 ret
= reiser4_alloc_block(&pos
->preceder
, &blk
,
2844 BA_FORMATTED
| BA_PERMANENT
);
2848 if (!ZF_ISSET(node
, JNODE_CREATED
) &&
2850 reiser4_dealloc_block(znode_get_block(node
), 0,
2851 BA_DEFER
| BA_FORMATTED
)))
2854 if (likely(!znode_is_root(node
))) {
2857 iplug
= item_plugin_by_coord(parent_coord
);
2858 assert("nikita-2954", iplug
->f
.update
!= NULL
);
2859 iplug
->f
.update(parent_coord
, &blk
);
2861 znode_make_dirty(parent_coord
->node
);
2864 reiser4_tree
*tree
= znode_get_tree(node
);
2867 /* We take a longterm lock on the fake node in order to change
2868 the root block number. This may cause atom fusion. */
2869 ret
= get_uber_znode(tree
, ZNODE_WRITE_LOCK
, ZNODE_LOCK_HIPRI
,
2871 /* The fake node cannot be deleted, and we must have priority
2872 here, and may not be confused with ENOSPC. */
2873 assert("jmacd-74412",
2874 ret
!= -EINVAL
&& ret
!= -E_DEADLOCK
&& ret
!= -ENOSPC
);
2879 uber
= uber_lock
.node
;
2881 write_lock_tree(tree
);
2882 tree
->root_block
= blk
;
2883 write_unlock_tree(tree
);
2885 znode_make_dirty(uber
);
2888 ret
= znode_rehash(node
, &blk
);
2891 /* Get flush reserved block back if something fails, because
2892 * callers assume that on error block wasn't relocated and its
2893 * flush reserved block wasn't used. */
2894 if (flush_reserved_used
) {
2896 * ok, we failed to move node into relocate
2897 * set. Restore status quo.
2899 grabbed2flush_reserved((__u64
) 1);
2900 ZF_SET(node
, JNODE_FLUSH_RESERVED
);
2904 done_lh(&uber_lock
);
2905 grabbed2free_mark(grabbed
);
2909 /* JNODE INTERFACE */
2911 /* Lock a node (if formatted) and then get its parent locked, set the child's
2912 coordinate in the parent. If the child is the root node, the above_root
2913 znode is returned but the coord is not set. This function may cause atom
2914 fusion, but it is only used for read locks (at this point) and therefore
2915 fusion only occurs when the parent is already dirty. */
2916 /* Hans adds this note: remember to ask how expensive this operation is vs.
2917 storing parent pointer in jnodes. */
2919 jnode_lock_parent_coord(jnode
* node
,
2921 lock_handle
* parent_lh
,
2922 load_count
* parent_zh
,
2923 znode_lock_mode parent_mode
, int try)
2927 assert("edward-53", jnode_is_unformatted(node
) || jnode_is_znode(node
));
2928 assert("edward-54", jnode_is_unformatted(node
)
2929 || znode_is_any_locked(JZNODE(node
)));
2931 if (!jnode_is_znode(node
)) {
2933 tree_level stop_level
= TWIG_LEVEL
;
2934 lookup_bias bias
= FIND_EXACT
;
2936 assert("edward-168", !(jnode_get_type(node
) == JNODE_BITMAP
));
2938 /* The case when node is not znode, but can have parent coord
2939 (unformatted node, node which represents cluster page,
2940 etc..). Generate a key for the appropriate entry, search
2941 in the tree using coord_by_key, which handles locking for
2945 * nothing is locked at this moment, so, nothing prevents
2946 * concurrent truncate from removing jnode from inode. To
2947 * prevent this spin-lock jnode. jnode can be truncated just
2948 * after call to the jnode_build_key(), but this is ok,
2949 * because coord_by_key() will just fail to find appropriate
2952 spin_lock_jnode(node
);
2953 if (!JF_ISSET(node
, JNODE_HEARD_BANSHEE
)) {
2954 jnode_build_key(node
, &key
);
2957 ret
= RETERR(-ENOENT
);
2958 spin_unlock_jnode(node
);
2963 if (jnode_is_cluster_page(node
))
2964 stop_level
= LEAF_LEVEL
;
2966 assert("jmacd-1812", coord
!= NULL
);
2968 ret
= coord_by_key(jnode_get_tree(node
), &key
, coord
, parent_lh
,
2969 parent_mode
, bias
, stop_level
, stop_level
,
2970 CBK_UNIQUE
, NULL
/*ra_info */);
2972 case CBK_COORD_NOTFOUND
:
2973 assert("edward-1038",
2974 ergo(jnode_is_cluster_page(node
),
2975 JF_ISSET(node
, JNODE_HEARD_BANSHEE
)));
2976 if (!JF_ISSET(node
, JNODE_HEARD_BANSHEE
))
2977 warning("nikita-3177", "Parent not found");
2979 case CBK_COORD_FOUND
:
2980 if (coord
->between
!= AT_UNIT
) {
2981 /* FIXME: comment needed */
2983 if (!JF_ISSET(node
, JNODE_HEARD_BANSHEE
)) {
2984 warning("nikita-3178",
2985 "Found but not happy: %i",
2988 return RETERR(-ENOENT
);
2990 ret
= incr_load_count_znode(parent_zh
, parent_lh
->node
);
2993 /* if (jnode_is_cluster_page(node)) {
2994 races with write() are possible
2995 check_child_cluster (parent_lh->node);
3008 /* Formatted node case: */
3009 assert("jmacd-2061", !znode_is_root(z
));
3011 flags
= GN_ALLOW_NOT_CONNECTED
;
3013 flags
|= GN_TRY_LOCK
;
3016 reiser4_get_parent_flags(parent_lh
, z
, parent_mode
, flags
);
3018 /* -E_REPEAT is ok here, it is handled by the caller. */
3021 /* Make the child's position "hint" up-to-date. (Unless above
3022 root, which caller must check.) */
3023 if (coord
!= NULL
) {
3025 ret
= incr_load_count_znode(parent_zh
, parent_lh
->node
);
3027 warning("jmacd-976812386",
3028 "incr_load_count_znode failed: %d",
3033 ret
= find_child_ptr(parent_lh
->node
, z
, coord
);
3035 warning("jmacd-976812",
3036 "find_child_ptr failed: %d", ret
);
3045 /* Get the (locked) next neighbor of a znode which is dirty and a member of the
3046 same atom. If there is no next neighbor or the neighbor is not in memory or
3047 if there is a neighbor but it is not dirty or not in the same atom,
3048 -E_NO_NEIGHBOR is returned. In some cases the slum may include nodes which
3049 are not dirty, if so @check_dirty should be 0 */
3050 static int neighbor_in_slum(znode
* node
, /* starting point */
3051 lock_handle
* lock
, /* lock on starting point */
3052 sideof side
, /* left or right direction we
3053 seek the next node in */
3054 znode_lock_mode mode
, /* kind of lock we want */
3055 int check_dirty
, /* true if the neighbor should
3057 int use_upper_levels
/* get neighbor by going though
3063 assert("jmacd-6334", znode_is_connected(node
));
3065 flags
= GN_SAME_ATOM
| (side
== LEFT_SIDE
? GN_GO_LEFT
: 0);
3066 if (use_upper_levels
)
3067 flags
|= GN_CAN_USE_UPPER_LEVELS
;
3069 ret
= reiser4_get_neighbor(lock
, node
, mode
, flags
);
3071 /* May return -ENOENT or -E_NO_NEIGHBOR. */
3072 /* FIXME(C): check EINVAL, E_DEADLOCK */
3074 ret
= RETERR(-E_NO_NEIGHBOR
);
3079 /* Check dirty bit of locked znode, no races here */
3080 if (JF_ISSET(ZJNODE(lock
->node
), JNODE_DIRTY
))
3084 return RETERR(-E_NO_NEIGHBOR
);
3087 /* Return true if two znodes have the same parent. This is called with both
3088 nodes write-locked (for squeezing) so no tree lock is needed. */
3089 static int znode_same_parents(znode
* a
, znode
* b
)
3093 assert("jmacd-7011", znode_is_write_locked(a
));
3094 assert("jmacd-7012", znode_is_write_locked(b
));
3096 /* We lock the whole tree for this check.... I really don't like whole
3097 * tree locks... -Hans */
3098 read_lock_tree(znode_get_tree(a
));
3099 result
= (znode_parent(a
) == znode_parent(b
));
3100 read_unlock_tree(znode_get_tree(a
));
3106 /* Initialize the flush_scan data structure. */
3107 static void scan_init(flush_scan
* scan
)
3109 memset(scan
, 0, sizeof(*scan
));
3110 init_lh(&scan
->node_lock
);
3111 init_lh(&scan
->parent_lock
);
3112 init_load_count(&scan
->parent_load
);
3113 init_load_count(&scan
->node_load
);
3114 coord_init_invalid(&scan
->parent_coord
, NULL
);
3117 /* Release any resources held by the flush scan, e.g. release locks,
3118 free memory, etc. */
3119 static void scan_done(flush_scan
* scan
)
3121 done_load_count(&scan
->node_load
);
3122 if (scan
->node
!= NULL
) {
3126 done_load_count(&scan
->parent_load
);
3127 done_lh(&scan
->parent_lock
);
3128 done_lh(&scan
->node_lock
);
3131 /* Returns true if flush scanning is finished. */
3132 int reiser4_scan_finished(flush_scan
* scan
)
3134 return scan
->stop
|| (scan
->direction
== RIGHT_SIDE
&&
3135 scan
->count
>= scan
->max_count
);
3138 /* Return true if the scan should continue to the @tonode. True if the node
3139 meets the same_slum_check condition. If not, deref the "left" node and stop
3141 int reiser4_scan_goto(flush_scan
* scan
, jnode
* tonode
)
3143 int go
= same_slum_check(scan
->node
, tonode
, 1, 0);
3153 /* Set the current scan->node, refcount it, increment count by the @add_count
3154 (number to count, e.g., skipped unallocated nodes), deref previous current,
3155 and copy the current parent coordinate. */
3157 scan_set_current(flush_scan
* scan
, jnode
* node
, unsigned add_count
,
3158 const coord_t
*parent
)
3160 /* Release the old references, take the new reference. */
3161 done_load_count(&scan
->node_load
);
3163 if (scan
->node
!= NULL
)
3166 scan
->count
+= add_count
;
3168 /* This next stmt is somewhat inefficient. The reiser4_scan_extent()
3169 code could delay this update step until it finishes and update the
3170 parent_coord only once. It did that before, but there was a bug and
3171 this was the easiest way to make it correct. */
3173 coord_dup(&scan
->parent_coord
, parent
);
3175 /* Failure may happen at the incr_load_count call, but the caller can
3176 assume the reference is safely taken. */
3177 return incr_load_count_jnode(&scan
->node_load
, node
);
3180 /* Return true if scanning in the leftward direction. */
3181 int reiser4_scanning_left(flush_scan
* scan
)
3183 return scan
->direction
== LEFT_SIDE
;
3186 /* Performs leftward scanning starting from either kind of node. Counts the
3187 starting node. The right-scan object is passed in for the left-scan in order
3188 to copy the parent of an unformatted starting position. This way we avoid
3189 searching for the unformatted node's parent when scanning in each direction.
3190 If we search for the parent once it is set in both scan objects. The limit
3191 parameter tells flush-scan when to stop.
3193 Rapid scanning is used only during scan_left, where we are interested in
3194 finding the 'leftpoint' where we begin flushing. We are interested in
3195 stopping at the left child of a twig that does not have a dirty left
3196 neighbour. THIS IS A SPECIAL CASE. The problem is finding a way to flush only
3197 those nodes without unallocated children, and it is difficult to solve in the
3198 bottom-up flushing algorithm we are currently using. The problem can be
3199 solved by scanning left at every level as we go upward, but this would
3200 basically bring us back to using a top-down allocation strategy, which we
3201 already tried (see BK history from May 2002), and has a different set of
3202 problems. The top-down strategy makes avoiding unallocated children easier,
3203 but makes it difficult to propertly flush dirty children with clean parents
3204 that would otherwise stop the top-down flush, only later to dirty the parent
3205 once the children are flushed. So we solve the problem in the bottom-up
3206 algorithm with a special case for twigs and leaves only.
3208 The first step in solving the problem is this rapid leftward scan. After we
3209 determine that there are at least enough nodes counted to qualify for
3210 FLUSH_RELOCATE_THRESHOLD we are no longer interested in the exact count, we
3211 are only interested in finding the best place to start the flush.
3213 We could choose one of two possibilities:
3215 1. Stop at the leftmost child (of a twig) that does not have a dirty left
3216 neighbor. This requires checking one leaf per rapid-scan twig
3218 2. Stop at the leftmost child (of a twig) where there are no dirty children
3219 of the twig to the left. This requires checking possibly all of the in-memory
3220 children of each twig during the rapid scan.
3222 For now we implement the first policy.
3225 scan_left(flush_scan
* scan
, flush_scan
* right
, jnode
* node
, unsigned limit
)
3229 scan
->max_count
= limit
;
3230 scan
->direction
= LEFT_SIDE
;
3232 ret
= scan_set_current(scan
, jref(node
), 1, NULL
);
3236 ret
= scan_common(scan
, right
);
3240 /* Before rapid scanning, we need a lock on scan->node so that we can
3241 get its parent, only if formatted. */
3242 if (jnode_is_znode(scan
->node
)) {
3243 ret
= longterm_lock_znode(&scan
->node_lock
, JZNODE(scan
->node
),
3244 ZNODE_WRITE_LOCK
, ZNODE_LOCK_LOPRI
);
3247 /* Rapid_scan would go here (with limit set to FLUSH_RELOCATE_THRESHOLD)
3252 /* Performs rightward scanning... Does not count the starting node. The limit
3253 parameter is described in scan_left. If the starting node is unformatted then
3254 the parent_coord was already set during scan_left. The rapid_after parameter
3255 is not used during right-scanning.
3257 scan_right is only called if the scan_left operation does not count at least
3258 FLUSH_RELOCATE_THRESHOLD nodes for flushing. Otherwise, the limit parameter
3259 is set to the difference between scan-left's count and
3260 FLUSH_RELOCATE_THRESHOLD, meaning scan-right counts as high as
3261 FLUSH_RELOCATE_THRESHOLD and then stops. */
3262 static int scan_right(flush_scan
* scan
, jnode
* node
, unsigned limit
)
3266 scan
->max_count
= limit
;
3267 scan
->direction
= RIGHT_SIDE
;
3269 ret
= scan_set_current(scan
, jref(node
), 0, NULL
);
3273 return scan_common(scan
, NULL
);
3276 /* Common code to perform left or right scanning. */
3277 static int scan_common(flush_scan
* scan
, flush_scan
* other
)
3281 assert("nikita-2376", scan
->node
!= NULL
);
3282 assert("edward-54", jnode_is_unformatted(scan
->node
)
3283 || jnode_is_znode(scan
->node
));
3285 /* Special case for starting at an unformatted node. Optimization: we
3286 only want to search for the parent (which requires a tree traversal)
3287 once. Obviously, we shouldn't have to call it once for the left scan
3288 and once for the right scan. For this reason, if we search for the
3289 parent during scan-left we then duplicate the coord/lock/load into
3290 the scan-right object. */
3291 if (jnode_is_unformatted(scan
->node
)) {
3292 ret
= scan_unformatted(scan
, other
);
3296 /* This loop expects to start at a formatted position and performs
3297 chaining of formatted regions */
3298 while (!reiser4_scan_finished(scan
)) {
3300 ret
= scan_formatted(scan
);
3308 static int scan_unformatted(flush_scan
* scan
, flush_scan
* other
)
3313 if (!coord_is_invalid(&scan
->parent_coord
))
3316 /* set parent coord from */
3317 if (!jnode_is_unformatted(scan
->node
)) {
3318 /* formatted position */
3321 assert("edward-301", jnode_is_znode(scan
->node
));
3325 * when flush starts from unformatted node, first thing it
3326 * does is tree traversal to find formatted parent of starting
3327 * node. This parent is then kept lock across scans to the
3328 * left and to the right. This means that during scan to the
3329 * left we cannot take left-ward lock, because this is
3330 * dead-lock prone. So, if we are scanning to the left and
3331 * there is already lock held by this thread,
3332 * jnode_lock_parent_coord() should use try-lock.
3334 try = reiser4_scanning_left(scan
)
3335 && !lock_stack_isclean(get_current_lock_stack());
3336 /* Need the node locked to get the parent lock, We have to
3337 take write lock since there is at least one call path
3338 where this znode is already write-locked by us. */
3340 longterm_lock_znode(&lock
, JZNODE(scan
->node
),
3342 reiser4_scanning_left(scan
) ?
3346 /* EINVAL or E_DEADLOCK here mean... try again! At this
3347 point we've scanned too far and can't back out, just
3351 ret
= jnode_lock_parent_coord(scan
->node
,
3352 &scan
->parent_coord
,
3355 ZNODE_WRITE_LOCK
, try);
3357 /* FIXME(C): check EINVAL, E_DEADLOCK */
3359 if (ret
== -E_REPEAT
) {
3367 /* unformatted position */
3370 jnode_lock_parent_coord(scan
->node
, &scan
->parent_coord
,
3373 ZNODE_WRITE_LOCK
, try);
3378 if (ret
== CBK_COORD_NOTFOUND
)
3379 /* FIXME(C): check EINVAL, E_DEADLOCK */
3382 /* parent was found */
3383 assert("jmacd-8661", other
!= NULL
);
3384 /* Duplicate the reference into the other flush_scan. */
3385 coord_dup(&other
->parent_coord
, &scan
->parent_coord
);
3386 copy_lh(&other
->parent_lock
, &scan
->parent_lock
);
3387 copy_load_count(&other
->parent_load
, &scan
->parent_load
);
3390 return scan_by_coord(scan
);
3393 /* Performs left- or rightward scanning starting from a formatted node. Follow
3394 left pointers under tree lock as long as:
3396 - node->left/right is non-NULL
3397 - node->left/right is connected, dirty
3398 - node->left/right belongs to the same atom
3399 - scan has not reached maximum count
3401 static int scan_formatted(flush_scan
* scan
)
3404 znode
*neighbor
= NULL
;
3406 assert("jmacd-1401", !reiser4_scan_finished(scan
));
3409 znode
*node
= JZNODE(scan
->node
);
3411 /* Node should be connected, but if not stop the scan. */
3412 if (!znode_is_connected(node
)) {
3417 /* Lock the tree, check-for and reference the next sibling. */
3418 read_lock_tree(znode_get_tree(node
));
3420 /* It may be that a node is inserted or removed between a node
3421 and its left sibling while the tree lock is released, but the
3422 flush-scan count does not need to be precise. Thus, we
3423 release the tree lock as soon as we get the neighboring node.
3426 reiser4_scanning_left(scan
) ? node
->left
: node
->right
;
3427 if (neighbor
!= NULL
)
3430 read_unlock_tree(znode_get_tree(node
));
3432 /* If neighbor is NULL at the leaf level, need to check for an
3433 unformatted sibling using the parent--break in any case. */
3434 if (neighbor
== NULL
)
3437 /* Check the condition for going left, break if it is not met.
3438 This also releases (jputs) the neighbor if false. */
3439 if (!reiser4_scan_goto(scan
, ZJNODE(neighbor
)))
3442 /* Advance the flush_scan state to the left, repeat. */
3443 ret
= scan_set_current(scan
, ZJNODE(neighbor
), 1, NULL
);
3447 } while (!reiser4_scan_finished(scan
));
3449 /* If neighbor is NULL then we reached the end of a formatted region, or
3450 else the sibling is out of memory, now check for an extent to the
3451 left (as long as LEAF_LEVEL). */
3452 if (neighbor
!= NULL
|| jnode_get_level(scan
->node
) != LEAF_LEVEL
3453 || reiser4_scan_finished(scan
)) {
3457 /* Otherwise, calls scan_by_coord for the right(left)most item of the
3458 left(right) neighbor on the parent level, then possibly continue. */
3460 coord_init_invalid(&scan
->parent_coord
, NULL
);
3461 return scan_unformatted(scan
, NULL
);
3465 This scans adjacent items of the same type and calls scan flush plugin for
3466 each one. Performs left(right)ward scanning starting from a (possibly)
3467 unformatted node. If we start from unformatted node, then we continue only if
3468 the next neighbor is also unformatted. When called from scan_formatted, we
3469 skip first iteration (to make sure that right(left)most item of the
3470 left(right) neighbor on the parent level is of the same type and set
3471 appropriate coord). */
3472 static int scan_by_coord(flush_scan
* scan
)
3475 int scan_this_coord
;
3476 lock_handle next_lock
;
3477 load_count next_load
;
3482 init_lh(&next_lock
);
3483 init_load_count(&next_load
);
3484 scan_this_coord
= (jnode_is_unformatted(scan
->node
) ? 1 : 0);
3486 /* set initial item id */
3487 iplug
= item_plugin_by_coord(&scan
->parent_coord
);
3489 for (; !reiser4_scan_finished(scan
); scan_this_coord
= 1) {
3490 if (scan_this_coord
) {
3491 /* Here we expect that unit is scannable. it would not
3492 * be so due to race with extent->tail conversion. */
3493 if (iplug
->f
.scan
== NULL
) {
3496 /* skip the check at the end. */
3500 ret
= iplug
->f
.scan(scan
);
3504 if (reiser4_scan_finished(scan
)) {
3509 /* the same race against truncate as above is possible
3512 /* NOTE-JMACD: In this case, apply the same end-of-node
3513 logic but don't scan the first coordinate. */
3514 assert("jmacd-1231",
3515 item_is_internal(&scan
->parent_coord
));
3518 if (iplug
->f
.utmost_child
== NULL
3519 || znode_get_level(scan
->parent_coord
.node
) != TWIG_LEVEL
) {
3520 /* stop this coord and continue on parrent level */
3522 scan_set_current(scan
,
3524 (scan
->parent_coord
.node
)),
3531 /* Either way, the invariant is that scan->parent_coord is set
3532 to the parent of scan->node. Now get the next unit. */
3533 coord_dup(&next_coord
, &scan
->parent_coord
);
3534 coord_sideof_unit(&next_coord
, scan
->direction
);
3536 /* If off-the-end of the twig, try the next twig. */
3537 if (coord_is_after_sideof_unit(&next_coord
, scan
->direction
)) {
3538 /* We take the write lock because we may start flushing
3539 * from this coordinate. */
3540 ret
= neighbor_in_slum(next_coord
.node
,
3544 1 /* check dirty */,
3545 0 /* don't go though upper
3547 if (ret
== -E_NO_NEIGHBOR
) {
3556 ret
= incr_load_count_znode(&next_load
, next_lock
.node
);
3560 coord_init_sideof_unit(&next_coord
, next_lock
.node
,
3561 sideof_reverse(scan
->direction
));
3564 iplug
= item_plugin_by_coord(&next_coord
);
3566 /* Get the next child. */
3568 iplug
->f
.utmost_child(&next_coord
,
3569 sideof_reverse(scan
->direction
),
3573 /* If the next child is not in memory, or, item_utmost_child
3574 failed (due to race with unlink, most probably), stop
3576 if (child
== NULL
|| IS_ERR(child
)) {
3582 assert("nikita-2374", jnode_is_unformatted(child
)
3583 || jnode_is_znode(child
));
3585 /* See if it is dirty, part of the same atom. */
3586 if (!reiser4_scan_goto(scan
, child
)) {
3591 /* If so, make this child current. */
3592 ret
= scan_set_current(scan
, child
, 1, &next_coord
);
3596 /* Now continue. If formatted we release the parent lock and
3597 return, then proceed. */
3598 if (jnode_is_znode(child
))
3601 /* Otherwise, repeat the above loop with next_coord. */
3602 if (next_load
.node
!= NULL
) {
3603 done_lh(&scan
->parent_lock
);
3604 move_lh(&scan
->parent_lock
, &next_lock
);
3605 move_load_count(&scan
->parent_load
, &next_load
);
3609 assert("jmacd-6233",
3610 reiser4_scan_finished(scan
) || jnode_is_znode(scan
->node
));
3613 race
: /* skip the above check */
3614 if (jnode_is_znode(scan
->node
)) {
3615 done_lh(&scan
->parent_lock
);
3616 done_load_count(&scan
->parent_load
);
3619 done_load_count(&next_load
);
3620 done_lh(&next_lock
);
3624 /* FLUSH POS HELPERS */
3626 /* Initialize the fields of a flush_position. */
3627 static void pos_init(flush_pos_t
*pos
)
3629 memset(pos
, 0, sizeof *pos
);
3631 pos
->state
= POS_INVALID
;
3632 coord_init_invalid(&pos
->coord
, NULL
);
3633 init_lh(&pos
->lock
);
3634 init_load_count(&pos
->load
);
3636 reiser4_blocknr_hint_init(&pos
->preceder
);
3639 /* The flush loop inside squalloc periodically checks pos_valid to determine
3640 when "enough flushing" has been performed. This will return true until one
3641 of the following conditions is met:
3643 1. the number of flush-queued nodes has reached the kernel-supplied
3644 "int *nr_to_flush" parameter, meaning we have flushed as many blocks as the
3645 kernel requested. When flushing to commit, this parameter is NULL.
3647 2. pos_stop() is called because squalloc discovers that the "next" node in
3648 the flush order is either non-existant, not dirty, or not in the same atom.
3651 static int pos_valid(flush_pos_t
*pos
)
3653 return pos
->state
!= POS_INVALID
;
3656 /* Release any resources of a flush_position. Called when jnode_flush
3658 static void pos_done(flush_pos_t
*pos
)
3661 reiser4_blocknr_hint_done(&pos
->preceder
);
3662 if (convert_data(pos
))
3663 free_convert_data(pos
);
3666 /* Reset the point and parent. Called during flush subroutines to terminate the
3668 static int pos_stop(flush_pos_t
*pos
)
3670 pos
->state
= POS_INVALID
;
3671 done_lh(&pos
->lock
);
3672 done_load_count(&pos
->load
);
3673 coord_init_invalid(&pos
->coord
, NULL
);
3683 /* Return the flush_position's block allocator hint. */
3684 reiser4_blocknr_hint
*reiser4_pos_hint(flush_pos_t
*pos
)
3686 return &pos
->preceder
;
3689 flush_queue_t
*reiser4_pos_fq(flush_pos_t
*pos
)
3694 /* Make Linus happy.
3696 c-indentation-style: "K&R"
3701 LocalWords: preceder