fs/reiser4/flush.c

   1 /* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
   2    reiser4/README */
   3
   4 /* The design document for this file is at http://www.namesys.com/v4/v4.html. */
   5
   6 #include "forward.h"
   7 #include "debug.h"
   8 #include "dformat.h"
   9 #include "key.h"
  10 #include "coord.h"
  11 #include "plugin/item/item.h"
  12 #include "plugin/plugin.h"
  13 #include "plugin/object.h"
  14 #include "txnmgr.h"
  15 #include "jnode.h"
  16 #include "znode.h"
  17 #include "block_alloc.h"
  18 #include "tree_walk.h"
  19 #include "carry.h"
  20 #include "tree.h"
  21 #include "vfs_ops.h"
  22 #include "inode.h"
  23 #include "page_cache.h"
  24 #include "wander.h"
  25 #include "super.h"
  26 #include "entd.h"
  27 #include "reiser4.h"
  28 #include "flush.h"
  29 #include "writeout.h"
  30
  31 #include <asm/atomic.h>
  32 #include <linux/fs.h>           /* for struct super_block  */
  33 #include <linux/mm.h>           /* for struct page */
  34 #include <linux/bio.h>          /* for struct bio */
  35 #include <linux/pagemap.h>
  36 #include <linux/blkdev.h>
  37
  38 /* IMPLEMENTATION NOTES */
  39
  40 /* PARENT-FIRST: Some terminology: A parent-first traversal is a way of
  41    assigning a total order to the nodes of the tree in which the parent is
  42    placed before its children, which are ordered (recursively) in left-to-right
  43    order. When we speak of a "parent-first preceder", it describes the node that
  44    "came before in forward parent-first order". When we speak of a "parent-first
  45    follower", it describes the node that "comes next in parent-first order"
  46    (alternatively the node that "came before in reverse parent-first order").
  47
  48    The following pseudo-code prints the nodes of a tree in forward parent-first
  49    order:
  50
  51    void parent_first (node)
  52    {
  53      print_node (node);
  54      if (node->level > leaf) {
  55        for (i = 0; i < num_children; i += 1) {
  56          parent_first (node->child[i]);
  57        }
  58      }
  59    }
  60 */
  61
  62 /* JUST WHAT ARE WE TRYING TO OPTIMIZE, HERE?  The idea is to optimize block
  63    allocation so that a left-to-right scan of the tree's data (i.e., the leaves
  64    in left-to-right order) can be accomplished with sequential reads, which
  65    results in reading nodes in their parent-first order. This is a
  66    read-optimization aspect of the flush algorithm, and there is also a
  67    write-optimization aspect, which is that we wish to make large sequential
  68    writes to the disk by allocating or reallocating blocks so that they can be
  69    written in sequence. Sometimes the read-optimization and write-optimization
  70    goals conflict with each other, as we discuss in more detail below.
  71 */
  72
  73 /* STATE BITS: The flush code revolves around the state of the jnodes it covers.
  74    Here are the relevant jnode->state bits and their relevence to flush:
  75
  76      JNODE_DIRTY: If a node is dirty, it must be flushed. But in order to be
  77      written it must be allocated first. In order to be considered allocated,
  78      the jnode must have exactly one of { JNODE_OVRWR, JNODE_RELOC } set. These
  79      two bits are exclusive, and all dirtied jnodes eventually have one of these
  80      bits set during each transaction.
  81
  82      JNODE_CREATED: The node was freshly created in its transaction and has no
  83      previous block address, so it is unconditionally assigned to be relocated,
  84      although this is mainly for code-convenience. It is not being 'relocated'
  85      from anything, but in almost every regard it is treated as part of the
  86      relocate set. The JNODE_CREATED bit remains set even after JNODE_RELOC is
  87      set, so the actual relocate can be distinguished from the
  88      created-and-allocated set easily: relocate-set members (belonging to the
  89      preserve-set) have (JNODE_RELOC) set and created-set members which have no
  90      previous location to preserve have (JNODE_RELOC | JNODE_CREATED) set.
  91
  92      JNODE_OVRWR: The node belongs to atom's overwrite set. The flush algorithm
  93      made the decision to maintain the pre-existing location for this node and
  94      it will be written to the wandered-log.
  95
  96      JNODE_RELOC: The flush algorithm made the decision to relocate this block
  97      (if it was not created, see note above). A block with JNODE_RELOC set is
  98      eligible for early-flushing and may be submitted during flush_empty_queues.
  99      When the JNODE_RELOC bit is set on a znode, the parent node's internal item
 100      is modified and the znode is rehashed.
 101
 102      JNODE_SQUEEZABLE: Before shifting everything left, the flush algorithm
 103      scans the node and calls plugin->f.squeeze() method for its items. By this
 104      technology we update disk clusters of cryptcompress objects. Also if
 105      leftmost point that was found by flush scan has this flag (races with
 106      write(), rare case) the flush algorythm makes the decision to pass it to
 107      squalloc() in spite of its flushprepped status for squeezing, not for
 108      repeated allocation.
 109
 110      JNODE_FLUSH_QUEUED: This bit is set when a call to flush enters the jnode
 111      into its flush queue. This means the jnode is not on any clean or dirty
 112      list, instead it is moved to one of the flush queue (see flush_queue.h)
 113      object private list. This prevents multiple concurrent flushes from
 114      attempting to start flushing from the same node.
 115
 116      (DEAD STATE BIT) JNODE_FLUSH_BUSY: This bit was set during the bottom-up
 117      squeeze-and-allocate on a node while its children are actively being
 118      squeezed and allocated. This flag was created to avoid submitting a write
 119      request for a node while its children are still being allocated and
 120      squeezed. Then flush queue was re-implemented to allow unlimited number of
 121      nodes be queued. This flag support was commented out in source code because
 122      we decided that there was no reason to submit queued nodes before
 123      jnode_flush() finishes.  However, current code calls fq_write() during a
 124      slum traversal and may submit "busy nodes" to disk. Probably we can
 125      re-enable the JNODE_FLUSH_BUSY bit support in future.
 126
 127    With these state bits, we describe a test used frequently in the code below,
 128    jnode_is_flushprepped()(and the spin-lock-taking jnode_check_flushprepped()).
 129    The test for "flushprepped" returns true if any of the following are true:
 130
 131      - The node is not dirty
 132      - The node has JNODE_RELOC set
 133      - The node has JNODE_OVRWR set
 134
 135    If either the node is not dirty or it has already been processed by flush
 136    (and assigned JNODE_OVRWR or JNODE_RELOC), then it is prepped. If
 137    jnode_is_flushprepped() returns true then flush has work to do on that node.
 138 */
 139
 140 /* FLUSH_PREP_ONCE_PER_TRANSACTION: Within a single transaction a node is never
 141    flushprepped twice (unless an explicit call to flush_unprep is made as
 142    described in detail below). For example a node is dirtied, allocated, and
 143    then early-flushed to disk and set clean. Before the transaction commits, the
 144    page is dirtied again and, due to memory pressure, the node is flushed again.
 145    The flush algorithm will not relocate the node to a new disk location, it
 146    will simply write it to the same, previously relocated position again.
 147 */
 148
 149 /* THE BOTTOM-UP VS. TOP-DOWN ISSUE: This code implements a bottom-up algorithm
 150    where we start at a leaf node and allocate in parent-first order by iterating
 151    to the right. At each step of the iteration, we check for the right neighbor.
 152    Before advancing to the right neighbor, we check if the current position and
 153    the right neighbor share the same parent. If they do not share the same
 154    parent, the parent is allocated before the right neighbor.
 155
 156    This process goes recursively up the tree and squeeze nodes level by level as
 157    long as the right neighbor and the current position have different parents,
 158    then it allocates the right-neighbors-with-different-parents on the way back
 159    down. This process is described in more detail in
 160    flush_squalloc_changed_ancestor and the recursive function
 161    squalloc_one_changed_ancestor. But the purpose here is not to discuss the
 162    specifics of the bottom-up approach as it is to contrast the bottom-up and
 163    top-down approaches.
 164
 165    The top-down algorithm was implemented earlier (April-May 2002). In the
 166    top-down approach, we find a starting point by scanning left along each level
 167    past dirty nodes, then going up and repeating the process until the left node
 168    and the parent node are clean. We then perform a parent-first traversal from
 169    the starting point, which makes allocating in parent-first order trivial.
 170    After one subtree has been allocated in this manner, we move to the right,
 171    try moving upward, then repeat the parent-first traversal.
 172
 173    Both approaches have problems that need to be addressed. Both are
 174    approximately the same amount of code, but the bottom-up approach has
 175    advantages in the order it acquires locks which, at the very least, make it
 176    the better approach. At first glance each one makes the other one look
 177    simpler, so it is important to remember a few of the problems with each one.
 178
 179    Main problem with the top-down approach: When you encounter a clean child
 180    during the parent-first traversal, what do you do? You would like to avoid
 181    searching through a large tree of nodes just to find a few dirty leaves at
 182    the bottom, and there is not an obvious solution. One of the advantages of
 183    the top-down approach is that during the parent-first traversal you check
 184    every child of a parent to see if it is dirty. In this way, the top-down
 185    approach easily handles the main problem of the bottom-up approach:
 186    unallocated children.
 187
 188    The unallocated children problem is that before writing a node to disk we
 189    must make sure that all of its children are allocated. Otherwise, the writing
 190    the node means extra I/O because the node will have to be written again when
 191    the child is finally allocated.
 192
 193    WE HAVE NOT YET ELIMINATED THE UNALLOCATED CHILDREN PROBLEM. Except for bugs,
 194    this should not cause any file system corruption, it only degrades I/O
 195    performance because a node may be written when it is sure to be written at
 196    least one more time in the same transaction when the remaining children are
 197    allocated. What follows is a description of how we will solve the problem.
 198 */
 199
 200 /* HANDLING UNALLOCATED CHILDREN: During flush we may allocate a parent node,
 201    then proceeding in parent first order, allocate some of its left-children,
 202    then encounter a clean child in the middle of the parent. We do not allocate
 203    the clean child, but there may remain unallocated (dirty) children to the
 204    right of the clean child. If we were to stop flushing at this moment and
 205    write everything to disk, the parent might still contain unallocated
 206    children.
 207
 208    We could try to allocate all the descendents of every node that we allocate,
 209    but this is not necessary. Doing so could result in allocating the entire
 210    tree: if the root node is allocated then every unallocated node would have to
 211    be allocated before flushing. Actually, we do not have to write a node just
 212    because we allocate it. It is possible to allocate but not write a node
 213    during flush, when it still has unallocated children. However, this approach
 214    is probably not optimal for the following reason.
 215
 216    The flush algorithm is designed to allocate nodes in parent-first order in an
 217    attempt to optimize reads that occur in the same order. Thus we are
 218    read-optimizing for a left-to-right scan through all the leaves in the
 219    system, and we are hoping to write-optimize at the same time because those
 220    nodes will be written together in batch. What happens, however, if we assign
 221    a block number to a node in its read-optimized order but then avoid writing
 222    it because it has unallocated children? In that situation, we lose out on the
 223    write-optimization aspect because a node will have to be written again to the
 224    its location on the device, later, which likely means seeking back to that
 225    location.
 226
 227    So there are tradeoffs. We can choose either:
 228
 229    A. Allocate all unallocated children to preserve both write-optimization and
 230    read-optimization, but this is not always desirable because it may mean
 231    having to allocate and flush very many nodes at once.
 232
 233    B. Defer writing nodes with unallocated children, keep their read-optimized
 234    locations, but sacrifice write-optimization because those nodes will be
 235    written again.
 236
 237    C. Defer writing nodes with unallocated children, but do not keep their
 238    read-optimized locations. Instead, choose to write-optimize them later, when
 239    they are written. To facilitate this, we "undo" the read-optimized allocation
 240    that was given to the node so that later it can be write-optimized, thus
 241    "unpreparing" the flush decision. This is a case where we disturb the
 242    FLUSH_PREP_ONCE_PER_TRANSACTION rule described above. By a call to
 243    flush_unprep() we will: if the node was wandered, unset the JNODE_OVRWR bit;
 244    if the node was relocated, unset the JNODE_RELOC bit, non-deferred-deallocate
 245    its block location, and set the JNODE_CREATED bit, effectively setting the
 246    node back to an unallocated state.
 247
 248    We will take the following approach in v4.0: for twig nodes we will always
 249    finish allocating unallocated children (A).  For nodes with (level > TWIG)
 250    we will defer writing and choose write-optimization (C).
 251
 252    To summarize, there are several parts to a solution that avoids the problem
 253    with unallocated children:
 254
 255    FIXME-ZAM: Still no one approach is implemented to eliminate the
 256    "UNALLOCATED CHILDREN" problem because there was an experiment which was done
 257    showed that we have 1-2 nodes with unallocated children for thousands of
 258    written nodes. The experiment was simple like coping/deletion of linux kernel
 259    sources. However the problem can arise in more complex tests. I think we have
 260    jnode_io_hook to insert a check for unallocated children and see what kind of
 261    problem we have.
 262
 263    1. When flush reaches a stopping point (e.g. a clean node) it should continue
 264    calling squeeze-and-allocate on any remaining unallocated children.
 265    FIXME: Difficulty to implement: should be simple -- amounts to adding a while
 266    loop to jnode_flush, see comments in that function.
 267
 268    2. When flush reaches flush_empty_queue(), some of the (level > TWIG) nodes
 269    may still have unallocated children. If the twig level has unallocated
 270    children it is an assertion failure. If a higher-level node has unallocated
 271    children, then it should be explicitly de-allocated by a call to
 272    flush_unprep().
 273    FIXME: Difficulty to implement: should be simple.
 274
 275    3. (CPU-Optimization) Checking whether a node has unallocated children may
 276    consume more CPU cycles than we would like, and it is possible (but medium
 277    complexity) to optimize this somewhat in the case where large sub-trees are
 278    flushed. The following observation helps: if both the left- and
 279    right-neighbor of a node are processed by the flush algorithm then the node
 280    itself is guaranteed to have all of its children allocated. However, the cost
 281    of this check may not be so expensive after all: it is not needed for leaves
 282    and flush can guarantee this property for twigs. That leaves only (level >
 283    TWIG) nodes that have to be checked, so this optimization only helps if at
 284    least three (level > TWIG) nodes are flushed in one pass, and the savings
 285    will be very small unless there are many more (level > TWIG) nodes. But if
 286    there are many (level > TWIG) nodes then the number of blocks being written
 287    will be very large, so the savings may be insignificant. That said, the idea
 288    is to maintain both the left and right edges of nodes that are processed in
 289    flush.  When flush_empty_queue() is called, a relatively simple test will
 290    tell whether the (level > TWIG) node is on the edge. If it is on the edge,
 291    the slow check is necessary, but if it is in the interior then it can be
 292    assumed to have all of its children allocated. FIXME: medium complexity to
 293    implement, but simple to verify given that we must have a slow check anyway.
 294
 295    4. (Optional) This part is optional, not for v4.0--flush should work
 296    independently of whether this option is used or not. Called RAPID_SCAN, the
 297    idea is to amend the left-scan operation to take unallocated children into
 298    account. Normally, the left-scan operation goes left as long as adjacent
 299    nodes are dirty up until some large maximum value (FLUSH_SCAN_MAXNODES) at
 300    which point it stops and begins flushing. But scan-left may stop at a
 301    position where there are unallocated children to the left with the same
 302    parent. When RAPID_SCAN is enabled, the ordinary scan-left operation stops
 303    after FLUSH_RELOCATE_THRESHOLD, which is much smaller than
 304    FLUSH_SCAN_MAXNODES, then procedes with a rapid scan. The rapid scan skips
 305    all the interior children of a node--if the leftmost child of a twig is
 306    dirty, check its left neighbor (the rightmost child of the twig to the left).
 307    If the left neighbor of the leftmost child is also dirty, then continue the
 308    scan at the left twig and repeat.  This option will cause flush to allocate
 309    more twigs in a single pass, but it also has the potential to write many more
 310    nodes than would otherwise be written without the RAPID_SCAN option.
 311    RAPID_SCAN was partially implemented, code removed August 12, 2002 by JMACD.
 312 */
 313
 314 /* FLUSH CALLED ON NON-LEAF LEVEL. Most of our design considerations assume that
 315    the starting point for flush is a leaf node, but actually the flush code
 316    cares very little about whether or not this is true.  It is possible that all
 317    the leaf nodes are flushed and dirty parent nodes still remain, in which case
 318    jnode_flush() is called on a non-leaf argument. Flush doesn't care--it treats
 319    the argument node as if it were a leaf, even when it is not. This is a simple
 320    approach, and there may be a more optimal policy but until a problem with
 321    this approach is discovered, simplest is probably best.
 322
 323    NOTE: In this case, the ordering produced by flush is parent-first only if
 324    you ignore the leaves. This is done as a matter of simplicity and there is
 325    only one (shaky) justification. When an atom commits, it flushes all leaf
 326    level nodes first, followed by twigs, and so on. With flushing done in this
 327    order, if flush is eventually called on a non-leaf node it means that
 328    (somehow) we reached a point where all leaves are clean and only internal
 329    nodes need to be flushed. If that it the case, then it means there were no
 330    leaves that were the parent-first preceder/follower of the parent. This is
 331    expected to be a rare case, which is why we do nothing special about it.
 332    However, memory pressure may pass an internal node to flush when there are
 333    still dirty leaf nodes that need to be flushed, which could prove our
 334    original assumptions "inoperative". If this needs to be fixed, then
 335    scan_left/right should have special checks for the non-leaf levels. For
 336    example, instead of passing from a node to the left neighbor, it should pass
 337    from the node to the left neighbor's rightmost descendent (if dirty).
 338
 339 */
 340
 341 /* UNIMPLEMENTED AS YET: REPACKING AND RESIZING. We walk the tree in 4MB-16MB
 342    chunks, dirtying everything and putting it into a transaction. We tell the
 343    allocator to allocate the blocks as far as possible towards one end of the
 344    logical device--the left (starting) end of the device if we are walking from
 345    left to right, the right end of the device if we are walking from right to
 346    left.  We then make passes in alternating directions, and as we do this the
 347    device becomes sorted such that tree order and block number order fully
 348    correlate.
 349
 350    Resizing is done by shifting everything either all the way to the left or all
 351    the way to the right, and then reporting the last block.
 352 */
 353
 354 /* RELOCATE DECISIONS: The code makes a decision to relocate in several places.
 355    This descibes the policy from the highest level:
 356
 357    The FLUSH_RELOCATE_THRESHOLD parameter: If we count this many consecutive
 358    nodes on the leaf level during flush-scan (right, left), then we
 359    unconditionally decide to relocate leaf nodes.
 360
 361    Otherwise, there are two contexts in which we make a decision to relocate:
 362
 363    1. The REVERSE PARENT-FIRST context: Implemented in reverse_relocate_test().
 364    During the initial stages of flush, after scan-right completes, we want to
 365    ask the question: should we relocate this leaf node and thus dirty the parent
 366    node. Then if the node is a leftmost child its parent is its own parent-first
 367    preceder, thus we repeat the question at the next level up, and so on. In
 368    these cases we are moving in the reverse-parent first direction.
 369
 370    There is another case which is considered the reverse direction, which comes
 371    at the end of a twig in reverse_relocate_end_of_twig(). As we finish
 372    processing a twig we may reach a point where there is a clean twig to the
 373    right with a dirty leftmost child. In this case, we may wish to relocate the
 374    child by testing if it should be relocated relative to its parent.
 375
 376    2. The FORWARD PARENT-FIRST context: Testing for forward relocation is done
 377    in allocate_znode. What distinguishes the forward parent-first case from the
 378    reverse-parent first case is that the preceder has already been allocated in
 379    the forward case, whereas in the reverse case we don't know what the preceder
 380    is until we finish "going in reverse". That simplifies the forward case
 381    considerably, and there we actually use the block allocator to determine
 382    whether, e.g., a block closer to the preceder is available.
 383 */
 384
 385 /* SQUEEZE_LEFT_EDGE: Unimplemented idea for future consideration. The idea is,
 386    once we finish scan-left and find a starting point, if the parent's left
 387    neighbor is dirty then squeeze the parent's left neighbor and the parent.
 388    This may change the flush-starting-node's parent. Repeat until the child's
 389    parent is stable. If the child is a leftmost child, repeat this left-edge
 390    squeezing operation at the next level up. Note that we cannot allocate
 391    extents during this or they will be out of parent-first order. There is also
 392    some difficult coordinate maintenence issues.  We can't do a tree search to
 393    find coordinates again (because we hold locks), we have to determine them
 394    from the two nodes being squeezed. Looks difficult, but has potential to
 395    increase space utilization. */
 396
 397 /* Flush-scan helper functions. */
 398 static void scan_init(flush_scan * scan);
 399 static void scan_done(flush_scan * scan);
 400
 401 /* Flush-scan algorithm. */
 402 static int scan_left(flush_scan * scan, flush_scan * right, jnode * node,
 403                      unsigned limit);
 404 static int scan_right(flush_scan * scan, jnode * node, unsigned limit);
 405 static int scan_common(flush_scan * scan, flush_scan * other);
 406 static int scan_formatted(flush_scan * scan);
 407 static int scan_unformatted(flush_scan * scan, flush_scan * other);
 408 static int scan_by_coord(flush_scan * scan);
 409
 410 /* Initial flush-point ancestor allocation. */
 411 static int alloc_pos_and_ancestors(flush_pos_t *pos);
 412 static int alloc_one_ancestor(const coord_t *coord, flush_pos_t *pos);
 413 static int set_preceder(const coord_t *coord_in, flush_pos_t *pos);
 414
 415 /* Main flush algorithm.
 416    Note on abbreviation: "squeeze and allocate" == "squalloc". */
 417 static int squalloc(flush_pos_t *pos);
 418
 419 /* Flush squeeze implementation. */
 420 static int squeeze_right_non_twig(znode * left, znode * right);
 421 static int shift_one_internal_unit(znode * left, znode * right);
 422
 423 /* Flush reverse parent-first relocation routines. */
 424 static int reverse_relocate_if_close_enough(const reiser4_block_nr * pblk,
 425                                             const reiser4_block_nr * nblk);
 426 static int reverse_relocate_test(jnode * node, const coord_t *parent_coord,
 427                                  flush_pos_t *pos);
 428 static int reverse_relocate_check_dirty_parent(jnode * node,
 429                                                const coord_t *parent_coord,
 430                                                flush_pos_t *pos);
 431
 432 /* Flush allocate write-queueing functions: */
 433 static int allocate_znode(znode * node, const coord_t *parent_coord,
 434                           flush_pos_t *pos);
 435 static int allocate_znode_update(znode * node, const coord_t *parent_coord,
 436                                  flush_pos_t *pos);
 437 static int lock_parent_and_allocate_znode(znode *, flush_pos_t *);
 438
 439 /* Flush helper functions: */
 440 static int jnode_lock_parent_coord(jnode * node,
 441                                    coord_t *coord,
 442                                    lock_handle * parent_lh,
 443                                    load_count * parent_zh,
 444                                    znode_lock_mode mode, int try);
 445 static int neighbor_in_slum(znode * node, lock_handle * right_lock, sideof side,
 446                            znode_lock_mode mode, int check_dirty, int expected);
 447 static int znode_same_parents(znode * a, znode * b);
 448
 449 static int znode_check_flushprepped(znode * node)
 450 {
 451         return jnode_check_flushprepped(ZJNODE(node));
 452 }
 453
 454 /* Flush position functions */
 455 static void pos_init(flush_pos_t *pos);
 456 static int pos_valid(flush_pos_t *pos);
 457 static void pos_done(flush_pos_t *pos);
 458 static int pos_stop(flush_pos_t *pos);
 459
 460 /* check that @org is first jnode extent unit, if extent is unallocated,
 461  * because all jnodes of unallocated extent are dirty and of the same atom. */
 462 #define checkchild(scan)                                                \
 463 assert("nikita-3435",                                                   \
 464        ergo(scan->direction == LEFT_SIDE &&                             \
 465             (scan->parent_coord.node->level == TWIG_LEVEL) &&           \
 466             jnode_is_unformatted(scan->node) &&                         \
 467             extent_is_unallocated(&scan->parent_coord),                 \
 468             extent_unit_index(&scan->parent_coord) == index_jnode(scan->node)))
 469
 470 /* This flush_cnt variable is used to track the number of concurrent flush
 471    operations, useful for debugging. It is initialized in txnmgr.c out of
 472    laziness (because flush has no static initializer function...) */
 473 ON_DEBUG(atomic_t flush_cnt;
 474     )
 475
 476 /* check fs backing device for write congestion */
 477 static int check_write_congestion(void)
 478 {
 479         struct super_block *sb;
 480         struct backing_dev_info *bdi;
 481
 482         sb = reiser4_get_current_sb();
 483         bdi = reiser4_get_super_fake(sb)->i_mapping->backing_dev_info;
 484         return bdi_write_congested(bdi);
 485 }
 486
 487 /* conditionally write flush queue */
 488 static int write_prepped_nodes(flush_pos_t *pos)
 489 {
 490         int ret;
 491
 492         assert("zam-831", pos);
 493         assert("zam-832", pos->fq);
 494
 495         if (!(pos->flags & JNODE_FLUSH_WRITE_BLOCKS))
 496                 return 0;
 497
 498         if (check_write_congestion())
 499                 return 0;
 500
 501         ret = reiser4_write_fq(pos->fq, pos->nr_written,
 502                        WRITEOUT_SINGLE_STREAM | WRITEOUT_FOR_PAGE_RECLAIM);
 503         return ret;
 504 }
 505
 506 /* Proper release all flush pos. resources then move flush position to new
 507    locked node */
 508 static void move_flush_pos(flush_pos_t *pos, lock_handle * new_lock,
 509                            load_count * new_load, const coord_t *new_coord)
 510 {
 511         assert("zam-857", new_lock->node == new_load->node);
 512
 513         if (new_coord) {
 514                 assert("zam-858", new_coord->node == new_lock->node);
 515                 coord_dup(&pos->coord, new_coord);
 516         } else {
 517                 coord_init_first_unit(&pos->coord, new_lock->node);
 518         }
 519
 520         if (pos->child) {
 521                 jput(pos->child);
 522                 pos->child = NULL;
 523         }
 524
 525         move_load_count(&pos->load, new_load);
 526         done_lh(&pos->lock);
 527         move_lh(&pos->lock, new_lock);
 528 }
 529
 530 /* delete empty node which link from the parent still exists. */
 531 static int delete_empty_node(znode * node)
 532 {
 533         reiser4_key smallest_removed;
 534
 535         assert("zam-1019", node != NULL);
 536         assert("zam-1020", node_is_empty(node));
 537         assert("zam-1023", znode_is_wlocked(node));
 538
 539         return reiser4_delete_node(node, &smallest_removed, NULL, 1);
 540 }
 541
 542 /* Prepare flush position for alloc_pos_and_ancestors() and squalloc() */
 543 static int prepare_flush_pos(flush_pos_t *pos, jnode * org)
 544 {
 545         int ret;
 546         load_count load;
 547         lock_handle lock;
 548
 549         init_lh(&lock);
 550         init_load_count(&load);
 551
 552         if (jnode_is_znode(org)) {
 553                 ret = longterm_lock_znode(&lock, JZNODE(org),
 554                                           ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI);
 555                 if (ret)
 556                         return ret;
 557
 558                 ret = incr_load_count_znode(&load, JZNODE(org));
 559                 if (ret)
 560                         return ret;
 561
 562                 pos->state =
 563                     (jnode_get_level(org) ==
 564                      LEAF_LEVEL) ? POS_ON_LEAF : POS_ON_INTERNAL;
 565                 move_flush_pos(pos, &lock, &load, NULL);
 566         } else {
 567                 coord_t parent_coord;
 568                 ret = jnode_lock_parent_coord(org, &parent_coord, &lock,
 569                                               &load, ZNODE_WRITE_LOCK, 0);
 570                 if (ret)
 571                         goto done;
 572                 if (!item_is_extent(&parent_coord)) {
 573                         /* file was converted to tail, org became HB, we found
 574                            internal item */
 575                         ret = -EAGAIN;
 576                         goto done;
 577                 }
 578
 579                 pos->state = POS_ON_EPOINT;
 580                 move_flush_pos(pos, &lock, &load, &parent_coord);
 581                 pos->child = jref(org);
 582                 if (extent_is_unallocated(&parent_coord)
 583                     && extent_unit_index(&parent_coord) != index_jnode(org)) {
 584                         /* @org is not first child of its parent unit. This may
 585                            happen because longerm lock of its parent node was
 586                            released between scan_left and scan_right. For now
 587                            work around this having flush to repeat */
 588                         ret = -EAGAIN;
 589                 }
 590         }
 591
 592 done:
 593         done_load_count(&load);
 594         done_lh(&lock);
 595         return ret;
 596 }
 597
 598 /* TODO LIST (no particular order): */
 599 /* I have labelled most of the legitimate FIXME comments in this file with
 600    letters to indicate which issue they relate to. There are a few miscellaneous
 601    FIXMEs with specific names mentioned instead that need to be
 602    inspected/resolved. */
 603 /* B. There is an issue described in reverse_relocate_test having to do with an
 604    imprecise is_preceder? check having to do with partially-dirty extents. The
 605    code that sets preceder hints and computes the preceder is basically
 606    untested. Careful testing needs to be done that preceder calculations are
 607    done correctly, since if it doesn't affect correctness we will not catch this
 608    stuff during regular testing. */
 609 /* C. EINVAL, E_DEADLOCK, E_NO_NEIGHBOR, ENOENT handling. It is unclear which of
 610    these are considered expected but unlikely conditions. Flush currently
 611    returns 0 (i.e., success but no progress, i.e., restart) whenever it receives
 612    any of these in jnode_flush(). Many of the calls that may produce one of
 613    these return values (i.e., longterm_lock_znode, reiser4_get_parent,
 614    reiser4_get_neighbor, ...) check some of these values themselves and, for
 615    instance, stop flushing instead of resulting in a restart. If any of these
 616    results are true error conditions then flush will go into a busy-loop, as we
 617    noticed during testing when a corrupt tree caused find_child_ptr to return
 618    ENOENT. It needs careful thought and testing of corner conditions.
 619 */
 620 /* D. Atomicity of flush_prep against deletion and flush concurrency. Suppose a
 621    created block is assigned a block number then early-flushed to disk. It is
 622    dirtied again and flush is called again. Concurrently, that block is deleted,
 623    and the de-allocation of its block number does not need to be deferred, since
 624    it is not part of the preserve set (i.e., it didn't exist before the
 625    transaction). I think there may be a race condition where flush writes the
 626    dirty, created block after the non-deferred deallocated block number is
 627    re-allocated, making it possible to write deleted data on top of non-deleted
 628    data. Its just a theory, but it needs to be thought out. */
 629 /* F. bio_alloc() failure is not handled gracefully. */
 630 /* G. Unallocated children. */
 631 /* H. Add a WANDERED_LIST to the atom to clarify the placement of wandered
 632    blocks. */
 633 /* I. Rename flush-scan to scan-point, (flush-pos to flush-point?) */
 634
 635 /* JNODE_FLUSH: MAIN ENTRY POINT */
 636 /* This is the main entry point for flushing a jnode and its dirty neighborhood
 637    (dirty neighborhood is named "slum"). Jnode_flush() is called if reiser4 has
 638    to write dirty blocks to disk, it happens when Linux VM decides to reduce
 639    number of dirty pages or as a part of transaction commit.
 640
 641    Our objective here is to prep and flush the slum the jnode belongs to. We
 642    want to squish the slum together, and allocate the nodes in it as we squish
 643    because allocation of children affects squishing of parents.
 644
 645    The "argument" @node tells flush where to start. From there, flush finds the
 646    left edge of the slum, and calls squalloc (in which nodes are squeezed and
 647    allocated). To find a "better place" to start squalloc first we perform a
 648    flush_scan.
 649
 650    Flush-scanning may be performed in both left and right directions, but for
 651    different purposes. When scanning to the left, we are searching for a node
 652    that precedes a sequence of parent-first-ordered nodes which we will then
 653    flush in parent-first order. During flush-scanning, we also take the
 654    opportunity to count the number of consecutive leaf nodes. If this number is
 655    past some threshold (FLUSH_RELOCATE_THRESHOLD), then we make a decision to
 656    reallocate leaf nodes (thus favoring write-optimization).
 657
 658    Since the flush argument node can be anywhere in a sequence of dirty leaves,
 659    there may also be dirty nodes to the right of the argument. If the scan-left
 660    operation does not count at least FLUSH_RELOCATE_THRESHOLD nodes then we
 661    follow it with a right-scan operation to see whether there is, in fact,
 662    enough nodes to meet the relocate threshold. Each right- and left-scan
 663    operation uses a single flush_scan object.
 664
 665    After left-scan and possibly right-scan, we prepare a flush_position object
 666    with the starting flush point or parent coordinate, which was determined
 667    using scan-left.
 668
 669    Next we call the main flush routine, squalloc, which iterates along the leaf
 670    level, squeezing and allocating nodes (and placing them into the flush
 671    queue).
 672
 673    After squalloc returns we take extra steps to ensure that all the children
 674    of the final twig node are allocated--this involves repeating squalloc
 675    until we finish at a twig with no unallocated children.
 676
 677    Finally, we call flush_empty_queue to submit write-requests to disk. If we
 678    encounter any above-twig nodes during flush_empty_queue that still have
 679    unallocated children, we flush_unprep them.
 680
 681    Flush treats several "failure" cases as non-failures, essentially causing
 682    them to start over. E_DEADLOCK is one example.
 683    FIXME:(C) EINVAL, E_NO_NEIGHBOR, ENOENT: these should probably be handled
 684    properly rather than restarting, but there are a bunch of cases to audit.
 685 */
 686
 687 static int
 688 jnode_flush(jnode * node, long nr_to_write, long *nr_written,
 689             flush_queue_t *fq, int flags)
 690 {
 691         long ret = 0;
 692         flush_scan *right_scan;
 693         flush_scan *left_scan;
 694         flush_pos_t *flush_pos;
 695         int todo;
 696         struct super_block *sb;
 697         reiser4_super_info_data *sbinfo;
 698         jnode *leftmost_in_slum = NULL;
 699
 700         assert("jmacd-76619", lock_stack_isclean(get_current_lock_stack()));
 701         assert("nikita-3022", reiser4_schedulable());
 702
 703         assert("nikita-3185",
 704                get_current_super_private()->delete_mutex_owner != current);
 705
 706         /* allocate right_scan, left_scan and flush_pos */
 707         right_scan =
 708             kmalloc(2 * sizeof(*right_scan) + sizeof(*flush_pos),
 709                     reiser4_ctx_gfp_mask_get());
 710         if (right_scan == NULL)
 711                 return RETERR(-ENOMEM);
 712         left_scan = right_scan + 1;
 713         flush_pos = (flush_pos_t *) (left_scan + 1);
 714
 715         sb = reiser4_get_current_sb();
 716         sbinfo = get_super_private(sb);
 717
 718         /* Flush-concurrency debug code */
 719 #if REISER4_DEBUG
 720         atomic_inc(&flush_cnt);
 721 #endif
 722
 723         reiser4_enter_flush(sb);
 724
 725         /* Initialize a flush position. */
 726         pos_init(flush_pos);
 727
 728         flush_pos->nr_written = nr_written;
 729         flush_pos->fq = fq;
 730         flush_pos->flags = flags;
 731         flush_pos->nr_to_write = nr_to_write;
 732
 733         scan_init(right_scan);
 734         scan_init(left_scan);
 735
 736         /* First scan left and remember the leftmost scan position. If the
 737            leftmost position is unformatted we remember its parent_coord. We
 738            scan until counting FLUSH_SCAN_MAXNODES.
 739
 740            If starting @node is unformatted, at the beginning of left scan its
 741            parent (twig level node, containing extent item) will be long term
 742            locked and lock handle will be stored in the
 743            @right_scan->parent_lock. This lock is used to start the rightward
 744            scan without redoing the tree traversal (necessary to find parent)
 745            and, hence, is kept during leftward scan. As a result, we have to
 746            use try-lock when taking long term locks during the leftward scan.
 747          */
 748         ret = scan_left(left_scan, right_scan,
 749                         node, sbinfo->flush.scan_maxnodes);
 750         if (ret != 0)
 751                 goto failed;
 752
 753         leftmost_in_slum = jref(left_scan->node);
 754         scan_done(left_scan);
 755
 756         /* Then possibly go right to decide if we will use a policy of
 757            relocating leaves. This is only done if we did not scan past (and
 758            count) enough nodes during the leftward scan. If we do scan right,
 759            we only care to go far enough to establish that at least
 760            FLUSH_RELOCATE_THRESHOLD number of nodes are being flushed. The scan
 761            limit is the difference between left_scan.count and the threshold. */
 762
 763         todo = sbinfo->flush.relocate_threshold - left_scan->count;
 764         /* scan right is inherently deadlock prone, because we are
 765          * (potentially) holding a lock on the twig node at this moment.
 766          * FIXME: this is incorrect comment: lock is not held */
 767         if (todo > 0) {
 768                 ret = scan_right(right_scan, node, (unsigned)todo);
 769                 if (ret != 0)
 770                         goto failed;
 771         }
 772
 773         /* Only the right-scan count is needed, release any rightward locks
 774            right away. */
 775         scan_done(right_scan);
 776
 777         /* ... and the answer is: we should relocate leaf nodes if at least
 778            FLUSH_RELOCATE_THRESHOLD nodes were found. */
 779         flush_pos->leaf_relocate = JF_ISSET(node, JNODE_REPACK) ||
 780             (left_scan->count + right_scan->count >=
 781              sbinfo->flush.relocate_threshold);
 782
 783         /* Funny business here.  We set the 'point' in the flush_position at
 784            prior to starting squalloc regardless of whether the first point is
 785            formatted or unformatted. Without this there would be an invariant,
 786            in the rest of the code, that if the flush_position is unformatted
 787            then flush_position->point is NULL and
 788            flush_position->parent_{lock,coord} is set, and if the flush_position
 789            is formatted then flush_position->point is non-NULL and no parent
 790            info is set.
 791
 792            This seems lazy, but it makes the initial calls to
 793            reverse_relocate_test (which ask "is it the pos->point the leftmost
 794            child of its parent") much easier because we know the first child
 795            already.  Nothing is broken by this, but the reasoning is subtle.
 796            Holding an extra reference on a jnode during flush can cause us to
 797            see nodes with HEARD_BANSHEE during squalloc, because nodes are not
 798            removed from sibling lists until they have zero reference count.
 799            Flush would never observe a HEARD_BANSHEE node on the left-edge of
 800            flush, nodes are only deleted to the right. So if nothing is broken,
 801            why fix it?
 802
 803            NOTE-NIKITA actually, flush can meet HEARD_BANSHEE node at any
 804            point and in any moment, because of the concurrent file system
 805            activity (for example, truncate). */
 806
 807         /* Check jnode state after flush_scan completed. Having a lock on this
 808            node or its parent (in case of unformatted) helps us in case of
 809            concurrent flushing. */
 810         if (jnode_check_flushprepped(leftmost_in_slum)
 811             && !jnode_convertible(leftmost_in_slum)) {
 812                 ret = 0;
 813                 goto failed;
 814         }
 815
 816         /* Now setup flush_pos using scan_left's endpoint. */
 817         ret = prepare_flush_pos(flush_pos, leftmost_in_slum);
 818         if (ret)
 819                 goto failed;
 820
 821         if (znode_get_level(flush_pos->coord.node) == LEAF_LEVEL
 822             && node_is_empty(flush_pos->coord.node)) {
 823                 znode *empty = flush_pos->coord.node;
 824
 825                 assert("zam-1022", !ZF_ISSET(empty, JNODE_HEARD_BANSHEE));
 826                 ret = delete_empty_node(empty);
 827                 goto failed;
 828         }
 829
 830         if (jnode_check_flushprepped(leftmost_in_slum)
 831             && !jnode_convertible(leftmost_in_slum)) {
 832                 ret = 0;
 833                 goto failed;
 834         }
 835
 836         /* Set pos->preceder and (re)allocate pos and its ancestors if it is
 837            needed  */
 838         ret = alloc_pos_and_ancestors(flush_pos);
 839         if (ret)
 840                 goto failed;
 841
 842         /* Do the main rightward-bottom-up squeeze and allocate loop. */
 843         ret = squalloc(flush_pos);
 844         pos_stop(flush_pos);
 845         if (ret)
 846                 goto failed;
 847
 848         /* FIXME_NFQUCMPD: Here, handle the twig-special case for unallocated
 849            children. First, the pos_stop() and pos_valid() routines should be
 850            modified so that pos_stop() sets a flush_position->stop flag to 1
 851            without releasing the current position immediately--instead release
 852            it in pos_done(). This is a better implementation than the current
 853            one anyway.
 854
 855            It is not clear that all fields of the flush_position should not be
 856            released, but at the very least the parent_lock, parent_coord, and
 857            parent_load should remain held because they are hold the last twig
 858            when pos_stop() is called.
 859
 860            When we reach this point in the code, if the parent_coord is set to
 861            after the last item then we know that flush reached the end of a twig
 862            (and according to the new flush queueing design, we will return now).
 863            If parent_coord is not past the last item, we should check if the
 864            current twig has any unallocated children to the right (we are not
 865            concerned with unallocated children to the left--in that case the
 866            twig itself should not have been allocated). If the twig has
 867            unallocated children to the right, set the parent_coord to that
 868            position and then repeat the call to squalloc.
 869
 870            Testing for unallocated children may be defined in two ways: if any
 871            internal item has a fake block number, it is unallocated; if any
 872            extent item is unallocated then all of its children are unallocated.
 873            But there is a more aggressive approach: if there are any dirty
 874            children of the twig to the right of the current position, we may
 875            wish to relocate those nodes now. Checking for potential relocation
 876            is more expensive as it requires knowing whether there are any dirty
 877            children that are not unallocated. The extent_needs_allocation should
 878            be used after setting the correct preceder.
 879
 880            When we reach the end of a twig at this point in the code, if the
 881            flush can continue (when the queue is ready) it will need some
 882            information on the future starting point. That should be stored away
 883            in the flush_handle using a seal, I believe. Holding a jref() on the
 884            future starting point may break other code that deletes that node.
 885          */
 886
 887         /* FIXME_NFQUCMPD: Also, we don't want to do any flushing when flush is
 888            called above the twig level.  If the VM calls flush above the twig
 889            level, do nothing and return (but figure out why this happens). The
 890            txnmgr should be modified to only flush its leaf-level dirty list.
 891            This will do all the necessary squeeze and allocate steps but leave
 892            unallocated branches and possibly unallocated twigs (when the twig's
 893            leftmost child is not dirty). After flushing the leaf level, the
 894            remaining unallocated nodes should be given write-optimized
 895            locations. (Possibly, the remaining unallocated twigs should be
 896            allocated just before their leftmost child.)
 897          */
 898
 899         /* Any failure reaches this point. */
 900 failed:
 901
 902         switch (ret) {
 903         case -E_REPEAT:
 904         case -EINVAL:
 905         case -E_DEADLOCK:
 906         case -E_NO_NEIGHBOR:
 907         case -ENOENT:
 908                 /* FIXME(C): Except for E_DEADLOCK, these should probably be
 909                    handled properly in each case. They already are handled in
 910                    many cases. */
 911                 /* Something bad happened, but difficult to avoid... Try again!
 912                 */
 913                 ret = 0;
 914         }
 915
 916         if (leftmost_in_slum)
 917                 jput(leftmost_in_slum);
 918
 919         pos_done(flush_pos);
 920         scan_done(left_scan);
 921         scan_done(right_scan);
 922         kfree(right_scan);
 923
 924         ON_DEBUG(atomic_dec(&flush_cnt));
 925
 926         reiser4_leave_flush(sb);
 927
 928         return ret;
 929 }
 930
 931 /* The reiser4 flush subsystem can be turned into "rapid flush mode" means that
 932  * flusher should submit all prepped nodes immediately without keeping them in
 933  * flush queues for long time.  The reason for rapid flush mode is to free
 934  * memory as fast as possible. */
 935
 936 #if REISER4_USE_RAPID_FLUSH
 937
 938 /**
 939  * submit all prepped nodes if rapid flush mode is set,
 940  * turn rapid flush mode off.
 941  */
 942
 943 static int rapid_flush(flush_pos_t *pos)
 944 {
 945         if (!wbq_available())
 946                 return 0;
 947
 948         return write_prepped_nodes(pos);
 949 }
 950
 951 #else
 952
 953 #define rapid_flush(pos) (0)
 954
 955 #endif                          /* REISER4_USE_RAPID_FLUSH */
 956
 957 static jnode *find_flush_start_jnode(jnode *start, txn_atom * atom,
 958                                      flush_queue_t *fq, int *nr_queued,
 959                                      int flags)
 960 {
 961         jnode * node;
 962
 963         if (start != NULL) {
 964                 spin_lock_jnode(start);
 965                 if (!jnode_is_flushprepped(start)) {
 966                         assert("zam-1056", start->atom == atom);
 967                         node = start;
 968                         goto enter;
 969                 }
 970                 spin_unlock_jnode(start);
 971         }
 972         /*
 973          * In this loop we process all already prepped (RELOC or OVRWR) and
 974          * dirtied again nodes. The atom spin lock is not released until all
 975          * dirty nodes processed or not prepped node found in the atom dirty
 976          * lists.
 977          */
 978         while ((node = find_first_dirty_jnode(atom, flags))) {
 979                 spin_lock_jnode(node);
 980 enter:
 981                 assert("zam-881", JF_ISSET(node, JNODE_DIRTY));
 982                 assert("zam-898", !JF_ISSET(node, JNODE_OVRWR));
 983
 984                 if (JF_ISSET(node, JNODE_WRITEBACK)) {
 985                         /* move node to the end of atom's writeback list */
 986                         list_move_tail(&node->capture_link, ATOM_WB_LIST(atom));
 987
 988                         /*
 989                          * jnode is not necessarily on dirty list: if it was
 990                          * dirtied when it was on flush queue - it does not get
 991                          * moved to dirty list
 992                          */
 993                         ON_DEBUG(count_jnode(atom, node, NODE_LIST(node),
 994                                              WB_LIST, 1));
 995
 996                 } else if (jnode_is_znode(node)
 997                            && znode_above_root(JZNODE(node))) {
 998                         /*
 999                          * A special case for znode-above-root. The above-root
1000                          * (fake) znode is captured and dirtied when the tree
1001                          * height changes or when the root node is relocated.
1002                          * This causes atoms to fuse so that changes at the root
1003                          * are serialized.  However, this node is never flushed.
1004                          * This special case used to be in lock.c to prevent the
1005                          * above-root node from ever being captured, but now
1006                          * that it is captured we simply prevent it from
1007                          * flushing. The log-writer code relies on this to
1008                          * properly log superblock modifications of the tree
1009                          * height.
1010                          */
1011                         jnode_make_wander_nolock(node);
1012                 } else if (JF_ISSET(node, JNODE_RELOC)) {
1013                         queue_jnode(fq, node);
1014                         ++(*nr_queued);
1015                 } else
1016                         break;
1017
1018                 spin_unlock_jnode(node);
1019         }
1020         return node;
1021 }
1022
1023 /* Flush some nodes of current atom, usually slum, return -E_REPEAT if there are
1024  * more nodes to flush, return 0 if atom's dirty lists empty and keep current
1025  * atom locked, return other errors as they are. */
1026 int
1027 flush_current_atom(int flags, long nr_to_write, long *nr_submitted,
1028                    txn_atom ** atom, jnode *start)
1029 {
1030         reiser4_super_info_data *sinfo = get_current_super_private();
1031         flush_queue_t *fq = NULL;
1032         jnode *node;
1033         int nr_queued;
1034         int ret;
1035
1036         assert("zam-889", atom != NULL && *atom != NULL);
1037         assert_spin_locked(&((*atom)->alock));
1038         assert("zam-892", get_current_context()->trans->atom == *atom);
1039
1040         nr_to_write = LONG_MAX;
1041         while (1) {
1042                 ret = reiser4_fq_by_atom(*atom, &fq);
1043                 if (ret != -E_REPEAT)
1044                         break;
1045                 *atom = get_current_atom_locked();
1046         }
1047         if (ret)
1048                 return ret;
1049
1050         assert_spin_locked(&((*atom)->alock));
1051
1052         /* parallel flushers limit */
1053         if (sinfo->tmgr.atom_max_flushers != 0) {
1054                 while ((*atom)->nr_flushers >= sinfo->tmgr.atom_max_flushers) {
1055                         /* An reiser4_atom_send_event() call is inside
1056                            reiser4_fq_put_nolock() which is called when flush is
1057                            finished and nr_flushers is decremented. */
1058                         reiser4_atom_wait_event(*atom);
1059                         *atom = get_current_atom_locked();
1060                 }
1061         }
1062
1063         /* count ourself as a flusher */
1064         (*atom)->nr_flushers++;
1065
1066         writeout_mode_enable();
1067
1068         nr_queued = 0;
1069         node = find_flush_start_jnode(start, *atom, fq, &nr_queued, flags);
1070
1071         if (node == NULL) {
1072                 if (nr_queued == 0) {
1073                         (*atom)->nr_flushers--;
1074                         reiser4_fq_put_nolock(fq);
1075                         reiser4_atom_send_event(*atom);
1076                         /* current atom remains locked */
1077                         writeout_mode_disable();
1078                         return 0;
1079                 }
1080                 spin_unlock_atom(*atom);
1081         } else {
1082                 jref(node);
1083                 BUG_ON((*atom)->super != node->tree->super);
1084                 spin_unlock_atom(*atom);
1085                 spin_unlock_jnode(node);
1086                 BUG_ON(nr_to_write == 0);
1087                 ret = jnode_flush(node, nr_to_write, nr_submitted, fq, flags);
1088                 jput(node);
1089         }
1090
1091         ret =
1092             reiser4_write_fq(fq, nr_submitted,
1093                      WRITEOUT_SINGLE_STREAM | WRITEOUT_FOR_PAGE_RECLAIM);
1094
1095         *atom = get_current_atom_locked();
1096         (*atom)->nr_flushers--;
1097         reiser4_fq_put_nolock(fq);
1098         reiser4_atom_send_event(*atom);
1099         spin_unlock_atom(*atom);
1100
1101         writeout_mode_disable();
1102
1103         if (ret == 0)
1104                 ret = -E_REPEAT;
1105
1106         return ret;
1107 }
1108
1109 /* REVERSE PARENT-FIRST RELOCATION POLICIES */
1110
1111 /* This implements the is-it-close-enough-to-its-preceder? test for relocation
1112    in the reverse parent-first relocate context. Here all we know is the
1113    preceder and the block number. Since we are going in reverse, the preceder
1114    may still be relocated as well, so we can't ask the block allocator "is there
1115    a closer block available to relocate?" here. In the _forward_ parent-first
1116    relocate context (not here) we actually call the block allocator to try and
1117    find a closer location. */
1118 static int
1119 reverse_relocate_if_close_enough(const reiser4_block_nr * pblk,
1120                                  const reiser4_block_nr * nblk)
1121 {
1122         reiser4_block_nr dist;
1123
1124         assert("jmacd-7710", *pblk != 0 && *nblk != 0);
1125         assert("jmacd-7711", !reiser4_blocknr_is_fake(pblk));
1126         assert("jmacd-7712", !reiser4_blocknr_is_fake(nblk));
1127
1128         /* Distance is the absolute value. */
1129         dist = (*pblk > *nblk) ? (*pblk - *nblk) : (*nblk - *pblk);
1130
1131         /* If the block is less than FLUSH_RELOCATE_DISTANCE blocks away from
1132            its preceder block, do not relocate. */
1133         if (dist <= get_current_super_private()->flush.relocate_distance)
1134                 return 0;
1135
1136         return 1;
1137 }
1138
1139 /* This function is a predicate that tests for relocation. Always called in the
1140    reverse-parent-first context, when we are asking whether the current node
1141    should be relocated in order to expand the flush by dirtying the parent level
1142    (and thus proceeding to flush that level). When traversing in the forward
1143    parent-first direction (not here), relocation decisions are handled in two
1144    places: allocate_znode() and extent_needs_allocation(). */
1145 static int
1146 reverse_relocate_test(jnode * node, const coord_t *parent_coord,
1147                       flush_pos_t *pos)
1148 {
1149         reiser4_block_nr pblk = 0;
1150         reiser4_block_nr nblk = 0;
1151
1152         assert("jmacd-8989", !jnode_is_root(node));
1153
1154         /*
1155          * This function is called only from the
1156          * reverse_relocate_check_dirty_parent() and only if the parent
1157          * node is clean. This implies that the parent has the real (i.e., not
1158          * fake) block number, and, so does the child, because otherwise the
1159          * parent would be dirty.
1160          */
1161
1162         /* New nodes are treated as if they are being relocated. */
1163         if (JF_ISSET(node, JNODE_CREATED) ||
1164             (pos->leaf_relocate && jnode_get_level(node) == LEAF_LEVEL))
1165                 return 1;
1166
1167         /* Find the preceder. FIXME(B): When the child is an unformatted,
1168            previously existing node, the coord may be leftmost even though the
1169            child is not the parent-first preceder of the parent. If the first
1170            dirty node appears somewhere in the middle of the first extent unit,
1171            this preceder calculation is wrong.
1172            Needs more logic in here. */
1173         if (coord_is_leftmost_unit(parent_coord)) {
1174                 pblk = *znode_get_block(parent_coord->node);
1175         } else {
1176                 pblk = pos->preceder.blk;
1177         }
1178         check_preceder(pblk);
1179
1180         /* If (pblk == 0) then the preceder isn't allocated or isn't known:
1181            relocate. */
1182         if (pblk == 0)
1183                 return 1;
1184
1185         nblk = *jnode_get_block(node);
1186
1187         if (reiser4_blocknr_is_fake(&nblk))
1188                 /* child is unallocated, mark parent dirty */
1189                 return 1;
1190
1191         return reverse_relocate_if_close_enough(&pblk, &nblk);
1192 }
1193
1194 /* This function calls reverse_relocate_test to make a reverse-parent-first
1195    relocation decision and then, if yes, it marks the parent dirty. */
1196 static int
1197 reverse_relocate_check_dirty_parent(jnode * node, const coord_t *parent_coord,
1198                                     flush_pos_t *pos)
1199 {
1200         int ret;
1201
1202         if (!JF_ISSET(ZJNODE(parent_coord->node), JNODE_DIRTY)) {
1203
1204                 ret = reverse_relocate_test(node, parent_coord, pos);
1205                 if (ret < 0)
1206                         return ret;
1207
1208                 /* FIXME-ZAM
1209                   if parent is already relocated - we do not want to grab space,
1210                   right? */
1211                 if (ret == 1) {
1212                         int grabbed;
1213
1214                         grabbed = get_current_context()->grabbed_blocks;
1215                         if (reiser4_grab_space_force((__u64) 1, BA_RESERVED) !=
1216                             0)
1217                                 reiser4_panic("umka-1250",
1218                                               "No space left during flush.");
1219
1220                         assert("jmacd-18923",
1221                                znode_is_write_locked(parent_coord->node));
1222                         znode_make_dirty(parent_coord->node);
1223                         grabbed2free_mark(grabbed);
1224                 }
1225         }
1226
1227         return 0;
1228 }
1229
1230 /* INITIAL ALLOCATE ANCESTORS STEP (REVERSE PARENT-FIRST ALLOCATION BEFORE
1231    FORWARD PARENT-FIRST LOOP BEGINS) */
1232
1233 /* Get the leftmost child for given coord. */
1234 static int get_leftmost_child_of_unit(const coord_t *coord, jnode ** child)
1235 {
1236         int ret;
1237
1238         ret = item_utmost_child(coord, LEFT_SIDE, child);
1239
1240         if (ret)
1241                 return ret;
1242
1243         if (IS_ERR(*child))
1244                 return PTR_ERR(*child);
1245
1246         return 0;
1247 }
1248
1249 /* This step occurs after the left- and right-scans are completed, before
1250    starting the forward parent-first traversal. Here we attempt to allocate
1251    ancestors of the starting flush point, which means continuing in the reverse
1252    parent-first direction to the parent, grandparent, and so on (as long as the
1253    child is a leftmost child). This routine calls a recursive process,
1254    alloc_one_ancestor, which does the real work, except there is special-case
1255    handling here for the first ancestor, which may be a twig. At each level
1256    (here and alloc_one_ancestor), we check for relocation and then, if the child
1257    is a leftmost child, repeat at the next level. On the way back down (the
1258    recursion), we allocate the ancestors in parent-first order. */
1259 static int alloc_pos_and_ancestors(flush_pos_t *pos)
1260 {
1261         int ret = 0;
1262         lock_handle plock;
1263         load_count pload;
1264         coord_t pcoord;
1265
1266         if (znode_check_flushprepped(pos->lock.node))
1267                 return 0;
1268
1269         coord_init_invalid(&pcoord, NULL);
1270         init_lh(&plock);
1271         init_load_count(&pload);
1272
1273         if (pos->state == POS_ON_EPOINT) {
1274                 /* a special case for pos on twig level, where we already have
1275                    a lock on parent node. */
1276                 /* The parent may not be dirty, in which case we should decide
1277                    whether to relocate the child now. If decision is made to
1278                    relocate the child, the parent is marked dirty. */
1279                 ret =
1280                     reverse_relocate_check_dirty_parent(pos->child, &pos->coord,
1281                                                         pos);
1282                 if (ret)
1283                         goto exit;
1284
1285                 /* FIXME_NFQUCMPD: We only need to allocate the twig (if child
1286                    is leftmost) and the leaf/child, so recursion is not needed.
1287                    Levels above the twig will be allocated for
1288                    write-optimization before the transaction commits.  */
1289
1290                 /* Do the recursive step, allocating zero or more of our
1291                  * ancestors. */
1292                 ret = alloc_one_ancestor(&pos->coord, pos);
1293
1294         } else {
1295                 if (!znode_is_root(pos->lock.node)) {
1296                         /* all formatted nodes except tree root */
1297                         ret =
1298                             reiser4_get_parent(&plock, pos->lock.node,
1299                                                ZNODE_WRITE_LOCK);
1300                         if (ret)
1301                                 goto exit;
1302
1303                         ret = incr_load_count_znode(&pload, plock.node);
1304                         if (ret)
1305                                 goto exit;
1306
1307                         ret =
1308                             find_child_ptr(plock.node, pos->lock.node, &pcoord);
1309                         if (ret)
1310                                 goto exit;
1311
1312                         ret =
1313                             reverse_relocate_check_dirty_parent(ZJNODE
1314                                                                 (pos->lock.
1315                                                                  node), &pcoord,
1316                                                                 pos);
1317                         if (ret)
1318                                 goto exit;
1319
1320                         ret = alloc_one_ancestor(&pcoord, pos);
1321                         if (ret)
1322                                 goto exit;
1323                 }
1324
1325                 ret = allocate_znode(pos->lock.node, &pcoord, pos);
1326         }
1327 exit:
1328         done_load_count(&pload);
1329         done_lh(&plock);
1330         return ret;
1331 }
1332
1333 /* This is the recursive step described in alloc_pos_and_ancestors, above.
1334    Ignoring the call to set_preceder, which is the next function described, this
1335    checks if the child is a leftmost child and returns if it is not. If the
1336    child is a leftmost child it checks for relocation, possibly dirtying the
1337    parent. Then it performs the recursive step. */
1338 static int alloc_one_ancestor(const coord_t *coord, flush_pos_t *pos)
1339 {
1340         int ret = 0;
1341         lock_handle alock;
1342         load_count aload;
1343         coord_t acoord;
1344
1345         /* As we ascend at the left-edge of the region to flush, take this
1346            opportunity at the twig level to find our parent-first preceder
1347            unless we have already set it. */
1348         if (pos->preceder.blk == 0) {
1349                 ret = set_preceder(coord, pos);
1350                 if (ret != 0)
1351                         return ret;
1352         }
1353
1354         /* If the ancestor is clean or already allocated, or if the child is not
1355            a leftmost child, stop going up, even leaving coord->node not
1356            flushprepped. */
1357         if (znode_check_flushprepped(coord->node)
1358             || !coord_is_leftmost_unit(coord))
1359                 return 0;
1360
1361         init_lh(&alock);
1362         init_load_count(&aload);
1363         coord_init_invalid(&acoord, NULL);
1364
1365         /* Only ascend to the next level if it is a leftmost child, but
1366            write-lock the parent in case we will relocate the child. */
1367         if (!znode_is_root(coord->node)) {
1368
1369                 ret =
1370                     jnode_lock_parent_coord(ZJNODE(coord->node), &acoord,
1371                                             &alock, &aload, ZNODE_WRITE_LOCK,
1372                                             0);
1373                 if (ret != 0) {
1374                         /* FIXME(C): check EINVAL, E_DEADLOCK */
1375                         goto exit;
1376                 }
1377
1378                 ret =
1379                     reverse_relocate_check_dirty_parent(ZJNODE(coord->node),
1380                                                         &acoord, pos);
1381                 if (ret != 0)
1382                         goto exit;
1383
1384                 /* Recursive call. */
1385                 if (!znode_check_flushprepped(acoord.node)) {
1386                         ret = alloc_one_ancestor(&acoord, pos);
1387                         if (ret)
1388                                 goto exit;
1389                 }
1390         }
1391
1392         /* Note: we call allocate with the parent write-locked (except at the
1393            root) in case we relocate the child, in which case it will modify the
1394            parent during this call. */
1395         ret = allocate_znode(coord->node, &acoord, pos);
1396
1397 exit:
1398         done_load_count(&aload);
1399         done_lh(&alock);
1400         return ret;
1401 }
1402
1403 /* During the reverse parent-first alloc_pos_and_ancestors process described
1404    above there is a call to this function at the twig level. During
1405    alloc_pos_and_ancestors we may ask: should this node be relocated (in reverse
1406    parent-first context)?  We repeat this process as long as the child is the
1407    leftmost child, eventually reaching an ancestor of the flush point that is
1408    not a leftmost child. The preceder of that ancestors, which is not a leftmost
1409    child, is actually on the leaf level. The preceder of that block is the
1410    left-neighbor of the flush point. The preceder of that block is the rightmost
1411    child of the twig on the left. So, when alloc_pos_and_ancestors passes upward
1412    through the twig level, it stops momentarily to remember the block of the
1413    rightmost child of the twig on the left and sets it to the flush_position's
1414    preceder_hint.
1415
1416    There is one other place where we may set the flush_position's preceder hint,
1417    which is during scan-left.
1418 */
1419 static int set_preceder(const coord_t *coord_in, flush_pos_t *pos)
1420 {
1421         int ret;
1422         coord_t coord;
1423         lock_handle left_lock;
1424         load_count left_load;
1425
1426         coord_dup(&coord, coord_in);
1427
1428         init_lh(&left_lock);
1429         init_load_count(&left_load);
1430
1431         /* FIXME(B): Same FIXME as in "Find the preceder" in
1432            reverse_relocate_test. coord_is_leftmost_unit is not the right test
1433            if the unformatted child is in the middle of the first extent unit.*/
1434         if (!coord_is_leftmost_unit(&coord)) {
1435                 coord_prev_unit(&coord);
1436         } else {
1437                 ret =
1438                     reiser4_get_left_neighbor(&left_lock, coord.node,
1439                                               ZNODE_READ_LOCK, GN_SAME_ATOM);
1440                 if (ret) {
1441                         /* If we fail for any reason it doesn't matter because
1442                            the preceder is only a hint. We are low-priority at
1443                            this point, so this must be the case. */
1444                         if (ret == -E_REPEAT || ret == -E_NO_NEIGHBOR ||
1445                             ret == -ENOENT || ret == -EINVAL
1446                             || ret == -E_DEADLOCK)
1447                                 ret = 0;
1448                         goto exit;
1449                 }
1450
1451                 ret = incr_load_count_znode(&left_load, left_lock.node);
1452                 if (ret)
1453                         goto exit;
1454
1455                 coord_init_last_unit(&coord, left_lock.node);
1456         }
1457
1458         ret =
1459             item_utmost_child_real_block(&coord, RIGHT_SIDE,
1460                                          &pos->preceder.blk);
1461 exit:
1462         check_preceder(pos->preceder.blk);
1463         done_load_count(&left_load);
1464         done_lh(&left_lock);
1465         return ret;
1466 }
1467
1468 /* MAIN SQUEEZE AND ALLOCATE LOOP (THREE BIG FUNCTIONS) */
1469
1470 /* This procedure implements the outer loop of the flush algorithm. To put this
1471    in context, here is the general list of steps taken by the flush routine as a
1472    whole:
1473
1474    1. Scan-left
1475    2. Scan-right (maybe)
1476    3. Allocate initial flush position and its ancestors
1477    4. <handle extents>
1478    5. <squeeze and next position and its ancestors to-the-right,
1479        then update position to-the-right>
1480    6. <repeat from #4 until flush is stopped>
1481
1482    This procedure implements the loop in steps 4 through 6 in the above listing.
1483
1484    Step 4: if the current flush position is an extent item (position on the twig
1485    level), it allocates the extent (allocate_extent_item_in_place) then shifts
1486    to the next coordinate. If the next coordinate's leftmost child needs
1487    flushprep, we will continue. If the next coordinate is an internal item, we
1488    descend back to the leaf level, otherwise we repeat a step #4 (labeled
1489    ALLOC_EXTENTS below). If the "next coordinate" brings us past the end of the
1490    twig level, then we call reverse_relocate_end_of_twig to possibly dirty the
1491    next (right) twig, prior to step #5 which moves to the right.
1492
1493    Step 5: calls squalloc_changed_ancestors, which initiates a recursive call up
1494    the tree to allocate any ancestors of the next-right flush position that are
1495    not also ancestors of the current position. Those ancestors (in top-down
1496    order) are the next in parent-first order. We squeeze adjacent nodes on the
1497    way up until the right node and current node share the same parent, then
1498    allocate on the way back down. Finally, this step sets the flush position to
1499    the next-right node.  Then repeat steps 4 and 5.
1500 */
1501
1502 /* SQUEEZE CODE */
1503
1504 /* squalloc_right_twig helper function, cut a range of extent items from
1505    cut node to->node from the beginning up to coord @to. */
1506 static int squalloc_right_twig_cut(coord_t *to, reiser4_key * to_key,
1507                                    znode * left)
1508 {
1509         coord_t from;
1510         reiser4_key from_key;
1511
1512         coord_init_first_unit(&from, to->node);
1513         item_key_by_coord(&from, &from_key);
1514
1515         return cut_node_content(&from, to, &from_key, to_key, NULL);
1516 }
1517
1518 /* Copy as much of the leading extents from @right to @left, allocating
1519    unallocated extents as they are copied.  Returns SQUEEZE_TARGET_FULL or
1520    SQUEEZE_SOURCE_EMPTY when no more can be shifted.  If the next item is an
1521    internal item it calls shift_one_internal_unit and may then return
1522    SUBTREE_MOVED. */
1523 static int squeeze_right_twig(znode * left, znode * right, flush_pos_t *pos)
1524 {
1525         int ret = SUBTREE_MOVED;
1526         coord_t coord;          /* used to iterate over items */
1527         reiser4_key stop_key;
1528
1529         assert("jmacd-2008", !node_is_empty(right));
1530         coord_init_first_unit(&coord, right);
1531
1532         /* FIXME: can be optimized to cut once */
1533         while (!node_is_empty(coord.node) && item_is_extent(&coord)) {
1534                 ON_DEBUG(void *vp);
1535
1536                 assert("vs-1468", coord_is_leftmost_unit(&coord));
1537                 ON_DEBUG(vp = shift_check_prepare(left, coord.node));
1538
1539                 /* stop_key is used to find what was copied and what to cut */
1540                 stop_key = *reiser4_min_key();
1541                 ret = squalloc_extent(left, &coord, pos, &stop_key);
1542                 if (ret != SQUEEZE_CONTINUE) {
1543                         ON_DEBUG(kfree(vp));
1544                         break;
1545                 }
1546                 assert("vs-1465", !keyeq(&stop_key, reiser4_min_key()));
1547
1548                 /* Helper function to do the cutting. */
1549                 set_key_offset(&stop_key, get_key_offset(&stop_key) - 1);
1550                 check_me("vs-1466",
1551                          squalloc_right_twig_cut(&coord, &stop_key, left) == 0);
1552
1553                 ON_DEBUG(shift_check(vp, left, coord.node));
1554         }
1555
1556         if (node_is_empty(coord.node))
1557                 ret = SQUEEZE_SOURCE_EMPTY;
1558
1559         if (ret == SQUEEZE_TARGET_FULL)
1560                 goto out;
1561
1562         if (node_is_empty(right)) {
1563                 /* The whole right node was copied into @left. */
1564                 assert("vs-464", ret == SQUEEZE_SOURCE_EMPTY);
1565                 goto out;
1566         }
1567
1568         coord_init_first_unit(&coord, right);
1569
1570         if (!item_is_internal(&coord)) {
1571                 /* we do not want to squeeze anything else to left neighbor
1572                    because "slum" is over */
1573                 ret = SQUEEZE_TARGET_FULL;
1574                 goto out;
1575         }
1576         assert("jmacd-433", item_is_internal(&coord));
1577
1578         /* Shift an internal unit.  The child must be allocated before shifting
1579            any more extents, so we stop here. */
1580         ret = shift_one_internal_unit(left, right);
1581
1582 out:
1583         assert("jmacd-8612", ret < 0 || ret == SQUEEZE_TARGET_FULL
1584                || ret == SUBTREE_MOVED || ret == SQUEEZE_SOURCE_EMPTY);
1585
1586         if (ret == SQUEEZE_TARGET_FULL) {
1587                 /* We submit prepped nodes here and expect that this @left twig
1588                  * will not be modified again during this jnode_flush() call. */
1589                 int ret1;
1590
1591                 /* NOTE: seems like io is done under long term locks. */
1592                 ret1 = write_prepped_nodes(pos);
1593                 if (ret1 < 0)
1594                         return ret1;
1595         }
1596
1597         return ret;
1598 }
1599
1600 #if REISER4_DEBUG
1601 static void item_convert_invariant(flush_pos_t *pos)
1602 {
1603         assert("edward-1225", coord_is_existing_item(&pos->coord));
1604         if (chaining_data_present(pos)) {
1605                 item_plugin *iplug = item_convert_plug(pos);
1606
1607                 assert("edward-1000",
1608                        iplug == item_plugin_by_coord(&pos->coord));
1609                 assert("edward-1001", iplug->f.convert != NULL);
1610         } else
1611                 assert("edward-1226", pos->child == NULL);
1612 }
1613 #else
1614
1615 #define item_convert_invariant(pos) noop
1616
1617 #endif
1618
1619 /* Scan node items starting from the first one and apply for each
1620    item its flush ->convert() method (if any). This method may
1621    resize/kill the item so the tree will be changed.
1622 */
1623 static int convert_node(flush_pos_t *pos, znode * node)
1624 {
1625         int ret = 0;
1626         item_plugin *iplug;
1627
1628         assert("edward-304", pos != NULL);
1629         assert("edward-305", pos->child == NULL);
1630         assert("edward-475", znode_convertible(node));
1631         assert("edward-669", znode_is_wlocked(node));
1632         assert("edward-1210", !node_is_empty(node));
1633
1634         if (znode_get_level(node) != LEAF_LEVEL)
1635                 /* unsupported */
1636                 goto exit;
1637
1638         coord_init_first_unit(&pos->coord, node);
1639
1640         while (1) {
1641                 ret = 0;
1642                 coord_set_to_left(&pos->coord);
1643                 item_convert_invariant(pos);
1644
1645                 iplug = item_plugin_by_coord(&pos->coord);
1646                 assert("edward-844", iplug != NULL);
1647
1648                 if (iplug->f.convert) {
1649                         ret = iplug->f.convert(pos);
1650                         if (ret)
1651                                 goto exit;
1652                 }
1653                 assert("edward-307", pos->child == NULL);
1654
1655                 if (coord_next_item(&pos->coord)) {
1656                         /* node is over */
1657
1658                         if (!chaining_data_present(pos))
1659                                 /* finished this node */
1660                                 break;
1661                         if (should_chain_next_node(pos)) {
1662                                 /* go to next node */
1663                                 move_chaining_data(pos, 0/* to next node */);
1664                                 break;
1665                         }
1666                         /* repeat this node */
1667                         move_chaining_data(pos, 1/* this node */);
1668                         continue;
1669                 }
1670                 /* Node is not over.
1671                    Check if there is attached convert data.
1672                    If so roll one item position back and repeat
1673                    on this node
1674                  */
1675                 if (chaining_data_present(pos)) {
1676
1677                         if (iplug != item_plugin_by_coord(&pos->coord))
1678                                 set_item_convert_count(pos, 0);
1679
1680                         ret = coord_prev_item(&pos->coord);
1681                         assert("edward-1003", !ret);
1682
1683                         move_chaining_data(pos, 1/* this node */);
1684                 }
1685         }
1686         JF_CLR(ZJNODE(node), JNODE_CONVERTIBLE);
1687         znode_make_dirty(node);
1688 exit:
1689         assert("edward-1004", !ret);
1690         return ret;
1691 }
1692
1693 /* Squeeze and allocate the right neighbor.  This is called after @left and
1694    its current children have been squeezed and allocated already.  This
1695    procedure's job is to squeeze and items from @right to @left.
1696
1697    If at the leaf level, use the shift_everything_left memcpy-optimized
1698    version of shifting (squeeze_right_leaf).
1699
1700    If at the twig level, extents are allocated as they are shifted from @right
1701    to @left (squalloc_right_twig).
1702
1703    At any other level, shift one internal item and return to the caller
1704    (squalloc_parent_first) so that the shifted-subtree can be processed in
1705    parent-first order.
1706
1707    When unit of internal item is moved, squeezing stops and SUBTREE_MOVED is
1708    returned.  When all content of @right is squeezed, SQUEEZE_SOURCE_EMPTY is
1709    returned.  If nothing can be moved into @left anymore, SQUEEZE_TARGET_FULL
1710    is returned.
1711 */
1712
1713 static int squeeze_right_neighbor(flush_pos_t *pos, znode * left,
1714                                   znode * right)
1715 {
1716         int ret;
1717
1718         /* FIXME it is possible to see empty hasn't-heard-banshee node in a
1719          * tree owing to error (for example, ENOSPC) in write */
1720         /* assert("jmacd-9321", !node_is_empty(left)); */
1721         assert("jmacd-9322", !node_is_empty(right));
1722         assert("jmacd-9323", znode_get_level(left) == znode_get_level(right));
1723
1724         switch (znode_get_level(left)) {
1725         case TWIG_LEVEL:
1726                 /* Shift with extent allocating until either an internal item
1727                    is encountered or everything is shifted or no free space
1728                    left in @left */
1729                 ret = squeeze_right_twig(left, right, pos);
1730                 break;
1731
1732         default:
1733                 /* All other levels can use shift_everything until we implement
1734                    per-item flush plugins. */
1735                 ret = squeeze_right_non_twig(left, right);
1736                 break;
1737         }
1738
1739         assert("jmacd-2011", (ret < 0 ||
1740                               ret == SQUEEZE_SOURCE_EMPTY
1741                               || ret == SQUEEZE_TARGET_FULL
1742                               || ret == SUBTREE_MOVED));
1743         return ret;
1744 }
1745
1746 static int squeeze_right_twig_and_advance_coord(flush_pos_t *pos,
1747                                                 znode * right)
1748 {
1749         int ret;
1750
1751         ret = squeeze_right_twig(pos->lock.node, right, pos);
1752         if (ret < 0)
1753                 return ret;
1754         if (ret > 0) {
1755                 coord_init_after_last_item(&pos->coord, pos->lock.node);
1756                 return ret;
1757         }
1758
1759         coord_init_last_unit(&pos->coord, pos->lock.node);
1760         return 0;
1761 }
1762
1763 /* forward declaration */
1764 static int squalloc_upper_levels(flush_pos_t *, znode *, znode *);
1765
1766 /* do a fast check for "same parents" condition before calling
1767  * squalloc_upper_levels() */
1768 static inline int check_parents_and_squalloc_upper_levels(flush_pos_t *pos,
1769                                                           znode * left,
1770                                                           znode * right)
1771 {
1772         if (znode_same_parents(left, right))
1773                 return 0;
1774
1775         return squalloc_upper_levels(pos, left, right);
1776 }
1777
1778 /* Check whether the parent of given @right node needs to be processes
1779    ((re)allocated) prior to processing of the child.  If @left and @right do not
1780    share at least the parent of the @right is after the @left but before the
1781    @right in parent-first order, we have to (re)allocate it before the @right
1782    gets (re)allocated. */
1783 static int squalloc_upper_levels(flush_pos_t *pos, znode * left, znode * right)
1784 {
1785         int ret;
1786
1787         lock_handle left_parent_lock;
1788         lock_handle right_parent_lock;
1789
1790         load_count left_parent_load;
1791         load_count right_parent_load;
1792
1793         init_lh(&left_parent_lock);
1794         init_lh(&right_parent_lock);
1795
1796         init_load_count(&left_parent_load);
1797         init_load_count(&right_parent_load);
1798
1799         ret = reiser4_get_parent(&left_parent_lock, left, ZNODE_WRITE_LOCK);
1800         if (ret)
1801                 goto out;
1802
1803         ret = reiser4_get_parent(&right_parent_lock, right, ZNODE_WRITE_LOCK);
1804         if (ret)
1805                 goto out;
1806
1807         /* Check for same parents */
1808         if (left_parent_lock.node == right_parent_lock.node)
1809                 goto out;
1810
1811         if (znode_check_flushprepped(right_parent_lock.node)) {
1812                 /* Keep parent-first order.  In the order, the right parent node
1813                    stands before the @right node.  If it is already allocated,
1814                    we set the preceder (next block search start point) to its
1815                    block number, @right node should be allocated after it.
1816
1817                    However, preceder is set only if the right parent is on twig
1818                    level. The explanation is the following: new branch nodes are
1819                    allocated over already allocated children while the tree
1820                    grows, it is difficult to keep tree ordered, we assume that
1821                    only leaves and twings are correctly allocated. So, only
1822                    twigs are used as a preceder for allocating of the rest of
1823                    the slum. */
1824                 if (znode_get_level(right_parent_lock.node) == TWIG_LEVEL) {
1825                         pos->preceder.blk =
1826                             *znode_get_block(right_parent_lock.node);
1827                         check_preceder(pos->preceder.blk);
1828                 }
1829                 goto out;
1830         }
1831
1832         ret = incr_load_count_znode(&left_parent_load, left_parent_lock.node);
1833         if (ret)
1834                 goto out;
1835
1836         ret = incr_load_count_znode(&right_parent_load, right_parent_lock.node);
1837         if (ret)
1838                 goto out;
1839
1840         ret =
1841             squeeze_right_neighbor(pos, left_parent_lock.node,
1842                                    right_parent_lock.node);
1843         /* We stop if error. We stop if some items/units were shifted (ret == 0)
1844          * and thus @right changed its parent. It means we have not process
1845          * right_parent node prior to processing of @right. Positive return
1846          * values say that shifting items was not happen because of "empty
1847          * source" or "target full" conditions. */
1848         if (ret <= 0)
1849                 goto out;
1850
1851         /* parent(@left) and parent(@right) may have different parents also. We
1852          * do a recursive call for checking that. */
1853         ret =
1854             check_parents_and_squalloc_upper_levels(pos, left_parent_lock.node,
1855                                                     right_parent_lock.node);
1856         if (ret)
1857                 goto out;
1858
1859         /* allocate znode when going down */
1860         ret = lock_parent_and_allocate_znode(right_parent_lock.node, pos);
1861
1862 out:
1863         done_load_count(&left_parent_load);
1864         done_load_count(&right_parent_load);
1865
1866         done_lh(&left_parent_lock);
1867         done_lh(&right_parent_lock);
1868
1869         return ret;
1870 }
1871
1872 /* Check the leftmost child "flushprepped" status, also returns true if child
1873  * node was not found in cache.  */
1874 static int leftmost_child_of_unit_check_flushprepped(const coord_t *coord)
1875 {
1876         int ret;
1877         int prepped;
1878
1879         jnode *child;
1880
1881         ret = get_leftmost_child_of_unit(coord, &child);
1882
1883         if (ret)
1884                 return ret;
1885
1886         if (child) {
1887                 prepped = jnode_check_flushprepped(child);
1888                 jput(child);
1889         } else {
1890                 /* We consider not existing child as a node which slum
1891                    processing should not continue to.  Not cached node is clean,
1892                    so it is flushprepped. */
1893                 prepped = 1;
1894         }
1895
1896         return prepped;
1897 }
1898
1899 /* (re)allocate znode with automated getting parent node */
1900 static int lock_parent_and_allocate_znode(znode * node, flush_pos_t *pos)
1901 {
1902         int ret;
1903         lock_handle parent_lock;
1904         load_count parent_load;
1905         coord_t pcoord;
1906
1907         assert("zam-851", znode_is_write_locked(node));
1908
1909         init_lh(&parent_lock);
1910         init_load_count(&parent_load);
1911
1912         ret = reiser4_get_parent(&parent_lock, node, ZNODE_WRITE_LOCK);
1913         if (ret)
1914                 goto out;
1915
1916         ret = incr_load_count_znode(&parent_load, parent_lock.node);
1917         if (ret)
1918                 goto out;
1919
1920         ret = find_child_ptr(parent_lock.node, node, &pcoord);
1921         if (ret)
1922                 goto out;
1923
1924         ret = allocate_znode(node, &pcoord, pos);
1925
1926 out:
1927         done_load_count(&parent_load);
1928         done_lh(&parent_lock);
1929         return ret;
1930 }
1931
1932 /* Process nodes on leaf level until unformatted node or rightmost node in the
1933  * slum reached.  */
1934 static int handle_pos_on_formatted(flush_pos_t *pos)
1935 {
1936         int ret;
1937         lock_handle right_lock;
1938         load_count right_load;
1939
1940         init_lh(&right_lock);
1941         init_load_count(&right_load);
1942
1943         if (should_convert_node(pos, pos->lock.node)) {
1944                 ret = convert_node(pos, pos->lock.node);
1945                 if (ret)
1946                         return ret;
1947         }
1948
1949         while (1) {
1950                 int expected;
1951                 expected = should_convert_next_node(pos);
1952                 ret = neighbor_in_slum(pos->lock.node, &right_lock, RIGHT_SIDE,
1953                                        ZNODE_WRITE_LOCK, !expected, expected);
1954                 if (ret) {
1955                         if (expected)
1956                                 warning("edward-1495",
1957                                 "Expected neighbor not found (ret = %d). Fsck?",
1958                                         ret);
1959                         break;
1960                 }
1961
1962                 /* we don't prep(allocate) nodes for flushing twice. This can be
1963                  * suboptimal, or it can be optimal. For now we choose to live
1964                  * with the risk that it will be suboptimal because it would be
1965                  * quite complex to code it to be smarter. */
1966                 if (znode_check_flushprepped(right_lock.node)
1967                     && !znode_convertible(right_lock.node)) {
1968                         assert("edward-1005", !should_convert_next_node(pos));
1969                         pos_stop(pos);
1970                         break;
1971                 }
1972
1973                 ret = incr_load_count_znode(&right_load, right_lock.node);
1974                 if (ret)
1975                         break;
1976                 if (should_convert_node(pos, right_lock.node)) {
1977                         ret = convert_node(pos, right_lock.node);
1978                         if (ret)
1979                                 break;
1980                         if (node_is_empty(right_lock.node)) {
1981                                 /* node became empty after converting, repeat */
1982                                 done_load_count(&right_load);
1983                                 done_lh(&right_lock);
1984                                 continue;
1985                         }
1986                 }
1987
1988                 /* squeeze _before_ going upward. */
1989                 ret =
1990                     squeeze_right_neighbor(pos, pos->lock.node,
1991                                            right_lock.node);
1992                 if (ret < 0)
1993                         break;
1994
1995                 if (znode_check_flushprepped(right_lock.node)) {
1996                         if (should_convert_next_node(pos)) {
1997                                 /* in spite of flushprepped status of the node,
1998                                    its right slum neighbor should be converted*/
1999                                 assert("edward-953", convert_data(pos));
2000                                 assert("edward-954", item_convert_data(pos));
2001
2002                                 if (node_is_empty(right_lock.node)) {
2003                                         done_load_count(&right_load);
2004                                         done_lh(&right_lock);
2005                                 } else
2006                                         move_flush_pos(pos, &right_lock,
2007                                                        &right_load, NULL);
2008                                 continue;
2009                         }
2010                         pos_stop(pos);
2011                         break;
2012                 }
2013
2014                 if (node_is_empty(right_lock.node)) {
2015                         /* repeat if right node was squeezed completely */
2016                         done_load_count(&right_load);
2017                         done_lh(&right_lock);
2018                         continue;
2019                 }
2020
2021                 /* parent(right_lock.node) has to be processed before
2022                  * (right_lock.node) due to "parent-first" allocation order. */
2023                 ret =
2024                     check_parents_and_squalloc_upper_levels(pos, pos->lock.node,
2025                                                             right_lock.node);
2026                 if (ret)
2027                         break;
2028                 /* (re)allocate _after_ going upward */
2029                 ret = lock_parent_and_allocate_znode(right_lock.node, pos);
2030                 if (ret)
2031                         break;
2032                 if (should_terminate_squalloc(pos)) {
2033                         set_item_convert_count(pos, 0);
2034                         break;
2035                 }
2036
2037                 /* advance the flush position to the right neighbor */
2038                 move_flush_pos(pos, &right_lock, &right_load, NULL);
2039
2040                 ret = rapid_flush(pos);
2041                 if (ret)
2042                         break;
2043         }
2044         check_convert_info(pos);
2045         done_load_count(&right_load);
2046         done_lh(&right_lock);
2047
2048         /* This function indicates via pos whether to stop or go to twig or
2049          * continue on current level. */
2050         return ret;
2051
2052 }
2053
2054 /* Process nodes on leaf level until unformatted node or rightmost node in the
2055  * slum reached.  */
2056 static int handle_pos_on_leaf(flush_pos_t *pos)
2057 {
2058         int ret;
2059
2060         assert("zam-845", pos->state == POS_ON_LEAF);
2061
2062         ret = handle_pos_on_formatted(pos);
2063
2064         if (ret == -E_NO_NEIGHBOR) {
2065                 /* cannot get right neighbor, go process extents. */
2066                 pos->state = POS_TO_TWIG;
2067                 return 0;
2068         }
2069
2070         return ret;
2071 }
2072
2073 /* Process slum on level > 1 */
2074 static int handle_pos_on_internal(flush_pos_t *pos)
2075 {
2076         assert("zam-850", pos->state == POS_ON_INTERNAL);
2077         return handle_pos_on_formatted(pos);
2078 }
2079
2080 /* check whether squalloc should stop before processing given extent */
2081 static int squalloc_extent_should_stop(flush_pos_t *pos)
2082 {
2083         assert("zam-869", item_is_extent(&pos->coord));
2084
2085         /* pos->child is a jnode handle_pos_on_extent() should start with in
2086          * stead of the first child of the first extent unit. */
2087         if (pos->child) {
2088                 int prepped;
2089
2090                 assert("vs-1383", jnode_is_unformatted(pos->child));
2091                 prepped = jnode_check_flushprepped(pos->child);
2092                 pos->pos_in_unit =
2093                     jnode_get_index(pos->child) -
2094                     extent_unit_index(&pos->coord);
2095                 assert("vs-1470",
2096                        pos->pos_in_unit < extent_unit_width(&pos->coord));
2097                 assert("nikita-3434",
2098                        ergo(extent_is_unallocated(&pos->coord),
2099                             pos->pos_in_unit == 0));
2100                 jput(pos->child);
2101                 pos->child = NULL;
2102
2103                 return prepped;
2104         }
2105
2106         pos->pos_in_unit = 0;
2107         if (extent_is_unallocated(&pos->coord))
2108                 return 0;
2109
2110         return leftmost_child_of_unit_check_flushprepped(&pos->coord);
2111 }
2112
2113 /* Handle the case when regular reiser4 tree (znodes connected one to its
2114  * neighbors by sibling pointers) is interrupted on leaf level by one or more
2115  * unformatted nodes.  By having a lock on twig level and use extent code
2116  * routines to process unformatted nodes we swim around an irregular part of
2117  * reiser4 tree. */
2118 static int handle_pos_on_twig(flush_pos_t *pos)
2119 {
2120         int ret;
2121
2122         assert("zam-844", pos->state == POS_ON_EPOINT);
2123         assert("zam-843", item_is_extent(&pos->coord));
2124
2125         /* We decide should we continue slum processing with current extent
2126            unit: if leftmost child of current extent unit is flushprepped
2127            (i.e. clean or already processed by flush) we stop squalloc().  There
2128            is a fast check for unallocated extents which we assume contain all
2129            not flushprepped nodes. */
2130         /* FIXME: Here we implement simple check, we are only looking on the
2131            leftmost child. */
2132         ret = squalloc_extent_should_stop(pos);
2133         if (ret != 0) {
2134                 pos_stop(pos);
2135                 return ret;
2136         }
2137
2138         while (pos_valid(pos) && coord_is_existing_unit(&pos->coord)
2139                && item_is_extent(&pos->coord)) {
2140                 ret = reiser4_alloc_extent(pos);
2141                 if (ret)
2142                         break;
2143                 coord_next_unit(&pos->coord);
2144         }
2145
2146         if (coord_is_after_rightmost(&pos->coord)) {
2147                 pos->state = POS_END_OF_TWIG;
2148                 return 0;
2149         }
2150         if (item_is_internal(&pos->coord)) {
2151                 pos->state = POS_TO_LEAF;
2152                 return 0;
2153         }
2154
2155         assert("zam-860", item_is_extent(&pos->coord));
2156
2157         /* "slum" is over */
2158         pos->state = POS_INVALID;
2159         return 0;
2160 }
2161
2162 /* When we about to return flush position from twig to leaf level we can process
2163  * the right twig node or move position to the leaf.  This processes right twig
2164  * if it is possible and jump to leaf level if not. */
2165 static int handle_pos_end_of_twig(flush_pos_t *pos)
2166 {
2167         int ret;
2168         lock_handle right_lock;
2169         load_count right_load;
2170         coord_t at_right;
2171         jnode *child = NULL;
2172
2173         assert("zam-848", pos->state == POS_END_OF_TWIG);
2174         assert("zam-849", coord_is_after_rightmost(&pos->coord));
2175
2176         init_lh(&right_lock);
2177         init_load_count(&right_load);
2178
2179         /* We get a lock on the right twig node even it is not dirty because
2180          * slum continues or discontinues on leaf level not on next twig. This
2181          * lock on the right twig is needed for getting its leftmost child. */
2182         ret =
2183             reiser4_get_right_neighbor(&right_lock, pos->lock.node,
2184                                        ZNODE_WRITE_LOCK, GN_SAME_ATOM);
2185         if (ret)
2186                 goto out;
2187
2188         ret = incr_load_count_znode(&right_load, right_lock.node);
2189         if (ret)
2190                 goto out;
2191
2192         /* right twig could be not dirty */
2193         if (JF_ISSET(ZJNODE(right_lock.node), JNODE_DIRTY)) {
2194                 /* If right twig node is dirty we always attempt to squeeze it
2195                  * content to the left... */
2196 became_dirty:
2197                 ret =
2198                     squeeze_right_twig_and_advance_coord(pos, right_lock.node);
2199                 if (ret <= 0) {
2200                         /* pos->coord is on internal item, go to leaf level, or
2201                          * we have an error which will be caught in squalloc()
2202                          */
2203                         pos->state = POS_TO_LEAF;
2204                         goto out;
2205                 }
2206
2207                 /* If right twig was squeezed completely we wave to re-lock
2208                  * right twig. now it is done through the top-level squalloc
2209                  * routine. */
2210                 if (node_is_empty(right_lock.node))
2211                         goto out;
2212
2213                 /* ... and prep it if it is not yet prepped */
2214                 if (!znode_check_flushprepped(right_lock.node)) {
2215                         /* As usual, process parent before ... */
2216                         ret =
2217                             check_parents_and_squalloc_upper_levels(pos,
2218                                                                     pos->lock.
2219                                                                     node,
2220                                                                     right_lock.
2221                                                                     node);
2222                         if (ret)
2223                                 goto out;
2224
2225                         /* ... processing the child */
2226                         ret =
2227                             lock_parent_and_allocate_znode(right_lock.node,
2228                                                            pos);
2229                         if (ret)
2230                                 goto out;
2231                 }
2232         } else {
2233                 coord_init_first_unit(&at_right, right_lock.node);
2234
2235                 /* check first child of next twig, should we continue there ? */
2236                 ret = get_leftmost_child_of_unit(&at_right, &child);
2237                 if (ret || child == NULL || jnode_check_flushprepped(child)) {
2238                         pos_stop(pos);
2239                         goto out;
2240                 }
2241
2242                 /* check clean twig for possible relocation */
2243                 if (!znode_check_flushprepped(right_lock.node)) {
2244                         ret =
2245                             reverse_relocate_check_dirty_parent(child,
2246                                                                 &at_right, pos);
2247                         if (ret)
2248                                 goto out;
2249                         if (JF_ISSET(ZJNODE(right_lock.node), JNODE_DIRTY))
2250                                 goto became_dirty;
2251                 }
2252         }
2253
2254         assert("zam-875", znode_check_flushprepped(right_lock.node));
2255
2256         /* Update the preceder by a block number of just processed right twig
2257          * node. The code above could miss the preceder updating because
2258          * allocate_znode() could not be called for this node. */
2259         pos->preceder.blk = *znode_get_block(right_lock.node);
2260         check_preceder(pos->preceder.blk);
2261
2262         coord_init_first_unit(&at_right, right_lock.node);
2263         assert("zam-868", coord_is_existing_unit(&at_right));
2264
2265         pos->state = item_is_extent(&at_right) ? POS_ON_EPOINT : POS_TO_LEAF;
2266         move_flush_pos(pos, &right_lock, &right_load, &at_right);
2267
2268 out:
2269         done_load_count(&right_load);
2270         done_lh(&right_lock);
2271
2272         if (child)
2273                 jput(child);
2274
2275         return ret;
2276 }
2277
2278 /* Move the pos->lock to leaf node pointed by pos->coord, check should we
2279  * continue there. */
2280 static int handle_pos_to_leaf(flush_pos_t *pos)
2281 {
2282         int ret;
2283         lock_handle child_lock;
2284         load_count child_load;
2285         jnode *child;
2286
2287         assert("zam-846", pos->state == POS_TO_LEAF);
2288         assert("zam-847", item_is_internal(&pos->coord));
2289
2290         init_lh(&child_lock);
2291         init_load_count(&child_load);
2292
2293         ret = get_leftmost_child_of_unit(&pos->coord, &child);
2294         if (ret)
2295                 return ret;
2296         if (child == NULL) {
2297                 pos_stop(pos);
2298                 return 0;
2299         }
2300
2301         if (jnode_check_flushprepped(child)) {
2302                 pos->state = POS_INVALID;
2303                 goto out;
2304         }
2305
2306         ret =
2307             longterm_lock_znode(&child_lock, JZNODE(child), ZNODE_WRITE_LOCK,
2308                                 ZNODE_LOCK_LOPRI);
2309         if (ret)
2310                 goto out;
2311
2312         ret = incr_load_count_znode(&child_load, JZNODE(child));
2313         if (ret)
2314                 goto out;
2315
2316         ret = allocate_znode(JZNODE(child), &pos->coord, pos);
2317         if (ret)
2318                 goto out;
2319
2320         /* move flush position to leaf level */
2321         pos->state = POS_ON_LEAF;
2322         move_flush_pos(pos, &child_lock, &child_load, NULL);
2323
2324         if (node_is_empty(JZNODE(child))) {
2325                 ret = delete_empty_node(JZNODE(child));
2326                 pos->state = POS_INVALID;
2327         }
2328 out:
2329         done_load_count(&child_load);
2330         done_lh(&child_lock);
2331         jput(child);
2332
2333         return ret;
2334 }
2335
2336 /* move pos from leaf to twig, and move lock from leaf to twig. */
2337 /* Move pos->lock to upper (twig) level */
2338 static int handle_pos_to_twig(flush_pos_t *pos)
2339 {
2340         int ret;
2341
2342         lock_handle parent_lock;
2343         load_count parent_load;
2344         coord_t pcoord;
2345
2346         assert("zam-852", pos->state == POS_TO_TWIG);
2347
2348         init_lh(&parent_lock);
2349         init_load_count(&parent_load);
2350
2351         ret =
2352             reiser4_get_parent(&parent_lock, pos->lock.node, ZNODE_WRITE_LOCK);
2353         if (ret)
2354                 goto out;
2355
2356         ret = incr_load_count_znode(&parent_load, parent_lock.node);
2357         if (ret)
2358                 goto out;
2359
2360         ret = find_child_ptr(parent_lock.node, pos->lock.node, &pcoord);
2361         if (ret)
2362                 goto out;
2363
2364         assert("zam-870", item_is_internal(&pcoord));
2365         coord_next_item(&pcoord);
2366
2367         if (coord_is_after_rightmost(&pcoord))
2368                 pos->state = POS_END_OF_TWIG;
2369         else if (item_is_extent(&pcoord))
2370                 pos->state = POS_ON_EPOINT;
2371         else {
2372                 /* Here we understand that getting -E_NO_NEIGHBOR in
2373                  * handle_pos_on_leaf() was because of just a reaching edge of
2374                  * slum */
2375                 pos_stop(pos);
2376                 goto out;
2377         }
2378
2379         move_flush_pos(pos, &parent_lock, &parent_load, &pcoord);
2380
2381 out:
2382         done_load_count(&parent_load);
2383         done_lh(&parent_lock);
2384
2385         return ret;
2386 }
2387
2388 typedef int (*pos_state_handle_t) (flush_pos_t *);
2389 static pos_state_handle_t flush_pos_handlers[] = {
2390         /* process formatted nodes on leaf level, keep lock on a leaf node */
2391         [POS_ON_LEAF] = handle_pos_on_leaf,
2392         /* process unformatted nodes, keep lock on twig node, pos->coord points
2393          * to extent currently being processed */
2394         [POS_ON_EPOINT] = handle_pos_on_twig,
2395         /* move a lock from leaf node to its parent for further processing of
2396            unformatted nodes */
2397         [POS_TO_TWIG] = handle_pos_to_twig,
2398         /* move a lock from twig to leaf level when a processing of unformatted
2399          * nodes finishes, pos->coord points to the leaf node we jump to */
2400         [POS_TO_LEAF] = handle_pos_to_leaf,
2401         /* after processing last extent in the twig node, attempting to shift
2402          * items from the twigs right neighbor and process them while shifting*/
2403         [POS_END_OF_TWIG] = handle_pos_end_of_twig,
2404         /* process formatted nodes on internal level, keep lock on an internal
2405            node */
2406         [POS_ON_INTERNAL] = handle_pos_on_internal
2407 };
2408
2409 /* Advance flush position horizontally, prepare for flushing ((re)allocate,
2410  * squeeze, encrypt) nodes and their ancestors in "parent-first" order */
2411 static int squalloc(flush_pos_t *pos)
2412 {
2413         int ret = 0;
2414
2415         /* maybe needs to be made a case statement with handle_pos_on_leaf as
2416          * first case, for greater CPU efficiency? Measure and see.... -Hans */
2417         while (pos_valid(pos)) {
2418                 ret = flush_pos_handlers[pos->state] (pos);
2419                 if (ret < 0)
2420                         break;
2421
2422                 ret = rapid_flush(pos);
2423                 if (ret)
2424                         break;
2425         }
2426
2427         /* any positive value or -E_NO_NEIGHBOR are legal return codes for
2428            handle_pos* routines, -E_NO_NEIGHBOR means that slum edge was
2429            reached */
2430         if (ret > 0 || ret == -E_NO_NEIGHBOR)
2431                 ret = 0;
2432
2433         return ret;
2434 }
2435
2436 static void update_ldkey(znode * node)
2437 {
2438         reiser4_key ldkey;
2439
2440         assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
2441         if (node_is_empty(node))
2442                 return;
2443
2444         znode_set_ld_key(node, leftmost_key_in_node(node, &ldkey));
2445 }
2446
2447 /* this is to be called after calling of shift node's method to shift data from
2448    @right to @left. It sets left delimiting keys of @left and @right to keys of
2449    first items of @left and @right correspondingly and sets right delimiting key
2450    of @left to first key of @right */
2451 static void update_znode_dkeys(znode * left, znode * right)
2452 {
2453         assert_rw_write_locked(&(znode_get_tree(right)->dk_lock));
2454         assert("vs-1629", (znode_is_write_locked(left) &&
2455                            znode_is_write_locked(right)));
2456
2457         /* we need to update left delimiting of left if it was empty before
2458            shift */
2459         update_ldkey(left);
2460         update_ldkey(right);
2461         if (node_is_empty(right))
2462                 znode_set_rd_key(left, znode_get_rd_key(right));
2463         else
2464                 znode_set_rd_key(left, znode_get_ld_key(right));
2465 }
2466
2467 /* try to shift everything from @right to @left. If everything was shifted -
2468    @right is removed from the tree.  Result is the number of bytes shifted. */
2469 static int
2470 shift_everything_left(znode * right, znode * left, carry_level * todo)
2471 {
2472         coord_t from;
2473         node_plugin *nplug;
2474         carry_plugin_info info;
2475
2476         coord_init_after_last_item(&from, right);
2477
2478         nplug = node_plugin_by_node(right);
2479         info.doing = NULL;
2480         info.todo = todo;
2481         return nplug->shift(&from, left, SHIFT_LEFT,
2482                             1 /* delete @right if it becomes empty */ ,
2483                             1
2484                             /* move coord @from to node @left if everything will
2485                                be shifted */
2486                             ,
2487                             &info);
2488 }
2489
2490 /* Shift as much as possible from @right to @left using the memcpy-optimized
2491    shift_everything_left.  @left and @right are formatted neighboring nodes on
2492    leaf level. */
2493 static int squeeze_right_non_twig(znode * left, znode * right)
2494 {
2495         int ret;
2496         carry_pool *pool;
2497         carry_level *todo;
2498
2499         assert("nikita-2246", znode_get_level(left) == znode_get_level(right));
2500
2501         if (!JF_ISSET(ZJNODE(left), JNODE_DIRTY) ||
2502             !JF_ISSET(ZJNODE(right), JNODE_DIRTY))
2503                 return SQUEEZE_TARGET_FULL;
2504
2505         pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo));
2506         if (IS_ERR(pool))
2507                 return PTR_ERR(pool);
2508         todo = (carry_level *) (pool + 1);
2509         init_carry_level(todo, pool);
2510
2511         ret = shift_everything_left(right, left, todo);
2512         if (ret > 0) {
2513                 /* something was shifted */
2514                 reiser4_tree *tree;
2515                 __u64 grabbed;
2516
2517                 znode_make_dirty(left);
2518                 znode_make_dirty(right);
2519
2520                 /* update delimiting keys of nodes which participated in
2521                    shift. FIXME: it would be better to have this in shift
2522                    node's operation. But it can not be done there. Nobody
2523                    remembers why, though */
2524                 tree = znode_get_tree(left);
2525                 write_lock_dk(tree);
2526                 update_znode_dkeys(left, right);
2527                 write_unlock_dk(tree);
2528
2529                 /* Carry is called to update delimiting key and, maybe, to
2530                    remove empty node. */
2531                 grabbed = get_current_context()->grabbed_blocks;
2532                 ret = reiser4_grab_space_force(tree->height, BA_RESERVED);
2533                 assert("nikita-3003", ret == 0);        /* reserved space is
2534                                                         exhausted. Ask Hans. */
2535                 ret = reiser4_carry(todo, NULL/* previous level */);
2536                 grabbed2free_mark(grabbed);
2537         } else {
2538                 /* Shifting impossible, we return appropriate result code */
2539                 ret =
2540                     node_is_empty(right) ? SQUEEZE_SOURCE_EMPTY :
2541                     SQUEEZE_TARGET_FULL;
2542         }
2543
2544         done_carry_pool(pool);
2545
2546         return ret;
2547 }
2548
2549 #if REISER4_DEBUG
2550 static int sibling_link_is_ok(const znode *left, const znode *right)
2551 {
2552         int result;
2553
2554         read_lock_tree(znode_get_tree(left));
2555         result = (left->right == right && left == right->left);
2556         read_unlock_tree(znode_get_tree(left));
2557         return result;
2558 }
2559 #endif
2560
2561 /* Shift first unit of first item if it is an internal one.  Return
2562    SQUEEZE_TARGET_FULL if it fails to shift an item, otherwise return
2563    SUBTREE_MOVED. */
2564 static int shift_one_internal_unit(znode * left, znode * right)
2565 {
2566         int ret;
2567         carry_pool *pool;
2568         carry_level *todo;
2569         coord_t *coord;
2570         carry_plugin_info *info;
2571         int size, moved;
2572
2573         assert("nikita-2247", znode_get_level(left) == znode_get_level(right));
2574         assert("nikita-2435", znode_is_write_locked(left));
2575         assert("nikita-2436", znode_is_write_locked(right));
2576         assert("nikita-2434", sibling_link_is_ok(left, right));
2577
2578         pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo) +
2579                                sizeof(*coord) + sizeof(*info)
2580 #if REISER4_DEBUG
2581                                + sizeof(*coord) + 2 * sizeof(reiser4_key)
2582 #endif
2583             );
2584         if (IS_ERR(pool))
2585                 return PTR_ERR(pool);
2586         todo = (carry_level *) (pool + 1);
2587         init_carry_level(todo, pool);
2588
2589         coord = (coord_t *) (todo + 3);
2590         coord_init_first_unit(coord, right);
2591         info = (carry_plugin_info *) (coord + 1);
2592
2593 #if REISER4_DEBUG
2594         if (!node_is_empty(left)) {
2595                 coord_t *last;
2596                 reiser4_key *right_key;
2597                 reiser4_key *left_key;
2598
2599                 last = (coord_t *) (info + 1);
2600                 right_key = (reiser4_key *) (last + 1);
2601                 left_key = right_key + 1;
2602                 coord_init_last_unit(last, left);
2603
2604                 assert("nikita-2463",
2605                        keyle(item_key_by_coord(last, left_key),
2606                              item_key_by_coord(coord, right_key)));
2607         }
2608 #endif
2609
2610         assert("jmacd-2007", item_is_internal(coord));
2611
2612         size = item_length_by_coord(coord);
2613         info->todo = todo;
2614         info->doing = NULL;
2615
2616         ret = node_plugin_by_node(left)->shift(coord, left, SHIFT_LEFT,
2617                                                1
2618                                                /* delete @right if it becomes
2619                                                   empty */
2620                                                ,
2621                                                0
2622                                                /* do not move coord @coord to
2623                                                   node @left */
2624                                                ,
2625                                                info);
2626
2627         /* If shift returns positive, then we shifted the item. */
2628         assert("vs-423", ret <= 0 || size == ret);
2629         moved = (ret > 0);
2630
2631         if (moved) {
2632                 /* something was moved */
2633                 reiser4_tree *tree;
2634                 int grabbed;
2635
2636                 znode_make_dirty(left);
2637                 znode_make_dirty(right);
2638                 tree = znode_get_tree(left);
2639                 write_lock_dk(tree);
2640                 update_znode_dkeys(left, right);
2641                 write_unlock_dk(tree);
2642
2643                 /* reserve space for delimiting keys after shifting */
2644                 grabbed = get_current_context()->grabbed_blocks;
2645                 ret = reiser4_grab_space_force(tree->height, BA_RESERVED);
2646                 assert("nikita-3003", ret == 0);        /* reserved space is
2647                                                         exhausted. Ask Hans. */
2648
2649                 ret = reiser4_carry(todo, NULL/* previous level */);
2650                 grabbed2free_mark(grabbed);
2651         }
2652
2653         done_carry_pool(pool);
2654
2655         if (ret != 0) {
2656                 /* Shift or carry operation failed. */
2657                 assert("jmacd-7325", ret < 0);
2658                 return ret;
2659         }
2660
2661         return moved ? SUBTREE_MOVED : SQUEEZE_TARGET_FULL;
2662 }
2663
2664 /* Make the final relocate/wander decision during forward parent-first squalloc
2665    for a znode. For unformatted nodes this is done in
2666    plugin/item/extent.c:extent_needs_allocation(). */
2667 static int
2668 allocate_znode_loaded(znode * node,
2669                       const coord_t *parent_coord, flush_pos_t *pos)
2670 {
2671         int ret;
2672         reiser4_super_info_data *sbinfo = get_current_super_private();
2673         /* FIXME(D): We have the node write-locked and should have checked for !
2674            allocated() somewhere before reaching this point, but there can be a
2675            race, so this assertion is bogus. */
2676         assert("jmacd-7987", !jnode_check_flushprepped(ZJNODE(node)));
2677         assert("jmacd-7988", znode_is_write_locked(node));
2678         assert("jmacd-7989", coord_is_invalid(parent_coord)
2679                || znode_is_write_locked(parent_coord->node));
2680
2681         if (ZF_ISSET(node, JNODE_REPACK) || ZF_ISSET(node, JNODE_CREATED) ||
2682             znode_is_root(node) ||
2683             /* We have enough nodes to relocate no matter what. */
2684             (pos->leaf_relocate != 0 && znode_get_level(node) == LEAF_LEVEL)) {
2685                 /* No need to decide with new nodes, they are treated the same
2686                    as relocate. If the root node is dirty, relocate. */
2687                 if (pos->preceder.blk == 0) {
2688                         /* preceder is unknown and we have decided to relocate
2689                            node -- using of default value for search start is
2690                            better than search from block #0. */
2691                         get_blocknr_hint_default(&pos->preceder.blk);
2692                         check_preceder(pos->preceder.blk);
2693                 }
2694
2695                 goto best_reloc;
2696
2697         } else if (pos->preceder.blk == 0) {
2698                 /* If we don't know the preceder, leave it where it is. */
2699                 jnode_make_wander(ZJNODE(node));
2700         } else {
2701                 /* Make a decision based on block distance. */
2702                 reiser4_block_nr dist;
2703                 reiser4_block_nr nblk = *znode_get_block(node);
2704
2705                 assert("jmacd-6172", !reiser4_blocknr_is_fake(&nblk));
2706                 assert("jmacd-6173", !reiser4_blocknr_is_fake(&pos->preceder.blk));
2707                 assert("jmacd-6174", pos->preceder.blk != 0);
2708
2709                 if (pos->preceder.blk == nblk - 1) {
2710                         /* Ideal. */
2711                         jnode_make_wander(ZJNODE(node));
2712                 } else {
2713
2714                         dist =
2715                             (nblk <
2716                              pos->preceder.blk) ? (pos->preceder.blk -
2717                                                    nblk) : (nblk -
2718                                                             pos->preceder.blk);
2719
2720                         /* See if we can find a closer block
2721                            (forward direction only). */
2722                         pos->preceder.max_dist =
2723                             min((reiser4_block_nr) sbinfo->flush.
2724                                 relocate_distance, dist);
2725                         pos->preceder.level = znode_get_level(node);
2726
2727                         ret = allocate_znode_update(node, parent_coord, pos);
2728
2729                         pos->preceder.max_dist = 0;
2730
2731                         if (ret && (ret != -ENOSPC))
2732                                 return ret;
2733
2734                         if (ret == 0) {
2735                                 /* Got a better allocation. */
2736                                 znode_make_reloc(node, pos->fq);
2737                         } else if (dist < sbinfo->flush.relocate_distance) {
2738                                 /* The present allocation is good enough. */
2739                                 jnode_make_wander(ZJNODE(node));
2740                         } else {
2741                                 /* Otherwise, try to relocate to the best
2742                                    position. */
2743 best_reloc:
2744                                 ret =
2745                                     allocate_znode_update(node, parent_coord,
2746                                                           pos);
2747                                 if (ret != 0)
2748                                         return ret;
2749
2750                                 /* set JNODE_RELOC bit _after_ node gets
2751                                    allocated */
2752                                 znode_make_reloc(node, pos->fq);
2753                         }
2754                 }
2755         }
2756
2757         /* This is the new preceder. */
2758         pos->preceder.blk = *znode_get_block(node);
2759         check_preceder(pos->preceder.blk);
2760         pos->alloc_cnt += 1;
2761
2762         assert("jmacd-4277", !reiser4_blocknr_is_fake(&pos->preceder.blk));
2763
2764         return 0;
2765 }
2766
2767 static int
2768 allocate_znode(znode * node, const coord_t *parent_coord, flush_pos_t *pos)
2769 {
2770         /*
2771          * perform znode allocation with znode pinned in memory to avoid races
2772          * with asynchronous emergency flush (which plays with
2773          * JNODE_FLUSH_RESERVED bit).
2774          */
2775         return WITH_DATA(node, allocate_znode_loaded(node, parent_coord, pos));
2776 }
2777
2778 /* A subroutine of allocate_znode, this is called first to see if there is a
2779    close position to relocate to. It may return ENOSPC if there is no close
2780    position. If there is no close position it may not relocate. This takes care
2781    of updating the parent node with the relocated block address. */
2782 static int
2783 allocate_znode_update(znode * node, const coord_t *parent_coord,
2784                       flush_pos_t *pos)
2785 {
2786         int ret;
2787         reiser4_block_nr blk;
2788         lock_handle uber_lock;
2789         int flush_reserved_used = 0;
2790         int grabbed;
2791         reiser4_context *ctx;
2792         reiser4_super_info_data *sbinfo;
2793
2794         init_lh(&uber_lock);
2795
2796         ctx = get_current_context();
2797         sbinfo = get_super_private(ctx->super);
2798
2799         grabbed = ctx->grabbed_blocks;
2800
2801         /* discard e-flush allocation */
2802         ret = zload(node);
2803         if (ret)
2804                 return ret;
2805
2806         if (ZF_ISSET(node, JNODE_CREATED)) {
2807                 assert("zam-816", reiser4_blocknr_is_fake(znode_get_block(node)));
2808                 pos->preceder.block_stage = BLOCK_UNALLOCATED;
2809         } else {
2810                 pos->preceder.block_stage = BLOCK_GRABBED;
2811
2812                 /* The disk space for relocating the @node is already reserved
2813                  * in "flush reserved" counter if @node is leaf, otherwise we
2814                  * grab space using BA_RESERVED (means grab space from whole
2815                  * disk not from only 95%). */
2816                 if (znode_get_level(node) == LEAF_LEVEL) {
2817                         /*
2818                          * earlier (during do_jnode_make_dirty()) we decided
2819                          * that @node can possibly go into overwrite set and
2820                          * reserved block for its wandering location.
2821                          */
2822                         txn_atom *atom = get_current_atom_locked();
2823                         assert("nikita-3449",
2824                                ZF_ISSET(node, JNODE_FLUSH_RESERVED));
2825                         flush_reserved2grabbed(atom, (__u64) 1);
2826                         spin_unlock_atom(atom);
2827                         /*
2828                          * we are trying to move node into relocate
2829                          * set. Allocation of relocated position "uses"
2830                          * reserved block.
2831                          */
2832                         ZF_CLR(node, JNODE_FLUSH_RESERVED);
2833                         flush_reserved_used = 1;
2834                 } else {
2835                         ret = reiser4_grab_space_force((__u64) 1, BA_RESERVED);
2836                         if (ret != 0)
2837                                 goto exit;
2838                 }
2839         }
2840
2841         /* We may do not use 5% of reserved disk space here and flush will not
2842            pack tightly. */
2843         ret = reiser4_alloc_block(&pos->preceder, &blk,
2844                                   BA_FORMATTED | BA_PERMANENT);
2845         if (ret)
2846                 goto exit;
2847
2848         if (!ZF_ISSET(node, JNODE_CREATED) &&
2849             (ret =
2850              reiser4_dealloc_block(znode_get_block(node), 0,
2851                                    BA_DEFER | BA_FORMATTED)))
2852                 goto exit;
2853
2854         if (likely(!znode_is_root(node))) {
2855                 item_plugin *iplug;
2856
2857                 iplug = item_plugin_by_coord(parent_coord);
2858                 assert("nikita-2954", iplug->f.update != NULL);
2859                 iplug->f.update(parent_coord, &blk);
2860
2861                 znode_make_dirty(parent_coord->node);
2862
2863         } else {
2864                 reiser4_tree *tree = znode_get_tree(node);
2865                 znode *uber;
2866
2867                 /* We take a longterm lock on the fake node in order to change
2868                    the root block number.  This may cause atom fusion. */
2869                 ret = get_uber_znode(tree, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI,
2870                                      &uber_lock);
2871                 /* The fake node cannot be deleted, and we must have priority
2872                    here, and may not be confused with ENOSPC. */
2873                 assert("jmacd-74412",
2874                        ret != -EINVAL && ret != -E_DEADLOCK && ret != -ENOSPC);
2875
2876                 if (ret)
2877                         goto exit;
2878
2879                 uber = uber_lock.node;
2880
2881                 write_lock_tree(tree);
2882                 tree->root_block = blk;
2883                 write_unlock_tree(tree);
2884
2885                 znode_make_dirty(uber);
2886         }
2887
2888         ret = znode_rehash(node, &blk);
2889 exit:
2890         if (ret) {
2891                 /* Get flush reserved block back if something fails, because
2892                  * callers assume that on error block wasn't relocated and its
2893                  * flush reserved block wasn't used. */
2894                 if (flush_reserved_used) {
2895                         /*
2896                          * ok, we failed to move node into relocate
2897                          * set. Restore status quo.
2898                          */
2899                         grabbed2flush_reserved((__u64) 1);
2900                         ZF_SET(node, JNODE_FLUSH_RESERVED);
2901                 }
2902         }
2903         zrelse(node);
2904         done_lh(&uber_lock);
2905         grabbed2free_mark(grabbed);
2906         return ret;
2907 }
2908
2909 /* JNODE INTERFACE */
2910
2911 /* Lock a node (if formatted) and then get its parent locked, set the child's
2912    coordinate in the parent.  If the child is the root node, the above_root
2913    znode is returned but the coord is not set.  This function may cause atom
2914    fusion, but it is only used for read locks (at this point) and therefore
2915    fusion only occurs when the parent is already dirty. */
2916 /* Hans adds this note: remember to ask how expensive this operation is vs.
2917    storing parent pointer in jnodes. */
2918 static int
2919 jnode_lock_parent_coord(jnode * node,
2920                         coord_t *coord,
2921                         lock_handle * parent_lh,
2922                         load_count * parent_zh,
2923                         znode_lock_mode parent_mode, int try)
2924 {
2925         int ret;
2926
2927         assert("edward-53", jnode_is_unformatted(node) || jnode_is_znode(node));
2928         assert("edward-54", jnode_is_unformatted(node)
2929                || znode_is_any_locked(JZNODE(node)));
2930
2931         if (!jnode_is_znode(node)) {
2932                 reiser4_key key;
2933                 tree_level stop_level = TWIG_LEVEL;
2934                 lookup_bias bias = FIND_EXACT;
2935
2936                 assert("edward-168", !(jnode_get_type(node) == JNODE_BITMAP));
2937
2938                 /* The case when node is not znode, but can have parent coord
2939                    (unformatted node, node which represents cluster page,
2940                    etc..).  Generate a key for the appropriate entry, search
2941                    in the tree using coord_by_key, which handles locking for
2942                    us. */
2943
2944                 /*
2945                  * nothing is locked at this moment, so, nothing prevents
2946                  * concurrent truncate from removing jnode from inode. To
2947                  * prevent this spin-lock jnode. jnode can be truncated just
2948                  * after call to the jnode_build_key(), but this is ok,
2949                  * because coord_by_key() will just fail to find appropriate
2950                  * extent.
2951                  */
2952                 spin_lock_jnode(node);
2953                 if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
2954                         jnode_build_key(node, &key);
2955                         ret = 0;
2956                 } else
2957                         ret = RETERR(-ENOENT);
2958                 spin_unlock_jnode(node);
2959
2960                 if (ret != 0)
2961                         return ret;
2962
2963                 if (jnode_is_cluster_page(node))
2964                         stop_level = LEAF_LEVEL;
2965
2966                 assert("jmacd-1812", coord != NULL);
2967
2968                 ret = coord_by_key(jnode_get_tree(node), &key, coord, parent_lh,
2969                                    parent_mode, bias, stop_level, stop_level,
2970                                    CBK_UNIQUE, NULL/*ra_info */);
2971                 switch (ret) {
2972                 case CBK_COORD_NOTFOUND:
2973                         assert("edward-1038",
2974                                ergo(jnode_is_cluster_page(node),
2975                                     JF_ISSET(node, JNODE_HEARD_BANSHEE)));
2976                         if (!JF_ISSET(node, JNODE_HEARD_BANSHEE))
2977                                 warning("nikita-3177", "Parent not found");
2978                         return ret;
2979                 case CBK_COORD_FOUND:
2980                         if (coord->between != AT_UNIT) {
2981                                 /* FIXME: comment needed */
2982                                 done_lh(parent_lh);
2983                                 if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
2984                                         warning("nikita-3178",
2985                                                 "Found but not happy: %i",
2986                                                 coord->between);
2987                                 }
2988                                 return RETERR(-ENOENT);
2989                         }
2990                         ret = incr_load_count_znode(parent_zh, parent_lh->node);
2991                         if (ret != 0)
2992                                 return ret;
2993                         /* if (jnode_is_cluster_page(node)) {
2994                            races with write() are possible
2995                            check_child_cluster (parent_lh->node);
2996                            }
2997                          */
2998                         break;
2999                 default:
3000                         return ret;
3001                 }
3002
3003         } else {
3004                 int flags;
3005                 znode *z;
3006
3007                 z = JZNODE(node);
3008                 /* Formatted node case: */
3009                 assert("jmacd-2061", !znode_is_root(z));
3010
3011                 flags = GN_ALLOW_NOT_CONNECTED;
3012                 if (try)
3013                         flags |= GN_TRY_LOCK;
3014
3015                 ret =
3016                     reiser4_get_parent_flags(parent_lh, z, parent_mode, flags);
3017                 if (ret != 0)
3018                         /* -E_REPEAT is ok here, it is handled by the caller. */
3019                         return ret;
3020
3021                 /* Make the child's position "hint" up-to-date.  (Unless above
3022                    root, which caller must check.) */
3023                 if (coord != NULL) {
3024
3025                         ret = incr_load_count_znode(parent_zh, parent_lh->node);
3026                         if (ret != 0) {
3027                                 warning("jmacd-976812386",
3028                                         "incr_load_count_znode failed: %d",
3029                                         ret);
3030                                 return ret;
3031                         }
3032
3033                         ret = find_child_ptr(parent_lh->node, z, coord);
3034                         if (ret != 0) {
3035                                 warning("jmacd-976812",
3036                                         "find_child_ptr failed: %d", ret);
3037                                 return ret;
3038                         }
3039                 }
3040         }
3041
3042         return 0;
3043 }
3044
3045 /* Get the (locked) next neighbor of a znode which is dirty and a member of the
3046    same atom. If there is no next neighbor or the neighbor is not in memory or
3047    if there is a neighbor but it is not dirty or not in the same atom,
3048    -E_NO_NEIGHBOR is returned. In some cases the slum may include nodes which
3049    are not dirty, if so @check_dirty should be 0 */
3050 static int neighbor_in_slum(znode * node,       /* starting point */
3051                             lock_handle * lock, /* lock on starting point */
3052                             sideof side,        /* left or right direction we
3053                                                    seek the next node in */
3054                             znode_lock_mode mode, /* kind of lock we want */
3055                             int check_dirty,    /* true if the neighbor should
3056                                                    be dirty */
3057                             int use_upper_levels /* get neighbor by going though
3058                                                     upper levels */)
3059 {
3060         int ret;
3061         int flags;
3062
3063         assert("jmacd-6334", znode_is_connected(node));
3064
3065         flags =  GN_SAME_ATOM | (side == LEFT_SIDE ? GN_GO_LEFT : 0);
3066         if (use_upper_levels)
3067                 flags |= GN_CAN_USE_UPPER_LEVELS;
3068
3069         ret = reiser4_get_neighbor(lock, node, mode, flags);
3070         if (ret) {
3071                 /* May return -ENOENT or -E_NO_NEIGHBOR. */
3072                 /* FIXME(C): check EINVAL, E_DEADLOCK */
3073                 if (ret == -ENOENT)
3074                         ret = RETERR(-E_NO_NEIGHBOR);
3075                 return ret;
3076         }
3077         if (!check_dirty)
3078                 return 0;
3079         /* Check dirty bit of locked znode, no races here */
3080         if (JF_ISSET(ZJNODE(lock->node), JNODE_DIRTY))
3081                 return 0;
3082
3083         done_lh(lock);
3084         return RETERR(-E_NO_NEIGHBOR);
3085 }
3086
3087 /* Return true if two znodes have the same parent.  This is called with both
3088    nodes write-locked (for squeezing) so no tree lock is needed. */
3089 static int znode_same_parents(znode * a, znode * b)
3090 {
3091         int result;
3092
3093         assert("jmacd-7011", znode_is_write_locked(a));
3094         assert("jmacd-7012", znode_is_write_locked(b));
3095
3096         /* We lock the whole tree for this check.... I really don't like whole
3097          * tree locks... -Hans */
3098         read_lock_tree(znode_get_tree(a));
3099         result = (znode_parent(a) == znode_parent(b));
3100         read_unlock_tree(znode_get_tree(a));
3101         return result;
3102 }
3103
3104 /* FLUSH SCAN */
3105
3106 /* Initialize the flush_scan data structure. */
3107 static void scan_init(flush_scan * scan)
3108 {
3109         memset(scan, 0, sizeof(*scan));
3110         init_lh(&scan->node_lock);
3111         init_lh(&scan->parent_lock);
3112         init_load_count(&scan->parent_load);
3113         init_load_count(&scan->node_load);
3114         coord_init_invalid(&scan->parent_coord, NULL);
3115 }
3116
3117 /* Release any resources held by the flush scan, e.g. release locks,
3118    free memory, etc. */
3119 static void scan_done(flush_scan * scan)
3120 {
3121         done_load_count(&scan->node_load);
3122         if (scan->node != NULL) {
3123                 jput(scan->node);
3124                 scan->node = NULL;
3125         }
3126         done_load_count(&scan->parent_load);
3127         done_lh(&scan->parent_lock);
3128         done_lh(&scan->node_lock);
3129 }
3130
3131 /* Returns true if flush scanning is finished. */
3132 int reiser4_scan_finished(flush_scan * scan)
3133 {
3134         return scan->stop || (scan->direction == RIGHT_SIDE &&
3135                               scan->count >= scan->max_count);
3136 }
3137
3138 /* Return true if the scan should continue to the @tonode. True if the node
3139    meets the same_slum_check condition. If not, deref the "left" node and stop
3140    the scan. */
3141 int reiser4_scan_goto(flush_scan * scan, jnode * tonode)
3142 {
3143         int go = same_slum_check(scan->node, tonode, 1, 0);
3144
3145         if (!go) {
3146                 scan->stop = 1;
3147                 jput(tonode);
3148         }
3149
3150         return go;
3151 }
3152
3153 /* Set the current scan->node, refcount it, increment count by the @add_count
3154    (number to count, e.g., skipped unallocated nodes), deref previous current,
3155    and copy the current parent coordinate. */
3156 int
3157 scan_set_current(flush_scan * scan, jnode * node, unsigned add_count,
3158                  const coord_t *parent)
3159 {
3160         /* Release the old references, take the new reference. */
3161         done_load_count(&scan->node_load);
3162
3163         if (scan->node != NULL)
3164                 jput(scan->node);
3165         scan->node = node;
3166         scan->count += add_count;
3167
3168         /* This next stmt is somewhat inefficient.  The reiser4_scan_extent()
3169            code could delay this update step until it finishes and update the
3170            parent_coord only once. It did that before, but there was a bug and
3171            this was the easiest way to make it correct. */
3172         if (parent != NULL)
3173                 coord_dup(&scan->parent_coord, parent);
3174
3175         /* Failure may happen at the incr_load_count call, but the caller can
3176            assume the reference is safely taken. */
3177         return incr_load_count_jnode(&scan->node_load, node);
3178 }
3179
3180 /* Return true if scanning in the leftward direction. */
3181 int reiser4_scanning_left(flush_scan * scan)
3182 {
3183         return scan->direction == LEFT_SIDE;
3184 }
3185
3186 /* Performs leftward scanning starting from either kind of node. Counts the
3187    starting node. The right-scan object is passed in for the left-scan in order
3188    to copy the parent of an unformatted starting position. This way we avoid
3189    searching for the unformatted node's parent when scanning in each direction.
3190    If we search for the parent once it is set in both scan objects. The limit
3191    parameter tells flush-scan when to stop.
3192
3193    Rapid scanning is used only during scan_left, where we are interested in
3194    finding the 'leftpoint' where we begin flushing. We are interested in
3195    stopping at the left child of a twig that does not have a dirty left
3196    neighbour. THIS IS A SPECIAL CASE. The problem is finding a way to flush only
3197    those nodes without unallocated children, and it is difficult to solve in the
3198    bottom-up flushing algorithm we are currently using. The problem can be
3199    solved by scanning left at every level as we go upward, but this would
3200    basically bring us back to using a top-down allocation strategy, which we
3201    already tried (see BK history from May 2002), and has a different set of
3202    problems. The top-down strategy makes avoiding unallocated children easier,
3203    but makes it difficult to propertly flush dirty children with clean parents
3204    that would otherwise stop the top-down flush, only later to dirty the parent
3205    once the children are flushed. So we solve the problem in the bottom-up
3206    algorithm with a special case for twigs and leaves only.
3207
3208    The first step in solving the problem is this rapid leftward scan.  After we
3209    determine that there are at least enough nodes counted to qualify for
3210    FLUSH_RELOCATE_THRESHOLD we are no longer interested in the exact count, we
3211    are only interested in finding the best place to start the flush.
3212
3213    We could choose one of two possibilities:
3214
3215    1. Stop at the leftmost child (of a twig) that does not have a dirty left
3216    neighbor. This requires checking one leaf per rapid-scan twig
3217
3218    2. Stop at the leftmost child (of a twig) where there are no dirty children
3219    of the twig to the left. This requires checking possibly all of the in-memory
3220    children of each twig during the rapid scan.
3221
3222    For now we implement the first policy.
3223 */
3224 static int
3225 scan_left(flush_scan * scan, flush_scan * right, jnode * node, unsigned limit)
3226 {
3227         int ret = 0;
3228
3229         scan->max_count = limit;
3230         scan->direction = LEFT_SIDE;
3231
3232         ret = scan_set_current(scan, jref(node), 1, NULL);
3233         if (ret != 0)
3234                 return ret;
3235
3236         ret = scan_common(scan, right);
3237         if (ret != 0)
3238                 return ret;
3239
3240         /* Before rapid scanning, we need a lock on scan->node so that we can
3241            get its parent, only if formatted. */
3242         if (jnode_is_znode(scan->node)) {
3243                 ret = longterm_lock_znode(&scan->node_lock, JZNODE(scan->node),
3244                                           ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI);
3245         }
3246
3247         /* Rapid_scan would go here (with limit set to FLUSH_RELOCATE_THRESHOLD)
3248         */
3249         return ret;
3250 }
3251
3252 /* Performs rightward scanning... Does not count the starting node. The limit
3253    parameter is described in scan_left. If the starting node is unformatted then
3254    the parent_coord was already set during scan_left. The rapid_after parameter
3255    is not used during right-scanning.
3256
3257    scan_right is only called if the scan_left operation does not count at least
3258    FLUSH_RELOCATE_THRESHOLD nodes for flushing.  Otherwise, the limit parameter
3259    is set to the difference between scan-left's count and
3260    FLUSH_RELOCATE_THRESHOLD, meaning scan-right counts as high as
3261    FLUSH_RELOCATE_THRESHOLD and then stops. */
3262 static int scan_right(flush_scan * scan, jnode * node, unsigned limit)
3263 {
3264         int ret;
3265
3266         scan->max_count = limit;
3267         scan->direction = RIGHT_SIDE;
3268
3269         ret = scan_set_current(scan, jref(node), 0, NULL);
3270         if (ret != 0)
3271                 return ret;
3272
3273         return scan_common(scan, NULL);
3274 }
3275
3276 /* Common code to perform left or right scanning. */
3277 static int scan_common(flush_scan * scan, flush_scan * other)
3278 {
3279         int ret;
3280
3281         assert("nikita-2376", scan->node != NULL);
3282         assert("edward-54", jnode_is_unformatted(scan->node)
3283                || jnode_is_znode(scan->node));
3284
3285         /* Special case for starting at an unformatted node. Optimization: we
3286            only want to search for the parent (which requires a tree traversal)
3287            once. Obviously, we shouldn't have to call it once for the left scan
3288            and once for the right scan. For this reason, if we search for the
3289            parent during scan-left we then duplicate the coord/lock/load into
3290            the scan-right object. */
3291         if (jnode_is_unformatted(scan->node)) {
3292                 ret = scan_unformatted(scan, other);
3293                 if (ret != 0)
3294                         return ret;
3295         }
3296         /* This loop expects to start at a formatted position and performs
3297            chaining of formatted regions */
3298         while (!reiser4_scan_finished(scan)) {
3299
3300                 ret = scan_formatted(scan);
3301                 if (ret != 0)
3302                         return ret;
3303         }
3304
3305         return 0;
3306 }
3307
3308 static int scan_unformatted(flush_scan * scan, flush_scan * other)
3309 {
3310         int ret = 0;
3311         int try = 0;
3312
3313         if (!coord_is_invalid(&scan->parent_coord))
3314                 goto scan;
3315
3316         /* set parent coord from */
3317         if (!jnode_is_unformatted(scan->node)) {
3318                 /* formatted position */
3319
3320                 lock_handle lock;
3321                 assert("edward-301", jnode_is_znode(scan->node));
3322                 init_lh(&lock);
3323
3324                 /*
3325                  * when flush starts from unformatted node, first thing it
3326                  * does is tree traversal to find formatted parent of starting
3327                  * node. This parent is then kept lock across scans to the
3328                  * left and to the right. This means that during scan to the
3329                  * left we cannot take left-ward lock, because this is
3330                  * dead-lock prone. So, if we are scanning to the left and
3331                  * there is already lock held by this thread,
3332                  * jnode_lock_parent_coord() should use try-lock.
3333                  */
3334                 try = reiser4_scanning_left(scan)
3335                     && !lock_stack_isclean(get_current_lock_stack());
3336                 /* Need the node locked to get the parent lock, We have to
3337                    take write lock since there is at least one call path
3338                    where this znode is already write-locked by us. */
3339                 ret =
3340                     longterm_lock_znode(&lock, JZNODE(scan->node),
3341                                         ZNODE_WRITE_LOCK,
3342                                         reiser4_scanning_left(scan) ?
3343                                         ZNODE_LOCK_LOPRI :
3344                                         ZNODE_LOCK_HIPRI);
3345                 if (ret != 0)
3346                         /* EINVAL or E_DEADLOCK here mean... try again!  At this
3347                            point we've scanned too far and can't back out, just
3348                            start over. */
3349                         return ret;
3350
3351                 ret = jnode_lock_parent_coord(scan->node,
3352                                               &scan->parent_coord,
3353                                               &scan->parent_lock,
3354                                               &scan->parent_load,
3355                                               ZNODE_WRITE_LOCK, try);
3356
3357                 /* FIXME(C): check EINVAL, E_DEADLOCK */
3358                 done_lh(&lock);
3359                 if (ret == -E_REPEAT) {
3360                         scan->stop = 1;
3361                         return 0;
3362                 }
3363                 if (ret)
3364                         return ret;
3365
3366         } else {
3367                 /* unformatted position */
3368
3369                 ret =
3370                     jnode_lock_parent_coord(scan->node, &scan->parent_coord,
3371                                             &scan->parent_lock,
3372                                             &scan->parent_load,
3373                                             ZNODE_WRITE_LOCK, try);
3374
3375                 if (IS_CBKERR(ret))
3376                         return ret;
3377
3378                 if (ret == CBK_COORD_NOTFOUND)
3379                         /* FIXME(C): check EINVAL, E_DEADLOCK */
3380                         return ret;
3381
3382                 /* parent was found */
3383                 assert("jmacd-8661", other != NULL);
3384                 /* Duplicate the reference into the other flush_scan. */
3385                 coord_dup(&other->parent_coord, &scan->parent_coord);
3386                 copy_lh(&other->parent_lock, &scan->parent_lock);
3387                 copy_load_count(&other->parent_load, &scan->parent_load);
3388         }
3389 scan:
3390         return scan_by_coord(scan);
3391 }
3392
3393 /* Performs left- or rightward scanning starting from a formatted node. Follow
3394    left pointers under tree lock as long as:
3395
3396    - node->left/right is non-NULL
3397    - node->left/right is connected, dirty
3398    - node->left/right belongs to the same atom
3399    - scan has not reached maximum count
3400 */
3401 static int scan_formatted(flush_scan * scan)
3402 {
3403         int ret;
3404         znode *neighbor = NULL;
3405
3406         assert("jmacd-1401", !reiser4_scan_finished(scan));
3407
3408         do {
3409                 znode *node = JZNODE(scan->node);
3410
3411                 /* Node should be connected, but if not stop the scan. */
3412                 if (!znode_is_connected(node)) {
3413                         scan->stop = 1;
3414                         break;
3415                 }
3416
3417                 /* Lock the tree, check-for and reference the next sibling. */
3418                 read_lock_tree(znode_get_tree(node));
3419
3420                 /* It may be that a node is inserted or removed between a node
3421                    and its left sibling while the tree lock is released, but the
3422                    flush-scan count does not need to be precise. Thus, we
3423                    release the tree lock as soon as we get the neighboring node.
3424                 */
3425                 neighbor =
3426                         reiser4_scanning_left(scan) ? node->left : node->right;
3427                 if (neighbor != NULL)
3428                         zref(neighbor);
3429
3430                 read_unlock_tree(znode_get_tree(node));
3431
3432                 /* If neighbor is NULL at the leaf level, need to check for an
3433                    unformatted sibling using the parent--break in any case. */
3434                 if (neighbor == NULL)
3435                         break;
3436
3437                 /* Check the condition for going left, break if it is not met.
3438                    This also releases (jputs) the neighbor if false. */
3439                 if (!reiser4_scan_goto(scan, ZJNODE(neighbor)))
3440                         break;
3441
3442                 /* Advance the flush_scan state to the left, repeat. */
3443                 ret = scan_set_current(scan, ZJNODE(neighbor), 1, NULL);
3444                 if (ret != 0)
3445                         return ret;
3446
3447         } while (!reiser4_scan_finished(scan));
3448
3449         /* If neighbor is NULL then we reached the end of a formatted region, or
3450            else the sibling is out of memory, now check for an extent to the
3451            left (as long as LEAF_LEVEL). */
3452         if (neighbor != NULL || jnode_get_level(scan->node) != LEAF_LEVEL
3453             || reiser4_scan_finished(scan)) {
3454                 scan->stop = 1;
3455                 return 0;
3456         }
3457         /* Otherwise, calls scan_by_coord for the right(left)most item of the
3458            left(right) neighbor on the parent level, then possibly continue. */
3459
3460         coord_init_invalid(&scan->parent_coord, NULL);
3461         return scan_unformatted(scan, NULL);
3462 }
3463
3464 /* NOTE-EDWARD:
3465    This scans adjacent items of the same type and calls scan flush plugin for
3466    each one. Performs left(right)ward scanning starting from a (possibly)
3467    unformatted node. If we start from unformatted node, then we continue only if
3468    the next neighbor is also unformatted. When called from scan_formatted, we
3469    skip first iteration (to make sure that right(left)most item of the
3470    left(right) neighbor on the parent level is of the same type and set
3471    appropriate coord). */
3472 static int scan_by_coord(flush_scan * scan)
3473 {
3474         int ret = 0;
3475         int scan_this_coord;
3476         lock_handle next_lock;
3477         load_count next_load;
3478         coord_t next_coord;
3479         jnode *child;
3480         item_plugin *iplug;
3481
3482         init_lh(&next_lock);
3483         init_load_count(&next_load);
3484         scan_this_coord = (jnode_is_unformatted(scan->node) ? 1 : 0);
3485
3486         /* set initial item id */
3487         iplug = item_plugin_by_coord(&scan->parent_coord);
3488
3489         for (; !reiser4_scan_finished(scan); scan_this_coord = 1) {
3490                 if (scan_this_coord) {
3491                         /* Here we expect that unit is scannable. it would not
3492                          * be so due to race with extent->tail conversion.  */
3493                         if (iplug->f.scan == NULL) {
3494                                 scan->stop = 1;
3495                                 ret = -E_REPEAT;
3496                                 /* skip the check at the end. */
3497                                 goto race;
3498                         }
3499
3500                         ret = iplug->f.scan(scan);
3501                         if (ret != 0)
3502                                 goto exit;
3503
3504                         if (reiser4_scan_finished(scan)) {
3505                                 checkchild(scan);
3506                                 break;
3507                         }
3508                 } else {
3509                         /* the same race against truncate as above is possible
3510                          * here, it seems */
3511
3512                         /* NOTE-JMACD: In this case, apply the same end-of-node
3513                            logic but don't scan the first coordinate. */
3514                         assert("jmacd-1231",
3515                                item_is_internal(&scan->parent_coord));
3516                 }
3517
3518                 if (iplug->f.utmost_child == NULL
3519                     || znode_get_level(scan->parent_coord.node) != TWIG_LEVEL) {
3520                         /* stop this coord and continue on parrent level */
3521                         ret =
3522                             scan_set_current(scan,
3523                                              ZJNODE(zref
3524                                                     (scan->parent_coord.node)),
3525                                              1, NULL);
3526                         if (ret != 0)
3527                                 goto exit;
3528                         break;
3529                 }
3530
3531                 /* Either way, the invariant is that scan->parent_coord is set
3532                    to the parent of scan->node. Now get the next unit. */
3533                 coord_dup(&next_coord, &scan->parent_coord);
3534                 coord_sideof_unit(&next_coord, scan->direction);
3535
3536                 /* If off-the-end of the twig, try the next twig. */
3537                 if (coord_is_after_sideof_unit(&next_coord, scan->direction)) {
3538                         /* We take the write lock because we may start flushing
3539                          * from this coordinate. */
3540                         ret = neighbor_in_slum(next_coord.node,
3541                                                &next_lock,
3542                                                scan->direction,
3543                                                ZNODE_WRITE_LOCK,
3544                                                1 /* check dirty */,
3545                                                0 /* don't go though upper
3546                                                     levels */);
3547                         if (ret == -E_NO_NEIGHBOR) {
3548                                 scan->stop = 1;
3549                                 ret = 0;
3550                                 break;
3551                         }
3552
3553                         if (ret != 0)
3554                                 goto exit;
3555
3556                         ret = incr_load_count_znode(&next_load, next_lock.node);
3557                         if (ret != 0)
3558                                 goto exit;
3559
3560                         coord_init_sideof_unit(&next_coord, next_lock.node,
3561                                                sideof_reverse(scan->direction));
3562                 }
3563
3564                 iplug = item_plugin_by_coord(&next_coord);
3565
3566                 /* Get the next child. */
3567                 ret =
3568                     iplug->f.utmost_child(&next_coord,
3569                                           sideof_reverse(scan->direction),
3570                                           &child);
3571                 if (ret != 0)
3572                         goto exit;
3573                 /* If the next child is not in memory, or, item_utmost_child
3574                    failed (due to race with unlink, most probably), stop
3575                    here. */
3576                 if (child == NULL || IS_ERR(child)) {
3577                         scan->stop = 1;
3578                         checkchild(scan);
3579                         break;
3580                 }
3581
3582                 assert("nikita-2374", jnode_is_unformatted(child)
3583                        || jnode_is_znode(child));
3584
3585                 /* See if it is dirty, part of the same atom. */
3586                 if (!reiser4_scan_goto(scan, child)) {
3587                         checkchild(scan);
3588                         break;
3589                 }
3590
3591                 /* If so, make this child current. */
3592                 ret = scan_set_current(scan, child, 1, &next_coord);
3593                 if (ret != 0)
3594                         goto exit;
3595
3596                 /* Now continue.  If formatted we release the parent lock and
3597                    return, then proceed. */
3598                 if (jnode_is_znode(child))
3599                         break;
3600
3601                 /* Otherwise, repeat the above loop with next_coord. */
3602                 if (next_load.node != NULL) {
3603                         done_lh(&scan->parent_lock);
3604                         move_lh(&scan->parent_lock, &next_lock);
3605                         move_load_count(&scan->parent_load, &next_load);
3606                 }
3607         }
3608
3609         assert("jmacd-6233",
3610                reiser4_scan_finished(scan) || jnode_is_znode(scan->node));
3611 exit:
3612         checkchild(scan);
3613 race:                   /* skip the above check  */
3614         if (jnode_is_znode(scan->node)) {
3615                 done_lh(&scan->parent_lock);
3616                 done_load_count(&scan->parent_load);
3617         }
3618
3619         done_load_count(&next_load);
3620         done_lh(&next_lock);
3621         return ret;
3622 }
3623
3624 /* FLUSH POS HELPERS */
3625
3626 /* Initialize the fields of a flush_position. */
3627 static void pos_init(flush_pos_t *pos)
3628 {
3629         memset(pos, 0, sizeof *pos);
3630
3631         pos->state = POS_INVALID;
3632         coord_init_invalid(&pos->coord, NULL);
3633         init_lh(&pos->lock);
3634         init_load_count(&pos->load);
3635
3636         reiser4_blocknr_hint_init(&pos->preceder);
3637 }
3638
3639 /* The flush loop inside squalloc periodically checks pos_valid to determine
3640    when "enough flushing" has been performed. This will return true until one
3641    of the following conditions is met:
3642
3643    1. the number of flush-queued nodes has reached the kernel-supplied
3644    "int *nr_to_flush" parameter, meaning we have flushed as many blocks as the
3645    kernel requested. When flushing to commit, this parameter is NULL.
3646
3647    2. pos_stop() is called because squalloc discovers that the "next" node in
3648    the flush order is either non-existant, not dirty, or not in the same atom.
3649 */
3650
3651 static int pos_valid(flush_pos_t *pos)
3652 {
3653         return pos->state != POS_INVALID;
3654 }
3655
3656 /* Release any resources of a flush_position. Called when jnode_flush
3657    finishes. */
3658 static void pos_done(flush_pos_t *pos)
3659 {
3660         pos_stop(pos);
3661         reiser4_blocknr_hint_done(&pos->preceder);
3662         if (convert_data(pos))
3663                 free_convert_data(pos);
3664 }
3665
3666 /* Reset the point and parent.  Called during flush subroutines to terminate the
3667    squalloc loop. */
3668 static int pos_stop(flush_pos_t *pos)
3669 {
3670         pos->state = POS_INVALID;
3671         done_lh(&pos->lock);
3672         done_load_count(&pos->load);
3673         coord_init_invalid(&pos->coord, NULL);
3674
3675         if (pos->child) {
3676                 jput(pos->child);
3677                 pos->child = NULL;
3678         }
3679
3680         return 0;
3681 }
3682
3683 /* Return the flush_position's block allocator hint. */
3684 reiser4_blocknr_hint *reiser4_pos_hint(flush_pos_t *pos)
3685 {
3686         return &pos->preceder;
3687 }
3688
3689 flush_queue_t *reiser4_pos_fq(flush_pos_t *pos)
3690 {
3691         return pos->fq;
3692 }
3693
3694 /* Make Linus happy.
3695    Local variables:
3696    c-indentation-style: "K&R"
3697    mode-name: "LC"
3698    c-basic-offset: 8
3699    tab-width: 8
3700    fill-column: 90
3701    LocalWords:  preceder
3702    End:
3703 */