fs/reiser4/wander.c

   1 /* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
   2  * reiser4/README */
   3
   4 /* Reiser4 Wandering Log */
   5
   6 /* You should read http://www.namesys.com/txn-doc.html
   7
   8    That describes how filesystem operations are performed as atomic
   9    transactions, and how we try to arrange it so that we can write most of the
  10    data only once while performing the operation atomically.
  11
  12    For the purposes of this code, it is enough for it to understand that it
  13    has been told a given block should be written either once, or twice (if
  14    twice then once to the wandered location and once to the real location).
  15
  16    This code guarantees that those blocks that are defined to be part of an
  17    atom either all take effect or none of them take effect.
  18
  19    The "relocate set" of nodes are submitted to write by the jnode_flush()
  20    routine, and the "overwrite set" is submitted by reiser4_write_log().
  21    This is because with the overwrite set we seek to optimize writes, and
  22    with the relocate set we seek to cause disk order to correlate with the
  23    "parent first order" (preorder).
  24
  25    reiser4_write_log() allocates and writes wandered blocks and maintains
  26    additional on-disk structures of the atom as wander records (each wander
  27    record occupies one block) for storing of the "wandered map" (a table which
  28    contains a relation between wandered and real block numbers) and other
  29    information which might be needed at transaction recovery time.
  30
  31    The wander records are unidirectionally linked into a circle: each wander
  32    record contains a block number of the next wander record, the last wander
  33    record points to the first one.
  34
  35    One wander record (named "tx head" in this file) has a format which is
  36    different from the other wander records. The "tx head" has a reference to the
  37    "tx head" block of the previously committed atom.  Also, "tx head" contains
  38    fs information (the free blocks counter, and the oid allocator state) which
  39    is logged in a special way .
  40
  41    There are two journal control blocks, named journal header and journal
  42    footer which have fixed on-disk locations.  The journal header has a
  43    reference to the "tx head" block of the last committed atom.  The journal
  44    footer points to the "tx head" of the last flushed atom.  The atom is
  45    "played" when all blocks from its overwrite set are written to disk the
  46    second time (i.e. written to their real locations).
  47
  48    NOTE: People who know reiserfs internals and its journal structure might be
  49    confused with these terms journal footer and journal header. There is a table
  50    with terms of similar semantics in reiserfs (reiser3) and reiser4:
  51
  52    REISER3 TERM        |  REISER4 TERM         | DESCRIPTION
  53    --------------------+-----------------------+----------------------------
  54    commit record       |  journal header       | atomic write of this record
  55                        |                       | ends transaction commit
  56    --------------------+-----------------------+----------------------------
  57    journal header      |  journal footer       | atomic write of this record
  58                        |                       | ends post-commit writes.
  59                        |                       | After successful
  60                        |                       | writing of this journal
  61                        |                       | blocks (in reiser3) or
  62                        |                       | wandered blocks/records are
  63                        |                       | free for re-use.
  64    --------------------+-----------------------+----------------------------
  65
  66    The atom commit process is the following:
  67
  68    1. The overwrite set is taken from atom's clean list, and its size is
  69       counted.
  70
  71    2. The number of necessary wander records (including tx head) is calculated,
  72       and the wander record blocks are allocated.
  73
  74    3. Allocate wandered blocks and populate wander records by wandered map.
  75
  76    4. submit write requests for wander records and wandered blocks.
  77
  78    5. wait until submitted write requests complete.
  79
  80    6. update journal header: change the pointer to the block number of just
  81    written tx head, submit an i/o for modified journal header block and wait
  82    for i/o completion.
  83
  84    NOTE: The special logging for bitmap blocks and some reiser4 super block
  85    fields makes processes of atom commit, flush and recovering a bit more
  86    complex (see comments in the source code for details).
  87
  88    The atom playing process is the following:
  89
  90    1. Write atom's overwrite set in-place.
  91
  92    2. Wait on i/o.
  93
  94    3. Update journal footer: change the pointer to block number of tx head
  95    block of the atom we currently flushing, submit an i/o, wait on i/o
  96    completion.
  97
  98    4. Free disk space which was used for wandered blocks and wander records.
  99
 100    After the freeing of wandered blocks and wander records we have that journal
 101    footer points to the on-disk structure which might be overwritten soon.
 102    Neither the log writer nor the journal recovery procedure use that pointer
 103    for accessing the data.  When the journal recovery procedure finds the oldest
 104    transaction it compares the journal footer pointer value with the "prev_tx"
 105    pointer value in tx head, if values are equal the oldest not flushed
 106    transaction is found.
 107
 108    NOTE on disk space leakage: the information about of what blocks and how many
 109    blocks are allocated for wandered blocks, wandered records is not written to
 110    the disk because of special logging for bitmaps and some super blocks
 111    counters.  After a system crash we the reiser4 does not remember those
 112    objects allocation, thus we have no such a kind of disk space leakage.
 113 */
 114
 115 /* Special logging of reiser4 super block fields. */
 116
 117 /* There are some reiser4 super block fields (free block count and OID allocator
 118    state (number of files and next free OID) which are logged separately from
 119    super block to avoid unnecessary atom fusion.
 120
 121    So, the reiser4 super block can be not captured by a transaction with
 122    allocates/deallocates disk blocks or create/delete file objects.  Moreover,
 123    the reiser4 on-disk super block is not touched when such a transaction is
 124    committed and flushed.  Those "counters logged specially" are logged in "tx
 125    head" blocks and in the journal footer block.
 126
 127    A step-by-step description of special logging:
 128
 129    0. The per-atom information about deleted or created files and allocated or
 130    freed blocks is collected during the transaction.  The atom's
 131    ->nr_objects_created and ->nr_objects_deleted are for object
 132    deletion/creation tracking, the numbers of allocated and freed blocks are
 133    calculated using atom's delete set and atom's capture list -- all new and
 134    relocated nodes should be on atom's clean list and should have JNODE_RELOC
 135    bit set.
 136
 137    1. The "logged specially" reiser4 super block fields have their "committed"
 138    versions in the reiser4 in-memory super block.  They get modified only at
 139    atom commit time.  The atom's commit thread has an exclusive access to those
 140    "committed" fields because the log writer implementation supports only one
 141    atom commit a time (there is a per-fs "commit" mutex).  At
 142    that time "committed" counters are modified using per-atom information
 143    collected during the transaction. These counters are stored on disk as a
 144    part of tx head block when atom is committed.
 145
 146    2. When the atom is flushed the value of the free block counter and the OID
 147    allocator state get written to the journal footer block.  A special journal
 148    procedure (journal_recover_sb_data()) takes those values from the journal
 149    footer and updates the reiser4 in-memory super block.
 150
 151    NOTE: That means free block count and OID allocator state are logged
 152    separately from the reiser4 super block regardless of the fact that the
 153    reiser4 super block has fields to store both the free block counter and the
 154    OID allocator.
 155
 156    Writing the whole super block at commit time requires knowing true values of
 157    all its fields without changes made by not yet committed transactions. It is
 158    possible by having their "committed" version of the super block like the
 159    reiser4 bitmap blocks have "committed" and "working" versions.  However,
 160    another scheme was implemented which stores special logged values in the
 161    unused free space inside transaction head block.  In my opinion it has an
 162    advantage of not writing whole super block when only part of it was
 163    modified. */
 164
 165 #include "debug.h"
 166 #include "dformat.h"
 167 #include "txnmgr.h"
 168 #include "jnode.h"
 169 #include "znode.h"
 170 #include "block_alloc.h"
 171 #include "page_cache.h"
 172 #include "wander.h"
 173 #include "reiser4.h"
 174 #include "super.h"
 175 #include "vfs_ops.h"
 176 #include "writeout.h"
 177 #include "inode.h"
 178 #include "entd.h"
 179
 180 #include <linux/types.h>
 181 #include <linux/fs.h>           /* for struct super_block  */
 182 #include <linux/mm.h>           /* for struct page */
 183 #include <linux/pagemap.h>
 184 #include <linux/bio.h>          /* for struct bio */
 185 #include <linux/blkdev.h>
 186
 187 static int write_jnodes_to_disk_extent(
 188         jnode *, int, const reiser4_block_nr *, flush_queue_t *, int);
 189
 190 /* The commit_handle is a container for objects needed at atom commit time  */
 191 struct commit_handle {
 192         /* A pointer to atom's list of OVRWR nodes */
 193         struct list_head *overwrite_set;
 194         /* atom's overwrite set size */
 195         int overwrite_set_size;
 196         /* jnodes for wander record blocks */
 197         struct list_head tx_list;
 198         /* number of wander records */
 199         __u32 tx_size;
 200         /* 'committed' sb counters are saved here until atom is completely
 201            flushed  */
 202         __u64 free_blocks;
 203         __u64 nr_files;
 204         __u64 next_oid;
 205         /* A pointer to the atom which is being committed */
 206         txn_atom *atom;
 207         /* A pointer to current super block */
 208         struct super_block *super;
 209         /* The counter of modified bitmaps */
 210         reiser4_block_nr nr_bitmap;
 211 };
 212
 213 static void init_commit_handle(struct commit_handle *ch, txn_atom *atom)
 214 {
 215         memset(ch, 0, sizeof(struct commit_handle));
 216         INIT_LIST_HEAD(&ch->tx_list);
 217
 218         ch->atom = atom;
 219         ch->super = reiser4_get_current_sb();
 220 }
 221
 222 static void done_commit_handle(struct commit_handle *ch)
 223 {
 224         assert("zam-690", list_empty(&ch->tx_list));
 225 }
 226
 227 static inline int reiser4_use_write_barrier(struct super_block * s)
 228 {
 229         return !reiser4_is_set(s, REISER4_NO_WRITE_BARRIER);
 230 }
 231
 232 static void disable_write_barrier(struct super_block * s)
 233 {
 234         notice("zam-1055", "%s does not support write barriers,"
 235                " using synchronous write instead.", s->s_id);
 236         set_bit((int)REISER4_NO_WRITE_BARRIER, &get_super_private(s)->fs_flags);
 237 }
 238
 239 /* fill journal header block data  */
 240 static void format_journal_header(struct commit_handle *ch)
 241 {
 242         struct reiser4_super_info_data *sbinfo;
 243         struct journal_header *header;
 244         jnode *txhead;
 245
 246         sbinfo = get_super_private(ch->super);
 247         assert("zam-479", sbinfo != NULL);
 248         assert("zam-480", sbinfo->journal_header != NULL);
 249
 250         txhead = list_entry(ch->tx_list.next, jnode, capture_link);
 251
 252         jload(sbinfo->journal_header);
 253
 254         header = (struct journal_header *)jdata(sbinfo->journal_header);
 255         assert("zam-484", header != NULL);
 256
 257         put_unaligned(cpu_to_le64(*jnode_get_block(txhead)),
 258                       &header->last_committed_tx);
 259
 260         jrelse(sbinfo->journal_header);
 261 }
 262
 263 /* fill journal footer block data */
 264 static void format_journal_footer(struct commit_handle *ch)
 265 {
 266         struct reiser4_super_info_data *sbinfo;
 267         struct journal_footer *footer;
 268         jnode *tx_head;
 269
 270         sbinfo = get_super_private(ch->super);
 271
 272         tx_head = list_entry(ch->tx_list.next, jnode, capture_link);
 273
 274         assert("zam-493", sbinfo != NULL);
 275         assert("zam-494", sbinfo->journal_header != NULL);
 276
 277         check_me("zam-691", jload(sbinfo->journal_footer) == 0);
 278
 279         footer = (struct journal_footer *)jdata(sbinfo->journal_footer);
 280         assert("zam-495", footer != NULL);
 281
 282         put_unaligned(cpu_to_le64(*jnode_get_block(tx_head)),
 283                       &footer->last_flushed_tx);
 284         put_unaligned(cpu_to_le64(ch->free_blocks), &footer->free_blocks);
 285
 286         put_unaligned(cpu_to_le64(ch->nr_files), &footer->nr_files);
 287         put_unaligned(cpu_to_le64(ch->next_oid), &footer->next_oid);
 288
 289         jrelse(sbinfo->journal_footer);
 290 }
 291
 292 /* wander record capacity depends on current block size */
 293 static int wander_record_capacity(const struct super_block *super)
 294 {
 295         return (super->s_blocksize -
 296                 sizeof(struct wander_record_header)) /
 297             sizeof(struct wander_entry);
 298 }
 299
 300 /* Fill first wander record (tx head) in accordance with supplied given data */
 301 static void format_tx_head(struct commit_handle *ch)
 302 {
 303         jnode *tx_head;
 304         jnode *next;
 305         struct tx_header *header;
 306
 307         tx_head = list_entry(ch->tx_list.next, jnode, capture_link);
 308         assert("zam-692", &ch->tx_list != &tx_head->capture_link);
 309
 310         next = list_entry(tx_head->capture_link.next, jnode, capture_link);
 311         if (&ch->tx_list == &next->capture_link)
 312                 next = tx_head;
 313
 314         header = (struct tx_header *)jdata(tx_head);
 315
 316         assert("zam-460", header != NULL);
 317         assert("zam-462", ch->super->s_blocksize >= sizeof(struct tx_header));
 318
 319         memset(jdata(tx_head), 0, (size_t) ch->super->s_blocksize);
 320         memcpy(jdata(tx_head), TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE);
 321
 322         put_unaligned(cpu_to_le32(ch->tx_size), &header->total);
 323         put_unaligned(cpu_to_le64(get_super_private(ch->super)->last_committed_tx),
 324                       &header->prev_tx);
 325         put_unaligned(cpu_to_le64(*jnode_get_block(next)), &header->next_block);
 326         put_unaligned(cpu_to_le64(ch->free_blocks), &header->free_blocks);
 327         put_unaligned(cpu_to_le64(ch->nr_files), &header->nr_files);
 328         put_unaligned(cpu_to_le64(ch->next_oid), &header->next_oid);
 329 }
 330
 331 /* prepare ordinary wander record block (fill all service fields) */
 332 static void
 333 format_wander_record(struct commit_handle *ch, jnode *node, __u32 serial)
 334 {
 335         struct wander_record_header *LRH;
 336         jnode *next;
 337
 338         assert("zam-464", node != NULL);
 339
 340         LRH = (struct wander_record_header *)jdata(node);
 341         next = list_entry(node->capture_link.next, jnode, capture_link);
 342
 343         if (&ch->tx_list == &next->capture_link)
 344                 next = list_entry(ch->tx_list.next, jnode, capture_link);
 345
 346         assert("zam-465", LRH != NULL);
 347         assert("zam-463",
 348                ch->super->s_blocksize > sizeof(struct wander_record_header));
 349
 350         memset(jdata(node), 0, (size_t) ch->super->s_blocksize);
 351         memcpy(jdata(node), WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE);
 352
 353         put_unaligned(cpu_to_le32(ch->tx_size), &LRH->total);
 354         put_unaligned(cpu_to_le32(serial), &LRH->serial);
 355         put_unaligned(cpu_to_le64(*jnode_get_block(next)), &LRH->next_block);
 356 }
 357
 358 /* add one wandered map entry to formatted wander record */
 359 static void
 360 store_entry(jnode * node, int index, const reiser4_block_nr * a,
 361             const reiser4_block_nr * b)
 362 {
 363         char *data;
 364         struct wander_entry *pairs;
 365
 366         data = jdata(node);
 367         assert("zam-451", data != NULL);
 368
 369         pairs =
 370             (struct wander_entry *)(data + sizeof(struct wander_record_header));
 371
 372         put_unaligned(cpu_to_le64(*a), &pairs[index].original);
 373         put_unaligned(cpu_to_le64(*b), &pairs[index].wandered);
 374 }
 375
 376 /* currently, wander records contains contain only wandered map, which depend on
 377    overwrite set size */
 378 static void get_tx_size(struct commit_handle *ch)
 379 {
 380         assert("zam-440", ch->overwrite_set_size != 0);
 381         assert("zam-695", ch->tx_size == 0);
 382
 383         /* count all ordinary wander records
 384            (<overwrite_set_size> - 1) / <wander_record_capacity> + 1 and add one
 385            for tx head block */
 386         ch->tx_size =
 387             (ch->overwrite_set_size - 1) / wander_record_capacity(ch->super) +
 388             2;
 389 }
 390
 391 /* A special structure for using in store_wmap_actor() for saving its state
 392    between calls */
 393 struct store_wmap_params {
 394         jnode *cur;             /* jnode of current wander record to fill */
 395         int idx;                /* free element index in wander record  */
 396         int capacity;           /* capacity  */
 397
 398 #if REISER4_DEBUG
 399         struct list_head *tx_list;
 400 #endif
 401 };
 402
 403 /* an actor for use in blocknr_set_iterator routine which populates the list
 404    of pre-formatted wander records by wandered map info */
 405 static int
 406 store_wmap_actor(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a,
 407                  const reiser4_block_nr * b, void *data)
 408 {
 409         struct store_wmap_params *params = data;
 410
 411         if (params->idx >= params->capacity) {
 412                 /* a new wander record should be taken from the tx_list */
 413                 params->cur = list_entry(params->cur->capture_link.next, jnode, capture_link);
 414                 assert("zam-454",
 415                        params->tx_list != &params->cur->capture_link);
 416
 417                 params->idx = 0;
 418         }
 419
 420         store_entry(params->cur, params->idx, a, b);
 421         params->idx++;
 422
 423         return 0;
 424 }
 425
 426 /* This function is called after Relocate set gets written to disk, Overwrite
 427    set is written to wandered locations and all wander records are written
 428    also. Updated journal header blocks contains a pointer (block number) to
 429    first wander record of the just written transaction */
 430 static int update_journal_header(struct commit_handle *ch, int use_barrier)
 431 {
 432         struct reiser4_super_info_data *sbinfo = get_super_private(ch->super);
 433         jnode *jh = sbinfo->journal_header;
 434         jnode *head = list_entry(ch->tx_list.next, jnode, capture_link);
 435         int ret;
 436
 437         format_journal_header(ch);
 438
 439         ret = write_jnodes_to_disk_extent(jh, 1, jnode_get_block(jh), NULL,
 440                                           use_barrier ? WRITEOUT_BARRIER : 0);
 441         if (ret)
 442                 return ret;
 443
 444         /* blk_run_address_space(sbinfo->fake->i_mapping);
 445          * blk_run_queues(); */
 446
 447         ret = jwait_io(jh, WRITE);
 448
 449         if (ret)
 450                 return ret;
 451
 452         sbinfo->last_committed_tx = *jnode_get_block(head);
 453
 454         return 0;
 455 }
 456
 457 /* This function is called after write-back is finished. We update journal
 458    footer block and free blocks which were occupied by wandered blocks and
 459    transaction wander records */
 460 static int update_journal_footer(struct commit_handle *ch, int use_barrier)
 461 {
 462         reiser4_super_info_data *sbinfo = get_super_private(ch->super);
 463
 464         jnode *jf = sbinfo->journal_footer;
 465
 466         int ret;
 467
 468         format_journal_footer(ch);
 469
 470         ret = write_jnodes_to_disk_extent(jf, 1, jnode_get_block(jf), NULL,
 471                                           use_barrier ? WRITEOUT_BARRIER : 0);
 472         if (ret)
 473                 return ret;
 474
 475         /* blk_run_address_space(sbinfo->fake->i_mapping);
 476          * blk_run_queue(); */
 477
 478         ret = jwait_io(jf, WRITE);
 479         if (ret)
 480                 return ret;
 481
 482         return 0;
 483 }
 484
 485 /* free block numbers of wander records of already written in place transaction */
 486 static void dealloc_tx_list(struct commit_handle *ch)
 487 {
 488         while (!list_empty(&ch->tx_list)) {
 489                 jnode *cur = list_entry(ch->tx_list.next, jnode, capture_link);
 490                 list_del(&cur->capture_link);
 491                 ON_DEBUG(INIT_LIST_HEAD(&cur->capture_link));
 492                 reiser4_dealloc_block(jnode_get_block(cur), BLOCK_NOT_COUNTED,
 493                                       BA_FORMATTED);
 494
 495                 unpin_jnode_data(cur);
 496                 reiser4_drop_io_head(cur);
 497         }
 498 }
 499
 500 /* An actor for use in block_nr_iterator() routine which frees wandered blocks
 501    from atom's overwrite set. */
 502 static int
 503 dealloc_wmap_actor(txn_atom * atom UNUSED_ARG,
 504                    const reiser4_block_nr * a UNUSED_ARG,
 505                    const reiser4_block_nr * b, void *data UNUSED_ARG)
 506 {
 507
 508         assert("zam-499", b != NULL);
 509         assert("zam-500", *b != 0);
 510         assert("zam-501", !reiser4_blocknr_is_fake(b));
 511
 512         reiser4_dealloc_block(b, BLOCK_NOT_COUNTED, BA_FORMATTED);
 513         return 0;
 514 }
 515
 516 /* free wandered block locations of already written in place transaction */
 517 static void dealloc_wmap(struct commit_handle *ch)
 518 {
 519         assert("zam-696", ch->atom != NULL);
 520
 521         blocknr_set_iterator(ch->atom, &ch->atom->wandered_map,
 522                              dealloc_wmap_actor, NULL, 1);
 523 }
 524
 525 /* helper function for alloc wandered blocks, which refill set of block
 526    numbers needed for wandered blocks  */
 527 static int
 528 get_more_wandered_blocks(int count, reiser4_block_nr * start, int *len)
 529 {
 530         reiser4_blocknr_hint hint;
 531         int ret;
 532
 533         reiser4_block_nr wide_len = count;
 534
 535         /* FIXME-ZAM: A special policy needed for allocation of wandered blocks
 536            ZAM-FIXME-HANS: yes, what happened to our discussion of using a fixed
 537            reserved allocation area so as to get the best qualities of fixed
 538            journals? */
 539         reiser4_blocknr_hint_init(&hint);
 540         hint.block_stage = BLOCK_GRABBED;
 541
 542         ret = reiser4_alloc_blocks(&hint, start, &wide_len,
 543                                    BA_FORMATTED | BA_USE_DEFAULT_SEARCH_START);
 544         *len = (int)wide_len;
 545
 546         return ret;
 547 }
 548
 549 /*
 550  * roll back changes made before issuing BIO in the case of IO error.
 551  */
 552 static void undo_bio(struct bio *bio)
 553 {
 554         int i;
 555
 556         for (i = 0; i < bio->bi_vcnt; ++i) {
 557                 struct page *pg;
 558                 jnode *node;
 559
 560                 pg = bio->bi_io_vec[i].bv_page;
 561                 end_page_writeback(pg);
 562                 node = jprivate(pg);
 563                 spin_lock_jnode(node);
 564                 JF_CLR(node, JNODE_WRITEBACK);
 565                 JF_SET(node, JNODE_DIRTY);
 566                 spin_unlock_jnode(node);
 567         }
 568         bio_put(bio);
 569 }
 570
 571 /* put overwrite set back to atom's clean list */
 572 static void put_overwrite_set(struct commit_handle *ch)
 573 {
 574         jnode *cur;
 575
 576         list_for_each_entry(cur, ch->overwrite_set, capture_link)
 577                 jrelse_tail(cur);
 578 }
 579
 580 /* Count overwrite set size, grab disk space for wandered blocks allocation.
 581    Since we have a separate list for atom's overwrite set we just scan the list,
 582    count bitmap and other not leaf nodes which wandered blocks allocation we
 583    have to grab space for. */
 584 static int get_overwrite_set(struct commit_handle *ch)
 585 {
 586         int ret;
 587         jnode *cur;
 588         __u64 nr_not_leaves = 0;
 589 #if REISER4_DEBUG
 590         __u64 nr_formatted_leaves = 0;
 591         __u64 nr_unformatted_leaves = 0;
 592 #endif
 593
 594         assert("zam-697", ch->overwrite_set_size == 0);
 595
 596         ch->overwrite_set = ATOM_OVRWR_LIST(ch->atom);
 597         cur = list_entry(ch->overwrite_set->next, jnode, capture_link);
 598
 599         while (ch->overwrite_set != &cur->capture_link) {
 600                 jnode *next = list_entry(cur->capture_link.next, jnode, capture_link);
 601
 602                 /* Count bitmap locks for getting correct statistics what number
 603                  * of blocks were cleared by the transaction commit. */
 604                 if (jnode_get_type(cur) == JNODE_BITMAP)
 605                         ch->nr_bitmap++;
 606
 607                 assert("zam-939", JF_ISSET(cur, JNODE_OVRWR)
 608                        || jnode_get_type(cur) == JNODE_BITMAP);
 609
 610                 if (jnode_is_znode(cur) && znode_above_root(JZNODE(cur))) {
 611                         /* we replace fake znode by another (real)
 612                            znode which is suggested by disk_layout
 613                            plugin */
 614
 615                         /* FIXME: it looks like fake znode should be
 616                            replaced by jnode supplied by
 617                            disk_layout. */
 618
 619                         struct super_block *s = reiser4_get_current_sb();
 620                         reiser4_super_info_data *sbinfo =
 621                             get_current_super_private();
 622
 623                         if (sbinfo->df_plug->log_super) {
 624                                 jnode *sj = sbinfo->df_plug->log_super(s);
 625
 626                                 assert("zam-593", sj != NULL);
 627
 628                                 if (IS_ERR(sj))
 629                                         return PTR_ERR(sj);
 630
 631                                 spin_lock_jnode(sj);
 632                                 JF_SET(sj, JNODE_OVRWR);
 633                                 insert_into_atom_ovrwr_list(ch->atom, sj);
 634                                 spin_unlock_jnode(sj);
 635
 636                                 /* jload it as the rest of overwrite set */
 637                                 jload_gfp(sj, reiser4_ctx_gfp_mask_get(), 0);
 638
 639                                 ch->overwrite_set_size++;
 640                         }
 641                         spin_lock_jnode(cur);
 642                         reiser4_uncapture_block(cur);
 643                         jput(cur);
 644
 645                 } else {
 646                         int ret;
 647                         ch->overwrite_set_size++;
 648                         ret = jload_gfp(cur, reiser4_ctx_gfp_mask_get(), 0);
 649                         if (ret)
 650                                 reiser4_panic("zam-783",
 651                                               "cannot load e-flushed jnode back (ret = %d)\n",
 652                                               ret);
 653                 }
 654
 655                 /* Count not leaves here because we have to grab disk space
 656                  * for wandered blocks. They were not counted as "flush
 657                  * reserved". Counting should be done _after_ nodes are pinned
 658                  * into memory by jload(). */
 659                 if (!jnode_is_leaf(cur))
 660                         nr_not_leaves++;
 661                 else {
 662 #if REISER4_DEBUG
 663                         /* at this point @cur either has JNODE_FLUSH_RESERVED
 664                          * or is eflushed. Locking is not strong enough to
 665                          * write an assertion checking for this. */
 666                         if (jnode_is_znode(cur))
 667                                 nr_formatted_leaves++;
 668                         else
 669                                 nr_unformatted_leaves++;
 670 #endif
 671                         JF_CLR(cur, JNODE_FLUSH_RESERVED);
 672                 }
 673
 674                 cur = next;
 675         }
 676
 677         /* Grab space for writing (wandered blocks) of not leaves found in
 678          * overwrite set. */
 679         ret = reiser4_grab_space_force(nr_not_leaves, BA_RESERVED);
 680         if (ret)
 681                 return ret;
 682
 683         /* Disk space for allocation of wandered blocks of leaf nodes already
 684          * reserved as "flush reserved", move it to grabbed space counter. */
 685         spin_lock_atom(ch->atom);
 686         assert("zam-940",
 687                nr_formatted_leaves + nr_unformatted_leaves <=
 688                ch->atom->flush_reserved);
 689         flush_reserved2grabbed(ch->atom, ch->atom->flush_reserved);
 690         spin_unlock_atom(ch->atom);
 691
 692         return ch->overwrite_set_size;
 693 }
 694
 695 /**
 696  * write_jnodes_to_disk_extent - submit write request
 697  * @head:
 698  * @first: first jnode of the list
 699  * @nr: number of jnodes on the list
 700  * @block_p:
 701  * @fq:
 702  * @flags: used to decide whether page is to get PG_reclaim flag
 703  *
 704  * Submits a write request for @nr jnodes beginning from the @first, other
 705  * jnodes are after the @first on the double-linked "capture" list.  All jnodes
 706  * will be written to the disk region of @nr blocks starting with @block_p block
 707  * number.  If @fq is not NULL it means that waiting for i/o completion will be
 708  * done more efficiently by using flush_queue_t objects.
 709  * This function is the one which writes list of jnodes in batch mode. It does
 710  * all low-level things as bio construction and page states manipulation.
 711  *
 712  * ZAM-FIXME-HANS: brief me on why this function exists, and why bios are
 713  * aggregated in this function instead of being left to the layers below
 714  *
 715  * FIXME: ZAM->HANS: What layer are you talking about? Can you point me to that?
 716  * Why that layer needed? Why BIOs cannot be constructed here?
 717  */
 718 static int write_jnodes_to_disk_extent(
 719         jnode *first, int nr, const reiser4_block_nr *block_p,
 720         flush_queue_t *fq, int flags)
 721 {
 722         struct super_block *super = reiser4_get_current_sb();
 723         int write_op = ( flags & WRITEOUT_BARRIER ) ? WRITE_BARRIER : WRITE;
 724         int max_blocks;
 725         jnode *cur = first;
 726         reiser4_block_nr block;
 727
 728         assert("zam-571", first != NULL);
 729         assert("zam-572", block_p != NULL);
 730         assert("zam-570", nr > 0);
 731
 732         block = *block_p;
 733         max_blocks = min(bio_get_nr_vecs(super->s_bdev), BIO_MAX_PAGES);
 734
 735         while (nr > 0) {
 736                 struct bio *bio;
 737                 int nr_blocks = min(nr, max_blocks);
 738                 int i;
 739                 int nr_used;
 740
 741                 bio = bio_alloc(GFP_NOIO, nr_blocks);
 742                 if (!bio)
 743                         return RETERR(-ENOMEM);
 744
 745                 bio->bi_bdev = super->s_bdev;
 746                 bio->bi_sector = block * (super->s_blocksize >> 9);
 747                 for (nr_used = 0, i = 0; i < nr_blocks; i++) {
 748                         struct page *pg;
 749
 750                         pg = jnode_page(cur);
 751                         assert("zam-573", pg != NULL);
 752
 753                         page_cache_get(pg);
 754
 755                         lock_and_wait_page_writeback(pg);
 756
 757                         if (!bio_add_page(bio, pg, super->s_blocksize, 0)) {
 758                                 /*
 759                                  * underlying device is satiated. Stop adding
 760                                  * pages to the bio.
 761                                  */
 762                                 unlock_page(pg);
 763                                 page_cache_release(pg);
 764                                 break;
 765                         }
 766
 767                         spin_lock_jnode(cur);
 768                         assert("nikita-3166",
 769                                pg->mapping == jnode_get_mapping(cur));
 770                         assert("zam-912", !JF_ISSET(cur, JNODE_WRITEBACK));
 771 #if REISER4_DEBUG
 772                         spin_lock(&cur->load);
 773                         assert("nikita-3165", !jnode_is_releasable(cur));
 774                         spin_unlock(&cur->load);
 775 #endif
 776                         JF_SET(cur, JNODE_WRITEBACK);
 777                         JF_CLR(cur, JNODE_DIRTY);
 778                         ON_DEBUG(cur->written++);
 779                         spin_unlock_jnode(cur);
 780
 781                         ClearPageError(pg);
 782                         set_page_writeback(pg);
 783
 784                         if (get_current_context()->entd) {
 785                                 /* this is ent thread */
 786                                 entd_context *ent = get_entd_context(super);
 787                                 struct wbq *rq, *next;
 788
 789                                 spin_lock(&ent->guard);
 790
 791                                 if (pg == ent->cur_request->page) {
 792                                         /*
 793                                          * entd is called for this page. This
 794                                          * request is not in th etodo list
 795                                          */
 796                                         ent->cur_request->written = 1;
 797                                 } else {
 798                                         /*
 799                                          * if we have written a page for which writepage
 800                                          * is called for - move request to another list.
 801                                          */
 802                                         list_for_each_entry_safe(rq, next, &ent->todo_list, link) {
 803                                                 assert("", rq->magic == WBQ_MAGIC);
 804                                                 if (pg == rq->page) {
 805                                                         /*
 806                                                          * remove request from
 807                                                          * entd's queue, but do
 808                                                          * not wake up a thread
 809                                                          * which put this
 810                                                          * request
 811                                                          */
 812                                                         list_del_init(&rq->link);
 813                                                         ent->nr_todo_reqs --;
 814                                                         list_add_tail(&rq->link, &ent->done_list);
 815                                                         ent->nr_done_reqs ++;
 816                                                         rq->written = 1;
 817                                                         break;
 818                                                 }
 819                                         }
 820                                 }
 821                                 spin_unlock(&ent->guard);
 822                         }
 823
 824                         clear_page_dirty_for_io(pg);
 825
 826                         unlock_page(pg);
 827
 828                         cur = list_entry(cur->capture_link.next, jnode, capture_link);
 829                         nr_used++;
 830                 }
 831                 if (nr_used > 0) {
 832                         assert("nikita-3453",
 833                                bio->bi_size == super->s_blocksize * nr_used);
 834                         assert("nikita-3454", bio->bi_vcnt == nr_used);
 835
 836                         /* Check if we are allowed to write at all */
 837                         if (super->s_flags & MS_RDONLY)
 838                                 undo_bio(bio);
 839                         else {
 840                                 int not_supported;
 841
 842                                 add_fq_to_bio(fq, bio);
 843                                 bio_get(bio);
 844                                 reiser4_submit_bio(write_op, bio);
 845                                 not_supported = bio_flagged(bio, BIO_EOPNOTSUPP);
 846                                 bio_put(bio);
 847                                 if (not_supported)
 848                                         return -EOPNOTSUPP;
 849                         }
 850
 851                         block += nr_used - 1;
 852                         update_blocknr_hint_default(super, &block);
 853                         block += 1;
 854                 } else {
 855                         bio_put(bio);
 856                 }
 857                 nr -= nr_used;
 858         }
 859
 860         return 0;
 861 }
 862
 863 /* This is a procedure which recovers a contiguous sequences of disk block
 864    numbers in the given list of j-nodes and submits write requests on this
 865    per-sequence basis */
 866 int
 867 write_jnode_list(struct list_head *head, flush_queue_t *fq,
 868                  long *nr_submitted, int flags)
 869 {
 870         int ret;
 871         jnode *beg = list_entry(head->next, jnode, capture_link);
 872
 873         while (head != &beg->capture_link) {
 874                 int nr = 1;
 875                 jnode *cur = list_entry(beg->capture_link.next, jnode, capture_link);
 876
 877                 while (head != &cur->capture_link) {
 878                         if (*jnode_get_block(cur) != *jnode_get_block(beg) + nr)
 879                                 break;
 880                         ++nr;
 881                         cur = list_entry(cur->capture_link.next, jnode, capture_link);
 882                 }
 883
 884                 ret = write_jnodes_to_disk_extent(
 885                         beg, nr, jnode_get_block(beg), fq, flags);
 886                 if (ret)
 887                         return ret;
 888
 889                 if (nr_submitted)
 890                         *nr_submitted += nr;
 891
 892                 beg = cur;
 893         }
 894
 895         return 0;
 896 }
 897
 898 /* add given wandered mapping to atom's wandered map */
 899 static int
 900 add_region_to_wmap(jnode * cur, int len, const reiser4_block_nr * block_p)
 901 {
 902         int ret;
 903         blocknr_set_entry *new_bsep = NULL;
 904         reiser4_block_nr block;
 905
 906         txn_atom *atom;
 907
 908         assert("zam-568", block_p != NULL);
 909         block = *block_p;
 910         assert("zam-569", len > 0);
 911
 912         while ((len--) > 0) {
 913                 do {
 914                         atom = get_current_atom_locked();
 915                         assert("zam-536",
 916                                !reiser4_blocknr_is_fake(jnode_get_block(cur)));
 917                         ret =
 918                             blocknr_set_add_pair(atom, &atom->wandered_map,
 919                                                  &new_bsep,
 920                                                  jnode_get_block(cur), &block);
 921                 } while (ret == -E_REPEAT);
 922
 923                 if (ret) {
 924                         /* deallocate blocks which were not added to wandered
 925                            map */
 926                         reiser4_block_nr wide_len = len;
 927
 928                         reiser4_dealloc_blocks(&block, &wide_len,
 929                                                BLOCK_NOT_COUNTED,
 930                                                BA_FORMATTED
 931                                                /* formatted, without defer */ );
 932
 933                         return ret;
 934                 }
 935
 936                 spin_unlock_atom(atom);
 937
 938                 cur = list_entry(cur->capture_link.next, jnode, capture_link);
 939                 ++block;
 940         }
 941
 942         return 0;
 943 }
 944
 945 /* Allocate wandered blocks for current atom's OVERWRITE SET and immediately
 946    submit IO for allocated blocks.  We assume that current atom is in a stage
 947    when any atom fusion is impossible and atom is unlocked and it is safe. */
 948 static int alloc_wandered_blocks(struct commit_handle *ch, flush_queue_t *fq)
 949 {
 950         reiser4_block_nr block;
 951
 952         int rest;
 953         int len;
 954         int ret;
 955
 956         jnode *cur;
 957
 958         assert("zam-534", ch->overwrite_set_size > 0);
 959
 960         rest = ch->overwrite_set_size;
 961
 962         cur = list_entry(ch->overwrite_set->next, jnode, capture_link);
 963         while (ch->overwrite_set != &cur->capture_link) {
 964                 assert("zam-567", JF_ISSET(cur, JNODE_OVRWR));
 965
 966                 ret = get_more_wandered_blocks(rest, &block, &len);
 967                 if (ret)
 968                         return ret;
 969
 970                 rest -= len;
 971
 972                 ret = add_region_to_wmap(cur, len, &block);
 973                 if (ret)
 974                         return ret;
 975
 976                 ret = write_jnodes_to_disk_extent(cur, len, &block, fq, 0);
 977                 if (ret)
 978                         return ret;
 979
 980                 while ((len--) > 0) {
 981                         assert("zam-604",
 982                                ch->overwrite_set != &cur->capture_link);
 983                         cur = list_entry(cur->capture_link.next, jnode, capture_link);
 984                 }
 985         }
 986
 987         return 0;
 988 }
 989
 990 /* allocate given number of nodes over the journal area and link them into a
 991    list, return pointer to the first jnode in the list */
 992 static int alloc_tx(struct commit_handle *ch, flush_queue_t * fq)
 993 {
 994         reiser4_blocknr_hint hint;
 995         reiser4_block_nr allocated = 0;
 996         reiser4_block_nr first, len;
 997         jnode *cur;
 998         jnode *txhead;
 999         int ret;
1000         reiser4_context *ctx;
1001         reiser4_super_info_data *sbinfo;
1002
1003         assert("zam-698", ch->tx_size > 0);
1004         assert("zam-699", list_empty_careful(&ch->tx_list));
1005
1006         ctx = get_current_context();
1007         sbinfo = get_super_private(ctx->super);
1008
1009         while (allocated < (unsigned)ch->tx_size) {
1010                 len = (ch->tx_size - allocated);
1011
1012                 reiser4_blocknr_hint_init(&hint);
1013
1014                 hint.block_stage = BLOCK_GRABBED;
1015
1016                 /* FIXME: there should be some block allocation policy for
1017                    nodes which contain wander records */
1018
1019                 /* We assume that disk space for wandered record blocks can be
1020                  * taken from reserved area. */
1021                 ret = reiser4_alloc_blocks(&hint, &first, &len,
1022                                            BA_FORMATTED | BA_RESERVED |
1023                                            BA_USE_DEFAULT_SEARCH_START);
1024                 reiser4_blocknr_hint_done(&hint);
1025
1026                 if (ret)
1027                         return ret;
1028
1029                 allocated += len;
1030
1031                 /* create jnodes for all wander records */
1032                 while (len--) {
1033                         cur = reiser4_alloc_io_head(&first);
1034
1035                         if (cur == NULL) {
1036                                 ret = RETERR(-ENOMEM);
1037                                 goto free_not_assigned;
1038                         }
1039
1040                         ret = jinit_new(cur, reiser4_ctx_gfp_mask_get());
1041
1042                         if (ret != 0) {
1043                                 jfree(cur);
1044                                 goto free_not_assigned;
1045                         }
1046
1047                         pin_jnode_data(cur);
1048
1049                         list_add_tail(&cur->capture_link, &ch->tx_list);
1050
1051                         first++;
1052                 }
1053         }
1054
1055         { /* format a on-disk linked list of wander records */
1056                 int serial = 1;
1057
1058                 txhead = list_entry(ch->tx_list.next, jnode, capture_link);
1059                 format_tx_head(ch);
1060
1061                 cur = list_entry(txhead->capture_link.next, jnode, capture_link);
1062                 while (&ch->tx_list != &cur->capture_link) {
1063                         format_wander_record(ch, cur, serial++);
1064                         cur = list_entry(cur->capture_link.next, jnode, capture_link);
1065                 }
1066         }
1067
1068         { /* Fill wander records with Wandered Set */
1069                 struct store_wmap_params params;
1070                 txn_atom *atom;
1071
1072                 params.cur = list_entry(txhead->capture_link.next, jnode, capture_link);
1073
1074                 params.idx = 0;
1075                 params.capacity =
1076                     wander_record_capacity(reiser4_get_current_sb());
1077
1078                 atom = get_current_atom_locked();
1079                 blocknr_set_iterator(atom, &atom->wandered_map,
1080                                      &store_wmap_actor, &params, 0);
1081                 spin_unlock_atom(atom);
1082         }
1083
1084         { /* relse all jnodes from tx_list */
1085                 cur = list_entry(ch->tx_list.next, jnode, capture_link);
1086                 while (&ch->tx_list != &cur->capture_link) {
1087                         jrelse(cur);
1088                         cur = list_entry(cur->capture_link.next, jnode, capture_link);
1089                 }
1090         }
1091
1092         ret = write_jnode_list(&ch->tx_list, fq, NULL, 0);
1093
1094         return ret;
1095
1096       free_not_assigned:
1097         /* We deallocate blocks not yet assigned to jnodes on tx_list. The
1098            caller takes care about invalidating of tx list  */
1099         reiser4_dealloc_blocks(&first, &len, BLOCK_NOT_COUNTED, BA_FORMATTED);
1100
1101         return ret;
1102 }
1103
1104 static int commit_tx(struct commit_handle *ch)
1105 {
1106         flush_queue_t *fq;
1107         int barrier;
1108         int ret;
1109
1110         /* Grab more space for wandered records. */
1111         ret = reiser4_grab_space_force((__u64) (ch->tx_size), BA_RESERVED);
1112         if (ret)
1113                 return ret;
1114
1115         fq = get_fq_for_current_atom();
1116         if (IS_ERR(fq))
1117                 return PTR_ERR(fq);
1118
1119         spin_unlock_atom(fq->atom);
1120         do {
1121                 ret = alloc_wandered_blocks(ch, fq);
1122                 if (ret)
1123                         break;
1124                 ret = alloc_tx(ch, fq);
1125                 if (ret)
1126                         break;
1127         } while (0);
1128
1129         reiser4_fq_put(fq);
1130         if (ret)
1131                 return ret;
1132  repeat_wo_barrier:
1133         barrier = reiser4_use_write_barrier(ch->super);
1134         if (!barrier) {
1135                 ret = current_atom_finish_all_fq();
1136                 if (ret)
1137                         return ret;
1138         }
1139         ret = update_journal_header(ch, barrier);
1140         if (barrier) {
1141                 if (ret) {
1142                         if (ret == -EOPNOTSUPP) {
1143                                 disable_write_barrier(ch->super);
1144                                 goto repeat_wo_barrier;
1145                         }
1146                         return ret;
1147                 }
1148                 ret = current_atom_finish_all_fq();
1149         }
1150         return ret;
1151 }
1152
1153 static int write_tx_back(struct commit_handle * ch)
1154 {
1155         flush_queue_t *fq;
1156         int ret;
1157         int barrier;
1158
1159         reiser4_post_commit_hook();
1160         fq = get_fq_for_current_atom();
1161         if (IS_ERR(fq))
1162                 return  PTR_ERR(fq);
1163         spin_unlock_atom(fq->atom);
1164         ret = write_jnode_list(
1165                 ch->overwrite_set, fq, NULL, WRITEOUT_FOR_PAGE_RECLAIM);
1166         reiser4_fq_put(fq);
1167         if (ret)
1168                 return ret;
1169  repeat_wo_barrier:
1170         barrier = reiser4_use_write_barrier(ch->super);
1171         if (!barrier) {
1172                 ret = current_atom_finish_all_fq();
1173                 if (ret)
1174                         return ret;
1175         }
1176         ret = update_journal_footer(ch, barrier);
1177         if (barrier) {
1178                 if (ret) {
1179                         if (ret == -EOPNOTSUPP) {
1180                                 disable_write_barrier(ch->super);
1181                                 goto repeat_wo_barrier;
1182                         }
1183                         return ret;
1184                 }
1185                 ret = current_atom_finish_all_fq();
1186         }
1187         if (ret)
1188                 return ret;
1189         reiser4_post_write_back_hook();
1190         return 0;
1191 }
1192
1193 /* We assume that at this moment all captured blocks are marked as RELOC or
1194    WANDER (belong to Relocate o Overwrite set), all nodes from Relocate set
1195    are submitted to write.
1196 */
1197
1198 int reiser4_write_logs(long *nr_submitted)
1199 {
1200         txn_atom *atom;
1201         struct super_block *super = reiser4_get_current_sb();
1202         reiser4_super_info_data *sbinfo = get_super_private(super);
1203         struct commit_handle ch;
1204         int ret;
1205
1206         writeout_mode_enable();
1207
1208         /* block allocator may add j-nodes to the clean_list */
1209         ret = reiser4_pre_commit_hook();
1210         if (ret)
1211                 return ret;
1212
1213         /* No locks are required if we take atom which stage >=
1214          * ASTAGE_PRE_COMMIT */
1215         atom = get_current_context()->trans->atom;
1216         assert("zam-965", atom != NULL);
1217
1218         /* relocate set is on the atom->clean_nodes list after
1219          * current_atom_complete_writes() finishes. It can be safely
1220          * uncaptured after commit_mutex is locked, because any atom that
1221          * captures these nodes is guaranteed to commit after current one.
1222          *
1223          * This can only be done after reiser4_pre_commit_hook(), because it is where
1224          * early flushed jnodes with CREATED bit are transferred to the
1225          * overwrite list. */
1226         reiser4_invalidate_list(ATOM_CLEAN_LIST(atom));
1227         spin_lock_atom(atom);
1228         /* There might be waiters for the relocate nodes which we have
1229          * released, wake them up. */
1230         reiser4_atom_send_event(atom);
1231         spin_unlock_atom(atom);
1232
1233         if (REISER4_DEBUG) {
1234                 int level;
1235
1236                 for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; ++level)
1237                         assert("nikita-3352",
1238                                list_empty_careful(ATOM_DIRTY_LIST(atom, level)));
1239         }
1240
1241         sbinfo->nr_files_committed += (unsigned)atom->nr_objects_created;
1242         sbinfo->nr_files_committed -= (unsigned)atom->nr_objects_deleted;
1243
1244         init_commit_handle(&ch, atom);
1245
1246         ch.free_blocks = sbinfo->blocks_free_committed;
1247         ch.nr_files = sbinfo->nr_files_committed;
1248         /* ZAM-FIXME-HANS: email me what the contention level is for the super
1249          * lock. */
1250         ch.next_oid = oid_next(super);
1251
1252         /* count overwrite set and place it in a separate list */
1253         ret = get_overwrite_set(&ch);
1254
1255         if (ret <= 0) {
1256                 /* It is possible that overwrite set is empty here, it means
1257                    all captured nodes are clean */
1258                 goto up_and_ret;
1259         }
1260
1261         /* Inform the caller about what number of dirty pages will be
1262          * submitted to disk. */
1263         *nr_submitted += ch.overwrite_set_size - ch.nr_bitmap;
1264
1265         /* count all records needed for storing of the wandered set */
1266         get_tx_size(&ch);
1267
1268         ret = commit_tx(&ch);
1269         if (ret)
1270                 goto up_and_ret;
1271
1272         spin_lock_atom(atom);
1273         reiser4_atom_set_stage(atom, ASTAGE_POST_COMMIT);
1274         spin_unlock_atom(atom);
1275
1276         ret = write_tx_back(&ch);
1277         reiser4_post_write_back_hook();
1278
1279       up_and_ret:
1280         if (ret) {
1281                 /* there could be fq attached to current atom; the only way to
1282                    remove them is: */
1283                 current_atom_finish_all_fq();
1284         }
1285
1286         /* free blocks of flushed transaction */
1287         dealloc_tx_list(&ch);
1288         dealloc_wmap(&ch);
1289
1290         put_overwrite_set(&ch);
1291
1292         done_commit_handle(&ch);
1293
1294         writeout_mode_disable();
1295
1296         return ret;
1297 }
1298
1299 /* consistency checks for journal data/control blocks: header, footer, log
1300    records, transactions head blocks. All functions return zero on success. */
1301
1302 static int check_journal_header(const jnode * node UNUSED_ARG)
1303 {
1304         /* FIXME: journal header has no magic field yet. */
1305         return 0;
1306 }
1307
1308 /* wait for write completion for all jnodes from given list */
1309 static int wait_on_jnode_list(struct list_head *head)
1310 {
1311         jnode *scan;
1312         int ret = 0;
1313
1314         list_for_each_entry(scan, head, capture_link) {
1315                 struct page *pg = jnode_page(scan);
1316
1317                 if (pg) {
1318                         if (PageWriteback(pg))
1319                                 wait_on_page_writeback(pg);
1320
1321                         if (PageError(pg))
1322                                 ret++;
1323                 }
1324         }
1325
1326         return ret;
1327 }
1328
1329 static int check_journal_footer(const jnode * node UNUSED_ARG)
1330 {
1331         /* FIXME: journal footer has no magic field yet. */
1332         return 0;
1333 }
1334
1335 static int check_tx_head(const jnode * node)
1336 {
1337         struct tx_header *header = (struct tx_header *)jdata(node);
1338
1339         if (memcmp(&header->magic, TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE) != 0) {
1340                 warning("zam-627", "tx head at block %s corrupted\n",
1341                         sprint_address(jnode_get_block(node)));
1342                 return RETERR(-EIO);
1343         }
1344
1345         return 0;
1346 }
1347
1348 static int check_wander_record(const jnode * node)
1349 {
1350         struct wander_record_header *RH =
1351             (struct wander_record_header *)jdata(node);
1352
1353         if (memcmp(&RH->magic, WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE) !=
1354             0) {
1355                 warning("zam-628", "wander record at block %s corrupted\n",
1356                         sprint_address(jnode_get_block(node)));
1357                 return RETERR(-EIO);
1358         }
1359
1360         return 0;
1361 }
1362
1363 /* fill commit_handler structure by everything what is needed for update_journal_footer */
1364 static int restore_commit_handle(struct commit_handle *ch, jnode *tx_head)
1365 {
1366         struct tx_header *TXH;
1367         int ret;
1368
1369         ret = jload(tx_head);
1370         if (ret)
1371                 return ret;
1372
1373         TXH = (struct tx_header *)jdata(tx_head);
1374
1375         ch->free_blocks = le64_to_cpu(get_unaligned(&TXH->free_blocks));
1376         ch->nr_files = le64_to_cpu(get_unaligned(&TXH->nr_files));
1377         ch->next_oid = le64_to_cpu(get_unaligned(&TXH->next_oid));
1378
1379         jrelse(tx_head);
1380
1381         list_add(&tx_head->capture_link, &ch->tx_list);
1382
1383         return 0;
1384 }
1385
1386 /* replay one transaction: restore and write overwrite set in place */
1387 static int replay_transaction(const struct super_block *s,
1388                               jnode * tx_head,
1389                               const reiser4_block_nr * log_rec_block_p,
1390                               const reiser4_block_nr * end_block,
1391                               unsigned int nr_wander_records)
1392 {
1393         reiser4_block_nr log_rec_block = *log_rec_block_p;
1394         struct commit_handle ch;
1395         LIST_HEAD(overwrite_set);
1396         jnode *log;
1397         int ret;
1398
1399         init_commit_handle(&ch, NULL);
1400         ch.overwrite_set = &overwrite_set;
1401
1402         restore_commit_handle(&ch, tx_head);
1403
1404         while (log_rec_block != *end_block) {
1405                 struct wander_record_header *header;
1406                 struct wander_entry *entry;
1407
1408                 int i;
1409
1410                 if (nr_wander_records == 0) {
1411                         warning("zam-631",
1412                                 "number of wander records in the linked list"
1413                                 " greater than number stored in tx head.\n");
1414                         ret = RETERR(-EIO);
1415                         goto free_ow_set;
1416                 }
1417
1418                 log = reiser4_alloc_io_head(&log_rec_block);
1419                 if (log == NULL)
1420                         return RETERR(-ENOMEM);
1421
1422                 ret = jload(log);
1423                 if (ret < 0) {
1424                         reiser4_drop_io_head(log);
1425                         return ret;
1426                 }
1427
1428                 ret = check_wander_record(log);
1429                 if (ret) {
1430                         jrelse(log);
1431                         reiser4_drop_io_head(log);
1432                         return ret;
1433                 }
1434
1435                 header = (struct wander_record_header *)jdata(log);
1436                 log_rec_block = le64_to_cpu(get_unaligned(&header->next_block));
1437
1438                 entry = (struct wander_entry *)(header + 1);
1439
1440                 /* restore overwrite set from wander record content */
1441                 for (i = 0; i < wander_record_capacity(s); i++) {
1442                         reiser4_block_nr block;
1443                         jnode *node;
1444
1445                         block = le64_to_cpu(get_unaligned(&entry->wandered));
1446                         if (block == 0)
1447                                 break;
1448
1449                         node = reiser4_alloc_io_head(&block);
1450                         if (node == NULL) {
1451                                 ret = RETERR(-ENOMEM);
1452                                 /*
1453                                  * FIXME-VS:???
1454                                  */
1455                                 jrelse(log);
1456                                 reiser4_drop_io_head(log);
1457                                 goto free_ow_set;
1458                         }
1459
1460                         ret = jload(node);
1461
1462                         if (ret < 0) {
1463                                 reiser4_drop_io_head(node);
1464                                 /*
1465                                  * FIXME-VS:???
1466                                  */
1467                                 jrelse(log);
1468                                 reiser4_drop_io_head(log);
1469                                 goto free_ow_set;
1470                         }
1471
1472                         block = le64_to_cpu(get_unaligned(&entry->original));
1473
1474                         assert("zam-603", block != 0);
1475
1476                         jnode_set_block(node, &block);
1477
1478                         list_add_tail(&node->capture_link, ch.overwrite_set);
1479
1480                         ++entry;
1481                 }
1482
1483                 jrelse(log);
1484                 reiser4_drop_io_head(log);
1485
1486                 --nr_wander_records;
1487         }
1488
1489         if (nr_wander_records != 0) {
1490                 warning("zam-632", "number of wander records in the linked list"
1491                         " less than number stored in tx head.\n");
1492                 ret = RETERR(-EIO);
1493                 goto free_ow_set;
1494         }
1495
1496         {                       /* write wandered set in place */
1497                 write_jnode_list(ch.overwrite_set, NULL, NULL, 0);
1498                 ret = wait_on_jnode_list(ch.overwrite_set);
1499
1500                 if (ret) {
1501                         ret = RETERR(-EIO);
1502                         goto free_ow_set;
1503                 }
1504         }
1505
1506         ret = update_journal_footer(&ch, 0);
1507
1508       free_ow_set:
1509
1510         while (!list_empty(ch.overwrite_set)) {
1511                 jnode *cur = list_entry(ch.overwrite_set->next, jnode, capture_link);
1512                 list_del_init(&cur->capture_link);
1513                 jrelse(cur);
1514                 reiser4_drop_io_head(cur);
1515         }
1516
1517         list_del_init(&tx_head->capture_link);
1518
1519         done_commit_handle(&ch);
1520
1521         return ret;
1522 }
1523
1524 /* find oldest committed and not played transaction and play it. The transaction
1525  * was committed and journal header block was updated but the blocks from the
1526  * process of writing the atom's overwrite set in-place and updating of journal
1527  * footer block were not completed. This function completes the process by
1528  * recovering the atom's overwrite set from their wandered locations and writes
1529  * them in-place and updating the journal footer. */
1530 static int replay_oldest_transaction(struct super_block *s)
1531 {
1532         reiser4_super_info_data *sbinfo = get_super_private(s);
1533         jnode *jf = sbinfo->journal_footer;
1534         unsigned int total;
1535         struct journal_footer *F;
1536         struct tx_header *T;
1537
1538         reiser4_block_nr prev_tx;
1539         reiser4_block_nr last_flushed_tx;
1540         reiser4_block_nr log_rec_block = 0;
1541
1542         jnode *tx_head;
1543
1544         int ret;
1545
1546         if ((ret = jload(jf)) < 0)
1547                 return ret;
1548
1549         F = (struct journal_footer *)jdata(jf);
1550
1551         last_flushed_tx = le64_to_cpu(get_unaligned(&F->last_flushed_tx));
1552
1553         jrelse(jf);
1554
1555         if (sbinfo->last_committed_tx == last_flushed_tx) {
1556                 /* all transactions are replayed */
1557                 return 0;
1558         }
1559
1560         prev_tx = sbinfo->last_committed_tx;
1561
1562         /* searching for oldest not flushed transaction */
1563         while (1) {
1564                 tx_head = reiser4_alloc_io_head(&prev_tx);
1565                 if (!tx_head)
1566                         return RETERR(-ENOMEM);
1567
1568                 ret = jload(tx_head);
1569                 if (ret < 0) {
1570                         reiser4_drop_io_head(tx_head);
1571                         return ret;
1572                 }
1573
1574                 ret = check_tx_head(tx_head);
1575                 if (ret) {
1576                         jrelse(tx_head);
1577                         reiser4_drop_io_head(tx_head);
1578                         return ret;
1579                 }
1580
1581                 T = (struct tx_header *)jdata(tx_head);
1582
1583                 prev_tx = le64_to_cpu(get_unaligned(&T->prev_tx));
1584
1585                 if (prev_tx == last_flushed_tx)
1586                         break;
1587
1588                 jrelse(tx_head);
1589                 reiser4_drop_io_head(tx_head);
1590         }
1591
1592         total = le32_to_cpu(get_unaligned(&T->total));
1593         log_rec_block = le64_to_cpu(get_unaligned(&T->next_block));
1594
1595         pin_jnode_data(tx_head);
1596         jrelse(tx_head);
1597
1598         ret =
1599             replay_transaction(s, tx_head, &log_rec_block,
1600                                jnode_get_block(tx_head), total - 1);
1601
1602         unpin_jnode_data(tx_head);
1603         reiser4_drop_io_head(tx_head);
1604
1605         if (ret)
1606                 return ret;
1607         return -E_REPEAT;
1608 }
1609
1610 /* The reiser4 journal current implementation was optimized to not to capture
1611    super block if certain super blocks fields are modified. Currently, the set
1612    is (<free block count>, <OID allocator>). These fields are logged by
1613    special way which includes storing them in each transaction head block at
1614    atom commit time and writing that information to journal footer block at
1615    atom flush time.  For getting info from journal footer block to the
1616    in-memory super block there is a special function
1617    reiser4_journal_recover_sb_data() which should be called after disk format
1618    plugin re-reads super block after journal replaying.
1619 */
1620
1621 /* get the information from journal footer in-memory super block */
1622 int reiser4_journal_recover_sb_data(struct super_block *s)
1623 {
1624         reiser4_super_info_data *sbinfo = get_super_private(s);
1625         struct journal_footer *jf;
1626         int ret;
1627
1628         assert("zam-673", sbinfo->journal_footer != NULL);
1629
1630         ret = jload(sbinfo->journal_footer);
1631         if (ret != 0)
1632                 return ret;
1633
1634         ret = check_journal_footer(sbinfo->journal_footer);
1635         if (ret != 0)
1636                 goto out;
1637
1638         jf = (struct journal_footer *)jdata(sbinfo->journal_footer);
1639
1640         /* was there at least one flushed transaction?  */
1641         if (jf->last_flushed_tx) {
1642
1643                 /* restore free block counter logged in this transaction */
1644                 reiser4_set_free_blocks(s, le64_to_cpu(get_unaligned(&jf->free_blocks)));
1645
1646                 /* restore oid allocator state */
1647                 oid_init_allocator(s,
1648                                    le64_to_cpu(get_unaligned(&jf->nr_files)),
1649                                    le64_to_cpu(get_unaligned(&jf->next_oid)));
1650         }
1651       out:
1652         jrelse(sbinfo->journal_footer);
1653         return ret;
1654 }
1655
1656 /* reiser4 replay journal procedure */
1657 int reiser4_journal_replay(struct super_block *s)
1658 {
1659         reiser4_super_info_data *sbinfo = get_super_private(s);
1660         jnode *jh, *jf;
1661         struct journal_header *header;
1662         int nr_tx_replayed = 0;
1663         int ret;
1664
1665         assert("zam-582", sbinfo != NULL);
1666
1667         jh = sbinfo->journal_header;
1668         jf = sbinfo->journal_footer;
1669
1670         if (!jh || !jf) {
1671                 /* it is possible that disk layout does not support journal
1672                    structures, we just warn about this */
1673                 warning("zam-583",
1674                         "journal control blocks were not loaded by disk layout plugin.  "
1675                         "journal replaying is not possible.\n");
1676                 return 0;
1677         }
1678
1679         /* Take free block count from journal footer block. The free block
1680            counter value corresponds the last flushed transaction state */
1681         ret = jload(jf);
1682         if (ret < 0)
1683                 return ret;
1684
1685         ret = check_journal_footer(jf);
1686         if (ret) {
1687                 jrelse(jf);
1688                 return ret;
1689         }
1690
1691         jrelse(jf);
1692
1693         /* store last committed transaction info in reiser4 in-memory super
1694            block */
1695         ret = jload(jh);
1696         if (ret < 0)
1697                 return ret;
1698
1699         ret = check_journal_header(jh);
1700         if (ret) {
1701                 jrelse(jh);
1702                 return ret;
1703         }
1704
1705         header = (struct journal_header *)jdata(jh);
1706         sbinfo->last_committed_tx = le64_to_cpu(get_unaligned(&header->last_committed_tx));
1707
1708         jrelse(jh);
1709
1710         /* replay committed transactions */
1711         while ((ret = replay_oldest_transaction(s)) == -E_REPEAT)
1712                 nr_tx_replayed++;
1713
1714         return ret;
1715 }
1716
1717 /* load journal control block (either journal header or journal footer block) */
1718 static int
1719 load_journal_control_block(jnode ** node, const reiser4_block_nr * block)
1720 {
1721         int ret;
1722
1723         *node = reiser4_alloc_io_head(block);
1724         if (!(*node))
1725                 return RETERR(-ENOMEM);
1726
1727         ret = jload(*node);
1728
1729         if (ret) {
1730                 reiser4_drop_io_head(*node);
1731                 *node = NULL;
1732                 return ret;
1733         }
1734
1735         pin_jnode_data(*node);
1736         jrelse(*node);
1737
1738         return 0;
1739 }
1740
1741 /* unload journal header or footer and free jnode */
1742 static void unload_journal_control_block(jnode ** node)
1743 {
1744         if (*node) {
1745                 unpin_jnode_data(*node);
1746                 reiser4_drop_io_head(*node);
1747                 *node = NULL;
1748         }
1749 }
1750
1751 /* release journal control blocks */
1752 void reiser4_done_journal_info(struct super_block *s)
1753 {
1754         reiser4_super_info_data *sbinfo = get_super_private(s);
1755
1756         assert("zam-476", sbinfo != NULL);
1757
1758         unload_journal_control_block(&sbinfo->journal_header);
1759         unload_journal_control_block(&sbinfo->journal_footer);
1760         rcu_barrier();
1761 }
1762
1763 /* load journal control blocks */
1764 int reiser4_init_journal_info(struct super_block *s)
1765 {
1766         reiser4_super_info_data *sbinfo = get_super_private(s);
1767         journal_location *loc;
1768         int ret;
1769
1770         loc = &sbinfo->jloc;
1771
1772         assert("zam-651", loc != NULL);
1773         assert("zam-652", loc->header != 0);
1774         assert("zam-653", loc->footer != 0);
1775
1776         ret = load_journal_control_block(&sbinfo->journal_header, &loc->header);
1777
1778         if (ret)
1779                 return ret;
1780
1781         ret = load_journal_control_block(&sbinfo->journal_footer, &loc->footer);
1782
1783         if (ret) {
1784                 unload_journal_control_block(&sbinfo->journal_header);
1785         }
1786
1787         return ret;
1788 }
1789
1790 /* Make Linus happy.
1791    Local variables:
1792    c-indentation-style: "K&R"
1793    mode-name: "LC"
1794    c-basic-offset: 8
1795    tab-width: 8
1796    fill-column: 80
1797    End:
1798 */