fs/reiser4/wander.c

   1 /* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
   2  * reiser4/README */
   3
   4 /* Reiser4 Wandering Log */
   5
   6 /* You should read http://www.namesys.com/txn-doc.html
   7
   8    That describes how filesystem operations are performed as atomic
   9    transactions, and how we try to arrange it so that we can write most of the
  10    data only once while performing the operation atomically.
  11
  12    For the purposes of this code, it is enough for it to understand that it
  13    has been told a given block should be written either once, or twice (if
  14    twice then once to the wandered location and once to the real location).
  15
  16    This code guarantees that those blocks that are defined to be part of an
  17    atom either all take effect or none of them take effect.
  18
  19    Relocate set nodes are submitted to write by the jnode_flush() routine, and
  20    the overwrite set is submitted by reiser4_write_log().  This is because with
  21    the overwrite set we seek to optimize writes, and with the relocate set we
  22    seek to cause disk order to correlate with the parent first pre-order.
  23
  24    reiser4_write_log() allocates and writes wandered blocks and maintains
  25    additional on-disk structures of the atom as wander records (each wander
  26    record occupies one block) for storing of the "wandered map" (a table which
  27    contains a relation between wandered and real block numbers) and other
  28    information which might be needed at transaction recovery time.
  29
  30    The wander records are unidirectionally linked into a circle: each wander
  31    record contains a block number of the next wander record, the last wander
  32    record points to the first one.
  33
  34    One wander record (named "tx head" in this file) has a format which is
  35    different from the other wander records. The "tx head" has a reference to the
  36    "tx head" block of the previously committed atom.  Also, "tx head" contains
  37    fs information (the free blocks counter, and the oid allocator state) which
  38    is logged in a special way .
  39
  40    There are two journal control blocks, named journal header and journal
  41    footer which have fixed on-disk locations.  The journal header has a
  42    reference to the "tx head" block of the last committed atom.  The journal
  43    footer points to the "tx head" of the last flushed atom.  The atom is
  44    "played" when all blocks from its overwrite set are written to disk the
  45    second time (i.e. written to their real locations).
  46
  47    NOTE: People who know reiserfs internals and its journal structure might be
  48    confused with these terms journal footer and journal header. There is a table
  49    with terms of similar semantics in reiserfs (reiser3) and reiser4:
  50
  51    REISER3 TERM        |  REISER4 TERM         | DESCRIPTION
  52    --------------------+-----------------------+----------------------------
  53    commit record       |  journal header       | atomic write of this record
  54                        |                       | ends transaction commit
  55    --------------------+-----------------------+----------------------------
  56    journal header      |  journal footer       | atomic write of this record
  57                        |                       | ends post-commit writes.
  58                        |                       | After successful
  59                        |                       | writing of this journal
  60                        |                       | blocks (in reiser3) or
  61                        |                       | wandered blocks/records are
  62                        |                       | free for re-use.
  63    --------------------+-----------------------+----------------------------
  64
  65    The atom commit process is the following:
  66
  67    1. The overwrite set is taken from atom's clean list, and its size is
  68       counted.
  69
  70    2. The number of necessary wander records (including tx head) is calculated,
  71       and the wander record blocks are allocated.
  72
  73    3. Allocate wandered blocks and populate wander records by wandered map.
  74
  75    4. submit write requests for wander records and wandered blocks.
  76
  77    5. wait until submitted write requests complete.
  78
  79    6. update journal header: change the pointer to the block number of just
  80    written tx head, submit an i/o for modified journal header block and wait
  81    for i/o completion.
  82
  83    NOTE: The special logging for bitmap blocks and some reiser4 super block
  84    fields makes processes of atom commit, flush and recovering a bit more
  85    complex (see comments in the source code for details).
  86
  87    The atom playing process is the following:
  88
  89    1. Write atom's overwrite set in-place.
  90
  91    2. Wait on i/o.
  92
  93    3. Update journal footer: change the pointer to block number of tx head
  94    block of the atom we currently flushing, submit an i/o, wait on i/o
  95    completion.
  96
  97    4. Free disk space which was used for wandered blocks and wander records.
  98
  99    After the freeing of wandered blocks and wander records we have that journal
 100    footer points to the on-disk structure which might be overwritten soon.
 101    Neither the log writer nor the journal recovery procedure use that pointer
 102    for accessing the data.  When the journal recovery procedure finds the oldest
 103    transaction it compares the journal footer pointer value with the "prev_tx"
 104    pointer value in tx head, if values are equal the oldest not flushed
 105    transaction is found.
 106
 107    NOTE on disk space leakage: the information about of what blocks and how many
 108    blocks are allocated for wandered blocks, wandered records is not written to
 109    the disk because of special logging for bitmaps and some super blocks
 110    counters.  After a system crash we the reiser4 does not remember those
 111    objects allocation, thus we have no such a kind of disk space leakage.
 112 */
 113
 114 /* Special logging of reiser4 super block fields. */
 115
 116 /* There are some reiser4 super block fields (free block count and OID allocator
 117    state (number of files and next free OID) which are logged separately from
 118    super block to avoid unnecessary atom fusion.
 119
 120    So, the reiser4 super block can be not captured by a transaction with
 121    allocates/deallocates disk blocks or create/delete file objects.  Moreover,
 122    the reiser4 on-disk super block is not touched when such a transaction is
 123    committed and flushed.  Those "counters logged specially" are logged in "tx
 124    head" blocks and in the journal footer block.
 125
 126    A step-by-step description of special logging:
 127
 128    0. The per-atom information about deleted or created files and allocated or
 129    freed blocks is collected during the transaction.  The atom's
 130    ->nr_objects_created and ->nr_objects_deleted are for object
 131    deletion/creation tracking, the numbers of allocated and freed blocks are
 132    calculated using atom's delete set and atom's capture list -- all new and
 133    relocated nodes should be on atom's clean list and should have JNODE_RELOC
 134    bit set.
 135
 136    1. The "logged specially" reiser4 super block fields have their "committed"
 137    versions in the reiser4 in-memory super block.  They get modified only at
 138    atom commit time.  The atom's commit thread has an exclusive access to those
 139    "committed" fields because the log writer implementation supports only one
 140    atom commit a time (there is a per-fs "commit" mutex).  At
 141    that time "committed" counters are modified using per-atom information
 142    collected during the transaction. These counters are stored on disk as a
 143    part of tx head block when atom is committed.
 144
 145    2. When the atom is flushed the value of the free block counter and the OID
 146    allocator state get written to the journal footer block.  A special journal
 147    procedure (journal_recover_sb_data()) takes those values from the journal
 148    footer and updates the reiser4 in-memory super block.
 149
 150    NOTE: That means free block count and OID allocator state are logged
 151    separately from the reiser4 super block regardless of the fact that the
 152    reiser4 super block has fields to store both the free block counter and the
 153    OID allocator.
 154
 155    Writing the whole super block at commit time requires knowing true values of
 156    all its fields without changes made by not yet committed transactions. It is
 157    possible by having their "committed" version of the super block like the
 158    reiser4 bitmap blocks have "committed" and "working" versions.  However,
 159    another scheme was implemented which stores special logged values in the
 160    unused free space inside transaction head block.  In my opinion it has an
 161    advantage of not writing whole super block when only part of it was
 162    modified. */
 163
 164 #include "debug.h"
 165 #include "dformat.h"
 166 #include "txnmgr.h"
 167 #include "jnode.h"
 168 #include "znode.h"
 169 #include "block_alloc.h"
 170 #include "page_cache.h"
 171 #include "wander.h"
 172 #include "reiser4.h"
 173 #include "super.h"
 174 #include "vfs_ops.h"
 175 #include "writeout.h"
 176 #include "inode.h"
 177 #include "entd.h"
 178
 179 #include <linux/types.h>
 180 #include <linux/fs.h>           /* for struct super_block  */
 181 #include <linux/mm.h>           /* for struct page */
 182 #include <linux/pagemap.h>
 183 #include <linux/bio.h>          /* for struct bio */
 184 #include <linux/blkdev.h>
 185
 186 static int write_jnodes_to_disk_extent(
 187         jnode *, int, const reiser4_block_nr *, flush_queue_t *, int);
 188
 189 /* The commit_handle is a container for objects needed at atom commit time  */
 190 struct commit_handle {
 191         /* A pointer to atom's list of OVRWR nodes */
 192         struct list_head *overwrite_set;
 193         /* atom's overwrite set size */
 194         int overwrite_set_size;
 195         /* jnodes for wander record blocks */
 196         struct list_head tx_list;
 197         /* number of wander records */
 198         __u32 tx_size;
 199         /* 'committed' sb counters are saved here until atom is completely
 200            flushed  */
 201         __u64 free_blocks;
 202         __u64 nr_files;
 203         __u64 next_oid;
 204         /* A pointer to the atom which is being committed */
 205         txn_atom *atom;
 206         /* A pointer to current super block */
 207         struct super_block *super;
 208         /* The counter of modified bitmaps */
 209         reiser4_block_nr nr_bitmap;
 210 };
 211
 212 static void init_commit_handle(struct commit_handle *ch, txn_atom *atom)
 213 {
 214         memset(ch, 0, sizeof(struct commit_handle));
 215         INIT_LIST_HEAD(&ch->tx_list);
 216
 217         ch->atom = atom;
 218         ch->super = reiser4_get_current_sb();
 219 }
 220
 221 static void done_commit_handle(struct commit_handle *ch)
 222 {
 223         assert("zam-690", list_empty(&ch->tx_list));
 224 }
 225
 226 static inline int reiser4_use_write_barrier(struct super_block * s)
 227 {
 228         return !reiser4_is_set(s, REISER4_NO_WRITE_BARRIER);
 229 }
 230
 231 static void disable_write_barrier(struct super_block * s)
 232 {
 233         notice("zam-1055", "%s does not support write barriers,"
 234                " using synchronous write instead.", s->s_id);
 235         set_bit((int)REISER4_NO_WRITE_BARRIER, &get_super_private(s)->fs_flags);
 236 }
 237
 238 /* fill journal header block data  */
 239 static void format_journal_header(struct commit_handle *ch)
 240 {
 241         struct reiser4_super_info_data *sbinfo;
 242         struct journal_header *header;
 243         jnode *txhead;
 244
 245         sbinfo = get_super_private(ch->super);
 246         assert("zam-479", sbinfo != NULL);
 247         assert("zam-480", sbinfo->journal_header != NULL);
 248
 249         txhead = list_entry(ch->tx_list.next, jnode, capture_link);
 250
 251         jload(sbinfo->journal_header);
 252
 253         header = (struct journal_header *)jdata(sbinfo->journal_header);
 254         assert("zam-484", header != NULL);
 255
 256         put_unaligned(cpu_to_le64(*jnode_get_block(txhead)),
 257                       &header->last_committed_tx);
 258
 259         jrelse(sbinfo->journal_header);
 260 }
 261
 262 /* fill journal footer block data */
 263 static void format_journal_footer(struct commit_handle *ch)
 264 {
 265         struct reiser4_super_info_data *sbinfo;
 266         struct journal_footer *footer;
 267         jnode *tx_head;
 268
 269         sbinfo = get_super_private(ch->super);
 270
 271         tx_head = list_entry(ch->tx_list.next, jnode, capture_link);
 272
 273         assert("zam-493", sbinfo != NULL);
 274         assert("zam-494", sbinfo->journal_header != NULL);
 275
 276         check_me("zam-691", jload(sbinfo->journal_footer) == 0);
 277
 278         footer = (struct journal_footer *)jdata(sbinfo->journal_footer);
 279         assert("zam-495", footer != NULL);
 280
 281         put_unaligned(cpu_to_le64(*jnode_get_block(tx_head)),
 282                       &footer->last_flushed_tx);
 283         put_unaligned(cpu_to_le64(ch->free_blocks), &footer->free_blocks);
 284
 285         put_unaligned(cpu_to_le64(ch->nr_files), &footer->nr_files);
 286         put_unaligned(cpu_to_le64(ch->next_oid), &footer->next_oid);
 287
 288         jrelse(sbinfo->journal_footer);
 289 }
 290
 291 /* wander record capacity depends on current block size */
 292 static int wander_record_capacity(const struct super_block *super)
 293 {
 294         return (super->s_blocksize -
 295                 sizeof(struct wander_record_header)) /
 296             sizeof(struct wander_entry);
 297 }
 298
 299 /* Fill first wander record (tx head) in accordance with supplied given data */
 300 static void format_tx_head(struct commit_handle *ch)
 301 {
 302         jnode *tx_head;
 303         jnode *next;
 304         struct tx_header *header;
 305
 306         tx_head = list_entry(ch->tx_list.next, jnode, capture_link);
 307         assert("zam-692", &ch->tx_list != &tx_head->capture_link);
 308
 309         next = list_entry(tx_head->capture_link.next, jnode, capture_link);
 310         if (&ch->tx_list == &next->capture_link)
 311                 next = tx_head;
 312
 313         header = (struct tx_header *)jdata(tx_head);
 314
 315         assert("zam-460", header != NULL);
 316         assert("zam-462", ch->super->s_blocksize >= sizeof(struct tx_header));
 317
 318         memset(jdata(tx_head), 0, (size_t) ch->super->s_blocksize);
 319         memcpy(jdata(tx_head), TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE);
 320
 321         put_unaligned(cpu_to_le32(ch->tx_size), &header->total);
 322         put_unaligned(cpu_to_le64(get_super_private(ch->super)->last_committed_tx),
 323                       &header->prev_tx);
 324         put_unaligned(cpu_to_le64(*jnode_get_block(next)), &header->next_block);
 325         put_unaligned(cpu_to_le64(ch->free_blocks), &header->free_blocks);
 326         put_unaligned(cpu_to_le64(ch->nr_files), &header->nr_files);
 327         put_unaligned(cpu_to_le64(ch->next_oid), &header->next_oid);
 328 }
 329
 330 /* prepare ordinary wander record block (fill all service fields) */
 331 static void
 332 format_wander_record(struct commit_handle *ch, jnode *node, __u32 serial)
 333 {
 334         struct wander_record_header *LRH;
 335         jnode *next;
 336
 337         assert("zam-464", node != NULL);
 338
 339         LRH = (struct wander_record_header *)jdata(node);
 340         next = list_entry(node->capture_link.next, jnode, capture_link);
 341
 342         if (&ch->tx_list == &next->capture_link)
 343                 next = list_entry(ch->tx_list.next, jnode, capture_link);
 344
 345         assert("zam-465", LRH != NULL);
 346         assert("zam-463",
 347                ch->super->s_blocksize > sizeof(struct wander_record_header));
 348
 349         memset(jdata(node), 0, (size_t) ch->super->s_blocksize);
 350         memcpy(jdata(node), WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE);
 351
 352         put_unaligned(cpu_to_le32(ch->tx_size), &LRH->total);
 353         put_unaligned(cpu_to_le32(serial), &LRH->serial);
 354         put_unaligned(cpu_to_le64(*jnode_get_block(next)), &LRH->next_block);
 355 }
 356
 357 /* add one wandered map entry to formatted wander record */
 358 static void
 359 store_entry(jnode * node, int index, const reiser4_block_nr * a,
 360             const reiser4_block_nr * b)
 361 {
 362         char *data;
 363         struct wander_entry *pairs;
 364
 365         data = jdata(node);
 366         assert("zam-451", data != NULL);
 367
 368         pairs =
 369             (struct wander_entry *)(data + sizeof(struct wander_record_header));
 370
 371         put_unaligned(cpu_to_le64(*a), &pairs[index].original);
 372         put_unaligned(cpu_to_le64(*b), &pairs[index].wandered);
 373 }
 374
 375 /* currently, wander records contains contain only wandered map, which depend on
 376    overwrite set size */
 377 static void get_tx_size(struct commit_handle *ch)
 378 {
 379         assert("zam-440", ch->overwrite_set_size != 0);
 380         assert("zam-695", ch->tx_size == 0);
 381
 382         /* count all ordinary wander records
 383            (<overwrite_set_size> - 1) / <wander_record_capacity> + 1 and add one
 384            for tx head block */
 385         ch->tx_size =
 386             (ch->overwrite_set_size - 1) / wander_record_capacity(ch->super) +
 387             2;
 388 }
 389
 390 /* A special structure for using in store_wmap_actor() for saving its state
 391    between calls */
 392 struct store_wmap_params {
 393         jnode *cur;             /* jnode of current wander record to fill */
 394         int idx;                /* free element index in wander record  */
 395         int capacity;           /* capacity  */
 396
 397 #if REISER4_DEBUG
 398         struct list_head *tx_list;
 399 #endif
 400 };
 401
 402 /* an actor for use in blocknr_set_iterator routine which populates the list
 403    of pre-formatted wander records by wandered map info */
 404 static int
 405 store_wmap_actor(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a,
 406                  const reiser4_block_nr * b, void *data)
 407 {
 408         struct store_wmap_params *params = data;
 409
 410         if (params->idx >= params->capacity) {
 411                 /* a new wander record should be taken from the tx_list */
 412                 params->cur = list_entry(params->cur->capture_link.next, jnode, capture_link);
 413                 assert("zam-454",
 414                        params->tx_list != &params->cur->capture_link);
 415
 416                 params->idx = 0;
 417         }
 418
 419         store_entry(params->cur, params->idx, a, b);
 420         params->idx++;
 421
 422         return 0;
 423 }
 424
 425 /* This function is called after Relocate set gets written to disk, Overwrite
 426    set is written to wandered locations and all wander records are written
 427    also. Updated journal header blocks contains a pointer (block number) to
 428    first wander record of the just written transaction */
 429 static int update_journal_header(struct commit_handle *ch, int use_barrier)
 430 {
 431         struct reiser4_super_info_data *sbinfo = get_super_private(ch->super);
 432         jnode *jh = sbinfo->journal_header;
 433         jnode *head = list_entry(ch->tx_list.next, jnode, capture_link);
 434         int ret;
 435
 436         format_journal_header(ch);
 437
 438         ret = write_jnodes_to_disk_extent(jh, 1, jnode_get_block(jh), NULL,
 439                                           use_barrier ? WRITEOUT_BARRIER : 0);
 440         if (ret)
 441                 return ret;
 442
 443         // blk_run_address_space(sbinfo->fake->i_mapping);
 444         /*blk_run_queues(); */
 445
 446         ret = jwait_io(jh, WRITE);
 447
 448         if (ret)
 449                 return ret;
 450
 451         sbinfo->last_committed_tx = *jnode_get_block(head);
 452
 453         return 0;
 454 }
 455
 456 /* This function is called after write-back is finished. We update journal
 457    footer block and free blocks which were occupied by wandered blocks and
 458    transaction wander records */
 459 static int update_journal_footer(struct commit_handle *ch, int use_barrier)
 460 {
 461         reiser4_super_info_data *sbinfo = get_super_private(ch->super);
 462
 463         jnode *jf = sbinfo->journal_footer;
 464
 465         int ret;
 466
 467         format_journal_footer(ch);
 468
 469         ret = write_jnodes_to_disk_extent(jf, 1, jnode_get_block(jf), NULL,
 470                                           use_barrier ? WRITEOUT_BARRIER : 0);
 471         if (ret)
 472                 return ret;
 473
 474         // blk_run_address_space(sbinfo->fake->i_mapping);
 475         /*blk_run_queue(); */
 476
 477         ret = jwait_io(jf, WRITE);
 478         if (ret)
 479                 return ret;
 480
 481         return 0;
 482 }
 483
 484 /* free block numbers of wander records of already written in place transaction */
 485 static void dealloc_tx_list(struct commit_handle *ch)
 486 {
 487         while (!list_empty(&ch->tx_list)) {
 488                 jnode *cur = list_entry(ch->tx_list.next, jnode, capture_link);
 489                 list_del(&cur->capture_link);
 490                 ON_DEBUG(INIT_LIST_HEAD(&cur->capture_link));
 491                 reiser4_dealloc_block(jnode_get_block(cur), BLOCK_NOT_COUNTED,
 492                                       BA_FORMATTED);
 493
 494                 unpin_jnode_data(cur);
 495                 reiser4_drop_io_head(cur);
 496         }
 497 }
 498
 499 /* An actor for use in block_nr_iterator() routine which frees wandered blocks
 500    from atom's overwrite set. */
 501 static int
 502 dealloc_wmap_actor(txn_atom * atom UNUSED_ARG,
 503                    const reiser4_block_nr * a UNUSED_ARG,
 504                    const reiser4_block_nr * b, void *data UNUSED_ARG)
 505 {
 506
 507         assert("zam-499", b != NULL);
 508         assert("zam-500", *b != 0);
 509         assert("zam-501", !reiser4_blocknr_is_fake(b));
 510
 511         reiser4_dealloc_block(b, BLOCK_NOT_COUNTED, BA_FORMATTED);
 512         return 0;
 513 }
 514
 515 /* free wandered block locations of already written in place transaction */
 516 static void dealloc_wmap(struct commit_handle *ch)
 517 {
 518         assert("zam-696", ch->atom != NULL);
 519
 520         blocknr_set_iterator(ch->atom, &ch->atom->wandered_map,
 521                              dealloc_wmap_actor, NULL, 1);
 522 }
 523
 524 /* helper function for alloc wandered blocks, which refill set of block
 525    numbers needed for wandered blocks  */
 526 static int
 527 get_more_wandered_blocks(int count, reiser4_block_nr * start, int *len)
 528 {
 529         reiser4_blocknr_hint hint;
 530         int ret;
 531
 532         reiser4_block_nr wide_len = count;
 533
 534         /* FIXME-ZAM: A special policy needed for allocation of wandered blocks
 535            ZAM-FIXME-HANS: yes, what happened to our discussion of using a fixed
 536            reserved allocation area so as to get the best qualities of fixed
 537            journals? */
 538         reiser4_blocknr_hint_init(&hint);
 539         hint.block_stage = BLOCK_GRABBED;
 540
 541         ret = reiser4_alloc_blocks(&hint, start, &wide_len,
 542                                    BA_FORMATTED | BA_USE_DEFAULT_SEARCH_START);
 543         *len = (int)wide_len;
 544
 545         return ret;
 546 }
 547
 548 /*
 549  * roll back changes made before issuing BIO in the case of IO error.
 550  */
 551 static void undo_bio(struct bio *bio)
 552 {
 553         int i;
 554
 555         for (i = 0; i < bio->bi_vcnt; ++i) {
 556                 struct page *pg;
 557                 jnode *node;
 558
 559                 pg = bio->bi_io_vec[i].bv_page;
 560                 end_page_writeback(pg);
 561                 node = jprivate(pg);
 562                 spin_lock_jnode(node);
 563                 JF_CLR(node, JNODE_WRITEBACK);
 564                 JF_SET(node, JNODE_DIRTY);
 565                 spin_unlock_jnode(node);
 566         }
 567         bio_put(bio);
 568 }
 569
 570 /* put overwrite set back to atom's clean list */
 571 static void put_overwrite_set(struct commit_handle *ch)
 572 {
 573         jnode *cur;
 574
 575         list_for_each_entry(cur, ch->overwrite_set, capture_link)
 576                 jrelse_tail(cur);
 577 }
 578
 579 /* Count overwrite set size, grab disk space for wandered blocks allocation.
 580    Since we have a separate list for atom's overwrite set we just scan the list,
 581    count bitmap and other not leaf nodes which wandered blocks allocation we
 582    have to grab space for. */
 583 static int get_overwrite_set(struct commit_handle *ch)
 584 {
 585         int ret;
 586         jnode *cur;
 587         __u64 nr_not_leaves = 0;
 588 #if REISER4_DEBUG
 589         __u64 nr_formatted_leaves = 0;
 590         __u64 nr_unformatted_leaves = 0;
 591 #endif
 592
 593         assert("zam-697", ch->overwrite_set_size == 0);
 594
 595         ch->overwrite_set = ATOM_OVRWR_LIST(ch->atom);
 596         cur = list_entry(ch->overwrite_set->next, jnode, capture_link);
 597
 598         while (ch->overwrite_set != &cur->capture_link) {
 599                 jnode *next = list_entry(cur->capture_link.next, jnode, capture_link);
 600
 601                 /* Count bitmap locks for getting correct statistics what number
 602                  * of blocks were cleared by the transaction commit. */
 603                 if (jnode_get_type(cur) == JNODE_BITMAP)
 604                         ch->nr_bitmap++;
 605
 606                 assert("zam-939", JF_ISSET(cur, JNODE_OVRWR)
 607                        || jnode_get_type(cur) == JNODE_BITMAP);
 608
 609                 if (jnode_is_znode(cur) && znode_above_root(JZNODE(cur))) {
 610                         /* we replace fake znode by another (real)
 611                            znode which is suggested by disk_layout
 612                            plugin */
 613
 614                         /* FIXME: it looks like fake znode should be
 615                            replaced by jnode supplied by
 616                            disk_layout. */
 617
 618                         struct super_block *s = reiser4_get_current_sb();
 619                         reiser4_super_info_data *sbinfo =
 620                             get_current_super_private();
 621
 622                         if (sbinfo->df_plug->log_super) {
 623                                 jnode *sj = sbinfo->df_plug->log_super(s);
 624
 625                                 assert("zam-593", sj != NULL);
 626
 627                                 if (IS_ERR(sj))
 628                                         return PTR_ERR(sj);
 629
 630                                 spin_lock_jnode(sj);
 631                                 JF_SET(sj, JNODE_OVRWR);
 632                                 insert_into_atom_ovrwr_list(ch->atom, sj);
 633                                 spin_unlock_jnode(sj);
 634
 635                                 /* jload it as the rest of overwrite set */
 636                                 jload_gfp(sj, reiser4_ctx_gfp_mask_get(), 0);
 637
 638                                 ch->overwrite_set_size++;
 639                         }
 640                         spin_lock_jnode(cur);
 641                         reiser4_uncapture_block(cur);
 642                         jput(cur);
 643
 644                 } else {
 645                         int ret;
 646                         ch->overwrite_set_size++;
 647                         ret = jload_gfp(cur, reiser4_ctx_gfp_mask_get(), 0);
 648                         if (ret)
 649                                 reiser4_panic("zam-783",
 650                                               "cannot load e-flushed jnode back (ret = %d)\n",
 651                                               ret);
 652                 }
 653
 654                 /* Count not leaves here because we have to grab disk space
 655                  * for wandered blocks. They were not counted as "flush
 656                  * reserved". Counting should be done _after_ nodes are pinned
 657                  * into memory by jload(). */
 658                 if (!jnode_is_leaf(cur))
 659                         nr_not_leaves++;
 660                 else {
 661 #if REISER4_DEBUG
 662                         /* at this point @cur either has JNODE_FLUSH_RESERVED
 663                          * or is eflushed. Locking is not strong enough to
 664                          * write an assertion checking for this. */
 665                         if (jnode_is_znode(cur))
 666                                 nr_formatted_leaves++;
 667                         else
 668                                 nr_unformatted_leaves++;
 669 #endif
 670                         JF_CLR(cur, JNODE_FLUSH_RESERVED);
 671                 }
 672
 673                 cur = next;
 674         }
 675
 676         /* Grab space for writing (wandered blocks) of not leaves found in
 677          * overwrite set. */
 678         ret = reiser4_grab_space_force(nr_not_leaves, BA_RESERVED);
 679         if (ret)
 680                 return ret;
 681
 682         /* Disk space for allocation of wandered blocks of leaf nodes already
 683          * reserved as "flush reserved", move it to grabbed space counter. */
 684         spin_lock_atom(ch->atom);
 685         assert("zam-940",
 686                nr_formatted_leaves + nr_unformatted_leaves <=
 687                ch->atom->flush_reserved);
 688         flush_reserved2grabbed(ch->atom, ch->atom->flush_reserved);
 689         spin_unlock_atom(ch->atom);
 690
 691         return ch->overwrite_set_size;
 692 }
 693
 694 /**
 695  * write_jnodes_to_disk_extent - submit write request
 696  * @head:
 697  * @first: first jnode of the list
 698  * @nr: number of jnodes on the list
 699  * @block_p:
 700  * @fq:
 701  * @flags: used to decide whether page is to get PG_reclaim flag
 702  *
 703  * Submits a write request for @nr jnodes beginning from the @first, other
 704  * jnodes are after the @first on the double-linked "capture" list.  All jnodes
 705  * will be written to the disk region of @nr blocks starting with @block_p block
 706  * number.  If @fq is not NULL it means that waiting for i/o completion will be
 707  * done more efficiently by using flush_queue_t objects.
 708  * This function is the one which writes list of jnodes in batch mode. It does
 709  * all low-level things as bio construction and page states manipulation.
 710  *
 711  * ZAM-FIXME-HANS: brief me on why this function exists, and why bios are
 712  * aggregated in this function instead of being left to the layers below
 713  *
 714  * FIXME: ZAM->HANS: What layer are you talking about? Can you point me to that?
 715  * Why that layer needed? Why BIOs cannot be constructed here?
 716  */
 717 static int write_jnodes_to_disk_extent(
 718         jnode *first, int nr, const reiser4_block_nr *block_p,
 719         flush_queue_t *fq, int flags)
 720 {
 721         struct super_block *super = reiser4_get_current_sb();
 722         int write_op = ( flags & WRITEOUT_BARRIER ) ? WRITE_BARRIER : WRITE;
 723         int max_blocks;
 724         jnode *cur = first;
 725         reiser4_block_nr block;
 726
 727         assert("zam-571", first != NULL);
 728         assert("zam-572", block_p != NULL);
 729         assert("zam-570", nr > 0);
 730
 731         block = *block_p;
 732         max_blocks = min(bio_get_nr_vecs(super->s_bdev), BIO_MAX_PAGES);
 733
 734         while (nr > 0) {
 735                 struct bio *bio;
 736                 int nr_blocks = min(nr, max_blocks);
 737                 int i;
 738                 int nr_used;
 739
 740                 bio = bio_alloc(GFP_NOIO, nr_blocks);
 741                 if (!bio)
 742                         return RETERR(-ENOMEM);
 743
 744                 bio->bi_bdev = super->s_bdev;
 745                 bio->bi_sector = block * (super->s_blocksize >> 9);
 746                 for (nr_used = 0, i = 0; i < nr_blocks; i++) {
 747                         struct page *pg;
 748
 749                         pg = jnode_page(cur);
 750                         assert("zam-573", pg != NULL);
 751
 752                         page_cache_get(pg);
 753
 754                         lock_and_wait_page_writeback(pg);
 755
 756                         if (!bio_add_page(bio, pg, super->s_blocksize, 0)) {
 757                                 /*
 758                                  * underlying device is satiated. Stop adding
 759                                  * pages to the bio.
 760                                  */
 761                                 unlock_page(pg);
 762                                 page_cache_release(pg);
 763                                 break;
 764                         }
 765
 766                         spin_lock_jnode(cur);
 767                         assert("nikita-3166",
 768                                pg->mapping == jnode_get_mapping(cur));
 769                         assert("zam-912", !JF_ISSET(cur, JNODE_WRITEBACK));
 770 #if REISER4_DEBUG
 771                         spin_lock(&cur->load);
 772                         assert("nikita-3165", !jnode_is_releasable(cur));
 773                         spin_unlock(&cur->load);
 774 #endif
 775                         JF_SET(cur, JNODE_WRITEBACK);
 776                         JF_CLR(cur, JNODE_DIRTY);
 777                         ON_DEBUG(cur->written++);
 778                         spin_unlock_jnode(cur);
 779
 780                         ClearPageError(pg);
 781                         set_page_writeback(pg);
 782
 783                         if (get_current_context()->entd) {
 784                                 /* this is ent thread */
 785                                 entd_context *ent = get_entd_context(super);
 786                                 struct wbq *rq, *next;
 787
 788                                 spin_lock(&ent->guard);
 789
 790                                 if (pg == ent->cur_request->page) {
 791                                         /*
 792                                          * entd is called for this page. This
 793                                          * request is not in th etodo list
 794                                          */
 795                                         ent->cur_request->written = 1;
 796                                 } else {
 797                                         /*
 798                                          * if we have written a page for which writepage
 799                                          * is called for - move request to another list.
 800                                          */
 801                                         list_for_each_entry_safe(rq, next, &ent->todo_list, link) {
 802                                                 assert("", rq->magic == WBQ_MAGIC);
 803                                                 if (pg == rq->page) {
 804                                                         /*
 805                                                          * remove request from
 806                                                          * entd's queue, but do
 807                                                          * not wake up a thread
 808                                                          * which put this
 809                                                          * request
 810                                                          */
 811                                                         list_del_init(&rq->link);
 812                                                         ent->nr_todo_reqs --;
 813                                                         list_add_tail(&rq->link, &ent->done_list);
 814                                                         ent->nr_done_reqs ++;
 815                                                         rq->written = 1;
 816                                                         break;
 817                                                 }
 818                                         }
 819                                 }
 820                                 spin_unlock(&ent->guard);
 821                         }
 822
 823                         clear_page_dirty_for_io(pg);
 824
 825                         unlock_page(pg);
 826
 827                         cur = list_entry(cur->capture_link.next, jnode, capture_link);
 828                         nr_used++;
 829                 }
 830                 if (nr_used > 0) {
 831                         assert("nikita-3453",
 832                                bio->bi_size == super->s_blocksize * nr_used);
 833                         assert("nikita-3454", bio->bi_vcnt == nr_used);
 834
 835                         /* Check if we are allowed to write at all */
 836                         if (super->s_flags & MS_RDONLY)
 837                                 undo_bio(bio);
 838                         else {
 839                                 int not_supported;
 840
 841                                 add_fq_to_bio(fq, bio);
 842                                 bio_get(bio);
 843                                 reiser4_submit_bio(write_op, bio);
 844                                 not_supported = bio_flagged(bio, BIO_EOPNOTSUPP);
 845                                 bio_put(bio);
 846                                 if (not_supported)
 847                                         return -EOPNOTSUPP;
 848                         }
 849
 850                         block += nr_used - 1;
 851                         update_blocknr_hint_default(super, &block);
 852                         block += 1;
 853                 } else {
 854                         bio_put(bio);
 855                 }
 856                 nr -= nr_used;
 857         }
 858
 859         return 0;
 860 }
 861
 862 /* This is a procedure which recovers a contiguous sequences of disk block
 863    numbers in the given list of j-nodes and submits write requests on this
 864    per-sequence basis */
 865 int
 866 write_jnode_list(struct list_head *head, flush_queue_t *fq,
 867                  long *nr_submitted, int flags)
 868 {
 869         int ret;
 870         jnode *beg = list_entry(head->next, jnode, capture_link);
 871
 872         while (head != &beg->capture_link) {
 873                 int nr = 1;
 874                 jnode *cur = list_entry(beg->capture_link.next, jnode, capture_link);
 875
 876                 while (head != &cur->capture_link) {
 877                         if (*jnode_get_block(cur) != *jnode_get_block(beg) + nr)
 878                                 break;
 879                         ++nr;
 880                         cur = list_entry(cur->capture_link.next, jnode, capture_link);
 881                 }
 882
 883                 ret = write_jnodes_to_disk_extent(
 884                         beg, nr, jnode_get_block(beg), fq, flags);
 885                 if (ret)
 886                         return ret;
 887
 888                 if (nr_submitted)
 889                         *nr_submitted += nr;
 890
 891                 beg = cur;
 892         }
 893
 894         return 0;
 895 }
 896
 897 /* add given wandered mapping to atom's wandered map */
 898 static int
 899 add_region_to_wmap(jnode * cur, int len, const reiser4_block_nr * block_p)
 900 {
 901         int ret;
 902         blocknr_set_entry *new_bsep = NULL;
 903         reiser4_block_nr block;
 904
 905         txn_atom *atom;
 906
 907         assert("zam-568", block_p != NULL);
 908         block = *block_p;
 909         assert("zam-569", len > 0);
 910
 911         while ((len--) > 0) {
 912                 do {
 913                         atom = get_current_atom_locked();
 914                         assert("zam-536",
 915                                !reiser4_blocknr_is_fake(jnode_get_block(cur)));
 916                         ret =
 917                             blocknr_set_add_pair(atom, &atom->wandered_map,
 918                                                  &new_bsep,
 919                                                  jnode_get_block(cur), &block);
 920                 } while (ret == -E_REPEAT);
 921
 922                 if (ret) {
 923                         /* deallocate blocks which were not added to wandered
 924                            map */
 925                         reiser4_block_nr wide_len = len;
 926
 927                         reiser4_dealloc_blocks(&block, &wide_len,
 928                                                BLOCK_NOT_COUNTED,
 929                                                BA_FORMATTED
 930                                                /* formatted, without defer */ );
 931
 932                         return ret;
 933                 }
 934
 935                 spin_unlock_atom(atom);
 936
 937                 cur = list_entry(cur->capture_link.next, jnode, capture_link);
 938                 ++block;
 939         }
 940
 941         return 0;
 942 }
 943
 944 /* Allocate wandered blocks for current atom's OVERWRITE SET and immediately
 945    submit IO for allocated blocks.  We assume that current atom is in a stage
 946    when any atom fusion is impossible and atom is unlocked and it is safe. */
 947 static int alloc_wandered_blocks(struct commit_handle *ch, flush_queue_t *fq)
 948 {
 949         reiser4_block_nr block;
 950
 951         int rest;
 952         int len;
 953         int ret;
 954
 955         jnode *cur;
 956
 957         assert("zam-534", ch->overwrite_set_size > 0);
 958
 959         rest = ch->overwrite_set_size;
 960
 961         cur = list_entry(ch->overwrite_set->next, jnode, capture_link);
 962         while (ch->overwrite_set != &cur->capture_link) {
 963                 assert("zam-567", JF_ISSET(cur, JNODE_OVRWR));
 964
 965                 ret = get_more_wandered_blocks(rest, &block, &len);
 966                 if (ret)
 967                         return ret;
 968
 969                 rest -= len;
 970
 971                 ret = add_region_to_wmap(cur, len, &block);
 972                 if (ret)
 973                         return ret;
 974
 975                 ret = write_jnodes_to_disk_extent(cur, len, &block, fq, 0);
 976                 if (ret)
 977                         return ret;
 978
 979                 while ((len--) > 0) {
 980                         assert("zam-604",
 981                                ch->overwrite_set != &cur->capture_link);
 982                         cur = list_entry(cur->capture_link.next, jnode, capture_link);
 983                 }
 984         }
 985
 986         return 0;
 987 }
 988
 989 /* allocate given number of nodes over the journal area and link them into a
 990    list, return pointer to the first jnode in the list */
 991 static int alloc_tx(struct commit_handle *ch, flush_queue_t * fq)
 992 {
 993         reiser4_blocknr_hint hint;
 994         reiser4_block_nr allocated = 0;
 995         reiser4_block_nr first, len;
 996         jnode *cur;
 997         jnode *txhead;
 998         int ret;
 999         reiser4_context *ctx;
1000         reiser4_super_info_data *sbinfo;
1001
1002         assert("zam-698", ch->tx_size > 0);
1003         assert("zam-699", list_empty_careful(&ch->tx_list));
1004
1005         ctx = get_current_context();
1006         sbinfo = get_super_private(ctx->super);
1007
1008         while (allocated < (unsigned)ch->tx_size) {
1009                 len = (ch->tx_size - allocated);
1010
1011                 reiser4_blocknr_hint_init(&hint);
1012
1013                 hint.block_stage = BLOCK_GRABBED;
1014
1015                 /* FIXME: there should be some block allocation policy for
1016                    nodes which contain wander records */
1017
1018                 /* We assume that disk space for wandered record blocks can be
1019                  * taken from reserved area. */
1020                 ret = reiser4_alloc_blocks(&hint, &first, &len,
1021                                            BA_FORMATTED | BA_RESERVED |
1022                                            BA_USE_DEFAULT_SEARCH_START);
1023                 reiser4_blocknr_hint_done(&hint);
1024
1025                 if (ret)
1026                         return ret;
1027
1028                 allocated += len;
1029
1030                 /* create jnodes for all wander records */
1031                 while (len--) {
1032                         cur = reiser4_alloc_io_head(&first);
1033
1034                         if (cur == NULL) {
1035                                 ret = RETERR(-ENOMEM);
1036                                 goto free_not_assigned;
1037                         }
1038
1039                         ret = jinit_new(cur, reiser4_ctx_gfp_mask_get());
1040
1041                         if (ret != 0) {
1042                                 jfree(cur);
1043                                 goto free_not_assigned;
1044                         }
1045
1046                         pin_jnode_data(cur);
1047
1048                         list_add_tail(&cur->capture_link, &ch->tx_list);
1049
1050                         first++;
1051                 }
1052         }
1053
1054         { /* format a on-disk linked list of wander records */
1055                 int serial = 1;
1056
1057                 txhead = list_entry(ch->tx_list.next, jnode, capture_link);
1058                 format_tx_head(ch);
1059
1060                 cur = list_entry(txhead->capture_link.next, jnode, capture_link);
1061                 while (&ch->tx_list != &cur->capture_link) {
1062                         format_wander_record(ch, cur, serial++);
1063                         cur = list_entry(cur->capture_link.next, jnode, capture_link);
1064                 }
1065         }
1066
1067         { /* Fill wander records with Wandered Set */
1068                 struct store_wmap_params params;
1069                 txn_atom *atom;
1070
1071                 params.cur = list_entry(txhead->capture_link.next, jnode, capture_link);
1072
1073                 params.idx = 0;
1074                 params.capacity =
1075                     wander_record_capacity(reiser4_get_current_sb());
1076
1077                 atom = get_current_atom_locked();
1078                 blocknr_set_iterator(atom, &atom->wandered_map,
1079                                      &store_wmap_actor, &params, 0);
1080                 spin_unlock_atom(atom);
1081         }
1082
1083         { /* relse all jnodes from tx_list */
1084                 cur = list_entry(ch->tx_list.next, jnode, capture_link);
1085                 while (&ch->tx_list != &cur->capture_link) {
1086                         jrelse(cur);
1087                         cur = list_entry(cur->capture_link.next, jnode, capture_link);
1088                 }
1089         }
1090
1091         ret = write_jnode_list(&ch->tx_list, fq, NULL, 0);
1092
1093         return ret;
1094
1095       free_not_assigned:
1096         /* We deallocate blocks not yet assigned to jnodes on tx_list. The
1097            caller takes care about invalidating of tx list  */
1098         reiser4_dealloc_blocks(&first, &len, BLOCK_NOT_COUNTED, BA_FORMATTED);
1099
1100         return ret;
1101 }
1102
1103 static int commit_tx(struct commit_handle *ch)
1104 {
1105         flush_queue_t *fq;
1106         int barrier;
1107         int ret;
1108
1109         /* Grab more space for wandered records. */
1110         ret = reiser4_grab_space_force((__u64) (ch->tx_size), BA_RESERVED);
1111         if (ret)
1112                 return ret;
1113
1114         fq = get_fq_for_current_atom();
1115         if (IS_ERR(fq))
1116                 return PTR_ERR(fq);
1117
1118         spin_unlock_atom(fq->atom);
1119         do {
1120                 ret = alloc_wandered_blocks(ch, fq);
1121                 if (ret)
1122                         break;
1123                 ret = alloc_tx(ch, fq);
1124                 if (ret)
1125                         break;
1126         } while (0);
1127
1128         reiser4_fq_put(fq);
1129         if (ret)
1130                 return ret;
1131  repeat_wo_barrier:
1132         barrier = reiser4_use_write_barrier(ch->super);
1133         if (!barrier) {
1134                 ret = current_atom_finish_all_fq();
1135                 if (ret)
1136                         return ret;
1137         }
1138         ret = update_journal_header(ch, barrier);
1139         if (barrier) {
1140                 if (ret) {
1141                         if (ret == -EOPNOTSUPP) {
1142                                 disable_write_barrier(ch->super);
1143                                 goto repeat_wo_barrier;
1144                         }
1145                         return ret;
1146                 }
1147                 ret = current_atom_finish_all_fq();
1148         }
1149         return ret;
1150 }
1151
1152 static int write_tx_back(struct commit_handle * ch)
1153 {
1154         flush_queue_t *fq;
1155         int ret;
1156         int barrier;
1157
1158         reiser4_post_commit_hook();
1159         fq = get_fq_for_current_atom();
1160         if (IS_ERR(fq))
1161                 return  PTR_ERR(fq);
1162         spin_unlock_atom(fq->atom);
1163         ret = write_jnode_list(
1164                 ch->overwrite_set, fq, NULL, WRITEOUT_FOR_PAGE_RECLAIM);
1165         reiser4_fq_put(fq);
1166         if (ret)
1167                 return ret;
1168  repeat_wo_barrier:
1169         barrier = reiser4_use_write_barrier(ch->super);
1170         if (!barrier) {
1171                 ret = current_atom_finish_all_fq();
1172                 if (ret)
1173                         return ret;
1174         }
1175         ret = update_journal_footer(ch, barrier);
1176         if (barrier) {
1177                 if (ret) {
1178                         if (ret == -EOPNOTSUPP) {
1179                                 disable_write_barrier(ch->super);
1180                                 goto repeat_wo_barrier;
1181                         }
1182                         return ret;
1183                 }
1184                 ret = current_atom_finish_all_fq();
1185         }
1186         if (ret)
1187                 return ret;
1188         reiser4_post_write_back_hook();
1189         return 0;
1190 }
1191
1192 /* We assume that at this moment all captured blocks are marked as RELOC or
1193    WANDER (belong to Relocate o Overwrite set), all nodes from Relocate set
1194    are submitted to write.
1195 */
1196
1197 int reiser4_write_logs(long *nr_submitted)
1198 {
1199         txn_atom *atom;
1200         struct super_block *super = reiser4_get_current_sb();
1201         reiser4_super_info_data *sbinfo = get_super_private(super);
1202         struct commit_handle ch;
1203         int ret;
1204
1205         writeout_mode_enable();
1206
1207         /* block allocator may add j-nodes to the clean_list */
1208         ret = reiser4_pre_commit_hook();
1209         if (ret)
1210                 return ret;
1211
1212         /* No locks are required if we take atom which stage >=
1213          * ASTAGE_PRE_COMMIT */
1214         atom = get_current_context()->trans->atom;
1215         assert("zam-965", atom != NULL);
1216
1217         /* relocate set is on the atom->clean_nodes list after
1218          * current_atom_complete_writes() finishes. It can be safely
1219          * uncaptured after commit_mutex is locked, because any atom that
1220          * captures these nodes is guaranteed to commit after current one.
1221          *
1222          * This can only be done after reiser4_pre_commit_hook(), because it is where
1223          * early flushed jnodes with CREATED bit are transferred to the
1224          * overwrite list. */
1225         reiser4_invalidate_list(ATOM_CLEAN_LIST(atom));
1226         spin_lock_atom(atom);
1227         /* There might be waiters for the relocate nodes which we have
1228          * released, wake them up. */
1229         reiser4_atom_send_event(atom);
1230         spin_unlock_atom(atom);
1231
1232         if (REISER4_DEBUG) {
1233                 int level;
1234
1235                 for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; ++level)
1236                         assert("nikita-3352",
1237                                list_empty_careful(ATOM_DIRTY_LIST(atom, level)));
1238         }
1239
1240         sbinfo->nr_files_committed += (unsigned)atom->nr_objects_created;
1241         sbinfo->nr_files_committed -= (unsigned)atom->nr_objects_deleted;
1242
1243         init_commit_handle(&ch, atom);
1244
1245         ch.free_blocks = sbinfo->blocks_free_committed;
1246         ch.nr_files = sbinfo->nr_files_committed;
1247         /* ZAM-FIXME-HANS: email me what the contention level is for the super
1248          * lock. */
1249         ch.next_oid = oid_next(super);
1250
1251         /* count overwrite set and place it in a separate list */
1252         ret = get_overwrite_set(&ch);
1253
1254         if (ret <= 0) {
1255                 /* It is possible that overwrite set is empty here, it means
1256                    all captured nodes are clean */
1257                 goto up_and_ret;
1258         }
1259
1260         /* Inform the caller about what number of dirty pages will be
1261          * submitted to disk. */
1262         *nr_submitted += ch.overwrite_set_size - ch.nr_bitmap;
1263
1264         /* count all records needed for storing of the wandered set */
1265         get_tx_size(&ch);
1266
1267         ret = commit_tx(&ch);
1268         if (ret)
1269                 goto up_and_ret;
1270
1271         spin_lock_atom(atom);
1272         reiser4_atom_set_stage(atom, ASTAGE_POST_COMMIT);
1273         spin_unlock_atom(atom);
1274
1275         ret = write_tx_back(&ch);
1276         reiser4_post_write_back_hook();
1277
1278       up_and_ret:
1279         if (ret) {
1280                 /* there could be fq attached to current atom; the only way to
1281                    remove them is: */
1282                 current_atom_finish_all_fq();
1283         }
1284
1285         /* free blocks of flushed transaction */
1286         dealloc_tx_list(&ch);
1287         dealloc_wmap(&ch);
1288
1289         put_overwrite_set(&ch);
1290
1291         done_commit_handle(&ch);
1292
1293         writeout_mode_disable();
1294
1295         return ret;
1296 }
1297
1298 /* consistency checks for journal data/control blocks: header, footer, log
1299    records, transactions head blocks. All functions return zero on success. */
1300
1301 static int check_journal_header(const jnode * node UNUSED_ARG)
1302 {
1303         /* FIXME: journal header has no magic field yet. */
1304         return 0;
1305 }
1306
1307 /* wait for write completion for all jnodes from given list */
1308 static int wait_on_jnode_list(struct list_head *head)
1309 {
1310         jnode *scan;
1311         int ret = 0;
1312
1313         list_for_each_entry(scan, head, capture_link) {
1314                 struct page *pg = jnode_page(scan);
1315
1316                 if (pg) {
1317                         if (PageWriteback(pg))
1318                                 wait_on_page_writeback(pg);
1319
1320                         if (PageError(pg))
1321                                 ret++;
1322                 }
1323         }
1324
1325         return ret;
1326 }
1327
1328 static int check_journal_footer(const jnode * node UNUSED_ARG)
1329 {
1330         /* FIXME: journal footer has no magic field yet. */
1331         return 0;
1332 }
1333
1334 static int check_tx_head(const jnode * node)
1335 {
1336         struct tx_header *header = (struct tx_header *)jdata(node);
1337
1338         if (memcmp(&header->magic, TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE) != 0) {
1339                 warning("zam-627", "tx head at block %s corrupted\n",
1340                         sprint_address(jnode_get_block(node)));
1341                 return RETERR(-EIO);
1342         }
1343
1344         return 0;
1345 }
1346
1347 static int check_wander_record(const jnode * node)
1348 {
1349         struct wander_record_header *RH =
1350             (struct wander_record_header *)jdata(node);
1351
1352         if (memcmp(&RH->magic, WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE) !=
1353             0) {
1354                 warning("zam-628", "wander record at block %s corrupted\n",
1355                         sprint_address(jnode_get_block(node)));
1356                 return RETERR(-EIO);
1357         }
1358
1359         return 0;
1360 }
1361
1362 /* fill commit_handler structure by everything what is needed for update_journal_footer */
1363 static int restore_commit_handle(struct commit_handle *ch, jnode *tx_head)
1364 {
1365         struct tx_header *TXH;
1366         int ret;
1367
1368         ret = jload(tx_head);
1369         if (ret)
1370                 return ret;
1371
1372         TXH = (struct tx_header *)jdata(tx_head);
1373
1374         ch->free_blocks = le64_to_cpu(get_unaligned(&TXH->free_blocks));
1375         ch->nr_files = le64_to_cpu(get_unaligned(&TXH->nr_files));
1376         ch->next_oid = le64_to_cpu(get_unaligned(&TXH->next_oid));
1377
1378         jrelse(tx_head);
1379
1380         list_add(&tx_head->capture_link, &ch->tx_list);
1381
1382         return 0;
1383 }
1384
1385 /* replay one transaction: restore and write overwrite set in place */
1386 static int replay_transaction(const struct super_block *s,
1387                               jnode * tx_head,
1388                               const reiser4_block_nr * log_rec_block_p,
1389                               const reiser4_block_nr * end_block,
1390                               unsigned int nr_wander_records)
1391 {
1392         reiser4_block_nr log_rec_block = *log_rec_block_p;
1393         struct commit_handle ch;
1394         LIST_HEAD(overwrite_set);
1395         jnode *log;
1396         int ret;
1397
1398         init_commit_handle(&ch, NULL);
1399         ch.overwrite_set = &overwrite_set;
1400
1401         restore_commit_handle(&ch, tx_head);
1402
1403         while (log_rec_block != *end_block) {
1404                 struct wander_record_header *header;
1405                 struct wander_entry *entry;
1406
1407                 int i;
1408
1409                 if (nr_wander_records == 0) {
1410                         warning("zam-631",
1411                                 "number of wander records in the linked list"
1412                                 " greater than number stored in tx head.\n");
1413                         ret = RETERR(-EIO);
1414                         goto free_ow_set;
1415                 }
1416
1417                 log = reiser4_alloc_io_head(&log_rec_block);
1418                 if (log == NULL)
1419                         return RETERR(-ENOMEM);
1420
1421                 ret = jload(log);
1422                 if (ret < 0) {
1423                         reiser4_drop_io_head(log);
1424                         return ret;
1425                 }
1426
1427                 ret = check_wander_record(log);
1428                 if (ret) {
1429                         jrelse(log);
1430                         reiser4_drop_io_head(log);
1431                         return ret;
1432                 }
1433
1434                 header = (struct wander_record_header *)jdata(log);
1435                 log_rec_block = le64_to_cpu(get_unaligned(&header->next_block));
1436
1437                 entry = (struct wander_entry *)(header + 1);
1438
1439                 /* restore overwrite set from wander record content */
1440                 for (i = 0; i < wander_record_capacity(s); i++) {
1441                         reiser4_block_nr block;
1442                         jnode *node;
1443
1444                         block = le64_to_cpu(get_unaligned(&entry->wandered));
1445                         if (block == 0)
1446                                 break;
1447
1448                         node = reiser4_alloc_io_head(&block);
1449                         if (node == NULL) {
1450                                 ret = RETERR(-ENOMEM);
1451                                 /*
1452                                  * FIXME-VS:???
1453                                  */
1454                                 jrelse(log);
1455                                 reiser4_drop_io_head(log);
1456                                 goto free_ow_set;
1457                         }
1458
1459                         ret = jload(node);
1460
1461                         if (ret < 0) {
1462                                 reiser4_drop_io_head(node);
1463                                 /*
1464                                  * FIXME-VS:???
1465                                  */
1466                                 jrelse(log);
1467                                 reiser4_drop_io_head(log);
1468                                 goto free_ow_set;
1469                         }
1470
1471                         block = le64_to_cpu(get_unaligned(&entry->original));
1472
1473                         assert("zam-603", block != 0);
1474
1475                         jnode_set_block(node, &block);
1476
1477                         list_add_tail(&node->capture_link, ch.overwrite_set);
1478
1479                         ++entry;
1480                 }
1481
1482                 jrelse(log);
1483                 reiser4_drop_io_head(log);
1484
1485                 --nr_wander_records;
1486         }
1487
1488         if (nr_wander_records != 0) {
1489                 warning("zam-632", "number of wander records in the linked list"
1490                         " less than number stored in tx head.\n");
1491                 ret = RETERR(-EIO);
1492                 goto free_ow_set;
1493         }
1494
1495         {                       /* write wandered set in place */
1496                 write_jnode_list(ch.overwrite_set, NULL, NULL, 0);
1497                 ret = wait_on_jnode_list(ch.overwrite_set);
1498
1499                 if (ret) {
1500                         ret = RETERR(-EIO);
1501                         goto free_ow_set;
1502                 }
1503         }
1504
1505         ret = update_journal_footer(&ch, 0);
1506
1507       free_ow_set:
1508
1509         while (!list_empty(ch.overwrite_set)) {
1510                 jnode *cur = list_entry(ch.overwrite_set->next, jnode, capture_link);
1511                 list_del_init(&cur->capture_link);
1512                 jrelse(cur);
1513                 reiser4_drop_io_head(cur);
1514         }
1515
1516         list_del_init(&tx_head->capture_link);
1517
1518         done_commit_handle(&ch);
1519
1520         return ret;
1521 }
1522
1523 /* find oldest committed and not played transaction and play it. The transaction
1524  * was committed and journal header block was updated but the blocks from the
1525  * process of writing the atom's overwrite set in-place and updating of journal
1526  * footer block were not completed. This function completes the process by
1527  * recovering the atom's overwrite set from their wandered locations and writes
1528  * them in-place and updating the journal footer. */
1529 static int replay_oldest_transaction(struct super_block *s)
1530 {
1531         reiser4_super_info_data *sbinfo = get_super_private(s);
1532         jnode *jf = sbinfo->journal_footer;
1533         unsigned int total;
1534         struct journal_footer *F;
1535         struct tx_header *T;
1536
1537         reiser4_block_nr prev_tx;
1538         reiser4_block_nr last_flushed_tx;
1539         reiser4_block_nr log_rec_block = 0;
1540
1541         jnode *tx_head;
1542
1543         int ret;
1544
1545         if ((ret = jload(jf)) < 0)
1546                 return ret;
1547
1548         F = (struct journal_footer *)jdata(jf);
1549
1550         last_flushed_tx = le64_to_cpu(get_unaligned(&F->last_flushed_tx));
1551
1552         jrelse(jf);
1553
1554         if (sbinfo->last_committed_tx == last_flushed_tx) {
1555                 /* all transactions are replayed */
1556                 return 0;
1557         }
1558
1559         prev_tx = sbinfo->last_committed_tx;
1560
1561         /* searching for oldest not flushed transaction */
1562         while (1) {
1563                 tx_head = reiser4_alloc_io_head(&prev_tx);
1564                 if (!tx_head)
1565                         return RETERR(-ENOMEM);
1566
1567                 ret = jload(tx_head);
1568                 if (ret < 0) {
1569                         reiser4_drop_io_head(tx_head);
1570                         return ret;
1571                 }
1572
1573                 ret = check_tx_head(tx_head);
1574                 if (ret) {
1575                         jrelse(tx_head);
1576                         reiser4_drop_io_head(tx_head);
1577                         return ret;
1578                 }
1579
1580                 T = (struct tx_header *)jdata(tx_head);
1581
1582                 prev_tx = le64_to_cpu(get_unaligned(&T->prev_tx));
1583
1584                 if (prev_tx == last_flushed_tx)
1585                         break;
1586
1587                 jrelse(tx_head);
1588                 reiser4_drop_io_head(tx_head);
1589         }
1590
1591         total = le32_to_cpu(get_unaligned(&T->total));
1592         log_rec_block = le64_to_cpu(get_unaligned(&T->next_block));
1593
1594         pin_jnode_data(tx_head);
1595         jrelse(tx_head);
1596
1597         ret =
1598             replay_transaction(s, tx_head, &log_rec_block,
1599                                jnode_get_block(tx_head), total - 1);
1600
1601         unpin_jnode_data(tx_head);
1602         reiser4_drop_io_head(tx_head);
1603
1604         if (ret)
1605                 return ret;
1606         return -E_REPEAT;
1607 }
1608
1609 /* The reiser4 journal current implementation was optimized to not to capture
1610    super block if certain super blocks fields are modified. Currently, the set
1611    is (<free block count>, <OID allocator>). These fields are logged by
1612    special way which includes storing them in each transaction head block at
1613    atom commit time and writing that information to journal footer block at
1614    atom flush time.  For getting info from journal footer block to the
1615    in-memory super block there is a special function
1616    reiser4_journal_recover_sb_data() which should be called after disk format
1617    plugin re-reads super block after journal replaying.
1618 */
1619
1620 /* get the information from journal footer in-memory super block */
1621 int reiser4_journal_recover_sb_data(struct super_block *s)
1622 {
1623         reiser4_super_info_data *sbinfo = get_super_private(s);
1624         struct journal_footer *jf;
1625         int ret;
1626
1627         assert("zam-673", sbinfo->journal_footer != NULL);
1628
1629         ret = jload(sbinfo->journal_footer);
1630         if (ret != 0)
1631                 return ret;
1632
1633         ret = check_journal_footer(sbinfo->journal_footer);
1634         if (ret != 0)
1635                 goto out;
1636
1637         jf = (struct journal_footer *)jdata(sbinfo->journal_footer);
1638
1639         /* was there at least one flushed transaction?  */
1640         if (jf->last_flushed_tx) {
1641
1642                 /* restore free block counter logged in this transaction */
1643                 reiser4_set_free_blocks(s, le64_to_cpu(get_unaligned(&jf->free_blocks)));
1644
1645                 /* restore oid allocator state */
1646                 oid_init_allocator(s,
1647                                    le64_to_cpu(get_unaligned(&jf->nr_files)),
1648                                    le64_to_cpu(get_unaligned(&jf->next_oid)));
1649         }
1650       out:
1651         jrelse(sbinfo->journal_footer);
1652         return ret;
1653 }
1654
1655 /* reiser4 replay journal procedure */
1656 int reiser4_journal_replay(struct super_block *s)
1657 {
1658         reiser4_super_info_data *sbinfo = get_super_private(s);
1659         jnode *jh, *jf;
1660         struct journal_header *header;
1661         int nr_tx_replayed = 0;
1662         int ret;
1663
1664         assert("zam-582", sbinfo != NULL);
1665
1666         jh = sbinfo->journal_header;
1667         jf = sbinfo->journal_footer;
1668
1669         if (!jh || !jf) {
1670                 /* it is possible that disk layout does not support journal
1671                    structures, we just warn about this */
1672                 warning("zam-583",
1673                         "journal control blocks were not loaded by disk layout plugin.  "
1674                         "journal replaying is not possible.\n");
1675                 return 0;
1676         }
1677
1678         /* Take free block count from journal footer block. The free block
1679            counter value corresponds the last flushed transaction state */
1680         ret = jload(jf);
1681         if (ret < 0)
1682                 return ret;
1683
1684         ret = check_journal_footer(jf);
1685         if (ret) {
1686                 jrelse(jf);
1687                 return ret;
1688         }
1689
1690         jrelse(jf);
1691
1692         /* store last committed transaction info in reiser4 in-memory super
1693            block */
1694         ret = jload(jh);
1695         if (ret < 0)
1696                 return ret;
1697
1698         ret = check_journal_header(jh);
1699         if (ret) {
1700                 jrelse(jh);
1701                 return ret;
1702         }
1703
1704         header = (struct journal_header *)jdata(jh);
1705         sbinfo->last_committed_tx = le64_to_cpu(get_unaligned(&header->last_committed_tx));
1706
1707         jrelse(jh);
1708
1709         /* replay committed transactions */
1710         while ((ret = replay_oldest_transaction(s)) == -E_REPEAT)
1711                 nr_tx_replayed++;
1712
1713         return ret;
1714 }
1715
1716 /* load journal control block (either journal header or journal footer block) */
1717 static int
1718 load_journal_control_block(jnode ** node, const reiser4_block_nr * block)
1719 {
1720         int ret;
1721
1722         *node = reiser4_alloc_io_head(block);
1723         if (!(*node))
1724                 return RETERR(-ENOMEM);
1725
1726         ret = jload(*node);
1727
1728         if (ret) {
1729                 reiser4_drop_io_head(*node);
1730                 *node = NULL;
1731                 return ret;
1732         }
1733
1734         pin_jnode_data(*node);
1735         jrelse(*node);
1736
1737         return 0;
1738 }
1739
1740 /* unload journal header or footer and free jnode */
1741 static void unload_journal_control_block(jnode ** node)
1742 {
1743         if (*node) {
1744                 unpin_jnode_data(*node);
1745                 reiser4_drop_io_head(*node);
1746                 *node = NULL;
1747         }
1748 }
1749
1750 /* release journal control blocks */
1751 void reiser4_done_journal_info(struct super_block *s)
1752 {
1753         reiser4_super_info_data *sbinfo = get_super_private(s);
1754
1755         assert("zam-476", sbinfo != NULL);
1756
1757         unload_journal_control_block(&sbinfo->journal_header);
1758         unload_journal_control_block(&sbinfo->journal_footer);
1759         rcu_barrier();
1760 }
1761
1762 /* load journal control blocks */
1763 int reiser4_init_journal_info(struct super_block *s)
1764 {
1765         reiser4_super_info_data *sbinfo = get_super_private(s);
1766         journal_location *loc;
1767         int ret;
1768
1769         loc = &sbinfo->jloc;
1770
1771         assert("zam-651", loc != NULL);
1772         assert("zam-652", loc->header != 0);
1773         assert("zam-653", loc->footer != 0);
1774
1775         ret = load_journal_control_block(&sbinfo->journal_header, &loc->header);
1776
1777         if (ret)
1778                 return ret;
1779
1780         ret = load_journal_control_block(&sbinfo->journal_footer, &loc->footer);
1781
1782         if (ret) {
1783                 unload_journal_control_block(&sbinfo->journal_header);
1784         }
1785
1786         return ret;
1787 }
1788
1789 /* Make Linus happy.
1790    Local variables:
1791    c-indentation-style: "K&R"
1792    mode-name: "LC"
1793    c-basic-offset: 8
1794    tab-width: 8
1795    fill-column: 80
1796    End:
1797 */