revert-mm-fix-blkdev-size-calculation-in-generic_write_checks
[linux-2.6/linux-trees-mm.git] / fs / reiser4 / wander.c
blob6d1d1d97823f84a7c41bb6c720c8d42f54e949d0
1 /* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
2 * reiser4/README */
4 /* Reiser4 Wandering Log */
6 /* You should read http://www.namesys.com/txn-doc.html
8 That describes how filesystem operations are performed as atomic
9 transactions, and how we try to arrange it so that we can write most of the
10 data only once while performing the operation atomically.
12 For the purposes of this code, it is enough for it to understand that it
13 has been told a given block should be written either once, or twice (if
14 twice then once to the wandered location and once to the real location).
16 This code guarantees that those blocks that are defined to be part of an
17 atom either all take effect or none of them take effect.
19 Relocate set nodes are submitted to write by the jnode_flush() routine, and
20 the overwrite set is submitted by reiser4_write_log(). This is because with
21 the overwrite set we seek to optimize writes, and with the relocate set we
22 seek to cause disk order to correlate with the parent first pre-order.
24 reiser4_write_log() allocates and writes wandered blocks and maintains
25 additional on-disk structures of the atom as wander records (each wander
26 record occupies one block) for storing of the "wandered map" (a table which
27 contains a relation between wandered and real block numbers) and other
28 information which might be needed at transaction recovery time.
30 The wander records are unidirectionally linked into a circle: each wander
31 record contains a block number of the next wander record, the last wander
32 record points to the first one.
34 One wander record (named "tx head" in this file) has a format which is
35 different from the other wander records. The "tx head" has a reference to the
36 "tx head" block of the previously committed atom. Also, "tx head" contains
37 fs information (the free blocks counter, and the oid allocator state) which
38 is logged in a special way .
40 There are two journal control blocks, named journal header and journal
41 footer which have fixed on-disk locations. The journal header has a
42 reference to the "tx head" block of the last committed atom. The journal
43 footer points to the "tx head" of the last flushed atom. The atom is
44 "played" when all blocks from its overwrite set are written to disk the
45 second time (i.e. written to their real locations).
47 NOTE: People who know reiserfs internals and its journal structure might be
48 confused with these terms journal footer and journal header. There is a table
49 with terms of similar semantics in reiserfs (reiser3) and reiser4:
51 REISER3 TERM | REISER4 TERM | DESCRIPTION
52 --------------------+-----------------------+----------------------------
53 commit record | journal header | atomic write of this record
54 | | ends transaction commit
55 --------------------+-----------------------+----------------------------
56 journal header | journal footer | atomic write of this record
57 | | ends post-commit writes.
58 | | After successful
59 | | writing of this journal
60 | | blocks (in reiser3) or
61 | | wandered blocks/records are
62 | | free for re-use.
63 --------------------+-----------------------+----------------------------
65 The atom commit process is the following:
67 1. The overwrite set is taken from atom's clean list, and its size is
68 counted.
70 2. The number of necessary wander records (including tx head) is calculated,
71 and the wander record blocks are allocated.
73 3. Allocate wandered blocks and populate wander records by wandered map.
75 4. submit write requests for wander records and wandered blocks.
77 5. wait until submitted write requests complete.
79 6. update journal header: change the pointer to the block number of just
80 written tx head, submit an i/o for modified journal header block and wait
81 for i/o completion.
83 NOTE: The special logging for bitmap blocks and some reiser4 super block
84 fields makes processes of atom commit, flush and recovering a bit more
85 complex (see comments in the source code for details).
87 The atom playing process is the following:
89 1. Write atom's overwrite set in-place.
91 2. Wait on i/o.
93 3. Update journal footer: change the pointer to block number of tx head
94 block of the atom we currently flushing, submit an i/o, wait on i/o
95 completion.
97 4. Free disk space which was used for wandered blocks and wander records.
99 After the freeing of wandered blocks and wander records we have that journal
100 footer points to the on-disk structure which might be overwritten soon.
101 Neither the log writer nor the journal recovery procedure use that pointer
102 for accessing the data. When the journal recovery procedure finds the oldest
103 transaction it compares the journal footer pointer value with the "prev_tx"
104 pointer value in tx head, if values are equal the oldest not flushed
105 transaction is found.
107 NOTE on disk space leakage: the information about of what blocks and how many
108 blocks are allocated for wandered blocks, wandered records is not written to
109 the disk because of special logging for bitmaps and some super blocks
110 counters. After a system crash we the reiser4 does not remember those
111 objects allocation, thus we have no such a kind of disk space leakage.
114 /* Special logging of reiser4 super block fields. */
116 /* There are some reiser4 super block fields (free block count and OID allocator
117 state (number of files and next free OID) which are logged separately from
118 super block to avoid unnecessary atom fusion.
120 So, the reiser4 super block can be not captured by a transaction with
121 allocates/deallocates disk blocks or create/delete file objects. Moreover,
122 the reiser4 on-disk super block is not touched when such a transaction is
123 committed and flushed. Those "counters logged specially" are logged in "tx
124 head" blocks and in the journal footer block.
126 A step-by-step description of special logging:
128 0. The per-atom information about deleted or created files and allocated or
129 freed blocks is collected during the transaction. The atom's
130 ->nr_objects_created and ->nr_objects_deleted are for object
131 deletion/creation tracking, the numbers of allocated and freed blocks are
132 calculated using atom's delete set and atom's capture list -- all new and
133 relocated nodes should be on atom's clean list and should have JNODE_RELOC
134 bit set.
136 1. The "logged specially" reiser4 super block fields have their "committed"
137 versions in the reiser4 in-memory super block. They get modified only at
138 atom commit time. The atom's commit thread has an exclusive access to those
139 "committed" fields because the log writer implementation supports only one
140 atom commit a time (there is a per-fs "commit" mutex). At
141 that time "committed" counters are modified using per-atom information
142 collected during the transaction. These counters are stored on disk as a
143 part of tx head block when atom is committed.
145 2. When the atom is flushed the value of the free block counter and the OID
146 allocator state get written to the journal footer block. A special journal
147 procedure (journal_recover_sb_data()) takes those values from the journal
148 footer and updates the reiser4 in-memory super block.
150 NOTE: That means free block count and OID allocator state are logged
151 separately from the reiser4 super block regardless of the fact that the
152 reiser4 super block has fields to store both the free block counter and the
153 OID allocator.
155 Writing the whole super block at commit time requires knowing true values of
156 all its fields without changes made by not yet committed transactions. It is
157 possible by having their "committed" version of the super block like the
158 reiser4 bitmap blocks have "committed" and "working" versions. However,
159 another scheme was implemented which stores special logged values in the
160 unused free space inside transaction head block. In my opinion it has an
161 advantage of not writing whole super block when only part of it was
162 modified. */
164 #include "debug.h"
165 #include "dformat.h"
166 #include "txnmgr.h"
167 #include "jnode.h"
168 #include "znode.h"
169 #include "block_alloc.h"
170 #include "page_cache.h"
171 #include "wander.h"
172 #include "reiser4.h"
173 #include "super.h"
174 #include "vfs_ops.h"
175 #include "writeout.h"
176 #include "inode.h"
177 #include "entd.h"
179 #include <linux/types.h>
180 #include <linux/fs.h> /* for struct super_block */
181 #include <linux/mm.h> /* for struct page */
182 #include <linux/pagemap.h>
183 #include <linux/bio.h> /* for struct bio */
184 #include <linux/blkdev.h>
186 static int write_jnodes_to_disk_extent(
187 jnode *, int, const reiser4_block_nr *, flush_queue_t *, int);
189 /* The commit_handle is a container for objects needed at atom commit time */
190 struct commit_handle {
191 /* A pointer to atom's list of OVRWR nodes */
192 struct list_head *overwrite_set;
193 /* atom's overwrite set size */
194 int overwrite_set_size;
195 /* jnodes for wander record blocks */
196 struct list_head tx_list;
197 /* number of wander records */
198 __u32 tx_size;
199 /* 'committed' sb counters are saved here until atom is completely
200 flushed */
201 __u64 free_blocks;
202 __u64 nr_files;
203 __u64 next_oid;
204 /* A pointer to the atom which is being committed */
205 txn_atom *atom;
206 /* A pointer to current super block */
207 struct super_block *super;
208 /* The counter of modified bitmaps */
209 reiser4_block_nr nr_bitmap;
212 static void init_commit_handle(struct commit_handle *ch, txn_atom *atom)
214 memset(ch, 0, sizeof(struct commit_handle));
215 INIT_LIST_HEAD(&ch->tx_list);
217 ch->atom = atom;
218 ch->super = reiser4_get_current_sb();
221 static void done_commit_handle(struct commit_handle *ch)
223 assert("zam-690", list_empty(&ch->tx_list));
226 static inline int reiser4_use_write_barrier(struct super_block * s)
228 return !reiser4_is_set(s, REISER4_NO_WRITE_BARRIER);
231 static void disable_write_barrier(struct super_block * s)
233 notice("zam-1055", "%s does not support write barriers,"
234 " using synchronous write instead.", s->s_id);
235 set_bit((int)REISER4_NO_WRITE_BARRIER, &get_super_private(s)->fs_flags);
238 /* fill journal header block data */
239 static void format_journal_header(struct commit_handle *ch)
241 struct reiser4_super_info_data *sbinfo;
242 struct journal_header *header;
243 jnode *txhead;
245 sbinfo = get_super_private(ch->super);
246 assert("zam-479", sbinfo != NULL);
247 assert("zam-480", sbinfo->journal_header != NULL);
249 txhead = list_entry(ch->tx_list.next, jnode, capture_link);
251 jload(sbinfo->journal_header);
253 header = (struct journal_header *)jdata(sbinfo->journal_header);
254 assert("zam-484", header != NULL);
256 put_unaligned(cpu_to_le64(*jnode_get_block(txhead)),
257 &header->last_committed_tx);
259 jrelse(sbinfo->journal_header);
262 /* fill journal footer block data */
263 static void format_journal_footer(struct commit_handle *ch)
265 struct reiser4_super_info_data *sbinfo;
266 struct journal_footer *footer;
267 jnode *tx_head;
269 sbinfo = get_super_private(ch->super);
271 tx_head = list_entry(ch->tx_list.next, jnode, capture_link);
273 assert("zam-493", sbinfo != NULL);
274 assert("zam-494", sbinfo->journal_header != NULL);
276 check_me("zam-691", jload(sbinfo->journal_footer) == 0);
278 footer = (struct journal_footer *)jdata(sbinfo->journal_footer);
279 assert("zam-495", footer != NULL);
281 put_unaligned(cpu_to_le64(*jnode_get_block(tx_head)),
282 &footer->last_flushed_tx);
283 put_unaligned(cpu_to_le64(ch->free_blocks), &footer->free_blocks);
285 put_unaligned(cpu_to_le64(ch->nr_files), &footer->nr_files);
286 put_unaligned(cpu_to_le64(ch->next_oid), &footer->next_oid);
288 jrelse(sbinfo->journal_footer);
291 /* wander record capacity depends on current block size */
292 static int wander_record_capacity(const struct super_block *super)
294 return (super->s_blocksize -
295 sizeof(struct wander_record_header)) /
296 sizeof(struct wander_entry);
299 /* Fill first wander record (tx head) in accordance with supplied given data */
300 static void format_tx_head(struct commit_handle *ch)
302 jnode *tx_head;
303 jnode *next;
304 struct tx_header *header;
306 tx_head = list_entry(ch->tx_list.next, jnode, capture_link);
307 assert("zam-692", &ch->tx_list != &tx_head->capture_link);
309 next = list_entry(tx_head->capture_link.next, jnode, capture_link);
310 if (&ch->tx_list == &next->capture_link)
311 next = tx_head;
313 header = (struct tx_header *)jdata(tx_head);
315 assert("zam-460", header != NULL);
316 assert("zam-462", ch->super->s_blocksize >= sizeof(struct tx_header));
318 memset(jdata(tx_head), 0, (size_t) ch->super->s_blocksize);
319 memcpy(jdata(tx_head), TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE);
321 put_unaligned(cpu_to_le32(ch->tx_size), &header->total);
322 put_unaligned(cpu_to_le64(get_super_private(ch->super)->last_committed_tx),
323 &header->prev_tx);
324 put_unaligned(cpu_to_le64(*jnode_get_block(next)), &header->next_block);
325 put_unaligned(cpu_to_le64(ch->free_blocks), &header->free_blocks);
326 put_unaligned(cpu_to_le64(ch->nr_files), &header->nr_files);
327 put_unaligned(cpu_to_le64(ch->next_oid), &header->next_oid);
330 /* prepare ordinary wander record block (fill all service fields) */
331 static void
332 format_wander_record(struct commit_handle *ch, jnode *node, __u32 serial)
334 struct wander_record_header *LRH;
335 jnode *next;
337 assert("zam-464", node != NULL);
339 LRH = (struct wander_record_header *)jdata(node);
340 next = list_entry(node->capture_link.next, jnode, capture_link);
342 if (&ch->tx_list == &next->capture_link)
343 next = list_entry(ch->tx_list.next, jnode, capture_link);
345 assert("zam-465", LRH != NULL);
346 assert("zam-463",
347 ch->super->s_blocksize > sizeof(struct wander_record_header));
349 memset(jdata(node), 0, (size_t) ch->super->s_blocksize);
350 memcpy(jdata(node), WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE);
352 put_unaligned(cpu_to_le32(ch->tx_size), &LRH->total);
353 put_unaligned(cpu_to_le32(serial), &LRH->serial);
354 put_unaligned(cpu_to_le64(*jnode_get_block(next)), &LRH->next_block);
357 /* add one wandered map entry to formatted wander record */
358 static void
359 store_entry(jnode * node, int index, const reiser4_block_nr * a,
360 const reiser4_block_nr * b)
362 char *data;
363 struct wander_entry *pairs;
365 data = jdata(node);
366 assert("zam-451", data != NULL);
368 pairs =
369 (struct wander_entry *)(data + sizeof(struct wander_record_header));
371 put_unaligned(cpu_to_le64(*a), &pairs[index].original);
372 put_unaligned(cpu_to_le64(*b), &pairs[index].wandered);
375 /* currently, wander records contains contain only wandered map, which depend on
376 overwrite set size */
377 static void get_tx_size(struct commit_handle *ch)
379 assert("zam-440", ch->overwrite_set_size != 0);
380 assert("zam-695", ch->tx_size == 0);
382 /* count all ordinary wander records
383 (<overwrite_set_size> - 1) / <wander_record_capacity> + 1 and add one
384 for tx head block */
385 ch->tx_size =
386 (ch->overwrite_set_size - 1) / wander_record_capacity(ch->super) +
390 /* A special structure for using in store_wmap_actor() for saving its state
391 between calls */
392 struct store_wmap_params {
393 jnode *cur; /* jnode of current wander record to fill */
394 int idx; /* free element index in wander record */
395 int capacity; /* capacity */
397 #if REISER4_DEBUG
398 struct list_head *tx_list;
399 #endif
402 /* an actor for use in blocknr_set_iterator routine which populates the list
403 of pre-formatted wander records by wandered map info */
404 static int
405 store_wmap_actor(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a,
406 const reiser4_block_nr * b, void *data)
408 struct store_wmap_params *params = data;
410 if (params->idx >= params->capacity) {
411 /* a new wander record should be taken from the tx_list */
412 params->cur = list_entry(params->cur->capture_link.next, jnode, capture_link);
413 assert("zam-454",
414 params->tx_list != &params->cur->capture_link);
416 params->idx = 0;
419 store_entry(params->cur, params->idx, a, b);
420 params->idx++;
422 return 0;
425 /* This function is called after Relocate set gets written to disk, Overwrite
426 set is written to wandered locations and all wander records are written
427 also. Updated journal header blocks contains a pointer (block number) to
428 first wander record of the just written transaction */
429 static int update_journal_header(struct commit_handle *ch, int use_barrier)
431 struct reiser4_super_info_data *sbinfo = get_super_private(ch->super);
432 jnode *jh = sbinfo->journal_header;
433 jnode *head = list_entry(ch->tx_list.next, jnode, capture_link);
434 int ret;
436 format_journal_header(ch);
438 ret = write_jnodes_to_disk_extent(jh, 1, jnode_get_block(jh), NULL,
439 use_barrier ? WRITEOUT_BARRIER : 0);
440 if (ret)
441 return ret;
443 // blk_run_address_space(sbinfo->fake->i_mapping);
444 /*blk_run_queues(); */
446 ret = jwait_io(jh, WRITE);
448 if (ret)
449 return ret;
451 sbinfo->last_committed_tx = *jnode_get_block(head);
453 return 0;
456 /* This function is called after write-back is finished. We update journal
457 footer block and free blocks which were occupied by wandered blocks and
458 transaction wander records */
459 static int update_journal_footer(struct commit_handle *ch, int use_barrier)
461 reiser4_super_info_data *sbinfo = get_super_private(ch->super);
463 jnode *jf = sbinfo->journal_footer;
465 int ret;
467 format_journal_footer(ch);
469 ret = write_jnodes_to_disk_extent(jf, 1, jnode_get_block(jf), NULL,
470 use_barrier ? WRITEOUT_BARRIER : 0);
471 if (ret)
472 return ret;
474 // blk_run_address_space(sbinfo->fake->i_mapping);
475 /*blk_run_queue(); */
477 ret = jwait_io(jf, WRITE);
478 if (ret)
479 return ret;
481 return 0;
484 /* free block numbers of wander records of already written in place transaction */
485 static void dealloc_tx_list(struct commit_handle *ch)
487 while (!list_empty(&ch->tx_list)) {
488 jnode *cur = list_entry(ch->tx_list.next, jnode, capture_link);
489 list_del(&cur->capture_link);
490 ON_DEBUG(INIT_LIST_HEAD(&cur->capture_link));
491 reiser4_dealloc_block(jnode_get_block(cur), BLOCK_NOT_COUNTED,
492 BA_FORMATTED);
494 unpin_jnode_data(cur);
495 reiser4_drop_io_head(cur);
499 /* An actor for use in block_nr_iterator() routine which frees wandered blocks
500 from atom's overwrite set. */
501 static int
502 dealloc_wmap_actor(txn_atom * atom UNUSED_ARG,
503 const reiser4_block_nr * a UNUSED_ARG,
504 const reiser4_block_nr * b, void *data UNUSED_ARG)
507 assert("zam-499", b != NULL);
508 assert("zam-500", *b != 0);
509 assert("zam-501", !reiser4_blocknr_is_fake(b));
511 reiser4_dealloc_block(b, BLOCK_NOT_COUNTED, BA_FORMATTED);
512 return 0;
515 /* free wandered block locations of already written in place transaction */
516 static void dealloc_wmap(struct commit_handle *ch)
518 assert("zam-696", ch->atom != NULL);
520 blocknr_set_iterator(ch->atom, &ch->atom->wandered_map,
521 dealloc_wmap_actor, NULL, 1);
524 /* helper function for alloc wandered blocks, which refill set of block
525 numbers needed for wandered blocks */
526 static int
527 get_more_wandered_blocks(int count, reiser4_block_nr * start, int *len)
529 reiser4_blocknr_hint hint;
530 int ret;
532 reiser4_block_nr wide_len = count;
534 /* FIXME-ZAM: A special policy needed for allocation of wandered blocks
535 ZAM-FIXME-HANS: yes, what happened to our discussion of using a fixed
536 reserved allocation area so as to get the best qualities of fixed
537 journals? */
538 reiser4_blocknr_hint_init(&hint);
539 hint.block_stage = BLOCK_GRABBED;
541 ret = reiser4_alloc_blocks(&hint, start, &wide_len,
542 BA_FORMATTED | BA_USE_DEFAULT_SEARCH_START);
543 *len = (int)wide_len;
545 return ret;
549 * roll back changes made before issuing BIO in the case of IO error.
551 static void undo_bio(struct bio *bio)
553 int i;
555 for (i = 0; i < bio->bi_vcnt; ++i) {
556 struct page *pg;
557 jnode *node;
559 pg = bio->bi_io_vec[i].bv_page;
560 end_page_writeback(pg);
561 node = jprivate(pg);
562 spin_lock_jnode(node);
563 JF_CLR(node, JNODE_WRITEBACK);
564 JF_SET(node, JNODE_DIRTY);
565 spin_unlock_jnode(node);
567 bio_put(bio);
570 /* put overwrite set back to atom's clean list */
571 static void put_overwrite_set(struct commit_handle *ch)
573 jnode *cur;
575 list_for_each_entry(cur, ch->overwrite_set, capture_link)
576 jrelse_tail(cur);
579 /* Count overwrite set size, grab disk space for wandered blocks allocation.
580 Since we have a separate list for atom's overwrite set we just scan the list,
581 count bitmap and other not leaf nodes which wandered blocks allocation we
582 have to grab space for. */
583 static int get_overwrite_set(struct commit_handle *ch)
585 int ret;
586 jnode *cur;
587 __u64 nr_not_leaves = 0;
588 #if REISER4_DEBUG
589 __u64 nr_formatted_leaves = 0;
590 __u64 nr_unformatted_leaves = 0;
591 #endif
593 assert("zam-697", ch->overwrite_set_size == 0);
595 ch->overwrite_set = ATOM_OVRWR_LIST(ch->atom);
596 cur = list_entry(ch->overwrite_set->next, jnode, capture_link);
598 while (ch->overwrite_set != &cur->capture_link) {
599 jnode *next = list_entry(cur->capture_link.next, jnode, capture_link);
601 /* Count bitmap locks for getting correct statistics what number
602 * of blocks were cleared by the transaction commit. */
603 if (jnode_get_type(cur) == JNODE_BITMAP)
604 ch->nr_bitmap++;
606 assert("zam-939", JF_ISSET(cur, JNODE_OVRWR)
607 || jnode_get_type(cur) == JNODE_BITMAP);
609 if (jnode_is_znode(cur) && znode_above_root(JZNODE(cur))) {
610 /* we replace fake znode by another (real)
611 znode which is suggested by disk_layout
612 plugin */
614 /* FIXME: it looks like fake znode should be
615 replaced by jnode supplied by
616 disk_layout. */
618 struct super_block *s = reiser4_get_current_sb();
619 reiser4_super_info_data *sbinfo =
620 get_current_super_private();
622 if (sbinfo->df_plug->log_super) {
623 jnode *sj = sbinfo->df_plug->log_super(s);
625 assert("zam-593", sj != NULL);
627 if (IS_ERR(sj))
628 return PTR_ERR(sj);
630 spin_lock_jnode(sj);
631 JF_SET(sj, JNODE_OVRWR);
632 insert_into_atom_ovrwr_list(ch->atom, sj);
633 spin_unlock_jnode(sj);
635 /* jload it as the rest of overwrite set */
636 jload_gfp(sj, reiser4_ctx_gfp_mask_get(), 0);
638 ch->overwrite_set_size++;
640 spin_lock_jnode(cur);
641 reiser4_uncapture_block(cur);
642 jput(cur);
644 } else {
645 int ret;
646 ch->overwrite_set_size++;
647 ret = jload_gfp(cur, reiser4_ctx_gfp_mask_get(), 0);
648 if (ret)
649 reiser4_panic("zam-783",
650 "cannot load e-flushed jnode back (ret = %d)\n",
651 ret);
654 /* Count not leaves here because we have to grab disk space
655 * for wandered blocks. They were not counted as "flush
656 * reserved". Counting should be done _after_ nodes are pinned
657 * into memory by jload(). */
658 if (!jnode_is_leaf(cur))
659 nr_not_leaves++;
660 else {
661 #if REISER4_DEBUG
662 /* at this point @cur either has JNODE_FLUSH_RESERVED
663 * or is eflushed. Locking is not strong enough to
664 * write an assertion checking for this. */
665 if (jnode_is_znode(cur))
666 nr_formatted_leaves++;
667 else
668 nr_unformatted_leaves++;
669 #endif
670 JF_CLR(cur, JNODE_FLUSH_RESERVED);
673 cur = next;
676 /* Grab space for writing (wandered blocks) of not leaves found in
677 * overwrite set. */
678 ret = reiser4_grab_space_force(nr_not_leaves, BA_RESERVED);
679 if (ret)
680 return ret;
682 /* Disk space for allocation of wandered blocks of leaf nodes already
683 * reserved as "flush reserved", move it to grabbed space counter. */
684 spin_lock_atom(ch->atom);
685 assert("zam-940",
686 nr_formatted_leaves + nr_unformatted_leaves <=
687 ch->atom->flush_reserved);
688 flush_reserved2grabbed(ch->atom, ch->atom->flush_reserved);
689 spin_unlock_atom(ch->atom);
691 return ch->overwrite_set_size;
695 * write_jnodes_to_disk_extent - submit write request
696 * @head:
697 * @first: first jnode of the list
698 * @nr: number of jnodes on the list
699 * @block_p:
700 * @fq:
701 * @flags: used to decide whether page is to get PG_reclaim flag
703 * Submits a write request for @nr jnodes beginning from the @first, other
704 * jnodes are after the @first on the double-linked "capture" list. All jnodes
705 * will be written to the disk region of @nr blocks starting with @block_p block
706 * number. If @fq is not NULL it means that waiting for i/o completion will be
707 * done more efficiently by using flush_queue_t objects.
708 * This function is the one which writes list of jnodes in batch mode. It does
709 * all low-level things as bio construction and page states manipulation.
711 * ZAM-FIXME-HANS: brief me on why this function exists, and why bios are
712 * aggregated in this function instead of being left to the layers below
714 * FIXME: ZAM->HANS: What layer are you talking about? Can you point me to that?
715 * Why that layer needed? Why BIOs cannot be constructed here?
717 static int write_jnodes_to_disk_extent(
718 jnode *first, int nr, const reiser4_block_nr *block_p,
719 flush_queue_t *fq, int flags)
721 struct super_block *super = reiser4_get_current_sb();
722 int write_op = ( flags & WRITEOUT_BARRIER ) ? WRITE_BARRIER : WRITE;
723 int max_blocks;
724 jnode *cur = first;
725 reiser4_block_nr block;
727 assert("zam-571", first != NULL);
728 assert("zam-572", block_p != NULL);
729 assert("zam-570", nr > 0);
731 block = *block_p;
732 max_blocks = min(bio_get_nr_vecs(super->s_bdev), BIO_MAX_PAGES);
734 while (nr > 0) {
735 struct bio *bio;
736 int nr_blocks = min(nr, max_blocks);
737 int i;
738 int nr_used;
740 bio = bio_alloc(GFP_NOIO, nr_blocks);
741 if (!bio)
742 return RETERR(-ENOMEM);
744 bio->bi_bdev = super->s_bdev;
745 bio->bi_sector = block * (super->s_blocksize >> 9);
746 for (nr_used = 0, i = 0; i < nr_blocks; i++) {
747 struct page *pg;
749 pg = jnode_page(cur);
750 assert("zam-573", pg != NULL);
752 page_cache_get(pg);
754 lock_and_wait_page_writeback(pg);
756 if (!bio_add_page(bio, pg, super->s_blocksize, 0)) {
758 * underlying device is satiated. Stop adding
759 * pages to the bio.
761 unlock_page(pg);
762 page_cache_release(pg);
763 break;
766 spin_lock_jnode(cur);
767 assert("nikita-3166",
768 pg->mapping == jnode_get_mapping(cur));
769 assert("zam-912", !JF_ISSET(cur, JNODE_WRITEBACK));
770 #if REISER4_DEBUG
771 spin_lock(&cur->load);
772 assert("nikita-3165", !jnode_is_releasable(cur));
773 spin_unlock(&cur->load);
774 #endif
775 JF_SET(cur, JNODE_WRITEBACK);
776 JF_CLR(cur, JNODE_DIRTY);
777 ON_DEBUG(cur->written++);
778 spin_unlock_jnode(cur);
780 ClearPageError(pg);
781 set_page_writeback(pg);
783 if (get_current_context()->entd) {
784 /* this is ent thread */
785 entd_context *ent = get_entd_context(super);
786 struct wbq *rq, *next;
788 spin_lock(&ent->guard);
790 if (pg == ent->cur_request->page) {
792 * entd is called for this page. This
793 * request is not in th etodo list
795 ent->cur_request->written = 1;
796 } else {
798 * if we have written a page for which writepage
799 * is called for - move request to another list.
801 list_for_each_entry_safe(rq, next, &ent->todo_list, link) {
802 assert("", rq->magic == WBQ_MAGIC);
803 if (pg == rq->page) {
805 * remove request from
806 * entd's queue, but do
807 * not wake up a thread
808 * which put this
809 * request
811 list_del_init(&rq->link);
812 ent->nr_todo_reqs --;
813 list_add_tail(&rq->link, &ent->done_list);
814 ent->nr_done_reqs ++;
815 rq->written = 1;
816 break;
820 spin_unlock(&ent->guard);
823 clear_page_dirty_for_io(pg);
825 unlock_page(pg);
827 cur = list_entry(cur->capture_link.next, jnode, capture_link);
828 nr_used++;
830 if (nr_used > 0) {
831 assert("nikita-3453",
832 bio->bi_size == super->s_blocksize * nr_used);
833 assert("nikita-3454", bio->bi_vcnt == nr_used);
835 /* Check if we are allowed to write at all */
836 if (super->s_flags & MS_RDONLY)
837 undo_bio(bio);
838 else {
839 int not_supported;
841 add_fq_to_bio(fq, bio);
842 bio_get(bio);
843 reiser4_submit_bio(write_op, bio);
844 not_supported = bio_flagged(bio, BIO_EOPNOTSUPP);
845 bio_put(bio);
846 if (not_supported)
847 return -EOPNOTSUPP;
850 block += nr_used - 1;
851 update_blocknr_hint_default(super, &block);
852 block += 1;
853 } else {
854 bio_put(bio);
856 nr -= nr_used;
859 return 0;
862 /* This is a procedure which recovers a contiguous sequences of disk block
863 numbers in the given list of j-nodes and submits write requests on this
864 per-sequence basis */
866 write_jnode_list(struct list_head *head, flush_queue_t *fq,
867 long *nr_submitted, int flags)
869 int ret;
870 jnode *beg = list_entry(head->next, jnode, capture_link);
872 while (head != &beg->capture_link) {
873 int nr = 1;
874 jnode *cur = list_entry(beg->capture_link.next, jnode, capture_link);
876 while (head != &cur->capture_link) {
877 if (*jnode_get_block(cur) != *jnode_get_block(beg) + nr)
878 break;
879 ++nr;
880 cur = list_entry(cur->capture_link.next, jnode, capture_link);
883 ret = write_jnodes_to_disk_extent(
884 beg, nr, jnode_get_block(beg), fq, flags);
885 if (ret)
886 return ret;
888 if (nr_submitted)
889 *nr_submitted += nr;
891 beg = cur;
894 return 0;
897 /* add given wandered mapping to atom's wandered map */
898 static int
899 add_region_to_wmap(jnode * cur, int len, const reiser4_block_nr * block_p)
901 int ret;
902 blocknr_set_entry *new_bsep = NULL;
903 reiser4_block_nr block;
905 txn_atom *atom;
907 assert("zam-568", block_p != NULL);
908 block = *block_p;
909 assert("zam-569", len > 0);
911 while ((len--) > 0) {
912 do {
913 atom = get_current_atom_locked();
914 assert("zam-536",
915 !reiser4_blocknr_is_fake(jnode_get_block(cur)));
916 ret =
917 blocknr_set_add_pair(atom, &atom->wandered_map,
918 &new_bsep,
919 jnode_get_block(cur), &block);
920 } while (ret == -E_REPEAT);
922 if (ret) {
923 /* deallocate blocks which were not added to wandered
924 map */
925 reiser4_block_nr wide_len = len;
927 reiser4_dealloc_blocks(&block, &wide_len,
928 BLOCK_NOT_COUNTED,
929 BA_FORMATTED
930 /* formatted, without defer */ );
932 return ret;
935 spin_unlock_atom(atom);
937 cur = list_entry(cur->capture_link.next, jnode, capture_link);
938 ++block;
941 return 0;
944 /* Allocate wandered blocks for current atom's OVERWRITE SET and immediately
945 submit IO for allocated blocks. We assume that current atom is in a stage
946 when any atom fusion is impossible and atom is unlocked and it is safe. */
947 static int alloc_wandered_blocks(struct commit_handle *ch, flush_queue_t *fq)
949 reiser4_block_nr block;
951 int rest;
952 int len;
953 int ret;
955 jnode *cur;
957 assert("zam-534", ch->overwrite_set_size > 0);
959 rest = ch->overwrite_set_size;
961 cur = list_entry(ch->overwrite_set->next, jnode, capture_link);
962 while (ch->overwrite_set != &cur->capture_link) {
963 assert("zam-567", JF_ISSET(cur, JNODE_OVRWR));
965 ret = get_more_wandered_blocks(rest, &block, &len);
966 if (ret)
967 return ret;
969 rest -= len;
971 ret = add_region_to_wmap(cur, len, &block);
972 if (ret)
973 return ret;
975 ret = write_jnodes_to_disk_extent(cur, len, &block, fq, 0);
976 if (ret)
977 return ret;
979 while ((len--) > 0) {
980 assert("zam-604",
981 ch->overwrite_set != &cur->capture_link);
982 cur = list_entry(cur->capture_link.next, jnode, capture_link);
986 return 0;
989 /* allocate given number of nodes over the journal area and link them into a
990 list, return pointer to the first jnode in the list */
991 static int alloc_tx(struct commit_handle *ch, flush_queue_t * fq)
993 reiser4_blocknr_hint hint;
994 reiser4_block_nr allocated = 0;
995 reiser4_block_nr first, len;
996 jnode *cur;
997 jnode *txhead;
998 int ret;
999 reiser4_context *ctx;
1000 reiser4_super_info_data *sbinfo;
1002 assert("zam-698", ch->tx_size > 0);
1003 assert("zam-699", list_empty_careful(&ch->tx_list));
1005 ctx = get_current_context();
1006 sbinfo = get_super_private(ctx->super);
1008 while (allocated < (unsigned)ch->tx_size) {
1009 len = (ch->tx_size - allocated);
1011 reiser4_blocknr_hint_init(&hint);
1013 hint.block_stage = BLOCK_GRABBED;
1015 /* FIXME: there should be some block allocation policy for
1016 nodes which contain wander records */
1018 /* We assume that disk space for wandered record blocks can be
1019 * taken from reserved area. */
1020 ret = reiser4_alloc_blocks(&hint, &first, &len,
1021 BA_FORMATTED | BA_RESERVED |
1022 BA_USE_DEFAULT_SEARCH_START);
1023 reiser4_blocknr_hint_done(&hint);
1025 if (ret)
1026 return ret;
1028 allocated += len;
1030 /* create jnodes for all wander records */
1031 while (len--) {
1032 cur = reiser4_alloc_io_head(&first);
1034 if (cur == NULL) {
1035 ret = RETERR(-ENOMEM);
1036 goto free_not_assigned;
1039 ret = jinit_new(cur, reiser4_ctx_gfp_mask_get());
1041 if (ret != 0) {
1042 jfree(cur);
1043 goto free_not_assigned;
1046 pin_jnode_data(cur);
1048 list_add_tail(&cur->capture_link, &ch->tx_list);
1050 first++;
1054 { /* format a on-disk linked list of wander records */
1055 int serial = 1;
1057 txhead = list_entry(ch->tx_list.next, jnode, capture_link);
1058 format_tx_head(ch);
1060 cur = list_entry(txhead->capture_link.next, jnode, capture_link);
1061 while (&ch->tx_list != &cur->capture_link) {
1062 format_wander_record(ch, cur, serial++);
1063 cur = list_entry(cur->capture_link.next, jnode, capture_link);
1067 { /* Fill wander records with Wandered Set */
1068 struct store_wmap_params params;
1069 txn_atom *atom;
1071 params.cur = list_entry(txhead->capture_link.next, jnode, capture_link);
1073 params.idx = 0;
1074 params.capacity =
1075 wander_record_capacity(reiser4_get_current_sb());
1077 atom = get_current_atom_locked();
1078 blocknr_set_iterator(atom, &atom->wandered_map,
1079 &store_wmap_actor, &params, 0);
1080 spin_unlock_atom(atom);
1083 { /* relse all jnodes from tx_list */
1084 cur = list_entry(ch->tx_list.next, jnode, capture_link);
1085 while (&ch->tx_list != &cur->capture_link) {
1086 jrelse(cur);
1087 cur = list_entry(cur->capture_link.next, jnode, capture_link);
1091 ret = write_jnode_list(&ch->tx_list, fq, NULL, 0);
1093 return ret;
1095 free_not_assigned:
1096 /* We deallocate blocks not yet assigned to jnodes on tx_list. The
1097 caller takes care about invalidating of tx list */
1098 reiser4_dealloc_blocks(&first, &len, BLOCK_NOT_COUNTED, BA_FORMATTED);
1100 return ret;
1103 static int commit_tx(struct commit_handle *ch)
1105 flush_queue_t *fq;
1106 int barrier;
1107 int ret;
1109 /* Grab more space for wandered records. */
1110 ret = reiser4_grab_space_force((__u64) (ch->tx_size), BA_RESERVED);
1111 if (ret)
1112 return ret;
1114 fq = get_fq_for_current_atom();
1115 if (IS_ERR(fq))
1116 return PTR_ERR(fq);
1118 spin_unlock_atom(fq->atom);
1119 do {
1120 ret = alloc_wandered_blocks(ch, fq);
1121 if (ret)
1122 break;
1123 ret = alloc_tx(ch, fq);
1124 if (ret)
1125 break;
1126 } while (0);
1128 reiser4_fq_put(fq);
1129 if (ret)
1130 return ret;
1131 repeat_wo_barrier:
1132 barrier = reiser4_use_write_barrier(ch->super);
1133 if (!barrier) {
1134 ret = current_atom_finish_all_fq();
1135 if (ret)
1136 return ret;
1138 ret = update_journal_header(ch, barrier);
1139 if (barrier) {
1140 if (ret) {
1141 if (ret == -EOPNOTSUPP) {
1142 disable_write_barrier(ch->super);
1143 goto repeat_wo_barrier;
1145 return ret;
1147 ret = current_atom_finish_all_fq();
1149 return ret;
1152 static int write_tx_back(struct commit_handle * ch)
1154 flush_queue_t *fq;
1155 int ret;
1156 int barrier;
1158 reiser4_post_commit_hook();
1159 fq = get_fq_for_current_atom();
1160 if (IS_ERR(fq))
1161 return PTR_ERR(fq);
1162 spin_unlock_atom(fq->atom);
1163 ret = write_jnode_list(
1164 ch->overwrite_set, fq, NULL, WRITEOUT_FOR_PAGE_RECLAIM);
1165 reiser4_fq_put(fq);
1166 if (ret)
1167 return ret;
1168 repeat_wo_barrier:
1169 barrier = reiser4_use_write_barrier(ch->super);
1170 if (!barrier) {
1171 ret = current_atom_finish_all_fq();
1172 if (ret)
1173 return ret;
1175 ret = update_journal_footer(ch, barrier);
1176 if (barrier) {
1177 if (ret) {
1178 if (ret == -EOPNOTSUPP) {
1179 disable_write_barrier(ch->super);
1180 goto repeat_wo_barrier;
1182 return ret;
1184 ret = current_atom_finish_all_fq();
1186 if (ret)
1187 return ret;
1188 reiser4_post_write_back_hook();
1189 return 0;
1192 /* We assume that at this moment all captured blocks are marked as RELOC or
1193 WANDER (belong to Relocate o Overwrite set), all nodes from Relocate set
1194 are submitted to write.
1197 int reiser4_write_logs(long *nr_submitted)
1199 txn_atom *atom;
1200 struct super_block *super = reiser4_get_current_sb();
1201 reiser4_super_info_data *sbinfo = get_super_private(super);
1202 struct commit_handle ch;
1203 int ret;
1205 writeout_mode_enable();
1207 /* block allocator may add j-nodes to the clean_list */
1208 ret = reiser4_pre_commit_hook();
1209 if (ret)
1210 return ret;
1212 /* No locks are required if we take atom which stage >=
1213 * ASTAGE_PRE_COMMIT */
1214 atom = get_current_context()->trans->atom;
1215 assert("zam-965", atom != NULL);
1217 /* relocate set is on the atom->clean_nodes list after
1218 * current_atom_complete_writes() finishes. It can be safely
1219 * uncaptured after commit_mutex is locked, because any atom that
1220 * captures these nodes is guaranteed to commit after current one.
1222 * This can only be done after reiser4_pre_commit_hook(), because it is where
1223 * early flushed jnodes with CREATED bit are transferred to the
1224 * overwrite list. */
1225 reiser4_invalidate_list(ATOM_CLEAN_LIST(atom));
1226 spin_lock_atom(atom);
1227 /* There might be waiters for the relocate nodes which we have
1228 * released, wake them up. */
1229 reiser4_atom_send_event(atom);
1230 spin_unlock_atom(atom);
1232 if (REISER4_DEBUG) {
1233 int level;
1235 for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; ++level)
1236 assert("nikita-3352",
1237 list_empty_careful(ATOM_DIRTY_LIST(atom, level)));
1240 sbinfo->nr_files_committed += (unsigned)atom->nr_objects_created;
1241 sbinfo->nr_files_committed -= (unsigned)atom->nr_objects_deleted;
1243 init_commit_handle(&ch, atom);
1245 ch.free_blocks = sbinfo->blocks_free_committed;
1246 ch.nr_files = sbinfo->nr_files_committed;
1247 /* ZAM-FIXME-HANS: email me what the contention level is for the super
1248 * lock. */
1249 ch.next_oid = oid_next(super);
1251 /* count overwrite set and place it in a separate list */
1252 ret = get_overwrite_set(&ch);
1254 if (ret <= 0) {
1255 /* It is possible that overwrite set is empty here, it means
1256 all captured nodes are clean */
1257 goto up_and_ret;
1260 /* Inform the caller about what number of dirty pages will be
1261 * submitted to disk. */
1262 *nr_submitted += ch.overwrite_set_size - ch.nr_bitmap;
1264 /* count all records needed for storing of the wandered set */
1265 get_tx_size(&ch);
1267 ret = commit_tx(&ch);
1268 if (ret)
1269 goto up_and_ret;
1271 spin_lock_atom(atom);
1272 reiser4_atom_set_stage(atom, ASTAGE_POST_COMMIT);
1273 spin_unlock_atom(atom);
1275 ret = write_tx_back(&ch);
1276 reiser4_post_write_back_hook();
1278 up_and_ret:
1279 if (ret) {
1280 /* there could be fq attached to current atom; the only way to
1281 remove them is: */
1282 current_atom_finish_all_fq();
1285 /* free blocks of flushed transaction */
1286 dealloc_tx_list(&ch);
1287 dealloc_wmap(&ch);
1289 put_overwrite_set(&ch);
1291 done_commit_handle(&ch);
1293 writeout_mode_disable();
1295 return ret;
1298 /* consistency checks for journal data/control blocks: header, footer, log
1299 records, transactions head blocks. All functions return zero on success. */
1301 static int check_journal_header(const jnode * node UNUSED_ARG)
1303 /* FIXME: journal header has no magic field yet. */
1304 return 0;
1307 /* wait for write completion for all jnodes from given list */
1308 static int wait_on_jnode_list(struct list_head *head)
1310 jnode *scan;
1311 int ret = 0;
1313 list_for_each_entry(scan, head, capture_link) {
1314 struct page *pg = jnode_page(scan);
1316 if (pg) {
1317 if (PageWriteback(pg))
1318 wait_on_page_writeback(pg);
1320 if (PageError(pg))
1321 ret++;
1325 return ret;
1328 static int check_journal_footer(const jnode * node UNUSED_ARG)
1330 /* FIXME: journal footer has no magic field yet. */
1331 return 0;
1334 static int check_tx_head(const jnode * node)
1336 struct tx_header *header = (struct tx_header *)jdata(node);
1338 if (memcmp(&header->magic, TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE) != 0) {
1339 warning("zam-627", "tx head at block %s corrupted\n",
1340 sprint_address(jnode_get_block(node)));
1341 return RETERR(-EIO);
1344 return 0;
1347 static int check_wander_record(const jnode * node)
1349 struct wander_record_header *RH =
1350 (struct wander_record_header *)jdata(node);
1352 if (memcmp(&RH->magic, WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE) !=
1353 0) {
1354 warning("zam-628", "wander record at block %s corrupted\n",
1355 sprint_address(jnode_get_block(node)));
1356 return RETERR(-EIO);
1359 return 0;
1362 /* fill commit_handler structure by everything what is needed for update_journal_footer */
1363 static int restore_commit_handle(struct commit_handle *ch, jnode *tx_head)
1365 struct tx_header *TXH;
1366 int ret;
1368 ret = jload(tx_head);
1369 if (ret)
1370 return ret;
1372 TXH = (struct tx_header *)jdata(tx_head);
1374 ch->free_blocks = le64_to_cpu(get_unaligned(&TXH->free_blocks));
1375 ch->nr_files = le64_to_cpu(get_unaligned(&TXH->nr_files));
1376 ch->next_oid = le64_to_cpu(get_unaligned(&TXH->next_oid));
1378 jrelse(tx_head);
1380 list_add(&tx_head->capture_link, &ch->tx_list);
1382 return 0;
1385 /* replay one transaction: restore and write overwrite set in place */
1386 static int replay_transaction(const struct super_block *s,
1387 jnode * tx_head,
1388 const reiser4_block_nr * log_rec_block_p,
1389 const reiser4_block_nr * end_block,
1390 unsigned int nr_wander_records)
1392 reiser4_block_nr log_rec_block = *log_rec_block_p;
1393 struct commit_handle ch;
1394 LIST_HEAD(overwrite_set);
1395 jnode *log;
1396 int ret;
1398 init_commit_handle(&ch, NULL);
1399 ch.overwrite_set = &overwrite_set;
1401 restore_commit_handle(&ch, tx_head);
1403 while (log_rec_block != *end_block) {
1404 struct wander_record_header *header;
1405 struct wander_entry *entry;
1407 int i;
1409 if (nr_wander_records == 0) {
1410 warning("zam-631",
1411 "number of wander records in the linked list"
1412 " greater than number stored in tx head.\n");
1413 ret = RETERR(-EIO);
1414 goto free_ow_set;
1417 log = reiser4_alloc_io_head(&log_rec_block);
1418 if (log == NULL)
1419 return RETERR(-ENOMEM);
1421 ret = jload(log);
1422 if (ret < 0) {
1423 reiser4_drop_io_head(log);
1424 return ret;
1427 ret = check_wander_record(log);
1428 if (ret) {
1429 jrelse(log);
1430 reiser4_drop_io_head(log);
1431 return ret;
1434 header = (struct wander_record_header *)jdata(log);
1435 log_rec_block = le64_to_cpu(get_unaligned(&header->next_block));
1437 entry = (struct wander_entry *)(header + 1);
1439 /* restore overwrite set from wander record content */
1440 for (i = 0; i < wander_record_capacity(s); i++) {
1441 reiser4_block_nr block;
1442 jnode *node;
1444 block = le64_to_cpu(get_unaligned(&entry->wandered));
1445 if (block == 0)
1446 break;
1448 node = reiser4_alloc_io_head(&block);
1449 if (node == NULL) {
1450 ret = RETERR(-ENOMEM);
1452 * FIXME-VS:???
1454 jrelse(log);
1455 reiser4_drop_io_head(log);
1456 goto free_ow_set;
1459 ret = jload(node);
1461 if (ret < 0) {
1462 reiser4_drop_io_head(node);
1464 * FIXME-VS:???
1466 jrelse(log);
1467 reiser4_drop_io_head(log);
1468 goto free_ow_set;
1471 block = le64_to_cpu(get_unaligned(&entry->original));
1473 assert("zam-603", block != 0);
1475 jnode_set_block(node, &block);
1477 list_add_tail(&node->capture_link, ch.overwrite_set);
1479 ++entry;
1482 jrelse(log);
1483 reiser4_drop_io_head(log);
1485 --nr_wander_records;
1488 if (nr_wander_records != 0) {
1489 warning("zam-632", "number of wander records in the linked list"
1490 " less than number stored in tx head.\n");
1491 ret = RETERR(-EIO);
1492 goto free_ow_set;
1495 { /* write wandered set in place */
1496 write_jnode_list(ch.overwrite_set, NULL, NULL, 0);
1497 ret = wait_on_jnode_list(ch.overwrite_set);
1499 if (ret) {
1500 ret = RETERR(-EIO);
1501 goto free_ow_set;
1505 ret = update_journal_footer(&ch, 0);
1507 free_ow_set:
1509 while (!list_empty(ch.overwrite_set)) {
1510 jnode *cur = list_entry(ch.overwrite_set->next, jnode, capture_link);
1511 list_del_init(&cur->capture_link);
1512 jrelse(cur);
1513 reiser4_drop_io_head(cur);
1516 list_del_init(&tx_head->capture_link);
1518 done_commit_handle(&ch);
1520 return ret;
1523 /* find oldest committed and not played transaction and play it. The transaction
1524 * was committed and journal header block was updated but the blocks from the
1525 * process of writing the atom's overwrite set in-place and updating of journal
1526 * footer block were not completed. This function completes the process by
1527 * recovering the atom's overwrite set from their wandered locations and writes
1528 * them in-place and updating the journal footer. */
1529 static int replay_oldest_transaction(struct super_block *s)
1531 reiser4_super_info_data *sbinfo = get_super_private(s);
1532 jnode *jf = sbinfo->journal_footer;
1533 unsigned int total;
1534 struct journal_footer *F;
1535 struct tx_header *T;
1537 reiser4_block_nr prev_tx;
1538 reiser4_block_nr last_flushed_tx;
1539 reiser4_block_nr log_rec_block = 0;
1541 jnode *tx_head;
1543 int ret;
1545 if ((ret = jload(jf)) < 0)
1546 return ret;
1548 F = (struct journal_footer *)jdata(jf);
1550 last_flushed_tx = le64_to_cpu(get_unaligned(&F->last_flushed_tx));
1552 jrelse(jf);
1554 if (sbinfo->last_committed_tx == last_flushed_tx) {
1555 /* all transactions are replayed */
1556 return 0;
1559 prev_tx = sbinfo->last_committed_tx;
1561 /* searching for oldest not flushed transaction */
1562 while (1) {
1563 tx_head = reiser4_alloc_io_head(&prev_tx);
1564 if (!tx_head)
1565 return RETERR(-ENOMEM);
1567 ret = jload(tx_head);
1568 if (ret < 0) {
1569 reiser4_drop_io_head(tx_head);
1570 return ret;
1573 ret = check_tx_head(tx_head);
1574 if (ret) {
1575 jrelse(tx_head);
1576 reiser4_drop_io_head(tx_head);
1577 return ret;
1580 T = (struct tx_header *)jdata(tx_head);
1582 prev_tx = le64_to_cpu(get_unaligned(&T->prev_tx));
1584 if (prev_tx == last_flushed_tx)
1585 break;
1587 jrelse(tx_head);
1588 reiser4_drop_io_head(tx_head);
1591 total = le32_to_cpu(get_unaligned(&T->total));
1592 log_rec_block = le64_to_cpu(get_unaligned(&T->next_block));
1594 pin_jnode_data(tx_head);
1595 jrelse(tx_head);
1597 ret =
1598 replay_transaction(s, tx_head, &log_rec_block,
1599 jnode_get_block(tx_head), total - 1);
1601 unpin_jnode_data(tx_head);
1602 reiser4_drop_io_head(tx_head);
1604 if (ret)
1605 return ret;
1606 return -E_REPEAT;
1609 /* The reiser4 journal current implementation was optimized to not to capture
1610 super block if certain super blocks fields are modified. Currently, the set
1611 is (<free block count>, <OID allocator>). These fields are logged by
1612 special way which includes storing them in each transaction head block at
1613 atom commit time and writing that information to journal footer block at
1614 atom flush time. For getting info from journal footer block to the
1615 in-memory super block there is a special function
1616 reiser4_journal_recover_sb_data() which should be called after disk format
1617 plugin re-reads super block after journal replaying.
1620 /* get the information from journal footer in-memory super block */
1621 int reiser4_journal_recover_sb_data(struct super_block *s)
1623 reiser4_super_info_data *sbinfo = get_super_private(s);
1624 struct journal_footer *jf;
1625 int ret;
1627 assert("zam-673", sbinfo->journal_footer != NULL);
1629 ret = jload(sbinfo->journal_footer);
1630 if (ret != 0)
1631 return ret;
1633 ret = check_journal_footer(sbinfo->journal_footer);
1634 if (ret != 0)
1635 goto out;
1637 jf = (struct journal_footer *)jdata(sbinfo->journal_footer);
1639 /* was there at least one flushed transaction? */
1640 if (jf->last_flushed_tx) {
1642 /* restore free block counter logged in this transaction */
1643 reiser4_set_free_blocks(s, le64_to_cpu(get_unaligned(&jf->free_blocks)));
1645 /* restore oid allocator state */
1646 oid_init_allocator(s,
1647 le64_to_cpu(get_unaligned(&jf->nr_files)),
1648 le64_to_cpu(get_unaligned(&jf->next_oid)));
1650 out:
1651 jrelse(sbinfo->journal_footer);
1652 return ret;
1655 /* reiser4 replay journal procedure */
1656 int reiser4_journal_replay(struct super_block *s)
1658 reiser4_super_info_data *sbinfo = get_super_private(s);
1659 jnode *jh, *jf;
1660 struct journal_header *header;
1661 int nr_tx_replayed = 0;
1662 int ret;
1664 assert("zam-582", sbinfo != NULL);
1666 jh = sbinfo->journal_header;
1667 jf = sbinfo->journal_footer;
1669 if (!jh || !jf) {
1670 /* it is possible that disk layout does not support journal
1671 structures, we just warn about this */
1672 warning("zam-583",
1673 "journal control blocks were not loaded by disk layout plugin. "
1674 "journal replaying is not possible.\n");
1675 return 0;
1678 /* Take free block count from journal footer block. The free block
1679 counter value corresponds the last flushed transaction state */
1680 ret = jload(jf);
1681 if (ret < 0)
1682 return ret;
1684 ret = check_journal_footer(jf);
1685 if (ret) {
1686 jrelse(jf);
1687 return ret;
1690 jrelse(jf);
1692 /* store last committed transaction info in reiser4 in-memory super
1693 block */
1694 ret = jload(jh);
1695 if (ret < 0)
1696 return ret;
1698 ret = check_journal_header(jh);
1699 if (ret) {
1700 jrelse(jh);
1701 return ret;
1704 header = (struct journal_header *)jdata(jh);
1705 sbinfo->last_committed_tx = le64_to_cpu(get_unaligned(&header->last_committed_tx));
1707 jrelse(jh);
1709 /* replay committed transactions */
1710 while ((ret = replay_oldest_transaction(s)) == -E_REPEAT)
1711 nr_tx_replayed++;
1713 return ret;
1716 /* load journal control block (either journal header or journal footer block) */
1717 static int
1718 load_journal_control_block(jnode ** node, const reiser4_block_nr * block)
1720 int ret;
1722 *node = reiser4_alloc_io_head(block);
1723 if (!(*node))
1724 return RETERR(-ENOMEM);
1726 ret = jload(*node);
1728 if (ret) {
1729 reiser4_drop_io_head(*node);
1730 *node = NULL;
1731 return ret;
1734 pin_jnode_data(*node);
1735 jrelse(*node);
1737 return 0;
1740 /* unload journal header or footer and free jnode */
1741 static void unload_journal_control_block(jnode ** node)
1743 if (*node) {
1744 unpin_jnode_data(*node);
1745 reiser4_drop_io_head(*node);
1746 *node = NULL;
1750 /* release journal control blocks */
1751 void reiser4_done_journal_info(struct super_block *s)
1753 reiser4_super_info_data *sbinfo = get_super_private(s);
1755 assert("zam-476", sbinfo != NULL);
1757 unload_journal_control_block(&sbinfo->journal_header);
1758 unload_journal_control_block(&sbinfo->journal_footer);
1759 rcu_barrier();
1762 /* load journal control blocks */
1763 int reiser4_init_journal_info(struct super_block *s)
1765 reiser4_super_info_data *sbinfo = get_super_private(s);
1766 journal_location *loc;
1767 int ret;
1769 loc = &sbinfo->jloc;
1771 assert("zam-651", loc != NULL);
1772 assert("zam-652", loc->header != 0);
1773 assert("zam-653", loc->footer != 0);
1775 ret = load_journal_control_block(&sbinfo->journal_header, &loc->header);
1777 if (ret)
1778 return ret;
1780 ret = load_journal_control_block(&sbinfo->journal_footer, &loc->footer);
1782 if (ret) {
1783 unload_journal_control_block(&sbinfo->journal_header);
1786 return ret;
1789 /* Make Linus happy.
1790 Local variables:
1791 c-indentation-style: "K&R"
1792 mode-name: "LC"
1793 c-basic-offset: 8
1794 tab-width: 8
1795 fill-column: 80
1796 End: