On Tue, Nov 06, 2007 at 02:33:53AM -0800, akpm@linux-foundation.org wrote:
[mmotm.git] / fs / reiser4 / wander.c
blob8f75096751ad8c55232cdb0ead556bd5ed75c3cb
1 /* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
2 * reiser4/README */
4 /* Reiser4 Wandering Log */
6 /* You should read http://www.namesys.com/txn-doc.html
8 That describes how filesystem operations are performed as atomic
9 transactions, and how we try to arrange it so that we can write most of the
10 data only once while performing the operation atomically.
12 For the purposes of this code, it is enough for it to understand that it
13 has been told a given block should be written either once, or twice (if
14 twice then once to the wandered location and once to the real location).
16 This code guarantees that those blocks that are defined to be part of an
17 atom either all take effect or none of them take effect.
19 The "relocate set" of nodes are submitted to write by the jnode_flush()
20 routine, and the "overwrite set" is submitted by reiser4_write_log().
21 This is because with the overwrite set we seek to optimize writes, and
22 with the relocate set we seek to cause disk order to correlate with the
23 "parent first order" (preorder).
25 reiser4_write_log() allocates and writes wandered blocks and maintains
26 additional on-disk structures of the atom as wander records (each wander
27 record occupies one block) for storing of the "wandered map" (a table which
28 contains a relation between wandered and real block numbers) and other
29 information which might be needed at transaction recovery time.
31 The wander records are unidirectionally linked into a circle: each wander
32 record contains a block number of the next wander record, the last wander
33 record points to the first one.
35 One wander record (named "tx head" in this file) has a format which is
36 different from the other wander records. The "tx head" has a reference to the
37 "tx head" block of the previously committed atom. Also, "tx head" contains
38 fs information (the free blocks counter, and the oid allocator state) which
39 is logged in a special way .
41 There are two journal control blocks, named journal header and journal
42 footer which have fixed on-disk locations. The journal header has a
43 reference to the "tx head" block of the last committed atom. The journal
44 footer points to the "tx head" of the last flushed atom. The atom is
45 "played" when all blocks from its overwrite set are written to disk the
46 second time (i.e. written to their real locations).
48 NOTE: People who know reiserfs internals and its journal structure might be
49 confused with these terms journal footer and journal header. There is a table
50 with terms of similar semantics in reiserfs (reiser3) and reiser4:
52 REISER3 TERM | REISER4 TERM | DESCRIPTION
53 --------------------+-----------------------+----------------------------
54 commit record | journal header | atomic write of this record
55 | | ends transaction commit
56 --------------------+-----------------------+----------------------------
57 journal header | journal footer | atomic write of this record
58 | | ends post-commit writes.
59 | | After successful
60 | | writing of this journal
61 | | blocks (in reiser3) or
62 | | wandered blocks/records are
63 | | free for re-use.
64 --------------------+-----------------------+----------------------------
66 The atom commit process is the following:
68 1. The overwrite set is taken from atom's clean list, and its size is
69 counted.
71 2. The number of necessary wander records (including tx head) is calculated,
72 and the wander record blocks are allocated.
74 3. Allocate wandered blocks and populate wander records by wandered map.
76 4. submit write requests for wander records and wandered blocks.
78 5. wait until submitted write requests complete.
80 6. update journal header: change the pointer to the block number of just
81 written tx head, submit an i/o for modified journal header block and wait
82 for i/o completion.
84 NOTE: The special logging for bitmap blocks and some reiser4 super block
85 fields makes processes of atom commit, flush and recovering a bit more
86 complex (see comments in the source code for details).
88 The atom playing process is the following:
90 1. Write atom's overwrite set in-place.
92 2. Wait on i/o.
94 3. Update journal footer: change the pointer to block number of tx head
95 block of the atom we currently flushing, submit an i/o, wait on i/o
96 completion.
98 4. Free disk space which was used for wandered blocks and wander records.
100 After the freeing of wandered blocks and wander records we have that journal
101 footer points to the on-disk structure which might be overwritten soon.
102 Neither the log writer nor the journal recovery procedure use that pointer
103 for accessing the data. When the journal recovery procedure finds the oldest
104 transaction it compares the journal footer pointer value with the "prev_tx"
105 pointer value in tx head, if values are equal the oldest not flushed
106 transaction is found.
108 NOTE on disk space leakage: the information about of what blocks and how many
109 blocks are allocated for wandered blocks, wandered records is not written to
110 the disk because of special logging for bitmaps and some super blocks
111 counters. After a system crash we the reiser4 does not remember those
112 objects allocation, thus we have no such a kind of disk space leakage.
115 /* Special logging of reiser4 super block fields. */
117 /* There are some reiser4 super block fields (free block count and OID allocator
118 state (number of files and next free OID) which are logged separately from
119 super block to avoid unnecessary atom fusion.
121 So, the reiser4 super block can be not captured by a transaction with
122 allocates/deallocates disk blocks or create/delete file objects. Moreover,
123 the reiser4 on-disk super block is not touched when such a transaction is
124 committed and flushed. Those "counters logged specially" are logged in "tx
125 head" blocks and in the journal footer block.
127 A step-by-step description of special logging:
129 0. The per-atom information about deleted or created files and allocated or
130 freed blocks is collected during the transaction. The atom's
131 ->nr_objects_created and ->nr_objects_deleted are for object
132 deletion/creation tracking, the numbers of allocated and freed blocks are
133 calculated using atom's delete set and atom's capture list -- all new and
134 relocated nodes should be on atom's clean list and should have JNODE_RELOC
135 bit set.
137 1. The "logged specially" reiser4 super block fields have their "committed"
138 versions in the reiser4 in-memory super block. They get modified only at
139 atom commit time. The atom's commit thread has an exclusive access to those
140 "committed" fields because the log writer implementation supports only one
141 atom commit a time (there is a per-fs "commit" mutex). At
142 that time "committed" counters are modified using per-atom information
143 collected during the transaction. These counters are stored on disk as a
144 part of tx head block when atom is committed.
146 2. When the atom is flushed the value of the free block counter and the OID
147 allocator state get written to the journal footer block. A special journal
148 procedure (journal_recover_sb_data()) takes those values from the journal
149 footer and updates the reiser4 in-memory super block.
151 NOTE: That means free block count and OID allocator state are logged
152 separately from the reiser4 super block regardless of the fact that the
153 reiser4 super block has fields to store both the free block counter and the
154 OID allocator.
156 Writing the whole super block at commit time requires knowing true values of
157 all its fields without changes made by not yet committed transactions. It is
158 possible by having their "committed" version of the super block like the
159 reiser4 bitmap blocks have "committed" and "working" versions. However,
160 another scheme was implemented which stores special logged values in the
161 unused free space inside transaction head block. In my opinion it has an
162 advantage of not writing whole super block when only part of it was
163 modified. */
165 #include "debug.h"
166 #include "dformat.h"
167 #include "txnmgr.h"
168 #include "jnode.h"
169 #include "znode.h"
170 #include "block_alloc.h"
171 #include "page_cache.h"
172 #include "wander.h"
173 #include "reiser4.h"
174 #include "super.h"
175 #include "vfs_ops.h"
176 #include "writeout.h"
177 #include "inode.h"
178 #include "entd.h"
180 #include <linux/types.h>
181 #include <linux/fs.h> /* for struct super_block */
182 #include <linux/mm.h> /* for struct page */
183 #include <linux/pagemap.h>
184 #include <linux/bio.h> /* for struct bio */
185 #include <linux/blkdev.h>
187 static int write_jnodes_to_disk_extent(
188 jnode *, int, const reiser4_block_nr *, flush_queue_t *, int);
190 /* The commit_handle is a container for objects needed at atom commit time */
191 struct commit_handle {
192 /* A pointer to atom's list of OVRWR nodes */
193 struct list_head *overwrite_set;
194 /* atom's overwrite set size */
195 int overwrite_set_size;
196 /* jnodes for wander record blocks */
197 struct list_head tx_list;
198 /* number of wander records */
199 __u32 tx_size;
200 /* 'committed' sb counters are saved here until atom is completely
201 flushed */
202 __u64 free_blocks;
203 __u64 nr_files;
204 __u64 next_oid;
205 /* A pointer to the atom which is being committed */
206 txn_atom *atom;
207 /* A pointer to current super block */
208 struct super_block *super;
209 /* The counter of modified bitmaps */
210 reiser4_block_nr nr_bitmap;
213 static void init_commit_handle(struct commit_handle *ch, txn_atom *atom)
215 memset(ch, 0, sizeof(struct commit_handle));
216 INIT_LIST_HEAD(&ch->tx_list);
218 ch->atom = atom;
219 ch->super = reiser4_get_current_sb();
222 static void done_commit_handle(struct commit_handle *ch)
224 assert("zam-690", list_empty(&ch->tx_list));
227 static inline int reiser4_use_write_barrier(struct super_block * s)
229 return !reiser4_is_set(s, REISER4_NO_WRITE_BARRIER);
232 static void disable_write_barrier(struct super_block * s)
234 notice("zam-1055", "%s does not support write barriers,"
235 " using synchronous write instead.", s->s_id);
236 set_bit((int)REISER4_NO_WRITE_BARRIER, &get_super_private(s)->fs_flags);
239 /* fill journal header block data */
240 static void format_journal_header(struct commit_handle *ch)
242 struct reiser4_super_info_data *sbinfo;
243 struct journal_header *header;
244 jnode *txhead;
246 sbinfo = get_super_private(ch->super);
247 assert("zam-479", sbinfo != NULL);
248 assert("zam-480", sbinfo->journal_header != NULL);
250 txhead = list_entry(ch->tx_list.next, jnode, capture_link);
252 jload(sbinfo->journal_header);
254 header = (struct journal_header *)jdata(sbinfo->journal_header);
255 assert("zam-484", header != NULL);
257 put_unaligned(cpu_to_le64(*jnode_get_block(txhead)),
258 &header->last_committed_tx);
260 jrelse(sbinfo->journal_header);
263 /* fill journal footer block data */
264 static void format_journal_footer(struct commit_handle *ch)
266 struct reiser4_super_info_data *sbinfo;
267 struct journal_footer *footer;
268 jnode *tx_head;
270 sbinfo = get_super_private(ch->super);
272 tx_head = list_entry(ch->tx_list.next, jnode, capture_link);
274 assert("zam-493", sbinfo != NULL);
275 assert("zam-494", sbinfo->journal_header != NULL);
277 check_me("zam-691", jload(sbinfo->journal_footer) == 0);
279 footer = (struct journal_footer *)jdata(sbinfo->journal_footer);
280 assert("zam-495", footer != NULL);
282 put_unaligned(cpu_to_le64(*jnode_get_block(tx_head)),
283 &footer->last_flushed_tx);
284 put_unaligned(cpu_to_le64(ch->free_blocks), &footer->free_blocks);
286 put_unaligned(cpu_to_le64(ch->nr_files), &footer->nr_files);
287 put_unaligned(cpu_to_le64(ch->next_oid), &footer->next_oid);
289 jrelse(sbinfo->journal_footer);
292 /* wander record capacity depends on current block size */
293 static int wander_record_capacity(const struct super_block *super)
295 return (super->s_blocksize -
296 sizeof(struct wander_record_header)) /
297 sizeof(struct wander_entry);
300 /* Fill first wander record (tx head) in accordance with supplied given data */
301 static void format_tx_head(struct commit_handle *ch)
303 jnode *tx_head;
304 jnode *next;
305 struct tx_header *header;
307 tx_head = list_entry(ch->tx_list.next, jnode, capture_link);
308 assert("zam-692", &ch->tx_list != &tx_head->capture_link);
310 next = list_entry(tx_head->capture_link.next, jnode, capture_link);
311 if (&ch->tx_list == &next->capture_link)
312 next = tx_head;
314 header = (struct tx_header *)jdata(tx_head);
316 assert("zam-460", header != NULL);
317 assert("zam-462", ch->super->s_blocksize >= sizeof(struct tx_header));
319 memset(jdata(tx_head), 0, (size_t) ch->super->s_blocksize);
320 memcpy(jdata(tx_head), TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE);
322 put_unaligned(cpu_to_le32(ch->tx_size), &header->total);
323 put_unaligned(cpu_to_le64(get_super_private(ch->super)->last_committed_tx),
324 &header->prev_tx);
325 put_unaligned(cpu_to_le64(*jnode_get_block(next)), &header->next_block);
326 put_unaligned(cpu_to_le64(ch->free_blocks), &header->free_blocks);
327 put_unaligned(cpu_to_le64(ch->nr_files), &header->nr_files);
328 put_unaligned(cpu_to_le64(ch->next_oid), &header->next_oid);
331 /* prepare ordinary wander record block (fill all service fields) */
332 static void
333 format_wander_record(struct commit_handle *ch, jnode *node, __u32 serial)
335 struct wander_record_header *LRH;
336 jnode *next;
338 assert("zam-464", node != NULL);
340 LRH = (struct wander_record_header *)jdata(node);
341 next = list_entry(node->capture_link.next, jnode, capture_link);
343 if (&ch->tx_list == &next->capture_link)
344 next = list_entry(ch->tx_list.next, jnode, capture_link);
346 assert("zam-465", LRH != NULL);
347 assert("zam-463",
348 ch->super->s_blocksize > sizeof(struct wander_record_header));
350 memset(jdata(node), 0, (size_t) ch->super->s_blocksize);
351 memcpy(jdata(node), WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE);
353 put_unaligned(cpu_to_le32(ch->tx_size), &LRH->total);
354 put_unaligned(cpu_to_le32(serial), &LRH->serial);
355 put_unaligned(cpu_to_le64(*jnode_get_block(next)), &LRH->next_block);
358 /* add one wandered map entry to formatted wander record */
359 static void
360 store_entry(jnode * node, int index, const reiser4_block_nr * a,
361 const reiser4_block_nr * b)
363 char *data;
364 struct wander_entry *pairs;
366 data = jdata(node);
367 assert("zam-451", data != NULL);
369 pairs =
370 (struct wander_entry *)(data + sizeof(struct wander_record_header));
372 put_unaligned(cpu_to_le64(*a), &pairs[index].original);
373 put_unaligned(cpu_to_le64(*b), &pairs[index].wandered);
376 /* currently, wander records contains contain only wandered map, which depend on
377 overwrite set size */
378 static void get_tx_size(struct commit_handle *ch)
380 assert("zam-440", ch->overwrite_set_size != 0);
381 assert("zam-695", ch->tx_size == 0);
383 /* count all ordinary wander records
384 (<overwrite_set_size> - 1) / <wander_record_capacity> + 1 and add one
385 for tx head block */
386 ch->tx_size =
387 (ch->overwrite_set_size - 1) / wander_record_capacity(ch->super) +
391 /* A special structure for using in store_wmap_actor() for saving its state
392 between calls */
393 struct store_wmap_params {
394 jnode *cur; /* jnode of current wander record to fill */
395 int idx; /* free element index in wander record */
396 int capacity; /* capacity */
398 #if REISER4_DEBUG
399 struct list_head *tx_list;
400 #endif
403 /* an actor for use in blocknr_set_iterator routine which populates the list
404 of pre-formatted wander records by wandered map info */
405 static int
406 store_wmap_actor(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a,
407 const reiser4_block_nr * b, void *data)
409 struct store_wmap_params *params = data;
411 if (params->idx >= params->capacity) {
412 /* a new wander record should be taken from the tx_list */
413 params->cur = list_entry(params->cur->capture_link.next, jnode, capture_link);
414 assert("zam-454",
415 params->tx_list != &params->cur->capture_link);
417 params->idx = 0;
420 store_entry(params->cur, params->idx, a, b);
421 params->idx++;
423 return 0;
426 /* This function is called after Relocate set gets written to disk, Overwrite
427 set is written to wandered locations and all wander records are written
428 also. Updated journal header blocks contains a pointer (block number) to
429 first wander record of the just written transaction */
430 static int update_journal_header(struct commit_handle *ch, int use_barrier)
432 struct reiser4_super_info_data *sbinfo = get_super_private(ch->super);
433 jnode *jh = sbinfo->journal_header;
434 jnode *head = list_entry(ch->tx_list.next, jnode, capture_link);
435 int ret;
437 format_journal_header(ch);
439 ret = write_jnodes_to_disk_extent(jh, 1, jnode_get_block(jh), NULL,
440 use_barrier ? WRITEOUT_BARRIER : 0);
441 if (ret)
442 return ret;
444 /* blk_run_address_space(sbinfo->fake->i_mapping);
445 * blk_run_queues(); */
447 ret = jwait_io(jh, WRITE);
449 if (ret)
450 return ret;
452 sbinfo->last_committed_tx = *jnode_get_block(head);
454 return 0;
457 /* This function is called after write-back is finished. We update journal
458 footer block and free blocks which were occupied by wandered blocks and
459 transaction wander records */
460 static int update_journal_footer(struct commit_handle *ch, int use_barrier)
462 reiser4_super_info_data *sbinfo = get_super_private(ch->super);
464 jnode *jf = sbinfo->journal_footer;
466 int ret;
468 format_journal_footer(ch);
470 ret = write_jnodes_to_disk_extent(jf, 1, jnode_get_block(jf), NULL,
471 use_barrier ? WRITEOUT_BARRIER : 0);
472 if (ret)
473 return ret;
475 /* blk_run_address_space(sbinfo->fake->i_mapping);
476 * blk_run_queue(); */
478 ret = jwait_io(jf, WRITE);
479 if (ret)
480 return ret;
482 return 0;
485 /* free block numbers of wander records of already written in place transaction */
486 static void dealloc_tx_list(struct commit_handle *ch)
488 while (!list_empty(&ch->tx_list)) {
489 jnode *cur = list_entry(ch->tx_list.next, jnode, capture_link);
490 list_del(&cur->capture_link);
491 ON_DEBUG(INIT_LIST_HEAD(&cur->capture_link));
492 reiser4_dealloc_block(jnode_get_block(cur), BLOCK_NOT_COUNTED,
493 BA_FORMATTED);
495 unpin_jnode_data(cur);
496 reiser4_drop_io_head(cur);
500 /* An actor for use in block_nr_iterator() routine which frees wandered blocks
501 from atom's overwrite set. */
502 static int
503 dealloc_wmap_actor(txn_atom * atom UNUSED_ARG,
504 const reiser4_block_nr * a UNUSED_ARG,
505 const reiser4_block_nr * b, void *data UNUSED_ARG)
508 assert("zam-499", b != NULL);
509 assert("zam-500", *b != 0);
510 assert("zam-501", !reiser4_blocknr_is_fake(b));
512 reiser4_dealloc_block(b, BLOCK_NOT_COUNTED, BA_FORMATTED);
513 return 0;
516 /* free wandered block locations of already written in place transaction */
517 static void dealloc_wmap(struct commit_handle *ch)
519 assert("zam-696", ch->atom != NULL);
521 blocknr_set_iterator(ch->atom, &ch->atom->wandered_map,
522 dealloc_wmap_actor, NULL, 1);
525 /* helper function for alloc wandered blocks, which refill set of block
526 numbers needed for wandered blocks */
527 static int
528 get_more_wandered_blocks(int count, reiser4_block_nr * start, int *len)
530 reiser4_blocknr_hint hint;
531 int ret;
533 reiser4_block_nr wide_len = count;
535 /* FIXME-ZAM: A special policy needed for allocation of wandered blocks
536 ZAM-FIXME-HANS: yes, what happened to our discussion of using a fixed
537 reserved allocation area so as to get the best qualities of fixed
538 journals? */
539 reiser4_blocknr_hint_init(&hint);
540 hint.block_stage = BLOCK_GRABBED;
542 ret = reiser4_alloc_blocks(&hint, start, &wide_len,
543 BA_FORMATTED | BA_USE_DEFAULT_SEARCH_START);
544 *len = (int)wide_len;
546 return ret;
550 * roll back changes made before issuing BIO in the case of IO error.
552 static void undo_bio(struct bio *bio)
554 int i;
556 for (i = 0; i < bio->bi_vcnt; ++i) {
557 struct page *pg;
558 jnode *node;
560 pg = bio->bi_io_vec[i].bv_page;
561 end_page_writeback(pg);
562 node = jprivate(pg);
563 spin_lock_jnode(node);
564 JF_CLR(node, JNODE_WRITEBACK);
565 JF_SET(node, JNODE_DIRTY);
566 spin_unlock_jnode(node);
568 bio_put(bio);
571 /* put overwrite set back to atom's clean list */
572 static void put_overwrite_set(struct commit_handle *ch)
574 jnode *cur;
576 list_for_each_entry(cur, ch->overwrite_set, capture_link)
577 jrelse_tail(cur);
580 /* Count overwrite set size, grab disk space for wandered blocks allocation.
581 Since we have a separate list for atom's overwrite set we just scan the list,
582 count bitmap and other not leaf nodes which wandered blocks allocation we
583 have to grab space for. */
584 static int get_overwrite_set(struct commit_handle *ch)
586 int ret;
587 jnode *cur;
588 __u64 nr_not_leaves = 0;
589 #if REISER4_DEBUG
590 __u64 nr_formatted_leaves = 0;
591 __u64 nr_unformatted_leaves = 0;
592 #endif
594 assert("zam-697", ch->overwrite_set_size == 0);
596 ch->overwrite_set = ATOM_OVRWR_LIST(ch->atom);
597 cur = list_entry(ch->overwrite_set->next, jnode, capture_link);
599 while (ch->overwrite_set != &cur->capture_link) {
600 jnode *next = list_entry(cur->capture_link.next, jnode, capture_link);
602 /* Count bitmap locks for getting correct statistics what number
603 * of blocks were cleared by the transaction commit. */
604 if (jnode_get_type(cur) == JNODE_BITMAP)
605 ch->nr_bitmap++;
607 assert("zam-939", JF_ISSET(cur, JNODE_OVRWR)
608 || jnode_get_type(cur) == JNODE_BITMAP);
610 if (jnode_is_znode(cur) && znode_above_root(JZNODE(cur))) {
611 /* we replace fake znode by another (real)
612 znode which is suggested by disk_layout
613 plugin */
615 /* FIXME: it looks like fake znode should be
616 replaced by jnode supplied by
617 disk_layout. */
619 struct super_block *s = reiser4_get_current_sb();
620 reiser4_super_info_data *sbinfo =
621 get_current_super_private();
623 if (sbinfo->df_plug->log_super) {
624 jnode *sj = sbinfo->df_plug->log_super(s);
626 assert("zam-593", sj != NULL);
628 if (IS_ERR(sj))
629 return PTR_ERR(sj);
631 spin_lock_jnode(sj);
632 JF_SET(sj, JNODE_OVRWR);
633 insert_into_atom_ovrwr_list(ch->atom, sj);
634 spin_unlock_jnode(sj);
636 /* jload it as the rest of overwrite set */
637 jload_gfp(sj, reiser4_ctx_gfp_mask_get(), 0);
639 ch->overwrite_set_size++;
641 spin_lock_jnode(cur);
642 reiser4_uncapture_block(cur);
643 jput(cur);
645 } else {
646 int ret;
647 ch->overwrite_set_size++;
648 ret = jload_gfp(cur, reiser4_ctx_gfp_mask_get(), 0);
649 if (ret)
650 reiser4_panic("zam-783",
651 "cannot load e-flushed jnode back (ret = %d)\n",
652 ret);
655 /* Count not leaves here because we have to grab disk space
656 * for wandered blocks. They were not counted as "flush
657 * reserved". Counting should be done _after_ nodes are pinned
658 * into memory by jload(). */
659 if (!jnode_is_leaf(cur))
660 nr_not_leaves++;
661 else {
662 #if REISER4_DEBUG
663 /* at this point @cur either has JNODE_FLUSH_RESERVED
664 * or is eflushed. Locking is not strong enough to
665 * write an assertion checking for this. */
666 if (jnode_is_znode(cur))
667 nr_formatted_leaves++;
668 else
669 nr_unformatted_leaves++;
670 #endif
671 JF_CLR(cur, JNODE_FLUSH_RESERVED);
674 cur = next;
677 /* Grab space for writing (wandered blocks) of not leaves found in
678 * overwrite set. */
679 ret = reiser4_grab_space_force(nr_not_leaves, BA_RESERVED);
680 if (ret)
681 return ret;
683 /* Disk space for allocation of wandered blocks of leaf nodes already
684 * reserved as "flush reserved", move it to grabbed space counter. */
685 spin_lock_atom(ch->atom);
686 assert("zam-940",
687 nr_formatted_leaves + nr_unformatted_leaves <=
688 ch->atom->flush_reserved);
689 flush_reserved2grabbed(ch->atom, ch->atom->flush_reserved);
690 spin_unlock_atom(ch->atom);
692 return ch->overwrite_set_size;
696 * write_jnodes_to_disk_extent - submit write request
697 * @head:
698 * @first: first jnode of the list
699 * @nr: number of jnodes on the list
700 * @block_p:
701 * @fq:
702 * @flags: used to decide whether page is to get PG_reclaim flag
704 * Submits a write request for @nr jnodes beginning from the @first, other
705 * jnodes are after the @first on the double-linked "capture" list. All jnodes
706 * will be written to the disk region of @nr blocks starting with @block_p block
707 * number. If @fq is not NULL it means that waiting for i/o completion will be
708 * done more efficiently by using flush_queue_t objects.
709 * This function is the one which writes list of jnodes in batch mode. It does
710 * all low-level things as bio construction and page states manipulation.
712 * ZAM-FIXME-HANS: brief me on why this function exists, and why bios are
713 * aggregated in this function instead of being left to the layers below
715 * FIXME: ZAM->HANS: What layer are you talking about? Can you point me to that?
716 * Why that layer needed? Why BIOs cannot be constructed here?
718 static int write_jnodes_to_disk_extent(
719 jnode *first, int nr, const reiser4_block_nr *block_p,
720 flush_queue_t *fq, int flags)
722 struct super_block *super = reiser4_get_current_sb();
723 int write_op = ( flags & WRITEOUT_BARRIER ) ? WRITE_BARRIER : WRITE;
724 int max_blocks;
725 jnode *cur = first;
726 reiser4_block_nr block;
728 assert("zam-571", first != NULL);
729 assert("zam-572", block_p != NULL);
730 assert("zam-570", nr > 0);
732 block = *block_p;
733 max_blocks = min(bio_get_nr_vecs(super->s_bdev), BIO_MAX_PAGES);
735 while (nr > 0) {
736 struct bio *bio;
737 int nr_blocks = min(nr, max_blocks);
738 int i;
739 int nr_used;
741 bio = bio_alloc(GFP_NOIO, nr_blocks);
742 if (!bio)
743 return RETERR(-ENOMEM);
745 bio->bi_bdev = super->s_bdev;
746 bio->bi_sector = block * (super->s_blocksize >> 9);
747 for (nr_used = 0, i = 0; i < nr_blocks; i++) {
748 struct page *pg;
750 pg = jnode_page(cur);
751 assert("zam-573", pg != NULL);
753 page_cache_get(pg);
755 lock_and_wait_page_writeback(pg);
757 if (!bio_add_page(bio, pg, super->s_blocksize, 0)) {
759 * underlying device is satiated. Stop adding
760 * pages to the bio.
762 unlock_page(pg);
763 page_cache_release(pg);
764 break;
767 spin_lock_jnode(cur);
768 assert("nikita-3166",
769 pg->mapping == jnode_get_mapping(cur));
770 assert("zam-912", !JF_ISSET(cur, JNODE_WRITEBACK));
771 #if REISER4_DEBUG
772 spin_lock(&cur->load);
773 assert("nikita-3165", !jnode_is_releasable(cur));
774 spin_unlock(&cur->load);
775 #endif
776 JF_SET(cur, JNODE_WRITEBACK);
777 JF_CLR(cur, JNODE_DIRTY);
778 ON_DEBUG(cur->written++);
779 spin_unlock_jnode(cur);
781 ClearPageError(pg);
782 set_page_writeback(pg);
784 if (get_current_context()->entd) {
785 /* this is ent thread */
786 entd_context *ent = get_entd_context(super);
787 struct wbq *rq, *next;
789 spin_lock(&ent->guard);
791 if (pg == ent->cur_request->page) {
793 * entd is called for this page. This
794 * request is not in th etodo list
796 ent->cur_request->written = 1;
797 } else {
799 * if we have written a page for which writepage
800 * is called for - move request to another list.
802 list_for_each_entry_safe(rq, next, &ent->todo_list, link) {
803 assert("", rq->magic == WBQ_MAGIC);
804 if (pg == rq->page) {
806 * remove request from
807 * entd's queue, but do
808 * not wake up a thread
809 * which put this
810 * request
812 list_del_init(&rq->link);
813 ent->nr_todo_reqs --;
814 list_add_tail(&rq->link, &ent->done_list);
815 ent->nr_done_reqs ++;
816 rq->written = 1;
817 break;
821 spin_unlock(&ent->guard);
824 clear_page_dirty_for_io(pg);
826 unlock_page(pg);
828 cur = list_entry(cur->capture_link.next, jnode, capture_link);
829 nr_used++;
831 if (nr_used > 0) {
832 assert("nikita-3453",
833 bio->bi_size == super->s_blocksize * nr_used);
834 assert("nikita-3454", bio->bi_vcnt == nr_used);
836 /* Check if we are allowed to write at all */
837 if (super->s_flags & MS_RDONLY)
838 undo_bio(bio);
839 else {
840 int not_supported;
842 add_fq_to_bio(fq, bio);
843 bio_get(bio);
844 reiser4_submit_bio(write_op, bio);
845 not_supported = bio_flagged(bio, BIO_EOPNOTSUPP);
846 bio_put(bio);
847 if (not_supported)
848 return -EOPNOTSUPP;
851 block += nr_used - 1;
852 update_blocknr_hint_default(super, &block);
853 block += 1;
854 } else {
855 bio_put(bio);
857 nr -= nr_used;
860 return 0;
863 /* This is a procedure which recovers a contiguous sequences of disk block
864 numbers in the given list of j-nodes and submits write requests on this
865 per-sequence basis */
867 write_jnode_list(struct list_head *head, flush_queue_t *fq,
868 long *nr_submitted, int flags)
870 int ret;
871 jnode *beg = list_entry(head->next, jnode, capture_link);
873 while (head != &beg->capture_link) {
874 int nr = 1;
875 jnode *cur = list_entry(beg->capture_link.next, jnode, capture_link);
877 while (head != &cur->capture_link) {
878 if (*jnode_get_block(cur) != *jnode_get_block(beg) + nr)
879 break;
880 ++nr;
881 cur = list_entry(cur->capture_link.next, jnode, capture_link);
884 ret = write_jnodes_to_disk_extent(
885 beg, nr, jnode_get_block(beg), fq, flags);
886 if (ret)
887 return ret;
889 if (nr_submitted)
890 *nr_submitted += nr;
892 beg = cur;
895 return 0;
898 /* add given wandered mapping to atom's wandered map */
899 static int
900 add_region_to_wmap(jnode * cur, int len, const reiser4_block_nr * block_p)
902 int ret;
903 blocknr_set_entry *new_bsep = NULL;
904 reiser4_block_nr block;
906 txn_atom *atom;
908 assert("zam-568", block_p != NULL);
909 block = *block_p;
910 assert("zam-569", len > 0);
912 while ((len--) > 0) {
913 do {
914 atom = get_current_atom_locked();
915 assert("zam-536",
916 !reiser4_blocknr_is_fake(jnode_get_block(cur)));
917 ret =
918 blocknr_set_add_pair(atom, &atom->wandered_map,
919 &new_bsep,
920 jnode_get_block(cur), &block);
921 } while (ret == -E_REPEAT);
923 if (ret) {
924 /* deallocate blocks which were not added to wandered
925 map */
926 reiser4_block_nr wide_len = len;
928 reiser4_dealloc_blocks(&block, &wide_len,
929 BLOCK_NOT_COUNTED,
930 BA_FORMATTED
931 /* formatted, without defer */ );
933 return ret;
936 spin_unlock_atom(atom);
938 cur = list_entry(cur->capture_link.next, jnode, capture_link);
939 ++block;
942 return 0;
945 /* Allocate wandered blocks for current atom's OVERWRITE SET and immediately
946 submit IO for allocated blocks. We assume that current atom is in a stage
947 when any atom fusion is impossible and atom is unlocked and it is safe. */
948 static int alloc_wandered_blocks(struct commit_handle *ch, flush_queue_t *fq)
950 reiser4_block_nr block;
952 int rest;
953 int len;
954 int ret;
956 jnode *cur;
958 assert("zam-534", ch->overwrite_set_size > 0);
960 rest = ch->overwrite_set_size;
962 cur = list_entry(ch->overwrite_set->next, jnode, capture_link);
963 while (ch->overwrite_set != &cur->capture_link) {
964 assert("zam-567", JF_ISSET(cur, JNODE_OVRWR));
966 ret = get_more_wandered_blocks(rest, &block, &len);
967 if (ret)
968 return ret;
970 rest -= len;
972 ret = add_region_to_wmap(cur, len, &block);
973 if (ret)
974 return ret;
976 ret = write_jnodes_to_disk_extent(cur, len, &block, fq, 0);
977 if (ret)
978 return ret;
980 while ((len--) > 0) {
981 assert("zam-604",
982 ch->overwrite_set != &cur->capture_link);
983 cur = list_entry(cur->capture_link.next, jnode, capture_link);
987 return 0;
990 /* allocate given number of nodes over the journal area and link them into a
991 list, return pointer to the first jnode in the list */
992 static int alloc_tx(struct commit_handle *ch, flush_queue_t * fq)
994 reiser4_blocknr_hint hint;
995 reiser4_block_nr allocated = 0;
996 reiser4_block_nr first, len;
997 jnode *cur;
998 jnode *txhead;
999 int ret;
1000 reiser4_context *ctx;
1001 reiser4_super_info_data *sbinfo;
1003 assert("zam-698", ch->tx_size > 0);
1004 assert("zam-699", list_empty_careful(&ch->tx_list));
1006 ctx = get_current_context();
1007 sbinfo = get_super_private(ctx->super);
1009 while (allocated < (unsigned)ch->tx_size) {
1010 len = (ch->tx_size - allocated);
1012 reiser4_blocknr_hint_init(&hint);
1014 hint.block_stage = BLOCK_GRABBED;
1016 /* FIXME: there should be some block allocation policy for
1017 nodes which contain wander records */
1019 /* We assume that disk space for wandered record blocks can be
1020 * taken from reserved area. */
1021 ret = reiser4_alloc_blocks(&hint, &first, &len,
1022 BA_FORMATTED | BA_RESERVED |
1023 BA_USE_DEFAULT_SEARCH_START);
1024 reiser4_blocknr_hint_done(&hint);
1026 if (ret)
1027 return ret;
1029 allocated += len;
1031 /* create jnodes for all wander records */
1032 while (len--) {
1033 cur = reiser4_alloc_io_head(&first);
1035 if (cur == NULL) {
1036 ret = RETERR(-ENOMEM);
1037 goto free_not_assigned;
1040 ret = jinit_new(cur, reiser4_ctx_gfp_mask_get());
1042 if (ret != 0) {
1043 jfree(cur);
1044 goto free_not_assigned;
1047 pin_jnode_data(cur);
1049 list_add_tail(&cur->capture_link, &ch->tx_list);
1051 first++;
1055 { /* format a on-disk linked list of wander records */
1056 int serial = 1;
1058 txhead = list_entry(ch->tx_list.next, jnode, capture_link);
1059 format_tx_head(ch);
1061 cur = list_entry(txhead->capture_link.next, jnode, capture_link);
1062 while (&ch->tx_list != &cur->capture_link) {
1063 format_wander_record(ch, cur, serial++);
1064 cur = list_entry(cur->capture_link.next, jnode, capture_link);
1068 { /* Fill wander records with Wandered Set */
1069 struct store_wmap_params params;
1070 txn_atom *atom;
1072 params.cur = list_entry(txhead->capture_link.next, jnode, capture_link);
1074 params.idx = 0;
1075 params.capacity =
1076 wander_record_capacity(reiser4_get_current_sb());
1078 atom = get_current_atom_locked();
1079 blocknr_set_iterator(atom, &atom->wandered_map,
1080 &store_wmap_actor, &params, 0);
1081 spin_unlock_atom(atom);
1084 { /* relse all jnodes from tx_list */
1085 cur = list_entry(ch->tx_list.next, jnode, capture_link);
1086 while (&ch->tx_list != &cur->capture_link) {
1087 jrelse(cur);
1088 cur = list_entry(cur->capture_link.next, jnode, capture_link);
1092 ret = write_jnode_list(&ch->tx_list, fq, NULL, 0);
1094 return ret;
1096 free_not_assigned:
1097 /* We deallocate blocks not yet assigned to jnodes on tx_list. The
1098 caller takes care about invalidating of tx list */
1099 reiser4_dealloc_blocks(&first, &len, BLOCK_NOT_COUNTED, BA_FORMATTED);
1101 return ret;
1104 static int commit_tx(struct commit_handle *ch)
1106 flush_queue_t *fq;
1107 int barrier;
1108 int ret;
1110 /* Grab more space for wandered records. */
1111 ret = reiser4_grab_space_force((__u64) (ch->tx_size), BA_RESERVED);
1112 if (ret)
1113 return ret;
1115 fq = get_fq_for_current_atom();
1116 if (IS_ERR(fq))
1117 return PTR_ERR(fq);
1119 spin_unlock_atom(fq->atom);
1120 do {
1121 ret = alloc_wandered_blocks(ch, fq);
1122 if (ret)
1123 break;
1124 ret = alloc_tx(ch, fq);
1125 if (ret)
1126 break;
1127 } while (0);
1129 reiser4_fq_put(fq);
1130 if (ret)
1131 return ret;
1132 repeat_wo_barrier:
1133 barrier = reiser4_use_write_barrier(ch->super);
1134 if (!barrier) {
1135 ret = current_atom_finish_all_fq();
1136 if (ret)
1137 return ret;
1139 ret = update_journal_header(ch, barrier);
1140 if (barrier) {
1141 if (ret) {
1142 if (ret == -EOPNOTSUPP) {
1143 disable_write_barrier(ch->super);
1144 goto repeat_wo_barrier;
1146 return ret;
1148 ret = current_atom_finish_all_fq();
1150 return ret;
1153 static int write_tx_back(struct commit_handle * ch)
1155 flush_queue_t *fq;
1156 int ret;
1157 int barrier;
1159 reiser4_post_commit_hook();
1160 fq = get_fq_for_current_atom();
1161 if (IS_ERR(fq))
1162 return PTR_ERR(fq);
1163 spin_unlock_atom(fq->atom);
1164 ret = write_jnode_list(
1165 ch->overwrite_set, fq, NULL, WRITEOUT_FOR_PAGE_RECLAIM);
1166 reiser4_fq_put(fq);
1167 if (ret)
1168 return ret;
1169 repeat_wo_barrier:
1170 barrier = reiser4_use_write_barrier(ch->super);
1171 if (!barrier) {
1172 ret = current_atom_finish_all_fq();
1173 if (ret)
1174 return ret;
1176 ret = update_journal_footer(ch, barrier);
1177 if (barrier) {
1178 if (ret) {
1179 if (ret == -EOPNOTSUPP) {
1180 disable_write_barrier(ch->super);
1181 goto repeat_wo_barrier;
1183 return ret;
1185 ret = current_atom_finish_all_fq();
1187 if (ret)
1188 return ret;
1189 reiser4_post_write_back_hook();
1190 return 0;
1193 /* We assume that at this moment all captured blocks are marked as RELOC or
1194 WANDER (belong to Relocate o Overwrite set), all nodes from Relocate set
1195 are submitted to write.
1198 int reiser4_write_logs(long *nr_submitted)
1200 txn_atom *atom;
1201 struct super_block *super = reiser4_get_current_sb();
1202 reiser4_super_info_data *sbinfo = get_super_private(super);
1203 struct commit_handle ch;
1204 int ret;
1206 writeout_mode_enable();
1208 /* block allocator may add j-nodes to the clean_list */
1209 ret = reiser4_pre_commit_hook();
1210 if (ret)
1211 return ret;
1213 /* No locks are required if we take atom which stage >=
1214 * ASTAGE_PRE_COMMIT */
1215 atom = get_current_context()->trans->atom;
1216 assert("zam-965", atom != NULL);
1218 /* relocate set is on the atom->clean_nodes list after
1219 * current_atom_complete_writes() finishes. It can be safely
1220 * uncaptured after commit_mutex is locked, because any atom that
1221 * captures these nodes is guaranteed to commit after current one.
1223 * This can only be done after reiser4_pre_commit_hook(), because it is where
1224 * early flushed jnodes with CREATED bit are transferred to the
1225 * overwrite list. */
1226 reiser4_invalidate_list(ATOM_CLEAN_LIST(atom));
1227 spin_lock_atom(atom);
1228 /* There might be waiters for the relocate nodes which we have
1229 * released, wake them up. */
1230 reiser4_atom_send_event(atom);
1231 spin_unlock_atom(atom);
1233 if (REISER4_DEBUG) {
1234 int level;
1236 for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; ++level)
1237 assert("nikita-3352",
1238 list_empty_careful(ATOM_DIRTY_LIST(atom, level)));
1241 sbinfo->nr_files_committed += (unsigned)atom->nr_objects_created;
1242 sbinfo->nr_files_committed -= (unsigned)atom->nr_objects_deleted;
1244 init_commit_handle(&ch, atom);
1246 ch.free_blocks = sbinfo->blocks_free_committed;
1247 ch.nr_files = sbinfo->nr_files_committed;
1248 /* ZAM-FIXME-HANS: email me what the contention level is for the super
1249 * lock. */
1250 ch.next_oid = oid_next(super);
1252 /* count overwrite set and place it in a separate list */
1253 ret = get_overwrite_set(&ch);
1255 if (ret <= 0) {
1256 /* It is possible that overwrite set is empty here, it means
1257 all captured nodes are clean */
1258 goto up_and_ret;
1261 /* Inform the caller about what number of dirty pages will be
1262 * submitted to disk. */
1263 *nr_submitted += ch.overwrite_set_size - ch.nr_bitmap;
1265 /* count all records needed for storing of the wandered set */
1266 get_tx_size(&ch);
1268 ret = commit_tx(&ch);
1269 if (ret)
1270 goto up_and_ret;
1272 spin_lock_atom(atom);
1273 reiser4_atom_set_stage(atom, ASTAGE_POST_COMMIT);
1274 spin_unlock_atom(atom);
1276 ret = write_tx_back(&ch);
1277 reiser4_post_write_back_hook();
1279 up_and_ret:
1280 if (ret) {
1281 /* there could be fq attached to current atom; the only way to
1282 remove them is: */
1283 current_atom_finish_all_fq();
1286 /* free blocks of flushed transaction */
1287 dealloc_tx_list(&ch);
1288 dealloc_wmap(&ch);
1290 put_overwrite_set(&ch);
1292 done_commit_handle(&ch);
1294 writeout_mode_disable();
1296 return ret;
1299 /* consistency checks for journal data/control blocks: header, footer, log
1300 records, transactions head blocks. All functions return zero on success. */
1302 static int check_journal_header(const jnode * node UNUSED_ARG)
1304 /* FIXME: journal header has no magic field yet. */
1305 return 0;
1308 /* wait for write completion for all jnodes from given list */
1309 static int wait_on_jnode_list(struct list_head *head)
1311 jnode *scan;
1312 int ret = 0;
1314 list_for_each_entry(scan, head, capture_link) {
1315 struct page *pg = jnode_page(scan);
1317 if (pg) {
1318 if (PageWriteback(pg))
1319 wait_on_page_writeback(pg);
1321 if (PageError(pg))
1322 ret++;
1326 return ret;
1329 static int check_journal_footer(const jnode * node UNUSED_ARG)
1331 /* FIXME: journal footer has no magic field yet. */
1332 return 0;
1335 static int check_tx_head(const jnode * node)
1337 struct tx_header *header = (struct tx_header *)jdata(node);
1339 if (memcmp(&header->magic, TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE) != 0) {
1340 warning("zam-627", "tx head at block %s corrupted\n",
1341 sprint_address(jnode_get_block(node)));
1342 return RETERR(-EIO);
1345 return 0;
1348 static int check_wander_record(const jnode * node)
1350 struct wander_record_header *RH =
1351 (struct wander_record_header *)jdata(node);
1353 if (memcmp(&RH->magic, WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE) !=
1354 0) {
1355 warning("zam-628", "wander record at block %s corrupted\n",
1356 sprint_address(jnode_get_block(node)));
1357 return RETERR(-EIO);
1360 return 0;
1363 /* fill commit_handler structure by everything what is needed for update_journal_footer */
1364 static int restore_commit_handle(struct commit_handle *ch, jnode *tx_head)
1366 struct tx_header *TXH;
1367 int ret;
1369 ret = jload(tx_head);
1370 if (ret)
1371 return ret;
1373 TXH = (struct tx_header *)jdata(tx_head);
1375 ch->free_blocks = le64_to_cpu(get_unaligned(&TXH->free_blocks));
1376 ch->nr_files = le64_to_cpu(get_unaligned(&TXH->nr_files));
1377 ch->next_oid = le64_to_cpu(get_unaligned(&TXH->next_oid));
1379 jrelse(tx_head);
1381 list_add(&tx_head->capture_link, &ch->tx_list);
1383 return 0;
1386 /* replay one transaction: restore and write overwrite set in place */
1387 static int replay_transaction(const struct super_block *s,
1388 jnode * tx_head,
1389 const reiser4_block_nr * log_rec_block_p,
1390 const reiser4_block_nr * end_block,
1391 unsigned int nr_wander_records)
1393 reiser4_block_nr log_rec_block = *log_rec_block_p;
1394 struct commit_handle ch;
1395 LIST_HEAD(overwrite_set);
1396 jnode *log;
1397 int ret;
1399 init_commit_handle(&ch, NULL);
1400 ch.overwrite_set = &overwrite_set;
1402 restore_commit_handle(&ch, tx_head);
1404 while (log_rec_block != *end_block) {
1405 struct wander_record_header *header;
1406 struct wander_entry *entry;
1408 int i;
1410 if (nr_wander_records == 0) {
1411 warning("zam-631",
1412 "number of wander records in the linked list"
1413 " greater than number stored in tx head.\n");
1414 ret = RETERR(-EIO);
1415 goto free_ow_set;
1418 log = reiser4_alloc_io_head(&log_rec_block);
1419 if (log == NULL)
1420 return RETERR(-ENOMEM);
1422 ret = jload(log);
1423 if (ret < 0) {
1424 reiser4_drop_io_head(log);
1425 return ret;
1428 ret = check_wander_record(log);
1429 if (ret) {
1430 jrelse(log);
1431 reiser4_drop_io_head(log);
1432 return ret;
1435 header = (struct wander_record_header *)jdata(log);
1436 log_rec_block = le64_to_cpu(get_unaligned(&header->next_block));
1438 entry = (struct wander_entry *)(header + 1);
1440 /* restore overwrite set from wander record content */
1441 for (i = 0; i < wander_record_capacity(s); i++) {
1442 reiser4_block_nr block;
1443 jnode *node;
1445 block = le64_to_cpu(get_unaligned(&entry->wandered));
1446 if (block == 0)
1447 break;
1449 node = reiser4_alloc_io_head(&block);
1450 if (node == NULL) {
1451 ret = RETERR(-ENOMEM);
1453 * FIXME-VS:???
1455 jrelse(log);
1456 reiser4_drop_io_head(log);
1457 goto free_ow_set;
1460 ret = jload(node);
1462 if (ret < 0) {
1463 reiser4_drop_io_head(node);
1465 * FIXME-VS:???
1467 jrelse(log);
1468 reiser4_drop_io_head(log);
1469 goto free_ow_set;
1472 block = le64_to_cpu(get_unaligned(&entry->original));
1474 assert("zam-603", block != 0);
1476 jnode_set_block(node, &block);
1478 list_add_tail(&node->capture_link, ch.overwrite_set);
1480 ++entry;
1483 jrelse(log);
1484 reiser4_drop_io_head(log);
1486 --nr_wander_records;
1489 if (nr_wander_records != 0) {
1490 warning("zam-632", "number of wander records in the linked list"
1491 " less than number stored in tx head.\n");
1492 ret = RETERR(-EIO);
1493 goto free_ow_set;
1496 { /* write wandered set in place */
1497 write_jnode_list(ch.overwrite_set, NULL, NULL, 0);
1498 ret = wait_on_jnode_list(ch.overwrite_set);
1500 if (ret) {
1501 ret = RETERR(-EIO);
1502 goto free_ow_set;
1506 ret = update_journal_footer(&ch, 0);
1508 free_ow_set:
1510 while (!list_empty(ch.overwrite_set)) {
1511 jnode *cur = list_entry(ch.overwrite_set->next, jnode, capture_link);
1512 list_del_init(&cur->capture_link);
1513 jrelse(cur);
1514 reiser4_drop_io_head(cur);
1517 list_del_init(&tx_head->capture_link);
1519 done_commit_handle(&ch);
1521 return ret;
1524 /* find oldest committed and not played transaction and play it. The transaction
1525 * was committed and journal header block was updated but the blocks from the
1526 * process of writing the atom's overwrite set in-place and updating of journal
1527 * footer block were not completed. This function completes the process by
1528 * recovering the atom's overwrite set from their wandered locations and writes
1529 * them in-place and updating the journal footer. */
1530 static int replay_oldest_transaction(struct super_block *s)
1532 reiser4_super_info_data *sbinfo = get_super_private(s);
1533 jnode *jf = sbinfo->journal_footer;
1534 unsigned int total;
1535 struct journal_footer *F;
1536 struct tx_header *T;
1538 reiser4_block_nr prev_tx;
1539 reiser4_block_nr last_flushed_tx;
1540 reiser4_block_nr log_rec_block = 0;
1542 jnode *tx_head;
1544 int ret;
1546 if ((ret = jload(jf)) < 0)
1547 return ret;
1549 F = (struct journal_footer *)jdata(jf);
1551 last_flushed_tx = le64_to_cpu(get_unaligned(&F->last_flushed_tx));
1553 jrelse(jf);
1555 if (sbinfo->last_committed_tx == last_flushed_tx) {
1556 /* all transactions are replayed */
1557 return 0;
1560 prev_tx = sbinfo->last_committed_tx;
1562 /* searching for oldest not flushed transaction */
1563 while (1) {
1564 tx_head = reiser4_alloc_io_head(&prev_tx);
1565 if (!tx_head)
1566 return RETERR(-ENOMEM);
1568 ret = jload(tx_head);
1569 if (ret < 0) {
1570 reiser4_drop_io_head(tx_head);
1571 return ret;
1574 ret = check_tx_head(tx_head);
1575 if (ret) {
1576 jrelse(tx_head);
1577 reiser4_drop_io_head(tx_head);
1578 return ret;
1581 T = (struct tx_header *)jdata(tx_head);
1583 prev_tx = le64_to_cpu(get_unaligned(&T->prev_tx));
1585 if (prev_tx == last_flushed_tx)
1586 break;
1588 jrelse(tx_head);
1589 reiser4_drop_io_head(tx_head);
1592 total = le32_to_cpu(get_unaligned(&T->total));
1593 log_rec_block = le64_to_cpu(get_unaligned(&T->next_block));
1595 pin_jnode_data(tx_head);
1596 jrelse(tx_head);
1598 ret =
1599 replay_transaction(s, tx_head, &log_rec_block,
1600 jnode_get_block(tx_head), total - 1);
1602 unpin_jnode_data(tx_head);
1603 reiser4_drop_io_head(tx_head);
1605 if (ret)
1606 return ret;
1607 return -E_REPEAT;
1610 /* The reiser4 journal current implementation was optimized to not to capture
1611 super block if certain super blocks fields are modified. Currently, the set
1612 is (<free block count>, <OID allocator>). These fields are logged by
1613 special way which includes storing them in each transaction head block at
1614 atom commit time and writing that information to journal footer block at
1615 atom flush time. For getting info from journal footer block to the
1616 in-memory super block there is a special function
1617 reiser4_journal_recover_sb_data() which should be called after disk format
1618 plugin re-reads super block after journal replaying.
1621 /* get the information from journal footer in-memory super block */
1622 int reiser4_journal_recover_sb_data(struct super_block *s)
1624 reiser4_super_info_data *sbinfo = get_super_private(s);
1625 struct journal_footer *jf;
1626 int ret;
1628 assert("zam-673", sbinfo->journal_footer != NULL);
1630 ret = jload(sbinfo->journal_footer);
1631 if (ret != 0)
1632 return ret;
1634 ret = check_journal_footer(sbinfo->journal_footer);
1635 if (ret != 0)
1636 goto out;
1638 jf = (struct journal_footer *)jdata(sbinfo->journal_footer);
1640 /* was there at least one flushed transaction? */
1641 if (jf->last_flushed_tx) {
1643 /* restore free block counter logged in this transaction */
1644 reiser4_set_free_blocks(s, le64_to_cpu(get_unaligned(&jf->free_blocks)));
1646 /* restore oid allocator state */
1647 oid_init_allocator(s,
1648 le64_to_cpu(get_unaligned(&jf->nr_files)),
1649 le64_to_cpu(get_unaligned(&jf->next_oid)));
1651 out:
1652 jrelse(sbinfo->journal_footer);
1653 return ret;
1656 /* reiser4 replay journal procedure */
1657 int reiser4_journal_replay(struct super_block *s)
1659 reiser4_super_info_data *sbinfo = get_super_private(s);
1660 jnode *jh, *jf;
1661 struct journal_header *header;
1662 int nr_tx_replayed = 0;
1663 int ret;
1665 assert("zam-582", sbinfo != NULL);
1667 jh = sbinfo->journal_header;
1668 jf = sbinfo->journal_footer;
1670 if (!jh || !jf) {
1671 /* it is possible that disk layout does not support journal
1672 structures, we just warn about this */
1673 warning("zam-583",
1674 "journal control blocks were not loaded by disk layout plugin. "
1675 "journal replaying is not possible.\n");
1676 return 0;
1679 /* Take free block count from journal footer block. The free block
1680 counter value corresponds the last flushed transaction state */
1681 ret = jload(jf);
1682 if (ret < 0)
1683 return ret;
1685 ret = check_journal_footer(jf);
1686 if (ret) {
1687 jrelse(jf);
1688 return ret;
1691 jrelse(jf);
1693 /* store last committed transaction info in reiser4 in-memory super
1694 block */
1695 ret = jload(jh);
1696 if (ret < 0)
1697 return ret;
1699 ret = check_journal_header(jh);
1700 if (ret) {
1701 jrelse(jh);
1702 return ret;
1705 header = (struct journal_header *)jdata(jh);
1706 sbinfo->last_committed_tx = le64_to_cpu(get_unaligned(&header->last_committed_tx));
1708 jrelse(jh);
1710 /* replay committed transactions */
1711 while ((ret = replay_oldest_transaction(s)) == -E_REPEAT)
1712 nr_tx_replayed++;
1714 return ret;
1717 /* load journal control block (either journal header or journal footer block) */
1718 static int
1719 load_journal_control_block(jnode ** node, const reiser4_block_nr * block)
1721 int ret;
1723 *node = reiser4_alloc_io_head(block);
1724 if (!(*node))
1725 return RETERR(-ENOMEM);
1727 ret = jload(*node);
1729 if (ret) {
1730 reiser4_drop_io_head(*node);
1731 *node = NULL;
1732 return ret;
1735 pin_jnode_data(*node);
1736 jrelse(*node);
1738 return 0;
1741 /* unload journal header or footer and free jnode */
1742 static void unload_journal_control_block(jnode ** node)
1744 if (*node) {
1745 unpin_jnode_data(*node);
1746 reiser4_drop_io_head(*node);
1747 *node = NULL;
1751 /* release journal control blocks */
1752 void reiser4_done_journal_info(struct super_block *s)
1754 reiser4_super_info_data *sbinfo = get_super_private(s);
1756 assert("zam-476", sbinfo != NULL);
1758 unload_journal_control_block(&sbinfo->journal_header);
1759 unload_journal_control_block(&sbinfo->journal_footer);
1760 rcu_barrier();
1763 /* load journal control blocks */
1764 int reiser4_init_journal_info(struct super_block *s)
1766 reiser4_super_info_data *sbinfo = get_super_private(s);
1767 journal_location *loc;
1768 int ret;
1770 loc = &sbinfo->jloc;
1772 assert("zam-651", loc != NULL);
1773 assert("zam-652", loc->header != 0);
1774 assert("zam-653", loc->footer != 0);
1776 ret = load_journal_control_block(&sbinfo->journal_header, &loc->header);
1778 if (ret)
1779 return ret;
1781 ret = load_journal_control_block(&sbinfo->journal_footer, &loc->footer);
1783 if (ret) {
1784 unload_journal_control_block(&sbinfo->journal_header);
1787 return ret;
1790 /* Make Linus happy.
1791 Local variables:
1792 c-indentation-style: "K&R"
1793 mode-name: "LC"
1794 c-basic-offset: 8
1795 tab-width: 8
1796 fill-column: 80
1797 End: