Linux 2.6.21
[linux/fpc-iii.git] / fs / jbd / commit.c
blobbe4648bc7a2f8febe584a999040599bac87edfb6
1 /*
2 * linux/fs/jbd/commit.c
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
6 * Copyright 1998 Red Hat corp --- All Rights Reserved
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
12 * Journal commit routines for the generic filesystem journaling code;
13 * part of the ext2fs journaling system.
16 #include <linux/time.h>
17 #include <linux/fs.h>
18 #include <linux/jbd.h>
19 #include <linux/errno.h>
20 #include <linux/slab.h>
21 #include <linux/mm.h>
22 #include <linux/pagemap.h>
23 #include <linux/smp_lock.h>
26 * Default IO end handler for temporary BJ_IO buffer_heads.
28 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
30 BUFFER_TRACE(bh, "");
31 if (uptodate)
32 set_buffer_uptodate(bh);
33 else
34 clear_buffer_uptodate(bh);
35 unlock_buffer(bh);
39 * When an ext3-ordered file is truncated, it is possible that many pages are
40 * not sucessfully freed, because they are attached to a committing transaction.
41 * After the transaction commits, these pages are left on the LRU, with no
42 * ->mapping, and with attached buffers. These pages are trivially reclaimable
43 * by the VM, but their apparent absence upsets the VM accounting, and it makes
44 * the numbers in /proc/meminfo look odd.
46 * So here, we have a buffer which has just come off the forget list. Look to
47 * see if we can strip all buffers from the backing page.
49 * Called under lock_journal(), and possibly under journal_datalist_lock. The
50 * caller provided us with a ref against the buffer, and we drop that here.
52 static void release_buffer_page(struct buffer_head *bh)
54 struct page *page;
56 if (buffer_dirty(bh))
57 goto nope;
58 if (atomic_read(&bh->b_count) != 1)
59 goto nope;
60 page = bh->b_page;
61 if (!page)
62 goto nope;
63 if (page->mapping)
64 goto nope;
66 /* OK, it's a truncated page */
67 if (TestSetPageLocked(page))
68 goto nope;
70 page_cache_get(page);
71 __brelse(bh);
72 try_to_free_buffers(page);
73 unlock_page(page);
74 page_cache_release(page);
75 return;
77 nope:
78 __brelse(bh);
82 * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
83 * held. For ranking reasons we must trylock. If we lose, schedule away and
84 * return 0. j_list_lock is dropped in this case.
86 static int inverted_lock(journal_t *journal, struct buffer_head *bh)
88 if (!jbd_trylock_bh_state(bh)) {
89 spin_unlock(&journal->j_list_lock);
90 schedule();
91 return 0;
93 return 1;
96 /* Done it all: now write the commit record. We should have
97 * cleaned up our previous buffers by now, so if we are in abort
98 * mode we can now just skip the rest of the journal write
99 * entirely.
101 * Returns 1 if the journal needs to be aborted or 0 on success
103 static int journal_write_commit_record(journal_t *journal,
104 transaction_t *commit_transaction)
106 struct journal_head *descriptor;
107 struct buffer_head *bh;
108 int i, ret;
109 int barrier_done = 0;
111 if (is_journal_aborted(journal))
112 return 0;
114 descriptor = journal_get_descriptor_buffer(journal);
115 if (!descriptor)
116 return 1;
118 bh = jh2bh(descriptor);
120 /* AKPM: buglet - add `i' to tmp! */
121 for (i = 0; i < bh->b_size; i += 512) {
122 journal_header_t *tmp = (journal_header_t*)bh->b_data;
123 tmp->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
124 tmp->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
125 tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
128 JBUFFER_TRACE(descriptor, "write commit block");
129 set_buffer_dirty(bh);
130 if (journal->j_flags & JFS_BARRIER) {
131 set_buffer_ordered(bh);
132 barrier_done = 1;
134 ret = sync_dirty_buffer(bh);
135 /* is it possible for another commit to fail at roughly
136 * the same time as this one? If so, we don't want to
137 * trust the barrier flag in the super, but instead want
138 * to remember if we sent a barrier request
140 if (ret == -EOPNOTSUPP && barrier_done) {
141 char b[BDEVNAME_SIZE];
143 printk(KERN_WARNING
144 "JBD: barrier-based sync failed on %s - "
145 "disabling barriers\n",
146 bdevname(journal->j_dev, b));
147 spin_lock(&journal->j_state_lock);
148 journal->j_flags &= ~JFS_BARRIER;
149 spin_unlock(&journal->j_state_lock);
151 /* And try again, without the barrier */
152 clear_buffer_ordered(bh);
153 set_buffer_uptodate(bh);
154 set_buffer_dirty(bh);
155 ret = sync_dirty_buffer(bh);
157 put_bh(bh); /* One for getblk() */
158 journal_put_journal_head(descriptor);
160 return (ret == -EIO);
163 static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
165 int i;
167 for (i = 0; i < bufs; i++) {
168 wbuf[i]->b_end_io = end_buffer_write_sync;
169 /* We use-up our safety reference in submit_bh() */
170 submit_bh(WRITE, wbuf[i]);
175 * Submit all the data buffers to disk
177 static void journal_submit_data_buffers(journal_t *journal,
178 transaction_t *commit_transaction)
180 struct journal_head *jh;
181 struct buffer_head *bh;
182 int locked;
183 int bufs = 0;
184 struct buffer_head **wbuf = journal->j_wbuf;
187 * Whenever we unlock the journal and sleep, things can get added
188 * onto ->t_sync_datalist, so we have to keep looping back to
189 * write_out_data until we *know* that the list is empty.
191 * Cleanup any flushed data buffers from the data list. Even in
192 * abort mode, we want to flush this out as soon as possible.
194 write_out_data:
195 cond_resched();
196 spin_lock(&journal->j_list_lock);
198 while (commit_transaction->t_sync_datalist) {
199 jh = commit_transaction->t_sync_datalist;
200 bh = jh2bh(jh);
201 locked = 0;
203 /* Get reference just to make sure buffer does not disappear
204 * when we are forced to drop various locks */
205 get_bh(bh);
206 /* If the buffer is dirty, we need to submit IO and hence
207 * we need the buffer lock. We try to lock the buffer without
208 * blocking. If we fail, we need to drop j_list_lock and do
209 * blocking lock_buffer().
211 if (buffer_dirty(bh)) {
212 if (test_set_buffer_locked(bh)) {
213 BUFFER_TRACE(bh, "needs blocking lock");
214 spin_unlock(&journal->j_list_lock);
215 /* Write out all data to prevent deadlocks */
216 journal_do_submit_data(wbuf, bufs);
217 bufs = 0;
218 lock_buffer(bh);
219 spin_lock(&journal->j_list_lock);
221 locked = 1;
223 /* We have to get bh_state lock. Again out of order, sigh. */
224 if (!inverted_lock(journal, bh)) {
225 jbd_lock_bh_state(bh);
226 spin_lock(&journal->j_list_lock);
228 /* Someone already cleaned up the buffer? */
229 if (!buffer_jbd(bh)
230 || jh->b_transaction != commit_transaction
231 || jh->b_jlist != BJ_SyncData) {
232 jbd_unlock_bh_state(bh);
233 if (locked)
234 unlock_buffer(bh);
235 BUFFER_TRACE(bh, "already cleaned up");
236 put_bh(bh);
237 continue;
239 if (locked && test_clear_buffer_dirty(bh)) {
240 BUFFER_TRACE(bh, "needs writeout, adding to array");
241 wbuf[bufs++] = bh;
242 __journal_file_buffer(jh, commit_transaction,
243 BJ_Locked);
244 jbd_unlock_bh_state(bh);
245 if (bufs == journal->j_wbufsize) {
246 spin_unlock(&journal->j_list_lock);
247 journal_do_submit_data(wbuf, bufs);
248 bufs = 0;
249 goto write_out_data;
251 } else if (!locked && buffer_locked(bh)) {
252 __journal_file_buffer(jh, commit_transaction,
253 BJ_Locked);
254 jbd_unlock_bh_state(bh);
255 put_bh(bh);
256 } else {
257 BUFFER_TRACE(bh, "writeout complete: unfile");
258 __journal_unfile_buffer(jh);
259 jbd_unlock_bh_state(bh);
260 if (locked)
261 unlock_buffer(bh);
262 journal_remove_journal_head(bh);
263 /* Once for our safety reference, once for
264 * journal_remove_journal_head() */
265 put_bh(bh);
266 put_bh(bh);
269 if (lock_need_resched(&journal->j_list_lock)) {
270 spin_unlock(&journal->j_list_lock);
271 goto write_out_data;
274 spin_unlock(&journal->j_list_lock);
275 journal_do_submit_data(wbuf, bufs);
279 * journal_commit_transaction
281 * The primary function for committing a transaction to the log. This
282 * function is called by the journal thread to begin a complete commit.
284 void journal_commit_transaction(journal_t *journal)
286 transaction_t *commit_transaction;
287 struct journal_head *jh, *new_jh, *descriptor;
288 struct buffer_head **wbuf = journal->j_wbuf;
289 int bufs;
290 int flags;
291 int err;
292 unsigned long blocknr;
293 char *tagp = NULL;
294 journal_header_t *header;
295 journal_block_tag_t *tag = NULL;
296 int space_left = 0;
297 int first_tag = 0;
298 int tag_flag;
299 int i;
302 * First job: lock down the current transaction and wait for
303 * all outstanding updates to complete.
306 #ifdef COMMIT_STATS
307 spin_lock(&journal->j_list_lock);
308 summarise_journal_usage(journal);
309 spin_unlock(&journal->j_list_lock);
310 #endif
312 /* Do we need to erase the effects of a prior journal_flush? */
313 if (journal->j_flags & JFS_FLUSHED) {
314 jbd_debug(3, "super block updated\n");
315 journal_update_superblock(journal, 1);
316 } else {
317 jbd_debug(3, "superblock not updated\n");
320 J_ASSERT(journal->j_running_transaction != NULL);
321 J_ASSERT(journal->j_committing_transaction == NULL);
323 commit_transaction = journal->j_running_transaction;
324 J_ASSERT(commit_transaction->t_state == T_RUNNING);
326 jbd_debug(1, "JBD: starting commit of transaction %d\n",
327 commit_transaction->t_tid);
329 spin_lock(&journal->j_state_lock);
330 commit_transaction->t_state = T_LOCKED;
332 spin_lock(&commit_transaction->t_handle_lock);
333 while (commit_transaction->t_updates) {
334 DEFINE_WAIT(wait);
336 prepare_to_wait(&journal->j_wait_updates, &wait,
337 TASK_UNINTERRUPTIBLE);
338 if (commit_transaction->t_updates) {
339 spin_unlock(&commit_transaction->t_handle_lock);
340 spin_unlock(&journal->j_state_lock);
341 schedule();
342 spin_lock(&journal->j_state_lock);
343 spin_lock(&commit_transaction->t_handle_lock);
345 finish_wait(&journal->j_wait_updates, &wait);
347 spin_unlock(&commit_transaction->t_handle_lock);
349 J_ASSERT (commit_transaction->t_outstanding_credits <=
350 journal->j_max_transaction_buffers);
353 * First thing we are allowed to do is to discard any remaining
354 * BJ_Reserved buffers. Note, it is _not_ permissible to assume
355 * that there are no such buffers: if a large filesystem
356 * operation like a truncate needs to split itself over multiple
357 * transactions, then it may try to do a journal_restart() while
358 * there are still BJ_Reserved buffers outstanding. These must
359 * be released cleanly from the current transaction.
361 * In this case, the filesystem must still reserve write access
362 * again before modifying the buffer in the new transaction, but
363 * we do not require it to remember exactly which old buffers it
364 * has reserved. This is consistent with the existing behaviour
365 * that multiple journal_get_write_access() calls to the same
366 * buffer are perfectly permissable.
368 while (commit_transaction->t_reserved_list) {
369 jh = commit_transaction->t_reserved_list;
370 JBUFFER_TRACE(jh, "reserved, unused: refile");
372 * A journal_get_undo_access()+journal_release_buffer() may
373 * leave undo-committed data.
375 if (jh->b_committed_data) {
376 struct buffer_head *bh = jh2bh(jh);
378 jbd_lock_bh_state(bh);
379 jbd_slab_free(jh->b_committed_data, bh->b_size);
380 jh->b_committed_data = NULL;
381 jbd_unlock_bh_state(bh);
383 journal_refile_buffer(journal, jh);
387 * Now try to drop any written-back buffers from the journal's
388 * checkpoint lists. We do this *before* commit because it potentially
389 * frees some memory
391 spin_lock(&journal->j_list_lock);
392 __journal_clean_checkpoint_list(journal);
393 spin_unlock(&journal->j_list_lock);
395 jbd_debug (3, "JBD: commit phase 1\n");
398 * Switch to a new revoke table.
400 journal_switch_revoke_table(journal);
402 commit_transaction->t_state = T_FLUSH;
403 journal->j_committing_transaction = commit_transaction;
404 journal->j_running_transaction = NULL;
405 commit_transaction->t_log_start = journal->j_head;
406 wake_up(&journal->j_wait_transaction_locked);
407 spin_unlock(&journal->j_state_lock);
409 jbd_debug (3, "JBD: commit phase 2\n");
412 * First, drop modified flag: all accesses to the buffers
413 * will be tracked for a new trasaction only -bzzz
415 spin_lock(&journal->j_list_lock);
416 if (commit_transaction->t_buffers) {
417 new_jh = jh = commit_transaction->t_buffers->b_tnext;
418 do {
419 J_ASSERT_JH(new_jh, new_jh->b_modified == 1 ||
420 new_jh->b_modified == 0);
421 new_jh->b_modified = 0;
422 new_jh = new_jh->b_tnext;
423 } while (new_jh != jh);
425 spin_unlock(&journal->j_list_lock);
428 * Now start flushing things to disk, in the order they appear
429 * on the transaction lists. Data blocks go first.
431 err = 0;
432 journal_submit_data_buffers(journal, commit_transaction);
435 * Wait for all previously submitted IO to complete.
437 spin_lock(&journal->j_list_lock);
438 while (commit_transaction->t_locked_list) {
439 struct buffer_head *bh;
441 jh = commit_transaction->t_locked_list->b_tprev;
442 bh = jh2bh(jh);
443 get_bh(bh);
444 if (buffer_locked(bh)) {
445 spin_unlock(&journal->j_list_lock);
446 wait_on_buffer(bh);
447 if (unlikely(!buffer_uptodate(bh)))
448 err = -EIO;
449 spin_lock(&journal->j_list_lock);
451 if (!inverted_lock(journal, bh)) {
452 put_bh(bh);
453 spin_lock(&journal->j_list_lock);
454 continue;
456 if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
457 __journal_unfile_buffer(jh);
458 jbd_unlock_bh_state(bh);
459 journal_remove_journal_head(bh);
460 put_bh(bh);
461 } else {
462 jbd_unlock_bh_state(bh);
464 put_bh(bh);
465 cond_resched_lock(&journal->j_list_lock);
467 spin_unlock(&journal->j_list_lock);
469 if (err)
470 __journal_abort_hard(journal);
472 journal_write_revoke_records(journal, commit_transaction);
474 jbd_debug(3, "JBD: commit phase 2\n");
477 * If we found any dirty or locked buffers, then we should have
478 * looped back up to the write_out_data label. If there weren't
479 * any then journal_clean_data_list should have wiped the list
480 * clean by now, so check that it is in fact empty.
482 J_ASSERT (commit_transaction->t_sync_datalist == NULL);
484 jbd_debug (3, "JBD: commit phase 3\n");
487 * Way to go: we have now written out all of the data for a
488 * transaction! Now comes the tricky part: we need to write out
489 * metadata. Loop over the transaction's entire buffer list:
491 commit_transaction->t_state = T_COMMIT;
493 descriptor = NULL;
494 bufs = 0;
495 while (commit_transaction->t_buffers) {
497 /* Find the next buffer to be journaled... */
499 jh = commit_transaction->t_buffers;
501 /* If we're in abort mode, we just un-journal the buffer and
502 release it for background writing. */
504 if (is_journal_aborted(journal)) {
505 JBUFFER_TRACE(jh, "journal is aborting: refile");
506 journal_refile_buffer(journal, jh);
507 /* If that was the last one, we need to clean up
508 * any descriptor buffers which may have been
509 * already allocated, even if we are now
510 * aborting. */
511 if (!commit_transaction->t_buffers)
512 goto start_journal_io;
513 continue;
516 /* Make sure we have a descriptor block in which to
517 record the metadata buffer. */
519 if (!descriptor) {
520 struct buffer_head *bh;
522 J_ASSERT (bufs == 0);
524 jbd_debug(4, "JBD: get descriptor\n");
526 descriptor = journal_get_descriptor_buffer(journal);
527 if (!descriptor) {
528 __journal_abort_hard(journal);
529 continue;
532 bh = jh2bh(descriptor);
533 jbd_debug(4, "JBD: got buffer %llu (%p)\n",
534 (unsigned long long)bh->b_blocknr, bh->b_data);
535 header = (journal_header_t *)&bh->b_data[0];
536 header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
537 header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK);
538 header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
540 tagp = &bh->b_data[sizeof(journal_header_t)];
541 space_left = bh->b_size - sizeof(journal_header_t);
542 first_tag = 1;
543 set_buffer_jwrite(bh);
544 set_buffer_dirty(bh);
545 wbuf[bufs++] = bh;
547 /* Record it so that we can wait for IO
548 completion later */
549 BUFFER_TRACE(bh, "ph3: file as descriptor");
550 journal_file_buffer(descriptor, commit_transaction,
551 BJ_LogCtl);
554 /* Where is the buffer to be written? */
556 err = journal_next_log_block(journal, &blocknr);
557 /* If the block mapping failed, just abandon the buffer
558 and repeat this loop: we'll fall into the
559 refile-on-abort condition above. */
560 if (err) {
561 __journal_abort_hard(journal);
562 continue;
566 * start_this_handle() uses t_outstanding_credits to determine
567 * the free space in the log, but this counter is changed
568 * by journal_next_log_block() also.
570 commit_transaction->t_outstanding_credits--;
572 /* Bump b_count to prevent truncate from stumbling over
573 the shadowed buffer! @@@ This can go if we ever get
574 rid of the BJ_IO/BJ_Shadow pairing of buffers. */
575 atomic_inc(&jh2bh(jh)->b_count);
577 /* Make a temporary IO buffer with which to write it out
578 (this will requeue both the metadata buffer and the
579 temporary IO buffer). new_bh goes on BJ_IO*/
581 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
583 * akpm: journal_write_metadata_buffer() sets
584 * new_bh->b_transaction to commit_transaction.
585 * We need to clean this up before we release new_bh
586 * (which is of type BJ_IO)
588 JBUFFER_TRACE(jh, "ph3: write metadata");
589 flags = journal_write_metadata_buffer(commit_transaction,
590 jh, &new_jh, blocknr);
591 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
592 wbuf[bufs++] = jh2bh(new_jh);
594 /* Record the new block's tag in the current descriptor
595 buffer */
597 tag_flag = 0;
598 if (flags & 1)
599 tag_flag |= JFS_FLAG_ESCAPE;
600 if (!first_tag)
601 tag_flag |= JFS_FLAG_SAME_UUID;
603 tag = (journal_block_tag_t *) tagp;
604 tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr);
605 tag->t_flags = cpu_to_be32(tag_flag);
606 tagp += sizeof(journal_block_tag_t);
607 space_left -= sizeof(journal_block_tag_t);
609 if (first_tag) {
610 memcpy (tagp, journal->j_uuid, 16);
611 tagp += 16;
612 space_left -= 16;
613 first_tag = 0;
616 /* If there's no more to do, or if the descriptor is full,
617 let the IO rip! */
619 if (bufs == journal->j_wbufsize ||
620 commit_transaction->t_buffers == NULL ||
621 space_left < sizeof(journal_block_tag_t) + 16) {
623 jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
625 /* Write an end-of-descriptor marker before
626 submitting the IOs. "tag" still points to
627 the last tag we set up. */
629 tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG);
631 start_journal_io:
632 for (i = 0; i < bufs; i++) {
633 struct buffer_head *bh = wbuf[i];
634 lock_buffer(bh);
635 clear_buffer_dirty(bh);
636 set_buffer_uptodate(bh);
637 bh->b_end_io = journal_end_buffer_io_sync;
638 submit_bh(WRITE, bh);
640 cond_resched();
642 /* Force a new descriptor to be generated next
643 time round the loop. */
644 descriptor = NULL;
645 bufs = 0;
649 /* Lo and behold: we have just managed to send a transaction to
650 the log. Before we can commit it, wait for the IO so far to
651 complete. Control buffers being written are on the
652 transaction's t_log_list queue, and metadata buffers are on
653 the t_iobuf_list queue.
655 Wait for the buffers in reverse order. That way we are
656 less likely to be woken up until all IOs have completed, and
657 so we incur less scheduling load.
660 jbd_debug(3, "JBD: commit phase 4\n");
663 * akpm: these are BJ_IO, and j_list_lock is not needed.
664 * See __journal_try_to_free_buffer.
666 wait_for_iobuf:
667 while (commit_transaction->t_iobuf_list != NULL) {
668 struct buffer_head *bh;
670 jh = commit_transaction->t_iobuf_list->b_tprev;
671 bh = jh2bh(jh);
672 if (buffer_locked(bh)) {
673 wait_on_buffer(bh);
674 goto wait_for_iobuf;
676 if (cond_resched())
677 goto wait_for_iobuf;
679 if (unlikely(!buffer_uptodate(bh)))
680 err = -EIO;
682 clear_buffer_jwrite(bh);
684 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
685 journal_unfile_buffer(journal, jh);
688 * ->t_iobuf_list should contain only dummy buffer_heads
689 * which were created by journal_write_metadata_buffer().
691 BUFFER_TRACE(bh, "dumping temporary bh");
692 journal_put_journal_head(jh);
693 __brelse(bh);
694 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
695 free_buffer_head(bh);
697 /* We also have to unlock and free the corresponding
698 shadowed buffer */
699 jh = commit_transaction->t_shadow_list->b_tprev;
700 bh = jh2bh(jh);
701 clear_bit(BH_JWrite, &bh->b_state);
702 J_ASSERT_BH(bh, buffer_jbddirty(bh));
704 /* The metadata is now released for reuse, but we need
705 to remember it against this transaction so that when
706 we finally commit, we can do any checkpointing
707 required. */
708 JBUFFER_TRACE(jh, "file as BJ_Forget");
709 journal_file_buffer(jh, commit_transaction, BJ_Forget);
710 /* Wake up any transactions which were waiting for this
711 IO to complete */
712 wake_up_bit(&bh->b_state, BH_Unshadow);
713 JBUFFER_TRACE(jh, "brelse shadowed buffer");
714 __brelse(bh);
717 J_ASSERT (commit_transaction->t_shadow_list == NULL);
719 jbd_debug(3, "JBD: commit phase 5\n");
721 /* Here we wait for the revoke record and descriptor record buffers */
722 wait_for_ctlbuf:
723 while (commit_transaction->t_log_list != NULL) {
724 struct buffer_head *bh;
726 jh = commit_transaction->t_log_list->b_tprev;
727 bh = jh2bh(jh);
728 if (buffer_locked(bh)) {
729 wait_on_buffer(bh);
730 goto wait_for_ctlbuf;
732 if (cond_resched())
733 goto wait_for_ctlbuf;
735 if (unlikely(!buffer_uptodate(bh)))
736 err = -EIO;
738 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
739 clear_buffer_jwrite(bh);
740 journal_unfile_buffer(journal, jh);
741 journal_put_journal_head(jh);
742 __brelse(bh); /* One for getblk */
743 /* AKPM: bforget here */
746 jbd_debug(3, "JBD: commit phase 6\n");
748 if (journal_write_commit_record(journal, commit_transaction))
749 err = -EIO;
751 if (err)
752 __journal_abort_hard(journal);
754 /* End of a transaction! Finally, we can do checkpoint
755 processing: any buffers committed as a result of this
756 transaction can be removed from any checkpoint list it was on
757 before. */
759 jbd_debug(3, "JBD: commit phase 7\n");
761 J_ASSERT(commit_transaction->t_sync_datalist == NULL);
762 J_ASSERT(commit_transaction->t_buffers == NULL);
763 J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
764 J_ASSERT(commit_transaction->t_iobuf_list == NULL);
765 J_ASSERT(commit_transaction->t_shadow_list == NULL);
766 J_ASSERT(commit_transaction->t_log_list == NULL);
768 restart_loop:
770 * As there are other places (journal_unmap_buffer()) adding buffers
771 * to this list we have to be careful and hold the j_list_lock.
773 spin_lock(&journal->j_list_lock);
774 while (commit_transaction->t_forget) {
775 transaction_t *cp_transaction;
776 struct buffer_head *bh;
778 jh = commit_transaction->t_forget;
779 spin_unlock(&journal->j_list_lock);
780 bh = jh2bh(jh);
781 jbd_lock_bh_state(bh);
782 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
783 jh->b_transaction == journal->j_running_transaction);
786 * If there is undo-protected committed data against
787 * this buffer, then we can remove it now. If it is a
788 * buffer needing such protection, the old frozen_data
789 * field now points to a committed version of the
790 * buffer, so rotate that field to the new committed
791 * data.
793 * Otherwise, we can just throw away the frozen data now.
795 if (jh->b_committed_data) {
796 jbd_slab_free(jh->b_committed_data, bh->b_size);
797 jh->b_committed_data = NULL;
798 if (jh->b_frozen_data) {
799 jh->b_committed_data = jh->b_frozen_data;
800 jh->b_frozen_data = NULL;
802 } else if (jh->b_frozen_data) {
803 jbd_slab_free(jh->b_frozen_data, bh->b_size);
804 jh->b_frozen_data = NULL;
807 spin_lock(&journal->j_list_lock);
808 cp_transaction = jh->b_cp_transaction;
809 if (cp_transaction) {
810 JBUFFER_TRACE(jh, "remove from old cp transaction");
811 __journal_remove_checkpoint(jh);
814 /* Only re-checkpoint the buffer_head if it is marked
815 * dirty. If the buffer was added to the BJ_Forget list
816 * by journal_forget, it may no longer be dirty and
817 * there's no point in keeping a checkpoint record for
818 * it. */
820 /* A buffer which has been freed while still being
821 * journaled by a previous transaction may end up still
822 * being dirty here, but we want to avoid writing back
823 * that buffer in the future now that the last use has
824 * been committed. That's not only a performance gain,
825 * it also stops aliasing problems if the buffer is left
826 * behind for writeback and gets reallocated for another
827 * use in a different page. */
828 if (buffer_freed(bh)) {
829 clear_buffer_freed(bh);
830 clear_buffer_jbddirty(bh);
833 if (buffer_jbddirty(bh)) {
834 JBUFFER_TRACE(jh, "add to new checkpointing trans");
835 __journal_insert_checkpoint(jh, commit_transaction);
836 JBUFFER_TRACE(jh, "refile for checkpoint writeback");
837 __journal_refile_buffer(jh);
838 jbd_unlock_bh_state(bh);
839 } else {
840 J_ASSERT_BH(bh, !buffer_dirty(bh));
841 /* The buffer on BJ_Forget list and not jbddirty means
842 * it has been freed by this transaction and hence it
843 * could not have been reallocated until this
844 * transaction has committed. *BUT* it could be
845 * reallocated once we have written all the data to
846 * disk and before we process the buffer on BJ_Forget
847 * list. */
848 JBUFFER_TRACE(jh, "refile or unfile freed buffer");
849 __journal_refile_buffer(jh);
850 if (!jh->b_transaction) {
851 jbd_unlock_bh_state(bh);
852 /* needs a brelse */
853 journal_remove_journal_head(bh);
854 release_buffer_page(bh);
855 } else
856 jbd_unlock_bh_state(bh);
858 cond_resched_lock(&journal->j_list_lock);
860 spin_unlock(&journal->j_list_lock);
862 * This is a bit sleazy. We borrow j_list_lock to protect
863 * journal->j_committing_transaction in __journal_remove_checkpoint.
864 * Really, __journal_remove_checkpoint should be using j_state_lock but
865 * it's a bit hassle to hold that across __journal_remove_checkpoint
867 spin_lock(&journal->j_state_lock);
868 spin_lock(&journal->j_list_lock);
870 * Now recheck if some buffers did not get attached to the transaction
871 * while the lock was dropped...
873 if (commit_transaction->t_forget) {
874 spin_unlock(&journal->j_list_lock);
875 spin_unlock(&journal->j_state_lock);
876 goto restart_loop;
879 /* Done with this transaction! */
881 jbd_debug(3, "JBD: commit phase 8\n");
883 J_ASSERT(commit_transaction->t_state == T_COMMIT);
885 commit_transaction->t_state = T_FINISHED;
886 J_ASSERT(commit_transaction == journal->j_committing_transaction);
887 journal->j_commit_sequence = commit_transaction->t_tid;
888 journal->j_committing_transaction = NULL;
889 spin_unlock(&journal->j_state_lock);
891 if (commit_transaction->t_checkpoint_list == NULL) {
892 __journal_drop_transaction(journal, commit_transaction);
893 } else {
894 if (journal->j_checkpoint_transactions == NULL) {
895 journal->j_checkpoint_transactions = commit_transaction;
896 commit_transaction->t_cpnext = commit_transaction;
897 commit_transaction->t_cpprev = commit_transaction;
898 } else {
899 commit_transaction->t_cpnext =
900 journal->j_checkpoint_transactions;
901 commit_transaction->t_cpprev =
902 commit_transaction->t_cpnext->t_cpprev;
903 commit_transaction->t_cpnext->t_cpprev =
904 commit_transaction;
905 commit_transaction->t_cpprev->t_cpnext =
906 commit_transaction;
909 spin_unlock(&journal->j_list_lock);
911 jbd_debug(1, "JBD: commit %d complete, head %d\n",
912 journal->j_commit_sequence, journal->j_tail_sequence);
914 wake_up(&journal->j_wait_done_commit);