2 * Copyright (C) 2007 Oracle. All rights reserved.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
20 #include <linux/slab.h>
21 #include <linux/sched.h>
22 #include <linux/writeback.h>
23 #include <linux/pagemap.h>
24 #include <linux/blkdev.h>
27 #include "transaction.h"
31 #define BTRFS_ROOT_TRANS_TAG 0
33 static noinline
void put_transaction(struct btrfs_transaction
*transaction
)
35 WARN_ON(transaction
->use_count
== 0);
36 transaction
->use_count
--;
37 if (transaction
->use_count
== 0) {
38 list_del_init(&transaction
->list
);
39 memset(transaction
, 0, sizeof(*transaction
));
40 kmem_cache_free(btrfs_transaction_cachep
, transaction
);
44 static noinline
void switch_commit_root(struct btrfs_root
*root
)
46 free_extent_buffer(root
->commit_root
);
47 root
->commit_root
= btrfs_root_node(root
);
51 * either allocate a new transaction or hop into the existing one
53 static noinline
int join_transaction(struct btrfs_root
*root
)
55 struct btrfs_transaction
*cur_trans
;
56 cur_trans
= root
->fs_info
->running_transaction
;
58 cur_trans
= kmem_cache_alloc(btrfs_transaction_cachep
,
62 root
->fs_info
->generation
++;
63 cur_trans
->num_writers
= 1;
64 cur_trans
->num_joined
= 0;
65 cur_trans
->transid
= root
->fs_info
->generation
;
66 init_waitqueue_head(&cur_trans
->writer_wait
);
67 init_waitqueue_head(&cur_trans
->commit_wait
);
68 cur_trans
->in_commit
= 0;
69 cur_trans
->blocked
= 0;
70 cur_trans
->use_count
= 1;
71 cur_trans
->commit_done
= 0;
72 cur_trans
->start_time
= get_seconds();
74 cur_trans
->delayed_refs
.root
= RB_ROOT
;
75 cur_trans
->delayed_refs
.num_entries
= 0;
76 cur_trans
->delayed_refs
.num_heads_ready
= 0;
77 cur_trans
->delayed_refs
.num_heads
= 0;
78 cur_trans
->delayed_refs
.flushing
= 0;
79 cur_trans
->delayed_refs
.run_delayed_start
= 0;
80 spin_lock_init(&cur_trans
->delayed_refs
.lock
);
82 INIT_LIST_HEAD(&cur_trans
->pending_snapshots
);
83 list_add_tail(&cur_trans
->list
, &root
->fs_info
->trans_list
);
84 extent_io_tree_init(&cur_trans
->dirty_pages
,
85 root
->fs_info
->btree_inode
->i_mapping
,
87 spin_lock(&root
->fs_info
->new_trans_lock
);
88 root
->fs_info
->running_transaction
= cur_trans
;
89 spin_unlock(&root
->fs_info
->new_trans_lock
);
91 cur_trans
->num_writers
++;
92 cur_trans
->num_joined
++;
99 * this does all the record keeping required to make sure that a reference
100 * counted root is properly recorded in a given transaction. This is required
101 * to make sure the old root from before we joined the transaction is deleted
102 * when the transaction commits
104 static noinline
int record_root_in_trans(struct btrfs_trans_handle
*trans
,
105 struct btrfs_root
*root
)
107 if (root
->ref_cows
&& root
->last_trans
< trans
->transid
) {
108 WARN_ON(root
== root
->fs_info
->extent_root
);
109 WARN_ON(root
->commit_root
!= root
->node
);
111 radix_tree_tag_set(&root
->fs_info
->fs_roots_radix
,
112 (unsigned long)root
->root_key
.objectid
,
113 BTRFS_ROOT_TRANS_TAG
);
114 root
->last_trans
= trans
->transid
;
115 btrfs_init_reloc_root(trans
, root
);
120 int btrfs_record_root_in_trans(struct btrfs_trans_handle
*trans
,
121 struct btrfs_root
*root
)
126 mutex_lock(&root
->fs_info
->trans_mutex
);
127 if (root
->last_trans
== trans
->transid
) {
128 mutex_unlock(&root
->fs_info
->trans_mutex
);
132 record_root_in_trans(trans
, root
);
133 mutex_unlock(&root
->fs_info
->trans_mutex
);
137 /* wait for commit against the current transaction to become unblocked
138 * when this is done, it is safe to start a new transaction, but the current
139 * transaction might not be fully on disk.
141 static void wait_current_trans(struct btrfs_root
*root
)
143 struct btrfs_transaction
*cur_trans
;
145 cur_trans
= root
->fs_info
->running_transaction
;
146 if (cur_trans
&& cur_trans
->blocked
) {
148 cur_trans
->use_count
++;
150 prepare_to_wait(&root
->fs_info
->transaction_wait
, &wait
,
151 TASK_UNINTERRUPTIBLE
);
152 if (!cur_trans
->blocked
)
154 mutex_unlock(&root
->fs_info
->trans_mutex
);
156 mutex_lock(&root
->fs_info
->trans_mutex
);
158 finish_wait(&root
->fs_info
->transaction_wait
, &wait
);
159 put_transaction(cur_trans
);
163 enum btrfs_trans_type
{
170 static int may_wait_transaction(struct btrfs_root
*root
, int type
)
172 if (!root
->fs_info
->log_root_recovering
&&
173 ((type
== TRANS_START
&& !root
->fs_info
->open_ioctl_trans
) ||
174 type
== TRANS_USERSPACE
))
179 static struct btrfs_trans_handle
*start_transaction(struct btrfs_root
*root
,
180 u64 num_items
, int type
)
182 struct btrfs_trans_handle
*h
;
183 struct btrfs_transaction
*cur_trans
;
186 if (root
->fs_info
->fs_state
& BTRFS_SUPER_FLAG_ERROR
)
187 return ERR_PTR(-EROFS
);
189 h
= kmem_cache_alloc(btrfs_trans_handle_cachep
, GFP_NOFS
);
191 return ERR_PTR(-ENOMEM
);
193 if (type
!= TRANS_JOIN_NOLOCK
)
194 mutex_lock(&root
->fs_info
->trans_mutex
);
195 if (may_wait_transaction(root
, type
))
196 wait_current_trans(root
);
198 ret
= join_transaction(root
);
200 kmem_cache_free(btrfs_trans_handle_cachep
, h
);
201 if (type
!= TRANS_JOIN_NOLOCK
)
202 mutex_unlock(&root
->fs_info
->trans_mutex
);
206 cur_trans
= root
->fs_info
->running_transaction
;
207 cur_trans
->use_count
++;
208 if (type
!= TRANS_JOIN_NOLOCK
)
209 mutex_unlock(&root
->fs_info
->trans_mutex
);
211 h
->transid
= cur_trans
->transid
;
212 h
->transaction
= cur_trans
;
215 h
->bytes_reserved
= 0;
216 h
->delayed_ref_updates
= 0;
220 if (cur_trans
->blocked
&& may_wait_transaction(root
, type
)) {
221 btrfs_commit_transaction(h
, root
);
226 ret
= btrfs_trans_reserve_metadata(h
, root
, num_items
);
227 if (ret
== -EAGAIN
) {
228 btrfs_commit_transaction(h
, root
);
232 btrfs_end_transaction(h
, root
);
237 if (type
!= TRANS_JOIN_NOLOCK
)
238 mutex_lock(&root
->fs_info
->trans_mutex
);
239 record_root_in_trans(h
, root
);
240 if (type
!= TRANS_JOIN_NOLOCK
)
241 mutex_unlock(&root
->fs_info
->trans_mutex
);
243 if (!current
->journal_info
&& type
!= TRANS_USERSPACE
)
244 current
->journal_info
= h
;
248 struct btrfs_trans_handle
*btrfs_start_transaction(struct btrfs_root
*root
,
251 return start_transaction(root
, num_items
, TRANS_START
);
253 struct btrfs_trans_handle
*btrfs_join_transaction(struct btrfs_root
*root
,
256 return start_transaction(root
, 0, TRANS_JOIN
);
259 struct btrfs_trans_handle
*btrfs_join_transaction_nolock(struct btrfs_root
*root
,
262 return start_transaction(root
, 0, TRANS_JOIN_NOLOCK
);
265 struct btrfs_trans_handle
*btrfs_start_ioctl_transaction(struct btrfs_root
*r
,
268 return start_transaction(r
, 0, TRANS_USERSPACE
);
271 /* wait for a transaction commit to be fully complete */
272 static noinline
int wait_for_commit(struct btrfs_root
*root
,
273 struct btrfs_transaction
*commit
)
276 mutex_lock(&root
->fs_info
->trans_mutex
);
277 while (!commit
->commit_done
) {
278 prepare_to_wait(&commit
->commit_wait
, &wait
,
279 TASK_UNINTERRUPTIBLE
);
280 if (commit
->commit_done
)
282 mutex_unlock(&root
->fs_info
->trans_mutex
);
284 mutex_lock(&root
->fs_info
->trans_mutex
);
286 mutex_unlock(&root
->fs_info
->trans_mutex
);
287 finish_wait(&commit
->commit_wait
, &wait
);
291 int btrfs_wait_for_commit(struct btrfs_root
*root
, u64 transid
)
293 struct btrfs_transaction
*cur_trans
= NULL
, *t
;
296 mutex_lock(&root
->fs_info
->trans_mutex
);
300 if (transid
<= root
->fs_info
->last_trans_committed
)
303 /* find specified transaction */
304 list_for_each_entry(t
, &root
->fs_info
->trans_list
, list
) {
305 if (t
->transid
== transid
) {
309 if (t
->transid
> transid
)
314 goto out_unlock
; /* bad transid */
316 /* find newest transaction that is committing | committed */
317 list_for_each_entry_reverse(t
, &root
->fs_info
->trans_list
,
327 goto out_unlock
; /* nothing committing|committed */
330 cur_trans
->use_count
++;
331 mutex_unlock(&root
->fs_info
->trans_mutex
);
333 wait_for_commit(root
, cur_trans
);
335 mutex_lock(&root
->fs_info
->trans_mutex
);
336 put_transaction(cur_trans
);
339 mutex_unlock(&root
->fs_info
->trans_mutex
);
345 * rate limit against the drop_snapshot code. This helps to slow down new
346 * operations if the drop_snapshot code isn't able to keep up.
348 static void throttle_on_drops(struct btrfs_root
*root
)
350 struct btrfs_fs_info
*info
= root
->fs_info
;
351 int harder_count
= 0;
354 if (atomic_read(&info
->throttles
)) {
357 thr
= atomic_read(&info
->throttle_gen
);
360 prepare_to_wait(&info
->transaction_throttle
,
361 &wait
, TASK_UNINTERRUPTIBLE
);
362 if (!atomic_read(&info
->throttles
)) {
363 finish_wait(&info
->transaction_throttle
, &wait
);
367 finish_wait(&info
->transaction_throttle
, &wait
);
368 } while (thr
== atomic_read(&info
->throttle_gen
));
371 if (root
->fs_info
->total_ref_cache_size
> 1 * 1024 * 1024 &&
375 if (root
->fs_info
->total_ref_cache_size
> 5 * 1024 * 1024 &&
379 if (root
->fs_info
->total_ref_cache_size
> 10 * 1024 * 1024 &&
386 void btrfs_throttle(struct btrfs_root
*root
)
388 mutex_lock(&root
->fs_info
->trans_mutex
);
389 if (!root
->fs_info
->open_ioctl_trans
)
390 wait_current_trans(root
);
391 mutex_unlock(&root
->fs_info
->trans_mutex
);
394 static int should_end_transaction(struct btrfs_trans_handle
*trans
,
395 struct btrfs_root
*root
)
398 ret
= btrfs_block_rsv_check(trans
, root
,
399 &root
->fs_info
->global_block_rsv
, 0, 5);
403 int btrfs_should_end_transaction(struct btrfs_trans_handle
*trans
,
404 struct btrfs_root
*root
)
406 struct btrfs_transaction
*cur_trans
= trans
->transaction
;
409 if (cur_trans
->blocked
|| cur_trans
->delayed_refs
.flushing
)
412 updates
= trans
->delayed_ref_updates
;
413 trans
->delayed_ref_updates
= 0;
415 btrfs_run_delayed_refs(trans
, root
, updates
);
417 return should_end_transaction(trans
, root
);
420 static int __btrfs_end_transaction(struct btrfs_trans_handle
*trans
,
421 struct btrfs_root
*root
, int throttle
, int lock
)
423 struct btrfs_transaction
*cur_trans
= trans
->transaction
;
424 struct btrfs_fs_info
*info
= root
->fs_info
;
428 unsigned long cur
= trans
->delayed_ref_updates
;
429 trans
->delayed_ref_updates
= 0;
431 trans
->transaction
->delayed_refs
.num_heads_ready
> 64) {
432 trans
->delayed_ref_updates
= 0;
435 * do a full flush if the transaction is trying
438 if (trans
->transaction
->delayed_refs
.flushing
)
440 btrfs_run_delayed_refs(trans
, root
, cur
);
447 btrfs_trans_release_metadata(trans
, root
);
449 if (lock
&& !root
->fs_info
->open_ioctl_trans
&&
450 should_end_transaction(trans
, root
))
451 trans
->transaction
->blocked
= 1;
453 if (lock
&& cur_trans
->blocked
&& !cur_trans
->in_commit
) {
455 return btrfs_commit_transaction(trans
, root
);
457 wake_up_process(info
->transaction_kthread
);
461 mutex_lock(&info
->trans_mutex
);
462 WARN_ON(cur_trans
!= info
->running_transaction
);
463 WARN_ON(cur_trans
->num_writers
< 1);
464 cur_trans
->num_writers
--;
467 if (waitqueue_active(&cur_trans
->writer_wait
))
468 wake_up(&cur_trans
->writer_wait
);
469 put_transaction(cur_trans
);
471 mutex_unlock(&info
->trans_mutex
);
473 if (current
->journal_info
== trans
)
474 current
->journal_info
= NULL
;
475 memset(trans
, 0, sizeof(*trans
));
476 kmem_cache_free(btrfs_trans_handle_cachep
, trans
);
479 btrfs_run_delayed_iputs(root
);
484 int btrfs_end_transaction(struct btrfs_trans_handle
*trans
,
485 struct btrfs_root
*root
)
487 return __btrfs_end_transaction(trans
, root
, 0, 1);
490 int btrfs_end_transaction_throttle(struct btrfs_trans_handle
*trans
,
491 struct btrfs_root
*root
)
493 return __btrfs_end_transaction(trans
, root
, 1, 1);
496 int btrfs_end_transaction_nolock(struct btrfs_trans_handle
*trans
,
497 struct btrfs_root
*root
)
499 return __btrfs_end_transaction(trans
, root
, 0, 0);
503 * when btree blocks are allocated, they have some corresponding bits set for
504 * them in one of two extent_io trees. This is used to make sure all of
505 * those extents are sent to disk but does not wait on them
507 int btrfs_write_marked_extents(struct btrfs_root
*root
,
508 struct extent_io_tree
*dirty_pages
, int mark
)
514 struct inode
*btree_inode
= root
->fs_info
->btree_inode
;
520 ret
= find_first_extent_bit(dirty_pages
, start
, &start
, &end
,
524 while (start
<= end
) {
527 index
= start
>> PAGE_CACHE_SHIFT
;
528 start
= (u64
)(index
+ 1) << PAGE_CACHE_SHIFT
;
529 page
= find_get_page(btree_inode
->i_mapping
, index
);
533 btree_lock_page_hook(page
);
534 if (!page
->mapping
) {
536 page_cache_release(page
);
540 if (PageWriteback(page
)) {
542 wait_on_page_writeback(page
);
545 page_cache_release(page
);
549 err
= write_one_page(page
, 0);
552 page_cache_release(page
);
561 * when btree blocks are allocated, they have some corresponding bits set for
562 * them in one of two extent_io trees. This is used to make sure all of
563 * those extents are on disk for transaction or log commit. We wait
564 * on all the pages and clear them from the dirty pages state tree
566 int btrfs_wait_marked_extents(struct btrfs_root
*root
,
567 struct extent_io_tree
*dirty_pages
, int mark
)
573 struct inode
*btree_inode
= root
->fs_info
->btree_inode
;
579 ret
= find_first_extent_bit(dirty_pages
, start
, &start
, &end
,
584 clear_extent_bits(dirty_pages
, start
, end
, mark
, GFP_NOFS
);
585 while (start
<= end
) {
586 index
= start
>> PAGE_CACHE_SHIFT
;
587 start
= (u64
)(index
+ 1) << PAGE_CACHE_SHIFT
;
588 page
= find_get_page(btree_inode
->i_mapping
, index
);
591 if (PageDirty(page
)) {
592 btree_lock_page_hook(page
);
593 wait_on_page_writeback(page
);
594 err
= write_one_page(page
, 0);
598 wait_on_page_writeback(page
);
599 page_cache_release(page
);
609 * when btree blocks are allocated, they have some corresponding bits set for
610 * them in one of two extent_io trees. This is used to make sure all of
611 * those extents are on disk for transaction or log commit
613 int btrfs_write_and_wait_marked_extents(struct btrfs_root
*root
,
614 struct extent_io_tree
*dirty_pages
, int mark
)
619 ret
= btrfs_write_marked_extents(root
, dirty_pages
, mark
);
620 ret2
= btrfs_wait_marked_extents(root
, dirty_pages
, mark
);
624 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle
*trans
,
625 struct btrfs_root
*root
)
627 if (!trans
|| !trans
->transaction
) {
628 struct inode
*btree_inode
;
629 btree_inode
= root
->fs_info
->btree_inode
;
630 return filemap_write_and_wait(btree_inode
->i_mapping
);
632 return btrfs_write_and_wait_marked_extents(root
,
633 &trans
->transaction
->dirty_pages
,
638 * this is used to update the root pointer in the tree of tree roots.
640 * But, in the case of the extent allocation tree, updating the root
641 * pointer may allocate blocks which may change the root of the extent
644 * So, this loops and repeats and makes sure the cowonly root didn't
645 * change while the root pointer was being updated in the metadata.
647 static int update_cowonly_root(struct btrfs_trans_handle
*trans
,
648 struct btrfs_root
*root
)
653 struct btrfs_root
*tree_root
= root
->fs_info
->tree_root
;
655 old_root_used
= btrfs_root_used(&root
->root_item
);
656 btrfs_write_dirty_block_groups(trans
, root
);
659 old_root_bytenr
= btrfs_root_bytenr(&root
->root_item
);
660 if (old_root_bytenr
== root
->node
->start
&&
661 old_root_used
== btrfs_root_used(&root
->root_item
))
664 btrfs_set_root_node(&root
->root_item
, root
->node
);
665 ret
= btrfs_update_root(trans
, tree_root
,
670 old_root_used
= btrfs_root_used(&root
->root_item
);
671 ret
= btrfs_write_dirty_block_groups(trans
, root
);
675 if (root
!= root
->fs_info
->extent_root
)
676 switch_commit_root(root
);
682 * update all the cowonly tree roots on disk
684 static noinline
int commit_cowonly_roots(struct btrfs_trans_handle
*trans
,
685 struct btrfs_root
*root
)
687 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
688 struct list_head
*next
;
689 struct extent_buffer
*eb
;
692 ret
= btrfs_run_delayed_refs(trans
, root
, (unsigned long)-1);
695 eb
= btrfs_lock_root_node(fs_info
->tree_root
);
696 btrfs_cow_block(trans
, fs_info
->tree_root
, eb
, NULL
, 0, &eb
);
697 btrfs_tree_unlock(eb
);
698 free_extent_buffer(eb
);
700 ret
= btrfs_run_delayed_refs(trans
, root
, (unsigned long)-1);
703 while (!list_empty(&fs_info
->dirty_cowonly_roots
)) {
704 next
= fs_info
->dirty_cowonly_roots
.next
;
706 root
= list_entry(next
, struct btrfs_root
, dirty_list
);
708 update_cowonly_root(trans
, root
);
711 down_write(&fs_info
->extent_commit_sem
);
712 switch_commit_root(fs_info
->extent_root
);
713 up_write(&fs_info
->extent_commit_sem
);
719 * dead roots are old snapshots that need to be deleted. This allocates
720 * a dirty root struct and adds it into the list of dead roots that need to
723 int btrfs_add_dead_root(struct btrfs_root
*root
)
725 mutex_lock(&root
->fs_info
->trans_mutex
);
726 list_add(&root
->root_list
, &root
->fs_info
->dead_roots
);
727 mutex_unlock(&root
->fs_info
->trans_mutex
);
732 * update all the cowonly tree roots on disk
734 static noinline
int commit_fs_roots(struct btrfs_trans_handle
*trans
,
735 struct btrfs_root
*root
)
737 struct btrfs_root
*gang
[8];
738 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
744 ret
= radix_tree_gang_lookup_tag(&fs_info
->fs_roots_radix
,
747 BTRFS_ROOT_TRANS_TAG
);
750 for (i
= 0; i
< ret
; i
++) {
752 radix_tree_tag_clear(&fs_info
->fs_roots_radix
,
753 (unsigned long)root
->root_key
.objectid
,
754 BTRFS_ROOT_TRANS_TAG
);
756 btrfs_free_log(trans
, root
);
757 btrfs_update_reloc_root(trans
, root
);
758 btrfs_orphan_commit_root(trans
, root
);
760 if (root
->commit_root
!= root
->node
) {
761 switch_commit_root(root
);
762 btrfs_set_root_node(&root
->root_item
,
766 err
= btrfs_update_root(trans
, fs_info
->tree_root
,
777 * defrag a given btree. If cacheonly == 1, this won't read from the disk,
778 * otherwise every leaf in the btree is read and defragged.
780 int btrfs_defrag_root(struct btrfs_root
*root
, int cacheonly
)
782 struct btrfs_fs_info
*info
= root
->fs_info
;
783 struct btrfs_trans_handle
*trans
;
787 if (xchg(&root
->defrag_running
, 1))
791 trans
= btrfs_start_transaction(root
, 0);
793 return PTR_ERR(trans
);
795 ret
= btrfs_defrag_leaves(trans
, root
, cacheonly
);
797 nr
= trans
->blocks_used
;
798 btrfs_end_transaction(trans
, root
);
799 btrfs_btree_balance_dirty(info
->tree_root
, nr
);
802 if (root
->fs_info
->closing
|| ret
!= -EAGAIN
)
805 root
->defrag_running
= 0;
811 * when dropping snapshots, we generate a ton of delayed refs, and it makes
812 * sense not to join the transaction while it is trying to flush the current
813 * queue of delayed refs out.
815 * This is used by the drop snapshot code only
817 static noinline
int wait_transaction_pre_flush(struct btrfs_fs_info
*info
)
821 mutex_lock(&info
->trans_mutex
);
822 while (info
->running_transaction
&&
823 info
->running_transaction
->delayed_refs
.flushing
) {
824 prepare_to_wait(&info
->transaction_wait
, &wait
,
825 TASK_UNINTERRUPTIBLE
);
826 mutex_unlock(&info
->trans_mutex
);
830 mutex_lock(&info
->trans_mutex
);
831 finish_wait(&info
->transaction_wait
, &wait
);
833 mutex_unlock(&info
->trans_mutex
);
838 * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
841 int btrfs_drop_dead_root(struct btrfs_root
*root
)
843 struct btrfs_trans_handle
*trans
;
844 struct btrfs_root
*tree_root
= root
->fs_info
->tree_root
;
850 * we don't want to jump in and create a bunch of
851 * delayed refs if the transaction is starting to close
853 wait_transaction_pre_flush(tree_root
->fs_info
);
854 trans
= btrfs_start_transaction(tree_root
, 1);
857 * we've joined a transaction, make sure it isn't
860 if (trans
->transaction
->delayed_refs
.flushing
) {
861 btrfs_end_transaction(trans
, tree_root
);
865 ret
= btrfs_drop_snapshot(trans
, root
);
869 ret
= btrfs_update_root(trans
, tree_root
,
875 nr
= trans
->blocks_used
;
876 ret
= btrfs_end_transaction(trans
, tree_root
);
879 btrfs_btree_balance_dirty(tree_root
, nr
);
884 ret
= btrfs_del_root(trans
, tree_root
, &root
->root_key
);
887 nr
= trans
->blocks_used
;
888 ret
= btrfs_end_transaction(trans
, tree_root
);
891 free_extent_buffer(root
->node
);
892 free_extent_buffer(root
->commit_root
);
895 btrfs_btree_balance_dirty(tree_root
, nr
);
901 * new snapshots need to be created at a very specific time in the
902 * transaction commit. This does the actual creation
904 static noinline
int create_pending_snapshot(struct btrfs_trans_handle
*trans
,
905 struct btrfs_fs_info
*fs_info
,
906 struct btrfs_pending_snapshot
*pending
)
908 struct btrfs_key key
;
909 struct btrfs_root_item
*new_root_item
;
910 struct btrfs_root
*tree_root
= fs_info
->tree_root
;
911 struct btrfs_root
*root
= pending
->root
;
912 struct btrfs_root
*parent_root
;
913 struct inode
*parent_inode
;
914 struct dentry
*parent
;
915 struct dentry
*dentry
;
916 struct extent_buffer
*tmp
;
917 struct extent_buffer
*old
;
924 new_root_item
= kmalloc(sizeof(*new_root_item
), GFP_NOFS
);
925 if (!new_root_item
) {
926 pending
->error
= -ENOMEM
;
930 ret
= btrfs_find_free_objectid(trans
, tree_root
, 0, &objectid
);
932 pending
->error
= ret
;
936 btrfs_reloc_pre_snapshot(trans
, pending
, &to_reserve
);
937 btrfs_orphan_pre_snapshot(trans
, pending
, &to_reserve
);
939 if (to_reserve
> 0) {
940 ret
= btrfs_block_rsv_add(trans
, root
, &pending
->block_rsv
,
943 pending
->error
= ret
;
948 key
.objectid
= objectid
;
949 key
.offset
= (u64
)-1;
950 key
.type
= BTRFS_ROOT_ITEM_KEY
;
952 trans
->block_rsv
= &pending
->block_rsv
;
954 dentry
= pending
->dentry
;
955 parent
= dget_parent(dentry
);
956 parent_inode
= parent
->d_inode
;
957 parent_root
= BTRFS_I(parent_inode
)->root
;
958 record_root_in_trans(trans
, parent_root
);
961 * insert the directory item
963 ret
= btrfs_set_inode_index(parent_inode
, &index
);
965 ret
= btrfs_insert_dir_item(trans
, parent_root
,
966 dentry
->d_name
.name
, dentry
->d_name
.len
,
967 parent_inode
->i_ino
, &key
,
968 BTRFS_FT_DIR
, index
);
971 btrfs_i_size_write(parent_inode
, parent_inode
->i_size
+
972 dentry
->d_name
.len
* 2);
973 ret
= btrfs_update_inode(trans
, parent_root
, parent_inode
);
976 record_root_in_trans(trans
, root
);
977 btrfs_set_root_last_snapshot(&root
->root_item
, trans
->transid
);
978 memcpy(new_root_item
, &root
->root_item
, sizeof(*new_root_item
));
979 btrfs_check_and_init_root_item(new_root_item
);
981 root_flags
= btrfs_root_flags(new_root_item
);
982 if (pending
->readonly
)
983 root_flags
|= BTRFS_ROOT_SUBVOL_RDONLY
;
985 root_flags
&= ~BTRFS_ROOT_SUBVOL_RDONLY
;
986 btrfs_set_root_flags(new_root_item
, root_flags
);
988 old
= btrfs_lock_root_node(root
);
989 btrfs_cow_block(trans
, root
, old
, NULL
, 0, &old
);
990 btrfs_set_lock_blocking(old
);
992 btrfs_copy_root(trans
, root
, old
, &tmp
, objectid
);
993 btrfs_tree_unlock(old
);
994 free_extent_buffer(old
);
996 btrfs_set_root_node(new_root_item
, tmp
);
997 /* record when the snapshot was created in key.offset */
998 key
.offset
= trans
->transid
;
999 ret
= btrfs_insert_root(trans
, tree_root
, &key
, new_root_item
);
1000 btrfs_tree_unlock(tmp
);
1001 free_extent_buffer(tmp
);
1005 * insert root back/forward references
1007 ret
= btrfs_add_root_ref(trans
, tree_root
, objectid
,
1008 parent_root
->root_key
.objectid
,
1009 parent_inode
->i_ino
, index
,
1010 dentry
->d_name
.name
, dentry
->d_name
.len
);
1014 key
.offset
= (u64
)-1;
1015 pending
->snap
= btrfs_read_fs_root_no_name(root
->fs_info
, &key
);
1016 BUG_ON(IS_ERR(pending
->snap
));
1018 btrfs_reloc_post_snapshot(trans
, pending
);
1019 btrfs_orphan_post_snapshot(trans
, pending
);
1021 kfree(new_root_item
);
1022 btrfs_block_rsv_release(root
, &pending
->block_rsv
, (u64
)-1);
1027 * create all the snapshots we've scheduled for creation
1029 static noinline
int create_pending_snapshots(struct btrfs_trans_handle
*trans
,
1030 struct btrfs_fs_info
*fs_info
)
1032 struct btrfs_pending_snapshot
*pending
;
1033 struct list_head
*head
= &trans
->transaction
->pending_snapshots
;
1036 list_for_each_entry(pending
, head
, list
) {
1037 ret
= create_pending_snapshot(trans
, fs_info
, pending
);
1043 static void update_super_roots(struct btrfs_root
*root
)
1045 struct btrfs_root_item
*root_item
;
1046 struct btrfs_super_block
*super
;
1048 super
= &root
->fs_info
->super_copy
;
1050 root_item
= &root
->fs_info
->chunk_root
->root_item
;
1051 super
->chunk_root
= root_item
->bytenr
;
1052 super
->chunk_root_generation
= root_item
->generation
;
1053 super
->chunk_root_level
= root_item
->level
;
1055 root_item
= &root
->fs_info
->tree_root
->root_item
;
1056 super
->root
= root_item
->bytenr
;
1057 super
->generation
= root_item
->generation
;
1058 super
->root_level
= root_item
->level
;
1059 if (super
->cache_generation
!= 0 || btrfs_test_opt(root
, SPACE_CACHE
))
1060 super
->cache_generation
= root_item
->generation
;
1063 int btrfs_transaction_in_commit(struct btrfs_fs_info
*info
)
1066 spin_lock(&info
->new_trans_lock
);
1067 if (info
->running_transaction
)
1068 ret
= info
->running_transaction
->in_commit
;
1069 spin_unlock(&info
->new_trans_lock
);
1073 int btrfs_transaction_blocked(struct btrfs_fs_info
*info
)
1076 spin_lock(&info
->new_trans_lock
);
1077 if (info
->running_transaction
)
1078 ret
= info
->running_transaction
->blocked
;
1079 spin_unlock(&info
->new_trans_lock
);
1084 * wait for the current transaction commit to start and block subsequent
1087 static void wait_current_trans_commit_start(struct btrfs_root
*root
,
1088 struct btrfs_transaction
*trans
)
1092 if (trans
->in_commit
)
1096 prepare_to_wait(&root
->fs_info
->transaction_blocked_wait
, &wait
,
1097 TASK_UNINTERRUPTIBLE
);
1098 if (trans
->in_commit
) {
1099 finish_wait(&root
->fs_info
->transaction_blocked_wait
,
1103 mutex_unlock(&root
->fs_info
->trans_mutex
);
1105 mutex_lock(&root
->fs_info
->trans_mutex
);
1106 finish_wait(&root
->fs_info
->transaction_blocked_wait
, &wait
);
1111 * wait for the current transaction to start and then become unblocked.
1114 static void wait_current_trans_commit_start_and_unblock(struct btrfs_root
*root
,
1115 struct btrfs_transaction
*trans
)
1119 if (trans
->commit_done
|| (trans
->in_commit
&& !trans
->blocked
))
1123 prepare_to_wait(&root
->fs_info
->transaction_wait
, &wait
,
1124 TASK_UNINTERRUPTIBLE
);
1125 if (trans
->commit_done
||
1126 (trans
->in_commit
&& !trans
->blocked
)) {
1127 finish_wait(&root
->fs_info
->transaction_wait
,
1131 mutex_unlock(&root
->fs_info
->trans_mutex
);
1133 mutex_lock(&root
->fs_info
->trans_mutex
);
1134 finish_wait(&root
->fs_info
->transaction_wait
,
1140 * commit transactions asynchronously. once btrfs_commit_transaction_async
1141 * returns, any subsequent transaction will not be allowed to join.
1143 struct btrfs_async_commit
{
1144 struct btrfs_trans_handle
*newtrans
;
1145 struct btrfs_root
*root
;
1146 struct delayed_work work
;
1149 static void do_async_commit(struct work_struct
*work
)
1151 struct btrfs_async_commit
*ac
=
1152 container_of(work
, struct btrfs_async_commit
, work
.work
);
1154 btrfs_commit_transaction(ac
->newtrans
, ac
->root
);
1158 int btrfs_commit_transaction_async(struct btrfs_trans_handle
*trans
,
1159 struct btrfs_root
*root
,
1160 int wait_for_unblock
)
1162 struct btrfs_async_commit
*ac
;
1163 struct btrfs_transaction
*cur_trans
;
1165 ac
= kmalloc(sizeof(*ac
), GFP_NOFS
);
1169 INIT_DELAYED_WORK(&ac
->work
, do_async_commit
);
1171 ac
->newtrans
= btrfs_join_transaction(root
, 0);
1172 if (IS_ERR(ac
->newtrans
)) {
1173 int err
= PTR_ERR(ac
->newtrans
);
1178 /* take transaction reference */
1179 mutex_lock(&root
->fs_info
->trans_mutex
);
1180 cur_trans
= trans
->transaction
;
1181 cur_trans
->use_count
++;
1182 mutex_unlock(&root
->fs_info
->trans_mutex
);
1184 btrfs_end_transaction(trans
, root
);
1185 schedule_delayed_work(&ac
->work
, 0);
1187 /* wait for transaction to start and unblock */
1188 mutex_lock(&root
->fs_info
->trans_mutex
);
1189 if (wait_for_unblock
)
1190 wait_current_trans_commit_start_and_unblock(root
, cur_trans
);
1192 wait_current_trans_commit_start(root
, cur_trans
);
1193 put_transaction(cur_trans
);
1194 mutex_unlock(&root
->fs_info
->trans_mutex
);
1200 * btrfs_transaction state sequence:
1201 * in_commit = 0, blocked = 0 (initial)
1202 * in_commit = 1, blocked = 1
1206 int btrfs_commit_transaction(struct btrfs_trans_handle
*trans
,
1207 struct btrfs_root
*root
)
1209 unsigned long joined
= 0;
1210 struct btrfs_transaction
*cur_trans
;
1211 struct btrfs_transaction
*prev_trans
= NULL
;
1214 int should_grow
= 0;
1215 unsigned long now
= get_seconds();
1216 int flush_on_commit
= btrfs_test_opt(root
, FLUSHONCOMMIT
);
1218 btrfs_run_ordered_operations(root
, 0);
1220 /* make a pass through all the delayed refs we have so far
1221 * any runnings procs may add more while we are here
1223 ret
= btrfs_run_delayed_refs(trans
, root
, 0);
1226 btrfs_trans_release_metadata(trans
, root
);
1228 cur_trans
= trans
->transaction
;
1230 * set the flushing flag so procs in this transaction have to
1231 * start sending their work down.
1233 cur_trans
->delayed_refs
.flushing
= 1;
1235 ret
= btrfs_run_delayed_refs(trans
, root
, 0);
1238 mutex_lock(&root
->fs_info
->trans_mutex
);
1239 if (cur_trans
->in_commit
) {
1240 cur_trans
->use_count
++;
1241 mutex_unlock(&root
->fs_info
->trans_mutex
);
1242 btrfs_end_transaction(trans
, root
);
1244 ret
= wait_for_commit(root
, cur_trans
);
1247 mutex_lock(&root
->fs_info
->trans_mutex
);
1248 put_transaction(cur_trans
);
1249 mutex_unlock(&root
->fs_info
->trans_mutex
);
1254 trans
->transaction
->in_commit
= 1;
1255 trans
->transaction
->blocked
= 1;
1256 wake_up(&root
->fs_info
->transaction_blocked_wait
);
1258 if (cur_trans
->list
.prev
!= &root
->fs_info
->trans_list
) {
1259 prev_trans
= list_entry(cur_trans
->list
.prev
,
1260 struct btrfs_transaction
, list
);
1261 if (!prev_trans
->commit_done
) {
1262 prev_trans
->use_count
++;
1263 mutex_unlock(&root
->fs_info
->trans_mutex
);
1265 wait_for_commit(root
, prev_trans
);
1267 mutex_lock(&root
->fs_info
->trans_mutex
);
1268 put_transaction(prev_trans
);
1272 if (now
< cur_trans
->start_time
|| now
- cur_trans
->start_time
< 1)
1276 int snap_pending
= 0;
1277 joined
= cur_trans
->num_joined
;
1278 if (!list_empty(&trans
->transaction
->pending_snapshots
))
1281 WARN_ON(cur_trans
!= trans
->transaction
);
1282 mutex_unlock(&root
->fs_info
->trans_mutex
);
1284 if (flush_on_commit
|| snap_pending
) {
1285 btrfs_start_delalloc_inodes(root
, 1);
1286 ret
= btrfs_wait_ordered_extents(root
, 0, 1);
1291 * rename don't use btrfs_join_transaction, so, once we
1292 * set the transaction to blocked above, we aren't going
1293 * to get any new ordered operations. We can safely run
1294 * it here and no for sure that nothing new will be added
1297 btrfs_run_ordered_operations(root
, 1);
1299 prepare_to_wait(&cur_trans
->writer_wait
, &wait
,
1300 TASK_UNINTERRUPTIBLE
);
1303 if (cur_trans
->num_writers
> 1)
1304 schedule_timeout(MAX_SCHEDULE_TIMEOUT
);
1305 else if (should_grow
)
1306 schedule_timeout(1);
1308 mutex_lock(&root
->fs_info
->trans_mutex
);
1309 finish_wait(&cur_trans
->writer_wait
, &wait
);
1310 } while (cur_trans
->num_writers
> 1 ||
1311 (should_grow
&& cur_trans
->num_joined
!= joined
));
1313 ret
= create_pending_snapshots(trans
, root
->fs_info
);
1316 ret
= btrfs_run_delayed_refs(trans
, root
, (unsigned long)-1);
1319 WARN_ON(cur_trans
!= trans
->transaction
);
1321 /* btrfs_commit_tree_roots is responsible for getting the
1322 * various roots consistent with each other. Every pointer
1323 * in the tree of tree roots has to point to the most up to date
1324 * root for every subvolume and other tree. So, we have to keep
1325 * the tree logging code from jumping in and changing any
1328 * At this point in the commit, there can't be any tree-log
1329 * writers, but a little lower down we drop the trans mutex
1330 * and let new people in. By holding the tree_log_mutex
1331 * from now until after the super is written, we avoid races
1332 * with the tree-log code.
1334 mutex_lock(&root
->fs_info
->tree_log_mutex
);
1336 ret
= commit_fs_roots(trans
, root
);
1339 /* commit_fs_roots gets rid of all the tree log roots, it is now
1340 * safe to free the root of tree log roots
1342 btrfs_free_log_root_tree(trans
, root
->fs_info
);
1344 ret
= commit_cowonly_roots(trans
, root
);
1347 btrfs_prepare_extent_commit(trans
, root
);
1349 cur_trans
= root
->fs_info
->running_transaction
;
1350 spin_lock(&root
->fs_info
->new_trans_lock
);
1351 root
->fs_info
->running_transaction
= NULL
;
1352 spin_unlock(&root
->fs_info
->new_trans_lock
);
1354 btrfs_set_root_node(&root
->fs_info
->tree_root
->root_item
,
1355 root
->fs_info
->tree_root
->node
);
1356 switch_commit_root(root
->fs_info
->tree_root
);
1358 btrfs_set_root_node(&root
->fs_info
->chunk_root
->root_item
,
1359 root
->fs_info
->chunk_root
->node
);
1360 switch_commit_root(root
->fs_info
->chunk_root
);
1362 update_super_roots(root
);
1364 if (!root
->fs_info
->log_root_recovering
) {
1365 btrfs_set_super_log_root(&root
->fs_info
->super_copy
, 0);
1366 btrfs_set_super_log_root_level(&root
->fs_info
->super_copy
, 0);
1369 memcpy(&root
->fs_info
->super_for_commit
, &root
->fs_info
->super_copy
,
1370 sizeof(root
->fs_info
->super_copy
));
1372 trans
->transaction
->blocked
= 0;
1374 wake_up(&root
->fs_info
->transaction_wait
);
1376 mutex_unlock(&root
->fs_info
->trans_mutex
);
1377 ret
= btrfs_write_and_wait_transaction(trans
, root
);
1379 write_ctree_super(trans
, root
, 0);
1382 * the super is written, we can safely allow the tree-loggers
1383 * to go about their business
1385 mutex_unlock(&root
->fs_info
->tree_log_mutex
);
1387 btrfs_finish_extent_commit(trans
, root
);
1389 mutex_lock(&root
->fs_info
->trans_mutex
);
1391 cur_trans
->commit_done
= 1;
1393 root
->fs_info
->last_trans_committed
= cur_trans
->transid
;
1395 wake_up(&cur_trans
->commit_wait
);
1397 put_transaction(cur_trans
);
1398 put_transaction(cur_trans
);
1400 trace_btrfs_transaction_commit(root
);
1402 mutex_unlock(&root
->fs_info
->trans_mutex
);
1404 if (current
->journal_info
== trans
)
1405 current
->journal_info
= NULL
;
1407 kmem_cache_free(btrfs_trans_handle_cachep
, trans
);
1409 if (current
!= root
->fs_info
->transaction_kthread
)
1410 btrfs_run_delayed_iputs(root
);
1416 * interface function to delete all the snapshots we have scheduled for deletion
1418 int btrfs_clean_old_snapshots(struct btrfs_root
*root
)
1421 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1423 mutex_lock(&fs_info
->trans_mutex
);
1424 list_splice_init(&fs_info
->dead_roots
, &list
);
1425 mutex_unlock(&fs_info
->trans_mutex
);
1427 while (!list_empty(&list
)) {
1428 root
= list_entry(list
.next
, struct btrfs_root
, root_list
);
1429 list_del(&root
->root_list
);
1431 if (btrfs_header_backref_rev(root
->node
) <
1432 BTRFS_MIXED_BACKREF_REV
)
1433 btrfs_drop_snapshot(root
, NULL
, 0);
1435 btrfs_drop_snapshot(root
, NULL
, 1);