Linux 3.12.39
[linux/fpc-iii.git] / fs / btrfs / ctree.c
blobc1123ecde6c9d92f7ac77e8b1d925483cd379832
1 /*
2 * Copyright (C) 2007,2008 Oracle. All rights reserved.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
19 #include <linux/sched.h>
20 #include <linux/slab.h>
21 #include <linux/rbtree.h>
22 #include "ctree.h"
23 #include "disk-io.h"
24 #include "transaction.h"
25 #include "print-tree.h"
26 #include "locking.h"
28 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
29 *root, struct btrfs_path *path, int level);
30 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
31 *root, struct btrfs_key *ins_key,
32 struct btrfs_path *path, int data_size, int extend);
33 static int push_node_left(struct btrfs_trans_handle *trans,
34 struct btrfs_root *root, struct extent_buffer *dst,
35 struct extent_buffer *src, int empty);
36 static int balance_node_right(struct btrfs_trans_handle *trans,
37 struct btrfs_root *root,
38 struct extent_buffer *dst_buf,
39 struct extent_buffer *src_buf);
40 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
41 int level, int slot);
42 static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
43 struct extent_buffer *eb);
44 static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
46 struct btrfs_path *btrfs_alloc_path(void)
48 struct btrfs_path *path;
49 path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
50 return path;
54 * set all locked nodes in the path to blocking locks. This should
55 * be done before scheduling
57 noinline void btrfs_set_path_blocking(struct btrfs_path *p)
59 int i;
60 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
61 if (!p->nodes[i] || !p->locks[i])
62 continue;
63 btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
64 if (p->locks[i] == BTRFS_READ_LOCK)
65 p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
66 else if (p->locks[i] == BTRFS_WRITE_LOCK)
67 p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
72 * reset all the locked nodes in the patch to spinning locks.
74 * held is used to keep lockdep happy, when lockdep is enabled
75 * we set held to a blocking lock before we go around and
76 * retake all the spinlocks in the path. You can safely use NULL
77 * for held
79 noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
80 struct extent_buffer *held, int held_rw)
82 int i;
84 #ifdef CONFIG_DEBUG_LOCK_ALLOC
85 /* lockdep really cares that we take all of these spinlocks
86 * in the right order. If any of the locks in the path are not
87 * currently blocking, it is going to complain. So, make really
88 * really sure by forcing the path to blocking before we clear
89 * the path blocking.
91 if (held) {
92 btrfs_set_lock_blocking_rw(held, held_rw);
93 if (held_rw == BTRFS_WRITE_LOCK)
94 held_rw = BTRFS_WRITE_LOCK_BLOCKING;
95 else if (held_rw == BTRFS_READ_LOCK)
96 held_rw = BTRFS_READ_LOCK_BLOCKING;
98 btrfs_set_path_blocking(p);
99 #endif
101 for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
102 if (p->nodes[i] && p->locks[i]) {
103 btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
104 if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
105 p->locks[i] = BTRFS_WRITE_LOCK;
106 else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
107 p->locks[i] = BTRFS_READ_LOCK;
111 #ifdef CONFIG_DEBUG_LOCK_ALLOC
112 if (held)
113 btrfs_clear_lock_blocking_rw(held, held_rw);
114 #endif
117 /* this also releases the path */
118 void btrfs_free_path(struct btrfs_path *p)
120 if (!p)
121 return;
122 btrfs_release_path(p);
123 kmem_cache_free(btrfs_path_cachep, p);
127 * path release drops references on the extent buffers in the path
128 * and it drops any locks held by this path
130 * It is safe to call this on paths that no locks or extent buffers held.
132 noinline void btrfs_release_path(struct btrfs_path *p)
134 int i;
136 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
137 p->slots[i] = 0;
138 if (!p->nodes[i])
139 continue;
140 if (p->locks[i]) {
141 btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
142 p->locks[i] = 0;
144 free_extent_buffer(p->nodes[i]);
145 p->nodes[i] = NULL;
150 * safely gets a reference on the root node of a tree. A lock
151 * is not taken, so a concurrent writer may put a different node
152 * at the root of the tree. See btrfs_lock_root_node for the
153 * looping required.
155 * The extent buffer returned by this has a reference taken, so
156 * it won't disappear. It may stop being the root of the tree
157 * at any time because there are no locks held.
159 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
161 struct extent_buffer *eb;
163 while (1) {
164 rcu_read_lock();
165 eb = rcu_dereference(root->node);
168 * RCU really hurts here, we could free up the root node because
169 * it was cow'ed but we may not get the new root node yet so do
170 * the inc_not_zero dance and if it doesn't work then
171 * synchronize_rcu and try again.
173 if (atomic_inc_not_zero(&eb->refs)) {
174 rcu_read_unlock();
175 break;
177 rcu_read_unlock();
178 synchronize_rcu();
180 return eb;
183 /* loop around taking references on and locking the root node of the
184 * tree until you end up with a lock on the root. A locked buffer
185 * is returned, with a reference held.
187 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
189 struct extent_buffer *eb;
191 while (1) {
192 eb = btrfs_root_node(root);
193 btrfs_tree_lock(eb);
194 if (eb == root->node)
195 break;
196 btrfs_tree_unlock(eb);
197 free_extent_buffer(eb);
199 return eb;
202 /* loop around taking references on and locking the root node of the
203 * tree until you end up with a lock on the root. A locked buffer
204 * is returned, with a reference held.
206 static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
208 struct extent_buffer *eb;
210 while (1) {
211 eb = btrfs_root_node(root);
212 btrfs_tree_read_lock(eb);
213 if (eb == root->node)
214 break;
215 btrfs_tree_read_unlock(eb);
216 free_extent_buffer(eb);
218 return eb;
221 /* cowonly root (everything not a reference counted cow subvolume), just get
222 * put onto a simple dirty list. transaction.c walks this to make sure they
223 * get properly updated on disk.
225 static void add_root_to_dirty_list(struct btrfs_root *root)
227 spin_lock(&root->fs_info->trans_lock);
228 if (root->track_dirty && list_empty(&root->dirty_list)) {
229 list_add(&root->dirty_list,
230 &root->fs_info->dirty_cowonly_roots);
232 spin_unlock(&root->fs_info->trans_lock);
236 * used by snapshot creation to make a copy of a root for a tree with
237 * a given objectid. The buffer with the new root node is returned in
238 * cow_ret, and this func returns zero on success or a negative error code.
240 int btrfs_copy_root(struct btrfs_trans_handle *trans,
241 struct btrfs_root *root,
242 struct extent_buffer *buf,
243 struct extent_buffer **cow_ret, u64 new_root_objectid)
245 struct extent_buffer *cow;
246 int ret = 0;
247 int level;
248 struct btrfs_disk_key disk_key;
250 WARN_ON(root->ref_cows && trans->transid !=
251 root->fs_info->running_transaction->transid);
252 WARN_ON(root->ref_cows && trans->transid != root->last_trans);
254 level = btrfs_header_level(buf);
255 if (level == 0)
256 btrfs_item_key(buf, &disk_key, 0);
257 else
258 btrfs_node_key(buf, &disk_key, 0);
260 cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
261 new_root_objectid, &disk_key, level,
262 buf->start, 0);
263 if (IS_ERR(cow))
264 return PTR_ERR(cow);
266 copy_extent_buffer(cow, buf, 0, 0, cow->len);
267 btrfs_set_header_bytenr(cow, cow->start);
268 btrfs_set_header_generation(cow, trans->transid);
269 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
270 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
271 BTRFS_HEADER_FLAG_RELOC);
272 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
273 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
274 else
275 btrfs_set_header_owner(cow, new_root_objectid);
277 write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(cow),
278 BTRFS_FSID_SIZE);
280 WARN_ON(btrfs_header_generation(buf) > trans->transid);
281 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
282 ret = btrfs_inc_ref(trans, root, cow, 1, 1);
283 else
284 ret = btrfs_inc_ref(trans, root, cow, 0, 1);
286 if (ret)
287 return ret;
289 btrfs_mark_buffer_dirty(cow);
290 *cow_ret = cow;
291 return 0;
294 enum mod_log_op {
295 MOD_LOG_KEY_REPLACE,
296 MOD_LOG_KEY_ADD,
297 MOD_LOG_KEY_REMOVE,
298 MOD_LOG_KEY_REMOVE_WHILE_FREEING,
299 MOD_LOG_KEY_REMOVE_WHILE_MOVING,
300 MOD_LOG_MOVE_KEYS,
301 MOD_LOG_ROOT_REPLACE,
304 struct tree_mod_move {
305 int dst_slot;
306 int nr_items;
309 struct tree_mod_root {
310 u64 logical;
311 u8 level;
314 struct tree_mod_elem {
315 struct rb_node node;
316 u64 index; /* shifted logical */
317 u64 seq;
318 enum mod_log_op op;
320 /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
321 int slot;
323 /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
324 u64 generation;
326 /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
327 struct btrfs_disk_key key;
328 u64 blockptr;
330 /* this is used for op == MOD_LOG_MOVE_KEYS */
331 struct tree_mod_move move;
333 /* this is used for op == MOD_LOG_ROOT_REPLACE */
334 struct tree_mod_root old_root;
337 static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info)
339 read_lock(&fs_info->tree_mod_log_lock);
342 static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info)
344 read_unlock(&fs_info->tree_mod_log_lock);
347 static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info)
349 write_lock(&fs_info->tree_mod_log_lock);
352 static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
354 write_unlock(&fs_info->tree_mod_log_lock);
358 * Increment the upper half of tree_mod_seq, set lower half zero.
360 * Must be called with fs_info->tree_mod_seq_lock held.
362 static inline u64 btrfs_inc_tree_mod_seq_major(struct btrfs_fs_info *fs_info)
364 u64 seq = atomic64_read(&fs_info->tree_mod_seq);
365 seq &= 0xffffffff00000000ull;
366 seq += 1ull << 32;
367 atomic64_set(&fs_info->tree_mod_seq, seq);
368 return seq;
372 * Increment the lower half of tree_mod_seq.
374 * Must be called with fs_info->tree_mod_seq_lock held. The way major numbers
375 * are generated should not technically require a spin lock here. (Rationale:
376 * incrementing the minor while incrementing the major seq number is between its
377 * atomic64_read and atomic64_set calls doesn't duplicate sequence numbers, it
378 * just returns a unique sequence number as usual.) We have decided to leave
379 * that requirement in here and rethink it once we notice it really imposes a
380 * problem on some workload.
382 static inline u64 btrfs_inc_tree_mod_seq_minor(struct btrfs_fs_info *fs_info)
384 return atomic64_inc_return(&fs_info->tree_mod_seq);
388 * return the last minor in the previous major tree_mod_seq number
390 u64 btrfs_tree_mod_seq_prev(u64 seq)
392 return (seq & 0xffffffff00000000ull) - 1ull;
396 * This adds a new blocker to the tree mod log's blocker list if the @elem
397 * passed does not already have a sequence number set. So when a caller expects
398 * to record tree modifications, it should ensure to set elem->seq to zero
399 * before calling btrfs_get_tree_mod_seq.
400 * Returns a fresh, unused tree log modification sequence number, even if no new
401 * blocker was added.
403 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
404 struct seq_list *elem)
406 u64 seq;
408 tree_mod_log_write_lock(fs_info);
409 spin_lock(&fs_info->tree_mod_seq_lock);
410 if (!elem->seq) {
411 elem->seq = btrfs_inc_tree_mod_seq_major(fs_info);
412 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
414 seq = btrfs_inc_tree_mod_seq_minor(fs_info);
415 spin_unlock(&fs_info->tree_mod_seq_lock);
416 tree_mod_log_write_unlock(fs_info);
418 return seq;
421 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
422 struct seq_list *elem)
424 struct rb_root *tm_root;
425 struct rb_node *node;
426 struct rb_node *next;
427 struct seq_list *cur_elem;
428 struct tree_mod_elem *tm;
429 u64 min_seq = (u64)-1;
430 u64 seq_putting = elem->seq;
432 if (!seq_putting)
433 return;
435 spin_lock(&fs_info->tree_mod_seq_lock);
436 list_del(&elem->list);
437 elem->seq = 0;
439 list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
440 if (cur_elem->seq < min_seq) {
441 if (seq_putting > cur_elem->seq) {
443 * blocker with lower sequence number exists, we
444 * cannot remove anything from the log
446 spin_unlock(&fs_info->tree_mod_seq_lock);
447 return;
449 min_seq = cur_elem->seq;
452 spin_unlock(&fs_info->tree_mod_seq_lock);
455 * anything that's lower than the lowest existing (read: blocked)
456 * sequence number can be removed from the tree.
458 tree_mod_log_write_lock(fs_info);
459 tm_root = &fs_info->tree_mod_log;
460 for (node = rb_first(tm_root); node; node = next) {
461 next = rb_next(node);
462 tm = container_of(node, struct tree_mod_elem, node);
463 if (tm->seq > min_seq)
464 continue;
465 rb_erase(node, tm_root);
466 kfree(tm);
468 tree_mod_log_write_unlock(fs_info);
472 * key order of the log:
473 * index -> sequence
475 * the index is the shifted logical of the *new* root node for root replace
476 * operations, or the shifted logical of the affected block for all other
477 * operations.
479 * Note: must be called with write lock (tree_mod_log_write_lock).
481 static noinline int
482 __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
484 struct rb_root *tm_root;
485 struct rb_node **new;
486 struct rb_node *parent = NULL;
487 struct tree_mod_elem *cur;
489 BUG_ON(!tm);
491 spin_lock(&fs_info->tree_mod_seq_lock);
492 tm->seq = btrfs_inc_tree_mod_seq_minor(fs_info);
493 spin_unlock(&fs_info->tree_mod_seq_lock);
495 tm_root = &fs_info->tree_mod_log;
496 new = &tm_root->rb_node;
497 while (*new) {
498 cur = container_of(*new, struct tree_mod_elem, node);
499 parent = *new;
500 if (cur->index < tm->index)
501 new = &((*new)->rb_left);
502 else if (cur->index > tm->index)
503 new = &((*new)->rb_right);
504 else if (cur->seq < tm->seq)
505 new = &((*new)->rb_left);
506 else if (cur->seq > tm->seq)
507 new = &((*new)->rb_right);
508 else
509 return -EEXIST;
512 rb_link_node(&tm->node, parent, new);
513 rb_insert_color(&tm->node, tm_root);
514 return 0;
518 * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
519 * returns zero with the tree_mod_log_lock acquired. The caller must hold
520 * this until all tree mod log insertions are recorded in the rb tree and then
521 * call tree_mod_log_write_unlock() to release.
523 static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
524 struct extent_buffer *eb) {
525 smp_mb();
526 if (list_empty(&(fs_info)->tree_mod_seq_list))
527 return 1;
528 if (eb && btrfs_header_level(eb) == 0)
529 return 1;
531 tree_mod_log_write_lock(fs_info);
532 if (list_empty(&(fs_info)->tree_mod_seq_list)) {
533 tree_mod_log_write_unlock(fs_info);
534 return 1;
537 return 0;
540 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
541 static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
542 struct extent_buffer *eb)
544 smp_mb();
545 if (list_empty(&(fs_info)->tree_mod_seq_list))
546 return 0;
547 if (eb && btrfs_header_level(eb) == 0)
548 return 0;
550 return 1;
553 static struct tree_mod_elem *
554 alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
555 enum mod_log_op op, gfp_t flags)
557 struct tree_mod_elem *tm;
559 tm = kzalloc(sizeof(*tm), flags);
560 if (!tm)
561 return NULL;
563 tm->index = eb->start >> PAGE_CACHE_SHIFT;
564 if (op != MOD_LOG_KEY_ADD) {
565 btrfs_node_key(eb, &tm->key, slot);
566 tm->blockptr = btrfs_node_blockptr(eb, slot);
568 tm->op = op;
569 tm->slot = slot;
570 tm->generation = btrfs_node_ptr_generation(eb, slot);
571 RB_CLEAR_NODE(&tm->node);
573 return tm;
576 static noinline int
577 tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
578 struct extent_buffer *eb, int slot,
579 enum mod_log_op op, gfp_t flags)
581 struct tree_mod_elem *tm;
582 int ret;
584 if (!tree_mod_need_log(fs_info, eb))
585 return 0;
587 tm = alloc_tree_mod_elem(eb, slot, op, flags);
588 if (!tm)
589 return -ENOMEM;
591 if (tree_mod_dont_log(fs_info, eb)) {
592 kfree(tm);
593 return 0;
596 ret = __tree_mod_log_insert(fs_info, tm);
597 tree_mod_log_write_unlock(fs_info);
598 if (ret)
599 kfree(tm);
601 return ret;
604 static noinline int
605 tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
606 struct extent_buffer *eb, int dst_slot, int src_slot,
607 int nr_items, gfp_t flags)
609 struct tree_mod_elem *tm = NULL;
610 struct tree_mod_elem **tm_list = NULL;
611 int ret = 0;
612 int i;
613 int locked = 0;
615 if (!tree_mod_need_log(fs_info, eb))
616 return 0;
618 tm_list = kzalloc(nr_items * sizeof(struct tree_mod_elem *), flags);
619 if (!tm_list)
620 return -ENOMEM;
622 tm = kzalloc(sizeof(*tm), flags);
623 if (!tm) {
624 ret = -ENOMEM;
625 goto free_tms;
628 tm->index = eb->start >> PAGE_CACHE_SHIFT;
629 tm->slot = src_slot;
630 tm->move.dst_slot = dst_slot;
631 tm->move.nr_items = nr_items;
632 tm->op = MOD_LOG_MOVE_KEYS;
634 for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
635 tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
636 MOD_LOG_KEY_REMOVE_WHILE_MOVING, flags);
637 if (!tm_list[i]) {
638 ret = -ENOMEM;
639 goto free_tms;
643 if (tree_mod_dont_log(fs_info, eb))
644 goto free_tms;
645 locked = 1;
648 * When we override something during the move, we log these removals.
649 * This can only happen when we move towards the beginning of the
650 * buffer, i.e. dst_slot < src_slot.
652 for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
653 ret = __tree_mod_log_insert(fs_info, tm_list[i]);
654 if (ret)
655 goto free_tms;
658 ret = __tree_mod_log_insert(fs_info, tm);
659 if (ret)
660 goto free_tms;
661 tree_mod_log_write_unlock(fs_info);
662 kfree(tm_list);
664 return 0;
665 free_tms:
666 for (i = 0; i < nr_items; i++) {
667 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
668 rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
669 kfree(tm_list[i]);
671 if (locked)
672 tree_mod_log_write_unlock(fs_info);
673 kfree(tm_list);
674 kfree(tm);
676 return ret;
679 static inline int
680 __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
681 struct tree_mod_elem **tm_list,
682 int nritems)
684 int i, j;
685 int ret;
687 for (i = nritems - 1; i >= 0; i--) {
688 ret = __tree_mod_log_insert(fs_info, tm_list[i]);
689 if (ret) {
690 for (j = nritems - 1; j > i; j--)
691 rb_erase(&tm_list[j]->node,
692 &fs_info->tree_mod_log);
693 return ret;
697 return 0;
700 static noinline int
701 tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
702 struct extent_buffer *old_root,
703 struct extent_buffer *new_root, gfp_t flags,
704 int log_removal)
706 struct tree_mod_elem *tm = NULL;
707 struct tree_mod_elem **tm_list = NULL;
708 int nritems = 0;
709 int ret = 0;
710 int i;
712 if (!tree_mod_need_log(fs_info, NULL))
713 return 0;
715 if (log_removal && btrfs_header_level(old_root) > 0) {
716 nritems = btrfs_header_nritems(old_root);
717 tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *),
718 flags);
719 if (!tm_list) {
720 ret = -ENOMEM;
721 goto free_tms;
723 for (i = 0; i < nritems; i++) {
724 tm_list[i] = alloc_tree_mod_elem(old_root, i,
725 MOD_LOG_KEY_REMOVE_WHILE_FREEING, flags);
726 if (!tm_list[i]) {
727 ret = -ENOMEM;
728 goto free_tms;
733 tm = kzalloc(sizeof(*tm), flags);
734 if (!tm) {
735 ret = -ENOMEM;
736 goto free_tms;
739 tm->index = new_root->start >> PAGE_CACHE_SHIFT;
740 tm->old_root.logical = old_root->start;
741 tm->old_root.level = btrfs_header_level(old_root);
742 tm->generation = btrfs_header_generation(old_root);
743 tm->op = MOD_LOG_ROOT_REPLACE;
745 if (tree_mod_dont_log(fs_info, NULL))
746 goto free_tms;
748 if (tm_list)
749 ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
750 if (!ret)
751 ret = __tree_mod_log_insert(fs_info, tm);
753 tree_mod_log_write_unlock(fs_info);
754 if (ret)
755 goto free_tms;
756 kfree(tm_list);
758 return ret;
760 free_tms:
761 if (tm_list) {
762 for (i = 0; i < nritems; i++)
763 kfree(tm_list[i]);
764 kfree(tm_list);
766 kfree(tm);
768 return ret;
771 static struct tree_mod_elem *
772 __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
773 int smallest)
775 struct rb_root *tm_root;
776 struct rb_node *node;
777 struct tree_mod_elem *cur = NULL;
778 struct tree_mod_elem *found = NULL;
779 u64 index = start >> PAGE_CACHE_SHIFT;
781 tree_mod_log_read_lock(fs_info);
782 tm_root = &fs_info->tree_mod_log;
783 node = tm_root->rb_node;
784 while (node) {
785 cur = container_of(node, struct tree_mod_elem, node);
786 if (cur->index < index) {
787 node = node->rb_left;
788 } else if (cur->index > index) {
789 node = node->rb_right;
790 } else if (cur->seq < min_seq) {
791 node = node->rb_left;
792 } else if (!smallest) {
793 /* we want the node with the highest seq */
794 if (found)
795 BUG_ON(found->seq > cur->seq);
796 found = cur;
797 node = node->rb_left;
798 } else if (cur->seq > min_seq) {
799 /* we want the node with the smallest seq */
800 if (found)
801 BUG_ON(found->seq < cur->seq);
802 found = cur;
803 node = node->rb_right;
804 } else {
805 found = cur;
806 break;
809 tree_mod_log_read_unlock(fs_info);
811 return found;
815 * this returns the element from the log with the smallest time sequence
816 * value that's in the log (the oldest log item). any element with a time
817 * sequence lower than min_seq will be ignored.
819 static struct tree_mod_elem *
820 tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
821 u64 min_seq)
823 return __tree_mod_log_search(fs_info, start, min_seq, 1);
827 * this returns the element from the log with the largest time sequence
828 * value that's in the log (the most recent log item). any element with
829 * a time sequence lower than min_seq will be ignored.
831 static struct tree_mod_elem *
832 tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
834 return __tree_mod_log_search(fs_info, start, min_seq, 0);
837 static noinline int
838 tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
839 struct extent_buffer *src, unsigned long dst_offset,
840 unsigned long src_offset, int nr_items)
842 int ret = 0;
843 struct tree_mod_elem **tm_list = NULL;
844 struct tree_mod_elem **tm_list_add, **tm_list_rem;
845 int i;
846 int locked = 0;
848 if (!tree_mod_need_log(fs_info, NULL))
849 return 0;
851 if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
852 return 0;
854 tm_list = kzalloc(nr_items * 2 * sizeof(struct tree_mod_elem *),
855 GFP_NOFS);
856 if (!tm_list)
857 return -ENOMEM;
859 tm_list_add = tm_list;
860 tm_list_rem = tm_list + nr_items;
861 for (i = 0; i < nr_items; i++) {
862 tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
863 MOD_LOG_KEY_REMOVE, GFP_NOFS);
864 if (!tm_list_rem[i]) {
865 ret = -ENOMEM;
866 goto free_tms;
869 tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
870 MOD_LOG_KEY_ADD, GFP_NOFS);
871 if (!tm_list_add[i]) {
872 ret = -ENOMEM;
873 goto free_tms;
877 if (tree_mod_dont_log(fs_info, NULL))
878 goto free_tms;
879 locked = 1;
881 for (i = 0; i < nr_items; i++) {
882 ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
883 if (ret)
884 goto free_tms;
885 ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
886 if (ret)
887 goto free_tms;
890 tree_mod_log_write_unlock(fs_info);
891 kfree(tm_list);
893 return 0;
895 free_tms:
896 for (i = 0; i < nr_items * 2; i++) {
897 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
898 rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
899 kfree(tm_list[i]);
901 if (locked)
902 tree_mod_log_write_unlock(fs_info);
903 kfree(tm_list);
905 return ret;
908 static inline void
909 tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
910 int dst_offset, int src_offset, int nr_items)
912 int ret;
913 ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset,
914 nr_items, GFP_NOFS);
915 BUG_ON(ret < 0);
918 static noinline void
919 tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
920 struct extent_buffer *eb, int slot, int atomic)
922 int ret;
924 ret = tree_mod_log_insert_key(fs_info, eb, slot,
925 MOD_LOG_KEY_REPLACE,
926 atomic ? GFP_ATOMIC : GFP_NOFS);
927 BUG_ON(ret < 0);
930 static noinline int
931 tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
933 struct tree_mod_elem **tm_list = NULL;
934 int nritems = 0;
935 int i;
936 int ret = 0;
938 if (btrfs_header_level(eb) == 0)
939 return 0;
941 if (!tree_mod_need_log(fs_info, NULL))
942 return 0;
944 nritems = btrfs_header_nritems(eb);
945 tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *),
946 GFP_NOFS);
947 if (!tm_list)
948 return -ENOMEM;
950 for (i = 0; i < nritems; i++) {
951 tm_list[i] = alloc_tree_mod_elem(eb, i,
952 MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
953 if (!tm_list[i]) {
954 ret = -ENOMEM;
955 goto free_tms;
959 if (tree_mod_dont_log(fs_info, eb))
960 goto free_tms;
962 ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
963 tree_mod_log_write_unlock(fs_info);
964 if (ret)
965 goto free_tms;
966 kfree(tm_list);
968 return 0;
970 free_tms:
971 for (i = 0; i < nritems; i++)
972 kfree(tm_list[i]);
973 kfree(tm_list);
975 return ret;
978 static noinline void
979 tree_mod_log_set_root_pointer(struct btrfs_root *root,
980 struct extent_buffer *new_root_node,
981 int log_removal)
983 int ret;
984 ret = tree_mod_log_insert_root(root->fs_info, root->node,
985 new_root_node, GFP_NOFS, log_removal);
986 BUG_ON(ret < 0);
990 * check if the tree block can be shared by multiple trees
992 int btrfs_block_can_be_shared(struct btrfs_root *root,
993 struct extent_buffer *buf)
996 * Tree blocks not in refernece counted trees and tree roots
997 * are never shared. If a block was allocated after the last
998 * snapshot and the block was not allocated by tree relocation,
999 * we know the block is not shared.
1001 if (root->ref_cows &&
1002 buf != root->node && buf != root->commit_root &&
1003 (btrfs_header_generation(buf) <=
1004 btrfs_root_last_snapshot(&root->root_item) ||
1005 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
1006 return 1;
1007 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1008 if (root->ref_cows &&
1009 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1010 return 1;
1011 #endif
1012 return 0;
1015 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
1016 struct btrfs_root *root,
1017 struct extent_buffer *buf,
1018 struct extent_buffer *cow,
1019 int *last_ref)
1021 u64 refs;
1022 u64 owner;
1023 u64 flags;
1024 u64 new_flags = 0;
1025 int ret;
1028 * Backrefs update rules:
1030 * Always use full backrefs for extent pointers in tree block
1031 * allocated by tree relocation.
1033 * If a shared tree block is no longer referenced by its owner
1034 * tree (btrfs_header_owner(buf) == root->root_key.objectid),
1035 * use full backrefs for extent pointers in tree block.
1037 * If a tree block is been relocating
1038 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
1039 * use full backrefs for extent pointers in tree block.
1040 * The reason for this is some operations (such as drop tree)
1041 * are only allowed for blocks use full backrefs.
1044 if (btrfs_block_can_be_shared(root, buf)) {
1045 ret = btrfs_lookup_extent_info(trans, root, buf->start,
1046 btrfs_header_level(buf), 1,
1047 &refs, &flags);
1048 if (ret)
1049 return ret;
1050 if (refs == 0) {
1051 ret = -EROFS;
1052 btrfs_std_error(root->fs_info, ret);
1053 return ret;
1055 } else {
1056 refs = 1;
1057 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1058 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1059 flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
1060 else
1061 flags = 0;
1064 owner = btrfs_header_owner(buf);
1065 BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
1066 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
1068 if (refs > 1) {
1069 if ((owner == root->root_key.objectid ||
1070 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
1071 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
1072 ret = btrfs_inc_ref(trans, root, buf, 1, 1);
1073 BUG_ON(ret); /* -ENOMEM */
1075 if (root->root_key.objectid ==
1076 BTRFS_TREE_RELOC_OBJECTID) {
1077 ret = btrfs_dec_ref(trans, root, buf, 0, 1);
1078 BUG_ON(ret); /* -ENOMEM */
1079 ret = btrfs_inc_ref(trans, root, cow, 1, 1);
1080 BUG_ON(ret); /* -ENOMEM */
1082 new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
1083 } else {
1085 if (root->root_key.objectid ==
1086 BTRFS_TREE_RELOC_OBJECTID)
1087 ret = btrfs_inc_ref(trans, root, cow, 1, 1);
1088 else
1089 ret = btrfs_inc_ref(trans, root, cow, 0, 1);
1090 BUG_ON(ret); /* -ENOMEM */
1092 if (new_flags != 0) {
1093 int level = btrfs_header_level(buf);
1095 ret = btrfs_set_disk_extent_flags(trans, root,
1096 buf->start,
1097 buf->len,
1098 new_flags, level, 0);
1099 if (ret)
1100 return ret;
1102 } else {
1103 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
1104 if (root->root_key.objectid ==
1105 BTRFS_TREE_RELOC_OBJECTID)
1106 ret = btrfs_inc_ref(trans, root, cow, 1, 1);
1107 else
1108 ret = btrfs_inc_ref(trans, root, cow, 0, 1);
1109 BUG_ON(ret); /* -ENOMEM */
1110 ret = btrfs_dec_ref(trans, root, buf, 1, 1);
1111 BUG_ON(ret); /* -ENOMEM */
1113 clean_tree_block(trans, root, buf);
1114 *last_ref = 1;
1116 return 0;
1120 * does the dirty work in cow of a single block. The parent block (if
1121 * supplied) is updated to point to the new cow copy. The new buffer is marked
1122 * dirty and returned locked. If you modify the block it needs to be marked
1123 * dirty again.
1125 * search_start -- an allocation hint for the new block
1127 * empty_size -- a hint that you plan on doing more cow. This is the size in
1128 * bytes the allocator should try to find free next to the block it returns.
1129 * This is just a hint and may be ignored by the allocator.
1131 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1132 struct btrfs_root *root,
1133 struct extent_buffer *buf,
1134 struct extent_buffer *parent, int parent_slot,
1135 struct extent_buffer **cow_ret,
1136 u64 search_start, u64 empty_size)
1138 struct btrfs_disk_key disk_key;
1139 struct extent_buffer *cow;
1140 int level, ret;
1141 int last_ref = 0;
1142 int unlock_orig = 0;
1143 u64 parent_start;
1145 if (*cow_ret == buf)
1146 unlock_orig = 1;
1148 btrfs_assert_tree_locked(buf);
1150 WARN_ON(root->ref_cows && trans->transid !=
1151 root->fs_info->running_transaction->transid);
1152 WARN_ON(root->ref_cows && trans->transid != root->last_trans);
1154 level = btrfs_header_level(buf);
1156 if (level == 0)
1157 btrfs_item_key(buf, &disk_key, 0);
1158 else
1159 btrfs_node_key(buf, &disk_key, 0);
1161 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
1162 if (parent)
1163 parent_start = parent->start;
1164 else
1165 parent_start = 0;
1166 } else
1167 parent_start = 0;
1169 cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
1170 root->root_key.objectid, &disk_key,
1171 level, search_start, empty_size);
1172 if (IS_ERR(cow))
1173 return PTR_ERR(cow);
1175 /* cow is set to blocking by btrfs_init_new_buffer */
1177 copy_extent_buffer(cow, buf, 0, 0, cow->len);
1178 btrfs_set_header_bytenr(cow, cow->start);
1179 btrfs_set_header_generation(cow, trans->transid);
1180 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
1181 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
1182 BTRFS_HEADER_FLAG_RELOC);
1183 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1184 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
1185 else
1186 btrfs_set_header_owner(cow, root->root_key.objectid);
1188 write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(cow),
1189 BTRFS_FSID_SIZE);
1191 ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
1192 if (ret) {
1193 btrfs_abort_transaction(trans, root, ret);
1194 return ret;
1197 if (root->ref_cows) {
1198 ret = btrfs_reloc_cow_block(trans, root, buf, cow);
1199 if (ret)
1200 return ret;
1203 if (buf == root->node) {
1204 WARN_ON(parent && parent != buf);
1205 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1206 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1207 parent_start = buf->start;
1208 else
1209 parent_start = 0;
1211 extent_buffer_get(cow);
1212 tree_mod_log_set_root_pointer(root, cow, 1);
1213 rcu_assign_pointer(root->node, cow);
1215 btrfs_free_tree_block(trans, root, buf, parent_start,
1216 last_ref);
1217 free_extent_buffer(buf);
1218 add_root_to_dirty_list(root);
1219 } else {
1220 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1221 parent_start = parent->start;
1222 else
1223 parent_start = 0;
1225 WARN_ON(trans->transid != btrfs_header_generation(parent));
1226 tree_mod_log_insert_key(root->fs_info, parent, parent_slot,
1227 MOD_LOG_KEY_REPLACE, GFP_NOFS);
1228 btrfs_set_node_blockptr(parent, parent_slot,
1229 cow->start);
1230 btrfs_set_node_ptr_generation(parent, parent_slot,
1231 trans->transid);
1232 btrfs_mark_buffer_dirty(parent);
1233 if (last_ref) {
1234 ret = tree_mod_log_free_eb(root->fs_info, buf);
1235 if (ret) {
1236 btrfs_abort_transaction(trans, root, ret);
1237 return ret;
1240 btrfs_free_tree_block(trans, root, buf, parent_start,
1241 last_ref);
1243 if (unlock_orig)
1244 btrfs_tree_unlock(buf);
1245 free_extent_buffer_stale(buf);
1246 btrfs_mark_buffer_dirty(cow);
1247 *cow_ret = cow;
1248 return 0;
1252 * returns the logical address of the oldest predecessor of the given root.
1253 * entries older than time_seq are ignored.
1255 static struct tree_mod_elem *
1256 __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
1257 struct extent_buffer *eb_root, u64 time_seq)
1259 struct tree_mod_elem *tm;
1260 struct tree_mod_elem *found = NULL;
1261 u64 root_logical = eb_root->start;
1262 int looped = 0;
1264 if (!time_seq)
1265 return NULL;
1268 * the very last operation that's logged for a root is the replacement
1269 * operation (if it is replaced at all). this has the index of the *new*
1270 * root, making it the very first operation that's logged for this root.
1272 while (1) {
1273 tm = tree_mod_log_search_oldest(fs_info, root_logical,
1274 time_seq);
1275 if (!looped && !tm)
1276 return NULL;
1278 * if there are no tree operation for the oldest root, we simply
1279 * return it. this should only happen if that (old) root is at
1280 * level 0.
1282 if (!tm)
1283 break;
1286 * if there's an operation that's not a root replacement, we
1287 * found the oldest version of our root. normally, we'll find a
1288 * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
1290 if (tm->op != MOD_LOG_ROOT_REPLACE)
1291 break;
1293 found = tm;
1294 root_logical = tm->old_root.logical;
1295 looped = 1;
1298 /* if there's no old root to return, return what we found instead */
1299 if (!found)
1300 found = tm;
1302 return found;
1306 * tm is a pointer to the first operation to rewind within eb. then, all
1307 * previous operations will be rewinded (until we reach something older than
1308 * time_seq).
1310 static void
1311 __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1312 u64 time_seq, struct tree_mod_elem *first_tm)
1314 u32 n;
1315 struct rb_node *next;
1316 struct tree_mod_elem *tm = first_tm;
1317 unsigned long o_dst;
1318 unsigned long o_src;
1319 unsigned long p_size = sizeof(struct btrfs_key_ptr);
1321 n = btrfs_header_nritems(eb);
1322 tree_mod_log_read_lock(fs_info);
1323 while (tm && tm->seq >= time_seq) {
1325 * all the operations are recorded with the operator used for
1326 * the modification. as we're going backwards, we do the
1327 * opposite of each operation here.
1329 switch (tm->op) {
1330 case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1331 BUG_ON(tm->slot < n);
1332 /* Fallthrough */
1333 case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1334 case MOD_LOG_KEY_REMOVE:
1335 btrfs_set_node_key(eb, &tm->key, tm->slot);
1336 btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1337 btrfs_set_node_ptr_generation(eb, tm->slot,
1338 tm->generation);
1339 n++;
1340 break;
1341 case MOD_LOG_KEY_REPLACE:
1342 BUG_ON(tm->slot >= n);
1343 btrfs_set_node_key(eb, &tm->key, tm->slot);
1344 btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1345 btrfs_set_node_ptr_generation(eb, tm->slot,
1346 tm->generation);
1347 break;
1348 case MOD_LOG_KEY_ADD:
1349 /* if a move operation is needed it's in the log */
1350 n--;
1351 break;
1352 case MOD_LOG_MOVE_KEYS:
1353 o_dst = btrfs_node_key_ptr_offset(tm->slot);
1354 o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1355 memmove_extent_buffer(eb, o_dst, o_src,
1356 tm->move.nr_items * p_size);
1357 break;
1358 case MOD_LOG_ROOT_REPLACE:
1360 * this operation is special. for roots, this must be
1361 * handled explicitly before rewinding.
1362 * for non-roots, this operation may exist if the node
1363 * was a root: root A -> child B; then A gets empty and
1364 * B is promoted to the new root. in the mod log, we'll
1365 * have a root-replace operation for B, a tree block
1366 * that is no root. we simply ignore that operation.
1368 break;
1370 next = rb_next(&tm->node);
1371 if (!next)
1372 break;
1373 tm = container_of(next, struct tree_mod_elem, node);
1374 if (tm->index != first_tm->index)
1375 break;
1377 tree_mod_log_read_unlock(fs_info);
1378 btrfs_set_header_nritems(eb, n);
1382 * Called with eb read locked. If the buffer cannot be rewinded, the same buffer
1383 * is returned. If rewind operations happen, a fresh buffer is returned. The
1384 * returned buffer is always read-locked. If the returned buffer is not the
1385 * input buffer, the lock on the input buffer is released and the input buffer
1386 * is freed (its refcount is decremented).
1388 static struct extent_buffer *
1389 tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1390 struct extent_buffer *eb, u64 time_seq)
1392 struct extent_buffer *eb_rewin;
1393 struct tree_mod_elem *tm;
1395 if (!time_seq)
1396 return eb;
1398 if (btrfs_header_level(eb) == 0)
1399 return eb;
1401 tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1402 if (!tm)
1403 return eb;
1405 btrfs_set_path_blocking(path);
1406 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
1408 if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1409 BUG_ON(tm->slot != 0);
1410 eb_rewin = alloc_dummy_extent_buffer(eb->start,
1411 fs_info->tree_root->nodesize);
1412 if (!eb_rewin) {
1413 btrfs_tree_read_unlock_blocking(eb);
1414 free_extent_buffer(eb);
1415 return NULL;
1417 btrfs_set_header_bytenr(eb_rewin, eb->start);
1418 btrfs_set_header_backref_rev(eb_rewin,
1419 btrfs_header_backref_rev(eb));
1420 btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1421 btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1422 } else {
1423 eb_rewin = btrfs_clone_extent_buffer(eb);
1424 if (!eb_rewin) {
1425 btrfs_tree_read_unlock_blocking(eb);
1426 free_extent_buffer(eb);
1427 return NULL;
1431 btrfs_clear_path_blocking(path, NULL, BTRFS_READ_LOCK);
1432 btrfs_tree_read_unlock_blocking(eb);
1433 free_extent_buffer(eb);
1435 extent_buffer_get(eb_rewin);
1436 btrfs_tree_read_lock(eb_rewin);
1437 __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
1438 WARN_ON(btrfs_header_nritems(eb_rewin) >
1439 BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root));
1441 return eb_rewin;
1445 * get_old_root() rewinds the state of @root's root node to the given @time_seq
1446 * value. If there are no changes, the current root->root_node is returned. If
1447 * anything changed in between, there's a fresh buffer allocated on which the
1448 * rewind operations are done. In any case, the returned buffer is read locked.
1449 * Returns NULL on error (with no locks held).
1451 static inline struct extent_buffer *
1452 get_old_root(struct btrfs_root *root, u64 time_seq)
1454 struct tree_mod_elem *tm;
1455 struct extent_buffer *eb = NULL;
1456 struct extent_buffer *eb_root;
1457 struct extent_buffer *old;
1458 struct tree_mod_root *old_root = NULL;
1459 u64 old_generation = 0;
1460 u64 logical;
1461 u32 blocksize;
1463 eb_root = btrfs_read_lock_root_node(root);
1464 tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
1465 if (!tm)
1466 return eb_root;
1468 if (tm->op == MOD_LOG_ROOT_REPLACE) {
1469 old_root = &tm->old_root;
1470 old_generation = tm->generation;
1471 logical = old_root->logical;
1472 } else {
1473 logical = eb_root->start;
1476 tm = tree_mod_log_search(root->fs_info, logical, time_seq);
1477 if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1478 btrfs_tree_read_unlock(eb_root);
1479 free_extent_buffer(eb_root);
1480 blocksize = btrfs_level_size(root, old_root->level);
1481 old = read_tree_block(root, logical, blocksize, 0);
1482 if (!old || !extent_buffer_uptodate(old)) {
1483 free_extent_buffer(old);
1484 pr_warn("btrfs: failed to read tree block %llu from get_old_root\n",
1485 logical);
1486 WARN_ON(1);
1487 } else {
1488 eb = btrfs_clone_extent_buffer(old);
1489 free_extent_buffer(old);
1491 } else if (old_root) {
1492 btrfs_tree_read_unlock(eb_root);
1493 free_extent_buffer(eb_root);
1494 eb = alloc_dummy_extent_buffer(logical, root->nodesize);
1495 } else {
1496 btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
1497 eb = btrfs_clone_extent_buffer(eb_root);
1498 btrfs_tree_read_unlock_blocking(eb_root);
1499 free_extent_buffer(eb_root);
1502 if (!eb)
1503 return NULL;
1504 extent_buffer_get(eb);
1505 btrfs_tree_read_lock(eb);
1506 if (old_root) {
1507 btrfs_set_header_bytenr(eb, eb->start);
1508 btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1509 btrfs_set_header_owner(eb, btrfs_header_owner(eb_root));
1510 btrfs_set_header_level(eb, old_root->level);
1511 btrfs_set_header_generation(eb, old_generation);
1513 if (tm)
1514 __tree_mod_log_rewind(root->fs_info, eb, time_seq, tm);
1515 else
1516 WARN_ON(btrfs_header_level(eb) != 0);
1517 WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root));
1519 return eb;
1522 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1524 struct tree_mod_elem *tm;
1525 int level;
1526 struct extent_buffer *eb_root = btrfs_root_node(root);
1528 tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
1529 if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
1530 level = tm->old_root.level;
1531 } else {
1532 level = btrfs_header_level(eb_root);
1534 free_extent_buffer(eb_root);
1536 return level;
1539 static inline int should_cow_block(struct btrfs_trans_handle *trans,
1540 struct btrfs_root *root,
1541 struct extent_buffer *buf)
1543 /* ensure we can see the force_cow */
1544 smp_rmb();
1547 * We do not need to cow a block if
1548 * 1) this block is not created or changed in this transaction;
1549 * 2) this block does not belong to TREE_RELOC tree;
1550 * 3) the root is not forced COW.
1552 * What is forced COW:
1553 * when we create snapshot during commiting the transaction,
1554 * after we've finished coping src root, we must COW the shared
1555 * block to ensure the metadata consistency.
1557 if (btrfs_header_generation(buf) == trans->transid &&
1558 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1559 !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1560 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1561 !root->force_cow)
1562 return 0;
1563 return 1;
1567 * cows a single block, see __btrfs_cow_block for the real work.
1568 * This version of it has extra checks so that a block isn't cow'd more than
1569 * once per transaction, as long as it hasn't been written yet
1571 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1572 struct btrfs_root *root, struct extent_buffer *buf,
1573 struct extent_buffer *parent, int parent_slot,
1574 struct extent_buffer **cow_ret)
1576 u64 search_start;
1577 int ret;
1579 if (trans->transaction != root->fs_info->running_transaction)
1580 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1581 trans->transid,
1582 root->fs_info->running_transaction->transid);
1584 if (trans->transid != root->fs_info->generation)
1585 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1586 trans->transid, root->fs_info->generation);
1588 if (!should_cow_block(trans, root, buf)) {
1589 *cow_ret = buf;
1590 return 0;
1593 search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
1595 if (parent)
1596 btrfs_set_lock_blocking(parent);
1597 btrfs_set_lock_blocking(buf);
1599 ret = __btrfs_cow_block(trans, root, buf, parent,
1600 parent_slot, cow_ret, search_start, 0);
1602 trace_btrfs_cow_block(root, buf, *cow_ret);
1604 return ret;
1608 * helper function for defrag to decide if two blocks pointed to by a
1609 * node are actually close by
1611 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1613 if (blocknr < other && other - (blocknr + blocksize) < 32768)
1614 return 1;
1615 if (blocknr > other && blocknr - (other + blocksize) < 32768)
1616 return 1;
1617 return 0;
1621 * compare two keys in a memcmp fashion
1623 static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
1625 struct btrfs_key k1;
1627 btrfs_disk_key_to_cpu(&k1, disk);
1629 return btrfs_comp_cpu_keys(&k1, k2);
1633 * same as comp_keys only with two btrfs_key's
1635 int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
1637 if (k1->objectid > k2->objectid)
1638 return 1;
1639 if (k1->objectid < k2->objectid)
1640 return -1;
1641 if (k1->type > k2->type)
1642 return 1;
1643 if (k1->type < k2->type)
1644 return -1;
1645 if (k1->offset > k2->offset)
1646 return 1;
1647 if (k1->offset < k2->offset)
1648 return -1;
1649 return 0;
1653 * this is used by the defrag code to go through all the
1654 * leaves pointed to by a node and reallocate them so that
1655 * disk order is close to key order
1657 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1658 struct btrfs_root *root, struct extent_buffer *parent,
1659 int start_slot, u64 *last_ret,
1660 struct btrfs_key *progress)
1662 struct extent_buffer *cur;
1663 u64 blocknr;
1664 u64 gen;
1665 u64 search_start = *last_ret;
1666 u64 last_block = 0;
1667 u64 other;
1668 u32 parent_nritems;
1669 int end_slot;
1670 int i;
1671 int err = 0;
1672 int parent_level;
1673 int uptodate;
1674 u32 blocksize;
1675 int progress_passed = 0;
1676 struct btrfs_disk_key disk_key;
1678 parent_level = btrfs_header_level(parent);
1680 WARN_ON(trans->transaction != root->fs_info->running_transaction);
1681 WARN_ON(trans->transid != root->fs_info->generation);
1683 parent_nritems = btrfs_header_nritems(parent);
1684 blocksize = btrfs_level_size(root, parent_level - 1);
1685 end_slot = parent_nritems;
1687 if (parent_nritems == 1)
1688 return 0;
1690 btrfs_set_lock_blocking(parent);
1692 for (i = start_slot; i < end_slot; i++) {
1693 int close = 1;
1695 btrfs_node_key(parent, &disk_key, i);
1696 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
1697 continue;
1699 progress_passed = 1;
1700 blocknr = btrfs_node_blockptr(parent, i);
1701 gen = btrfs_node_ptr_generation(parent, i);
1702 if (last_block == 0)
1703 last_block = blocknr;
1705 if (i > 0) {
1706 other = btrfs_node_blockptr(parent, i - 1);
1707 close = close_blocks(blocknr, other, blocksize);
1709 if (!close && i < end_slot - 2) {
1710 other = btrfs_node_blockptr(parent, i + 1);
1711 close = close_blocks(blocknr, other, blocksize);
1713 if (close) {
1714 last_block = blocknr;
1715 continue;
1718 cur = btrfs_find_tree_block(root, blocknr, blocksize);
1719 if (cur)
1720 uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1721 else
1722 uptodate = 0;
1723 if (!cur || !uptodate) {
1724 if (!cur) {
1725 cur = read_tree_block(root, blocknr,
1726 blocksize, gen);
1727 if (!cur || !extent_buffer_uptodate(cur)) {
1728 free_extent_buffer(cur);
1729 return -EIO;
1731 } else if (!uptodate) {
1732 err = btrfs_read_buffer(cur, gen);
1733 if (err) {
1734 free_extent_buffer(cur);
1735 return err;
1739 if (search_start == 0)
1740 search_start = last_block;
1742 btrfs_tree_lock(cur);
1743 btrfs_set_lock_blocking(cur);
1744 err = __btrfs_cow_block(trans, root, cur, parent, i,
1745 &cur, search_start,
1746 min(16 * blocksize,
1747 (end_slot - i) * blocksize));
1748 if (err) {
1749 btrfs_tree_unlock(cur);
1750 free_extent_buffer(cur);
1751 break;
1753 search_start = cur->start;
1754 last_block = cur->start;
1755 *last_ret = search_start;
1756 btrfs_tree_unlock(cur);
1757 free_extent_buffer(cur);
1759 return err;
1763 * The leaf data grows from end-to-front in the node.
1764 * this returns the address of the start of the last item,
1765 * which is the stop of the leaf data stack
1767 static inline unsigned int leaf_data_end(struct btrfs_root *root,
1768 struct extent_buffer *leaf)
1770 u32 nr = btrfs_header_nritems(leaf);
1771 if (nr == 0)
1772 return BTRFS_LEAF_DATA_SIZE(root);
1773 return btrfs_item_offset_nr(leaf, nr - 1);
1778 * search for key in the extent_buffer. The items start at offset p,
1779 * and they are item_size apart. There are 'max' items in p.
1781 * the slot in the array is returned via slot, and it points to
1782 * the place where you would insert key if it is not found in
1783 * the array.
1785 * slot may point to max if the key is bigger than all of the keys
1787 static noinline int generic_bin_search(struct extent_buffer *eb,
1788 unsigned long p,
1789 int item_size, struct btrfs_key *key,
1790 int max, int *slot)
1792 int low = 0;
1793 int high = max;
1794 int mid;
1795 int ret;
1796 struct btrfs_disk_key *tmp = NULL;
1797 struct btrfs_disk_key unaligned;
1798 unsigned long offset;
1799 char *kaddr = NULL;
1800 unsigned long map_start = 0;
1801 unsigned long map_len = 0;
1802 int err;
1804 while (low < high) {
1805 mid = (low + high) / 2;
1806 offset = p + mid * item_size;
1808 if (!kaddr || offset < map_start ||
1809 (offset + sizeof(struct btrfs_disk_key)) >
1810 map_start + map_len) {
1812 err = map_private_extent_buffer(eb, offset,
1813 sizeof(struct btrfs_disk_key),
1814 &kaddr, &map_start, &map_len);
1816 if (!err) {
1817 tmp = (struct btrfs_disk_key *)(kaddr + offset -
1818 map_start);
1819 } else {
1820 read_extent_buffer(eb, &unaligned,
1821 offset, sizeof(unaligned));
1822 tmp = &unaligned;
1825 } else {
1826 tmp = (struct btrfs_disk_key *)(kaddr + offset -
1827 map_start);
1829 ret = comp_keys(tmp, key);
1831 if (ret < 0)
1832 low = mid + 1;
1833 else if (ret > 0)
1834 high = mid;
1835 else {
1836 *slot = mid;
1837 return 0;
1840 *slot = low;
1841 return 1;
1845 * simple bin_search frontend that does the right thing for
1846 * leaves vs nodes
1848 static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
1849 int level, int *slot)
1851 if (level == 0)
1852 return generic_bin_search(eb,
1853 offsetof(struct btrfs_leaf, items),
1854 sizeof(struct btrfs_item),
1855 key, btrfs_header_nritems(eb),
1856 slot);
1857 else
1858 return generic_bin_search(eb,
1859 offsetof(struct btrfs_node, ptrs),
1860 sizeof(struct btrfs_key_ptr),
1861 key, btrfs_header_nritems(eb),
1862 slot);
1865 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
1866 int level, int *slot)
1868 return bin_search(eb, key, level, slot);
1871 static void root_add_used(struct btrfs_root *root, u32 size)
1873 spin_lock(&root->accounting_lock);
1874 btrfs_set_root_used(&root->root_item,
1875 btrfs_root_used(&root->root_item) + size);
1876 spin_unlock(&root->accounting_lock);
1879 static void root_sub_used(struct btrfs_root *root, u32 size)
1881 spin_lock(&root->accounting_lock);
1882 btrfs_set_root_used(&root->root_item,
1883 btrfs_root_used(&root->root_item) - size);
1884 spin_unlock(&root->accounting_lock);
1887 /* given a node and slot number, this reads the blocks it points to. The
1888 * extent buffer is returned with a reference taken (but unlocked).
1889 * NULL is returned on error.
1891 static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
1892 struct extent_buffer *parent, int slot)
1894 int level = btrfs_header_level(parent);
1895 struct extent_buffer *eb;
1897 if (slot < 0)
1898 return NULL;
1899 if (slot >= btrfs_header_nritems(parent))
1900 return NULL;
1902 BUG_ON(level == 0);
1904 eb = read_tree_block(root, btrfs_node_blockptr(parent, slot),
1905 btrfs_level_size(root, level - 1),
1906 btrfs_node_ptr_generation(parent, slot));
1907 if (eb && !extent_buffer_uptodate(eb)) {
1908 free_extent_buffer(eb);
1909 eb = NULL;
1912 return eb;
1916 * node level balancing, used to make sure nodes are in proper order for
1917 * item deletion. We balance from the top down, so we have to make sure
1918 * that a deletion won't leave an node completely empty later on.
1920 static noinline int balance_level(struct btrfs_trans_handle *trans,
1921 struct btrfs_root *root,
1922 struct btrfs_path *path, int level)
1924 struct extent_buffer *right = NULL;
1925 struct extent_buffer *mid;
1926 struct extent_buffer *left = NULL;
1927 struct extent_buffer *parent = NULL;
1928 int ret = 0;
1929 int wret;
1930 int pslot;
1931 int orig_slot = path->slots[level];
1932 u64 orig_ptr;
1934 if (level == 0)
1935 return 0;
1937 mid = path->nodes[level];
1939 WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
1940 path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
1941 WARN_ON(btrfs_header_generation(mid) != trans->transid);
1943 orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1945 if (level < BTRFS_MAX_LEVEL - 1) {
1946 parent = path->nodes[level + 1];
1947 pslot = path->slots[level + 1];
1951 * deal with the case where there is only one pointer in the root
1952 * by promoting the node below to a root
1954 if (!parent) {
1955 struct extent_buffer *child;
1957 if (btrfs_header_nritems(mid) != 1)
1958 return 0;
1960 /* promote the child to a root */
1961 child = read_node_slot(root, mid, 0);
1962 if (!child) {
1963 ret = -EROFS;
1964 btrfs_std_error(root->fs_info, ret);
1965 goto enospc;
1968 btrfs_tree_lock(child);
1969 btrfs_set_lock_blocking(child);
1970 ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1971 if (ret) {
1972 btrfs_tree_unlock(child);
1973 free_extent_buffer(child);
1974 goto enospc;
1977 tree_mod_log_set_root_pointer(root, child, 1);
1978 rcu_assign_pointer(root->node, child);
1980 add_root_to_dirty_list(root);
1981 btrfs_tree_unlock(child);
1983 path->locks[level] = 0;
1984 path->nodes[level] = NULL;
1985 clean_tree_block(trans, root, mid);
1986 btrfs_tree_unlock(mid);
1987 /* once for the path */
1988 free_extent_buffer(mid);
1990 root_sub_used(root, mid->len);
1991 btrfs_free_tree_block(trans, root, mid, 0, 1);
1992 /* once for the root ptr */
1993 free_extent_buffer_stale(mid);
1994 return 0;
1996 if (btrfs_header_nritems(mid) >
1997 BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
1998 return 0;
2000 left = read_node_slot(root, parent, pslot - 1);
2001 if (left) {
2002 btrfs_tree_lock(left);
2003 btrfs_set_lock_blocking(left);
2004 wret = btrfs_cow_block(trans, root, left,
2005 parent, pslot - 1, &left);
2006 if (wret) {
2007 ret = wret;
2008 goto enospc;
2011 right = read_node_slot(root, parent, pslot + 1);
2012 if (right) {
2013 btrfs_tree_lock(right);
2014 btrfs_set_lock_blocking(right);
2015 wret = btrfs_cow_block(trans, root, right,
2016 parent, pslot + 1, &right);
2017 if (wret) {
2018 ret = wret;
2019 goto enospc;
2023 /* first, try to make some room in the middle buffer */
2024 if (left) {
2025 orig_slot += btrfs_header_nritems(left);
2026 wret = push_node_left(trans, root, left, mid, 1);
2027 if (wret < 0)
2028 ret = wret;
2032 * then try to empty the right most buffer into the middle
2034 if (right) {
2035 wret = push_node_left(trans, root, mid, right, 1);
2036 if (wret < 0 && wret != -ENOSPC)
2037 ret = wret;
2038 if (btrfs_header_nritems(right) == 0) {
2039 clean_tree_block(trans, root, right);
2040 btrfs_tree_unlock(right);
2041 del_ptr(root, path, level + 1, pslot + 1);
2042 root_sub_used(root, right->len);
2043 btrfs_free_tree_block(trans, root, right, 0, 1);
2044 free_extent_buffer_stale(right);
2045 right = NULL;
2046 } else {
2047 struct btrfs_disk_key right_key;
2048 btrfs_node_key(right, &right_key, 0);
2049 tree_mod_log_set_node_key(root->fs_info, parent,
2050 pslot + 1, 0);
2051 btrfs_set_node_key(parent, &right_key, pslot + 1);
2052 btrfs_mark_buffer_dirty(parent);
2055 if (btrfs_header_nritems(mid) == 1) {
2057 * we're not allowed to leave a node with one item in the
2058 * tree during a delete. A deletion from lower in the tree
2059 * could try to delete the only pointer in this node.
2060 * So, pull some keys from the left.
2061 * There has to be a left pointer at this point because
2062 * otherwise we would have pulled some pointers from the
2063 * right
2065 if (!left) {
2066 ret = -EROFS;
2067 btrfs_std_error(root->fs_info, ret);
2068 goto enospc;
2070 wret = balance_node_right(trans, root, mid, left);
2071 if (wret < 0) {
2072 ret = wret;
2073 goto enospc;
2075 if (wret == 1) {
2076 wret = push_node_left(trans, root, left, mid, 1);
2077 if (wret < 0)
2078 ret = wret;
2080 BUG_ON(wret == 1);
2082 if (btrfs_header_nritems(mid) == 0) {
2083 clean_tree_block(trans, root, mid);
2084 btrfs_tree_unlock(mid);
2085 del_ptr(root, path, level + 1, pslot);
2086 root_sub_used(root, mid->len);
2087 btrfs_free_tree_block(trans, root, mid, 0, 1);
2088 free_extent_buffer_stale(mid);
2089 mid = NULL;
2090 } else {
2091 /* update the parent key to reflect our changes */
2092 struct btrfs_disk_key mid_key;
2093 btrfs_node_key(mid, &mid_key, 0);
2094 tree_mod_log_set_node_key(root->fs_info, parent,
2095 pslot, 0);
2096 btrfs_set_node_key(parent, &mid_key, pslot);
2097 btrfs_mark_buffer_dirty(parent);
2100 /* update the path */
2101 if (left) {
2102 if (btrfs_header_nritems(left) > orig_slot) {
2103 extent_buffer_get(left);
2104 /* left was locked after cow */
2105 path->nodes[level] = left;
2106 path->slots[level + 1] -= 1;
2107 path->slots[level] = orig_slot;
2108 if (mid) {
2109 btrfs_tree_unlock(mid);
2110 free_extent_buffer(mid);
2112 } else {
2113 orig_slot -= btrfs_header_nritems(left);
2114 path->slots[level] = orig_slot;
2117 /* double check we haven't messed things up */
2118 if (orig_ptr !=
2119 btrfs_node_blockptr(path->nodes[level], path->slots[level]))
2120 BUG();
2121 enospc:
2122 if (right) {
2123 btrfs_tree_unlock(right);
2124 free_extent_buffer(right);
2126 if (left) {
2127 if (path->nodes[level] != left)
2128 btrfs_tree_unlock(left);
2129 free_extent_buffer(left);
2131 return ret;
2134 /* Node balancing for insertion. Here we only split or push nodes around
2135 * when they are completely full. This is also done top down, so we
2136 * have to be pessimistic.
2138 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
2139 struct btrfs_root *root,
2140 struct btrfs_path *path, int level)
2142 struct extent_buffer *right = NULL;
2143 struct extent_buffer *mid;
2144 struct extent_buffer *left = NULL;
2145 struct extent_buffer *parent = NULL;
2146 int ret = 0;
2147 int wret;
2148 int pslot;
2149 int orig_slot = path->slots[level];
2151 if (level == 0)
2152 return 1;
2154 mid = path->nodes[level];
2155 WARN_ON(btrfs_header_generation(mid) != trans->transid);
2157 if (level < BTRFS_MAX_LEVEL - 1) {
2158 parent = path->nodes[level + 1];
2159 pslot = path->slots[level + 1];
2162 if (!parent)
2163 return 1;
2165 left = read_node_slot(root, parent, pslot - 1);
2167 /* first, try to make some room in the middle buffer */
2168 if (left) {
2169 u32 left_nr;
2171 btrfs_tree_lock(left);
2172 btrfs_set_lock_blocking(left);
2174 left_nr = btrfs_header_nritems(left);
2175 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
2176 wret = 1;
2177 } else {
2178 ret = btrfs_cow_block(trans, root, left, parent,
2179 pslot - 1, &left);
2180 if (ret)
2181 wret = 1;
2182 else {
2183 wret = push_node_left(trans, root,
2184 left, mid, 0);
2187 if (wret < 0)
2188 ret = wret;
2189 if (wret == 0) {
2190 struct btrfs_disk_key disk_key;
2191 orig_slot += left_nr;
2192 btrfs_node_key(mid, &disk_key, 0);
2193 tree_mod_log_set_node_key(root->fs_info, parent,
2194 pslot, 0);
2195 btrfs_set_node_key(parent, &disk_key, pslot);
2196 btrfs_mark_buffer_dirty(parent);
2197 if (btrfs_header_nritems(left) > orig_slot) {
2198 path->nodes[level] = left;
2199 path->slots[level + 1] -= 1;
2200 path->slots[level] = orig_slot;
2201 btrfs_tree_unlock(mid);
2202 free_extent_buffer(mid);
2203 } else {
2204 orig_slot -=
2205 btrfs_header_nritems(left);
2206 path->slots[level] = orig_slot;
2207 btrfs_tree_unlock(left);
2208 free_extent_buffer(left);
2210 return 0;
2212 btrfs_tree_unlock(left);
2213 free_extent_buffer(left);
2215 right = read_node_slot(root, parent, pslot + 1);
2218 * then try to empty the right most buffer into the middle
2220 if (right) {
2221 u32 right_nr;
2223 btrfs_tree_lock(right);
2224 btrfs_set_lock_blocking(right);
2226 right_nr = btrfs_header_nritems(right);
2227 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
2228 wret = 1;
2229 } else {
2230 ret = btrfs_cow_block(trans, root, right,
2231 parent, pslot + 1,
2232 &right);
2233 if (ret)
2234 wret = 1;
2235 else {
2236 wret = balance_node_right(trans, root,
2237 right, mid);
2240 if (wret < 0)
2241 ret = wret;
2242 if (wret == 0) {
2243 struct btrfs_disk_key disk_key;
2245 btrfs_node_key(right, &disk_key, 0);
2246 tree_mod_log_set_node_key(root->fs_info, parent,
2247 pslot + 1, 0);
2248 btrfs_set_node_key(parent, &disk_key, pslot + 1);
2249 btrfs_mark_buffer_dirty(parent);
2251 if (btrfs_header_nritems(mid) <= orig_slot) {
2252 path->nodes[level] = right;
2253 path->slots[level + 1] += 1;
2254 path->slots[level] = orig_slot -
2255 btrfs_header_nritems(mid);
2256 btrfs_tree_unlock(mid);
2257 free_extent_buffer(mid);
2258 } else {
2259 btrfs_tree_unlock(right);
2260 free_extent_buffer(right);
2262 return 0;
2264 btrfs_tree_unlock(right);
2265 free_extent_buffer(right);
2267 return 1;
2271 * readahead one full node of leaves, finding things that are close
2272 * to the block in 'slot', and triggering ra on them.
2274 static void reada_for_search(struct btrfs_root *root,
2275 struct btrfs_path *path,
2276 int level, int slot, u64 objectid)
2278 struct extent_buffer *node;
2279 struct btrfs_disk_key disk_key;
2280 u32 nritems;
2281 u64 search;
2282 u64 target;
2283 u64 nread = 0;
2284 u64 gen;
2285 int direction = path->reada;
2286 struct extent_buffer *eb;
2287 u32 nr;
2288 u32 blocksize;
2289 u32 nscan = 0;
2291 if (level != 1)
2292 return;
2294 if (!path->nodes[level])
2295 return;
2297 node = path->nodes[level];
2299 search = btrfs_node_blockptr(node, slot);
2300 blocksize = btrfs_level_size(root, level - 1);
2301 eb = btrfs_find_tree_block(root, search, blocksize);
2302 if (eb) {
2303 free_extent_buffer(eb);
2304 return;
2307 target = search;
2309 nritems = btrfs_header_nritems(node);
2310 nr = slot;
2312 while (1) {
2313 if (direction < 0) {
2314 if (nr == 0)
2315 break;
2316 nr--;
2317 } else if (direction > 0) {
2318 nr++;
2319 if (nr >= nritems)
2320 break;
2322 if (path->reada < 0 && objectid) {
2323 btrfs_node_key(node, &disk_key, nr);
2324 if (btrfs_disk_key_objectid(&disk_key) != objectid)
2325 break;
2327 search = btrfs_node_blockptr(node, nr);
2328 if ((search <= target && target - search <= 65536) ||
2329 (search > target && search - target <= 65536)) {
2330 gen = btrfs_node_ptr_generation(node, nr);
2331 readahead_tree_block(root, search, blocksize, gen);
2332 nread += blocksize;
2334 nscan++;
2335 if ((nread > 65536 || nscan > 32))
2336 break;
2340 static noinline void reada_for_balance(struct btrfs_root *root,
2341 struct btrfs_path *path, int level)
2343 int slot;
2344 int nritems;
2345 struct extent_buffer *parent;
2346 struct extent_buffer *eb;
2347 u64 gen;
2348 u64 block1 = 0;
2349 u64 block2 = 0;
2350 int blocksize;
2352 parent = path->nodes[level + 1];
2353 if (!parent)
2354 return;
2356 nritems = btrfs_header_nritems(parent);
2357 slot = path->slots[level + 1];
2358 blocksize = btrfs_level_size(root, level);
2360 if (slot > 0) {
2361 block1 = btrfs_node_blockptr(parent, slot - 1);
2362 gen = btrfs_node_ptr_generation(parent, slot - 1);
2363 eb = btrfs_find_tree_block(root, block1, blocksize);
2365 * if we get -eagain from btrfs_buffer_uptodate, we
2366 * don't want to return eagain here. That will loop
2367 * forever
2369 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2370 block1 = 0;
2371 free_extent_buffer(eb);
2373 if (slot + 1 < nritems) {
2374 block2 = btrfs_node_blockptr(parent, slot + 1);
2375 gen = btrfs_node_ptr_generation(parent, slot + 1);
2376 eb = btrfs_find_tree_block(root, block2, blocksize);
2377 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2378 block2 = 0;
2379 free_extent_buffer(eb);
2382 if (block1)
2383 readahead_tree_block(root, block1, blocksize, 0);
2384 if (block2)
2385 readahead_tree_block(root, block2, blocksize, 0);
2390 * when we walk down the tree, it is usually safe to unlock the higher layers
2391 * in the tree. The exceptions are when our path goes through slot 0, because
2392 * operations on the tree might require changing key pointers higher up in the
2393 * tree.
2395 * callers might also have set path->keep_locks, which tells this code to keep
2396 * the lock if the path points to the last slot in the block. This is part of
2397 * walking through the tree, and selecting the next slot in the higher block.
2399 * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so
2400 * if lowest_unlock is 1, level 0 won't be unlocked
2402 static noinline void unlock_up(struct btrfs_path *path, int level,
2403 int lowest_unlock, int min_write_lock_level,
2404 int *write_lock_level)
2406 int i;
2407 int skip_level = level;
2408 int no_skips = 0;
2409 struct extent_buffer *t;
2411 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2412 if (!path->nodes[i])
2413 break;
2414 if (!path->locks[i])
2415 break;
2416 if (!no_skips && path->slots[i] == 0) {
2417 skip_level = i + 1;
2418 continue;
2420 if (!no_skips && path->keep_locks) {
2421 u32 nritems;
2422 t = path->nodes[i];
2423 nritems = btrfs_header_nritems(t);
2424 if (nritems < 1 || path->slots[i] >= nritems - 1) {
2425 skip_level = i + 1;
2426 continue;
2429 if (skip_level < i && i >= lowest_unlock)
2430 no_skips = 1;
2432 t = path->nodes[i];
2433 if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
2434 btrfs_tree_unlock_rw(t, path->locks[i]);
2435 path->locks[i] = 0;
2436 if (write_lock_level &&
2437 i > min_write_lock_level &&
2438 i <= *write_lock_level) {
2439 *write_lock_level = i - 1;
2446 * This releases any locks held in the path starting at level and
2447 * going all the way up to the root.
2449 * btrfs_search_slot will keep the lock held on higher nodes in a few
2450 * corner cases, such as COW of the block at slot zero in the node. This
2451 * ignores those rules, and it should only be called when there are no
2452 * more updates to be done higher up in the tree.
2454 noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
2456 int i;
2458 if (path->keep_locks)
2459 return;
2461 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2462 if (!path->nodes[i])
2463 continue;
2464 if (!path->locks[i])
2465 continue;
2466 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
2467 path->locks[i] = 0;
2472 * helper function for btrfs_search_slot. The goal is to find a block
2473 * in cache without setting the path to blocking. If we find the block
2474 * we return zero and the path is unchanged.
2476 * If we can't find the block, we set the path blocking and do some
2477 * reada. -EAGAIN is returned and the search must be repeated.
2479 static int
2480 read_block_for_search(struct btrfs_trans_handle *trans,
2481 struct btrfs_root *root, struct btrfs_path *p,
2482 struct extent_buffer **eb_ret, int level, int slot,
2483 struct btrfs_key *key, u64 time_seq)
2485 u64 blocknr;
2486 u64 gen;
2487 u32 blocksize;
2488 struct extent_buffer *b = *eb_ret;
2489 struct extent_buffer *tmp;
2490 int ret;
2492 blocknr = btrfs_node_blockptr(b, slot);
2493 gen = btrfs_node_ptr_generation(b, slot);
2494 blocksize = btrfs_level_size(root, level - 1);
2496 tmp = btrfs_find_tree_block(root, blocknr, blocksize);
2497 if (tmp) {
2498 /* first we do an atomic uptodate check */
2499 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2500 *eb_ret = tmp;
2501 return 0;
2504 /* the pages were up to date, but we failed
2505 * the generation number check. Do a full
2506 * read for the generation number that is correct.
2507 * We must do this without dropping locks so
2508 * we can trust our generation number
2510 btrfs_set_path_blocking(p);
2512 /* now we're allowed to do a blocking uptodate check */
2513 ret = btrfs_read_buffer(tmp, gen);
2514 if (!ret) {
2515 *eb_ret = tmp;
2516 return 0;
2518 free_extent_buffer(tmp);
2519 btrfs_release_path(p);
2520 return -EIO;
2524 * reduce lock contention at high levels
2525 * of the btree by dropping locks before
2526 * we read. Don't release the lock on the current
2527 * level because we need to walk this node to figure
2528 * out which blocks to read.
2530 btrfs_unlock_up_safe(p, level + 1);
2531 btrfs_set_path_blocking(p);
2533 free_extent_buffer(tmp);
2534 if (p->reada)
2535 reada_for_search(root, p, level, slot, key->objectid);
2537 btrfs_release_path(p);
2539 ret = -EAGAIN;
2540 tmp = read_tree_block(root, blocknr, blocksize, 0);
2541 if (tmp) {
2543 * If the read above didn't mark this buffer up to date,
2544 * it will never end up being up to date. Set ret to EIO now
2545 * and give up so that our caller doesn't loop forever
2546 * on our EAGAINs.
2548 if (!btrfs_buffer_uptodate(tmp, 0, 0))
2549 ret = -EIO;
2550 free_extent_buffer(tmp);
2552 return ret;
2556 * helper function for btrfs_search_slot. This does all of the checks
2557 * for node-level blocks and does any balancing required based on
2558 * the ins_len.
2560 * If no extra work was required, zero is returned. If we had to
2561 * drop the path, -EAGAIN is returned and btrfs_search_slot must
2562 * start over
2564 static int
2565 setup_nodes_for_search(struct btrfs_trans_handle *trans,
2566 struct btrfs_root *root, struct btrfs_path *p,
2567 struct extent_buffer *b, int level, int ins_len,
2568 int *write_lock_level)
2570 int ret;
2571 if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
2572 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
2573 int sret;
2575 if (*write_lock_level < level + 1) {
2576 *write_lock_level = level + 1;
2577 btrfs_release_path(p);
2578 goto again;
2581 btrfs_set_path_blocking(p);
2582 reada_for_balance(root, p, level);
2583 sret = split_node(trans, root, p, level);
2584 btrfs_clear_path_blocking(p, NULL, 0);
2586 BUG_ON(sret > 0);
2587 if (sret) {
2588 ret = sret;
2589 goto done;
2591 b = p->nodes[level];
2592 } else if (ins_len < 0 && btrfs_header_nritems(b) <
2593 BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {
2594 int sret;
2596 if (*write_lock_level < level + 1) {
2597 *write_lock_level = level + 1;
2598 btrfs_release_path(p);
2599 goto again;
2602 btrfs_set_path_blocking(p);
2603 reada_for_balance(root, p, level);
2604 sret = balance_level(trans, root, p, level);
2605 btrfs_clear_path_blocking(p, NULL, 0);
2607 if (sret) {
2608 ret = sret;
2609 goto done;
2611 b = p->nodes[level];
2612 if (!b) {
2613 btrfs_release_path(p);
2614 goto again;
2616 BUG_ON(btrfs_header_nritems(b) == 1);
2618 return 0;
2620 again:
2621 ret = -EAGAIN;
2622 done:
2623 return ret;
2626 static void key_search_validate(struct extent_buffer *b,
2627 struct btrfs_key *key,
2628 int level)
2630 #ifdef CONFIG_BTRFS_ASSERT
2631 struct btrfs_disk_key disk_key;
2633 btrfs_cpu_key_to_disk(&disk_key, key);
2635 if (level == 0)
2636 ASSERT(!memcmp_extent_buffer(b, &disk_key,
2637 offsetof(struct btrfs_leaf, items[0].key),
2638 sizeof(disk_key)));
2639 else
2640 ASSERT(!memcmp_extent_buffer(b, &disk_key,
2641 offsetof(struct btrfs_node, ptrs[0].key),
2642 sizeof(disk_key)));
2643 #endif
2646 static int key_search(struct extent_buffer *b, struct btrfs_key *key,
2647 int level, int *prev_cmp, int *slot)
2649 if (*prev_cmp != 0) {
2650 *prev_cmp = bin_search(b, key, level, slot);
2651 return *prev_cmp;
2654 key_search_validate(b, key, level);
2655 *slot = 0;
2657 return 0;
2661 * look for key in the tree. path is filled in with nodes along the way
2662 * if key is found, we return zero and you can find the item in the leaf
2663 * level of the path (level 0)
2665 * If the key isn't found, the path points to the slot where it should
2666 * be inserted, and 1 is returned. If there are other errors during the
2667 * search a negative error number is returned.
2669 * if ins_len > 0, nodes and leaves will be split as we walk down the
2670 * tree. if ins_len < 0, nodes will be merged as we walk down the tree (if
2671 * possible)
2673 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
2674 *root, struct btrfs_key *key, struct btrfs_path *p, int
2675 ins_len, int cow)
2677 struct extent_buffer *b;
2678 int slot;
2679 int ret;
2680 int err;
2681 int level;
2682 int lowest_unlock = 1;
2683 int root_lock;
2684 /* everything at write_lock_level or lower must be write locked */
2685 int write_lock_level = 0;
2686 u8 lowest_level = 0;
2687 int min_write_lock_level;
2688 int prev_cmp;
2690 lowest_level = p->lowest_level;
2691 WARN_ON(lowest_level && ins_len > 0);
2692 WARN_ON(p->nodes[0] != NULL);
2694 if (ins_len < 0) {
2695 lowest_unlock = 2;
2697 /* when we are removing items, we might have to go up to level
2698 * two as we update tree pointers Make sure we keep write
2699 * for those levels as well
2701 write_lock_level = 2;
2702 } else if (ins_len > 0) {
2704 * for inserting items, make sure we have a write lock on
2705 * level 1 so we can update keys
2707 write_lock_level = 1;
2710 if (!cow)
2711 write_lock_level = -1;
2713 if (cow && (p->keep_locks || p->lowest_level))
2714 write_lock_level = BTRFS_MAX_LEVEL;
2716 min_write_lock_level = write_lock_level;
2718 again:
2719 prev_cmp = -1;
2721 * we try very hard to do read locks on the root
2723 root_lock = BTRFS_READ_LOCK;
2724 level = 0;
2725 if (p->search_commit_root) {
2727 * the commit roots are read only
2728 * so we always do read locks
2730 b = root->commit_root;
2731 extent_buffer_get(b);
2732 level = btrfs_header_level(b);
2733 if (!p->skip_locking)
2734 btrfs_tree_read_lock(b);
2735 } else {
2736 if (p->skip_locking) {
2737 b = btrfs_root_node(root);
2738 level = btrfs_header_level(b);
2739 } else {
2740 /* we don't know the level of the root node
2741 * until we actually have it read locked
2743 b = btrfs_read_lock_root_node(root);
2744 level = btrfs_header_level(b);
2745 if (level <= write_lock_level) {
2746 /* whoops, must trade for write lock */
2747 btrfs_tree_read_unlock(b);
2748 free_extent_buffer(b);
2749 b = btrfs_lock_root_node(root);
2750 root_lock = BTRFS_WRITE_LOCK;
2752 /* the level might have changed, check again */
2753 level = btrfs_header_level(b);
2757 p->nodes[level] = b;
2758 if (!p->skip_locking)
2759 p->locks[level] = root_lock;
2761 while (b) {
2762 level = btrfs_header_level(b);
2765 * setup the path here so we can release it under lock
2766 * contention with the cow code
2768 if (cow) {
2770 * if we don't really need to cow this block
2771 * then we don't want to set the path blocking,
2772 * so we test it here
2774 if (!should_cow_block(trans, root, b))
2775 goto cow_done;
2777 btrfs_set_path_blocking(p);
2780 * must have write locks on this node and the
2781 * parent
2783 if (level > write_lock_level ||
2784 (level + 1 > write_lock_level &&
2785 level + 1 < BTRFS_MAX_LEVEL &&
2786 p->nodes[level + 1])) {
2787 write_lock_level = level + 1;
2788 btrfs_release_path(p);
2789 goto again;
2792 err = btrfs_cow_block(trans, root, b,
2793 p->nodes[level + 1],
2794 p->slots[level + 1], &b);
2795 if (err) {
2796 ret = err;
2797 goto done;
2800 cow_done:
2801 BUG_ON(!cow && ins_len);
2803 p->nodes[level] = b;
2804 btrfs_clear_path_blocking(p, NULL, 0);
2807 * we have a lock on b and as long as we aren't changing
2808 * the tree, there is no way to for the items in b to change.
2809 * It is safe to drop the lock on our parent before we
2810 * go through the expensive btree search on b.
2812 * If cow is true, then we might be changing slot zero,
2813 * which may require changing the parent. So, we can't
2814 * drop the lock until after we know which slot we're
2815 * operating on.
2817 if (!cow)
2818 btrfs_unlock_up_safe(p, level + 1);
2820 ret = key_search(b, key, level, &prev_cmp, &slot);
2822 if (level != 0) {
2823 int dec = 0;
2824 if (ret && slot > 0) {
2825 dec = 1;
2826 slot -= 1;
2828 p->slots[level] = slot;
2829 err = setup_nodes_for_search(trans, root, p, b, level,
2830 ins_len, &write_lock_level);
2831 if (err == -EAGAIN)
2832 goto again;
2833 if (err) {
2834 ret = err;
2835 goto done;
2837 b = p->nodes[level];
2838 slot = p->slots[level];
2841 * slot 0 is special, if we change the key
2842 * we have to update the parent pointer
2843 * which means we must have a write lock
2844 * on the parent
2846 if (slot == 0 && cow &&
2847 write_lock_level < level + 1) {
2848 write_lock_level = level + 1;
2849 btrfs_release_path(p);
2850 goto again;
2853 unlock_up(p, level, lowest_unlock,
2854 min_write_lock_level, &write_lock_level);
2856 if (level == lowest_level) {
2857 if (dec)
2858 p->slots[level]++;
2859 goto done;
2862 err = read_block_for_search(trans, root, p,
2863 &b, level, slot, key, 0);
2864 if (err == -EAGAIN)
2865 goto again;
2866 if (err) {
2867 ret = err;
2868 goto done;
2871 if (!p->skip_locking) {
2872 level = btrfs_header_level(b);
2873 if (level <= write_lock_level) {
2874 err = btrfs_try_tree_write_lock(b);
2875 if (!err) {
2876 btrfs_set_path_blocking(p);
2877 btrfs_tree_lock(b);
2878 btrfs_clear_path_blocking(p, b,
2879 BTRFS_WRITE_LOCK);
2881 p->locks[level] = BTRFS_WRITE_LOCK;
2882 } else {
2883 err = btrfs_try_tree_read_lock(b);
2884 if (!err) {
2885 btrfs_set_path_blocking(p);
2886 btrfs_tree_read_lock(b);
2887 btrfs_clear_path_blocking(p, b,
2888 BTRFS_READ_LOCK);
2890 p->locks[level] = BTRFS_READ_LOCK;
2892 p->nodes[level] = b;
2894 } else {
2895 p->slots[level] = slot;
2896 if (ins_len > 0 &&
2897 btrfs_leaf_free_space(root, b) < ins_len) {
2898 if (write_lock_level < 1) {
2899 write_lock_level = 1;
2900 btrfs_release_path(p);
2901 goto again;
2904 btrfs_set_path_blocking(p);
2905 err = split_leaf(trans, root, key,
2906 p, ins_len, ret == 0);
2907 btrfs_clear_path_blocking(p, NULL, 0);
2909 BUG_ON(err > 0);
2910 if (err) {
2911 ret = err;
2912 goto done;
2915 if (!p->search_for_split)
2916 unlock_up(p, level, lowest_unlock,
2917 min_write_lock_level, &write_lock_level);
2918 goto done;
2921 ret = 1;
2922 done:
2924 * we don't really know what they plan on doing with the path
2925 * from here on, so for now just mark it as blocking
2927 if (!p->leave_spinning)
2928 btrfs_set_path_blocking(p);
2929 if (ret < 0)
2930 btrfs_release_path(p);
2931 return ret;
2935 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2936 * current state of the tree together with the operations recorded in the tree
2937 * modification log to search for the key in a previous version of this tree, as
2938 * denoted by the time_seq parameter.
2940 * Naturally, there is no support for insert, delete or cow operations.
2942 * The resulting path and return value will be set up as if we called
2943 * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2945 int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
2946 struct btrfs_path *p, u64 time_seq)
2948 struct extent_buffer *b;
2949 int slot;
2950 int ret;
2951 int err;
2952 int level;
2953 int lowest_unlock = 1;
2954 u8 lowest_level = 0;
2955 int prev_cmp = -1;
2957 lowest_level = p->lowest_level;
2958 WARN_ON(p->nodes[0] != NULL);
2960 if (p->search_commit_root) {
2961 BUG_ON(time_seq);
2962 return btrfs_search_slot(NULL, root, key, p, 0, 0);
2965 again:
2966 b = get_old_root(root, time_seq);
2967 level = btrfs_header_level(b);
2968 p->locks[level] = BTRFS_READ_LOCK;
2970 while (b) {
2971 level = btrfs_header_level(b);
2972 p->nodes[level] = b;
2973 btrfs_clear_path_blocking(p, NULL, 0);
2976 * we have a lock on b and as long as we aren't changing
2977 * the tree, there is no way to for the items in b to change.
2978 * It is safe to drop the lock on our parent before we
2979 * go through the expensive btree search on b.
2981 btrfs_unlock_up_safe(p, level + 1);
2984 * Since we can unwind eb's we want to do a real search every
2985 * time.
2987 prev_cmp = -1;
2988 ret = key_search(b, key, level, &prev_cmp, &slot);
2990 if (level != 0) {
2991 int dec = 0;
2992 if (ret && slot > 0) {
2993 dec = 1;
2994 slot -= 1;
2996 p->slots[level] = slot;
2997 unlock_up(p, level, lowest_unlock, 0, NULL);
2999 if (level == lowest_level) {
3000 if (dec)
3001 p->slots[level]++;
3002 goto done;
3005 err = read_block_for_search(NULL, root, p, &b, level,
3006 slot, key, time_seq);
3007 if (err == -EAGAIN)
3008 goto again;
3009 if (err) {
3010 ret = err;
3011 goto done;
3014 level = btrfs_header_level(b);
3015 err = btrfs_try_tree_read_lock(b);
3016 if (!err) {
3017 btrfs_set_path_blocking(p);
3018 btrfs_tree_read_lock(b);
3019 btrfs_clear_path_blocking(p, b,
3020 BTRFS_READ_LOCK);
3022 b = tree_mod_log_rewind(root->fs_info, p, b, time_seq);
3023 if (!b) {
3024 ret = -ENOMEM;
3025 goto done;
3027 p->locks[level] = BTRFS_READ_LOCK;
3028 p->nodes[level] = b;
3029 } else {
3030 p->slots[level] = slot;
3031 unlock_up(p, level, lowest_unlock, 0, NULL);
3032 goto done;
3035 ret = 1;
3036 done:
3037 if (!p->leave_spinning)
3038 btrfs_set_path_blocking(p);
3039 if (ret < 0)
3040 btrfs_release_path(p);
3042 return ret;
3046 * helper to use instead of search slot if no exact match is needed but
3047 * instead the next or previous item should be returned.
3048 * When find_higher is true, the next higher item is returned, the next lower
3049 * otherwise.
3050 * When return_any and find_higher are both true, and no higher item is found,
3051 * return the next lower instead.
3052 * When return_any is true and find_higher is false, and no lower item is found,
3053 * return the next higher instead.
3054 * It returns 0 if any item is found, 1 if none is found (tree empty), and
3055 * < 0 on error
3057 int btrfs_search_slot_for_read(struct btrfs_root *root,
3058 struct btrfs_key *key, struct btrfs_path *p,
3059 int find_higher, int return_any)
3061 int ret;
3062 struct extent_buffer *leaf;
3064 again:
3065 ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
3066 if (ret <= 0)
3067 return ret;
3069 * a return value of 1 means the path is at the position where the
3070 * item should be inserted. Normally this is the next bigger item,
3071 * but in case the previous item is the last in a leaf, path points
3072 * to the first free slot in the previous leaf, i.e. at an invalid
3073 * item.
3075 leaf = p->nodes[0];
3077 if (find_higher) {
3078 if (p->slots[0] >= btrfs_header_nritems(leaf)) {
3079 ret = btrfs_next_leaf(root, p);
3080 if (ret <= 0)
3081 return ret;
3082 if (!return_any)
3083 return 1;
3085 * no higher item found, return the next
3086 * lower instead
3088 return_any = 0;
3089 find_higher = 0;
3090 btrfs_release_path(p);
3091 goto again;
3093 } else {
3094 if (p->slots[0] == 0) {
3095 ret = btrfs_prev_leaf(root, p);
3096 if (ret < 0)
3097 return ret;
3098 if (!ret) {
3099 p->slots[0] = btrfs_header_nritems(leaf) - 1;
3100 return 0;
3102 if (!return_any)
3103 return 1;
3105 * no lower item found, return the next
3106 * higher instead
3108 return_any = 0;
3109 find_higher = 1;
3110 btrfs_release_path(p);
3111 goto again;
3112 } else {
3113 --p->slots[0];
3116 return 0;
3120 * adjust the pointers going up the tree, starting at level
3121 * making sure the right key of each node is points to 'key'.
3122 * This is used after shifting pointers to the left, so it stops
3123 * fixing up pointers when a given leaf/node is not in slot 0 of the
3124 * higher levels
3127 static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path,
3128 struct btrfs_disk_key *key, int level)
3130 int i;
3131 struct extent_buffer *t;
3133 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
3134 int tslot = path->slots[i];
3135 if (!path->nodes[i])
3136 break;
3137 t = path->nodes[i];
3138 tree_mod_log_set_node_key(root->fs_info, t, tslot, 1);
3139 btrfs_set_node_key(t, key, tslot);
3140 btrfs_mark_buffer_dirty(path->nodes[i]);
3141 if (tslot != 0)
3142 break;
3147 * update item key.
3149 * This function isn't completely safe. It's the caller's responsibility
3150 * that the new key won't break the order
3152 void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path,
3153 struct btrfs_key *new_key)
3155 struct btrfs_disk_key disk_key;
3156 struct extent_buffer *eb;
3157 int slot;
3159 eb = path->nodes[0];
3160 slot = path->slots[0];
3161 if (slot > 0) {
3162 btrfs_item_key(eb, &disk_key, slot - 1);
3163 BUG_ON(comp_keys(&disk_key, new_key) >= 0);
3165 if (slot < btrfs_header_nritems(eb) - 1) {
3166 btrfs_item_key(eb, &disk_key, slot + 1);
3167 BUG_ON(comp_keys(&disk_key, new_key) <= 0);
3170 btrfs_cpu_key_to_disk(&disk_key, new_key);
3171 btrfs_set_item_key(eb, &disk_key, slot);
3172 btrfs_mark_buffer_dirty(eb);
3173 if (slot == 0)
3174 fixup_low_keys(root, path, &disk_key, 1);
3178 * try to push data from one node into the next node left in the
3179 * tree.
3181 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
3182 * error, and > 0 if there was no room in the left hand block.
3184 static int push_node_left(struct btrfs_trans_handle *trans,
3185 struct btrfs_root *root, struct extent_buffer *dst,
3186 struct extent_buffer *src, int empty)
3188 int push_items = 0;
3189 int src_nritems;
3190 int dst_nritems;
3191 int ret = 0;
3193 src_nritems = btrfs_header_nritems(src);
3194 dst_nritems = btrfs_header_nritems(dst);
3195 push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
3196 WARN_ON(btrfs_header_generation(src) != trans->transid);
3197 WARN_ON(btrfs_header_generation(dst) != trans->transid);
3199 if (!empty && src_nritems <= 8)
3200 return 1;
3202 if (push_items <= 0)
3203 return 1;
3205 if (empty) {
3206 push_items = min(src_nritems, push_items);
3207 if (push_items < src_nritems) {
3208 /* leave at least 8 pointers in the node if
3209 * we aren't going to empty it
3211 if (src_nritems - push_items < 8) {
3212 if (push_items <= 8)
3213 return 1;
3214 push_items -= 8;
3217 } else
3218 push_items = min(src_nritems - 8, push_items);
3220 ret = tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0,
3221 push_items);
3222 if (ret) {
3223 btrfs_abort_transaction(trans, root, ret);
3224 return ret;
3226 copy_extent_buffer(dst, src,
3227 btrfs_node_key_ptr_offset(dst_nritems),
3228 btrfs_node_key_ptr_offset(0),
3229 push_items * sizeof(struct btrfs_key_ptr));
3231 if (push_items < src_nritems) {
3233 * don't call tree_mod_log_eb_move here, key removal was already
3234 * fully logged by tree_mod_log_eb_copy above.
3236 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
3237 btrfs_node_key_ptr_offset(push_items),
3238 (src_nritems - push_items) *
3239 sizeof(struct btrfs_key_ptr));
3241 btrfs_set_header_nritems(src, src_nritems - push_items);
3242 btrfs_set_header_nritems(dst, dst_nritems + push_items);
3243 btrfs_mark_buffer_dirty(src);
3244 btrfs_mark_buffer_dirty(dst);
3246 return ret;
3250 * try to push data from one node into the next node right in the
3251 * tree.
3253 * returns 0 if some ptrs were pushed, < 0 if there was some horrible
3254 * error, and > 0 if there was no room in the right hand block.
3256 * this will only push up to 1/2 the contents of the left node over
3258 static int balance_node_right(struct btrfs_trans_handle *trans,
3259 struct btrfs_root *root,
3260 struct extent_buffer *dst,
3261 struct extent_buffer *src)
3263 int push_items = 0;
3264 int max_push;
3265 int src_nritems;
3266 int dst_nritems;
3267 int ret = 0;
3269 WARN_ON(btrfs_header_generation(src) != trans->transid);
3270 WARN_ON(btrfs_header_generation(dst) != trans->transid);
3272 src_nritems = btrfs_header_nritems(src);
3273 dst_nritems = btrfs_header_nritems(dst);
3274 push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
3275 if (push_items <= 0)
3276 return 1;
3278 if (src_nritems < 4)
3279 return 1;
3281 max_push = src_nritems / 2 + 1;
3282 /* don't try to empty the node */
3283 if (max_push >= src_nritems)
3284 return 1;
3286 if (max_push < push_items)
3287 push_items = max_push;
3289 tree_mod_log_eb_move(root->fs_info, dst, push_items, 0, dst_nritems);
3290 memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
3291 btrfs_node_key_ptr_offset(0),
3292 (dst_nritems) *
3293 sizeof(struct btrfs_key_ptr));
3295 ret = tree_mod_log_eb_copy(root->fs_info, dst, src, 0,
3296 src_nritems - push_items, push_items);
3297 if (ret) {
3298 btrfs_abort_transaction(trans, root, ret);
3299 return ret;
3301 copy_extent_buffer(dst, src,
3302 btrfs_node_key_ptr_offset(0),
3303 btrfs_node_key_ptr_offset(src_nritems - push_items),
3304 push_items * sizeof(struct btrfs_key_ptr));
3306 btrfs_set_header_nritems(src, src_nritems - push_items);
3307 btrfs_set_header_nritems(dst, dst_nritems + push_items);
3309 btrfs_mark_buffer_dirty(src);
3310 btrfs_mark_buffer_dirty(dst);
3312 return ret;
3316 * helper function to insert a new root level in the tree.
3317 * A new node is allocated, and a single item is inserted to
3318 * point to the existing root
3320 * returns zero on success or < 0 on failure.
3322 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3323 struct btrfs_root *root,
3324 struct btrfs_path *path, int level)
3326 u64 lower_gen;
3327 struct extent_buffer *lower;
3328 struct extent_buffer *c;
3329 struct extent_buffer *old;
3330 struct btrfs_disk_key lower_key;
3332 BUG_ON(path->nodes[level]);
3333 BUG_ON(path->nodes[level-1] != root->node);
3335 lower = path->nodes[level-1];
3336 if (level == 1)
3337 btrfs_item_key(lower, &lower_key, 0);
3338 else
3339 btrfs_node_key(lower, &lower_key, 0);
3341 c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
3342 root->root_key.objectid, &lower_key,
3343 level, root->node->start, 0);
3344 if (IS_ERR(c))
3345 return PTR_ERR(c);
3347 root_add_used(root, root->nodesize);
3349 memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
3350 btrfs_set_header_nritems(c, 1);
3351 btrfs_set_header_level(c, level);
3352 btrfs_set_header_bytenr(c, c->start);
3353 btrfs_set_header_generation(c, trans->transid);
3354 btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
3355 btrfs_set_header_owner(c, root->root_key.objectid);
3357 write_extent_buffer(c, root->fs_info->fsid, btrfs_header_fsid(c),
3358 BTRFS_FSID_SIZE);
3360 write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
3361 btrfs_header_chunk_tree_uuid(c), BTRFS_UUID_SIZE);
3363 btrfs_set_node_key(c, &lower_key, 0);
3364 btrfs_set_node_blockptr(c, 0, lower->start);
3365 lower_gen = btrfs_header_generation(lower);
3366 WARN_ON(lower_gen != trans->transid);
3368 btrfs_set_node_ptr_generation(c, 0, lower_gen);
3370 btrfs_mark_buffer_dirty(c);
3372 old = root->node;
3373 tree_mod_log_set_root_pointer(root, c, 0);
3374 rcu_assign_pointer(root->node, c);
3376 /* the super has an extra ref to root->node */
3377 free_extent_buffer(old);
3379 add_root_to_dirty_list(root);
3380 extent_buffer_get(c);
3381 path->nodes[level] = c;
3382 path->locks[level] = BTRFS_WRITE_LOCK;
3383 path->slots[level] = 0;
3384 return 0;
3388 * worker function to insert a single pointer in a node.
3389 * the node should have enough room for the pointer already
3391 * slot and level indicate where you want the key to go, and
3392 * blocknr is the block the key points to.
3394 static void insert_ptr(struct btrfs_trans_handle *trans,
3395 struct btrfs_root *root, struct btrfs_path *path,
3396 struct btrfs_disk_key *key, u64 bytenr,
3397 int slot, int level)
3399 struct extent_buffer *lower;
3400 int nritems;
3401 int ret;
3403 BUG_ON(!path->nodes[level]);
3404 btrfs_assert_tree_locked(path->nodes[level]);
3405 lower = path->nodes[level];
3406 nritems = btrfs_header_nritems(lower);
3407 BUG_ON(slot > nritems);
3408 BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root));
3409 if (slot != nritems) {
3410 if (level)
3411 tree_mod_log_eb_move(root->fs_info, lower, slot + 1,
3412 slot, nritems - slot);
3413 memmove_extent_buffer(lower,
3414 btrfs_node_key_ptr_offset(slot + 1),
3415 btrfs_node_key_ptr_offset(slot),
3416 (nritems - slot) * sizeof(struct btrfs_key_ptr));
3418 if (level) {
3419 ret = tree_mod_log_insert_key(root->fs_info, lower, slot,
3420 MOD_LOG_KEY_ADD, GFP_NOFS);
3421 BUG_ON(ret < 0);
3423 btrfs_set_node_key(lower, key, slot);
3424 btrfs_set_node_blockptr(lower, slot, bytenr);
3425 WARN_ON(trans->transid == 0);
3426 btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3427 btrfs_set_header_nritems(lower, nritems + 1);
3428 btrfs_mark_buffer_dirty(lower);
3432 * split the node at the specified level in path in two.
3433 * The path is corrected to point to the appropriate node after the split
3435 * Before splitting this tries to make some room in the node by pushing
3436 * left and right, if either one works, it returns right away.
3438 * returns 0 on success and < 0 on failure
3440 static noinline int split_node(struct btrfs_trans_handle *trans,
3441 struct btrfs_root *root,
3442 struct btrfs_path *path, int level)
3444 struct extent_buffer *c;
3445 struct extent_buffer *split;
3446 struct btrfs_disk_key disk_key;
3447 int mid;
3448 int ret;
3449 u32 c_nritems;
3451 c = path->nodes[level];
3452 WARN_ON(btrfs_header_generation(c) != trans->transid);
3453 if (c == root->node) {
3455 * trying to split the root, lets make a new one
3457 * tree mod log: We don't log_removal old root in
3458 * insert_new_root, because that root buffer will be kept as a
3459 * normal node. We are going to log removal of half of the
3460 * elements below with tree_mod_log_eb_copy. We're holding a
3461 * tree lock on the buffer, which is why we cannot race with
3462 * other tree_mod_log users.
3464 ret = insert_new_root(trans, root, path, level + 1);
3465 if (ret)
3466 return ret;
3467 } else {
3468 ret = push_nodes_for_insert(trans, root, path, level);
3469 c = path->nodes[level];
3470 if (!ret && btrfs_header_nritems(c) <
3471 BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
3472 return 0;
3473 if (ret < 0)
3474 return ret;
3477 c_nritems = btrfs_header_nritems(c);
3478 mid = (c_nritems + 1) / 2;
3479 btrfs_node_key(c, &disk_key, mid);
3481 split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
3482 root->root_key.objectid,
3483 &disk_key, level, c->start, 0);
3484 if (IS_ERR(split))
3485 return PTR_ERR(split);
3487 root_add_used(root, root->nodesize);
3489 memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
3490 btrfs_set_header_level(split, btrfs_header_level(c));
3491 btrfs_set_header_bytenr(split, split->start);
3492 btrfs_set_header_generation(split, trans->transid);
3493 btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
3494 btrfs_set_header_owner(split, root->root_key.objectid);
3495 write_extent_buffer(split, root->fs_info->fsid,
3496 btrfs_header_fsid(split), BTRFS_FSID_SIZE);
3497 write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
3498 btrfs_header_chunk_tree_uuid(split),
3499 BTRFS_UUID_SIZE);
3501 ret = tree_mod_log_eb_copy(root->fs_info, split, c, 0,
3502 mid, c_nritems - mid);
3503 if (ret) {
3504 btrfs_abort_transaction(trans, root, ret);
3505 return ret;
3507 copy_extent_buffer(split, c,
3508 btrfs_node_key_ptr_offset(0),
3509 btrfs_node_key_ptr_offset(mid),
3510 (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3511 btrfs_set_header_nritems(split, c_nritems - mid);
3512 btrfs_set_header_nritems(c, mid);
3513 ret = 0;
3515 btrfs_mark_buffer_dirty(c);
3516 btrfs_mark_buffer_dirty(split);
3518 insert_ptr(trans, root, path, &disk_key, split->start,
3519 path->slots[level + 1] + 1, level + 1);
3521 if (path->slots[level] >= mid) {
3522 path->slots[level] -= mid;
3523 btrfs_tree_unlock(c);
3524 free_extent_buffer(c);
3525 path->nodes[level] = split;
3526 path->slots[level + 1] += 1;
3527 } else {
3528 btrfs_tree_unlock(split);
3529 free_extent_buffer(split);
3531 return ret;
3535 * how many bytes are required to store the items in a leaf. start
3536 * and nr indicate which items in the leaf to check. This totals up the
3537 * space used both by the item structs and the item data
3539 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3541 struct btrfs_item *start_item;
3542 struct btrfs_item *end_item;
3543 struct btrfs_map_token token;
3544 int data_len;
3545 int nritems = btrfs_header_nritems(l);
3546 int end = min(nritems, start + nr) - 1;
3548 if (!nr)
3549 return 0;
3550 btrfs_init_map_token(&token);
3551 start_item = btrfs_item_nr(l, start);
3552 end_item = btrfs_item_nr(l, end);
3553 data_len = btrfs_token_item_offset(l, start_item, &token) +
3554 btrfs_token_item_size(l, start_item, &token);
3555 data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
3556 data_len += sizeof(struct btrfs_item) * nr;
3557 WARN_ON(data_len < 0);
3558 return data_len;
3562 * The space between the end of the leaf items and
3563 * the start of the leaf data. IOW, how much room
3564 * the leaf has left for both items and data
3566 noinline int btrfs_leaf_free_space(struct btrfs_root *root,
3567 struct extent_buffer *leaf)
3569 int nritems = btrfs_header_nritems(leaf);
3570 int ret;
3571 ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
3572 if (ret < 0) {
3573 printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, "
3574 "used %d nritems %d\n",
3575 ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
3576 leaf_space_used(leaf, 0, nritems), nritems);
3578 return ret;
3582 * min slot controls the lowest index we're willing to push to the
3583 * right. We'll push up to and including min_slot, but no lower
3585 static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
3586 struct btrfs_root *root,
3587 struct btrfs_path *path,
3588 int data_size, int empty,
3589 struct extent_buffer *right,
3590 int free_space, u32 left_nritems,
3591 u32 min_slot)
3593 struct extent_buffer *left = path->nodes[0];
3594 struct extent_buffer *upper = path->nodes[1];
3595 struct btrfs_map_token token;
3596 struct btrfs_disk_key disk_key;
3597 int slot;
3598 u32 i;
3599 int push_space = 0;
3600 int push_items = 0;
3601 struct btrfs_item *item;
3602 u32 nr;
3603 u32 right_nritems;
3604 u32 data_end;
3605 u32 this_item_size;
3607 btrfs_init_map_token(&token);
3609 if (empty)
3610 nr = 0;
3611 else
3612 nr = max_t(u32, 1, min_slot);
3614 if (path->slots[0] >= left_nritems)
3615 push_space += data_size;
3617 slot = path->slots[1];
3618 i = left_nritems - 1;
3619 while (i >= nr) {
3620 item = btrfs_item_nr(left, i);
3622 if (!empty && push_items > 0) {
3623 if (path->slots[0] > i)
3624 break;
3625 if (path->slots[0] == i) {
3626 int space = btrfs_leaf_free_space(root, left);
3627 if (space + push_space * 2 > free_space)
3628 break;
3632 if (path->slots[0] == i)
3633 push_space += data_size;
3635 this_item_size = btrfs_item_size(left, item);
3636 if (this_item_size + sizeof(*item) + push_space > free_space)
3637 break;
3639 push_items++;
3640 push_space += this_item_size + sizeof(*item);
3641 if (i == 0)
3642 break;
3643 i--;
3646 if (push_items == 0)
3647 goto out_unlock;
3649 WARN_ON(!empty && push_items == left_nritems);
3651 /* push left to right */
3652 right_nritems = btrfs_header_nritems(right);
3654 push_space = btrfs_item_end_nr(left, left_nritems - push_items);
3655 push_space -= leaf_data_end(root, left);
3657 /* make room in the right data area */
3658 data_end = leaf_data_end(root, right);
3659 memmove_extent_buffer(right,
3660 btrfs_leaf_data(right) + data_end - push_space,
3661 btrfs_leaf_data(right) + data_end,
3662 BTRFS_LEAF_DATA_SIZE(root) - data_end);
3664 /* copy from the left data area */
3665 copy_extent_buffer(right, left, btrfs_leaf_data(right) +
3666 BTRFS_LEAF_DATA_SIZE(root) - push_space,
3667 btrfs_leaf_data(left) + leaf_data_end(root, left),
3668 push_space);
3670 memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
3671 btrfs_item_nr_offset(0),
3672 right_nritems * sizeof(struct btrfs_item));
3674 /* copy the items from left to right */
3675 copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
3676 btrfs_item_nr_offset(left_nritems - push_items),
3677 push_items * sizeof(struct btrfs_item));
3679 /* update the item pointers */
3680 right_nritems += push_items;
3681 btrfs_set_header_nritems(right, right_nritems);
3682 push_space = BTRFS_LEAF_DATA_SIZE(root);
3683 for (i = 0; i < right_nritems; i++) {
3684 item = btrfs_item_nr(right, i);
3685 push_space -= btrfs_token_item_size(right, item, &token);
3686 btrfs_set_token_item_offset(right, item, push_space, &token);
3689 left_nritems -= push_items;
3690 btrfs_set_header_nritems(left, left_nritems);
3692 if (left_nritems)
3693 btrfs_mark_buffer_dirty(left);
3694 else
3695 clean_tree_block(trans, root, left);
3697 btrfs_mark_buffer_dirty(right);
3699 btrfs_item_key(right, &disk_key, 0);
3700 btrfs_set_node_key(upper, &disk_key, slot + 1);
3701 btrfs_mark_buffer_dirty(upper);
3703 /* then fixup the leaf pointer in the path */
3704 if (path->slots[0] >= left_nritems) {
3705 path->slots[0] -= left_nritems;
3706 if (btrfs_header_nritems(path->nodes[0]) == 0)
3707 clean_tree_block(trans, root, path->nodes[0]);
3708 btrfs_tree_unlock(path->nodes[0]);
3709 free_extent_buffer(path->nodes[0]);
3710 path->nodes[0] = right;
3711 path->slots[1] += 1;
3712 } else {
3713 btrfs_tree_unlock(right);
3714 free_extent_buffer(right);
3716 return 0;
3718 out_unlock:
3719 btrfs_tree_unlock(right);
3720 free_extent_buffer(right);
3721 return 1;
3725 * push some data in the path leaf to the right, trying to free up at
3726 * least data_size bytes. returns zero if the push worked, nonzero otherwise
3728 * returns 1 if the push failed because the other node didn't have enough
3729 * room, 0 if everything worked out and < 0 if there were major errors.
3731 * this will push starting from min_slot to the end of the leaf. It won't
3732 * push any slot lower than min_slot
3734 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3735 *root, struct btrfs_path *path,
3736 int min_data_size, int data_size,
3737 int empty, u32 min_slot)
3739 struct extent_buffer *left = path->nodes[0];
3740 struct extent_buffer *right;
3741 struct extent_buffer *upper;
3742 int slot;
3743 int free_space;
3744 u32 left_nritems;
3745 int ret;
3747 if (!path->nodes[1])
3748 return 1;
3750 slot = path->slots[1];
3751 upper = path->nodes[1];
3752 if (slot >= btrfs_header_nritems(upper) - 1)
3753 return 1;
3755 btrfs_assert_tree_locked(path->nodes[1]);
3757 right = read_node_slot(root, upper, slot + 1);
3758 if (right == NULL)
3759 return 1;
3761 btrfs_tree_lock(right);
3762 btrfs_set_lock_blocking(right);
3764 free_space = btrfs_leaf_free_space(root, right);
3765 if (free_space < data_size)
3766 goto out_unlock;
3768 /* cow and double check */
3769 ret = btrfs_cow_block(trans, root, right, upper,
3770 slot + 1, &right);
3771 if (ret)
3772 goto out_unlock;
3774 free_space = btrfs_leaf_free_space(root, right);
3775 if (free_space < data_size)
3776 goto out_unlock;
3778 left_nritems = btrfs_header_nritems(left);
3779 if (left_nritems == 0)
3780 goto out_unlock;
3782 return __push_leaf_right(trans, root, path, min_data_size, empty,
3783 right, free_space, left_nritems, min_slot);
3784 out_unlock:
3785 btrfs_tree_unlock(right);
3786 free_extent_buffer(right);
3787 return 1;
3791 * push some data in the path leaf to the left, trying to free up at
3792 * least data_size bytes. returns zero if the push worked, nonzero otherwise
3794 * max_slot can put a limit on how far into the leaf we'll push items. The
3795 * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the
3796 * items
3798 static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
3799 struct btrfs_root *root,
3800 struct btrfs_path *path, int data_size,
3801 int empty, struct extent_buffer *left,
3802 int free_space, u32 right_nritems,
3803 u32 max_slot)
3805 struct btrfs_disk_key disk_key;
3806 struct extent_buffer *right = path->nodes[0];
3807 int i;
3808 int push_space = 0;
3809 int push_items = 0;
3810 struct btrfs_item *item;
3811 u32 old_left_nritems;
3812 u32 nr;
3813 int ret = 0;
3814 u32 this_item_size;
3815 u32 old_left_item_size;
3816 struct btrfs_map_token token;
3818 btrfs_init_map_token(&token);
3820 if (empty)
3821 nr = min(right_nritems, max_slot);
3822 else
3823 nr = min(right_nritems - 1, max_slot);
3825 for (i = 0; i < nr; i++) {
3826 item = btrfs_item_nr(right, i);
3828 if (!empty && push_items > 0) {
3829 if (path->slots[0] < i)
3830 break;
3831 if (path->slots[0] == i) {
3832 int space = btrfs_leaf_free_space(root, right);
3833 if (space + push_space * 2 > free_space)
3834 break;
3838 if (path->slots[0] == i)
3839 push_space += data_size;
3841 this_item_size = btrfs_item_size(right, item);
3842 if (this_item_size + sizeof(*item) + push_space > free_space)
3843 break;
3845 push_items++;
3846 push_space += this_item_size + sizeof(*item);
3849 if (push_items == 0) {
3850 ret = 1;
3851 goto out;
3853 if (!empty && push_items == btrfs_header_nritems(right))
3854 WARN_ON(1);
3856 /* push data from right to left */
3857 copy_extent_buffer(left, right,
3858 btrfs_item_nr_offset(btrfs_header_nritems(left)),
3859 btrfs_item_nr_offset(0),
3860 push_items * sizeof(struct btrfs_item));
3862 push_space = BTRFS_LEAF_DATA_SIZE(root) -
3863 btrfs_item_offset_nr(right, push_items - 1);
3865 copy_extent_buffer(left, right, btrfs_leaf_data(left) +
3866 leaf_data_end(root, left) - push_space,
3867 btrfs_leaf_data(right) +
3868 btrfs_item_offset_nr(right, push_items - 1),
3869 push_space);
3870 old_left_nritems = btrfs_header_nritems(left);
3871 BUG_ON(old_left_nritems <= 0);
3873 old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
3874 for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3875 u32 ioff;
3877 item = btrfs_item_nr(left, i);
3879 ioff = btrfs_token_item_offset(left, item, &token);
3880 btrfs_set_token_item_offset(left, item,
3881 ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size),
3882 &token);
3884 btrfs_set_header_nritems(left, old_left_nritems + push_items);
3886 /* fixup right node */
3887 if (push_items > right_nritems)
3888 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3889 right_nritems);
3891 if (push_items < right_nritems) {
3892 push_space = btrfs_item_offset_nr(right, push_items - 1) -
3893 leaf_data_end(root, right);
3894 memmove_extent_buffer(right, btrfs_leaf_data(right) +
3895 BTRFS_LEAF_DATA_SIZE(root) - push_space,
3896 btrfs_leaf_data(right) +
3897 leaf_data_end(root, right), push_space);
3899 memmove_extent_buffer(right, btrfs_item_nr_offset(0),
3900 btrfs_item_nr_offset(push_items),
3901 (btrfs_header_nritems(right) - push_items) *
3902 sizeof(struct btrfs_item));
3904 right_nritems -= push_items;
3905 btrfs_set_header_nritems(right, right_nritems);
3906 push_space = BTRFS_LEAF_DATA_SIZE(root);
3907 for (i = 0; i < right_nritems; i++) {
3908 item = btrfs_item_nr(right, i);
3910 push_space = push_space - btrfs_token_item_size(right,
3911 item, &token);
3912 btrfs_set_token_item_offset(right, item, push_space, &token);
3915 btrfs_mark_buffer_dirty(left);
3916 if (right_nritems)
3917 btrfs_mark_buffer_dirty(right);
3918 else
3919 clean_tree_block(trans, root, right);
3921 btrfs_item_key(right, &disk_key, 0);
3922 fixup_low_keys(root, path, &disk_key, 1);
3924 /* then fixup the leaf pointer in the path */
3925 if (path->slots[0] < push_items) {
3926 path->slots[0] += old_left_nritems;
3927 btrfs_tree_unlock(path->nodes[0]);
3928 free_extent_buffer(path->nodes[0]);
3929 path->nodes[0] = left;
3930 path->slots[1] -= 1;
3931 } else {
3932 btrfs_tree_unlock(left);
3933 free_extent_buffer(left);
3934 path->slots[0] -= push_items;
3936 BUG_ON(path->slots[0] < 0);
3937 return ret;
3938 out:
3939 btrfs_tree_unlock(left);
3940 free_extent_buffer(left);
3941 return ret;
3945 * push some data in the path leaf to the left, trying to free up at
3946 * least data_size bytes. returns zero if the push worked, nonzero otherwise
3948 * max_slot can put a limit on how far into the leaf we'll push items. The
3949 * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the
3950 * items
3952 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3953 *root, struct btrfs_path *path, int min_data_size,
3954 int data_size, int empty, u32 max_slot)
3956 struct extent_buffer *right = path->nodes[0];
3957 struct extent_buffer *left;
3958 int slot;
3959 int free_space;
3960 u32 right_nritems;
3961 int ret = 0;
3963 slot = path->slots[1];
3964 if (slot == 0)
3965 return 1;
3966 if (!path->nodes[1])
3967 return 1;
3969 right_nritems = btrfs_header_nritems(right);
3970 if (right_nritems == 0)
3971 return 1;
3973 btrfs_assert_tree_locked(path->nodes[1]);
3975 left = read_node_slot(root, path->nodes[1], slot - 1);
3976 if (left == NULL)
3977 return 1;
3979 btrfs_tree_lock(left);
3980 btrfs_set_lock_blocking(left);
3982 free_space = btrfs_leaf_free_space(root, left);
3983 if (free_space < data_size) {
3984 ret = 1;
3985 goto out;
3988 /* cow and double check */
3989 ret = btrfs_cow_block(trans, root, left,
3990 path->nodes[1], slot - 1, &left);
3991 if (ret) {
3992 /* we hit -ENOSPC, but it isn't fatal here */
3993 if (ret == -ENOSPC)
3994 ret = 1;
3995 goto out;
3998 free_space = btrfs_leaf_free_space(root, left);
3999 if (free_space < data_size) {
4000 ret = 1;
4001 goto out;
4004 return __push_leaf_left(trans, root, path, min_data_size,
4005 empty, left, free_space, right_nritems,
4006 max_slot);
4007 out:
4008 btrfs_tree_unlock(left);
4009 free_extent_buffer(left);
4010 return ret;
4014 * split the path's leaf in two, making sure there is at least data_size
4015 * available for the resulting leaf level of the path.
4017 static noinline void copy_for_split(struct btrfs_trans_handle *trans,
4018 struct btrfs_root *root,
4019 struct btrfs_path *path,
4020 struct extent_buffer *l,
4021 struct extent_buffer *right,
4022 int slot, int mid, int nritems)
4024 int data_copy_size;
4025 int rt_data_off;
4026 int i;
4027 struct btrfs_disk_key disk_key;
4028 struct btrfs_map_token token;
4030 btrfs_init_map_token(&token);
4032 nritems = nritems - mid;
4033 btrfs_set_header_nritems(right, nritems);
4034 data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
4036 copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
4037 btrfs_item_nr_offset(mid),
4038 nritems * sizeof(struct btrfs_item));
4040 copy_extent_buffer(right, l,
4041 btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
4042 data_copy_size, btrfs_leaf_data(l) +
4043 leaf_data_end(root, l), data_copy_size);
4045 rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
4046 btrfs_item_end_nr(l, mid);
4048 for (i = 0; i < nritems; i++) {
4049 struct btrfs_item *item = btrfs_item_nr(right, i);
4050 u32 ioff;
4052 ioff = btrfs_token_item_offset(right, item, &token);
4053 btrfs_set_token_item_offset(right, item,
4054 ioff + rt_data_off, &token);
4057 btrfs_set_header_nritems(l, mid);
4058 btrfs_item_key(right, &disk_key, 0);
4059 insert_ptr(trans, root, path, &disk_key, right->start,
4060 path->slots[1] + 1, 1);
4062 btrfs_mark_buffer_dirty(right);
4063 btrfs_mark_buffer_dirty(l);
4064 BUG_ON(path->slots[0] != slot);
4066 if (mid <= slot) {
4067 btrfs_tree_unlock(path->nodes[0]);
4068 free_extent_buffer(path->nodes[0]);
4069 path->nodes[0] = right;
4070 path->slots[0] -= mid;
4071 path->slots[1] += 1;
4072 } else {
4073 btrfs_tree_unlock(right);
4074 free_extent_buffer(right);
4077 BUG_ON(path->slots[0] < 0);
4081 * double splits happen when we need to insert a big item in the middle
4082 * of a leaf. A double split can leave us with 3 mostly empty leaves:
4083 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
4084 * A B C
4086 * We avoid this by trying to push the items on either side of our target
4087 * into the adjacent leaves. If all goes well we can avoid the double split
4088 * completely.
4090 static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
4091 struct btrfs_root *root,
4092 struct btrfs_path *path,
4093 int data_size)
4095 int ret;
4096 int progress = 0;
4097 int slot;
4098 u32 nritems;
4100 slot = path->slots[0];
4103 * try to push all the items after our slot into the
4104 * right leaf
4106 ret = push_leaf_right(trans, root, path, 1, data_size, 0, slot);
4107 if (ret < 0)
4108 return ret;
4110 if (ret == 0)
4111 progress++;
4113 nritems = btrfs_header_nritems(path->nodes[0]);
4115 * our goal is to get our slot at the start or end of a leaf. If
4116 * we've done so we're done
4118 if (path->slots[0] == 0 || path->slots[0] == nritems)
4119 return 0;
4121 if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
4122 return 0;
4124 /* try to push all the items before our slot into the next leaf */
4125 slot = path->slots[0];
4126 ret = push_leaf_left(trans, root, path, 1, data_size, 0, slot);
4127 if (ret < 0)
4128 return ret;
4130 if (ret == 0)
4131 progress++;
4133 if (progress)
4134 return 0;
4135 return 1;
4139 * split the path's leaf in two, making sure there is at least data_size
4140 * available for the resulting leaf level of the path.
4142 * returns 0 if all went well and < 0 on failure.
4144 static noinline int split_leaf(struct btrfs_trans_handle *trans,
4145 struct btrfs_root *root,
4146 struct btrfs_key *ins_key,
4147 struct btrfs_path *path, int data_size,
4148 int extend)
4150 struct btrfs_disk_key disk_key;
4151 struct extent_buffer *l;
4152 u32 nritems;
4153 int mid;
4154 int slot;
4155 struct extent_buffer *right;
4156 int ret = 0;
4157 int wret;
4158 int split;
4159 int num_doubles = 0;
4160 int tried_avoid_double = 0;
4162 l = path->nodes[0];
4163 slot = path->slots[0];
4164 if (extend && data_size + btrfs_item_size_nr(l, slot) +
4165 sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root))
4166 return -EOVERFLOW;
4168 /* first try to make some room by pushing left and right */
4169 if (data_size && path->nodes[1]) {
4170 wret = push_leaf_right(trans, root, path, data_size,
4171 data_size, 0, 0);
4172 if (wret < 0)
4173 return wret;
4174 if (wret) {
4175 wret = push_leaf_left(trans, root, path, data_size,
4176 data_size, 0, (u32)-1);
4177 if (wret < 0)
4178 return wret;
4180 l = path->nodes[0];
4182 /* did the pushes work? */
4183 if (btrfs_leaf_free_space(root, l) >= data_size)
4184 return 0;
4187 if (!path->nodes[1]) {
4188 ret = insert_new_root(trans, root, path, 1);
4189 if (ret)
4190 return ret;
4192 again:
4193 split = 1;
4194 l = path->nodes[0];
4195 slot = path->slots[0];
4196 nritems = btrfs_header_nritems(l);
4197 mid = (nritems + 1) / 2;
4199 if (mid <= slot) {
4200 if (nritems == 1 ||
4201 leaf_space_used(l, mid, nritems - mid) + data_size >
4202 BTRFS_LEAF_DATA_SIZE(root)) {
4203 if (slot >= nritems) {
4204 split = 0;
4205 } else {
4206 mid = slot;
4207 if (mid != nritems &&
4208 leaf_space_used(l, mid, nritems - mid) +
4209 data_size > BTRFS_LEAF_DATA_SIZE(root)) {
4210 if (data_size && !tried_avoid_double)
4211 goto push_for_double;
4212 split = 2;
4216 } else {
4217 if (leaf_space_used(l, 0, mid) + data_size >
4218 BTRFS_LEAF_DATA_SIZE(root)) {
4219 if (!extend && data_size && slot == 0) {
4220 split = 0;
4221 } else if ((extend || !data_size) && slot == 0) {
4222 mid = 1;
4223 } else {
4224 mid = slot;
4225 if (mid != nritems &&
4226 leaf_space_used(l, mid, nritems - mid) +
4227 data_size > BTRFS_LEAF_DATA_SIZE(root)) {
4228 if (data_size && !tried_avoid_double)
4229 goto push_for_double;
4230 split = 2 ;
4236 if (split == 0)
4237 btrfs_cpu_key_to_disk(&disk_key, ins_key);
4238 else
4239 btrfs_item_key(l, &disk_key, mid);
4241 right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
4242 root->root_key.objectid,
4243 &disk_key, 0, l->start, 0);
4244 if (IS_ERR(right))
4245 return PTR_ERR(right);
4247 root_add_used(root, root->leafsize);
4249 memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
4250 btrfs_set_header_bytenr(right, right->start);
4251 btrfs_set_header_generation(right, trans->transid);
4252 btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
4253 btrfs_set_header_owner(right, root->root_key.objectid);
4254 btrfs_set_header_level(right, 0);
4255 write_extent_buffer(right, root->fs_info->fsid,
4256 btrfs_header_fsid(right), BTRFS_FSID_SIZE);
4258 write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
4259 btrfs_header_chunk_tree_uuid(right),
4260 BTRFS_UUID_SIZE);
4262 if (split == 0) {
4263 if (mid <= slot) {
4264 btrfs_set_header_nritems(right, 0);
4265 insert_ptr(trans, root, path, &disk_key, right->start,
4266 path->slots[1] + 1, 1);
4267 btrfs_tree_unlock(path->nodes[0]);
4268 free_extent_buffer(path->nodes[0]);
4269 path->nodes[0] = right;
4270 path->slots[0] = 0;
4271 path->slots[1] += 1;
4272 } else {
4273 btrfs_set_header_nritems(right, 0);
4274 insert_ptr(trans, root, path, &disk_key, right->start,
4275 path->slots[1], 1);
4276 btrfs_tree_unlock(path->nodes[0]);
4277 free_extent_buffer(path->nodes[0]);
4278 path->nodes[0] = right;
4279 path->slots[0] = 0;
4280 if (path->slots[1] == 0)
4281 fixup_low_keys(root, path, &disk_key, 1);
4283 btrfs_mark_buffer_dirty(right);
4284 return ret;
4287 copy_for_split(trans, root, path, l, right, slot, mid, nritems);
4289 if (split == 2) {
4290 BUG_ON(num_doubles != 0);
4291 num_doubles++;
4292 goto again;
4295 return 0;
4297 push_for_double:
4298 push_for_double_split(trans, root, path, data_size);
4299 tried_avoid_double = 1;
4300 if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
4301 return 0;
4302 goto again;
4305 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
4306 struct btrfs_root *root,
4307 struct btrfs_path *path, int ins_len)
4309 struct btrfs_key key;
4310 struct extent_buffer *leaf;
4311 struct btrfs_file_extent_item *fi;
4312 u64 extent_len = 0;
4313 u32 item_size;
4314 int ret;
4316 leaf = path->nodes[0];
4317 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4319 BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
4320 key.type != BTRFS_EXTENT_CSUM_KEY);
4322 if (btrfs_leaf_free_space(root, leaf) >= ins_len)
4323 return 0;
4325 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4326 if (key.type == BTRFS_EXTENT_DATA_KEY) {
4327 fi = btrfs_item_ptr(leaf, path->slots[0],
4328 struct btrfs_file_extent_item);
4329 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
4331 btrfs_release_path(path);
4333 path->keep_locks = 1;
4334 path->search_for_split = 1;
4335 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4336 path->search_for_split = 0;
4337 if (ret < 0)
4338 goto err;
4340 ret = -EAGAIN;
4341 leaf = path->nodes[0];
4342 /* if our item isn't there or got smaller, return now */
4343 if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0]))
4344 goto err;
4346 /* the leaf has changed, it now has room. return now */
4347 if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len)
4348 goto err;
4350 if (key.type == BTRFS_EXTENT_DATA_KEY) {
4351 fi = btrfs_item_ptr(leaf, path->slots[0],
4352 struct btrfs_file_extent_item);
4353 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
4354 goto err;
4357 btrfs_set_path_blocking(path);
4358 ret = split_leaf(trans, root, &key, path, ins_len, 1);
4359 if (ret)
4360 goto err;
4362 path->keep_locks = 0;
4363 btrfs_unlock_up_safe(path, 1);
4364 return 0;
4365 err:
4366 path->keep_locks = 0;
4367 return ret;
4370 static noinline int split_item(struct btrfs_trans_handle *trans,
4371 struct btrfs_root *root,
4372 struct btrfs_path *path,
4373 struct btrfs_key *new_key,
4374 unsigned long split_offset)
4376 struct extent_buffer *leaf;
4377 struct btrfs_item *item;
4378 struct btrfs_item *new_item;
4379 int slot;
4380 char *buf;
4381 u32 nritems;
4382 u32 item_size;
4383 u32 orig_offset;
4384 struct btrfs_disk_key disk_key;
4386 leaf = path->nodes[0];
4387 BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
4389 btrfs_set_path_blocking(path);
4391 item = btrfs_item_nr(leaf, path->slots[0]);
4392 orig_offset = btrfs_item_offset(leaf, item);
4393 item_size = btrfs_item_size(leaf, item);
4395 buf = kmalloc(item_size, GFP_NOFS);
4396 if (!buf)
4397 return -ENOMEM;
4399 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
4400 path->slots[0]), item_size);
4402 slot = path->slots[0] + 1;
4403 nritems = btrfs_header_nritems(leaf);
4404 if (slot != nritems) {
4405 /* shift the items */
4406 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
4407 btrfs_item_nr_offset(slot),
4408 (nritems - slot) * sizeof(struct btrfs_item));
4411 btrfs_cpu_key_to_disk(&disk_key, new_key);
4412 btrfs_set_item_key(leaf, &disk_key, slot);
4414 new_item = btrfs_item_nr(leaf, slot);
4416 btrfs_set_item_offset(leaf, new_item, orig_offset);
4417 btrfs_set_item_size(leaf, new_item, item_size - split_offset);
4419 btrfs_set_item_offset(leaf, item,
4420 orig_offset + item_size - split_offset);
4421 btrfs_set_item_size(leaf, item, split_offset);
4423 btrfs_set_header_nritems(leaf, nritems + 1);
4425 /* write the data for the start of the original item */
4426 write_extent_buffer(leaf, buf,
4427 btrfs_item_ptr_offset(leaf, path->slots[0]),
4428 split_offset);
4430 /* write the data for the new item */
4431 write_extent_buffer(leaf, buf + split_offset,
4432 btrfs_item_ptr_offset(leaf, slot),
4433 item_size - split_offset);
4434 btrfs_mark_buffer_dirty(leaf);
4436 BUG_ON(btrfs_leaf_free_space(root, leaf) < 0);
4437 kfree(buf);
4438 return 0;
4442 * This function splits a single item into two items,
4443 * giving 'new_key' to the new item and splitting the
4444 * old one at split_offset (from the start of the item).
4446 * The path may be released by this operation. After
4447 * the split, the path is pointing to the old item. The
4448 * new item is going to be in the same node as the old one.
4450 * Note, the item being split must be smaller enough to live alone on
4451 * a tree block with room for one extra struct btrfs_item
4453 * This allows us to split the item in place, keeping a lock on the
4454 * leaf the entire time.
4456 int btrfs_split_item(struct btrfs_trans_handle *trans,
4457 struct btrfs_root *root,
4458 struct btrfs_path *path,
4459 struct btrfs_key *new_key,
4460 unsigned long split_offset)
4462 int ret;
4463 ret = setup_leaf_for_split(trans, root, path,
4464 sizeof(struct btrfs_item));
4465 if (ret)
4466 return ret;
4468 ret = split_item(trans, root, path, new_key, split_offset);
4469 return ret;
4473 * This function duplicate a item, giving 'new_key' to the new item.
4474 * It guarantees both items live in the same tree leaf and the new item
4475 * is contiguous with the original item.
4477 * This allows us to split file extent in place, keeping a lock on the
4478 * leaf the entire time.
4480 int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4481 struct btrfs_root *root,
4482 struct btrfs_path *path,
4483 struct btrfs_key *new_key)
4485 struct extent_buffer *leaf;
4486 int ret;
4487 u32 item_size;
4489 leaf = path->nodes[0];
4490 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4491 ret = setup_leaf_for_split(trans, root, path,
4492 item_size + sizeof(struct btrfs_item));
4493 if (ret)
4494 return ret;
4496 path->slots[0]++;
4497 setup_items_for_insert(root, path, new_key, &item_size,
4498 item_size, item_size +
4499 sizeof(struct btrfs_item), 1);
4500 leaf = path->nodes[0];
4501 memcpy_extent_buffer(leaf,
4502 btrfs_item_ptr_offset(leaf, path->slots[0]),
4503 btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4504 item_size);
4505 return 0;
4509 * make the item pointed to by the path smaller. new_size indicates
4510 * how small to make it, and from_end tells us if we just chop bytes
4511 * off the end of the item or if we shift the item to chop bytes off
4512 * the front.
4514 void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
4515 u32 new_size, int from_end)
4517 int slot;
4518 struct extent_buffer *leaf;
4519 struct btrfs_item *item;
4520 u32 nritems;
4521 unsigned int data_end;
4522 unsigned int old_data_start;
4523 unsigned int old_size;
4524 unsigned int size_diff;
4525 int i;
4526 struct btrfs_map_token token;
4528 btrfs_init_map_token(&token);
4530 leaf = path->nodes[0];
4531 slot = path->slots[0];
4533 old_size = btrfs_item_size_nr(leaf, slot);
4534 if (old_size == new_size)
4535 return;
4537 nritems = btrfs_header_nritems(leaf);
4538 data_end = leaf_data_end(root, leaf);
4540 old_data_start = btrfs_item_offset_nr(leaf, slot);
4542 size_diff = old_size - new_size;
4544 BUG_ON(slot < 0);
4545 BUG_ON(slot >= nritems);
4548 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4550 /* first correct the data pointers */
4551 for (i = slot; i < nritems; i++) {
4552 u32 ioff;
4553 item = btrfs_item_nr(leaf, i);
4555 ioff = btrfs_token_item_offset(leaf, item, &token);
4556 btrfs_set_token_item_offset(leaf, item,
4557 ioff + size_diff, &token);
4560 /* shift the data */
4561 if (from_end) {
4562 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4563 data_end + size_diff, btrfs_leaf_data(leaf) +
4564 data_end, old_data_start + new_size - data_end);
4565 } else {
4566 struct btrfs_disk_key disk_key;
4567 u64 offset;
4569 btrfs_item_key(leaf, &disk_key, slot);
4571 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4572 unsigned long ptr;
4573 struct btrfs_file_extent_item *fi;
4575 fi = btrfs_item_ptr(leaf, slot,
4576 struct btrfs_file_extent_item);
4577 fi = (struct btrfs_file_extent_item *)(
4578 (unsigned long)fi - size_diff);
4580 if (btrfs_file_extent_type(leaf, fi) ==
4581 BTRFS_FILE_EXTENT_INLINE) {
4582 ptr = btrfs_item_ptr_offset(leaf, slot);
4583 memmove_extent_buffer(leaf, ptr,
4584 (unsigned long)fi,
4585 offsetof(struct btrfs_file_extent_item,
4586 disk_bytenr));
4590 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4591 data_end + size_diff, btrfs_leaf_data(leaf) +
4592 data_end, old_data_start - data_end);
4594 offset = btrfs_disk_key_offset(&disk_key);
4595 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4596 btrfs_set_item_key(leaf, &disk_key, slot);
4597 if (slot == 0)
4598 fixup_low_keys(root, path, &disk_key, 1);
4601 item = btrfs_item_nr(leaf, slot);
4602 btrfs_set_item_size(leaf, item, new_size);
4603 btrfs_mark_buffer_dirty(leaf);
4605 if (btrfs_leaf_free_space(root, leaf) < 0) {
4606 btrfs_print_leaf(root, leaf);
4607 BUG();
4612 * make the item pointed to by the path bigger, data_size is the added size.
4614 void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
4615 u32 data_size)
4617 int slot;
4618 struct extent_buffer *leaf;
4619 struct btrfs_item *item;
4620 u32 nritems;
4621 unsigned int data_end;
4622 unsigned int old_data;
4623 unsigned int old_size;
4624 int i;
4625 struct btrfs_map_token token;
4627 btrfs_init_map_token(&token);
4629 leaf = path->nodes[0];
4631 nritems = btrfs_header_nritems(leaf);
4632 data_end = leaf_data_end(root, leaf);
4634 if (btrfs_leaf_free_space(root, leaf) < data_size) {
4635 btrfs_print_leaf(root, leaf);
4636 BUG();
4638 slot = path->slots[0];
4639 old_data = btrfs_item_end_nr(leaf, slot);
4641 BUG_ON(slot < 0);
4642 if (slot >= nritems) {
4643 btrfs_print_leaf(root, leaf);
4644 printk(KERN_CRIT "slot %d too large, nritems %d\n",
4645 slot, nritems);
4646 BUG_ON(1);
4650 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4652 /* first correct the data pointers */
4653 for (i = slot; i < nritems; i++) {
4654 u32 ioff;
4655 item = btrfs_item_nr(leaf, i);
4657 ioff = btrfs_token_item_offset(leaf, item, &token);
4658 btrfs_set_token_item_offset(leaf, item,
4659 ioff - data_size, &token);
4662 /* shift the data */
4663 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4664 data_end - data_size, btrfs_leaf_data(leaf) +
4665 data_end, old_data - data_end);
4667 data_end = old_data;
4668 old_size = btrfs_item_size_nr(leaf, slot);
4669 item = btrfs_item_nr(leaf, slot);
4670 btrfs_set_item_size(leaf, item, old_size + data_size);
4671 btrfs_mark_buffer_dirty(leaf);
4673 if (btrfs_leaf_free_space(root, leaf) < 0) {
4674 btrfs_print_leaf(root, leaf);
4675 BUG();
4680 * this is a helper for btrfs_insert_empty_items, the main goal here is
4681 * to save stack depth by doing the bulk of the work in a function
4682 * that doesn't call btrfs_search_slot
4684 void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4685 struct btrfs_key *cpu_key, u32 *data_size,
4686 u32 total_data, u32 total_size, int nr)
4688 struct btrfs_item *item;
4689 int i;
4690 u32 nritems;
4691 unsigned int data_end;
4692 struct btrfs_disk_key disk_key;
4693 struct extent_buffer *leaf;
4694 int slot;
4695 struct btrfs_map_token token;
4697 btrfs_init_map_token(&token);
4699 leaf = path->nodes[0];
4700 slot = path->slots[0];
4702 nritems = btrfs_header_nritems(leaf);
4703 data_end = leaf_data_end(root, leaf);
4705 if (btrfs_leaf_free_space(root, leaf) < total_size) {
4706 btrfs_print_leaf(root, leaf);
4707 printk(KERN_CRIT "not enough freespace need %u have %d\n",
4708 total_size, btrfs_leaf_free_space(root, leaf));
4709 BUG();
4712 if (slot != nritems) {
4713 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4715 if (old_data < data_end) {
4716 btrfs_print_leaf(root, leaf);
4717 printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
4718 slot, old_data, data_end);
4719 BUG_ON(1);
4722 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4724 /* first correct the data pointers */
4725 for (i = slot; i < nritems; i++) {
4726 u32 ioff;
4728 item = btrfs_item_nr(leaf, i);
4729 ioff = btrfs_token_item_offset(leaf, item, &token);
4730 btrfs_set_token_item_offset(leaf, item,
4731 ioff - total_data, &token);
4733 /* shift the items */
4734 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4735 btrfs_item_nr_offset(slot),
4736 (nritems - slot) * sizeof(struct btrfs_item));
4738 /* shift the data */
4739 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4740 data_end - total_data, btrfs_leaf_data(leaf) +
4741 data_end, old_data - data_end);
4742 data_end = old_data;
4745 /* setup the item for the new data */
4746 for (i = 0; i < nr; i++) {
4747 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4748 btrfs_set_item_key(leaf, &disk_key, slot + i);
4749 item = btrfs_item_nr(leaf, slot + i);
4750 btrfs_set_token_item_offset(leaf, item,
4751 data_end - data_size[i], &token);
4752 data_end -= data_size[i];
4753 btrfs_set_token_item_size(leaf, item, data_size[i], &token);
4756 btrfs_set_header_nritems(leaf, nritems + nr);
4758 if (slot == 0) {
4759 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4760 fixup_low_keys(root, path, &disk_key, 1);
4762 btrfs_unlock_up_safe(path, 1);
4763 btrfs_mark_buffer_dirty(leaf);
4765 if (btrfs_leaf_free_space(root, leaf) < 0) {
4766 btrfs_print_leaf(root, leaf);
4767 BUG();
4772 * Given a key and some data, insert items into the tree.
4773 * This does all the path init required, making room in the tree if needed.
4775 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4776 struct btrfs_root *root,
4777 struct btrfs_path *path,
4778 struct btrfs_key *cpu_key, u32 *data_size,
4779 int nr)
4781 int ret = 0;
4782 int slot;
4783 int i;
4784 u32 total_size = 0;
4785 u32 total_data = 0;
4787 for (i = 0; i < nr; i++)
4788 total_data += data_size[i];
4790 total_size = total_data + (nr * sizeof(struct btrfs_item));
4791 ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4792 if (ret == 0)
4793 return -EEXIST;
4794 if (ret < 0)
4795 return ret;
4797 slot = path->slots[0];
4798 BUG_ON(slot < 0);
4800 setup_items_for_insert(root, path, cpu_key, data_size,
4801 total_data, total_size, nr);
4802 return 0;
4806 * Given a key and some data, insert an item into the tree.
4807 * This does all the path init required, making room in the tree if needed.
4809 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
4810 *root, struct btrfs_key *cpu_key, void *data, u32
4811 data_size)
4813 int ret = 0;
4814 struct btrfs_path *path;
4815 struct extent_buffer *leaf;
4816 unsigned long ptr;
4818 path = btrfs_alloc_path();
4819 if (!path)
4820 return -ENOMEM;
4821 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4822 if (!ret) {
4823 leaf = path->nodes[0];
4824 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4825 write_extent_buffer(leaf, data, ptr, data_size);
4826 btrfs_mark_buffer_dirty(leaf);
4828 btrfs_free_path(path);
4829 return ret;
4833 * delete the pointer from a given node.
4835 * the tree should have been previously balanced so the deletion does not
4836 * empty a node.
4838 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4839 int level, int slot)
4841 struct extent_buffer *parent = path->nodes[level];
4842 u32 nritems;
4843 int ret;
4845 nritems = btrfs_header_nritems(parent);
4846 if (slot != nritems - 1) {
4847 if (level)
4848 tree_mod_log_eb_move(root->fs_info, parent, slot,
4849 slot + 1, nritems - slot - 1);
4850 memmove_extent_buffer(parent,
4851 btrfs_node_key_ptr_offset(slot),
4852 btrfs_node_key_ptr_offset(slot + 1),
4853 sizeof(struct btrfs_key_ptr) *
4854 (nritems - slot - 1));
4855 } else if (level) {
4856 ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
4857 MOD_LOG_KEY_REMOVE, GFP_NOFS);
4858 BUG_ON(ret < 0);
4861 nritems--;
4862 btrfs_set_header_nritems(parent, nritems);
4863 if (nritems == 0 && parent == root->node) {
4864 BUG_ON(btrfs_header_level(root->node) != 1);
4865 /* just turn the root into a leaf and break */
4866 btrfs_set_header_level(root->node, 0);
4867 } else if (slot == 0) {
4868 struct btrfs_disk_key disk_key;
4870 btrfs_node_key(parent, &disk_key, 0);
4871 fixup_low_keys(root, path, &disk_key, level + 1);
4873 btrfs_mark_buffer_dirty(parent);
4877 * a helper function to delete the leaf pointed to by path->slots[1] and
4878 * path->nodes[1].
4880 * This deletes the pointer in path->nodes[1] and frees the leaf
4881 * block extent. zero is returned if it all worked out, < 0 otherwise.
4883 * The path must have already been setup for deleting the leaf, including
4884 * all the proper balancing. path->nodes[1] must be locked.
4886 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4887 struct btrfs_root *root,
4888 struct btrfs_path *path,
4889 struct extent_buffer *leaf)
4891 WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4892 del_ptr(root, path, 1, path->slots[1]);
4895 * btrfs_free_extent is expensive, we want to make sure we
4896 * aren't holding any locks when we call it
4898 btrfs_unlock_up_safe(path, 0);
4900 root_sub_used(root, leaf->len);
4902 extent_buffer_get(leaf);
4903 btrfs_free_tree_block(trans, root, leaf, 0, 1);
4904 free_extent_buffer_stale(leaf);
4907 * delete the item at the leaf level in path. If that empties
4908 * the leaf, remove it from the tree
4910 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4911 struct btrfs_path *path, int slot, int nr)
4913 struct extent_buffer *leaf;
4914 struct btrfs_item *item;
4915 int last_off;
4916 int dsize = 0;
4917 int ret = 0;
4918 int wret;
4919 int i;
4920 u32 nritems;
4921 struct btrfs_map_token token;
4923 btrfs_init_map_token(&token);
4925 leaf = path->nodes[0];
4926 last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
4928 for (i = 0; i < nr; i++)
4929 dsize += btrfs_item_size_nr(leaf, slot + i);
4931 nritems = btrfs_header_nritems(leaf);
4933 if (slot + nr != nritems) {
4934 int data_end = leaf_data_end(root, leaf);
4936 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4937 data_end + dsize,
4938 btrfs_leaf_data(leaf) + data_end,
4939 last_off - data_end);
4941 for (i = slot + nr; i < nritems; i++) {
4942 u32 ioff;
4944 item = btrfs_item_nr(leaf, i);
4945 ioff = btrfs_token_item_offset(leaf, item, &token);
4946 btrfs_set_token_item_offset(leaf, item,
4947 ioff + dsize, &token);
4950 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
4951 btrfs_item_nr_offset(slot + nr),
4952 sizeof(struct btrfs_item) *
4953 (nritems - slot - nr));
4955 btrfs_set_header_nritems(leaf, nritems - nr);
4956 nritems -= nr;
4958 /* delete the leaf if we've emptied it */
4959 if (nritems == 0) {
4960 if (leaf == root->node) {
4961 btrfs_set_header_level(leaf, 0);
4962 } else {
4963 btrfs_set_path_blocking(path);
4964 clean_tree_block(trans, root, leaf);
4965 btrfs_del_leaf(trans, root, path, leaf);
4967 } else {
4968 int used = leaf_space_used(leaf, 0, nritems);
4969 if (slot == 0) {
4970 struct btrfs_disk_key disk_key;
4972 btrfs_item_key(leaf, &disk_key, 0);
4973 fixup_low_keys(root, path, &disk_key, 1);
4976 /* delete the leaf if it is mostly empty */
4977 if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
4978 /* push_leaf_left fixes the path.
4979 * make sure the path still points to our leaf
4980 * for possible call to del_ptr below
4982 slot = path->slots[1];
4983 extent_buffer_get(leaf);
4985 btrfs_set_path_blocking(path);
4986 wret = push_leaf_left(trans, root, path, 1, 1,
4987 1, (u32)-1);
4988 if (wret < 0 && wret != -ENOSPC)
4989 ret = wret;
4991 if (path->nodes[0] == leaf &&
4992 btrfs_header_nritems(leaf)) {
4993 wret = push_leaf_right(trans, root, path, 1,
4994 1, 1, 0);
4995 if (wret < 0 && wret != -ENOSPC)
4996 ret = wret;
4999 if (btrfs_header_nritems(leaf) == 0) {
5000 path->slots[1] = slot;
5001 btrfs_del_leaf(trans, root, path, leaf);
5002 free_extent_buffer(leaf);
5003 ret = 0;
5004 } else {
5005 /* if we're still in the path, make sure
5006 * we're dirty. Otherwise, one of the
5007 * push_leaf functions must have already
5008 * dirtied this buffer
5010 if (path->nodes[0] == leaf)
5011 btrfs_mark_buffer_dirty(leaf);
5012 free_extent_buffer(leaf);
5014 } else {
5015 btrfs_mark_buffer_dirty(leaf);
5018 return ret;
5022 * search the tree again to find a leaf with lesser keys
5023 * returns 0 if it found something or 1 if there are no lesser leaves.
5024 * returns < 0 on io errors.
5026 * This may release the path, and so you may lose any locks held at the
5027 * time you call it.
5029 static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
5031 struct btrfs_key key;
5032 struct btrfs_disk_key found_key;
5033 int ret;
5035 btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
5037 if (key.offset > 0)
5038 key.offset--;
5039 else if (key.type > 0)
5040 key.type--;
5041 else if (key.objectid > 0)
5042 key.objectid--;
5043 else
5044 return 1;
5046 btrfs_release_path(path);
5047 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5048 if (ret < 0)
5049 return ret;
5050 btrfs_item_key(path->nodes[0], &found_key, 0);
5051 ret = comp_keys(&found_key, &key);
5052 if (ret < 0)
5053 return 0;
5054 return 1;
5058 * A helper function to walk down the tree starting at min_key, and looking
5059 * for nodes or leaves that are have a minimum transaction id.
5060 * This is used by the btree defrag code, and tree logging
5062 * This does not cow, but it does stuff the starting key it finds back
5063 * into min_key, so you can call btrfs_search_slot with cow=1 on the
5064 * key and get a writable path.
5066 * This does lock as it descends, and path->keep_locks should be set
5067 * to 1 by the caller.
5069 * This honors path->lowest_level to prevent descent past a given level
5070 * of the tree.
5072 * min_trans indicates the oldest transaction that you are interested
5073 * in walking through. Any nodes or leaves older than min_trans are
5074 * skipped over (without reading them).
5076 * returns zero if something useful was found, < 0 on error and 1 if there
5077 * was nothing in the tree that matched the search criteria.
5079 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
5080 struct btrfs_key *max_key,
5081 struct btrfs_path *path,
5082 u64 min_trans)
5084 struct extent_buffer *cur;
5085 struct btrfs_key found_key;
5086 int slot;
5087 int sret;
5088 u32 nritems;
5089 int level;
5090 int ret = 1;
5092 WARN_ON(!path->keep_locks);
5093 again:
5094 cur = btrfs_read_lock_root_node(root);
5095 level = btrfs_header_level(cur);
5096 WARN_ON(path->nodes[level]);
5097 path->nodes[level] = cur;
5098 path->locks[level] = BTRFS_READ_LOCK;
5100 if (btrfs_header_generation(cur) < min_trans) {
5101 ret = 1;
5102 goto out;
5104 while (1) {
5105 nritems = btrfs_header_nritems(cur);
5106 level = btrfs_header_level(cur);
5107 sret = bin_search(cur, min_key, level, &slot);
5109 /* at the lowest level, we're done, setup the path and exit */
5110 if (level == path->lowest_level) {
5111 if (slot >= nritems)
5112 goto find_next_key;
5113 ret = 0;
5114 path->slots[level] = slot;
5115 btrfs_item_key_to_cpu(cur, &found_key, slot);
5116 goto out;
5118 if (sret && slot > 0)
5119 slot--;
5121 * check this node pointer against the min_trans parameters.
5122 * If it is too old, old, skip to the next one.
5124 while (slot < nritems) {
5125 u64 blockptr;
5126 u64 gen;
5128 blockptr = btrfs_node_blockptr(cur, slot);
5129 gen = btrfs_node_ptr_generation(cur, slot);
5130 if (gen < min_trans) {
5131 slot++;
5132 continue;
5134 break;
5136 find_next_key:
5138 * we didn't find a candidate key in this node, walk forward
5139 * and find another one
5141 if (slot >= nritems) {
5142 path->slots[level] = slot;
5143 btrfs_set_path_blocking(path);
5144 sret = btrfs_find_next_key(root, path, min_key, level,
5145 min_trans);
5146 if (sret == 0) {
5147 btrfs_release_path(path);
5148 goto again;
5149 } else {
5150 goto out;
5153 /* save our key for returning back */
5154 btrfs_node_key_to_cpu(cur, &found_key, slot);
5155 path->slots[level] = slot;
5156 if (level == path->lowest_level) {
5157 ret = 0;
5158 unlock_up(path, level, 1, 0, NULL);
5159 goto out;
5161 btrfs_set_path_blocking(path);
5162 cur = read_node_slot(root, cur, slot);
5163 BUG_ON(!cur); /* -ENOMEM */
5165 btrfs_tree_read_lock(cur);
5167 path->locks[level - 1] = BTRFS_READ_LOCK;
5168 path->nodes[level - 1] = cur;
5169 unlock_up(path, level, 1, 0, NULL);
5170 btrfs_clear_path_blocking(path, NULL, 0);
5172 out:
5173 if (ret == 0)
5174 memcpy(min_key, &found_key, sizeof(found_key));
5175 btrfs_set_path_blocking(path);
5176 return ret;
5179 static void tree_move_down(struct btrfs_root *root,
5180 struct btrfs_path *path,
5181 int *level, int root_level)
5183 BUG_ON(*level == 0);
5184 path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level],
5185 path->slots[*level]);
5186 path->slots[*level - 1] = 0;
5187 (*level)--;
5190 static int tree_move_next_or_upnext(struct btrfs_root *root,
5191 struct btrfs_path *path,
5192 int *level, int root_level)
5194 int ret = 0;
5195 int nritems;
5196 nritems = btrfs_header_nritems(path->nodes[*level]);
5198 path->slots[*level]++;
5200 while (path->slots[*level] >= nritems) {
5201 if (*level == root_level)
5202 return -1;
5204 /* move upnext */
5205 path->slots[*level] = 0;
5206 free_extent_buffer(path->nodes[*level]);
5207 path->nodes[*level] = NULL;
5208 (*level)++;
5209 path->slots[*level]++;
5211 nritems = btrfs_header_nritems(path->nodes[*level]);
5212 ret = 1;
5214 return ret;
5218 * Returns 1 if it had to move up and next. 0 is returned if it moved only next
5219 * or down.
5221 static int tree_advance(struct btrfs_root *root,
5222 struct btrfs_path *path,
5223 int *level, int root_level,
5224 int allow_down,
5225 struct btrfs_key *key)
5227 int ret;
5229 if (*level == 0 || !allow_down) {
5230 ret = tree_move_next_or_upnext(root, path, level, root_level);
5231 } else {
5232 tree_move_down(root, path, level, root_level);
5233 ret = 0;
5235 if (ret >= 0) {
5236 if (*level == 0)
5237 btrfs_item_key_to_cpu(path->nodes[*level], key,
5238 path->slots[*level]);
5239 else
5240 btrfs_node_key_to_cpu(path->nodes[*level], key,
5241 path->slots[*level]);
5243 return ret;
5246 static int tree_compare_item(struct btrfs_root *left_root,
5247 struct btrfs_path *left_path,
5248 struct btrfs_path *right_path,
5249 char *tmp_buf)
5251 int cmp;
5252 int len1, len2;
5253 unsigned long off1, off2;
5255 len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
5256 len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
5257 if (len1 != len2)
5258 return 1;
5260 off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
5261 off2 = btrfs_item_ptr_offset(right_path->nodes[0],
5262 right_path->slots[0]);
5264 read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
5266 cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
5267 if (cmp)
5268 return 1;
5269 return 0;
5272 #define ADVANCE 1
5273 #define ADVANCE_ONLY_NEXT -1
5276 * This function compares two trees and calls the provided callback for
5277 * every changed/new/deleted item it finds.
5278 * If shared tree blocks are encountered, whole subtrees are skipped, making
5279 * the compare pretty fast on snapshotted subvolumes.
5281 * This currently works on commit roots only. As commit roots are read only,
5282 * we don't do any locking. The commit roots are protected with transactions.
5283 * Transactions are ended and rejoined when a commit is tried in between.
5285 * This function checks for modifications done to the trees while comparing.
5286 * If it detects a change, it aborts immediately.
5288 int btrfs_compare_trees(struct btrfs_root *left_root,
5289 struct btrfs_root *right_root,
5290 btrfs_changed_cb_t changed_cb, void *ctx)
5292 int ret;
5293 int cmp;
5294 struct btrfs_trans_handle *trans = NULL;
5295 struct btrfs_path *left_path = NULL;
5296 struct btrfs_path *right_path = NULL;
5297 struct btrfs_key left_key;
5298 struct btrfs_key right_key;
5299 char *tmp_buf = NULL;
5300 int left_root_level;
5301 int right_root_level;
5302 int left_level;
5303 int right_level;
5304 int left_end_reached;
5305 int right_end_reached;
5306 int advance_left;
5307 int advance_right;
5308 u64 left_blockptr;
5309 u64 right_blockptr;
5310 u64 left_start_ctransid;
5311 u64 right_start_ctransid;
5312 u64 ctransid;
5314 left_path = btrfs_alloc_path();
5315 if (!left_path) {
5316 ret = -ENOMEM;
5317 goto out;
5319 right_path = btrfs_alloc_path();
5320 if (!right_path) {
5321 ret = -ENOMEM;
5322 goto out;
5325 tmp_buf = kmalloc(left_root->leafsize, GFP_NOFS);
5326 if (!tmp_buf) {
5327 ret = -ENOMEM;
5328 goto out;
5331 left_path->search_commit_root = 1;
5332 left_path->skip_locking = 1;
5333 right_path->search_commit_root = 1;
5334 right_path->skip_locking = 1;
5336 spin_lock(&left_root->root_item_lock);
5337 left_start_ctransid = btrfs_root_ctransid(&left_root->root_item);
5338 spin_unlock(&left_root->root_item_lock);
5340 spin_lock(&right_root->root_item_lock);
5341 right_start_ctransid = btrfs_root_ctransid(&right_root->root_item);
5342 spin_unlock(&right_root->root_item_lock);
5344 trans = btrfs_join_transaction(left_root);
5345 if (IS_ERR(trans)) {
5346 ret = PTR_ERR(trans);
5347 trans = NULL;
5348 goto out;
5352 * Strategy: Go to the first items of both trees. Then do
5354 * If both trees are at level 0
5355 * Compare keys of current items
5356 * If left < right treat left item as new, advance left tree
5357 * and repeat
5358 * If left > right treat right item as deleted, advance right tree
5359 * and repeat
5360 * If left == right do deep compare of items, treat as changed if
5361 * needed, advance both trees and repeat
5362 * If both trees are at the same level but not at level 0
5363 * Compare keys of current nodes/leafs
5364 * If left < right advance left tree and repeat
5365 * If left > right advance right tree and repeat
5366 * If left == right compare blockptrs of the next nodes/leafs
5367 * If they match advance both trees but stay at the same level
5368 * and repeat
5369 * If they don't match advance both trees while allowing to go
5370 * deeper and repeat
5371 * If tree levels are different
5372 * Advance the tree that needs it and repeat
5374 * Advancing a tree means:
5375 * If we are at level 0, try to go to the next slot. If that's not
5376 * possible, go one level up and repeat. Stop when we found a level
5377 * where we could go to the next slot. We may at this point be on a
5378 * node or a leaf.
5380 * If we are not at level 0 and not on shared tree blocks, go one
5381 * level deeper.
5383 * If we are not at level 0 and on shared tree blocks, go one slot to
5384 * the right if possible or go up and right.
5387 left_level = btrfs_header_level(left_root->commit_root);
5388 left_root_level = left_level;
5389 left_path->nodes[left_level] = left_root->commit_root;
5390 extent_buffer_get(left_path->nodes[left_level]);
5392 right_level = btrfs_header_level(right_root->commit_root);
5393 right_root_level = right_level;
5394 right_path->nodes[right_level] = right_root->commit_root;
5395 extent_buffer_get(right_path->nodes[right_level]);
5397 if (left_level == 0)
5398 btrfs_item_key_to_cpu(left_path->nodes[left_level],
5399 &left_key, left_path->slots[left_level]);
5400 else
5401 btrfs_node_key_to_cpu(left_path->nodes[left_level],
5402 &left_key, left_path->slots[left_level]);
5403 if (right_level == 0)
5404 btrfs_item_key_to_cpu(right_path->nodes[right_level],
5405 &right_key, right_path->slots[right_level]);
5406 else
5407 btrfs_node_key_to_cpu(right_path->nodes[right_level],
5408 &right_key, right_path->slots[right_level]);
5410 left_end_reached = right_end_reached = 0;
5411 advance_left = advance_right = 0;
5413 while (1) {
5415 * We need to make sure the transaction does not get committed
5416 * while we do anything on commit roots. This means, we need to
5417 * join and leave transactions for every item that we process.
5419 if (trans && btrfs_should_end_transaction(trans, left_root)) {
5420 btrfs_release_path(left_path);
5421 btrfs_release_path(right_path);
5423 ret = btrfs_end_transaction(trans, left_root);
5424 trans = NULL;
5425 if (ret < 0)
5426 goto out;
5428 /* now rejoin the transaction */
5429 if (!trans) {
5430 trans = btrfs_join_transaction(left_root);
5431 if (IS_ERR(trans)) {
5432 ret = PTR_ERR(trans);
5433 trans = NULL;
5434 goto out;
5437 spin_lock(&left_root->root_item_lock);
5438 ctransid = btrfs_root_ctransid(&left_root->root_item);
5439 spin_unlock(&left_root->root_item_lock);
5440 if (ctransid != left_start_ctransid)
5441 left_start_ctransid = 0;
5443 spin_lock(&right_root->root_item_lock);
5444 ctransid = btrfs_root_ctransid(&right_root->root_item);
5445 spin_unlock(&right_root->root_item_lock);
5446 if (ctransid != right_start_ctransid)
5447 right_start_ctransid = 0;
5449 if (!left_start_ctransid || !right_start_ctransid) {
5450 WARN(1, KERN_WARNING
5451 "btrfs: btrfs_compare_tree detected "
5452 "a change in one of the trees while "
5453 "iterating. This is probably a "
5454 "bug.\n");
5455 ret = -EIO;
5456 goto out;
5460 * the commit root may have changed, so start again
5461 * where we stopped
5463 left_path->lowest_level = left_level;
5464 right_path->lowest_level = right_level;
5465 ret = btrfs_search_slot(NULL, left_root,
5466 &left_key, left_path, 0, 0);
5467 if (ret < 0)
5468 goto out;
5469 ret = btrfs_search_slot(NULL, right_root,
5470 &right_key, right_path, 0, 0);
5471 if (ret < 0)
5472 goto out;
5475 if (advance_left && !left_end_reached) {
5476 ret = tree_advance(left_root, left_path, &left_level,
5477 left_root_level,
5478 advance_left != ADVANCE_ONLY_NEXT,
5479 &left_key);
5480 if (ret < 0)
5481 left_end_reached = ADVANCE;
5482 advance_left = 0;
5484 if (advance_right && !right_end_reached) {
5485 ret = tree_advance(right_root, right_path, &right_level,
5486 right_root_level,
5487 advance_right != ADVANCE_ONLY_NEXT,
5488 &right_key);
5489 if (ret < 0)
5490 right_end_reached = ADVANCE;
5491 advance_right = 0;
5494 if (left_end_reached && right_end_reached) {
5495 ret = 0;
5496 goto out;
5497 } else if (left_end_reached) {
5498 if (right_level == 0) {
5499 ret = changed_cb(left_root, right_root,
5500 left_path, right_path,
5501 &right_key,
5502 BTRFS_COMPARE_TREE_DELETED,
5503 ctx);
5504 if (ret < 0)
5505 goto out;
5507 advance_right = ADVANCE;
5508 continue;
5509 } else if (right_end_reached) {
5510 if (left_level == 0) {
5511 ret = changed_cb(left_root, right_root,
5512 left_path, right_path,
5513 &left_key,
5514 BTRFS_COMPARE_TREE_NEW,
5515 ctx);
5516 if (ret < 0)
5517 goto out;
5519 advance_left = ADVANCE;
5520 continue;
5523 if (left_level == 0 && right_level == 0) {
5524 cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5525 if (cmp < 0) {
5526 ret = changed_cb(left_root, right_root,
5527 left_path, right_path,
5528 &left_key,
5529 BTRFS_COMPARE_TREE_NEW,
5530 ctx);
5531 if (ret < 0)
5532 goto out;
5533 advance_left = ADVANCE;
5534 } else if (cmp > 0) {
5535 ret = changed_cb(left_root, right_root,
5536 left_path, right_path,
5537 &right_key,
5538 BTRFS_COMPARE_TREE_DELETED,
5539 ctx);
5540 if (ret < 0)
5541 goto out;
5542 advance_right = ADVANCE;
5543 } else {
5544 enum btrfs_compare_tree_result cmp;
5546 WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
5547 ret = tree_compare_item(left_root, left_path,
5548 right_path, tmp_buf);
5549 if (ret)
5550 cmp = BTRFS_COMPARE_TREE_CHANGED;
5551 else
5552 cmp = BTRFS_COMPARE_TREE_SAME;
5553 ret = changed_cb(left_root, right_root,
5554 left_path, right_path,
5555 &left_key, cmp, ctx);
5556 if (ret < 0)
5557 goto out;
5558 advance_left = ADVANCE;
5559 advance_right = ADVANCE;
5561 } else if (left_level == right_level) {
5562 cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5563 if (cmp < 0) {
5564 advance_left = ADVANCE;
5565 } else if (cmp > 0) {
5566 advance_right = ADVANCE;
5567 } else {
5568 left_blockptr = btrfs_node_blockptr(
5569 left_path->nodes[left_level],
5570 left_path->slots[left_level]);
5571 right_blockptr = btrfs_node_blockptr(
5572 right_path->nodes[right_level],
5573 right_path->slots[right_level]);
5574 if (left_blockptr == right_blockptr) {
5576 * As we're on a shared block, don't
5577 * allow to go deeper.
5579 advance_left = ADVANCE_ONLY_NEXT;
5580 advance_right = ADVANCE_ONLY_NEXT;
5581 } else {
5582 advance_left = ADVANCE;
5583 advance_right = ADVANCE;
5586 } else if (left_level < right_level) {
5587 advance_right = ADVANCE;
5588 } else {
5589 advance_left = ADVANCE;
5593 out:
5594 btrfs_free_path(left_path);
5595 btrfs_free_path(right_path);
5596 kfree(tmp_buf);
5598 if (trans) {
5599 if (!ret)
5600 ret = btrfs_end_transaction(trans, left_root);
5601 else
5602 btrfs_end_transaction(trans, left_root);
5605 return ret;
5609 * this is similar to btrfs_next_leaf, but does not try to preserve
5610 * and fixup the path. It looks for and returns the next key in the
5611 * tree based on the current path and the min_trans parameters.
5613 * 0 is returned if another key is found, < 0 if there are any errors
5614 * and 1 is returned if there are no higher keys in the tree
5616 * path->keep_locks should be set to 1 on the search made before
5617 * calling this function.
5619 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
5620 struct btrfs_key *key, int level, u64 min_trans)
5622 int slot;
5623 struct extent_buffer *c;
5625 WARN_ON(!path->keep_locks);
5626 while (level < BTRFS_MAX_LEVEL) {
5627 if (!path->nodes[level])
5628 return 1;
5630 slot = path->slots[level] + 1;
5631 c = path->nodes[level];
5632 next:
5633 if (slot >= btrfs_header_nritems(c)) {
5634 int ret;
5635 int orig_lowest;
5636 struct btrfs_key cur_key;
5637 if (level + 1 >= BTRFS_MAX_LEVEL ||
5638 !path->nodes[level + 1])
5639 return 1;
5641 if (path->locks[level + 1]) {
5642 level++;
5643 continue;
5646 slot = btrfs_header_nritems(c) - 1;
5647 if (level == 0)
5648 btrfs_item_key_to_cpu(c, &cur_key, slot);
5649 else
5650 btrfs_node_key_to_cpu(c, &cur_key, slot);
5652 orig_lowest = path->lowest_level;
5653 btrfs_release_path(path);
5654 path->lowest_level = level;
5655 ret = btrfs_search_slot(NULL, root, &cur_key, path,
5656 0, 0);
5657 path->lowest_level = orig_lowest;
5658 if (ret < 0)
5659 return ret;
5661 c = path->nodes[level];
5662 slot = path->slots[level];
5663 if (ret == 0)
5664 slot++;
5665 goto next;
5668 if (level == 0)
5669 btrfs_item_key_to_cpu(c, key, slot);
5670 else {
5671 u64 gen = btrfs_node_ptr_generation(c, slot);
5673 if (gen < min_trans) {
5674 slot++;
5675 goto next;
5677 btrfs_node_key_to_cpu(c, key, slot);
5679 return 0;
5681 return 1;
5685 * search the tree again to find a leaf with greater keys
5686 * returns 0 if it found something or 1 if there are no greater leaves.
5687 * returns < 0 on io errors.
5689 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
5691 return btrfs_next_old_leaf(root, path, 0);
5694 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
5695 u64 time_seq)
5697 int slot;
5698 int level;
5699 struct extent_buffer *c;
5700 struct extent_buffer *next;
5701 struct btrfs_key key;
5702 u32 nritems;
5703 int ret;
5704 int old_spinning = path->leave_spinning;
5705 int next_rw_lock = 0;
5707 nritems = btrfs_header_nritems(path->nodes[0]);
5708 if (nritems == 0)
5709 return 1;
5711 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5712 again:
5713 level = 1;
5714 next = NULL;
5715 next_rw_lock = 0;
5716 btrfs_release_path(path);
5718 path->keep_locks = 1;
5719 path->leave_spinning = 1;
5721 if (time_seq)
5722 ret = btrfs_search_old_slot(root, &key, path, time_seq);
5723 else
5724 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5725 path->keep_locks = 0;
5727 if (ret < 0)
5728 return ret;
5730 nritems = btrfs_header_nritems(path->nodes[0]);
5732 * by releasing the path above we dropped all our locks. A balance
5733 * could have added more items next to the key that used to be
5734 * at the very end of the block. So, check again here and
5735 * advance the path if there are now more items available.
5737 if (nritems > 0 && path->slots[0] < nritems - 1) {
5738 if (ret == 0)
5739 path->slots[0]++;
5740 ret = 0;
5741 goto done;
5744 while (level < BTRFS_MAX_LEVEL) {
5745 if (!path->nodes[level]) {
5746 ret = 1;
5747 goto done;
5750 slot = path->slots[level] + 1;
5751 c = path->nodes[level];
5752 if (slot >= btrfs_header_nritems(c)) {
5753 level++;
5754 if (level == BTRFS_MAX_LEVEL) {
5755 ret = 1;
5756 goto done;
5758 continue;
5761 if (next) {
5762 btrfs_tree_unlock_rw(next, next_rw_lock);
5763 free_extent_buffer(next);
5766 next = c;
5767 next_rw_lock = path->locks[level];
5768 ret = read_block_for_search(NULL, root, path, &next, level,
5769 slot, &key, 0);
5770 if (ret == -EAGAIN)
5771 goto again;
5773 if (ret < 0) {
5774 btrfs_release_path(path);
5775 goto done;
5778 if (!path->skip_locking) {
5779 ret = btrfs_try_tree_read_lock(next);
5780 if (!ret && time_seq) {
5782 * If we don't get the lock, we may be racing
5783 * with push_leaf_left, holding that lock while
5784 * itself waiting for the leaf we've currently
5785 * locked. To solve this situation, we give up
5786 * on our lock and cycle.
5788 free_extent_buffer(next);
5789 btrfs_release_path(path);
5790 cond_resched();
5791 goto again;
5793 if (!ret) {
5794 btrfs_set_path_blocking(path);
5795 btrfs_tree_read_lock(next);
5796 btrfs_clear_path_blocking(path, next,
5797 BTRFS_READ_LOCK);
5799 next_rw_lock = BTRFS_READ_LOCK;
5801 break;
5803 path->slots[level] = slot;
5804 while (1) {
5805 level--;
5806 c = path->nodes[level];
5807 if (path->locks[level])
5808 btrfs_tree_unlock_rw(c, path->locks[level]);
5810 free_extent_buffer(c);
5811 path->nodes[level] = next;
5812 path->slots[level] = 0;
5813 if (!path->skip_locking)
5814 path->locks[level] = next_rw_lock;
5815 if (!level)
5816 break;
5818 ret = read_block_for_search(NULL, root, path, &next, level,
5819 0, &key, 0);
5820 if (ret == -EAGAIN)
5821 goto again;
5823 if (ret < 0) {
5824 btrfs_release_path(path);
5825 goto done;
5828 if (!path->skip_locking) {
5829 ret = btrfs_try_tree_read_lock(next);
5830 if (!ret) {
5831 btrfs_set_path_blocking(path);
5832 btrfs_tree_read_lock(next);
5833 btrfs_clear_path_blocking(path, next,
5834 BTRFS_READ_LOCK);
5836 next_rw_lock = BTRFS_READ_LOCK;
5839 ret = 0;
5840 done:
5841 unlock_up(path, 0, 1, 0, NULL);
5842 path->leave_spinning = old_spinning;
5843 if (!old_spinning)
5844 btrfs_set_path_blocking(path);
5846 return ret;
5850 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
5851 * searching until it gets past min_objectid or finds an item of 'type'
5853 * returns 0 if something is found, 1 if nothing was found and < 0 on error
5855 int btrfs_previous_item(struct btrfs_root *root,
5856 struct btrfs_path *path, u64 min_objectid,
5857 int type)
5859 struct btrfs_key found_key;
5860 struct extent_buffer *leaf;
5861 u32 nritems;
5862 int ret;
5864 while (1) {
5865 if (path->slots[0] == 0) {
5866 btrfs_set_path_blocking(path);
5867 ret = btrfs_prev_leaf(root, path);
5868 if (ret != 0)
5869 return ret;
5870 } else {
5871 path->slots[0]--;
5873 leaf = path->nodes[0];
5874 nritems = btrfs_header_nritems(leaf);
5875 if (nritems == 0)
5876 return 1;
5877 if (path->slots[0] == nritems)
5878 path->slots[0]--;
5880 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5881 if (found_key.objectid < min_objectid)
5882 break;
5883 if (found_key.type == type)
5884 return 0;
5885 if (found_key.objectid == min_objectid &&
5886 found_key.type < type)
5887 break;
5889 return 1;