1 /* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
6 * Extent allocs and frees
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
27 #include <linux/types.h>
28 #include <linux/slab.h>
29 #include <linux/highmem.h>
31 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
32 #include <cluster/masklog.h>
38 #include "extent_map.h"
41 #include "localalloc.h"
48 #include "buffer_head_io.h"
50 static int ocfs2_extent_contig(struct inode
*inode
,
51 struct ocfs2_extent_rec
*ext
,
54 static int ocfs2_create_new_meta_bhs(struct ocfs2_super
*osb
,
55 struct ocfs2_journal_handle
*handle
,
58 struct ocfs2_alloc_context
*meta_ac
,
59 struct buffer_head
*bhs
[]);
61 static int ocfs2_add_branch(struct ocfs2_super
*osb
,
62 struct ocfs2_journal_handle
*handle
,
64 struct buffer_head
*fe_bh
,
65 struct buffer_head
*eb_bh
,
66 struct buffer_head
*last_eb_bh
,
67 struct ocfs2_alloc_context
*meta_ac
);
69 static int ocfs2_shift_tree_depth(struct ocfs2_super
*osb
,
70 struct ocfs2_journal_handle
*handle
,
72 struct buffer_head
*fe_bh
,
73 struct ocfs2_alloc_context
*meta_ac
,
74 struct buffer_head
**ret_new_eb_bh
);
76 static int ocfs2_do_insert_extent(struct ocfs2_super
*osb
,
77 struct ocfs2_journal_handle
*handle
,
79 struct buffer_head
*fe_bh
,
83 static int ocfs2_find_branch_target(struct ocfs2_super
*osb
,
85 struct buffer_head
*fe_bh
,
86 struct buffer_head
**target_bh
);
88 static int ocfs2_find_new_last_ext_blk(struct ocfs2_super
*osb
,
90 struct ocfs2_dinode
*fe
,
91 unsigned int new_i_clusters
,
92 struct buffer_head
*old_last_eb
,
93 struct buffer_head
**new_last_eb
);
95 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context
*tc
);
97 static int ocfs2_extent_contig(struct inode
*inode
,
98 struct ocfs2_extent_rec
*ext
,
101 return blkno
== (le64_to_cpu(ext
->e_blkno
) +
102 ocfs2_clusters_to_blocks(inode
->i_sb
,
103 le32_to_cpu(ext
->e_clusters
)));
107 * How many free extents have we got before we need more meta data?
109 int ocfs2_num_free_extents(struct ocfs2_super
*osb
,
111 struct ocfs2_dinode
*fe
)
114 struct ocfs2_extent_list
*el
;
115 struct ocfs2_extent_block
*eb
;
116 struct buffer_head
*eb_bh
= NULL
;
120 if (!OCFS2_IS_VALID_DINODE(fe
)) {
121 OCFS2_RO_ON_INVALID_DINODE(inode
->i_sb
, fe
);
126 if (fe
->i_last_eb_blk
) {
127 retval
= ocfs2_read_block(osb
, le64_to_cpu(fe
->i_last_eb_blk
),
128 &eb_bh
, OCFS2_BH_CACHED
, inode
);
133 eb
= (struct ocfs2_extent_block
*) eb_bh
->b_data
;
136 el
= &fe
->id2
.i_list
;
138 BUG_ON(el
->l_tree_depth
!= 0);
140 retval
= le16_to_cpu(el
->l_count
) - le16_to_cpu(el
->l_next_free_rec
);
149 /* expects array to already be allocated
151 * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
154 static int ocfs2_create_new_meta_bhs(struct ocfs2_super
*osb
,
155 struct ocfs2_journal_handle
*handle
,
158 struct ocfs2_alloc_context
*meta_ac
,
159 struct buffer_head
*bhs
[])
161 int count
, status
, i
;
162 u16 suballoc_bit_start
;
165 struct ocfs2_extent_block
*eb
;
170 while (count
< wanted
) {
171 status
= ocfs2_claim_metadata(osb
,
183 for(i
= count
; i
< (num_got
+ count
); i
++) {
184 bhs
[i
] = sb_getblk(osb
->sb
, first_blkno
);
185 if (bhs
[i
] == NULL
) {
190 ocfs2_set_new_buffer_uptodate(inode
, bhs
[i
]);
192 status
= ocfs2_journal_access(handle
, inode
, bhs
[i
],
193 OCFS2_JOURNAL_ACCESS_CREATE
);
199 memset(bhs
[i
]->b_data
, 0, osb
->sb
->s_blocksize
);
200 eb
= (struct ocfs2_extent_block
*) bhs
[i
]->b_data
;
201 /* Ok, setup the minimal stuff here. */
202 strcpy(eb
->h_signature
, OCFS2_EXTENT_BLOCK_SIGNATURE
);
203 eb
->h_blkno
= cpu_to_le64(first_blkno
);
204 eb
->h_fs_generation
= cpu_to_le32(osb
->fs_generation
);
206 #ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
207 /* we always use slot zero's suballocator */
208 eb
->h_suballoc_slot
= 0;
210 eb
->h_suballoc_slot
= cpu_to_le16(osb
->slot_num
);
212 eb
->h_suballoc_bit
= cpu_to_le16(suballoc_bit_start
);
214 cpu_to_le16(ocfs2_extent_recs_per_eb(osb
->sb
));
216 suballoc_bit_start
++;
219 /* We'll also be dirtied by the caller, so
220 * this isn't absolutely necessary. */
221 status
= ocfs2_journal_dirty(handle
, bhs
[i
]);
234 for(i
= 0; i
< wanted
; i
++) {
245 * Add an entire tree branch to our inode. eb_bh is the extent block
246 * to start at, if we don't want to start the branch at the dinode
249 * last_eb_bh is required as we have to update it's next_leaf pointer
250 * for the new last extent block.
252 * the new branch will be 'empty' in the sense that every block will
253 * contain a single record with e_clusters == 0.
255 static int ocfs2_add_branch(struct ocfs2_super
*osb
,
256 struct ocfs2_journal_handle
*handle
,
258 struct buffer_head
*fe_bh
,
259 struct buffer_head
*eb_bh
,
260 struct buffer_head
*last_eb_bh
,
261 struct ocfs2_alloc_context
*meta_ac
)
263 int status
, new_blocks
, i
;
264 u64 next_blkno
, new_last_eb_blk
;
265 struct buffer_head
*bh
;
266 struct buffer_head
**new_eb_bhs
= NULL
;
267 struct ocfs2_dinode
*fe
;
268 struct ocfs2_extent_block
*eb
;
269 struct ocfs2_extent_list
*eb_el
;
270 struct ocfs2_extent_list
*el
;
276 fe
= (struct ocfs2_dinode
*) fe_bh
->b_data
;
279 eb
= (struct ocfs2_extent_block
*) eb_bh
->b_data
;
282 el
= &fe
->id2
.i_list
;
284 /* we never add a branch to a leaf. */
285 BUG_ON(!el
->l_tree_depth
);
287 new_blocks
= le16_to_cpu(el
->l_tree_depth
);
289 /* allocate the number of new eb blocks we need */
290 new_eb_bhs
= kcalloc(new_blocks
, sizeof(struct buffer_head
*),
298 status
= ocfs2_create_new_meta_bhs(osb
, handle
, inode
, new_blocks
,
299 meta_ac
, new_eb_bhs
);
305 /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
306 * linked with the rest of the tree.
307 * conversly, new_eb_bhs[0] is the new bottommost leaf.
309 * when we leave the loop, new_last_eb_blk will point to the
310 * newest leaf, and next_blkno will point to the topmost extent
312 next_blkno
= new_last_eb_blk
= 0;
313 for(i
= 0; i
< new_blocks
; i
++) {
315 eb
= (struct ocfs2_extent_block
*) bh
->b_data
;
316 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb
)) {
317 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode
->i_sb
, eb
);
323 status
= ocfs2_journal_access(handle
, inode
, bh
,
324 OCFS2_JOURNAL_ACCESS_CREATE
);
330 eb
->h_next_leaf_blk
= 0;
331 eb_el
->l_tree_depth
= cpu_to_le16(i
);
332 eb_el
->l_next_free_rec
= cpu_to_le16(1);
333 eb_el
->l_recs
[0].e_cpos
= fe
->i_clusters
;
334 eb_el
->l_recs
[0].e_blkno
= cpu_to_le64(next_blkno
);
335 eb_el
->l_recs
[0].e_clusters
= cpu_to_le32(0);
336 if (!eb_el
->l_tree_depth
)
337 new_last_eb_blk
= le64_to_cpu(eb
->h_blkno
);
339 status
= ocfs2_journal_dirty(handle
, bh
);
345 next_blkno
= le64_to_cpu(eb
->h_blkno
);
348 /* This is a bit hairy. We want to update up to three blocks
349 * here without leaving any of them in an inconsistent state
350 * in case of error. We don't have to worry about
351 * journal_dirty erroring as it won't unless we've aborted the
352 * handle (in which case we would never be here) so reserving
353 * the write with journal_access is all we need to do. */
354 status
= ocfs2_journal_access(handle
, inode
, last_eb_bh
,
355 OCFS2_JOURNAL_ACCESS_WRITE
);
360 status
= ocfs2_journal_access(handle
, inode
, fe_bh
,
361 OCFS2_JOURNAL_ACCESS_WRITE
);
367 status
= ocfs2_journal_access(handle
, inode
, eb_bh
,
368 OCFS2_JOURNAL_ACCESS_WRITE
);
375 /* Link the new branch into the rest of the tree (el will
376 * either be on the fe, or the extent block passed in. */
377 i
= le16_to_cpu(el
->l_next_free_rec
);
378 el
->l_recs
[i
].e_blkno
= cpu_to_le64(next_blkno
);
379 el
->l_recs
[i
].e_cpos
= fe
->i_clusters
;
380 el
->l_recs
[i
].e_clusters
= 0;
381 le16_add_cpu(&el
->l_next_free_rec
, 1);
383 /* fe needs a new last extent block pointer, as does the
384 * next_leaf on the previously last-extent-block. */
385 fe
->i_last_eb_blk
= cpu_to_le64(new_last_eb_blk
);
387 eb
= (struct ocfs2_extent_block
*) last_eb_bh
->b_data
;
388 eb
->h_next_leaf_blk
= cpu_to_le64(new_last_eb_blk
);
390 status
= ocfs2_journal_dirty(handle
, last_eb_bh
);
393 status
= ocfs2_journal_dirty(handle
, fe_bh
);
397 status
= ocfs2_journal_dirty(handle
, eb_bh
);
405 for (i
= 0; i
< new_blocks
; i
++)
407 brelse(new_eb_bhs
[i
]);
416 * adds another level to the allocation tree.
417 * returns back the new extent block so you can add a branch to it
420 static int ocfs2_shift_tree_depth(struct ocfs2_super
*osb
,
421 struct ocfs2_journal_handle
*handle
,
423 struct buffer_head
*fe_bh
,
424 struct ocfs2_alloc_context
*meta_ac
,
425 struct buffer_head
**ret_new_eb_bh
)
428 struct buffer_head
*new_eb_bh
= NULL
;
429 struct ocfs2_dinode
*fe
;
430 struct ocfs2_extent_block
*eb
;
431 struct ocfs2_extent_list
*fe_el
;
432 struct ocfs2_extent_list
*eb_el
;
436 status
= ocfs2_create_new_meta_bhs(osb
, handle
, inode
, 1, meta_ac
,
443 eb
= (struct ocfs2_extent_block
*) new_eb_bh
->b_data
;
444 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb
)) {
445 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode
->i_sb
, eb
);
451 fe
= (struct ocfs2_dinode
*) fe_bh
->b_data
;
452 fe_el
= &fe
->id2
.i_list
;
454 status
= ocfs2_journal_access(handle
, inode
, new_eb_bh
,
455 OCFS2_JOURNAL_ACCESS_CREATE
);
461 /* copy the fe data into the new extent block */
462 eb_el
->l_tree_depth
= fe_el
->l_tree_depth
;
463 eb_el
->l_next_free_rec
= fe_el
->l_next_free_rec
;
464 for(i
= 0; i
< le16_to_cpu(fe_el
->l_next_free_rec
); i
++) {
465 eb_el
->l_recs
[i
].e_cpos
= fe_el
->l_recs
[i
].e_cpos
;
466 eb_el
->l_recs
[i
].e_clusters
= fe_el
->l_recs
[i
].e_clusters
;
467 eb_el
->l_recs
[i
].e_blkno
= fe_el
->l_recs
[i
].e_blkno
;
470 status
= ocfs2_journal_dirty(handle
, new_eb_bh
);
476 status
= ocfs2_journal_access(handle
, inode
, fe_bh
,
477 OCFS2_JOURNAL_ACCESS_WRITE
);
484 le16_add_cpu(&fe_el
->l_tree_depth
, 1);
485 fe_el
->l_recs
[0].e_cpos
= 0;
486 fe_el
->l_recs
[0].e_blkno
= eb
->h_blkno
;
487 fe_el
->l_recs
[0].e_clusters
= fe
->i_clusters
;
488 for(i
= 1; i
< le16_to_cpu(fe_el
->l_next_free_rec
); i
++) {
489 fe_el
->l_recs
[i
].e_cpos
= 0;
490 fe_el
->l_recs
[i
].e_clusters
= 0;
491 fe_el
->l_recs
[i
].e_blkno
= 0;
493 fe_el
->l_next_free_rec
= cpu_to_le16(1);
495 /* If this is our 1st tree depth shift, then last_eb_blk
496 * becomes the allocated extent block */
497 if (fe_el
->l_tree_depth
== cpu_to_le16(1))
498 fe
->i_last_eb_blk
= eb
->h_blkno
;
500 status
= ocfs2_journal_dirty(handle
, fe_bh
);
506 *ret_new_eb_bh
= new_eb_bh
;
518 * Expects the tree to already have room in the rightmost leaf for the
519 * extent. Updates all the extent blocks (and the dinode) on the way
522 static int ocfs2_do_insert_extent(struct ocfs2_super
*osb
,
523 struct ocfs2_journal_handle
*handle
,
525 struct buffer_head
*fe_bh
,
529 int status
, i
, num_bhs
= 0;
532 struct buffer_head
**eb_bhs
= NULL
;
533 struct ocfs2_dinode
*fe
;
534 struct ocfs2_extent_block
*eb
;
535 struct ocfs2_extent_list
*el
;
539 status
= ocfs2_journal_access(handle
, inode
, fe_bh
,
540 OCFS2_JOURNAL_ACCESS_WRITE
);
546 fe
= (struct ocfs2_dinode
*) fe_bh
->b_data
;
547 el
= &fe
->id2
.i_list
;
548 if (el
->l_tree_depth
) {
549 /* This is another operation where we want to be
550 * careful about our tree updates. An error here means
551 * none of the previous changes we made should roll
552 * forward. As a result, we have to record the buffers
553 * for this part of the tree in an array and reserve a
554 * journal write to them before making any changes. */
555 num_bhs
= le16_to_cpu(fe
->id2
.i_list
.l_tree_depth
);
556 eb_bhs
= kcalloc(num_bhs
, sizeof(struct buffer_head
*),
565 while(el
->l_tree_depth
) {
566 next_free
= le16_to_cpu(el
->l_next_free_rec
);
567 if (next_free
== 0) {
568 ocfs2_error(inode
->i_sb
,
569 "Dinode %"MLFu64
" has a bad "
571 OCFS2_I(inode
)->ip_blkno
);
575 next_blkno
= le64_to_cpu(el
->l_recs
[next_free
- 1].e_blkno
);
577 BUG_ON(i
>= num_bhs
);
578 status
= ocfs2_read_block(osb
, next_blkno
, &eb_bhs
[i
],
579 OCFS2_BH_CACHED
, inode
);
584 eb
= (struct ocfs2_extent_block
*) eb_bhs
[i
]->b_data
;
585 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb
)) {
586 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode
->i_sb
,
592 status
= ocfs2_journal_access(handle
, inode
, eb_bhs
[i
],
593 OCFS2_JOURNAL_ACCESS_WRITE
);
601 /* When we leave this loop, eb_bhs[num_bhs - 1] will
602 * hold the bottom-most leaf extent block. */
604 BUG_ON(el
->l_tree_depth
);
606 el
= &fe
->id2
.i_list
;
607 /* If we have tree depth, then the fe update is
608 * trivial, and we want to switch el out for the
609 * bottom-most leaf in order to update it with the
610 * actual extent data below. */
611 next_free
= le16_to_cpu(el
->l_next_free_rec
);
612 if (next_free
== 0) {
613 ocfs2_error(inode
->i_sb
,
614 "Dinode %"MLFu64
" has a bad "
616 OCFS2_I(inode
)->ip_blkno
);
620 le32_add_cpu(&el
->l_recs
[next_free
- 1].e_clusters
,
622 /* (num_bhs - 1) to avoid the leaf */
623 for(i
= 0; i
< (num_bhs
- 1); i
++) {
624 eb
= (struct ocfs2_extent_block
*) eb_bhs
[i
]->b_data
;
627 /* finally, make our actual change to the
628 * intermediate extent blocks. */
629 next_free
= le16_to_cpu(el
->l_next_free_rec
);
630 le32_add_cpu(&el
->l_recs
[next_free
- 1].e_clusters
,
633 status
= ocfs2_journal_dirty(handle
, eb_bhs
[i
]);
637 BUG_ON(i
!= (num_bhs
- 1));
638 /* note that the leaf block wasn't touched in
640 eb
= (struct ocfs2_extent_block
*) eb_bhs
[num_bhs
- 1]->b_data
;
642 BUG_ON(el
->l_tree_depth
);
645 /* yay, we can finally add the actual extent now! */
646 i
= le16_to_cpu(el
->l_next_free_rec
) - 1;
647 if (le16_to_cpu(el
->l_next_free_rec
) &&
648 ocfs2_extent_contig(inode
, &el
->l_recs
[i
], start_blk
)) {
649 le32_add_cpu(&el
->l_recs
[i
].e_clusters
, new_clusters
);
650 } else if (le16_to_cpu(el
->l_next_free_rec
) &&
651 (le32_to_cpu(el
->l_recs
[i
].e_clusters
) == 0)) {
652 /* having an empty extent at eof is legal. */
653 if (el
->l_recs
[i
].e_cpos
!= fe
->i_clusters
) {
654 ocfs2_error(inode
->i_sb
,
655 "Dinode %"MLFu64
" trailing extent is bad: "
656 "cpos (%u) != number of clusters (%u)",
657 le32_to_cpu(el
->l_recs
[i
].e_cpos
),
658 le32_to_cpu(fe
->i_clusters
));
662 el
->l_recs
[i
].e_blkno
= cpu_to_le64(start_blk
);
663 el
->l_recs
[i
].e_clusters
= cpu_to_le32(new_clusters
);
665 /* No contiguous record, or no empty record at eof, so
666 * we add a new one. */
668 BUG_ON(le16_to_cpu(el
->l_next_free_rec
) >=
669 le16_to_cpu(el
->l_count
));
670 i
= le16_to_cpu(el
->l_next_free_rec
);
672 el
->l_recs
[i
].e_blkno
= cpu_to_le64(start_blk
);
673 el
->l_recs
[i
].e_clusters
= cpu_to_le32(new_clusters
);
674 el
->l_recs
[i
].e_cpos
= fe
->i_clusters
;
675 le16_add_cpu(&el
->l_next_free_rec
, 1);
679 * extent_map errors are not fatal, so they are ignored outside
680 * of flushing the thing.
682 status
= ocfs2_extent_map_append(inode
, &el
->l_recs
[i
],
686 ocfs2_extent_map_drop(inode
, le32_to_cpu(fe
->i_clusters
));
689 status
= ocfs2_journal_dirty(handle
, fe_bh
);
692 if (fe
->id2
.i_list
.l_tree_depth
) {
693 status
= ocfs2_journal_dirty(handle
, eb_bhs
[num_bhs
- 1]);
701 for (i
= 0; i
< num_bhs
; i
++)
712 * Should only be called when there is no space left in any of the
713 * leaf nodes. What we want to do is find the lowest tree depth
714 * non-leaf extent block with room for new records. There are three
715 * valid results of this search:
717 * 1) a lowest extent block is found, then we pass it back in
718 * *lowest_eb_bh and return '0'
720 * 2) the search fails to find anything, but the dinode has room. We
721 * pass NULL back in *lowest_eb_bh, but still return '0'
723 * 3) the search fails to find anything AND the dinode is full, in
724 * which case we return > 0
726 * return status < 0 indicates an error.
728 static int ocfs2_find_branch_target(struct ocfs2_super
*osb
,
730 struct buffer_head
*fe_bh
,
731 struct buffer_head
**target_bh
)
735 struct ocfs2_dinode
*fe
;
736 struct ocfs2_extent_block
*eb
;
737 struct ocfs2_extent_list
*el
;
738 struct buffer_head
*bh
= NULL
;
739 struct buffer_head
*lowest_bh
= NULL
;
745 fe
= (struct ocfs2_dinode
*) fe_bh
->b_data
;
746 el
= &fe
->id2
.i_list
;
748 while(le16_to_cpu(el
->l_tree_depth
) > 1) {
749 if (le16_to_cpu(el
->l_next_free_rec
) == 0) {
750 ocfs2_error(inode
->i_sb
, "Dinode %"MLFu64
" has empty "
751 "extent list (next_free_rec == 0)",
752 OCFS2_I(inode
)->ip_blkno
);
756 i
= le16_to_cpu(el
->l_next_free_rec
) - 1;
757 blkno
= le64_to_cpu(el
->l_recs
[i
].e_blkno
);
759 ocfs2_error(inode
->i_sb
, "Dinode %"MLFu64
" has extent "
760 "list where extent # %d has no physical "
762 OCFS2_I(inode
)->ip_blkno
, i
);
772 status
= ocfs2_read_block(osb
, blkno
, &bh
, OCFS2_BH_CACHED
,
779 eb
= (struct ocfs2_extent_block
*) bh
->b_data
;
780 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb
)) {
781 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode
->i_sb
, eb
);
787 if (le16_to_cpu(el
->l_next_free_rec
) <
788 le16_to_cpu(el
->l_count
)) {
796 /* If we didn't find one and the fe doesn't have any room,
799 && (fe
->id2
.i_list
.l_next_free_rec
== fe
->id2
.i_list
.l_count
))
802 *target_bh
= lowest_bh
;
811 /* the caller needs to update fe->i_clusters */
812 int ocfs2_insert_extent(struct ocfs2_super
*osb
,
813 struct ocfs2_journal_handle
*handle
,
815 struct buffer_head
*fe_bh
,
818 struct ocfs2_alloc_context
*meta_ac
)
820 int status
, i
, shift
;
821 struct buffer_head
*last_eb_bh
= NULL
;
822 struct buffer_head
*bh
= NULL
;
823 struct ocfs2_dinode
*fe
;
824 struct ocfs2_extent_block
*eb
;
825 struct ocfs2_extent_list
*el
;
829 mlog(0, "add %u clusters starting at block %"MLFu64
" to "
831 new_clusters
, start_blk
, OCFS2_I(inode
)->ip_blkno
);
833 fe
= (struct ocfs2_dinode
*) fe_bh
->b_data
;
834 el
= &fe
->id2
.i_list
;
836 if (el
->l_tree_depth
) {
837 /* jump to end of tree */
838 status
= ocfs2_read_block(osb
, le64_to_cpu(fe
->i_last_eb_blk
),
839 &last_eb_bh
, OCFS2_BH_CACHED
, inode
);
844 eb
= (struct ocfs2_extent_block
*) last_eb_bh
->b_data
;
848 /* Can we allocate without adding/shifting tree bits? */
849 i
= le16_to_cpu(el
->l_next_free_rec
) - 1;
850 if (le16_to_cpu(el
->l_next_free_rec
) == 0
851 || (le16_to_cpu(el
->l_next_free_rec
) < le16_to_cpu(el
->l_count
))
852 || le32_to_cpu(el
->l_recs
[i
].e_clusters
) == 0
853 || ocfs2_extent_contig(inode
, &el
->l_recs
[i
], start_blk
))
856 mlog(0, "ocfs2_allocate_extent: couldn't do a simple add, traversing "
859 shift
= ocfs2_find_branch_target(osb
, inode
, fe_bh
, &bh
);
866 /* We traveled all the way to the bottom of the allocation tree
867 * and didn't find room for any more extents - we need to add
868 * another tree level */
870 /* if we hit a leaf, we'd better be empty :) */
871 BUG_ON(le16_to_cpu(el
->l_next_free_rec
) !=
872 le16_to_cpu(el
->l_count
));
874 mlog(0, "ocfs2_allocate_extent: need to shift tree depth "
876 le16_to_cpu(fe
->id2
.i_list
.l_tree_depth
));
878 /* ocfs2_shift_tree_depth will return us a buffer with
879 * the new extent block (so we can pass that to
880 * ocfs2_add_branch). */
881 status
= ocfs2_shift_tree_depth(osb
, handle
, inode
, fe_bh
,
887 /* Special case: we have room now if we shifted from
889 if (fe
->id2
.i_list
.l_tree_depth
== cpu_to_le16(1))
893 /* call ocfs2_add_branch to add the final part of the tree with
895 mlog(0, "ocfs2_allocate_extent: add branch. bh = %p\n", bh
);
896 status
= ocfs2_add_branch(osb
, handle
, inode
, fe_bh
, bh
, last_eb_bh
,
904 /* Finally, we can add clusters. */
905 status
= ocfs2_do_insert_extent(osb
, handle
, inode
, fe_bh
,
906 start_blk
, new_clusters
);
921 static inline int ocfs2_truncate_log_needs_flush(struct ocfs2_super
*osb
)
923 struct buffer_head
*tl_bh
= osb
->osb_tl_bh
;
924 struct ocfs2_dinode
*di
;
925 struct ocfs2_truncate_log
*tl
;
927 di
= (struct ocfs2_dinode
*) tl_bh
->b_data
;
928 tl
= &di
->id2
.i_dealloc
;
930 mlog_bug_on_msg(le16_to_cpu(tl
->tl_used
) > le16_to_cpu(tl
->tl_count
),
931 "slot %d, invalid truncate log parameters: used = "
932 "%u, count = %u\n", osb
->slot_num
,
933 le16_to_cpu(tl
->tl_used
), le16_to_cpu(tl
->tl_count
));
934 return le16_to_cpu(tl
->tl_used
) == le16_to_cpu(tl
->tl_count
);
937 static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log
*tl
,
938 unsigned int new_start
)
940 unsigned int tail_index
;
941 unsigned int current_tail
;
943 /* No records, nothing to coalesce */
944 if (!le16_to_cpu(tl
->tl_used
))
947 tail_index
= le16_to_cpu(tl
->tl_used
) - 1;
948 current_tail
= le32_to_cpu(tl
->tl_recs
[tail_index
].t_start
);
949 current_tail
+= le32_to_cpu(tl
->tl_recs
[tail_index
].t_clusters
);
951 return current_tail
== new_start
;
954 static int ocfs2_truncate_log_append(struct ocfs2_super
*osb
,
955 struct ocfs2_journal_handle
*handle
,
957 unsigned int num_clusters
)
960 unsigned int start_cluster
, tl_count
;
961 struct inode
*tl_inode
= osb
->osb_tl_inode
;
962 struct buffer_head
*tl_bh
= osb
->osb_tl_bh
;
963 struct ocfs2_dinode
*di
;
964 struct ocfs2_truncate_log
*tl
;
966 mlog_entry("start_blk = %"MLFu64
", num_clusters = %u\n", start_blk
,
969 BUG_ON(mutex_trylock(&tl_inode
->i_mutex
));
971 start_cluster
= ocfs2_blocks_to_clusters(osb
->sb
, start_blk
);
973 di
= (struct ocfs2_dinode
*) tl_bh
->b_data
;
974 tl
= &di
->id2
.i_dealloc
;
975 if (!OCFS2_IS_VALID_DINODE(di
)) {
976 OCFS2_RO_ON_INVALID_DINODE(osb
->sb
, di
);
981 tl_count
= le16_to_cpu(tl
->tl_count
);
982 mlog_bug_on_msg(tl_count
> ocfs2_truncate_recs_per_inode(osb
->sb
) ||
984 "Truncate record count on #%"MLFu64
" invalid ("
985 "wanted %u, actual %u\n", OCFS2_I(tl_inode
)->ip_blkno
,
986 ocfs2_truncate_recs_per_inode(osb
->sb
),
987 le16_to_cpu(tl
->tl_count
));
989 /* Caller should have known to flush before calling us. */
990 index
= le16_to_cpu(tl
->tl_used
);
991 if (index
>= tl_count
) {
997 status
= ocfs2_journal_access(handle
, tl_inode
, tl_bh
,
998 OCFS2_JOURNAL_ACCESS_WRITE
);
1004 mlog(0, "Log truncate of %u clusters starting at cluster %u to "
1005 "%"MLFu64
" (index = %d)\n", num_clusters
, start_cluster
,
1006 OCFS2_I(tl_inode
)->ip_blkno
, index
);
1008 if (ocfs2_truncate_log_can_coalesce(tl
, start_cluster
)) {
1010 * Move index back to the record we are coalescing with.
1011 * ocfs2_truncate_log_can_coalesce() guarantees nonzero
1015 num_clusters
+= le32_to_cpu(tl
->tl_recs
[index
].t_clusters
);
1016 mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
1017 index
, le32_to_cpu(tl
->tl_recs
[index
].t_start
),
1020 tl
->tl_recs
[index
].t_start
= cpu_to_le32(start_cluster
);
1021 tl
->tl_used
= cpu_to_le16(index
+ 1);
1023 tl
->tl_recs
[index
].t_clusters
= cpu_to_le32(num_clusters
);
1025 status
= ocfs2_journal_dirty(handle
, tl_bh
);
1036 static int ocfs2_replay_truncate_records(struct ocfs2_super
*osb
,
1037 struct ocfs2_journal_handle
*handle
,
1038 struct inode
*data_alloc_inode
,
1039 struct buffer_head
*data_alloc_bh
)
1043 unsigned int num_clusters
;
1045 struct ocfs2_truncate_rec rec
;
1046 struct ocfs2_dinode
*di
;
1047 struct ocfs2_truncate_log
*tl
;
1048 struct inode
*tl_inode
= osb
->osb_tl_inode
;
1049 struct buffer_head
*tl_bh
= osb
->osb_tl_bh
;
1053 di
= (struct ocfs2_dinode
*) tl_bh
->b_data
;
1054 tl
= &di
->id2
.i_dealloc
;
1055 i
= le16_to_cpu(tl
->tl_used
) - 1;
1057 /* Caller has given us at least enough credits to
1058 * update the truncate log dinode */
1059 status
= ocfs2_journal_access(handle
, tl_inode
, tl_bh
,
1060 OCFS2_JOURNAL_ACCESS_WRITE
);
1066 tl
->tl_used
= cpu_to_le16(i
);
1068 status
= ocfs2_journal_dirty(handle
, tl_bh
);
1074 /* TODO: Perhaps we can calculate the bulk of the
1075 * credits up front rather than extending like
1077 status
= ocfs2_extend_trans(handle
,
1078 OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC
);
1084 rec
= tl
->tl_recs
[i
];
1085 start_blk
= ocfs2_clusters_to_blocks(data_alloc_inode
->i_sb
,
1086 le32_to_cpu(rec
.t_start
));
1087 num_clusters
= le32_to_cpu(rec
.t_clusters
);
1089 /* if start_blk is not set, we ignore the record as
1092 mlog(0, "free record %d, start = %u, clusters = %u\n",
1093 i
, le32_to_cpu(rec
.t_start
), num_clusters
);
1095 status
= ocfs2_free_clusters(handle
, data_alloc_inode
,
1096 data_alloc_bh
, start_blk
,
1111 /* Expects you to already be holding tl_inode->i_mutex */
1112 static int __ocfs2_flush_truncate_log(struct ocfs2_super
*osb
)
1115 unsigned int num_to_flush
;
1116 struct ocfs2_journal_handle
*handle
= NULL
;
1117 struct inode
*tl_inode
= osb
->osb_tl_inode
;
1118 struct inode
*data_alloc_inode
= NULL
;
1119 struct buffer_head
*tl_bh
= osb
->osb_tl_bh
;
1120 struct buffer_head
*data_alloc_bh
= NULL
;
1121 struct ocfs2_dinode
*di
;
1122 struct ocfs2_truncate_log
*tl
;
1126 BUG_ON(mutex_trylock(&tl_inode
->i_mutex
));
1128 di
= (struct ocfs2_dinode
*) tl_bh
->b_data
;
1129 tl
= &di
->id2
.i_dealloc
;
1130 if (!OCFS2_IS_VALID_DINODE(di
)) {
1131 OCFS2_RO_ON_INVALID_DINODE(osb
->sb
, di
);
1136 num_to_flush
= le16_to_cpu(tl
->tl_used
);
1137 mlog(0, "Flush %u records from truncate log #%"MLFu64
"\n",
1138 num_to_flush
, OCFS2_I(tl_inode
)->ip_blkno
);
1139 if (!num_to_flush
) {
1144 handle
= ocfs2_alloc_handle(osb
);
1151 data_alloc_inode
= ocfs2_get_system_file_inode(osb
,
1152 GLOBAL_BITMAP_SYSTEM_INODE
,
1153 OCFS2_INVALID_SLOT
);
1154 if (!data_alloc_inode
) {
1156 mlog(ML_ERROR
, "Could not get bitmap inode!\n");
1160 ocfs2_handle_add_inode(handle
, data_alloc_inode
);
1161 status
= ocfs2_meta_lock(data_alloc_inode
, handle
, &data_alloc_bh
, 1);
1167 handle
= ocfs2_start_trans(osb
, handle
, OCFS2_TRUNCATE_LOG_UPDATE
);
1168 if (IS_ERR(handle
)) {
1169 status
= PTR_ERR(handle
);
1175 status
= ocfs2_replay_truncate_records(osb
, handle
, data_alloc_inode
,
1184 ocfs2_commit_trans(handle
);
1186 if (data_alloc_inode
)
1187 iput(data_alloc_inode
);
1190 brelse(data_alloc_bh
);
1196 int ocfs2_flush_truncate_log(struct ocfs2_super
*osb
)
1199 struct inode
*tl_inode
= osb
->osb_tl_inode
;
1201 mutex_lock(&tl_inode
->i_mutex
);
1202 status
= __ocfs2_flush_truncate_log(osb
);
1203 mutex_unlock(&tl_inode
->i_mutex
);
1208 static void ocfs2_truncate_log_worker(void *data
)
1211 struct ocfs2_super
*osb
= data
;
1215 status
= ocfs2_flush_truncate_log(osb
);
1222 #define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
1223 void ocfs2_schedule_truncate_log_flush(struct ocfs2_super
*osb
,
1226 if (osb
->osb_tl_inode
) {
1227 /* We want to push off log flushes while truncates are
1230 cancel_delayed_work(&osb
->osb_truncate_log_wq
);
1232 queue_delayed_work(ocfs2_wq
, &osb
->osb_truncate_log_wq
,
1233 OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL
);
1237 static int ocfs2_get_truncate_log_info(struct ocfs2_super
*osb
,
1239 struct inode
**tl_inode
,
1240 struct buffer_head
**tl_bh
)
1243 struct inode
*inode
= NULL
;
1244 struct buffer_head
*bh
= NULL
;
1246 inode
= ocfs2_get_system_file_inode(osb
,
1247 TRUNCATE_LOG_SYSTEM_INODE
,
1251 mlog(ML_ERROR
, "Could not get load truncate log inode!\n");
1255 status
= ocfs2_read_block(osb
, OCFS2_I(inode
)->ip_blkno
, &bh
,
1256 OCFS2_BH_CACHED
, inode
);
1270 /* called during the 1st stage of node recovery. we stamp a clean
1271 * truncate log and pass back a copy for processing later. if the
1272 * truncate log does not require processing, a *tl_copy is set to
1274 int ocfs2_begin_truncate_log_recovery(struct ocfs2_super
*osb
,
1276 struct ocfs2_dinode
**tl_copy
)
1279 struct inode
*tl_inode
= NULL
;
1280 struct buffer_head
*tl_bh
= NULL
;
1281 struct ocfs2_dinode
*di
;
1282 struct ocfs2_truncate_log
*tl
;
1286 mlog(0, "recover truncate log from slot %d\n", slot_num
);
1288 status
= ocfs2_get_truncate_log_info(osb
, slot_num
, &tl_inode
, &tl_bh
);
1294 di
= (struct ocfs2_dinode
*) tl_bh
->b_data
;
1295 tl
= &di
->id2
.i_dealloc
;
1296 if (!OCFS2_IS_VALID_DINODE(di
)) {
1297 OCFS2_RO_ON_INVALID_DINODE(tl_inode
->i_sb
, di
);
1302 if (le16_to_cpu(tl
->tl_used
)) {
1303 mlog(0, "We'll have %u logs to recover\n",
1304 le16_to_cpu(tl
->tl_used
));
1306 *tl_copy
= kmalloc(tl_bh
->b_size
, GFP_KERNEL
);
1313 /* Assuming the write-out below goes well, this copy
1314 * will be passed back to recovery for processing. */
1315 memcpy(*tl_copy
, tl_bh
->b_data
, tl_bh
->b_size
);
1317 /* All we need to do to clear the truncate log is set
1321 status
= ocfs2_write_block(osb
, tl_bh
, tl_inode
);
1334 if (status
< 0 && (*tl_copy
)) {
1343 int ocfs2_complete_truncate_log_recovery(struct ocfs2_super
*osb
,
1344 struct ocfs2_dinode
*tl_copy
)
1348 unsigned int clusters
, num_recs
, start_cluster
;
1350 struct ocfs2_journal_handle
*handle
;
1351 struct inode
*tl_inode
= osb
->osb_tl_inode
;
1352 struct ocfs2_truncate_log
*tl
;
1356 if (OCFS2_I(tl_inode
)->ip_blkno
== le64_to_cpu(tl_copy
->i_blkno
)) {
1357 mlog(ML_ERROR
, "Asked to recover my own truncate log!\n");
1361 tl
= &tl_copy
->id2
.i_dealloc
;
1362 num_recs
= le16_to_cpu(tl
->tl_used
);
1363 mlog(0, "cleanup %u records from %"MLFu64
"\n", num_recs
,
1366 mutex_lock(&tl_inode
->i_mutex
);
1367 for(i
= 0; i
< num_recs
; i
++) {
1368 if (ocfs2_truncate_log_needs_flush(osb
)) {
1369 status
= __ocfs2_flush_truncate_log(osb
);
1376 handle
= ocfs2_start_trans(osb
, NULL
,
1377 OCFS2_TRUNCATE_LOG_UPDATE
);
1378 if (IS_ERR(handle
)) {
1379 status
= PTR_ERR(handle
);
1384 clusters
= le32_to_cpu(tl
->tl_recs
[i
].t_clusters
);
1385 start_cluster
= le32_to_cpu(tl
->tl_recs
[i
].t_start
);
1386 start_blk
= ocfs2_clusters_to_blocks(osb
->sb
, start_cluster
);
1388 status
= ocfs2_truncate_log_append(osb
, handle
,
1389 start_blk
, clusters
);
1390 ocfs2_commit_trans(handle
);
1398 mutex_unlock(&tl_inode
->i_mutex
);
1404 void ocfs2_truncate_log_shutdown(struct ocfs2_super
*osb
)
1407 struct inode
*tl_inode
= osb
->osb_tl_inode
;
1412 cancel_delayed_work(&osb
->osb_truncate_log_wq
);
1413 flush_workqueue(ocfs2_wq
);
1415 status
= ocfs2_flush_truncate_log(osb
);
1419 brelse(osb
->osb_tl_bh
);
1420 iput(osb
->osb_tl_inode
);
1426 int ocfs2_truncate_log_init(struct ocfs2_super
*osb
)
1429 struct inode
*tl_inode
= NULL
;
1430 struct buffer_head
*tl_bh
= NULL
;
1434 status
= ocfs2_get_truncate_log_info(osb
,
1441 /* ocfs2_truncate_log_shutdown keys on the existence of
1442 * osb->osb_tl_inode so we don't set any of the osb variables
1443 * until we're sure all is well. */
1444 INIT_WORK(&osb
->osb_truncate_log_wq
, ocfs2_truncate_log_worker
, osb
);
1445 osb
->osb_tl_bh
= tl_bh
;
1446 osb
->osb_tl_inode
= tl_inode
;
1452 /* This function will figure out whether the currently last extent
1453 * block will be deleted, and if it will, what the new last extent
1454 * block will be so we can update his h_next_leaf_blk field, as well
1455 * as the dinodes i_last_eb_blk */
1456 static int ocfs2_find_new_last_ext_blk(struct ocfs2_super
*osb
,
1457 struct inode
*inode
,
1458 struct ocfs2_dinode
*fe
,
1460 struct buffer_head
*old_last_eb
,
1461 struct buffer_head
**new_last_eb
)
1465 struct ocfs2_extent_block
*eb
;
1466 struct ocfs2_extent_list
*el
;
1467 struct buffer_head
*bh
= NULL
;
1469 *new_last_eb
= NULL
;
1471 if (!OCFS2_IS_VALID_DINODE(fe
)) {
1472 OCFS2_RO_ON_INVALID_DINODE(inode
->i_sb
, fe
);
1477 /* we have no tree, so of course, no last_eb. */
1478 if (!fe
->id2
.i_list
.l_tree_depth
)
1481 /* trunc to zero special case - this makes tree_depth = 0
1482 * regardless of what it is. */
1483 if (!new_i_clusters
)
1486 eb
= (struct ocfs2_extent_block
*) old_last_eb
->b_data
;
1488 BUG_ON(!el
->l_next_free_rec
);
1490 /* Make sure that this guy will actually be empty after we
1491 * clear away the data. */
1492 if (le32_to_cpu(el
->l_recs
[0].e_cpos
) < new_i_clusters
)
1495 /* Ok, at this point, we know that last_eb will definitely
1496 * change, so lets traverse the tree and find the second to
1497 * last extent block. */
1498 el
= &(fe
->id2
.i_list
);
1499 /* go down the tree, */
1501 for(i
= (le16_to_cpu(el
->l_next_free_rec
) - 1); i
>= 0; i
--) {
1502 if (le32_to_cpu(el
->l_recs
[i
].e_cpos
) <
1504 block
= le64_to_cpu(el
->l_recs
[i
].e_blkno
);
1515 status
= ocfs2_read_block(osb
, block
, &bh
, OCFS2_BH_CACHED
,
1521 eb
= (struct ocfs2_extent_block
*) bh
->b_data
;
1523 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb
)) {
1524 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode
->i_sb
, eb
);
1528 } while (el
->l_tree_depth
);
1531 get_bh(*new_last_eb
);
1532 mlog(0, "returning block %"MLFu64
"\n", le64_to_cpu(eb
->h_blkno
));
1540 static int ocfs2_do_truncate(struct ocfs2_super
*osb
,
1541 unsigned int clusters_to_del
,
1542 struct inode
*inode
,
1543 struct buffer_head
*fe_bh
,
1544 struct buffer_head
*old_last_eb_bh
,
1545 struct ocfs2_journal_handle
*handle
,
1546 struct ocfs2_truncate_context
*tc
)
1548 int status
, i
, depth
;
1549 struct ocfs2_dinode
*fe
;
1550 struct ocfs2_extent_block
*eb
;
1551 struct ocfs2_extent_block
*last_eb
= NULL
;
1552 struct ocfs2_extent_list
*el
;
1553 struct buffer_head
*eb_bh
= NULL
;
1554 struct buffer_head
*last_eb_bh
= NULL
;
1558 fe
= (struct ocfs2_dinode
*) fe_bh
->b_data
;
1560 status
= ocfs2_find_new_last_ext_blk(osb
,
1563 le32_to_cpu(fe
->i_clusters
) -
1572 last_eb
= (struct ocfs2_extent_block
*) last_eb_bh
->b_data
;
1574 status
= ocfs2_journal_access(handle
, inode
, fe_bh
,
1575 OCFS2_JOURNAL_ACCESS_WRITE
);
1580 el
= &(fe
->id2
.i_list
);
1582 spin_lock(&OCFS2_I(inode
)->ip_lock
);
1583 OCFS2_I(inode
)->ip_clusters
= le32_to_cpu(fe
->i_clusters
) -
1585 spin_unlock(&OCFS2_I(inode
)->ip_lock
);
1586 le32_add_cpu(&fe
->i_clusters
, -clusters_to_del
);
1587 fe
->i_mtime
= cpu_to_le64(CURRENT_TIME
.tv_sec
);
1588 fe
->i_mtime_nsec
= cpu_to_le32(CURRENT_TIME
.tv_nsec
);
1590 i
= le16_to_cpu(el
->l_next_free_rec
) - 1;
1592 BUG_ON(le32_to_cpu(el
->l_recs
[i
].e_clusters
) < clusters_to_del
);
1593 le32_add_cpu(&el
->l_recs
[i
].e_clusters
, -clusters_to_del
);
1594 /* tree depth zero, we can just delete the clusters, otherwise
1595 * we need to record the offset of the next level extent block
1596 * as we may overwrite it. */
1597 if (!el
->l_tree_depth
)
1598 delete_blk
= le64_to_cpu(el
->l_recs
[i
].e_blkno
)
1599 + ocfs2_clusters_to_blocks(osb
->sb
,
1600 le32_to_cpu(el
->l_recs
[i
].e_clusters
));
1602 next_eb
= le64_to_cpu(el
->l_recs
[i
].e_blkno
);
1604 if (!el
->l_recs
[i
].e_clusters
) {
1605 /* if we deleted the whole extent record, then clear
1606 * out the other fields and update the extent
1607 * list. For depth > 0 trees, we've already recorded
1608 * the extent block in 'next_eb' */
1609 el
->l_recs
[i
].e_cpos
= 0;
1610 el
->l_recs
[i
].e_blkno
= 0;
1611 BUG_ON(!el
->l_next_free_rec
);
1612 le16_add_cpu(&el
->l_next_free_rec
, -1);
1615 depth
= le16_to_cpu(el
->l_tree_depth
);
1616 if (!fe
->i_clusters
) {
1617 /* trunc to zero is a special case. */
1618 el
->l_tree_depth
= 0;
1619 fe
->i_last_eb_blk
= 0;
1621 fe
->i_last_eb_blk
= last_eb
->h_blkno
;
1623 status
= ocfs2_journal_dirty(handle
, fe_bh
);
1630 /* If there will be a new last extent block, then by
1631 * definition, there cannot be any leaves to the right of
1633 status
= ocfs2_journal_access(handle
, inode
, last_eb_bh
,
1634 OCFS2_JOURNAL_ACCESS_WRITE
);
1639 last_eb
->h_next_leaf_blk
= 0;
1640 status
= ocfs2_journal_dirty(handle
, last_eb_bh
);
1647 /* if our tree depth > 0, update all the tree blocks below us. */
1649 mlog(0, "traveling tree (depth = %d, next_eb = %"MLFu64
")\n",
1651 status
= ocfs2_read_block(osb
, next_eb
, &eb_bh
,
1652 OCFS2_BH_CACHED
, inode
);
1657 eb
= (struct ocfs2_extent_block
*)eb_bh
->b_data
;
1658 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb
)) {
1659 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode
->i_sb
, eb
);
1665 status
= ocfs2_journal_access(handle
, inode
, eb_bh
,
1666 OCFS2_JOURNAL_ACCESS_WRITE
);
1672 BUG_ON(le16_to_cpu(el
->l_next_free_rec
) == 0);
1673 BUG_ON(depth
!= (le16_to_cpu(el
->l_tree_depth
) + 1));
1675 i
= le16_to_cpu(el
->l_next_free_rec
) - 1;
1677 mlog(0, "extent block %"MLFu64
", before: record %d: "
1678 "(%u, %u, %"MLFu64
"), next = %u\n",
1679 le64_to_cpu(eb
->h_blkno
), i
,
1680 le32_to_cpu(el
->l_recs
[i
].e_cpos
),
1681 le32_to_cpu(el
->l_recs
[i
].e_clusters
),
1682 le64_to_cpu(el
->l_recs
[i
].e_blkno
),
1683 le16_to_cpu(el
->l_next_free_rec
));
1685 BUG_ON(le32_to_cpu(el
->l_recs
[i
].e_clusters
) < clusters_to_del
);
1686 le32_add_cpu(&el
->l_recs
[i
].e_clusters
, -clusters_to_del
);
1688 next_eb
= le64_to_cpu(el
->l_recs
[i
].e_blkno
);
1689 /* bottom-most block requires us to delete data.*/
1690 if (!el
->l_tree_depth
)
1691 delete_blk
= le64_to_cpu(el
->l_recs
[i
].e_blkno
)
1692 + ocfs2_clusters_to_blocks(osb
->sb
,
1693 le32_to_cpu(el
->l_recs
[i
].e_clusters
));
1694 if (!el
->l_recs
[i
].e_clusters
) {
1695 el
->l_recs
[i
].e_cpos
= 0;
1696 el
->l_recs
[i
].e_blkno
= 0;
1697 BUG_ON(!el
->l_next_free_rec
);
1698 le16_add_cpu(&el
->l_next_free_rec
, -1);
1700 mlog(0, "extent block %"MLFu64
", after: record %d: "
1701 "(%u, %u, %"MLFu64
"), next = %u\n",
1702 le64_to_cpu(eb
->h_blkno
), i
,
1703 le32_to_cpu(el
->l_recs
[i
].e_cpos
),
1704 le32_to_cpu(el
->l_recs
[i
].e_clusters
),
1705 le64_to_cpu(el
->l_recs
[i
].e_blkno
),
1706 le16_to_cpu(el
->l_next_free_rec
));
1708 status
= ocfs2_journal_dirty(handle
, eb_bh
);
1714 if (!el
->l_next_free_rec
) {
1715 mlog(0, "deleting this extent block.\n");
1717 ocfs2_remove_from_cache(inode
, eb_bh
);
1719 BUG_ON(eb
->h_suballoc_slot
);
1720 BUG_ON(el
->l_recs
[0].e_clusters
);
1721 BUG_ON(el
->l_recs
[0].e_cpos
);
1722 BUG_ON(el
->l_recs
[0].e_blkno
);
1723 status
= ocfs2_free_extent_block(handle
,
1724 tc
->tc_ext_alloc_inode
,
1725 tc
->tc_ext_alloc_bh
,
1737 BUG_ON(!delete_blk
);
1738 status
= ocfs2_truncate_log_append(osb
, handle
, delete_blk
,
1747 ocfs2_extent_map_trunc(inode
, le32_to_cpu(fe
->i_clusters
));
1749 ocfs2_extent_map_drop(inode
, 0);
1755 * It is expected, that by the time you call this function,
1756 * inode->i_size and fe->i_size have been adjusted.
1758 * WARNING: This will kfree the truncate context
1760 int ocfs2_commit_truncate(struct ocfs2_super
*osb
,
1761 struct inode
*inode
,
1762 struct buffer_head
*fe_bh
,
1763 struct ocfs2_truncate_context
*tc
)
1765 int status
, i
, credits
, tl_sem
= 0;
1766 u32 clusters_to_del
, target_i_clusters
;
1768 struct ocfs2_dinode
*fe
;
1769 struct ocfs2_extent_block
*eb
;
1770 struct ocfs2_extent_list
*el
;
1771 struct buffer_head
*last_eb_bh
;
1772 struct ocfs2_journal_handle
*handle
= NULL
;
1773 struct inode
*tl_inode
= osb
->osb_tl_inode
;
1777 down_write(&OCFS2_I(inode
)->ip_alloc_sem
);
1779 target_i_clusters
= ocfs2_clusters_for_bytes(osb
->sb
,
1780 i_size_read(inode
));
1782 last_eb_bh
= tc
->tc_last_eb_bh
;
1783 tc
->tc_last_eb_bh
= NULL
;
1785 fe
= (struct ocfs2_dinode
*) fe_bh
->b_data
;
1787 if (fe
->id2
.i_list
.l_tree_depth
) {
1788 eb
= (struct ocfs2_extent_block
*) last_eb_bh
->b_data
;
1791 el
= &fe
->id2
.i_list
;
1792 last_eb
= le64_to_cpu(fe
->i_last_eb_blk
);
1794 mlog(0, "ocfs2_commit_truncate: fe->i_clusters = %u, "
1795 "last_eb = %"MLFu64
", fe->i_last_eb_blk = %"MLFu64
", "
1796 "fe->id2.i_list.l_tree_depth = %u last_eb_bh = %p\n",
1797 le32_to_cpu(fe
->i_clusters
), last_eb
,
1798 le64_to_cpu(fe
->i_last_eb_blk
),
1799 le16_to_cpu(fe
->id2
.i_list
.l_tree_depth
), last_eb_bh
);
1801 if (last_eb
!= le64_to_cpu(fe
->i_last_eb_blk
)) {
1802 mlog(0, "last_eb changed!\n");
1803 BUG_ON(!fe
->id2
.i_list
.l_tree_depth
);
1804 last_eb
= le64_to_cpu(fe
->i_last_eb_blk
);
1805 /* i_last_eb_blk may have changed, read it if
1806 * necessary. We don't have to worry about the
1807 * truncate to zero case here (where there becomes no
1808 * last_eb) because we never loop back after our work
1815 status
= ocfs2_read_block(osb
, last_eb
,
1816 &last_eb_bh
, OCFS2_BH_CACHED
,
1822 eb
= (struct ocfs2_extent_block
*) last_eb_bh
->b_data
;
1823 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb
)) {
1824 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode
->i_sb
, eb
);
1831 /* by now, el will point to the extent list on the bottom most
1832 * portion of this tree. */
1833 i
= le16_to_cpu(el
->l_next_free_rec
) - 1;
1834 if (le32_to_cpu(el
->l_recs
[i
].e_cpos
) >= target_i_clusters
)
1835 clusters_to_del
= le32_to_cpu(el
->l_recs
[i
].e_clusters
);
1837 clusters_to_del
= (le32_to_cpu(el
->l_recs
[i
].e_clusters
) +
1838 le32_to_cpu(el
->l_recs
[i
].e_cpos
)) -
1841 mlog(0, "clusters_to_del = %u in this pass\n", clusters_to_del
);
1843 mutex_lock(&tl_inode
->i_mutex
);
1845 /* ocfs2_truncate_log_needs_flush guarantees us at least one
1846 * record is free for use. If there isn't any, we flush to get
1847 * an empty truncate log. */
1848 if (ocfs2_truncate_log_needs_flush(osb
)) {
1849 status
= __ocfs2_flush_truncate_log(osb
);
1856 credits
= ocfs2_calc_tree_trunc_credits(osb
->sb
, clusters_to_del
,
1858 handle
= ocfs2_start_trans(osb
, NULL
, credits
);
1859 if (IS_ERR(handle
)) {
1860 status
= PTR_ERR(handle
);
1866 inode
->i_ctime
= inode
->i_mtime
= CURRENT_TIME
;
1867 status
= ocfs2_mark_inode_dirty(handle
, inode
, fe_bh
);
1871 status
= ocfs2_do_truncate(osb
, clusters_to_del
, inode
, fe_bh
,
1872 last_eb_bh
, handle
, tc
);
1878 mutex_unlock(&tl_inode
->i_mutex
);
1881 ocfs2_commit_trans(handle
);
1884 BUG_ON(le32_to_cpu(fe
->i_clusters
) < target_i_clusters
);
1885 if (le32_to_cpu(fe
->i_clusters
) > target_i_clusters
)
1888 up_write(&OCFS2_I(inode
)->ip_alloc_sem
);
1890 ocfs2_schedule_truncate_log_flush(osb
, 1);
1893 mutex_unlock(&tl_inode
->i_mutex
);
1896 ocfs2_commit_trans(handle
);
1901 /* This will drop the ext_alloc cluster lock for us */
1902 ocfs2_free_truncate_context(tc
);
1910 * Expects the inode to already be locked. This will figure out which
1911 * inodes need to be locked and will put them on the returned truncate
1914 int ocfs2_prepare_truncate(struct ocfs2_super
*osb
,
1915 struct inode
*inode
,
1916 struct buffer_head
*fe_bh
,
1917 struct ocfs2_truncate_context
**tc
)
1919 int status
, metadata_delete
;
1920 unsigned int new_i_clusters
;
1921 struct ocfs2_dinode
*fe
;
1922 struct ocfs2_extent_block
*eb
;
1923 struct ocfs2_extent_list
*el
;
1924 struct buffer_head
*last_eb_bh
= NULL
;
1925 struct inode
*ext_alloc_inode
= NULL
;
1926 struct buffer_head
*ext_alloc_bh
= NULL
;
1932 new_i_clusters
= ocfs2_clusters_for_bytes(osb
->sb
,
1933 i_size_read(inode
));
1934 fe
= (struct ocfs2_dinode
*) fe_bh
->b_data
;
1936 mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
1937 "%"MLFu64
"\n", fe
->i_clusters
, new_i_clusters
, fe
->i_size
);
1939 if (le32_to_cpu(fe
->i_clusters
) <= new_i_clusters
) {
1940 ocfs2_error(inode
->i_sb
, "Dinode %"MLFu64
" has cluster count "
1941 "%u and size %"MLFu64
" whereas struct inode has "
1942 "cluster count %u and size %llu which caused an "
1943 "invalid truncate to %u clusters.",
1944 le64_to_cpu(fe
->i_blkno
),
1945 le32_to_cpu(fe
->i_clusters
),
1946 le64_to_cpu(fe
->i_size
),
1947 OCFS2_I(inode
)->ip_clusters
, i_size_read(inode
),
1949 mlog_meta_lvb(ML_ERROR
, &OCFS2_I(inode
)->ip_meta_lockres
);
1954 *tc
= kcalloc(1, sizeof(struct ocfs2_truncate_context
), GFP_KERNEL
);
1961 metadata_delete
= 0;
1962 if (fe
->id2
.i_list
.l_tree_depth
) {
1963 /* If we have a tree, then the truncate may result in
1964 * metadata deletes. Figure this out from the
1965 * rightmost leaf block.*/
1966 status
= ocfs2_read_block(osb
, le64_to_cpu(fe
->i_last_eb_blk
),
1967 &last_eb_bh
, OCFS2_BH_CACHED
, inode
);
1972 eb
= (struct ocfs2_extent_block
*) last_eb_bh
->b_data
;
1973 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb
)) {
1974 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode
->i_sb
, eb
);
1981 if (le32_to_cpu(el
->l_recs
[0].e_cpos
) >= new_i_clusters
)
1982 metadata_delete
= 1;
1985 (*tc
)->tc_last_eb_bh
= last_eb_bh
;
1987 if (metadata_delete
) {
1988 mlog(0, "Will have to delete metadata for this trunc. "
1989 "locking allocator.\n");
1990 ext_alloc_inode
= ocfs2_get_system_file_inode(osb
, EXTENT_ALLOC_SYSTEM_INODE
, 0);
1991 if (!ext_alloc_inode
) {
1997 mutex_lock(&ext_alloc_inode
->i_mutex
);
1998 (*tc
)->tc_ext_alloc_inode
= ext_alloc_inode
;
2000 status
= ocfs2_meta_lock(ext_alloc_inode
,
2008 (*tc
)->tc_ext_alloc_bh
= ext_alloc_bh
;
2009 (*tc
)->tc_ext_alloc_locked
= 1;
2016 ocfs2_free_truncate_context(*tc
);
2023 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context
*tc
)
2025 if (tc
->tc_ext_alloc_inode
) {
2026 if (tc
->tc_ext_alloc_locked
)
2027 ocfs2_meta_unlock(tc
->tc_ext_alloc_inode
, 1);
2029 mutex_unlock(&tc
->tc_ext_alloc_inode
->i_mutex
);
2030 iput(tc
->tc_ext_alloc_inode
);
2033 if (tc
->tc_ext_alloc_bh
)
2034 brelse(tc
->tc_ext_alloc_bh
);
2036 if (tc
->tc_last_eb_bh
)
2037 brelse(tc
->tc_last_eb_bh
);