2 * Copyright (c) 2008,2009 NEC Software Tohoku, Ltd.
3 * Written by Takashi Sato <t-sato@yk.jp.nec.com>
4 * Akira Fujita <a-fujita@rs.jp.nec.com>
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of version 2.1 of the GNU Lesser General Public License
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
17 #include <linux/quotaops.h>
18 #include <linux/slab.h>
19 #include "ext4_jbd2.h"
21 #include "ext4_extents.h"
24 * get_ext_path - Find an extent path for designated logical block number.
26 * @inode: an inode which is searched
27 * @lblock: logical block number to find an extent path
28 * @path: pointer to an extent path pointer (for output)
30 * ext4_find_extent wrapper. Return 0 on success, or a negative error value
34 get_ext_path(struct inode
*inode
, ext4_lblk_t lblock
,
35 struct ext4_ext_path
**ppath
)
37 struct ext4_ext_path
*path
;
39 path
= ext4_find_extent(inode
, lblock
, ppath
, EXT4_EX_NOCACHE
);
42 if (path
[ext_depth(inode
)].p_ext
== NULL
) {
43 ext4_ext_drop_refs(path
);
53 * ext4_double_down_write_data_sem - Acquire two inodes' write lock
56 * Acquire write lock of i_data_sem of the two inodes
59 ext4_double_down_write_data_sem(struct inode
*first
, struct inode
*second
)
62 down_write(&EXT4_I(first
)->i_data_sem
);
63 down_write_nested(&EXT4_I(second
)->i_data_sem
, I_DATA_SEM_OTHER
);
65 down_write(&EXT4_I(second
)->i_data_sem
);
66 down_write_nested(&EXT4_I(first
)->i_data_sem
, I_DATA_SEM_OTHER
);
72 * ext4_double_up_write_data_sem - Release two inodes' write lock of i_data_sem
74 * @orig_inode: original inode structure to be released its lock first
75 * @donor_inode: donor inode structure to be released its lock second
76 * Release write lock of i_data_sem of two inodes (orig and donor).
79 ext4_double_up_write_data_sem(struct inode
*orig_inode
,
80 struct inode
*donor_inode
)
82 up_write(&EXT4_I(orig_inode
)->i_data_sem
);
83 up_write(&EXT4_I(donor_inode
)->i_data_sem
);
87 * mext_check_coverage - Check that all extents in range has the same type
89 * @inode: inode in question
90 * @from: block offset of inode
91 * @count: block count to be checked
92 * @unwritten: extents expected to be unwritten
93 * @err: pointer to save error value
95 * Return 1 if all extents in range has expected type, and zero otherwise.
98 mext_check_coverage(struct inode
*inode
, ext4_lblk_t from
, ext4_lblk_t count
,
99 int unwritten
, int *err
)
101 struct ext4_ext_path
*path
= NULL
;
102 struct ext4_extent
*ext
;
104 ext4_lblk_t last
= from
+ count
;
105 while (from
< last
) {
106 *err
= get_ext_path(inode
, from
, &path
);
109 ext
= path
[ext_depth(inode
)].p_ext
;
110 if (unwritten
!= ext4_ext_is_unwritten(ext
))
112 from
+= ext4_ext_get_actual_len(ext
);
113 ext4_ext_drop_refs(path
);
117 ext4_ext_drop_refs(path
);
123 * mext_page_double_lock - Grab and lock pages on both @inode1 and @inode2
125 * @inode1: the inode structure
126 * @inode2: the inode structure
127 * @index1: page index
128 * @index2: page index
129 * @page: result page vector
131 * Grab two locked pages for inode's by inode order
134 mext_page_double_lock(struct inode
*inode1
, struct inode
*inode2
,
135 pgoff_t index1
, pgoff_t index2
, struct page
*page
[2])
137 struct address_space
*mapping
[2];
138 unsigned fl
= AOP_FLAG_NOFS
;
140 BUG_ON(!inode1
|| !inode2
);
141 if (inode1
< inode2
) {
142 mapping
[0] = inode1
->i_mapping
;
143 mapping
[1] = inode2
->i_mapping
;
145 pgoff_t tmp
= index1
;
148 mapping
[0] = inode2
->i_mapping
;
149 mapping
[1] = inode1
->i_mapping
;
152 page
[0] = grab_cache_page_write_begin(mapping
[0], index1
, fl
);
156 page
[1] = grab_cache_page_write_begin(mapping
[1], index2
, fl
);
158 unlock_page(page
[0]);
163 * grab_cache_page_write_begin() may not wait on page's writeback if
164 * BDI not demand that. But it is reasonable to be very conservative
165 * here and explicitly wait on page's writeback
167 wait_on_page_writeback(page
[0]);
168 wait_on_page_writeback(page
[1]);
170 swap(page
[0], page
[1]);
175 /* Force page buffers uptodate w/o dropping page's lock */
177 mext_page_mkuptodate(struct page
*page
, unsigned from
, unsigned to
)
179 struct inode
*inode
= page
->mapping
->host
;
181 struct buffer_head
*bh
, *head
, *arr
[MAX_BUF_PER_PAGE
];
182 unsigned int blocksize
, block_start
, block_end
;
183 int i
, err
, nr
= 0, partial
= 0;
184 BUG_ON(!PageLocked(page
));
185 BUG_ON(PageWriteback(page
));
187 if (PageUptodate(page
))
190 blocksize
= i_blocksize(inode
);
191 if (!page_has_buffers(page
))
192 create_empty_buffers(page
, blocksize
, 0);
194 head
= page_buffers(page
);
195 block
= (sector_t
)page
->index
<< (PAGE_SHIFT
- inode
->i_blkbits
);
196 for (bh
= head
, block_start
= 0; bh
!= head
|| !block_start
;
197 block
++, block_start
= block_end
, bh
= bh
->b_this_page
) {
198 block_end
= block_start
+ blocksize
;
199 if (block_end
<= from
|| block_start
>= to
) {
200 if (!buffer_uptodate(bh
))
204 if (buffer_uptodate(bh
))
206 if (!buffer_mapped(bh
)) {
207 err
= ext4_get_block(inode
, block
, bh
, 0);
212 if (!buffer_mapped(bh
)) {
213 zero_user(page
, block_start
, blocksize
);
214 set_buffer_uptodate(bh
);
218 BUG_ON(nr
>= MAX_BUF_PER_PAGE
);
225 for (i
= 0; i
< nr
; i
++) {
227 if (!bh_uptodate_or_lock(bh
)) {
228 err
= bh_submit_read(bh
);
235 SetPageUptodate(page
);
240 * move_extent_per_page - Move extent data per page
242 * @o_filp: file structure of original file
243 * @donor_inode: donor inode
244 * @orig_page_offset: page index on original file
245 * @donor_page_offset: page index on donor file
246 * @data_offset_in_page: block index where data swapping starts
247 * @block_len_in_page: the number of blocks to be swapped
248 * @unwritten: orig extent is unwritten or not
249 * @err: pointer to save return value
251 * Save the data in original inode blocks and replace original inode extents
252 * with donor inode extents by calling ext4_swap_extents().
253 * Finally, write out the saved data in new original inode blocks. Return
254 * replaced block count.
257 move_extent_per_page(struct file
*o_filp
, struct inode
*donor_inode
,
258 pgoff_t orig_page_offset
, pgoff_t donor_page_offset
,
259 int data_offset_in_page
,
260 int block_len_in_page
, int unwritten
, int *err
)
262 struct inode
*orig_inode
= file_inode(o_filp
);
263 struct page
*pagep
[2] = {NULL
, NULL
};
265 ext4_lblk_t orig_blk_offset
, donor_blk_offset
;
266 unsigned long blocksize
= orig_inode
->i_sb
->s_blocksize
;
267 unsigned int tmp_data_size
, data_size
, replaced_size
;
268 int i
, err2
, jblocks
, retries
= 0;
269 int replaced_count
= 0;
270 int from
= data_offset_in_page
<< orig_inode
->i_blkbits
;
271 int blocks_per_page
= PAGE_SIZE
>> orig_inode
->i_blkbits
;
272 struct super_block
*sb
= orig_inode
->i_sb
;
273 struct buffer_head
*bh
= NULL
;
276 * It needs twice the amount of ordinary journal buffers because
277 * inode and donor_inode may change each different metadata blocks.
281 jblocks
= ext4_writepage_trans_blocks(orig_inode
) * 2;
282 handle
= ext4_journal_start(orig_inode
, EXT4_HT_MOVE_EXTENTS
, jblocks
);
283 if (IS_ERR(handle
)) {
284 *err
= PTR_ERR(handle
);
288 orig_blk_offset
= orig_page_offset
* blocks_per_page
+
291 donor_blk_offset
= donor_page_offset
* blocks_per_page
+
294 /* Calculate data_size */
295 if ((orig_blk_offset
+ block_len_in_page
- 1) ==
296 ((orig_inode
->i_size
- 1) >> orig_inode
->i_blkbits
)) {
297 /* Replace the last block */
298 tmp_data_size
= orig_inode
->i_size
& (blocksize
- 1);
300 * If data_size equal zero, it shows data_size is multiples of
301 * blocksize. So we set appropriate value.
303 if (tmp_data_size
== 0)
304 tmp_data_size
= blocksize
;
306 data_size
= tmp_data_size
+
307 ((block_len_in_page
- 1) << orig_inode
->i_blkbits
);
309 data_size
= block_len_in_page
<< orig_inode
->i_blkbits
;
311 replaced_size
= data_size
;
313 *err
= mext_page_double_lock(orig_inode
, donor_inode
, orig_page_offset
,
314 donor_page_offset
, pagep
);
315 if (unlikely(*err
< 0))
318 * If orig extent was unwritten it can become initialized
319 * at any time after i_data_sem was dropped, in order to
320 * serialize with delalloc we have recheck extent while we
321 * hold page's lock, if it is still the case data copy is not
322 * necessary, just swap data blocks between orig and donor.
325 ext4_double_down_write_data_sem(orig_inode
, donor_inode
);
326 /* If any of extents in range became initialized we have to
327 * fallback to data copying */
328 unwritten
= mext_check_coverage(orig_inode
, orig_blk_offset
,
329 block_len_in_page
, 1, err
);
333 unwritten
&= mext_check_coverage(donor_inode
, donor_blk_offset
,
334 block_len_in_page
, 1, err
);
339 ext4_double_up_write_data_sem(orig_inode
, donor_inode
);
342 if ((page_has_private(pagep
[0]) &&
343 !try_to_release_page(pagep
[0], 0)) ||
344 (page_has_private(pagep
[1]) &&
345 !try_to_release_page(pagep
[1], 0))) {
349 replaced_count
= ext4_swap_extents(handle
, orig_inode
,
350 donor_inode
, orig_blk_offset
,
352 block_len_in_page
, 1, err
);
354 ext4_double_up_write_data_sem(orig_inode
, donor_inode
);
358 *err
= mext_page_mkuptodate(pagep
[0], from
, from
+ replaced_size
);
362 /* At this point all buffers in range are uptodate, old mapping layout
363 * is no longer required, try to drop it now. */
364 if ((page_has_private(pagep
[0]) && !try_to_release_page(pagep
[0], 0)) ||
365 (page_has_private(pagep
[1]) && !try_to_release_page(pagep
[1], 0))) {
369 ext4_double_down_write_data_sem(orig_inode
, donor_inode
);
370 replaced_count
= ext4_swap_extents(handle
, orig_inode
, donor_inode
,
371 orig_blk_offset
, donor_blk_offset
,
372 block_len_in_page
, 1, err
);
373 ext4_double_up_write_data_sem(orig_inode
, donor_inode
);
375 if (replaced_count
) {
376 block_len_in_page
= replaced_count
;
378 block_len_in_page
<< orig_inode
->i_blkbits
;
382 /* Perform all necessary steps similar write_begin()/write_end()
383 * but keeping in mind that i_size will not change */
384 if (!page_has_buffers(pagep
[0]))
385 create_empty_buffers(pagep
[0], 1 << orig_inode
->i_blkbits
, 0);
386 bh
= page_buffers(pagep
[0]);
387 for (i
= 0; i
< data_offset_in_page
; i
++)
388 bh
= bh
->b_this_page
;
389 for (i
= 0; i
< block_len_in_page
; i
++) {
390 *err
= ext4_get_block(orig_inode
, orig_blk_offset
+ i
, bh
, 0);
393 bh
= bh
->b_this_page
;
396 *err
= block_commit_write(pagep
[0], from
, from
+ replaced_size
);
398 if (unlikely(*err
< 0))
399 goto repair_branches
;
401 /* Even in case of data=writeback it is reasonable to pin
402 * inode to transaction, to prevent unexpected data loss */
403 *err
= ext4_jbd2_inode_add_write(handle
, orig_inode
,
404 (loff_t
)orig_page_offset
<< PAGE_SHIFT
, replaced_size
);
407 unlock_page(pagep
[0]);
409 unlock_page(pagep
[1]);
412 ext4_journal_stop(handle
);
413 if (*err
== -ENOSPC
&&
414 ext4_should_retry_alloc(sb
, &retries
))
416 /* Buffer was busy because probably is pinned to journal transaction,
417 * force transaction commit may help to free it. */
418 if (*err
== -EBUSY
&& retries
++ < 4 && EXT4_SB(sb
)->s_journal
&&
419 jbd2_journal_force_commit_nested(EXT4_SB(sb
)->s_journal
))
421 return replaced_count
;
425 * This should never ever happen!
426 * Extents are swapped already, but we are not able to copy data.
427 * Try to swap extents to it's original places
429 ext4_double_down_write_data_sem(orig_inode
, donor_inode
);
430 replaced_count
= ext4_swap_extents(handle
, donor_inode
, orig_inode
,
431 orig_blk_offset
, donor_blk_offset
,
432 block_len_in_page
, 0, &err2
);
433 ext4_double_up_write_data_sem(orig_inode
, donor_inode
);
434 if (replaced_count
!= block_len_in_page
) {
435 EXT4_ERROR_INODE_BLOCK(orig_inode
, (sector_t
)(orig_blk_offset
),
436 "Unable to copy data block,"
437 " data will be lost.");
445 * mext_check_arguments - Check whether move extent can be done
447 * @orig_inode: original inode
448 * @donor_inode: donor inode
449 * @orig_start: logical start offset in block for orig
450 * @donor_start: logical start offset in block for donor
451 * @len: the number of blocks to be moved
453 * Check the arguments of ext4_move_extents() whether the files can be
454 * exchanged with each other.
455 * Return 0 on success, or a negative error value on failure.
458 mext_check_arguments(struct inode
*orig_inode
,
459 struct inode
*donor_inode
, __u64 orig_start
,
460 __u64 donor_start
, __u64
*len
)
462 __u64 orig_eof
, donor_eof
;
463 unsigned int blkbits
= orig_inode
->i_blkbits
;
464 unsigned int blocksize
= 1 << blkbits
;
466 orig_eof
= (i_size_read(orig_inode
) + blocksize
- 1) >> blkbits
;
467 donor_eof
= (i_size_read(donor_inode
) + blocksize
- 1) >> blkbits
;
470 if (donor_inode
->i_mode
& (S_ISUID
|S_ISGID
)) {
471 ext4_debug("ext4 move extent: suid or sgid is set"
472 " to donor file [ino:orig %lu, donor %lu]\n",
473 orig_inode
->i_ino
, donor_inode
->i_ino
);
477 if (IS_IMMUTABLE(donor_inode
) || IS_APPEND(donor_inode
))
480 /* Ext4 move extent does not support swapfile */
481 if (IS_SWAPFILE(orig_inode
) || IS_SWAPFILE(donor_inode
)) {
482 ext4_debug("ext4 move extent: The argument files should "
483 "not be swapfile [ino:orig %lu, donor %lu]\n",
484 orig_inode
->i_ino
, donor_inode
->i_ino
);
488 if (ext4_is_quota_file(orig_inode
) && ext4_is_quota_file(donor_inode
)) {
489 ext4_debug("ext4 move extent: The argument files should "
490 "not be quota files [ino:orig %lu, donor %lu]\n",
491 orig_inode
->i_ino
, donor_inode
->i_ino
);
495 /* Ext4 move extent supports only extent based file */
496 if (!(ext4_test_inode_flag(orig_inode
, EXT4_INODE_EXTENTS
))) {
497 ext4_debug("ext4 move extent: orig file is not extents "
498 "based file [ino:orig %lu]\n", orig_inode
->i_ino
);
500 } else if (!(ext4_test_inode_flag(donor_inode
, EXT4_INODE_EXTENTS
))) {
501 ext4_debug("ext4 move extent: donor file is not extents "
502 "based file [ino:donor %lu]\n", donor_inode
->i_ino
);
506 if ((!orig_inode
->i_size
) || (!donor_inode
->i_size
)) {
507 ext4_debug("ext4 move extent: File size is 0 byte\n");
511 /* Start offset should be same */
512 if ((orig_start
& ~(PAGE_MASK
>> orig_inode
->i_blkbits
)) !=
513 (donor_start
& ~(PAGE_MASK
>> orig_inode
->i_blkbits
))) {
514 ext4_debug("ext4 move extent: orig and donor's start "
515 "offsets are not aligned [ino:orig %lu, donor %lu]\n",
516 orig_inode
->i_ino
, donor_inode
->i_ino
);
520 if ((orig_start
>= EXT_MAX_BLOCKS
) ||
521 (donor_start
>= EXT_MAX_BLOCKS
) ||
522 (*len
> EXT_MAX_BLOCKS
) ||
523 (donor_start
+ *len
>= EXT_MAX_BLOCKS
) ||
524 (orig_start
+ *len
>= EXT_MAX_BLOCKS
)) {
525 ext4_debug("ext4 move extent: Can't handle over [%u] blocks "
526 "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCKS
,
527 orig_inode
->i_ino
, donor_inode
->i_ino
);
530 if (orig_eof
<= orig_start
)
532 else if (orig_eof
< orig_start
+ *len
- 1)
533 *len
= orig_eof
- orig_start
;
534 if (donor_eof
<= donor_start
)
536 else if (donor_eof
< donor_start
+ *len
- 1)
537 *len
= donor_eof
- donor_start
;
539 ext4_debug("ext4 move extent: len should not be 0 "
540 "[ino:orig %lu, donor %lu]\n", orig_inode
->i_ino
,
549 * ext4_move_extents - Exchange the specified range of a file
551 * @o_filp: file structure of the original file
552 * @d_filp: file structure of the donor file
553 * @orig_blk: start offset in block for orig
554 * @donor_blk: start offset in block for donor
555 * @len: the number of blocks to be moved
556 * @moved_len: moved block length
558 * This function returns 0 and moved block length is set in moved_len
559 * if succeed, otherwise returns error value.
563 ext4_move_extents(struct file
*o_filp
, struct file
*d_filp
, __u64 orig_blk
,
564 __u64 donor_blk
, __u64 len
, __u64
*moved_len
)
566 struct inode
*orig_inode
= file_inode(o_filp
);
567 struct inode
*donor_inode
= file_inode(d_filp
);
568 struct ext4_ext_path
*path
= NULL
;
569 int blocks_per_page
= PAGE_SIZE
>> orig_inode
->i_blkbits
;
570 ext4_lblk_t o_end
, o_start
= orig_blk
;
571 ext4_lblk_t d_start
= donor_blk
;
574 if (orig_inode
->i_sb
!= donor_inode
->i_sb
) {
575 ext4_debug("ext4 move extent: The argument files "
576 "should be in same FS [ino:orig %lu, donor %lu]\n",
577 orig_inode
->i_ino
, donor_inode
->i_ino
);
581 /* orig and donor should be different inodes */
582 if (orig_inode
== donor_inode
) {
583 ext4_debug("ext4 move extent: The argument files should not "
584 "be same inode [ino:orig %lu, donor %lu]\n",
585 orig_inode
->i_ino
, donor_inode
->i_ino
);
589 /* Regular file check */
590 if (!S_ISREG(orig_inode
->i_mode
) || !S_ISREG(donor_inode
->i_mode
)) {
591 ext4_debug("ext4 move extent: The argument files should be "
592 "regular file [ino:orig %lu, donor %lu]\n",
593 orig_inode
->i_ino
, donor_inode
->i_ino
);
597 /* TODO: it's not obvious how to swap blocks for inodes with full
598 journaling enabled */
599 if (ext4_should_journal_data(orig_inode
) ||
600 ext4_should_journal_data(donor_inode
)) {
601 ext4_msg(orig_inode
->i_sb
, KERN_ERR
,
602 "Online defrag not supported with data journaling");
606 if (ext4_encrypted_inode(orig_inode
) ||
607 ext4_encrypted_inode(donor_inode
)) {
608 ext4_msg(orig_inode
->i_sb
, KERN_ERR
,
609 "Online defrag not supported for encrypted files");
613 /* Protect orig and donor inodes against a truncate */
614 lock_two_nondirectories(orig_inode
, donor_inode
);
616 /* Wait for all existing dio workers */
617 ext4_inode_block_unlocked_dio(orig_inode
);
618 ext4_inode_block_unlocked_dio(donor_inode
);
619 inode_dio_wait(orig_inode
);
620 inode_dio_wait(donor_inode
);
622 /* Protect extent tree against block allocations via delalloc */
623 ext4_double_down_write_data_sem(orig_inode
, donor_inode
);
624 /* Check the filesystem environment whether move_extent can be done */
625 ret
= mext_check_arguments(orig_inode
, donor_inode
, orig_blk
,
629 o_end
= o_start
+ len
;
631 while (o_start
< o_end
) {
632 struct ext4_extent
*ex
;
633 ext4_lblk_t cur_blk
, next_blk
;
634 pgoff_t orig_page_index
, donor_page_index
;
636 int unwritten
, cur_len
;
638 ret
= get_ext_path(orig_inode
, o_start
, &path
);
641 ex
= path
[path
->p_depth
].p_ext
;
642 next_blk
= ext4_ext_next_allocated_block(path
);
643 cur_blk
= le32_to_cpu(ex
->ee_block
);
644 cur_len
= ext4_ext_get_actual_len(ex
);
645 /* Check hole before the start pos */
646 if (cur_blk
+ cur_len
- 1 < o_start
) {
647 if (next_blk
== EXT_MAX_BLOCKS
) {
652 d_start
+= next_blk
- o_start
;
655 /* Check hole after the start pos */
656 } else if (cur_blk
> o_start
) {
658 d_start
+= cur_blk
- o_start
;
660 /* Extent inside requested range ?*/
661 if (cur_blk
>= o_end
)
663 } else { /* in_range(o_start, o_blk, o_len) */
664 cur_len
+= cur_blk
- o_start
;
666 unwritten
= ext4_ext_is_unwritten(ex
);
667 if (o_end
- o_start
< cur_len
)
668 cur_len
= o_end
- o_start
;
670 orig_page_index
= o_start
>> (PAGE_SHIFT
-
671 orig_inode
->i_blkbits
);
672 donor_page_index
= d_start
>> (PAGE_SHIFT
-
673 donor_inode
->i_blkbits
);
674 offset_in_page
= o_start
% blocks_per_page
;
675 if (cur_len
> blocks_per_page
- offset_in_page
)
676 cur_len
= blocks_per_page
- offset_in_page
;
678 * Up semaphore to avoid following problems:
679 * a. transaction deadlock among ext4_journal_start,
680 * ->write_begin via pagefault, and jbd2_journal_commit
681 * b. racing with ->readpage, ->write_begin, and ext4_get_block
682 * in move_extent_per_page
684 ext4_double_up_write_data_sem(orig_inode
, donor_inode
);
685 /* Swap original branches with new branches */
686 move_extent_per_page(o_filp
, donor_inode
,
687 orig_page_index
, donor_page_index
,
688 offset_in_page
, cur_len
,
690 ext4_double_down_write_data_sem(orig_inode
, donor_inode
);
696 *moved_len
= o_start
- orig_blk
;
697 if (*moved_len
> len
)
702 ext4_discard_preallocations(orig_inode
);
703 ext4_discard_preallocations(donor_inode
);
706 ext4_ext_drop_refs(path
);
708 ext4_double_up_write_data_sem(orig_inode
, donor_inode
);
709 ext4_inode_resume_unlocked_dio(orig_inode
);
710 ext4_inode_resume_unlocked_dio(donor_inode
);
711 unlock_two_nondirectories(orig_inode
, donor_inode
);