1 // SPDX-License-Identifier: LGPL-2.1
3 * Copyright (c) 2008,2009 NEC Software Tohoku, Ltd.
4 * Written by Takashi Sato <t-sato@yk.jp.nec.com>
5 * Akira Fujita <a-fujita@rs.jp.nec.com>
9 #include <linux/quotaops.h>
10 #include <linux/slab.h>
11 #include <linux/sched/mm.h>
12 #include "ext4_jbd2.h"
14 #include "ext4_extents.h"
17 * get_ext_path() - Find an extent path for designated logical block number.
18 * @inode: inode to be searched
19 * @lblock: logical block number to find an extent path
20 * @path: pointer to an extent path
22 * ext4_find_extent wrapper. Return an extent path pointer on success,
23 * or an error pointer on failure.
25 static inline struct ext4_ext_path
*
26 get_ext_path(struct inode
*inode
, ext4_lblk_t lblock
,
27 struct ext4_ext_path
*path
)
29 path
= ext4_find_extent(inode
, lblock
, path
, EXT4_EX_NOCACHE
);
32 if (path
[ext_depth(inode
)].p_ext
== NULL
) {
33 ext4_free_ext_path(path
);
34 return ERR_PTR(-ENODATA
);
40 * ext4_double_down_write_data_sem() - write lock two inodes's i_data_sem
41 * @first: inode to be locked
42 * @second: inode to be locked
44 * Acquire write lock of i_data_sem of the two inodes
47 ext4_double_down_write_data_sem(struct inode
*first
, struct inode
*second
)
50 down_write(&EXT4_I(first
)->i_data_sem
);
51 down_write_nested(&EXT4_I(second
)->i_data_sem
, I_DATA_SEM_OTHER
);
53 down_write(&EXT4_I(second
)->i_data_sem
);
54 down_write_nested(&EXT4_I(first
)->i_data_sem
, I_DATA_SEM_OTHER
);
60 * ext4_double_up_write_data_sem - Release two inodes' write lock of i_data_sem
62 * @orig_inode: original inode structure to be released its lock first
63 * @donor_inode: donor inode structure to be released its lock second
64 * Release write lock of i_data_sem of two inodes (orig and donor).
67 ext4_double_up_write_data_sem(struct inode
*orig_inode
,
68 struct inode
*donor_inode
)
70 up_write(&EXT4_I(orig_inode
)->i_data_sem
);
71 up_write(&EXT4_I(donor_inode
)->i_data_sem
);
75 * mext_check_coverage - Check that all extents in range has the same type
77 * @inode: inode in question
78 * @from: block offset of inode
79 * @count: block count to be checked
80 * @unwritten: extents expected to be unwritten
81 * @err: pointer to save error value
83 * Return 1 if all extents in range has expected type, and zero otherwise.
86 mext_check_coverage(struct inode
*inode
, ext4_lblk_t from
, ext4_lblk_t count
,
87 int unwritten
, int *err
)
89 struct ext4_ext_path
*path
= NULL
;
90 struct ext4_extent
*ext
;
92 ext4_lblk_t last
= from
+ count
;
94 path
= get_ext_path(inode
, from
, path
);
99 ext
= path
[ext_depth(inode
)].p_ext
;
100 if (unwritten
!= ext4_ext_is_unwritten(ext
))
102 from
+= ext4_ext_get_actual_len(ext
);
106 ext4_free_ext_path(path
);
111 * mext_folio_double_lock - Grab and lock folio on both @inode1 and @inode2
113 * @inode1: the inode structure
114 * @inode2: the inode structure
115 * @index1: folio index
116 * @index2: folio index
117 * @folio: result folio vector
119 * Grab two locked folio for inode's by inode order
122 mext_folio_double_lock(struct inode
*inode1
, struct inode
*inode2
,
123 pgoff_t index1
, pgoff_t index2
, struct folio
*folio
[2])
125 struct address_space
*mapping
[2];
128 BUG_ON(!inode1
|| !inode2
);
129 if (inode1
< inode2
) {
130 mapping
[0] = inode1
->i_mapping
;
131 mapping
[1] = inode2
->i_mapping
;
133 swap(index1
, index2
);
134 mapping
[0] = inode2
->i_mapping
;
135 mapping
[1] = inode1
->i_mapping
;
138 flags
= memalloc_nofs_save();
139 folio
[0] = __filemap_get_folio(mapping
[0], index1
, FGP_WRITEBEGIN
,
140 mapping_gfp_mask(mapping
[0]));
141 if (IS_ERR(folio
[0])) {
142 memalloc_nofs_restore(flags
);
143 return PTR_ERR(folio
[0]);
146 folio
[1] = __filemap_get_folio(mapping
[1], index2
, FGP_WRITEBEGIN
,
147 mapping_gfp_mask(mapping
[1]));
148 memalloc_nofs_restore(flags
);
149 if (IS_ERR(folio
[1])) {
150 folio_unlock(folio
[0]);
152 return PTR_ERR(folio
[1]);
155 * __filemap_get_folio() may not wait on folio's writeback if
156 * BDI not demand that. But it is reasonable to be very conservative
157 * here and explicitly wait on folio's writeback
159 folio_wait_writeback(folio
[0]);
160 folio_wait_writeback(folio
[1]);
162 swap(folio
[0], folio
[1]);
167 /* Force folio buffers uptodate w/o dropping folio's lock */
168 static int mext_page_mkuptodate(struct folio
*folio
, size_t from
, size_t to
)
170 struct inode
*inode
= folio
->mapping
->host
;
172 struct buffer_head
*bh
, *head
;
173 unsigned int blocksize
, block_start
, block_end
;
175 bool partial
= false;
177 BUG_ON(!folio_test_locked(folio
));
178 BUG_ON(folio_test_writeback(folio
));
180 if (folio_test_uptodate(folio
))
183 blocksize
= i_blocksize(inode
);
184 head
= folio_buffers(folio
);
186 head
= create_empty_buffers(folio
, blocksize
, 0);
188 block
= folio_pos(folio
) >> inode
->i_blkbits
;
192 block_start
= block_end
;
193 block_end
= block_start
+ blocksize
;
194 if (block_end
<= from
|| block_start
>= to
) {
195 if (!buffer_uptodate(bh
))
199 if (buffer_uptodate(bh
))
201 if (!buffer_mapped(bh
)) {
202 int err
= ext4_get_block(inode
, block
, bh
, 0);
205 if (!buffer_mapped(bh
)) {
206 folio_zero_range(folio
, block_start
, blocksize
);
207 set_buffer_uptodate(bh
);
212 if (buffer_uptodate(bh
)) {
216 ext4_read_bh_nowait(bh
, 0, NULL
, false);
218 } while (block
++, (bh
= bh
->b_this_page
) != head
);
226 if (bh_offset(bh
) + blocksize
<= from
)
228 if (bh_offset(bh
) > to
)
231 if (buffer_uptodate(bh
))
234 } while ((bh
= bh
->b_this_page
) != head
);
237 folio_mark_uptodate(folio
);
242 * move_extent_per_page - Move extent data per page
244 * @o_filp: file structure of original file
245 * @donor_inode: donor inode
246 * @orig_page_offset: page index on original file
247 * @donor_page_offset: page index on donor file
248 * @data_offset_in_page: block index where data swapping starts
249 * @block_len_in_page: the number of blocks to be swapped
250 * @unwritten: orig extent is unwritten or not
251 * @err: pointer to save return value
253 * Save the data in original inode blocks and replace original inode extents
254 * with donor inode extents by calling ext4_swap_extents().
255 * Finally, write out the saved data in new original inode blocks. Return
256 * replaced block count.
259 move_extent_per_page(struct file
*o_filp
, struct inode
*donor_inode
,
260 pgoff_t orig_page_offset
, pgoff_t donor_page_offset
,
261 int data_offset_in_page
,
262 int block_len_in_page
, int unwritten
, int *err
)
264 struct inode
*orig_inode
= file_inode(o_filp
);
265 struct folio
*folio
[2] = {NULL
, NULL
};
267 ext4_lblk_t orig_blk_offset
, donor_blk_offset
;
268 unsigned long blocksize
= orig_inode
->i_sb
->s_blocksize
;
269 unsigned int tmp_data_size
, data_size
, replaced_size
;
270 int i
, err2
, jblocks
, retries
= 0;
271 int replaced_count
= 0;
272 int from
= data_offset_in_page
<< orig_inode
->i_blkbits
;
273 int blocks_per_page
= PAGE_SIZE
>> orig_inode
->i_blkbits
;
274 struct super_block
*sb
= orig_inode
->i_sb
;
275 struct buffer_head
*bh
= NULL
;
278 * It needs twice the amount of ordinary journal buffers because
279 * inode and donor_inode may change each different metadata blocks.
283 jblocks
= ext4_writepage_trans_blocks(orig_inode
) * 2;
284 handle
= ext4_journal_start(orig_inode
, EXT4_HT_MOVE_EXTENTS
, jblocks
);
285 if (IS_ERR(handle
)) {
286 *err
= PTR_ERR(handle
);
290 orig_blk_offset
= orig_page_offset
* blocks_per_page
+
293 donor_blk_offset
= donor_page_offset
* blocks_per_page
+
296 /* Calculate data_size */
297 if ((orig_blk_offset
+ block_len_in_page
- 1) ==
298 ((orig_inode
->i_size
- 1) >> orig_inode
->i_blkbits
)) {
299 /* Replace the last block */
300 tmp_data_size
= orig_inode
->i_size
& (blocksize
- 1);
302 * If data_size equal zero, it shows data_size is multiples of
303 * blocksize. So we set appropriate value.
305 if (tmp_data_size
== 0)
306 tmp_data_size
= blocksize
;
308 data_size
= tmp_data_size
+
309 ((block_len_in_page
- 1) << orig_inode
->i_blkbits
);
311 data_size
= block_len_in_page
<< orig_inode
->i_blkbits
;
313 replaced_size
= data_size
;
315 *err
= mext_folio_double_lock(orig_inode
, donor_inode
, orig_page_offset
,
316 donor_page_offset
, folio
);
317 if (unlikely(*err
< 0))
320 * If orig extent was unwritten it can become initialized
321 * at any time after i_data_sem was dropped, in order to
322 * serialize with delalloc we have recheck extent while we
323 * hold page's lock, if it is still the case data copy is not
324 * necessary, just swap data blocks between orig and donor.
327 VM_BUG_ON_FOLIO(folio_test_large(folio
[0]), folio
[0]);
328 VM_BUG_ON_FOLIO(folio_test_large(folio
[1]), folio
[1]);
329 VM_BUG_ON_FOLIO(folio_nr_pages(folio
[0]) != folio_nr_pages(folio
[1]), folio
[1]);
332 ext4_double_down_write_data_sem(orig_inode
, donor_inode
);
333 /* If any of extents in range became initialized we have to
334 * fallback to data copying */
335 unwritten
= mext_check_coverage(orig_inode
, orig_blk_offset
,
336 block_len_in_page
, 1, err
);
340 unwritten
&= mext_check_coverage(donor_inode
, donor_blk_offset
,
341 block_len_in_page
, 1, err
);
346 ext4_double_up_write_data_sem(orig_inode
, donor_inode
);
349 if (!filemap_release_folio(folio
[0], 0) ||
350 !filemap_release_folio(folio
[1], 0)) {
354 replaced_count
= ext4_swap_extents(handle
, orig_inode
,
355 donor_inode
, orig_blk_offset
,
357 block_len_in_page
, 1, err
);
359 ext4_double_up_write_data_sem(orig_inode
, donor_inode
);
363 *err
= mext_page_mkuptodate(folio
[0], from
, from
+ replaced_size
);
367 /* At this point all buffers in range are uptodate, old mapping layout
368 * is no longer required, try to drop it now. */
369 if (!filemap_release_folio(folio
[0], 0) ||
370 !filemap_release_folio(folio
[1], 0)) {
374 ext4_double_down_write_data_sem(orig_inode
, donor_inode
);
375 replaced_count
= ext4_swap_extents(handle
, orig_inode
, donor_inode
,
376 orig_blk_offset
, donor_blk_offset
,
377 block_len_in_page
, 1, err
);
378 ext4_double_up_write_data_sem(orig_inode
, donor_inode
);
380 if (replaced_count
) {
381 block_len_in_page
= replaced_count
;
383 block_len_in_page
<< orig_inode
->i_blkbits
;
387 /* Perform all necessary steps similar write_begin()/write_end()
388 * but keeping in mind that i_size will not change */
389 bh
= folio_buffers(folio
[0]);
391 bh
= create_empty_buffers(folio
[0],
392 1 << orig_inode
->i_blkbits
, 0);
393 for (i
= 0; i
< data_offset_in_page
; i
++)
394 bh
= bh
->b_this_page
;
395 for (i
= 0; i
< block_len_in_page
; i
++) {
396 *err
= ext4_get_block(orig_inode
, orig_blk_offset
+ i
, bh
, 0);
398 goto repair_branches
;
399 bh
= bh
->b_this_page
;
402 block_commit_write(&folio
[0]->page
, from
, from
+ replaced_size
);
404 /* Even in case of data=writeback it is reasonable to pin
405 * inode to transaction, to prevent unexpected data loss */
406 *err
= ext4_jbd2_inode_add_write(handle
, orig_inode
,
407 (loff_t
)orig_page_offset
<< PAGE_SHIFT
, replaced_size
);
410 folio_unlock(folio
[0]);
412 folio_unlock(folio
[1]);
415 ext4_journal_stop(handle
);
416 if (*err
== -ENOSPC
&&
417 ext4_should_retry_alloc(sb
, &retries
))
419 /* Buffer was busy because probably is pinned to journal transaction,
420 * force transaction commit may help to free it. */
421 if (*err
== -EBUSY
&& retries
++ < 4 && EXT4_SB(sb
)->s_journal
&&
422 jbd2_journal_force_commit_nested(EXT4_SB(sb
)->s_journal
))
424 return replaced_count
;
428 * This should never ever happen!
429 * Extents are swapped already, but we are not able to copy data.
430 * Try to swap extents to it's original places
432 ext4_double_down_write_data_sem(orig_inode
, donor_inode
);
433 replaced_count
= ext4_swap_extents(handle
, donor_inode
, orig_inode
,
434 orig_blk_offset
, donor_blk_offset
,
435 block_len_in_page
, 0, &err2
);
436 ext4_double_up_write_data_sem(orig_inode
, donor_inode
);
437 if (replaced_count
!= block_len_in_page
) {
438 ext4_error_inode_block(orig_inode
, (sector_t
)(orig_blk_offset
),
439 EIO
, "Unable to copy data block,"
440 " data will be lost.");
448 * mext_check_arguments - Check whether move extent can be done
450 * @orig_inode: original inode
451 * @donor_inode: donor inode
452 * @orig_start: logical start offset in block for orig
453 * @donor_start: logical start offset in block for donor
454 * @len: the number of blocks to be moved
456 * Check the arguments of ext4_move_extents() whether the files can be
457 * exchanged with each other.
458 * Return 0 on success, or a negative error value on failure.
461 mext_check_arguments(struct inode
*orig_inode
,
462 struct inode
*donor_inode
, __u64 orig_start
,
463 __u64 donor_start
, __u64
*len
)
465 __u64 orig_eof
, donor_eof
;
466 unsigned int blkbits
= orig_inode
->i_blkbits
;
467 unsigned int blocksize
= 1 << blkbits
;
469 orig_eof
= (i_size_read(orig_inode
) + blocksize
- 1) >> blkbits
;
470 donor_eof
= (i_size_read(donor_inode
) + blocksize
- 1) >> blkbits
;
473 if (donor_inode
->i_mode
& (S_ISUID
|S_ISGID
)) {
474 ext4_debug("ext4 move extent: suid or sgid is set"
475 " to donor file [ino:orig %lu, donor %lu]\n",
476 orig_inode
->i_ino
, donor_inode
->i_ino
);
480 if (IS_IMMUTABLE(donor_inode
) || IS_APPEND(donor_inode
))
483 /* Ext4 move extent does not support swap files */
484 if (IS_SWAPFILE(orig_inode
) || IS_SWAPFILE(donor_inode
)) {
485 ext4_debug("ext4 move extent: The argument files should not be swap files [ino:orig %lu, donor %lu]\n",
486 orig_inode
->i_ino
, donor_inode
->i_ino
);
490 if (ext4_is_quota_file(orig_inode
) && ext4_is_quota_file(donor_inode
)) {
491 ext4_debug("ext4 move extent: The argument files should not be quota files [ino:orig %lu, donor %lu]\n",
492 orig_inode
->i_ino
, donor_inode
->i_ino
);
496 /* Ext4 move extent supports only extent based file */
497 if (!(ext4_test_inode_flag(orig_inode
, EXT4_INODE_EXTENTS
))) {
498 ext4_debug("ext4 move extent: orig file is not extents "
499 "based file [ino:orig %lu]\n", orig_inode
->i_ino
);
501 } else if (!(ext4_test_inode_flag(donor_inode
, EXT4_INODE_EXTENTS
))) {
502 ext4_debug("ext4 move extent: donor file is not extents "
503 "based file [ino:donor %lu]\n", donor_inode
->i_ino
);
507 if ((!orig_inode
->i_size
) || (!donor_inode
->i_size
)) {
508 ext4_debug("ext4 move extent: File size is 0 byte\n");
512 /* Start offset should be same */
513 if ((orig_start
& ~(PAGE_MASK
>> orig_inode
->i_blkbits
)) !=
514 (donor_start
& ~(PAGE_MASK
>> orig_inode
->i_blkbits
))) {
515 ext4_debug("ext4 move extent: orig and donor's start "
516 "offsets are not aligned [ino:orig %lu, donor %lu]\n",
517 orig_inode
->i_ino
, donor_inode
->i_ino
);
521 if ((orig_start
>= EXT_MAX_BLOCKS
) ||
522 (donor_start
>= EXT_MAX_BLOCKS
) ||
523 (*len
> EXT_MAX_BLOCKS
) ||
524 (donor_start
+ *len
>= EXT_MAX_BLOCKS
) ||
525 (orig_start
+ *len
>= EXT_MAX_BLOCKS
)) {
526 ext4_debug("ext4 move extent: Can't handle over [%u] blocks "
527 "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCKS
,
528 orig_inode
->i_ino
, donor_inode
->i_ino
);
531 if (orig_eof
<= orig_start
)
533 else if (orig_eof
< orig_start
+ *len
- 1)
534 *len
= orig_eof
- orig_start
;
535 if (donor_eof
<= donor_start
)
537 else if (donor_eof
< donor_start
+ *len
- 1)
538 *len
= donor_eof
- donor_start
;
540 ext4_debug("ext4 move extent: len should not be 0 "
541 "[ino:orig %lu, donor %lu]\n", orig_inode
->i_ino
,
550 * ext4_move_extents - Exchange the specified range of a file
552 * @o_filp: file structure of the original file
553 * @d_filp: file structure of the donor file
554 * @orig_blk: start offset in block for orig
555 * @donor_blk: start offset in block for donor
556 * @len: the number of blocks to be moved
557 * @moved_len: moved block length
559 * This function returns 0 and moved block length is set in moved_len
560 * if succeed, otherwise returns error value.
564 ext4_move_extents(struct file
*o_filp
, struct file
*d_filp
, __u64 orig_blk
,
565 __u64 donor_blk
, __u64 len
, __u64
*moved_len
)
567 struct inode
*orig_inode
= file_inode(o_filp
);
568 struct inode
*donor_inode
= file_inode(d_filp
);
569 struct ext4_ext_path
*path
= NULL
;
570 int blocks_per_page
= PAGE_SIZE
>> orig_inode
->i_blkbits
;
571 ext4_lblk_t o_end
, o_start
= orig_blk
;
572 ext4_lblk_t d_start
= donor_blk
;
575 if (orig_inode
->i_sb
!= donor_inode
->i_sb
) {
576 ext4_debug("ext4 move extent: The argument files "
577 "should be in same FS [ino:orig %lu, donor %lu]\n",
578 orig_inode
->i_ino
, donor_inode
->i_ino
);
582 /* orig and donor should be different inodes */
583 if (orig_inode
== donor_inode
) {
584 ext4_debug("ext4 move extent: The argument files should not "
585 "be same inode [ino:orig %lu, donor %lu]\n",
586 orig_inode
->i_ino
, donor_inode
->i_ino
);
590 /* Regular file check */
591 if (!S_ISREG(orig_inode
->i_mode
) || !S_ISREG(donor_inode
->i_mode
)) {
592 ext4_debug("ext4 move extent: The argument files should be "
593 "regular file [ino:orig %lu, donor %lu]\n",
594 orig_inode
->i_ino
, donor_inode
->i_ino
);
598 /* TODO: it's not obvious how to swap blocks for inodes with full
599 journaling enabled */
600 if (ext4_should_journal_data(orig_inode
) ||
601 ext4_should_journal_data(donor_inode
)) {
602 ext4_msg(orig_inode
->i_sb
, KERN_ERR
,
603 "Online defrag not supported with data journaling");
607 if (IS_ENCRYPTED(orig_inode
) || IS_ENCRYPTED(donor_inode
)) {
608 ext4_msg(orig_inode
->i_sb
, KERN_ERR
,
609 "Online defrag not supported for encrypted files");
613 /* Protect orig and donor inodes against a truncate */
614 lock_two_nondirectories(orig_inode
, donor_inode
);
616 /* Wait for all existing dio workers */
617 inode_dio_wait(orig_inode
);
618 inode_dio_wait(donor_inode
);
620 /* Protect extent tree against block allocations via delalloc */
621 ext4_double_down_write_data_sem(orig_inode
, donor_inode
);
622 /* Check the filesystem environment whether move_extent can be done */
623 ret
= mext_check_arguments(orig_inode
, donor_inode
, orig_blk
,
627 o_end
= o_start
+ len
;
630 while (o_start
< o_end
) {
631 struct ext4_extent
*ex
;
632 ext4_lblk_t cur_blk
, next_blk
;
633 pgoff_t orig_page_index
, donor_page_index
;
635 int unwritten
, cur_len
;
637 path
= get_ext_path(orig_inode
, o_start
, path
);
642 ex
= path
[path
->p_depth
].p_ext
;
643 cur_blk
= le32_to_cpu(ex
->ee_block
);
644 cur_len
= ext4_ext_get_actual_len(ex
);
645 /* Check hole before the start pos */
646 if (cur_blk
+ cur_len
- 1 < o_start
) {
647 next_blk
= ext4_ext_next_allocated_block(path
);
648 if (next_blk
== EXT_MAX_BLOCKS
) {
652 d_start
+= next_blk
- o_start
;
655 /* Check hole after the start pos */
656 } else if (cur_blk
> o_start
) {
658 d_start
+= cur_blk
- o_start
;
660 /* Extent inside requested range ?*/
661 if (cur_blk
>= o_end
)
663 } else { /* in_range(o_start, o_blk, o_len) */
664 cur_len
+= cur_blk
- o_start
;
666 unwritten
= ext4_ext_is_unwritten(ex
);
667 if (o_end
- o_start
< cur_len
)
668 cur_len
= o_end
- o_start
;
670 orig_page_index
= o_start
>> (PAGE_SHIFT
-
671 orig_inode
->i_blkbits
);
672 donor_page_index
= d_start
>> (PAGE_SHIFT
-
673 donor_inode
->i_blkbits
);
674 offset_in_page
= o_start
% blocks_per_page
;
675 if (cur_len
> blocks_per_page
- offset_in_page
)
676 cur_len
= blocks_per_page
- offset_in_page
;
678 * Up semaphore to avoid following problems:
679 * a. transaction deadlock among ext4_journal_start,
680 * ->write_begin via pagefault, and jbd2_journal_commit
681 * b. racing with ->read_folio, ->write_begin, and
682 * ext4_get_block in move_extent_per_page
684 ext4_double_up_write_data_sem(orig_inode
, donor_inode
);
685 /* Swap original branches with new branches */
686 *moved_len
+= move_extent_per_page(o_filp
, donor_inode
,
687 orig_page_index
, donor_page_index
,
688 offset_in_page
, cur_len
,
690 ext4_double_down_write_data_sem(orig_inode
, donor_inode
);
699 ext4_discard_preallocations(orig_inode
);
700 ext4_discard_preallocations(donor_inode
);
703 ext4_free_ext_path(path
);
704 ext4_double_up_write_data_sem(orig_inode
, donor_inode
);
705 unlock_two_nondirectories(orig_inode
, donor_inode
);