1 // SPDX-License-Identifier: GPL-2.0-only
2 #include <linux/slab.h>
3 #include <linux/stat.h>
4 #include <linux/sched/xacct.h>
5 #include <linux/fcntl.h>
6 #include <linux/file.h>
8 #include <linux/fsnotify.h>
9 #include <linux/security.h>
10 #include <linux/export.h>
11 #include <linux/syscalls.h>
12 #include <linux/pagemap.h>
13 #include <linux/splice.h>
14 #include <linux/compat.h>
15 #include <linux/mount.h>
17 #include <linux/dax.h>
18 #include <linux/overflow.h>
21 #include <linux/uaccess.h>
22 #include <asm/unistd.h>
25 * Performs necessary checks before doing a clone.
27 * Can adjust amount of bytes to clone via @req_count argument.
28 * Returns appropriate error code that caller should return or
29 * zero in case the clone should be allowed.
31 static int generic_remap_checks(struct file
*file_in
, loff_t pos_in
,
32 struct file
*file_out
, loff_t pos_out
,
33 loff_t
*req_count
, unsigned int remap_flags
)
35 struct inode
*inode_in
= file_in
->f_mapping
->host
;
36 struct inode
*inode_out
= file_out
->f_mapping
->host
;
37 uint64_t count
= *req_count
;
39 loff_t size_in
, size_out
;
40 loff_t bs
= inode_out
->i_sb
->s_blocksize
;
43 /* The start of both ranges must be aligned to an fs block. */
44 if (!IS_ALIGNED(pos_in
, bs
) || !IS_ALIGNED(pos_out
, bs
))
47 /* Ensure offsets don't wrap. */
48 if (pos_in
+ count
< pos_in
|| pos_out
+ count
< pos_out
)
51 size_in
= i_size_read(inode_in
);
52 size_out
= i_size_read(inode_out
);
54 /* Dedupe requires both ranges to be within EOF. */
55 if ((remap_flags
& REMAP_FILE_DEDUP
) &&
56 (pos_in
>= size_in
|| pos_in
+ count
> size_in
||
57 pos_out
>= size_out
|| pos_out
+ count
> size_out
))
60 /* Ensure the infile range is within the infile. */
61 if (pos_in
>= size_in
)
63 count
= min(count
, size_in
- (uint64_t)pos_in
);
65 ret
= generic_write_check_limits(file_out
, pos_out
, &count
);
70 * If the user wanted us to link to the infile's EOF, round up to the
71 * next block boundary for this check.
73 * Otherwise, make sure the count is also block-aligned, having
74 * already confirmed the starting offsets' block alignment.
76 if (pos_in
+ count
== size_in
&&
77 (!(remap_flags
& REMAP_FILE_DEDUP
) || pos_out
+ count
== size_out
)) {
78 bcount
= ALIGN(size_in
, bs
) - pos_in
;
80 if (!IS_ALIGNED(count
, bs
))
81 count
= ALIGN_DOWN(count
, bs
);
85 /* Don't allow overlapped cloning within the same file. */
86 if (inode_in
== inode_out
&&
87 pos_out
+ bcount
> pos_in
&&
88 pos_out
< pos_in
+ bcount
)
92 * We shortened the request but the caller can't deal with that, so
93 * bounce the request back to userspace.
95 if (*req_count
!= count
&& !(remap_flags
& REMAP_FILE_CAN_SHORTEN
))
102 int remap_verify_area(struct file
*file
, loff_t pos
, loff_t len
, bool write
)
104 int mask
= write
? MAY_WRITE
: MAY_READ
;
108 if (unlikely(pos
< 0 || len
< 0))
111 if (unlikely(check_add_overflow(pos
, len
, &tmp
)))
114 ret
= security_file_permission(file
, mask
);
118 return fsnotify_file_area_perm(file
, mask
, &pos
, len
);
120 EXPORT_SYMBOL_GPL(remap_verify_area
);
123 * Ensure that we don't remap a partial EOF block in the middle of something
124 * else. Assume that the offsets have already been checked for block
127 * For clone we only link a partial EOF block above or at the destination file's
128 * EOF. For deduplication we accept a partial EOF block only if it ends at the
129 * destination file's EOF (can not link it into the middle of a file).
131 * Shorten the request if possible.
133 static int generic_remap_check_len(struct inode
*inode_in
,
134 struct inode
*inode_out
,
137 unsigned int remap_flags
)
139 u64 blkmask
= i_blocksize(inode_in
) - 1;
140 loff_t new_len
= *len
;
142 if ((*len
& blkmask
) == 0)
145 if (pos_out
+ *len
< i_size_read(inode_out
))
151 if (remap_flags
& REMAP_FILE_CAN_SHORTEN
) {
156 return (remap_flags
& REMAP_FILE_DEDUP
) ? -EBADE
: -EINVAL
;
159 /* Read a page's worth of file data into the page cache. */
160 static struct folio
*vfs_dedupe_get_folio(struct file
*file
, loff_t pos
)
162 return read_mapping_folio(file
->f_mapping
, pos
>> PAGE_SHIFT
, file
);
166 * Lock two folios, ensuring that we lock in offset order if the folios
167 * are from the same file.
169 static void vfs_lock_two_folios(struct folio
*folio1
, struct folio
*folio2
)
171 /* Always lock in order of increasing index. */
172 if (folio1
->index
> folio2
->index
)
173 swap(folio1
, folio2
);
176 if (folio1
!= folio2
)
180 /* Unlock two folios, being careful not to unlock the same folio twice. */
181 static void vfs_unlock_two_folios(struct folio
*folio1
, struct folio
*folio2
)
183 folio_unlock(folio1
);
184 if (folio1
!= folio2
)
185 folio_unlock(folio2
);
189 * Compare extents of two files to see if they are the same.
190 * Caller must have locked both inodes to prevent write races.
192 static int vfs_dedupe_file_range_compare(struct file
*src
, loff_t srcoff
,
193 struct file
*dest
, loff_t dstoff
,
194 loff_t len
, bool *is_same
)
200 struct folio
*src_folio
, *dst_folio
;
201 void *src_addr
, *dst_addr
;
202 loff_t cmp_len
= min(PAGE_SIZE
- offset_in_page(srcoff
),
203 PAGE_SIZE
- offset_in_page(dstoff
));
205 cmp_len
= min(cmp_len
, len
);
209 src_folio
= vfs_dedupe_get_folio(src
, srcoff
);
210 if (IS_ERR(src_folio
)) {
211 error
= PTR_ERR(src_folio
);
214 dst_folio
= vfs_dedupe_get_folio(dest
, dstoff
);
215 if (IS_ERR(dst_folio
)) {
216 error
= PTR_ERR(dst_folio
);
217 folio_put(src_folio
);
221 vfs_lock_two_folios(src_folio
, dst_folio
);
224 * Now that we've locked both folios, make sure they're still
225 * mapped to the file data we're interested in. If not,
226 * someone is invalidating pages on us and we lose.
228 if (!folio_test_uptodate(src_folio
) || !folio_test_uptodate(dst_folio
) ||
229 src_folio
->mapping
!= src
->f_mapping
||
230 dst_folio
->mapping
!= dest
->f_mapping
) {
235 src_addr
= kmap_local_folio(src_folio
,
236 offset_in_folio(src_folio
, srcoff
));
237 dst_addr
= kmap_local_folio(dst_folio
,
238 offset_in_folio(dst_folio
, dstoff
));
240 flush_dcache_folio(src_folio
);
241 flush_dcache_folio(dst_folio
);
243 if (memcmp(src_addr
, dst_addr
, cmp_len
))
246 kunmap_local(dst_addr
);
247 kunmap_local(src_addr
);
249 vfs_unlock_two_folios(src_folio
, dst_folio
);
250 folio_put(dst_folio
);
251 folio_put(src_folio
);
269 * Check that the two inodes are eligible for cloning, the ranges make
270 * sense, and then flush all dirty data. Caller must ensure that the
271 * inodes have been locked against any other modifications.
273 * If there's an error, then the usual negative error code is returned.
274 * Otherwise returns 0 with *len set to the request length.
277 __generic_remap_file_range_prep(struct file
*file_in
, loff_t pos_in
,
278 struct file
*file_out
, loff_t pos_out
,
279 loff_t
*len
, unsigned int remap_flags
,
280 const struct iomap_ops
*dax_read_ops
)
282 struct inode
*inode_in
= file_inode(file_in
);
283 struct inode
*inode_out
= file_inode(file_out
);
284 bool same_inode
= (inode_in
== inode_out
);
287 /* Don't touch certain kinds of inodes */
288 if (IS_IMMUTABLE(inode_out
))
291 if (IS_SWAPFILE(inode_in
) || IS_SWAPFILE(inode_out
))
294 /* Don't reflink dirs, pipes, sockets... */
295 if (S_ISDIR(inode_in
->i_mode
) || S_ISDIR(inode_out
->i_mode
))
297 if (!S_ISREG(inode_in
->i_mode
) || !S_ISREG(inode_out
->i_mode
))
300 /* Zero length dedupe exits immediately; reflink goes to EOF. */
302 loff_t isize
= i_size_read(inode_in
);
304 if ((remap_flags
& REMAP_FILE_DEDUP
) || pos_in
== isize
)
308 *len
= isize
- pos_in
;
313 /* Check that we don't violate system file offset limits. */
314 ret
= generic_remap_checks(file_in
, pos_in
, file_out
, pos_out
, len
,
316 if (ret
|| *len
== 0)
319 /* Wait for the completion of any pending IOs on both files */
320 inode_dio_wait(inode_in
);
322 inode_dio_wait(inode_out
);
324 ret
= filemap_write_and_wait_range(inode_in
->i_mapping
,
325 pos_in
, pos_in
+ *len
- 1);
329 ret
= filemap_write_and_wait_range(inode_out
->i_mapping
,
330 pos_out
, pos_out
+ *len
- 1);
335 * Check that the extents are the same.
337 if (remap_flags
& REMAP_FILE_DEDUP
) {
338 bool is_same
= false;
340 if (!IS_DAX(inode_in
))
341 ret
= vfs_dedupe_file_range_compare(file_in
, pos_in
,
342 file_out
, pos_out
, *len
, &is_same
);
343 else if (dax_read_ops
)
344 ret
= dax_dedupe_file_range_compare(inode_in
, pos_in
,
345 inode_out
, pos_out
, *len
, &is_same
,
355 ret
= generic_remap_check_len(inode_in
, inode_out
, pos_out
, len
,
357 if (ret
|| *len
== 0)
360 /* If can't alter the file contents, we're done. */
361 if (!(remap_flags
& REMAP_FILE_DEDUP
))
362 ret
= file_modified(file_out
);
367 int generic_remap_file_range_prep(struct file
*file_in
, loff_t pos_in
,
368 struct file
*file_out
, loff_t pos_out
,
369 loff_t
*len
, unsigned int remap_flags
)
371 return __generic_remap_file_range_prep(file_in
, pos_in
, file_out
,
372 pos_out
, len
, remap_flags
, NULL
);
374 EXPORT_SYMBOL(generic_remap_file_range_prep
);
376 loff_t
vfs_clone_file_range(struct file
*file_in
, loff_t pos_in
,
377 struct file
*file_out
, loff_t pos_out
,
378 loff_t len
, unsigned int remap_flags
)
382 WARN_ON_ONCE(remap_flags
& REMAP_FILE_DEDUP
);
384 if (file_inode(file_in
)->i_sb
!= file_inode(file_out
)->i_sb
)
387 ret
= generic_file_rw_checks(file_in
, file_out
);
391 if (!file_in
->f_op
->remap_file_range
)
394 ret
= remap_verify_area(file_in
, pos_in
, len
, false);
398 ret
= remap_verify_area(file_out
, pos_out
, len
, true);
402 file_start_write(file_out
);
403 ret
= file_in
->f_op
->remap_file_range(file_in
, pos_in
,
404 file_out
, pos_out
, len
, remap_flags
);
405 file_end_write(file_out
);
409 fsnotify_access(file_in
);
410 fsnotify_modify(file_out
);
413 EXPORT_SYMBOL(vfs_clone_file_range
);
415 /* Check whether we are allowed to dedupe the destination file */
416 static bool may_dedupe_file(struct file
*file
)
418 struct mnt_idmap
*idmap
= file_mnt_idmap(file
);
419 struct inode
*inode
= file_inode(file
);
421 if (capable(CAP_SYS_ADMIN
))
423 if (file
->f_mode
& FMODE_WRITE
)
425 if (vfsuid_eq_kuid(i_uid_into_vfsuid(idmap
, inode
), current_fsuid()))
427 if (!inode_permission(idmap
, inode
, MAY_WRITE
))
432 loff_t
vfs_dedupe_file_range_one(struct file
*src_file
, loff_t src_pos
,
433 struct file
*dst_file
, loff_t dst_pos
,
434 loff_t len
, unsigned int remap_flags
)
438 WARN_ON_ONCE(remap_flags
& ~(REMAP_FILE_DEDUP
|
439 REMAP_FILE_CAN_SHORTEN
));
442 * This is redundant if called from vfs_dedupe_file_range(), but other
443 * callers need it and it's not performance sesitive...
445 ret
= remap_verify_area(src_file
, src_pos
, len
, false);
449 ret
= remap_verify_area(dst_file
, dst_pos
, len
, true);
454 * This needs to be called after remap_verify_area() because of
455 * sb_start_write() and before may_dedupe_file() because the mount's
456 * MAY_WRITE need to be checked with mnt_get_write_access_file() held.
458 ret
= mnt_want_write_file(dst_file
);
463 if (!may_dedupe_file(dst_file
))
467 if (file_inode(src_file
)->i_sb
!= file_inode(dst_file
)->i_sb
)
471 if (S_ISDIR(file_inode(dst_file
)->i_mode
))
475 if (!dst_file
->f_op
->remap_file_range
)
483 ret
= dst_file
->f_op
->remap_file_range(src_file
, src_pos
, dst_file
,
484 dst_pos
, len
, remap_flags
| REMAP_FILE_DEDUP
);
486 mnt_drop_write_file(dst_file
);
490 EXPORT_SYMBOL(vfs_dedupe_file_range_one
);
492 int vfs_dedupe_file_range(struct file
*file
, struct file_dedupe_range
*same
)
494 struct file_dedupe_range_info
*info
;
495 struct inode
*src
= file_inode(file
);
500 u16 count
= same
->dest_count
;
503 if (!(file
->f_mode
& FMODE_READ
))
506 if (same
->reserved1
|| same
->reserved2
)
509 off
= same
->src_offset
;
510 len
= same
->src_length
;
512 if (S_ISDIR(src
->i_mode
))
515 if (!S_ISREG(src
->i_mode
))
518 if (!file
->f_op
->remap_file_range
)
521 ret
= remap_verify_area(file
, off
, len
, false);
526 if (off
+ len
> i_size_read(src
))
529 /* Arbitrary 1G limit on a single dedupe request, can be raised. */
530 len
= min_t(u64
, len
, 1 << 30);
532 /* pre-format output fields to sane values */
533 for (i
= 0; i
< count
; i
++) {
534 same
->info
[i
].bytes_deduped
= 0ULL;
535 same
->info
[i
].status
= FILE_DEDUPE_RANGE_SAME
;
538 for (i
= 0, info
= same
->info
; i
< count
; i
++, info
++) {
539 CLASS(fd
, dst_fd
)(info
->dest_fd
);
541 if (fd_empty(dst_fd
)) {
542 info
->status
= -EBADF
;
546 if (info
->reserved
) {
547 info
->status
= -EINVAL
;
551 deduped
= vfs_dedupe_file_range_one(file
, off
, fd_file(dst_fd
),
552 info
->dest_offset
, len
,
553 REMAP_FILE_CAN_SHORTEN
);
554 if (deduped
== -EBADE
)
555 info
->status
= FILE_DEDUPE_RANGE_DIFFERS
;
556 else if (deduped
< 0)
557 info
->status
= deduped
;
559 info
->bytes_deduped
= len
;
562 if (fatal_signal_pending(current
))
567 EXPORT_SYMBOL(vfs_dedupe_file_range
);