1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Network filesystem high-level buffered write support.
4 * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
5 * Written by David Howells (dhowells@redhat.com)
8 #include <linux/export.h>
11 #include <linux/pagemap.h>
12 #include <linux/slab.h>
13 #include <linux/pagevec.h>
16 static void __netfs_set_group(struct folio
*folio
, struct netfs_group
*netfs_group
)
19 folio_attach_private(folio
, netfs_get_group(netfs_group
));
22 static void netfs_set_group(struct folio
*folio
, struct netfs_group
*netfs_group
)
24 void *priv
= folio_get_private(folio
);
26 if (unlikely(priv
!= netfs_group
)) {
27 if (netfs_group
&& (!priv
|| priv
== NETFS_FOLIO_COPY_TO_CACHE
))
28 folio_attach_private(folio
, netfs_get_group(netfs_group
));
29 else if (!netfs_group
&& priv
== NETFS_FOLIO_COPY_TO_CACHE
)
30 folio_detach_private(folio
);
35 * Grab a folio for writing and lock it. Attempt to allocate as large a folio
36 * as possible to hold as much of the remaining length as possible in one go.
38 static struct folio
*netfs_grab_folio_for_write(struct address_space
*mapping
,
39 loff_t pos
, size_t part
)
41 pgoff_t index
= pos
/ PAGE_SIZE
;
42 fgf_t fgp_flags
= FGP_WRITEBEGIN
;
44 if (mapping_large_folio_support(mapping
))
45 fgp_flags
|= fgf_set_order(pos
% PAGE_SIZE
+ part
);
47 return __filemap_get_folio(mapping
, index
, fgp_flags
,
48 mapping_gfp_mask(mapping
));
52 * Update i_size and estimate the update to i_blocks to reflect the additional
53 * data written into the pagecache until we can find out from the server what
54 * the values actually are.
56 static void netfs_update_i_size(struct netfs_inode
*ctx
, struct inode
*inode
,
57 loff_t i_size
, loff_t pos
, size_t copied
)
62 if (ctx
->ops
->update_i_size
) {
63 ctx
->ops
->update_i_size(inode
, pos
);
67 i_size_write(inode
, pos
);
68 #if IS_ENABLED(CONFIG_FSCACHE)
69 fscache_update_cookie(ctx
->cache
, NULL
, &pos
);
72 gap
= SECTOR_SIZE
- (i_size
& (SECTOR_SIZE
- 1));
74 add
= DIV_ROUND_UP(copied
- gap
, SECTOR_SIZE
);
76 inode
->i_blocks
= min_t(blkcnt_t
,
77 DIV_ROUND_UP(pos
, SECTOR_SIZE
),
78 inode
->i_blocks
+ add
);
83 * netfs_perform_write - Copy data into the pagecache.
84 * @iocb: The operation parameters
85 * @iter: The source buffer
86 * @netfs_group: Grouping for dirty folios (eg. ceph snaps).
88 * Copy data into pagecache folios attached to the inode specified by @iocb.
89 * The caller must hold appropriate inode locks.
91 * Dirty folios are tagged with a netfs_folio struct if they're not up to date
92 * to indicate the range modified. Dirty folios may also be tagged with a
93 * netfs-specific grouping such that data from an old group gets flushed before
94 * a new one is started.
96 ssize_t
netfs_perform_write(struct kiocb
*iocb
, struct iov_iter
*iter
,
97 struct netfs_group
*netfs_group
)
99 struct file
*file
= iocb
->ki_filp
;
100 struct inode
*inode
= file_inode(file
);
101 struct address_space
*mapping
= inode
->i_mapping
;
102 struct netfs_inode
*ctx
= netfs_inode(inode
);
103 struct writeback_control wbc
= {
104 .sync_mode
= WB_SYNC_NONE
,
106 .nr_to_write
= LONG_MAX
,
107 .range_start
= iocb
->ki_pos
,
108 .range_end
= iocb
->ki_pos
+ iter
->count
,
110 struct netfs_io_request
*wreq
= NULL
;
111 struct folio
*folio
= NULL
, *writethrough
= NULL
;
112 unsigned int bdp_flags
= (iocb
->ki_flags
& IOCB_NOWAIT
) ? BDP_ASYNC
: 0;
113 ssize_t written
= 0, ret
, ret2
;
114 loff_t i_size
, pos
= iocb
->ki_pos
;
115 size_t max_chunk
= mapping_max_folio_size(mapping
);
116 bool maybe_trouble
= false;
118 if (unlikely(test_bit(NETFS_ICTX_WRITETHROUGH
, &ctx
->flags
) ||
119 iocb
->ki_flags
& (IOCB_DSYNC
| IOCB_SYNC
))
121 wbc_attach_fdatawrite_inode(&wbc
, mapping
->host
);
123 ret
= filemap_write_and_wait_range(mapping
, pos
, pos
+ iter
->count
);
125 wbc_detach_inode(&wbc
);
129 wreq
= netfs_begin_writethrough(iocb
, iter
->count
);
131 wbc_detach_inode(&wbc
);
136 if (!is_sync_kiocb(iocb
))
138 netfs_stat(&netfs_n_wh_writethrough
);
140 netfs_stat(&netfs_n_wh_buffered_write
);
144 struct netfs_folio
*finfo
;
145 struct netfs_group
*group
;
146 unsigned long long fpos
;
148 size_t offset
; /* Offset into pagecache folio */
149 size_t part
; /* Bytes to write to folio */
150 size_t copied
; /* Bytes copied from user */
152 offset
= pos
& (max_chunk
- 1);
153 part
= min(max_chunk
- offset
, iov_iter_count(iter
));
155 /* Bring in the user pages that we will copy from _first_ lest
156 * we hit a nasty deadlock on copying from the same page as
157 * we're writing to, without it being marked uptodate.
159 * Not only is this an optimisation, but it is also required to
160 * check that the address is actually valid, when atomic
161 * usercopies are used below.
163 * We rely on the page being held onto long enough by the LRU
164 * that we can grab it below if this causes it to be read.
167 if (unlikely(fault_in_iov_iter_readable(iter
, part
) == part
))
170 folio
= netfs_grab_folio_for_write(mapping
, pos
, part
);
172 ret
= PTR_ERR(folio
);
176 flen
= folio_size(folio
);
177 fpos
= folio_pos(folio
);
179 part
= min_t(size_t, flen
- offset
, part
);
181 /* Wait for writeback to complete. The writeback engine owns
182 * the info in folio->private and may change it until it
183 * removes the WB mark.
185 if (folio_get_private(folio
) &&
186 folio_wait_writeback_killable(folio
)) {
187 ret
= written
? -EINTR
: -ERESTARTSYS
;
188 goto error_folio_unlock
;
191 if (signal_pending(current
)) {
192 ret
= written
? -EINTR
: -ERESTARTSYS
;
193 goto error_folio_unlock
;
196 /* Decide how we should modify a folio. We might be attempting
197 * to do write-streaming, in which case we don't want to a
198 * local RMW cycle if we can avoid it. If we're doing local
199 * caching or content crypto, we award that priority over
200 * avoiding RMW. If the file is open readably, then we also
201 * assume that we may want to read what we wrote.
203 finfo
= netfs_folio_info(folio
);
204 group
= netfs_folio_group(folio
);
206 if (unlikely(group
!= netfs_group
) &&
207 group
!= NETFS_FOLIO_COPY_TO_CACHE
)
210 if (folio_test_uptodate(folio
)) {
211 if (mapping_writably_mapped(mapping
))
212 flush_dcache_folio(folio
);
213 copied
= copy_folio_from_iter_atomic(folio
, offset
, part
, iter
);
214 if (unlikely(copied
== 0))
216 netfs_set_group(folio
, netfs_group
);
217 trace_netfs_folio(folio
, netfs_folio_is_uptodate
);
221 /* If the page is above the zero-point then we assume that the
222 * server would just return a block of zeros or a short read if
225 if (fpos
>= ctx
->zero_point
) {
226 folio_zero_segment(folio
, 0, offset
);
227 copied
= copy_folio_from_iter_atomic(folio
, offset
, part
, iter
);
228 if (unlikely(copied
== 0))
230 folio_zero_segment(folio
, offset
+ copied
, flen
);
231 __netfs_set_group(folio
, netfs_group
);
232 folio_mark_uptodate(folio
);
233 trace_netfs_folio(folio
, netfs_modify_and_clear
);
237 /* See if we can write a whole folio in one go. */
238 if (!maybe_trouble
&& offset
== 0 && part
>= flen
) {
239 copied
= copy_folio_from_iter_atomic(folio
, offset
, part
, iter
);
240 if (unlikely(copied
== 0))
242 if (unlikely(copied
< part
)) {
243 maybe_trouble
= true;
244 iov_iter_revert(iter
, copied
);
249 __netfs_set_group(folio
, netfs_group
);
250 folio_mark_uptodate(folio
);
251 trace_netfs_folio(folio
, netfs_whole_folio_modify
);
255 /* We don't want to do a streaming write on a file that loses
256 * caching service temporarily because the backing store got
257 * culled and we don't really want to get a streaming write on
258 * a file that's open for reading as ->read_folio() then has to
259 * be able to flush it.
261 if ((file
->f_mode
& FMODE_READ
) ||
262 netfs_is_cache_enabled(ctx
)) {
264 netfs_stat(&netfs_n_wh_wstream_conflict
);
267 ret
= netfs_prefetch_for_write(file
, folio
, offset
, part
);
269 _debug("prefetch = %zd", ret
);
270 goto error_folio_unlock
;
272 /* Note that copy-to-cache may have been set. */
274 copied
= copy_folio_from_iter_atomic(folio
, offset
, part
, iter
);
275 if (unlikely(copied
== 0))
277 netfs_set_group(folio
, netfs_group
);
278 trace_netfs_folio(folio
, netfs_just_prefetch
);
284 if (WARN_ON(folio_get_private(folio
)))
285 goto error_folio_unlock
;
286 copied
= copy_folio_from_iter_atomic(folio
, offset
, part
, iter
);
287 if (unlikely(copied
== 0))
289 if (offset
== 0 && copied
== flen
) {
290 __netfs_set_group(folio
, netfs_group
);
291 folio_mark_uptodate(folio
);
292 trace_netfs_folio(folio
, netfs_streaming_filled_page
);
296 finfo
= kzalloc(sizeof(*finfo
), GFP_KERNEL
);
298 iov_iter_revert(iter
, copied
);
300 goto error_folio_unlock
;
302 finfo
->netfs_group
= netfs_get_group(netfs_group
);
303 finfo
->dirty_offset
= offset
;
304 finfo
->dirty_len
= copied
;
305 folio_attach_private(folio
, (void *)((unsigned long)finfo
|
307 trace_netfs_folio(folio
, netfs_streaming_write
);
311 /* We can continue a streaming write only if it continues on
312 * from the previous. If it overlaps, we must flush lest we
313 * suffer a partial copy and disjoint dirty regions.
315 if (offset
== finfo
->dirty_offset
+ finfo
->dirty_len
) {
316 copied
= copy_folio_from_iter_atomic(folio
, offset
, part
, iter
);
317 if (unlikely(copied
== 0))
319 finfo
->dirty_len
+= copied
;
320 if (finfo
->dirty_offset
== 0 && finfo
->dirty_len
== flen
) {
321 if (finfo
->netfs_group
)
322 folio_change_private(folio
, finfo
->netfs_group
);
324 folio_detach_private(folio
);
325 folio_mark_uptodate(folio
);
327 trace_netfs_folio(folio
, netfs_streaming_cont_filled_page
);
329 trace_netfs_folio(folio
, netfs_streaming_write_cont
);
334 /* Incompatible write; flush the folio and try again. */
336 trace_netfs_folio(folio
, netfs_flush_content
);
339 ret
= filemap_write_and_wait_range(mapping
, fpos
, fpos
+ flen
- 1);
341 goto error_folio_unlock
;
345 flush_dcache_folio(folio
);
347 /* Update the inode size if we moved the EOF marker */
349 i_size
= i_size_read(inode
);
351 netfs_update_i_size(ctx
, inode
, i_size
, pos
, copied
);
355 folio_mark_dirty(folio
);
358 netfs_advance_writethrough(wreq
, &wbc
, folio
, copied
,
359 offset
+ copied
== flen
,
367 ret
= balance_dirty_pages_ratelimited_flags(mapping
, bdp_flags
);
368 if (unlikely(ret
< 0))
372 } while (iov_iter_count(iter
));
375 if (likely(written
)) {
376 /* Set indication that ctime and mtime got updated in case
379 set_bit(NETFS_ICTX_MODIFIED_ATTR
, &ctx
->flags
);
380 if (unlikely(ctx
->ops
->post_modify
))
381 ctx
->ops
->post_modify(inode
);
384 if (unlikely(wreq
)) {
385 ret2
= netfs_end_writethrough(wreq
, &wbc
, writethrough
);
386 wbc_detach_inode(&wbc
);
387 if (ret2
== -EIOCBQUEUED
)
393 iocb
->ki_pos
+= written
;
394 _leave(" = %zd [%zd]", written
, ret
);
395 return written
? written
: ret
;
404 EXPORT_SYMBOL(netfs_perform_write
);
407 * netfs_buffered_write_iter_locked - write data to a file
408 * @iocb: IO state structure (file, offset, etc.)
409 * @from: iov_iter with data to write
410 * @netfs_group: Grouping for dirty folios (eg. ceph snaps).
412 * This function does all the work needed for actually writing data to a
413 * file. It does all basic checks, removes SUID from the file, updates
414 * modification times and calls proper subroutines depending on whether we
415 * do direct IO or a standard buffered write.
417 * The caller must hold appropriate locks around this function and have called
418 * generic_write_checks() already. The caller is also responsible for doing
419 * any necessary syncing afterwards.
421 * This function does *not* take care of syncing data in case of O_SYNC write.
422 * A caller has to handle it. This is mainly due to the fact that we want to
423 * avoid syncing under i_rwsem.
426 * * number of bytes written, even for truncated writes
427 * * negative error code if no data has been written at all
429 ssize_t
netfs_buffered_write_iter_locked(struct kiocb
*iocb
, struct iov_iter
*from
,
430 struct netfs_group
*netfs_group
)
432 struct file
*file
= iocb
->ki_filp
;
435 trace_netfs_write_iter(iocb
, from
);
437 ret
= file_remove_privs(file
);
441 ret
= file_update_time(file
);
445 return netfs_perform_write(iocb
, from
, netfs_group
);
447 EXPORT_SYMBOL(netfs_buffered_write_iter_locked
);
450 * netfs_file_write_iter - write data to a file
451 * @iocb: IO state structure
452 * @from: iov_iter with data to write
454 * Perform a write to a file, writing into the pagecache if possible and doing
455 * an unbuffered write instead if not.
458 * * Negative error code if no data has been written at all of
459 * vfs_fsync_range() failed for a synchronous write
460 * * Number of bytes written, even for truncated writes
462 ssize_t
netfs_file_write_iter(struct kiocb
*iocb
, struct iov_iter
*from
)
464 struct file
*file
= iocb
->ki_filp
;
465 struct inode
*inode
= file
->f_mapping
->host
;
466 struct netfs_inode
*ictx
= netfs_inode(inode
);
469 _enter("%llx,%zx,%llx", iocb
->ki_pos
, iov_iter_count(from
), i_size_read(inode
));
471 if (!iov_iter_count(from
))
474 if ((iocb
->ki_flags
& IOCB_DIRECT
) ||
475 test_bit(NETFS_ICTX_UNBUFFERED
, &ictx
->flags
))
476 return netfs_unbuffered_write_iter(iocb
, from
);
478 ret
= netfs_start_io_write(inode
);
482 ret
= generic_write_checks(iocb
, from
);
484 ret
= netfs_buffered_write_iter_locked(iocb
, from
, NULL
);
485 netfs_end_io_write(inode
);
487 ret
= generic_write_sync(iocb
, ret
);
490 EXPORT_SYMBOL(netfs_file_write_iter
);
493 * Notification that a previously read-only page is about to become writable.
494 * The caller indicates the precise page that needs to be written to, but
495 * we only track group on a per-folio basis, so we block more often than
496 * we might otherwise.
498 vm_fault_t
netfs_page_mkwrite(struct vm_fault
*vmf
, struct netfs_group
*netfs_group
)
500 struct netfs_group
*group
;
501 struct folio
*folio
= page_folio(vmf
->page
);
502 struct file
*file
= vmf
->vma
->vm_file
;
503 struct address_space
*mapping
= file
->f_mapping
;
504 struct inode
*inode
= file_inode(file
);
505 struct netfs_inode
*ictx
= netfs_inode(inode
);
506 vm_fault_t ret
= VM_FAULT_NOPAGE
;
509 _enter("%lx", folio
->index
);
511 sb_start_pagefault(inode
->i_sb
);
513 if (folio_lock_killable(folio
) < 0)
515 if (folio
->mapping
!= mapping
)
517 if (folio_wait_writeback_killable(folio
) < 0)
520 /* Can we see a streaming write here? */
521 if (WARN_ON(!folio_test_uptodate(folio
))) {
522 ret
= VM_FAULT_SIGBUS
;
526 group
= netfs_folio_group(folio
);
527 if (group
!= netfs_group
&& group
!= NETFS_FOLIO_COPY_TO_CACHE
) {
529 err
= filemap_fdatawrite_range(mapping
,
531 folio_pos(folio
) + folio_size(folio
));
534 ret
= VM_FAULT_RETRY
;
540 ret
= VM_FAULT_SIGBUS
;
545 if (folio_test_dirty(folio
))
546 trace_netfs_folio(folio
, netfs_folio_trace_mkwrite_plus
);
548 trace_netfs_folio(folio
, netfs_folio_trace_mkwrite
);
549 netfs_set_group(folio
, netfs_group
);
550 file_update_time(file
);
551 set_bit(NETFS_ICTX_MODIFIED_ATTR
, &ictx
->flags
);
552 if (ictx
->ops
->post_modify
)
553 ictx
->ops
->post_modify(inode
);
554 ret
= VM_FAULT_LOCKED
;
556 sb_end_pagefault(inode
->i_sb
);
562 EXPORT_SYMBOL(netfs_page_mkwrite
);