1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Network filesystem high-level (buffered) writeback.
4 * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
5 * Written by David Howells (dhowells@redhat.com)
8 * To support network filesystems with local caching, we manage a situation
9 * that can be envisioned like the following:
11 * +---+---+-----+-----+---+----------+
12 * Folios: | | | | | | |
13 * +---+---+-----+-----+---+----------+
15 * +------+------+ +----+----+
16 * Upload: | | |.....| | |
17 * (Stream 0) +------+------+ +----+----+
19 * +------+------+------+------+------+
21 * (Stream 1) +------+------+------+------+------+
23 * Where we have a sequence of folios of varying sizes that we need to overlay
24 * with multiple parallel streams of I/O requests, where the I/O requests in a
25 * stream may also be of various sizes (in cifs, for example, the sizes are
26 * negotiated with the server; in something like ceph, they may represent the
27 * sizes of storage objects).
29 * The sequence in each stream may contain gaps and noncontiguous subrequests
30 * may be glued together into single vectored write RPCs.
33 #include <linux/export.h>
36 #include <linux/pagemap.h>
40 * Kill all dirty folios in the event of an unrecoverable error, starting with
41 * a locked folio we've already obtained from writeback_iter().
43 static void netfs_kill_dirty_pages(struct address_space
*mapping
,
44 struct writeback_control
*wbc
,
50 enum netfs_folio_trace why
= netfs_folio_trace_kill
;
51 struct netfs_group
*group
= NULL
;
52 struct netfs_folio
*finfo
= NULL
;
55 priv
= folio_detach_private(folio
);
57 finfo
= __netfs_folio_info(priv
);
59 /* Kill folio from streaming write. */
60 group
= finfo
->netfs_group
;
61 why
= netfs_folio_trace_kill_s
;
64 if (group
== NETFS_FOLIO_COPY_TO_CACHE
) {
65 /* Kill copy-to-cache folio */
66 why
= netfs_folio_trace_kill_cc
;
69 /* Kill folio with group */
70 why
= netfs_folio_trace_kill_g
;
75 trace_netfs_folio(folio
, why
);
77 folio_start_writeback(folio
);
79 folio_end_writeback(folio
);
81 netfs_put_group(group
);
84 } while ((folio
= writeback_iter(mapping
, wbc
, folio
, &error
)));
88 * Create a write request and set it up appropriately for the origin type.
90 struct netfs_io_request
*netfs_create_write_req(struct address_space
*mapping
,
93 enum netfs_io_origin origin
)
95 struct netfs_io_request
*wreq
;
96 struct netfs_inode
*ictx
;
97 bool is_buffered
= (origin
== NETFS_WRITEBACK
||
98 origin
== NETFS_WRITETHROUGH
||
99 origin
== NETFS_PGPRIV2_COPY_TO_CACHE
);
101 wreq
= netfs_alloc_request(mapping
, file
, start
, 0, origin
);
105 _enter("R=%x", wreq
->debug_id
);
107 ictx
= netfs_inode(wreq
->inode
);
108 if (is_buffered
&& netfs_is_cache_enabled(ictx
))
109 fscache_begin_write_operation(&wreq
->cache_resources
, netfs_i_cookie(ictx
));
111 wreq
->cleaned_to
= wreq
->start
;
113 wreq
->io_streams
[0].stream_nr
= 0;
114 wreq
->io_streams
[0].source
= NETFS_UPLOAD_TO_SERVER
;
115 wreq
->io_streams
[0].prepare_write
= ictx
->ops
->prepare_write
;
116 wreq
->io_streams
[0].issue_write
= ictx
->ops
->issue_write
;
117 wreq
->io_streams
[0].collected_to
= start
;
118 wreq
->io_streams
[0].transferred
= LONG_MAX
;
120 wreq
->io_streams
[1].stream_nr
= 1;
121 wreq
->io_streams
[1].source
= NETFS_WRITE_TO_CACHE
;
122 wreq
->io_streams
[1].collected_to
= start
;
123 wreq
->io_streams
[1].transferred
= LONG_MAX
;
124 if (fscache_resources_valid(&wreq
->cache_resources
)) {
125 wreq
->io_streams
[1].avail
= true;
126 wreq
->io_streams
[1].active
= true;
127 wreq
->io_streams
[1].prepare_write
= wreq
->cache_resources
.ops
->prepare_write_subreq
;
128 wreq
->io_streams
[1].issue_write
= wreq
->cache_resources
.ops
->issue_write
;
135 * netfs_prepare_write_failed - Note write preparation failed
136 * @subreq: The subrequest to mark
138 * Mark a subrequest to note that preparation for write failed.
140 void netfs_prepare_write_failed(struct netfs_io_subrequest
*subreq
)
142 __set_bit(NETFS_SREQ_FAILED
, &subreq
->flags
);
143 trace_netfs_sreq(subreq
, netfs_sreq_trace_prep_failed
);
145 EXPORT_SYMBOL(netfs_prepare_write_failed
);
148 * Prepare a write subrequest. We need to allocate a new subrequest
149 * if we don't have one.
151 static void netfs_prepare_write(struct netfs_io_request
*wreq
,
152 struct netfs_io_stream
*stream
,
155 struct netfs_io_subrequest
*subreq
;
156 struct iov_iter
*wreq_iter
= &wreq
->io_iter
;
158 /* Make sure we don't point the iterator at a used-up folio_queue
159 * struct being used as a placeholder to prevent the queue from
160 * collapsing. In such a case, extend the queue.
162 if (iov_iter_is_folioq(wreq_iter
) &&
163 wreq_iter
->folioq_slot
>= folioq_nr_slots(wreq_iter
->folioq
)) {
164 netfs_buffer_make_space(wreq
);
167 subreq
= netfs_alloc_subrequest(wreq
);
168 subreq
->source
= stream
->source
;
169 subreq
->start
= start
;
170 subreq
->stream_nr
= stream
->stream_nr
;
171 subreq
->io_iter
= *wreq_iter
;
173 _enter("R=%x[%x]", wreq
->debug_id
, subreq
->debug_index
);
175 trace_netfs_sreq(subreq
, netfs_sreq_trace_prepare
);
177 stream
->sreq_max_len
= UINT_MAX
;
178 stream
->sreq_max_segs
= INT_MAX
;
179 switch (stream
->source
) {
180 case NETFS_UPLOAD_TO_SERVER
:
181 netfs_stat(&netfs_n_wh_upload
);
182 stream
->sreq_max_len
= wreq
->wsize
;
184 case NETFS_WRITE_TO_CACHE
:
185 netfs_stat(&netfs_n_wh_write
);
192 if (stream
->prepare_write
)
193 stream
->prepare_write(subreq
);
195 __set_bit(NETFS_SREQ_IN_PROGRESS
, &subreq
->flags
);
197 /* We add to the end of the list whilst the collector may be walking
198 * the list. The collector only goes nextwards and uses the lock to
199 * remove entries off of the front.
201 spin_lock_bh(&wreq
->lock
);
202 list_add_tail(&subreq
->rreq_link
, &stream
->subrequests
);
203 if (list_is_first(&subreq
->rreq_link
, &stream
->subrequests
)) {
204 stream
->front
= subreq
;
205 if (!stream
->active
) {
206 stream
->collected_to
= stream
->front
->start
;
207 /* Write list pointers before active flag */
208 smp_store_release(&stream
->active
, true);
212 spin_unlock_bh(&wreq
->lock
);
214 stream
->construct
= subreq
;
218 * Set the I/O iterator for the filesystem/cache to use and dispatch the I/O
219 * operation. The operation may be asynchronous and should call
220 * netfs_write_subrequest_terminated() when complete.
222 static void netfs_do_issue_write(struct netfs_io_stream
*stream
,
223 struct netfs_io_subrequest
*subreq
)
225 struct netfs_io_request
*wreq
= subreq
->rreq
;
227 _enter("R=%x[%x],%zx", wreq
->debug_id
, subreq
->debug_index
, subreq
->len
);
229 if (test_bit(NETFS_SREQ_FAILED
, &subreq
->flags
))
230 return netfs_write_subrequest_terminated(subreq
, subreq
->error
, false);
232 trace_netfs_sreq(subreq
, netfs_sreq_trace_submit
);
233 stream
->issue_write(subreq
);
236 void netfs_reissue_write(struct netfs_io_stream
*stream
,
237 struct netfs_io_subrequest
*subreq
,
238 struct iov_iter
*source
)
240 size_t size
= subreq
->len
- subreq
->transferred
;
242 // TODO: Use encrypted buffer
243 subreq
->io_iter
= *source
;
244 iov_iter_advance(source
, size
);
245 iov_iter_truncate(&subreq
->io_iter
, size
);
247 __set_bit(NETFS_SREQ_IN_PROGRESS
, &subreq
->flags
);
248 netfs_do_issue_write(stream
, subreq
);
251 void netfs_issue_write(struct netfs_io_request
*wreq
,
252 struct netfs_io_stream
*stream
)
254 struct netfs_io_subrequest
*subreq
= stream
->construct
;
258 stream
->construct
= NULL
;
259 subreq
->io_iter
.count
= subreq
->len
;
260 netfs_do_issue_write(stream
, subreq
);
264 * Add data to the write subrequest, dispatching each as we fill it up or if it
265 * is discontiguous with the previous. We only fill one part at a time so that
266 * we can avoid overrunning the credits obtained (cifs) and try to parallelise
267 * content-crypto preparation with network writes.
269 int netfs_advance_write(struct netfs_io_request
*wreq
,
270 struct netfs_io_stream
*stream
,
271 loff_t start
, size_t len
, bool to_eof
)
273 struct netfs_io_subrequest
*subreq
= stream
->construct
;
276 if (!stream
->avail
) {
281 _enter("R=%x[%x]", wreq
->debug_id
, subreq
? subreq
->debug_index
: 0);
283 if (subreq
&& start
!= subreq
->start
+ subreq
->len
) {
284 netfs_issue_write(wreq
, stream
);
288 if (!stream
->construct
)
289 netfs_prepare_write(wreq
, stream
, start
);
290 subreq
= stream
->construct
;
292 part
= umin(stream
->sreq_max_len
- subreq
->len
, len
);
293 _debug("part %zx/%zx %zx/%zx", subreq
->len
, stream
->sreq_max_len
, part
, len
);
296 stream
->submit_extendable_to
-= part
;
298 if (subreq
->len
>= stream
->sreq_max_len
||
299 subreq
->nr_segs
>= stream
->sreq_max_segs
||
301 netfs_issue_write(wreq
, stream
);
309 * Write some of a pending folio data back to the server.
311 static int netfs_write_folio(struct netfs_io_request
*wreq
,
312 struct writeback_control
*wbc
,
315 struct netfs_io_stream
*upload
= &wreq
->io_streams
[0];
316 struct netfs_io_stream
*cache
= &wreq
->io_streams
[1];
317 struct netfs_io_stream
*stream
;
318 struct netfs_group
*fgroup
; /* TODO: Use this with ceph */
319 struct netfs_folio
*finfo
;
321 size_t fsize
= folio_size(folio
), flen
= fsize
, foff
= 0;
322 loff_t fpos
= folio_pos(folio
), i_size
;
323 bool to_eof
= false, streamw
= false;
328 /* netfs_perform_write() may shift i_size around the page or from out
329 * of the page to beyond it, but cannot move i_size into or through the
330 * page since we have it locked.
332 i_size
= i_size_read(wreq
->inode
);
334 if (fpos
>= i_size
) {
335 /* mmap beyond eof. */
336 _debug("beyond eof");
337 folio_start_writeback(folio
);
339 wreq
->nr_group_rel
+= netfs_folio_written_back(folio
);
340 netfs_put_group_many(wreq
->group
, wreq
->nr_group_rel
);
341 wreq
->nr_group_rel
= 0;
345 if (fpos
+ fsize
> wreq
->i_size
)
346 wreq
->i_size
= i_size
;
348 fgroup
= netfs_folio_group(folio
);
349 finfo
= netfs_folio_info(folio
);
351 foff
= finfo
->dirty_offset
;
352 flen
= foff
+ finfo
->dirty_len
;
356 if (wreq
->origin
== NETFS_WRITETHROUGH
) {
358 if (flen
> i_size
- fpos
)
359 flen
= i_size
- fpos
;
360 } else if (flen
> i_size
- fpos
) {
361 flen
= i_size
- fpos
;
363 folio_zero_segment(folio
, flen
, fsize
);
365 } else if (flen
== i_size
- fpos
) {
370 _debug("folio %zx %zx %zx", foff
, flen
, fsize
);
372 /* Deal with discontinuities in the stream of dirty pages. These can
373 * arise from a number of sources:
375 * (1) Intervening non-dirty pages from random-access writes, multiple
376 * flushers writing back different parts simultaneously and manual
379 * (2) Partially-written pages from write-streaming.
381 * (3) Pages that belong to a different write-back group (eg. Ceph
384 * (4) Actually-clean pages that were marked for write to the cache
385 * when they were read. Note that these appear as a special
388 if (fgroup
== NETFS_FOLIO_COPY_TO_CACHE
) {
389 netfs_issue_write(wreq
, upload
);
390 } else if (fgroup
!= wreq
->group
) {
391 /* We can't write this page to the server yet. */
392 kdebug("wrong group");
393 folio_redirty_for_writepage(wbc
, folio
);
395 netfs_issue_write(wreq
, upload
);
396 netfs_issue_write(wreq
, cache
);
401 netfs_issue_write(wreq
, upload
);
403 netfs_issue_write(wreq
, cache
);
405 /* Flip the page to the writeback state and unlock. If we're called
406 * from write-through, then the page has already been put into the wb
409 if (wreq
->origin
== NETFS_WRITEBACK
)
410 folio_start_writeback(folio
);
413 if (fgroup
== NETFS_FOLIO_COPY_TO_CACHE
) {
415 trace_netfs_folio(folio
, netfs_folio_trace_cancel_copy
);
416 netfs_issue_write(wreq
, upload
);
417 netfs_folio_written_back(folio
);
420 trace_netfs_folio(folio
, netfs_folio_trace_store_copy
);
421 } else if (!upload
->avail
&& !cache
->avail
) {
422 trace_netfs_folio(folio
, netfs_folio_trace_cancel_store
);
423 netfs_folio_written_back(folio
);
425 } else if (!upload
->construct
) {
426 trace_netfs_folio(folio
, netfs_folio_trace_store
);
428 trace_netfs_folio(folio
, netfs_folio_trace_store_plus
);
431 /* Attach the folio to the rolling buffer. */
432 netfs_buffer_append_folio(wreq
, folio
, false);
434 /* Move the submission point forward to allow for write-streaming data
435 * not starting at the front of the page. We don't do write-streaming
436 * with the cache as the cache requires DIO alignment.
438 * Also skip uploading for data that's been read and just needs copying
441 for (int s
= 0; s
< NR_IO_STREAMS
; s
++) {
442 stream
= &wreq
->io_streams
[s
];
443 stream
->submit_off
= foff
;
444 stream
->submit_len
= flen
;
445 if ((stream
->source
== NETFS_WRITE_TO_CACHE
&& streamw
) ||
446 (stream
->source
== NETFS_UPLOAD_TO_SERVER
&&
447 fgroup
== NETFS_FOLIO_COPY_TO_CACHE
)) {
448 stream
->submit_off
= UINT_MAX
;
449 stream
->submit_len
= 0;
453 /* Attach the folio to one or more subrequests. For a big folio, we
454 * could end up with thousands of subrequests if the wsize is small -
455 * but we might need to wait during the creation of subrequests for
456 * network resources (eg. SMB credits).
460 size_t lowest_off
= ULONG_MAX
;
463 /* Always add to the lowest-submitted stream first. */
464 for (int s
= 0; s
< NR_IO_STREAMS
; s
++) {
465 stream
= &wreq
->io_streams
[s
];
466 if (stream
->submit_len
> 0 &&
467 stream
->submit_off
< lowest_off
) {
468 lowest_off
= stream
->submit_off
;
475 stream
= &wreq
->io_streams
[choose_s
];
477 /* Advance the iterator(s). */
478 if (stream
->submit_off
> iter_off
) {
479 iov_iter_advance(&wreq
->io_iter
, stream
->submit_off
- iter_off
);
480 iter_off
= stream
->submit_off
;
483 atomic64_set(&wreq
->issued_to
, fpos
+ stream
->submit_off
);
484 stream
->submit_extendable_to
= fsize
- stream
->submit_off
;
485 part
= netfs_advance_write(wreq
, stream
, fpos
+ stream
->submit_off
,
486 stream
->submit_len
, to_eof
);
487 stream
->submit_off
+= part
;
488 if (part
> stream
->submit_len
)
489 stream
->submit_len
= 0;
491 stream
->submit_len
-= part
;
496 if (fsize
> iter_off
)
497 iov_iter_advance(&wreq
->io_iter
, fsize
- iter_off
);
498 atomic64_set(&wreq
->issued_to
, fpos
+ fsize
);
501 kdebug("R=%x: No submit", wreq
->debug_id
);
503 if (foff
+ flen
< fsize
)
504 for (int s
= 0; s
< NR_IO_STREAMS
; s
++)
505 netfs_issue_write(wreq
, &wreq
->io_streams
[s
]);
512 * End the issuing of writes, letting the collector know we're done.
514 static void netfs_end_issue_write(struct netfs_io_request
*wreq
)
516 bool needs_poke
= true;
518 smp_wmb(); /* Write subreq lists before ALL_QUEUED. */
519 set_bit(NETFS_RREQ_ALL_QUEUED
, &wreq
->flags
);
521 for (int s
= 0; s
< NR_IO_STREAMS
; s
++) {
522 struct netfs_io_stream
*stream
= &wreq
->io_streams
[s
];
526 if (!list_empty(&stream
->subrequests
))
528 netfs_issue_write(wreq
, stream
);
532 netfs_wake_write_collector(wreq
, false);
536 * Write some of the pending data back to the server
538 int netfs_writepages(struct address_space
*mapping
,
539 struct writeback_control
*wbc
)
541 struct netfs_inode
*ictx
= netfs_inode(mapping
->host
);
542 struct netfs_io_request
*wreq
= NULL
;
546 if (!mutex_trylock(&ictx
->wb_lock
)) {
547 if (wbc
->sync_mode
== WB_SYNC_NONE
) {
548 netfs_stat(&netfs_n_wb_lock_skip
);
551 netfs_stat(&netfs_n_wb_lock_wait
);
552 mutex_lock(&ictx
->wb_lock
);
555 /* Need the first folio to be able to set up the op. */
556 folio
= writeback_iter(mapping
, wbc
, NULL
, &error
);
560 wreq
= netfs_create_write_req(mapping
, NULL
, folio_pos(folio
), NETFS_WRITEBACK
);
562 error
= PTR_ERR(wreq
);
566 trace_netfs_write(wreq
, netfs_write_trace_writeback
);
567 netfs_stat(&netfs_n_wh_writepages
);
570 _debug("wbiter %lx %llx", folio
->index
, atomic64_read(&wreq
->issued_to
));
572 /* It appears we don't have to handle cyclic writeback wrapping. */
573 WARN_ON_ONCE(wreq
&& folio_pos(folio
) < atomic64_read(&wreq
->issued_to
));
575 if (netfs_folio_group(folio
) != NETFS_FOLIO_COPY_TO_CACHE
&&
576 unlikely(!test_bit(NETFS_RREQ_UPLOAD_TO_SERVER
, &wreq
->flags
))) {
577 set_bit(NETFS_RREQ_UPLOAD_TO_SERVER
, &wreq
->flags
);
578 wreq
->netfs_ops
->begin_writeback(wreq
);
581 error
= netfs_write_folio(wreq
, wbc
, folio
);
584 } while ((folio
= writeback_iter(mapping
, wbc
, folio
, &error
)));
586 netfs_end_issue_write(wreq
);
588 mutex_unlock(&ictx
->wb_lock
);
590 netfs_put_request(wreq
, false, netfs_rreq_trace_put_return
);
591 _leave(" = %d", error
);
595 netfs_kill_dirty_pages(mapping
, wbc
, folio
);
597 mutex_unlock(&ictx
->wb_lock
);
598 _leave(" = %d", error
);
601 EXPORT_SYMBOL(netfs_writepages
);
604 * Begin a write operation for writing through the pagecache.
606 struct netfs_io_request
*netfs_begin_writethrough(struct kiocb
*iocb
, size_t len
)
608 struct netfs_io_request
*wreq
= NULL
;
609 struct netfs_inode
*ictx
= netfs_inode(file_inode(iocb
->ki_filp
));
611 mutex_lock(&ictx
->wb_lock
);
613 wreq
= netfs_create_write_req(iocb
->ki_filp
->f_mapping
, iocb
->ki_filp
,
614 iocb
->ki_pos
, NETFS_WRITETHROUGH
);
616 mutex_unlock(&ictx
->wb_lock
);
620 wreq
->io_streams
[0].avail
= true;
621 trace_netfs_write(wreq
, netfs_write_trace_writethrough
);
626 * Advance the state of the write operation used when writing through the
627 * pagecache. Data has been copied into the pagecache that we need to append
628 * to the request. If we've added more than wsize then we need to create a new
631 int netfs_advance_writethrough(struct netfs_io_request
*wreq
, struct writeback_control
*wbc
,
632 struct folio
*folio
, size_t copied
, bool to_page_end
,
633 struct folio
**writethrough_cache
)
635 _enter("R=%x ic=%zu ws=%u cp=%zu tp=%u",
636 wreq
->debug_id
, wreq
->iter
.count
, wreq
->wsize
, copied
, to_page_end
);
638 if (!*writethrough_cache
) {
639 if (folio_test_dirty(folio
))
641 folio_clear_dirty_for_io(folio
);
643 /* We can make multiple writes to the folio... */
644 folio_start_writeback(folio
);
646 trace_netfs_folio(folio
, netfs_folio_trace_wthru
);
648 trace_netfs_folio(folio
, netfs_folio_trace_wthru_plus
);
649 *writethrough_cache
= folio
;
656 *writethrough_cache
= NULL
;
657 return netfs_write_folio(wreq
, wbc
, folio
);
661 * End a write operation used when writing through the pagecache.
663 int netfs_end_writethrough(struct netfs_io_request
*wreq
, struct writeback_control
*wbc
,
664 struct folio
*writethrough_cache
)
666 struct netfs_inode
*ictx
= netfs_inode(wreq
->inode
);
669 _enter("R=%x", wreq
->debug_id
);
671 if (writethrough_cache
)
672 netfs_write_folio(wreq
, wbc
, writethrough_cache
);
674 netfs_end_issue_write(wreq
);
676 mutex_unlock(&ictx
->wb_lock
);
681 wait_on_bit(&wreq
->flags
, NETFS_RREQ_IN_PROGRESS
, TASK_UNINTERRUPTIBLE
);
684 netfs_put_request(wreq
, false, netfs_rreq_trace_put_return
);
689 * Write data to the server without going through the pagecache and without
690 * writing it to the local cache.
692 int netfs_unbuffered_write(struct netfs_io_request
*wreq
, bool may_wait
, size_t len
)
694 struct netfs_io_stream
*upload
= &wreq
->io_streams
[0];
696 loff_t start
= wreq
->start
;
701 if (wreq
->origin
== NETFS_DIO_WRITE
)
702 inode_dio_begin(wreq
->inode
);
705 // TODO: Prepare content encryption
707 _debug("unbuffered %zx", len
);
708 part
= netfs_advance_write(wreq
, upload
, start
, len
, false);
711 iov_iter_advance(&wreq
->io_iter
, part
);
712 if (test_bit(NETFS_RREQ_PAUSE
, &wreq
->flags
)) {
713 trace_netfs_rreq(wreq
, netfs_rreq_trace_wait_pause
);
714 wait_on_bit(&wreq
->flags
, NETFS_RREQ_PAUSE
, TASK_UNINTERRUPTIBLE
);
716 if (test_bit(NETFS_RREQ_FAILED
, &wreq
->flags
))
720 netfs_end_issue_write(wreq
);
721 _leave(" = %d", error
);