1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Network filesystem high-level (buffered) writeback.
4 * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
5 * Written by David Howells (dhowells@redhat.com)
8 * To support network filesystems with local caching, we manage a situation
9 * that can be envisioned like the following:
11 * +---+---+-----+-----+---+----------+
12 * Folios: | | | | | | |
13 * +---+---+-----+-----+---+----------+
15 * +------+------+ +----+----+
16 * Upload: | | |.....| | |
17 * (Stream 0) +------+------+ +----+----+
19 * +------+------+------+------+------+
21 * (Stream 1) +------+------+------+------+------+
23 * Where we have a sequence of folios of varying sizes that we need to overlay
24 * with multiple parallel streams of I/O requests, where the I/O requests in a
25 * stream may also be of various sizes (in cifs, for example, the sizes are
26 * negotiated with the server; in something like ceph, they may represent the
27 * sizes of storage objects).
29 * The sequence in each stream may contain gaps and noncontiguous subrequests
30 * may be glued together into single vectored write RPCs.
33 #include <linux/export.h>
36 #include <linux/pagemap.h>
40 * Kill all dirty folios in the event of an unrecoverable error, starting with
41 * a locked folio we've already obtained from writeback_iter().
43 static void netfs_kill_dirty_pages(struct address_space
*mapping
,
44 struct writeback_control
*wbc
,
50 enum netfs_folio_trace why
= netfs_folio_trace_kill
;
51 struct netfs_group
*group
= NULL
;
52 struct netfs_folio
*finfo
= NULL
;
55 priv
= folio_detach_private(folio
);
57 finfo
= __netfs_folio_info(priv
);
59 /* Kill folio from streaming write. */
60 group
= finfo
->netfs_group
;
61 why
= netfs_folio_trace_kill_s
;
64 if (group
== NETFS_FOLIO_COPY_TO_CACHE
) {
65 /* Kill copy-to-cache folio */
66 why
= netfs_folio_trace_kill_cc
;
69 /* Kill folio with group */
70 why
= netfs_folio_trace_kill_g
;
75 trace_netfs_folio(folio
, why
);
77 folio_start_writeback(folio
);
79 folio_end_writeback(folio
);
81 netfs_put_group(group
);
84 } while ((folio
= writeback_iter(mapping
, wbc
, folio
, &error
)));
88 * Create a write request and set it up appropriately for the origin type.
90 struct netfs_io_request
*netfs_create_write_req(struct address_space
*mapping
,
93 enum netfs_io_origin origin
)
95 struct netfs_io_request
*wreq
;
96 struct netfs_inode
*ictx
;
97 bool is_buffered
= (origin
== NETFS_WRITEBACK
||
98 origin
== NETFS_WRITETHROUGH
||
99 origin
== NETFS_PGPRIV2_COPY_TO_CACHE
);
101 wreq
= netfs_alloc_request(mapping
, file
, start
, 0, origin
);
105 _enter("R=%x", wreq
->debug_id
);
107 ictx
= netfs_inode(wreq
->inode
);
108 if (is_buffered
&& netfs_is_cache_enabled(ictx
))
109 fscache_begin_write_operation(&wreq
->cache_resources
, netfs_i_cookie(ictx
));
111 wreq
->cleaned_to
= wreq
->start
;
113 wreq
->io_streams
[0].stream_nr
= 0;
114 wreq
->io_streams
[0].source
= NETFS_UPLOAD_TO_SERVER
;
115 wreq
->io_streams
[0].prepare_write
= ictx
->ops
->prepare_write
;
116 wreq
->io_streams
[0].issue_write
= ictx
->ops
->issue_write
;
117 wreq
->io_streams
[0].collected_to
= start
;
118 wreq
->io_streams
[0].transferred
= LONG_MAX
;
120 wreq
->io_streams
[1].stream_nr
= 1;
121 wreq
->io_streams
[1].source
= NETFS_WRITE_TO_CACHE
;
122 wreq
->io_streams
[1].collected_to
= start
;
123 wreq
->io_streams
[1].transferred
= LONG_MAX
;
124 if (fscache_resources_valid(&wreq
->cache_resources
)) {
125 wreq
->io_streams
[1].avail
= true;
126 wreq
->io_streams
[1].active
= true;
127 wreq
->io_streams
[1].prepare_write
= wreq
->cache_resources
.ops
->prepare_write_subreq
;
128 wreq
->io_streams
[1].issue_write
= wreq
->cache_resources
.ops
->issue_write
;
135 * netfs_prepare_write_failed - Note write preparation failed
136 * @subreq: The subrequest to mark
138 * Mark a subrequest to note that preparation for write failed.
140 void netfs_prepare_write_failed(struct netfs_io_subrequest
*subreq
)
142 __set_bit(NETFS_SREQ_FAILED
, &subreq
->flags
);
143 trace_netfs_sreq(subreq
, netfs_sreq_trace_prep_failed
);
145 EXPORT_SYMBOL(netfs_prepare_write_failed
);
148 * Prepare a write subrequest. We need to allocate a new subrequest
149 * if we don't have one.
151 static void netfs_prepare_write(struct netfs_io_request
*wreq
,
152 struct netfs_io_stream
*stream
,
155 struct netfs_io_subrequest
*subreq
;
156 struct iov_iter
*wreq_iter
= &wreq
->io_iter
;
158 /* Make sure we don't point the iterator at a used-up folio_queue
159 * struct being used as a placeholder to prevent the queue from
160 * collapsing. In such a case, extend the queue.
162 if (iov_iter_is_folioq(wreq_iter
) &&
163 wreq_iter
->folioq_slot
>= folioq_nr_slots(wreq_iter
->folioq
)) {
164 netfs_buffer_make_space(wreq
);
167 subreq
= netfs_alloc_subrequest(wreq
);
168 subreq
->source
= stream
->source
;
169 subreq
->start
= start
;
170 subreq
->stream_nr
= stream
->stream_nr
;
171 subreq
->io_iter
= *wreq_iter
;
173 _enter("R=%x[%x]", wreq
->debug_id
, subreq
->debug_index
);
175 trace_netfs_sreq(subreq
, netfs_sreq_trace_prepare
);
177 stream
->sreq_max_len
= UINT_MAX
;
178 stream
->sreq_max_segs
= INT_MAX
;
179 switch (stream
->source
) {
180 case NETFS_UPLOAD_TO_SERVER
:
181 netfs_stat(&netfs_n_wh_upload
);
182 stream
->sreq_max_len
= wreq
->wsize
;
184 case NETFS_WRITE_TO_CACHE
:
185 netfs_stat(&netfs_n_wh_write
);
192 if (stream
->prepare_write
)
193 stream
->prepare_write(subreq
);
195 __set_bit(NETFS_SREQ_IN_PROGRESS
, &subreq
->flags
);
197 /* We add to the end of the list whilst the collector may be walking
198 * the list. The collector only goes nextwards and uses the lock to
199 * remove entries off of the front.
201 spin_lock_bh(&wreq
->lock
);
202 list_add_tail(&subreq
->rreq_link
, &stream
->subrequests
);
203 if (list_is_first(&subreq
->rreq_link
, &stream
->subrequests
)) {
204 stream
->front
= subreq
;
205 if (!stream
->active
) {
206 stream
->collected_to
= stream
->front
->start
;
207 /* Write list pointers before active flag */
208 smp_store_release(&stream
->active
, true);
212 spin_unlock_bh(&wreq
->lock
);
214 stream
->construct
= subreq
;
218 * Set the I/O iterator for the filesystem/cache to use and dispatch the I/O
219 * operation. The operation may be asynchronous and should call
220 * netfs_write_subrequest_terminated() when complete.
222 static void netfs_do_issue_write(struct netfs_io_stream
*stream
,
223 struct netfs_io_subrequest
*subreq
)
225 struct netfs_io_request
*wreq
= subreq
->rreq
;
227 _enter("R=%x[%x],%zx", wreq
->debug_id
, subreq
->debug_index
, subreq
->len
);
229 if (test_bit(NETFS_SREQ_FAILED
, &subreq
->flags
))
230 return netfs_write_subrequest_terminated(subreq
, subreq
->error
, false);
232 trace_netfs_sreq(subreq
, netfs_sreq_trace_submit
);
233 stream
->issue_write(subreq
);
236 void netfs_reissue_write(struct netfs_io_stream
*stream
,
237 struct netfs_io_subrequest
*subreq
,
238 struct iov_iter
*source
)
240 size_t size
= subreq
->len
- subreq
->transferred
;
242 // TODO: Use encrypted buffer
243 subreq
->io_iter
= *source
;
244 iov_iter_advance(source
, size
);
245 iov_iter_truncate(&subreq
->io_iter
, size
);
247 __set_bit(NETFS_SREQ_IN_PROGRESS
, &subreq
->flags
);
248 netfs_do_issue_write(stream
, subreq
);
251 void netfs_issue_write(struct netfs_io_request
*wreq
,
252 struct netfs_io_stream
*stream
)
254 struct netfs_io_subrequest
*subreq
= stream
->construct
;
258 stream
->construct
= NULL
;
259 subreq
->io_iter
.count
= subreq
->len
;
260 netfs_do_issue_write(stream
, subreq
);
264 * Add data to the write subrequest, dispatching each as we fill it up or if it
265 * is discontiguous with the previous. We only fill one part at a time so that
266 * we can avoid overrunning the credits obtained (cifs) and try to parallelise
267 * content-crypto preparation with network writes.
269 int netfs_advance_write(struct netfs_io_request
*wreq
,
270 struct netfs_io_stream
*stream
,
271 loff_t start
, size_t len
, bool to_eof
)
273 struct netfs_io_subrequest
*subreq
= stream
->construct
;
276 if (!stream
->avail
) {
281 _enter("R=%x[%x]", wreq
->debug_id
, subreq
? subreq
->debug_index
: 0);
283 if (subreq
&& start
!= subreq
->start
+ subreq
->len
) {
284 netfs_issue_write(wreq
, stream
);
288 if (!stream
->construct
)
289 netfs_prepare_write(wreq
, stream
, start
);
290 subreq
= stream
->construct
;
292 part
= umin(stream
->sreq_max_len
- subreq
->len
, len
);
293 _debug("part %zx/%zx %zx/%zx", subreq
->len
, stream
->sreq_max_len
, part
, len
);
296 stream
->submit_extendable_to
-= part
;
298 if (subreq
->len
>= stream
->sreq_max_len
||
299 subreq
->nr_segs
>= stream
->sreq_max_segs
||
301 netfs_issue_write(wreq
, stream
);
309 * Write some of a pending folio data back to the server.
311 static int netfs_write_folio(struct netfs_io_request
*wreq
,
312 struct writeback_control
*wbc
,
315 struct netfs_io_stream
*upload
= &wreq
->io_streams
[0];
316 struct netfs_io_stream
*cache
= &wreq
->io_streams
[1];
317 struct netfs_io_stream
*stream
;
318 struct netfs_group
*fgroup
; /* TODO: Use this with ceph */
319 struct netfs_folio
*finfo
;
320 size_t fsize
= folio_size(folio
), flen
= fsize
, foff
= 0;
321 loff_t fpos
= folio_pos(folio
), i_size
;
322 bool to_eof
= false, streamw
= false;
327 /* netfs_perform_write() may shift i_size around the page or from out
328 * of the page to beyond it, but cannot move i_size into or through the
329 * page since we have it locked.
331 i_size
= i_size_read(wreq
->inode
);
333 if (fpos
>= i_size
) {
334 /* mmap beyond eof. */
335 _debug("beyond eof");
336 folio_start_writeback(folio
);
338 wreq
->nr_group_rel
+= netfs_folio_written_back(folio
);
339 netfs_put_group_many(wreq
->group
, wreq
->nr_group_rel
);
340 wreq
->nr_group_rel
= 0;
344 if (fpos
+ fsize
> wreq
->i_size
)
345 wreq
->i_size
= i_size
;
347 fgroup
= netfs_folio_group(folio
);
348 finfo
= netfs_folio_info(folio
);
350 foff
= finfo
->dirty_offset
;
351 flen
= foff
+ finfo
->dirty_len
;
355 if (wreq
->origin
== NETFS_WRITETHROUGH
) {
357 if (flen
> i_size
- fpos
)
358 flen
= i_size
- fpos
;
359 } else if (flen
> i_size
- fpos
) {
360 flen
= i_size
- fpos
;
362 folio_zero_segment(folio
, flen
, fsize
);
364 } else if (flen
== i_size
- fpos
) {
369 _debug("folio %zx %zx %zx", foff
, flen
, fsize
);
371 /* Deal with discontinuities in the stream of dirty pages. These can
372 * arise from a number of sources:
374 * (1) Intervening non-dirty pages from random-access writes, multiple
375 * flushers writing back different parts simultaneously and manual
378 * (2) Partially-written pages from write-streaming.
380 * (3) Pages that belong to a different write-back group (eg. Ceph
383 * (4) Actually-clean pages that were marked for write to the cache
384 * when they were read. Note that these appear as a special
387 if (fgroup
== NETFS_FOLIO_COPY_TO_CACHE
) {
388 netfs_issue_write(wreq
, upload
);
389 } else if (fgroup
!= wreq
->group
) {
390 /* We can't write this page to the server yet. */
391 kdebug("wrong group");
392 folio_redirty_for_writepage(wbc
, folio
);
394 netfs_issue_write(wreq
, upload
);
395 netfs_issue_write(wreq
, cache
);
400 netfs_issue_write(wreq
, upload
);
402 netfs_issue_write(wreq
, cache
);
404 /* Flip the page to the writeback state and unlock. If we're called
405 * from write-through, then the page has already been put into the wb
408 if (wreq
->origin
== NETFS_WRITEBACK
)
409 folio_start_writeback(folio
);
412 if (fgroup
== NETFS_FOLIO_COPY_TO_CACHE
) {
414 trace_netfs_folio(folio
, netfs_folio_trace_cancel_copy
);
415 netfs_issue_write(wreq
, upload
);
416 netfs_folio_written_back(folio
);
419 trace_netfs_folio(folio
, netfs_folio_trace_store_copy
);
420 } else if (!upload
->avail
&& !cache
->avail
) {
421 trace_netfs_folio(folio
, netfs_folio_trace_cancel_store
);
422 netfs_folio_written_back(folio
);
424 } else if (!upload
->construct
) {
425 trace_netfs_folio(folio
, netfs_folio_trace_store
);
427 trace_netfs_folio(folio
, netfs_folio_trace_store_plus
);
430 /* Attach the folio to the rolling buffer. */
431 netfs_buffer_append_folio(wreq
, folio
, false);
433 /* Move the submission point forward to allow for write-streaming data
434 * not starting at the front of the page. We don't do write-streaming
435 * with the cache as the cache requires DIO alignment.
437 * Also skip uploading for data that's been read and just needs copying
440 for (int s
= 0; s
< NR_IO_STREAMS
; s
++) {
441 stream
= &wreq
->io_streams
[s
];
442 stream
->submit_off
= foff
;
443 stream
->submit_len
= flen
;
444 if ((stream
->source
== NETFS_WRITE_TO_CACHE
&& streamw
) ||
445 (stream
->source
== NETFS_UPLOAD_TO_SERVER
&&
446 fgroup
== NETFS_FOLIO_COPY_TO_CACHE
)) {
447 stream
->submit_off
= UINT_MAX
;
448 stream
->submit_len
= 0;
452 /* Attach the folio to one or more subrequests. For a big folio, we
453 * could end up with thousands of subrequests if the wsize is small -
454 * but we might need to wait during the creation of subrequests for
455 * network resources (eg. SMB credits).
459 size_t lowest_off
= ULONG_MAX
;
462 /* Always add to the lowest-submitted stream first. */
463 for (int s
= 0; s
< NR_IO_STREAMS
; s
++) {
464 stream
= &wreq
->io_streams
[s
];
465 if (stream
->submit_len
> 0 &&
466 stream
->submit_off
< lowest_off
) {
467 lowest_off
= stream
->submit_off
;
474 stream
= &wreq
->io_streams
[choose_s
];
475 wreq
->io_iter
.iov_offset
= stream
->submit_off
;
477 atomic64_set(&wreq
->issued_to
, fpos
+ stream
->submit_off
);
478 stream
->submit_extendable_to
= fsize
- stream
->submit_off
;
479 part
= netfs_advance_write(wreq
, stream
, fpos
+ stream
->submit_off
,
480 stream
->submit_len
, to_eof
);
481 stream
->submit_off
+= part
;
482 if (part
> stream
->submit_len
)
483 stream
->submit_len
= 0;
485 stream
->submit_len
-= part
;
490 wreq
->io_iter
.iov_offset
= 0;
491 iov_iter_advance(&wreq
->io_iter
, fsize
);
492 atomic64_set(&wreq
->issued_to
, fpos
+ fsize
);
495 kdebug("R=%x: No submit", wreq
->debug_id
);
497 if (foff
+ flen
< fsize
)
498 for (int s
= 0; s
< NR_IO_STREAMS
; s
++)
499 netfs_issue_write(wreq
, &wreq
->io_streams
[s
]);
506 * Write some of the pending data back to the server
508 int netfs_writepages(struct address_space
*mapping
,
509 struct writeback_control
*wbc
)
511 struct netfs_inode
*ictx
= netfs_inode(mapping
->host
);
512 struct netfs_io_request
*wreq
= NULL
;
516 if (!mutex_trylock(&ictx
->wb_lock
)) {
517 if (wbc
->sync_mode
== WB_SYNC_NONE
) {
518 netfs_stat(&netfs_n_wb_lock_skip
);
521 netfs_stat(&netfs_n_wb_lock_wait
);
522 mutex_lock(&ictx
->wb_lock
);
525 /* Need the first folio to be able to set up the op. */
526 folio
= writeback_iter(mapping
, wbc
, NULL
, &error
);
530 wreq
= netfs_create_write_req(mapping
, NULL
, folio_pos(folio
), NETFS_WRITEBACK
);
532 error
= PTR_ERR(wreq
);
536 trace_netfs_write(wreq
, netfs_write_trace_writeback
);
537 netfs_stat(&netfs_n_wh_writepages
);
540 _debug("wbiter %lx %llx", folio
->index
, atomic64_read(&wreq
->issued_to
));
542 /* It appears we don't have to handle cyclic writeback wrapping. */
543 WARN_ON_ONCE(wreq
&& folio_pos(folio
) < atomic64_read(&wreq
->issued_to
));
545 if (netfs_folio_group(folio
) != NETFS_FOLIO_COPY_TO_CACHE
&&
546 unlikely(!test_bit(NETFS_RREQ_UPLOAD_TO_SERVER
, &wreq
->flags
))) {
547 set_bit(NETFS_RREQ_UPLOAD_TO_SERVER
, &wreq
->flags
);
548 wreq
->netfs_ops
->begin_writeback(wreq
);
551 error
= netfs_write_folio(wreq
, wbc
, folio
);
554 } while ((folio
= writeback_iter(mapping
, wbc
, folio
, &error
)));
556 for (int s
= 0; s
< NR_IO_STREAMS
; s
++)
557 netfs_issue_write(wreq
, &wreq
->io_streams
[s
]);
558 smp_wmb(); /* Write lists before ALL_QUEUED. */
559 set_bit(NETFS_RREQ_ALL_QUEUED
, &wreq
->flags
);
561 mutex_unlock(&ictx
->wb_lock
);
563 netfs_put_request(wreq
, false, netfs_rreq_trace_put_return
);
564 _leave(" = %d", error
);
568 netfs_kill_dirty_pages(mapping
, wbc
, folio
);
570 mutex_unlock(&ictx
->wb_lock
);
571 _leave(" = %d", error
);
574 EXPORT_SYMBOL(netfs_writepages
);
577 * Begin a write operation for writing through the pagecache.
579 struct netfs_io_request
*netfs_begin_writethrough(struct kiocb
*iocb
, size_t len
)
581 struct netfs_io_request
*wreq
= NULL
;
582 struct netfs_inode
*ictx
= netfs_inode(file_inode(iocb
->ki_filp
));
584 mutex_lock(&ictx
->wb_lock
);
586 wreq
= netfs_create_write_req(iocb
->ki_filp
->f_mapping
, iocb
->ki_filp
,
587 iocb
->ki_pos
, NETFS_WRITETHROUGH
);
589 mutex_unlock(&ictx
->wb_lock
);
593 wreq
->io_streams
[0].avail
= true;
594 trace_netfs_write(wreq
, netfs_write_trace_writethrough
);
599 * Advance the state of the write operation used when writing through the
600 * pagecache. Data has been copied into the pagecache that we need to append
601 * to the request. If we've added more than wsize then we need to create a new
604 int netfs_advance_writethrough(struct netfs_io_request
*wreq
, struct writeback_control
*wbc
,
605 struct folio
*folio
, size_t copied
, bool to_page_end
,
606 struct folio
**writethrough_cache
)
608 _enter("R=%x ic=%zu ws=%u cp=%zu tp=%u",
609 wreq
->debug_id
, wreq
->iter
.count
, wreq
->wsize
, copied
, to_page_end
);
611 if (!*writethrough_cache
) {
612 if (folio_test_dirty(folio
))
614 folio_clear_dirty_for_io(folio
);
616 /* We can make multiple writes to the folio... */
617 folio_start_writeback(folio
);
619 trace_netfs_folio(folio
, netfs_folio_trace_wthru
);
621 trace_netfs_folio(folio
, netfs_folio_trace_wthru_plus
);
622 *writethrough_cache
= folio
;
629 *writethrough_cache
= NULL
;
630 return netfs_write_folio(wreq
, wbc
, folio
);
634 * End a write operation used when writing through the pagecache.
636 int netfs_end_writethrough(struct netfs_io_request
*wreq
, struct writeback_control
*wbc
,
637 struct folio
*writethrough_cache
)
639 struct netfs_inode
*ictx
= netfs_inode(wreq
->inode
);
642 _enter("R=%x", wreq
->debug_id
);
644 if (writethrough_cache
)
645 netfs_write_folio(wreq
, wbc
, writethrough_cache
);
647 netfs_issue_write(wreq
, &wreq
->io_streams
[0]);
648 netfs_issue_write(wreq
, &wreq
->io_streams
[1]);
649 smp_wmb(); /* Write lists before ALL_QUEUED. */
650 set_bit(NETFS_RREQ_ALL_QUEUED
, &wreq
->flags
);
652 mutex_unlock(&ictx
->wb_lock
);
657 wait_on_bit(&wreq
->flags
, NETFS_RREQ_IN_PROGRESS
, TASK_UNINTERRUPTIBLE
);
660 netfs_put_request(wreq
, false, netfs_rreq_trace_put_return
);
665 * Write data to the server without going through the pagecache and without
666 * writing it to the local cache.
668 int netfs_unbuffered_write(struct netfs_io_request
*wreq
, bool may_wait
, size_t len
)
670 struct netfs_io_stream
*upload
= &wreq
->io_streams
[0];
672 loff_t start
= wreq
->start
;
677 if (wreq
->origin
== NETFS_DIO_WRITE
)
678 inode_dio_begin(wreq
->inode
);
681 // TODO: Prepare content encryption
683 _debug("unbuffered %zx", len
);
684 part
= netfs_advance_write(wreq
, upload
, start
, len
, false);
687 iov_iter_advance(&wreq
->io_iter
, part
);
688 if (test_bit(NETFS_RREQ_PAUSE
, &wreq
->flags
)) {
689 trace_netfs_rreq(wreq
, netfs_rreq_trace_wait_pause
);
690 wait_on_bit(&wreq
->flags
, NETFS_RREQ_PAUSE
, TASK_UNINTERRUPTIBLE
);
692 if (test_bit(NETFS_RREQ_FAILED
, &wreq
->flags
))
696 netfs_issue_write(wreq
, upload
);
698 smp_wmb(); /* Write lists before ALL_QUEUED. */
699 set_bit(NETFS_RREQ_ALL_QUEUED
, &wreq
->flags
);
700 if (list_empty(&upload
->subrequests
))
701 netfs_wake_write_collector(wreq
, false);
703 _leave(" = %d", error
);