1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Network filesystem read subrequest result collection, assessment and
5 * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
6 * Written by David Howells (dhowells@redhat.com)
9 #include <linux/export.h>
12 #include <linux/pagemap.h>
13 #include <linux/slab.h>
14 #include <linux/task_io_accounting_ops.h>
18 * Clear the unread part of an I/O request.
20 static void netfs_clear_unread(struct netfs_io_subrequest
*subreq
)
22 netfs_reset_iter(subreq
);
23 WARN_ON_ONCE(subreq
->len
- subreq
->transferred
!= iov_iter_count(&subreq
->io_iter
));
24 iov_iter_zero(iov_iter_count(&subreq
->io_iter
), &subreq
->io_iter
);
25 if (subreq
->start
+ subreq
->transferred
>= subreq
->rreq
->i_size
)
26 __set_bit(NETFS_SREQ_HIT_EOF
, &subreq
->flags
);
30 * Flush, mark and unlock a folio that's now completely read. If we want to
31 * cache the folio, we set the group to NETFS_FOLIO_COPY_TO_CACHE, mark it
32 * dirty and let writeback handle it.
34 static void netfs_unlock_read_folio(struct netfs_io_subrequest
*subreq
,
35 struct netfs_io_request
*rreq
,
36 struct folio_queue
*folioq
,
39 struct netfs_folio
*finfo
;
40 struct folio
*folio
= folioq_folio(folioq
, slot
);
42 flush_dcache_folio(folio
);
43 folio_mark_uptodate(folio
);
45 if (!test_bit(NETFS_RREQ_USE_PGPRIV2
, &rreq
->flags
)) {
46 finfo
= netfs_folio_info(folio
);
48 trace_netfs_folio(folio
, netfs_folio_trace_filled_gaps
);
49 if (finfo
->netfs_group
)
50 folio_change_private(folio
, finfo
->netfs_group
);
52 folio_detach_private(folio
);
56 if (test_bit(NETFS_SREQ_COPY_TO_CACHE
, &subreq
->flags
)) {
57 if (!WARN_ON_ONCE(folio_get_private(folio
) != NULL
)) {
58 trace_netfs_folio(folio
, netfs_folio_trace_copy_to_cache
);
59 folio_attach_private(folio
, NETFS_FOLIO_COPY_TO_CACHE
);
60 folio_mark_dirty(folio
);
63 trace_netfs_folio(folio
, netfs_folio_trace_read_done
);
66 // TODO: Use of PG_private_2 is deprecated.
67 if (test_bit(NETFS_SREQ_COPY_TO_CACHE
, &subreq
->flags
))
68 netfs_pgpriv2_mark_copy_to_cache(subreq
, rreq
, folioq
, slot
);
71 if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS
, &rreq
->flags
)) {
72 if (folio
->index
== rreq
->no_unlock_folio
&&
73 test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO
, &rreq
->flags
)) {
76 trace_netfs_folio(folio
, netfs_folio_trace_read_unlock
);
81 folioq_clear(folioq
, slot
);
85 * Unlock any folios that are now completely read. Returns true if the
86 * subrequest is removed from the list.
88 static bool netfs_consume_read_data(struct netfs_io_subrequest
*subreq
, bool was_async
)
90 struct netfs_io_subrequest
*prev
, *next
;
91 struct netfs_io_request
*rreq
= subreq
->rreq
;
92 struct folio_queue
*folioq
= subreq
->curr_folioq
;
93 size_t avail
, prev_donated
, next_donated
, fsize
, part
, excess
;
96 int slot
= subreq
->curr_folioq_slot
;
98 if (WARN(subreq
->transferred
> subreq
->len
,
99 "Subreq overread: R%x[%x] %zu > %zu",
100 rreq
->debug_id
, subreq
->debug_index
,
101 subreq
->transferred
, subreq
->len
))
102 subreq
->transferred
= subreq
->len
;
105 fsize
= PAGE_SIZE
<< subreq
->curr_folio_order
;
106 fpos
= round_down(subreq
->start
+ subreq
->consumed
, fsize
);
109 if (WARN_ON_ONCE(!folioq
) ||
110 WARN_ON_ONCE(!folioq_folio(folioq
, slot
)) ||
111 WARN_ON_ONCE(folioq_folio(folioq
, slot
)->index
!= fpos
/ PAGE_SIZE
)) {
112 pr_err("R=%08x[%x] s=%llx-%llx ctl=%zx/%zx/%zx sl=%u\n",
113 rreq
->debug_id
, subreq
->debug_index
,
114 subreq
->start
, subreq
->start
+ subreq
->transferred
- 1,
115 subreq
->consumed
, subreq
->transferred
, subreq
->len
,
118 struct folio
*folio
= folioq_folio(folioq
, slot
);
120 pr_err("folioq: orders=%02x%02x%02x%02x\n",
121 folioq
->orders
[0], folioq
->orders
[1],
122 folioq
->orders
[2], folioq
->orders
[3]);
124 pr_err("folio: %llx-%llx ix=%llx o=%u qo=%u\n",
125 fpos
, fend
- 1, folio_pos(folio
), folio_order(folio
),
126 folioq_folio_order(folioq
, slot
));
131 /* Try to consume the current folio if we've hit or passed the end of
132 * it. There's a possibility that this subreq doesn't start at the
133 * beginning of the folio, in which case we need to donate to/from the
136 * We also need to include any potential donation back from the
139 prev_donated
= READ_ONCE(subreq
->prev_donated
);
140 next_donated
= READ_ONCE(subreq
->next_donated
);
141 if (prev_donated
|| next_donated
) {
142 spin_lock_bh(&rreq
->lock
);
143 prev_donated
= subreq
->prev_donated
;
144 next_donated
= subreq
->next_donated
;
145 subreq
->start
-= prev_donated
;
146 subreq
->len
+= prev_donated
;
147 subreq
->transferred
+= prev_donated
;
148 prev_donated
= subreq
->prev_donated
= 0;
149 if (subreq
->transferred
== subreq
->len
) {
150 subreq
->len
+= next_donated
;
151 subreq
->transferred
+= next_donated
;
152 next_donated
= subreq
->next_donated
= 0;
154 trace_netfs_sreq(subreq
, netfs_sreq_trace_add_donations
);
155 spin_unlock_bh(&rreq
->lock
);
158 avail
= subreq
->transferred
;
159 if (avail
== subreq
->len
)
160 avail
+= next_donated
;
161 start
= subreq
->start
;
162 if (subreq
->consumed
== 0) {
163 start
-= prev_donated
;
164 avail
+= prev_donated
;
166 start
+= subreq
->consumed
;
167 avail
-= subreq
->consumed
;
169 part
= umin(avail
, fsize
);
171 trace_netfs_progress(subreq
, start
, avail
, part
);
173 if (start
+ avail
>= fend
) {
175 /* Flush, unlock and mark for caching any folio we've just read. */
176 subreq
->consumed
= fend
- subreq
->start
;
177 netfs_unlock_read_folio(subreq
, rreq
, folioq
, slot
);
178 folioq_mark2(folioq
, slot
);
179 if (subreq
->consumed
>= subreq
->len
)
181 } else if (fpos
< start
) {
182 excess
= fend
- subreq
->start
;
184 spin_lock_bh(&rreq
->lock
);
185 /* If we complete first on a folio split with the
186 * preceding subreq, donate to that subreq - otherwise
187 * we get the responsibility.
189 if (subreq
->prev_donated
!= prev_donated
) {
190 spin_unlock_bh(&rreq
->lock
);
191 goto donation_changed
;
194 if (list_is_first(&subreq
->rreq_link
, &rreq
->subrequests
)) {
195 spin_unlock_bh(&rreq
->lock
);
196 pr_err("Can't donate prior to front\n");
200 prev
= list_prev_entry(subreq
, rreq_link
);
201 WRITE_ONCE(prev
->next_donated
, prev
->next_donated
+ excess
);
202 subreq
->start
+= excess
;
203 subreq
->len
-= excess
;
204 subreq
->transferred
-= excess
;
205 trace_netfs_donate(rreq
, subreq
, prev
, excess
,
206 netfs_trace_donate_tail_to_prev
);
207 trace_netfs_sreq(subreq
, netfs_sreq_trace_donate_to_prev
);
209 if (subreq
->consumed
>= subreq
->len
)
210 goto remove_subreq_locked
;
211 spin_unlock_bh(&rreq
->lock
);
213 pr_err("fpos > start\n");
217 /* Advance the rolling buffer to the next folio. */
219 if (slot
>= folioq_nr_slots(folioq
)) {
221 folioq
= folioq
->next
;
222 subreq
->curr_folioq
= folioq
;
224 subreq
->curr_folioq_slot
= slot
;
225 if (folioq
&& folioq_folio(folioq
, slot
))
226 subreq
->curr_folio_order
= folioq
->orders
[slot
];
232 /* Deal with partial progress. */
233 if (subreq
->transferred
< subreq
->len
)
236 /* Donate the remaining downloaded data to one of the neighbouring
237 * subrequests. Note that we may race with them doing the same thing.
239 spin_lock_bh(&rreq
->lock
);
241 if (subreq
->prev_donated
!= prev_donated
||
242 subreq
->next_donated
!= next_donated
) {
243 spin_unlock_bh(&rreq
->lock
);
245 goto donation_changed
;
248 /* Deal with the trickiest case: that this subreq is in the middle of a
249 * folio, not touching either edge, but finishes first. In such a
250 * case, we donate to the previous subreq, if there is one, so that the
251 * donation is only handled when that completes - and remove this
252 * subreq from the list.
254 * If the previous subreq finished first, we will have acquired their
255 * donation and should be able to unlock folios and/or donate nextwards.
257 if (!subreq
->consumed
&&
259 !list_is_first(&subreq
->rreq_link
, &rreq
->subrequests
)) {
260 prev
= list_prev_entry(subreq
, rreq_link
);
261 WRITE_ONCE(prev
->next_donated
, prev
->next_donated
+ subreq
->len
);
262 subreq
->start
+= subreq
->len
;
264 subreq
->transferred
= 0;
265 trace_netfs_donate(rreq
, subreq
, prev
, subreq
->len
,
266 netfs_trace_donate_to_prev
);
267 trace_netfs_sreq(subreq
, netfs_sreq_trace_donate_to_prev
);
268 goto remove_subreq_locked
;
271 /* If we can't donate down the chain, donate up the chain instead. */
272 excess
= subreq
->len
- subreq
->consumed
+ next_donated
;
274 if (!subreq
->consumed
)
275 excess
+= prev_donated
;
277 if (list_is_last(&subreq
->rreq_link
, &rreq
->subrequests
)) {
278 rreq
->prev_donated
= excess
;
279 trace_netfs_donate(rreq
, subreq
, NULL
, excess
,
280 netfs_trace_donate_to_deferred_next
);
282 next
= list_next_entry(subreq
, rreq_link
);
283 WRITE_ONCE(next
->prev_donated
, excess
);
284 trace_netfs_donate(rreq
, subreq
, next
, excess
,
285 netfs_trace_donate_to_next
);
287 trace_netfs_sreq(subreq
, netfs_sreq_trace_donate_to_next
);
288 subreq
->len
= subreq
->consumed
;
289 subreq
->transferred
= subreq
->consumed
;
290 goto remove_subreq_locked
;
293 spin_lock_bh(&rreq
->lock
);
294 remove_subreq_locked
:
295 subreq
->consumed
= subreq
->len
;
296 list_del(&subreq
->rreq_link
);
297 spin_unlock_bh(&rreq
->lock
);
298 netfs_put_subrequest(subreq
, false, netfs_sreq_trace_put_consumed
);
302 /* Errr... prev and next both donated to us, but insufficient to finish
305 printk("R=%08x[%x] s=%llx-%llx %zx/%zx/%zx\n",
306 rreq
->debug_id
, subreq
->debug_index
,
307 subreq
->start
, subreq
->start
+ subreq
->transferred
- 1,
308 subreq
->consumed
, subreq
->transferred
, subreq
->len
);
309 printk("folio: %llx-%llx\n", fpos
, fend
- 1);
310 printk("donated: prev=%zx next=%zx\n", prev_donated
, next_donated
);
311 printk("s=%llx av=%zx part=%zx\n", start
, avail
, part
);
316 * Do page flushing and suchlike after DIO.
318 static void netfs_rreq_assess_dio(struct netfs_io_request
*rreq
)
320 struct netfs_io_subrequest
*subreq
;
323 /* Collect unbuffered reads and direct reads, adding up the transfer
324 * sizes until we find the first short or failed subrequest.
326 list_for_each_entry(subreq
, &rreq
->subrequests
, rreq_link
) {
327 rreq
->transferred
+= subreq
->transferred
;
329 if (subreq
->transferred
< subreq
->len
||
330 test_bit(NETFS_SREQ_FAILED
, &subreq
->flags
)) {
331 rreq
->error
= subreq
->error
;
336 if (rreq
->origin
== NETFS_DIO_READ
) {
337 for (i
= 0; i
< rreq
->direct_bv_count
; i
++) {
338 flush_dcache_page(rreq
->direct_bv
[i
].bv_page
);
339 // TODO: cifs marks pages in the destination buffer
340 // dirty under some circumstances after a read. Do we
341 // need to do that too?
342 set_page_dirty(rreq
->direct_bv
[i
].bv_page
);
347 rreq
->iocb
->ki_pos
+= rreq
->transferred
;
348 if (rreq
->iocb
->ki_complete
)
349 rreq
->iocb
->ki_complete(
350 rreq
->iocb
, rreq
->error
? rreq
->error
: rreq
->transferred
);
352 if (rreq
->netfs_ops
->done
)
353 rreq
->netfs_ops
->done(rreq
);
354 if (rreq
->origin
== NETFS_DIO_READ
)
355 inode_dio_end(rreq
->inode
);
359 * Assess the state of a read request and decide what to do next.
361 * Note that we're in normal kernel thread context at this point, possibly
362 * running on a workqueue.
364 static void netfs_rreq_assess(struct netfs_io_request
*rreq
)
366 trace_netfs_rreq(rreq
, netfs_rreq_trace_assess
);
368 //netfs_rreq_is_still_valid(rreq);
370 if (test_and_clear_bit(NETFS_RREQ_NEED_RETRY
, &rreq
->flags
)) {
371 netfs_retry_reads(rreq
);
375 if (rreq
->origin
== NETFS_DIO_READ
||
376 rreq
->origin
== NETFS_READ_GAPS
)
377 netfs_rreq_assess_dio(rreq
);
378 task_io_account_read(rreq
->transferred
);
380 trace_netfs_rreq(rreq
, netfs_rreq_trace_wake_ip
);
381 clear_bit_unlock(NETFS_RREQ_IN_PROGRESS
, &rreq
->flags
);
382 wake_up_bit(&rreq
->flags
, NETFS_RREQ_IN_PROGRESS
);
384 trace_netfs_rreq(rreq
, netfs_rreq_trace_done
);
385 netfs_clear_subrequests(rreq
, false);
386 netfs_unlock_abandoned_read_pages(rreq
);
387 if (unlikely(test_bit(NETFS_RREQ_USE_PGPRIV2
, &rreq
->flags
)))
388 netfs_pgpriv2_write_to_the_cache(rreq
);
391 void netfs_read_termination_worker(struct work_struct
*work
)
393 struct netfs_io_request
*rreq
=
394 container_of(work
, struct netfs_io_request
, work
);
395 netfs_see_request(rreq
, netfs_rreq_trace_see_work
);
396 netfs_rreq_assess(rreq
);
397 netfs_put_request(rreq
, false, netfs_rreq_trace_put_work_complete
);
401 * Handle the completion of all outstanding I/O operations on a read request.
402 * We inherit a ref from the caller.
404 void netfs_rreq_terminated(struct netfs_io_request
*rreq
, bool was_async
)
407 return netfs_rreq_assess(rreq
);
408 if (!work_pending(&rreq
->work
)) {
409 netfs_get_request(rreq
, netfs_rreq_trace_get_work
);
410 if (!queue_work(system_unbound_wq
, &rreq
->work
))
411 netfs_put_request(rreq
, was_async
, netfs_rreq_trace_put_work_nq
);
416 * netfs_read_subreq_progress - Note progress of a read operation.
417 * @subreq: The read request that has terminated.
418 * @was_async: True if we're in an asynchronous context.
420 * This tells the read side of netfs lib that a contributory I/O operation has
421 * made some progress and that it may be possible to unlock some folios.
423 * Before calling, the filesystem should update subreq->transferred to track
424 * the amount of data copied into the output buffer.
426 * If @was_async is true, the caller might be running in softirq or interrupt
427 * context and we can't sleep.
429 void netfs_read_subreq_progress(struct netfs_io_subrequest
*subreq
,
432 struct netfs_io_request
*rreq
= subreq
->rreq
;
434 trace_netfs_sreq(subreq
, netfs_sreq_trace_progress
);
436 if (subreq
->transferred
> subreq
->consumed
&&
437 (rreq
->origin
== NETFS_READAHEAD
||
438 rreq
->origin
== NETFS_READPAGE
||
439 rreq
->origin
== NETFS_READ_FOR_WRITE
)) {
440 netfs_consume_read_data(subreq
, was_async
);
441 __clear_bit(NETFS_SREQ_NO_PROGRESS
, &subreq
->flags
);
444 EXPORT_SYMBOL(netfs_read_subreq_progress
);
447 * netfs_read_subreq_terminated - Note the termination of an I/O operation.
448 * @subreq: The I/O request that has terminated.
449 * @error: Error code indicating type of completion.
450 * @was_async: The termination was asynchronous
452 * This tells the read helper that a contributory I/O operation has terminated,
453 * one way or another, and that it should integrate the results.
455 * The caller indicates the outcome of the operation through @error, supplying
456 * 0 to indicate a successful or retryable transfer (if NETFS_SREQ_NEED_RETRY
457 * is set) or a negative error code. The helper will look after reissuing I/O
458 * operations as appropriate and writing downloaded data to the cache.
460 * Before calling, the filesystem should update subreq->transferred to track
461 * the amount of data copied into the output buffer.
463 * If @was_async is true, the caller might be running in softirq or interrupt
464 * context and we can't sleep.
466 void netfs_read_subreq_terminated(struct netfs_io_subrequest
*subreq
,
467 int error
, bool was_async
)
469 struct netfs_io_request
*rreq
= subreq
->rreq
;
471 switch (subreq
->source
) {
472 case NETFS_READ_FROM_CACHE
:
473 netfs_stat(&netfs_n_rh_read_done
);
475 case NETFS_DOWNLOAD_FROM_SERVER
:
476 netfs_stat(&netfs_n_rh_download_done
);
482 if (rreq
->origin
!= NETFS_DIO_READ
) {
483 /* Collect buffered reads.
485 * If the read completed validly short, then we can clear the
486 * tail before going on to unlock the folios.
488 if (error
== 0 && subreq
->transferred
< subreq
->len
&&
489 (test_bit(NETFS_SREQ_HIT_EOF
, &subreq
->flags
) ||
490 test_bit(NETFS_SREQ_CLEAR_TAIL
, &subreq
->flags
))) {
491 netfs_clear_unread(subreq
);
492 subreq
->transferred
= subreq
->len
;
493 trace_netfs_sreq(subreq
, netfs_sreq_trace_clear
);
495 if (subreq
->transferred
> subreq
->consumed
&&
496 (rreq
->origin
== NETFS_READAHEAD
||
497 rreq
->origin
== NETFS_READPAGE
||
498 rreq
->origin
== NETFS_READ_FOR_WRITE
)) {
499 netfs_consume_read_data(subreq
, was_async
);
500 __clear_bit(NETFS_SREQ_NO_PROGRESS
, &subreq
->flags
);
502 rreq
->transferred
+= subreq
->transferred
;
505 /* Deal with retry requests, short reads and errors. If we retry
506 * but don't make progress, we abandon the attempt.
508 if (!error
&& subreq
->transferred
< subreq
->len
) {
509 if (test_bit(NETFS_SREQ_HIT_EOF
, &subreq
->flags
)) {
510 trace_netfs_sreq(subreq
, netfs_sreq_trace_hit_eof
);
512 trace_netfs_sreq(subreq
, netfs_sreq_trace_short
);
513 if (subreq
->transferred
> subreq
->consumed
) {
514 __set_bit(NETFS_SREQ_NEED_RETRY
, &subreq
->flags
);
515 __clear_bit(NETFS_SREQ_NO_PROGRESS
, &subreq
->flags
);
516 set_bit(NETFS_RREQ_NEED_RETRY
, &rreq
->flags
);
517 } else if (!__test_and_set_bit(NETFS_SREQ_NO_PROGRESS
, &subreq
->flags
)) {
518 __set_bit(NETFS_SREQ_NEED_RETRY
, &subreq
->flags
);
519 set_bit(NETFS_RREQ_NEED_RETRY
, &rreq
->flags
);
521 __set_bit(NETFS_SREQ_FAILED
, &subreq
->flags
);
527 subreq
->error
= error
;
528 trace_netfs_sreq(subreq
, netfs_sreq_trace_terminated
);
530 if (unlikely(error
< 0)) {
531 trace_netfs_failure(rreq
, subreq
, error
, netfs_fail_read
);
532 if (subreq
->source
== NETFS_READ_FROM_CACHE
) {
533 netfs_stat(&netfs_n_rh_read_failed
);
535 netfs_stat(&netfs_n_rh_download_failed
);
536 set_bit(NETFS_RREQ_FAILED
, &rreq
->flags
);
537 rreq
->error
= subreq
->error
;
541 if (atomic_dec_and_test(&rreq
->nr_outstanding
))
542 netfs_rreq_terminated(rreq
, was_async
);
544 netfs_put_subrequest(subreq
, was_async
, netfs_sreq_trace_put_terminated
);
546 EXPORT_SYMBOL(netfs_read_subreq_terminated
);