1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (C) 2018 HUAWEI, Inc.
4 * https://www.huawei.com/
5 * Copyright (C) 2022 Alibaba Cloud
9 #include <linux/cpuhotplug.h>
10 #include <trace/events/erofs.h>
12 #define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE)
13 #define Z_EROFS_INLINE_BVECS 2
16 * let's leave a type here in case of introducing
17 * another tagged pointer later.
19 typedef void *z_erofs_next_pcluster_t
;
27 #define __Z_EROFS_BVSET(name, total) \
29 /* point to the next page which contains the following bvecs */ \
30 struct page *nextpage; \
31 struct z_erofs_bvec bvec[total]; \
33 __Z_EROFS_BVSET(z_erofs_bvset
,);
34 __Z_EROFS_BVSET(z_erofs_bvset_inline
, Z_EROFS_INLINE_BVECS
);
37 * Structure fields follow one of the following exclusion rules.
39 * I: Modifiable by initialization/destruction paths and read-only
42 * L: Field should be protected by the pcluster lock;
44 * A: Field should be accessed / updated in atomic for parallelized code.
46 struct z_erofs_pcluster
{
48 struct lockref lockref
;
50 /* A: point to next chained pcluster or TAILs */
51 z_erofs_next_pcluster_t next
;
53 /* I: start block address of this pcluster */
56 /* L: the maximum decompression size of this round */
59 /* L: total number of bvecs */
62 /* I: pcluster size (compressed size) in bytes */
63 unsigned int pclustersize
;
65 /* I: page offset of start position of decompression */
66 unsigned short pageofs_out
;
68 /* I: page offset of inline compressed data */
69 unsigned short pageofs_in
;
72 /* L: inline a certain number of bvec for bootstrap */
73 struct z_erofs_bvset_inline bvset
;
75 /* I: can be used to free the pcluster by RCU. */
79 /* I: compression algorithm format */
80 unsigned char algorithmformat
;
82 /* L: whether partial decompression or not */
85 /* L: indicate several pageofs_outs or not */
88 /* L: whether extra buffer allocations are best-effort */
91 /* A: compressed bvecs (can be cached or inplaced pages) */
92 struct z_erofs_bvec compressed_bvecs
[];
95 /* the end of a chain of pclusters */
96 #define Z_EROFS_PCLUSTER_TAIL ((void *) 0x700 + POISON_POINTER_DELTA)
97 #define Z_EROFS_PCLUSTER_NIL (NULL)
99 struct z_erofs_decompressqueue
{
100 struct super_block
*sb
;
101 atomic_t pending_bios
;
102 z_erofs_next_pcluster_t head
;
105 struct completion done
;
106 struct work_struct work
;
107 struct kthread_work kthread_work
;
112 static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster
*pcl
)
117 static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster
*pcl
)
119 return PAGE_ALIGN(pcl
->pclustersize
) >> PAGE_SHIFT
;
122 static bool erofs_folio_is_managed(struct erofs_sb_info
*sbi
, struct folio
*fo
)
124 return fo
->mapping
== MNGD_MAPPING(sbi
);
127 #define Z_EROFS_ONSTACK_PAGES 32
130 * since pclustersize is variable for big pcluster feature, introduce slab
131 * pools implementation for different pcluster sizes.
133 struct z_erofs_pcluster_slab
{
134 struct kmem_cache
*slab
;
135 unsigned int maxpages
;
139 #define _PCLP(n) { .maxpages = n }
141 static struct z_erofs_pcluster_slab pcluster_pool
[] __read_mostly
= {
142 _PCLP(1), _PCLP(4), _PCLP(16), _PCLP(64), _PCLP(128),
143 _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES
)
146 struct z_erofs_bvec_iter
{
148 struct z_erofs_bvset
*bvset
;
149 unsigned int nr
, cur
;
152 static struct page
*z_erofs_bvec_iter_end(struct z_erofs_bvec_iter
*iter
)
155 kunmap_local(iter
->bvset
);
159 static struct page
*z_erofs_bvset_flip(struct z_erofs_bvec_iter
*iter
)
161 unsigned long base
= (unsigned long)((struct z_erofs_bvset
*)0)->bvec
;
162 /* have to access nextpage in advance, otherwise it will be unmapped */
163 struct page
*nextpage
= iter
->bvset
->nextpage
;
164 struct page
*oldpage
;
166 DBG_BUGON(!nextpage
);
167 oldpage
= z_erofs_bvec_iter_end(iter
);
168 iter
->bvpage
= nextpage
;
169 iter
->bvset
= kmap_local_page(nextpage
);
170 iter
->nr
= (PAGE_SIZE
- base
) / sizeof(struct z_erofs_bvec
);
175 static void z_erofs_bvec_iter_begin(struct z_erofs_bvec_iter
*iter
,
176 struct z_erofs_bvset_inline
*bvset
,
177 unsigned int bootstrap_nr
,
180 *iter
= (struct z_erofs_bvec_iter
) {
182 .bvset
= (struct z_erofs_bvset
*)bvset
,
185 while (cur
> iter
->nr
) {
187 z_erofs_bvset_flip(iter
);
192 static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter
*iter
,
193 struct z_erofs_bvec
*bvec
,
194 struct page
**candidate_bvpage
,
195 struct page
**pagepool
)
197 if (iter
->cur
>= iter
->nr
) {
198 struct page
*nextpage
= *candidate_bvpage
;
201 nextpage
= __erofs_allocpage(pagepool
, GFP_KERNEL
,
205 set_page_private(nextpage
, Z_EROFS_SHORTLIVED_PAGE
);
207 DBG_BUGON(iter
->bvset
->nextpage
);
208 iter
->bvset
->nextpage
= nextpage
;
209 z_erofs_bvset_flip(iter
);
211 iter
->bvset
->nextpage
= NULL
;
212 *candidate_bvpage
= NULL
;
214 iter
->bvset
->bvec
[iter
->cur
++] = *bvec
;
218 static void z_erofs_bvec_dequeue(struct z_erofs_bvec_iter
*iter
,
219 struct z_erofs_bvec
*bvec
,
220 struct page
**old_bvpage
)
222 if (iter
->cur
== iter
->nr
)
223 *old_bvpage
= z_erofs_bvset_flip(iter
);
226 *bvec
= iter
->bvset
->bvec
[iter
->cur
++];
229 static void z_erofs_destroy_pcluster_pool(void)
233 for (i
= 0; i
< ARRAY_SIZE(pcluster_pool
); ++i
) {
234 if (!pcluster_pool
[i
].slab
)
236 kmem_cache_destroy(pcluster_pool
[i
].slab
);
237 pcluster_pool
[i
].slab
= NULL
;
241 static int z_erofs_create_pcluster_pool(void)
243 struct z_erofs_pcluster_slab
*pcs
;
244 struct z_erofs_pcluster
*a
;
247 for (pcs
= pcluster_pool
;
248 pcs
< pcluster_pool
+ ARRAY_SIZE(pcluster_pool
); ++pcs
) {
249 size
= struct_size(a
, compressed_bvecs
, pcs
->maxpages
);
251 sprintf(pcs
->name
, "erofs_pcluster-%u", pcs
->maxpages
);
252 pcs
->slab
= kmem_cache_create(pcs
->name
, size
, 0,
253 SLAB_RECLAIM_ACCOUNT
, NULL
);
257 z_erofs_destroy_pcluster_pool();
263 static struct z_erofs_pcluster
*z_erofs_alloc_pcluster(unsigned int size
)
265 unsigned int nrpages
= PAGE_ALIGN(size
) >> PAGE_SHIFT
;
266 struct z_erofs_pcluster_slab
*pcs
= pcluster_pool
;
268 for (; pcs
< pcluster_pool
+ ARRAY_SIZE(pcluster_pool
); ++pcs
) {
269 struct z_erofs_pcluster
*pcl
;
271 if (nrpages
> pcs
->maxpages
)
274 pcl
= kmem_cache_zalloc(pcs
->slab
, GFP_KERNEL
);
276 return ERR_PTR(-ENOMEM
);
277 pcl
->pclustersize
= size
;
280 return ERR_PTR(-EINVAL
);
283 static void z_erofs_free_pcluster(struct z_erofs_pcluster
*pcl
)
285 unsigned int pclusterpages
= z_erofs_pclusterpages(pcl
);
288 for (i
= 0; i
< ARRAY_SIZE(pcluster_pool
); ++i
) {
289 struct z_erofs_pcluster_slab
*pcs
= pcluster_pool
+ i
;
291 if (pclusterpages
> pcs
->maxpages
)
294 kmem_cache_free(pcs
->slab
, pcl
);
300 static struct workqueue_struct
*z_erofs_workqueue __read_mostly
;
302 #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD
303 static struct kthread_worker __rcu
**z_erofs_pcpu_workers
;
305 static void erofs_destroy_percpu_workers(void)
307 struct kthread_worker
*worker
;
310 for_each_possible_cpu(cpu
) {
311 worker
= rcu_dereference_protected(
312 z_erofs_pcpu_workers
[cpu
], 1);
313 rcu_assign_pointer(z_erofs_pcpu_workers
[cpu
], NULL
);
315 kthread_destroy_worker(worker
);
317 kfree(z_erofs_pcpu_workers
);
320 static struct kthread_worker
*erofs_init_percpu_worker(int cpu
)
322 struct kthread_worker
*worker
=
323 kthread_run_worker_on_cpu(cpu
, 0, "erofs_worker/%u");
327 if (IS_ENABLED(CONFIG_EROFS_FS_PCPU_KTHREAD_HIPRI
))
328 sched_set_fifo_low(worker
->task
);
332 static int erofs_init_percpu_workers(void)
334 struct kthread_worker
*worker
;
337 z_erofs_pcpu_workers
= kcalloc(num_possible_cpus(),
338 sizeof(struct kthread_worker
*), GFP_ATOMIC
);
339 if (!z_erofs_pcpu_workers
)
342 for_each_online_cpu(cpu
) { /* could miss cpu{off,on}line? */
343 worker
= erofs_init_percpu_worker(cpu
);
345 rcu_assign_pointer(z_erofs_pcpu_workers
[cpu
], worker
);
350 static inline void erofs_destroy_percpu_workers(void) {}
351 static inline int erofs_init_percpu_workers(void) { return 0; }
354 #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_EROFS_FS_PCPU_KTHREAD)
355 static DEFINE_SPINLOCK(z_erofs_pcpu_worker_lock
);
356 static enum cpuhp_state erofs_cpuhp_state
;
358 static int erofs_cpu_online(unsigned int cpu
)
360 struct kthread_worker
*worker
, *old
;
362 worker
= erofs_init_percpu_worker(cpu
);
364 return PTR_ERR(worker
);
366 spin_lock(&z_erofs_pcpu_worker_lock
);
367 old
= rcu_dereference_protected(z_erofs_pcpu_workers
[cpu
],
368 lockdep_is_held(&z_erofs_pcpu_worker_lock
));
370 rcu_assign_pointer(z_erofs_pcpu_workers
[cpu
], worker
);
371 spin_unlock(&z_erofs_pcpu_worker_lock
);
373 kthread_destroy_worker(worker
);
377 static int erofs_cpu_offline(unsigned int cpu
)
379 struct kthread_worker
*worker
;
381 spin_lock(&z_erofs_pcpu_worker_lock
);
382 worker
= rcu_dereference_protected(z_erofs_pcpu_workers
[cpu
],
383 lockdep_is_held(&z_erofs_pcpu_worker_lock
));
384 rcu_assign_pointer(z_erofs_pcpu_workers
[cpu
], NULL
);
385 spin_unlock(&z_erofs_pcpu_worker_lock
);
389 kthread_destroy_worker(worker
);
393 static int erofs_cpu_hotplug_init(void)
397 state
= cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN
,
398 "fs/erofs:online", erofs_cpu_online
, erofs_cpu_offline
);
402 erofs_cpuhp_state
= state
;
406 static void erofs_cpu_hotplug_destroy(void)
408 if (erofs_cpuhp_state
)
409 cpuhp_remove_state_nocalls(erofs_cpuhp_state
);
411 #else /* !CONFIG_HOTPLUG_CPU || !CONFIG_EROFS_FS_PCPU_KTHREAD */
412 static inline int erofs_cpu_hotplug_init(void) { return 0; }
413 static inline void erofs_cpu_hotplug_destroy(void) {}
416 void z_erofs_exit_subsystem(void)
418 erofs_cpu_hotplug_destroy();
419 erofs_destroy_percpu_workers();
420 destroy_workqueue(z_erofs_workqueue
);
421 z_erofs_destroy_pcluster_pool();
422 z_erofs_exit_decompressor();
425 int __init
z_erofs_init_subsystem(void)
427 int err
= z_erofs_init_decompressor();
430 goto err_decompressor
;
432 err
= z_erofs_create_pcluster_pool();
434 goto err_pcluster_pool
;
436 z_erofs_workqueue
= alloc_workqueue("erofs_worker",
437 WQ_UNBOUND
| WQ_HIGHPRI
, num_possible_cpus());
438 if (!z_erofs_workqueue
) {
440 goto err_workqueue_init
;
443 err
= erofs_init_percpu_workers();
445 goto err_pcpu_worker
;
447 err
= erofs_cpu_hotplug_init();
453 erofs_destroy_percpu_workers();
455 destroy_workqueue(z_erofs_workqueue
);
457 z_erofs_destroy_pcluster_pool();
459 z_erofs_exit_decompressor();
464 enum z_erofs_pclustermode
{
465 Z_EROFS_PCLUSTER_INFLIGHT
,
467 * a weak form of Z_EROFS_PCLUSTER_FOLLOWED, the difference is that it
468 * could be dispatched into bypass queue later due to uptodated managed
469 * pages. All related online pages cannot be reused for inplace I/O (or
470 * bvpage) since it can be directly decoded without I/O submission.
472 Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE
,
474 * The pcluster was just linked to a decompression chain by us. It can
475 * also be linked with the remaining pclusters, which means if the
476 * processing page is the tail page of a pcluster, this pcluster can
477 * safely use the whole page (since the previous pcluster is within the
478 * same chain) for in-place I/O, as illustrated below:
479 * ___________________________________________________
480 * | tail (partial) page | head (partial) page |
481 * | (of the current pcl) | (of the previous pcl) |
482 * |___PCLUSTER_FOLLOWED___|_____PCLUSTER_FOLLOWED_____|
484 * [ (*) the page above can be used as inplace I/O. ]
486 Z_EROFS_PCLUSTER_FOLLOWED
,
489 struct z_erofs_decompress_frontend
{
490 struct inode
*const inode
;
491 struct erofs_map_blocks map
;
492 struct z_erofs_bvec_iter biter
;
494 struct page
*pagepool
;
495 struct page
*candidate_bvpage
;
496 struct z_erofs_pcluster
*pcl
;
497 z_erofs_next_pcluster_t owned_head
;
498 enum z_erofs_pclustermode mode
;
500 erofs_off_t headoffset
;
502 /* a pointer used to pick up inplace I/O pages */
506 #define DECOMPRESS_FRONTEND_INIT(__i) { \
507 .inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \
508 .mode = Z_EROFS_PCLUSTER_FOLLOWED }
510 static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend
*fe
)
512 unsigned int cachestrategy
= EROFS_I_SB(fe
->inode
)->opt
.cache_strategy
;
514 if (cachestrategy
<= EROFS_ZIP_CACHE_DISABLED
)
517 if (!(fe
->map
.m_flags
& EROFS_MAP_FULL_MAPPED
))
520 if (cachestrategy
>= EROFS_ZIP_CACHE_READAROUND
&&
521 fe
->map
.m_la
< fe
->headoffset
)
527 static void z_erofs_bind_cache(struct z_erofs_decompress_frontend
*fe
)
529 struct address_space
*mc
= MNGD_MAPPING(EROFS_I_SB(fe
->inode
));
530 struct z_erofs_pcluster
*pcl
= fe
->pcl
;
531 unsigned int pclusterpages
= z_erofs_pclusterpages(pcl
);
532 bool shouldalloc
= z_erofs_should_alloc_cache(fe
);
533 bool standalone
= true;
535 * optimistic allocation without direct reclaim since inplace I/O
536 * can be used if low memory otherwise.
538 gfp_t gfp
= (mapping_gfp_mask(mc
) & ~__GFP_DIRECT_RECLAIM
) |
539 __GFP_NOMEMALLOC
| __GFP_NORETRY
| __GFP_NOWARN
;
542 if (i_blocksize(fe
->inode
) != PAGE_SIZE
||
543 fe
->mode
< Z_EROFS_PCLUSTER_FOLLOWED
)
546 for (i
= 0; i
< pclusterpages
; ++i
) {
547 struct page
*page
, *newpage
;
549 /* Inaccurate check w/o locking to avoid unneeded lookups */
550 if (READ_ONCE(pcl
->compressed_bvecs
[i
].page
))
553 page
= find_get_page(mc
, pcl
->index
+ i
);
555 /* I/O is needed, no possible to decompress directly */
561 * Try cached I/O if allocation succeeds or fallback to
562 * in-place I/O instead to avoid any direct reclaim.
564 newpage
= erofs_allocpage(&fe
->pagepool
, gfp
);
567 set_page_private(newpage
, Z_EROFS_PREALLOCATED_PAGE
);
569 spin_lock(&pcl
->lockref
.lock
);
570 if (!pcl
->compressed_bvecs
[i
].page
) {
571 pcl
->compressed_bvecs
[i
].page
= page
? page
: newpage
;
572 spin_unlock(&pcl
->lockref
.lock
);
575 spin_unlock(&pcl
->lockref
.lock
);
580 erofs_pagepool_add(&fe
->pagepool
, newpage
);
584 * don't do inplace I/O if all compressed pages are available in
585 * managed cache since it can be moved to the bypass queue instead.
588 fe
->mode
= Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE
;
591 /* (erofs_shrinker) disconnect cached encoded data with pclusters */
592 static int erofs_try_to_free_all_cached_folios(struct erofs_sb_info
*sbi
,
593 struct z_erofs_pcluster
*pcl
)
595 unsigned int pclusterpages
= z_erofs_pclusterpages(pcl
);
599 DBG_BUGON(z_erofs_is_inline_pcluster(pcl
));
600 /* Each cached folio contains one page unless bs > ps is supported */
601 for (i
= 0; i
< pclusterpages
; ++i
) {
602 if (pcl
->compressed_bvecs
[i
].page
) {
603 folio
= page_folio(pcl
->compressed_bvecs
[i
].page
);
604 /* Avoid reclaiming or migrating this folio */
605 if (!folio_trylock(folio
))
608 if (!erofs_folio_is_managed(sbi
, folio
))
610 pcl
->compressed_bvecs
[i
].page
= NULL
;
611 folio_detach_private(folio
);
618 static bool z_erofs_cache_release_folio(struct folio
*folio
, gfp_t gfp
)
620 struct z_erofs_pcluster
*pcl
= folio_get_private(folio
);
621 struct z_erofs_bvec
*bvec
= pcl
->compressed_bvecs
;
622 struct z_erofs_bvec
*end
= bvec
+ z_erofs_pclusterpages(pcl
);
625 if (!folio_test_private(folio
))
629 spin_lock(&pcl
->lockref
.lock
);
630 if (pcl
->lockref
.count
<= 0) {
631 DBG_BUGON(z_erofs_is_inline_pcluster(pcl
));
632 for (; bvec
< end
; ++bvec
) {
633 if (bvec
->page
&& page_folio(bvec
->page
) == folio
) {
635 folio_detach_private(folio
);
641 spin_unlock(&pcl
->lockref
.lock
);
646 * It will be called only on inode eviction. In case that there are still some
647 * decompression requests in progress, wait with rescheduling for a bit here.
648 * An extra lock could be introduced instead but it seems unnecessary.
650 static void z_erofs_cache_invalidate_folio(struct folio
*folio
,
651 size_t offset
, size_t length
)
653 const size_t stop
= length
+ offset
;
655 /* Check for potential overflow in debug mode */
656 DBG_BUGON(stop
> folio_size(folio
) || stop
< length
);
658 if (offset
== 0 && stop
== folio_size(folio
))
659 while (!z_erofs_cache_release_folio(folio
, 0))
663 static const struct address_space_operations z_erofs_cache_aops
= {
664 .release_folio
= z_erofs_cache_release_folio
,
665 .invalidate_folio
= z_erofs_cache_invalidate_folio
,
668 int erofs_init_managed_cache(struct super_block
*sb
)
670 struct inode
*const inode
= new_inode(sb
);
676 inode
->i_size
= OFFSET_MAX
;
677 inode
->i_mapping
->a_ops
= &z_erofs_cache_aops
;
678 mapping_set_gfp_mask(inode
->i_mapping
, GFP_KERNEL
);
679 EROFS_SB(sb
)->managed_cache
= inode
;
683 /* callers must be with pcluster lock held */
684 static int z_erofs_attach_page(struct z_erofs_decompress_frontend
*fe
,
685 struct z_erofs_bvec
*bvec
, bool exclusive
)
687 struct z_erofs_pcluster
*pcl
= fe
->pcl
;
691 /* give priority for inplaceio to use file pages first */
692 spin_lock(&pcl
->lockref
.lock
);
693 while (fe
->icur
> 0) {
694 if (pcl
->compressed_bvecs
[--fe
->icur
].page
)
696 pcl
->compressed_bvecs
[fe
->icur
] = *bvec
;
697 spin_unlock(&pcl
->lockref
.lock
);
700 spin_unlock(&pcl
->lockref
.lock
);
702 /* otherwise, check if it can be used as a bvpage */
703 if (fe
->mode
>= Z_EROFS_PCLUSTER_FOLLOWED
&&
704 !fe
->candidate_bvpage
)
705 fe
->candidate_bvpage
= bvec
->page
;
707 ret
= z_erofs_bvec_enqueue(&fe
->biter
, bvec
, &fe
->candidate_bvpage
,
709 fe
->pcl
->vcnt
+= (ret
>= 0);
713 static bool z_erofs_get_pcluster(struct z_erofs_pcluster
*pcl
)
715 if (lockref_get_not_zero(&pcl
->lockref
))
718 spin_lock(&pcl
->lockref
.lock
);
719 if (__lockref_is_dead(&pcl
->lockref
)) {
720 spin_unlock(&pcl
->lockref
.lock
);
724 if (!pcl
->lockref
.count
++)
725 atomic_long_dec(&erofs_global_shrink_cnt
);
726 spin_unlock(&pcl
->lockref
.lock
);
730 static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend
*fe
)
732 struct erofs_map_blocks
*map
= &fe
->map
;
733 struct super_block
*sb
= fe
->inode
->i_sb
;
734 struct erofs_sb_info
*sbi
= EROFS_SB(sb
);
735 bool ztailpacking
= map
->m_flags
& EROFS_MAP_META
;
736 struct z_erofs_pcluster
*pcl
, *pre
;
739 if (!(map
->m_flags
& EROFS_MAP_ENCODED
) ||
740 (!ztailpacking
&& !erofs_blknr(sb
, map
->m_pa
))) {
742 return -EFSCORRUPTED
;
745 /* no available pcluster, let's allocate one */
746 pcl
= z_erofs_alloc_pcluster(map
->m_plen
);
750 lockref_init(&pcl
->lockref
, 1); /* one ref for this request */
751 pcl
->algorithmformat
= map
->m_algorithmformat
;
755 /* new pclusters should be claimed as type 1, primary and followed */
756 pcl
->next
= fe
->owned_head
;
757 pcl
->pageofs_out
= map
->m_la
& ~PAGE_MASK
;
758 fe
->mode
= Z_EROFS_PCLUSTER_FOLLOWED
;
761 * lock all primary followed works before visible to others
762 * and mutex_trylock *never* fails for a new pcluster.
764 mutex_init(&pcl
->lock
);
765 DBG_BUGON(!mutex_trylock(&pcl
->lock
));
768 pcl
->index
= 0; /* which indicates ztailpacking */
770 pcl
->index
= erofs_blknr(sb
, map
->m_pa
);
772 xa_lock(&sbi
->managed_pslots
);
773 pre
= __xa_cmpxchg(&sbi
->managed_pslots
, pcl
->index
,
774 NULL
, pcl
, GFP_KERNEL
);
775 if (!pre
|| xa_is_err(pre
) || z_erofs_get_pcluster(pre
)) {
776 xa_unlock(&sbi
->managed_pslots
);
779 /* try to legitimize the current in-tree one */
780 xa_unlock(&sbi
->managed_pslots
);
783 if (xa_is_err(pre
)) {
792 fe
->owned_head
= &pcl
->next
;
797 mutex_unlock(&pcl
->lock
);
798 z_erofs_free_pcluster(pcl
);
802 static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend
*fe
)
804 struct erofs_map_blocks
*map
= &fe
->map
;
805 struct super_block
*sb
= fe
->inode
->i_sb
;
806 erofs_blk_t blknr
= erofs_blknr(sb
, map
->m_pa
);
807 struct z_erofs_pcluster
*pcl
= NULL
;
811 /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */
812 DBG_BUGON(fe
->owned_head
== Z_EROFS_PCLUSTER_NIL
);
814 if (!(map
->m_flags
& EROFS_MAP_META
)) {
817 pcl
= xa_load(&EROFS_SB(sb
)->managed_pslots
, blknr
);
818 if (!pcl
|| z_erofs_get_pcluster(pcl
)) {
819 DBG_BUGON(pcl
&& blknr
!= pcl
->index
);
825 } else if ((map
->m_pa
& ~PAGE_MASK
) + map
->m_plen
> PAGE_SIZE
) {
827 return -EFSCORRUPTED
;
834 ret
= z_erofs_register_pcluster(fe
);
837 if (ret
== -EEXIST
) {
838 mutex_lock(&fe
->pcl
->lock
);
839 /* check if this pcluster hasn't been linked into any chain. */
840 if (cmpxchg(&fe
->pcl
->next
, Z_EROFS_PCLUSTER_NIL
,
841 fe
->owned_head
) == Z_EROFS_PCLUSTER_NIL
) {
842 /* .. so it can be attached to our submission chain */
843 fe
->owned_head
= &fe
->pcl
->next
;
844 fe
->mode
= Z_EROFS_PCLUSTER_FOLLOWED
;
845 } else { /* otherwise, it belongs to an inflight chain */
846 fe
->mode
= Z_EROFS_PCLUSTER_INFLIGHT
;
852 z_erofs_bvec_iter_begin(&fe
->biter
, &fe
->pcl
->bvset
,
853 Z_EROFS_INLINE_BVECS
, fe
->pcl
->vcnt
);
854 if (!z_erofs_is_inline_pcluster(fe
->pcl
)) {
855 /* bind cache first when cached decompression is preferred */
856 z_erofs_bind_cache(fe
);
860 mptr
= erofs_read_metabuf(&map
->buf
, sb
, map
->m_pa
, EROFS_NO_KMAP
);
863 erofs_err(sb
, "failed to get inline data %d", ret
);
866 get_page(map
->buf
.page
);
867 WRITE_ONCE(fe
->pcl
->compressed_bvecs
[0].page
, map
->buf
.page
);
868 fe
->pcl
->pageofs_in
= map
->m_pa
& ~PAGE_MASK
;
869 fe
->mode
= Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE
;
871 /* file-backed inplace I/O pages are traversed in reverse order */
872 fe
->icur
= z_erofs_pclusterpages(fe
->pcl
);
877 * keep in mind that no referenced pclusters will be freed
878 * only after a RCU grace period.
880 static void z_erofs_rcu_callback(struct rcu_head
*head
)
882 z_erofs_free_pcluster(container_of(head
,
883 struct z_erofs_pcluster
, rcu
));
886 static bool __erofs_try_to_release_pcluster(struct erofs_sb_info
*sbi
,
887 struct z_erofs_pcluster
*pcl
)
889 if (pcl
->lockref
.count
)
893 * Note that all cached folios should be detached before deleted from
894 * the XArray. Otherwise some folios could be still attached to the
895 * orphan old pcluster when the new one is available in the tree.
897 if (erofs_try_to_free_all_cached_folios(sbi
, pcl
))
901 * It's impossible to fail after the pcluster is freezed, but in order
902 * to avoid some race conditions, add a DBG_BUGON to observe this.
904 DBG_BUGON(__xa_erase(&sbi
->managed_pslots
, pcl
->index
) != pcl
);
906 lockref_mark_dead(&pcl
->lockref
);
910 static bool erofs_try_to_release_pcluster(struct erofs_sb_info
*sbi
,
911 struct z_erofs_pcluster
*pcl
)
915 spin_lock(&pcl
->lockref
.lock
);
916 free
= __erofs_try_to_release_pcluster(sbi
, pcl
);
917 spin_unlock(&pcl
->lockref
.lock
);
919 atomic_long_dec(&erofs_global_shrink_cnt
);
920 call_rcu(&pcl
->rcu
, z_erofs_rcu_callback
);
925 unsigned long z_erofs_shrink_scan(struct erofs_sb_info
*sbi
,
926 unsigned long nr_shrink
)
928 struct z_erofs_pcluster
*pcl
;
929 unsigned int freed
= 0;
932 xa_lock(&sbi
->managed_pslots
);
933 xa_for_each(&sbi
->managed_pslots
, index
, pcl
) {
934 /* try to shrink each valid pcluster */
935 if (!erofs_try_to_release_pcluster(sbi
, pcl
))
937 xa_unlock(&sbi
->managed_pslots
);
942 xa_lock(&sbi
->managed_pslots
);
944 xa_unlock(&sbi
->managed_pslots
);
948 static void z_erofs_put_pcluster(struct erofs_sb_info
*sbi
,
949 struct z_erofs_pcluster
*pcl
, bool try_free
)
953 if (lockref_put_or_lock(&pcl
->lockref
))
956 DBG_BUGON(__lockref_is_dead(&pcl
->lockref
));
957 if (!--pcl
->lockref
.count
) {
958 if (try_free
&& xa_trylock(&sbi
->managed_pslots
)) {
959 free
= __erofs_try_to_release_pcluster(sbi
, pcl
);
960 xa_unlock(&sbi
->managed_pslots
);
962 atomic_long_add(!free
, &erofs_global_shrink_cnt
);
964 spin_unlock(&pcl
->lockref
.lock
);
966 call_rcu(&pcl
->rcu
, z_erofs_rcu_callback
);
969 static void z_erofs_pcluster_end(struct z_erofs_decompress_frontend
*fe
)
971 struct z_erofs_pcluster
*pcl
= fe
->pcl
;
976 z_erofs_bvec_iter_end(&fe
->biter
);
977 mutex_unlock(&pcl
->lock
);
979 if (fe
->candidate_bvpage
)
980 fe
->candidate_bvpage
= NULL
;
983 * if all pending pages are added, don't hold its reference
984 * any longer if the pcluster isn't hosted by ourselves.
986 if (fe
->mode
< Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE
)
987 z_erofs_put_pcluster(EROFS_I_SB(fe
->inode
), pcl
, false);
992 static int z_erofs_read_fragment(struct super_block
*sb
, struct folio
*folio
,
993 unsigned int cur
, unsigned int end
, erofs_off_t pos
)
995 struct inode
*packed_inode
= EROFS_SB(sb
)->packed_inode
;
996 struct erofs_buf buf
= __EROFS_BUF_INITIALIZER
;
1001 return -EFSCORRUPTED
;
1003 buf
.mapping
= packed_inode
->i_mapping
;
1004 for (; cur
< end
; cur
+= cnt
, pos
+= cnt
) {
1005 cnt
= min(end
- cur
, sb
->s_blocksize
- erofs_blkoff(sb
, pos
));
1006 src
= erofs_bread(&buf
, pos
, EROFS_KMAP
);
1008 erofs_put_metabuf(&buf
);
1009 return PTR_ERR(src
);
1011 memcpy_to_folio(folio
, cur
, src
, cnt
);
1013 erofs_put_metabuf(&buf
);
1017 static int z_erofs_scan_folio(struct z_erofs_decompress_frontend
*f
,
1018 struct folio
*folio
, bool ra
)
1020 struct inode
*const inode
= f
->inode
;
1021 struct erofs_map_blocks
*const map
= &f
->map
;
1022 const loff_t offset
= folio_pos(folio
);
1023 const unsigned int bs
= i_blocksize(inode
);
1024 unsigned int end
= folio_size(folio
), split
= 0, cur
, pgs
;
1028 tight
= (bs
== PAGE_SIZE
);
1029 erofs_onlinefolio_init(folio
);
1031 if (offset
+ end
- 1 < map
->m_la
||
1032 offset
+ end
- 1 >= map
->m_la
+ map
->m_llen
) {
1033 z_erofs_pcluster_end(f
);
1034 map
->m_la
= offset
+ end
- 1;
1036 err
= z_erofs_map_blocks_iter(inode
, map
, 0);
1041 cur
= offset
> map
->m_la
? 0 : map
->m_la
- offset
;
1042 pgs
= round_down(cur
, PAGE_SIZE
);
1043 /* bump split parts first to avoid several separate cases */
1046 if (!(map
->m_flags
& EROFS_MAP_MAPPED
)) {
1047 folio_zero_segment(folio
, cur
, end
);
1049 } else if (map
->m_flags
& EROFS_MAP_FRAGMENT
) {
1050 erofs_off_t fpos
= offset
+ cur
- map
->m_la
;
1052 err
= z_erofs_read_fragment(inode
->i_sb
, folio
, cur
,
1053 cur
+ min(map
->m_llen
- fpos
, end
- cur
),
1054 EROFS_I(inode
)->z_fragmentoff
+ fpos
);
1060 err
= z_erofs_pcluster_begin(f
);
1063 f
->pcl
->besteffort
|= !ra
;
1066 pgs
= round_down(end
- 1, PAGE_SIZE
);
1068 * Ensure this partial page belongs to this submit chain
1069 * rather than other concurrent submit chains or
1070 * noio(bypass) chains since those chains are handled
1071 * asynchronously thus it cannot be used for inplace I/O
1072 * or bvpage (should be processed in the strict order.)
1074 tight
&= (f
->mode
>= Z_EROFS_PCLUSTER_FOLLOWED
);
1077 excl
= (split
<= 1) || tight
;
1081 err
= z_erofs_attach_page(f
, &((struct z_erofs_bvec
) {
1082 .page
= folio_page(folio
, pgs
>> PAGE_SHIFT
),
1083 .offset
= offset
+ pgs
- map
->m_la
,
1084 .end
= end
- pgs
, }), excl
);
1088 erofs_onlinefolio_split(folio
);
1089 if (f
->pcl
->pageofs_out
!= (map
->m_la
& ~PAGE_MASK
))
1090 f
->pcl
->multibases
= true;
1091 if (f
->pcl
->length
< offset
+ end
- map
->m_la
) {
1092 f
->pcl
->length
= offset
+ end
- map
->m_la
;
1093 f
->pcl
->pageofs_out
= map
->m_la
& ~PAGE_MASK
;
1095 if ((map
->m_flags
& EROFS_MAP_FULL_MAPPED
) &&
1096 !(map
->m_flags
& EROFS_MAP_PARTIAL_REF
) &&
1097 f
->pcl
->length
== map
->m_llen
)
1098 f
->pcl
->partial
= false;
1100 /* shorten the remaining extent to update progress */
1101 map
->m_llen
= offset
+ cur
- map
->m_la
;
1102 map
->m_flags
&= ~EROFS_MAP_FULL_MAPPED
;
1105 tight
= (bs
== PAGE_SIZE
);
1107 } while ((end
= cur
) > 0);
1108 erofs_onlinefolio_end(folio
, err
);
1112 static bool z_erofs_is_sync_decompress(struct erofs_sb_info
*sbi
,
1113 unsigned int readahead_pages
)
1115 /* auto: enable for read_folio, disable for readahead */
1116 if ((sbi
->opt
.sync_decompress
== EROFS_SYNC_DECOMPRESS_AUTO
) &&
1120 if ((sbi
->opt
.sync_decompress
== EROFS_SYNC_DECOMPRESS_FORCE_ON
) &&
1121 (readahead_pages
<= sbi
->opt
.max_sync_decompress_pages
))
1127 static bool z_erofs_page_is_invalidated(struct page
*page
)
1129 return !page_folio(page
)->mapping
&& !z_erofs_is_shortlived_page(page
);
1132 struct z_erofs_decompress_backend
{
1133 struct page
*onstack_pages
[Z_EROFS_ONSTACK_PAGES
];
1134 struct super_block
*sb
;
1135 struct z_erofs_pcluster
*pcl
;
1137 /* pages with the longest decompressed length for deduplication */
1138 struct page
**decompressed_pages
;
1139 /* pages to keep the compressed data */
1140 struct page
**compressed_pages
;
1142 struct list_head decompressed_secondary_bvecs
;
1143 struct page
**pagepool
;
1144 unsigned int onstack_used
, nr_pages
;
1147 struct z_erofs_bvec_item
{
1148 struct z_erofs_bvec bvec
;
1149 struct list_head list
;
1152 static void z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend
*be
,
1153 struct z_erofs_bvec
*bvec
)
1155 struct z_erofs_bvec_item
*item
;
1158 if (!((bvec
->offset
+ be
->pcl
->pageofs_out
) & ~PAGE_MASK
) &&
1159 (bvec
->end
== PAGE_SIZE
||
1160 bvec
->offset
+ bvec
->end
== be
->pcl
->length
)) {
1161 pgnr
= (bvec
->offset
+ be
->pcl
->pageofs_out
) >> PAGE_SHIFT
;
1162 DBG_BUGON(pgnr
>= be
->nr_pages
);
1163 if (!be
->decompressed_pages
[pgnr
]) {
1164 be
->decompressed_pages
[pgnr
] = bvec
->page
;
1169 /* (cold path) one pcluster is requested multiple times */
1170 item
= kmalloc(sizeof(*item
), GFP_KERNEL
| __GFP_NOFAIL
);
1172 list_add(&item
->list
, &be
->decompressed_secondary_bvecs
);
1175 static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend
*be
,
1178 unsigned int off0
= be
->pcl
->pageofs_out
;
1179 struct list_head
*p
, *n
;
1181 list_for_each_safe(p
, n
, &be
->decompressed_secondary_bvecs
) {
1182 struct z_erofs_bvec_item
*bvi
;
1183 unsigned int end
, cur
;
1186 bvi
= container_of(p
, struct z_erofs_bvec_item
, list
);
1187 cur
= bvi
->bvec
.offset
< 0 ? -bvi
->bvec
.offset
: 0;
1188 end
= min_t(unsigned int, be
->pcl
->length
- bvi
->bvec
.offset
,
1190 dst
= kmap_local_page(bvi
->bvec
.page
);
1192 unsigned int pgnr
, scur
, len
;
1194 pgnr
= (bvi
->bvec
.offset
+ cur
+ off0
) >> PAGE_SHIFT
;
1195 DBG_BUGON(pgnr
>= be
->nr_pages
);
1197 scur
= bvi
->bvec
.offset
+ cur
-
1198 ((pgnr
<< PAGE_SHIFT
) - off0
);
1199 len
= min_t(unsigned int, end
- cur
, PAGE_SIZE
- scur
);
1200 if (!be
->decompressed_pages
[pgnr
]) {
1201 err
= -EFSCORRUPTED
;
1205 src
= kmap_local_page(be
->decompressed_pages
[pgnr
]);
1206 memcpy(dst
+ cur
, src
+ scur
, len
);
1211 erofs_onlinefolio_end(page_folio(bvi
->bvec
.page
), err
);
1217 static void z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend
*be
)
1219 struct z_erofs_pcluster
*pcl
= be
->pcl
;
1220 struct z_erofs_bvec_iter biter
;
1221 struct page
*old_bvpage
;
1224 z_erofs_bvec_iter_begin(&biter
, &pcl
->bvset
, Z_EROFS_INLINE_BVECS
, 0);
1225 for (i
= 0; i
< pcl
->vcnt
; ++i
) {
1226 struct z_erofs_bvec bvec
;
1228 z_erofs_bvec_dequeue(&biter
, &bvec
, &old_bvpage
);
1231 z_erofs_put_shortlivedpage(be
->pagepool
, old_bvpage
);
1233 DBG_BUGON(z_erofs_page_is_invalidated(bvec
.page
));
1234 z_erofs_do_decompressed_bvec(be
, &bvec
);
1237 old_bvpage
= z_erofs_bvec_iter_end(&biter
);
1239 z_erofs_put_shortlivedpage(be
->pagepool
, old_bvpage
);
1242 static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend
*be
,
1245 struct z_erofs_pcluster
*pcl
= be
->pcl
;
1246 unsigned int pclusterpages
= z_erofs_pclusterpages(pcl
);
1249 *overlapped
= false;
1250 for (i
= 0; i
< pclusterpages
; ++i
) {
1251 struct z_erofs_bvec
*bvec
= &pcl
->compressed_bvecs
[i
];
1252 struct page
*page
= bvec
->page
;
1254 /* compressed data ought to be valid when decompressing */
1255 if (IS_ERR(page
) || !page
) {
1256 bvec
->page
= NULL
; /* clear the failure reason */
1257 err
= page
? PTR_ERR(page
) : -EIO
;
1260 be
->compressed_pages
[i
] = page
;
1262 if (z_erofs_is_inline_pcluster(pcl
) ||
1263 erofs_folio_is_managed(EROFS_SB(be
->sb
), page_folio(page
))) {
1264 if (!PageUptodate(page
))
1269 DBG_BUGON(z_erofs_page_is_invalidated(page
));
1270 if (z_erofs_is_shortlived_page(page
))
1272 z_erofs_do_decompressed_bvec(be
, bvec
);
1278 static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend
*be
,
1281 struct erofs_sb_info
*const sbi
= EROFS_SB(be
->sb
);
1282 struct z_erofs_pcluster
*pcl
= be
->pcl
;
1283 unsigned int pclusterpages
= z_erofs_pclusterpages(pcl
);
1284 const struct z_erofs_decompressor
*decomp
=
1285 z_erofs_decomp
[pcl
->algorithmformat
];
1286 int i
, j
, jtop
, err2
;
1289 bool try_free
= true;
1291 mutex_lock(&pcl
->lock
);
1292 be
->nr_pages
= PAGE_ALIGN(pcl
->length
+ pcl
->pageofs_out
) >> PAGE_SHIFT
;
1294 /* allocate (de)compressed page arrays if cannot be kept on stack */
1295 be
->decompressed_pages
= NULL
;
1296 be
->compressed_pages
= NULL
;
1297 be
->onstack_used
= 0;
1298 if (be
->nr_pages
<= Z_EROFS_ONSTACK_PAGES
) {
1299 be
->decompressed_pages
= be
->onstack_pages
;
1300 be
->onstack_used
= be
->nr_pages
;
1301 memset(be
->decompressed_pages
, 0,
1302 sizeof(struct page
*) * be
->nr_pages
);
1305 if (pclusterpages
+ be
->onstack_used
<= Z_EROFS_ONSTACK_PAGES
)
1306 be
->compressed_pages
= be
->onstack_pages
+ be
->onstack_used
;
1308 if (!be
->decompressed_pages
)
1309 be
->decompressed_pages
=
1310 kvcalloc(be
->nr_pages
, sizeof(struct page
*),
1311 GFP_KERNEL
| __GFP_NOFAIL
);
1312 if (!be
->compressed_pages
)
1313 be
->compressed_pages
=
1314 kvcalloc(pclusterpages
, sizeof(struct page
*),
1315 GFP_KERNEL
| __GFP_NOFAIL
);
1317 z_erofs_parse_out_bvecs(be
);
1318 err2
= z_erofs_parse_in_bvecs(be
, &overlapped
);
1322 err
= decomp
->decompress(&(struct z_erofs_decompress_req
) {
1324 .in
= be
->compressed_pages
,
1325 .out
= be
->decompressed_pages
,
1326 .pageofs_in
= pcl
->pageofs_in
,
1327 .pageofs_out
= pcl
->pageofs_out
,
1328 .inputsize
= pcl
->pclustersize
,
1329 .outputsize
= pcl
->length
,
1330 .alg
= pcl
->algorithmformat
,
1331 .inplace_io
= overlapped
,
1332 .partial_decoding
= pcl
->partial
,
1333 .fillgaps
= pcl
->multibases
,
1334 .gfp
= pcl
->besteffort
? GFP_KERNEL
:
1335 GFP_NOWAIT
| __GFP_NORETRY
1338 /* must handle all compressed pages before actual file pages */
1339 if (z_erofs_is_inline_pcluster(pcl
)) {
1340 page
= pcl
->compressed_bvecs
[0].page
;
1341 WRITE_ONCE(pcl
->compressed_bvecs
[0].page
, NULL
);
1344 /* managed folios are still left in compressed_bvecs[] */
1345 for (i
= 0; i
< pclusterpages
; ++i
) {
1346 page
= be
->compressed_pages
[i
];
1349 if (erofs_folio_is_managed(sbi
, page_folio(page
))) {
1353 (void)z_erofs_put_shortlivedpage(be
->pagepool
, page
);
1354 WRITE_ONCE(pcl
->compressed_bvecs
[i
].page
, NULL
);
1357 if (be
->compressed_pages
< be
->onstack_pages
||
1358 be
->compressed_pages
>= be
->onstack_pages
+ Z_EROFS_ONSTACK_PAGES
)
1359 kvfree(be
->compressed_pages
);
1362 z_erofs_fill_other_copies(be
, err
);
1363 for (i
= 0; i
< be
->nr_pages
; ++i
) {
1364 page
= be
->decompressed_pages
[i
];
1368 DBG_BUGON(z_erofs_page_is_invalidated(page
));
1369 if (!z_erofs_is_shortlived_page(page
)) {
1370 erofs_onlinefolio_end(page_folio(page
), err
);
1373 if (pcl
->algorithmformat
!= Z_EROFS_COMPRESSION_LZ4
) {
1374 erofs_pagepool_add(be
->pagepool
, page
);
1377 for (j
= 0; j
< jtop
&& be
->decompressed_pages
[j
] != page
; ++j
)
1379 if (j
>= jtop
) /* this bounce page is newly detected */
1380 be
->decompressed_pages
[jtop
++] = page
;
1383 erofs_pagepool_add(be
->pagepool
,
1384 be
->decompressed_pages
[--jtop
]);
1385 if (be
->decompressed_pages
!= be
->onstack_pages
)
1386 kvfree(be
->decompressed_pages
);
1389 pcl
->partial
= true;
1390 pcl
->multibases
= false;
1391 pcl
->besteffort
= false;
1392 pcl
->bvset
.nextpage
= NULL
;
1395 /* pcluster lock MUST be taken before the following line */
1396 WRITE_ONCE(pcl
->next
, Z_EROFS_PCLUSTER_NIL
);
1397 mutex_unlock(&pcl
->lock
);
1399 if (z_erofs_is_inline_pcluster(pcl
))
1400 z_erofs_free_pcluster(pcl
);
1402 z_erofs_put_pcluster(sbi
, pcl
, try_free
);
1406 static int z_erofs_decompress_queue(const struct z_erofs_decompressqueue
*io
,
1407 struct page
**pagepool
)
1409 struct z_erofs_decompress_backend be
= {
1411 .pagepool
= pagepool
,
1412 .decompressed_secondary_bvecs
=
1413 LIST_HEAD_INIT(be
.decompressed_secondary_bvecs
),
1415 z_erofs_next_pcluster_t owned
= io
->head
;
1416 int err
= io
->eio
? -EIO
: 0;
1418 while (owned
!= Z_EROFS_PCLUSTER_TAIL
) {
1419 DBG_BUGON(owned
== Z_EROFS_PCLUSTER_NIL
);
1421 be
.pcl
= container_of(owned
, struct z_erofs_pcluster
, next
);
1422 owned
= READ_ONCE(be
.pcl
->next
);
1424 err
= z_erofs_decompress_pcluster(&be
, err
) ?: err
;
1429 static void z_erofs_decompressqueue_work(struct work_struct
*work
)
1431 struct z_erofs_decompressqueue
*bgq
=
1432 container_of(work
, struct z_erofs_decompressqueue
, u
.work
);
1433 struct page
*pagepool
= NULL
;
1435 DBG_BUGON(bgq
->head
== Z_EROFS_PCLUSTER_TAIL
);
1436 z_erofs_decompress_queue(bgq
, &pagepool
);
1437 erofs_release_pages(&pagepool
);
1441 #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD
1442 static void z_erofs_decompressqueue_kthread_work(struct kthread_work
*work
)
1444 z_erofs_decompressqueue_work((struct work_struct
*)work
);
1448 static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue
*io
,
1451 struct erofs_sb_info
*const sbi
= EROFS_SB(io
->sb
);
1453 /* wake up the caller thread for sync decompression */
1455 if (!atomic_add_return(bios
, &io
->pending_bios
))
1456 complete(&io
->u
.done
);
1460 if (atomic_add_return(bios
, &io
->pending_bios
))
1462 /* Use (kthread_)work and sync decompression for atomic contexts only */
1463 if (!in_task() || irqs_disabled() || rcu_read_lock_any_held()) {
1464 #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD
1465 struct kthread_worker
*worker
;
1468 worker
= rcu_dereference(
1469 z_erofs_pcpu_workers
[raw_smp_processor_id()]);
1471 INIT_WORK(&io
->u
.work
, z_erofs_decompressqueue_work
);
1472 queue_work(z_erofs_workqueue
, &io
->u
.work
);
1474 kthread_queue_work(worker
, &io
->u
.kthread_work
);
1478 queue_work(z_erofs_workqueue
, &io
->u
.work
);
1480 /* enable sync decompression for readahead */
1481 if (sbi
->opt
.sync_decompress
== EROFS_SYNC_DECOMPRESS_AUTO
)
1482 sbi
->opt
.sync_decompress
= EROFS_SYNC_DECOMPRESS_FORCE_ON
;
1485 z_erofs_decompressqueue_work(&io
->u
.work
);
1488 static void z_erofs_fill_bio_vec(struct bio_vec
*bvec
,
1489 struct z_erofs_decompress_frontend
*f
,
1490 struct z_erofs_pcluster
*pcl
,
1492 struct address_space
*mc
)
1494 gfp_t gfp
= mapping_gfp_mask(mc
);
1495 bool tocache
= false;
1496 struct z_erofs_bvec zbv
;
1497 struct address_space
*mapping
;
1498 struct folio
*folio
;
1500 int bs
= i_blocksize(f
->inode
);
1502 /* Except for inplace folios, the entire folio can be used for I/Os */
1503 bvec
->bv_offset
= 0;
1504 bvec
->bv_len
= PAGE_SIZE
;
1506 spin_lock(&pcl
->lockref
.lock
);
1507 zbv
= pcl
->compressed_bvecs
[nr
];
1508 spin_unlock(&pcl
->lockref
.lock
);
1510 goto out_allocfolio
;
1512 bvec
->bv_page
= zbv
.page
;
1513 DBG_BUGON(z_erofs_is_shortlived_page(bvec
->bv_page
));
1515 folio
= page_folio(zbv
.page
);
1517 * Handle preallocated cached folios. We tried to allocate such folios
1518 * without triggering direct reclaim. If allocation failed, inplace
1519 * file-backed folios will be used instead.
1521 if (folio
->private == (void *)Z_EROFS_PREALLOCATED_PAGE
) {
1526 mapping
= READ_ONCE(folio
->mapping
);
1528 * File-backed folios for inplace I/Os are all locked steady,
1529 * therefore it is impossible for `mapping` to be NULL.
1531 if (mapping
&& mapping
!= mc
) {
1533 bvec
->bv_offset
= round_up(-zbv
.offset
, bs
);
1534 bvec
->bv_len
= round_up(zbv
.end
, bs
) - bvec
->bv_offset
;
1539 if (likely(folio
->mapping
== mc
)) {
1541 * The cached folio is still in managed cache but without
1542 * a valid `->private` pcluster hint. Let's reconnect them.
1544 if (!folio_test_private(folio
)) {
1545 folio_attach_private(folio
, pcl
);
1546 /* compressed_bvecs[] already takes a ref before */
1549 if (likely(folio
->private == pcl
)) {
1550 /* don't submit cache I/Os again if already uptodate */
1551 if (folio_test_uptodate(folio
)) {
1552 folio_unlock(folio
);
1553 bvec
->bv_page
= NULL
;
1558 * Already linked with another pcluster, which only appears in
1559 * crafted images by fuzzers for now. But handle this anyway.
1561 tocache
= false; /* use temporary short-lived pages */
1563 DBG_BUGON(1); /* referenced managed folios can't be truncated */
1566 folio_unlock(folio
);
1569 page
= __erofs_allocpage(&f
->pagepool
, gfp
, true);
1570 spin_lock(&pcl
->lockref
.lock
);
1571 if (unlikely(pcl
->compressed_bvecs
[nr
].page
!= zbv
.page
)) {
1573 erofs_pagepool_add(&f
->pagepool
, page
);
1574 spin_unlock(&pcl
->lockref
.lock
);
1578 pcl
->compressed_bvecs
[nr
].page
= page
? page
: ERR_PTR(-ENOMEM
);
1579 spin_unlock(&pcl
->lockref
.lock
);
1580 bvec
->bv_page
= page
;
1583 folio
= page_folio(page
);
1585 if (!tocache
|| bs
!= PAGE_SIZE
||
1586 filemap_add_folio(mc
, folio
, pcl
->index
+ nr
, gfp
)) {
1587 /* turn into a temporary shortlived folio (1 ref) */
1588 folio
->private = (void *)Z_EROFS_SHORTLIVED_PAGE
;
1591 folio_attach_private(folio
, pcl
);
1592 /* drop a refcount added by allocpage (then 2 refs in total here) */
1596 static struct z_erofs_decompressqueue
*jobqueue_init(struct super_block
*sb
,
1597 struct z_erofs_decompressqueue
*fgq
, bool *fg
)
1599 struct z_erofs_decompressqueue
*q
;
1602 q
= kvzalloc(sizeof(*q
), GFP_KERNEL
| __GFP_NOWARN
);
1607 #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD
1608 kthread_init_work(&q
->u
.kthread_work
,
1609 z_erofs_decompressqueue_kthread_work
);
1611 INIT_WORK(&q
->u
.work
, z_erofs_decompressqueue_work
);
1616 init_completion(&fgq
->u
.done
);
1617 atomic_set(&fgq
->pending_bios
, 0);
1622 q
->head
= Z_EROFS_PCLUSTER_TAIL
;
1626 /* define decompression jobqueue types */
1633 static void move_to_bypass_jobqueue(struct z_erofs_pcluster
*pcl
,
1634 z_erofs_next_pcluster_t qtail
[],
1635 z_erofs_next_pcluster_t owned_head
)
1637 z_erofs_next_pcluster_t
*const submit_qtail
= qtail
[JQ_SUBMIT
];
1638 z_erofs_next_pcluster_t
*const bypass_qtail
= qtail
[JQ_BYPASS
];
1640 WRITE_ONCE(pcl
->next
, Z_EROFS_PCLUSTER_TAIL
);
1642 WRITE_ONCE(*submit_qtail
, owned_head
);
1643 WRITE_ONCE(*bypass_qtail
, &pcl
->next
);
1645 qtail
[JQ_BYPASS
] = &pcl
->next
;
1648 static void z_erofs_endio(struct bio
*bio
)
1650 struct z_erofs_decompressqueue
*q
= bio
->bi_private
;
1651 blk_status_t err
= bio
->bi_status
;
1652 struct folio_iter fi
;
1654 bio_for_each_folio_all(fi
, bio
) {
1655 struct folio
*folio
= fi
.folio
;
1657 DBG_BUGON(folio_test_uptodate(folio
));
1658 DBG_BUGON(z_erofs_page_is_invalidated(&folio
->page
));
1659 if (!erofs_folio_is_managed(EROFS_SB(q
->sb
), folio
))
1663 folio_mark_uptodate(folio
);
1664 folio_unlock(folio
);
1668 z_erofs_decompress_kickoff(q
, -1);
1673 static void z_erofs_submit_queue(struct z_erofs_decompress_frontend
*f
,
1674 struct z_erofs_decompressqueue
*fgq
,
1675 bool *force_fg
, bool readahead
)
1677 struct super_block
*sb
= f
->inode
->i_sb
;
1678 struct address_space
*mc
= MNGD_MAPPING(EROFS_SB(sb
));
1679 z_erofs_next_pcluster_t qtail
[NR_JOBQUEUES
];
1680 struct z_erofs_decompressqueue
*q
[NR_JOBQUEUES
];
1681 z_erofs_next_pcluster_t owned_head
= f
->owned_head
;
1682 /* bio is NULL initially, so no need to initialize last_{index,bdev} */
1683 erofs_off_t last_pa
;
1684 unsigned int nr_bios
= 0;
1685 struct bio
*bio
= NULL
;
1686 unsigned long pflags
;
1689 /* No need to read from device for pclusters in the bypass queue. */
1690 q
[JQ_BYPASS
] = jobqueue_init(sb
, fgq
+ JQ_BYPASS
, NULL
);
1691 q
[JQ_SUBMIT
] = jobqueue_init(sb
, fgq
+ JQ_SUBMIT
, force_fg
);
1693 qtail
[JQ_BYPASS
] = &q
[JQ_BYPASS
]->head
;
1694 qtail
[JQ_SUBMIT
] = &q
[JQ_SUBMIT
]->head
;
1696 /* by default, all need io submission */
1697 q
[JQ_SUBMIT
]->head
= owned_head
;
1700 struct erofs_map_dev mdev
;
1701 struct z_erofs_pcluster
*pcl
;
1702 erofs_off_t cur
, end
;
1703 struct bio_vec bvec
;
1707 DBG_BUGON(owned_head
== Z_EROFS_PCLUSTER_NIL
);
1708 pcl
= container_of(owned_head
, struct z_erofs_pcluster
, next
);
1709 owned_head
= READ_ONCE(pcl
->next
);
1711 if (z_erofs_is_inline_pcluster(pcl
)) {
1712 move_to_bypass_jobqueue(pcl
, qtail
, owned_head
);
1716 /* no device id here, thus it will always succeed */
1717 mdev
= (struct erofs_map_dev
) {
1718 .m_pa
= erofs_pos(sb
, pcl
->index
),
1720 (void)erofs_map_dev(sb
, &mdev
);
1723 end
= cur
+ pcl
->pclustersize
;
1725 bvec
.bv_page
= NULL
;
1726 if (bio
&& (cur
!= last_pa
||
1727 bio
->bi_bdev
!= mdev
.m_bdev
)) {
1729 if (erofs_is_fileio_mode(EROFS_SB(sb
)))
1730 erofs_fileio_submit_bio(bio
);
1731 else if (erofs_is_fscache_mode(sb
))
1732 erofs_fscache_submit_bio(bio
);
1737 psi_memstall_leave(&pflags
);
1743 if (!bvec
.bv_page
) {
1744 z_erofs_fill_bio_vec(&bvec
, f
, pcl
, i
++, mc
);
1747 if (cur
+ bvec
.bv_len
> end
)
1748 bvec
.bv_len
= end
- cur
;
1749 DBG_BUGON(bvec
.bv_len
< sb
->s_blocksize
);
1752 if (unlikely(PageWorkingset(bvec
.bv_page
)) &&
1754 psi_memstall_enter(&pflags
);
1759 if (erofs_is_fileio_mode(EROFS_SB(sb
)))
1760 bio
= erofs_fileio_bio_alloc(&mdev
);
1761 else if (erofs_is_fscache_mode(sb
))
1762 bio
= erofs_fscache_bio_alloc(&mdev
);
1764 bio
= bio_alloc(mdev
.m_bdev
, BIO_MAX_VECS
,
1765 REQ_OP_READ
, GFP_NOIO
);
1766 bio
->bi_end_io
= z_erofs_endio
;
1767 bio
->bi_iter
.bi_sector
= cur
>> 9;
1768 bio
->bi_private
= q
[JQ_SUBMIT
];
1770 bio
->bi_opf
|= REQ_RAHEAD
;
1774 if (!bio_add_page(bio
, bvec
.bv_page
, bvec
.bv_len
,
1777 last_pa
= cur
+ bvec
.bv_len
;
1779 } while ((cur
+= bvec
.bv_len
) < end
);
1782 qtail
[JQ_SUBMIT
] = &pcl
->next
;
1784 move_to_bypass_jobqueue(pcl
, qtail
, owned_head
);
1785 } while (owned_head
!= Z_EROFS_PCLUSTER_TAIL
);
1788 if (erofs_is_fileio_mode(EROFS_SB(sb
)))
1789 erofs_fileio_submit_bio(bio
);
1790 else if (erofs_is_fscache_mode(sb
))
1791 erofs_fscache_submit_bio(bio
);
1796 psi_memstall_leave(&pflags
);
1799 * although background is preferred, no one is pending for submission.
1800 * don't issue decompression but drop it directly instead.
1802 if (!*force_fg
&& !nr_bios
) {
1803 kvfree(q
[JQ_SUBMIT
]);
1806 z_erofs_decompress_kickoff(q
[JQ_SUBMIT
], nr_bios
);
1809 static int z_erofs_runqueue(struct z_erofs_decompress_frontend
*f
,
1810 unsigned int ra_folios
)
1812 struct z_erofs_decompressqueue io
[NR_JOBQUEUES
];
1813 struct erofs_sb_info
*sbi
= EROFS_I_SB(f
->inode
);
1814 bool force_fg
= z_erofs_is_sync_decompress(sbi
, ra_folios
);
1817 if (f
->owned_head
== Z_EROFS_PCLUSTER_TAIL
)
1819 z_erofs_submit_queue(f
, io
, &force_fg
, !!ra_folios
);
1821 /* handle bypass queue (no i/o pclusters) immediately */
1822 err
= z_erofs_decompress_queue(&io
[JQ_BYPASS
], &f
->pagepool
);
1826 /* wait until all bios are completed */
1827 wait_for_completion_io(&io
[JQ_SUBMIT
].u
.done
);
1829 /* handle synchronous decompress queue in the caller context */
1830 return z_erofs_decompress_queue(&io
[JQ_SUBMIT
], &f
->pagepool
) ?: err
;
1834 * Since partial uptodate is still unimplemented for now, we have to use
1835 * approximate readmore strategies as a start.
1837 static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend
*f
,
1838 struct readahead_control
*rac
, bool backmost
)
1840 struct inode
*inode
= f
->inode
;
1841 struct erofs_map_blocks
*map
= &f
->map
;
1842 erofs_off_t cur
, end
, headoffset
= f
->headoffset
;
1847 end
= headoffset
+ readahead_length(rac
) - 1;
1849 end
= headoffset
+ PAGE_SIZE
- 1;
1851 err
= z_erofs_map_blocks_iter(inode
, map
,
1852 EROFS_GET_BLOCKS_READMORE
);
1856 /* expand ra for the trailing edge if readahead */
1858 cur
= round_up(map
->m_la
+ map
->m_llen
, PAGE_SIZE
);
1859 readahead_expand(rac
, headoffset
, cur
- headoffset
);
1862 end
= round_up(end
, PAGE_SIZE
);
1864 end
= round_up(map
->m_la
, PAGE_SIZE
);
1869 cur
= map
->m_la
+ map
->m_llen
- 1;
1870 while ((cur
>= end
) && (cur
< i_size_read(inode
))) {
1871 pgoff_t index
= cur
>> PAGE_SHIFT
;
1872 struct folio
*folio
;
1874 folio
= erofs_grab_folio_nowait(inode
->i_mapping
, index
);
1875 if (!IS_ERR_OR_NULL(folio
)) {
1876 if (folio_test_uptodate(folio
))
1877 folio_unlock(folio
);
1879 z_erofs_scan_folio(f
, folio
, !!rac
);
1883 if (cur
< PAGE_SIZE
)
1885 cur
= (index
<< PAGE_SHIFT
) - 1;
1889 static int z_erofs_read_folio(struct file
*file
, struct folio
*folio
)
1891 struct inode
*const inode
= folio
->mapping
->host
;
1892 struct z_erofs_decompress_frontend f
= DECOMPRESS_FRONTEND_INIT(inode
);
1895 trace_erofs_read_folio(folio
, false);
1896 f
.headoffset
= (erofs_off_t
)folio
->index
<< PAGE_SHIFT
;
1898 z_erofs_pcluster_readmore(&f
, NULL
, true);
1899 err
= z_erofs_scan_folio(&f
, folio
, false);
1900 z_erofs_pcluster_readmore(&f
, NULL
, false);
1901 z_erofs_pcluster_end(&f
);
1903 /* if some pclusters are ready, need submit them anyway */
1904 err
= z_erofs_runqueue(&f
, 0) ?: err
;
1905 if (err
&& err
!= -EINTR
)
1906 erofs_err(inode
->i_sb
, "read error %d @ %lu of nid %llu",
1907 err
, folio
->index
, EROFS_I(inode
)->nid
);
1909 erofs_put_metabuf(&f
.map
.buf
);
1910 erofs_release_pages(&f
.pagepool
);
1914 static void z_erofs_readahead(struct readahead_control
*rac
)
1916 struct inode
*const inode
= rac
->mapping
->host
;
1917 struct z_erofs_decompress_frontend f
= DECOMPRESS_FRONTEND_INIT(inode
);
1918 struct folio
*head
= NULL
, *folio
;
1919 unsigned int nr_folios
;
1922 f
.headoffset
= readahead_pos(rac
);
1924 z_erofs_pcluster_readmore(&f
, rac
, true);
1925 nr_folios
= readahead_count(rac
);
1926 trace_erofs_readpages(inode
, readahead_index(rac
), nr_folios
, false);
1928 while ((folio
= readahead_folio(rac
))) {
1929 folio
->private = head
;
1933 /* traverse in reverse order for best metadata I/O performance */
1936 head
= folio_get_private(folio
);
1938 err
= z_erofs_scan_folio(&f
, folio
, true);
1939 if (err
&& err
!= -EINTR
)
1940 erofs_err(inode
->i_sb
, "readahead error at folio %lu @ nid %llu",
1941 folio
->index
, EROFS_I(inode
)->nid
);
1943 z_erofs_pcluster_readmore(&f
, rac
, false);
1944 z_erofs_pcluster_end(&f
);
1946 (void)z_erofs_runqueue(&f
, nr_folios
);
1947 erofs_put_metabuf(&f
.map
.buf
);
1948 erofs_release_pages(&f
.pagepool
);
1951 const struct address_space_operations z_erofs_aops
= {
1952 .read_folio
= z_erofs_read_folio
,
1953 .readahead
= z_erofs_readahead
,