1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (C) 2018 HUAWEI, Inc.
4 * https://www.huawei.com/
5 * Created by Gao Xiang <gaoxiang25@huawei.com>
9 #include <linux/prefetch.h>
11 #include <trace/events/erofs.h>
14 * a compressed_pages[] placeholder in order to avoid
15 * being filled with file pages for in-place decompression.
17 #define PAGE_UNALLOCATED ((void *)0x5F0E4B1D)
19 /* how to allocate cached pages for a pcluster */
20 enum z_erofs_cache_alloctype
{
21 DONTALLOC
, /* don't allocate any cached pages */
22 DELAYEDALLOC
, /* delayed allocation (at the time of submitting io) */
24 * try to use cached I/O if page allocation succeeds or fallback
25 * to in-place I/O instead to avoid any direct reclaim.
31 * tagged pointer with 1-bit tag for all compressed pages
32 * tag 0 - the page is just found with an extra page reference
34 typedef tagptr1_t compressed_page_t
;
36 #define tag_compressed_page_justfound(page) \
37 tagptr_fold(compressed_page_t, page, 1)
39 static struct workqueue_struct
*z_erofs_workqueue __read_mostly
;
40 static struct kmem_cache
*pcluster_cachep __read_mostly
;
42 void z_erofs_exit_zip_subsystem(void)
44 destroy_workqueue(z_erofs_workqueue
);
45 kmem_cache_destroy(pcluster_cachep
);
48 static inline int z_erofs_init_workqueue(void)
50 const unsigned int onlinecpus
= num_possible_cpus();
53 * no need to spawn too many threads, limiting threads could minimum
54 * scheduling overhead, perhaps per-CPU threads should be better?
56 z_erofs_workqueue
= alloc_workqueue("erofs_unzipd",
57 WQ_UNBOUND
| WQ_HIGHPRI
,
58 onlinecpus
+ onlinecpus
/ 4);
59 return z_erofs_workqueue
? 0 : -ENOMEM
;
62 static void z_erofs_pcluster_init_once(void *ptr
)
64 struct z_erofs_pcluster
*pcl
= ptr
;
65 struct z_erofs_collection
*cl
= z_erofs_primarycollection(pcl
);
68 mutex_init(&cl
->lock
);
71 for (i
= 0; i
< Z_EROFS_CLUSTER_MAX_PAGES
; ++i
)
72 pcl
->compressed_pages
[i
] = NULL
;
75 int __init
z_erofs_init_zip_subsystem(void)
77 pcluster_cachep
= kmem_cache_create("erofs_compress",
78 Z_EROFS_WORKGROUP_SIZE
, 0,
80 z_erofs_pcluster_init_once
);
81 if (pcluster_cachep
) {
82 if (!z_erofs_init_workqueue())
85 kmem_cache_destroy(pcluster_cachep
);
90 enum z_erofs_collectmode
{
94 * The current collection was the tail of an exist chain, in addition
95 * that the previous processed chained collections are all decided to
97 * A new chain will be created for the remaining collections which are
98 * not processed yet, therefore different from COLLECT_PRIMARY_FOLLOWED,
99 * the next collection cannot reuse the whole page safely in
100 * the following scenario:
101 * ________________________________________________________________
102 * | tail (partial) page | head (partial) page |
103 * | (belongs to the next cl) | (belongs to the current cl) |
104 * |_______PRIMARY_FOLLOWED_______|________PRIMARY_HOOKED___________|
106 COLLECT_PRIMARY_HOOKED
,
107 COLLECT_PRIMARY_FOLLOWED_NOINPLACE
,
109 * The current collection has been linked with the owned chain, and
110 * could also be linked with the remaining collections, which means
111 * if the processing page is the tail page of the collection, thus
112 * the current collection can safely use the whole page (since
113 * the previous collection is under control) for in-place I/O, as
115 * ________________________________________________________________
116 * | tail (partial) page | head (partial) page |
117 * | (of the current cl) | (of the previous collection) |
118 * | PRIMARY_FOLLOWED or | |
119 * |_____PRIMARY_HOOKED___|____________PRIMARY_FOLLOWED____________|
121 * [ (*) the above page can be used as inplace I/O. ]
123 COLLECT_PRIMARY_FOLLOWED
,
126 struct z_erofs_collector
{
127 struct z_erofs_pagevec_ctor vector
;
129 struct z_erofs_pcluster
*pcl
, *tailpcl
;
130 struct z_erofs_collection
*cl
;
131 struct page
**compressedpages
;
132 z_erofs_next_pcluster_t owned_head
;
134 enum z_erofs_collectmode mode
;
137 struct z_erofs_decompress_frontend
{
138 struct inode
*const inode
;
140 struct z_erofs_collector clt
;
141 struct erofs_map_blocks map
;
144 /* used for applying cache strategy on the fly */
146 erofs_off_t headoffset
;
149 #define COLLECTOR_INIT() { \
150 .owned_head = Z_EROFS_PCLUSTER_TAIL, \
151 .mode = COLLECT_PRIMARY_FOLLOWED }
153 #define DECOMPRESS_FRONTEND_INIT(__i) { \
154 .inode = __i, .clt = COLLECTOR_INIT(), \
157 static struct page
*z_pagemap_global
[Z_EROFS_VMAP_GLOBAL_PAGES
];
158 static DEFINE_MUTEX(z_pagemap_global_lock
);
160 static void preload_compressed_pages(struct z_erofs_collector
*clt
,
161 struct address_space
*mc
,
162 enum z_erofs_cache_alloctype type
,
163 struct list_head
*pagepool
)
165 const struct z_erofs_pcluster
*pcl
= clt
->pcl
;
166 const unsigned int clusterpages
= BIT(pcl
->clusterbits
);
167 struct page
**pages
= clt
->compressedpages
;
168 pgoff_t index
= pcl
->obj
.index
+ (pages
- pcl
->compressed_pages
);
169 bool standalone
= true;
170 gfp_t gfp
= (mapping_gfp_mask(mc
) & ~__GFP_DIRECT_RECLAIM
) |
171 __GFP_NOMEMALLOC
| __GFP_NORETRY
| __GFP_NOWARN
;
173 if (clt
->mode
< COLLECT_PRIMARY_FOLLOWED
)
176 for (; pages
< pcl
->compressed_pages
+ clusterpages
; ++pages
) {
179 struct page
*newpage
= NULL
;
181 /* the compressed page was loaded before */
182 if (READ_ONCE(*pages
))
185 page
= find_get_page(mc
, index
);
188 t
= tag_compressed_page_justfound(page
);
189 } else if (type
== DELAYEDALLOC
) {
190 t
= tagptr_init(compressed_page_t
, PAGE_UNALLOCATED
);
191 } else if (type
== TRYALLOC
) {
192 newpage
= erofs_allocpage(pagepool
, gfp
);
196 set_page_private(newpage
, Z_EROFS_PREALLOCATED_PAGE
);
197 t
= tag_compressed_page_justfound(newpage
);
198 } else { /* DONTALLOC */
201 clt
->compressedpages
= pages
;
206 if (!cmpxchg_relaxed(pages
, NULL
, tagptr_cast_ptr(t
)))
211 } else if (newpage
) {
212 set_page_private(newpage
, 0);
213 list_add(&newpage
->lru
, pagepool
);
217 if (standalone
) /* downgrade to PRIMARY_FOLLOWED_NOINPLACE */
218 clt
->mode
= COLLECT_PRIMARY_FOLLOWED_NOINPLACE
;
221 /* called by erofs_shrinker to get rid of all compressed_pages */
222 int erofs_try_to_free_all_cached_pages(struct erofs_sb_info
*sbi
,
223 struct erofs_workgroup
*grp
)
225 struct z_erofs_pcluster
*const pcl
=
226 container_of(grp
, struct z_erofs_pcluster
, obj
);
227 struct address_space
*const mapping
= MNGD_MAPPING(sbi
);
228 const unsigned int clusterpages
= BIT(pcl
->clusterbits
);
232 * refcount of workgroup is now freezed as 1,
233 * therefore no need to worry about available decompression users.
235 for (i
= 0; i
< clusterpages
; ++i
) {
236 struct page
*page
= pcl
->compressed_pages
[i
];
241 /* block other users from reclaiming or migrating the page */
242 if (!trylock_page(page
))
245 if (page
->mapping
!= mapping
)
248 /* barrier is implied in the following 'unlock_page' */
249 WRITE_ONCE(pcl
->compressed_pages
[i
], NULL
);
250 detach_page_private(page
);
256 int erofs_try_to_free_cached_page(struct address_space
*mapping
,
259 struct z_erofs_pcluster
*const pcl
= (void *)page_private(page
);
260 const unsigned int clusterpages
= BIT(pcl
->clusterbits
);
261 int ret
= 0; /* 0 - busy */
263 if (erofs_workgroup_try_to_freeze(&pcl
->obj
, 1)) {
266 for (i
= 0; i
< clusterpages
; ++i
) {
267 if (pcl
->compressed_pages
[i
] == page
) {
268 WRITE_ONCE(pcl
->compressed_pages
[i
], NULL
);
273 erofs_workgroup_unfreeze(&pcl
->obj
, 1);
276 detach_page_private(page
);
281 /* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */
282 static inline bool z_erofs_try_inplace_io(struct z_erofs_collector
*clt
,
285 struct z_erofs_pcluster
*const pcl
= clt
->pcl
;
286 const unsigned int clusterpages
= BIT(pcl
->clusterbits
);
288 while (clt
->compressedpages
< pcl
->compressed_pages
+ clusterpages
) {
289 if (!cmpxchg(clt
->compressedpages
++, NULL
, page
))
295 /* callers must be with collection lock held */
296 static int z_erofs_attach_page(struct z_erofs_collector
*clt
,
298 enum z_erofs_page_type type
)
303 /* give priority for inplaceio */
304 if (clt
->mode
>= COLLECT_PRIMARY
&&
305 type
== Z_EROFS_PAGE_TYPE_EXCLUSIVE
&&
306 z_erofs_try_inplace_io(clt
, page
))
309 ret
= z_erofs_pagevec_enqueue(&clt
->vector
,
310 page
, type
, &occupied
);
311 clt
->cl
->vcnt
+= (unsigned int)ret
;
313 return ret
? 0 : -EAGAIN
;
316 static void z_erofs_try_to_claim_pcluster(struct z_erofs_collector
*clt
)
318 struct z_erofs_pcluster
*pcl
= clt
->pcl
;
319 z_erofs_next_pcluster_t
*owned_head
= &clt
->owned_head
;
321 /* type 1, nil pcluster (this pcluster doesn't belong to any chain.) */
322 if (cmpxchg(&pcl
->next
, Z_EROFS_PCLUSTER_NIL
,
323 *owned_head
) == Z_EROFS_PCLUSTER_NIL
) {
324 *owned_head
= &pcl
->next
;
325 /* so we can attach this pcluster to our submission chain. */
326 clt
->mode
= COLLECT_PRIMARY_FOLLOWED
;
331 * type 2, link to the end of an existing open chain, be careful
332 * that its submission is controlled by the original attached chain.
334 if (cmpxchg(&pcl
->next
, Z_EROFS_PCLUSTER_TAIL
,
335 *owned_head
) == Z_EROFS_PCLUSTER_TAIL
) {
336 *owned_head
= Z_EROFS_PCLUSTER_TAIL
;
337 clt
->mode
= COLLECT_PRIMARY_HOOKED
;
341 /* type 3, it belongs to a chain, but it isn't the end of the chain */
342 clt
->mode
= COLLECT_PRIMARY
;
345 static int z_erofs_lookup_collection(struct z_erofs_collector
*clt
,
347 struct erofs_map_blocks
*map
)
349 struct z_erofs_pcluster
*pcl
= clt
->pcl
;
350 struct z_erofs_collection
*cl
;
353 /* to avoid unexpected loop formed by corrupted images */
354 if (clt
->owned_head
== &pcl
->next
|| pcl
== clt
->tailpcl
) {
356 return -EFSCORRUPTED
;
359 cl
= z_erofs_primarycollection(pcl
);
360 if (cl
->pageofs
!= (map
->m_la
& ~PAGE_MASK
)) {
362 return -EFSCORRUPTED
;
365 length
= READ_ONCE(pcl
->length
);
366 if (length
& Z_EROFS_PCLUSTER_FULL_LENGTH
) {
367 if ((map
->m_llen
<< Z_EROFS_PCLUSTER_LENGTH_BIT
) > length
) {
369 return -EFSCORRUPTED
;
372 unsigned int llen
= map
->m_llen
<< Z_EROFS_PCLUSTER_LENGTH_BIT
;
374 if (map
->m_flags
& EROFS_MAP_FULL_MAPPED
)
375 llen
|= Z_EROFS_PCLUSTER_FULL_LENGTH
;
377 while (llen
> length
&&
378 length
!= cmpxchg_relaxed(&pcl
->length
, length
, llen
)) {
380 length
= READ_ONCE(pcl
->length
);
383 mutex_lock(&cl
->lock
);
384 /* used to check tail merging loop due to corrupted images */
385 if (clt
->owned_head
== Z_EROFS_PCLUSTER_TAIL
)
388 z_erofs_try_to_claim_pcluster(clt
);
393 static int z_erofs_register_collection(struct z_erofs_collector
*clt
,
395 struct erofs_map_blocks
*map
)
397 struct z_erofs_pcluster
*pcl
;
398 struct z_erofs_collection
*cl
;
399 struct erofs_workgroup
*grp
;
402 /* no available workgroup, let's allocate one */
403 pcl
= kmem_cache_alloc(pcluster_cachep
, GFP_NOFS
);
407 atomic_set(&pcl
->obj
.refcount
, 1);
408 pcl
->obj
.index
= map
->m_pa
>> PAGE_SHIFT
;
410 pcl
->length
= (map
->m_llen
<< Z_EROFS_PCLUSTER_LENGTH_BIT
) |
411 (map
->m_flags
& EROFS_MAP_FULL_MAPPED
?
412 Z_EROFS_PCLUSTER_FULL_LENGTH
: 0);
414 if (map
->m_flags
& EROFS_MAP_ZIPPED
)
415 pcl
->algorithmformat
= Z_EROFS_COMPRESSION_LZ4
;
417 pcl
->algorithmformat
= Z_EROFS_COMPRESSION_SHIFTED
;
419 pcl
->clusterbits
= EROFS_I(inode
)->z_physical_clusterbits
[0];
420 pcl
->clusterbits
-= PAGE_SHIFT
;
422 /* new pclusters should be claimed as type 1, primary and followed */
423 pcl
->next
= clt
->owned_head
;
424 clt
->mode
= COLLECT_PRIMARY_FOLLOWED
;
426 cl
= z_erofs_primarycollection(pcl
);
428 /* must be cleaned before freeing to slab */
429 DBG_BUGON(cl
->nr_pages
);
432 cl
->pageofs
= map
->m_la
& ~PAGE_MASK
;
435 * lock all primary followed works before visible to others
436 * and mutex_trylock *never* fails for a new pcluster.
438 DBG_BUGON(!mutex_trylock(&cl
->lock
));
440 grp
= erofs_insert_workgroup(inode
->i_sb
, &pcl
->obj
);
446 if (grp
!= &pcl
->obj
) {
447 clt
->pcl
= container_of(grp
, struct z_erofs_pcluster
, obj
);
451 /* used to check tail merging loop due to corrupted images */
452 if (clt
->owned_head
== Z_EROFS_PCLUSTER_TAIL
)
454 clt
->owned_head
= &pcl
->next
;
460 mutex_unlock(&cl
->lock
);
461 kmem_cache_free(pcluster_cachep
, pcl
);
465 static int z_erofs_collector_begin(struct z_erofs_collector
*clt
,
467 struct erofs_map_blocks
*map
)
469 struct erofs_workgroup
*grp
;
474 /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous collection */
475 DBG_BUGON(clt
->owned_head
== Z_EROFS_PCLUSTER_NIL
);
476 DBG_BUGON(clt
->owned_head
== Z_EROFS_PCLUSTER_TAIL_CLOSED
);
478 if (!PAGE_ALIGNED(map
->m_pa
)) {
483 grp
= erofs_find_workgroup(inode
->i_sb
, map
->m_pa
>> PAGE_SHIFT
);
485 clt
->pcl
= container_of(grp
, struct z_erofs_pcluster
, obj
);
487 ret
= z_erofs_register_collection(clt
, inode
, map
);
495 ret
= z_erofs_lookup_collection(clt
, inode
, map
);
497 erofs_workgroup_put(&clt
->pcl
->obj
);
502 z_erofs_pagevec_ctor_init(&clt
->vector
, Z_EROFS_NR_INLINE_PAGEVECS
,
503 clt
->cl
->pagevec
, clt
->cl
->vcnt
);
505 clt
->compressedpages
= clt
->pcl
->compressed_pages
;
506 if (clt
->mode
<= COLLECT_PRIMARY
) /* cannot do in-place I/O */
507 clt
->compressedpages
+= Z_EROFS_CLUSTER_MAX_PAGES
;
512 * keep in mind that no referenced pclusters will be freed
513 * only after a RCU grace period.
515 static void z_erofs_rcu_callback(struct rcu_head
*head
)
517 struct z_erofs_collection
*const cl
=
518 container_of(head
, struct z_erofs_collection
, rcu
);
520 kmem_cache_free(pcluster_cachep
,
521 container_of(cl
, struct z_erofs_pcluster
,
522 primary_collection
));
525 void erofs_workgroup_free_rcu(struct erofs_workgroup
*grp
)
527 struct z_erofs_pcluster
*const pcl
=
528 container_of(grp
, struct z_erofs_pcluster
, obj
);
529 struct z_erofs_collection
*const cl
= z_erofs_primarycollection(pcl
);
531 call_rcu(&cl
->rcu
, z_erofs_rcu_callback
);
534 static void z_erofs_collection_put(struct z_erofs_collection
*cl
)
536 struct z_erofs_pcluster
*const pcl
=
537 container_of(cl
, struct z_erofs_pcluster
, primary_collection
);
539 erofs_workgroup_put(&pcl
->obj
);
542 static bool z_erofs_collector_end(struct z_erofs_collector
*clt
)
544 struct z_erofs_collection
*cl
= clt
->cl
;
549 z_erofs_pagevec_ctor_exit(&clt
->vector
, false);
550 mutex_unlock(&cl
->lock
);
553 * if all pending pages are added, don't hold its reference
554 * any longer if the pcluster isn't hosted by ourselves.
556 if (clt
->mode
< COLLECT_PRIMARY_FOLLOWED_NOINPLACE
)
557 z_erofs_collection_put(cl
);
563 static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend
*fe
,
564 unsigned int cachestrategy
,
567 if (cachestrategy
<= EROFS_ZIP_CACHE_DISABLED
)
573 return cachestrategy
>= EROFS_ZIP_CACHE_READAROUND
&&
577 static int z_erofs_do_read_page(struct z_erofs_decompress_frontend
*fe
,
578 struct page
*page
, struct list_head
*pagepool
)
580 struct inode
*const inode
= fe
->inode
;
581 struct erofs_sb_info
*const sbi
= EROFS_I_SB(inode
);
582 struct erofs_map_blocks
*const map
= &fe
->map
;
583 struct z_erofs_collector
*const clt
= &fe
->clt
;
584 const loff_t offset
= page_offset(page
);
587 enum z_erofs_cache_alloctype cache_strategy
;
588 enum z_erofs_page_type page_type
;
589 unsigned int cur
, end
, spiltted
, index
;
592 /* register locked file pages as online pages in pack */
593 z_erofs_onlinepage_init(page
);
600 /* lucky, within the range of the current map_blocks */
601 if (offset
+ cur
>= map
->m_la
&&
602 offset
+ cur
< map
->m_la
+ map
->m_llen
) {
603 /* didn't get a valid collection previously (very rare) */
609 /* go ahead the next map_blocks */
610 erofs_dbg("%s: [out-of-range] pos %llu", __func__
, offset
+ cur
);
612 if (z_erofs_collector_end(clt
))
613 fe
->backmost
= false;
615 map
->m_la
= offset
+ cur
;
617 err
= z_erofs_map_blocks_iter(inode
, map
, 0);
622 if (!(map
->m_flags
& EROFS_MAP_MAPPED
))
625 err
= z_erofs_collector_begin(clt
, inode
, map
);
629 /* preload all compressed pages (maybe downgrade role if necessary) */
630 if (should_alloc_managed_pages(fe
, sbi
->ctx
.cache_strategy
, map
->m_la
))
631 cache_strategy
= TRYALLOC
;
633 cache_strategy
= DONTALLOC
;
635 preload_compressed_pages(clt
, MNGD_MAPPING(sbi
),
636 cache_strategy
, pagepool
);
640 * Ensure the current partial page belongs to this submit chain rather
641 * than other concurrent submit chains or the noio(bypass) chain since
642 * those chains are handled asynchronously thus the page cannot be used
643 * for inplace I/O or pagevec (should be processed in strict order.)
645 tight
&= (clt
->mode
>= COLLECT_PRIMARY_HOOKED
&&
646 clt
->mode
!= COLLECT_PRIMARY_FOLLOWED_NOINPLACE
);
648 cur
= end
- min_t(unsigned int, offset
+ end
- map
->m_la
, end
);
649 if (!(map
->m_flags
& EROFS_MAP_MAPPED
)) {
650 zero_user_segment(page
, cur
, end
);
654 /* let's derive page type */
655 page_type
= cur
? Z_EROFS_VLE_PAGE_TYPE_HEAD
:
656 (!spiltted
? Z_EROFS_PAGE_TYPE_EXCLUSIVE
:
657 (tight
? Z_EROFS_PAGE_TYPE_EXCLUSIVE
:
658 Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED
));
661 tight
&= (clt
->mode
>= COLLECT_PRIMARY_FOLLOWED
);
664 err
= z_erofs_attach_page(clt
, page
, page_type
);
665 /* should allocate an additional short-lived page for pagevec */
666 if (err
== -EAGAIN
) {
667 struct page
*const newpage
=
668 alloc_page(GFP_NOFS
| __GFP_NOFAIL
);
670 set_page_private(newpage
, Z_EROFS_SHORTLIVED_PAGE
);
671 err
= z_erofs_attach_page(clt
, newpage
,
672 Z_EROFS_PAGE_TYPE_EXCLUSIVE
);
680 index
= page
->index
- (map
->m_la
>> PAGE_SHIFT
);
682 z_erofs_onlinepage_fixup(page
, index
, true);
684 /* bump up the number of spiltted parts of a page */
686 /* also update nr_pages */
687 clt
->cl
->nr_pages
= max_t(pgoff_t
, clt
->cl
->nr_pages
, index
+ 1);
689 /* can be used for verification */
690 map
->m_llen
= offset
+ cur
- map
->m_la
;
697 z_erofs_onlinepage_endio(page
);
699 erofs_dbg("%s, finish page: %pK spiltted: %u map->m_llen %llu",
700 __func__
, page
, spiltted
, map
->m_llen
);
703 /* if some error occurred while processing this page */
709 static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue
*io
,
712 /* wake up the caller thread for sync decompression */
716 spin_lock_irqsave(&io
->u
.wait
.lock
, flags
);
717 if (!atomic_add_return(bios
, &io
->pending_bios
))
718 wake_up_locked(&io
->u
.wait
);
719 spin_unlock_irqrestore(&io
->u
.wait
.lock
, flags
);
723 if (!atomic_add_return(bios
, &io
->pending_bios
))
724 queue_work(z_erofs_workqueue
, &io
->u
.work
);
727 static bool z_erofs_page_is_invalidated(struct page
*page
)
729 return !page
->mapping
&& !z_erofs_is_shortlived_page(page
);
732 static void z_erofs_decompressqueue_endio(struct bio
*bio
)
734 tagptr1_t t
= tagptr_init(tagptr1_t
, bio
->bi_private
);
735 struct z_erofs_decompressqueue
*q
= tagptr_unfold_ptr(t
);
736 blk_status_t err
= bio
->bi_status
;
737 struct bio_vec
*bvec
;
738 struct bvec_iter_all iter_all
;
740 bio_for_each_segment_all(bvec
, bio
, iter_all
) {
741 struct page
*page
= bvec
->bv_page
;
743 DBG_BUGON(PageUptodate(page
));
744 DBG_BUGON(z_erofs_page_is_invalidated(page
));
749 if (erofs_page_is_managed(EROFS_SB(q
->sb
), page
)) {
751 SetPageUptodate(page
);
755 z_erofs_decompress_kickoff(q
, tagptr_unfold_tags(t
), -1);
759 static int z_erofs_decompress_pcluster(struct super_block
*sb
,
760 struct z_erofs_pcluster
*pcl
,
761 struct list_head
*pagepool
)
763 struct erofs_sb_info
*const sbi
= EROFS_SB(sb
);
764 const unsigned int clusterpages
= BIT(pcl
->clusterbits
);
765 struct z_erofs_pagevec_ctor ctor
;
766 unsigned int i
, outputsize
, llen
, nr_pages
;
767 struct page
*pages_onstack
[Z_EROFS_VMAP_ONSTACK_PAGES
];
768 struct page
**pages
, **compressed_pages
, *page
;
770 enum z_erofs_page_type page_type
;
771 bool overlapped
, partial
;
772 struct z_erofs_collection
*cl
;
776 cl
= z_erofs_primarycollection(pcl
);
777 DBG_BUGON(!READ_ONCE(cl
->nr_pages
));
779 mutex_lock(&cl
->lock
);
780 nr_pages
= cl
->nr_pages
;
782 if (nr_pages
<= Z_EROFS_VMAP_ONSTACK_PAGES
) {
783 pages
= pages_onstack
;
784 } else if (nr_pages
<= Z_EROFS_VMAP_GLOBAL_PAGES
&&
785 mutex_trylock(&z_pagemap_global_lock
)) {
786 pages
= z_pagemap_global
;
788 gfp_t gfp_flags
= GFP_KERNEL
;
790 if (nr_pages
> Z_EROFS_VMAP_GLOBAL_PAGES
)
791 gfp_flags
|= __GFP_NOFAIL
;
793 pages
= kvmalloc_array(nr_pages
, sizeof(struct page
*),
796 /* fallback to global pagemap for the lowmem scenario */
798 mutex_lock(&z_pagemap_global_lock
);
799 pages
= z_pagemap_global
;
803 for (i
= 0; i
< nr_pages
; ++i
)
807 z_erofs_pagevec_ctor_init(&ctor
, Z_EROFS_NR_INLINE_PAGEVECS
,
810 for (i
= 0; i
< cl
->vcnt
; ++i
) {
813 page
= z_erofs_pagevec_dequeue(&ctor
, &page_type
);
815 /* all pages in pagevec ought to be valid */
817 DBG_BUGON(z_erofs_page_is_invalidated(page
));
819 if (z_erofs_put_shortlivedpage(pagepool
, page
))
822 if (page_type
== Z_EROFS_VLE_PAGE_TYPE_HEAD
)
825 pagenr
= z_erofs_onlinepage_index(page
);
827 DBG_BUGON(pagenr
>= nr_pages
);
830 * currently EROFS doesn't support multiref(dedup),
831 * so here erroring out one multiref page.
835 SetPageError(pages
[pagenr
]);
836 z_erofs_onlinepage_endio(pages
[pagenr
]);
839 pages
[pagenr
] = page
;
841 z_erofs_pagevec_ctor_exit(&ctor
, true);
844 compressed_pages
= pcl
->compressed_pages
;
846 for (i
= 0; i
< clusterpages
; ++i
) {
849 page
= compressed_pages
[i
];
851 /* all compressed pages ought to be valid */
853 DBG_BUGON(z_erofs_page_is_invalidated(page
));
855 if (!z_erofs_is_shortlived_page(page
)) {
856 if (erofs_page_is_managed(sbi
, page
)) {
857 if (!PageUptodate(page
))
863 * only if non-head page can be selected
864 * for inplace decompression
866 pagenr
= z_erofs_onlinepage_index(page
);
868 DBG_BUGON(pagenr
>= nr_pages
);
871 SetPageError(pages
[pagenr
]);
872 z_erofs_onlinepage_endio(pages
[pagenr
]);
875 pages
[pagenr
] = page
;
880 /* PG_error needs checking for all non-managed pages */
881 if (PageError(page
)) {
882 DBG_BUGON(PageUptodate(page
));
890 llen
= pcl
->length
>> Z_EROFS_PCLUSTER_LENGTH_BIT
;
891 if (nr_pages
<< PAGE_SHIFT
>= cl
->pageofs
+ llen
) {
893 partial
= !(pcl
->length
& Z_EROFS_PCLUSTER_FULL_LENGTH
);
895 outputsize
= (nr_pages
<< PAGE_SHIFT
) - cl
->pageofs
;
899 err
= z_erofs_decompress(&(struct z_erofs_decompress_req
) {
901 .in
= compressed_pages
,
903 .pageofs_out
= cl
->pageofs
,
904 .inputsize
= PAGE_SIZE
,
905 .outputsize
= outputsize
,
906 .alg
= pcl
->algorithmformat
,
907 .inplace_io
= overlapped
,
908 .partial_decoding
= partial
912 /* must handle all compressed pages before endding pages */
913 for (i
= 0; i
< clusterpages
; ++i
) {
914 page
= compressed_pages
[i
];
916 if (erofs_page_is_managed(sbi
, page
))
919 /* recycle all individual short-lived pages */
920 (void)z_erofs_put_shortlivedpage(pagepool
, page
);
922 WRITE_ONCE(compressed_pages
[i
], NULL
);
925 for (i
= 0; i
< nr_pages
; ++i
) {
930 DBG_BUGON(z_erofs_page_is_invalidated(page
));
932 /* recycle all individual short-lived pages */
933 if (z_erofs_put_shortlivedpage(pagepool
, page
))
939 z_erofs_onlinepage_endio(page
);
942 if (pages
== z_pagemap_global
)
943 mutex_unlock(&z_pagemap_global_lock
);
944 else if (pages
!= pages_onstack
)
950 /* all cl locks MUST be taken before the following line */
951 WRITE_ONCE(pcl
->next
, Z_EROFS_PCLUSTER_NIL
);
953 /* all cl locks SHOULD be released right now */
954 mutex_unlock(&cl
->lock
);
956 z_erofs_collection_put(cl
);
960 static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue
*io
,
961 struct list_head
*pagepool
)
963 z_erofs_next_pcluster_t owned
= io
->head
;
965 while (owned
!= Z_EROFS_PCLUSTER_TAIL_CLOSED
) {
966 struct z_erofs_pcluster
*pcl
;
968 /* no possible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */
969 DBG_BUGON(owned
== Z_EROFS_PCLUSTER_TAIL
);
971 /* no possible that 'owned' equals NULL */
972 DBG_BUGON(owned
== Z_EROFS_PCLUSTER_NIL
);
974 pcl
= container_of(owned
, struct z_erofs_pcluster
, next
);
975 owned
= READ_ONCE(pcl
->next
);
977 z_erofs_decompress_pcluster(io
->sb
, pcl
, pagepool
);
981 static void z_erofs_decompressqueue_work(struct work_struct
*work
)
983 struct z_erofs_decompressqueue
*bgq
=
984 container_of(work
, struct z_erofs_decompressqueue
, u
.work
);
987 DBG_BUGON(bgq
->head
== Z_EROFS_PCLUSTER_TAIL_CLOSED
);
988 z_erofs_decompress_queue(bgq
, &pagepool
);
990 put_pages_list(&pagepool
);
994 static struct page
*pickup_page_for_submission(struct z_erofs_pcluster
*pcl
,
996 struct list_head
*pagepool
,
997 struct address_space
*mc
,
1000 const pgoff_t index
= pcl
->obj
.index
;
1001 bool tocache
= false;
1003 struct address_space
*mapping
;
1004 struct page
*oldpage
, *page
;
1006 compressed_page_t t
;
1010 page
= READ_ONCE(pcl
->compressed_pages
[nr
]);
1017 * the cached page has not been allocated and
1018 * an placeholder is out there, prepare it now.
1020 if (page
== PAGE_UNALLOCATED
) {
1025 /* process the target tagged pointer */
1026 t
= tagptr_init(compressed_page_t
, page
);
1027 justfound
= tagptr_unfold_tags(t
);
1028 page
= tagptr_unfold_ptr(t
);
1031 * preallocated cached pages, which is used to avoid direct reclaim
1032 * otherwise, it will go inplace I/O path instead.
1034 if (page
->private == Z_EROFS_PREALLOCATED_PAGE
) {
1035 WRITE_ONCE(pcl
->compressed_pages
[nr
], page
);
1036 set_page_private(page
, 0);
1040 mapping
= READ_ONCE(page
->mapping
);
1043 * file-backed online pages in plcuster are all locked steady,
1044 * therefore it is impossible for `mapping' to be NULL.
1046 if (mapping
&& mapping
!= mc
)
1047 /* ought to be unmanaged pages */
1050 /* directly return for shortlived page as well */
1051 if (z_erofs_is_shortlived_page(page
))
1056 /* only true if page reclaim goes wrong, should never happen */
1057 DBG_BUGON(justfound
&& PagePrivate(page
));
1059 /* the page is still in manage cache */
1060 if (page
->mapping
== mc
) {
1061 WRITE_ONCE(pcl
->compressed_pages
[nr
], page
);
1063 ClearPageError(page
);
1064 if (!PagePrivate(page
)) {
1066 * impossible to be !PagePrivate(page) for
1067 * the current restriction as well if
1068 * the page is already in compressed_pages[].
1070 DBG_BUGON(!justfound
);
1073 set_page_private(page
, (unsigned long)pcl
);
1074 SetPagePrivate(page
);
1077 /* no need to submit io if it is already up-to-date */
1078 if (PageUptodate(page
)) {
1086 * the managed page has been truncated, it's unsafe to
1087 * reuse this one, let's allocate a new cache-managed page.
1089 DBG_BUGON(page
->mapping
);
1090 DBG_BUGON(!justfound
);
1096 page
= erofs_allocpage(pagepool
, gfp
| __GFP_NOFAIL
);
1097 if (oldpage
!= cmpxchg(&pcl
->compressed_pages
[nr
], oldpage
, page
)) {
1098 list_add(&page
->lru
, pagepool
);
1103 if (!tocache
|| add_to_page_cache_lru(page
, mc
, index
+ nr
, gfp
)) {
1104 /* turn into temporary page if fails (1 ref) */
1105 set_page_private(page
, Z_EROFS_SHORTLIVED_PAGE
);
1108 attach_page_private(page
, pcl
);
1109 /* drop a refcount added by allocpage (then we have 2 refs here) */
1112 out
: /* the only exit (for tracing and debugging) */
1116 static struct z_erofs_decompressqueue
*
1117 jobqueue_init(struct super_block
*sb
,
1118 struct z_erofs_decompressqueue
*fgq
, bool *fg
)
1120 struct z_erofs_decompressqueue
*q
;
1123 q
= kvzalloc(sizeof(*q
), GFP_KERNEL
| __GFP_NOWARN
);
1128 INIT_WORK(&q
->u
.work
, z_erofs_decompressqueue_work
);
1132 init_waitqueue_head(&fgq
->u
.wait
);
1133 atomic_set(&fgq
->pending_bios
, 0);
1136 q
->head
= Z_EROFS_PCLUSTER_TAIL_CLOSED
;
1140 /* define decompression jobqueue types */
1147 static void *jobqueueset_init(struct super_block
*sb
,
1148 struct z_erofs_decompressqueue
*q
[],
1149 struct z_erofs_decompressqueue
*fgq
, bool *fg
)
1152 * if managed cache is enabled, bypass jobqueue is needed,
1153 * no need to read from device for all pclusters in this queue.
1155 q
[JQ_BYPASS
] = jobqueue_init(sb
, fgq
+ JQ_BYPASS
, NULL
);
1156 q
[JQ_SUBMIT
] = jobqueue_init(sb
, fgq
+ JQ_SUBMIT
, fg
);
1158 return tagptr_cast_ptr(tagptr_fold(tagptr1_t
, q
[JQ_SUBMIT
], *fg
));
1161 static void move_to_bypass_jobqueue(struct z_erofs_pcluster
*pcl
,
1162 z_erofs_next_pcluster_t qtail
[],
1163 z_erofs_next_pcluster_t owned_head
)
1165 z_erofs_next_pcluster_t
*const submit_qtail
= qtail
[JQ_SUBMIT
];
1166 z_erofs_next_pcluster_t
*const bypass_qtail
= qtail
[JQ_BYPASS
];
1168 DBG_BUGON(owned_head
== Z_EROFS_PCLUSTER_TAIL_CLOSED
);
1169 if (owned_head
== Z_EROFS_PCLUSTER_TAIL
)
1170 owned_head
= Z_EROFS_PCLUSTER_TAIL_CLOSED
;
1172 WRITE_ONCE(pcl
->next
, Z_EROFS_PCLUSTER_TAIL_CLOSED
);
1174 WRITE_ONCE(*submit_qtail
, owned_head
);
1175 WRITE_ONCE(*bypass_qtail
, &pcl
->next
);
1177 qtail
[JQ_BYPASS
] = &pcl
->next
;
1180 static void z_erofs_submit_queue(struct super_block
*sb
,
1181 struct z_erofs_decompress_frontend
*f
,
1182 struct list_head
*pagepool
,
1183 struct z_erofs_decompressqueue
*fgq
,
1186 struct erofs_sb_info
*const sbi
= EROFS_SB(sb
);
1187 z_erofs_next_pcluster_t qtail
[NR_JOBQUEUES
];
1188 struct z_erofs_decompressqueue
*q
[NR_JOBQUEUES
];
1190 z_erofs_next_pcluster_t owned_head
= f
->clt
.owned_head
;
1191 /* since bio will be NULL, no need to initialize last_index */
1193 unsigned int nr_bios
= 0;
1194 struct bio
*bio
= NULL
;
1196 bi_private
= jobqueueset_init(sb
, q
, fgq
, force_fg
);
1197 qtail
[JQ_BYPASS
] = &q
[JQ_BYPASS
]->head
;
1198 qtail
[JQ_SUBMIT
] = &q
[JQ_SUBMIT
]->head
;
1200 /* by default, all need io submission */
1201 q
[JQ_SUBMIT
]->head
= owned_head
;
1204 struct z_erofs_pcluster
*pcl
;
1209 /* no possible 'owned_head' equals the following */
1210 DBG_BUGON(owned_head
== Z_EROFS_PCLUSTER_TAIL_CLOSED
);
1211 DBG_BUGON(owned_head
== Z_EROFS_PCLUSTER_NIL
);
1213 pcl
= container_of(owned_head
, struct z_erofs_pcluster
, next
);
1215 cur
= pcl
->obj
.index
;
1216 end
= cur
+ BIT(pcl
->clusterbits
);
1218 /* close the main owned chain at first */
1219 owned_head
= cmpxchg(&pcl
->next
, Z_EROFS_PCLUSTER_TAIL
,
1220 Z_EROFS_PCLUSTER_TAIL_CLOSED
);
1225 page
= pickup_page_for_submission(pcl
, i
++, pagepool
,
1231 if (bio
&& cur
!= last_index
+ 1) {
1238 bio
= bio_alloc(GFP_NOIO
, BIO_MAX_PAGES
);
1240 bio
->bi_end_io
= z_erofs_decompressqueue_endio
;
1241 bio_set_dev(bio
, sb
->s_bdev
);
1242 bio
->bi_iter
.bi_sector
= (sector_t
)cur
<<
1243 LOG_SECTORS_PER_BLOCK
;
1244 bio
->bi_private
= bi_private
;
1245 bio
->bi_opf
= REQ_OP_READ
;
1247 bio
->bi_opf
|= REQ_RAHEAD
;
1251 if (bio_add_page(bio
, page
, PAGE_SIZE
, 0) < PAGE_SIZE
)
1252 goto submit_bio_retry
;
1256 } while (++cur
< end
);
1259 qtail
[JQ_SUBMIT
] = &pcl
->next
;
1261 move_to_bypass_jobqueue(pcl
, qtail
, owned_head
);
1262 } while (owned_head
!= Z_EROFS_PCLUSTER_TAIL
);
1268 * although background is preferred, no one is pending for submission.
1269 * don't issue workqueue for decompression but drop it directly instead.
1271 if (!*force_fg
&& !nr_bios
) {
1272 kvfree(q
[JQ_SUBMIT
]);
1275 z_erofs_decompress_kickoff(q
[JQ_SUBMIT
], *force_fg
, nr_bios
);
1278 static void z_erofs_runqueue(struct super_block
*sb
,
1279 struct z_erofs_decompress_frontend
*f
,
1280 struct list_head
*pagepool
, bool force_fg
)
1282 struct z_erofs_decompressqueue io
[NR_JOBQUEUES
];
1284 if (f
->clt
.owned_head
== Z_EROFS_PCLUSTER_TAIL
)
1286 z_erofs_submit_queue(sb
, f
, pagepool
, io
, &force_fg
);
1288 /* handle bypass queue (no i/o pclusters) immediately */
1289 z_erofs_decompress_queue(&io
[JQ_BYPASS
], pagepool
);
1294 /* wait until all bios are completed */
1295 io_wait_event(io
[JQ_SUBMIT
].u
.wait
,
1296 !atomic_read(&io
[JQ_SUBMIT
].pending_bios
));
1298 /* handle synchronous decompress queue in the caller context */
1299 z_erofs_decompress_queue(&io
[JQ_SUBMIT
], pagepool
);
1302 static int z_erofs_readpage(struct file
*file
, struct page
*page
)
1304 struct inode
*const inode
= page
->mapping
->host
;
1305 struct z_erofs_decompress_frontend f
= DECOMPRESS_FRONTEND_INIT(inode
);
1307 LIST_HEAD(pagepool
);
1309 trace_erofs_readpage(page
, false);
1311 f
.headoffset
= (erofs_off_t
)page
->index
<< PAGE_SHIFT
;
1313 err
= z_erofs_do_read_page(&f
, page
, &pagepool
);
1314 (void)z_erofs_collector_end(&f
.clt
);
1316 /* if some compressed cluster ready, need submit them anyway */
1317 z_erofs_runqueue(inode
->i_sb
, &f
, &pagepool
, true);
1320 erofs_err(inode
->i_sb
, "failed to read, err [%d]", err
);
1323 put_page(f
.map
.mpage
);
1325 /* clean up the remaining free pages */
1326 put_pages_list(&pagepool
);
1330 static void z_erofs_readahead(struct readahead_control
*rac
)
1332 struct inode
*const inode
= rac
->mapping
->host
;
1333 struct erofs_sb_info
*const sbi
= EROFS_I_SB(inode
);
1335 unsigned int nr_pages
= readahead_count(rac
);
1336 bool sync
= (nr_pages
<= sbi
->ctx
.max_sync_decompress_pages
);
1337 struct z_erofs_decompress_frontend f
= DECOMPRESS_FRONTEND_INIT(inode
);
1338 struct page
*page
, *head
= NULL
;
1339 LIST_HEAD(pagepool
);
1341 trace_erofs_readpages(inode
, readahead_index(rac
), nr_pages
, false);
1344 f
.headoffset
= readahead_pos(rac
);
1346 while ((page
= readahead_page(rac
))) {
1347 prefetchw(&page
->flags
);
1350 * A pure asynchronous readahead is indicated if
1351 * a PG_readahead marked page is hitted at first.
1352 * Let's also do asynchronous decompression for this case.
1354 sync
&= !(PageReadahead(page
) && !head
);
1356 set_page_private(page
, (unsigned long)head
);
1361 struct page
*page
= head
;
1364 /* traversal in reverse order */
1365 head
= (void *)page_private(page
);
1367 err
= z_erofs_do_read_page(&f
, page
, &pagepool
);
1369 erofs_err(inode
->i_sb
,
1370 "readahead error at page %lu @ nid %llu",
1371 page
->index
, EROFS_I(inode
)->nid
);
1375 (void)z_erofs_collector_end(&f
.clt
);
1377 z_erofs_runqueue(inode
->i_sb
, &f
, &pagepool
, sync
);
1380 put_page(f
.map
.mpage
);
1382 /* clean up the remaining free pages */
1383 put_pages_list(&pagepool
);
1386 const struct address_space_operations z_erofs_aops
= {
1387 .readpage
= z_erofs_readpage
,
1388 .readahead
= z_erofs_readahead
,