1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (C) 2018 HUAWEI, Inc.
4 * http://www.huawei.com/
5 * Created by Gao Xiang <gaoxiang25@huawei.com>
9 #include <linux/prefetch.h>
11 #include <trace/events/erofs.h>
14 * a compressed_pages[] placeholder in order to avoid
15 * being filled with file pages for in-place decompression.
17 #define PAGE_UNALLOCATED ((void *)0x5F0E4B1D)
19 /* how to allocate cached pages for a pcluster */
20 enum z_erofs_cache_alloctype
{
21 DONTALLOC
, /* don't allocate any cached pages */
22 DELAYEDALLOC
, /* delayed allocation (at the time of submitting io) */
26 * tagged pointer with 1-bit tag for all compressed pages
27 * tag 0 - the page is just found with an extra page reference
29 typedef tagptr1_t compressed_page_t
;
31 #define tag_compressed_page_justfound(page) \
32 tagptr_fold(compressed_page_t, page, 1)
34 static struct workqueue_struct
*z_erofs_workqueue __read_mostly
;
35 static struct kmem_cache
*pcluster_cachep __read_mostly
;
37 void z_erofs_exit_zip_subsystem(void)
39 destroy_workqueue(z_erofs_workqueue
);
40 kmem_cache_destroy(pcluster_cachep
);
43 static inline int z_erofs_init_workqueue(void)
45 const unsigned int onlinecpus
= num_possible_cpus();
46 const unsigned int flags
= WQ_UNBOUND
| WQ_HIGHPRI
| WQ_CPU_INTENSIVE
;
49 * no need to spawn too many threads, limiting threads could minimum
50 * scheduling overhead, perhaps per-CPU threads should be better?
52 z_erofs_workqueue
= alloc_workqueue("erofs_unzipd", flags
,
53 onlinecpus
+ onlinecpus
/ 4);
54 return z_erofs_workqueue
? 0 : -ENOMEM
;
57 static void z_erofs_pcluster_init_once(void *ptr
)
59 struct z_erofs_pcluster
*pcl
= ptr
;
60 struct z_erofs_collection
*cl
= z_erofs_primarycollection(pcl
);
63 mutex_init(&cl
->lock
);
66 for (i
= 0; i
< Z_EROFS_CLUSTER_MAX_PAGES
; ++i
)
67 pcl
->compressed_pages
[i
] = NULL
;
70 int __init
z_erofs_init_zip_subsystem(void)
72 pcluster_cachep
= kmem_cache_create("erofs_compress",
73 Z_EROFS_WORKGROUP_SIZE
, 0,
75 z_erofs_pcluster_init_once
);
76 if (pcluster_cachep
) {
77 if (!z_erofs_init_workqueue())
80 kmem_cache_destroy(pcluster_cachep
);
85 enum z_erofs_collectmode
{
89 * The current collection was the tail of an exist chain, in addition
90 * that the previous processed chained collections are all decided to
92 * A new chain will be created for the remaining collections which are
93 * not processed yet, therefore different from COLLECT_PRIMARY_FOLLOWED,
94 * the next collection cannot reuse the whole page safely in
95 * the following scenario:
96 * ________________________________________________________________
97 * | tail (partial) page | head (partial) page |
98 * | (belongs to the next cl) | (belongs to the current cl) |
99 * |_______PRIMARY_FOLLOWED_______|________PRIMARY_HOOKED___________|
101 COLLECT_PRIMARY_HOOKED
,
102 COLLECT_PRIMARY_FOLLOWED_NOINPLACE
,
104 * The current collection has been linked with the owned chain, and
105 * could also be linked with the remaining collections, which means
106 * if the processing page is the tail page of the collection, thus
107 * the current collection can safely use the whole page (since
108 * the previous collection is under control) for in-place I/O, as
110 * ________________________________________________________________
111 * | tail (partial) page | head (partial) page |
112 * | (of the current cl) | (of the previous collection) |
113 * | PRIMARY_FOLLOWED or | |
114 * |_____PRIMARY_HOOKED___|____________PRIMARY_FOLLOWED____________|
116 * [ (*) the above page can be used as inplace I/O. ]
118 COLLECT_PRIMARY_FOLLOWED
,
121 struct z_erofs_collector
{
122 struct z_erofs_pagevec_ctor vector
;
124 struct z_erofs_pcluster
*pcl
, *tailpcl
;
125 struct z_erofs_collection
*cl
;
126 struct page
**compressedpages
;
127 z_erofs_next_pcluster_t owned_head
;
129 enum z_erofs_collectmode mode
;
132 struct z_erofs_decompress_frontend
{
133 struct inode
*const inode
;
135 struct z_erofs_collector clt
;
136 struct erofs_map_blocks map
;
138 /* used for applying cache strategy on the fly */
140 erofs_off_t headoffset
;
143 #define COLLECTOR_INIT() { \
144 .owned_head = Z_EROFS_PCLUSTER_TAIL, \
145 .mode = COLLECT_PRIMARY_FOLLOWED }
147 #define DECOMPRESS_FRONTEND_INIT(__i) { \
148 .inode = __i, .clt = COLLECTOR_INIT(), \
151 static struct page
*z_pagemap_global
[Z_EROFS_VMAP_GLOBAL_PAGES
];
152 static DEFINE_MUTEX(z_pagemap_global_lock
);
154 static void preload_compressed_pages(struct z_erofs_collector
*clt
,
155 struct address_space
*mc
,
156 enum z_erofs_cache_alloctype type
,
157 struct list_head
*pagepool
)
159 const struct z_erofs_pcluster
*pcl
= clt
->pcl
;
160 const unsigned int clusterpages
= BIT(pcl
->clusterbits
);
161 struct page
**pages
= clt
->compressedpages
;
162 pgoff_t index
= pcl
->obj
.index
+ (pages
- pcl
->compressed_pages
);
163 bool standalone
= true;
165 if (clt
->mode
< COLLECT_PRIMARY_FOLLOWED
)
168 for (; pages
< pcl
->compressed_pages
+ clusterpages
; ++pages
) {
172 /* the compressed page was loaded before */
173 if (READ_ONCE(*pages
))
176 page
= find_get_page(mc
, index
);
179 t
= tag_compressed_page_justfound(page
);
180 } else if (type
== DELAYEDALLOC
) {
181 t
= tagptr_init(compressed_page_t
, PAGE_UNALLOCATED
);
182 } else { /* DONTALLOC */
184 clt
->compressedpages
= pages
;
189 if (!cmpxchg_relaxed(pages
, NULL
, tagptr_cast_ptr(t
)))
196 if (standalone
) /* downgrade to PRIMARY_FOLLOWED_NOINPLACE */
197 clt
->mode
= COLLECT_PRIMARY_FOLLOWED_NOINPLACE
;
200 /* called by erofs_shrinker to get rid of all compressed_pages */
201 int erofs_try_to_free_all_cached_pages(struct erofs_sb_info
*sbi
,
202 struct erofs_workgroup
*grp
)
204 struct z_erofs_pcluster
*const pcl
=
205 container_of(grp
, struct z_erofs_pcluster
, obj
);
206 struct address_space
*const mapping
= MNGD_MAPPING(sbi
);
207 const unsigned int clusterpages
= BIT(pcl
->clusterbits
);
211 * refcount of workgroup is now freezed as 1,
212 * therefore no need to worry about available decompression users.
214 for (i
= 0; i
< clusterpages
; ++i
) {
215 struct page
*page
= pcl
->compressed_pages
[i
];
220 /* block other users from reclaiming or migrating the page */
221 if (!trylock_page(page
))
224 if (page
->mapping
!= mapping
)
227 /* barrier is implied in the following 'unlock_page' */
228 WRITE_ONCE(pcl
->compressed_pages
[i
], NULL
);
229 set_page_private(page
, 0);
230 ClearPagePrivate(page
);
238 int erofs_try_to_free_cached_page(struct address_space
*mapping
,
241 struct z_erofs_pcluster
*const pcl
= (void *)page_private(page
);
242 const unsigned int clusterpages
= BIT(pcl
->clusterbits
);
243 int ret
= 0; /* 0 - busy */
245 if (erofs_workgroup_try_to_freeze(&pcl
->obj
, 1)) {
248 for (i
= 0; i
< clusterpages
; ++i
) {
249 if (pcl
->compressed_pages
[i
] == page
) {
250 WRITE_ONCE(pcl
->compressed_pages
[i
], NULL
);
255 erofs_workgroup_unfreeze(&pcl
->obj
, 1);
258 ClearPagePrivate(page
);
265 /* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */
266 static inline bool z_erofs_try_inplace_io(struct z_erofs_collector
*clt
,
269 struct z_erofs_pcluster
*const pcl
= clt
->pcl
;
270 const unsigned int clusterpages
= BIT(pcl
->clusterbits
);
272 while (clt
->compressedpages
< pcl
->compressed_pages
+ clusterpages
) {
273 if (!cmpxchg(clt
->compressedpages
++, NULL
, page
))
279 /* callers must be with collection lock held */
280 static int z_erofs_attach_page(struct z_erofs_collector
*clt
,
282 enum z_erofs_page_type type
)
287 /* give priority for inplaceio */
288 if (clt
->mode
>= COLLECT_PRIMARY
&&
289 type
== Z_EROFS_PAGE_TYPE_EXCLUSIVE
&&
290 z_erofs_try_inplace_io(clt
, page
))
293 ret
= z_erofs_pagevec_enqueue(&clt
->vector
,
294 page
, type
, &occupied
);
295 clt
->cl
->vcnt
+= (unsigned int)ret
;
297 return ret
? 0 : -EAGAIN
;
300 static enum z_erofs_collectmode
301 try_to_claim_pcluster(struct z_erofs_pcluster
*pcl
,
302 z_erofs_next_pcluster_t
*owned_head
)
304 /* let's claim these following types of pclusters */
306 if (pcl
->next
== Z_EROFS_PCLUSTER_NIL
) {
307 /* type 1, nil pcluster */
308 if (cmpxchg(&pcl
->next
, Z_EROFS_PCLUSTER_NIL
,
309 *owned_head
) != Z_EROFS_PCLUSTER_NIL
)
312 *owned_head
= &pcl
->next
;
313 /* lucky, I am the followee :) */
314 return COLLECT_PRIMARY_FOLLOWED
;
315 } else if (pcl
->next
== Z_EROFS_PCLUSTER_TAIL
) {
317 * type 2, link to the end of a existing open chain,
318 * be careful that its submission itself is governed
319 * by the original owned chain.
321 if (cmpxchg(&pcl
->next
, Z_EROFS_PCLUSTER_TAIL
,
322 *owned_head
) != Z_EROFS_PCLUSTER_TAIL
)
324 *owned_head
= Z_EROFS_PCLUSTER_TAIL
;
325 return COLLECT_PRIMARY_HOOKED
;
327 return COLLECT_PRIMARY
; /* :( better luck next time */
330 static int z_erofs_lookup_collection(struct z_erofs_collector
*clt
,
332 struct erofs_map_blocks
*map
)
334 struct z_erofs_pcluster
*pcl
= clt
->pcl
;
335 struct z_erofs_collection
*cl
;
338 /* to avoid unexpected loop formed by corrupted images */
339 if (clt
->owned_head
== &pcl
->next
|| pcl
== clt
->tailpcl
) {
341 return -EFSCORRUPTED
;
344 cl
= z_erofs_primarycollection(pcl
);
345 if (cl
->pageofs
!= (map
->m_la
& ~PAGE_MASK
)) {
347 return -EFSCORRUPTED
;
350 length
= READ_ONCE(pcl
->length
);
351 if (length
& Z_EROFS_PCLUSTER_FULL_LENGTH
) {
352 if ((map
->m_llen
<< Z_EROFS_PCLUSTER_LENGTH_BIT
) > length
) {
354 return -EFSCORRUPTED
;
357 unsigned int llen
= map
->m_llen
<< Z_EROFS_PCLUSTER_LENGTH_BIT
;
359 if (map
->m_flags
& EROFS_MAP_FULL_MAPPED
)
360 llen
|= Z_EROFS_PCLUSTER_FULL_LENGTH
;
362 while (llen
> length
&&
363 length
!= cmpxchg_relaxed(&pcl
->length
, length
, llen
)) {
365 length
= READ_ONCE(pcl
->length
);
368 mutex_lock(&cl
->lock
);
369 /* used to check tail merging loop due to corrupted images */
370 if (clt
->owned_head
== Z_EROFS_PCLUSTER_TAIL
)
372 clt
->mode
= try_to_claim_pcluster(pcl
, &clt
->owned_head
);
373 /* clean tailpcl if the current owned_head is Z_EROFS_PCLUSTER_TAIL */
374 if (clt
->owned_head
== Z_EROFS_PCLUSTER_TAIL
)
380 static int z_erofs_register_collection(struct z_erofs_collector
*clt
,
382 struct erofs_map_blocks
*map
)
384 struct z_erofs_pcluster
*pcl
;
385 struct z_erofs_collection
*cl
;
386 struct erofs_workgroup
*grp
;
389 /* no available workgroup, let's allocate one */
390 pcl
= kmem_cache_alloc(pcluster_cachep
, GFP_NOFS
);
394 atomic_set(&pcl
->obj
.refcount
, 1);
395 pcl
->obj
.index
= map
->m_pa
>> PAGE_SHIFT
;
397 pcl
->length
= (map
->m_llen
<< Z_EROFS_PCLUSTER_LENGTH_BIT
) |
398 (map
->m_flags
& EROFS_MAP_FULL_MAPPED
?
399 Z_EROFS_PCLUSTER_FULL_LENGTH
: 0);
401 if (map
->m_flags
& EROFS_MAP_ZIPPED
)
402 pcl
->algorithmformat
= Z_EROFS_COMPRESSION_LZ4
;
404 pcl
->algorithmformat
= Z_EROFS_COMPRESSION_SHIFTED
;
406 pcl
->clusterbits
= EROFS_I(inode
)->z_physical_clusterbits
[0];
407 pcl
->clusterbits
-= PAGE_SHIFT
;
409 /* new pclusters should be claimed as type 1, primary and followed */
410 pcl
->next
= clt
->owned_head
;
411 clt
->mode
= COLLECT_PRIMARY_FOLLOWED
;
413 cl
= z_erofs_primarycollection(pcl
);
415 /* must be cleaned before freeing to slab */
416 DBG_BUGON(cl
->nr_pages
);
419 cl
->pageofs
= map
->m_la
& ~PAGE_MASK
;
422 * lock all primary followed works before visible to others
423 * and mutex_trylock *never* fails for a new pcluster.
425 DBG_BUGON(!mutex_trylock(&cl
->lock
));
427 grp
= erofs_insert_workgroup(inode
->i_sb
, &pcl
->obj
);
433 if (grp
!= &pcl
->obj
) {
434 clt
->pcl
= container_of(grp
, struct z_erofs_pcluster
, obj
);
438 /* used to check tail merging loop due to corrupted images */
439 if (clt
->owned_head
== Z_EROFS_PCLUSTER_TAIL
)
441 clt
->owned_head
= &pcl
->next
;
447 mutex_unlock(&cl
->lock
);
448 kmem_cache_free(pcluster_cachep
, pcl
);
452 static int z_erofs_collector_begin(struct z_erofs_collector
*clt
,
454 struct erofs_map_blocks
*map
)
456 struct erofs_workgroup
*grp
;
461 /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous collection */
462 DBG_BUGON(clt
->owned_head
== Z_EROFS_PCLUSTER_NIL
);
463 DBG_BUGON(clt
->owned_head
== Z_EROFS_PCLUSTER_TAIL_CLOSED
);
465 if (!PAGE_ALIGNED(map
->m_pa
)) {
470 grp
= erofs_find_workgroup(inode
->i_sb
, map
->m_pa
>> PAGE_SHIFT
);
472 clt
->pcl
= container_of(grp
, struct z_erofs_pcluster
, obj
);
474 ret
= z_erofs_register_collection(clt
, inode
, map
);
482 ret
= z_erofs_lookup_collection(clt
, inode
, map
);
484 erofs_workgroup_put(&clt
->pcl
->obj
);
489 z_erofs_pagevec_ctor_init(&clt
->vector
, Z_EROFS_NR_INLINE_PAGEVECS
,
490 clt
->cl
->pagevec
, clt
->cl
->vcnt
);
492 clt
->compressedpages
= clt
->pcl
->compressed_pages
;
493 if (clt
->mode
<= COLLECT_PRIMARY
) /* cannot do in-place I/O */
494 clt
->compressedpages
+= Z_EROFS_CLUSTER_MAX_PAGES
;
499 * keep in mind that no referenced pclusters will be freed
500 * only after a RCU grace period.
502 static void z_erofs_rcu_callback(struct rcu_head
*head
)
504 struct z_erofs_collection
*const cl
=
505 container_of(head
, struct z_erofs_collection
, rcu
);
507 kmem_cache_free(pcluster_cachep
,
508 container_of(cl
, struct z_erofs_pcluster
,
509 primary_collection
));
512 void erofs_workgroup_free_rcu(struct erofs_workgroup
*grp
)
514 struct z_erofs_pcluster
*const pcl
=
515 container_of(grp
, struct z_erofs_pcluster
, obj
);
516 struct z_erofs_collection
*const cl
= z_erofs_primarycollection(pcl
);
518 call_rcu(&cl
->rcu
, z_erofs_rcu_callback
);
521 static void z_erofs_collection_put(struct z_erofs_collection
*cl
)
523 struct z_erofs_pcluster
*const pcl
=
524 container_of(cl
, struct z_erofs_pcluster
, primary_collection
);
526 erofs_workgroup_put(&pcl
->obj
);
529 static bool z_erofs_collector_end(struct z_erofs_collector
*clt
)
531 struct z_erofs_collection
*cl
= clt
->cl
;
536 z_erofs_pagevec_ctor_exit(&clt
->vector
, false);
537 mutex_unlock(&cl
->lock
);
540 * if all pending pages are added, don't hold its reference
541 * any longer if the pcluster isn't hosted by ourselves.
543 if (clt
->mode
< COLLECT_PRIMARY_FOLLOWED_NOINPLACE
)
544 z_erofs_collection_put(cl
);
550 static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend
*fe
,
551 unsigned int cachestrategy
,
554 if (cachestrategy
<= EROFS_ZIP_CACHE_DISABLED
)
560 return cachestrategy
>= EROFS_ZIP_CACHE_READAROUND
&&
564 static int z_erofs_do_read_page(struct z_erofs_decompress_frontend
*fe
,
566 struct list_head
*pagepool
)
568 struct inode
*const inode
= fe
->inode
;
569 struct erofs_sb_info
*const sbi
= EROFS_I_SB(inode
);
570 struct erofs_map_blocks
*const map
= &fe
->map
;
571 struct z_erofs_collector
*const clt
= &fe
->clt
;
572 const loff_t offset
= page_offset(page
);
575 enum z_erofs_cache_alloctype cache_strategy
;
576 enum z_erofs_page_type page_type
;
577 unsigned int cur
, end
, spiltted
, index
;
580 /* register locked file pages as online pages in pack */
581 z_erofs_onlinepage_init(page
);
588 /* lucky, within the range of the current map_blocks */
589 if (offset
+ cur
>= map
->m_la
&&
590 offset
+ cur
< map
->m_la
+ map
->m_llen
) {
591 /* didn't get a valid collection previously (very rare) */
597 /* go ahead the next map_blocks */
598 erofs_dbg("%s: [out-of-range] pos %llu", __func__
, offset
+ cur
);
600 if (z_erofs_collector_end(clt
))
601 fe
->backmost
= false;
603 map
->m_la
= offset
+ cur
;
605 err
= z_erofs_map_blocks_iter(inode
, map
, 0);
610 if (!(map
->m_flags
& EROFS_MAP_MAPPED
))
613 err
= z_erofs_collector_begin(clt
, inode
, map
);
617 /* preload all compressed pages (maybe downgrade role if necessary) */
618 if (should_alloc_managed_pages(fe
, sbi
->cache_strategy
, map
->m_la
))
619 cache_strategy
= DELAYEDALLOC
;
621 cache_strategy
= DONTALLOC
;
623 preload_compressed_pages(clt
, MNGD_MAPPING(sbi
),
624 cache_strategy
, pagepool
);
628 * Ensure the current partial page belongs to this submit chain rather
629 * than other concurrent submit chains or the noio(bypass) chain since
630 * those chains are handled asynchronously thus the page cannot be used
631 * for inplace I/O or pagevec (should be processed in strict order.)
633 tight
&= (clt
->mode
>= COLLECT_PRIMARY_HOOKED
&&
634 clt
->mode
!= COLLECT_PRIMARY_FOLLOWED_NOINPLACE
);
636 cur
= end
- min_t(unsigned int, offset
+ end
- map
->m_la
, end
);
637 if (!(map
->m_flags
& EROFS_MAP_MAPPED
)) {
638 zero_user_segment(page
, cur
, end
);
642 /* let's derive page type */
643 page_type
= cur
? Z_EROFS_VLE_PAGE_TYPE_HEAD
:
644 (!spiltted
? Z_EROFS_PAGE_TYPE_EXCLUSIVE
:
645 (tight
? Z_EROFS_PAGE_TYPE_EXCLUSIVE
:
646 Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED
));
649 tight
&= (clt
->mode
>= COLLECT_PRIMARY_FOLLOWED
);
652 err
= z_erofs_attach_page(clt
, page
, page_type
);
653 /* should allocate an additional staging page for pagevec */
654 if (err
== -EAGAIN
) {
655 struct page
*const newpage
=
656 erofs_allocpage(pagepool
, GFP_NOFS
| __GFP_NOFAIL
);
658 newpage
->mapping
= Z_EROFS_MAPPING_STAGING
;
659 err
= z_erofs_attach_page(clt
, newpage
,
660 Z_EROFS_PAGE_TYPE_EXCLUSIVE
);
668 index
= page
->index
- (map
->m_la
>> PAGE_SHIFT
);
670 z_erofs_onlinepage_fixup(page
, index
, true);
672 /* bump up the number of spiltted parts of a page */
674 /* also update nr_pages */
675 clt
->cl
->nr_pages
= max_t(pgoff_t
, clt
->cl
->nr_pages
, index
+ 1);
677 /* can be used for verification */
678 map
->m_llen
= offset
+ cur
- map
->m_la
;
685 z_erofs_onlinepage_endio(page
);
687 erofs_dbg("%s, finish page: %pK spiltted: %u map->m_llen %llu",
688 __func__
, page
, spiltted
, map
->m_llen
);
691 /* if some error occurred while processing this page */
697 static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue
*io
,
700 /* wake up the caller thread for sync decompression */
704 spin_lock_irqsave(&io
->u
.wait
.lock
, flags
);
705 if (!atomic_add_return(bios
, &io
->pending_bios
))
706 wake_up_locked(&io
->u
.wait
);
707 spin_unlock_irqrestore(&io
->u
.wait
.lock
, flags
);
711 if (!atomic_add_return(bios
, &io
->pending_bios
))
712 queue_work(z_erofs_workqueue
, &io
->u
.work
);
715 static void z_erofs_decompressqueue_endio(struct bio
*bio
)
717 tagptr1_t t
= tagptr_init(tagptr1_t
, bio
->bi_private
);
718 struct z_erofs_decompressqueue
*q
= tagptr_unfold_ptr(t
);
719 blk_status_t err
= bio
->bi_status
;
720 struct bio_vec
*bvec
;
721 struct bvec_iter_all iter_all
;
723 bio_for_each_segment_all(bvec
, bio
, iter_all
) {
724 struct page
*page
= bvec
->bv_page
;
726 DBG_BUGON(PageUptodate(page
));
727 DBG_BUGON(!page
->mapping
);
732 if (erofs_page_is_managed(EROFS_SB(q
->sb
), page
)) {
734 SetPageUptodate(page
);
738 z_erofs_decompress_kickoff(q
, tagptr_unfold_tags(t
), -1);
742 static int z_erofs_decompress_pcluster(struct super_block
*sb
,
743 struct z_erofs_pcluster
*pcl
,
744 struct list_head
*pagepool
)
746 struct erofs_sb_info
*const sbi
= EROFS_SB(sb
);
747 const unsigned int clusterpages
= BIT(pcl
->clusterbits
);
748 struct z_erofs_pagevec_ctor ctor
;
749 unsigned int i
, outputsize
, llen
, nr_pages
;
750 struct page
*pages_onstack
[Z_EROFS_VMAP_ONSTACK_PAGES
];
751 struct page
**pages
, **compressed_pages
, *page
;
753 enum z_erofs_page_type page_type
;
754 bool overlapped
, partial
;
755 struct z_erofs_collection
*cl
;
759 cl
= z_erofs_primarycollection(pcl
);
760 DBG_BUGON(!READ_ONCE(cl
->nr_pages
));
762 mutex_lock(&cl
->lock
);
763 nr_pages
= cl
->nr_pages
;
765 if (nr_pages
<= Z_EROFS_VMAP_ONSTACK_PAGES
) {
766 pages
= pages_onstack
;
767 } else if (nr_pages
<= Z_EROFS_VMAP_GLOBAL_PAGES
&&
768 mutex_trylock(&z_pagemap_global_lock
)) {
769 pages
= z_pagemap_global
;
771 gfp_t gfp_flags
= GFP_KERNEL
;
773 if (nr_pages
> Z_EROFS_VMAP_GLOBAL_PAGES
)
774 gfp_flags
|= __GFP_NOFAIL
;
776 pages
= kvmalloc_array(nr_pages
, sizeof(struct page
*),
779 /* fallback to global pagemap for the lowmem scenario */
781 mutex_lock(&z_pagemap_global_lock
);
782 pages
= z_pagemap_global
;
786 for (i
= 0; i
< nr_pages
; ++i
)
790 z_erofs_pagevec_ctor_init(&ctor
, Z_EROFS_NR_INLINE_PAGEVECS
,
793 for (i
= 0; i
< cl
->vcnt
; ++i
) {
796 page
= z_erofs_pagevec_dequeue(&ctor
, &page_type
);
798 /* all pages in pagevec ought to be valid */
800 DBG_BUGON(!page
->mapping
);
802 if (z_erofs_put_stagingpage(pagepool
, page
))
805 if (page_type
== Z_EROFS_VLE_PAGE_TYPE_HEAD
)
808 pagenr
= z_erofs_onlinepage_index(page
);
810 DBG_BUGON(pagenr
>= nr_pages
);
813 * currently EROFS doesn't support multiref(dedup),
814 * so here erroring out one multiref page.
818 SetPageError(pages
[pagenr
]);
819 z_erofs_onlinepage_endio(pages
[pagenr
]);
822 pages
[pagenr
] = page
;
824 z_erofs_pagevec_ctor_exit(&ctor
, true);
827 compressed_pages
= pcl
->compressed_pages
;
829 for (i
= 0; i
< clusterpages
; ++i
) {
832 page
= compressed_pages
[i
];
834 /* all compressed pages ought to be valid */
836 DBG_BUGON(!page
->mapping
);
838 if (!z_erofs_page_is_staging(page
)) {
839 if (erofs_page_is_managed(sbi
, page
)) {
840 if (!PageUptodate(page
))
846 * only if non-head page can be selected
847 * for inplace decompression
849 pagenr
= z_erofs_onlinepage_index(page
);
851 DBG_BUGON(pagenr
>= nr_pages
);
854 SetPageError(pages
[pagenr
]);
855 z_erofs_onlinepage_endio(pages
[pagenr
]);
858 pages
[pagenr
] = page
;
863 /* PG_error needs checking for inplaced and staging pages */
864 if (PageError(page
)) {
865 DBG_BUGON(PageUptodate(page
));
873 llen
= pcl
->length
>> Z_EROFS_PCLUSTER_LENGTH_BIT
;
874 if (nr_pages
<< PAGE_SHIFT
>= cl
->pageofs
+ llen
) {
876 partial
= !(pcl
->length
& Z_EROFS_PCLUSTER_FULL_LENGTH
);
878 outputsize
= (nr_pages
<< PAGE_SHIFT
) - cl
->pageofs
;
882 err
= z_erofs_decompress(&(struct z_erofs_decompress_req
) {
884 .in
= compressed_pages
,
886 .pageofs_out
= cl
->pageofs
,
887 .inputsize
= PAGE_SIZE
,
888 .outputsize
= outputsize
,
889 .alg
= pcl
->algorithmformat
,
890 .inplace_io
= overlapped
,
891 .partial_decoding
= partial
895 /* must handle all compressed pages before endding pages */
896 for (i
= 0; i
< clusterpages
; ++i
) {
897 page
= compressed_pages
[i
];
899 if (erofs_page_is_managed(sbi
, page
))
902 /* recycle all individual staging pages */
903 (void)z_erofs_put_stagingpage(pagepool
, page
);
905 WRITE_ONCE(compressed_pages
[i
], NULL
);
908 for (i
= 0; i
< nr_pages
; ++i
) {
913 DBG_BUGON(!page
->mapping
);
915 /* recycle all individual staging pages */
916 if (z_erofs_put_stagingpage(pagepool
, page
))
922 z_erofs_onlinepage_endio(page
);
925 if (pages
== z_pagemap_global
)
926 mutex_unlock(&z_pagemap_global_lock
);
927 else if (pages
!= pages_onstack
)
933 /* all cl locks MUST be taken before the following line */
934 WRITE_ONCE(pcl
->next
, Z_EROFS_PCLUSTER_NIL
);
936 /* all cl locks SHOULD be released right now */
937 mutex_unlock(&cl
->lock
);
939 z_erofs_collection_put(cl
);
943 static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue
*io
,
944 struct list_head
*pagepool
)
946 z_erofs_next_pcluster_t owned
= io
->head
;
948 while (owned
!= Z_EROFS_PCLUSTER_TAIL_CLOSED
) {
949 struct z_erofs_pcluster
*pcl
;
951 /* no possible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */
952 DBG_BUGON(owned
== Z_EROFS_PCLUSTER_TAIL
);
954 /* no possible that 'owned' equals NULL */
955 DBG_BUGON(owned
== Z_EROFS_PCLUSTER_NIL
);
957 pcl
= container_of(owned
, struct z_erofs_pcluster
, next
);
958 owned
= READ_ONCE(pcl
->next
);
960 z_erofs_decompress_pcluster(io
->sb
, pcl
, pagepool
);
964 static void z_erofs_decompressqueue_work(struct work_struct
*work
)
966 struct z_erofs_decompressqueue
*bgq
=
967 container_of(work
, struct z_erofs_decompressqueue
, u
.work
);
970 DBG_BUGON(bgq
->head
== Z_EROFS_PCLUSTER_TAIL_CLOSED
);
971 z_erofs_decompress_queue(bgq
, &pagepool
);
973 put_pages_list(&pagepool
);
977 static struct page
*pickup_page_for_submission(struct z_erofs_pcluster
*pcl
,
979 struct list_head
*pagepool
,
980 struct address_space
*mc
,
983 const pgoff_t index
= pcl
->obj
.index
;
984 bool tocache
= false;
986 struct address_space
*mapping
;
987 struct page
*oldpage
, *page
;
993 page
= READ_ONCE(pcl
->compressed_pages
[nr
]);
1000 * the cached page has not been allocated and
1001 * an placeholder is out there, prepare it now.
1003 if (page
== PAGE_UNALLOCATED
) {
1008 /* process the target tagged pointer */
1009 t
= tagptr_init(compressed_page_t
, page
);
1010 justfound
= tagptr_unfold_tags(t
);
1011 page
= tagptr_unfold_ptr(t
);
1013 mapping
= READ_ONCE(page
->mapping
);
1016 * unmanaged (file) pages are all locked solidly,
1017 * therefore it is impossible for `mapping' to be NULL.
1019 if (mapping
&& mapping
!= mc
)
1020 /* ought to be unmanaged pages */
1025 /* only true if page reclaim goes wrong, should never happen */
1026 DBG_BUGON(justfound
&& PagePrivate(page
));
1028 /* the page is still in manage cache */
1029 if (page
->mapping
== mc
) {
1030 WRITE_ONCE(pcl
->compressed_pages
[nr
], page
);
1032 ClearPageError(page
);
1033 if (!PagePrivate(page
)) {
1035 * impossible to be !PagePrivate(page) for
1036 * the current restriction as well if
1037 * the page is already in compressed_pages[].
1039 DBG_BUGON(!justfound
);
1042 set_page_private(page
, (unsigned long)pcl
);
1043 SetPagePrivate(page
);
1046 /* no need to submit io if it is already up-to-date */
1047 if (PageUptodate(page
)) {
1055 * the managed page has been truncated, it's unsafe to
1056 * reuse this one, let's allocate a new cache-managed page.
1058 DBG_BUGON(page
->mapping
);
1059 DBG_BUGON(!justfound
);
1065 page
= erofs_allocpage(pagepool
, gfp
| __GFP_NOFAIL
);
1066 if (!tocache
|| add_to_page_cache_lru(page
, mc
, index
+ nr
, gfp
)) {
1067 /* non-LRU / non-movable temporary page is needed */
1068 page
->mapping
= Z_EROFS_MAPPING_STAGING
;
1072 if (oldpage
!= cmpxchg(&pcl
->compressed_pages
[nr
], oldpage
, page
)) {
1074 /* since it added to managed cache successfully */
1078 list_add(&page
->lru
, pagepool
);
1083 set_page_private(page
, (unsigned long)pcl
);
1084 SetPagePrivate(page
);
1085 out
: /* the only exit (for tracing and debugging) */
1089 static struct z_erofs_decompressqueue
*
1090 jobqueue_init(struct super_block
*sb
,
1091 struct z_erofs_decompressqueue
*fgq
, bool *fg
)
1093 struct z_erofs_decompressqueue
*q
;
1096 q
= kvzalloc(sizeof(*q
), GFP_KERNEL
| __GFP_NOWARN
);
1101 INIT_WORK(&q
->u
.work
, z_erofs_decompressqueue_work
);
1105 init_waitqueue_head(&fgq
->u
.wait
);
1106 atomic_set(&fgq
->pending_bios
, 0);
1109 q
->head
= Z_EROFS_PCLUSTER_TAIL_CLOSED
;
1113 /* define decompression jobqueue types */
1120 static void *jobqueueset_init(struct super_block
*sb
,
1121 struct z_erofs_decompressqueue
*q
[],
1122 struct z_erofs_decompressqueue
*fgq
, bool *fg
)
1125 * if managed cache is enabled, bypass jobqueue is needed,
1126 * no need to read from device for all pclusters in this queue.
1128 q
[JQ_BYPASS
] = jobqueue_init(sb
, fgq
+ JQ_BYPASS
, NULL
);
1129 q
[JQ_SUBMIT
] = jobqueue_init(sb
, fgq
+ JQ_SUBMIT
, fg
);
1131 return tagptr_cast_ptr(tagptr_fold(tagptr1_t
, q
[JQ_SUBMIT
], *fg
));
1134 static void move_to_bypass_jobqueue(struct z_erofs_pcluster
*pcl
,
1135 z_erofs_next_pcluster_t qtail
[],
1136 z_erofs_next_pcluster_t owned_head
)
1138 z_erofs_next_pcluster_t
*const submit_qtail
= qtail
[JQ_SUBMIT
];
1139 z_erofs_next_pcluster_t
*const bypass_qtail
= qtail
[JQ_BYPASS
];
1141 DBG_BUGON(owned_head
== Z_EROFS_PCLUSTER_TAIL_CLOSED
);
1142 if (owned_head
== Z_EROFS_PCLUSTER_TAIL
)
1143 owned_head
= Z_EROFS_PCLUSTER_TAIL_CLOSED
;
1145 WRITE_ONCE(pcl
->next
, Z_EROFS_PCLUSTER_TAIL_CLOSED
);
1147 WRITE_ONCE(*submit_qtail
, owned_head
);
1148 WRITE_ONCE(*bypass_qtail
, &pcl
->next
);
1150 qtail
[JQ_BYPASS
] = &pcl
->next
;
1153 static void z_erofs_submit_queue(struct super_block
*sb
,
1154 z_erofs_next_pcluster_t owned_head
,
1155 struct list_head
*pagepool
,
1156 struct z_erofs_decompressqueue
*fgq
,
1159 struct erofs_sb_info
*const sbi
= EROFS_SB(sb
);
1160 z_erofs_next_pcluster_t qtail
[NR_JOBQUEUES
];
1161 struct z_erofs_decompressqueue
*q
[NR_JOBQUEUES
];
1163 /* since bio will be NULL, no need to initialize last_index */
1164 pgoff_t
uninitialized_var(last_index
);
1165 unsigned int nr_bios
= 0;
1166 struct bio
*bio
= NULL
;
1168 bi_private
= jobqueueset_init(sb
, q
, fgq
, force_fg
);
1169 qtail
[JQ_BYPASS
] = &q
[JQ_BYPASS
]->head
;
1170 qtail
[JQ_SUBMIT
] = &q
[JQ_SUBMIT
]->head
;
1172 /* by default, all need io submission */
1173 q
[JQ_SUBMIT
]->head
= owned_head
;
1176 struct z_erofs_pcluster
*pcl
;
1181 /* no possible 'owned_head' equals the following */
1182 DBG_BUGON(owned_head
== Z_EROFS_PCLUSTER_TAIL_CLOSED
);
1183 DBG_BUGON(owned_head
== Z_EROFS_PCLUSTER_NIL
);
1185 pcl
= container_of(owned_head
, struct z_erofs_pcluster
, next
);
1187 cur
= pcl
->obj
.index
;
1188 end
= cur
+ BIT(pcl
->clusterbits
);
1190 /* close the main owned chain at first */
1191 owned_head
= cmpxchg(&pcl
->next
, Z_EROFS_PCLUSTER_TAIL
,
1192 Z_EROFS_PCLUSTER_TAIL_CLOSED
);
1198 page
= pickup_page_for_submission(pcl
, i
++, pagepool
,
1204 if (bio
&& cur
!= last_index
+ 1) {
1211 bio
= bio_alloc(GFP_NOIO
, BIO_MAX_PAGES
);
1213 bio
->bi_end_io
= z_erofs_decompressqueue_endio
;
1214 bio_set_dev(bio
, sb
->s_bdev
);
1215 bio
->bi_iter
.bi_sector
= (sector_t
)cur
<<
1216 LOG_SECTORS_PER_BLOCK
;
1217 bio
->bi_private
= bi_private
;
1218 bio
->bi_opf
= REQ_OP_READ
;
1222 err
= bio_add_page(bio
, page
, PAGE_SIZE
, 0);
1223 if (err
< PAGE_SIZE
)
1224 goto submit_bio_retry
;
1228 } while (++cur
< end
);
1231 qtail
[JQ_SUBMIT
] = &pcl
->next
;
1233 move_to_bypass_jobqueue(pcl
, qtail
, owned_head
);
1234 } while (owned_head
!= Z_EROFS_PCLUSTER_TAIL
);
1240 * although background is preferred, no one is pending for submission.
1241 * don't issue workqueue for decompression but drop it directly instead.
1243 if (!*force_fg
&& !nr_bios
) {
1244 kvfree(q
[JQ_SUBMIT
]);
1247 z_erofs_decompress_kickoff(q
[JQ_SUBMIT
], *force_fg
, nr_bios
);
1250 static void z_erofs_runqueue(struct super_block
*sb
,
1251 struct z_erofs_collector
*clt
,
1252 struct list_head
*pagepool
, bool force_fg
)
1254 struct z_erofs_decompressqueue io
[NR_JOBQUEUES
];
1256 if (clt
->owned_head
== Z_EROFS_PCLUSTER_TAIL
)
1258 z_erofs_submit_queue(sb
, clt
->owned_head
, pagepool
, io
, &force_fg
);
1260 /* handle bypass queue (no i/o pclusters) immediately */
1261 z_erofs_decompress_queue(&io
[JQ_BYPASS
], pagepool
);
1266 /* wait until all bios are completed */
1267 io_wait_event(io
[JQ_SUBMIT
].u
.wait
,
1268 !atomic_read(&io
[JQ_SUBMIT
].pending_bios
));
1270 /* handle synchronous decompress queue in the caller context */
1271 z_erofs_decompress_queue(&io
[JQ_SUBMIT
], pagepool
);
1274 static int z_erofs_readpage(struct file
*file
, struct page
*page
)
1276 struct inode
*const inode
= page
->mapping
->host
;
1277 struct z_erofs_decompress_frontend f
= DECOMPRESS_FRONTEND_INIT(inode
);
1279 LIST_HEAD(pagepool
);
1281 trace_erofs_readpage(page
, false);
1283 f
.headoffset
= (erofs_off_t
)page
->index
<< PAGE_SHIFT
;
1285 err
= z_erofs_do_read_page(&f
, page
, &pagepool
);
1286 (void)z_erofs_collector_end(&f
.clt
);
1288 /* if some compressed cluster ready, need submit them anyway */
1289 z_erofs_runqueue(inode
->i_sb
, &f
.clt
, &pagepool
, true);
1292 erofs_err(inode
->i_sb
, "failed to read, err [%d]", err
);
1295 put_page(f
.map
.mpage
);
1297 /* clean up the remaining free pages */
1298 put_pages_list(&pagepool
);
1302 static bool should_decompress_synchronously(struct erofs_sb_info
*sbi
,
1305 return nr
<= sbi
->max_sync_decompress_pages
;
1308 static int z_erofs_readpages(struct file
*filp
, struct address_space
*mapping
,
1309 struct list_head
*pages
, unsigned int nr_pages
)
1311 struct inode
*const inode
= mapping
->host
;
1312 struct erofs_sb_info
*const sbi
= EROFS_I_SB(inode
);
1314 bool sync
= should_decompress_synchronously(sbi
, nr_pages
);
1315 struct z_erofs_decompress_frontend f
= DECOMPRESS_FRONTEND_INIT(inode
);
1316 gfp_t gfp
= mapping_gfp_constraint(mapping
, GFP_KERNEL
);
1317 struct page
*head
= NULL
;
1318 LIST_HEAD(pagepool
);
1320 trace_erofs_readpages(mapping
->host
, lru_to_page(pages
),
1323 f
.headoffset
= (erofs_off_t
)lru_to_page(pages
)->index
<< PAGE_SHIFT
;
1325 for (; nr_pages
; --nr_pages
) {
1326 struct page
*page
= lru_to_page(pages
);
1328 prefetchw(&page
->flags
);
1329 list_del(&page
->lru
);
1332 * A pure asynchronous readahead is indicated if
1333 * a PG_readahead marked page is hitted at first.
1334 * Let's also do asynchronous decompression for this case.
1336 sync
&= !(PageReadahead(page
) && !head
);
1338 if (add_to_page_cache_lru(page
, mapping
, page
->index
, gfp
)) {
1339 list_add(&page
->lru
, &pagepool
);
1343 set_page_private(page
, (unsigned long)head
);
1348 struct page
*page
= head
;
1351 /* traversal in reverse order */
1352 head
= (void *)page_private(page
);
1354 err
= z_erofs_do_read_page(&f
, page
, &pagepool
);
1356 erofs_err(inode
->i_sb
,
1357 "readahead error at page %lu @ nid %llu",
1358 page
->index
, EROFS_I(inode
)->nid
);
1362 (void)z_erofs_collector_end(&f
.clt
);
1364 z_erofs_runqueue(inode
->i_sb
, &f
.clt
, &pagepool
, sync
);
1367 put_page(f
.map
.mpage
);
1369 /* clean up the remaining free pages */
1370 put_pages_list(&pagepool
);
1374 const struct address_space_operations z_erofs_aops
= {
1375 .readpage
= z_erofs_readpage
,
1376 .readpages
= z_erofs_readpages
,