1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (C) 2018 HUAWEI, Inc.
4 * http://www.huawei.com/
5 * Created by Gao Xiang <gaoxiang25@huawei.com>
9 #include <linux/prefetch.h>
11 #include <trace/events/erofs.h>
14 * a compressed_pages[] placeholder in order to avoid
15 * being filled with file pages for in-place decompression.
17 #define PAGE_UNALLOCATED ((void *)0x5F0E4B1D)
19 /* how to allocate cached pages for a pcluster */
20 enum z_erofs_cache_alloctype
{
21 DONTALLOC
, /* don't allocate any cached pages */
22 DELAYEDALLOC
, /* delayed allocation (at the time of submitting io) */
26 * tagged pointer with 1-bit tag for all compressed pages
27 * tag 0 - the page is just found with an extra page reference
29 typedef tagptr1_t compressed_page_t
;
31 #define tag_compressed_page_justfound(page) \
32 tagptr_fold(compressed_page_t, page, 1)
34 static struct workqueue_struct
*z_erofs_workqueue __read_mostly
;
35 static struct kmem_cache
*pcluster_cachep __read_mostly
;
37 void z_erofs_exit_zip_subsystem(void)
39 destroy_workqueue(z_erofs_workqueue
);
40 kmem_cache_destroy(pcluster_cachep
);
43 static inline int z_erofs_init_workqueue(void)
45 const unsigned int onlinecpus
= num_possible_cpus();
46 const unsigned int flags
= WQ_UNBOUND
| WQ_HIGHPRI
| WQ_CPU_INTENSIVE
;
49 * no need to spawn too many threads, limiting threads could minimum
50 * scheduling overhead, perhaps per-CPU threads should be better?
52 z_erofs_workqueue
= alloc_workqueue("erofs_unzipd", flags
,
53 onlinecpus
+ onlinecpus
/ 4);
54 return z_erofs_workqueue
? 0 : -ENOMEM
;
57 static void z_erofs_pcluster_init_once(void *ptr
)
59 struct z_erofs_pcluster
*pcl
= ptr
;
60 struct z_erofs_collection
*cl
= z_erofs_primarycollection(pcl
);
63 mutex_init(&cl
->lock
);
66 for (i
= 0; i
< Z_EROFS_CLUSTER_MAX_PAGES
; ++i
)
67 pcl
->compressed_pages
[i
] = NULL
;
70 static void z_erofs_pcluster_init_always(struct z_erofs_pcluster
*pcl
)
72 struct z_erofs_collection
*cl
= z_erofs_primarycollection(pcl
);
74 atomic_set(&pcl
->obj
.refcount
, 1);
76 DBG_BUGON(cl
->nr_pages
);
80 int __init
z_erofs_init_zip_subsystem(void)
82 pcluster_cachep
= kmem_cache_create("erofs_compress",
83 Z_EROFS_WORKGROUP_SIZE
, 0,
85 z_erofs_pcluster_init_once
);
86 if (pcluster_cachep
) {
87 if (!z_erofs_init_workqueue())
90 kmem_cache_destroy(pcluster_cachep
);
95 enum z_erofs_collectmode
{
99 * The current collection was the tail of an exist chain, in addition
100 * that the previous processed chained collections are all decided to
101 * be hooked up to it.
102 * A new chain will be created for the remaining collections which are
103 * not processed yet, therefore different from COLLECT_PRIMARY_FOLLOWED,
104 * the next collection cannot reuse the whole page safely in
105 * the following scenario:
106 * ________________________________________________________________
107 * | tail (partial) page | head (partial) page |
108 * | (belongs to the next cl) | (belongs to the current cl) |
109 * |_______PRIMARY_FOLLOWED_______|________PRIMARY_HOOKED___________|
111 COLLECT_PRIMARY_HOOKED
,
112 COLLECT_PRIMARY_FOLLOWED_NOINPLACE
,
114 * The current collection has been linked with the owned chain, and
115 * could also be linked with the remaining collections, which means
116 * if the processing page is the tail page of the collection, thus
117 * the current collection can safely use the whole page (since
118 * the previous collection is under control) for in-place I/O, as
120 * ________________________________________________________________
121 * | tail (partial) page | head (partial) page |
122 * | (of the current cl) | (of the previous collection) |
123 * | PRIMARY_FOLLOWED or | |
124 * |_____PRIMARY_HOOKED___|____________PRIMARY_FOLLOWED____________|
126 * [ (*) the above page can be used as inplace I/O. ]
128 COLLECT_PRIMARY_FOLLOWED
,
131 struct z_erofs_collector
{
132 struct z_erofs_pagevec_ctor vector
;
134 struct z_erofs_pcluster
*pcl
, *tailpcl
;
135 struct z_erofs_collection
*cl
;
136 struct page
**compressedpages
;
137 z_erofs_next_pcluster_t owned_head
;
139 enum z_erofs_collectmode mode
;
142 struct z_erofs_decompress_frontend
{
143 struct inode
*const inode
;
145 struct z_erofs_collector clt
;
146 struct erofs_map_blocks map
;
148 /* used for applying cache strategy on the fly */
150 erofs_off_t headoffset
;
153 #define COLLECTOR_INIT() { \
154 .owned_head = Z_EROFS_PCLUSTER_TAIL, \
155 .mode = COLLECT_PRIMARY_FOLLOWED }
157 #define DECOMPRESS_FRONTEND_INIT(__i) { \
158 .inode = __i, .clt = COLLECTOR_INIT(), \
161 static struct page
*z_pagemap_global
[Z_EROFS_VMAP_GLOBAL_PAGES
];
162 static DEFINE_MUTEX(z_pagemap_global_lock
);
164 static void preload_compressed_pages(struct z_erofs_collector
*clt
,
165 struct address_space
*mc
,
166 enum z_erofs_cache_alloctype type
,
167 struct list_head
*pagepool
)
169 const struct z_erofs_pcluster
*pcl
= clt
->pcl
;
170 const unsigned int clusterpages
= BIT(pcl
->clusterbits
);
171 struct page
**pages
= clt
->compressedpages
;
172 pgoff_t index
= pcl
->obj
.index
+ (pages
- pcl
->compressed_pages
);
173 bool standalone
= true;
175 if (clt
->mode
< COLLECT_PRIMARY_FOLLOWED
)
178 for (; pages
< pcl
->compressed_pages
+ clusterpages
; ++pages
) {
182 /* the compressed page was loaded before */
183 if (READ_ONCE(*pages
))
186 page
= find_get_page(mc
, index
);
189 t
= tag_compressed_page_justfound(page
);
190 } else if (type
== DELAYEDALLOC
) {
191 t
= tagptr_init(compressed_page_t
, PAGE_UNALLOCATED
);
192 } else { /* DONTALLOC */
194 clt
->compressedpages
= pages
;
199 if (!cmpxchg_relaxed(pages
, NULL
, tagptr_cast_ptr(t
)))
206 if (standalone
) /* downgrade to PRIMARY_FOLLOWED_NOINPLACE */
207 clt
->mode
= COLLECT_PRIMARY_FOLLOWED_NOINPLACE
;
210 /* called by erofs_shrinker to get rid of all compressed_pages */
211 int erofs_try_to_free_all_cached_pages(struct erofs_sb_info
*sbi
,
212 struct erofs_workgroup
*grp
)
214 struct z_erofs_pcluster
*const pcl
=
215 container_of(grp
, struct z_erofs_pcluster
, obj
);
216 struct address_space
*const mapping
= MNGD_MAPPING(sbi
);
217 const unsigned int clusterpages
= BIT(pcl
->clusterbits
);
221 * refcount of workgroup is now freezed as 1,
222 * therefore no need to worry about available decompression users.
224 for (i
= 0; i
< clusterpages
; ++i
) {
225 struct page
*page
= pcl
->compressed_pages
[i
];
230 /* block other users from reclaiming or migrating the page */
231 if (!trylock_page(page
))
234 if (page
->mapping
!= mapping
)
237 /* barrier is implied in the following 'unlock_page' */
238 WRITE_ONCE(pcl
->compressed_pages
[i
], NULL
);
239 set_page_private(page
, 0);
240 ClearPagePrivate(page
);
248 int erofs_try_to_free_cached_page(struct address_space
*mapping
,
251 struct z_erofs_pcluster
*const pcl
= (void *)page_private(page
);
252 const unsigned int clusterpages
= BIT(pcl
->clusterbits
);
253 int ret
= 0; /* 0 - busy */
255 if (erofs_workgroup_try_to_freeze(&pcl
->obj
, 1)) {
258 for (i
= 0; i
< clusterpages
; ++i
) {
259 if (pcl
->compressed_pages
[i
] == page
) {
260 WRITE_ONCE(pcl
->compressed_pages
[i
], NULL
);
265 erofs_workgroup_unfreeze(&pcl
->obj
, 1);
268 ClearPagePrivate(page
);
275 /* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */
276 static inline bool z_erofs_try_inplace_io(struct z_erofs_collector
*clt
,
279 struct z_erofs_pcluster
*const pcl
= clt
->pcl
;
280 const unsigned int clusterpages
= BIT(pcl
->clusterbits
);
282 while (clt
->compressedpages
< pcl
->compressed_pages
+ clusterpages
) {
283 if (!cmpxchg(clt
->compressedpages
++, NULL
, page
))
289 /* callers must be with collection lock held */
290 static int z_erofs_attach_page(struct z_erofs_collector
*clt
,
292 enum z_erofs_page_type type
)
297 /* give priority for inplaceio */
298 if (clt
->mode
>= COLLECT_PRIMARY
&&
299 type
== Z_EROFS_PAGE_TYPE_EXCLUSIVE
&&
300 z_erofs_try_inplace_io(clt
, page
))
303 ret
= z_erofs_pagevec_enqueue(&clt
->vector
,
304 page
, type
, &occupied
);
305 clt
->cl
->vcnt
+= (unsigned int)ret
;
307 return ret
? 0 : -EAGAIN
;
310 static enum z_erofs_collectmode
311 try_to_claim_pcluster(struct z_erofs_pcluster
*pcl
,
312 z_erofs_next_pcluster_t
*owned_head
)
314 /* let's claim these following types of pclusters */
316 if (pcl
->next
== Z_EROFS_PCLUSTER_NIL
) {
317 /* type 1, nil pcluster */
318 if (cmpxchg(&pcl
->next
, Z_EROFS_PCLUSTER_NIL
,
319 *owned_head
) != Z_EROFS_PCLUSTER_NIL
)
322 *owned_head
= &pcl
->next
;
323 /* lucky, I am the followee :) */
324 return COLLECT_PRIMARY_FOLLOWED
;
325 } else if (pcl
->next
== Z_EROFS_PCLUSTER_TAIL
) {
327 * type 2, link to the end of a existing open chain,
328 * be careful that its submission itself is governed
329 * by the original owned chain.
331 if (cmpxchg(&pcl
->next
, Z_EROFS_PCLUSTER_TAIL
,
332 *owned_head
) != Z_EROFS_PCLUSTER_TAIL
)
334 *owned_head
= Z_EROFS_PCLUSTER_TAIL
;
335 return COLLECT_PRIMARY_HOOKED
;
337 return COLLECT_PRIMARY
; /* :( better luck next time */
340 static int z_erofs_lookup_collection(struct z_erofs_collector
*clt
,
342 struct erofs_map_blocks
*map
)
344 struct erofs_workgroup
*grp
;
345 struct z_erofs_pcluster
*pcl
;
346 struct z_erofs_collection
*cl
;
349 grp
= erofs_find_workgroup(inode
->i_sb
, map
->m_pa
>> PAGE_SHIFT
);
353 pcl
= container_of(grp
, struct z_erofs_pcluster
, obj
);
354 if (clt
->owned_head
== &pcl
->next
|| pcl
== clt
->tailpcl
) {
356 erofs_workgroup_put(grp
);
357 return -EFSCORRUPTED
;
360 cl
= z_erofs_primarycollection(pcl
);
361 if (cl
->pageofs
!= (map
->m_la
& ~PAGE_MASK
)) {
363 erofs_workgroup_put(grp
);
364 return -EFSCORRUPTED
;
367 length
= READ_ONCE(pcl
->length
);
368 if (length
& Z_EROFS_PCLUSTER_FULL_LENGTH
) {
369 if ((map
->m_llen
<< Z_EROFS_PCLUSTER_LENGTH_BIT
) > length
) {
371 erofs_workgroup_put(grp
);
372 return -EFSCORRUPTED
;
375 unsigned int llen
= map
->m_llen
<< Z_EROFS_PCLUSTER_LENGTH_BIT
;
377 if (map
->m_flags
& EROFS_MAP_FULL_MAPPED
)
378 llen
|= Z_EROFS_PCLUSTER_FULL_LENGTH
;
380 while (llen
> length
&&
381 length
!= cmpxchg_relaxed(&pcl
->length
, length
, llen
)) {
383 length
= READ_ONCE(pcl
->length
);
386 mutex_lock(&cl
->lock
);
387 /* used to check tail merging loop due to corrupted images */
388 if (clt
->owned_head
== Z_EROFS_PCLUSTER_TAIL
)
390 clt
->mode
= try_to_claim_pcluster(pcl
, &clt
->owned_head
);
391 /* clean tailpcl if the current owned_head is Z_EROFS_PCLUSTER_TAIL */
392 if (clt
->owned_head
== Z_EROFS_PCLUSTER_TAIL
)
399 static int z_erofs_register_collection(struct z_erofs_collector
*clt
,
401 struct erofs_map_blocks
*map
)
403 struct z_erofs_pcluster
*pcl
;
404 struct z_erofs_collection
*cl
;
407 /* no available workgroup, let's allocate one */
408 pcl
= kmem_cache_alloc(pcluster_cachep
, GFP_NOFS
);
412 z_erofs_pcluster_init_always(pcl
);
413 pcl
->obj
.index
= map
->m_pa
>> PAGE_SHIFT
;
415 pcl
->length
= (map
->m_llen
<< Z_EROFS_PCLUSTER_LENGTH_BIT
) |
416 (map
->m_flags
& EROFS_MAP_FULL_MAPPED
?
417 Z_EROFS_PCLUSTER_FULL_LENGTH
: 0);
419 if (map
->m_flags
& EROFS_MAP_ZIPPED
)
420 pcl
->algorithmformat
= Z_EROFS_COMPRESSION_LZ4
;
422 pcl
->algorithmformat
= Z_EROFS_COMPRESSION_SHIFTED
;
424 pcl
->clusterbits
= EROFS_I(inode
)->z_physical_clusterbits
[0];
425 pcl
->clusterbits
-= PAGE_SHIFT
;
427 /* new pclusters should be claimed as type 1, primary and followed */
428 pcl
->next
= clt
->owned_head
;
429 clt
->mode
= COLLECT_PRIMARY_FOLLOWED
;
431 cl
= z_erofs_primarycollection(pcl
);
432 cl
->pageofs
= map
->m_la
& ~PAGE_MASK
;
435 * lock all primary followed works before visible to others
436 * and mutex_trylock *never* fails for a new pcluster.
438 mutex_trylock(&cl
->lock
);
440 err
= erofs_register_workgroup(inode
->i_sb
, &pcl
->obj
);
442 mutex_unlock(&cl
->lock
);
443 kmem_cache_free(pcluster_cachep
, pcl
);
446 /* used to check tail merging loop due to corrupted images */
447 if (clt
->owned_head
== Z_EROFS_PCLUSTER_TAIL
)
449 clt
->owned_head
= &pcl
->next
;
455 static int z_erofs_collector_begin(struct z_erofs_collector
*clt
,
457 struct erofs_map_blocks
*map
)
463 /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous collection */
464 DBG_BUGON(clt
->owned_head
== Z_EROFS_PCLUSTER_NIL
);
465 DBG_BUGON(clt
->owned_head
== Z_EROFS_PCLUSTER_TAIL_CLOSED
);
467 if (!PAGE_ALIGNED(map
->m_pa
)) {
473 ret
= z_erofs_lookup_collection(clt
, inode
, map
);
474 if (ret
== -ENOENT
) {
475 ret
= z_erofs_register_collection(clt
, inode
, map
);
477 /* someone registered at the same time, give another try */
478 if (ret
== -EAGAIN
) {
487 z_erofs_pagevec_ctor_init(&clt
->vector
, Z_EROFS_NR_INLINE_PAGEVECS
,
488 clt
->cl
->pagevec
, clt
->cl
->vcnt
);
490 clt
->compressedpages
= clt
->pcl
->compressed_pages
;
491 if (clt
->mode
<= COLLECT_PRIMARY
) /* cannot do in-place I/O */
492 clt
->compressedpages
+= Z_EROFS_CLUSTER_MAX_PAGES
;
497 * keep in mind that no referenced pclusters will be freed
498 * only after a RCU grace period.
500 static void z_erofs_rcu_callback(struct rcu_head
*head
)
502 struct z_erofs_collection
*const cl
=
503 container_of(head
, struct z_erofs_collection
, rcu
);
505 kmem_cache_free(pcluster_cachep
,
506 container_of(cl
, struct z_erofs_pcluster
,
507 primary_collection
));
510 void erofs_workgroup_free_rcu(struct erofs_workgroup
*grp
)
512 struct z_erofs_pcluster
*const pcl
=
513 container_of(grp
, struct z_erofs_pcluster
, obj
);
514 struct z_erofs_collection
*const cl
= z_erofs_primarycollection(pcl
);
516 call_rcu(&cl
->rcu
, z_erofs_rcu_callback
);
519 static void z_erofs_collection_put(struct z_erofs_collection
*cl
)
521 struct z_erofs_pcluster
*const pcl
=
522 container_of(cl
, struct z_erofs_pcluster
, primary_collection
);
524 erofs_workgroup_put(&pcl
->obj
);
527 static bool z_erofs_collector_end(struct z_erofs_collector
*clt
)
529 struct z_erofs_collection
*cl
= clt
->cl
;
534 z_erofs_pagevec_ctor_exit(&clt
->vector
, false);
535 mutex_unlock(&cl
->lock
);
538 * if all pending pages are added, don't hold its reference
539 * any longer if the pcluster isn't hosted by ourselves.
541 if (clt
->mode
< COLLECT_PRIMARY_FOLLOWED_NOINPLACE
)
542 z_erofs_collection_put(cl
);
548 static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend
*fe
,
549 unsigned int cachestrategy
,
552 if (cachestrategy
<= EROFS_ZIP_CACHE_DISABLED
)
558 return cachestrategy
>= EROFS_ZIP_CACHE_READAROUND
&&
562 static int z_erofs_do_read_page(struct z_erofs_decompress_frontend
*fe
,
564 struct list_head
*pagepool
)
566 struct inode
*const inode
= fe
->inode
;
567 struct erofs_sb_info
*const sbi
= EROFS_I_SB(inode
);
568 struct erofs_map_blocks
*const map
= &fe
->map
;
569 struct z_erofs_collector
*const clt
= &fe
->clt
;
570 const loff_t offset
= page_offset(page
);
573 enum z_erofs_cache_alloctype cache_strategy
;
574 enum z_erofs_page_type page_type
;
575 unsigned int cur
, end
, spiltted
, index
;
578 /* register locked file pages as online pages in pack */
579 z_erofs_onlinepage_init(page
);
586 /* lucky, within the range of the current map_blocks */
587 if (offset
+ cur
>= map
->m_la
&&
588 offset
+ cur
< map
->m_la
+ map
->m_llen
) {
589 /* didn't get a valid collection previously (very rare) */
595 /* go ahead the next map_blocks */
596 erofs_dbg("%s: [out-of-range] pos %llu", __func__
, offset
+ cur
);
598 if (z_erofs_collector_end(clt
))
599 fe
->backmost
= false;
601 map
->m_la
= offset
+ cur
;
603 err
= z_erofs_map_blocks_iter(inode
, map
, 0);
608 if (!(map
->m_flags
& EROFS_MAP_MAPPED
))
611 err
= z_erofs_collector_begin(clt
, inode
, map
);
615 /* preload all compressed pages (maybe downgrade role if necessary) */
616 if (should_alloc_managed_pages(fe
, sbi
->cache_strategy
, map
->m_la
))
617 cache_strategy
= DELAYEDALLOC
;
619 cache_strategy
= DONTALLOC
;
621 preload_compressed_pages(clt
, MNGD_MAPPING(sbi
),
622 cache_strategy
, pagepool
);
626 * Ensure the current partial page belongs to this submit chain rather
627 * than other concurrent submit chains or the noio(bypass) chain since
628 * those chains are handled asynchronously thus the page cannot be used
629 * for inplace I/O or pagevec (should be processed in strict order.)
631 tight
&= (clt
->mode
>= COLLECT_PRIMARY_HOOKED
&&
632 clt
->mode
!= COLLECT_PRIMARY_FOLLOWED_NOINPLACE
);
634 cur
= end
- min_t(unsigned int, offset
+ end
- map
->m_la
, end
);
635 if (!(map
->m_flags
& EROFS_MAP_MAPPED
)) {
636 zero_user_segment(page
, cur
, end
);
640 /* let's derive page type */
641 page_type
= cur
? Z_EROFS_VLE_PAGE_TYPE_HEAD
:
642 (!spiltted
? Z_EROFS_PAGE_TYPE_EXCLUSIVE
:
643 (tight
? Z_EROFS_PAGE_TYPE_EXCLUSIVE
:
644 Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED
));
647 tight
&= (clt
->mode
>= COLLECT_PRIMARY_FOLLOWED
);
650 err
= z_erofs_attach_page(clt
, page
, page_type
);
651 /* should allocate an additional staging page for pagevec */
652 if (err
== -EAGAIN
) {
653 struct page
*const newpage
=
654 erofs_allocpage(pagepool
, GFP_NOFS
| __GFP_NOFAIL
);
656 newpage
->mapping
= Z_EROFS_MAPPING_STAGING
;
657 err
= z_erofs_attach_page(clt
, newpage
,
658 Z_EROFS_PAGE_TYPE_EXCLUSIVE
);
666 index
= page
->index
- (map
->m_la
>> PAGE_SHIFT
);
668 z_erofs_onlinepage_fixup(page
, index
, true);
670 /* bump up the number of spiltted parts of a page */
672 /* also update nr_pages */
673 clt
->cl
->nr_pages
= max_t(pgoff_t
, clt
->cl
->nr_pages
, index
+ 1);
675 /* can be used for verification */
676 map
->m_llen
= offset
+ cur
- map
->m_la
;
683 z_erofs_onlinepage_endio(page
);
685 erofs_dbg("%s, finish page: %pK spiltted: %u map->m_llen %llu",
686 __func__
, page
, spiltted
, map
->m_llen
);
689 /* if some error occurred while processing this page */
695 static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue
*io
,
698 /* wake up the caller thread for sync decompression */
702 spin_lock_irqsave(&io
->u
.wait
.lock
, flags
);
703 if (!atomic_add_return(bios
, &io
->pending_bios
))
704 wake_up_locked(&io
->u
.wait
);
705 spin_unlock_irqrestore(&io
->u
.wait
.lock
, flags
);
709 if (!atomic_add_return(bios
, &io
->pending_bios
))
710 queue_work(z_erofs_workqueue
, &io
->u
.work
);
713 static void z_erofs_decompressqueue_endio(struct bio
*bio
)
715 tagptr1_t t
= tagptr_init(tagptr1_t
, bio
->bi_private
);
716 struct z_erofs_decompressqueue
*q
= tagptr_unfold_ptr(t
);
717 blk_status_t err
= bio
->bi_status
;
718 struct bio_vec
*bvec
;
719 struct bvec_iter_all iter_all
;
721 bio_for_each_segment_all(bvec
, bio
, iter_all
) {
722 struct page
*page
= bvec
->bv_page
;
724 DBG_BUGON(PageUptodate(page
));
725 DBG_BUGON(!page
->mapping
);
730 if (erofs_page_is_managed(EROFS_SB(q
->sb
), page
)) {
732 SetPageUptodate(page
);
736 z_erofs_decompress_kickoff(q
, tagptr_unfold_tags(t
), -1);
740 static int z_erofs_decompress_pcluster(struct super_block
*sb
,
741 struct z_erofs_pcluster
*pcl
,
742 struct list_head
*pagepool
)
744 struct erofs_sb_info
*const sbi
= EROFS_SB(sb
);
745 const unsigned int clusterpages
= BIT(pcl
->clusterbits
);
746 struct z_erofs_pagevec_ctor ctor
;
747 unsigned int i
, outputsize
, llen
, nr_pages
;
748 struct page
*pages_onstack
[Z_EROFS_VMAP_ONSTACK_PAGES
];
749 struct page
**pages
, **compressed_pages
, *page
;
751 enum z_erofs_page_type page_type
;
752 bool overlapped
, partial
;
753 struct z_erofs_collection
*cl
;
757 cl
= z_erofs_primarycollection(pcl
);
758 DBG_BUGON(!READ_ONCE(cl
->nr_pages
));
760 mutex_lock(&cl
->lock
);
761 nr_pages
= cl
->nr_pages
;
763 if (nr_pages
<= Z_EROFS_VMAP_ONSTACK_PAGES
) {
764 pages
= pages_onstack
;
765 } else if (nr_pages
<= Z_EROFS_VMAP_GLOBAL_PAGES
&&
766 mutex_trylock(&z_pagemap_global_lock
)) {
767 pages
= z_pagemap_global
;
769 gfp_t gfp_flags
= GFP_KERNEL
;
771 if (nr_pages
> Z_EROFS_VMAP_GLOBAL_PAGES
)
772 gfp_flags
|= __GFP_NOFAIL
;
774 pages
= kvmalloc_array(nr_pages
, sizeof(struct page
*),
777 /* fallback to global pagemap for the lowmem scenario */
779 mutex_lock(&z_pagemap_global_lock
);
780 pages
= z_pagemap_global
;
784 for (i
= 0; i
< nr_pages
; ++i
)
788 z_erofs_pagevec_ctor_init(&ctor
, Z_EROFS_NR_INLINE_PAGEVECS
,
791 for (i
= 0; i
< cl
->vcnt
; ++i
) {
794 page
= z_erofs_pagevec_dequeue(&ctor
, &page_type
);
796 /* all pages in pagevec ought to be valid */
798 DBG_BUGON(!page
->mapping
);
800 if (z_erofs_put_stagingpage(pagepool
, page
))
803 if (page_type
== Z_EROFS_VLE_PAGE_TYPE_HEAD
)
806 pagenr
= z_erofs_onlinepage_index(page
);
808 DBG_BUGON(pagenr
>= nr_pages
);
811 * currently EROFS doesn't support multiref(dedup),
812 * so here erroring out one multiref page.
816 SetPageError(pages
[pagenr
]);
817 z_erofs_onlinepage_endio(pages
[pagenr
]);
820 pages
[pagenr
] = page
;
822 z_erofs_pagevec_ctor_exit(&ctor
, true);
825 compressed_pages
= pcl
->compressed_pages
;
827 for (i
= 0; i
< clusterpages
; ++i
) {
830 page
= compressed_pages
[i
];
832 /* all compressed pages ought to be valid */
834 DBG_BUGON(!page
->mapping
);
836 if (!z_erofs_page_is_staging(page
)) {
837 if (erofs_page_is_managed(sbi
, page
)) {
838 if (!PageUptodate(page
))
844 * only if non-head page can be selected
845 * for inplace decompression
847 pagenr
= z_erofs_onlinepage_index(page
);
849 DBG_BUGON(pagenr
>= nr_pages
);
852 SetPageError(pages
[pagenr
]);
853 z_erofs_onlinepage_endio(pages
[pagenr
]);
856 pages
[pagenr
] = page
;
861 /* PG_error needs checking for inplaced and staging pages */
862 if (PageError(page
)) {
863 DBG_BUGON(PageUptodate(page
));
871 llen
= pcl
->length
>> Z_EROFS_PCLUSTER_LENGTH_BIT
;
872 if (nr_pages
<< PAGE_SHIFT
>= cl
->pageofs
+ llen
) {
874 partial
= !(pcl
->length
& Z_EROFS_PCLUSTER_FULL_LENGTH
);
876 outputsize
= (nr_pages
<< PAGE_SHIFT
) - cl
->pageofs
;
880 err
= z_erofs_decompress(&(struct z_erofs_decompress_req
) {
882 .in
= compressed_pages
,
884 .pageofs_out
= cl
->pageofs
,
885 .inputsize
= PAGE_SIZE
,
886 .outputsize
= outputsize
,
887 .alg
= pcl
->algorithmformat
,
888 .inplace_io
= overlapped
,
889 .partial_decoding
= partial
893 /* must handle all compressed pages before endding pages */
894 for (i
= 0; i
< clusterpages
; ++i
) {
895 page
= compressed_pages
[i
];
897 if (erofs_page_is_managed(sbi
, page
))
900 /* recycle all individual staging pages */
901 (void)z_erofs_put_stagingpage(pagepool
, page
);
903 WRITE_ONCE(compressed_pages
[i
], NULL
);
906 for (i
= 0; i
< nr_pages
; ++i
) {
911 DBG_BUGON(!page
->mapping
);
913 /* recycle all individual staging pages */
914 if (z_erofs_put_stagingpage(pagepool
, page
))
920 z_erofs_onlinepage_endio(page
);
923 if (pages
== z_pagemap_global
)
924 mutex_unlock(&z_pagemap_global_lock
);
925 else if (pages
!= pages_onstack
)
931 /* all cl locks MUST be taken before the following line */
932 WRITE_ONCE(pcl
->next
, Z_EROFS_PCLUSTER_NIL
);
934 /* all cl locks SHOULD be released right now */
935 mutex_unlock(&cl
->lock
);
937 z_erofs_collection_put(cl
);
941 static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue
*io
,
942 struct list_head
*pagepool
)
944 z_erofs_next_pcluster_t owned
= io
->head
;
946 while (owned
!= Z_EROFS_PCLUSTER_TAIL_CLOSED
) {
947 struct z_erofs_pcluster
*pcl
;
949 /* no possible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */
950 DBG_BUGON(owned
== Z_EROFS_PCLUSTER_TAIL
);
952 /* no possible that 'owned' equals NULL */
953 DBG_BUGON(owned
== Z_EROFS_PCLUSTER_NIL
);
955 pcl
= container_of(owned
, struct z_erofs_pcluster
, next
);
956 owned
= READ_ONCE(pcl
->next
);
958 z_erofs_decompress_pcluster(io
->sb
, pcl
, pagepool
);
962 static void z_erofs_decompressqueue_work(struct work_struct
*work
)
964 struct z_erofs_decompressqueue
*bgq
=
965 container_of(work
, struct z_erofs_decompressqueue
, u
.work
);
968 DBG_BUGON(bgq
->head
== Z_EROFS_PCLUSTER_TAIL_CLOSED
);
969 z_erofs_decompress_queue(bgq
, &pagepool
);
971 put_pages_list(&pagepool
);
975 static struct page
*pickup_page_for_submission(struct z_erofs_pcluster
*pcl
,
977 struct list_head
*pagepool
,
978 struct address_space
*mc
,
981 const pgoff_t index
= pcl
->obj
.index
;
982 bool tocache
= false;
984 struct address_space
*mapping
;
985 struct page
*oldpage
, *page
;
991 page
= READ_ONCE(pcl
->compressed_pages
[nr
]);
998 * the cached page has not been allocated and
999 * an placeholder is out there, prepare it now.
1001 if (page
== PAGE_UNALLOCATED
) {
1006 /* process the target tagged pointer */
1007 t
= tagptr_init(compressed_page_t
, page
);
1008 justfound
= tagptr_unfold_tags(t
);
1009 page
= tagptr_unfold_ptr(t
);
1011 mapping
= READ_ONCE(page
->mapping
);
1014 * unmanaged (file) pages are all locked solidly,
1015 * therefore it is impossible for `mapping' to be NULL.
1017 if (mapping
&& mapping
!= mc
)
1018 /* ought to be unmanaged pages */
1023 /* only true if page reclaim goes wrong, should never happen */
1024 DBG_BUGON(justfound
&& PagePrivate(page
));
1026 /* the page is still in manage cache */
1027 if (page
->mapping
== mc
) {
1028 WRITE_ONCE(pcl
->compressed_pages
[nr
], page
);
1030 ClearPageError(page
);
1031 if (!PagePrivate(page
)) {
1033 * impossible to be !PagePrivate(page) for
1034 * the current restriction as well if
1035 * the page is already in compressed_pages[].
1037 DBG_BUGON(!justfound
);
1040 set_page_private(page
, (unsigned long)pcl
);
1041 SetPagePrivate(page
);
1044 /* no need to submit io if it is already up-to-date */
1045 if (PageUptodate(page
)) {
1053 * the managed page has been truncated, it's unsafe to
1054 * reuse this one, let's allocate a new cache-managed page.
1056 DBG_BUGON(page
->mapping
);
1057 DBG_BUGON(!justfound
);
1063 page
= erofs_allocpage(pagepool
, gfp
| __GFP_NOFAIL
);
1064 if (!tocache
|| add_to_page_cache_lru(page
, mc
, index
+ nr
, gfp
)) {
1065 /* non-LRU / non-movable temporary page is needed */
1066 page
->mapping
= Z_EROFS_MAPPING_STAGING
;
1070 if (oldpage
!= cmpxchg(&pcl
->compressed_pages
[nr
], oldpage
, page
)) {
1072 /* since it added to managed cache successfully */
1076 list_add(&page
->lru
, pagepool
);
1081 set_page_private(page
, (unsigned long)pcl
);
1082 SetPagePrivate(page
);
1083 out
: /* the only exit (for tracing and debugging) */
1087 static struct z_erofs_decompressqueue
*
1088 jobqueue_init(struct super_block
*sb
,
1089 struct z_erofs_decompressqueue
*fgq
, bool *fg
)
1091 struct z_erofs_decompressqueue
*q
;
1094 q
= kvzalloc(sizeof(*q
), GFP_KERNEL
| __GFP_NOWARN
);
1099 INIT_WORK(&q
->u
.work
, z_erofs_decompressqueue_work
);
1103 init_waitqueue_head(&fgq
->u
.wait
);
1104 atomic_set(&fgq
->pending_bios
, 0);
1107 q
->head
= Z_EROFS_PCLUSTER_TAIL_CLOSED
;
1111 /* define decompression jobqueue types */
1118 static void *jobqueueset_init(struct super_block
*sb
,
1119 struct z_erofs_decompressqueue
*q
[],
1120 struct z_erofs_decompressqueue
*fgq
, bool *fg
)
1123 * if managed cache is enabled, bypass jobqueue is needed,
1124 * no need to read from device for all pclusters in this queue.
1126 q
[JQ_BYPASS
] = jobqueue_init(sb
, fgq
+ JQ_BYPASS
, NULL
);
1127 q
[JQ_SUBMIT
] = jobqueue_init(sb
, fgq
+ JQ_SUBMIT
, fg
);
1129 return tagptr_cast_ptr(tagptr_fold(tagptr1_t
, q
[JQ_SUBMIT
], *fg
));
1132 static void move_to_bypass_jobqueue(struct z_erofs_pcluster
*pcl
,
1133 z_erofs_next_pcluster_t qtail
[],
1134 z_erofs_next_pcluster_t owned_head
)
1136 z_erofs_next_pcluster_t
*const submit_qtail
= qtail
[JQ_SUBMIT
];
1137 z_erofs_next_pcluster_t
*const bypass_qtail
= qtail
[JQ_BYPASS
];
1139 DBG_BUGON(owned_head
== Z_EROFS_PCLUSTER_TAIL_CLOSED
);
1140 if (owned_head
== Z_EROFS_PCLUSTER_TAIL
)
1141 owned_head
= Z_EROFS_PCLUSTER_TAIL_CLOSED
;
1143 WRITE_ONCE(pcl
->next
, Z_EROFS_PCLUSTER_TAIL_CLOSED
);
1145 WRITE_ONCE(*submit_qtail
, owned_head
);
1146 WRITE_ONCE(*bypass_qtail
, &pcl
->next
);
1148 qtail
[JQ_BYPASS
] = &pcl
->next
;
1151 static void z_erofs_submit_queue(struct super_block
*sb
,
1152 z_erofs_next_pcluster_t owned_head
,
1153 struct list_head
*pagepool
,
1154 struct z_erofs_decompressqueue
*fgq
,
1157 struct erofs_sb_info
*const sbi
= EROFS_SB(sb
);
1158 z_erofs_next_pcluster_t qtail
[NR_JOBQUEUES
];
1159 struct z_erofs_decompressqueue
*q
[NR_JOBQUEUES
];
1161 /* since bio will be NULL, no need to initialize last_index */
1162 pgoff_t
uninitialized_var(last_index
);
1163 unsigned int nr_bios
= 0;
1164 struct bio
*bio
= NULL
;
1166 bi_private
= jobqueueset_init(sb
, q
, fgq
, force_fg
);
1167 qtail
[JQ_BYPASS
] = &q
[JQ_BYPASS
]->head
;
1168 qtail
[JQ_SUBMIT
] = &q
[JQ_SUBMIT
]->head
;
1170 /* by default, all need io submission */
1171 q
[JQ_SUBMIT
]->head
= owned_head
;
1174 struct z_erofs_pcluster
*pcl
;
1179 /* no possible 'owned_head' equals the following */
1180 DBG_BUGON(owned_head
== Z_EROFS_PCLUSTER_TAIL_CLOSED
);
1181 DBG_BUGON(owned_head
== Z_EROFS_PCLUSTER_NIL
);
1183 pcl
= container_of(owned_head
, struct z_erofs_pcluster
, next
);
1185 cur
= pcl
->obj
.index
;
1186 end
= cur
+ BIT(pcl
->clusterbits
);
1188 /* close the main owned chain at first */
1189 owned_head
= cmpxchg(&pcl
->next
, Z_EROFS_PCLUSTER_TAIL
,
1190 Z_EROFS_PCLUSTER_TAIL_CLOSED
);
1196 page
= pickup_page_for_submission(pcl
, i
++, pagepool
,
1202 if (bio
&& cur
!= last_index
+ 1) {
1209 bio
= bio_alloc(GFP_NOIO
, BIO_MAX_PAGES
);
1211 bio
->bi_end_io
= z_erofs_decompressqueue_endio
;
1212 bio_set_dev(bio
, sb
->s_bdev
);
1213 bio
->bi_iter
.bi_sector
= (sector_t
)cur
<<
1214 LOG_SECTORS_PER_BLOCK
;
1215 bio
->bi_private
= bi_private
;
1216 bio
->bi_opf
= REQ_OP_READ
;
1220 err
= bio_add_page(bio
, page
, PAGE_SIZE
, 0);
1221 if (err
< PAGE_SIZE
)
1222 goto submit_bio_retry
;
1226 } while (++cur
< end
);
1229 qtail
[JQ_SUBMIT
] = &pcl
->next
;
1231 move_to_bypass_jobqueue(pcl
, qtail
, owned_head
);
1232 } while (owned_head
!= Z_EROFS_PCLUSTER_TAIL
);
1238 * although background is preferred, no one is pending for submission.
1239 * don't issue workqueue for decompression but drop it directly instead.
1241 if (!*force_fg
&& !nr_bios
) {
1242 kvfree(q
[JQ_SUBMIT
]);
1245 z_erofs_decompress_kickoff(q
[JQ_SUBMIT
], *force_fg
, nr_bios
);
1248 static void z_erofs_runqueue(struct super_block
*sb
,
1249 struct z_erofs_collector
*clt
,
1250 struct list_head
*pagepool
, bool force_fg
)
1252 struct z_erofs_decompressqueue io
[NR_JOBQUEUES
];
1254 if (clt
->owned_head
== Z_EROFS_PCLUSTER_TAIL
)
1256 z_erofs_submit_queue(sb
, clt
->owned_head
, pagepool
, io
, &force_fg
);
1258 /* handle bypass queue (no i/o pclusters) immediately */
1259 z_erofs_decompress_queue(&io
[JQ_BYPASS
], pagepool
);
1264 /* wait until all bios are completed */
1265 io_wait_event(io
[JQ_SUBMIT
].u
.wait
,
1266 !atomic_read(&io
[JQ_SUBMIT
].pending_bios
));
1268 /* handle synchronous decompress queue in the caller context */
1269 z_erofs_decompress_queue(&io
[JQ_SUBMIT
], pagepool
);
1272 static int z_erofs_readpage(struct file
*file
, struct page
*page
)
1274 struct inode
*const inode
= page
->mapping
->host
;
1275 struct z_erofs_decompress_frontend f
= DECOMPRESS_FRONTEND_INIT(inode
);
1277 LIST_HEAD(pagepool
);
1279 trace_erofs_readpage(page
, false);
1281 f
.headoffset
= (erofs_off_t
)page
->index
<< PAGE_SHIFT
;
1283 err
= z_erofs_do_read_page(&f
, page
, &pagepool
);
1284 (void)z_erofs_collector_end(&f
.clt
);
1286 /* if some compressed cluster ready, need submit them anyway */
1287 z_erofs_runqueue(inode
->i_sb
, &f
.clt
, &pagepool
, true);
1290 erofs_err(inode
->i_sb
, "failed to read, err [%d]", err
);
1293 put_page(f
.map
.mpage
);
1295 /* clean up the remaining free pages */
1296 put_pages_list(&pagepool
);
1300 static bool should_decompress_synchronously(struct erofs_sb_info
*sbi
,
1303 return nr
<= sbi
->max_sync_decompress_pages
;
1306 static int z_erofs_readpages(struct file
*filp
, struct address_space
*mapping
,
1307 struct list_head
*pages
, unsigned int nr_pages
)
1309 struct inode
*const inode
= mapping
->host
;
1310 struct erofs_sb_info
*const sbi
= EROFS_I_SB(inode
);
1312 bool sync
= should_decompress_synchronously(sbi
, nr_pages
);
1313 struct z_erofs_decompress_frontend f
= DECOMPRESS_FRONTEND_INIT(inode
);
1314 gfp_t gfp
= mapping_gfp_constraint(mapping
, GFP_KERNEL
);
1315 struct page
*head
= NULL
;
1316 LIST_HEAD(pagepool
);
1318 trace_erofs_readpages(mapping
->host
, lru_to_page(pages
),
1321 f
.headoffset
= (erofs_off_t
)lru_to_page(pages
)->index
<< PAGE_SHIFT
;
1323 for (; nr_pages
; --nr_pages
) {
1324 struct page
*page
= lru_to_page(pages
);
1326 prefetchw(&page
->flags
);
1327 list_del(&page
->lru
);
1330 * A pure asynchronous readahead is indicated if
1331 * a PG_readahead marked page is hitted at first.
1332 * Let's also do asynchronous decompression for this case.
1334 sync
&= !(PageReadahead(page
) && !head
);
1336 if (add_to_page_cache_lru(page
, mapping
, page
->index
, gfp
)) {
1337 list_add(&page
->lru
, &pagepool
);
1341 set_page_private(page
, (unsigned long)head
);
1346 struct page
*page
= head
;
1349 /* traversal in reverse order */
1350 head
= (void *)page_private(page
);
1352 err
= z_erofs_do_read_page(&f
, page
, &pagepool
);
1354 erofs_err(inode
->i_sb
,
1355 "readahead error at page %lu @ nid %llu",
1356 page
->index
, EROFS_I(inode
)->nid
);
1360 (void)z_erofs_collector_end(&f
.clt
);
1362 z_erofs_runqueue(inode
->i_sb
, &f
.clt
, &pagepool
, sync
);
1365 put_page(f
.map
.mpage
);
1367 /* clean up the remaining free pages */
1368 put_pages_list(&pagepool
);
1372 const struct address_space_operations z_erofs_aops
= {
1373 .readpage
= z_erofs_readpage
,
1374 .readpages
= z_erofs_readpages
,