fs/erofs/zdata.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Copyright (C) 2018 HUAWEI, Inc.
   4  *             https://www.huawei.com/
   5  * Copyright (C) 2022 Alibaba Cloud
   6  */
   7 #include "compress.h"
   8 #include <linux/psi.h>
   9 #include <linux/cpuhotplug.h>
  10 #include <trace/events/erofs.h>
  11
  12 #define Z_EROFS_PCLUSTER_MAX_PAGES      (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE)
  13 #define Z_EROFS_INLINE_BVECS            2
  14
  15 /*
  16  * let's leave a type here in case of introducing
  17  * another tagged pointer later.
  18  */
  19 typedef void *z_erofs_next_pcluster_t;
  20
  21 struct z_erofs_bvec {
  22         struct page *page;
  23         int offset;
  24         unsigned int end;
  25 };
  26
  27 #define __Z_EROFS_BVSET(name, total) \
  28 struct name { \
  29         /* point to the next page which contains the following bvecs */ \
  30         struct page *nextpage; \
  31         struct z_erofs_bvec bvec[total]; \
  32 }
  33 __Z_EROFS_BVSET(z_erofs_bvset,);
  34 __Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_INLINE_BVECS);
  35
  36 /*
  37  * Structure fields follow one of the following exclusion rules.
  38  *
  39  * I: Modifiable by initialization/destruction paths and read-only
  40  *    for everyone else;
  41  *
  42  * L: Field should be protected by the pcluster lock;
  43  *
  44  * A: Field should be accessed / updated in atomic for parallelized code.
  45  */
  46 struct z_erofs_pcluster {
  47         struct mutex lock;
  48         struct lockref lockref;
  49
  50         /* A: point to next chained pcluster or TAILs */
  51         z_erofs_next_pcluster_t next;
  52
  53         /* I: start block address of this pcluster */
  54         erofs_off_t index;
  55
  56         /* L: the maximum decompression size of this round */
  57         unsigned int length;
  58
  59         /* L: total number of bvecs */
  60         unsigned int vcnt;
  61
  62         /* I: pcluster size (compressed size) in bytes */
  63         unsigned int pclustersize;
  64
  65         /* I: page offset of start position of decompression */
  66         unsigned short pageofs_out;
  67
  68         /* I: page offset of inline compressed data */
  69         unsigned short pageofs_in;
  70
  71         union {
  72                 /* L: inline a certain number of bvec for bootstrap */
  73                 struct z_erofs_bvset_inline bvset;
  74
  75                 /* I: can be used to free the pcluster by RCU. */
  76                 struct rcu_head rcu;
  77         };
  78
  79         /* I: compression algorithm format */
  80         unsigned char algorithmformat;
  81
  82         /* L: whether partial decompression or not */
  83         bool partial;
  84
  85         /* L: indicate several pageofs_outs or not */
  86         bool multibases;
  87
  88         /* L: whether extra buffer allocations are best-effort */
  89         bool besteffort;
  90
  91         /* A: compressed bvecs (can be cached or inplaced pages) */
  92         struct z_erofs_bvec compressed_bvecs[];
  93 };
  94
  95 /* the end of a chain of pclusters */
  96 #define Z_EROFS_PCLUSTER_TAIL           ((void *) 0x700 + POISON_POINTER_DELTA)
  97 #define Z_EROFS_PCLUSTER_NIL            (NULL)
  98
  99 struct z_erofs_decompressqueue {
 100         struct super_block *sb;
 101         atomic_t pending_bios;
 102         z_erofs_next_pcluster_t head;
 103
 104         union {
 105                 struct completion done;
 106                 struct work_struct work;
 107                 struct kthread_work kthread_work;
 108         } u;
 109         bool eio, sync;
 110 };
 111
 112 static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl)
 113 {
 114         return !pcl->index;
 115 }
 116
 117 static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl)
 118 {
 119         return PAGE_ALIGN(pcl->pclustersize) >> PAGE_SHIFT;
 120 }
 121
 122 static bool erofs_folio_is_managed(struct erofs_sb_info *sbi, struct folio *fo)
 123 {
 124         return fo->mapping == MNGD_MAPPING(sbi);
 125 }
 126
 127 #define Z_EROFS_ONSTACK_PAGES           32
 128
 129 /*
 130  * since pclustersize is variable for big pcluster feature, introduce slab
 131  * pools implementation for different pcluster sizes.
 132  */
 133 struct z_erofs_pcluster_slab {
 134         struct kmem_cache *slab;
 135         unsigned int maxpages;
 136         char name[48];
 137 };
 138
 139 #define _PCLP(n) { .maxpages = n }
 140
 141 static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly = {
 142         _PCLP(1), _PCLP(4), _PCLP(16), _PCLP(64), _PCLP(128),
 143         _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES)
 144 };
 145
 146 struct z_erofs_bvec_iter {
 147         struct page *bvpage;
 148         struct z_erofs_bvset *bvset;
 149         unsigned int nr, cur;
 150 };
 151
 152 static struct page *z_erofs_bvec_iter_end(struct z_erofs_bvec_iter *iter)
 153 {
 154         if (iter->bvpage)
 155                 kunmap_local(iter->bvset);
 156         return iter->bvpage;
 157 }
 158
 159 static struct page *z_erofs_bvset_flip(struct z_erofs_bvec_iter *iter)
 160 {
 161         unsigned long base = (unsigned long)((struct z_erofs_bvset *)0)->bvec;
 162         /* have to access nextpage in advance, otherwise it will be unmapped */
 163         struct page *nextpage = iter->bvset->nextpage;
 164         struct page *oldpage;
 165
 166         DBG_BUGON(!nextpage);
 167         oldpage = z_erofs_bvec_iter_end(iter);
 168         iter->bvpage = nextpage;
 169         iter->bvset = kmap_local_page(nextpage);
 170         iter->nr = (PAGE_SIZE - base) / sizeof(struct z_erofs_bvec);
 171         iter->cur = 0;
 172         return oldpage;
 173 }
 174
 175 static void z_erofs_bvec_iter_begin(struct z_erofs_bvec_iter *iter,
 176                                     struct z_erofs_bvset_inline *bvset,
 177                                     unsigned int bootstrap_nr,
 178                                     unsigned int cur)
 179 {
 180         *iter = (struct z_erofs_bvec_iter) {
 181                 .nr = bootstrap_nr,
 182                 .bvset = (struct z_erofs_bvset *)bvset,
 183         };
 184
 185         while (cur > iter->nr) {
 186                 cur -= iter->nr;
 187                 z_erofs_bvset_flip(iter);
 188         }
 189         iter->cur = cur;
 190 }
 191
 192 static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter,
 193                                 struct z_erofs_bvec *bvec,
 194                                 struct page **candidate_bvpage,
 195                                 struct page **pagepool)
 196 {
 197         if (iter->cur >= iter->nr) {
 198                 struct page *nextpage = *candidate_bvpage;
 199
 200                 if (!nextpage) {
 201                         nextpage = __erofs_allocpage(pagepool, GFP_KERNEL,
 202                                         true);
 203                         if (!nextpage)
 204                                 return -ENOMEM;
 205                         set_page_private(nextpage, Z_EROFS_SHORTLIVED_PAGE);
 206                 }
 207                 DBG_BUGON(iter->bvset->nextpage);
 208                 iter->bvset->nextpage = nextpage;
 209                 z_erofs_bvset_flip(iter);
 210
 211                 iter->bvset->nextpage = NULL;
 212                 *candidate_bvpage = NULL;
 213         }
 214         iter->bvset->bvec[iter->cur++] = *bvec;
 215         return 0;
 216 }
 217
 218 static void z_erofs_bvec_dequeue(struct z_erofs_bvec_iter *iter,
 219                                  struct z_erofs_bvec *bvec,
 220                                  struct page **old_bvpage)
 221 {
 222         if (iter->cur == iter->nr)
 223                 *old_bvpage = z_erofs_bvset_flip(iter);
 224         else
 225                 *old_bvpage = NULL;
 226         *bvec = iter->bvset->bvec[iter->cur++];
 227 }
 228
 229 static void z_erofs_destroy_pcluster_pool(void)
 230 {
 231         int i;
 232
 233         for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) {
 234                 if (!pcluster_pool[i].slab)
 235                         continue;
 236                 kmem_cache_destroy(pcluster_pool[i].slab);
 237                 pcluster_pool[i].slab = NULL;
 238         }
 239 }
 240
 241 static int z_erofs_create_pcluster_pool(void)
 242 {
 243         struct z_erofs_pcluster_slab *pcs;
 244         struct z_erofs_pcluster *a;
 245         unsigned int size;
 246
 247         for (pcs = pcluster_pool;
 248              pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) {
 249                 size = struct_size(a, compressed_bvecs, pcs->maxpages);
 250
 251                 sprintf(pcs->name, "erofs_pcluster-%u", pcs->maxpages);
 252                 pcs->slab = kmem_cache_create(pcs->name, size, 0,
 253                                               SLAB_RECLAIM_ACCOUNT, NULL);
 254                 if (pcs->slab)
 255                         continue;
 256
 257                 z_erofs_destroy_pcluster_pool();
 258                 return -ENOMEM;
 259         }
 260         return 0;
 261 }
 262
 263 static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int size)
 264 {
 265         unsigned int nrpages = PAGE_ALIGN(size) >> PAGE_SHIFT;
 266         struct z_erofs_pcluster_slab *pcs = pcluster_pool;
 267
 268         for (; pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) {
 269                 struct z_erofs_pcluster *pcl;
 270
 271                 if (nrpages > pcs->maxpages)
 272                         continue;
 273
 274                 pcl = kmem_cache_zalloc(pcs->slab, GFP_KERNEL);
 275                 if (!pcl)
 276                         return ERR_PTR(-ENOMEM);
 277                 pcl->pclustersize = size;
 278                 return pcl;
 279         }
 280         return ERR_PTR(-EINVAL);
 281 }
 282
 283 static void z_erofs_free_pcluster(struct z_erofs_pcluster *pcl)
 284 {
 285         unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
 286         int i;
 287
 288         for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) {
 289                 struct z_erofs_pcluster_slab *pcs = pcluster_pool + i;
 290
 291                 if (pclusterpages > pcs->maxpages)
 292                         continue;
 293
 294                 kmem_cache_free(pcs->slab, pcl);
 295                 return;
 296         }
 297         DBG_BUGON(1);
 298 }
 299
 300 static struct workqueue_struct *z_erofs_workqueue __read_mostly;
 301
 302 #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD
 303 static struct kthread_worker __rcu **z_erofs_pcpu_workers;
 304
 305 static void erofs_destroy_percpu_workers(void)
 306 {
 307         struct kthread_worker *worker;
 308         unsigned int cpu;
 309
 310         for_each_possible_cpu(cpu) {
 311                 worker = rcu_dereference_protected(
 312                                         z_erofs_pcpu_workers[cpu], 1);
 313                 rcu_assign_pointer(z_erofs_pcpu_workers[cpu], NULL);
 314                 if (worker)
 315                         kthread_destroy_worker(worker);
 316         }
 317         kfree(z_erofs_pcpu_workers);
 318 }
 319
 320 static struct kthread_worker *erofs_init_percpu_worker(int cpu)
 321 {
 322         struct kthread_worker *worker =
 323                 kthread_run_worker_on_cpu(cpu, 0, "erofs_worker/%u");
 324
 325         if (IS_ERR(worker))
 326                 return worker;
 327         if (IS_ENABLED(CONFIG_EROFS_FS_PCPU_KTHREAD_HIPRI))
 328                 sched_set_fifo_low(worker->task);
 329         return worker;
 330 }
 331
 332 static int erofs_init_percpu_workers(void)
 333 {
 334         struct kthread_worker *worker;
 335         unsigned int cpu;
 336
 337         z_erofs_pcpu_workers = kcalloc(num_possible_cpus(),
 338                         sizeof(struct kthread_worker *), GFP_ATOMIC);
 339         if (!z_erofs_pcpu_workers)
 340                 return -ENOMEM;
 341
 342         for_each_online_cpu(cpu) {      /* could miss cpu{off,on}line? */
 343                 worker = erofs_init_percpu_worker(cpu);
 344                 if (!IS_ERR(worker))
 345                         rcu_assign_pointer(z_erofs_pcpu_workers[cpu], worker);
 346         }
 347         return 0;
 348 }
 349 #else
 350 static inline void erofs_destroy_percpu_workers(void) {}
 351 static inline int erofs_init_percpu_workers(void) { return 0; }
 352 #endif
 353
 354 #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_EROFS_FS_PCPU_KTHREAD)
 355 static DEFINE_SPINLOCK(z_erofs_pcpu_worker_lock);
 356 static enum cpuhp_state erofs_cpuhp_state;
 357
 358 static int erofs_cpu_online(unsigned int cpu)
 359 {
 360         struct kthread_worker *worker, *old;
 361
 362         worker = erofs_init_percpu_worker(cpu);
 363         if (IS_ERR(worker))
 364                 return PTR_ERR(worker);
 365
 366         spin_lock(&z_erofs_pcpu_worker_lock);
 367         old = rcu_dereference_protected(z_erofs_pcpu_workers[cpu],
 368                         lockdep_is_held(&z_erofs_pcpu_worker_lock));
 369         if (!old)
 370                 rcu_assign_pointer(z_erofs_pcpu_workers[cpu], worker);
 371         spin_unlock(&z_erofs_pcpu_worker_lock);
 372         if (old)
 373                 kthread_destroy_worker(worker);
 374         return 0;
 375 }
 376
 377 static int erofs_cpu_offline(unsigned int cpu)
 378 {
 379         struct kthread_worker *worker;
 380
 381         spin_lock(&z_erofs_pcpu_worker_lock);
 382         worker = rcu_dereference_protected(z_erofs_pcpu_workers[cpu],
 383                         lockdep_is_held(&z_erofs_pcpu_worker_lock));
 384         rcu_assign_pointer(z_erofs_pcpu_workers[cpu], NULL);
 385         spin_unlock(&z_erofs_pcpu_worker_lock);
 386
 387         synchronize_rcu();
 388         if (worker)
 389                 kthread_destroy_worker(worker);
 390         return 0;
 391 }
 392
 393 static int erofs_cpu_hotplug_init(void)
 394 {
 395         int state;
 396
 397         state = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
 398                         "fs/erofs:online", erofs_cpu_online, erofs_cpu_offline);
 399         if (state < 0)
 400                 return state;
 401
 402         erofs_cpuhp_state = state;
 403         return 0;
 404 }
 405
 406 static void erofs_cpu_hotplug_destroy(void)
 407 {
 408         if (erofs_cpuhp_state)
 409                 cpuhp_remove_state_nocalls(erofs_cpuhp_state);
 410 }
 411 #else /* !CONFIG_HOTPLUG_CPU || !CONFIG_EROFS_FS_PCPU_KTHREAD */
 412 static inline int erofs_cpu_hotplug_init(void) { return 0; }
 413 static inline void erofs_cpu_hotplug_destroy(void) {}
 414 #endif
 415
 416 void z_erofs_exit_subsystem(void)
 417 {
 418         erofs_cpu_hotplug_destroy();
 419         erofs_destroy_percpu_workers();
 420         destroy_workqueue(z_erofs_workqueue);
 421         z_erofs_destroy_pcluster_pool();
 422         z_erofs_exit_decompressor();
 423 }
 424
 425 int __init z_erofs_init_subsystem(void)
 426 {
 427         int err = z_erofs_init_decompressor();
 428
 429         if (err)
 430                 goto err_decompressor;
 431
 432         err = z_erofs_create_pcluster_pool();
 433         if (err)
 434                 goto err_pcluster_pool;
 435
 436         z_erofs_workqueue = alloc_workqueue("erofs_worker",
 437                         WQ_UNBOUND | WQ_HIGHPRI, num_possible_cpus());
 438         if (!z_erofs_workqueue) {
 439                 err = -ENOMEM;
 440                 goto err_workqueue_init;
 441         }
 442
 443         err = erofs_init_percpu_workers();
 444         if (err)
 445                 goto err_pcpu_worker;
 446
 447         err = erofs_cpu_hotplug_init();
 448         if (err < 0)
 449                 goto err_cpuhp_init;
 450         return err;
 451
 452 err_cpuhp_init:
 453         erofs_destroy_percpu_workers();
 454 err_pcpu_worker:
 455         destroy_workqueue(z_erofs_workqueue);
 456 err_workqueue_init:
 457         z_erofs_destroy_pcluster_pool();
 458 err_pcluster_pool:
 459         z_erofs_exit_decompressor();
 460 err_decompressor:
 461         return err;
 462 }
 463
 464 enum z_erofs_pclustermode {
 465         Z_EROFS_PCLUSTER_INFLIGHT,
 466         /*
 467          * a weak form of Z_EROFS_PCLUSTER_FOLLOWED, the difference is that it
 468          * could be dispatched into bypass queue later due to uptodated managed
 469          * pages. All related online pages cannot be reused for inplace I/O (or
 470          * bvpage) since it can be directly decoded without I/O submission.
 471          */
 472         Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE,
 473         /*
 474          * The pcluster was just linked to a decompression chain by us.  It can
 475          * also be linked with the remaining pclusters, which means if the
 476          * processing page is the tail page of a pcluster, this pcluster can
 477          * safely use the whole page (since the previous pcluster is within the
 478          * same chain) for in-place I/O, as illustrated below:
 479          *  ___________________________________________________
 480          * |  tail (partial) page  |    head (partial) page    |
 481          * |  (of the current pcl) |   (of the previous pcl)   |
 482          * |___PCLUSTER_FOLLOWED___|_____PCLUSTER_FOLLOWED_____|
 483          *
 484          * [  (*) the page above can be used as inplace I/O.   ]
 485          */
 486         Z_EROFS_PCLUSTER_FOLLOWED,
 487 };
 488
 489 struct z_erofs_decompress_frontend {
 490         struct inode *const inode;
 491         struct erofs_map_blocks map;
 492         struct z_erofs_bvec_iter biter;
 493
 494         struct page *pagepool;
 495         struct page *candidate_bvpage;
 496         struct z_erofs_pcluster *pcl;
 497         z_erofs_next_pcluster_t owned_head;
 498         enum z_erofs_pclustermode mode;
 499
 500         erofs_off_t headoffset;
 501
 502         /* a pointer used to pick up inplace I/O pages */
 503         unsigned int icur;
 504 };
 505
 506 #define DECOMPRESS_FRONTEND_INIT(__i) { \
 507         .inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \
 508         .mode = Z_EROFS_PCLUSTER_FOLLOWED }
 509
 510 static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe)
 511 {
 512         unsigned int cachestrategy = EROFS_I_SB(fe->inode)->opt.cache_strategy;
 513
 514         if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED)
 515                 return false;
 516
 517         if (!(fe->map.m_flags & EROFS_MAP_FULL_MAPPED))
 518                 return true;
 519
 520         if (cachestrategy >= EROFS_ZIP_CACHE_READAROUND &&
 521             fe->map.m_la < fe->headoffset)
 522                 return true;
 523
 524         return false;
 525 }
 526
 527 static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
 528 {
 529         struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode));
 530         struct z_erofs_pcluster *pcl = fe->pcl;
 531         unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
 532         bool shouldalloc = z_erofs_should_alloc_cache(fe);
 533         bool standalone = true;
 534         /*
 535          * optimistic allocation without direct reclaim since inplace I/O
 536          * can be used if low memory otherwise.
 537          */
 538         gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) |
 539                         __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
 540         unsigned int i;
 541
 542         if (i_blocksize(fe->inode) != PAGE_SIZE ||
 543             fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
 544                 return;
 545
 546         for (i = 0; i < pclusterpages; ++i) {
 547                 struct page *page, *newpage;
 548
 549                 /* Inaccurate check w/o locking to avoid unneeded lookups */
 550                 if (READ_ONCE(pcl->compressed_bvecs[i].page))
 551                         continue;
 552
 553                 page = find_get_page(mc, pcl->index + i);
 554                 if (!page) {
 555                         /* I/O is needed, no possible to decompress directly */
 556                         standalone = false;
 557                         if (!shouldalloc)
 558                                 continue;
 559
 560                         /*
 561                          * Try cached I/O if allocation succeeds or fallback to
 562                          * in-place I/O instead to avoid any direct reclaim.
 563                          */
 564                         newpage = erofs_allocpage(&fe->pagepool, gfp);
 565                         if (!newpage)
 566                                 continue;
 567                         set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE);
 568                 }
 569                 spin_lock(&pcl->lockref.lock);
 570                 if (!pcl->compressed_bvecs[i].page) {
 571                         pcl->compressed_bvecs[i].page = page ? page : newpage;
 572                         spin_unlock(&pcl->lockref.lock);
 573                         continue;
 574                 }
 575                 spin_unlock(&pcl->lockref.lock);
 576
 577                 if (page)
 578                         put_page(page);
 579                 else if (newpage)
 580                         erofs_pagepool_add(&fe->pagepool, newpage);
 581         }
 582
 583         /*
 584          * don't do inplace I/O if all compressed pages are available in
 585          * managed cache since it can be moved to the bypass queue instead.
 586          */
 587         if (standalone)
 588                 fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
 589 }
 590
 591 /* (erofs_shrinker) disconnect cached encoded data with pclusters */
 592 static int erofs_try_to_free_all_cached_folios(struct erofs_sb_info *sbi,
 593                                                struct z_erofs_pcluster *pcl)
 594 {
 595         unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
 596         struct folio *folio;
 597         int i;
 598
 599         DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
 600         /* Each cached folio contains one page unless bs > ps is supported */
 601         for (i = 0; i < pclusterpages; ++i) {
 602                 if (pcl->compressed_bvecs[i].page) {
 603                         folio = page_folio(pcl->compressed_bvecs[i].page);
 604                         /* Avoid reclaiming or migrating this folio */
 605                         if (!folio_trylock(folio))
 606                                 return -EBUSY;
 607
 608                         if (!erofs_folio_is_managed(sbi, folio))
 609                                 continue;
 610                         pcl->compressed_bvecs[i].page = NULL;
 611                         folio_detach_private(folio);
 612                         folio_unlock(folio);
 613                 }
 614         }
 615         return 0;
 616 }
 617
 618 static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp)
 619 {
 620         struct z_erofs_pcluster *pcl = folio_get_private(folio);
 621         struct z_erofs_bvec *bvec = pcl->compressed_bvecs;
 622         struct z_erofs_bvec *end = bvec + z_erofs_pclusterpages(pcl);
 623         bool ret;
 624
 625         if (!folio_test_private(folio))
 626                 return true;
 627
 628         ret = false;
 629         spin_lock(&pcl->lockref.lock);
 630         if (pcl->lockref.count <= 0) {
 631                 DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
 632                 for (; bvec < end; ++bvec) {
 633                         if (bvec->page && page_folio(bvec->page) == folio) {
 634                                 bvec->page = NULL;
 635                                 folio_detach_private(folio);
 636                                 ret = true;
 637                                 break;
 638                         }
 639                 }
 640         }
 641         spin_unlock(&pcl->lockref.lock);
 642         return ret;
 643 }
 644
 645 /*
 646  * It will be called only on inode eviction. In case that there are still some
 647  * decompression requests in progress, wait with rescheduling for a bit here.
 648  * An extra lock could be introduced instead but it seems unnecessary.
 649  */
 650 static void z_erofs_cache_invalidate_folio(struct folio *folio,
 651                                            size_t offset, size_t length)
 652 {
 653         const size_t stop = length + offset;
 654
 655         /* Check for potential overflow in debug mode */
 656         DBG_BUGON(stop > folio_size(folio) || stop < length);
 657
 658         if (offset == 0 && stop == folio_size(folio))
 659                 while (!z_erofs_cache_release_folio(folio, 0))
 660                         cond_resched();
 661 }
 662
 663 static const struct address_space_operations z_erofs_cache_aops = {
 664         .release_folio = z_erofs_cache_release_folio,
 665         .invalidate_folio = z_erofs_cache_invalidate_folio,
 666 };
 667
 668 int erofs_init_managed_cache(struct super_block *sb)
 669 {
 670         struct inode *const inode = new_inode(sb);
 671
 672         if (!inode)
 673                 return -ENOMEM;
 674
 675         set_nlink(inode, 1);
 676         inode->i_size = OFFSET_MAX;
 677         inode->i_mapping->a_ops = &z_erofs_cache_aops;
 678         mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL);
 679         EROFS_SB(sb)->managed_cache = inode;
 680         return 0;
 681 }
 682
 683 /* callers must be with pcluster lock held */
 684 static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
 685                                struct z_erofs_bvec *bvec, bool exclusive)
 686 {
 687         struct z_erofs_pcluster *pcl = fe->pcl;
 688         int ret;
 689
 690         if (exclusive) {
 691                 /* give priority for inplaceio to use file pages first */
 692                 spin_lock(&pcl->lockref.lock);
 693                 while (fe->icur > 0) {
 694                         if (pcl->compressed_bvecs[--fe->icur].page)
 695                                 continue;
 696                         pcl->compressed_bvecs[fe->icur] = *bvec;
 697                         spin_unlock(&pcl->lockref.lock);
 698                         return 0;
 699                 }
 700                 spin_unlock(&pcl->lockref.lock);
 701
 702                 /* otherwise, check if it can be used as a bvpage */
 703                 if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED &&
 704                     !fe->candidate_bvpage)
 705                         fe->candidate_bvpage = bvec->page;
 706         }
 707         ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage,
 708                                    &fe->pagepool);
 709         fe->pcl->vcnt += (ret >= 0);
 710         return ret;
 711 }
 712
 713 static bool z_erofs_get_pcluster(struct z_erofs_pcluster *pcl)
 714 {
 715         if (lockref_get_not_zero(&pcl->lockref))
 716                 return true;
 717
 718         spin_lock(&pcl->lockref.lock);
 719         if (__lockref_is_dead(&pcl->lockref)) {
 720                 spin_unlock(&pcl->lockref.lock);
 721                 return false;
 722         }
 723
 724         if (!pcl->lockref.count++)
 725                 atomic_long_dec(&erofs_global_shrink_cnt);
 726         spin_unlock(&pcl->lockref.lock);
 727         return true;
 728 }
 729
 730 static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
 731 {
 732         struct erofs_map_blocks *map = &fe->map;
 733         struct super_block *sb = fe->inode->i_sb;
 734         struct erofs_sb_info *sbi = EROFS_SB(sb);
 735         bool ztailpacking = map->m_flags & EROFS_MAP_META;
 736         struct z_erofs_pcluster *pcl, *pre;
 737         int err;
 738
 739         if (!(map->m_flags & EROFS_MAP_ENCODED) ||
 740             (!ztailpacking && !erofs_blknr(sb, map->m_pa))) {
 741                 DBG_BUGON(1);
 742                 return -EFSCORRUPTED;
 743         }
 744
 745         /* no available pcluster, let's allocate one */
 746         pcl = z_erofs_alloc_pcluster(map->m_plen);
 747         if (IS_ERR(pcl))
 748                 return PTR_ERR(pcl);
 749
 750         lockref_init(&pcl->lockref, 1); /* one ref for this request */
 751         pcl->algorithmformat = map->m_algorithmformat;
 752         pcl->length = 0;
 753         pcl->partial = true;
 754
 755         /* new pclusters should be claimed as type 1, primary and followed */
 756         pcl->next = fe->owned_head;
 757         pcl->pageofs_out = map->m_la & ~PAGE_MASK;
 758         fe->mode = Z_EROFS_PCLUSTER_FOLLOWED;
 759
 760         /*
 761          * lock all primary followed works before visible to others
 762          * and mutex_trylock *never* fails for a new pcluster.
 763          */
 764         mutex_init(&pcl->lock);
 765         DBG_BUGON(!mutex_trylock(&pcl->lock));
 766
 767         if (ztailpacking) {
 768                 pcl->index = 0;         /* which indicates ztailpacking */
 769         } else {
 770                 pcl->index = erofs_blknr(sb, map->m_pa);
 771                 while (1) {
 772                         xa_lock(&sbi->managed_pslots);
 773                         pre = __xa_cmpxchg(&sbi->managed_pslots, pcl->index,
 774                                            NULL, pcl, GFP_KERNEL);
 775                         if (!pre || xa_is_err(pre) || z_erofs_get_pcluster(pre)) {
 776                                 xa_unlock(&sbi->managed_pslots);
 777                                 break;
 778                         }
 779                         /* try to legitimize the current in-tree one */
 780                         xa_unlock(&sbi->managed_pslots);
 781                         cond_resched();
 782                 }
 783                 if (xa_is_err(pre)) {
 784                         err = xa_err(pre);
 785                         goto err_out;
 786                 } else if (pre) {
 787                         fe->pcl = pre;
 788                         err = -EEXIST;
 789                         goto err_out;
 790                 }
 791         }
 792         fe->owned_head = &pcl->next;
 793         fe->pcl = pcl;
 794         return 0;
 795
 796 err_out:
 797         mutex_unlock(&pcl->lock);
 798         z_erofs_free_pcluster(pcl);
 799         return err;
 800 }
 801
 802 static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe)
 803 {
 804         struct erofs_map_blocks *map = &fe->map;
 805         struct super_block *sb = fe->inode->i_sb;
 806         erofs_blk_t blknr = erofs_blknr(sb, map->m_pa);
 807         struct z_erofs_pcluster *pcl = NULL;
 808         int ret;
 809
 810         DBG_BUGON(fe->pcl);
 811         /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */
 812         DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL);
 813
 814         if (!(map->m_flags & EROFS_MAP_META)) {
 815                 while (1) {
 816                         rcu_read_lock();
 817                         pcl = xa_load(&EROFS_SB(sb)->managed_pslots, blknr);
 818                         if (!pcl || z_erofs_get_pcluster(pcl)) {
 819                                 DBG_BUGON(pcl && blknr != pcl->index);
 820                                 rcu_read_unlock();
 821                                 break;
 822                         }
 823                         rcu_read_unlock();
 824                 }
 825         } else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) {
 826                 DBG_BUGON(1);
 827                 return -EFSCORRUPTED;
 828         }
 829
 830         if (pcl) {
 831                 fe->pcl = pcl;
 832                 ret = -EEXIST;
 833         } else {
 834                 ret = z_erofs_register_pcluster(fe);
 835         }
 836
 837         if (ret == -EEXIST) {
 838                 mutex_lock(&fe->pcl->lock);
 839                 /* check if this pcluster hasn't been linked into any chain. */
 840                 if (cmpxchg(&fe->pcl->next, Z_EROFS_PCLUSTER_NIL,
 841                             fe->owned_head) == Z_EROFS_PCLUSTER_NIL) {
 842                         /* .. so it can be attached to our submission chain */
 843                         fe->owned_head = &fe->pcl->next;
 844                         fe->mode = Z_EROFS_PCLUSTER_FOLLOWED;
 845                 } else {        /* otherwise, it belongs to an inflight chain */
 846                         fe->mode = Z_EROFS_PCLUSTER_INFLIGHT;
 847                 }
 848         } else if (ret) {
 849                 return ret;
 850         }
 851
 852         z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset,
 853                                 Z_EROFS_INLINE_BVECS, fe->pcl->vcnt);
 854         if (!z_erofs_is_inline_pcluster(fe->pcl)) {
 855                 /* bind cache first when cached decompression is preferred */
 856                 z_erofs_bind_cache(fe);
 857         } else {
 858                 void *mptr;
 859
 860                 mptr = erofs_read_metabuf(&map->buf, sb, map->m_pa, EROFS_NO_KMAP);
 861                 if (IS_ERR(mptr)) {
 862                         ret = PTR_ERR(mptr);
 863                         erofs_err(sb, "failed to get inline data %d", ret);
 864                         return ret;
 865                 }
 866                 get_page(map->buf.page);
 867                 WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, map->buf.page);
 868                 fe->pcl->pageofs_in = map->m_pa & ~PAGE_MASK;
 869                 fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
 870         }
 871         /* file-backed inplace I/O pages are traversed in reverse order */
 872         fe->icur = z_erofs_pclusterpages(fe->pcl);
 873         return 0;
 874 }
 875
 876 /*
 877  * keep in mind that no referenced pclusters will be freed
 878  * only after a RCU grace period.
 879  */
 880 static void z_erofs_rcu_callback(struct rcu_head *head)
 881 {
 882         z_erofs_free_pcluster(container_of(head,
 883                         struct z_erofs_pcluster, rcu));
 884 }
 885
 886 static bool __erofs_try_to_release_pcluster(struct erofs_sb_info *sbi,
 887                                           struct z_erofs_pcluster *pcl)
 888 {
 889         if (pcl->lockref.count)
 890                 return false;
 891
 892         /*
 893          * Note that all cached folios should be detached before deleted from
 894          * the XArray.  Otherwise some folios could be still attached to the
 895          * orphan old pcluster when the new one is available in the tree.
 896          */
 897         if (erofs_try_to_free_all_cached_folios(sbi, pcl))
 898                 return false;
 899
 900         /*
 901          * It's impossible to fail after the pcluster is freezed, but in order
 902          * to avoid some race conditions, add a DBG_BUGON to observe this.
 903          */
 904         DBG_BUGON(__xa_erase(&sbi->managed_pslots, pcl->index) != pcl);
 905
 906         lockref_mark_dead(&pcl->lockref);
 907         return true;
 908 }
 909
 910 static bool erofs_try_to_release_pcluster(struct erofs_sb_info *sbi,
 911                                           struct z_erofs_pcluster *pcl)
 912 {
 913         bool free;
 914
 915         spin_lock(&pcl->lockref.lock);
 916         free = __erofs_try_to_release_pcluster(sbi, pcl);
 917         spin_unlock(&pcl->lockref.lock);
 918         if (free) {
 919                 atomic_long_dec(&erofs_global_shrink_cnt);
 920                 call_rcu(&pcl->rcu, z_erofs_rcu_callback);
 921         }
 922         return free;
 923 }
 924
 925 unsigned long z_erofs_shrink_scan(struct erofs_sb_info *sbi,
 926                                   unsigned long nr_shrink)
 927 {
 928         struct z_erofs_pcluster *pcl;
 929         unsigned int freed = 0;
 930         unsigned long index;
 931
 932         xa_lock(&sbi->managed_pslots);
 933         xa_for_each(&sbi->managed_pslots, index, pcl) {
 934                 /* try to shrink each valid pcluster */
 935                 if (!erofs_try_to_release_pcluster(sbi, pcl))
 936                         continue;
 937                 xa_unlock(&sbi->managed_pslots);
 938
 939                 ++freed;
 940                 if (!--nr_shrink)
 941                         return freed;
 942                 xa_lock(&sbi->managed_pslots);
 943         }
 944         xa_unlock(&sbi->managed_pslots);
 945         return freed;
 946 }
 947
 948 static void z_erofs_put_pcluster(struct erofs_sb_info *sbi,
 949                 struct z_erofs_pcluster *pcl, bool try_free)
 950 {
 951         bool free = false;
 952
 953         if (lockref_put_or_lock(&pcl->lockref))
 954                 return;
 955
 956         DBG_BUGON(__lockref_is_dead(&pcl->lockref));
 957         if (!--pcl->lockref.count) {
 958                 if (try_free && xa_trylock(&sbi->managed_pslots)) {
 959                         free = __erofs_try_to_release_pcluster(sbi, pcl);
 960                         xa_unlock(&sbi->managed_pslots);
 961                 }
 962                 atomic_long_add(!free, &erofs_global_shrink_cnt);
 963         }
 964         spin_unlock(&pcl->lockref.lock);
 965         if (free)
 966                 call_rcu(&pcl->rcu, z_erofs_rcu_callback);
 967 }
 968
 969 static void z_erofs_pcluster_end(struct z_erofs_decompress_frontend *fe)
 970 {
 971         struct z_erofs_pcluster *pcl = fe->pcl;
 972
 973         if (!pcl)
 974                 return;
 975
 976         z_erofs_bvec_iter_end(&fe->biter);
 977         mutex_unlock(&pcl->lock);
 978
 979         if (fe->candidate_bvpage)
 980                 fe->candidate_bvpage = NULL;
 981
 982         /*
 983          * if all pending pages are added, don't hold its reference
 984          * any longer if the pcluster isn't hosted by ourselves.
 985          */
 986         if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE)
 987                 z_erofs_put_pcluster(EROFS_I_SB(fe->inode), pcl, false);
 988
 989         fe->pcl = NULL;
 990 }
 991
 992 static int z_erofs_read_fragment(struct super_block *sb, struct folio *folio,
 993                         unsigned int cur, unsigned int end, erofs_off_t pos)
 994 {
 995         struct inode *packed_inode = EROFS_SB(sb)->packed_inode;
 996         struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
 997         unsigned int cnt;
 998         u8 *src;
 999
1000         if (!packed_inode)
1001                 return -EFSCORRUPTED;
1002
1003         buf.mapping = packed_inode->i_mapping;
1004         for (; cur < end; cur += cnt, pos += cnt) {
1005                 cnt = min(end - cur, sb->s_blocksize - erofs_blkoff(sb, pos));
1006                 src = erofs_bread(&buf, pos, EROFS_KMAP);
1007                 if (IS_ERR(src)) {
1008                         erofs_put_metabuf(&buf);
1009                         return PTR_ERR(src);
1010                 }
1011                 memcpy_to_folio(folio, cur, src, cnt);
1012         }
1013         erofs_put_metabuf(&buf);
1014         return 0;
1015 }
1016
1017 static int z_erofs_scan_folio(struct z_erofs_decompress_frontend *f,
1018                               struct folio *folio, bool ra)
1019 {
1020         struct inode *const inode = f->inode;
1021         struct erofs_map_blocks *const map = &f->map;
1022         const loff_t offset = folio_pos(folio);
1023         const unsigned int bs = i_blocksize(inode);
1024         unsigned int end = folio_size(folio), split = 0, cur, pgs;
1025         bool tight, excl;
1026         int err = 0;
1027
1028         tight = (bs == PAGE_SIZE);
1029         erofs_onlinefolio_init(folio);
1030         do {
1031                 if (offset + end - 1 < map->m_la ||
1032                     offset + end - 1 >= map->m_la + map->m_llen) {
1033                         z_erofs_pcluster_end(f);
1034                         map->m_la = offset + end - 1;
1035                         map->m_llen = 0;
1036                         err = z_erofs_map_blocks_iter(inode, map, 0);
1037                         if (err)
1038                                 break;
1039                 }
1040
1041                 cur = offset > map->m_la ? 0 : map->m_la - offset;
1042                 pgs = round_down(cur, PAGE_SIZE);
1043                 /* bump split parts first to avoid several separate cases */
1044                 ++split;
1045
1046                 if (!(map->m_flags & EROFS_MAP_MAPPED)) {
1047                         folio_zero_segment(folio, cur, end);
1048                         tight = false;
1049                 } else if (map->m_flags & EROFS_MAP_FRAGMENT) {
1050                         erofs_off_t fpos = offset + cur - map->m_la;
1051
1052                         err = z_erofs_read_fragment(inode->i_sb, folio, cur,
1053                                         cur + min(map->m_llen - fpos, end - cur),
1054                                         EROFS_I(inode)->z_fragmentoff + fpos);
1055                         if (err)
1056                                 break;
1057                         tight = false;
1058                 } else {
1059                         if (!f->pcl) {
1060                                 err = z_erofs_pcluster_begin(f);
1061                                 if (err)
1062                                         break;
1063                                 f->pcl->besteffort |= !ra;
1064                         }
1065
1066                         pgs = round_down(end - 1, PAGE_SIZE);
1067                         /*
1068                          * Ensure this partial page belongs to this submit chain
1069                          * rather than other concurrent submit chains or
1070                          * noio(bypass) chains since those chains are handled
1071                          * asynchronously thus it cannot be used for inplace I/O
1072                          * or bvpage (should be processed in the strict order.)
1073                          */
1074                         tight &= (f->mode >= Z_EROFS_PCLUSTER_FOLLOWED);
1075                         excl = false;
1076                         if (cur <= pgs) {
1077                                 excl = (split <= 1) || tight;
1078                                 cur = pgs;
1079                         }
1080
1081                         err = z_erofs_attach_page(f, &((struct z_erofs_bvec) {
1082                                 .page = folio_page(folio, pgs >> PAGE_SHIFT),
1083                                 .offset = offset + pgs - map->m_la,
1084                                 .end = end - pgs, }), excl);
1085                         if (err)
1086                                 break;
1087
1088                         erofs_onlinefolio_split(folio);
1089                         if (f->pcl->pageofs_out != (map->m_la & ~PAGE_MASK))
1090                                 f->pcl->multibases = true;
1091                         if (f->pcl->length < offset + end - map->m_la) {
1092                                 f->pcl->length = offset + end - map->m_la;
1093                                 f->pcl->pageofs_out = map->m_la & ~PAGE_MASK;
1094                         }
1095                         if ((map->m_flags & EROFS_MAP_FULL_MAPPED) &&
1096                             !(map->m_flags & EROFS_MAP_PARTIAL_REF) &&
1097                             f->pcl->length == map->m_llen)
1098                                 f->pcl->partial = false;
1099                 }
1100                 /* shorten the remaining extent to update progress */
1101                 map->m_llen = offset + cur - map->m_la;
1102                 map->m_flags &= ~EROFS_MAP_FULL_MAPPED;
1103                 if (cur <= pgs) {
1104                         split = cur < pgs;
1105                         tight = (bs == PAGE_SIZE);
1106                 }
1107         } while ((end = cur) > 0);
1108         erofs_onlinefolio_end(folio, err);
1109         return err;
1110 }
1111
1112 static bool z_erofs_is_sync_decompress(struct erofs_sb_info *sbi,
1113                                        unsigned int readahead_pages)
1114 {
1115         /* auto: enable for read_folio, disable for readahead */
1116         if ((sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO) &&
1117             !readahead_pages)
1118                 return true;
1119
1120         if ((sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_FORCE_ON) &&
1121             (readahead_pages <= sbi->opt.max_sync_decompress_pages))
1122                 return true;
1123
1124         return false;
1125 }
1126
1127 static bool z_erofs_page_is_invalidated(struct page *page)
1128 {
1129         return !page_folio(page)->mapping && !z_erofs_is_shortlived_page(page);
1130 }
1131
1132 struct z_erofs_decompress_backend {
1133         struct page *onstack_pages[Z_EROFS_ONSTACK_PAGES];
1134         struct super_block *sb;
1135         struct z_erofs_pcluster *pcl;
1136
1137         /* pages with the longest decompressed length for deduplication */
1138         struct page **decompressed_pages;
1139         /* pages to keep the compressed data */
1140         struct page **compressed_pages;
1141
1142         struct list_head decompressed_secondary_bvecs;
1143         struct page **pagepool;
1144         unsigned int onstack_used, nr_pages;
1145 };
1146
1147 struct z_erofs_bvec_item {
1148         struct z_erofs_bvec bvec;
1149         struct list_head list;
1150 };
1151
1152 static void z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be,
1153                                          struct z_erofs_bvec *bvec)
1154 {
1155         struct z_erofs_bvec_item *item;
1156         unsigned int pgnr;
1157
1158         if (!((bvec->offset + be->pcl->pageofs_out) & ~PAGE_MASK) &&
1159             (bvec->end == PAGE_SIZE ||
1160              bvec->offset + bvec->end == be->pcl->length)) {
1161                 pgnr = (bvec->offset + be->pcl->pageofs_out) >> PAGE_SHIFT;
1162                 DBG_BUGON(pgnr >= be->nr_pages);
1163                 if (!be->decompressed_pages[pgnr]) {
1164                         be->decompressed_pages[pgnr] = bvec->page;
1165                         return;
1166                 }
1167         }
1168
1169         /* (cold path) one pcluster is requested multiple times */
1170         item = kmalloc(sizeof(*item), GFP_KERNEL | __GFP_NOFAIL);
1171         item->bvec = *bvec;
1172         list_add(&item->list, &be->decompressed_secondary_bvecs);
1173 }
1174
1175 static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be,
1176                                       int err)
1177 {
1178         unsigned int off0 = be->pcl->pageofs_out;
1179         struct list_head *p, *n;
1180
1181         list_for_each_safe(p, n, &be->decompressed_secondary_bvecs) {
1182                 struct z_erofs_bvec_item *bvi;
1183                 unsigned int end, cur;
1184                 void *dst, *src;
1185
1186                 bvi = container_of(p, struct z_erofs_bvec_item, list);
1187                 cur = bvi->bvec.offset < 0 ? -bvi->bvec.offset : 0;
1188                 end = min_t(unsigned int, be->pcl->length - bvi->bvec.offset,
1189                             bvi->bvec.end);
1190                 dst = kmap_local_page(bvi->bvec.page);
1191                 while (cur < end) {
1192                         unsigned int pgnr, scur, len;
1193
1194                         pgnr = (bvi->bvec.offset + cur + off0) >> PAGE_SHIFT;
1195                         DBG_BUGON(pgnr >= be->nr_pages);
1196
1197                         scur = bvi->bvec.offset + cur -
1198                                         ((pgnr << PAGE_SHIFT) - off0);
1199                         len = min_t(unsigned int, end - cur, PAGE_SIZE - scur);
1200                         if (!be->decompressed_pages[pgnr]) {
1201                                 err = -EFSCORRUPTED;
1202                                 cur += len;
1203                                 continue;
1204                         }
1205                         src = kmap_local_page(be->decompressed_pages[pgnr]);
1206                         memcpy(dst + cur, src + scur, len);
1207                         kunmap_local(src);
1208                         cur += len;
1209                 }
1210                 kunmap_local(dst);
1211                 erofs_onlinefolio_end(page_folio(bvi->bvec.page), err);
1212                 list_del(p);
1213                 kfree(bvi);
1214         }
1215 }
1216
1217 static void z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend *be)
1218 {
1219         struct z_erofs_pcluster *pcl = be->pcl;
1220         struct z_erofs_bvec_iter biter;
1221         struct page *old_bvpage;
1222         int i;
1223
1224         z_erofs_bvec_iter_begin(&biter, &pcl->bvset, Z_EROFS_INLINE_BVECS, 0);
1225         for (i = 0; i < pcl->vcnt; ++i) {
1226                 struct z_erofs_bvec bvec;
1227
1228                 z_erofs_bvec_dequeue(&biter, &bvec, &old_bvpage);
1229
1230                 if (old_bvpage)
1231                         z_erofs_put_shortlivedpage(be->pagepool, old_bvpage);
1232
1233                 DBG_BUGON(z_erofs_page_is_invalidated(bvec.page));
1234                 z_erofs_do_decompressed_bvec(be, &bvec);
1235         }
1236
1237         old_bvpage = z_erofs_bvec_iter_end(&biter);
1238         if (old_bvpage)
1239                 z_erofs_put_shortlivedpage(be->pagepool, old_bvpage);
1240 }
1241
1242 static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be,
1243                                   bool *overlapped)
1244 {
1245         struct z_erofs_pcluster *pcl = be->pcl;
1246         unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
1247         int i, err = 0;
1248
1249         *overlapped = false;
1250         for (i = 0; i < pclusterpages; ++i) {
1251                 struct z_erofs_bvec *bvec = &pcl->compressed_bvecs[i];
1252                 struct page *page = bvec->page;
1253
1254                 /* compressed data ought to be valid when decompressing */
1255                 if (IS_ERR(page) || !page) {
1256                         bvec->page = NULL;      /* clear the failure reason */
1257                         err = page ? PTR_ERR(page) : -EIO;
1258                         continue;
1259                 }
1260                 be->compressed_pages[i] = page;
1261
1262                 if (z_erofs_is_inline_pcluster(pcl) ||
1263                     erofs_folio_is_managed(EROFS_SB(be->sb), page_folio(page))) {
1264                         if (!PageUptodate(page))
1265                                 err = -EIO;
1266                         continue;
1267                 }
1268
1269                 DBG_BUGON(z_erofs_page_is_invalidated(page));
1270                 if (z_erofs_is_shortlived_page(page))
1271                         continue;
1272                 z_erofs_do_decompressed_bvec(be, bvec);
1273                 *overlapped = true;
1274         }
1275         return err;
1276 }
1277
1278 static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
1279                                        int err)
1280 {
1281         struct erofs_sb_info *const sbi = EROFS_SB(be->sb);
1282         struct z_erofs_pcluster *pcl = be->pcl;
1283         unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
1284         const struct z_erofs_decompressor *decomp =
1285                                 z_erofs_decomp[pcl->algorithmformat];
1286         int i, j, jtop, err2;
1287         struct page *page;
1288         bool overlapped;
1289         bool try_free = true;
1290
1291         mutex_lock(&pcl->lock);
1292         be->nr_pages = PAGE_ALIGN(pcl->length + pcl->pageofs_out) >> PAGE_SHIFT;
1293
1294         /* allocate (de)compressed page arrays if cannot be kept on stack */
1295         be->decompressed_pages = NULL;
1296         be->compressed_pages = NULL;
1297         be->onstack_used = 0;
1298         if (be->nr_pages <= Z_EROFS_ONSTACK_PAGES) {
1299                 be->decompressed_pages = be->onstack_pages;
1300                 be->onstack_used = be->nr_pages;
1301                 memset(be->decompressed_pages, 0,
1302                        sizeof(struct page *) * be->nr_pages);
1303         }
1304
1305         if (pclusterpages + be->onstack_used <= Z_EROFS_ONSTACK_PAGES)
1306                 be->compressed_pages = be->onstack_pages + be->onstack_used;
1307
1308         if (!be->decompressed_pages)
1309                 be->decompressed_pages =
1310                         kvcalloc(be->nr_pages, sizeof(struct page *),
1311                                  GFP_KERNEL | __GFP_NOFAIL);
1312         if (!be->compressed_pages)
1313                 be->compressed_pages =
1314                         kvcalloc(pclusterpages, sizeof(struct page *),
1315                                  GFP_KERNEL | __GFP_NOFAIL);
1316
1317         z_erofs_parse_out_bvecs(be);
1318         err2 = z_erofs_parse_in_bvecs(be, &overlapped);
1319         if (err2)
1320                 err = err2;
1321         if (!err)
1322                 err = decomp->decompress(&(struct z_erofs_decompress_req) {
1323                                         .sb = be->sb,
1324                                         .in = be->compressed_pages,
1325                                         .out = be->decompressed_pages,
1326                                         .pageofs_in = pcl->pageofs_in,
1327                                         .pageofs_out = pcl->pageofs_out,
1328                                         .inputsize = pcl->pclustersize,
1329                                         .outputsize = pcl->length,
1330                                         .alg = pcl->algorithmformat,
1331                                         .inplace_io = overlapped,
1332                                         .partial_decoding = pcl->partial,
1333                                         .fillgaps = pcl->multibases,
1334                                         .gfp = pcl->besteffort ? GFP_KERNEL :
1335                                                 GFP_NOWAIT | __GFP_NORETRY
1336                                  }, be->pagepool);
1337
1338         /* must handle all compressed pages before actual file pages */
1339         if (z_erofs_is_inline_pcluster(pcl)) {
1340                 page = pcl->compressed_bvecs[0].page;
1341                 WRITE_ONCE(pcl->compressed_bvecs[0].page, NULL);
1342                 put_page(page);
1343         } else {
1344                 /* managed folios are still left in compressed_bvecs[] */
1345                 for (i = 0; i < pclusterpages; ++i) {
1346                         page = be->compressed_pages[i];
1347                         if (!page)
1348                                 continue;
1349                         if (erofs_folio_is_managed(sbi, page_folio(page))) {
1350                                 try_free = false;
1351                                 continue;
1352                         }
1353                         (void)z_erofs_put_shortlivedpage(be->pagepool, page);
1354                         WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
1355                 }
1356         }
1357         if (be->compressed_pages < be->onstack_pages ||
1358             be->compressed_pages >= be->onstack_pages + Z_EROFS_ONSTACK_PAGES)
1359                 kvfree(be->compressed_pages);
1360
1361         jtop = 0;
1362         z_erofs_fill_other_copies(be, err);
1363         for (i = 0; i < be->nr_pages; ++i) {
1364                 page = be->decompressed_pages[i];
1365                 if (!page)
1366                         continue;
1367
1368                 DBG_BUGON(z_erofs_page_is_invalidated(page));
1369                 if (!z_erofs_is_shortlived_page(page)) {
1370                         erofs_onlinefolio_end(page_folio(page), err);
1371                         continue;
1372                 }
1373                 if (pcl->algorithmformat != Z_EROFS_COMPRESSION_LZ4) {
1374                         erofs_pagepool_add(be->pagepool, page);
1375                         continue;
1376                 }
1377                 for (j = 0; j < jtop && be->decompressed_pages[j] != page; ++j)
1378                         ;
1379                 if (j >= jtop)  /* this bounce page is newly detected */
1380                         be->decompressed_pages[jtop++] = page;
1381         }
1382         while (jtop)
1383                 erofs_pagepool_add(be->pagepool,
1384                                    be->decompressed_pages[--jtop]);
1385         if (be->decompressed_pages != be->onstack_pages)
1386                 kvfree(be->decompressed_pages);
1387
1388         pcl->length = 0;
1389         pcl->partial = true;
1390         pcl->multibases = false;
1391         pcl->besteffort = false;
1392         pcl->bvset.nextpage = NULL;
1393         pcl->vcnt = 0;
1394
1395         /* pcluster lock MUST be taken before the following line */
1396         WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_NIL);
1397         mutex_unlock(&pcl->lock);
1398
1399         if (z_erofs_is_inline_pcluster(pcl))
1400                 z_erofs_free_pcluster(pcl);
1401         else
1402                 z_erofs_put_pcluster(sbi, pcl, try_free);
1403         return err;
1404 }
1405
1406 static int z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
1407                                     struct page **pagepool)
1408 {
1409         struct z_erofs_decompress_backend be = {
1410                 .sb = io->sb,
1411                 .pagepool = pagepool,
1412                 .decompressed_secondary_bvecs =
1413                         LIST_HEAD_INIT(be.decompressed_secondary_bvecs),
1414         };
1415         z_erofs_next_pcluster_t owned = io->head;
1416         int err = io->eio ? -EIO : 0;
1417
1418         while (owned != Z_EROFS_PCLUSTER_TAIL) {
1419                 DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL);
1420
1421                 be.pcl = container_of(owned, struct z_erofs_pcluster, next);
1422                 owned = READ_ONCE(be.pcl->next);
1423
1424                 err = z_erofs_decompress_pcluster(&be, err) ?: err;
1425         }
1426         return err;
1427 }
1428
1429 static void z_erofs_decompressqueue_work(struct work_struct *work)
1430 {
1431         struct z_erofs_decompressqueue *bgq =
1432                 container_of(work, struct z_erofs_decompressqueue, u.work);
1433         struct page *pagepool = NULL;
1434
1435         DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL);
1436         z_erofs_decompress_queue(bgq, &pagepool);
1437         erofs_release_pages(&pagepool);
1438         kvfree(bgq);
1439 }
1440
1441 #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD
1442 static void z_erofs_decompressqueue_kthread_work(struct kthread_work *work)
1443 {
1444         z_erofs_decompressqueue_work((struct work_struct *)work);
1445 }
1446 #endif
1447
1448 static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
1449                                        int bios)
1450 {
1451         struct erofs_sb_info *const sbi = EROFS_SB(io->sb);
1452
1453         /* wake up the caller thread for sync decompression */
1454         if (io->sync) {
1455                 if (!atomic_add_return(bios, &io->pending_bios))
1456                         complete(&io->u.done);
1457                 return;
1458         }
1459
1460         if (atomic_add_return(bios, &io->pending_bios))
1461                 return;
1462         /* Use (kthread_)work and sync decompression for atomic contexts only */
1463         if (!in_task() || irqs_disabled() || rcu_read_lock_any_held()) {
1464 #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD
1465                 struct kthread_worker *worker;
1466
1467                 rcu_read_lock();
1468                 worker = rcu_dereference(
1469                                 z_erofs_pcpu_workers[raw_smp_processor_id()]);
1470                 if (!worker) {
1471                         INIT_WORK(&io->u.work, z_erofs_decompressqueue_work);
1472                         queue_work(z_erofs_workqueue, &io->u.work);
1473                 } else {
1474                         kthread_queue_work(worker, &io->u.kthread_work);
1475                 }
1476                 rcu_read_unlock();
1477 #else
1478                 queue_work(z_erofs_workqueue, &io->u.work);
1479 #endif
1480                 /* enable sync decompression for readahead */
1481                 if (sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO)
1482                         sbi->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_FORCE_ON;
1483                 return;
1484         }
1485         z_erofs_decompressqueue_work(&io->u.work);
1486 }
1487
1488 static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
1489                                  struct z_erofs_decompress_frontend *f,
1490                                  struct z_erofs_pcluster *pcl,
1491                                  unsigned int nr,
1492                                  struct address_space *mc)
1493 {
1494         gfp_t gfp = mapping_gfp_mask(mc);
1495         bool tocache = false;
1496         struct z_erofs_bvec zbv;
1497         struct address_space *mapping;
1498         struct folio *folio;
1499         struct page *page;
1500         int bs = i_blocksize(f->inode);
1501
1502         /* Except for inplace folios, the entire folio can be used for I/Os */
1503         bvec->bv_offset = 0;
1504         bvec->bv_len = PAGE_SIZE;
1505 repeat:
1506         spin_lock(&pcl->lockref.lock);
1507         zbv = pcl->compressed_bvecs[nr];
1508         spin_unlock(&pcl->lockref.lock);
1509         if (!zbv.page)
1510                 goto out_allocfolio;
1511
1512         bvec->bv_page = zbv.page;
1513         DBG_BUGON(z_erofs_is_shortlived_page(bvec->bv_page));
1514
1515         folio = page_folio(zbv.page);
1516         /*
1517          * Handle preallocated cached folios.  We tried to allocate such folios
1518          * without triggering direct reclaim.  If allocation failed, inplace
1519          * file-backed folios will be used instead.
1520          */
1521         if (folio->private == (void *)Z_EROFS_PREALLOCATED_PAGE) {
1522                 tocache = true;
1523                 goto out_tocache;
1524         }
1525
1526         mapping = READ_ONCE(folio->mapping);
1527         /*
1528          * File-backed folios for inplace I/Os are all locked steady,
1529          * therefore it is impossible for `mapping` to be NULL.
1530          */
1531         if (mapping && mapping != mc) {
1532                 if (zbv.offset < 0)
1533                         bvec->bv_offset = round_up(-zbv.offset, bs);
1534                 bvec->bv_len = round_up(zbv.end, bs) - bvec->bv_offset;
1535                 return;
1536         }
1537
1538         folio_lock(folio);
1539         if (likely(folio->mapping == mc)) {
1540                 /*
1541                  * The cached folio is still in managed cache but without
1542                  * a valid `->private` pcluster hint.  Let's reconnect them.
1543                  */
1544                 if (!folio_test_private(folio)) {
1545                         folio_attach_private(folio, pcl);
1546                         /* compressed_bvecs[] already takes a ref before */
1547                         folio_put(folio);
1548                 }
1549                 if (likely(folio->private == pcl))  {
1550                         /* don't submit cache I/Os again if already uptodate */
1551                         if (folio_test_uptodate(folio)) {
1552                                 folio_unlock(folio);
1553                                 bvec->bv_page = NULL;
1554                         }
1555                         return;
1556                 }
1557                 /*
1558                  * Already linked with another pcluster, which only appears in
1559                  * crafted images by fuzzers for now.  But handle this anyway.
1560                  */
1561                 tocache = false;        /* use temporary short-lived pages */
1562         } else {
1563                 DBG_BUGON(1); /* referenced managed folios can't be truncated */
1564                 tocache = true;
1565         }
1566         folio_unlock(folio);
1567         folio_put(folio);
1568 out_allocfolio:
1569         page = __erofs_allocpage(&f->pagepool, gfp, true);
1570         spin_lock(&pcl->lockref.lock);
1571         if (unlikely(pcl->compressed_bvecs[nr].page != zbv.page)) {
1572                 if (page)
1573                         erofs_pagepool_add(&f->pagepool, page);
1574                 spin_unlock(&pcl->lockref.lock);
1575                 cond_resched();
1576                 goto repeat;
1577         }
1578         pcl->compressed_bvecs[nr].page = page ? page : ERR_PTR(-ENOMEM);
1579         spin_unlock(&pcl->lockref.lock);
1580         bvec->bv_page = page;
1581         if (!page)
1582                 return;
1583         folio = page_folio(page);
1584 out_tocache:
1585         if (!tocache || bs != PAGE_SIZE ||
1586             filemap_add_folio(mc, folio, pcl->index + nr, gfp)) {
1587                 /* turn into a temporary shortlived folio (1 ref) */
1588                 folio->private = (void *)Z_EROFS_SHORTLIVED_PAGE;
1589                 return;
1590         }
1591         folio_attach_private(folio, pcl);
1592         /* drop a refcount added by allocpage (then 2 refs in total here) */
1593         folio_put(folio);
1594 }
1595
1596 static struct z_erofs_decompressqueue *jobqueue_init(struct super_block *sb,
1597                               struct z_erofs_decompressqueue *fgq, bool *fg)
1598 {
1599         struct z_erofs_decompressqueue *q;
1600
1601         if (fg && !*fg) {
1602                 q = kvzalloc(sizeof(*q), GFP_KERNEL | __GFP_NOWARN);
1603                 if (!q) {
1604                         *fg = true;
1605                         goto fg_out;
1606                 }
1607 #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD
1608                 kthread_init_work(&q->u.kthread_work,
1609                                   z_erofs_decompressqueue_kthread_work);
1610 #else
1611                 INIT_WORK(&q->u.work, z_erofs_decompressqueue_work);
1612 #endif
1613         } else {
1614 fg_out:
1615                 q = fgq;
1616                 init_completion(&fgq->u.done);
1617                 atomic_set(&fgq->pending_bios, 0);
1618                 q->eio = false;
1619                 q->sync = true;
1620         }
1621         q->sb = sb;
1622         q->head = Z_EROFS_PCLUSTER_TAIL;
1623         return q;
1624 }
1625
1626 /* define decompression jobqueue types */
1627 enum {
1628         JQ_BYPASS,
1629         JQ_SUBMIT,
1630         NR_JOBQUEUES,
1631 };
1632
1633 static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl,
1634                                     z_erofs_next_pcluster_t qtail[],
1635                                     z_erofs_next_pcluster_t owned_head)
1636 {
1637         z_erofs_next_pcluster_t *const submit_qtail = qtail[JQ_SUBMIT];
1638         z_erofs_next_pcluster_t *const bypass_qtail = qtail[JQ_BYPASS];
1639
1640         WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL);
1641
1642         WRITE_ONCE(*submit_qtail, owned_head);
1643         WRITE_ONCE(*bypass_qtail, &pcl->next);
1644
1645         qtail[JQ_BYPASS] = &pcl->next;
1646 }
1647
1648 static void z_erofs_endio(struct bio *bio)
1649 {
1650         struct z_erofs_decompressqueue *q = bio->bi_private;
1651         blk_status_t err = bio->bi_status;
1652         struct folio_iter fi;
1653
1654         bio_for_each_folio_all(fi, bio) {
1655                 struct folio *folio = fi.folio;
1656
1657                 DBG_BUGON(folio_test_uptodate(folio));
1658                 DBG_BUGON(z_erofs_page_is_invalidated(&folio->page));
1659                 if (!erofs_folio_is_managed(EROFS_SB(q->sb), folio))
1660                         continue;
1661
1662                 if (!err)
1663                         folio_mark_uptodate(folio);
1664                 folio_unlock(folio);
1665         }
1666         if (err)
1667                 q->eio = true;
1668         z_erofs_decompress_kickoff(q, -1);
1669         if (bio->bi_bdev)
1670                 bio_put(bio);
1671 }
1672
1673 static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
1674                                  struct z_erofs_decompressqueue *fgq,
1675                                  bool *force_fg, bool readahead)
1676 {
1677         struct super_block *sb = f->inode->i_sb;
1678         struct address_space *mc = MNGD_MAPPING(EROFS_SB(sb));
1679         z_erofs_next_pcluster_t qtail[NR_JOBQUEUES];
1680         struct z_erofs_decompressqueue *q[NR_JOBQUEUES];
1681         z_erofs_next_pcluster_t owned_head = f->owned_head;
1682         /* bio is NULL initially, so no need to initialize last_{index,bdev} */
1683         erofs_off_t last_pa;
1684         unsigned int nr_bios = 0;
1685         struct bio *bio = NULL;
1686         unsigned long pflags;
1687         int memstall = 0;
1688
1689         /* No need to read from device for pclusters in the bypass queue. */
1690         q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, NULL);
1691         q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, force_fg);
1692
1693         qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head;
1694         qtail[JQ_SUBMIT] = &q[JQ_SUBMIT]->head;
1695
1696         /* by default, all need io submission */
1697         q[JQ_SUBMIT]->head = owned_head;
1698
1699         do {
1700                 struct erofs_map_dev mdev;
1701                 struct z_erofs_pcluster *pcl;
1702                 erofs_off_t cur, end;
1703                 struct bio_vec bvec;
1704                 unsigned int i = 0;
1705                 bool bypass = true;
1706
1707                 DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_NIL);
1708                 pcl = container_of(owned_head, struct z_erofs_pcluster, next);
1709                 owned_head = READ_ONCE(pcl->next);
1710
1711                 if (z_erofs_is_inline_pcluster(pcl)) {
1712                         move_to_bypass_jobqueue(pcl, qtail, owned_head);
1713                         continue;
1714                 }
1715
1716                 /* no device id here, thus it will always succeed */
1717                 mdev = (struct erofs_map_dev) {
1718                         .m_pa = erofs_pos(sb, pcl->index),
1719                 };
1720                 (void)erofs_map_dev(sb, &mdev);
1721
1722                 cur = mdev.m_pa;
1723                 end = cur + pcl->pclustersize;
1724                 do {
1725                         bvec.bv_page = NULL;
1726                         if (bio && (cur != last_pa ||
1727                                     bio->bi_bdev != mdev.m_bdev)) {
1728 drain_io:
1729                                 if (erofs_is_fileio_mode(EROFS_SB(sb)))
1730                                         erofs_fileio_submit_bio(bio);
1731                                 else if (erofs_is_fscache_mode(sb))
1732                                         erofs_fscache_submit_bio(bio);
1733                                 else
1734                                         submit_bio(bio);
1735
1736                                 if (memstall) {
1737                                         psi_memstall_leave(&pflags);
1738                                         memstall = 0;
1739                                 }
1740                                 bio = NULL;
1741                         }
1742
1743                         if (!bvec.bv_page) {
1744                                 z_erofs_fill_bio_vec(&bvec, f, pcl, i++, mc);
1745                                 if (!bvec.bv_page)
1746                                         continue;
1747                                 if (cur + bvec.bv_len > end)
1748                                         bvec.bv_len = end - cur;
1749                                 DBG_BUGON(bvec.bv_len < sb->s_blocksize);
1750                         }
1751
1752                         if (unlikely(PageWorkingset(bvec.bv_page)) &&
1753                             !memstall) {
1754                                 psi_memstall_enter(&pflags);
1755                                 memstall = 1;
1756                         }
1757
1758                         if (!bio) {
1759                                 if (erofs_is_fileio_mode(EROFS_SB(sb)))
1760                                         bio = erofs_fileio_bio_alloc(&mdev);
1761                                 else if (erofs_is_fscache_mode(sb))
1762                                         bio = erofs_fscache_bio_alloc(&mdev);
1763                                 else
1764                                         bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS,
1765                                                         REQ_OP_READ, GFP_NOIO);
1766                                 bio->bi_end_io = z_erofs_endio;
1767                                 bio->bi_iter.bi_sector = cur >> 9;
1768                                 bio->bi_private = q[JQ_SUBMIT];
1769                                 if (readahead)
1770                                         bio->bi_opf |= REQ_RAHEAD;
1771                                 ++nr_bios;
1772                         }
1773
1774                         if (!bio_add_page(bio, bvec.bv_page, bvec.bv_len,
1775                                           bvec.bv_offset))
1776                                 goto drain_io;
1777                         last_pa = cur + bvec.bv_len;
1778                         bypass = false;
1779                 } while ((cur += bvec.bv_len) < end);
1780
1781                 if (!bypass)
1782                         qtail[JQ_SUBMIT] = &pcl->next;
1783                 else
1784                         move_to_bypass_jobqueue(pcl, qtail, owned_head);
1785         } while (owned_head != Z_EROFS_PCLUSTER_TAIL);
1786
1787         if (bio) {
1788                 if (erofs_is_fileio_mode(EROFS_SB(sb)))
1789                         erofs_fileio_submit_bio(bio);
1790                 else if (erofs_is_fscache_mode(sb))
1791                         erofs_fscache_submit_bio(bio);
1792                 else
1793                         submit_bio(bio);
1794         }
1795         if (memstall)
1796                 psi_memstall_leave(&pflags);
1797
1798         /*
1799          * although background is preferred, no one is pending for submission.
1800          * don't issue decompression but drop it directly instead.
1801          */
1802         if (!*force_fg && !nr_bios) {
1803                 kvfree(q[JQ_SUBMIT]);
1804                 return;
1805         }
1806         z_erofs_decompress_kickoff(q[JQ_SUBMIT], nr_bios);
1807 }
1808
1809 static int z_erofs_runqueue(struct z_erofs_decompress_frontend *f,
1810                             unsigned int ra_folios)
1811 {
1812         struct z_erofs_decompressqueue io[NR_JOBQUEUES];
1813         struct erofs_sb_info *sbi = EROFS_I_SB(f->inode);
1814         bool force_fg = z_erofs_is_sync_decompress(sbi, ra_folios);
1815         int err;
1816
1817         if (f->owned_head == Z_EROFS_PCLUSTER_TAIL)
1818                 return 0;
1819         z_erofs_submit_queue(f, io, &force_fg, !!ra_folios);
1820
1821         /* handle bypass queue (no i/o pclusters) immediately */
1822         err = z_erofs_decompress_queue(&io[JQ_BYPASS], &f->pagepool);
1823         if (!force_fg)
1824                 return err;
1825
1826         /* wait until all bios are completed */
1827         wait_for_completion_io(&io[JQ_SUBMIT].u.done);
1828
1829         /* handle synchronous decompress queue in the caller context */
1830         return z_erofs_decompress_queue(&io[JQ_SUBMIT], &f->pagepool) ?: err;
1831 }
1832
1833 /*
1834  * Since partial uptodate is still unimplemented for now, we have to use
1835  * approximate readmore strategies as a start.
1836  */
1837 static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
1838                 struct readahead_control *rac, bool backmost)
1839 {
1840         struct inode *inode = f->inode;
1841         struct erofs_map_blocks *map = &f->map;
1842         erofs_off_t cur, end, headoffset = f->headoffset;
1843         int err;
1844
1845         if (backmost) {
1846                 if (rac)
1847                         end = headoffset + readahead_length(rac) - 1;
1848                 else
1849                         end = headoffset + PAGE_SIZE - 1;
1850                 map->m_la = end;
1851                 err = z_erofs_map_blocks_iter(inode, map,
1852                                               EROFS_GET_BLOCKS_READMORE);
1853                 if (err)
1854                         return;
1855
1856                 /* expand ra for the trailing edge if readahead */
1857                 if (rac) {
1858                         cur = round_up(map->m_la + map->m_llen, PAGE_SIZE);
1859                         readahead_expand(rac, headoffset, cur - headoffset);
1860                         return;
1861                 }
1862                 end = round_up(end, PAGE_SIZE);
1863         } else {
1864                 end = round_up(map->m_la, PAGE_SIZE);
1865                 if (!map->m_llen)
1866                         return;
1867         }
1868
1869         cur = map->m_la + map->m_llen - 1;
1870         while ((cur >= end) && (cur < i_size_read(inode))) {
1871                 pgoff_t index = cur >> PAGE_SHIFT;
1872                 struct folio *folio;
1873
1874                 folio = erofs_grab_folio_nowait(inode->i_mapping, index);
1875                 if (!IS_ERR_OR_NULL(folio)) {
1876                         if (folio_test_uptodate(folio))
1877                                 folio_unlock(folio);
1878                         else
1879                                 z_erofs_scan_folio(f, folio, !!rac);
1880                         folio_put(folio);
1881                 }
1882
1883                 if (cur < PAGE_SIZE)
1884                         break;
1885                 cur = (index << PAGE_SHIFT) - 1;
1886         }
1887 }
1888
1889 static int z_erofs_read_folio(struct file *file, struct folio *folio)
1890 {
1891         struct inode *const inode = folio->mapping->host;
1892         struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
1893         int err;
1894
1895         trace_erofs_read_folio(folio, false);
1896         f.headoffset = (erofs_off_t)folio->index << PAGE_SHIFT;
1897
1898         z_erofs_pcluster_readmore(&f, NULL, true);
1899         err = z_erofs_scan_folio(&f, folio, false);
1900         z_erofs_pcluster_readmore(&f, NULL, false);
1901         z_erofs_pcluster_end(&f);
1902
1903         /* if some pclusters are ready, need submit them anyway */
1904         err = z_erofs_runqueue(&f, 0) ?: err;
1905         if (err && err != -EINTR)
1906                 erofs_err(inode->i_sb, "read error %d @ %lu of nid %llu",
1907                           err, folio->index, EROFS_I(inode)->nid);
1908
1909         erofs_put_metabuf(&f.map.buf);
1910         erofs_release_pages(&f.pagepool);
1911         return err;
1912 }
1913
1914 static void z_erofs_readahead(struct readahead_control *rac)
1915 {
1916         struct inode *const inode = rac->mapping->host;
1917         struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
1918         struct folio *head = NULL, *folio;
1919         unsigned int nr_folios;
1920         int err;
1921
1922         f.headoffset = readahead_pos(rac);
1923
1924         z_erofs_pcluster_readmore(&f, rac, true);
1925         nr_folios = readahead_count(rac);
1926         trace_erofs_readpages(inode, readahead_index(rac), nr_folios, false);
1927
1928         while ((folio = readahead_folio(rac))) {
1929                 folio->private = head;
1930                 head = folio;
1931         }
1932
1933         /* traverse in reverse order for best metadata I/O performance */
1934         while (head) {
1935                 folio = head;
1936                 head = folio_get_private(folio);
1937
1938                 err = z_erofs_scan_folio(&f, folio, true);
1939                 if (err && err != -EINTR)
1940                         erofs_err(inode->i_sb, "readahead error at folio %lu @ nid %llu",
1941                                   folio->index, EROFS_I(inode)->nid);
1942         }
1943         z_erofs_pcluster_readmore(&f, rac, false);
1944         z_erofs_pcluster_end(&f);
1945
1946         (void)z_erofs_runqueue(&f, nr_folios);
1947         erofs_put_metabuf(&f.map.buf);
1948         erofs_release_pages(&f.pagepool);
1949 }
1950
1951 const struct address_space_operations z_erofs_aops = {
1952         .read_folio = z_erofs_read_folio,
1953         .readahead = z_erofs_readahead,
1954 };