mm/filemap.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  *      linux/mm/filemap.c
   4  *
   5  * Copyright (C) 1994-1999  Linus Torvalds
   6  */
   7
   8 /*
   9  * This file handles the generic file mmap semantics used by
  10  * most "normal" filesystems (but you don't /have/ to use this:
  11  * the NFS filesystem used to do this differently, for example)
  12  */
  13 #include <linux/export.h>
  14 #include <linux/compiler.h>
  15 #include <linux/dax.h>
  16 #include <linux/fs.h>
  17 #include <linux/sched/signal.h>
  18 #include <linux/uaccess.h>
  19 #include <linux/capability.h>
  20 #include <linux/kernel_stat.h>
  21 #include <linux/gfp.h>
  22 #include <linux/mm.h>
  23 #include <linux/swap.h>
  24 #include <linux/swapops.h>
  25 #include <linux/syscalls.h>
  26 #include <linux/mman.h>
  27 #include <linux/pagemap.h>
  28 #include <linux/file.h>
  29 #include <linux/uio.h>
  30 #include <linux/error-injection.h>
  31 #include <linux/hash.h>
  32 #include <linux/writeback.h>
  33 #include <linux/backing-dev.h>
  34 #include <linux/pagevec.h>
  35 #include <linux/security.h>
  36 #include <linux/cpuset.h>
  37 #include <linux/hugetlb.h>
  38 #include <linux/memcontrol.h>
  39 #include <linux/shmem_fs.h>
  40 #include <linux/rmap.h>
  41 #include <linux/delayacct.h>
  42 #include <linux/psi.h>
  43 #include <linux/ramfs.h>
  44 #include <linux/page_idle.h>
  45 #include <linux/migrate.h>
  46 #include <linux/pipe_fs_i.h>
  47 #include <linux/splice.h>
  48 #include <linux/rcupdate_wait.h>
  49 #include <linux/sched/mm.h>
  50 #include <linux/fsnotify.h>
  51 #include <asm/pgalloc.h>
  52 #include <asm/tlbflush.h>
  53 #include "internal.h"
  54
  55 #define CREATE_TRACE_POINTS
  56 #include <trace/events/filemap.h>
  57
  58 /*
  59  * FIXME: remove all knowledge of the buffer layer from the core VM
  60  */
  61 #include <linux/buffer_head.h> /* for try_to_free_buffers */
  62
  63 #include <asm/mman.h>
  64
  65 #include "swap.h"
  66
  67 /*
  68  * Shared mappings implemented 30.11.1994. It's not fully working yet,
  69  * though.
  70  *
  71  * Shared mappings now work. 15.8.1995  Bruno.
  72  *
  73  * finished 'unifying' the page and buffer cache and SMP-threaded the
  74  * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
  75  *
  76  * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
  77  */
  78
  79 /*
  80  * Lock ordering:
  81  *
  82  *  ->i_mmap_rwsem              (truncate_pagecache)
  83  *    ->private_lock            (__free_pte->block_dirty_folio)
  84  *      ->swap_lock             (exclusive_swap_page, others)
  85  *        ->i_pages lock
  86  *
  87  *  ->i_rwsem
  88  *    ->invalidate_lock         (acquired by fs in truncate path)
  89  *      ->i_mmap_rwsem          (truncate->unmap_mapping_range)
  90  *
  91  *  ->mmap_lock
  92  *    ->i_mmap_rwsem
  93  *      ->page_table_lock or pte_lock   (various, mainly in memory.c)
  94  *        ->i_pages lock        (arch-dependent flush_dcache_mmap_lock)
  95  *
  96  *  ->mmap_lock
  97  *    ->invalidate_lock         (filemap_fault)
  98  *      ->lock_page             (filemap_fault, access_process_vm)
  99  *
 100  *  ->i_rwsem                   (generic_perform_write)
 101  *    ->mmap_lock               (fault_in_readable->do_page_fault)
 102  *
 103  *  bdi->wb.list_lock
 104  *    sb_lock                   (fs/fs-writeback.c)
 105  *    ->i_pages lock            (__sync_single_inode)
 106  *
 107  *  ->i_mmap_rwsem
 108  *    ->anon_vma.lock           (vma_merge)
 109  *
 110  *  ->anon_vma.lock
 111  *    ->page_table_lock or pte_lock     (anon_vma_prepare and various)
 112  *
 113  *  ->page_table_lock or pte_lock
 114  *    ->swap_lock               (try_to_unmap_one)
 115  *    ->private_lock            (try_to_unmap_one)
 116  *    ->i_pages lock            (try_to_unmap_one)
 117  *    ->lruvec->lru_lock        (follow_page_mask->mark_page_accessed)
 118  *    ->lruvec->lru_lock        (check_pte_range->folio_isolate_lru)
 119  *    ->private_lock            (folio_remove_rmap_pte->set_page_dirty)
 120  *    ->i_pages lock            (folio_remove_rmap_pte->set_page_dirty)
 121  *    bdi.wb->list_lock         (folio_remove_rmap_pte->set_page_dirty)
 122  *    ->inode->i_lock           (folio_remove_rmap_pte->set_page_dirty)
 123  *    bdi.wb->list_lock         (zap_pte_range->set_page_dirty)
 124  *    ->inode->i_lock           (zap_pte_range->set_page_dirty)
 125  *    ->private_lock            (zap_pte_range->block_dirty_folio)
 126  */
 127
 128 static void page_cache_delete(struct address_space *mapping,
 129                                    struct folio *folio, void *shadow)
 130 {
 131         XA_STATE(xas, &mapping->i_pages, folio->index);
 132         long nr = 1;
 133
 134         mapping_set_update(&xas, mapping);
 135
 136         xas_set_order(&xas, folio->index, folio_order(folio));
 137         nr = folio_nr_pages(folio);
 138
 139         VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
 140
 141         xas_store(&xas, shadow);
 142         xas_init_marks(&xas);
 143
 144         folio->mapping = NULL;
 145         /* Leave page->index set: truncation lookup relies upon it */
 146         mapping->nrpages -= nr;
 147 }
 148
 149 static void filemap_unaccount_folio(struct address_space *mapping,
 150                 struct folio *folio)
 151 {
 152         long nr;
 153
 154         VM_BUG_ON_FOLIO(folio_mapped(folio), folio);
 155         if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(folio_mapped(folio))) {
 156                 pr_alert("BUG: Bad page cache in process %s  pfn:%05lx\n",
 157                          current->comm, folio_pfn(folio));
 158                 dump_page(&folio->page, "still mapped when deleted");
 159                 dump_stack();
 160                 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
 161
 162                 if (mapping_exiting(mapping) && !folio_test_large(folio)) {
 163                         int mapcount = folio_mapcount(folio);
 164
 165                         if (folio_ref_count(folio) >= mapcount + 2) {
 166                                 /*
 167                                  * All vmas have already been torn down, so it's
 168                                  * a good bet that actually the page is unmapped
 169                                  * and we'd rather not leak it: if we're wrong,
 170                                  * another bad page check should catch it later.
 171                                  */
 172                                 atomic_set(&folio->_mapcount, -1);
 173                                 folio_ref_sub(folio, mapcount);
 174                         }
 175                 }
 176         }
 177
 178         /* hugetlb folios do not participate in page cache accounting. */
 179         if (folio_test_hugetlb(folio))
 180                 return;
 181
 182         nr = folio_nr_pages(folio);
 183
 184         __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr);
 185         if (folio_test_swapbacked(folio)) {
 186                 __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr);
 187                 if (folio_test_pmd_mappable(folio))
 188                         __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr);
 189         } else if (folio_test_pmd_mappable(folio)) {
 190                 __lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr);
 191                 filemap_nr_thps_dec(mapping);
 192         }
 193
 194         /*
 195          * At this point folio must be either written or cleaned by
 196          * truncate.  Dirty folio here signals a bug and loss of
 197          * unwritten data - on ordinary filesystems.
 198          *
 199          * But it's harmless on in-memory filesystems like tmpfs; and can
 200          * occur when a driver which did get_user_pages() sets page dirty
 201          * before putting it, while the inode is being finally evicted.
 202          *
 203          * Below fixes dirty accounting after removing the folio entirely
 204          * but leaves the dirty flag set: it has no effect for truncated
 205          * folio and anyway will be cleared before returning folio to
 206          * buddy allocator.
 207          */
 208         if (WARN_ON_ONCE(folio_test_dirty(folio) &&
 209                          mapping_can_writeback(mapping)))
 210                 folio_account_cleaned(folio, inode_to_wb(mapping->host));
 211 }
 212
 213 /*
 214  * Delete a page from the page cache and free it. Caller has to make
 215  * sure the page is locked and that nobody else uses it - or that usage
 216  * is safe.  The caller must hold the i_pages lock.
 217  */
 218 void __filemap_remove_folio(struct folio *folio, void *shadow)
 219 {
 220         struct address_space *mapping = folio->mapping;
 221
 222         trace_mm_filemap_delete_from_page_cache(folio);
 223         filemap_unaccount_folio(mapping, folio);
 224         page_cache_delete(mapping, folio, shadow);
 225 }
 226
 227 void filemap_free_folio(struct address_space *mapping, struct folio *folio)
 228 {
 229         void (*free_folio)(struct folio *);
 230         int refs = 1;
 231
 232         free_folio = mapping->a_ops->free_folio;
 233         if (free_folio)
 234                 free_folio(folio);
 235
 236         if (folio_test_large(folio))
 237                 refs = folio_nr_pages(folio);
 238         folio_put_refs(folio, refs);
 239 }
 240
 241 /**
 242  * filemap_remove_folio - Remove folio from page cache.
 243  * @folio: The folio.
 244  *
 245  * This must be called only on folios that are locked and have been
 246  * verified to be in the page cache.  It will never put the folio into
 247  * the free list because the caller has a reference on the page.
 248  */
 249 void filemap_remove_folio(struct folio *folio)
 250 {
 251         struct address_space *mapping = folio->mapping;
 252
 253         BUG_ON(!folio_test_locked(folio));
 254         spin_lock(&mapping->host->i_lock);
 255         xa_lock_irq(&mapping->i_pages);
 256         __filemap_remove_folio(folio, NULL);
 257         xa_unlock_irq(&mapping->i_pages);
 258         if (mapping_shrinkable(mapping))
 259                 inode_add_lru(mapping->host);
 260         spin_unlock(&mapping->host->i_lock);
 261
 262         filemap_free_folio(mapping, folio);
 263 }
 264
 265 /*
 266  * page_cache_delete_batch - delete several folios from page cache
 267  * @mapping: the mapping to which folios belong
 268  * @fbatch: batch of folios to delete
 269  *
 270  * The function walks over mapping->i_pages and removes folios passed in
 271  * @fbatch from the mapping. The function expects @fbatch to be sorted
 272  * by page index and is optimised for it to be dense.
 273  * It tolerates holes in @fbatch (mapping entries at those indices are not
 274  * modified).
 275  *
 276  * The function expects the i_pages lock to be held.
 277  */
 278 static void page_cache_delete_batch(struct address_space *mapping,
 279                              struct folio_batch *fbatch)
 280 {
 281         XA_STATE(xas, &mapping->i_pages, fbatch->folios[0]->index);
 282         long total_pages = 0;
 283         int i = 0;
 284         struct folio *folio;
 285
 286         mapping_set_update(&xas, mapping);
 287         xas_for_each(&xas, folio, ULONG_MAX) {
 288                 if (i >= folio_batch_count(fbatch))
 289                         break;
 290
 291                 /* A swap/dax/shadow entry got inserted? Skip it. */
 292                 if (xa_is_value(folio))
 293                         continue;
 294                 /*
 295                  * A page got inserted in our range? Skip it. We have our
 296                  * pages locked so they are protected from being removed.
 297                  * If we see a page whose index is higher than ours, it
 298                  * means our page has been removed, which shouldn't be
 299                  * possible because we're holding the PageLock.
 300                  */
 301                 if (folio != fbatch->folios[i]) {
 302                         VM_BUG_ON_FOLIO(folio->index >
 303                                         fbatch->folios[i]->index, folio);
 304                         continue;
 305                 }
 306
 307                 WARN_ON_ONCE(!folio_test_locked(folio));
 308
 309                 folio->mapping = NULL;
 310                 /* Leave folio->index set: truncation lookup relies on it */
 311
 312                 i++;
 313                 xas_store(&xas, NULL);
 314                 total_pages += folio_nr_pages(folio);
 315         }
 316         mapping->nrpages -= total_pages;
 317 }
 318
 319 void delete_from_page_cache_batch(struct address_space *mapping,
 320                                   struct folio_batch *fbatch)
 321 {
 322         int i;
 323
 324         if (!folio_batch_count(fbatch))
 325                 return;
 326
 327         spin_lock(&mapping->host->i_lock);
 328         xa_lock_irq(&mapping->i_pages);
 329         for (i = 0; i < folio_batch_count(fbatch); i++) {
 330                 struct folio *folio = fbatch->folios[i];
 331
 332                 trace_mm_filemap_delete_from_page_cache(folio);
 333                 filemap_unaccount_folio(mapping, folio);
 334         }
 335         page_cache_delete_batch(mapping, fbatch);
 336         xa_unlock_irq(&mapping->i_pages);
 337         if (mapping_shrinkable(mapping))
 338                 inode_add_lru(mapping->host);
 339         spin_unlock(&mapping->host->i_lock);
 340
 341         for (i = 0; i < folio_batch_count(fbatch); i++)
 342                 filemap_free_folio(mapping, fbatch->folios[i]);
 343 }
 344
 345 int filemap_check_errors(struct address_space *mapping)
 346 {
 347         int ret = 0;
 348         /* Check for outstanding write errors */
 349         if (test_bit(AS_ENOSPC, &mapping->flags) &&
 350             test_and_clear_bit(AS_ENOSPC, &mapping->flags))
 351                 ret = -ENOSPC;
 352         if (test_bit(AS_EIO, &mapping->flags) &&
 353             test_and_clear_bit(AS_EIO, &mapping->flags))
 354                 ret = -EIO;
 355         return ret;
 356 }
 357 EXPORT_SYMBOL(filemap_check_errors);
 358
 359 static int filemap_check_and_keep_errors(struct address_space *mapping)
 360 {
 361         /* Check for outstanding write errors */
 362         if (test_bit(AS_EIO, &mapping->flags))
 363                 return -EIO;
 364         if (test_bit(AS_ENOSPC, &mapping->flags))
 365                 return -ENOSPC;
 366         return 0;
 367 }
 368
 369 /**
 370  * filemap_fdatawrite_wbc - start writeback on mapping dirty pages in range
 371  * @mapping:    address space structure to write
 372  * @wbc:        the writeback_control controlling the writeout
 373  *
 374  * Call writepages on the mapping using the provided wbc to control the
 375  * writeout.
 376  *
 377  * Return: %0 on success, negative error code otherwise.
 378  */
 379 int filemap_fdatawrite_wbc(struct address_space *mapping,
 380                            struct writeback_control *wbc)
 381 {
 382         int ret;
 383
 384         if (!mapping_can_writeback(mapping) ||
 385             !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
 386                 return 0;
 387
 388         wbc_attach_fdatawrite_inode(wbc, mapping->host);
 389         ret = do_writepages(mapping, wbc);
 390         wbc_detach_inode(wbc);
 391         return ret;
 392 }
 393 EXPORT_SYMBOL(filemap_fdatawrite_wbc);
 394
 395 /**
 396  * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
 397  * @mapping:    address space structure to write
 398  * @start:      offset in bytes where the range starts
 399  * @end:        offset in bytes where the range ends (inclusive)
 400  * @sync_mode:  enable synchronous operation
 401  *
 402  * Start writeback against all of a mapping's dirty pages that lie
 403  * within the byte offsets <start, end> inclusive.
 404  *
 405  * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
 406  * opposed to a regular memory cleansing writeback.  The difference between
 407  * these two operations is that if a dirty page/buffer is encountered, it must
 408  * be waited upon, and not just skipped over.
 409  *
 410  * Return: %0 on success, negative error code otherwise.
 411  */
 412 int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
 413                                 loff_t end, int sync_mode)
 414 {
 415         struct writeback_control wbc = {
 416                 .sync_mode = sync_mode,
 417                 .nr_to_write = LONG_MAX,
 418                 .range_start = start,
 419                 .range_end = end,
 420         };
 421
 422         return filemap_fdatawrite_wbc(mapping, &wbc);
 423 }
 424
 425 static inline int __filemap_fdatawrite(struct address_space *mapping,
 426         int sync_mode)
 427 {
 428         return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode);
 429 }
 430
 431 int filemap_fdatawrite(struct address_space *mapping)
 432 {
 433         return __filemap_fdatawrite(mapping, WB_SYNC_ALL);
 434 }
 435 EXPORT_SYMBOL(filemap_fdatawrite);
 436
 437 int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
 438                                 loff_t end)
 439 {
 440         return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
 441 }
 442 EXPORT_SYMBOL(filemap_fdatawrite_range);
 443
 444 /**
 445  * filemap_fdatawrite_range_kick - start writeback on a range
 446  * @mapping:    target address_space
 447  * @start:      index to start writeback on
 448  * @end:        last (non-inclusive) index for writeback
 449  *
 450  * This is a non-integrity writeback helper, to start writing back folios
 451  * for the indicated range.
 452  *
 453  * Return: %0 on success, negative error code otherwise.
 454  */
 455 int filemap_fdatawrite_range_kick(struct address_space *mapping, loff_t start,
 456                                   loff_t end)
 457 {
 458         return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_NONE);
 459 }
 460 EXPORT_SYMBOL_GPL(filemap_fdatawrite_range_kick);
 461
 462 /**
 463  * filemap_flush - mostly a non-blocking flush
 464  * @mapping:    target address_space
 465  *
 466  * This is a mostly non-blocking flush.  Not suitable for data-integrity
 467  * purposes - I/O may not be started against all dirty pages.
 468  *
 469  * Return: %0 on success, negative error code otherwise.
 470  */
 471 int filemap_flush(struct address_space *mapping)
 472 {
 473         return __filemap_fdatawrite(mapping, WB_SYNC_NONE);
 474 }
 475 EXPORT_SYMBOL(filemap_flush);
 476
 477 /**
 478  * filemap_range_has_page - check if a page exists in range.
 479  * @mapping:           address space within which to check
 480  * @start_byte:        offset in bytes where the range starts
 481  * @end_byte:          offset in bytes where the range ends (inclusive)
 482  *
 483  * Find at least one page in the range supplied, usually used to check if
 484  * direct writing in this range will trigger a writeback.
 485  *
 486  * Return: %true if at least one page exists in the specified range,
 487  * %false otherwise.
 488  */
 489 bool filemap_range_has_page(struct address_space *mapping,
 490                            loff_t start_byte, loff_t end_byte)
 491 {
 492         struct folio *folio;
 493         XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
 494         pgoff_t max = end_byte >> PAGE_SHIFT;
 495
 496         if (end_byte < start_byte)
 497                 return false;
 498
 499         rcu_read_lock();
 500         for (;;) {
 501                 folio = xas_find(&xas, max);
 502                 if (xas_retry(&xas, folio))
 503                         continue;
 504                 /* Shadow entries don't count */
 505                 if (xa_is_value(folio))
 506                         continue;
 507                 /*
 508                  * We don't need to try to pin this page; we're about to
 509                  * release the RCU lock anyway.  It is enough to know that
 510                  * there was a page here recently.
 511                  */
 512                 break;
 513         }
 514         rcu_read_unlock();
 515
 516         return folio != NULL;
 517 }
 518 EXPORT_SYMBOL(filemap_range_has_page);
 519
 520 static void __filemap_fdatawait_range(struct address_space *mapping,
 521                                      loff_t start_byte, loff_t end_byte)
 522 {
 523         pgoff_t index = start_byte >> PAGE_SHIFT;
 524         pgoff_t end = end_byte >> PAGE_SHIFT;
 525         struct folio_batch fbatch;
 526         unsigned nr_folios;
 527
 528         folio_batch_init(&fbatch);
 529
 530         while (index <= end) {
 531                 unsigned i;
 532
 533                 nr_folios = filemap_get_folios_tag(mapping, &index, end,
 534                                 PAGECACHE_TAG_WRITEBACK, &fbatch);
 535
 536                 if (!nr_folios)
 537                         break;
 538
 539                 for (i = 0; i < nr_folios; i++) {
 540                         struct folio *folio = fbatch.folios[i];
 541
 542                         folio_wait_writeback(folio);
 543                 }
 544                 folio_batch_release(&fbatch);
 545                 cond_resched();
 546         }
 547 }
 548
 549 /**
 550  * filemap_fdatawait_range - wait for writeback to complete
 551  * @mapping:            address space structure to wait for
 552  * @start_byte:         offset in bytes where the range starts
 553  * @end_byte:           offset in bytes where the range ends (inclusive)
 554  *
 555  * Walk the list of under-writeback pages of the given address space
 556  * in the given range and wait for all of them.  Check error status of
 557  * the address space and return it.
 558  *
 559  * Since the error status of the address space is cleared by this function,
 560  * callers are responsible for checking the return value and handling and/or
 561  * reporting the error.
 562  *
 563  * Return: error status of the address space.
 564  */
 565 int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
 566                             loff_t end_byte)
 567 {
 568         __filemap_fdatawait_range(mapping, start_byte, end_byte);
 569         return filemap_check_errors(mapping);
 570 }
 571 EXPORT_SYMBOL(filemap_fdatawait_range);
 572
 573 /**
 574  * filemap_fdatawait_range_keep_errors - wait for writeback to complete
 575  * @mapping:            address space structure to wait for
 576  * @start_byte:         offset in bytes where the range starts
 577  * @end_byte:           offset in bytes where the range ends (inclusive)
 578  *
 579  * Walk the list of under-writeback pages of the given address space in the
 580  * given range and wait for all of them.  Unlike filemap_fdatawait_range(),
 581  * this function does not clear error status of the address space.
 582  *
 583  * Use this function if callers don't handle errors themselves.  Expected
 584  * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
 585  * fsfreeze(8)
 586  */
 587 int filemap_fdatawait_range_keep_errors(struct address_space *mapping,
 588                 loff_t start_byte, loff_t end_byte)
 589 {
 590         __filemap_fdatawait_range(mapping, start_byte, end_byte);
 591         return filemap_check_and_keep_errors(mapping);
 592 }
 593 EXPORT_SYMBOL(filemap_fdatawait_range_keep_errors);
 594
 595 /**
 596  * file_fdatawait_range - wait for writeback to complete
 597  * @file:               file pointing to address space structure to wait for
 598  * @start_byte:         offset in bytes where the range starts
 599  * @end_byte:           offset in bytes where the range ends (inclusive)
 600  *
 601  * Walk the list of under-writeback pages of the address space that file
 602  * refers to, in the given range and wait for all of them.  Check error
 603  * status of the address space vs. the file->f_wb_err cursor and return it.
 604  *
 605  * Since the error status of the file is advanced by this function,
 606  * callers are responsible for checking the return value and handling and/or
 607  * reporting the error.
 608  *
 609  * Return: error status of the address space vs. the file->f_wb_err cursor.
 610  */
 611 int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte)
 612 {
 613         struct address_space *mapping = file->f_mapping;
 614
 615         __filemap_fdatawait_range(mapping, start_byte, end_byte);
 616         return file_check_and_advance_wb_err(file);
 617 }
 618 EXPORT_SYMBOL(file_fdatawait_range);
 619
 620 /**
 621  * filemap_fdatawait_keep_errors - wait for writeback without clearing errors
 622  * @mapping: address space structure to wait for
 623  *
 624  * Walk the list of under-writeback pages of the given address space
 625  * and wait for all of them.  Unlike filemap_fdatawait(), this function
 626  * does not clear error status of the address space.
 627  *
 628  * Use this function if callers don't handle errors themselves.  Expected
 629  * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
 630  * fsfreeze(8)
 631  *
 632  * Return: error status of the address space.
 633  */
 634 int filemap_fdatawait_keep_errors(struct address_space *mapping)
 635 {
 636         __filemap_fdatawait_range(mapping, 0, LLONG_MAX);
 637         return filemap_check_and_keep_errors(mapping);
 638 }
 639 EXPORT_SYMBOL(filemap_fdatawait_keep_errors);
 640
 641 /* Returns true if writeback might be needed or already in progress. */
 642 static bool mapping_needs_writeback(struct address_space *mapping)
 643 {
 644         return mapping->nrpages;
 645 }
 646
 647 bool filemap_range_has_writeback(struct address_space *mapping,
 648                                  loff_t start_byte, loff_t end_byte)
 649 {
 650         XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
 651         pgoff_t max = end_byte >> PAGE_SHIFT;
 652         struct folio *folio;
 653
 654         if (end_byte < start_byte)
 655                 return false;
 656
 657         rcu_read_lock();
 658         xas_for_each(&xas, folio, max) {
 659                 if (xas_retry(&xas, folio))
 660                         continue;
 661                 if (xa_is_value(folio))
 662                         continue;
 663                 if (folio_test_dirty(folio) || folio_test_locked(folio) ||
 664                                 folio_test_writeback(folio))
 665                         break;
 666         }
 667         rcu_read_unlock();
 668         return folio != NULL;
 669 }
 670 EXPORT_SYMBOL_GPL(filemap_range_has_writeback);
 671
 672 /**
 673  * filemap_write_and_wait_range - write out & wait on a file range
 674  * @mapping:    the address_space for the pages
 675  * @lstart:     offset in bytes where the range starts
 676  * @lend:       offset in bytes where the range ends (inclusive)
 677  *
 678  * Write out and wait upon file offsets lstart->lend, inclusive.
 679  *
 680  * Note that @lend is inclusive (describes the last byte to be written) so
 681  * that this function can be used to write to the very end-of-file (end = -1).
 682  *
 683  * Return: error status of the address space.
 684  */
 685 int filemap_write_and_wait_range(struct address_space *mapping,
 686                                  loff_t lstart, loff_t lend)
 687 {
 688         int err = 0, err2;
 689
 690         if (lend < lstart)
 691                 return 0;
 692
 693         if (mapping_needs_writeback(mapping)) {
 694                 err = __filemap_fdatawrite_range(mapping, lstart, lend,
 695                                                  WB_SYNC_ALL);
 696                 /*
 697                  * Even if the above returned error, the pages may be
 698                  * written partially (e.g. -ENOSPC), so we wait for it.
 699                  * But the -EIO is special case, it may indicate the worst
 700                  * thing (e.g. bug) happened, so we avoid waiting for it.
 701                  */
 702                 if (err != -EIO)
 703                         __filemap_fdatawait_range(mapping, lstart, lend);
 704         }
 705         err2 = filemap_check_errors(mapping);
 706         if (!err)
 707                 err = err2;
 708         return err;
 709 }
 710 EXPORT_SYMBOL(filemap_write_and_wait_range);
 711
 712 void __filemap_set_wb_err(struct address_space *mapping, int err)
 713 {
 714         errseq_t eseq = errseq_set(&mapping->wb_err, err);
 715
 716         trace_filemap_set_wb_err(mapping, eseq);
 717 }
 718 EXPORT_SYMBOL(__filemap_set_wb_err);
 719
 720 /**
 721  * file_check_and_advance_wb_err - report wb error (if any) that was previously
 722  *                                 and advance wb_err to current one
 723  * @file: struct file on which the error is being reported
 724  *
 725  * When userland calls fsync (or something like nfsd does the equivalent), we
 726  * want to report any writeback errors that occurred since the last fsync (or
 727  * since the file was opened if there haven't been any).
 728  *
 729  * Grab the wb_err from the mapping. If it matches what we have in the file,
 730  * then just quickly return 0. The file is all caught up.
 731  *
 732  * If it doesn't match, then take the mapping value, set the "seen" flag in
 733  * it and try to swap it into place. If it works, or another task beat us
 734  * to it with the new value, then update the f_wb_err and return the error
 735  * portion. The error at this point must be reported via proper channels
 736  * (a'la fsync, or NFS COMMIT operation, etc.).
 737  *
 738  * While we handle mapping->wb_err with atomic operations, the f_wb_err
 739  * value is protected by the f_lock since we must ensure that it reflects
 740  * the latest value swapped in for this file descriptor.
 741  *
 742  * Return: %0 on success, negative error code otherwise.
 743  */
 744 int file_check_and_advance_wb_err(struct file *file)
 745 {
 746         int err = 0;
 747         errseq_t old = READ_ONCE(file->f_wb_err);
 748         struct address_space *mapping = file->f_mapping;
 749
 750         /* Locklessly handle the common case where nothing has changed */
 751         if (errseq_check(&mapping->wb_err, old)) {
 752                 /* Something changed, must use slow path */
 753                 spin_lock(&file->f_lock);
 754                 old = file->f_wb_err;
 755                 err = errseq_check_and_advance(&mapping->wb_err,
 756                                                 &file->f_wb_err);
 757                 trace_file_check_and_advance_wb_err(file, old);
 758                 spin_unlock(&file->f_lock);
 759         }
 760
 761         /*
 762          * We're mostly using this function as a drop in replacement for
 763          * filemap_check_errors. Clear AS_EIO/AS_ENOSPC to emulate the effect
 764          * that the legacy code would have had on these flags.
 765          */
 766         clear_bit(AS_EIO, &mapping->flags);
 767         clear_bit(AS_ENOSPC, &mapping->flags);
 768         return err;
 769 }
 770 EXPORT_SYMBOL(file_check_and_advance_wb_err);
 771
 772 /**
 773  * file_write_and_wait_range - write out & wait on a file range
 774  * @file:       file pointing to address_space with pages
 775  * @lstart:     offset in bytes where the range starts
 776  * @lend:       offset in bytes where the range ends (inclusive)
 777  *
 778  * Write out and wait upon file offsets lstart->lend, inclusive.
 779  *
 780  * Note that @lend is inclusive (describes the last byte to be written) so
 781  * that this function can be used to write to the very end-of-file (end = -1).
 782  *
 783  * After writing out and waiting on the data, we check and advance the
 784  * f_wb_err cursor to the latest value, and return any errors detected there.
 785  *
 786  * Return: %0 on success, negative error code otherwise.
 787  */
 788 int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend)
 789 {
 790         int err = 0, err2;
 791         struct address_space *mapping = file->f_mapping;
 792
 793         if (lend < lstart)
 794                 return 0;
 795
 796         if (mapping_needs_writeback(mapping)) {
 797                 err = __filemap_fdatawrite_range(mapping, lstart, lend,
 798                                                  WB_SYNC_ALL);
 799                 /* See comment of filemap_write_and_wait() */
 800                 if (err != -EIO)
 801                         __filemap_fdatawait_range(mapping, lstart, lend);
 802         }
 803         err2 = file_check_and_advance_wb_err(file);
 804         if (!err)
 805                 err = err2;
 806         return err;
 807 }
 808 EXPORT_SYMBOL(file_write_and_wait_range);
 809
 810 /**
 811  * replace_page_cache_folio - replace a pagecache folio with a new one
 812  * @old:        folio to be replaced
 813  * @new:        folio to replace with
 814  *
 815  * This function replaces a folio in the pagecache with a new one.  On
 816  * success it acquires the pagecache reference for the new folio and
 817  * drops it for the old folio.  Both the old and new folios must be
 818  * locked.  This function does not add the new folio to the LRU, the
 819  * caller must do that.
 820  *
 821  * The remove + add is atomic.  This function cannot fail.
 822  */
 823 void replace_page_cache_folio(struct folio *old, struct folio *new)
 824 {
 825         struct address_space *mapping = old->mapping;
 826         void (*free_folio)(struct folio *) = mapping->a_ops->free_folio;
 827         pgoff_t offset = old->index;
 828         XA_STATE(xas, &mapping->i_pages, offset);
 829
 830         VM_BUG_ON_FOLIO(!folio_test_locked(old), old);
 831         VM_BUG_ON_FOLIO(!folio_test_locked(new), new);
 832         VM_BUG_ON_FOLIO(new->mapping, new);
 833
 834         folio_get(new);
 835         new->mapping = mapping;
 836         new->index = offset;
 837
 838         mem_cgroup_replace_folio(old, new);
 839
 840         xas_lock_irq(&xas);
 841         xas_store(&xas, new);
 842
 843         old->mapping = NULL;
 844         /* hugetlb pages do not participate in page cache accounting. */
 845         if (!folio_test_hugetlb(old))
 846                 __lruvec_stat_sub_folio(old, NR_FILE_PAGES);
 847         if (!folio_test_hugetlb(new))
 848                 __lruvec_stat_add_folio(new, NR_FILE_PAGES);
 849         if (folio_test_swapbacked(old))
 850                 __lruvec_stat_sub_folio(old, NR_SHMEM);
 851         if (folio_test_swapbacked(new))
 852                 __lruvec_stat_add_folio(new, NR_SHMEM);
 853         xas_unlock_irq(&xas);
 854         if (free_folio)
 855                 free_folio(old);
 856         folio_put(old);
 857 }
 858 EXPORT_SYMBOL_GPL(replace_page_cache_folio);
 859
 860 noinline int __filemap_add_folio(struct address_space *mapping,
 861                 struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp)
 862 {
 863         XA_STATE(xas, &mapping->i_pages, index);
 864         void *alloced_shadow = NULL;
 865         int alloced_order = 0;
 866         bool huge;
 867         long nr;
 868
 869         VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
 870         VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio);
 871         VM_BUG_ON_FOLIO(folio_order(folio) < mapping_min_folio_order(mapping),
 872                         folio);
 873         mapping_set_update(&xas, mapping);
 874
 875         VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio);
 876         xas_set_order(&xas, index, folio_order(folio));
 877         huge = folio_test_hugetlb(folio);
 878         nr = folio_nr_pages(folio);
 879
 880         gfp &= GFP_RECLAIM_MASK;
 881         folio_ref_add(folio, nr);
 882         folio->mapping = mapping;
 883         folio->index = xas.xa_index;
 884
 885         for (;;) {
 886                 int order = -1, split_order = 0;
 887                 void *entry, *old = NULL;
 888
 889                 xas_lock_irq(&xas);
 890                 xas_for_each_conflict(&xas, entry) {
 891                         old = entry;
 892                         if (!xa_is_value(entry)) {
 893                                 xas_set_err(&xas, -EEXIST);
 894                                 goto unlock;
 895                         }
 896                         /*
 897                          * If a larger entry exists,
 898                          * it will be the first and only entry iterated.
 899                          */
 900                         if (order == -1)
 901                                 order = xas_get_order(&xas);
 902                 }
 903
 904                 /* entry may have changed before we re-acquire the lock */
 905                 if (alloced_order && (old != alloced_shadow || order != alloced_order)) {
 906                         xas_destroy(&xas);
 907                         alloced_order = 0;
 908                 }
 909
 910                 if (old) {
 911                         if (order > 0 && order > folio_order(folio)) {
 912                                 /* How to handle large swap entries? */
 913                                 BUG_ON(shmem_mapping(mapping));
 914                                 if (!alloced_order) {
 915                                         split_order = order;
 916                                         goto unlock;
 917                                 }
 918                                 xas_split(&xas, old, order);
 919                                 xas_reset(&xas);
 920                         }
 921                         if (shadowp)
 922                                 *shadowp = old;
 923                 }
 924
 925                 xas_store(&xas, folio);
 926                 if (xas_error(&xas))
 927                         goto unlock;
 928
 929                 mapping->nrpages += nr;
 930
 931                 /* hugetlb pages do not participate in page cache accounting */
 932                 if (!huge) {
 933                         __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr);
 934                         if (folio_test_pmd_mappable(folio))
 935                                 __lruvec_stat_mod_folio(folio,
 936                                                 NR_FILE_THPS, nr);
 937                 }
 938
 939 unlock:
 940                 xas_unlock_irq(&xas);
 941
 942                 /* split needed, alloc here and retry. */
 943                 if (split_order) {
 944                         xas_split_alloc(&xas, old, split_order, gfp);
 945                         if (xas_error(&xas))
 946                                 goto error;
 947                         alloced_shadow = old;
 948                         alloced_order = split_order;
 949                         xas_reset(&xas);
 950                         continue;
 951                 }
 952
 953                 if (!xas_nomem(&xas, gfp))
 954                         break;
 955         }
 956
 957         if (xas_error(&xas))
 958                 goto error;
 959
 960         trace_mm_filemap_add_to_page_cache(folio);
 961         return 0;
 962 error:
 963         folio->mapping = NULL;
 964         /* Leave page->index set: truncation relies upon it */
 965         folio_put_refs(folio, nr);
 966         return xas_error(&xas);
 967 }
 968 ALLOW_ERROR_INJECTION(__filemap_add_folio, ERRNO);
 969
 970 int filemap_add_folio(struct address_space *mapping, struct folio *folio,
 971                                 pgoff_t index, gfp_t gfp)
 972 {
 973         void *shadow = NULL;
 974         int ret;
 975
 976         ret = mem_cgroup_charge(folio, NULL, gfp);
 977         if (ret)
 978                 return ret;
 979
 980         __folio_set_locked(folio);
 981         ret = __filemap_add_folio(mapping, folio, index, gfp, &shadow);
 982         if (unlikely(ret)) {
 983                 mem_cgroup_uncharge(folio);
 984                 __folio_clear_locked(folio);
 985         } else {
 986                 /*
 987                  * The folio might have been evicted from cache only
 988                  * recently, in which case it should be activated like
 989                  * any other repeatedly accessed folio.
 990                  * The exception is folios getting rewritten; evicting other
 991                  * data from the working set, only to cache data that will
 992                  * get overwritten with something else, is a waste of memory.
 993                  */
 994                 WARN_ON_ONCE(folio_test_active(folio));
 995                 if (!(gfp & __GFP_WRITE) && shadow)
 996                         workingset_refault(folio, shadow);
 997                 folio_add_lru(folio);
 998         }
 999         return ret;
1000 }
1001 EXPORT_SYMBOL_GPL(filemap_add_folio);
1002
1003 #ifdef CONFIG_NUMA
1004 struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
1005 {
1006         int n;
1007         struct folio *folio;
1008
1009         if (cpuset_do_page_mem_spread()) {
1010                 unsigned int cpuset_mems_cookie;
1011                 do {
1012                         cpuset_mems_cookie = read_mems_allowed_begin();
1013                         n = cpuset_mem_spread_node();
1014                         folio = __folio_alloc_node_noprof(gfp, order, n);
1015                 } while (!folio && read_mems_allowed_retry(cpuset_mems_cookie));
1016
1017                 return folio;
1018         }
1019         return folio_alloc_noprof(gfp, order);
1020 }
1021 EXPORT_SYMBOL(filemap_alloc_folio_noprof);
1022 #endif
1023
1024 /*
1025  * filemap_invalidate_lock_two - lock invalidate_lock for two mappings
1026  *
1027  * Lock exclusively invalidate_lock of any passed mapping that is not NULL.
1028  *
1029  * @mapping1: the first mapping to lock
1030  * @mapping2: the second mapping to lock
1031  */
1032 void filemap_invalidate_lock_two(struct address_space *mapping1,
1033                                  struct address_space *mapping2)
1034 {
1035         if (mapping1 > mapping2)
1036                 swap(mapping1, mapping2);
1037         if (mapping1)
1038                 down_write(&mapping1->invalidate_lock);
1039         if (mapping2 && mapping1 != mapping2)
1040                 down_write_nested(&mapping2->invalidate_lock, 1);
1041 }
1042 EXPORT_SYMBOL(filemap_invalidate_lock_two);
1043
1044 /*
1045  * filemap_invalidate_unlock_two - unlock invalidate_lock for two mappings
1046  *
1047  * Unlock exclusive invalidate_lock of any passed mapping that is not NULL.
1048  *
1049  * @mapping1: the first mapping to unlock
1050  * @mapping2: the second mapping to unlock
1051  */
1052 void filemap_invalidate_unlock_two(struct address_space *mapping1,
1053                                    struct address_space *mapping2)
1054 {
1055         if (mapping1)
1056                 up_write(&mapping1->invalidate_lock);
1057         if (mapping2 && mapping1 != mapping2)
1058                 up_write(&mapping2->invalidate_lock);
1059 }
1060 EXPORT_SYMBOL(filemap_invalidate_unlock_two);
1061
1062 /*
1063  * In order to wait for pages to become available there must be
1064  * waitqueues associated with pages. By using a hash table of
1065  * waitqueues where the bucket discipline is to maintain all
1066  * waiters on the same queue and wake all when any of the pages
1067  * become available, and for the woken contexts to check to be
1068  * sure the appropriate page became available, this saves space
1069  * at a cost of "thundering herd" phenomena during rare hash
1070  * collisions.
1071  */
1072 #define PAGE_WAIT_TABLE_BITS 8
1073 #define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS)
1074 static wait_queue_head_t folio_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;
1075
1076 static wait_queue_head_t *folio_waitqueue(struct folio *folio)
1077 {
1078         return &folio_wait_table[hash_ptr(folio, PAGE_WAIT_TABLE_BITS)];
1079 }
1080
1081 void __init pagecache_init(void)
1082 {
1083         int i;
1084
1085         for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++)
1086                 init_waitqueue_head(&folio_wait_table[i]);
1087
1088         page_writeback_init();
1089 }
1090
1091 /*
1092  * The page wait code treats the "wait->flags" somewhat unusually, because
1093  * we have multiple different kinds of waits, not just the usual "exclusive"
1094  * one.
1095  *
1096  * We have:
1097  *
1098  *  (a) no special bits set:
1099  *
1100  *      We're just waiting for the bit to be released, and when a waker
1101  *      calls the wakeup function, we set WQ_FLAG_WOKEN and wake it up,
1102  *      and remove it from the wait queue.
1103  *
1104  *      Simple and straightforward.
1105  *
1106  *  (b) WQ_FLAG_EXCLUSIVE:
1107  *
1108  *      The waiter is waiting to get the lock, and only one waiter should
1109  *      be woken up to avoid any thundering herd behavior. We'll set the
1110  *      WQ_FLAG_WOKEN bit, wake it up, and remove it from the wait queue.
1111  *
1112  *      This is the traditional exclusive wait.
1113  *
1114  *  (c) WQ_FLAG_EXCLUSIVE | WQ_FLAG_CUSTOM:
1115  *
1116  *      The waiter is waiting to get the bit, and additionally wants the
1117  *      lock to be transferred to it for fair lock behavior. If the lock
1118  *      cannot be taken, we stop walking the wait queue without waking
1119  *      the waiter.
1120  *
1121  *      This is the "fair lock handoff" case, and in addition to setting
1122  *      WQ_FLAG_WOKEN, we set WQ_FLAG_DONE to let the waiter easily see
1123  *      that it now has the lock.
1124  */
1125 static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
1126 {
1127         unsigned int flags;
1128         struct wait_page_key *key = arg;
1129         struct wait_page_queue *wait_page
1130                 = container_of(wait, struct wait_page_queue, wait);
1131
1132         if (!wake_page_match(wait_page, key))
1133                 return 0;
1134
1135         /*
1136          * If it's a lock handoff wait, we get the bit for it, and
1137          * stop walking (and do not wake it up) if we can't.
1138          */
1139         flags = wait->flags;
1140         if (flags & WQ_FLAG_EXCLUSIVE) {
1141                 if (test_bit(key->bit_nr, &key->folio->flags))
1142                         return -1;
1143                 if (flags & WQ_FLAG_CUSTOM) {
1144                         if (test_and_set_bit(key->bit_nr, &key->folio->flags))
1145                                 return -1;
1146                         flags |= WQ_FLAG_DONE;
1147                 }
1148         }
1149
1150         /*
1151          * We are holding the wait-queue lock, but the waiter that
1152          * is waiting for this will be checking the flags without
1153          * any locking.
1154          *
1155          * So update the flags atomically, and wake up the waiter
1156          * afterwards to avoid any races. This store-release pairs
1157          * with the load-acquire in folio_wait_bit_common().
1158          */
1159         smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN);
1160         wake_up_state(wait->private, mode);
1161
1162         /*
1163          * Ok, we have successfully done what we're waiting for,
1164          * and we can unconditionally remove the wait entry.
1165          *
1166          * Note that this pairs with the "finish_wait()" in the
1167          * waiter, and has to be the absolute last thing we do.
1168          * After this list_del_init(&wait->entry) the wait entry
1169          * might be de-allocated and the process might even have
1170          * exited.
1171          */
1172         list_del_init_careful(&wait->entry);
1173         return (flags & WQ_FLAG_EXCLUSIVE) != 0;
1174 }
1175
1176 static void folio_wake_bit(struct folio *folio, int bit_nr)
1177 {
1178         wait_queue_head_t *q = folio_waitqueue(folio);
1179         struct wait_page_key key;
1180         unsigned long flags;
1181
1182         key.folio = folio;
1183         key.bit_nr = bit_nr;
1184         key.page_match = 0;
1185
1186         spin_lock_irqsave(&q->lock, flags);
1187         __wake_up_locked_key(q, TASK_NORMAL, &key);
1188
1189         /*
1190          * It's possible to miss clearing waiters here, when we woke our page
1191          * waiters, but the hashed waitqueue has waiters for other pages on it.
1192          * That's okay, it's a rare case. The next waker will clear it.
1193          *
1194          * Note that, depending on the page pool (buddy, hugetlb, ZONE_DEVICE,
1195          * other), the flag may be cleared in the course of freeing the page;
1196          * but that is not required for correctness.
1197          */
1198         if (!waitqueue_active(q) || !key.page_match)
1199                 folio_clear_waiters(folio);
1200
1201         spin_unlock_irqrestore(&q->lock, flags);
1202 }
1203
1204 /*
1205  * A choice of three behaviors for folio_wait_bit_common():
1206  */
1207 enum behavior {
1208         EXCLUSIVE,      /* Hold ref to page and take the bit when woken, like
1209                          * __folio_lock() waiting on then setting PG_locked.
1210                          */
1211         SHARED,         /* Hold ref to page and check the bit when woken, like
1212                          * folio_wait_writeback() waiting on PG_writeback.
1213                          */
1214         DROP,           /* Drop ref to page before wait, no check when woken,
1215                          * like folio_put_wait_locked() on PG_locked.
1216                          */
1217 };
1218
1219 /*
1220  * Attempt to check (or get) the folio flag, and mark us done
1221  * if successful.
1222  */
1223 static inline bool folio_trylock_flag(struct folio *folio, int bit_nr,
1224                                         struct wait_queue_entry *wait)
1225 {
1226         if (wait->flags & WQ_FLAG_EXCLUSIVE) {
1227                 if (test_and_set_bit(bit_nr, &folio->flags))
1228                         return false;
1229         } else if (test_bit(bit_nr, &folio->flags))
1230                 return false;
1231
1232         wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE;
1233         return true;
1234 }
1235
1236 /* How many times do we accept lock stealing from under a waiter? */
1237 int sysctl_page_lock_unfairness = 5;
1238
1239 static inline int folio_wait_bit_common(struct folio *folio, int bit_nr,
1240                 int state, enum behavior behavior)
1241 {
1242         wait_queue_head_t *q = folio_waitqueue(folio);
1243         int unfairness = sysctl_page_lock_unfairness;
1244         struct wait_page_queue wait_page;
1245         wait_queue_entry_t *wait = &wait_page.wait;
1246         bool thrashing = false;
1247         unsigned long pflags;
1248         bool in_thrashing;
1249
1250         if (bit_nr == PG_locked &&
1251             !folio_test_uptodate(folio) && folio_test_workingset(folio)) {
1252                 delayacct_thrashing_start(&in_thrashing);
1253                 psi_memstall_enter(&pflags);
1254                 thrashing = true;
1255         }
1256
1257         init_wait(wait);
1258         wait->func = wake_page_function;
1259         wait_page.folio = folio;
1260         wait_page.bit_nr = bit_nr;
1261
1262 repeat:
1263         wait->flags = 0;
1264         if (behavior == EXCLUSIVE) {
1265                 wait->flags = WQ_FLAG_EXCLUSIVE;
1266                 if (--unfairness < 0)
1267                         wait->flags |= WQ_FLAG_CUSTOM;
1268         }
1269
1270         /*
1271          * Do one last check whether we can get the
1272          * page bit synchronously.
1273          *
1274          * Do the folio_set_waiters() marking before that
1275          * to let any waker we _just_ missed know they
1276          * need to wake us up (otherwise they'll never
1277          * even go to the slow case that looks at the
1278          * page queue), and add ourselves to the wait
1279          * queue if we need to sleep.
1280          *
1281          * This part needs to be done under the queue
1282          * lock to avoid races.
1283          */
1284         spin_lock_irq(&q->lock);
1285         folio_set_waiters(folio);
1286         if (!folio_trylock_flag(folio, bit_nr, wait))
1287                 __add_wait_queue_entry_tail(q, wait);
1288         spin_unlock_irq(&q->lock);
1289
1290         /*
1291          * From now on, all the logic will be based on
1292          * the WQ_FLAG_WOKEN and WQ_FLAG_DONE flag, to
1293          * see whether the page bit testing has already
1294          * been done by the wake function.
1295          *
1296          * We can drop our reference to the folio.
1297          */
1298         if (behavior == DROP)
1299                 folio_put(folio);
1300
1301         /*
1302          * Note that until the "finish_wait()", or until
1303          * we see the WQ_FLAG_WOKEN flag, we need to
1304          * be very careful with the 'wait->flags', because
1305          * we may race with a waker that sets them.
1306          */
1307         for (;;) {
1308                 unsigned int flags;
1309
1310                 set_current_state(state);
1311
1312                 /* Loop until we've been woken or interrupted */
1313                 flags = smp_load_acquire(&wait->flags);
1314                 if (!(flags & WQ_FLAG_WOKEN)) {
1315                         if (signal_pending_state(state, current))
1316                                 break;
1317
1318                         io_schedule();
1319                         continue;
1320                 }
1321
1322                 /* If we were non-exclusive, we're done */
1323                 if (behavior != EXCLUSIVE)
1324                         break;
1325
1326                 /* If the waker got the lock for us, we're done */
1327                 if (flags & WQ_FLAG_DONE)
1328                         break;
1329
1330                 /*
1331                  * Otherwise, if we're getting the lock, we need to
1332                  * try to get it ourselves.
1333                  *
1334                  * And if that fails, we'll have to retry this all.
1335                  */
1336                 if (unlikely(test_and_set_bit(bit_nr, folio_flags(folio, 0))))
1337                         goto repeat;
1338
1339                 wait->flags |= WQ_FLAG_DONE;
1340                 break;
1341         }
1342
1343         /*
1344          * If a signal happened, this 'finish_wait()' may remove the last
1345          * waiter from the wait-queues, but the folio waiters bit will remain
1346          * set. That's ok. The next wakeup will take care of it, and trying
1347          * to do it here would be difficult and prone to races.
1348          */
1349         finish_wait(q, wait);
1350
1351         if (thrashing) {
1352                 delayacct_thrashing_end(&in_thrashing);
1353                 psi_memstall_leave(&pflags);
1354         }
1355
1356         /*
1357          * NOTE! The wait->flags weren't stable until we've done the
1358          * 'finish_wait()', and we could have exited the loop above due
1359          * to a signal, and had a wakeup event happen after the signal
1360          * test but before the 'finish_wait()'.
1361          *
1362          * So only after the finish_wait() can we reliably determine
1363          * if we got woken up or not, so we can now figure out the final
1364          * return value based on that state without races.
1365          *
1366          * Also note that WQ_FLAG_WOKEN is sufficient for a non-exclusive
1367          * waiter, but an exclusive one requires WQ_FLAG_DONE.
1368          */
1369         if (behavior == EXCLUSIVE)
1370                 return wait->flags & WQ_FLAG_DONE ? 0 : -EINTR;
1371
1372         return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR;
1373 }
1374
1375 #ifdef CONFIG_MIGRATION
1376 /**
1377  * migration_entry_wait_on_locked - Wait for a migration entry to be removed
1378  * @entry: migration swap entry.
1379  * @ptl: already locked ptl. This function will drop the lock.
1380  *
1381  * Wait for a migration entry referencing the given page to be removed. This is
1382  * equivalent to put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE) except
1383  * this can be called without taking a reference on the page. Instead this
1384  * should be called while holding the ptl for the migration entry referencing
1385  * the page.
1386  *
1387  * Returns after unlocking the ptl.
1388  *
1389  * This follows the same logic as folio_wait_bit_common() so see the comments
1390  * there.
1391  */
1392 void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl)
1393         __releases(ptl)
1394 {
1395         struct wait_page_queue wait_page;
1396         wait_queue_entry_t *wait = &wait_page.wait;
1397         bool thrashing = false;
1398         unsigned long pflags;
1399         bool in_thrashing;
1400         wait_queue_head_t *q;
1401         struct folio *folio = pfn_swap_entry_folio(entry);
1402
1403         q = folio_waitqueue(folio);
1404         if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) {
1405                 delayacct_thrashing_start(&in_thrashing);
1406                 psi_memstall_enter(&pflags);
1407                 thrashing = true;
1408         }
1409
1410         init_wait(wait);
1411         wait->func = wake_page_function;
1412         wait_page.folio = folio;
1413         wait_page.bit_nr = PG_locked;
1414         wait->flags = 0;
1415
1416         spin_lock_irq(&q->lock);
1417         folio_set_waiters(folio);
1418         if (!folio_trylock_flag(folio, PG_locked, wait))
1419                 __add_wait_queue_entry_tail(q, wait);
1420         spin_unlock_irq(&q->lock);
1421
1422         /*
1423          * If a migration entry exists for the page the migration path must hold
1424          * a valid reference to the page, and it must take the ptl to remove the
1425          * migration entry. So the page is valid until the ptl is dropped.
1426          */
1427         spin_unlock(ptl);
1428
1429         for (;;) {
1430                 unsigned int flags;
1431
1432                 set_current_state(TASK_UNINTERRUPTIBLE);
1433
1434                 /* Loop until we've been woken or interrupted */
1435                 flags = smp_load_acquire(&wait->flags);
1436                 if (!(flags & WQ_FLAG_WOKEN)) {
1437                         if (signal_pending_state(TASK_UNINTERRUPTIBLE, current))
1438                                 break;
1439
1440                         io_schedule();
1441                         continue;
1442                 }
1443                 break;
1444         }
1445
1446         finish_wait(q, wait);
1447
1448         if (thrashing) {
1449                 delayacct_thrashing_end(&in_thrashing);
1450                 psi_memstall_leave(&pflags);
1451         }
1452 }
1453 #endif
1454
1455 void folio_wait_bit(struct folio *folio, int bit_nr)
1456 {
1457         folio_wait_bit_common(folio, bit_nr, TASK_UNINTERRUPTIBLE, SHARED);
1458 }
1459 EXPORT_SYMBOL(folio_wait_bit);
1460
1461 int folio_wait_bit_killable(struct folio *folio, int bit_nr)
1462 {
1463         return folio_wait_bit_common(folio, bit_nr, TASK_KILLABLE, SHARED);
1464 }
1465 EXPORT_SYMBOL(folio_wait_bit_killable);
1466
1467 /**
1468  * folio_put_wait_locked - Drop a reference and wait for it to be unlocked
1469  * @folio: The folio to wait for.
1470  * @state: The sleep state (TASK_KILLABLE, TASK_UNINTERRUPTIBLE, etc).
1471  *
1472  * The caller should hold a reference on @folio.  They expect the page to
1473  * become unlocked relatively soon, but do not wish to hold up migration
1474  * (for example) by holding the reference while waiting for the folio to
1475  * come unlocked.  After this function returns, the caller should not
1476  * dereference @folio.
1477  *
1478  * Return: 0 if the folio was unlocked or -EINTR if interrupted by a signal.
1479  */
1480 static int folio_put_wait_locked(struct folio *folio, int state)
1481 {
1482         return folio_wait_bit_common(folio, PG_locked, state, DROP);
1483 }
1484
1485 /**
1486  * folio_unlock - Unlock a locked folio.
1487  * @folio: The folio.
1488  *
1489  * Unlocks the folio and wakes up any thread sleeping on the page lock.
1490  *
1491  * Context: May be called from interrupt or process context.  May not be
1492  * called from NMI context.
1493  */
1494 void folio_unlock(struct folio *folio)
1495 {
1496         /* Bit 7 allows x86 to check the byte's sign bit */
1497         BUILD_BUG_ON(PG_waiters != 7);
1498         BUILD_BUG_ON(PG_locked > 7);
1499         VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
1500         if (folio_xor_flags_has_waiters(folio, 1 << PG_locked))
1501                 folio_wake_bit(folio, PG_locked);
1502 }
1503 EXPORT_SYMBOL(folio_unlock);
1504
1505 /**
1506  * folio_end_read - End read on a folio.
1507  * @folio: The folio.
1508  * @success: True if all reads completed successfully.
1509  *
1510  * When all reads against a folio have completed, filesystems should
1511  * call this function to let the pagecache know that no more reads
1512  * are outstanding.  This will unlock the folio and wake up any thread
1513  * sleeping on the lock.  The folio will also be marked uptodate if all
1514  * reads succeeded.
1515  *
1516  * Context: May be called from interrupt or process context.  May not be
1517  * called from NMI context.
1518  */
1519 void folio_end_read(struct folio *folio, bool success)
1520 {
1521         unsigned long mask = 1 << PG_locked;
1522
1523         /* Must be in bottom byte for x86 to work */
1524         BUILD_BUG_ON(PG_uptodate > 7);
1525         VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
1526         VM_BUG_ON_FOLIO(success && folio_test_uptodate(folio), folio);
1527
1528         if (likely(success))
1529                 mask |= 1 << PG_uptodate;
1530         if (folio_xor_flags_has_waiters(folio, mask))
1531                 folio_wake_bit(folio, PG_locked);
1532 }
1533 EXPORT_SYMBOL(folio_end_read);
1534
1535 /**
1536  * folio_end_private_2 - Clear PG_private_2 and wake any waiters.
1537  * @folio: The folio.
1538  *
1539  * Clear the PG_private_2 bit on a folio and wake up any sleepers waiting for
1540  * it.  The folio reference held for PG_private_2 being set is released.
1541  *
1542  * This is, for example, used when a netfs folio is being written to a local
1543  * disk cache, thereby allowing writes to the cache for the same folio to be
1544  * serialised.
1545  */
1546 void folio_end_private_2(struct folio *folio)
1547 {
1548         VM_BUG_ON_FOLIO(!folio_test_private_2(folio), folio);
1549         clear_bit_unlock(PG_private_2, folio_flags(folio, 0));
1550         folio_wake_bit(folio, PG_private_2);
1551         folio_put(folio);
1552 }
1553 EXPORT_SYMBOL(folio_end_private_2);
1554
1555 /**
1556  * folio_wait_private_2 - Wait for PG_private_2 to be cleared on a folio.
1557  * @folio: The folio to wait on.
1558  *
1559  * Wait for PG_private_2 to be cleared on a folio.
1560  */
1561 void folio_wait_private_2(struct folio *folio)
1562 {
1563         while (folio_test_private_2(folio))
1564                 folio_wait_bit(folio, PG_private_2);
1565 }
1566 EXPORT_SYMBOL(folio_wait_private_2);
1567
1568 /**
1569  * folio_wait_private_2_killable - Wait for PG_private_2 to be cleared on a folio.
1570  * @folio: The folio to wait on.
1571  *
1572  * Wait for PG_private_2 to be cleared on a folio or until a fatal signal is
1573  * received by the calling task.
1574  *
1575  * Return:
1576  * - 0 if successful.
1577  * - -EINTR if a fatal signal was encountered.
1578  */
1579 int folio_wait_private_2_killable(struct folio *folio)
1580 {
1581         int ret = 0;
1582
1583         while (folio_test_private_2(folio)) {
1584                 ret = folio_wait_bit_killable(folio, PG_private_2);
1585                 if (ret < 0)
1586                         break;
1587         }
1588
1589         return ret;
1590 }
1591 EXPORT_SYMBOL(folio_wait_private_2_killable);
1592
1593 /*
1594  * If folio was marked as dropbehind, then pages should be dropped when writeback
1595  * completes. Do that now. If we fail, it's likely because of a big folio -
1596  * just reset dropbehind for that case and latter completions should invalidate.
1597  */
1598 static void folio_end_dropbehind_write(struct folio *folio)
1599 {
1600         /*
1601          * Hitting !in_task() should not happen off RWF_DONTCACHE writeback,
1602          * but can happen if normal writeback just happens to find dirty folios
1603          * that were created as part of uncached writeback, and that writeback
1604          * would otherwise not need non-IRQ handling. Just skip the
1605          * invalidation in that case.
1606          */
1607         if (in_task() && folio_trylock(folio)) {
1608                 if (folio->mapping)
1609                         folio_unmap_invalidate(folio->mapping, folio, 0);
1610                 folio_unlock(folio);
1611         }
1612 }
1613
1614 /**
1615  * folio_end_writeback - End writeback against a folio.
1616  * @folio: The folio.
1617  *
1618  * The folio must actually be under writeback.
1619  *
1620  * Context: May be called from process or interrupt context.
1621  */
1622 void folio_end_writeback(struct folio *folio)
1623 {
1624         bool folio_dropbehind = false;
1625
1626         VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio);
1627
1628         /*
1629          * folio_test_clear_reclaim() could be used here but it is an
1630          * atomic operation and overkill in this particular case. Failing
1631          * to shuffle a folio marked for immediate reclaim is too mild
1632          * a gain to justify taking an atomic operation penalty at the
1633          * end of every folio writeback.
1634          */
1635         if (folio_test_reclaim(folio)) {
1636                 folio_clear_reclaim(folio);
1637                 folio_rotate_reclaimable(folio);
1638         }
1639
1640         /*
1641          * Writeback does not hold a folio reference of its own, relying
1642          * on truncation to wait for the clearing of PG_writeback.
1643          * But here we must make sure that the folio is not freed and
1644          * reused before the folio_wake_bit().
1645          */
1646         folio_get(folio);
1647         if (!folio_test_dirty(folio))
1648                 folio_dropbehind = folio_test_clear_dropbehind(folio);
1649         if (__folio_end_writeback(folio))
1650                 folio_wake_bit(folio, PG_writeback);
1651         acct_reclaim_writeback(folio);
1652
1653         if (folio_dropbehind)
1654                 folio_end_dropbehind_write(folio);
1655         folio_put(folio);
1656 }
1657 EXPORT_SYMBOL(folio_end_writeback);
1658
1659 /**
1660  * __folio_lock - Get a lock on the folio, assuming we need to sleep to get it.
1661  * @folio: The folio to lock
1662  */
1663 void __folio_lock(struct folio *folio)
1664 {
1665         folio_wait_bit_common(folio, PG_locked, TASK_UNINTERRUPTIBLE,
1666                                 EXCLUSIVE);
1667 }
1668 EXPORT_SYMBOL(__folio_lock);
1669
1670 int __folio_lock_killable(struct folio *folio)
1671 {
1672         return folio_wait_bit_common(folio, PG_locked, TASK_KILLABLE,
1673                                         EXCLUSIVE);
1674 }
1675 EXPORT_SYMBOL_GPL(__folio_lock_killable);
1676
1677 static int __folio_lock_async(struct folio *folio, struct wait_page_queue *wait)
1678 {
1679         struct wait_queue_head *q = folio_waitqueue(folio);
1680         int ret;
1681
1682         wait->folio = folio;
1683         wait->bit_nr = PG_locked;
1684
1685         spin_lock_irq(&q->lock);
1686         __add_wait_queue_entry_tail(q, &wait->wait);
1687         folio_set_waiters(folio);
1688         ret = !folio_trylock(folio);
1689         /*
1690          * If we were successful now, we know we're still on the
1691          * waitqueue as we're still under the lock. This means it's
1692          * safe to remove and return success, we know the callback
1693          * isn't going to trigger.
1694          */
1695         if (!ret)
1696                 __remove_wait_queue(q, &wait->wait);
1697         else
1698                 ret = -EIOCBQUEUED;
1699         spin_unlock_irq(&q->lock);
1700         return ret;
1701 }
1702
1703 /*
1704  * Return values:
1705  * 0 - folio is locked.
1706  * non-zero - folio is not locked.
1707  *     mmap_lock or per-VMA lock has been released (mmap_read_unlock() or
1708  *     vma_end_read()), unless flags had both FAULT_FLAG_ALLOW_RETRY and
1709  *     FAULT_FLAG_RETRY_NOWAIT set, in which case the lock is still held.
1710  *
1711  * If neither ALLOW_RETRY nor KILLABLE are set, will always return 0
1712  * with the folio locked and the mmap_lock/per-VMA lock is left unperturbed.
1713  */
1714 vm_fault_t __folio_lock_or_retry(struct folio *folio, struct vm_fault *vmf)
1715 {
1716         unsigned int flags = vmf->flags;
1717
1718         if (fault_flag_allow_retry_first(flags)) {
1719                 /*
1720                  * CAUTION! In this case, mmap_lock/per-VMA lock is not
1721                  * released even though returning VM_FAULT_RETRY.
1722                  */
1723                 if (flags & FAULT_FLAG_RETRY_NOWAIT)
1724                         return VM_FAULT_RETRY;
1725
1726                 release_fault_lock(vmf);
1727                 if (flags & FAULT_FLAG_KILLABLE)
1728                         folio_wait_locked_killable(folio);
1729                 else
1730                         folio_wait_locked(folio);
1731                 return VM_FAULT_RETRY;
1732         }
1733         if (flags & FAULT_FLAG_KILLABLE) {
1734                 bool ret;
1735
1736                 ret = __folio_lock_killable(folio);
1737                 if (ret) {
1738                         release_fault_lock(vmf);
1739                         return VM_FAULT_RETRY;
1740                 }
1741         } else {
1742                 __folio_lock(folio);
1743         }
1744
1745         return 0;
1746 }
1747
1748 /**
1749  * page_cache_next_miss() - Find the next gap in the page cache.
1750  * @mapping: Mapping.
1751  * @index: Index.
1752  * @max_scan: Maximum range to search.
1753  *
1754  * Search the range [index, min(index + max_scan - 1, ULONG_MAX)] for the
1755  * gap with the lowest index.
1756  *
1757  * This function may be called under the rcu_read_lock.  However, this will
1758  * not atomically search a snapshot of the cache at a single point in time.
1759  * For example, if a gap is created at index 5, then subsequently a gap is
1760  * created at index 10, page_cache_next_miss covering both indices may
1761  * return 10 if called under the rcu_read_lock.
1762  *
1763  * Return: The index of the gap if found, otherwise an index outside the
1764  * range specified (in which case 'return - index >= max_scan' will be true).
1765  * In the rare case of index wrap-around, 0 will be returned.
1766  */
1767 pgoff_t page_cache_next_miss(struct address_space *mapping,
1768                              pgoff_t index, unsigned long max_scan)
1769 {
1770         XA_STATE(xas, &mapping->i_pages, index);
1771
1772         while (max_scan--) {
1773                 void *entry = xas_next(&xas);
1774                 if (!entry || xa_is_value(entry))
1775                         return xas.xa_index;
1776                 if (xas.xa_index == 0)
1777                         return 0;
1778         }
1779
1780         return index + max_scan;
1781 }
1782 EXPORT_SYMBOL(page_cache_next_miss);
1783
1784 /**
1785  * page_cache_prev_miss() - Find the previous gap in the page cache.
1786  * @mapping: Mapping.
1787  * @index: Index.
1788  * @max_scan: Maximum range to search.
1789  *
1790  * Search the range [max(index - max_scan + 1, 0), index] for the
1791  * gap with the highest index.
1792  *
1793  * This function may be called under the rcu_read_lock.  However, this will
1794  * not atomically search a snapshot of the cache at a single point in time.
1795  * For example, if a gap is created at index 10, then subsequently a gap is
1796  * created at index 5, page_cache_prev_miss() covering both indices may
1797  * return 5 if called under the rcu_read_lock.
1798  *
1799  * Return: The index of the gap if found, otherwise an index outside the
1800  * range specified (in which case 'index - return >= max_scan' will be true).
1801  * In the rare case of wrap-around, ULONG_MAX will be returned.
1802  */
1803 pgoff_t page_cache_prev_miss(struct address_space *mapping,
1804                              pgoff_t index, unsigned long max_scan)
1805 {
1806         XA_STATE(xas, &mapping->i_pages, index);
1807
1808         while (max_scan--) {
1809                 void *entry = xas_prev(&xas);
1810                 if (!entry || xa_is_value(entry))
1811                         break;
1812                 if (xas.xa_index == ULONG_MAX)
1813                         break;
1814         }
1815
1816         return xas.xa_index;
1817 }
1818 EXPORT_SYMBOL(page_cache_prev_miss);
1819
1820 /*
1821  * Lockless page cache protocol:
1822  * On the lookup side:
1823  * 1. Load the folio from i_pages
1824  * 2. Increment the refcount if it's not zero
1825  * 3. If the folio is not found by xas_reload(), put the refcount and retry
1826  *
1827  * On the removal side:
1828  * A. Freeze the page (by zeroing the refcount if nobody else has a reference)
1829  * B. Remove the page from i_pages
1830  * C. Return the page to the page allocator
1831  *
1832  * This means that any page may have its reference count temporarily
1833  * increased by a speculative page cache (or GUP-fast) lookup as it can
1834  * be allocated by another user before the RCU grace period expires.
1835  * Because the refcount temporarily acquired here may end up being the
1836  * last refcount on the page, any page allocation must be freeable by
1837  * folio_put().
1838  */
1839
1840 /*
1841  * filemap_get_entry - Get a page cache entry.
1842  * @mapping: the address_space to search
1843  * @index: The page cache index.
1844  *
1845  * Looks up the page cache entry at @mapping & @index.  If it is a folio,
1846  * it is returned with an increased refcount.  If it is a shadow entry
1847  * of a previously evicted folio, or a swap entry from shmem/tmpfs,
1848  * it is returned without further action.
1849  *
1850  * Return: The folio, swap or shadow entry, %NULL if nothing is found.
1851  */
1852 void *filemap_get_entry(struct address_space *mapping, pgoff_t index)
1853 {
1854         XA_STATE(xas, &mapping->i_pages, index);
1855         struct folio *folio;
1856
1857         rcu_read_lock();
1858 repeat:
1859         xas_reset(&xas);
1860         folio = xas_load(&xas);
1861         if (xas_retry(&xas, folio))
1862                 goto repeat;
1863         /*
1864          * A shadow entry of a recently evicted page, or a swap entry from
1865          * shmem/tmpfs.  Return it without attempting to raise page count.
1866          */
1867         if (!folio || xa_is_value(folio))
1868                 goto out;
1869
1870         if (!folio_try_get(folio))
1871                 goto repeat;
1872
1873         if (unlikely(folio != xas_reload(&xas))) {
1874                 folio_put(folio);
1875                 goto repeat;
1876         }
1877 out:
1878         rcu_read_unlock();
1879
1880         return folio;
1881 }
1882
1883 /**
1884  * __filemap_get_folio - Find and get a reference to a folio.
1885  * @mapping: The address_space to search.
1886  * @index: The page index.
1887  * @fgp_flags: %FGP flags modify how the folio is returned.
1888  * @gfp: Memory allocation flags to use if %FGP_CREAT is specified.
1889  *
1890  * Looks up the page cache entry at @mapping & @index.
1891  *
1892  * If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even
1893  * if the %GFP flags specified for %FGP_CREAT are atomic.
1894  *
1895  * If this function returns a folio, it is returned with an increased refcount.
1896  *
1897  * Return: The found folio or an ERR_PTR() otherwise.
1898  */
1899 struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
1900                 fgf_t fgp_flags, gfp_t gfp)
1901 {
1902         struct folio *folio;
1903
1904 repeat:
1905         folio = filemap_get_entry(mapping, index);
1906         if (xa_is_value(folio))
1907                 folio = NULL;
1908         if (!folio)
1909                 goto no_page;
1910
1911         if (fgp_flags & FGP_LOCK) {
1912                 if (fgp_flags & FGP_NOWAIT) {
1913                         if (!folio_trylock(folio)) {
1914                                 folio_put(folio);
1915                                 return ERR_PTR(-EAGAIN);
1916                         }
1917                 } else {
1918                         folio_lock(folio);
1919                 }
1920
1921                 /* Has the page been truncated? */
1922                 if (unlikely(folio->mapping != mapping)) {
1923                         folio_unlock(folio);
1924                         folio_put(folio);
1925                         goto repeat;
1926                 }
1927                 VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);
1928         }
1929
1930         if (fgp_flags & FGP_ACCESSED)
1931                 folio_mark_accessed(folio);
1932         else if (fgp_flags & FGP_WRITE) {
1933                 /* Clear idle flag for buffer write */
1934                 if (folio_test_idle(folio))
1935                         folio_clear_idle(folio);
1936         }
1937
1938         if (fgp_flags & FGP_STABLE)
1939                 folio_wait_stable(folio);
1940 no_page:
1941         if (!folio && (fgp_flags & FGP_CREAT)) {
1942                 unsigned int min_order = mapping_min_folio_order(mapping);
1943                 unsigned int order = max(min_order, FGF_GET_ORDER(fgp_flags));
1944                 int err;
1945                 index = mapping_align_index(mapping, index);
1946
1947                 if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping))
1948                         gfp |= __GFP_WRITE;
1949                 if (fgp_flags & FGP_NOFS)
1950                         gfp &= ~__GFP_FS;
1951                 if (fgp_flags & FGP_NOWAIT) {
1952                         gfp &= ~GFP_KERNEL;
1953                         gfp |= GFP_NOWAIT | __GFP_NOWARN;
1954                 }
1955                 if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
1956                         fgp_flags |= FGP_LOCK;
1957
1958                 if (order > mapping_max_folio_order(mapping))
1959                         order = mapping_max_folio_order(mapping);
1960                 /* If we're not aligned, allocate a smaller folio */
1961                 if (index & ((1UL << order) - 1))
1962                         order = __ffs(index);
1963
1964                 do {
1965                         gfp_t alloc_gfp = gfp;
1966
1967                         err = -ENOMEM;
1968                         if (order > min_order)
1969                                 alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN;
1970                         folio = filemap_alloc_folio(alloc_gfp, order);
1971                         if (!folio)
1972                                 continue;
1973
1974                         /* Init accessed so avoid atomic mark_page_accessed later */
1975                         if (fgp_flags & FGP_ACCESSED)
1976                                 __folio_set_referenced(folio);
1977                         if (fgp_flags & FGP_DONTCACHE)
1978                                 __folio_set_dropbehind(folio);
1979
1980                         err = filemap_add_folio(mapping, folio, index, gfp);
1981                         if (!err)
1982                                 break;
1983                         folio_put(folio);
1984                         folio = NULL;
1985                 } while (order-- > min_order);
1986
1987                 if (err == -EEXIST)
1988                         goto repeat;
1989                 if (err)
1990                         return ERR_PTR(err);
1991                 /*
1992                  * filemap_add_folio locks the page, and for mmap
1993                  * we expect an unlocked page.
1994                  */
1995                 if (folio && (fgp_flags & FGP_FOR_MMAP))
1996                         folio_unlock(folio);
1997         }
1998
1999         if (!folio)
2000                 return ERR_PTR(-ENOENT);
2001         /* not an uncached lookup, clear uncached if set */
2002         if (folio_test_dropbehind(folio) && !(fgp_flags & FGP_DONTCACHE))
2003                 folio_clear_dropbehind(folio);
2004         return folio;
2005 }
2006 EXPORT_SYMBOL(__filemap_get_folio);
2007
2008 static inline struct folio *find_get_entry(struct xa_state *xas, pgoff_t max,
2009                 xa_mark_t mark)
2010 {
2011         struct folio *folio;
2012
2013 retry:
2014         if (mark == XA_PRESENT)
2015                 folio = xas_find(xas, max);
2016         else
2017                 folio = xas_find_marked(xas, max, mark);
2018
2019         if (xas_retry(xas, folio))
2020                 goto retry;
2021         /*
2022          * A shadow entry of a recently evicted page, a swap
2023          * entry from shmem/tmpfs or a DAX entry.  Return it
2024          * without attempting to raise page count.
2025          */
2026         if (!folio || xa_is_value(folio))
2027                 return folio;
2028
2029         if (!folio_try_get(folio))
2030                 goto reset;
2031
2032         if (unlikely(folio != xas_reload(xas))) {
2033                 folio_put(folio);
2034                 goto reset;
2035         }
2036
2037         return folio;
2038 reset:
2039         xas_reset(xas);
2040         goto retry;
2041 }
2042
2043 /**
2044  * find_get_entries - gang pagecache lookup
2045  * @mapping:    The address_space to search
2046  * @start:      The starting page cache index
2047  * @end:        The final page index (inclusive).
2048  * @fbatch:     Where the resulting entries are placed.
2049  * @indices:    The cache indices corresponding to the entries in @entries
2050  *
2051  * find_get_entries() will search for and return a batch of entries in
2052  * the mapping.  The entries are placed in @fbatch.  find_get_entries()
2053  * takes a reference on any actual folios it returns.
2054  *
2055  * The entries have ascending indexes.  The indices may not be consecutive
2056  * due to not-present entries or large folios.
2057  *
2058  * Any shadow entries of evicted folios, or swap entries from
2059  * shmem/tmpfs, are included in the returned array.
2060  *
2061  * Return: The number of entries which were found.
2062  */
2063 unsigned find_get_entries(struct address_space *mapping, pgoff_t *start,
2064                 pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices)
2065 {
2066         XA_STATE(xas, &mapping->i_pages, *start);
2067         struct folio *folio;
2068
2069         rcu_read_lock();
2070         while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) {
2071                 indices[fbatch->nr] = xas.xa_index;
2072                 if (!folio_batch_add(fbatch, folio))
2073                         break;
2074         }
2075
2076         if (folio_batch_count(fbatch)) {
2077                 unsigned long nr;
2078                 int idx = folio_batch_count(fbatch) - 1;
2079
2080                 folio = fbatch->folios[idx];
2081                 if (!xa_is_value(folio))
2082                         nr = folio_nr_pages(folio);
2083                 else
2084                         nr = 1 << xa_get_order(&mapping->i_pages, indices[idx]);
2085                 *start = round_down(indices[idx] + nr, nr);
2086         }
2087         rcu_read_unlock();
2088
2089         return folio_batch_count(fbatch);
2090 }
2091
2092 /**
2093  * find_lock_entries - Find a batch of pagecache entries.
2094  * @mapping:    The address_space to search.
2095  * @start:      The starting page cache index.
2096  * @end:        The final page index (inclusive).
2097  * @fbatch:     Where the resulting entries are placed.
2098  * @indices:    The cache indices of the entries in @fbatch.
2099  *
2100  * find_lock_entries() will return a batch of entries from @mapping.
2101  * Swap, shadow and DAX entries are included.  Folios are returned
2102  * locked and with an incremented refcount.  Folios which are locked
2103  * by somebody else or under writeback are skipped.  Folios which are
2104  * partially outside the range are not returned.
2105  *
2106  * The entries have ascending indexes.  The indices may not be consecutive
2107  * due to not-present entries, large folios, folios which could not be
2108  * locked or folios under writeback.
2109  *
2110  * Return: The number of entries which were found.
2111  */
2112 unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start,
2113                 pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices)
2114 {
2115         XA_STATE(xas, &mapping->i_pages, *start);
2116         struct folio *folio;
2117
2118         rcu_read_lock();
2119         while ((folio = find_get_entry(&xas, end, XA_PRESENT))) {
2120                 unsigned long base;
2121                 unsigned long nr;
2122
2123                 if (!xa_is_value(folio)) {
2124                         nr = folio_nr_pages(folio);
2125                         base = folio->index;
2126                         /* Omit large folio which begins before the start */
2127                         if (base < *start)
2128                                 goto put;
2129                         /* Omit large folio which extends beyond the end */
2130                         if (base + nr - 1 > end)
2131                                 goto put;
2132                         if (!folio_trylock(folio))
2133                                 goto put;
2134                         if (folio->mapping != mapping ||
2135                             folio_test_writeback(folio))
2136                                 goto unlock;
2137                         VM_BUG_ON_FOLIO(!folio_contains(folio, xas.xa_index),
2138                                         folio);
2139                 } else {
2140                         nr = 1 << xas_get_order(&xas);
2141                         base = xas.xa_index & ~(nr - 1);
2142                         /* Omit order>0 value which begins before the start */
2143                         if (base < *start)
2144                                 continue;
2145                         /* Omit order>0 value which extends beyond the end */
2146                         if (base + nr - 1 > end)
2147                                 break;
2148                 }
2149
2150                 /* Update start now so that last update is correct on return */
2151                 *start = base + nr;
2152                 indices[fbatch->nr] = xas.xa_index;
2153                 if (!folio_batch_add(fbatch, folio))
2154                         break;
2155                 continue;
2156 unlock:
2157                 folio_unlock(folio);
2158 put:
2159                 folio_put(folio);
2160         }
2161         rcu_read_unlock();
2162
2163         return folio_batch_count(fbatch);
2164 }
2165
2166 /**
2167  * filemap_get_folios - Get a batch of folios
2168  * @mapping:    The address_space to search
2169  * @start:      The starting page index
2170  * @end:        The final page index (inclusive)
2171  * @fbatch:     The batch to fill.
2172  *
2173  * Search for and return a batch of folios in the mapping starting at
2174  * index @start and up to index @end (inclusive).  The folios are returned
2175  * in @fbatch with an elevated reference count.
2176  *
2177  * Return: The number of folios which were found.
2178  * We also update @start to index the next folio for the traversal.
2179  */
2180 unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start,
2181                 pgoff_t end, struct folio_batch *fbatch)
2182 {
2183         return filemap_get_folios_tag(mapping, start, end, XA_PRESENT, fbatch);
2184 }
2185 EXPORT_SYMBOL(filemap_get_folios);
2186
2187 /**
2188  * filemap_get_folios_contig - Get a batch of contiguous folios
2189  * @mapping:    The address_space to search
2190  * @start:      The starting page index
2191  * @end:        The final page index (inclusive)
2192  * @fbatch:     The batch to fill
2193  *
2194  * filemap_get_folios_contig() works exactly like filemap_get_folios(),
2195  * except the returned folios are guaranteed to be contiguous. This may
2196  * not return all contiguous folios if the batch gets filled up.
2197  *
2198  * Return: The number of folios found.
2199  * Also update @start to be positioned for traversal of the next folio.
2200  */
2201
2202 unsigned filemap_get_folios_contig(struct address_space *mapping,
2203                 pgoff_t *start, pgoff_t end, struct folio_batch *fbatch)
2204 {
2205         XA_STATE(xas, &mapping->i_pages, *start);
2206         unsigned long nr;
2207         struct folio *folio;
2208
2209         rcu_read_lock();
2210
2211         for (folio = xas_load(&xas); folio && xas.xa_index <= end;
2212                         folio = xas_next(&xas)) {
2213                 if (xas_retry(&xas, folio))
2214                         continue;
2215                 /*
2216                  * If the entry has been swapped out, we can stop looking.
2217                  * No current caller is looking for DAX entries.
2218                  */
2219                 if (xa_is_value(folio))
2220                         goto update_start;
2221
2222                 /* If we landed in the middle of a THP, continue at its end. */
2223                 if (xa_is_sibling(folio))
2224                         goto update_start;
2225
2226                 if (!folio_try_get(folio))
2227                         goto retry;
2228
2229                 if (unlikely(folio != xas_reload(&xas)))
2230                         goto put_folio;
2231
2232                 if (!folio_batch_add(fbatch, folio)) {
2233                         nr = folio_nr_pages(folio);
2234                         *start = folio->index + nr;
2235                         goto out;
2236                 }
2237                 continue;
2238 put_folio:
2239                 folio_put(folio);
2240
2241 retry:
2242                 xas_reset(&xas);
2243         }
2244
2245 update_start:
2246         nr = folio_batch_count(fbatch);
2247
2248         if (nr) {
2249                 folio = fbatch->folios[nr - 1];
2250                 *start = folio_next_index(folio);
2251         }
2252 out:
2253         rcu_read_unlock();
2254         return folio_batch_count(fbatch);
2255 }
2256 EXPORT_SYMBOL(filemap_get_folios_contig);
2257
2258 /**
2259  * filemap_get_folios_tag - Get a batch of folios matching @tag
2260  * @mapping:    The address_space to search
2261  * @start:      The starting page index
2262  * @end:        The final page index (inclusive)
2263  * @tag:        The tag index
2264  * @fbatch:     The batch to fill
2265  *
2266  * The first folio may start before @start; if it does, it will contain
2267  * @start.  The final folio may extend beyond @end; if it does, it will
2268  * contain @end.  The folios have ascending indices.  There may be gaps
2269  * between the folios if there are indices which have no folio in the
2270  * page cache.  If folios are added to or removed from the page cache
2271  * while this is running, they may or may not be found by this call.
2272  * Only returns folios that are tagged with @tag.
2273  *
2274  * Return: The number of folios found.
2275  * Also update @start to index the next folio for traversal.
2276  */
2277 unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start,
2278                         pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch)
2279 {
2280         XA_STATE(xas, &mapping->i_pages, *start);
2281         struct folio *folio;
2282
2283         rcu_read_lock();
2284         while ((folio = find_get_entry(&xas, end, tag)) != NULL) {
2285                 /*
2286                  * Shadow entries should never be tagged, but this iteration
2287                  * is lockless so there is a window for page reclaim to evict
2288                  * a page we saw tagged. Skip over it.
2289                  */
2290                 if (xa_is_value(folio))
2291                         continue;
2292                 if (!folio_batch_add(fbatch, folio)) {
2293                         unsigned long nr = folio_nr_pages(folio);
2294                         *start = folio->index + nr;
2295                         goto out;
2296                 }
2297         }
2298         /*
2299          * We come here when there is no page beyond @end. We take care to not
2300          * overflow the index @start as it confuses some of the callers. This
2301          * breaks the iteration when there is a page at index -1 but that is
2302          * already broke anyway.
2303          */
2304         if (end == (pgoff_t)-1)
2305                 *start = (pgoff_t)-1;
2306         else
2307                 *start = end + 1;
2308 out:
2309         rcu_read_unlock();
2310
2311         return folio_batch_count(fbatch);
2312 }
2313 EXPORT_SYMBOL(filemap_get_folios_tag);
2314
2315 /*
2316  * CD/DVDs are error prone. When a medium error occurs, the driver may fail
2317  * a _large_ part of the i/o request. Imagine the worst scenario:
2318  *
2319  *      ---R__________________________________________B__________
2320  *         ^ reading here                             ^ bad block(assume 4k)
2321  *
2322  * read(R) => miss => readahead(R...B) => media error => frustrating retries
2323  * => failing the whole request => read(R) => read(R+1) =>
2324  * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) =>
2325  * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) =>
2326  * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ......
2327  *
2328  * It is going insane. Fix it by quickly scaling down the readahead size.
2329  */
2330 static void shrink_readahead_size_eio(struct file_ra_state *ra)
2331 {
2332         ra->ra_pages /= 4;
2333 }
2334
2335 /*
2336  * filemap_get_read_batch - Get a batch of folios for read
2337  *
2338  * Get a batch of folios which represent a contiguous range of bytes in
2339  * the file.  No exceptional entries will be returned.  If @index is in
2340  * the middle of a folio, the entire folio will be returned.  The last
2341  * folio in the batch may have the readahead flag set or the uptodate flag
2342  * clear so that the caller can take the appropriate action.
2343  */
2344 static void filemap_get_read_batch(struct address_space *mapping,
2345                 pgoff_t index, pgoff_t max, struct folio_batch *fbatch)
2346 {
2347         XA_STATE(xas, &mapping->i_pages, index);
2348         struct folio *folio;
2349
2350         rcu_read_lock();
2351         for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) {
2352                 if (xas_retry(&xas, folio))
2353                         continue;
2354                 if (xas.xa_index > max || xa_is_value(folio))
2355                         break;
2356                 if (xa_is_sibling(folio))
2357                         break;
2358                 if (!folio_try_get(folio))
2359                         goto retry;
2360
2361                 if (unlikely(folio != xas_reload(&xas)))
2362                         goto put_folio;
2363
2364                 if (!folio_batch_add(fbatch, folio))
2365                         break;
2366                 if (!folio_test_uptodate(folio))
2367                         break;
2368                 if (folio_test_readahead(folio))
2369                         break;
2370                 xas_advance(&xas, folio_next_index(folio) - 1);
2371                 continue;
2372 put_folio:
2373                 folio_put(folio);
2374 retry:
2375                 xas_reset(&xas);
2376         }
2377         rcu_read_unlock();
2378 }
2379
2380 static int filemap_read_folio(struct file *file, filler_t filler,
2381                 struct folio *folio)
2382 {
2383         bool workingset = folio_test_workingset(folio);
2384         unsigned long pflags;
2385         int error;
2386
2387         /* Start the actual read. The read will unlock the page. */
2388         if (unlikely(workingset))
2389                 psi_memstall_enter(&pflags);
2390         error = filler(file, folio);
2391         if (unlikely(workingset))
2392                 psi_memstall_leave(&pflags);
2393         if (error)
2394                 return error;
2395
2396         error = folio_wait_locked_killable(folio);
2397         if (error)
2398                 return error;
2399         if (folio_test_uptodate(folio))
2400                 return 0;
2401         if (file)
2402                 shrink_readahead_size_eio(&file->f_ra);
2403         return -EIO;
2404 }
2405
2406 static bool filemap_range_uptodate(struct address_space *mapping,
2407                 loff_t pos, size_t count, struct folio *folio,
2408                 bool need_uptodate)
2409 {
2410         if (folio_test_uptodate(folio))
2411                 return true;
2412         /* pipes can't handle partially uptodate pages */
2413         if (need_uptodate)
2414                 return false;
2415         if (!mapping->a_ops->is_partially_uptodate)
2416                 return false;
2417         if (mapping->host->i_blkbits >= folio_shift(folio))
2418                 return false;
2419
2420         if (folio_pos(folio) > pos) {
2421                 count -= folio_pos(folio) - pos;
2422                 pos = 0;
2423         } else {
2424                 pos -= folio_pos(folio);
2425         }
2426
2427         return mapping->a_ops->is_partially_uptodate(folio, pos, count);
2428 }
2429
2430 static int filemap_update_page(struct kiocb *iocb,
2431                 struct address_space *mapping, size_t count,
2432                 struct folio *folio, bool need_uptodate)
2433 {
2434         int error;
2435
2436         if (iocb->ki_flags & IOCB_NOWAIT) {
2437                 if (!filemap_invalidate_trylock_shared(mapping))
2438                         return -EAGAIN;
2439         } else {
2440                 filemap_invalidate_lock_shared(mapping);
2441         }
2442
2443         if (!folio_trylock(folio)) {
2444                 error = -EAGAIN;
2445                 if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO))
2446                         goto unlock_mapping;
2447                 if (!(iocb->ki_flags & IOCB_WAITQ)) {
2448                         filemap_invalidate_unlock_shared(mapping);
2449                         /*
2450                          * This is where we usually end up waiting for a
2451                          * previously submitted readahead to finish.
2452                          */
2453                         folio_put_wait_locked(folio, TASK_KILLABLE);
2454                         return AOP_TRUNCATED_PAGE;
2455                 }
2456                 error = __folio_lock_async(folio, iocb->ki_waitq);
2457                 if (error)
2458                         goto unlock_mapping;
2459         }
2460
2461         error = AOP_TRUNCATED_PAGE;
2462         if (!folio->mapping)
2463                 goto unlock;
2464
2465         error = 0;
2466         if (filemap_range_uptodate(mapping, iocb->ki_pos, count, folio,
2467                                    need_uptodate))
2468                 goto unlock;
2469
2470         error = -EAGAIN;
2471         if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ))
2472                 goto unlock;
2473
2474         error = filemap_read_folio(iocb->ki_filp, mapping->a_ops->read_folio,
2475                         folio);
2476         goto unlock_mapping;
2477 unlock:
2478         folio_unlock(folio);
2479 unlock_mapping:
2480         filemap_invalidate_unlock_shared(mapping);
2481         if (error == AOP_TRUNCATED_PAGE)
2482                 folio_put(folio);
2483         return error;
2484 }
2485
2486 static int filemap_create_folio(struct kiocb *iocb, struct folio_batch *fbatch)
2487 {
2488         struct address_space *mapping = iocb->ki_filp->f_mapping;
2489         struct folio *folio;
2490         int error;
2491         unsigned int min_order = mapping_min_folio_order(mapping);
2492         pgoff_t index;
2493
2494         if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ))
2495                 return -EAGAIN;
2496
2497         folio = filemap_alloc_folio(mapping_gfp_mask(mapping), min_order);
2498         if (!folio)
2499                 return -ENOMEM;
2500         if (iocb->ki_flags & IOCB_DONTCACHE)
2501                 __folio_set_dropbehind(folio);
2502
2503         /*
2504          * Protect against truncate / hole punch. Grabbing invalidate_lock
2505          * here assures we cannot instantiate and bring uptodate new
2506          * pagecache folios after evicting page cache during truncate
2507          * and before actually freeing blocks.  Note that we could
2508          * release invalidate_lock after inserting the folio into
2509          * the page cache as the locked folio would then be enough to
2510          * synchronize with hole punching. But there are code paths
2511          * such as filemap_update_page() filling in partially uptodate
2512          * pages or ->readahead() that need to hold invalidate_lock
2513          * while mapping blocks for IO so let's hold the lock here as
2514          * well to keep locking rules simple.
2515          */
2516         filemap_invalidate_lock_shared(mapping);
2517         index = (iocb->ki_pos >> (PAGE_SHIFT + min_order)) << min_order;
2518         error = filemap_add_folio(mapping, folio, index,
2519                         mapping_gfp_constraint(mapping, GFP_KERNEL));
2520         if (error == -EEXIST)
2521                 error = AOP_TRUNCATED_PAGE;
2522         if (error)
2523                 goto error;
2524
2525         error = filemap_read_folio(iocb->ki_filp, mapping->a_ops->read_folio,
2526                                         folio);
2527         if (error)
2528                 goto error;
2529
2530         filemap_invalidate_unlock_shared(mapping);
2531         folio_batch_add(fbatch, folio);
2532         return 0;
2533 error:
2534         filemap_invalidate_unlock_shared(mapping);
2535         folio_put(folio);
2536         return error;
2537 }
2538
2539 static int filemap_readahead(struct kiocb *iocb, struct file *file,
2540                 struct address_space *mapping, struct folio *folio,
2541                 pgoff_t last_index)
2542 {
2543         DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, folio->index);
2544
2545         if (iocb->ki_flags & IOCB_NOIO)
2546                 return -EAGAIN;
2547         if (iocb->ki_flags & IOCB_DONTCACHE)
2548                 ractl.dropbehind = 1;
2549         page_cache_async_ra(&ractl, folio, last_index - folio->index);
2550         return 0;
2551 }
2552
2553 static int filemap_get_pages(struct kiocb *iocb, size_t count,
2554                 struct folio_batch *fbatch, bool need_uptodate)
2555 {
2556         struct file *filp = iocb->ki_filp;
2557         struct address_space *mapping = filp->f_mapping;
2558         pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
2559         pgoff_t last_index;
2560         struct folio *folio;
2561         unsigned int flags;
2562         int err = 0;
2563
2564         /* "last_index" is the index of the page beyond the end of the read */
2565         last_index = DIV_ROUND_UP(iocb->ki_pos + count, PAGE_SIZE);
2566 retry:
2567         if (fatal_signal_pending(current))
2568                 return -EINTR;
2569
2570         filemap_get_read_batch(mapping, index, last_index - 1, fbatch);
2571         if (!folio_batch_count(fbatch)) {
2572                 DEFINE_READAHEAD(ractl, filp, &filp->f_ra, mapping, index);
2573
2574                 if (iocb->ki_flags & IOCB_NOIO)
2575                         return -EAGAIN;
2576                 if (iocb->ki_flags & IOCB_NOWAIT)
2577                         flags = memalloc_noio_save();
2578                 if (iocb->ki_flags & IOCB_DONTCACHE)
2579                         ractl.dropbehind = 1;
2580                 page_cache_sync_ra(&ractl, last_index - index);
2581                 if (iocb->ki_flags & IOCB_NOWAIT)
2582                         memalloc_noio_restore(flags);
2583                 filemap_get_read_batch(mapping, index, last_index - 1, fbatch);
2584         }
2585         if (!folio_batch_count(fbatch)) {
2586                 err = filemap_create_folio(iocb, fbatch);
2587                 if (err == AOP_TRUNCATED_PAGE)
2588                         goto retry;
2589                 return err;
2590         }
2591
2592         folio = fbatch->folios[folio_batch_count(fbatch) - 1];
2593         if (folio_test_readahead(folio)) {
2594                 err = filemap_readahead(iocb, filp, mapping, folio, last_index);
2595                 if (err)
2596                         goto err;
2597         }
2598         if (!folio_test_uptodate(folio)) {
2599                 if ((iocb->ki_flags & IOCB_WAITQ) &&
2600                     folio_batch_count(fbatch) > 1)
2601                         iocb->ki_flags |= IOCB_NOWAIT;
2602                 err = filemap_update_page(iocb, mapping, count, folio,
2603                                           need_uptodate);
2604                 if (err)
2605                         goto err;
2606         }
2607
2608         trace_mm_filemap_get_pages(mapping, index, last_index - 1);
2609         return 0;
2610 err:
2611         if (err < 0)
2612                 folio_put(folio);
2613         if (likely(--fbatch->nr))
2614                 return 0;
2615         if (err == AOP_TRUNCATED_PAGE)
2616                 goto retry;
2617         return err;
2618 }
2619
2620 static inline bool pos_same_folio(loff_t pos1, loff_t pos2, struct folio *folio)
2621 {
2622         unsigned int shift = folio_shift(folio);
2623
2624         return (pos1 >> shift == pos2 >> shift);
2625 }
2626
2627 static void filemap_end_dropbehind_read(struct address_space *mapping,
2628                                         struct folio *folio)
2629 {
2630         if (!folio_test_dropbehind(folio))
2631                 return;
2632         if (folio_test_writeback(folio) || folio_test_dirty(folio))
2633                 return;
2634         if (folio_trylock(folio)) {
2635                 if (folio_test_clear_dropbehind(folio))
2636                         folio_unmap_invalidate(mapping, folio, 0);
2637                 folio_unlock(folio);
2638         }
2639 }
2640
2641 /**
2642  * filemap_read - Read data from the page cache.
2643  * @iocb: The iocb to read.
2644  * @iter: Destination for the data.
2645  * @already_read: Number of bytes already read by the caller.
2646  *
2647  * Copies data from the page cache.  If the data is not currently present,
2648  * uses the readahead and read_folio address_space operations to fetch it.
2649  *
2650  * Return: Total number of bytes copied, including those already read by
2651  * the caller.  If an error happens before any bytes are copied, returns
2652  * a negative error number.
2653  */
2654 ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
2655                 ssize_t already_read)
2656 {
2657         struct file *filp = iocb->ki_filp;
2658         struct file_ra_state *ra = &filp->f_ra;
2659         struct address_space *mapping = filp->f_mapping;
2660         struct inode *inode = mapping->host;
2661         struct folio_batch fbatch;
2662         int i, error = 0;
2663         bool writably_mapped;
2664         loff_t isize, end_offset;
2665         loff_t last_pos = ra->prev_pos;
2666
2667         if (unlikely(iocb->ki_pos < 0))
2668                 return -EINVAL;
2669         if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
2670                 return 0;
2671         if (unlikely(!iov_iter_count(iter)))
2672                 return 0;
2673
2674         iov_iter_truncate(iter, inode->i_sb->s_maxbytes - iocb->ki_pos);
2675         folio_batch_init(&fbatch);
2676
2677         do {
2678                 cond_resched();
2679
2680                 /*
2681                  * If we've already successfully copied some data, then we
2682                  * can no longer safely return -EIOCBQUEUED. Hence mark
2683                  * an async read NOWAIT at that point.
2684                  */
2685                 if ((iocb->ki_flags & IOCB_WAITQ) && already_read)
2686                         iocb->ki_flags |= IOCB_NOWAIT;
2687
2688                 if (unlikely(iocb->ki_pos >= i_size_read(inode)))
2689                         break;
2690
2691                 error = filemap_get_pages(iocb, iter->count, &fbatch, false);
2692                 if (error < 0)
2693                         break;
2694
2695                 /*
2696                  * i_size must be checked after we know the pages are Uptodate.
2697                  *
2698                  * Checking i_size after the check allows us to calculate
2699                  * the correct value for "nr", which means the zero-filled
2700                  * part of the page is not copied back to userspace (unless
2701                  * another truncate extends the file - this is desired though).
2702                  */
2703                 isize = i_size_read(inode);
2704                 if (unlikely(iocb->ki_pos >= isize))
2705                         goto put_folios;
2706                 end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);
2707
2708                 /*
2709                  * Once we start copying data, we don't want to be touching any
2710                  * cachelines that might be contended:
2711                  */
2712                 writably_mapped = mapping_writably_mapped(mapping);
2713
2714                 /*
2715                  * When a read accesses the same folio several times, only
2716                  * mark it as accessed the first time.
2717                  */
2718                 if (!pos_same_folio(iocb->ki_pos, last_pos - 1,
2719                                     fbatch.folios[0]))
2720                         folio_mark_accessed(fbatch.folios[0]);
2721
2722                 for (i = 0; i < folio_batch_count(&fbatch); i++) {
2723                         struct folio *folio = fbatch.folios[i];
2724                         size_t fsize = folio_size(folio);
2725                         size_t offset = iocb->ki_pos & (fsize - 1);
2726                         size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos,
2727                                              fsize - offset);
2728                         size_t copied;
2729
2730                         if (end_offset < folio_pos(folio))
2731                                 break;
2732                         if (i > 0)
2733                                 folio_mark_accessed(folio);
2734                         /*
2735                          * If users can be writing to this folio using arbitrary
2736                          * virtual addresses, take care of potential aliasing
2737                          * before reading the folio on the kernel side.
2738                          */
2739                         if (writably_mapped)
2740                                 flush_dcache_folio(folio);
2741
2742                         copied = copy_folio_to_iter(folio, offset, bytes, iter);
2743
2744                         already_read += copied;
2745                         iocb->ki_pos += copied;
2746                         last_pos = iocb->ki_pos;
2747
2748                         if (copied < bytes) {
2749                                 error = -EFAULT;
2750                                 break;
2751                         }
2752                 }
2753 put_folios:
2754                 for (i = 0; i < folio_batch_count(&fbatch); i++) {
2755                         struct folio *folio = fbatch.folios[i];
2756
2757                         filemap_end_dropbehind_read(mapping, folio);
2758                         folio_put(folio);
2759                 }
2760                 folio_batch_init(&fbatch);
2761         } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);
2762
2763         file_accessed(filp);
2764         ra->prev_pos = last_pos;
2765         return already_read ? already_read : error;
2766 }
2767 EXPORT_SYMBOL_GPL(filemap_read);
2768
2769 int kiocb_write_and_wait(struct kiocb *iocb, size_t count)
2770 {
2771         struct address_space *mapping = iocb->ki_filp->f_mapping;
2772         loff_t pos = iocb->ki_pos;
2773         loff_t end = pos + count - 1;
2774
2775         if (iocb->ki_flags & IOCB_NOWAIT) {
2776                 if (filemap_range_needs_writeback(mapping, pos, end))
2777                         return -EAGAIN;
2778                 return 0;
2779         }
2780
2781         return filemap_write_and_wait_range(mapping, pos, end);
2782 }
2783 EXPORT_SYMBOL_GPL(kiocb_write_and_wait);
2784
2785 int filemap_invalidate_pages(struct address_space *mapping,
2786                              loff_t pos, loff_t end, bool nowait)
2787 {
2788         int ret;
2789
2790         if (nowait) {
2791                 /* we could block if there are any pages in the range */
2792                 if (filemap_range_has_page(mapping, pos, end))
2793                         return -EAGAIN;
2794         } else {
2795                 ret = filemap_write_and_wait_range(mapping, pos, end);
2796                 if (ret)
2797                         return ret;
2798         }
2799
2800         /*
2801          * After a write we want buffered reads to be sure to go to disk to get
2802          * the new data.  We invalidate clean cached page from the region we're
2803          * about to write.  We do this *before* the write so that we can return
2804          * without clobbering -EIOCBQUEUED from ->direct_IO().
2805          */
2806         return invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT,
2807                                              end >> PAGE_SHIFT);
2808 }
2809
2810 int kiocb_invalidate_pages(struct kiocb *iocb, size_t count)
2811 {
2812         struct address_space *mapping = iocb->ki_filp->f_mapping;
2813
2814         return filemap_invalidate_pages(mapping, iocb->ki_pos,
2815                                         iocb->ki_pos + count - 1,
2816                                         iocb->ki_flags & IOCB_NOWAIT);
2817 }
2818 EXPORT_SYMBOL_GPL(kiocb_invalidate_pages);
2819
2820 /**
2821  * generic_file_read_iter - generic filesystem read routine
2822  * @iocb:       kernel I/O control block
2823  * @iter:       destination for the data read
2824  *
2825  * This is the "read_iter()" routine for all filesystems
2826  * that can use the page cache directly.
2827  *
2828  * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall
2829  * be returned when no data can be read without waiting for I/O requests
2830  * to complete; it doesn't prevent readahead.
2831  *
2832  * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O
2833  * requests shall be made for the read or for readahead.  When no data
2834  * can be read, -EAGAIN shall be returned.  When readahead would be
2835  * triggered, a partial, possibly empty read shall be returned.
2836  *
2837  * Return:
2838  * * number of bytes copied, even for partial reads
2839  * * negative error code (or 0 if IOCB_NOIO) if nothing was read
2840  */
2841 ssize_t
2842 generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
2843 {
2844         size_t count = iov_iter_count(iter);
2845         ssize_t retval = 0;
2846
2847         if (!count)
2848                 return 0; /* skip atime */
2849
2850         if (iocb->ki_flags & IOCB_DIRECT) {
2851                 struct file *file = iocb->ki_filp;
2852                 struct address_space *mapping = file->f_mapping;
2853                 struct inode *inode = mapping->host;
2854
2855                 retval = kiocb_write_and_wait(iocb, count);
2856                 if (retval < 0)
2857                         return retval;
2858                 file_accessed(file);
2859
2860                 retval = mapping->a_ops->direct_IO(iocb, iter);
2861                 if (retval >= 0) {
2862                         iocb->ki_pos += retval;
2863                         count -= retval;
2864                 }
2865                 if (retval != -EIOCBQUEUED)
2866                         iov_iter_revert(iter, count - iov_iter_count(iter));
2867
2868                 /*
2869                  * Btrfs can have a short DIO read if we encounter
2870                  * compressed extents, so if there was an error, or if
2871                  * we've already read everything we wanted to, or if
2872                  * there was a short read because we hit EOF, go ahead
2873                  * and return.  Otherwise fallthrough to buffered io for
2874                  * the rest of the read.  Buffered reads will not work for
2875                  * DAX files, so don't bother trying.
2876                  */
2877                 if (retval < 0 || !count || IS_DAX(inode))
2878                         return retval;
2879                 if (iocb->ki_pos >= i_size_read(inode))
2880                         return retval;
2881         }
2882
2883         return filemap_read(iocb, iter, retval);
2884 }
2885 EXPORT_SYMBOL(generic_file_read_iter);
2886
2887 /*
2888  * Splice subpages from a folio into a pipe.
2889  */
2890 size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
2891                               struct folio *folio, loff_t fpos, size_t size)
2892 {
2893         struct page *page;
2894         size_t spliced = 0, offset = offset_in_folio(folio, fpos);
2895
2896         page = folio_page(folio, offset / PAGE_SIZE);
2897         size = min(size, folio_size(folio) - offset);
2898         offset %= PAGE_SIZE;
2899
2900         while (spliced < size &&
2901                !pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
2902                 struct pipe_buffer *buf = pipe_head_buf(pipe);
2903                 size_t part = min_t(size_t, PAGE_SIZE - offset, size - spliced);
2904
2905                 *buf = (struct pipe_buffer) {
2906                         .ops    = &page_cache_pipe_buf_ops,
2907                         .page   = page,
2908                         .offset = offset,
2909                         .len    = part,
2910                 };
2911                 folio_get(folio);
2912                 pipe->head++;
2913                 page++;
2914                 spliced += part;
2915                 offset = 0;
2916         }
2917
2918         return spliced;
2919 }
2920
2921 /**
2922  * filemap_splice_read -  Splice data from a file's pagecache into a pipe
2923  * @in: The file to read from
2924  * @ppos: Pointer to the file position to read from
2925  * @pipe: The pipe to splice into
2926  * @len: The amount to splice
2927  * @flags: The SPLICE_F_* flags
2928  *
2929  * This function gets folios from a file's pagecache and splices them into the
2930  * pipe.  Readahead will be called as necessary to fill more folios.  This may
2931  * be used for blockdevs also.
2932  *
2933  * Return: On success, the number of bytes read will be returned and *@ppos
2934  * will be updated if appropriate; 0 will be returned if there is no more data
2935  * to be read; -EAGAIN will be returned if the pipe had no space, and some
2936  * other negative error code will be returned on error.  A short read may occur
2937  * if the pipe has insufficient space, we reach the end of the data or we hit a
2938  * hole.
2939  */
2940 ssize_t filemap_splice_read(struct file *in, loff_t *ppos,
2941                             struct pipe_inode_info *pipe,
2942                             size_t len, unsigned int flags)
2943 {
2944         struct folio_batch fbatch;
2945         struct kiocb iocb;
2946         size_t total_spliced = 0, used, npages;
2947         loff_t isize, end_offset;
2948         bool writably_mapped;
2949         int i, error = 0;
2950
2951         if (unlikely(*ppos >= in->f_mapping->host->i_sb->s_maxbytes))
2952                 return 0;
2953
2954         init_sync_kiocb(&iocb, in);
2955         iocb.ki_pos = *ppos;
2956
2957         /* Work out how much data we can actually add into the pipe */
2958         used = pipe_occupancy(pipe->head, pipe->tail);
2959         npages = max_t(ssize_t, pipe->max_usage - used, 0);
2960         len = min_t(size_t, len, npages * PAGE_SIZE);
2961
2962         folio_batch_init(&fbatch);
2963
2964         do {
2965                 cond_resched();
2966
2967                 if (*ppos >= i_size_read(in->f_mapping->host))
2968                         break;
2969
2970                 iocb.ki_pos = *ppos;
2971                 error = filemap_get_pages(&iocb, len, &fbatch, true);
2972                 if (error < 0)
2973                         break;
2974
2975                 /*
2976                  * i_size must be checked after we know the pages are Uptodate.
2977                  *
2978                  * Checking i_size after the check allows us to calculate
2979                  * the correct value for "nr", which means the zero-filled
2980                  * part of the page is not copied back to userspace (unless
2981                  * another truncate extends the file - this is desired though).
2982                  */
2983                 isize = i_size_read(in->f_mapping->host);
2984                 if (unlikely(*ppos >= isize))
2985                         break;
2986                 end_offset = min_t(loff_t, isize, *ppos + len);
2987
2988                 /*
2989                  * Once we start copying data, we don't want to be touching any
2990                  * cachelines that might be contended:
2991                  */
2992                 writably_mapped = mapping_writably_mapped(in->f_mapping);
2993
2994                 for (i = 0; i < folio_batch_count(&fbatch); i++) {
2995                         struct folio *folio = fbatch.folios[i];
2996                         size_t n;
2997
2998                         if (folio_pos(folio) >= end_offset)
2999                                 goto out;
3000                         folio_mark_accessed(folio);
3001
3002                         /*
3003                          * If users can be writing to this folio using arbitrary
3004                          * virtual addresses, take care of potential aliasing
3005                          * before reading the folio on the kernel side.
3006                          */
3007                         if (writably_mapped)
3008                                 flush_dcache_folio(folio);
3009
3010                         n = min_t(loff_t, len, isize - *ppos);
3011                         n = splice_folio_into_pipe(pipe, folio, *ppos, n);
3012                         if (!n)
3013                                 goto out;
3014                         len -= n;
3015                         total_spliced += n;
3016                         *ppos += n;
3017                         in->f_ra.prev_pos = *ppos;
3018                         if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
3019                                 goto out;
3020                 }
3021
3022                 folio_batch_release(&fbatch);
3023         } while (len);
3024
3025 out:
3026         folio_batch_release(&fbatch);
3027         file_accessed(in);
3028
3029         return total_spliced ? total_spliced : error;
3030 }
3031 EXPORT_SYMBOL(filemap_splice_read);
3032
3033 static inline loff_t folio_seek_hole_data(struct xa_state *xas,
3034                 struct address_space *mapping, struct folio *folio,
3035                 loff_t start, loff_t end, bool seek_data)
3036 {
3037         const struct address_space_operations *ops = mapping->a_ops;
3038         size_t offset, bsz = i_blocksize(mapping->host);
3039
3040         if (xa_is_value(folio) || folio_test_uptodate(folio))
3041                 return seek_data ? start : end;
3042         if (!ops->is_partially_uptodate)
3043                 return seek_data ? end : start;
3044
3045         xas_pause(xas);
3046         rcu_read_unlock();
3047         folio_lock(folio);
3048         if (unlikely(folio->mapping != mapping))
3049                 goto unlock;
3050
3051         offset = offset_in_folio(folio, start) & ~(bsz - 1);
3052
3053         do {
3054                 if (ops->is_partially_uptodate(folio, offset, bsz) ==
3055                                                         seek_data)
3056                         break;
3057                 start = (start + bsz) & ~((u64)bsz - 1);
3058                 offset += bsz;
3059         } while (offset < folio_size(folio));
3060 unlock:
3061         folio_unlock(folio);
3062         rcu_read_lock();
3063         return start;
3064 }
3065
3066 static inline size_t seek_folio_size(struct xa_state *xas, struct folio *folio)
3067 {
3068         if (xa_is_value(folio))
3069                 return PAGE_SIZE << xas_get_order(xas);
3070         return folio_size(folio);
3071 }
3072
3073 /**
3074  * mapping_seek_hole_data - Seek for SEEK_DATA / SEEK_HOLE in the page cache.
3075  * @mapping: Address space to search.
3076  * @start: First byte to consider.
3077  * @end: Limit of search (exclusive).
3078  * @whence: Either SEEK_HOLE or SEEK_DATA.
3079  *
3080  * If the page cache knows which blocks contain holes and which blocks
3081  * contain data, your filesystem can use this function to implement
3082  * SEEK_HOLE and SEEK_DATA.  This is useful for filesystems which are
3083  * entirely memory-based such as tmpfs, and filesystems which support
3084  * unwritten extents.
3085  *
3086  * Return: The requested offset on success, or -ENXIO if @whence specifies
3087  * SEEK_DATA and there is no data after @start.  There is an implicit hole
3088  * after @end - 1, so SEEK_HOLE returns @end if all the bytes between @start
3089  * and @end contain data.
3090  */
3091 loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start,
3092                 loff_t end, int whence)
3093 {
3094         XA_STATE(xas, &mapping->i_pages, start >> PAGE_SHIFT);
3095         pgoff_t max = (end - 1) >> PAGE_SHIFT;
3096         bool seek_data = (whence == SEEK_DATA);
3097         struct folio *folio;
3098
3099         if (end <= start)
3100                 return -ENXIO;
3101
3102         rcu_read_lock();
3103         while ((folio = find_get_entry(&xas, max, XA_PRESENT))) {
3104                 loff_t pos = (u64)xas.xa_index << PAGE_SHIFT;
3105                 size_t seek_size;
3106
3107                 if (start < pos) {
3108                         if (!seek_data)
3109                                 goto unlock;
3110                         start = pos;
3111                 }
3112
3113                 seek_size = seek_folio_size(&xas, folio);
3114                 pos = round_up((u64)pos + 1, seek_size);
3115                 start = folio_seek_hole_data(&xas, mapping, folio, start, pos,
3116                                 seek_data);
3117                 if (start < pos)
3118                         goto unlock;
3119                 if (start >= end)
3120                         break;
3121                 if (seek_size > PAGE_SIZE)
3122                         xas_set(&xas, pos >> PAGE_SHIFT);
3123                 if (!xa_is_value(folio))
3124                         folio_put(folio);
3125         }
3126         if (seek_data)
3127                 start = -ENXIO;
3128 unlock:
3129         rcu_read_unlock();
3130         if (folio && !xa_is_value(folio))
3131                 folio_put(folio);
3132         if (start > end)
3133                 return end;
3134         return start;
3135 }
3136
3137 #ifdef CONFIG_MMU
3138 #define MMAP_LOTSAMISS  (100)
3139 /*
3140  * lock_folio_maybe_drop_mmap - lock the page, possibly dropping the mmap_lock
3141  * @vmf - the vm_fault for this fault.
3142  * @folio - the folio to lock.
3143  * @fpin - the pointer to the file we may pin (or is already pinned).
3144  *
3145  * This works similar to lock_folio_or_retry in that it can drop the
3146  * mmap_lock.  It differs in that it actually returns the folio locked
3147  * if it returns 1 and 0 if it couldn't lock the folio.  If we did have
3148  * to drop the mmap_lock then fpin will point to the pinned file and
3149  * needs to be fput()'ed at a later point.
3150  */
3151 static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio,
3152                                      struct file **fpin)
3153 {
3154         if (folio_trylock(folio))
3155                 return 1;
3156
3157         /*
3158          * NOTE! This will make us return with VM_FAULT_RETRY, but with
3159          * the fault lock still held. That's how FAULT_FLAG_RETRY_NOWAIT
3160          * is supposed to work. We have way too many special cases..
3161          */
3162         if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
3163                 return 0;
3164
3165         *fpin = maybe_unlock_mmap_for_io(vmf, *fpin);
3166         if (vmf->flags & FAULT_FLAG_KILLABLE) {
3167                 if (__folio_lock_killable(folio)) {
3168                         /*
3169                          * We didn't have the right flags to drop the
3170                          * fault lock, but all fault_handlers only check
3171                          * for fatal signals if we return VM_FAULT_RETRY,
3172                          * so we need to drop the fault lock here and
3173                          * return 0 if we don't have a fpin.
3174                          */
3175                         if (*fpin == NULL)
3176                                 release_fault_lock(vmf);
3177                         return 0;
3178                 }
3179         } else
3180                 __folio_lock(folio);
3181
3182         return 1;
3183 }
3184
3185 /*
3186  * Synchronous readahead happens when we don't even find a page in the page
3187  * cache at all.  We don't want to perform IO under the mmap sem, so if we have
3188  * to drop the mmap sem we return the file that was pinned in order for us to do
3189  * that.  If we didn't pin a file then we return NULL.  The file that is
3190  * returned needs to be fput()'ed when we're done with it.
3191  */
3192 static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
3193 {
3194         struct file *file = vmf->vma->vm_file;
3195         struct file_ra_state *ra = &file->f_ra;
3196         struct address_space *mapping = file->f_mapping;
3197         DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff);
3198         struct file *fpin = NULL;
3199         unsigned long vm_flags = vmf->vma->vm_flags;
3200         unsigned int mmap_miss;
3201
3202         /*
3203          * If we have pre-content watches we need to disable readahead to make
3204          * sure that we don't populate our mapping with 0 filled pages that we
3205          * never emitted an event for.
3206          */
3207         if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode)))
3208                 return fpin;
3209
3210 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
3211         /* Use the readahead code, even if readahead is disabled */
3212         if ((vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER) {
3213                 fpin = maybe_unlock_mmap_for_io(vmf, fpin);
3214                 ractl._index &= ~((unsigned long)HPAGE_PMD_NR - 1);
3215                 ra->size = HPAGE_PMD_NR;
3216                 /*
3217                  * Fetch two PMD folios, so we get the chance to actually
3218                  * readahead, unless we've been told not to.
3219                  */
3220                 if (!(vm_flags & VM_RAND_READ))
3221                         ra->size *= 2;
3222                 ra->async_size = HPAGE_PMD_NR;
3223                 page_cache_ra_order(&ractl, ra, HPAGE_PMD_ORDER);
3224                 return fpin;
3225         }
3226 #endif
3227
3228         /* If we don't want any read-ahead, don't bother */
3229         if (vm_flags & VM_RAND_READ)
3230                 return fpin;
3231         if (!ra->ra_pages)
3232                 return fpin;
3233
3234         if (vm_flags & VM_SEQ_READ) {
3235                 fpin = maybe_unlock_mmap_for_io(vmf, fpin);
3236                 page_cache_sync_ra(&ractl, ra->ra_pages);
3237                 return fpin;
3238         }
3239
3240         /* Avoid banging the cache line if not needed */
3241         mmap_miss = READ_ONCE(ra->mmap_miss);
3242         if (mmap_miss < MMAP_LOTSAMISS * 10)
3243                 WRITE_ONCE(ra->mmap_miss, ++mmap_miss);
3244
3245         /*
3246          * Do we miss much more than hit in this file? If so,
3247          * stop bothering with read-ahead. It will only hurt.
3248          */
3249         if (mmap_miss > MMAP_LOTSAMISS)
3250                 return fpin;
3251
3252         /*
3253          * mmap read-around
3254          */
3255         fpin = maybe_unlock_mmap_for_io(vmf, fpin);
3256         ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2);
3257         ra->size = ra->ra_pages;
3258         ra->async_size = ra->ra_pages / 4;
3259         ractl._index = ra->start;
3260         page_cache_ra_order(&ractl, ra, 0);
3261         return fpin;
3262 }
3263
3264 /*
3265  * Asynchronous readahead happens when we find the page and PG_readahead,
3266  * so we want to possibly extend the readahead further.  We return the file that
3267  * was pinned if we have to drop the mmap_lock in order to do IO.
3268  */
3269 static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
3270                                             struct folio *folio)
3271 {
3272         struct file *file = vmf->vma->vm_file;
3273         struct file_ra_state *ra = &file->f_ra;
3274         DEFINE_READAHEAD(ractl, file, ra, file->f_mapping, vmf->pgoff);
3275         struct file *fpin = NULL;
3276         unsigned int mmap_miss;
3277
3278         /* See comment in do_sync_mmap_readahead. */
3279         if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode)))
3280                 return fpin;
3281
3282         /* If we don't want any read-ahead, don't bother */
3283         if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages)
3284                 return fpin;
3285
3286         mmap_miss = READ_ONCE(ra->mmap_miss);
3287         if (mmap_miss)
3288                 WRITE_ONCE(ra->mmap_miss, --mmap_miss);
3289
3290         if (folio_test_readahead(folio)) {
3291                 fpin = maybe_unlock_mmap_for_io(vmf, fpin);
3292                 page_cache_async_ra(&ractl, folio, ra->ra_pages);
3293         }
3294         return fpin;
3295 }
3296
3297 static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf)
3298 {
3299         struct vm_area_struct *vma = vmf->vma;
3300         vm_fault_t ret = 0;
3301         pte_t *ptep;
3302
3303         /*
3304          * We might have COW'ed a pagecache folio and might now have an mlocked
3305          * anon folio mapped. The original pagecache folio is not mlocked and
3306          * might have been evicted. During a read+clear/modify/write update of
3307          * the PTE, such as done in do_numa_page()/change_pte_range(), we
3308          * temporarily clear the PTE under PT lock and might detect it here as
3309          * "none" when not holding the PT lock.
3310          *
3311          * Not rechecking the PTE under PT lock could result in an unexpected
3312          * major fault in an mlock'ed region. Recheck only for this special
3313          * scenario while holding the PT lock, to not degrade non-mlocked
3314          * scenarios. Recheck the PTE without PT lock firstly, thereby reducing
3315          * the number of times we hold PT lock.
3316          */
3317         if (!(vma->vm_flags & VM_LOCKED))
3318                 return 0;
3319
3320         if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))
3321                 return 0;
3322
3323         ptep = pte_offset_map_ro_nolock(vma->vm_mm, vmf->pmd, vmf->address,
3324                                         &vmf->ptl);
3325         if (unlikely(!ptep))
3326                 return VM_FAULT_NOPAGE;
3327
3328         if (unlikely(!pte_none(ptep_get_lockless(ptep)))) {
3329                 ret = VM_FAULT_NOPAGE;
3330         } else {
3331                 spin_lock(vmf->ptl);
3332                 if (unlikely(!pte_none(ptep_get(ptep))))
3333                         ret = VM_FAULT_NOPAGE;
3334                 spin_unlock(vmf->ptl);
3335         }
3336         pte_unmap(ptep);
3337         return ret;
3338 }
3339
3340 /**
3341  * filemap_fsnotify_fault - maybe emit a pre-content event.
3342  * @vmf:        struct vm_fault containing details of the fault.
3343  *
3344  * If we have a pre-content watch on this file we will emit an event for this
3345  * range.  If we return anything the fault caller should return immediately, we
3346  * will return VM_FAULT_RETRY if we had to emit an event, which will trigger the
3347  * fault again and then the fault handler will run the second time through.
3348  *
3349  * Return: a bitwise-OR of %VM_FAULT_ codes, 0 if nothing happened.
3350  */
3351 vm_fault_t filemap_fsnotify_fault(struct vm_fault *vmf)
3352 {
3353         struct file *fpin = NULL;
3354         int mask = (vmf->flags & FAULT_FLAG_WRITE) ? MAY_WRITE : MAY_ACCESS;
3355         loff_t pos = vmf->pgoff >> PAGE_SHIFT;
3356         size_t count = PAGE_SIZE;
3357         int err;
3358
3359         /*
3360          * We already did this and now we're retrying with everything locked,
3361          * don't emit the event and continue.
3362          */
3363         if (vmf->flags & FAULT_FLAG_TRIED)
3364                 return 0;
3365
3366         /* No watches, we're done. */
3367         if (likely(!FMODE_FSNOTIFY_HSM(vmf->vma->vm_file->f_mode)))
3368                 return 0;
3369
3370         fpin = maybe_unlock_mmap_for_io(vmf, fpin);
3371         if (!fpin)
3372                 return VM_FAULT_SIGBUS;
3373
3374         err = fsnotify_file_area_perm(fpin, mask, &pos, count);
3375         fput(fpin);
3376         if (err)
3377                 return VM_FAULT_SIGBUS;
3378         return VM_FAULT_RETRY;
3379 }
3380 EXPORT_SYMBOL_GPL(filemap_fsnotify_fault);
3381
3382 /**
3383  * filemap_fault - read in file data for page fault handling
3384  * @vmf:        struct vm_fault containing details of the fault
3385  *
3386  * filemap_fault() is invoked via the vma operations vector for a
3387  * mapped memory region to read in file data during a page fault.
3388  *
3389  * The goto's are kind of ugly, but this streamlines the normal case of having
3390  * it in the page cache, and handles the special cases reasonably without
3391  * having a lot of duplicated code.
3392  *
3393  * vma->vm_mm->mmap_lock must be held on entry.
3394  *
3395  * If our return value has VM_FAULT_RETRY set, it's because the mmap_lock
3396  * may be dropped before doing I/O or by lock_folio_maybe_drop_mmap().
3397  *
3398  * If our return value does not have VM_FAULT_RETRY set, the mmap_lock
3399  * has not been released.
3400  *
3401  * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set.
3402  *
3403  * Return: bitwise-OR of %VM_FAULT_ codes.
3404  */
3405 vm_fault_t filemap_fault(struct vm_fault *vmf)
3406 {
3407         int error;
3408         struct file *file = vmf->vma->vm_file;
3409         struct file *fpin = NULL;
3410         struct address_space *mapping = file->f_mapping;
3411         struct inode *inode = mapping->host;
3412         pgoff_t max_idx, index = vmf->pgoff;
3413         struct folio *folio;
3414         vm_fault_t ret = 0;
3415         bool mapping_locked = false;
3416
3417         max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
3418         if (unlikely(index >= max_idx))
3419                 return VM_FAULT_SIGBUS;
3420
3421         trace_mm_filemap_fault(mapping, index);
3422
3423         /*
3424          * Do we have something in the page cache already?
3425          */
3426         folio = filemap_get_folio(mapping, index);
3427         if (likely(!IS_ERR(folio))) {
3428                 /*
3429                  * We found the page, so try async readahead before waiting for
3430                  * the lock.
3431                  */
3432                 if (!(vmf->flags & FAULT_FLAG_TRIED))
3433                         fpin = do_async_mmap_readahead(vmf, folio);
3434                 if (unlikely(!folio_test_uptodate(folio))) {
3435                         filemap_invalidate_lock_shared(mapping);
3436                         mapping_locked = true;
3437                 }
3438         } else {
3439                 ret = filemap_fault_recheck_pte_none(vmf);
3440                 if (unlikely(ret))
3441                         return ret;
3442
3443                 /* No page in the page cache at all */
3444                 count_vm_event(PGMAJFAULT);
3445                 count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
3446                 ret = VM_FAULT_MAJOR;
3447                 fpin = do_sync_mmap_readahead(vmf);
3448 retry_find:
3449                 /*
3450                  * See comment in filemap_create_folio() why we need
3451                  * invalidate_lock
3452                  */
3453                 if (!mapping_locked) {
3454                         filemap_invalidate_lock_shared(mapping);
3455                         mapping_locked = true;
3456                 }
3457                 folio = __filemap_get_folio(mapping, index,
3458                                           FGP_CREAT|FGP_FOR_MMAP,
3459                                           vmf->gfp_mask);
3460                 if (IS_ERR(folio)) {
3461                         if (fpin)
3462                                 goto out_retry;
3463                         filemap_invalidate_unlock_shared(mapping);
3464                         return VM_FAULT_OOM;
3465                 }
3466         }
3467
3468         if (!lock_folio_maybe_drop_mmap(vmf, folio, &fpin))
3469                 goto out_retry;
3470
3471         /* Did it get truncated? */
3472         if (unlikely(folio->mapping != mapping)) {
3473                 folio_unlock(folio);
3474                 folio_put(folio);
3475                 goto retry_find;
3476         }
3477         VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);
3478
3479         /*
3480          * We have a locked folio in the page cache, now we need to check
3481          * that it's up-to-date. If not, it is going to be due to an error,
3482          * or because readahead was otherwise unable to retrieve it.
3483          */
3484         if (unlikely(!folio_test_uptodate(folio))) {
3485                 /*
3486                  * If this is a precontent file we have can now emit an event to
3487                  * try and populate the folio.
3488                  */
3489                 if (!(vmf->flags & FAULT_FLAG_TRIED) &&
3490                     unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) {
3491                         loff_t pos = folio_pos(folio);
3492                         size_t count = folio_size(folio);
3493
3494                         /* We're NOWAIT, we have to retry. */
3495                         if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) {
3496                                 folio_unlock(folio);
3497                                 goto out_retry;
3498                         }
3499
3500                         if (mapping_locked)
3501                                 filemap_invalidate_unlock_shared(mapping);
3502                         mapping_locked = false;
3503
3504                         folio_unlock(folio);
3505                         fpin = maybe_unlock_mmap_for_io(vmf, fpin);
3506                         if (!fpin)
3507                                 goto out_retry;
3508
3509                         error = fsnotify_file_area_perm(fpin, MAY_ACCESS, &pos,
3510                                                         count);
3511                         if (error)
3512                                 ret = VM_FAULT_SIGBUS;
3513                         goto out_retry;
3514                 }
3515
3516                 /*
3517                  * If the invalidate lock is not held, the folio was in cache
3518                  * and uptodate and now it is not. Strange but possible since we
3519                  * didn't hold the page lock all the time. Let's drop
3520                  * everything, get the invalidate lock and try again.
3521                  */
3522                 if (!mapping_locked) {
3523                         folio_unlock(folio);
3524                         folio_put(folio);
3525                         goto retry_find;
3526                 }
3527
3528                 /*
3529                  * OK, the folio is really not uptodate. This can be because the
3530                  * VMA has the VM_RAND_READ flag set, or because an error
3531                  * arose. Let's read it in directly.
3532                  */
3533                 goto page_not_uptodate;
3534         }
3535
3536         /*
3537          * We've made it this far and we had to drop our mmap_lock, now is the
3538          * time to return to the upper layer and have it re-find the vma and
3539          * redo the fault.
3540          */
3541         if (fpin) {
3542                 folio_unlock(folio);
3543                 goto out_retry;
3544         }
3545         if (mapping_locked)
3546                 filemap_invalidate_unlock_shared(mapping);
3547
3548         /*
3549          * Found the page and have a reference on it.
3550          * We must recheck i_size under page lock.
3551          */
3552         max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
3553         if (unlikely(index >= max_idx)) {
3554                 folio_unlock(folio);
3555                 folio_put(folio);
3556                 return VM_FAULT_SIGBUS;
3557         }
3558
3559         vmf->page = folio_file_page(folio, index);
3560         return ret | VM_FAULT_LOCKED;
3561
3562 page_not_uptodate:
3563         /*
3564          * Umm, take care of errors if the page isn't up-to-date.
3565          * Try to re-read it _once_. We do this synchronously,
3566          * because there really aren't any performance issues here
3567          * and we need to check for errors.
3568          */
3569         fpin = maybe_unlock_mmap_for_io(vmf, fpin);
3570         error = filemap_read_folio(file, mapping->a_ops->read_folio, folio);
3571         if (fpin)
3572                 goto out_retry;
3573         folio_put(folio);
3574
3575         if (!error || error == AOP_TRUNCATED_PAGE)
3576                 goto retry_find;
3577         filemap_invalidate_unlock_shared(mapping);
3578
3579         return VM_FAULT_SIGBUS;
3580
3581 out_retry:
3582         /*
3583          * We dropped the mmap_lock, we need to return to the fault handler to
3584          * re-find the vma and come back and find our hopefully still populated
3585          * page.
3586          */
3587         if (!IS_ERR(folio))
3588                 folio_put(folio);
3589         if (mapping_locked)
3590                 filemap_invalidate_unlock_shared(mapping);
3591         if (fpin)
3592                 fput(fpin);
3593         return ret | VM_FAULT_RETRY;
3594 }
3595 EXPORT_SYMBOL(filemap_fault);
3596
3597 static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio,
3598                 pgoff_t start)
3599 {
3600         struct mm_struct *mm = vmf->vma->vm_mm;
3601
3602         /* Huge page is mapped? No need to proceed. */
3603         if (pmd_trans_huge(*vmf->pmd)) {
3604                 folio_unlock(folio);
3605                 folio_put(folio);
3606                 return true;
3607         }
3608
3609         if (pmd_none(*vmf->pmd) && folio_test_pmd_mappable(folio)) {
3610                 struct page *page = folio_file_page(folio, start);
3611                 vm_fault_t ret = do_set_pmd(vmf, page);
3612                 if (!ret) {
3613                         /* The page is mapped successfully, reference consumed. */
3614                         folio_unlock(folio);
3615                         return true;
3616                 }
3617         }
3618
3619         if (pmd_none(*vmf->pmd) && vmf->prealloc_pte)
3620                 pmd_install(mm, vmf->pmd, &vmf->prealloc_pte);
3621
3622         return false;
3623 }
3624
3625 static struct folio *next_uptodate_folio(struct xa_state *xas,
3626                 struct address_space *mapping, pgoff_t end_pgoff)
3627 {
3628         struct folio *folio = xas_next_entry(xas, end_pgoff);
3629         unsigned long max_idx;
3630
3631         do {
3632                 if (!folio)
3633                         return NULL;
3634                 if (xas_retry(xas, folio))
3635                         continue;
3636                 if (xa_is_value(folio))
3637                         continue;
3638                 if (!folio_try_get(folio))
3639                         continue;
3640                 if (folio_test_locked(folio))
3641                         goto skip;
3642                 /* Has the page moved or been split? */
3643                 if (unlikely(folio != xas_reload(xas)))
3644                         goto skip;
3645                 if (!folio_test_uptodate(folio) || folio_test_readahead(folio))
3646                         goto skip;
3647                 if (!folio_trylock(folio))
3648                         goto skip;
3649                 if (folio->mapping != mapping)
3650                         goto unlock;
3651                 if (!folio_test_uptodate(folio))
3652                         goto unlock;
3653                 max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
3654                 if (xas->xa_index >= max_idx)
3655                         goto unlock;
3656                 return folio;
3657 unlock:
3658                 folio_unlock(folio);
3659 skip:
3660                 folio_put(folio);
3661         } while ((folio = xas_next_entry(xas, end_pgoff)) != NULL);
3662
3663         return NULL;
3664 }
3665
3666 /*
3667  * Map page range [start_page, start_page + nr_pages) of folio.
3668  * start_page is gotten from start by folio_page(folio, start)
3669  */
3670 static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
3671                         struct folio *folio, unsigned long start,
3672                         unsigned long addr, unsigned int nr_pages,
3673                         unsigned long *rss, unsigned int *mmap_miss)
3674 {
3675         vm_fault_t ret = 0;
3676         struct page *page = folio_page(folio, start);
3677         unsigned int count = 0;
3678         pte_t *old_ptep = vmf->pte;
3679
3680         do {
3681                 if (PageHWPoison(page + count))
3682                         goto skip;
3683
3684                 /*
3685                  * If there are too many folios that are recently evicted
3686                  * in a file, they will probably continue to be evicted.
3687                  * In such situation, read-ahead is only a waste of IO.
3688                  * Don't decrease mmap_miss in this scenario to make sure
3689                  * we can stop read-ahead.
3690                  */
3691                 if (!folio_test_workingset(folio))
3692                         (*mmap_miss)++;
3693
3694                 /*
3695                  * NOTE: If there're PTE markers, we'll leave them to be
3696                  * handled in the specific fault path, and it'll prohibit the
3697                  * fault-around logic.
3698                  */
3699                 if (!pte_none(ptep_get(&vmf->pte[count])))
3700                         goto skip;
3701
3702                 count++;
3703                 continue;
3704 skip:
3705                 if (count) {
3706                         set_pte_range(vmf, folio, page, count, addr);
3707                         *rss += count;
3708                         folio_ref_add(folio, count);
3709                         if (in_range(vmf->address, addr, count * PAGE_SIZE))
3710                                 ret = VM_FAULT_NOPAGE;
3711                 }
3712
3713                 count++;
3714                 page += count;
3715                 vmf->pte += count;
3716                 addr += count * PAGE_SIZE;
3717                 count = 0;
3718         } while (--nr_pages > 0);
3719
3720         if (count) {
3721                 set_pte_range(vmf, folio, page, count, addr);
3722                 *rss += count;
3723                 folio_ref_add(folio, count);
3724                 if (in_range(vmf->address, addr, count * PAGE_SIZE))
3725                         ret = VM_FAULT_NOPAGE;
3726         }
3727
3728         vmf->pte = old_ptep;
3729
3730         return ret;
3731 }
3732
3733 static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf,
3734                 struct folio *folio, unsigned long addr,
3735                 unsigned long *rss, unsigned int *mmap_miss)
3736 {
3737         vm_fault_t ret = 0;
3738         struct page *page = &folio->page;
3739
3740         if (PageHWPoison(page))
3741                 return ret;
3742
3743         /* See comment of filemap_map_folio_range() */
3744         if (!folio_test_workingset(folio))
3745                 (*mmap_miss)++;
3746
3747         /*
3748          * NOTE: If there're PTE markers, we'll leave them to be
3749          * handled in the specific fault path, and it'll prohibit
3750          * the fault-around logic.
3751          */
3752         if (!pte_none(ptep_get(vmf->pte)))
3753                 return ret;
3754
3755         if (vmf->address == addr)
3756                 ret = VM_FAULT_NOPAGE;
3757
3758         set_pte_range(vmf, folio, page, 1, addr);
3759         (*rss)++;
3760         folio_ref_inc(folio);
3761
3762         return ret;
3763 }
3764
3765 vm_fault_t filemap_map_pages(struct vm_fault *vmf,
3766                              pgoff_t start_pgoff, pgoff_t end_pgoff)
3767 {
3768         struct vm_area_struct *vma = vmf->vma;
3769         struct file *file = vma->vm_file;
3770         struct address_space *mapping = file->f_mapping;
3771         pgoff_t file_end, last_pgoff = start_pgoff;
3772         unsigned long addr;
3773         XA_STATE(xas, &mapping->i_pages, start_pgoff);
3774         struct folio *folio;
3775         vm_fault_t ret = 0;
3776         unsigned long rss = 0;
3777         unsigned int nr_pages = 0, mmap_miss = 0, mmap_miss_saved, folio_type;
3778
3779         rcu_read_lock();
3780         folio = next_uptodate_folio(&xas, mapping, end_pgoff);
3781         if (!folio)
3782                 goto out;
3783
3784         if (filemap_map_pmd(vmf, folio, start_pgoff)) {
3785                 ret = VM_FAULT_NOPAGE;
3786                 goto out;
3787         }
3788
3789         addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT);
3790         vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
3791         if (!vmf->pte) {
3792                 folio_unlock(folio);
3793                 folio_put(folio);
3794                 goto out;
3795         }
3796
3797         file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE) - 1;
3798         if (end_pgoff > file_end)
3799                 end_pgoff = file_end;
3800
3801         folio_type = mm_counter_file(folio);
3802         do {
3803                 unsigned long end;
3804
3805                 addr += (xas.xa_index - last_pgoff) << PAGE_SHIFT;
3806                 vmf->pte += xas.xa_index - last_pgoff;
3807                 last_pgoff = xas.xa_index;
3808                 end = folio_next_index(folio) - 1;
3809                 nr_pages = min(end, end_pgoff) - xas.xa_index + 1;
3810
3811                 if (!folio_test_large(folio))
3812                         ret |= filemap_map_order0_folio(vmf,
3813                                         folio, addr, &rss, &mmap_miss);
3814                 else
3815                         ret |= filemap_map_folio_range(vmf, folio,
3816                                         xas.xa_index - folio->index, addr,
3817                                         nr_pages, &rss, &mmap_miss);
3818
3819                 folio_unlock(folio);
3820                 folio_put(folio);
3821         } while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL);
3822         add_mm_counter(vma->vm_mm, folio_type, rss);
3823         pte_unmap_unlock(vmf->pte, vmf->ptl);
3824         trace_mm_filemap_map_pages(mapping, start_pgoff, end_pgoff);
3825 out:
3826         rcu_read_unlock();
3827
3828         mmap_miss_saved = READ_ONCE(file->f_ra.mmap_miss);
3829         if (mmap_miss >= mmap_miss_saved)
3830                 WRITE_ONCE(file->f_ra.mmap_miss, 0);
3831         else
3832                 WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss_saved - mmap_miss);
3833
3834         return ret;
3835 }
3836 EXPORT_SYMBOL(filemap_map_pages);
3837
3838 vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
3839 {
3840         struct address_space *mapping = vmf->vma->vm_file->f_mapping;
3841         struct folio *folio = page_folio(vmf->page);
3842         vm_fault_t ret = VM_FAULT_LOCKED;
3843
3844         sb_start_pagefault(mapping->host->i_sb);
3845         file_update_time(vmf->vma->vm_file);
3846         folio_lock(folio);
3847         if (folio->mapping != mapping) {
3848                 folio_unlock(folio);
3849                 ret = VM_FAULT_NOPAGE;
3850                 goto out;
3851         }
3852         /*
3853          * We mark the folio dirty already here so that when freeze is in
3854          * progress, we are guaranteed that writeback during freezing will
3855          * see the dirty folio and writeprotect it again.
3856          */
3857         folio_mark_dirty(folio);
3858         folio_wait_stable(folio);
3859 out:
3860         sb_end_pagefault(mapping->host->i_sb);
3861         return ret;
3862 }
3863
3864 const struct vm_operations_struct generic_file_vm_ops = {
3865         .fault          = filemap_fault,
3866         .map_pages      = filemap_map_pages,
3867         .page_mkwrite   = filemap_page_mkwrite,
3868 };
3869
3870 /* This is used for a general mmap of a disk file */
3871
3872 int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
3873 {
3874         struct address_space *mapping = file->f_mapping;
3875
3876         if (!mapping->a_ops->read_folio)
3877                 return -ENOEXEC;
3878         file_accessed(file);
3879         vma->vm_ops = &generic_file_vm_ops;
3880         return 0;
3881 }
3882
3883 /*
3884  * This is for filesystems which do not implement ->writepage.
3885  */
3886 int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
3887 {
3888         if (vma_is_shared_maywrite(vma))
3889                 return -EINVAL;
3890         return generic_file_mmap(file, vma);
3891 }
3892 #else
3893 vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
3894 {
3895         return VM_FAULT_SIGBUS;
3896 }
3897 int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
3898 {
3899         return -ENOSYS;
3900 }
3901 int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
3902 {
3903         return -ENOSYS;
3904 }
3905 #endif /* CONFIG_MMU */
3906
3907 EXPORT_SYMBOL(filemap_page_mkwrite);
3908 EXPORT_SYMBOL(generic_file_mmap);
3909 EXPORT_SYMBOL(generic_file_readonly_mmap);
3910
3911 static struct folio *do_read_cache_folio(struct address_space *mapping,
3912                 pgoff_t index, filler_t filler, struct file *file, gfp_t gfp)
3913 {
3914         struct folio *folio;
3915         int err;
3916
3917         if (!filler)
3918                 filler = mapping->a_ops->read_folio;
3919 repeat:
3920         folio = filemap_get_folio(mapping, index);
3921         if (IS_ERR(folio)) {
3922                 folio = filemap_alloc_folio(gfp,
3923                                             mapping_min_folio_order(mapping));
3924                 if (!folio)
3925                         return ERR_PTR(-ENOMEM);
3926                 index = mapping_align_index(mapping, index);
3927                 err = filemap_add_folio(mapping, folio, index, gfp);
3928                 if (unlikely(err)) {
3929                         folio_put(folio);
3930                         if (err == -EEXIST)
3931                                 goto repeat;
3932                         /* Presumably ENOMEM for xarray node */
3933                         return ERR_PTR(err);
3934                 }
3935
3936                 goto filler;
3937         }
3938         if (folio_test_uptodate(folio))
3939                 goto out;
3940
3941         if (!folio_trylock(folio)) {
3942                 folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE);
3943                 goto repeat;
3944         }
3945
3946         /* Folio was truncated from mapping */
3947         if (!folio->mapping) {
3948                 folio_unlock(folio);
3949                 folio_put(folio);
3950                 goto repeat;
3951         }
3952
3953         /* Someone else locked and filled the page in a very small window */
3954         if (folio_test_uptodate(folio)) {
3955                 folio_unlock(folio);
3956                 goto out;
3957         }
3958
3959 filler:
3960         err = filemap_read_folio(file, filler, folio);
3961         if (err) {
3962                 folio_put(folio);
3963                 if (err == AOP_TRUNCATED_PAGE)
3964                         goto repeat;
3965                 return ERR_PTR(err);
3966         }
3967
3968 out:
3969         folio_mark_accessed(folio);
3970         return folio;
3971 }
3972
3973 /**
3974  * read_cache_folio - Read into page cache, fill it if needed.
3975  * @mapping: The address_space to read from.
3976  * @index: The index to read.
3977  * @filler: Function to perform the read, or NULL to use aops->read_folio().
3978  * @file: Passed to filler function, may be NULL if not required.
3979  *
3980  * Read one page into the page cache.  If it succeeds, the folio returned
3981  * will contain @index, but it may not be the first page of the folio.
3982  *
3983  * If the filler function returns an error, it will be returned to the
3984  * caller.
3985  *
3986  * Context: May sleep.  Expects mapping->invalidate_lock to be held.
3987  * Return: An uptodate folio on success, ERR_PTR() on failure.
3988  */
3989 struct folio *read_cache_folio(struct address_space *mapping, pgoff_t index,
3990                 filler_t filler, struct file *file)
3991 {
3992         return do_read_cache_folio(mapping, index, filler, file,
3993                         mapping_gfp_mask(mapping));
3994 }
3995 EXPORT_SYMBOL(read_cache_folio);
3996
3997 /**
3998  * mapping_read_folio_gfp - Read into page cache, using specified allocation flags.
3999  * @mapping:    The address_space for the folio.
4000  * @index:      The index that the allocated folio will contain.
4001  * @gfp:        The page allocator flags to use if allocating.
4002  *
4003  * This is the same as "read_cache_folio(mapping, index, NULL, NULL)", but with
4004  * any new memory allocations done using the specified allocation flags.
4005  *
4006  * The most likely error from this function is EIO, but ENOMEM is
4007  * possible and so is EINTR.  If ->read_folio returns another error,
4008  * that will be returned to the caller.
4009  *
4010  * The function expects mapping->invalidate_lock to be already held.
4011  *
4012  * Return: Uptodate folio on success, ERR_PTR() on failure.
4013  */
4014 struct folio *mapping_read_folio_gfp(struct address_space *mapping,
4015                 pgoff_t index, gfp_t gfp)
4016 {
4017         return do_read_cache_folio(mapping, index, NULL, NULL, gfp);
4018 }
4019 EXPORT_SYMBOL(mapping_read_folio_gfp);
4020
4021 static struct page *do_read_cache_page(struct address_space *mapping,
4022                 pgoff_t index, filler_t *filler, struct file *file, gfp_t gfp)
4023 {
4024         struct folio *folio;
4025
4026         folio = do_read_cache_folio(mapping, index, filler, file, gfp);
4027         if (IS_ERR(folio))
4028                 return &folio->page;
4029         return folio_file_page(folio, index);
4030 }
4031
4032 struct page *read_cache_page(struct address_space *mapping,
4033                         pgoff_t index, filler_t *filler, struct file *file)
4034 {
4035         return do_read_cache_page(mapping, index, filler, file,
4036                         mapping_gfp_mask(mapping));
4037 }
4038 EXPORT_SYMBOL(read_cache_page);
4039
4040 /**
4041  * read_cache_page_gfp - read into page cache, using specified page allocation flags.
4042  * @mapping:    the page's address_space
4043  * @index:      the page index
4044  * @gfp:        the page allocator flags to use if allocating
4045  *
4046  * This is the same as "read_mapping_page(mapping, index, NULL)", but with
4047  * any new page allocations done using the specified allocation flags.
4048  *
4049  * If the page does not get brought uptodate, return -EIO.
4050  *
4051  * The function expects mapping->invalidate_lock to be already held.
4052  *
4053  * Return: up to date page on success, ERR_PTR() on failure.
4054  */
4055 struct page *read_cache_page_gfp(struct address_space *mapping,
4056                                 pgoff_t index,
4057                                 gfp_t gfp)
4058 {
4059         return do_read_cache_page(mapping, index, NULL, NULL, gfp);
4060 }
4061 EXPORT_SYMBOL(read_cache_page_gfp);
4062
4063 /*
4064  * Warn about a page cache invalidation failure during a direct I/O write.
4065  */
4066 static void dio_warn_stale_pagecache(struct file *filp)
4067 {
4068         static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST);
4069         char pathname[128];
4070         char *path;
4071
4072         errseq_set(&filp->f_mapping->wb_err, -EIO);
4073         if (__ratelimit(&_rs)) {
4074                 path = file_path(filp, pathname, sizeof(pathname));
4075                 if (IS_ERR(path))
4076                         path = "(unknown)";
4077                 pr_crit("Page cache invalidation failure on direct I/O.  Possible data corruption due to collision with buffered I/O!\n");
4078                 pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid,
4079                         current->comm);
4080         }
4081 }
4082
4083 void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count)
4084 {
4085         struct address_space *mapping = iocb->ki_filp->f_mapping;
4086
4087         if (mapping->nrpages &&
4088             invalidate_inode_pages2_range(mapping,
4089                         iocb->ki_pos >> PAGE_SHIFT,
4090                         (iocb->ki_pos + count - 1) >> PAGE_SHIFT))
4091                 dio_warn_stale_pagecache(iocb->ki_filp);
4092 }
4093
4094 ssize_t
4095 generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
4096 {
4097         struct address_space *mapping = iocb->ki_filp->f_mapping;
4098         size_t write_len = iov_iter_count(from);
4099         ssize_t written;
4100
4101         /*
4102          * If a page can not be invalidated, return 0 to fall back
4103          * to buffered write.
4104          */
4105         written = kiocb_invalidate_pages(iocb, write_len);
4106         if (written) {
4107                 if (written == -EBUSY)
4108                         return 0;
4109                 return written;
4110         }
4111
4112         written = mapping->a_ops->direct_IO(iocb, from);
4113
4114         /*
4115          * Finally, try again to invalidate clean pages which might have been
4116          * cached by non-direct readahead, or faulted in by get_user_pages()
4117          * if the source of the write was an mmap'ed region of the file
4118          * we're writing.  Either one is a pretty crazy thing to do,
4119          * so we don't support it 100%.  If this invalidation
4120          * fails, tough, the write still worked...
4121          *
4122          * Most of the time we do not need this since dio_complete() will do
4123          * the invalidation for us. However there are some file systems that
4124          * do not end up with dio_complete() being called, so let's not break
4125          * them by removing it completely.
4126          *
4127          * Noticeable example is a blkdev_direct_IO().
4128          *
4129          * Skip invalidation for async writes or if mapping has no pages.
4130          */
4131         if (written > 0) {
4132                 struct inode *inode = mapping->host;
4133                 loff_t pos = iocb->ki_pos;
4134
4135                 kiocb_invalidate_post_direct_write(iocb, written);
4136                 pos += written;
4137                 write_len -= written;
4138                 if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
4139                         i_size_write(inode, pos);
4140                         mark_inode_dirty(inode);
4141                 }
4142                 iocb->ki_pos = pos;
4143         }
4144         if (written != -EIOCBQUEUED)
4145                 iov_iter_revert(from, write_len - iov_iter_count(from));
4146         return written;
4147 }
4148 EXPORT_SYMBOL(generic_file_direct_write);
4149
4150 ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i)
4151 {
4152         struct file *file = iocb->ki_filp;
4153         loff_t pos = iocb->ki_pos;
4154         struct address_space *mapping = file->f_mapping;
4155         const struct address_space_operations *a_ops = mapping->a_ops;
4156         size_t chunk = mapping_max_folio_size(mapping);
4157         long status = 0;
4158         ssize_t written = 0;
4159
4160         do {
4161                 struct folio *folio;
4162                 size_t offset;          /* Offset into folio */
4163                 size_t bytes;           /* Bytes to write to folio */
4164                 size_t copied;          /* Bytes copied from user */
4165                 void *fsdata = NULL;
4166
4167                 bytes = iov_iter_count(i);
4168 retry:
4169                 offset = pos & (chunk - 1);
4170                 bytes = min(chunk - offset, bytes);
4171                 balance_dirty_pages_ratelimited(mapping);
4172
4173                 /*
4174                  * Bring in the user page that we will copy from _first_.
4175                  * Otherwise there's a nasty deadlock on copying from the
4176                  * same page as we're writing to, without it being marked
4177                  * up-to-date.
4178                  */
4179                 if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) {
4180                         status = -EFAULT;
4181                         break;
4182                 }
4183
4184                 if (fatal_signal_pending(current)) {
4185                         status = -EINTR;
4186                         break;
4187                 }
4188
4189                 status = a_ops->write_begin(file, mapping, pos, bytes,
4190                                                 &folio, &fsdata);
4191                 if (unlikely(status < 0))
4192                         break;
4193
4194                 offset = offset_in_folio(folio, pos);
4195                 if (bytes > folio_size(folio) - offset)
4196                         bytes = folio_size(folio) - offset;
4197
4198                 if (mapping_writably_mapped(mapping))
4199                         flush_dcache_folio(folio);
4200
4201                 copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);
4202                 flush_dcache_folio(folio);
4203
4204                 status = a_ops->write_end(file, mapping, pos, bytes, copied,
4205                                                 folio, fsdata);
4206                 if (unlikely(status != copied)) {
4207                         iov_iter_revert(i, copied - max(status, 0L));
4208                         if (unlikely(status < 0))
4209                                 break;
4210                 }
4211                 cond_resched();
4212
4213                 if (unlikely(status == 0)) {
4214                         /*
4215                          * A short copy made ->write_end() reject the
4216                          * thing entirely.  Might be memory poisoning
4217                          * halfway through, might be a race with munmap,
4218                          * might be severe memory pressure.
4219                          */
4220                         if (chunk > PAGE_SIZE)
4221                                 chunk /= 2;
4222                         if (copied) {
4223                                 bytes = copied;
4224                                 goto retry;
4225                         }
4226                 } else {
4227                         pos += status;
4228                         written += status;
4229                 }
4230         } while (iov_iter_count(i));
4231
4232         if (!written)
4233                 return status;
4234         iocb->ki_pos += written;
4235         return written;
4236 }
4237 EXPORT_SYMBOL(generic_perform_write);
4238
4239 /**
4240  * __generic_file_write_iter - write data to a file
4241  * @iocb:       IO state structure (file, offset, etc.)
4242  * @from:       iov_iter with data to write
4243  *
4244  * This function does all the work needed for actually writing data to a
4245  * file. It does all basic checks, removes SUID from the file, updates
4246  * modification times and calls proper subroutines depending on whether we
4247  * do direct IO or a standard buffered write.
4248  *
4249  * It expects i_rwsem to be grabbed unless we work on a block device or similar
4250  * object which does not need locking at all.
4251  *
4252  * This function does *not* take care of syncing data in case of O_SYNC write.
4253  * A caller has to handle it. This is mainly due to the fact that we want to
4254  * avoid syncing under i_rwsem.
4255  *
4256  * Return:
4257  * * number of bytes written, even for truncated writes
4258  * * negative error code if no data has been written at all
4259  */
4260 ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
4261 {
4262         struct file *file = iocb->ki_filp;
4263         struct address_space *mapping = file->f_mapping;
4264         struct inode *inode = mapping->host;
4265         ssize_t ret;
4266
4267         ret = file_remove_privs(file);
4268         if (ret)
4269                 return ret;
4270
4271         ret = file_update_time(file);
4272         if (ret)
4273                 return ret;
4274
4275         if (iocb->ki_flags & IOCB_DIRECT) {
4276                 ret = generic_file_direct_write(iocb, from);
4277                 /*
4278                  * If the write stopped short of completing, fall back to
4279                  * buffered writes.  Some filesystems do this for writes to
4280                  * holes, for example.  For DAX files, a buffered write will
4281                  * not succeed (even if it did, DAX does not handle dirty
4282                  * page-cache pages correctly).
4283                  */
4284                 if (ret < 0 || !iov_iter_count(from) || IS_DAX(inode))
4285                         return ret;
4286                 return direct_write_fallback(iocb, from, ret,
4287                                 generic_perform_write(iocb, from));
4288         }
4289
4290         return generic_perform_write(iocb, from);
4291 }
4292 EXPORT_SYMBOL(__generic_file_write_iter);
4293
4294 /**
4295  * generic_file_write_iter - write data to a file
4296  * @iocb:       IO state structure
4297  * @from:       iov_iter with data to write
4298  *
4299  * This is a wrapper around __generic_file_write_iter() to be used by most
4300  * filesystems. It takes care of syncing the file in case of O_SYNC file
4301  * and acquires i_rwsem as needed.
4302  * Return:
4303  * * negative error code if no data has been written at all of
4304  *   vfs_fsync_range() failed for a synchronous write
4305  * * number of bytes written, even for truncated writes
4306  */
4307 ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
4308 {
4309         struct file *file = iocb->ki_filp;
4310         struct inode *inode = file->f_mapping->host;
4311         ssize_t ret;
4312
4313         inode_lock(inode);
4314         ret = generic_write_checks(iocb, from);
4315         if (ret > 0)
4316                 ret = __generic_file_write_iter(iocb, from);
4317         inode_unlock(inode);
4318
4319         if (ret > 0)
4320                 ret = generic_write_sync(iocb, ret);
4321         return ret;
4322 }
4323 EXPORT_SYMBOL(generic_file_write_iter);
4324
4325 /**
4326  * filemap_release_folio() - Release fs-specific metadata on a folio.
4327  * @folio: The folio which the kernel is trying to free.
4328  * @gfp: Memory allocation flags (and I/O mode).
4329  *
4330  * The address_space is trying to release any data attached to a folio
4331  * (presumably at folio->private).
4332  *
4333  * This will also be called if the private_2 flag is set on a page,
4334  * indicating that the folio has other metadata associated with it.
4335  *
4336  * The @gfp argument specifies whether I/O may be performed to release
4337  * this page (__GFP_IO), and whether the call may block
4338  * (__GFP_RECLAIM & __GFP_FS).
4339  *
4340  * Return: %true if the release was successful, otherwise %false.
4341  */
4342 bool filemap_release_folio(struct folio *folio, gfp_t gfp)
4343 {
4344         struct address_space * const mapping = folio->mapping;
4345
4346         BUG_ON(!folio_test_locked(folio));
4347         if (!folio_needs_release(folio))
4348                 return true;
4349         if (folio_test_writeback(folio))
4350                 return false;
4351
4352         if (mapping && mapping->a_ops->release_folio)
4353                 return mapping->a_ops->release_folio(folio, gfp);
4354         return try_to_free_buffers(folio);
4355 }
4356 EXPORT_SYMBOL(filemap_release_folio);
4357
4358 /**
4359  * filemap_invalidate_inode - Invalidate/forcibly write back a range of an inode's pagecache
4360  * @inode: The inode to flush
4361  * @flush: Set to write back rather than simply invalidate.
4362  * @start: First byte to in range.
4363  * @end: Last byte in range (inclusive), or LLONG_MAX for everything from start
4364  *       onwards.
4365  *
4366  * Invalidate all the folios on an inode that contribute to the specified
4367  * range, possibly writing them back first.  Whilst the operation is
4368  * undertaken, the invalidate lock is held to prevent new folios from being
4369  * installed.
4370  */
4371 int filemap_invalidate_inode(struct inode *inode, bool flush,
4372                              loff_t start, loff_t end)
4373 {
4374         struct address_space *mapping = inode->i_mapping;
4375         pgoff_t first = start >> PAGE_SHIFT;
4376         pgoff_t last = end >> PAGE_SHIFT;
4377         pgoff_t nr = end == LLONG_MAX ? ULONG_MAX : last - first + 1;
4378
4379         if (!mapping || !mapping->nrpages || end < start)
4380                 goto out;
4381
4382         /* Prevent new folios from being added to the inode. */
4383         filemap_invalidate_lock(mapping);
4384
4385         if (!mapping->nrpages)
4386                 goto unlock;
4387
4388         unmap_mapping_pages(mapping, first, nr, false);
4389
4390         /* Write back the data if we're asked to. */
4391         if (flush) {
4392                 struct writeback_control wbc = {
4393                         .sync_mode      = WB_SYNC_ALL,
4394                         .nr_to_write    = LONG_MAX,
4395                         .range_start    = start,
4396                         .range_end      = end,
4397                 };
4398
4399                 filemap_fdatawrite_wbc(mapping, &wbc);
4400         }
4401
4402         /* Wait for writeback to complete on all folios and discard. */
4403         invalidate_inode_pages2_range(mapping, start / PAGE_SIZE, end / PAGE_SIZE);
4404
4405 unlock:
4406         filemap_invalidate_unlock(mapping);
4407 out:
4408         return filemap_check_errors(mapping);
4409 }
4410 EXPORT_SYMBOL_GPL(filemap_invalidate_inode);
4411
4412 #ifdef CONFIG_CACHESTAT_SYSCALL
4413 /**
4414  * filemap_cachestat() - compute the page cache statistics of a mapping
4415  * @mapping:    The mapping to compute the statistics for.
4416  * @first_index:        The starting page cache index.
4417  * @last_index: The final page index (inclusive).
4418  * @cs: the cachestat struct to write the result to.
4419  *
4420  * This will query the page cache statistics of a mapping in the
4421  * page range of [first_index, last_index] (inclusive). The statistics
4422  * queried include: number of dirty pages, number of pages marked for
4423  * writeback, and the number of (recently) evicted pages.
4424  */
4425 static void filemap_cachestat(struct address_space *mapping,
4426                 pgoff_t first_index, pgoff_t last_index, struct cachestat *cs)
4427 {
4428         XA_STATE(xas, &mapping->i_pages, first_index);
4429         struct folio *folio;
4430
4431         /* Flush stats (and potentially sleep) outside the RCU read section. */
4432         mem_cgroup_flush_stats_ratelimited(NULL);
4433
4434         rcu_read_lock();
4435         xas_for_each(&xas, folio, last_index) {
4436                 int order;
4437                 unsigned long nr_pages;
4438                 pgoff_t folio_first_index, folio_last_index;
4439
4440                 /*
4441                  * Don't deref the folio. It is not pinned, and might
4442                  * get freed (and reused) underneath us.
4443                  *
4444                  * We *could* pin it, but that would be expensive for
4445                  * what should be a fast and lightweight syscall.
4446                  *
4447                  * Instead, derive all information of interest from
4448                  * the rcu-protected xarray.
4449                  */
4450
4451                 if (xas_retry(&xas, folio))
4452                         continue;
4453
4454                 order = xas_get_order(&xas);
4455                 nr_pages = 1 << order;
4456                 folio_first_index = round_down(xas.xa_index, 1 << order);
4457                 folio_last_index = folio_first_index + nr_pages - 1;
4458
4459                 /* Folios might straddle the range boundaries, only count covered pages */
4460                 if (folio_first_index < first_index)
4461                         nr_pages -= first_index - folio_first_index;
4462
4463                 if (folio_last_index > last_index)
4464                         nr_pages -= folio_last_index - last_index;
4465
4466                 if (xa_is_value(folio)) {
4467                         /* page is evicted */
4468                         void *shadow = (void *)folio;
4469                         bool workingset; /* not used */
4470
4471                         cs->nr_evicted += nr_pages;
4472
4473 #ifdef CONFIG_SWAP /* implies CONFIG_MMU */
4474                         if (shmem_mapping(mapping)) {
4475                                 /* shmem file - in swap cache */
4476                                 swp_entry_t swp = radix_to_swp_entry(folio);
4477
4478                                 /* swapin error results in poisoned entry */
4479                                 if (non_swap_entry(swp))
4480                                         goto resched;
4481
4482                                 /*
4483                                  * Getting a swap entry from the shmem
4484                                  * inode means we beat
4485                                  * shmem_unuse(). rcu_read_lock()
4486                                  * ensures swapoff waits for us before
4487                                  * freeing the swapper space. However,
4488                                  * we can race with swapping and
4489                                  * invalidation, so there might not be
4490                                  * a shadow in the swapcache (yet).
4491                                  */
4492                                 shadow = get_shadow_from_swap_cache(swp);
4493                                 if (!shadow)
4494                                         goto resched;
4495                         }
4496 #endif
4497                         if (workingset_test_recent(shadow, true, &workingset, false))
4498                                 cs->nr_recently_evicted += nr_pages;
4499
4500                         goto resched;
4501                 }
4502
4503                 /* page is in cache */
4504                 cs->nr_cache += nr_pages;
4505
4506                 if (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY))
4507                         cs->nr_dirty += nr_pages;
4508
4509                 if (xas_get_mark(&xas, PAGECACHE_TAG_WRITEBACK))
4510                         cs->nr_writeback += nr_pages;
4511
4512 resched:
4513                 if (need_resched()) {
4514                         xas_pause(&xas);
4515                         cond_resched_rcu();
4516                 }
4517         }
4518         rcu_read_unlock();
4519 }
4520
4521 /*
4522  * See mincore: reveal pagecache information only for files
4523  * that the calling process has write access to, or could (if
4524  * tried) open for writing.
4525  */
4526 static inline bool can_do_cachestat(struct file *f)
4527 {
4528         if (f->f_mode & FMODE_WRITE)
4529                 return true;
4530         if (inode_owner_or_capable(file_mnt_idmap(f), file_inode(f)))
4531                 return true;
4532         return file_permission(f, MAY_WRITE) == 0;
4533 }
4534
4535 /*
4536  * The cachestat(2) system call.
4537  *
4538  * cachestat() returns the page cache statistics of a file in the
4539  * bytes range specified by `off` and `len`: number of cached pages,
4540  * number of dirty pages, number of pages marked for writeback,
4541  * number of evicted pages, and number of recently evicted pages.
4542  *
4543  * An evicted page is a page that is previously in the page cache
4544  * but has been evicted since. A page is recently evicted if its last
4545  * eviction was recent enough that its reentry to the cache would
4546  * indicate that it is actively being used by the system, and that
4547  * there is memory pressure on the system.
4548  *
4549  * `off` and `len` must be non-negative integers. If `len` > 0,
4550  * the queried range is [`off`, `off` + `len`]. If `len` == 0,
4551  * we will query in the range from `off` to the end of the file.
4552  *
4553  * The `flags` argument is unused for now, but is included for future
4554  * extensibility. User should pass 0 (i.e no flag specified).
4555  *
4556  * Currently, hugetlbfs is not supported.
4557  *
4558  * Because the status of a page can change after cachestat() checks it
4559  * but before it returns to the application, the returned values may
4560  * contain stale information.
4561  *
4562  * return values:
4563  *  zero        - success
4564  *  -EFAULT     - cstat or cstat_range points to an illegal address
4565  *  -EINVAL     - invalid flags
4566  *  -EBADF      - invalid file descriptor
4567  *  -EOPNOTSUPP - file descriptor is of a hugetlbfs file
4568  */
4569 SYSCALL_DEFINE4(cachestat, unsigned int, fd,
4570                 struct cachestat_range __user *, cstat_range,
4571                 struct cachestat __user *, cstat, unsigned int, flags)
4572 {
4573         CLASS(fd, f)(fd);
4574         struct address_space *mapping;
4575         struct cachestat_range csr;
4576         struct cachestat cs;
4577         pgoff_t first_index, last_index;
4578
4579         if (fd_empty(f))
4580                 return -EBADF;
4581
4582         if (copy_from_user(&csr, cstat_range,
4583                         sizeof(struct cachestat_range)))
4584                 return -EFAULT;
4585
4586         /* hugetlbfs is not supported */
4587         if (is_file_hugepages(fd_file(f)))
4588                 return -EOPNOTSUPP;
4589
4590         if (!can_do_cachestat(fd_file(f)))
4591                 return -EPERM;
4592
4593         if (flags != 0)
4594                 return -EINVAL;
4595
4596         first_index = csr.off >> PAGE_SHIFT;
4597         last_index =
4598                 csr.len == 0 ? ULONG_MAX : (csr.off + csr.len - 1) >> PAGE_SHIFT;
4599         memset(&cs, 0, sizeof(struct cachestat));
4600         mapping = fd_file(f)->f_mapping;
4601         filemap_cachestat(mapping, first_index, last_index, &cs);
4602
4603         if (copy_to_user(cstat, &cs, sizeof(struct cachestat)))
4604                 return -EFAULT;
4605
4606         return 0;
4607 }
4608 #endif /* CONFIG_CACHESTAT_SYSCALL */