mm/pagewalk.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 #include <linux/pagewalk.h>
   3 #include <linux/highmem.h>
   4 #include <linux/sched.h>
   5 #include <linux/hugetlb.h>
   6
   7 static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
   8                           struct mm_walk *walk)
   9 {
  10         pte_t *pte;
  11         int err = 0;
  12         const struct mm_walk_ops *ops = walk->ops;
  13
  14         pte = pte_offset_map(pmd, addr);
  15         for (;;) {
  16                 err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
  17                 if (err)
  18                        break;
  19                 addr += PAGE_SIZE;
  20                 if (addr == end)
  21                         break;
  22                 pte++;
  23         }
  24
  25         pte_unmap(pte);
  26         return err;
  27 }
  28
  29 static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
  30                           struct mm_walk *walk)
  31 {
  32         pmd_t *pmd;
  33         unsigned long next;
  34         const struct mm_walk_ops *ops = walk->ops;
  35         int err = 0;
  36
  37         pmd = pmd_offset(pud, addr);
  38         do {
  39 again:
  40                 next = pmd_addr_end(addr, end);
  41                 if (pmd_none(*pmd) || !walk->vma) {
  42                         if (ops->pte_hole)
  43                                 err = ops->pte_hole(addr, next, walk);
  44                         if (err)
  45                                 break;
  46                         continue;
  47                 }
  48                 /*
  49                  * This implies that each ->pmd_entry() handler
  50                  * needs to know about pmd_trans_huge() pmds
  51                  */
  52                 if (ops->pmd_entry)
  53                         err = ops->pmd_entry(pmd, addr, next, walk);
  54                 if (err)
  55                         break;
  56
  57                 /*
  58                  * Check this here so we only break down trans_huge
  59                  * pages when we _need_ to
  60                  */
  61                 if (!ops->pte_entry)
  62                         continue;
  63
  64                 split_huge_pmd(walk->vma, pmd, addr);
  65                 if (pmd_trans_unstable(pmd))
  66                         goto again;
  67                 err = walk_pte_range(pmd, addr, next, walk);
  68                 if (err)
  69                         break;
  70         } while (pmd++, addr = next, addr != end);
  71
  72         return err;
  73 }
  74
  75 static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
  76                           struct mm_walk *walk)
  77 {
  78         pud_t *pud;
  79         unsigned long next;
  80         const struct mm_walk_ops *ops = walk->ops;
  81         int err = 0;
  82
  83         pud = pud_offset(p4d, addr);
  84         do {
  85  again:
  86                 next = pud_addr_end(addr, end);
  87                 if (pud_none(*pud) || !walk->vma) {
  88                         if (ops->pte_hole)
  89                                 err = ops->pte_hole(addr, next, walk);
  90                         if (err)
  91                                 break;
  92                         continue;
  93                 }
  94
  95                 if (ops->pud_entry) {
  96                         spinlock_t *ptl = pud_trans_huge_lock(pud, walk->vma);
  97
  98                         if (ptl) {
  99                                 err = ops->pud_entry(pud, addr, next, walk);
 100                                 spin_unlock(ptl);
 101                                 if (err)
 102                                         break;
 103                                 continue;
 104                         }
 105                 }
 106
 107                 split_huge_pud(walk->vma, pud, addr);
 108                 if (pud_none(*pud))
 109                         goto again;
 110
 111                 if (ops->pmd_entry || ops->pte_entry)
 112                         err = walk_pmd_range(pud, addr, next, walk);
 113                 if (err)
 114                         break;
 115         } while (pud++, addr = next, addr != end);
 116
 117         return err;
 118 }
 119
 120 static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
 121                           struct mm_walk *walk)
 122 {
 123         p4d_t *p4d;
 124         unsigned long next;
 125         const struct mm_walk_ops *ops = walk->ops;
 126         int err = 0;
 127
 128         p4d = p4d_offset(pgd, addr);
 129         do {
 130                 next = p4d_addr_end(addr, end);
 131                 if (p4d_none_or_clear_bad(p4d)) {
 132                         if (ops->pte_hole)
 133                                 err = ops->pte_hole(addr, next, walk);
 134                         if (err)
 135                                 break;
 136                         continue;
 137                 }
 138                 if (ops->pmd_entry || ops->pte_entry)
 139                         err = walk_pud_range(p4d, addr, next, walk);
 140                 if (err)
 141                         break;
 142         } while (p4d++, addr = next, addr != end);
 143
 144         return err;
 145 }
 146
 147 static int walk_pgd_range(unsigned long addr, unsigned long end,
 148                           struct mm_walk *walk)
 149 {
 150         pgd_t *pgd;
 151         unsigned long next;
 152         const struct mm_walk_ops *ops = walk->ops;
 153         int err = 0;
 154
 155         pgd = pgd_offset(walk->mm, addr);
 156         do {
 157                 next = pgd_addr_end(addr, end);
 158                 if (pgd_none_or_clear_bad(pgd)) {
 159                         if (ops->pte_hole)
 160                                 err = ops->pte_hole(addr, next, walk);
 161                         if (err)
 162                                 break;
 163                         continue;
 164                 }
 165                 if (ops->pmd_entry || ops->pte_entry)
 166                         err = walk_p4d_range(pgd, addr, next, walk);
 167                 if (err)
 168                         break;
 169         } while (pgd++, addr = next, addr != end);
 170
 171         return err;
 172 }
 173
 174 #ifdef CONFIG_HUGETLB_PAGE
 175 static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
 176                                        unsigned long end)
 177 {
 178         unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h);
 179         return boundary < end ? boundary : end;
 180 }
 181
 182 static int walk_hugetlb_range(unsigned long addr, unsigned long end,
 183                               struct mm_walk *walk)
 184 {
 185         struct vm_area_struct *vma = walk->vma;
 186         struct hstate *h = hstate_vma(vma);
 187         unsigned long next;
 188         unsigned long hmask = huge_page_mask(h);
 189         unsigned long sz = huge_page_size(h);
 190         pte_t *pte;
 191         const struct mm_walk_ops *ops = walk->ops;
 192         int err = 0;
 193
 194         do {
 195                 next = hugetlb_entry_end(h, addr, end);
 196                 pte = huge_pte_offset(walk->mm, addr & hmask, sz);
 197
 198                 if (pte)
 199                         err = ops->hugetlb_entry(pte, hmask, addr, next, walk);
 200                 else if (ops->pte_hole)
 201                         err = ops->pte_hole(addr, next, walk);
 202
 203                 if (err)
 204                         break;
 205         } while (addr = next, addr != end);
 206
 207         return err;
 208 }
 209
 210 #else /* CONFIG_HUGETLB_PAGE */
 211 static int walk_hugetlb_range(unsigned long addr, unsigned long end,
 212                               struct mm_walk *walk)
 213 {
 214         return 0;
 215 }
 216
 217 #endif /* CONFIG_HUGETLB_PAGE */
 218
 219 /*
 220  * Decide whether we really walk over the current vma on [@start, @end)
 221  * or skip it via the returned value. Return 0 if we do walk over the
 222  * current vma, and return 1 if we skip the vma. Negative values means
 223  * error, where we abort the current walk.
 224  */
 225 static int walk_page_test(unsigned long start, unsigned long end,
 226                         struct mm_walk *walk)
 227 {
 228         struct vm_area_struct *vma = walk->vma;
 229         const struct mm_walk_ops *ops = walk->ops;
 230
 231         if (ops->test_walk)
 232                 return ops->test_walk(start, end, walk);
 233
 234         /*
 235          * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP
 236          * range, so we don't walk over it as we do for normal vmas. However,
 237          * Some callers are interested in handling hole range and they don't
 238          * want to just ignore any single address range. Such users certainly
 239          * define their ->pte_hole() callbacks, so let's delegate them to handle
 240          * vma(VM_PFNMAP).
 241          */
 242         if (vma->vm_flags & VM_PFNMAP) {
 243                 int err = 1;
 244                 if (ops->pte_hole)
 245                         err = ops->pte_hole(start, end, walk);
 246                 return err ? err : 1;
 247         }
 248         return 0;
 249 }
 250
 251 static int __walk_page_range(unsigned long start, unsigned long end,
 252                         struct mm_walk *walk)
 253 {
 254         int err = 0;
 255         struct vm_area_struct *vma = walk->vma;
 256
 257         if (vma && is_vm_hugetlb_page(vma)) {
 258                 if (walk->ops->hugetlb_entry)
 259                         err = walk_hugetlb_range(start, end, walk);
 260         } else
 261                 err = walk_pgd_range(start, end, walk);
 262
 263         return err;
 264 }
 265
 266 /**
 267  * walk_page_range - walk page table with caller specific callbacks
 268  * @mm:         mm_struct representing the target process of page table walk
 269  * @start:      start address of the virtual address range
 270  * @end:        end address of the virtual address range
 271  * @ops:        operation to call during the walk
 272  * @private:    private data for callbacks' usage
 273  *
 274  * Recursively walk the page table tree of the process represented by @mm
 275  * within the virtual address range [@start, @end). During walking, we can do
 276  * some caller-specific works for each entry, by setting up pmd_entry(),
 277  * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
 278  * callbacks, the associated entries/pages are just ignored.
 279  * The return values of these callbacks are commonly defined like below:
 280  *
 281  *  - 0  : succeeded to handle the current entry, and if you don't reach the
 282  *         end address yet, continue to walk.
 283  *  - >0 : succeeded to handle the current entry, and return to the caller
 284  *         with caller specific value.
 285  *  - <0 : failed to handle the current entry, and return to the caller
 286  *         with error code.
 287  *
 288  * Before starting to walk page table, some callers want to check whether
 289  * they really want to walk over the current vma, typically by checking
 290  * its vm_flags. walk_page_test() and @ops->test_walk() are used for this
 291  * purpose.
 292  *
 293  * struct mm_walk keeps current values of some common data like vma and pmd,
 294  * which are useful for the access from callbacks. If you want to pass some
 295  * caller-specific data to callbacks, @private should be helpful.
 296  *
 297  * Locking:
 298  *   Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_sem,
 299  *   because these function traverse vma list and/or access to vma's data.
 300  */
 301 int walk_page_range(struct mm_struct *mm, unsigned long start,
 302                 unsigned long end, const struct mm_walk_ops *ops,
 303                 void *private)
 304 {
 305         int err = 0;
 306         unsigned long next;
 307         struct vm_area_struct *vma;
 308         struct mm_walk walk = {
 309                 .ops            = ops,
 310                 .mm             = mm,
 311                 .private        = private,
 312         };
 313
 314         if (start >= end)
 315                 return -EINVAL;
 316
 317         if (!walk.mm)
 318                 return -EINVAL;
 319
 320         lockdep_assert_held(&walk.mm->mmap_sem);
 321
 322         vma = find_vma(walk.mm, start);
 323         do {
 324                 if (!vma) { /* after the last vma */
 325                         walk.vma = NULL;
 326                         next = end;
 327                 } else if (start < vma->vm_start) { /* outside vma */
 328                         walk.vma = NULL;
 329                         next = min(end, vma->vm_start);
 330                 } else { /* inside vma */
 331                         walk.vma = vma;
 332                         next = min(end, vma->vm_end);
 333                         vma = vma->vm_next;
 334
 335                         err = walk_page_test(start, next, &walk);
 336                         if (err > 0) {
 337                                 /*
 338                                  * positive return values are purely for
 339                                  * controlling the pagewalk, so should never
 340                                  * be passed to the callers.
 341                                  */
 342                                 err = 0;
 343                                 continue;
 344                         }
 345                         if (err < 0)
 346                                 break;
 347                 }
 348                 if (walk.vma || walk.ops->pte_hole)
 349                         err = __walk_page_range(start, next, &walk);
 350                 if (err)
 351                         break;
 352         } while (start = next, start < end);
 353         return err;
 354 }
 355
 356 int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
 357                 void *private)
 358 {
 359         struct mm_walk walk = {
 360                 .ops            = ops,
 361                 .mm             = vma->vm_mm,
 362                 .vma            = vma,
 363                 .private        = private,
 364         };
 365         int err;
 366
 367         if (!walk.mm)
 368                 return -EINVAL;
 369
 370         lockdep_assert_held(&walk.mm->mmap_sem);
 371
 372         err = walk_page_test(vma->vm_start, vma->vm_end, &walk);
 373         if (err > 0)
 374                 return 0;
 375         if (err < 0)
 376                 return err;
 377         return __walk_page_range(vma->vm_start, vma->vm_end, &walk);
 378 }