mm/page_reporting.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 #include <linux/mm.h>
   3 #include <linux/mmzone.h>
   4 #include <linux/page_reporting.h>
   5 #include <linux/gfp.h>
   6 #include <linux/export.h>
   7 #include <linux/module.h>
   8 #include <linux/delay.h>
   9 #include <linux/scatterlist.h>
  10
  11 #include "page_reporting.h"
  12 #include "internal.h"
  13
  14 /* Initialize to an unsupported value */
  15 unsigned int page_reporting_order = -1;
  16
  17 static int page_order_update_notify(const char *val, const struct kernel_param *kp)
  18 {
  19         /*
  20          * If param is set beyond this limit, order is set to default
  21          * pageblock_order value
  22          */
  23         return  param_set_uint_minmax(val, kp, 0, MAX_PAGE_ORDER);
  24 }
  25
  26 static const struct kernel_param_ops page_reporting_param_ops = {
  27         .set = &page_order_update_notify,
  28         /*
  29          * For the get op, use param_get_int instead of param_get_uint.
  30          * This is to make sure that when unset the initialized value of
  31          * -1 is shown correctly
  32          */
  33         .get = &param_get_int,
  34 };
  35
  36 module_param_cb(page_reporting_order, &page_reporting_param_ops,
  37                         &page_reporting_order, 0644);
  38 MODULE_PARM_DESC(page_reporting_order, "Set page reporting order");
  39
  40 /*
  41  * This symbol is also a kernel parameter. Export the page_reporting_order
  42  * symbol so that other drivers can access it to control order values without
  43  * having to introduce another configurable parameter. Only one driver can
  44  * register with the page_reporting driver for the service, so we have just
  45  * one control parameter for the use case(which can be accessed in both
  46  * drivers)
  47  */
  48 EXPORT_SYMBOL_GPL(page_reporting_order);
  49
  50 #define PAGE_REPORTING_DELAY    (2 * HZ)
  51 static struct page_reporting_dev_info __rcu *pr_dev_info __read_mostly;
  52
  53 enum {
  54         PAGE_REPORTING_IDLE = 0,
  55         PAGE_REPORTING_REQUESTED,
  56         PAGE_REPORTING_ACTIVE
  57 };
  58
  59 /* request page reporting */
  60 static void
  61 __page_reporting_request(struct page_reporting_dev_info *prdev)
  62 {
  63         unsigned int state;
  64
  65         /* Check to see if we are in desired state */
  66         state = atomic_read(&prdev->state);
  67         if (state == PAGE_REPORTING_REQUESTED)
  68                 return;
  69
  70         /*
  71          * If reporting is already active there is nothing we need to do.
  72          * Test against 0 as that represents PAGE_REPORTING_IDLE.
  73          */
  74         state = atomic_xchg(&prdev->state, PAGE_REPORTING_REQUESTED);
  75         if (state != PAGE_REPORTING_IDLE)
  76                 return;
  77
  78         /*
  79          * Delay the start of work to allow a sizable queue to build. For
  80          * now we are limiting this to running no more than once every
  81          * couple of seconds.
  82          */
  83         schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY);
  84 }
  85
  86 /* notify prdev of free page reporting request */
  87 void __page_reporting_notify(void)
  88 {
  89         struct page_reporting_dev_info *prdev;
  90
  91         /*
  92          * We use RCU to protect the pr_dev_info pointer. In almost all
  93          * cases this should be present, however in the unlikely case of
  94          * a shutdown this will be NULL and we should exit.
  95          */
  96         rcu_read_lock();
  97         prdev = rcu_dereference(pr_dev_info);
  98         if (likely(prdev))
  99                 __page_reporting_request(prdev);
 100
 101         rcu_read_unlock();
 102 }
 103
 104 static void
 105 page_reporting_drain(struct page_reporting_dev_info *prdev,
 106                      struct scatterlist *sgl, unsigned int nents, bool reported)
 107 {
 108         struct scatterlist *sg = sgl;
 109
 110         /*
 111          * Drain the now reported pages back into their respective
 112          * free lists/areas. We assume at least one page is populated.
 113          */
 114         do {
 115                 struct page *page = sg_page(sg);
 116                 int mt = get_pageblock_migratetype(page);
 117                 unsigned int order = get_order(sg->length);
 118
 119                 __putback_isolated_page(page, order, mt);
 120
 121                 /* If the pages were not reported due to error skip flagging */
 122                 if (!reported)
 123                         continue;
 124
 125                 /*
 126                  * If page was not comingled with another page we can
 127                  * consider the result to be "reported" since the page
 128                  * hasn't been modified, otherwise we will need to
 129                  * report on the new larger page when we make our way
 130                  * up to that higher order.
 131                  */
 132                 if (PageBuddy(page) && buddy_order(page) == order)
 133                         __SetPageReported(page);
 134         } while ((sg = sg_next(sg)));
 135
 136         /* reinitialize scatterlist now that it is empty */
 137         sg_init_table(sgl, nents);
 138 }
 139
 140 /*
 141  * The page reporting cycle consists of 4 stages, fill, report, drain, and
 142  * idle. We will cycle through the first 3 stages until we cannot obtain a
 143  * full scatterlist of pages, in that case we will switch to idle.
 144  */
 145 static int
 146 page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *zone,
 147                      unsigned int order, unsigned int mt,
 148                      struct scatterlist *sgl, unsigned int *offset)
 149 {
 150         struct free_area *area = &zone->free_area[order];
 151         struct list_head *list = &area->free_list[mt];
 152         unsigned int page_len = PAGE_SIZE << order;
 153         struct page *page, *next;
 154         long budget;
 155         int err = 0;
 156
 157         /*
 158          * Perform early check, if free area is empty there is
 159          * nothing to process so we can skip this free_list.
 160          */
 161         if (list_empty(list))
 162                 return err;
 163
 164         spin_lock_irq(&zone->lock);
 165
 166         /*
 167          * Limit how many calls we will be making to the page reporting
 168          * device for this list. By doing this we avoid processing any
 169          * given list for too long.
 170          *
 171          * The current value used allows us enough calls to process over a
 172          * sixteenth of the current list plus one additional call to handle
 173          * any pages that may have already been present from the previous
 174          * list processed. This should result in us reporting all pages on
 175          * an idle system in about 30 seconds.
 176          *
 177          * The division here should be cheap since PAGE_REPORTING_CAPACITY
 178          * should always be a power of 2.
 179          */
 180         budget = DIV_ROUND_UP(area->nr_free, PAGE_REPORTING_CAPACITY * 16);
 181
 182         /* loop through free list adding unreported pages to sg list */
 183         list_for_each_entry_safe(page, next, list, lru) {
 184                 /* We are going to skip over the reported pages. */
 185                 if (PageReported(page))
 186                         continue;
 187
 188                 /*
 189                  * If we fully consumed our budget then update our
 190                  * state to indicate that we are requesting additional
 191                  * processing and exit this list.
 192                  */
 193                 if (budget < 0) {
 194                         atomic_set(&prdev->state, PAGE_REPORTING_REQUESTED);
 195                         next = page;
 196                         break;
 197                 }
 198
 199                 /* Attempt to pull page from list and place in scatterlist */
 200                 if (*offset) {
 201                         if (!__isolate_free_page(page, order)) {
 202                                 next = page;
 203                                 break;
 204                         }
 205
 206                         /* Add page to scatter list */
 207                         --(*offset);
 208                         sg_set_page(&sgl[*offset], page, page_len, 0);
 209
 210                         continue;
 211                 }
 212
 213                 /*
 214                  * Make the first non-reported page in the free list
 215                  * the new head of the free list before we release the
 216                  * zone lock.
 217                  */
 218                 if (!list_is_first(&page->lru, list))
 219                         list_rotate_to_front(&page->lru, list);
 220
 221                 /* release lock before waiting on report processing */
 222                 spin_unlock_irq(&zone->lock);
 223
 224                 /* begin processing pages in local list */
 225                 err = prdev->report(prdev, sgl, PAGE_REPORTING_CAPACITY);
 226
 227                 /* reset offset since the full list was reported */
 228                 *offset = PAGE_REPORTING_CAPACITY;
 229
 230                 /* update budget to reflect call to report function */
 231                 budget--;
 232
 233                 /* reacquire zone lock and resume processing */
 234                 spin_lock_irq(&zone->lock);
 235
 236                 /* flush reported pages from the sg list */
 237                 page_reporting_drain(prdev, sgl, PAGE_REPORTING_CAPACITY, !err);
 238
 239                 /*
 240                  * Reset next to first entry, the old next isn't valid
 241                  * since we dropped the lock to report the pages
 242                  */
 243                 next = list_first_entry(list, struct page, lru);
 244
 245                 /* exit on error */
 246                 if (err)
 247                         break;
 248         }
 249
 250         /* Rotate any leftover pages to the head of the freelist */
 251         if (!list_entry_is_head(next, list, lru) && !list_is_first(&next->lru, list))
 252                 list_rotate_to_front(&next->lru, list);
 253
 254         spin_unlock_irq(&zone->lock);
 255
 256         return err;
 257 }
 258
 259 static int
 260 page_reporting_process_zone(struct page_reporting_dev_info *prdev,
 261                             struct scatterlist *sgl, struct zone *zone)
 262 {
 263         unsigned int order, mt, leftover, offset = PAGE_REPORTING_CAPACITY;
 264         unsigned long watermark;
 265         int err = 0;
 266
 267         /* Generate minimum watermark to be able to guarantee progress */
 268         watermark = low_wmark_pages(zone) +
 269                     (PAGE_REPORTING_CAPACITY << page_reporting_order);
 270
 271         /*
 272          * Cancel request if insufficient free memory or if we failed
 273          * to allocate page reporting statistics for the zone.
 274          */
 275         if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
 276                 return err;
 277
 278         /* Process each free list starting from lowest order/mt */
 279         for (order = page_reporting_order; order < NR_PAGE_ORDERS; order++) {
 280                 for (mt = 0; mt < MIGRATE_TYPES; mt++) {
 281                         /* We do not pull pages from the isolate free list */
 282                         if (is_migrate_isolate(mt))
 283                                 continue;
 284
 285                         err = page_reporting_cycle(prdev, zone, order, mt,
 286                                                    sgl, &offset);
 287                         if (err)
 288                                 return err;
 289                 }
 290         }
 291
 292         /* report the leftover pages before going idle */
 293         leftover = PAGE_REPORTING_CAPACITY - offset;
 294         if (leftover) {
 295                 sgl = &sgl[offset];
 296                 err = prdev->report(prdev, sgl, leftover);
 297
 298                 /* flush any remaining pages out from the last report */
 299                 spin_lock_irq(&zone->lock);
 300                 page_reporting_drain(prdev, sgl, leftover, !err);
 301                 spin_unlock_irq(&zone->lock);
 302         }
 303
 304         return err;
 305 }
 306
 307 static void page_reporting_process(struct work_struct *work)
 308 {
 309         struct delayed_work *d_work = to_delayed_work(work);
 310         struct page_reporting_dev_info *prdev =
 311                 container_of(d_work, struct page_reporting_dev_info, work);
 312         int err = 0, state = PAGE_REPORTING_ACTIVE;
 313         struct scatterlist *sgl;
 314         struct zone *zone;
 315
 316         /*
 317          * Change the state to "Active" so that we can track if there is
 318          * anyone requests page reporting after we complete our pass. If
 319          * the state is not altered by the end of the pass we will switch
 320          * to idle and quit scheduling reporting runs.
 321          */
 322         atomic_set(&prdev->state, state);
 323
 324         /* allocate scatterlist to store pages being reported on */
 325         sgl = kmalloc_array(PAGE_REPORTING_CAPACITY, sizeof(*sgl), GFP_KERNEL);
 326         if (!sgl)
 327                 goto err_out;
 328
 329         sg_init_table(sgl, PAGE_REPORTING_CAPACITY);
 330
 331         for_each_zone(zone) {
 332                 err = page_reporting_process_zone(prdev, sgl, zone);
 333                 if (err)
 334                         break;
 335         }
 336
 337         kfree(sgl);
 338 err_out:
 339         /*
 340          * If the state has reverted back to requested then there may be
 341          * additional pages to be processed. We will defer for 2s to allow
 342          * more pages to accumulate.
 343          */
 344         state = atomic_cmpxchg(&prdev->state, state, PAGE_REPORTING_IDLE);
 345         if (state == PAGE_REPORTING_REQUESTED)
 346                 schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY);
 347 }
 348
 349 static DEFINE_MUTEX(page_reporting_mutex);
 350 DEFINE_STATIC_KEY_FALSE(page_reporting_enabled);
 351
 352 int page_reporting_register(struct page_reporting_dev_info *prdev)
 353 {
 354         int err = 0;
 355
 356         mutex_lock(&page_reporting_mutex);
 357
 358         /* nothing to do if already in use */
 359         if (rcu_dereference_protected(pr_dev_info,
 360                                 lockdep_is_held(&page_reporting_mutex))) {
 361                 err = -EBUSY;
 362                 goto err_out;
 363         }
 364
 365         /*
 366          * If the page_reporting_order value is not set, we check if
 367          * an order is provided from the driver that is performing the
 368          * registration. If that is not provided either, we default to
 369          * pageblock_order.
 370          */
 371
 372         if (page_reporting_order == -1) {
 373                 if (prdev->order > 0 && prdev->order <= MAX_PAGE_ORDER)
 374                         page_reporting_order = prdev->order;
 375                 else
 376                         page_reporting_order = pageblock_order;
 377         }
 378
 379         /* initialize state and work structures */
 380         atomic_set(&prdev->state, PAGE_REPORTING_IDLE);
 381         INIT_DELAYED_WORK(&prdev->work, &page_reporting_process);
 382
 383         /* Begin initial flush of zones */
 384         __page_reporting_request(prdev);
 385
 386         /* Assign device to allow notifications */
 387         rcu_assign_pointer(pr_dev_info, prdev);
 388
 389         /* enable page reporting notification */
 390         if (!static_key_enabled(&page_reporting_enabled)) {
 391                 static_branch_enable(&page_reporting_enabled);
 392                 pr_info("Free page reporting enabled\n");
 393         }
 394 err_out:
 395         mutex_unlock(&page_reporting_mutex);
 396
 397         return err;
 398 }
 399 EXPORT_SYMBOL_GPL(page_reporting_register);
 400
 401 void page_reporting_unregister(struct page_reporting_dev_info *prdev)
 402 {
 403         mutex_lock(&page_reporting_mutex);
 404
 405         if (prdev == rcu_dereference_protected(pr_dev_info,
 406                                 lockdep_is_held(&page_reporting_mutex))) {
 407                 /* Disable page reporting notification */
 408                 RCU_INIT_POINTER(pr_dev_info, NULL);
 409                 synchronize_rcu();
 410
 411                 /* Flush any existing work, and lock it out */
 412                 cancel_delayed_work_sync(&prdev->work);
 413         }
 414
 415         mutex_unlock(&page_reporting_mutex);
 416 }
 417 EXPORT_SYMBOL_GPL(page_reporting_unregister);