mm/page_cgroup.c

   1 #include <linux/mm.h>
   2 #include <linux/mmzone.h>
   3 #include <linux/bootmem.h>
   4 #include <linux/bit_spinlock.h>
   5 #include <linux/page_cgroup.h>
   6 #include <linux/hash.h>
   7 #include <linux/slab.h>
   8 #include <linux/memory.h>
   9 #include <linux/vmalloc.h>
  10 #include <linux/cgroup.h>
  11 #include <linux/swapops.h>
  12 #include <linux/kmemleak.h>
  13
  14 static unsigned long total_usage;
  15
  16 static void page_cgroup_lock_init(struct page_cgroup *pc, int nr_pages)
  17 {
  18 #ifdef CONFIG_PREEMPT_RT_BASE
  19         for (; nr_pages; nr_pages--, pc++)
  20                 spin_lock_init(&pc->pcg_lock);
  21 #endif
  22 }
  23
  24 #if !defined(CONFIG_SPARSEMEM)
  25
  26
  27 void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
  28 {
  29         pgdat->node_page_cgroup = NULL;
  30 }
  31
  32 struct page_cgroup *lookup_page_cgroup(struct page *page)
  33 {
  34         unsigned long pfn = page_to_pfn(page);
  35         unsigned long offset;
  36         struct page_cgroup *base;
  37
  38         base = NODE_DATA(page_to_nid(page))->node_page_cgroup;
  39 #ifdef CONFIG_DEBUG_VM
  40         /*
  41          * The sanity checks the page allocator does upon freeing a
  42          * page can reach here before the page_cgroup arrays are
  43          * allocated when feeding a range of pages to the allocator
  44          * for the first time during bootup or memory hotplug.
  45          */
  46         if (unlikely(!base))
  47                 return NULL;
  48 #endif
  49         offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn;
  50         return base + offset;
  51 }
  52
  53 static int __init alloc_node_page_cgroup(int nid)
  54 {
  55         struct page_cgroup *base;
  56         unsigned long table_size;
  57         unsigned long nr_pages;
  58
  59         nr_pages = NODE_DATA(nid)->node_spanned_pages;
  60         if (!nr_pages)
  61                 return 0;
  62
  63         table_size = sizeof(struct page_cgroup) * nr_pages;
  64
  65         base = memblock_virt_alloc_try_nid_nopanic(
  66                         table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
  67                         BOOTMEM_ALLOC_ACCESSIBLE, nid);
  68         if (!base)
  69                 return -ENOMEM;
  70         NODE_DATA(nid)->node_page_cgroup = base;
  71         total_usage += table_size;
  72         page_cgroup_lock_init(base, nr_pages);
  73         return 0;
  74 }
  75
  76 void __init page_cgroup_init_flatmem(void)
  77 {
  78
  79         int nid, fail;
  80
  81         if (mem_cgroup_disabled())
  82                 return;
  83
  84         for_each_online_node(nid)  {
  85                 fail = alloc_node_page_cgroup(nid);
  86                 if (fail)
  87                         goto fail;
  88         }
  89         printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
  90         printk(KERN_INFO "please try 'cgroup_disable=memory' option if you"
  91         " don't want memory cgroups\n");
  92         return;
  93 fail:
  94         printk(KERN_CRIT "allocation of page_cgroup failed.\n");
  95         printk(KERN_CRIT "please try 'cgroup_disable=memory' boot option\n");
  96         panic("Out of memory");
  97 }
  98
  99 #else /* CONFIG_FLAT_NODE_MEM_MAP */
 100
 101 struct page_cgroup *lookup_page_cgroup(struct page *page)
 102 {
 103         unsigned long pfn = page_to_pfn(page);
 104         struct mem_section *section = __pfn_to_section(pfn);
 105 #ifdef CONFIG_DEBUG_VM
 106         /*
 107          * The sanity checks the page allocator does upon freeing a
 108          * page can reach here before the page_cgroup arrays are
 109          * allocated when feeding a range of pages to the allocator
 110          * for the first time during bootup or memory hotplug.
 111          */
 112         if (!section->page_cgroup)
 113                 return NULL;
 114 #endif
 115         return section->page_cgroup + pfn;
 116 }
 117
 118 static void *__meminit alloc_page_cgroup(size_t size, int nid)
 119 {
 120         gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN;
 121         void *addr = NULL;
 122
 123         addr = alloc_pages_exact_nid(nid, size, flags);
 124         if (addr) {
 125                 kmemleak_alloc(addr, size, 1, flags);
 126                 return addr;
 127         }
 128
 129         if (node_state(nid, N_HIGH_MEMORY))
 130                 addr = vzalloc_node(size, nid);
 131         else
 132                 addr = vzalloc(size);
 133
 134         return addr;
 135 }
 136
 137 static int __meminit init_section_page_cgroup(unsigned long pfn, int nid)
 138 {
 139         struct mem_section *section;
 140         struct page_cgroup *base;
 141         unsigned long table_size;
 142
 143         section = __pfn_to_section(pfn);
 144
 145         if (section->page_cgroup)
 146                 return 0;
 147
 148         table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
 149         base = alloc_page_cgroup(table_size, nid);
 150
 151         /*
 152          * The value stored in section->page_cgroup is (base - pfn)
 153          * and it does not point to the memory block allocated above,
 154          * causing kmemleak false positives.
 155          */
 156         kmemleak_not_leak(base);
 157
 158         if (!base) {
 159                 printk(KERN_ERR "page cgroup allocation failure\n");
 160                 return -ENOMEM;
 161         }
 162
 163         page_cgroup_lock_init(base, PAGES_PER_SECTION);
 164
 165         /*
 166          * The passed "pfn" may not be aligned to SECTION.  For the calculation
 167          * we need to apply a mask.
 168          */
 169         pfn &= PAGE_SECTION_MASK;
 170         section->page_cgroup = base - pfn;
 171         total_usage += table_size;
 172         return 0;
 173 }
 174 #ifdef CONFIG_MEMORY_HOTPLUG
 175 static void free_page_cgroup(void *addr)
 176 {
 177         if (is_vmalloc_addr(addr)) {
 178                 vfree(addr);
 179         } else {
 180                 struct page *page = virt_to_page(addr);
 181                 size_t table_size =
 182                         sizeof(struct page_cgroup) * PAGES_PER_SECTION;
 183
 184                 BUG_ON(PageReserved(page));
 185                 kmemleak_free(addr);
 186                 free_pages_exact(addr, table_size);
 187         }
 188 }
 189
 190 void __free_page_cgroup(unsigned long pfn)
 191 {
 192         struct mem_section *ms;
 193         struct page_cgroup *base;
 194
 195         ms = __pfn_to_section(pfn);
 196         if (!ms || !ms->page_cgroup)
 197                 return;
 198         base = ms->page_cgroup + pfn;
 199         free_page_cgroup(base);
 200         ms->page_cgroup = NULL;
 201 }
 202
 203 int __meminit online_page_cgroup(unsigned long start_pfn,
 204                         unsigned long nr_pages,
 205                         int nid)
 206 {
 207         unsigned long start, end, pfn;
 208         int fail = 0;
 209
 210         start = SECTION_ALIGN_DOWN(start_pfn);
 211         end = SECTION_ALIGN_UP(start_pfn + nr_pages);
 212
 213         if (nid == -1) {
 214                 /*
 215                  * In this case, "nid" already exists and contains valid memory.
 216                  * "start_pfn" passed to us is a pfn which is an arg for
 217                  * online__pages(), and start_pfn should exist.
 218                  */
 219                 nid = pfn_to_nid(start_pfn);
 220                 VM_BUG_ON(!node_state(nid, N_ONLINE));
 221         }
 222
 223         for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
 224                 if (!pfn_present(pfn))
 225                         continue;
 226                 fail = init_section_page_cgroup(pfn, nid);
 227         }
 228         if (!fail)
 229                 return 0;
 230
 231         /* rollback */
 232         for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
 233                 __free_page_cgroup(pfn);
 234
 235         return -ENOMEM;
 236 }
 237
 238 int __meminit offline_page_cgroup(unsigned long start_pfn,
 239                 unsigned long nr_pages, int nid)
 240 {
 241         unsigned long start, end, pfn;
 242
 243         start = SECTION_ALIGN_DOWN(start_pfn);
 244         end = SECTION_ALIGN_UP(start_pfn + nr_pages);
 245
 246         for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
 247                 __free_page_cgroup(pfn);
 248         return 0;
 249
 250 }
 251
 252 static int __meminit page_cgroup_callback(struct notifier_block *self,
 253                                unsigned long action, void *arg)
 254 {
 255         struct memory_notify *mn = arg;
 256         int ret = 0;
 257         switch (action) {
 258         case MEM_GOING_ONLINE:
 259                 ret = online_page_cgroup(mn->start_pfn,
 260                                    mn->nr_pages, mn->status_change_nid);
 261                 break;
 262         case MEM_OFFLINE:
 263                 offline_page_cgroup(mn->start_pfn,
 264                                 mn->nr_pages, mn->status_change_nid);
 265                 break;
 266         case MEM_CANCEL_ONLINE:
 267                 offline_page_cgroup(mn->start_pfn,
 268                                 mn->nr_pages, mn->status_change_nid);
 269                 break;
 270         case MEM_GOING_OFFLINE:
 271                 break;
 272         case MEM_ONLINE:
 273         case MEM_CANCEL_OFFLINE:
 274                 break;
 275         }
 276
 277         return notifier_from_errno(ret);
 278 }
 279
 280 #endif
 281
 282 void __init page_cgroup_init(void)
 283 {
 284         unsigned long pfn;
 285         int nid;
 286
 287         if (mem_cgroup_disabled())
 288                 return;
 289
 290         for_each_node_state(nid, N_MEMORY) {
 291                 unsigned long start_pfn, end_pfn;
 292
 293                 start_pfn = node_start_pfn(nid);
 294                 end_pfn = node_end_pfn(nid);
 295                 /*
 296                  * start_pfn and end_pfn may not be aligned to SECTION and the
 297                  * page->flags of out of node pages are not initialized.  So we
 298                  * scan [start_pfn, the biggest section's pfn < end_pfn) here.
 299                  */
 300                 for (pfn = start_pfn;
 301                      pfn < end_pfn;
 302                      pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) {
 303
 304                         if (!pfn_valid(pfn))
 305                                 continue;
 306                         /*
 307                          * Nodes's pfns can be overlapping.
 308                          * We know some arch can have a nodes layout such as
 309                          * -------------pfn-------------->
 310                          * N0 | N1 | N2 | N0 | N1 | N2|....
 311                          */
 312                         if (pfn_to_nid(pfn) != nid)
 313                                 continue;
 314                         if (init_section_page_cgroup(pfn, nid))
 315                                 goto oom;
 316                 }
 317         }
 318         hotplug_memory_notifier(page_cgroup_callback, 0);
 319         printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
 320         printk(KERN_INFO "please try 'cgroup_disable=memory' option if you "
 321                          "don't want memory cgroups\n");
 322         return;
 323 oom:
 324         printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
 325         panic("Out of memory");
 326 }
 327
 328 void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
 329 {
 330         return;
 331 }
 332
 333 #endif
 334
 335
 336 #ifdef CONFIG_MEMCG_SWAP
 337
 338 static DEFINE_MUTEX(swap_cgroup_mutex);
 339 struct swap_cgroup_ctrl {
 340         struct page **map;
 341         unsigned long length;
 342         spinlock_t      lock;
 343 };
 344
 345 static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
 346
 347 struct swap_cgroup {
 348         unsigned short          id;
 349 };
 350 #define SC_PER_PAGE     (PAGE_SIZE/sizeof(struct swap_cgroup))
 351
 352 /*
 353  * SwapCgroup implements "lookup" and "exchange" operations.
 354  * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
 355  * against SwapCache. At swap_free(), this is accessed directly from swap.
 356  *
 357  * This means,
 358  *  - we have no race in "exchange" when we're accessed via SwapCache because
 359  *    SwapCache(and its swp_entry) is under lock.
 360  *  - When called via swap_free(), there is no user of this entry and no race.
 361  * Then, we don't need lock around "exchange".
 362  *
 363  * TODO: we can push these buffers out to HIGHMEM.
 364  */
 365
 366 /*
 367  * allocate buffer for swap_cgroup.
 368  */
 369 static int swap_cgroup_prepare(int type)
 370 {
 371         struct page *page;
 372         struct swap_cgroup_ctrl *ctrl;
 373         unsigned long idx, max;
 374
 375         ctrl = &swap_cgroup_ctrl[type];
 376
 377         for (idx = 0; idx < ctrl->length; idx++) {
 378                 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 379                 if (!page)
 380                         goto not_enough_page;
 381                 ctrl->map[idx] = page;
 382         }
 383         return 0;
 384 not_enough_page:
 385         max = idx;
 386         for (idx = 0; idx < max; idx++)
 387                 __free_page(ctrl->map[idx]);
 388
 389         return -ENOMEM;
 390 }
 391
 392 static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
 393                                         struct swap_cgroup_ctrl **ctrlp)
 394 {
 395         pgoff_t offset = swp_offset(ent);
 396         struct swap_cgroup_ctrl *ctrl;
 397         struct page *mappage;
 398         struct swap_cgroup *sc;
 399
 400         ctrl = &swap_cgroup_ctrl[swp_type(ent)];
 401         if (ctrlp)
 402                 *ctrlp = ctrl;
 403
 404         mappage = ctrl->map[offset / SC_PER_PAGE];
 405         sc = page_address(mappage);
 406         return sc + offset % SC_PER_PAGE;
 407 }
 408
 409 /**
 410  * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
 411  * @ent: swap entry to be cmpxchged
 412  * @old: old id
 413  * @new: new id
 414  *
 415  * Returns old id at success, 0 at failure.
 416  * (There is no mem_cgroup using 0 as its id)
 417  */
 418 unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
 419                                         unsigned short old, unsigned short new)
 420 {
 421         struct swap_cgroup_ctrl *ctrl;
 422         struct swap_cgroup *sc;
 423         unsigned long flags;
 424         unsigned short retval;
 425
 426         sc = lookup_swap_cgroup(ent, &ctrl);
 427
 428         spin_lock_irqsave(&ctrl->lock, flags);
 429         retval = sc->id;
 430         if (retval == old)
 431                 sc->id = new;
 432         else
 433                 retval = 0;
 434         spin_unlock_irqrestore(&ctrl->lock, flags);
 435         return retval;
 436 }
 437
 438 /**
 439  * swap_cgroup_record - record mem_cgroup for this swp_entry.
 440  * @ent: swap entry to be recorded into
 441  * @id: mem_cgroup to be recorded
 442  *
 443  * Returns old value at success, 0 at failure.
 444  * (Of course, old value can be 0.)
 445  */
 446 unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
 447 {
 448         struct swap_cgroup_ctrl *ctrl;
 449         struct swap_cgroup *sc;
 450         unsigned short old;
 451         unsigned long flags;
 452
 453         sc = lookup_swap_cgroup(ent, &ctrl);
 454
 455         spin_lock_irqsave(&ctrl->lock, flags);
 456         old = sc->id;
 457         sc->id = id;
 458         spin_unlock_irqrestore(&ctrl->lock, flags);
 459
 460         return old;
 461 }
 462
 463 /**
 464  * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry
 465  * @ent: swap entry to be looked up.
 466  *
 467  * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
 468  */
 469 unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
 470 {
 471         return lookup_swap_cgroup(ent, NULL)->id;
 472 }
 473
 474 int swap_cgroup_swapon(int type, unsigned long max_pages)
 475 {
 476         void *array;
 477         unsigned long array_size;
 478         unsigned long length;
 479         struct swap_cgroup_ctrl *ctrl;
 480
 481         if (!do_swap_account)
 482                 return 0;
 483
 484         length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
 485         array_size = length * sizeof(void *);
 486
 487         array = vzalloc(array_size);
 488         if (!array)
 489                 goto nomem;
 490
 491         ctrl = &swap_cgroup_ctrl[type];
 492         mutex_lock(&swap_cgroup_mutex);
 493         ctrl->length = length;
 494         ctrl->map = array;
 495         spin_lock_init(&ctrl->lock);
 496         if (swap_cgroup_prepare(type)) {
 497                 /* memory shortage */
 498                 ctrl->map = NULL;
 499                 ctrl->length = 0;
 500                 mutex_unlock(&swap_cgroup_mutex);
 501                 vfree(array);
 502                 goto nomem;
 503         }
 504         mutex_unlock(&swap_cgroup_mutex);
 505
 506         return 0;
 507 nomem:
 508         printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
 509         printk(KERN_INFO
 510                 "swap_cgroup can be disabled by swapaccount=0 boot option\n");
 511         return -ENOMEM;
 512 }
 513
 514 void swap_cgroup_swapoff(int type)
 515 {
 516         struct page **map;
 517         unsigned long i, length;
 518         struct swap_cgroup_ctrl *ctrl;
 519
 520         if (!do_swap_account)
 521                 return;
 522
 523         mutex_lock(&swap_cgroup_mutex);
 524         ctrl = &swap_cgroup_ctrl[type];
 525         map = ctrl->map;
 526         length = ctrl->length;
 527         ctrl->map = NULL;
 528         ctrl->length = 0;
 529         mutex_unlock(&swap_cgroup_mutex);
 530
 531         if (map) {
 532                 for (i = 0; i < length; i++) {
 533                         struct page *page = map[i];
 534                         if (page)
 535                                 __free_page(page);
 536                 }
 537                 vfree(map);
 538         }
 539 }
 540
 541 #endif