mm/page_cgroup.c

   1 #include <linux/mm.h>
   2 #include <linux/mmzone.h>
   3 #include <linux/bootmem.h>
   4 #include <linux/bit_spinlock.h>
   5 #include <linux/page_cgroup.h>
   6 #include <linux/hash.h>
   7 #include <linux/slab.h>
   8 #include <linux/memory.h>
   9 #include <linux/vmalloc.h>
  10 #include <linux/cgroup.h>
  11 #include <linux/swapops.h>
  12 #include <linux/kmemleak.h>
  13
  14 static void __meminit init_page_cgroup(struct page_cgroup *pc, unsigned long id)
  15 {
  16         pc->flags = 0;
  17         set_page_cgroup_array_id(pc, id);
  18         pc->mem_cgroup = NULL;
  19         INIT_LIST_HEAD(&pc->lru);
  20 }
  21 static unsigned long total_usage;
  22
  23 #if !defined(CONFIG_SPARSEMEM)
  24
  25
  26 void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
  27 {
  28         pgdat->node_page_cgroup = NULL;
  29 }
  30
  31 struct page_cgroup *lookup_page_cgroup(struct page *page)
  32 {
  33         unsigned long pfn = page_to_pfn(page);
  34         unsigned long offset;
  35         struct page_cgroup *base;
  36
  37         base = NODE_DATA(page_to_nid(page))->node_page_cgroup;
  38         if (unlikely(!base))
  39                 return NULL;
  40
  41         offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn;
  42         return base + offset;
  43 }
  44
  45 struct page *lookup_cgroup_page(struct page_cgroup *pc)
  46 {
  47         unsigned long pfn;
  48         struct page *page;
  49         pg_data_t *pgdat;
  50
  51         pgdat = NODE_DATA(page_cgroup_array_id(pc));
  52         pfn = pc - pgdat->node_page_cgroup + pgdat->node_start_pfn;
  53         page = pfn_to_page(pfn);
  54         VM_BUG_ON(pc != lookup_page_cgroup(page));
  55         return page;
  56 }
  57
  58 static int __init alloc_node_page_cgroup(int nid)
  59 {
  60         struct page_cgroup *base, *pc;
  61         unsigned long table_size;
  62         unsigned long start_pfn, nr_pages, index;
  63
  64         start_pfn = NODE_DATA(nid)->node_start_pfn;
  65         nr_pages = NODE_DATA(nid)->node_spanned_pages;
  66
  67         if (!nr_pages)
  68                 return 0;
  69
  70         table_size = sizeof(struct page_cgroup) * nr_pages;
  71
  72         base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
  73                         table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
  74         if (!base)
  75                 return -ENOMEM;
  76         for (index = 0; index < nr_pages; index++) {
  77                 pc = base + index;
  78                 init_page_cgroup(pc, nid);
  79         }
  80         NODE_DATA(nid)->node_page_cgroup = base;
  81         total_usage += table_size;
  82         return 0;
  83 }
  84
  85 void __init page_cgroup_init_flatmem(void)
  86 {
  87
  88         int nid, fail;
  89
  90         if (mem_cgroup_disabled())
  91                 return;
  92
  93         for_each_online_node(nid)  {
  94                 fail = alloc_node_page_cgroup(nid);
  95                 if (fail)
  96                         goto fail;
  97         }
  98         printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
  99         printk(KERN_INFO "please try 'cgroup_disable=memory' option if you"
 100         " don't want memory cgroups\n");
 101         return;
 102 fail:
 103         printk(KERN_CRIT "allocation of page_cgroup failed.\n");
 104         printk(KERN_CRIT "please try 'cgroup_disable=memory' boot option\n");
 105         panic("Out of memory");
 106 }
 107
 108 #else /* CONFIG_FLAT_NODE_MEM_MAP */
 109
 110 struct page_cgroup *lookup_page_cgroup(struct page *page)
 111 {
 112         unsigned long pfn = page_to_pfn(page);
 113         struct mem_section *section = __pfn_to_section(pfn);
 114
 115         if (!section->page_cgroup)
 116                 return NULL;
 117         return section->page_cgroup + pfn;
 118 }
 119
 120 struct page *lookup_cgroup_page(struct page_cgroup *pc)
 121 {
 122         struct mem_section *section;
 123         struct page *page;
 124         unsigned long nr;
 125
 126         nr = page_cgroup_array_id(pc);
 127         section = __nr_to_section(nr);
 128         page = pfn_to_page(pc - section->page_cgroup);
 129         VM_BUG_ON(pc != lookup_page_cgroup(page));
 130         return page;
 131 }
 132
 133 static void *__init_refok alloc_page_cgroup(size_t size, int nid)
 134 {
 135         void *addr = NULL;
 136
 137         addr = alloc_pages_exact(size, GFP_KERNEL | __GFP_NOWARN);
 138         if (addr)
 139                 return addr;
 140
 141         if (node_state(nid, N_HIGH_MEMORY))
 142                 addr = vmalloc_node(size, nid);
 143         else
 144                 addr = vmalloc(size);
 145
 146         return addr;
 147 }
 148
 149 #ifdef CONFIG_MEMORY_HOTPLUG
 150 static void free_page_cgroup(void *addr)
 151 {
 152         if (is_vmalloc_addr(addr)) {
 153                 vfree(addr);
 154         } else {
 155                 struct page *page = virt_to_page(addr);
 156                 size_t table_size =
 157                         sizeof(struct page_cgroup) * PAGES_PER_SECTION;
 158
 159                 BUG_ON(PageReserved(page));
 160                 free_pages_exact(addr, table_size);
 161         }
 162 }
 163 #endif
 164
 165 static int __init_refok init_section_page_cgroup(unsigned long pfn)
 166 {
 167         struct page_cgroup *base, *pc;
 168         struct mem_section *section;
 169         unsigned long table_size;
 170         unsigned long nr;
 171         int nid, index;
 172
 173         nr = pfn_to_section_nr(pfn);
 174         section = __nr_to_section(nr);
 175
 176         if (section->page_cgroup)
 177                 return 0;
 178
 179         nid = page_to_nid(pfn_to_page(pfn));
 180         table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
 181         base = alloc_page_cgroup(table_size, nid);
 182
 183         /*
 184          * The value stored in section->page_cgroup is (base - pfn)
 185          * and it does not point to the memory block allocated above,
 186          * causing kmemleak false positives.
 187          */
 188         kmemleak_not_leak(base);
 189
 190         if (!base) {
 191                 printk(KERN_ERR "page cgroup allocation failure\n");
 192                 return -ENOMEM;
 193         }
 194
 195         for (index = 0; index < PAGES_PER_SECTION; index++) {
 196                 pc = base + index;
 197                 init_page_cgroup(pc, nr);
 198         }
 199
 200         section->page_cgroup = base - pfn;
 201         total_usage += table_size;
 202         return 0;
 203 }
 204 #ifdef CONFIG_MEMORY_HOTPLUG
 205 void __free_page_cgroup(unsigned long pfn)
 206 {
 207         struct mem_section *ms;
 208         struct page_cgroup *base;
 209
 210         ms = __pfn_to_section(pfn);
 211         if (!ms || !ms->page_cgroup)
 212                 return;
 213         base = ms->page_cgroup + pfn;
 214         free_page_cgroup(base);
 215         ms->page_cgroup = NULL;
 216 }
 217
 218 int __meminit online_page_cgroup(unsigned long start_pfn,
 219                         unsigned long nr_pages,
 220                         int nid)
 221 {
 222         unsigned long start, end, pfn;
 223         int fail = 0;
 224
 225         start = start_pfn & ~(PAGES_PER_SECTION - 1);
 226         end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
 227
 228         for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
 229                 if (!pfn_present(pfn))
 230                         continue;
 231                 fail = init_section_page_cgroup(pfn);
 232         }
 233         if (!fail)
 234                 return 0;
 235
 236         /* rollback */
 237         for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
 238                 __free_page_cgroup(pfn);
 239
 240         return -ENOMEM;
 241 }
 242
 243 int __meminit offline_page_cgroup(unsigned long start_pfn,
 244                 unsigned long nr_pages, int nid)
 245 {
 246         unsigned long start, end, pfn;
 247
 248         start = start_pfn & ~(PAGES_PER_SECTION - 1);
 249         end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
 250
 251         for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
 252                 __free_page_cgroup(pfn);
 253         return 0;
 254
 255 }
 256
 257 static int __meminit page_cgroup_callback(struct notifier_block *self,
 258                                unsigned long action, void *arg)
 259 {
 260         struct memory_notify *mn = arg;
 261         int ret = 0;
 262         switch (action) {
 263         case MEM_GOING_ONLINE:
 264                 ret = online_page_cgroup(mn->start_pfn,
 265                                    mn->nr_pages, mn->status_change_nid);
 266                 break;
 267         case MEM_OFFLINE:
 268                 offline_page_cgroup(mn->start_pfn,
 269                                 mn->nr_pages, mn->status_change_nid);
 270                 break;
 271         case MEM_CANCEL_ONLINE:
 272         case MEM_GOING_OFFLINE:
 273                 break;
 274         case MEM_ONLINE:
 275         case MEM_CANCEL_OFFLINE:
 276                 break;
 277         }
 278
 279         return notifier_from_errno(ret);
 280 }
 281
 282 #endif
 283
 284 void __init page_cgroup_init(void)
 285 {
 286         unsigned long pfn;
 287         int fail = 0;
 288
 289         if (mem_cgroup_disabled())
 290                 return;
 291
 292         for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) {
 293                 if (!pfn_present(pfn))
 294                         continue;
 295                 fail = init_section_page_cgroup(pfn);
 296         }
 297         if (fail) {
 298                 printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
 299                 panic("Out of memory");
 300         } else {
 301                 hotplug_memory_notifier(page_cgroup_callback, 0);
 302         }
 303         printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
 304         printk(KERN_INFO "please try 'cgroup_disable=memory' option if you don't"
 305         " want memory cgroups\n");
 306 }
 307
 308 void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
 309 {
 310         return;
 311 }
 312
 313 #endif
 314
 315
 316 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
 317
 318 static DEFINE_MUTEX(swap_cgroup_mutex);
 319 struct swap_cgroup_ctrl {
 320         struct page **map;
 321         unsigned long length;
 322         spinlock_t      lock;
 323 };
 324
 325 struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
 326
 327 struct swap_cgroup {
 328         unsigned short          id;
 329 };
 330 #define SC_PER_PAGE     (PAGE_SIZE/sizeof(struct swap_cgroup))
 331 #define SC_POS_MASK     (SC_PER_PAGE - 1)
 332
 333 /*
 334  * SwapCgroup implements "lookup" and "exchange" operations.
 335  * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
 336  * against SwapCache. At swap_free(), this is accessed directly from swap.
 337  *
 338  * This means,
 339  *  - we have no race in "exchange" when we're accessed via SwapCache because
 340  *    SwapCache(and its swp_entry) is under lock.
 341  *  - When called via swap_free(), there is no user of this entry and no race.
 342  * Then, we don't need lock around "exchange".
 343  *
 344  * TODO: we can push these buffers out to HIGHMEM.
 345  */
 346
 347 /*
 348  * allocate buffer for swap_cgroup.
 349  */
 350 static int swap_cgroup_prepare(int type)
 351 {
 352         struct page *page;
 353         struct swap_cgroup_ctrl *ctrl;
 354         unsigned long idx, max;
 355
 356         ctrl = &swap_cgroup_ctrl[type];
 357
 358         for (idx = 0; idx < ctrl->length; idx++) {
 359                 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 360                 if (!page)
 361                         goto not_enough_page;
 362                 ctrl->map[idx] = page;
 363         }
 364         return 0;
 365 not_enough_page:
 366         max = idx;
 367         for (idx = 0; idx < max; idx++)
 368                 __free_page(ctrl->map[idx]);
 369
 370         return -ENOMEM;
 371 }
 372
 373 /**
 374  * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
 375  * @end: swap entry to be cmpxchged
 376  * @old: old id
 377  * @new: new id
 378  *
 379  * Returns old id at success, 0 at failure.
 380  * (There is no mem_cgroup using 0 as its id)
 381  */
 382 unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
 383                                         unsigned short old, unsigned short new)
 384 {
 385         int type = swp_type(ent);
 386         unsigned long offset = swp_offset(ent);
 387         unsigned long idx = offset / SC_PER_PAGE;
 388         unsigned long pos = offset & SC_POS_MASK;
 389         struct swap_cgroup_ctrl *ctrl;
 390         struct page *mappage;
 391         struct swap_cgroup *sc;
 392         unsigned long flags;
 393         unsigned short retval;
 394
 395         ctrl = &swap_cgroup_ctrl[type];
 396
 397         mappage = ctrl->map[idx];
 398         sc = page_address(mappage);
 399         sc += pos;
 400         spin_lock_irqsave(&ctrl->lock, flags);
 401         retval = sc->id;
 402         if (retval == old)
 403                 sc->id = new;
 404         else
 405                 retval = 0;
 406         spin_unlock_irqrestore(&ctrl->lock, flags);
 407         return retval;
 408 }
 409
 410 /**
 411  * swap_cgroup_record - record mem_cgroup for this swp_entry.
 412  * @ent: swap entry to be recorded into
 413  * @mem: mem_cgroup to be recorded
 414  *
 415  * Returns old value at success, 0 at failure.
 416  * (Of course, old value can be 0.)
 417  */
 418 unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
 419 {
 420         int type = swp_type(ent);
 421         unsigned long offset = swp_offset(ent);
 422         unsigned long idx = offset / SC_PER_PAGE;
 423         unsigned long pos = offset & SC_POS_MASK;
 424         struct swap_cgroup_ctrl *ctrl;
 425         struct page *mappage;
 426         struct swap_cgroup *sc;
 427         unsigned short old;
 428         unsigned long flags;
 429
 430         ctrl = &swap_cgroup_ctrl[type];
 431
 432         mappage = ctrl->map[idx];
 433         sc = page_address(mappage);
 434         sc += pos;
 435         spin_lock_irqsave(&ctrl->lock, flags);
 436         old = sc->id;
 437         sc->id = id;
 438         spin_unlock_irqrestore(&ctrl->lock, flags);
 439
 440         return old;
 441 }
 442
 443 /**
 444  * lookup_swap_cgroup - lookup mem_cgroup tied to swap entry
 445  * @ent: swap entry to be looked up.
 446  *
 447  * Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
 448  */
 449 unsigned short lookup_swap_cgroup(swp_entry_t ent)
 450 {
 451         int type = swp_type(ent);
 452         unsigned long offset = swp_offset(ent);
 453         unsigned long idx = offset / SC_PER_PAGE;
 454         unsigned long pos = offset & SC_POS_MASK;
 455         struct swap_cgroup_ctrl *ctrl;
 456         struct page *mappage;
 457         struct swap_cgroup *sc;
 458         unsigned short ret;
 459
 460         ctrl = &swap_cgroup_ctrl[type];
 461         mappage = ctrl->map[idx];
 462         sc = page_address(mappage);
 463         sc += pos;
 464         ret = sc->id;
 465         return ret;
 466 }
 467
 468 int swap_cgroup_swapon(int type, unsigned long max_pages)
 469 {
 470         void *array;
 471         unsigned long array_size;
 472         unsigned long length;
 473         struct swap_cgroup_ctrl *ctrl;
 474
 475         if (!do_swap_account)
 476                 return 0;
 477
 478         length = ((max_pages/SC_PER_PAGE) + 1);
 479         array_size = length * sizeof(void *);
 480
 481         array = vmalloc(array_size);
 482         if (!array)
 483                 goto nomem;
 484
 485         memset(array, 0, array_size);
 486         ctrl = &swap_cgroup_ctrl[type];
 487         mutex_lock(&swap_cgroup_mutex);
 488         ctrl->length = length;
 489         ctrl->map = array;
 490         spin_lock_init(&ctrl->lock);
 491         if (swap_cgroup_prepare(type)) {
 492                 /* memory shortage */
 493                 ctrl->map = NULL;
 494                 ctrl->length = 0;
 495                 vfree(array);
 496                 mutex_unlock(&swap_cgroup_mutex);
 497                 goto nomem;
 498         }
 499         mutex_unlock(&swap_cgroup_mutex);
 500
 501         return 0;
 502 nomem:
 503         printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
 504         printk(KERN_INFO
 505                 "swap_cgroup can be disabled by noswapaccount boot option\n");
 506         return -ENOMEM;
 507 }
 508
 509 void swap_cgroup_swapoff(int type)
 510 {
 511         int i;
 512         struct swap_cgroup_ctrl *ctrl;
 513
 514         if (!do_swap_account)
 515                 return;
 516
 517         mutex_lock(&swap_cgroup_mutex);
 518         ctrl = &swap_cgroup_ctrl[type];
 519         if (ctrl->map) {
 520                 for (i = 0; i < ctrl->length; i++) {
 521                         struct page *page = ctrl->map[i];
 522                         if (page)
 523                                 __free_page(page);
 524                 }
 525                 vfree(ctrl->map);
 526                 ctrl->map = NULL;
 527                 ctrl->length = 0;
 528         }
 529         mutex_unlock(&swap_cgroup_mutex);
 530 }
 531
 532 #endif