mm/memcontrol.c

   1 /* memcontrol.c - Memory Controller
   2  *
   3  * Copyright IBM Corporation, 2007
   4  * Author Balbir Singh <balbir@linux.vnet.ibm.com>
   5  *
   6  * Copyright 2007 OpenVZ SWsoft Inc
   7  * Author: Pavel Emelianov <xemul@openvz.org>
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  */
  19
  20 #include <linux/res_counter.h>
  21 #include <linux/memcontrol.h>
  22 #include <linux/cgroup.h>
  23 #include <linux/mm.h>
  24 #include <linux/page-flags.h>
  25 #include <linux/bit_spinlock.h>
  26 #include <linux/rcupdate.h>
  27
  28 struct cgroup_subsys mem_cgroup_subsys;
  29
  30 /*
  31  * The memory controller data structure. The memory controller controls both
  32  * page cache and RSS per cgroup. We would eventually like to provide
  33  * statistics based on the statistics developed by Rik Van Riel for clock-pro,
  34  * to help the administrator determine what knobs to tune.
  35  *
  36  * TODO: Add a water mark for the memory controller. Reclaim will begin when
  37  * we hit the water mark. May be even add a low water mark, such that
  38  * no reclaim occurs from a cgroup at it's low water mark, this is
  39  * a feature that will be implemented much later in the future.
  40  */
  41 struct mem_cgroup {
  42         struct cgroup_subsys_state css;
  43         /*
  44          * the counter to account for memory usage
  45          */
  46         struct res_counter res;
  47         /*
  48          * Per cgroup active and inactive list, similar to the
  49          * per zone LRU lists.
  50          * TODO: Consider making these lists per zone
  51          */
  52         struct list_head active_list;
  53         struct list_head inactive_list;
  54 };
  55
  56 /*
  57  * We use the lower bit of the page->page_cgroup pointer as a bit spin
  58  * lock. We need to ensure that page->page_cgroup is atleast two
  59  * byte aligned (based on comments from Nick Piggin)
  60  */
  61 #define PAGE_CGROUP_LOCK_BIT    0x0
  62 #define PAGE_CGROUP_LOCK                (1 << PAGE_CGROUP_LOCK_BIT)
  63
  64 /*
  65  * A page_cgroup page is associated with every page descriptor. The
  66  * page_cgroup helps us identify information about the cgroup
  67  */
  68 struct page_cgroup {
  69         struct list_head lru;           /* per cgroup LRU list */
  70         struct page *page;
  71         struct mem_cgroup *mem_cgroup;
  72         atomic_t ref_cnt;               /* Helpful when pages move b/w  */
  73                                         /* mapped and cached states     */
  74 };
  75
  76
  77 static inline
  78 struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
  79 {
  80         return container_of(cgroup_subsys_state(cont,
  81                                 mem_cgroup_subsys_id), struct mem_cgroup,
  82                                 css);
  83 }
  84
  85 static inline
  86 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
  87 {
  88         return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
  89                                 struct mem_cgroup, css);
  90 }
  91
  92 void mm_init_cgroup(struct mm_struct *mm, struct task_struct *p)
  93 {
  94         struct mem_cgroup *mem;
  95
  96         mem = mem_cgroup_from_task(p);
  97         css_get(&mem->css);
  98         mm->mem_cgroup = mem;
  99 }
 100
 101 void mm_free_cgroup(struct mm_struct *mm)
 102 {
 103         css_put(&mm->mem_cgroup->css);
 104 }
 105
 106 static inline int page_cgroup_locked(struct page *page)
 107 {
 108         return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT,
 109                                         &page->page_cgroup);
 110 }
 111
 112 void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)
 113 {
 114         int locked;
 115
 116         /*
 117          * While resetting the page_cgroup we might not hold the
 118          * page_cgroup lock. free_hot_cold_page() is an example
 119          * of such a scenario
 120          */
 121         if (pc)
 122                 VM_BUG_ON(!page_cgroup_locked(page));
 123         locked = (page->page_cgroup & PAGE_CGROUP_LOCK);
 124         page->page_cgroup = ((unsigned long)pc | locked);
 125 }
 126
 127 struct page_cgroup *page_get_page_cgroup(struct page *page)
 128 {
 129         return (struct page_cgroup *)
 130                 (page->page_cgroup & ~PAGE_CGROUP_LOCK);
 131 }
 132
 133 void __always_inline lock_page_cgroup(struct page *page)
 134 {
 135         bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
 136         VM_BUG_ON(!page_cgroup_locked(page));
 137 }
 138
 139 void __always_inline unlock_page_cgroup(struct page *page)
 140 {
 141         bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
 142 }
 143
 144 /*
 145  * Charge the memory controller for page usage.
 146  * Return
 147  * 0 if the charge was successful
 148  * < 0 if the cgroup is over its limit
 149  */
 150 int mem_cgroup_charge(struct page *page, struct mm_struct *mm)
 151 {
 152         struct mem_cgroup *mem;
 153         struct page_cgroup *pc, *race_pc;
 154
 155         /*
 156          * Should page_cgroup's go to their own slab?
 157          * One could optimize the performance of the charging routine
 158          * by saving a bit in the page_flags and using it as a lock
 159          * to see if the cgroup page already has a page_cgroup associated
 160          * with it
 161          */
 162         lock_page_cgroup(page);
 163         pc = page_get_page_cgroup(page);
 164         /*
 165          * The page_cgroup exists and the page has already been accounted
 166          */
 167         if (pc) {
 168                 atomic_inc(&pc->ref_cnt);
 169                 goto done;
 170         }
 171
 172         unlock_page_cgroup(page);
 173
 174         pc = kzalloc(sizeof(struct page_cgroup), GFP_KERNEL);
 175         if (pc == NULL)
 176                 goto err;
 177
 178         rcu_read_lock();
 179         /*
 180          * We always charge the cgroup the mm_struct belongs to
 181          * the mm_struct's mem_cgroup changes on task migration if the
 182          * thread group leader migrates. It's possible that mm is not
 183          * set, if so charge the init_mm (happens for pagecache usage).
 184          */
 185         if (!mm)
 186                 mm = &init_mm;
 187
 188         mem = rcu_dereference(mm->mem_cgroup);
 189         /*
 190          * For every charge from the cgroup, increment reference
 191          * count
 192          */
 193         css_get(&mem->css);
 194         rcu_read_unlock();
 195
 196         /*
 197          * If we created the page_cgroup, we should free it on exceeding
 198          * the cgroup limit.
 199          */
 200         if (res_counter_charge(&mem->res, 1)) {
 201                 css_put(&mem->css);
 202                 goto free_pc;
 203         }
 204
 205         lock_page_cgroup(page);
 206         /*
 207          * Check if somebody else beat us to allocating the page_cgroup
 208          */
 209         race_pc = page_get_page_cgroup(page);
 210         if (race_pc) {
 211                 kfree(pc);
 212                 pc = race_pc;
 213                 atomic_inc(&pc->ref_cnt);
 214                 res_counter_uncharge(&mem->res, 1);
 215                 css_put(&mem->css);
 216                 goto done;
 217         }
 218
 219         atomic_set(&pc->ref_cnt, 1);
 220         pc->mem_cgroup = mem;
 221         pc->page = page;
 222         page_assign_page_cgroup(page, pc);
 223
 224 done:
 225         unlock_page_cgroup(page);
 226         return 0;
 227 free_pc:
 228         kfree(pc);
 229         return -ENOMEM;
 230 err:
 231         unlock_page_cgroup(page);
 232         return -ENOMEM;
 233 }
 234
 235 /*
 236  * Uncharging is always a welcome operation, we never complain, simply
 237  * uncharge.
 238  */
 239 void mem_cgroup_uncharge(struct page_cgroup *pc)
 240 {
 241         struct mem_cgroup *mem;
 242         struct page *page;
 243
 244         if (!pc)
 245                 return;
 246
 247         if (atomic_dec_and_test(&pc->ref_cnt)) {
 248                 page = pc->page;
 249                 lock_page_cgroup(page);
 250                 mem = pc->mem_cgroup;
 251                 css_put(&mem->css);
 252                 page_assign_page_cgroup(page, NULL);
 253                 unlock_page_cgroup(page);
 254                 res_counter_uncharge(&mem->res, 1);
 255                 kfree(pc);
 256         }
 257 }
 258
 259 static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
 260                         struct file *file, char __user *userbuf, size_t nbytes,
 261                         loff_t *ppos)
 262 {
 263         return res_counter_read(&mem_cgroup_from_cont(cont)->res,
 264                                 cft->private, userbuf, nbytes, ppos);
 265 }
 266
 267 static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
 268                                 struct file *file, const char __user *userbuf,
 269                                 size_t nbytes, loff_t *ppos)
 270 {
 271         return res_counter_write(&mem_cgroup_from_cont(cont)->res,
 272                                 cft->private, userbuf, nbytes, ppos);
 273 }
 274
 275 static struct cftype mem_cgroup_files[] = {
 276         {
 277                 .name = "usage",
 278                 .private = RES_USAGE,
 279                 .read = mem_cgroup_read,
 280         },
 281         {
 282                 .name = "limit",
 283                 .private = RES_LIMIT,
 284                 .write = mem_cgroup_write,
 285                 .read = mem_cgroup_read,
 286         },
 287         {
 288                 .name = "failcnt",
 289                 .private = RES_FAILCNT,
 290                 .read = mem_cgroup_read,
 291         },
 292 };
 293
 294 static struct mem_cgroup init_mem_cgroup;
 295
 296 static struct cgroup_subsys_state *
 297 mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 298 {
 299         struct mem_cgroup *mem;
 300
 301         if (unlikely((cont->parent) == NULL)) {
 302                 mem = &init_mem_cgroup;
 303                 init_mm.mem_cgroup = mem;
 304         } else
 305                 mem = kzalloc(sizeof(struct mem_cgroup), GFP_KERNEL);
 306
 307         if (mem == NULL)
 308                 return NULL;
 309
 310         res_counter_init(&mem->res);
 311         INIT_LIST_HEAD(&mem->active_list);
 312         INIT_LIST_HEAD(&mem->inactive_list);
 313         return &mem->css;
 314 }
 315
 316 static void mem_cgroup_destroy(struct cgroup_subsys *ss,
 317                                 struct cgroup *cont)
 318 {
 319         kfree(mem_cgroup_from_cont(cont));
 320 }
 321
 322 static int mem_cgroup_populate(struct cgroup_subsys *ss,
 323                                 struct cgroup *cont)
 324 {
 325         return cgroup_add_files(cont, ss, mem_cgroup_files,
 326                                         ARRAY_SIZE(mem_cgroup_files));
 327 }
 328
 329 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
 330                                 struct cgroup *cont,
 331                                 struct cgroup *old_cont,
 332                                 struct task_struct *p)
 333 {
 334         struct mm_struct *mm;
 335         struct mem_cgroup *mem, *old_mem;
 336
 337         mm = get_task_mm(p);
 338         if (mm == NULL)
 339                 return;
 340
 341         mem = mem_cgroup_from_cont(cont);
 342         old_mem = mem_cgroup_from_cont(old_cont);
 343
 344         if (mem == old_mem)
 345                 goto out;
 346
 347         /*
 348          * Only thread group leaders are allowed to migrate, the mm_struct is
 349          * in effect owned by the leader
 350          */
 351         if (p->tgid != p->pid)
 352                 goto out;
 353
 354         css_get(&mem->css);
 355         rcu_assign_pointer(mm->mem_cgroup, mem);
 356         css_put(&old_mem->css);
 357
 358 out:
 359         mmput(mm);
 360         return;
 361 }
 362
 363 struct cgroup_subsys mem_cgroup_subsys = {
 364         .name = "memory",
 365         .subsys_id = mem_cgroup_subsys_id,
 366         .create = mem_cgroup_create,
 367         .destroy = mem_cgroup_destroy,
 368         .populate = mem_cgroup_populate,
 369         .attach = mem_cgroup_move_task,
 370         .early_init = 1,
 371 };