mm/highmem.c

   1 /*
   2  * High memory handling common code and variables.
   3  *
   4  * (C) 1999 Andrea Arcangeli, SuSE GmbH, andrea@suse.de
   5  *          Gerhard Wichert, Siemens AG, Gerhard.Wichert@pdb.siemens.de
   6  *
   7  *
   8  * Redesigned the x86 32-bit VM architecture to deal with
   9  * 64-bit physical space. With current x86 CPUs this
  10  * means up to 64 Gigabytes physical RAM.
  11  *
  12  * Rewrote high memory support to move the page cache into
  13  * high memory. Implemented permanent (schedulable) kmaps
  14  * based on Linus' idea.
  15  *
  16  * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
  17  *
  18  * Largely rewritten to get rid of all global locks
  19  *
  20  * Copyright (C) 2006 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
  21  *
  22  */
  23
  24 #include <linux/mm.h>
  25 #include <linux/module.h>
  26 #include <linux/swap.h>
  27 #include <linux/bio.h>
  28 #include <linux/pagemap.h>
  29 #include <linux/mempool.h>
  30 #include <linux/blkdev.h>
  31 #include <linux/init.h>
  32 #include <linux/hash.h>
  33 #include <linux/highmem.h>
  34 #include <linux/blktrace_api.h>
  35 #include <linux/hardirq.h>
  36
  37 #include <asm/tlbflush.h>
  38 #include <asm/pgtable.h>
  39
  40 #ifdef CONFIG_HIGHMEM
  41
  42 static int __set_page_address(struct page *page, void *virtual, int pos);
  43
  44 unsigned long totalhigh_pages __read_mostly;
  45 EXPORT_SYMBOL(totalhigh_pages);
  46
  47 unsigned int nr_free_highpages (void)
  48 {
  49         pg_data_t *pgdat;
  50         unsigned int pages = 0;
  51
  52         for_each_online_pgdat(pgdat) {
  53                 pages += zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
  54                         NR_FREE_PAGES);
  55                 if (zone_movable_is_highmem())
  56                         pages += zone_page_state(
  57                                         &pgdat->node_zones[ZONE_MOVABLE],
  58                                         NR_FREE_PAGES);
  59         }
  60
  61         return pages;
  62 }
  63
  64 /*
  65  * count is not a pure "count".
  66  *  0 means its owned exclusively by someone
  67  *  1 means its free for use - either mapped or not.
  68  *  n means that there are (n-1) current users of it.
  69  */
  70 static atomic_t pkmap_count[LAST_PKMAP];
  71 static atomic_t pkmap_hand;
  72 static atomic_t pkmap_free;
  73 static atomic_t pkmap_users;
  74
  75 pte_t * pkmap_page_table;
  76
  77 static DECLARE_WAIT_QUEUE_HEAD(pkmap_wait);
  78
  79 /*
  80  * Try to free a given kmap slot.
  81  *
  82  * Returns:
  83  *  -1 - in use
  84  *   0 - free, no TLB flush needed
  85  *   1 - free, needs TLB flush
  86  */
  87 static int pkmap_try_free(int pos)
  88 {
  89         if (atomic_cmpxchg(&pkmap_count[pos], 1, 0) != 1)
  90                 return -1;
  91
  92         atomic_dec(&pkmap_free);
  93         /*
  94          * TODO: add a young bit to make it CLOCK
  95          */
  96         if (!pte_none(pkmap_page_table[pos])) {
  97                 struct page *page = pte_page(pkmap_page_table[pos]);
  98                 unsigned long addr = PKMAP_ADDR(pos);
  99                 pte_t *ptep = &pkmap_page_table[pos];
 100
 101                 VM_BUG_ON(addr != (unsigned long)page_address(page));
 102
 103                 if (!__set_page_address(page, NULL, pos))
 104                         BUG();
 105                 flush_kernel_dcache_page(page);
 106                 pte_clear(&init_mm, addr, ptep);
 107
 108                 return 1;
 109         }
 110
 111         return 0;
 112 }
 113
 114 static inline void pkmap_put(atomic_t *counter)
 115 {
 116         switch (atomic_dec_return(counter)) {
 117         case 0:
 118                 BUG();
 119
 120         case 1:
 121                 atomic_inc(&pkmap_free);
 122                 wake_up(&pkmap_wait);
 123         }
 124 }
 125
 126 #define TLB_BATCH       32
 127
 128 static int pkmap_get_free(void)
 129 {
 130         int i, pos, flush;
 131
 132 restart:
 133         for (i = 0; i < LAST_PKMAP; i++) {
 134                 pos = atomic_inc_return(&pkmap_hand) & LAST_PKMAP_MASK;
 135                 flush = pkmap_try_free(pos);
 136                 if (flush >= 0)
 137                         goto got_one;
 138         }
 139
 140         /*
 141          * wait for somebody else to unmap their entries
 142          */
 143         if (likely(!in_interrupt()))
 144                 wait_event(pkmap_wait, atomic_read(&pkmap_free) != 0);
 145
 146         goto restart;
 147
 148 got_one:
 149         if (flush) {
 150 #if 0
 151                 flush_tlb_kernel_range(PKMAP_ADDR(pos), PKMAP_ADDR(pos+1));
 152 #else
 153                 int pos2 = (pos + 1) & LAST_PKMAP_MASK;
 154                 int nr;
 155                 int entries[TLB_BATCH];
 156
 157                 /*
 158                  * For those architectures that cannot help but flush the
 159                  * whole TLB, flush some more entries to make it worthwhile.
 160                  * Scan ahead of the hand to minimise search distances.
 161                  */
 162                 for (i = 0, nr = 0; i < LAST_PKMAP && nr < TLB_BATCH;
 163                                 i++, pos2 = (pos2 + 1) & LAST_PKMAP_MASK) {
 164
 165                         flush = pkmap_try_free(pos2);
 166                         if (flush < 0)
 167                                 continue;
 168
 169                         if (!flush) {
 170                                 atomic_t *counter = &pkmap_count[pos2];
 171                                 VM_BUG_ON(atomic_read(counter) != 0);
 172                                 atomic_set(counter, 2);
 173                                 pkmap_put(counter);
 174                         } else
 175                                 entries[nr++] = pos2;
 176                 }
 177                 flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP));
 178
 179                 for (i = 0; i < nr; i++) {
 180                         atomic_t *counter = &pkmap_count[entries[i]];
 181                         VM_BUG_ON(atomic_read(counter) != 0);
 182                         atomic_set(counter, 2);
 183                         pkmap_put(counter);
 184                 }
 185 #endif
 186         }
 187         return pos;
 188 }
 189
 190 static unsigned long pkmap_insert(struct page *page)
 191 {
 192         int pos = pkmap_get_free();
 193         unsigned long vaddr = PKMAP_ADDR(pos);
 194         pte_t *ptep = &pkmap_page_table[pos];
 195         pte_t entry = mk_pte(page, kmap_prot);
 196         atomic_t *counter = &pkmap_count[pos];
 197
 198         VM_BUG_ON(atomic_read(counter) != 0);
 199
 200         set_pte_at(&init_mm, vaddr, ptep, entry);
 201         if (unlikely(!__set_page_address(page, (void *)vaddr, pos))) {
 202                 /*
 203                  * concurrent pkmap_inserts for this page -
 204                  * the other won the race, release this entry.
 205                  *
 206                  * we can still clear the pte without a tlb flush since
 207                  * it couldn't have been used yet.
 208                  */
 209                 pte_clear(&init_mm, vaddr, ptep);
 210                 VM_BUG_ON(atomic_read(counter) != 0);
 211                 atomic_set(counter, 2);
 212                 pkmap_put(counter);
 213                 vaddr = 0;
 214         } else
 215                 atomic_set(counter, 2);
 216
 217         return vaddr;
 218 }
 219
 220 /*
 221  * Flush all unused kmap mappings in order to remove stray mappings.
 222  */
 223 void kmap_flush_unused(void)
 224 {
 225         WARN_ON_ONCE(1);
 226 }
 227
 228 /*
 229  * Avoid starvation deadlock by limiting the number of tasks that can obtain a
 230  * kmap to (LAST_PKMAP - KM_TYPE_NR*NR_CPUS)/2.
 231  */
 232 static void kmap_account(void)
 233 {
 234         int weight;
 235
 236 #ifndef CONFIG_PREEMPT_RT
 237         if (in_interrupt()) {
 238                 /* irqs can always get them */
 239                 weight = -1;
 240         } else
 241 #endif
 242         if (current->flags & PF_KMAP) {
 243                 current->flags &= ~PF_KMAP;
 244                 /* we already accounted the second */
 245                 weight = 0;
 246         } else {
 247                 /* mark 1, account 2 */
 248                 current->flags |= PF_KMAP;
 249                 weight = 2;
 250         }
 251
 252         if (weight > 0) {
 253                 /*
 254                  * reserve KM_TYPE_NR maps per CPU for interrupt context
 255                  */
 256                 const int target = LAST_PKMAP
 257 #ifndef CONFIG_PREEMPT_RT
 258                                 - KM_TYPE_NR*NR_CPUS
 259 #endif
 260                         ;
 261
 262 again:
 263                 wait_event(pkmap_wait,
 264                         atomic_read(&pkmap_users) + weight <= target);
 265
 266                 if (atomic_add_return(weight, &pkmap_users) > target) {
 267                         atomic_sub(weight, &pkmap_users);
 268                         goto again;
 269                 }
 270         }
 271 }
 272
 273 static void kunmap_account(void)
 274 {
 275         int weight;
 276
 277 #ifndef CONFIG_PREEMPT_RT
 278         if (in_irq()) {
 279                 weight = -1;
 280         } else
 281 #endif
 282         if (current->flags & PF_KMAP) {
 283                 /* there was only 1 kmap, un-account both */
 284                 current->flags &= ~PF_KMAP;
 285                 weight = 2;
 286         } else {
 287                 /* there were two kmaps, un-account per kunmap */
 288                 weight = 1;
 289         }
 290
 291         if (weight > 0)
 292                 atomic_sub(weight, &pkmap_users);
 293         wake_up(&pkmap_wait);
 294 }
 295
 296  void *kmap_high(struct page *page)
 297 {
 298         unsigned long vaddr;
 299
 300         kmap_account();
 301 again:
 302         vaddr = (unsigned long)page_address(page);
 303         if (vaddr) {
 304                 atomic_t *counter = &pkmap_count[PKMAP_NR(vaddr)];
 305                 if (atomic_inc_not_zero(counter)) {
 306                         /*
 307                          * atomic_inc_not_zero implies a (memory) barrier on success
 308                          * so page address will be reloaded.
 309                          */
 310                         unsigned long vaddr2 = (unsigned long)page_address(page);
 311                         if (likely(vaddr == vaddr2))
 312                                 return (void *)vaddr;
 313
 314                         /*
 315                          * Oops, we got someone else.
 316                          *
 317                          * This can happen if we get preempted after
 318                          * page_address() and before atomic_inc_not_zero()
 319                          * and during that preemption this slot is freed and
 320                          * reused.
 321                          */
 322                         pkmap_put(counter);
 323                         goto again;
 324                 }
 325         }
 326
 327         vaddr = pkmap_insert(page);
 328         if (!vaddr)
 329                 goto again;
 330
 331         return (void *)vaddr;
 332 }
 333
 334 EXPORT_SYMBOL(kmap_high);
 335
 336  void kunmap_high(struct page *page)
 337 {
 338         unsigned long vaddr = (unsigned long)page_address(page);
 339         BUG_ON(!vaddr);
 340         pkmap_put(&pkmap_count[PKMAP_NR(vaddr)]);
 341         kunmap_account();
 342 }
 343
 344 EXPORT_SYMBOL(kunmap_high);
 345 #endif
 346
 347 #if defined(HASHED_PAGE_VIRTUAL)
 348
 349 #define PA_HASH_ORDER   7
 350
 351 /*
 352  * Describes one page->virtual address association.
 353  */
 354 static struct page_address_map {
 355         struct page *page;
 356         void *virtual;
 357         struct list_head list;
 358 } page_address_maps[LAST_PKMAP];
 359
 360 /*
 361  * Hash table bucket
 362  */
 363 static struct page_address_slot {
 364         struct list_head lh;                    /* List of page_address_maps */
 365         spinlock_t lock;                        /* Protect this bucket's list */
 366 } ____cacheline_aligned_in_smp page_address_htable[1<<PA_HASH_ORDER];
 367
 368 static struct page_address_slot *page_slot(struct page *page)
 369 {
 370         return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)];
 371 }
 372
 373 /**
 374  * page_address - get the mapped virtual address of a page
 375  * @page: &struct page to get the virtual address of
 376  *
 377  * Returns the page's virtual address.
 378  */
 379
 380 static void *__page_address(struct page_address_slot *pas, struct page *page)
 381 {
 382         void *ret = NULL;
 383
 384         if (!list_empty(&pas->lh)) {
 385                 struct page_address_map *pam;
 386
 387                 list_for_each_entry(pam, &pas->lh, list) {
 388                         if (pam->page == page) {
 389                                 ret = pam->virtual;
 390                                 break;
 391                         }
 392                 }
 393         }
 394
 395         return ret;
 396 }
 397
 398 void *page_address(struct page *page)
 399 {
 400         unsigned long flags;
 401         void *ret;
 402         struct page_address_slot *pas;
 403
 404         if (!PageHighMem(page))
 405                 return lowmem_page_address(page);
 406
 407         pas = page_slot(page);
 408         spin_lock_irqsave(&pas->lock, flags);
 409         ret = __page_address(pas, page);
 410         spin_unlock_irqrestore(&pas->lock, flags);
 411         return ret;
 412 }
 413
 414 EXPORT_SYMBOL(page_address);
 415
 416 /**
 417  * set_page_address - set a page's virtual address
 418  * @page: &struct page to set
 419  * @virtual: virtual address to use
 420  */
 421 static int __set_page_address(struct page *page, void *virtual, int pos)
 422 {
 423         int ret = 0;
 424         unsigned long flags;
 425         struct page_address_slot *pas;
 426         struct page_address_map *pam;
 427
 428         VM_BUG_ON(!PageHighMem(page));
 429         VM_BUG_ON(atomic_read(&pkmap_count[pos]) != 0);
 430         VM_BUG_ON(pos < 0 || pos >= LAST_PKMAP);
 431
 432         pas = page_slot(page);
 433         pam = &page_address_maps[pos];
 434
 435         spin_lock_irqsave(&pas->lock, flags);
 436         if (virtual) { /* add */
 437                 VM_BUG_ON(!list_empty(&pam->list));
 438
 439                 if (!__page_address(pas, page)) {
 440                         pam->page = page;
 441                         pam->virtual = virtual;
 442                         list_add_tail(&pam->list, &pas->lh);
 443                         ret = 1;
 444                 }
 445         } else { /* remove */
 446                 if (!list_empty(&pam->list)) {
 447                         list_del_init(&pam->list);
 448                         ret = 1;
 449                 }
 450         }
 451         spin_unlock_irqrestore(&pas->lock, flags);
 452
 453         return ret;
 454 }
 455
 456 int set_page_address(struct page *page, void *virtual)
 457 {
 458         /*
 459          * set_page_address is not supposed to be called when using
 460          * hashed virtual addresses.
 461          */
 462         BUG();
 463         return 0;
 464 }
 465
 466 void __init __page_address_init(void)
 467 {
 468         int i;
 469
 470         for (i = 0; i < ARRAY_SIZE(page_address_maps); i++)
 471                 INIT_LIST_HEAD(&page_address_maps[i].list);
 472
 473         for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) {
 474                 INIT_LIST_HEAD(&page_address_htable[i].lh);
 475                 spin_lock_init(&page_address_htable[i].lock);
 476         }
 477 }
 478
 479 #elif defined (CONFIG_HIGHMEM) /* HASHED_PAGE_VIRTUAL */
 480
 481 static int __set_page_address(struct page *page, void *virtual, int pos)
 482 {
 483         return set_page_address(page, virtual);
 484 }
 485
 486 #endif  /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
 487
 488 #if defined(CONFIG_HIGHMEM) || defined(HASHED_PAGE_VIRTUAL)
 489
 490 void __init page_address_init(void)
 491 {
 492 #ifdef CONFIG_HIGHMEM
 493         int i;
 494
 495         for (i = 0; i < ARRAY_SIZE(pkmap_count); i++)
 496                 atomic_set(&pkmap_count[i], 1);
 497         atomic_set(&pkmap_hand, 0);
 498         atomic_set(&pkmap_free, LAST_PKMAP);
 499         atomic_set(&pkmap_users, 0);
 500 #endif
 501
 502 #ifdef HASHED_PAGE_VIRTUAL
 503         __page_address_init();
 504 #endif
 505 }
 506
 507 #endif  /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */