mm/page_alloc.c

   1 /*
   2  *  linux/mm/page_alloc.c
   3  *
   4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5  *  Swap reorganised 29.12.95, Stephen Tweedie
   6  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
   7  *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
   8  *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
   9  *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
  10  */
  11
  12 #include <linux/config.h>
  13 #include <linux/mm.h>
  14 #include <linux/swap.h>
  15 #include <linux/swapctl.h>
  16 #include <linux/interrupt.h>
  17 #include <linux/pagemap.h>
  18 #include <linux/bootmem.h>
  19
  20 int nr_swap_pages;
  21 int nr_active_pages;
  22 int nr_inactive_dirty_pages;
  23 pg_data_t *pgdat_list;
  24
  25 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
  26 static int zone_balance_ratio[MAX_NR_ZONES] = { 32, 128, 128, };
  27 static int zone_balance_min[MAX_NR_ZONES] = { 10 , 10, 10, };
  28 static int zone_balance_max[MAX_NR_ZONES] = { 255 , 255, 255, };
  29
  30 struct list_head active_list;
  31 struct list_head inactive_dirty_list;
  32 /*
  33  * Free_page() adds the page to the free lists. This is optimized for
  34  * fast normal cases (no error jumps taken normally).
  35  *
  36  * The way to optimize jumps for gcc-2.2.2 is to:
  37  *  - select the "normal" case and put it inside the if () { XXX }
  38  *  - no else-statements if you can avoid them
  39  *
  40  * With the above two rules, you get a straight-line execution path
  41  * for the normal case, giving better asm-code.
  42  */
  43
  44 #define memlist_init(x) INIT_LIST_HEAD(x)
  45 #define memlist_add_head list_add
  46 #define memlist_add_tail list_add_tail
  47 #define memlist_del list_del
  48 #define memlist_entry list_entry
  49 #define memlist_next(x) ((x)->next)
  50 #define memlist_prev(x) ((x)->prev)
  51
  52 /*
  53  * Temporary debugging check.
  54  */
  55 #define BAD_RANGE(zone,x) (((zone) != (x)->zone) || (((x)-mem_map) < (zone)->offset) || (((x)-mem_map) >= (zone)->offset+(zone)->size))
  56
  57 /*
  58  * Buddy system. Hairy. You really aren't expected to understand this
  59  *
  60  * Hint: -mask = 1+~mask
  61  */
  62
  63 static void FASTCALL(__free_pages_ok (struct page *page, unsigned long order));
  64 static void __free_pages_ok (struct page *page, unsigned long order)
  65 {
  66         unsigned long index, page_idx, mask, flags;
  67         free_area_t *area;
  68         struct page *base;
  69         zone_t *zone;
  70
  71         /*
  72          * Subtle. We do not want to test this in the inlined part of
  73          * __free_page() - it's a rare condition and just increases
  74          * cache footprint unnecesserily. So we do an 'incorrect'
  75          * decrement on page->count for reserved pages, but this part
  76          * makes it safe.
  77          */
  78         if (PageReserved(page))
  79                 return;
  80
  81         if (page->buffers)
  82                 BUG();
  83         if (page->mapping)
  84                 BUG();
  85         if (!VALID_PAGE(page))
  86                 BUG();
  87         if (PageSwapCache(page))
  88                 BUG();
  89         if (PageLocked(page))
  90                 BUG();
  91         if (PageDecrAfter(page))
  92                 BUG();
  93         if (PageActive(page))
  94                 BUG();
  95         if (PageInactiveDirty(page))
  96                 BUG();
  97         if (PageInactiveClean(page))
  98                 BUG();
  99
 100         page->flags &= ~((1<<PG_referenced) | (1<<PG_dirty));
 101         page->age = PAGE_AGE_START;
 102
 103         zone = page->zone;
 104
 105         mask = (~0UL) << order;
 106         base = mem_map + zone->offset;
 107         page_idx = page - base;
 108         if (page_idx & ~mask)
 109                 BUG();
 110         index = page_idx >> (1 + order);
 111
 112         area = zone->free_area + order;
 113
 114         spin_lock_irqsave(&zone->lock, flags);
 115
 116         zone->free_pages -= mask;
 117
 118         while (mask + (1 << (MAX_ORDER-1))) {
 119                 struct page *buddy1, *buddy2;
 120
 121                 if (area >= zone->free_area + MAX_ORDER)
 122                         BUG();
 123                 if (!test_and_change_bit(index, area->map))
 124                         /*
 125                          * the buddy page is still allocated.
 126                          */
 127                         break;
 128                 /*
 129                  * Move the buddy up one level.
 130                  */
 131                 buddy1 = base + (page_idx ^ -mask);
 132                 buddy2 = base + page_idx;
 133                 if (BAD_RANGE(zone,buddy1))
 134                         BUG();
 135                 if (BAD_RANGE(zone,buddy2))
 136                         BUG();
 137
 138                 memlist_del(&buddy1->list);
 139                 mask <<= 1;
 140                 area++;
 141                 index >>= 1;
 142                 page_idx &= mask;
 143         }
 144         memlist_add_head(&(base + page_idx)->list, &area->free_list);
 145
 146         spin_unlock_irqrestore(&zone->lock, flags);
 147
 148         /*
 149          * We don't want to protect this variable from race conditions
 150          * since it's nothing important, but we do want to make sure
 151          * it never gets negative.
 152          */
 153         if (memory_pressure > NR_CPUS)
 154                 memory_pressure--;
 155 }
 156
 157 #define MARK_USED(index, order, area) \
 158         change_bit((index) >> (1+(order)), (area)->map)
 159
 160 static inline struct page * expand (zone_t *zone, struct page *page,
 161          unsigned long index, int low, int high, free_area_t * area)
 162 {
 163         unsigned long size = 1 << high;
 164
 165         while (high > low) {
 166                 if (BAD_RANGE(zone,page))
 167                         BUG();
 168                 area--;
 169                 high--;
 170                 size >>= 1;
 171                 memlist_add_head(&(page)->list, &(area)->free_list);
 172                 MARK_USED(index, high, area);
 173                 index += size;
 174                 page += size;
 175         }
 176         if (BAD_RANGE(zone,page))
 177                 BUG();
 178         return page;
 179 }
 180
 181 static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned long order));
 182 static struct page * rmqueue(zone_t *zone, unsigned long order)
 183 {
 184         free_area_t * area = zone->free_area + order;
 185         unsigned long curr_order = order;
 186         struct list_head *head, *curr;
 187         unsigned long flags;
 188         struct page *page;
 189
 190         spin_lock_irqsave(&zone->lock, flags);
 191         do {
 192                 head = &area->free_list;
 193                 curr = memlist_next(head);
 194
 195                 if (curr != head) {
 196                         unsigned int index;
 197
 198                         page = memlist_entry(curr, struct page, list);
 199                         if (BAD_RANGE(zone,page))
 200                                 BUG();
 201                         memlist_del(curr);
 202                         index = (page - mem_map) - zone->offset;
 203                         MARK_USED(index, curr_order, area);
 204                         zone->free_pages -= 1 << order;
 205
 206                         page = expand(zone, page, index, order, curr_order, area);
 207                         spin_unlock_irqrestore(&zone->lock, flags);
 208
 209                         set_page_count(page, 1);
 210                         if (BAD_RANGE(zone,page))
 211                                 BUG();
 212                         DEBUG_ADD_PAGE
 213                         return page;
 214                 }
 215                 curr_order++;
 216                 area++;
 217         } while (curr_order < MAX_ORDER);
 218         spin_unlock_irqrestore(&zone->lock, flags);
 219
 220         return NULL;
 221 }
 222
 223 #define PAGES_MIN       0
 224 #define PAGES_LOW       1
 225 #define PAGES_HIGH      2
 226
 227 /*
 228  * This function does the dirty work for __alloc_pages
 229  * and is separated out to keep the code size smaller.
 230  * (suggested by Davem at 1:30 AM, typed by Rik at 6 AM)
 231  */
 232 static struct page * __alloc_pages_limit(zonelist_t *zonelist,
 233                         unsigned long order, int limit, int direct_reclaim)
 234 {
 235         zone_t **zone = zonelist->zones;
 236
 237         for (;;) {
 238                 zone_t *z = *(zone++);
 239                 unsigned long water_mark;
 240
 241                 if (!z)
 242                         break;
 243                 if (!z->size)
 244                         BUG();
 245
 246                 /*
 247                  * We allocate if the number of free + inactive_clean
 248                  * pages is above the watermark.
 249                  */
 250                 switch (limit) {
 251                         default:
 252                         case PAGES_MIN:
 253                                 water_mark = z->pages_min;
 254                                 break;
 255                         case PAGES_LOW:
 256                                 water_mark = z->pages_low;
 257                                 break;
 258                         case PAGES_HIGH:
 259                                 water_mark = z->pages_high;
 260                 }
 261
 262                 if (z->free_pages + z->inactive_clean_pages > water_mark) {
 263                         struct page *page = NULL;
 264                         /* If possible, reclaim a page directly. */
 265                         if (direct_reclaim && z->free_pages < z->pages_min + 8)
 266                                 page = reclaim_page(z);
 267                         /* If that fails, fall back to rmqueue. */
 268                         if (!page)
 269                                 page = rmqueue(z, order);
 270                         if (page)
 271                                 return page;
 272                 }
 273         }
 274
 275         /* Found nothing. */
 276         return NULL;
 277 }
 278
 279
 280 /*
 281  * This is the 'heart' of the zoned buddy allocator:
 282  */
 283 struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order)
 284 {
 285         zone_t **zone;
 286         int direct_reclaim = 0;
 287         unsigned int gfp_mask = zonelist->gfp_mask;
 288         struct page * page;
 289
 290         /*
 291          * Allocations put pressure on the VM subsystem.
 292          */
 293         memory_pressure++;
 294
 295         /*
 296          * (If anyone calls gfp from interrupts nonatomically then it
 297          * will sooner or later tripped up by a schedule().)
 298          *
 299          * We are falling back to lower-level zones if allocation
 300          * in a higher zone fails.
 301          */
 302
 303         /*
 304          * Can we take pages directly from the inactive_clean
 305          * list?
 306          */
 307         if (order == 0 && (gfp_mask & __GFP_WAIT) &&
 308                         !(current->flags & PF_MEMALLOC))
 309                 direct_reclaim = 1;
 310
 311         /*
 312          * If we are about to get low on free pages and we also have
 313          * an inactive page shortage, wake up kswapd.
 314          */
 315         if (inactive_shortage() > inactive_target / 2 && free_shortage())
 316                 wakeup_kswapd(0);
 317         /*
 318          * If we are about to get low on free pages and cleaning
 319          * the inactive_dirty pages would fix the situation,
 320          * wake up bdflush.
 321          */
 322         else if (free_shortage() && nr_inactive_dirty_pages > free_shortage()
 323                         && nr_inactive_dirty_pages >= freepages.high)
 324                 wakeup_bdflush(0);
 325
 326 try_again:
 327         /*
 328          * First, see if we have any zones with lots of free memory.
 329          *
 330          * We allocate free memory first because it doesn't contain
 331          * any data ... DUH!
 332          */
 333         zone = zonelist->zones;
 334         for (;;) {
 335                 zone_t *z = *(zone++);
 336                 if (!z)
 337                         break;
 338                 if (!z->size)
 339                         BUG();
 340
 341                 if (z->free_pages >= z->pages_low) {
 342                         page = rmqueue(z, order);
 343                         if (page)
 344                                 return page;
 345                 } else if (z->free_pages < z->pages_min &&
 346                                         waitqueue_active(&kreclaimd_wait)) {
 347                                 wake_up_interruptible(&kreclaimd_wait);
 348                 }
 349         }
 350
 351         /*
 352          * Try to allocate a page from a zone with a HIGH
 353          * amount of free + inactive_clean pages.
 354          *
 355          * If there is a lot of activity, inactive_target
 356          * will be high and we'll have a good chance of
 357          * finding a page using the HIGH limit.
 358          */
 359         page = __alloc_pages_limit(zonelist, order, PAGES_HIGH, direct_reclaim);
 360         if (page)
 361                 return page;
 362
 363         /*
 364          * Then try to allocate a page from a zone with more
 365          * than zone->pages_low free + inactive_clean pages.
 366          *
 367          * When the working set is very large and VM activity
 368          * is low, we're most likely to have our allocation
 369          * succeed here.
 370          */
 371         page = __alloc_pages_limit(zonelist, order, PAGES_LOW, direct_reclaim);
 372         if (page)
 373                 return page;
 374
 375         /*
 376          * OK, none of the zones on our zonelist has lots
 377          * of pages free.
 378          *
 379          * We wake up kswapd, in the hope that kswapd will
 380          * resolve this situation before memory gets tight.
 381          *
 382          * We also yield the CPU, because that:
 383          * - gives kswapd a chance to do something
 384          * - slows down allocations, in particular the
 385          *   allocations from the fast allocator that's
 386          *   causing the problems ...
 387          * - ... which minimises the impact the "bad guys"
 388          *   have on the rest of the system
 389          * - if we don't have __GFP_IO set, kswapd may be
 390          *   able to free some memory we can't free ourselves
 391          */
 392         wakeup_kswapd(0);
 393         if (gfp_mask & __GFP_WAIT) {
 394                 __set_current_state(TASK_RUNNING);
 395                 current->policy |= SCHED_YIELD;
 396                 schedule();
 397         }
 398
 399         /*
 400          * After waking up kswapd, we try to allocate a page
 401          * from any zone which isn't critical yet.
 402          *
 403          * Kswapd should, in most situations, bring the situation
 404          * back to normal in no time.
 405          */
 406         page = __alloc_pages_limit(zonelist, order, PAGES_MIN, direct_reclaim);
 407         if (page)
 408                 return page;
 409
 410         /*
 411          * Damn, we didn't succeed.
 412          *
 413          * This can be due to 2 reasons:
 414          * - we're doing a higher-order allocation
 415          *      --> move pages to the free list until we succeed
 416          * - we're /really/ tight on memory
 417          *      --> wait on the kswapd waitqueue until memory is freed
 418          */
 419         if (!(current->flags & PF_MEMALLOC)) {
 420                 /*
 421                  * Are we dealing with a higher order allocation?
 422                  *
 423                  * Move pages from the inactive_clean to the free list
 424                  * in the hope of creating a large, physically contiguous
 425                  * piece of free memory.
 426                  */
 427                 if (order > 0 && (gfp_mask & __GFP_WAIT)) {
 428                         zone = zonelist->zones;
 429                         /* First, clean some dirty pages. */
 430                         page_launder(gfp_mask, 1);
 431                         for (;;) {
 432                                 zone_t *z = *(zone++);
 433                                 if (!z)
 434                                         break;
 435                                 if (!z->size)
 436                                         continue;
 437                                 while (z->inactive_clean_pages) {
 438                                         struct page * page;
 439                                         /* Move one page to the free list. */
 440                                         page = reclaim_page(z);
 441                                         if (!page)
 442                                                 break;
 443                                         __free_page(page);
 444                                         /* Try if the allocation succeeds. */
 445                                         page = rmqueue(z, order);
 446                                         if (page)
 447                                                 return page;
 448                                 }
 449                         }
 450                 }
 451                 /*
 452                  * When we arrive here, we are really tight on memory.
 453                  *
 454                  * We wake up kswapd and sleep until kswapd wakes us
 455                  * up again. After that we loop back to the start.
 456                  *
 457                  * We have to do this because something else might eat
 458                  * the memory kswapd frees for us and we need to be
 459                  * reliable. Note that we don't loop back for higher
 460                  * order allocations since it is possible that kswapd
 461                  * simply cannot free a large enough contiguous area
 462                  * of memory *ever*.
 463                  */
 464                 if ((gfp_mask & (__GFP_WAIT|__GFP_IO)) == (__GFP_WAIT|__GFP_IO)) {
 465                         wakeup_kswapd(1);
 466                         memory_pressure++;
 467                         if (!order)
 468                                 goto try_again;
 469                 /*
 470                  * If __GFP_IO isn't set, we can't wait on kswapd because
 471                  * kswapd just might need some IO locks /we/ are holding ...
 472                  *
 473                  * SUBTLE: The scheduling point above makes sure that
 474                  * kswapd does get the chance to free memory we can't
 475                  * free ourselves...
 476                  */
 477                 } else if (gfp_mask & __GFP_WAIT) {
 478                         try_to_free_pages(gfp_mask);
 479                         memory_pressure++;
 480                         if (!order)
 481                                 goto try_again;
 482                 }
 483
 484         }
 485
 486         /*
 487          * Final phase: allocate anything we can!
 488          *
 489          * Higher order allocations, GFP_ATOMIC allocations and
 490          * recursive allocations (PF_MEMALLOC) end up here.
 491          *
 492          * Only recursive allocations can use the very last pages
 493          * in the system, otherwise it would be just too easy to
 494          * deadlock the system...
 495          */
 496         zone = zonelist->zones;
 497         for (;;) {
 498                 zone_t *z = *(zone++);
 499                 struct page * page = NULL;
 500                 if (!z)
 501                         break;
 502                 if (!z->size)
 503                         BUG();
 504
 505                 /*
 506                  * SUBTLE: direct_reclaim is only possible if the task
 507                  * becomes PF_MEMALLOC while looping above. This will
 508                  * happen when the OOM killer selects this task for
 509                  * instant execution...
 510                  */
 511                 if (direct_reclaim) {
 512                         page = reclaim_page(z);
 513                         if (page)
 514                                 return page;
 515                 }
 516
 517                 /* XXX: is pages_min/4 a good amount to reserve for this? */
 518                 if (z->free_pages < z->pages_min / 4 &&
 519                                 !(current->flags & PF_MEMALLOC))
 520                         continue;
 521                 page = rmqueue(z, order);
 522                 if (page)
 523                         return page;
 524         }
 525
 526         /* No luck.. */
 527         printk(KERN_ERR "__alloc_pages: %lu-order allocation failed.\n", order);
 528         return NULL;
 529 }
 530
 531 /*
 532  * Common helper functions.
 533  */
 534 unsigned long __get_free_pages(int gfp_mask, unsigned long order)
 535 {
 536         struct page * page;
 537
 538         page = alloc_pages(gfp_mask, order);
 539         if (!page)
 540                 return 0;
 541         return (unsigned long) page_address(page);
 542 }
 543
 544 unsigned long get_zeroed_page(int gfp_mask)
 545 {
 546         struct page * page;
 547
 548         page = alloc_pages(gfp_mask, 0);
 549         if (page) {
 550                 void *address = page_address(page);
 551                 clear_page(address);
 552                 return (unsigned long) address;
 553         }
 554         return 0;
 555 }
 556
 557 void __free_pages(struct page *page, unsigned long order)
 558 {
 559         if (put_page_testzero(page))
 560                 __free_pages_ok(page, order);
 561 }
 562
 563 void free_pages(unsigned long addr, unsigned long order)
 564 {
 565         struct page *fpage;
 566
 567 #ifdef CONFIG_DISCONTIGMEM
 568         if (addr == 0) return;
 569 #endif
 570         fpage = virt_to_page(addr);
 571         if (VALID_PAGE(fpage))
 572                 __free_pages(fpage, order);
 573 }
 574
 575 /*
 576  * Total amount of free (allocatable) RAM:
 577  */
 578 unsigned int nr_free_pages (void)
 579 {
 580         unsigned int sum;
 581         zone_t *zone;
 582         pg_data_t *pgdat = pgdat_list;
 583
 584         sum = 0;
 585         while (pgdat) {
 586                 for (zone = pgdat->node_zones; zone < pgdat->node_zones + MAX_NR_ZONES; zone++)
 587                         sum += zone->free_pages;
 588                 pgdat = pgdat->node_next;
 589         }
 590         return sum;
 591 }
 592
 593 /*
 594  * Total amount of inactive_clean (allocatable) RAM:
 595  */
 596 unsigned int nr_inactive_clean_pages (void)
 597 {
 598         unsigned int sum;
 599         zone_t *zone;
 600         pg_data_t *pgdat = pgdat_list;
 601
 602         sum = 0;
 603         while (pgdat) {
 604                 for (zone = pgdat->node_zones; zone < pgdat->node_zones + MAX_NR_ZONES; zone++)
 605                         sum += zone->inactive_clean_pages;
 606                 pgdat = pgdat->node_next;
 607         }
 608         return sum;
 609 }
 610
 611 /*
 612  * Amount of free RAM allocatable as buffer memory:
 613  */
 614 unsigned int nr_free_buffer_pages (void)
 615 {
 616         unsigned int sum;
 617
 618         sum = nr_free_pages();
 619         sum += nr_inactive_clean_pages();
 620         sum += nr_inactive_dirty_pages;
 621
 622         /*
 623          * Keep our write behind queue filled, even if
 624          * kswapd lags a bit right now.
 625          */
 626         if (sum < freepages.high + inactive_target)
 627                 sum = freepages.high + inactive_target;
 628         /*
 629          * We don't want dirty page writebehind to put too
 630          * much pressure on the working set, but we want it
 631          * to be possible to have some dirty pages in the
 632          * working set without upsetting the writebehind logic.
 633          */
 634         sum += nr_active_pages >> 4;
 635
 636         return sum;
 637 }
 638
 639 #if CONFIG_HIGHMEM
 640 unsigned int nr_free_highpages (void)
 641 {
 642         pg_data_t *pgdat = pgdat_list;
 643         unsigned int pages = 0;
 644
 645         while (pgdat) {
 646                 pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
 647                 pgdat = pgdat->node_next;
 648         }
 649         return pages;
 650 }
 651 #endif
 652
 653 /*
 654  * Show free area list (used inside shift_scroll-lock stuff)
 655  * We also calculate the percentage fragmentation. We do this by counting the
 656  * memory on each free list with the exception of the first item on the list.
 657  */
 658 void show_free_areas_core(pg_data_t *pgdat)
 659 {
 660         unsigned long order;
 661         unsigned type;
 662
 663         printk("Free pages:      %6dkB (%6dkB HighMem)\n",
 664                 nr_free_pages() << (PAGE_SHIFT-10),
 665                 nr_free_highpages() << (PAGE_SHIFT-10));
 666
 667         printk("( Active: %d, inactive_dirty: %d, inactive_clean: %d, free: %d (%d %d %d) )\n",
 668                 nr_active_pages,
 669                 nr_inactive_dirty_pages,
 670                 nr_inactive_clean_pages(),
 671                 nr_free_pages(),
 672                 freepages.min,
 673                 freepages.low,
 674                 freepages.high);
 675
 676         for (type = 0; type < MAX_NR_ZONES; type++) {
 677                 struct list_head *head, *curr;
 678                 zone_t *zone = pgdat->node_zones + type;
 679                 unsigned long nr, total, flags;
 680
 681                 total = 0;
 682                 if (zone->size) {
 683                         spin_lock_irqsave(&zone->lock, flags);
 684                         for (order = 0; order < MAX_ORDER; order++) {
 685                                 head = &(zone->free_area + order)->free_list;
 686                                 curr = head;
 687                                 nr = 0;
 688                                 for (;;) {
 689                                         curr = memlist_next(curr);
 690                                         if (curr == head)
 691                                                 break;
 692                                         nr++;
 693                                 }
 694                                 total += nr * (1 << order);
 695                                 printk("%lu*%lukB ", nr,
 696                                                 (PAGE_SIZE>>10) << order);
 697                         }
 698                         spin_unlock_irqrestore(&zone->lock, flags);
 699                 }
 700                 printk("= %lukB)\n", total * (PAGE_SIZE>>10));
 701         }
 702
 703 #ifdef SWAP_CACHE_INFO
 704         show_swap_cache_info();
 705 #endif
 706 }
 707
 708 void show_free_areas(void)
 709 {
 710         show_free_areas_core(pgdat_list);
 711 }
 712
 713 /*
 714  * Builds allocation fallback zone lists.
 715  */
 716 static inline void build_zonelists(pg_data_t *pgdat)
 717 {
 718         int i, j, k;
 719
 720         for (i = 0; i < NR_GFPINDEX; i++) {
 721                 zonelist_t *zonelist;
 722                 zone_t *zone;
 723
 724                 zonelist = pgdat->node_zonelists + i;
 725                 memset(zonelist, 0, sizeof(*zonelist));
 726
 727                 zonelist->gfp_mask = i;
 728                 j = 0;
 729                 k = ZONE_NORMAL;
 730                 if (i & __GFP_HIGHMEM)
 731                         k = ZONE_HIGHMEM;
 732                 if (i & __GFP_DMA)
 733                         k = ZONE_DMA;
 734
 735                 switch (k) {
 736                         default:
 737                                 BUG();
 738                         /*
 739                          * fallthrough:
 740                          */
 741                         case ZONE_HIGHMEM:
 742                                 zone = pgdat->node_zones + ZONE_HIGHMEM;
 743                                 if (zone->size) {
 744 #ifndef CONFIG_HIGHMEM
 745                                         BUG();
 746 #endif
 747                                         zonelist->zones[j++] = zone;
 748                                 }
 749                         case ZONE_NORMAL:
 750                                 zone = pgdat->node_zones + ZONE_NORMAL;
 751                                 if (zone->size)
 752                                         zonelist->zones[j++] = zone;
 753                         case ZONE_DMA:
 754                                 zone = pgdat->node_zones + ZONE_DMA;
 755                                 if (zone->size)
 756                                         zonelist->zones[j++] = zone;
 757                 }
 758                 zonelist->zones[j++] = NULL;
 759         }
 760 }
 761
 762 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
 763
 764 /*
 765  * Set up the zone data structures:
 766  *   - mark all pages reserved
 767  *   - mark all memory queues empty
 768  *   - clear the memory bitmaps
 769  */
 770 void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
 771         unsigned long *zones_size, unsigned long zone_start_paddr,
 772         unsigned long *zholes_size, struct page *lmem_map)
 773 {
 774         struct page *p;
 775         unsigned long i, j;
 776         unsigned long map_size;
 777         unsigned long totalpages, offset, realtotalpages;
 778         unsigned int cumulative = 0;
 779
 780         totalpages = 0;
 781         for (i = 0; i < MAX_NR_ZONES; i++) {
 782                 unsigned long size = zones_size[i];
 783                 totalpages += size;
 784         }
 785         realtotalpages = totalpages;
 786         if (zholes_size)
 787                 for (i = 0; i < MAX_NR_ZONES; i++)
 788                         realtotalpages -= zholes_size[i];
 789
 790         printk("On node %d totalpages: %lu\n", nid, realtotalpages);
 791
 792         memlist_init(&active_list);
 793         memlist_init(&inactive_dirty_list);
 794
 795         /*
 796          * Some architectures (with lots of mem and discontinous memory
 797          * maps) have to search for a good mem_map area:
 798          * For discontigmem, the conceptual mem map array starts from
 799          * PAGE_OFFSET, we need to align the actual array onto a mem map
 800          * boundary, so that MAP_NR works.
 801          */
 802         map_size = (totalpages + 1)*sizeof(struct page);
 803         if (lmem_map == (struct page *)0) {
 804                 lmem_map = (struct page *) alloc_bootmem_node(pgdat, map_size);
 805                 lmem_map = (struct page *)(PAGE_OFFSET +
 806                         MAP_ALIGN((unsigned long)lmem_map - PAGE_OFFSET));
 807         }
 808         *gmap = pgdat->node_mem_map = lmem_map;
 809         pgdat->node_size = totalpages;
 810         pgdat->node_start_paddr = zone_start_paddr;
 811         pgdat->node_start_mapnr = (lmem_map - mem_map);
 812
 813         /*
 814          * Initially all pages are reserved - free ones are freed
 815          * up by free_all_bootmem() once the early boot process is
 816          * done.
 817          */
 818         for (p = lmem_map; p < lmem_map + totalpages; p++) {
 819                 set_page_count(p, 0);
 820                 SetPageReserved(p);
 821                 init_waitqueue_head(&p->wait);
 822                 memlist_init(&p->list);
 823         }
 824
 825         offset = lmem_map - mem_map;
 826         for (j = 0; j < MAX_NR_ZONES; j++) {
 827                 zone_t *zone = pgdat->node_zones + j;
 828                 unsigned long mask;
 829                 unsigned long size, realsize;
 830
 831                 realsize = size = zones_size[j];
 832                 if (zholes_size)
 833                         realsize -= zholes_size[j];
 834
 835                 printk("zone(%lu): %lu pages.\n", j, size);
 836                 zone->size = size;
 837                 zone->name = zone_names[j];
 838                 zone->lock = SPIN_LOCK_UNLOCKED;
 839                 zone->zone_pgdat = pgdat;
 840                 zone->free_pages = 0;
 841                 zone->inactive_clean_pages = 0;
 842                 zone->inactive_dirty_pages = 0;
 843                 memlist_init(&zone->inactive_clean_list);
 844                 if (!size)
 845                         continue;
 846
 847                 zone->offset = offset;
 848                 cumulative += size;
 849                 mask = (realsize / zone_balance_ratio[j]);
 850                 if (mask < zone_balance_min[j])
 851                         mask = zone_balance_min[j];
 852                 else if (mask > zone_balance_max[j])
 853                         mask = zone_balance_max[j];
 854                 zone->pages_min = mask;
 855                 zone->pages_low = mask*2;
 856                 zone->pages_high = mask*3;
 857                 /*
 858                  * Add these free targets to the global free target;
 859                  * we have to be SURE that freepages.high is higher
 860                  * than SUM [zone->pages_min] for all zones, otherwise
 861                  * we may have bad bad problems.
 862                  *
 863                  * This means we cannot make the freepages array writable
 864                  * in /proc, but have to add a separate extra_free_target
 865                  * for people who require it to catch load spikes in eg.
 866                  * gigabit ethernet routing...
 867                  */
 868                 freepages.min += mask;
 869                 freepages.low += mask*2;
 870                 freepages.high += mask*3;
 871                 zone->zone_mem_map = mem_map + offset;
 872                 zone->zone_start_mapnr = offset;
 873                 zone->zone_start_paddr = zone_start_paddr;
 874
 875                 for (i = 0; i < size; i++) {
 876                         struct page *page = mem_map + offset + i;
 877                         page->zone = zone;
 878                         if (j != ZONE_HIGHMEM) {
 879                                 page->virtual = __va(zone_start_paddr);
 880                                 zone_start_paddr += PAGE_SIZE;
 881                         }
 882                 }
 883
 884                 offset += size;
 885                 mask = -1;
 886                 for (i = 0; i < MAX_ORDER; i++) {
 887                         unsigned long bitmap_size;
 888
 889                         memlist_init(&zone->free_area[i].free_list);
 890                         mask += mask;
 891                         size = (size + ~mask) & mask;
 892                         bitmap_size = size >> i;
 893                         bitmap_size = (bitmap_size + 7) >> 3;
 894                         bitmap_size = LONG_ALIGN(bitmap_size);
 895                         zone->free_area[i].map =
 896                           (unsigned int *) alloc_bootmem_node(pgdat, bitmap_size);
 897                 }
 898         }
 899         build_zonelists(pgdat);
 900 }
 901
 902 void __init free_area_init(unsigned long *zones_size)
 903 {
 904         free_area_init_core(0, &contig_page_data, &mem_map, zones_size, 0, 0, 0);
 905 }
 906
 907 static int __init setup_mem_frac(char *str)
 908 {
 909         int j = 0;
 910
 911         while (get_option(&str, &zone_balance_ratio[j++]) == 2);
 912         printk("setup_mem_frac: ");
 913         for (j = 0; j < MAX_NR_ZONES; j++) printk("%d  ", zone_balance_ratio[j]);
 914         printk("\n");
 915         return 1;
 916 }
 917
 918 __setup("memfrac=", setup_mem_frac);