2 * linux/mm/page_alloc.c
4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
5 * Swap reorganised 29.12.95, Stephen Tweedie
6 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
7 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
8 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
9 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000
12 #include <linux/config.h>
14 #include <linux/swap.h>
15 #include <linux/swapctl.h>
16 #include <linux/interrupt.h>
17 #include <linux/pagemap.h>
18 #include <linux/bootmem.h>
22 int nr_inactive_dirty_pages
;
23 pg_data_t
*pgdat_list
;
25 static char *zone_names
[MAX_NR_ZONES
] = { "DMA", "Normal", "HighMem" };
26 static int zone_balance_ratio
[MAX_NR_ZONES
] = { 32, 128, 128, };
27 static int zone_balance_min
[MAX_NR_ZONES
] = { 10 , 10, 10, };
28 static int zone_balance_max
[MAX_NR_ZONES
] = { 255 , 255, 255, };
30 struct list_head active_list
;
31 struct list_head inactive_dirty_list
;
33 * Free_page() adds the page to the free lists. This is optimized for
34 * fast normal cases (no error jumps taken normally).
36 * The way to optimize jumps for gcc-2.2.2 is to:
37 * - select the "normal" case and put it inside the if () { XXX }
38 * - no else-statements if you can avoid them
40 * With the above two rules, you get a straight-line execution path
41 * for the normal case, giving better asm-code.
44 #define memlist_init(x) INIT_LIST_HEAD(x)
45 #define memlist_add_head list_add
46 #define memlist_add_tail list_add_tail
47 #define memlist_del list_del
48 #define memlist_entry list_entry
49 #define memlist_next(x) ((x)->next)
50 #define memlist_prev(x) ((x)->prev)
53 * Temporary debugging check.
55 #define BAD_RANGE(zone,x) (((zone) != (x)->zone) || (((x)-mem_map) < (zone)->offset) || (((x)-mem_map) >= (zone)->offset+(zone)->size))
58 * Buddy system. Hairy. You really aren't expected to understand this
60 * Hint: -mask = 1+~mask
63 static void FASTCALL(__free_pages_ok (struct page
*page
, unsigned long order
));
64 static void __free_pages_ok (struct page
*page
, unsigned long order
)
66 unsigned long index
, page_idx
, mask
, flags
;
72 * Subtle. We do not want to test this in the inlined part of
73 * __free_page() - it's a rare condition and just increases
74 * cache footprint unnecesserily. So we do an 'incorrect'
75 * decrement on page->count for reserved pages, but this part
78 if (PageReserved(page
))
85 if (!VALID_PAGE(page
))
87 if (PageSwapCache(page
))
91 if (PageDecrAfter(page
))
95 if (PageInactiveDirty(page
))
97 if (PageInactiveClean(page
))
100 page
->flags
&= ~((1<<PG_referenced
) | (1<<PG_dirty
));
101 page
->age
= PAGE_AGE_START
;
105 mask
= (~0UL) << order
;
106 base
= mem_map
+ zone
->offset
;
107 page_idx
= page
- base
;
108 if (page_idx
& ~mask
)
110 index
= page_idx
>> (1 + order
);
112 area
= zone
->free_area
+ order
;
114 spin_lock_irqsave(&zone
->lock
, flags
);
116 zone
->free_pages
-= mask
;
118 while (mask
+ (1 << (MAX_ORDER
-1))) {
119 struct page
*buddy1
, *buddy2
;
121 if (area
>= zone
->free_area
+ MAX_ORDER
)
123 if (!test_and_change_bit(index
, area
->map
))
125 * the buddy page is still allocated.
129 * Move the buddy up one level.
131 buddy1
= base
+ (page_idx
^ -mask
);
132 buddy2
= base
+ page_idx
;
133 if (BAD_RANGE(zone
,buddy1
))
135 if (BAD_RANGE(zone
,buddy2
))
138 memlist_del(&buddy1
->list
);
144 memlist_add_head(&(base
+ page_idx
)->list
, &area
->free_list
);
146 spin_unlock_irqrestore(&zone
->lock
, flags
);
149 * We don't want to protect this variable from race conditions
150 * since it's nothing important, but we do want to make sure
151 * it never gets negative.
153 if (memory_pressure
> NR_CPUS
)
157 #define MARK_USED(index, order, area) \
158 change_bit((index) >> (1+(order)), (area)->map)
160 static inline struct page
* expand (zone_t
*zone
, struct page
*page
,
161 unsigned long index
, int low
, int high
, free_area_t
* area
)
163 unsigned long size
= 1 << high
;
166 if (BAD_RANGE(zone
,page
))
171 memlist_add_head(&(page
)->list
, &(area
)->free_list
);
172 MARK_USED(index
, high
, area
);
176 if (BAD_RANGE(zone
,page
))
181 static FASTCALL(struct page
* rmqueue(zone_t
*zone
, unsigned long order
));
182 static struct page
* rmqueue(zone_t
*zone
, unsigned long order
)
184 free_area_t
* area
= zone
->free_area
+ order
;
185 unsigned long curr_order
= order
;
186 struct list_head
*head
, *curr
;
190 spin_lock_irqsave(&zone
->lock
, flags
);
192 head
= &area
->free_list
;
193 curr
= memlist_next(head
);
198 page
= memlist_entry(curr
, struct page
, list
);
199 if (BAD_RANGE(zone
,page
))
202 index
= (page
- mem_map
) - zone
->offset
;
203 MARK_USED(index
, curr_order
, area
);
204 zone
->free_pages
-= 1 << order
;
206 page
= expand(zone
, page
, index
, order
, curr_order
, area
);
207 spin_unlock_irqrestore(&zone
->lock
, flags
);
209 set_page_count(page
, 1);
210 if (BAD_RANGE(zone
,page
))
217 } while (curr_order
< MAX_ORDER
);
218 spin_unlock_irqrestore(&zone
->lock
, flags
);
228 * This function does the dirty work for __alloc_pages
229 * and is separated out to keep the code size smaller.
230 * (suggested by Davem at 1:30 AM, typed by Rik at 6 AM)
232 static struct page
* __alloc_pages_limit(zonelist_t
*zonelist
,
233 unsigned long order
, int limit
, int direct_reclaim
)
235 zone_t
**zone
= zonelist
->zones
;
238 zone_t
*z
= *(zone
++);
239 unsigned long water_mark
;
247 * We allocate if the number of free + inactive_clean
248 * pages is above the watermark.
253 water_mark
= z
->pages_min
;
256 water_mark
= z
->pages_low
;
259 water_mark
= z
->pages_high
;
262 if (z
->free_pages
+ z
->inactive_clean_pages
> water_mark
) {
263 struct page
*page
= NULL
;
264 /* If possible, reclaim a page directly. */
265 if (direct_reclaim
&& z
->free_pages
< z
->pages_min
+ 8)
266 page
= reclaim_page(z
);
267 /* If that fails, fall back to rmqueue. */
269 page
= rmqueue(z
, order
);
281 * This is the 'heart' of the zoned buddy allocator:
283 struct page
* __alloc_pages(zonelist_t
*zonelist
, unsigned long order
)
286 int direct_reclaim
= 0;
287 unsigned int gfp_mask
= zonelist
->gfp_mask
;
291 * Allocations put pressure on the VM subsystem.
296 * (If anyone calls gfp from interrupts nonatomically then it
297 * will sooner or later tripped up by a schedule().)
299 * We are falling back to lower-level zones if allocation
300 * in a higher zone fails.
304 * Can we take pages directly from the inactive_clean
307 if (order
== 0 && (gfp_mask
& __GFP_WAIT
) &&
308 !(current
->flags
& PF_MEMALLOC
))
312 * If we are about to get low on free pages and we also have
313 * an inactive page shortage, wake up kswapd.
315 if (inactive_shortage() > inactive_target
/ 2 && free_shortage())
318 * If we are about to get low on free pages and cleaning
319 * the inactive_dirty pages would fix the situation,
322 else if (free_shortage() && nr_inactive_dirty_pages
> free_shortage()
323 && nr_inactive_dirty_pages
>= freepages
.high
)
328 * First, see if we have any zones with lots of free memory.
330 * We allocate free memory first because it doesn't contain
333 zone
= zonelist
->zones
;
335 zone_t
*z
= *(zone
++);
341 if (z
->free_pages
>= z
->pages_low
) {
342 page
= rmqueue(z
, order
);
345 } else if (z
->free_pages
< z
->pages_min
&&
346 waitqueue_active(&kreclaimd_wait
)) {
347 wake_up_interruptible(&kreclaimd_wait
);
352 * Try to allocate a page from a zone with a HIGH
353 * amount of free + inactive_clean pages.
355 * If there is a lot of activity, inactive_target
356 * will be high and we'll have a good chance of
357 * finding a page using the HIGH limit.
359 page
= __alloc_pages_limit(zonelist
, order
, PAGES_HIGH
, direct_reclaim
);
364 * Then try to allocate a page from a zone with more
365 * than zone->pages_low free + inactive_clean pages.
367 * When the working set is very large and VM activity
368 * is low, we're most likely to have our allocation
371 page
= __alloc_pages_limit(zonelist
, order
, PAGES_LOW
, direct_reclaim
);
376 * OK, none of the zones on our zonelist has lots
379 * We wake up kswapd, in the hope that kswapd will
380 * resolve this situation before memory gets tight.
382 * We also yield the CPU, because that:
383 * - gives kswapd a chance to do something
384 * - slows down allocations, in particular the
385 * allocations from the fast allocator that's
386 * causing the problems ...
387 * - ... which minimises the impact the "bad guys"
388 * have on the rest of the system
389 * - if we don't have __GFP_IO set, kswapd may be
390 * able to free some memory we can't free ourselves
393 if (gfp_mask
& __GFP_WAIT
) {
394 __set_current_state(TASK_RUNNING
);
395 current
->policy
|= SCHED_YIELD
;
400 * After waking up kswapd, we try to allocate a page
401 * from any zone which isn't critical yet.
403 * Kswapd should, in most situations, bring the situation
404 * back to normal in no time.
406 page
= __alloc_pages_limit(zonelist
, order
, PAGES_MIN
, direct_reclaim
);
411 * Damn, we didn't succeed.
413 * This can be due to 2 reasons:
414 * - we're doing a higher-order allocation
415 * --> move pages to the free list until we succeed
416 * - we're /really/ tight on memory
417 * --> wait on the kswapd waitqueue until memory is freed
419 if (!(current
->flags
& PF_MEMALLOC
)) {
421 * Are we dealing with a higher order allocation?
423 * Move pages from the inactive_clean to the free list
424 * in the hope of creating a large, physically contiguous
425 * piece of free memory.
427 if (order
> 0 && (gfp_mask
& __GFP_WAIT
)) {
428 zone
= zonelist
->zones
;
429 /* First, clean some dirty pages. */
430 page_launder(gfp_mask
, 1);
432 zone_t
*z
= *(zone
++);
437 while (z
->inactive_clean_pages
) {
439 /* Move one page to the free list. */
440 page
= reclaim_page(z
);
444 /* Try if the allocation succeeds. */
445 page
= rmqueue(z
, order
);
452 * When we arrive here, we are really tight on memory.
454 * We wake up kswapd and sleep until kswapd wakes us
455 * up again. After that we loop back to the start.
457 * We have to do this because something else might eat
458 * the memory kswapd frees for us and we need to be
459 * reliable. Note that we don't loop back for higher
460 * order allocations since it is possible that kswapd
461 * simply cannot free a large enough contiguous area
464 if ((gfp_mask
& (__GFP_WAIT
|__GFP_IO
)) == (__GFP_WAIT
|__GFP_IO
)) {
470 * If __GFP_IO isn't set, we can't wait on kswapd because
471 * kswapd just might need some IO locks /we/ are holding ...
473 * SUBTLE: The scheduling point above makes sure that
474 * kswapd does get the chance to free memory we can't
477 } else if (gfp_mask
& __GFP_WAIT
) {
478 try_to_free_pages(gfp_mask
);
487 * Final phase: allocate anything we can!
489 * Higher order allocations, GFP_ATOMIC allocations and
490 * recursive allocations (PF_MEMALLOC) end up here.
492 * Only recursive allocations can use the very last pages
493 * in the system, otherwise it would be just too easy to
494 * deadlock the system...
496 zone
= zonelist
->zones
;
498 zone_t
*z
= *(zone
++);
499 struct page
* page
= NULL
;
506 * SUBTLE: direct_reclaim is only possible if the task
507 * becomes PF_MEMALLOC while looping above. This will
508 * happen when the OOM killer selects this task for
509 * instant execution...
511 if (direct_reclaim
) {
512 page
= reclaim_page(z
);
517 /* XXX: is pages_min/4 a good amount to reserve for this? */
518 if (z
->free_pages
< z
->pages_min
/ 4 &&
519 !(current
->flags
& PF_MEMALLOC
))
521 page
= rmqueue(z
, order
);
527 printk(KERN_ERR
"__alloc_pages: %lu-order allocation failed.\n", order
);
532 * Common helper functions.
534 unsigned long __get_free_pages(int gfp_mask
, unsigned long order
)
538 page
= alloc_pages(gfp_mask
, order
);
541 return (unsigned long) page_address(page
);
544 unsigned long get_zeroed_page(int gfp_mask
)
548 page
= alloc_pages(gfp_mask
, 0);
550 void *address
= page_address(page
);
552 return (unsigned long) address
;
557 void __free_pages(struct page
*page
, unsigned long order
)
559 if (put_page_testzero(page
))
560 __free_pages_ok(page
, order
);
563 void free_pages(unsigned long addr
, unsigned long order
)
567 #ifdef CONFIG_DISCONTIGMEM
568 if (addr
== 0) return;
570 fpage
= virt_to_page(addr
);
571 if (VALID_PAGE(fpage
))
572 __free_pages(fpage
, order
);
576 * Total amount of free (allocatable) RAM:
578 unsigned int nr_free_pages (void)
582 pg_data_t
*pgdat
= pgdat_list
;
586 for (zone
= pgdat
->node_zones
; zone
< pgdat
->node_zones
+ MAX_NR_ZONES
; zone
++)
587 sum
+= zone
->free_pages
;
588 pgdat
= pgdat
->node_next
;
594 * Total amount of inactive_clean (allocatable) RAM:
596 unsigned int nr_inactive_clean_pages (void)
600 pg_data_t
*pgdat
= pgdat_list
;
604 for (zone
= pgdat
->node_zones
; zone
< pgdat
->node_zones
+ MAX_NR_ZONES
; zone
++)
605 sum
+= zone
->inactive_clean_pages
;
606 pgdat
= pgdat
->node_next
;
612 * Amount of free RAM allocatable as buffer memory:
614 unsigned int nr_free_buffer_pages (void)
618 sum
= nr_free_pages();
619 sum
+= nr_inactive_clean_pages();
620 sum
+= nr_inactive_dirty_pages
;
623 * Keep our write behind queue filled, even if
624 * kswapd lags a bit right now.
626 if (sum
< freepages
.high
+ inactive_target
)
627 sum
= freepages
.high
+ inactive_target
;
629 * We don't want dirty page writebehind to put too
630 * much pressure on the working set, but we want it
631 * to be possible to have some dirty pages in the
632 * working set without upsetting the writebehind logic.
634 sum
+= nr_active_pages
>> 4;
640 unsigned int nr_free_highpages (void)
642 pg_data_t
*pgdat
= pgdat_list
;
643 unsigned int pages
= 0;
646 pages
+= pgdat
->node_zones
[ZONE_HIGHMEM
].free_pages
;
647 pgdat
= pgdat
->node_next
;
654 * Show free area list (used inside shift_scroll-lock stuff)
655 * We also calculate the percentage fragmentation. We do this by counting the
656 * memory on each free list with the exception of the first item on the list.
658 void show_free_areas_core(pg_data_t
*pgdat
)
663 printk("Free pages: %6dkB (%6dkB HighMem)\n",
664 nr_free_pages() << (PAGE_SHIFT
-10),
665 nr_free_highpages() << (PAGE_SHIFT
-10));
667 printk("( Active: %d, inactive_dirty: %d, inactive_clean: %d, free: %d (%d %d %d) )\n",
669 nr_inactive_dirty_pages
,
670 nr_inactive_clean_pages(),
676 for (type
= 0; type
< MAX_NR_ZONES
; type
++) {
677 struct list_head
*head
, *curr
;
678 zone_t
*zone
= pgdat
->node_zones
+ type
;
679 unsigned long nr
, total
, flags
;
683 spin_lock_irqsave(&zone
->lock
, flags
);
684 for (order
= 0; order
< MAX_ORDER
; order
++) {
685 head
= &(zone
->free_area
+ order
)->free_list
;
689 curr
= memlist_next(curr
);
694 total
+= nr
* (1 << order
);
695 printk("%lu*%lukB ", nr
,
696 (PAGE_SIZE
>>10) << order
);
698 spin_unlock_irqrestore(&zone
->lock
, flags
);
700 printk("= %lukB)\n", total
* (PAGE_SIZE
>>10));
703 #ifdef SWAP_CACHE_INFO
704 show_swap_cache_info();
708 void show_free_areas(void)
710 show_free_areas_core(pgdat_list
);
714 * Builds allocation fallback zone lists.
716 static inline void build_zonelists(pg_data_t
*pgdat
)
720 for (i
= 0; i
< NR_GFPINDEX
; i
++) {
721 zonelist_t
*zonelist
;
724 zonelist
= pgdat
->node_zonelists
+ i
;
725 memset(zonelist
, 0, sizeof(*zonelist
));
727 zonelist
->gfp_mask
= i
;
730 if (i
& __GFP_HIGHMEM
)
742 zone
= pgdat
->node_zones
+ ZONE_HIGHMEM
;
744 #ifndef CONFIG_HIGHMEM
747 zonelist
->zones
[j
++] = zone
;
750 zone
= pgdat
->node_zones
+ ZONE_NORMAL
;
752 zonelist
->zones
[j
++] = zone
;
754 zone
= pgdat
->node_zones
+ ZONE_DMA
;
756 zonelist
->zones
[j
++] = zone
;
758 zonelist
->zones
[j
++] = NULL
;
762 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
765 * Set up the zone data structures:
766 * - mark all pages reserved
767 * - mark all memory queues empty
768 * - clear the memory bitmaps
770 void __init
free_area_init_core(int nid
, pg_data_t
*pgdat
, struct page
**gmap
,
771 unsigned long *zones_size
, unsigned long zone_start_paddr
,
772 unsigned long *zholes_size
, struct page
*lmem_map
)
776 unsigned long map_size
;
777 unsigned long totalpages
, offset
, realtotalpages
;
778 unsigned int cumulative
= 0;
781 for (i
= 0; i
< MAX_NR_ZONES
; i
++) {
782 unsigned long size
= zones_size
[i
];
785 realtotalpages
= totalpages
;
787 for (i
= 0; i
< MAX_NR_ZONES
; i
++)
788 realtotalpages
-= zholes_size
[i
];
790 printk("On node %d totalpages: %lu\n", nid
, realtotalpages
);
792 memlist_init(&active_list
);
793 memlist_init(&inactive_dirty_list
);
796 * Some architectures (with lots of mem and discontinous memory
797 * maps) have to search for a good mem_map area:
798 * For discontigmem, the conceptual mem map array starts from
799 * PAGE_OFFSET, we need to align the actual array onto a mem map
800 * boundary, so that MAP_NR works.
802 map_size
= (totalpages
+ 1)*sizeof(struct page
);
803 if (lmem_map
== (struct page
*)0) {
804 lmem_map
= (struct page
*) alloc_bootmem_node(pgdat
, map_size
);
805 lmem_map
= (struct page
*)(PAGE_OFFSET
+
806 MAP_ALIGN((unsigned long)lmem_map
- PAGE_OFFSET
));
808 *gmap
= pgdat
->node_mem_map
= lmem_map
;
809 pgdat
->node_size
= totalpages
;
810 pgdat
->node_start_paddr
= zone_start_paddr
;
811 pgdat
->node_start_mapnr
= (lmem_map
- mem_map
);
814 * Initially all pages are reserved - free ones are freed
815 * up by free_all_bootmem() once the early boot process is
818 for (p
= lmem_map
; p
< lmem_map
+ totalpages
; p
++) {
819 set_page_count(p
, 0);
821 init_waitqueue_head(&p
->wait
);
822 memlist_init(&p
->list
);
825 offset
= lmem_map
- mem_map
;
826 for (j
= 0; j
< MAX_NR_ZONES
; j
++) {
827 zone_t
*zone
= pgdat
->node_zones
+ j
;
829 unsigned long size
, realsize
;
831 realsize
= size
= zones_size
[j
];
833 realsize
-= zholes_size
[j
];
835 printk("zone(%lu): %lu pages.\n", j
, size
);
837 zone
->name
= zone_names
[j
];
838 zone
->lock
= SPIN_LOCK_UNLOCKED
;
839 zone
->zone_pgdat
= pgdat
;
840 zone
->free_pages
= 0;
841 zone
->inactive_clean_pages
= 0;
842 zone
->inactive_dirty_pages
= 0;
843 memlist_init(&zone
->inactive_clean_list
);
847 zone
->offset
= offset
;
849 mask
= (realsize
/ zone_balance_ratio
[j
]);
850 if (mask
< zone_balance_min
[j
])
851 mask
= zone_balance_min
[j
];
852 else if (mask
> zone_balance_max
[j
])
853 mask
= zone_balance_max
[j
];
854 zone
->pages_min
= mask
;
855 zone
->pages_low
= mask
*2;
856 zone
->pages_high
= mask
*3;
858 * Add these free targets to the global free target;
859 * we have to be SURE that freepages.high is higher
860 * than SUM [zone->pages_min] for all zones, otherwise
861 * we may have bad bad problems.
863 * This means we cannot make the freepages array writable
864 * in /proc, but have to add a separate extra_free_target
865 * for people who require it to catch load spikes in eg.
866 * gigabit ethernet routing...
868 freepages
.min
+= mask
;
869 freepages
.low
+= mask
*2;
870 freepages
.high
+= mask
*3;
871 zone
->zone_mem_map
= mem_map
+ offset
;
872 zone
->zone_start_mapnr
= offset
;
873 zone
->zone_start_paddr
= zone_start_paddr
;
875 for (i
= 0; i
< size
; i
++) {
876 struct page
*page
= mem_map
+ offset
+ i
;
878 if (j
!= ZONE_HIGHMEM
) {
879 page
->virtual = __va(zone_start_paddr
);
880 zone_start_paddr
+= PAGE_SIZE
;
886 for (i
= 0; i
< MAX_ORDER
; i
++) {
887 unsigned long bitmap_size
;
889 memlist_init(&zone
->free_area
[i
].free_list
);
891 size
= (size
+ ~mask
) & mask
;
892 bitmap_size
= size
>> i
;
893 bitmap_size
= (bitmap_size
+ 7) >> 3;
894 bitmap_size
= LONG_ALIGN(bitmap_size
);
895 zone
->free_area
[i
].map
=
896 (unsigned int *) alloc_bootmem_node(pgdat
, bitmap_size
);
899 build_zonelists(pgdat
);
902 void __init
free_area_init(unsigned long *zones_size
)
904 free_area_init_core(0, &contig_page_data
, &mem_map
, zones_size
, 0, 0, 0);
907 static int __init
setup_mem_frac(char *str
)
911 while (get_option(&str
, &zone_balance_ratio
[j
++]) == 2);
912 printk("setup_mem_frac: ");
913 for (j
= 0; j
< MAX_NR_ZONES
; j
++) printk("%d ", zone_balance_ratio
[j
]);
918 __setup("memfrac=", setup_mem_frac
);