1 diff -urN linux-2.6.38/arch/x86/kernel/entry_32.S uksm-2.6.38-zhang/arch/x86/kernel/entry_32.S
2 --- linux-2.6.38/arch/x86/kernel/entry_32.S 2011-03-15 09:20:32.000000000 +0800
3 +++ uksm-2.6.38-zhang/arch/x86/kernel/entry_32.S 2012-01-09 10:05:23.642269166 +0800
5 CFI_ADJUST_CFA_OFFSET 4
13 diff -urN linux-2.6.38/arch/x86/kernel/entry_64.S uksm-2.6.38-zhang/arch/x86/kernel/entry_64.S
14 --- linux-2.6.38/arch/x86/kernel/entry_64.S 2011-03-15 09:20:32.000000000 +0800
15 +++ uksm-2.6.38-zhang/arch/x86/kernel/entry_64.S 2012-01-09 10:05:23.642269166 +0800
17 decl PER_CPU_VAR(irq_count)
20 -END(do_hypervisor_callback)
21 +END(xen_do_hypervisor_callback)
24 * Hypervisor uses this for application faults while it executes.
25 diff -urN linux-2.6.38/fs/exec.c uksm-2.6.38-zhang/fs/exec.c
26 --- linux-2.6.38/fs/exec.c 2011-03-15 09:20:32.000000000 +0800
27 +++ uksm-2.6.38-zhang/fs/exec.c 2012-01-09 10:05:55.168936883 +0800
29 * current->executable is only used by the procfs. This allows a dispatch
30 * table to check for several different types of binary formats. We keep
31 * trying until we recognize the file or we run out of supported binary
36 #include <linux/slab.h>
38 #include <linux/fs_struct.h>
39 #include <linux/pipe_fs_i.h>
40 #include <linux/oom.h>
41 +#include <linux/ksm.h>
43 #include <asm/uaccess.h>
44 #include <asm/mmu_context.h>
46 insert ? list_add(&fmt->lh, &formats) :
47 list_add_tail(&fmt->lh, &formats);
48 write_unlock(&binfmt_lock);
53 EXPORT_SYMBOL(__register_binfmt);
57 current->self_exec_id++;
60 flush_signal_handlers(current, 0);
61 flush_old_files(current->files);
68 - * Fill the binprm structure from the inode.
70 + * Fill the binprm structure from the inode.
71 * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
73 * This may be called multiple times for binary chains (scripts for example).
74 diff -urN linux-2.6.38/fs/proc/meminfo.c uksm-2.6.38-zhang/fs/proc/meminfo.c
75 --- linux-2.6.38/fs/proc/meminfo.c 2011-03-15 09:20:32.000000000 +0800
76 +++ uksm-2.6.38-zhang/fs/proc/meminfo.c 2012-01-09 10:05:56.362270256 +0800
78 "SUnreclaim: %8lu kB\n"
79 "KernelStack: %8lu kB\n"
80 "PageTables: %8lu kB\n"
82 + "KsmSharing: %8lu kB\n"
83 + "KsmZeroPages: %8lu kB\n"
85 #ifdef CONFIG_QUICKLIST
86 "Quicklists: %8lu kB\n"
89 K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
90 global_page_state(NR_KERNEL_STACK) * THREAD_SIZE / 1024,
91 K(global_page_state(NR_PAGETABLE)),
93 + K(global_page_state(NR_KSM_PAGES_SHARING)),
94 + K(global_page_state(NR_KSM_ZERO_PAGES)),
96 #ifdef CONFIG_QUICKLIST
97 K(quicklist_total_size()),
99 Binary files linux-2.6.38/.gitignore.swp and uksm-2.6.38-zhang/.gitignore.swp differ
100 diff -urN linux-2.6.38/include/linux/ksm.h uksm-2.6.38-zhang/include/linux/ksm.h
101 --- linux-2.6.38/include/linux/ksm.h 2011-03-15 09:20:32.000000000 +0800
102 +++ uksm-2.6.38-zhang/include/linux/ksm.h 2012-01-09 10:11:24.218947858 +0800
104 struct vm_area_struct *vma, unsigned long address);
107 -int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
108 - unsigned long end, int advice, unsigned long *vm_flags);
109 -int __ksm_enter(struct mm_struct *mm);
110 -void __ksm_exit(struct mm_struct *mm);
112 -static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
114 - if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
115 - return __ksm_enter(mm);
119 -static inline void ksm_exit(struct mm_struct *mm)
121 - if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
124 +extern unsigned long zero_pfn __read_mostly;
125 +extern unsigned long ksm_zero_pfn __read_mostly;
126 +extern struct page *empty_ksm_zero_page;
129 * A KSM page is one of those write-protected "shared pages" or "merged pages"
131 (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
134 +/* must be done before linked to mm */
135 +extern void ksm_vma_add_new(struct vm_area_struct *vma);
137 +extern void ksm_remove_vma(struct vm_area_struct *vma);
138 +extern int unmerge_ksm_pages(struct vm_area_struct *vma,
139 + unsigned long start, unsigned long end);
142 * When do_swap_page() first faults in from swap what used to be a KSM page,
143 * no problem, it will be assigned to this vma's anon_vma; but thereafter,
145 struct vm_area_struct *, unsigned long, void *), void *arg);
146 void ksm_migrate_page(struct page *newpage, struct page *oldpage);
148 -#else /* !CONFIG_KSM */
149 +/* Each rung of this ladder is a list of VMAs having a same scan ratio */
151 + struct list_head vma_list;
152 + //spinlock_t vma_list_lock;
153 + //struct semaphore sem;
154 + struct list_head *current_scan;
155 + unsigned int pages_to_scan;
156 + unsigned char round_finished; /* rung is ready for the next round */
157 + unsigned char busy_searched;
158 + unsigned long fully_scanned_slots;
159 + unsigned long scan_ratio;
160 + unsigned long vma_num;
161 + //unsigned long vma_finished;
162 + unsigned long scan_turn;
166 + struct list_head ksm_list;
167 + struct list_head slot_list;
168 + unsigned long dedup_ratio;
169 + unsigned long dedup_num;
170 + int ksm_index; /* -1 if vma is not in inter-table,
171 + positive otherwise */
172 + unsigned long pages_scanned;
173 + unsigned long last_scanned;
174 + unsigned long pages_to_scan;
175 + struct scan_rung *rung;
176 + struct page **rmap_list_pool;
177 + unsigned long *pool_counts;
178 + unsigned long pool_size;
179 + struct vm_area_struct *vma;
180 + struct mm_struct *mm;
181 + unsigned long ctime_j;
182 + unsigned long pages;
183 + unsigned char need_sort;
184 + unsigned char need_rerand;
185 + unsigned long slot_scanned; /* It's scanned in this round */
186 + unsigned long fully_scanned; /* the above four to be merged to status bits */
187 + unsigned long pages_cowed; /* pages cowed this round */
188 + unsigned long pages_merged; /* pages merged this round */
190 + /* used for dup vma pair */
191 + struct radix_tree_root dup_tree;
194 -static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
199 + * A few notes about the KSM scanning process,
200 + * to make it easier to understand the data structures below:
202 + * In order to reduce excessive scanning, KSM sorts the memory pages by their
203 + * contents into a data structure that holds pointers to the pages' locations.
205 + * Since the contents of the pages may change at any moment, KSM cannot just
206 + * insert the pages into a normal sorted tree and expect it to find anything.
207 + * Therefore KSM uses two data structures - the stable and the unstable tree.
209 + * The stable tree holds pointers to all the merged pages (ksm pages), sorted
210 + * by their contents. Because each such page is write-protected, searching on
211 + * this tree is fully assured to be working (except when pages are unmapped),
212 + * and therefore this tree is called the stable tree.
214 + * In addition to the stable tree, KSM uses a second data structure called the
215 + * unstable tree: this tree holds pointers to pages which have been found to
216 + * be "unchanged for a period of time". The unstable tree sorts these pages
217 + * by their contents, but since they are not write-protected, KSM cannot rely
218 + * upon the unstable tree to work correctly - the unstable tree is liable to
219 + * be corrupted as its contents are modified, and so it is called unstable.
221 + * KSM solves this problem by several techniques:
223 + * 1) The unstable tree is flushed every time KSM completes scanning all
224 + * memory areas, and then the tree is rebuilt again from the beginning.
225 + * 2) KSM will only insert into the unstable tree, pages whose hash value
226 + * has not changed since the previous scan of all memory areas.
227 + * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the
228 + * colors of the nodes and not on their contents, assuring that even when
229 + * the tree gets "corrupted" it won't get out of balance, so scanning time
230 + * remains the same (also, searching and inserting nodes in an rbtree uses
231 + * the same algorithm, so we have no overhead when we flush and rebuild).
232 + * 4) KSM never flushes the stable tree, which means that even if it were to
233 + * take 10 attempts to find a page in the unstable tree, once it is found,
234 + * it is secured in the stable tree. (When we scan a new page, we first
235 + * compare it against the stable tree, and then against the unstable tree.)
238 -static inline void ksm_exit(struct mm_struct *mm)
243 + * node of either the stable or unstale rbtree
247 + struct rb_node node; /* link in the main (un)stable rbtree */
248 + struct rb_root sub_root; /* rb_root for sublevel collision rbtree */
250 + unsigned long count; /* how many sublevel tree nodes */
251 + struct list_head all_list; /* all tree nodes in stable/unstable tree */
256 + * struct stable_node - node of the stable rbtree
257 + * @node: rb node of this ksm page in the stable tree
258 + * @hlist: hlist head of rmap_items using this ksm page
259 + * @kpfn: page frame number of this ksm page
261 +struct stable_node {
262 + struct rb_node node; /* link in sub-rbtree */
263 + struct tree_node *tree_node; /* it's tree node root in stable tree, NULL if it's in hell list */
264 + struct hlist_head hlist;
265 + unsigned long kpfn;
266 + u32 hash_max; /* if ==0 then it's not been calculated yet */
267 + //struct vm_area_struct *old_vma;
268 + struct list_head all_list; /* in a list for all stable nodes */
275 + * struct node_vma - group rmap_items linked in a same stable
280 + struct vma_slot *slot;
281 + unsigned long key; /* slot is used as key sorted on hlist */
283 + struct hlist_node hlist;
284 + struct hlist_head rmap_hlist;
285 + struct stable_node *head;
286 + unsigned long last_update;
290 + * struct rmap_item - reverse mapping item for virtual addresses
291 + * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
292 + * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
293 + * @mm: the memory structure this rmap_item is pointing into
294 + * @address: the virtual address this rmap_item tracks (+ flags in low bits)
295 + * @node: rb node of this rmap_item in the unstable tree
296 + * @head: pointer to stable_node heading this list in the stable tree
297 + * @hlist: link into hlist of rmap_items hanging off that stable_node
300 + struct vma_slot *slot;
302 + unsigned long address; /* + low bits used for flags below */
303 + /* Appendded to (un)stable tree on which scan round */
304 + unsigned long append_round;
306 + /* Which rung scan turn it was last scanned */
307 + //unsigned long last_scan;
308 + unsigned long entry_index;
310 + struct {/* when in unstable tree */
311 + struct rb_node node;
312 + struct tree_node *tree_node;
315 + struct { /* when in stable tree */
316 + struct node_vma *head;
317 + struct hlist_node hlist;
318 + struct anon_vma *anon_vma;
321 +} __attribute__((aligned(4)));
323 +struct rmap_list_entry {
325 + struct rmap_item *item;
326 + unsigned long addr;
328 + // lowest bit is used for is_addr tag
329 + //unsigned char is_addr;
330 +} __attribute__((aligned(4))); // 4 aligned to fit in to pages
332 +//extern struct semaphore ksm_scan_sem;
333 +#else /* !CONFIG_KSM */
335 static inline int PageKsm(struct page *page)
341 -static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
342 - unsigned long end, int advice, unsigned long *vm_flags)
344 +extern inline int unmerge_ksm_pages(struct vm_area_struct *vma,
345 + unsigned long start, unsigned long end)
349 diff -urN linux-2.6.38/include/linux/mm_types.h uksm-2.6.38-zhang/include/linux/mm_types.h
350 --- linux-2.6.38/include/linux/mm_types.h 2011-03-15 09:20:32.000000000 +0800
351 +++ uksm-2.6.38-zhang/include/linux/mm_types.h 2012-01-09 10:05:57.562270296 +0800
354 struct mempolicy *vm_policy; /* NUMA policy for the VMA */
357 + struct vma_slot *ksm_vma_slot;
362 diff -urN linux-2.6.38/include/linux/mmzone.h uksm-2.6.38-zhang/include/linux/mmzone.h
363 --- linux-2.6.38/include/linux/mmzone.h 2011-03-15 09:20:32.000000000 +0800
364 +++ uksm-2.6.38-zhang/include/linux/mmzone.h 2012-01-09 10:05:57.562270296 +0800
366 NUMA_OTHER, /* allocation from other node */
368 NR_ANON_TRANSPARENT_HUGEPAGES,
370 + NR_KSM_PAGES_SHARING,
373 NR_VM_ZONE_STAT_ITEMS };
379 /* Fields commonly accessed by the page reclaim scanner */
380 - spinlock_t lru_lock;
381 + spinlock_t lru_lock;
383 struct list_head list;
389 - * is_highmem - helper function to quickly check if a struct zone is a
390 + * is_highmem - helper function to quickly check if a struct zone is a
391 * highmem zone or not. This is an attempt to keep references
392 * to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum.
393 * @zone - pointer to struct zone variable
394 diff -urN linux-2.6.38/include/linux/sched.h uksm-2.6.38-zhang/include/linux/sched.h
395 --- linux-2.6.38/include/linux/sched.h 2011-03-15 09:20:32.000000000 +0800
396 +++ uksm-2.6.38-zhang/include/linux/sched.h 2012-01-09 10:05:57.815603639 +0800
398 # define MMF_DUMP_MASK_DEFAULT_ELF 0
400 /* leave room for more dump flags */
401 -#define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */
402 #define MMF_VM_HUGEPAGE 17 /* set when VM_HUGEPAGE is set on vma */
404 #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK)
405 @@ -1280,9 +1279,9 @@
406 unsigned long stack_canary;
411 * pointers to (original) parent process, youngest child, younger sibling,
412 - * older sibling, respectively. (p->father can be replaced with
413 + * older sibling, respectively. (p->father can be replaced with
414 * p->real_parent->pid)
416 struct task_struct *real_parent; /* real parent process */
417 @@ -2080,7 +2079,7 @@
418 spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
424 extern void block_all_signals(int (*notifier)(void *priv), void *priv,
426 diff -urN linux-2.6.38/kernel/fork.c uksm-2.6.38-zhang/kernel/fork.c
427 --- linux-2.6.38/kernel/fork.c 2011-03-15 09:20:32.000000000 +0800
428 +++ uksm-2.6.38-zhang/kernel/fork.c 2012-01-09 10:05:59.635603699 +0800
430 rb_link = &mm->mm_rb.rb_node;
433 - retval = ksm_fork(mm, oldmm);
436 retval = khugepaged_fork(mm, oldmm);
443 - tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
444 + tmp = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
449 __vma_link_rb(mm, tmp, rb_link, rb_parent);
450 rb_link = &tmp->vm_rb.rb_right;
451 rb_parent = &tmp->vm_rb;
454 + ksm_vma_add_new(tmp);
457 retval = copy_page_range(mm, oldmm, mpnt);
461 if (atomic_dec_and_test(&mm->mm_users)) {
464 khugepaged_exit(mm); /* must run before exit_mmap */
466 set_mm_exe_file(mm, NULL);
467 diff -urN linux-2.6.38/mm/ksm.c uksm-2.6.38-zhang/mm/ksm.c
468 --- linux-2.6.38/mm/ksm.c 2011-03-15 09:20:32.000000000 +0800
469 +++ uksm-2.6.38-zhang/mm/ksm.c 2012-01-09 10:05:59.862270375 +0800
473 * This work is licensed under the terms of the GNU GPL, version 2.
477 + * Ultra KSM. Copyright (C) 2011 Nai Xia
479 + * This is an improvement upon KSM. Its features:
480 + * 1. Full system scan:
481 + * It automatically scans all user processes' anonymous VMAs. Kernel-user
482 + * interaction to submit a memory area to KSM is no longer needed.
484 + * 2. Rich area detection based on random sampling:
485 + * It automatically detects rich areas containing abundant duplicated
486 + * pages based on their randomly-sampled history. Rich areas are given
487 + * a full scan speed. Poor areas are sampled at a reasonable speed with
488 + * very low CPU consumption.
490 + * 3. Per-page scan speed improvement:
491 + * A new hash algorithm(random_sample_hash) is proposed. Quite usually,
492 + * it's enough to distinguish pages by hashing their partial content
493 + * instead of full pages. This algorithm can automatically adapt to this
494 + * situation. For the best case, only one 32-bit-word/page is needed to
495 + * get the hash value for distinguishing pages. For the worst case, it's as
496 + * fast as SuperFastHash.
498 + * 4. Thrashing area avoidance:
499 + * Thrashing area(an VMA that has frequent Ksm page break-out) can be
500 + * filtered out. My benchmark shows it's more efficient than KSM's per-page
501 + * hash value based volatile page detection.
503 + * 5. Hash-value-based identical page detection:
504 + * It no longer uses "memcmp" based page detection any more.
506 + * 6. Misc changes upon KSM:
507 + * * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page
508 + * comparison. It's much faster than default C version on x86.
509 + * * rmap_item now has an struct *page member to loosely cache a
510 + * address-->page mapping, which reduces too much time-costly
512 + * * The VMA creation/exit procedures are hooked to let the Ultra KSM know.
513 + * * try_to_merge_two_pages() now can revert a pte if it fails. No break_
514 + * ksm is needed for this case.
517 #include <linux/errno.h>
518 @@ -33,142 +74,168 @@
519 #include <linux/mmu_notifier.h>
520 #include <linux/swap.h>
521 #include <linux/ksm.h>
522 -#include <linux/hash.h>
523 +#include <linux/crypto.h>
524 +#include <linux/scatterlist.h>
525 +#include <crypto/hash.h>
526 +#include <linux/random.h>
527 +#include <linux/math64.h>
528 +#include <linux/gcd.h>
529 #include <linux/freezer.h>
531 #include <asm/tlbflush.h>
532 #include "internal.h"
537 +#ifdef CONFIG_X86_32
538 +#define memcmp memcmpx86_32
540 - * A few notes about the KSM scanning process,
541 - * to make it easier to understand the data structures below:
543 - * In order to reduce excessive scanning, KSM sorts the memory pages by their
544 - * contents into a data structure that holds pointers to the pages' locations.
546 - * Since the contents of the pages may change at any moment, KSM cannot just
547 - * insert the pages into a normal sorted tree and expect it to find anything.
548 - * Therefore KSM uses two data structures - the stable and the unstable tree.
550 - * The stable tree holds pointers to all the merged pages (ksm pages), sorted
551 - * by their contents. Because each such page is write-protected, searching on
552 - * this tree is fully assured to be working (except when pages are unmapped),
553 - * and therefore this tree is called the stable tree.
555 - * In addition to the stable tree, KSM uses a second data structure called the
556 - * unstable tree: this tree holds pointers to pages which have been found to
557 - * be "unchanged for a period of time". The unstable tree sorts these pages
558 - * by their contents, but since they are not write-protected, KSM cannot rely
559 - * upon the unstable tree to work correctly - the unstable tree is liable to
560 - * be corrupted as its contents are modified, and so it is called unstable.
562 - * KSM solves this problem by several techniques:
564 - * 1) The unstable tree is flushed every time KSM completes scanning all
565 - * memory areas, and then the tree is rebuilt again from the beginning.
566 - * 2) KSM will only insert into the unstable tree, pages whose hash value
567 - * has not changed since the previous scan of all memory areas.
568 - * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the
569 - * colors of the nodes and not on their contents, assuring that even when
570 - * the tree gets "corrupted" it won't get out of balance, so scanning time
571 - * remains the same (also, searching and inserting nodes in an rbtree uses
572 - * the same algorithm, so we have no overhead when we flush and rebuild).
573 - * 4) KSM never flushes the stable tree, which means that even if it were to
574 - * take 10 attempts to find a page in the unstable tree, once it is found,
575 - * it is secured in the stable tree. (When we scan a new page, we first
576 - * compare it against the stable tree, and then against the unstable tree.)
577 + * Compare 4-byte-aligned address s1 and s2, with length n
579 +int memcmpx86_32(void *s1, void *s2, size_t n)
581 + size_t num = n / 4;
585 - * struct mm_slot - ksm information per mm that is being scanned
586 - * @link: link to the mm_slots hash list
587 - * @mm_list: link into the mm_slots list, rooted in ksm_mm_head
588 - * @rmap_list: head for this mm_slot's singly-linked list of rmap_items
589 - * @mm: the mm that this information is valid for
592 - struct hlist_node link;
593 - struct list_head mm_list;
594 - struct rmap_item *rmap_list;
595 - struct mm_struct *mm;
597 + __asm__ __volatile__
605 + : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num)
610 - * struct ksm_scan - cursor for scanning
611 - * @mm_slot: the current mm_slot we are scanning
612 - * @address: the next address inside that to be scanned
613 - * @rmap_list: link to the next rmap to be scanned in the rmap_list
614 - * @seqnr: count of completed full scans (needed when removing unstable node)
616 - * There is only the one ksm_scan instance of this cursor structure.
621 + * Check the page is all zero ?
624 - struct mm_slot *mm_slot;
625 - unsigned long address;
626 - struct rmap_item **rmap_list;
627 - unsigned long seqnr;
629 +static int is_full_zero(const void *s1, size_t len)
631 + unsigned char same;
634 - * struct stable_node - node of the stable rbtree
635 - * @node: rb node of this ksm page in the stable tree
636 - * @hlist: hlist head of rmap_items using this ksm page
637 - * @kpfn: page frame number of this ksm page
639 -struct stable_node {
640 - struct rb_node node;
641 - struct hlist_head hlist;
642 - unsigned long kpfn;
647 - * struct rmap_item - reverse mapping item for virtual addresses
648 - * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
649 - * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
650 - * @mm: the memory structure this rmap_item is pointing into
651 - * @address: the virtual address this rmap_item tracks (+ flags in low bits)
652 - * @oldchecksum: previous checksum of the page at that virtual address
653 - * @node: rb node of this rmap_item in the unstable tree
654 - * @head: pointer to stable_node heading this list in the stable tree
655 - * @hlist: link into hlist of rmap_items hanging off that stable_node
658 - struct rmap_item *rmap_list;
659 - struct anon_vma *anon_vma; /* when stable */
660 - struct mm_struct *mm;
661 - unsigned long address; /* + low bits used for flags below */
662 - unsigned int oldchecksum; /* when unstable */
664 - struct rb_node node; /* when node of unstable tree */
665 - struct { /* when listed from stable tree */
666 - struct stable_node *head;
667 - struct hlist_node hlist;
671 + __asm__ __volatile__
674 + : "=qm" (same), "+D" (s1), "+c" (len)
681 -#define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */
682 -#define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */
683 -#define STABLE_FLAG 0x200 /* is listed from the stable tree */
685 -/* The stable and unstable tree heads */
686 -static struct rb_root root_stable_tree = RB_ROOT;
687 -static struct rb_root root_unstable_tree = RB_ROOT;
688 +#elif defined(CONFIG_X86_64)
689 +#define memcmp memcmpx86_64
691 + * Compare 8-byte-aligned address s1 and s2, with length n
693 +int memcmpx86_64(void *s1, void *s2, size_t n)
695 + size_t num = n / 8;
698 -#define MM_SLOTS_HASH_SHIFT 10
699 -#define MM_SLOTS_HASH_HEADS (1 << MM_SLOTS_HASH_SHIFT)
700 -static struct hlist_head mm_slots_hash[MM_SLOTS_HASH_HEADS];
701 + __asm__ __volatile__
703 + "testq %q3,%q3\n\t"
709 + : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num)
713 -static struct mm_slot ksm_mm_head = {
714 - .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
716 -static struct ksm_scan ksm_scan = {
717 - .mm_slot = &ksm_mm_head,
722 +static int is_full_zero(const void *s1, size_t len)
724 + unsigned char same;
728 + __asm__ __volatile__
731 + : "=qm" (same), "+D" (s1), "+c" (len)
740 +static int is_full_zero(const void *s1, size_t len)
742 + unsigned long *src = s1;
745 + len /= sizeof(*src);
747 + for (i = 0; i < len; i++) {
756 +#define U64_MAX (~((u64)0))
760 + * Flags for rmap_item to judge if it's listed in the stable/unstable tree.
761 + * The flags use the low bits of rmap_item.address
763 +#define UNSTABLE_FLAG 0x1
764 +#define STABLE_FLAG 0x2
765 +#define get_rmap_addr(x) ((x)->address & PAGE_MASK)
768 + * rmap_list_entry helpers
770 +#define IS_ADDR_FLAG 1
771 +#define is_addr(ptr) ((unsigned long)(ptr) & IS_ADDR_FLAG)
772 +#define set_is_addr(ptr) ((ptr) |= IS_ADDR_FLAG)
773 +#define get_clean_addr(ptr) (((ptr) & ~(__typeof__(ptr))IS_ADDR_FLAG))
777 + * High speed caches for frequently allocated and freed structs
779 static struct kmem_cache *rmap_item_cache;
780 static struct kmem_cache *stable_node_cache;
781 -static struct kmem_cache *mm_slot_cache;
782 +static struct kmem_cache *node_vma_cache;
783 +static struct kmem_cache *vma_slot_cache;
784 +static struct kmem_cache *tree_node_cache;
785 +#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
786 + sizeof(struct __struct), __alignof__(struct __struct),\
789 +/* The scan rounds ksmd is currently in */
790 +static unsigned long long ksm_scan_round = 1;
792 +/* The number of pages has been scanned since the start up */
793 +static u64 ksm_pages_scanned;
795 +/* The number of pages has been scanned when last scan round finished */
796 +static u64 ksm_pages_scanned_last;
798 +/* If the scanned number is tooo large, we encode it here */
799 +static u64 pages_scanned_stored;
800 +static unsigned long pages_scanned_base;
802 /* The number of nodes in the stable tree */
803 static unsigned long ksm_pages_shared;
804 @@ -179,345 +246,403 @@
805 /* The number of nodes in the unstable tree */
806 static unsigned long ksm_pages_unshared;
808 -/* The number of rmap_items in use: to calculate pages_volatile */
809 -static unsigned long ksm_rmap_items;
811 -/* Number of pages ksmd should scan in one batch */
812 -static unsigned int ksm_thread_pages_to_scan = 100;
814 + * Number of pages ksmd should scan in one batch. This is the top speed for
815 + * richly duplicated areas.
817 +static unsigned long ksm_scan_batch_pages = 60000;
819 /* Milliseconds ksmd should sleep between batches */
820 -static unsigned int ksm_thread_sleep_millisecs = 20;
821 +static unsigned int ksm_sleep_jiffies = 2;
824 + * The threshold used to filter out thrashing areas,
825 + * If it == 0, filtering is disabled, otherwise it's the percentage up-bound
826 + * of the thrashing ratio of all areas. Any area with a bigger thrashing ratio
827 + * will be considered as having a zero duplication ratio.
829 +static unsigned int ksm_thrash_threshold = 50;
831 +/* To avoid the float point arithmetic, this is the scale of a
832 + * deduplication ratio number.
834 +#define KSM_DEDUP_RATIO_SCALE 100
837 +#define KSM_SCAN_RATIO_MAX 125
839 +/* minimum scan ratio for a vma, in unit of 1/KSM_SCAN_RATIO_MAX */
840 +static unsigned int ksm_min_scan_ratio = 1;
843 + * After each scan round, the scan ratio of an area with a big deduplication
844 + * ratio is upgraded by *=ksm_scan_ratio_delta
846 +static unsigned int ksm_scan_ratio_delta = 5;
849 + * Inter-vma duplication number table page pointer array, initialized at
850 + * startup. Whenever ksmd finds that two areas have an identical page,
851 + * their corresponding table entry is increased. After each scan round
852 + * is finished, this table is scanned to calculate the estimated
853 + * duplication ratio for VMAs. Limited number(2048) of VMAs are
854 + * supported by now. We will migrate it to more scalable data structures
857 +#define KSM_DUP_VMA_MAX 2048
859 +#define INDIRECT_OFFSET 1
862 + * For mapping of vma_slot and its index in inter-vma duplication number
865 +static struct radix_tree_root ksm_vma_tree;
866 +static unsigned long ksm_vma_tree_num;
867 +static unsigned long ksm_vma_tree_index_end;
869 +/* Array of all scan_rung, ksm_scan_ladder[0] having the minimum scan ratio */
870 +static struct scan_rung *ksm_scan_ladder;
871 +static unsigned int ksm_scan_ladder_size;
873 +/* The number of VMAs we are keeping track of */
874 +static unsigned long ksm_vma_slot_num;
876 +/* How many times the ksmd has slept since startup */
877 +static u64 ksm_sleep_times;
879 #define KSM_RUN_STOP 0
880 #define KSM_RUN_MERGE 1
881 -#define KSM_RUN_UNMERGE 2
882 -static unsigned int ksm_run = KSM_RUN_STOP;
883 +static unsigned int ksm_run = KSM_RUN_MERGE;
885 static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
886 static DEFINE_MUTEX(ksm_thread_mutex);
887 -static DEFINE_SPINLOCK(ksm_mmlist_lock);
889 -#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
890 - sizeof(struct __struct), __alignof__(struct __struct),\
893 + * List vma_slot_new is for newly created vma_slot waiting to be added by
894 + * ksmd. If one cannot be added(e.g. due to it's too small), it's moved to
895 + * vma_slot_noadd. vma_slot_del is the list for vma_slot whose corresponding
896 + * VMA has been removed/freed.
898 +struct list_head vma_slot_new = LIST_HEAD_INIT(vma_slot_new);
899 +struct list_head vma_slot_noadd = LIST_HEAD_INIT(vma_slot_noadd);
900 +struct list_head vma_slot_del = LIST_HEAD_INIT(vma_slot_del);
901 +static DEFINE_SPINLOCK(vma_slot_list_lock);
903 -static int __init ksm_slab_init(void)
904 +/* The unstable tree heads */
905 +static struct rb_root root_unstable_tree = RB_ROOT;
908 + * All tree_nodes are in a list to be freed at once when unstable tree is
909 + * freed after each scan round.
911 +static struct list_head unstable_tree_node_list =
912 + LIST_HEAD_INIT(unstable_tree_node_list);
914 +/* List contains all stable nodes */
915 +static struct list_head stable_node_list = LIST_HEAD_INIT(stable_node_list);
918 + * When the hash strength is changed, the stable tree must be delta_hashed and
919 + * re-structured. We use two set of below structs to speed up the
920 + * re-structuring of stable tree.
922 +static struct list_head
923 +stable_tree_node_list[2] = {LIST_HEAD_INIT(stable_tree_node_list[0]),
924 + LIST_HEAD_INIT(stable_tree_node_list[1])};
926 +static struct list_head *stable_tree_node_listp = &stable_tree_node_list[0];
927 +static struct rb_root root_stable_tree[2] = {RB_ROOT, RB_ROOT};
928 +static struct rb_root *root_stable_treep = &root_stable_tree[0];
929 +static unsigned long stable_tree_index;
931 +/* The hash strength needed to hash a full page */
932 +#define HASH_STRENGTH_FULL (PAGE_SIZE / sizeof(u32))
934 +/* The hash strength needed for loop-back hashing */
935 +#define HASH_STRENGTH_MAX (HASH_STRENGTH_FULL + 10)
937 +/* The random offsets in a page */
938 +static u32 *random_nums;
940 +/* The hash strength */
941 +static unsigned long hash_strength = HASH_STRENGTH_FULL >> 4;
943 +/* The delta value each time the hash strength increases or decreases */
944 +static unsigned long hash_strength_delta;
945 +#define HASH_STRENGTH_DELTA_MAX 5
947 +/* The time we have saved due to random_sample_hash */
948 +static u64 rshash_pos;
950 +/* The time we have wasted due to hash collision */
951 +static u64 rshash_neg;
953 +struct ksm_benefit {
957 + unsigned long base;
961 + * The relative cost of memcmp, compared to 1 time unit of random sample
962 + * hash, this value is tested when ksm module is initialized
964 +static unsigned long memcmp_cost;
966 +static unsigned long rshash_neg_cont_zero;
967 +static unsigned long rshash_cont_obscure;
969 +/* The possible states of hash strength adjustment heuristic */
970 +enum rshash_states {
978 +/* The possible direction we are about to adjust hash strength */
979 +enum rshash_direct {
986 +/* random sampling hash state machine */
988 + enum rshash_states state;
989 + enum rshash_direct pre_direct;
991 + /* Keep a lookup window of size 5, iff above_count/below_count > 3
992 + * in this window we stop trying.
994 + u8 lookup_window_index;
995 + u64 stable_benefit;
996 + unsigned long turn_point_down;
997 + unsigned long turn_benefit_down;
998 + unsigned long turn_point_up;
999 + unsigned long turn_benefit_up;
1000 + unsigned long stable_point;
1003 +/*zero page hash table, hash_strength [0 ~ HASH_STRENGTH_MAX]*/
1004 +static u32 *zero_hash_table;
1006 +static inline struct node_vma *alloc_node_vma(void)
1008 - rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
1009 - if (!rmap_item_cache)
1011 + struct node_vma *node_vma;
1012 + node_vma = kmem_cache_zalloc(node_vma_cache, GFP_KERNEL);
1014 + INIT_HLIST_HEAD(&node_vma->rmap_hlist);
1015 + INIT_HLIST_NODE(&node_vma->hlist);
1016 + node_vma->last_update = 0;
1021 - stable_node_cache = KSM_KMEM_CACHE(stable_node, 0);
1022 - if (!stable_node_cache)
1024 +static inline void free_node_vma(struct node_vma *node_vma)
1026 + kmem_cache_free(node_vma_cache, node_vma);
1029 - mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0);
1030 - if (!mm_slot_cache)
1034 +static inline struct vma_slot *alloc_vma_slot(void)
1036 + struct vma_slot *slot;
1039 - kmem_cache_destroy(stable_node_cache);
1041 - kmem_cache_destroy(rmap_item_cache);
1045 + * In case ksm is not initialized by now.
1046 + * Oops, we need to consider the call site of ksm_init() in the future.
1048 + if (!vma_slot_cache)
1051 + slot = kmem_cache_zalloc(vma_slot_cache, GFP_KERNEL);
1053 + INIT_LIST_HEAD(&slot->ksm_list);
1054 + INIT_LIST_HEAD(&slot->slot_list);
1055 + INIT_RADIX_TREE(&slot->dup_tree, GFP_KERNEL);
1056 + slot->ksm_index = -1;
1057 + slot->need_rerand = 1;
1062 -static void __init ksm_slab_free(void)
1063 +static inline void free_vma_slot(struct vma_slot *vma_slot)
1065 - kmem_cache_destroy(mm_slot_cache);
1066 - kmem_cache_destroy(stable_node_cache);
1067 - kmem_cache_destroy(rmap_item_cache);
1068 - mm_slot_cache = NULL;
1069 + kmem_cache_free(vma_slot_cache, vma_slot);
1074 static inline struct rmap_item *alloc_rmap_item(void)
1076 struct rmap_item *rmap_item;
1078 rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL);
1082 + /* bug on lowest bit is not clear for flag use */
1083 + BUG_ON(is_addr(rmap_item));
1088 static inline void free_rmap_item(struct rmap_item *rmap_item)
1091 - rmap_item->mm = NULL; /* debug safety */
1092 + rmap_item->slot = NULL; /* debug safety */
1093 kmem_cache_free(rmap_item_cache, rmap_item);
1096 static inline struct stable_node *alloc_stable_node(void)
1098 - return kmem_cache_alloc(stable_node_cache, GFP_KERNEL);
1099 + struct stable_node *node;
1100 + node = kmem_cache_alloc(stable_node_cache, GFP_KERNEL | GFP_ATOMIC);
1104 + INIT_HLIST_HEAD(&node->hlist);
1105 + list_add(&node->all_list, &stable_node_list);
1109 static inline void free_stable_node(struct stable_node *stable_node)
1111 + list_del(&stable_node->all_list);
1112 kmem_cache_free(stable_node_cache, stable_node);
1115 -static inline struct mm_slot *alloc_mm_slot(void)
1116 +static inline struct tree_node *alloc_tree_node(struct list_head *list)
1118 - if (!mm_slot_cache) /* initialization failed */
1119 + struct tree_node *node;
1120 + node = kmem_cache_zalloc(tree_node_cache, GFP_KERNEL | GFP_ATOMIC);
1123 - return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
1125 + list_add(&node->all_list, list);
1129 -static inline void free_mm_slot(struct mm_slot *mm_slot)
1130 +static inline void free_tree_node(struct tree_node *node)
1132 - kmem_cache_free(mm_slot_cache, mm_slot);
1133 + list_del(&node->all_list);
1134 + kmem_cache_free(tree_node_cache, node);
1137 -static struct mm_slot *get_mm_slot(struct mm_struct *mm)
1138 +static void ksm_drop_anon_vma(struct rmap_item *rmap_item)
1140 - struct mm_slot *mm_slot;
1141 - struct hlist_head *bucket;
1142 - struct hlist_node *node;
1143 + struct anon_vma *anon_vma = rmap_item->anon_vma;
1145 - bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)];
1146 - hlist_for_each_entry(mm_slot, node, bucket, link) {
1147 - if (mm == mm_slot->mm)
1151 + drop_anon_vma(anon_vma);
1154 -static void insert_to_mm_slots_hash(struct mm_struct *mm,
1155 - struct mm_slot *mm_slot)
1158 + * Remove a stable node from stable_tree, may unlink from its tree_node and
1159 + * may remove its parent tree_node if no other stable node is pending.
1161 + * @stable_node The node need to be removed
1162 + * @unlink_rb Will this node be unlinked from the rbtree?
1163 + * @remove_tree_ node Will its tree_node be removed if empty?
1165 +static void remove_node_from_stable_tree(struct stable_node *stable_node,
1166 + int unlink_rb, int remove_tree_node)
1168 - struct hlist_head *bucket;
1169 + struct node_vma *node_vma;
1170 + struct rmap_item *rmap_item;
1171 + struct hlist_node *hlist, *rmap_hlist, *n;
1173 - bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)];
1175 - hlist_add_head(&mm_slot->link, bucket);
1177 + if (!hlist_empty(&stable_node->hlist)) {
1178 + hlist_for_each_entry_safe(node_vma, hlist, n,
1179 + &stable_node->hlist, hlist) {
1180 + hlist_for_each_entry(rmap_item, rmap_hlist,
1181 + &node_vma->rmap_hlist, hlist) {
1182 + ksm_pages_sharing--;
1184 -static inline int in_stable_tree(struct rmap_item *rmap_item)
1186 - return rmap_item->address & STABLE_FLAG;
1188 + ksm_drop_anon_vma(rmap_item);
1189 + rmap_item->address &= PAGE_MASK;
1191 + free_node_vma(node_vma);
1195 -static void hold_anon_vma(struct rmap_item *rmap_item,
1196 - struct anon_vma *anon_vma)
1198 - rmap_item->anon_vma = anon_vma;
1199 - get_anon_vma(anon_vma);
1201 + /* the last one is counted as shared */
1202 + ksm_pages_shared--;
1203 + ksm_pages_sharing++;
1206 -static void ksm_drop_anon_vma(struct rmap_item *rmap_item)
1208 - struct anon_vma *anon_vma = rmap_item->anon_vma;
1209 + if (stable_node->tree_node && unlink_rb) {
1210 + rb_erase(&stable_node->node,
1211 + &stable_node->tree_node->sub_root);
1213 + if (RB_EMPTY_ROOT(&stable_node->tree_node->sub_root) &&
1214 + remove_tree_node) {
1215 + rb_erase(&stable_node->tree_node->node,
1216 + root_stable_treep);
1217 + free_tree_node(stable_node->tree_node);
1219 + stable_node->tree_node->count--;
1223 - drop_anon_vma(anon_vma);
1224 + free_stable_node(stable_node);
1228 - * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
1229 - * page tables after it has passed through ksm_exit() - which, if necessary,
1230 - * takes mmap_sem briefly to serialize against them. ksm_exit() does not set
1231 - * a special flag: they can just back out as soon as mm_users goes to zero.
1232 - * ksm_test_exit() is used throughout to make this test for exit: in some
1233 - * places for correctness, in some places just to avoid unnecessary work.
1235 -static inline bool ksm_test_exit(struct mm_struct *mm)
1237 - return atomic_read(&mm->mm_users) == 0;
1241 - * We use break_ksm to break COW on a ksm page: it's a stripped down
1242 + * get_ksm_page: checks if the page indicated by the stable node
1243 + * is still its ksm page, despite having held no reference to it.
1244 + * In which case we can trust the content of the page, and it
1245 + * returns the gotten page; but if the page has now been zapped,
1246 + * remove the stale node from the stable tree and return NULL.
1248 - * if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1)
1250 + * You would expect the stable_node to hold a reference to the ksm page.
1251 + * But if it increments the page's count, swapping out has to wait for
1252 + * ksmd to come around again before it can free the page, which may take
1253 + * seconds or even minutes: much too unresponsive. So instead we use a
1254 + * "keyhole reference": access to the ksm page from the stable node peeps
1255 + * out through its keyhole to see if that page still holds the right key,
1256 + * pointing back to this stable node. This relies on freeing a PageAnon
1257 + * page to reset its page->mapping to NULL, and relies on no other use of
1258 + * a page to put something that might look like our key in page->mapping.
1260 - * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
1261 - * in case the application has unmapped and remapped mm,addr meanwhile.
1262 - * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP
1263 - * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
1264 + * include/linux/pagemap.h page_cache_get_speculative() is a good reference,
1265 + * but this is different - made simpler by ksm_thread_mutex being held, but
1266 + * interesting for assuming that no other use of the struct page could ever
1267 + * put our expected_mapping into page->mapping (or a field of the union which
1268 + * coincides with page->mapping). The RCU calls are not for KSM at all, but
1269 + * to keep the page_count protocol described with page_cache_get_speculative.
1271 + * Note: it is possible that get_ksm_page() will return NULL one moment,
1272 + * then page the next, if the page is in between page_freeze_refs() and
1273 + * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page
1274 + * is on its way to being freed; but it is an anomaly to bear in mind.
1276 + * @unlink_rb: if the removal of this node will firstly unlink from
1277 + * its rbtree. stable_node_reinsert will prevent this when restructuring the
1278 + * node from its old tree.
1280 + * @remove_tree_node: if this is the last one of its tree_node, will the
1281 + * tree_node be freed ? If we are inserting stable node, this tree_node may
1282 + * be reused, so don't free it.
1284 -static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
1285 +static struct page *get_ksm_page(struct stable_node *stable_node,
1286 + int unlink_rb, int remove_tree_node)
1290 + void *expected_mapping;
1294 - page = follow_page(vma, addr, FOLL_GET);
1295 - if (IS_ERR_OR_NULL(page))
1297 - if (PageKsm(page))
1298 - ret = handle_mm_fault(vma->vm_mm, vma, addr,
1299 - FAULT_FLAG_WRITE);
1301 - ret = VM_FAULT_WRITE;
1303 - } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_OOM)));
1305 - * We must loop because handle_mm_fault() may back out if there's
1306 - * any difficulty e.g. if pte accessed bit gets updated concurrently.
1308 - * VM_FAULT_WRITE is what we have been hoping for: it indicates that
1309 - * COW has been broken, even if the vma does not permit VM_WRITE;
1310 - * but note that a concurrent fault might break PageKsm for us.
1312 - * VM_FAULT_SIGBUS could occur if we race with truncation of the
1313 - * backing file, which also invalidates anonymous pages: that's
1314 - * okay, that truncation will have unmapped the PageKsm for us.
1316 - * VM_FAULT_OOM: at the time of writing (late July 2009), setting
1317 - * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
1318 - * current task has TIF_MEMDIE set, and will be OOM killed on return
1319 - * to user; and ksmd, having no mm, would never be chosen for that.
1321 - * But if the mm is in a limited mem_cgroup, then the fault may fail
1322 - * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
1323 - * even ksmd can fail in this way - though it's usually breaking ksm
1324 - * just to undo a merge it made a moment before, so unlikely to oom.
1326 - * That's a pity: we might therefore have more kernel pages allocated
1327 - * than we're counting as nodes in the stable tree; but ksm_do_scan
1328 - * will retry to break_cow on each pass, so should recover the page
1329 - * in due course. The important thing is to not let VM_MERGEABLE
1330 - * be cleared while any such pages might remain in the area.
1332 - return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
1335 -static void break_cow(struct rmap_item *rmap_item)
1337 - struct mm_struct *mm = rmap_item->mm;
1338 - unsigned long addr = rmap_item->address;
1339 - struct vm_area_struct *vma;
1342 - * It is not an accident that whenever we want to break COW
1343 - * to undo, we also need to drop a reference to the anon_vma.
1345 - ksm_drop_anon_vma(rmap_item);
1347 - down_read(&mm->mmap_sem);
1348 - if (ksm_test_exit(mm))
1350 - vma = find_vma(mm, addr);
1351 - if (!vma || vma->vm_start > addr)
1353 - if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
1355 - break_ksm(vma, addr);
1357 - up_read(&mm->mmap_sem);
1360 -static struct page *page_trans_compound_anon(struct page *page)
1362 - if (PageTransCompound(page)) {
1363 - struct page *head = compound_trans_head(page);
1365 - * head may actually be splitted and freed from under
1366 - * us but it's ok here.
1368 - if (PageAnon(head))
1374 -static struct page *get_mergeable_page(struct rmap_item *rmap_item)
1376 - struct mm_struct *mm = rmap_item->mm;
1377 - unsigned long addr = rmap_item->address;
1378 - struct vm_area_struct *vma;
1379 - struct page *page;
1381 - down_read(&mm->mmap_sem);
1382 - if (ksm_test_exit(mm))
1384 - vma = find_vma(mm, addr);
1385 - if (!vma || vma->vm_start > addr)
1387 - if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
1390 - page = follow_page(vma, addr, FOLL_GET);
1391 - if (IS_ERR_OR_NULL(page))
1393 - if (PageAnon(page) || page_trans_compound_anon(page)) {
1394 - flush_anon_page(vma, page, addr);
1395 - flush_dcache_page(page);
1400 - up_read(&mm->mmap_sem);
1404 -static void remove_node_from_stable_tree(struct stable_node *stable_node)
1406 - struct rmap_item *rmap_item;
1407 - struct hlist_node *hlist;
1409 - hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
1410 - if (rmap_item->hlist.next)
1411 - ksm_pages_sharing--;
1413 - ksm_pages_shared--;
1414 - ksm_drop_anon_vma(rmap_item);
1415 - rmap_item->address &= PAGE_MASK;
1419 - rb_erase(&stable_node->node, &root_stable_tree);
1420 - free_stable_node(stable_node);
1424 - * get_ksm_page: checks if the page indicated by the stable node
1425 - * is still its ksm page, despite having held no reference to it.
1426 - * In which case we can trust the content of the page, and it
1427 - * returns the gotten page; but if the page has now been zapped,
1428 - * remove the stale node from the stable tree and return NULL.
1430 - * You would expect the stable_node to hold a reference to the ksm page.
1431 - * But if it increments the page's count, swapping out has to wait for
1432 - * ksmd to come around again before it can free the page, which may take
1433 - * seconds or even minutes: much too unresponsive. So instead we use a
1434 - * "keyhole reference": access to the ksm page from the stable node peeps
1435 - * out through its keyhole to see if that page still holds the right key,
1436 - * pointing back to this stable node. This relies on freeing a PageAnon
1437 - * page to reset its page->mapping to NULL, and relies on no other use of
1438 - * a page to put something that might look like our key in page->mapping.
1440 - * include/linux/pagemap.h page_cache_get_speculative() is a good reference,
1441 - * but this is different - made simpler by ksm_thread_mutex being held, but
1442 - * interesting for assuming that no other use of the struct page could ever
1443 - * put our expected_mapping into page->mapping (or a field of the union which
1444 - * coincides with page->mapping). The RCU calls are not for KSM at all, but
1445 - * to keep the page_count protocol described with page_cache_get_speculative.
1447 - * Note: it is possible that get_ksm_page() will return NULL one moment,
1448 - * then page the next, if the page is in between page_freeze_refs() and
1449 - * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page
1450 - * is on its way to being freed; but it is an anomaly to bear in mind.
1452 -static struct page *get_ksm_page(struct stable_node *stable_node)
1454 - struct page *page;
1455 - void *expected_mapping;
1457 - page = pfn_to_page(stable_node->kpfn);
1458 - expected_mapping = (void *)stable_node +
1459 - (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
1461 - if (page->mapping != expected_mapping)
1463 - if (!get_page_unless_zero(page))
1465 - if (page->mapping != expected_mapping) {
1466 + page = pfn_to_page(stable_node->kpfn);
1467 + expected_mapping = (void *)stable_node +
1468 + (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
1470 + if (page->mapping != expected_mapping)
1472 + if (!get_page_unless_zero(page))
1474 + if (page->mapping != expected_mapping) {
1482 - remove_node_from_stable_tree(stable_node);
1483 + remove_node_from_stable_tree(stable_node, unlink_rb, remove_tree_node);
1488 @@ -533,32 +659,46 @@
1489 * Removing rmap_item from stable or unstable tree.
1490 * This function will clean the information from the stable/unstable tree.
1492 -static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
1493 +static inline void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
1495 if (rmap_item->address & STABLE_FLAG) {
1496 struct stable_node *stable_node;
1497 + struct node_vma *node_vma;
1500 - stable_node = rmap_item->head;
1501 - page = get_ksm_page(stable_node);
1502 + node_vma = rmap_item->head;
1503 + stable_node = node_vma->head;
1504 + page = get_ksm_page(stable_node, 1, 1);
1509 + * page lock is needed because it's racing with
1510 + * try_to_unmap_ksm(), etc.
1513 hlist_del(&rmap_item->hlist);
1515 + if (hlist_empty(&node_vma->rmap_hlist)) {
1516 + hlist_del(&node_vma->hlist);
1517 + free_node_vma(node_vma);
1522 - if (stable_node->hlist.first)
1523 - ksm_pages_sharing--;
1526 + if (hlist_empty(&stable_node->hlist)) {
1527 + /* do NOT call remove_node_from_stable_tree() here,
1528 + * it's possible for a forked rmap_item not in
1529 + * stable tree while the in-tree rmap_items were
1534 + ksm_pages_sharing--;
1536 - ksm_drop_anon_vma(rmap_item);
1537 - rmap_item->address &= PAGE_MASK;
1539 + ksm_drop_anon_vma(rmap_item);
1540 } else if (rmap_item->address & UNSTABLE_FLAG) {
1541 - unsigned char age;
1543 * Usually ksmd can and must skip the rb_erase, because
1544 * root_unstable_tree was already reset to RB_ROOT.
1545 @@ -566,169 +706,454 @@
1546 * if this rmap_item was inserted by this scan, rather
1547 * than left over from before.
1549 - age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
1552 - rb_erase(&rmap_item->node, &root_unstable_tree);
1554 + if (rmap_item->append_round == ksm_scan_round) {
1555 + rb_erase(&rmap_item->node,
1556 + &rmap_item->tree_node->sub_root);
1557 + if (RB_EMPTY_ROOT(&rmap_item->tree_node->sub_root)) {
1558 + rb_erase(&rmap_item->tree_node->node,
1559 + &root_unstable_tree);
1561 + free_tree_node(rmap_item->tree_node);
1563 + rmap_item->tree_node->count--;
1565 ksm_pages_unshared--;
1566 - rmap_item->address &= PAGE_MASK;
1569 + rmap_item->address &= PAGE_MASK;
1570 + rmap_item->hash_max = 0;
1573 cond_resched(); /* we're called from many long loops */
1576 -static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
1577 - struct rmap_item **rmap_list)
1579 + * Need to do two things:
1580 + * 1. check if slot was moved to del list
1581 + * 2. make sure the mmap_sem is manipulated under valid vma.
1583 + * My concern here is that in some cases, this may make
1584 + * vma_slot_list_lock() waiters to serialized further by some
1585 + * sem->wait_lock, can this really be expensive?
1589 + * 0: if successfully locked mmap_sem
1590 + * -ENOENT: this slot was moved to del list
1591 + * -EBUSY: vma lock failed
1593 +static int try_down_read_slot_mmap_sem(struct vma_slot *slot)
1595 - while (*rmap_list) {
1596 - struct rmap_item *rmap_item = *rmap_list;
1597 - *rmap_list = rmap_item->rmap_list;
1598 - remove_rmap_item_from_tree(rmap_item);
1599 - free_rmap_item(rmap_item);
1600 + struct vm_area_struct *vma;
1601 + struct mm_struct *mm;
1602 + struct rw_semaphore *sem;
1604 + spin_lock(&vma_slot_list_lock);
1606 + /* the slot_list was removed and inited from new list, when it enters
1607 + * ksm_list. If now it's not empty, then it must be moved to del list
1609 + if (!list_empty(&slot->slot_list)) {
1610 + spin_unlock(&vma_slot_list_lock);
1614 + BUG_ON(slot->pages != vma_pages(slot->vma));
1615 + /* Ok, vma still valid */
1618 + sem = &mm->mmap_sem;
1619 + if (down_read_trylock(sem)) {
1620 + spin_unlock(&vma_slot_list_lock);
1624 + spin_unlock(&vma_slot_list_lock);
1629 - * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather
1630 - * than check every pte of a given vma, the locking doesn't quite work for
1631 - * that - an rmap_item is assigned to the stable tree after inserting ksm
1632 - * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing
1633 - * rmap_items from parent to child at fork time (so as not to waste time
1634 - * if exit comes before the next scan reaches it).
1636 - * Similarly, although we'd like to remove rmap_items (so updating counts
1637 - * and freeing memory) when unmerging an area, it's easier to leave that
1638 - * to the next pass of ksmd - consider, for example, how ksmd might be
1639 - * in cmp_and_merge_page on one of the rmap_items we would be removing.
1641 -static int unmerge_ksm_pages(struct vm_area_struct *vma,
1642 - unsigned long start, unsigned long end)
1643 +static inline unsigned long
1644 +vma_page_address(struct page *page, struct vm_area_struct *vma)
1646 - unsigned long addr;
1648 + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
1649 + unsigned long address;
1651 - for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
1652 - if (ksm_test_exit(vma->vm_mm))
1654 - if (signal_pending(current))
1655 - err = -ERESTARTSYS;
1657 - err = break_ksm(vma, addr);
1658 + address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
1659 + if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
1660 + /* page should be within @vma mapping range */
1667 -#ifdef CONFIG_SYSFS
1669 - * Only called through the sysfs control interface:
1670 + * Test if the mm is exiting
1672 -static int unmerge_and_remove_all_rmap_items(void)
1673 +static inline bool ksm_test_exit(struct mm_struct *mm)
1675 + return atomic_read(&mm->mm_users) == 0;
1678 +/* return 0 on success with the item's mmap_sem locked */
1679 +static inline int get_mergeable_page_lock_mmap(struct rmap_item *item)
1681 - struct mm_slot *mm_slot;
1682 struct mm_struct *mm;
1683 struct vm_area_struct *vma;
1685 + struct vma_slot *slot = item->slot;
1686 + int err = -EINVAL;
1688 - spin_lock(&ksm_mmlist_lock);
1689 - ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next,
1690 - struct mm_slot, mm_list);
1691 - spin_unlock(&ksm_mmlist_lock);
1693 - for (mm_slot = ksm_scan.mm_slot;
1694 - mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) {
1696 - down_read(&mm->mmap_sem);
1697 - for (vma = mm->mmap; vma; vma = vma->vm_next) {
1698 - if (ksm_test_exit(mm))
1700 - if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
1702 - err = unmerge_ksm_pages(vma,
1703 - vma->vm_start, vma->vm_end);
1708 - remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list);
1710 - spin_lock(&ksm_mmlist_lock);
1711 - ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
1712 - struct mm_slot, mm_list);
1713 - if (ksm_test_exit(mm)) {
1714 - hlist_del(&mm_slot->link);
1715 - list_del(&mm_slot->mm_list);
1716 - spin_unlock(&ksm_mmlist_lock);
1718 - free_mm_slot(mm_slot);
1719 - clear_bit(MMF_VM_MERGEABLE, &mm->flags);
1720 - up_read(&mm->mmap_sem);
1723 - spin_unlock(&ksm_mmlist_lock);
1724 - up_read(&mm->mmap_sem);
1726 + struct page *page;
1728 + BUG_ON(!item->slot);
1730 + * try_down_read_slot_mmap_sem() returns non-zero if the slot
1731 + * has been removed by ksm_remove_vma().
1733 + if (try_down_read_slot_mmap_sem(slot))
1736 + mm = slot->vma->vm_mm;
1739 + if (ksm_test_exit(mm))
1742 + page = item->page;
1744 + if (!get_page_unless_zero(page)) {
1745 + rcu_read_unlock();
1749 - ksm_scan.seqnr = 0;
1750 + /* No need to consider huge page here. */
1751 + if (item->slot->vma->anon_vma != page_anon_vma(page) ||
1752 + vma_page_address(page, item->slot->vma) != get_rmap_addr(item)) {
1755 + * should we release this item becase of its stale page
1759 + rcu_read_unlock();
1762 + rcu_read_unlock();
1767 up_read(&mm->mmap_sem);
1768 - spin_lock(&ksm_mmlist_lock);
1769 - ksm_scan.mm_slot = &ksm_mm_head;
1770 - spin_unlock(&ksm_mmlist_lock);
1773 -#endif /* CONFIG_SYSFS */
1775 -static u32 calc_checksum(struct page *page)
1777 + * What kind of VMA is considered ?
1779 +static inline int vma_can_enter(struct vm_area_struct *vma)
1782 - void *addr = kmap_atomic(page, KM_USER0);
1783 - checksum = jhash2(addr, PAGE_SIZE / 4, 17);
1784 - kunmap_atomic(addr, KM_USER0);
1786 + return !(vma->vm_flags & (VM_PFNMAP | VM_IO | VM_DONTEXPAND |
1787 + VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
1788 + VM_NONLINEAR | VM_MIXEDMAP | VM_SAO |
1789 + VM_SHARED | VM_MAYSHARE | VM_GROWSUP
1793 -static int memcmp_pages(struct page *page1, struct page *page2)
1795 + * Called whenever a fresh new vma is created A new vma_slot.
1796 + * is created and inserted into a global list Must be called.
1797 + * after vma is inserted to its mm .
1799 +inline void ksm_vma_add_new(struct vm_area_struct *vma)
1801 - char *addr1, *addr2;
1803 + struct vma_slot *slot;
1805 - addr1 = kmap_atomic(page1, KM_USER0);
1806 - addr2 = kmap_atomic(page2, KM_USER1);
1807 - ret = memcmp(addr1, addr2, PAGE_SIZE);
1808 - kunmap_atomic(addr2, KM_USER1);
1809 - kunmap_atomic(addr1, KM_USER0);
1812 + if (!vma_can_enter(vma)) {
1813 + vma->ksm_vma_slot = NULL;
1817 -static inline int pages_identical(struct page *page1, struct page *page2)
1819 - return !memcmp_pages(page1, page2);
1820 + slot = alloc_vma_slot();
1822 + vma->ksm_vma_slot = NULL;
1826 + vma->ksm_vma_slot = slot;
1828 + slot->mm = vma->vm_mm;
1829 + slot->ctime_j = jiffies;
1830 + slot->pages = vma_pages(vma);
1831 + spin_lock(&vma_slot_list_lock);
1832 + list_add_tail(&slot->slot_list, &vma_slot_new);
1833 + spin_unlock(&vma_slot_list_lock);
1836 -static int write_protect_page(struct vm_area_struct *vma, struct page *page,
1839 + * Called after vma is unlinked from its mm
1841 +void ksm_remove_vma(struct vm_area_struct *vma)
1843 - struct mm_struct *mm = vma->vm_mm;
1844 - unsigned long addr;
1848 - int err = -EFAULT;
1849 + struct vma_slot *slot;
1851 - addr = page_address_in_vma(page, vma);
1852 - if (addr == -EFAULT)
1854 + if (!vma->ksm_vma_slot)
1857 - BUG_ON(PageTransCompound(page));
1858 - ptep = page_check_address(page, mm, addr, &ptl, 0);
1861 + slot = vma->ksm_vma_slot;
1862 + spin_lock(&vma_slot_list_lock);
1863 + if (list_empty(&slot->slot_list)) {
1865 + * This slot has been added by ksmd, so move to the del list
1866 + * waiting ksmd to free it.
1868 + list_add_tail(&slot->slot_list, &vma_slot_del);
1871 + * It's still on new list. It's ok to free slot directly.
1873 + list_del(&slot->slot_list);
1874 + free_vma_slot(slot);
1876 + spin_unlock(&vma_slot_list_lock);
1877 + vma->ksm_vma_slot = NULL;
1880 - if (pte_write(*ptep) || pte_dirty(*ptep)) {
1881 +/* 32/3 < they < 32/2 */
1885 +#define HASH_FROM_TO(from, to) \
1886 +for (index = from; index < to; index++) { \
1887 + pos = random_nums[index]; \
1888 + hash += key[pos]; \
1889 + hash += (hash << shiftl); \
1890 + hash ^= (hash >> shiftr); \
1894 +#define HASH_FROM_DOWN_TO(from, to) \
1895 +for (index = from - 1; index >= to; index--) { \
1896 + hash ^= (hash >> shiftr); \
1897 + hash ^= (hash >> (shiftr*2)); \
1898 + hash -= (hash << shiftl); \
1899 + hash += (hash << (shiftl*2)); \
1900 + pos = random_nums[index]; \
1901 + hash -= key[pos]; \
1905 + * The main random sample hash function.
1907 +static u32 random_sample_hash(void *addr, u32 hash_strength)
1909 + u32 hash = 0xdeadbeef;
1910 + int index, pos, loop = hash_strength;
1911 + u32 *key = (u32 *)addr;
1913 + if (loop > HASH_STRENGTH_FULL)
1914 + loop = HASH_STRENGTH_FULL;
1916 + HASH_FROM_TO(0, loop);
1918 + if (hash_strength > HASH_STRENGTH_FULL) {
1919 + loop = hash_strength - HASH_STRENGTH_FULL;
1920 + HASH_FROM_TO(0, loop);
1928 + * It's used when hash strength is adjusted
1930 + * @addr The page's virtual address
1931 + * @from The original hash strength
1932 + * @to The hash strength changed to
1933 + * @hash The hash value generated with "from" hash value
1935 + * return the hash value
1937 +static u32 delta_hash(void *addr, int from, int to, u32 hash)
1939 + u32 *key = (u32 *)addr;
1940 + int index, pos; /* make sure they are int type */
1943 + if (from >= HASH_STRENGTH_FULL) {
1944 + from -= HASH_STRENGTH_FULL;
1945 + to -= HASH_STRENGTH_FULL;
1946 + HASH_FROM_TO(from, to);
1947 + } else if (to <= HASH_STRENGTH_FULL) {
1948 + HASH_FROM_TO(from, to);
1950 + HASH_FROM_TO(from, HASH_STRENGTH_FULL);
1951 + HASH_FROM_TO(0, to - HASH_STRENGTH_FULL);
1954 + if (from <= HASH_STRENGTH_FULL) {
1955 + HASH_FROM_DOWN_TO(from, to);
1956 + } else if (to >= HASH_STRENGTH_FULL) {
1957 + from -= HASH_STRENGTH_FULL;
1958 + to -= HASH_STRENGTH_FULL;
1959 + HASH_FROM_DOWN_TO(from, to);
1961 + HASH_FROM_DOWN_TO(from - HASH_STRENGTH_FULL, 0);
1962 + HASH_FROM_DOWN_TO(HASH_STRENGTH_FULL, to);
1972 +#define CAN_OVERFLOW_U64(x, delta) (U64_MAX - (x) < (delta))
1976 + * Called when: rshash_pos or rshash_neg is about to overflow or a scan round
1980 +static inline void encode_benefit(void)
1982 + u64 scanned_delta, pos_delta, neg_delta;
1983 + unsigned long base = benefit.base;
1985 + scanned_delta = (ksm_pages_scanned - ksm_pages_scanned_last) >> base;
1986 + pos_delta = rshash_pos >> base;
1987 + neg_delta = rshash_neg >> base;
1989 + if (CAN_OVERFLOW_U64(benefit.pos, pos_delta) ||
1990 + CAN_OVERFLOW_U64(benefit.neg, neg_delta) ||
1991 + CAN_OVERFLOW_U64(benefit.scanned, scanned_delta)) {
1992 + benefit.scanned >>= 1;
1993 + benefit.neg >>= 1;
1994 + benefit.pos >>= 1;
1996 + scanned_delta >>= 1;
2001 + benefit.pos += pos_delta;
2002 + benefit.neg += neg_delta;
2003 + benefit.scanned += scanned_delta;
2005 + BUG_ON(!benefit.scanned);
2007 + rshash_pos = rshash_neg = 0;
2009 + /* -1 to make rshash_adjust() work */
2010 + ksm_pages_scanned_last = ksm_pages_scanned - 1;
2013 +static inline void reset_benefit(void)
2018 + benefit.scanned = 0;
2021 +static inline void inc_rshash_pos(unsigned long delta)
2023 + if (CAN_OVERFLOW_U64(rshash_pos, delta))
2026 + rshash_pos += delta;
2029 +static inline void inc_rshash_neg(unsigned long delta)
2031 + if (CAN_OVERFLOW_U64(rshash_neg, delta))
2034 + rshash_neg += delta;
2038 +static inline u32 page_hash(struct page *page, unsigned long hash_strength,
2039 + int cost_accounting)
2042 + unsigned long delta;
2044 + void *addr = kmap_atomic(page, KM_USER0);
2046 + val = random_sample_hash(addr, hash_strength);
2047 + kunmap_atomic(addr, KM_USER0);
2049 + if (cost_accounting) {
2050 + if (HASH_STRENGTH_FULL > hash_strength)
2051 + delta = HASH_STRENGTH_FULL - hash_strength;
2055 + inc_rshash_pos(delta);
2061 +static int memcmp_pages(struct page *page1, struct page *page2,
2062 + int cost_accounting)
2064 + char *addr1, *addr2;
2067 + addr1 = kmap_atomic(page1, KM_USER0);
2068 + addr2 = kmap_atomic(page2, KM_USER1);
2069 + ret = memcmp(addr1, addr2, PAGE_SIZE);
2070 + kunmap_atomic(addr2, KM_USER1);
2071 + kunmap_atomic(addr1, KM_USER0);
2073 + if (cost_accounting)
2074 + inc_rshash_neg(memcmp_cost);
2079 +static inline int pages_identical(struct page *page1, struct page *page2)
2081 + return !memcmp_pages(page1, page2, 0);
2084 +static inline int is_page_full_zero(struct page *page)
2089 + addr = kmap_atomic(page, KM_USER0);
2090 + ret = is_full_zero(addr, PAGE_SIZE);
2091 + kunmap_atomic(addr, KM_USER0);
2096 +static int write_protect_page(struct vm_area_struct *vma, struct page *page,
2097 + pte_t *orig_pte, pte_t *old_pte)
2099 + struct mm_struct *mm = vma->vm_mm;
2100 + unsigned long addr;
2104 + int err = -EFAULT;
2106 + addr = page_address_in_vma(page, vma);
2107 + if (addr == -EFAULT)
2110 + BUG_ON(PageTransCompound(page));
2111 + ptep = page_check_address(page, mm, addr, &ptl, 0);
2118 + if (pte_write(*ptep) || pte_dirty(*ptep)) {
2121 swapped = PageSwapCache(page);
2122 @@ -765,6 +1190,11 @@
2126 +#define MERGE_ERR_PGERR 1 /* the page is invalid cannot continue */
2127 +#define MERGE_ERR_COLLI 2 /* there is a collision */
2128 +#define MERGE_ERR_CHANGED 3 /* the page has changed since last hash */
2132 * replace_page - replace page in vma by new ksm page
2133 * @vma: vma that holds the pte pointing to page
2134 @@ -772,7 +1202,7 @@
2135 * @kpage: the ksm page we replace page by
2136 * @orig_pte: the original value of the pte
2138 - * Returns 0 on success, -EFAULT on failure.
2139 + * Returns 0 on success, MERGE_ERR_PGERR on failure.
2141 static int replace_page(struct vm_area_struct *vma, struct page *page,
2142 struct page *kpage, pte_t orig_pte)
2143 @@ -783,8 +1213,10 @@
2150 - int err = -EFAULT;
2151 + int err = MERGE_ERR_PGERR;
2153 addr = page_address_in_vma(page, vma);
2154 if (addr == -EFAULT)
2155 @@ -809,12 +1241,20 @@
2160 - page_add_anon_rmap(kpage, vma, addr);
2162 flush_cache_page(vma, addr, pte_pfn(*ptep));
2163 ptep_clear_flush(vma, addr, ptep);
2164 - set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
2165 + entry = mk_pte(kpage, vma->vm_page_prot);
2167 + /* special treatment is needed for zero_page */
2168 + if ((page_to_pfn(kpage) == ksm_zero_pfn) ||
2169 + (page_to_pfn(kpage) == zero_pfn))
2170 + entry = pte_mkspecial(entry);
2173 + page_add_anon_rmap(kpage, vma, addr);
2176 + set_pte_at_notify(mm, addr, ptep, entry);
2178 page_remove_rmap(page);
2179 if (!page_mapped(page))
2180 @@ -827,6 +1267,85 @@
2186 + * Fully hash a page with HASH_STRENGTH_MAX return a non-zero hash value. The
2187 + * zero hash value at HASH_STRENGTH_MAX is used to indicated that its
2188 + * hash_max member has not been calculated.
2190 + * @page The page needs to be hashed
2191 + * @hash_old The hash value calculated with current hash strength
2193 + * return the new hash value calculated at HASH_STRENGTH_MAX
2195 +static inline u32 page_hash_max(struct page *page, u32 hash_old)
2200 + addr = kmap_atomic(page, KM_USER0);
2201 + hash_max = delta_hash(addr, hash_strength,
2202 + HASH_STRENGTH_MAX, hash_old);
2204 + kunmap_atomic(addr, KM_USER0);
2209 + inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength);
2214 + * We compare the hash again, to ensure that it is really a hash collision
2215 + * instead of being caused by page write.
2217 +static inline int check_collision(struct rmap_item *rmap_item,
2221 + struct page *page = rmap_item->page;
2223 + /* if this rmap_item has already been hash_maxed, then the collision
2224 + * must appears in the second-level rbtree search. In this case we check
2225 + * if its hash_max value has been changed. Otherwise, the collision
2226 + * happens in the first-level rbtree search, so we check against it's
2227 + * current hash value.
2229 + if (rmap_item->hash_max) {
2230 + inc_rshash_neg(memcmp_cost);
2231 + inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength);
2233 + if (rmap_item->hash_max == page_hash_max(page, hash))
2234 + err = MERGE_ERR_COLLI;
2236 + err = MERGE_ERR_CHANGED;
2238 + inc_rshash_neg(memcmp_cost + hash_strength);
2240 + if (page_hash(page, hash_strength, 0) == hash)
2241 + err = MERGE_ERR_COLLI;
2243 + err = MERGE_ERR_CHANGED;
2249 +static struct page *page_trans_compound_anon(struct page *page)
2251 + if (PageTransCompound(page)) {
2252 + struct page *head = compound_trans_head(page);
2254 + * head may actually be splitted and freed from under
2255 + * us but it's ok here.
2257 + if (PageAnon(head))
2263 static int page_trans_compound_anon_split(struct page *page)
2266 @@ -854,30 +1373,36 @@
2271 - * try_to_merge_one_page - take two pages and merge them into one
2272 - * @vma: the vma that holds the pte pointing to page
2273 - * @page: the PageAnon page that we want to replace with kpage
2274 - * @kpage: the PageKsm page that we want to map instead of page,
2275 - * or NULL the first time when we want to use page as kpage.
2277 + * Try to merge a rmap_item.page with a kpage in stable node. kpage must
2278 + * already be a ksm page.
2280 - * This function returns 0 if the pages were merged, -EFAULT otherwise.
2281 + * @return 0 if the pages were merged, -EFAULT otherwise.
2283 -static int try_to_merge_one_page(struct vm_area_struct *vma,
2284 - struct page *page, struct page *kpage)
2285 +static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
2286 + struct page *kpage, u32 hash)
2288 + struct vm_area_struct *vma = rmap_item->slot->vma;
2289 + struct mm_struct *mm = vma->vm_mm;
2290 pte_t orig_pte = __pte(0);
2291 - int err = -EFAULT;
2292 + int err = MERGE_ERR_PGERR;
2293 + struct page *page;
2295 - if (page == kpage) /* ksm page forked */
2297 + if (ksm_test_exit(mm))
2300 + page = rmap_item->page;
2302 - if (!(vma->vm_flags & VM_MERGEABLE))
2303 + if (page == kpage) { /* ksm page forked */
2308 if (PageTransCompound(page) && page_trans_compound_anon_split(page))
2310 BUG_ON(PageTransCompound(page));
2311 - if (!PageAnon(page))
2313 + if (!PageAnon(page) || !PageKsm(kpage))
2317 @@ -895,18 +1420,27 @@
2318 * ptes are necessarily already write-protected. But in either
2319 * case, we need to lock and check page_count is not raised.
2321 - if (write_protect_page(vma, page, &orig_pte) == 0) {
2322 + if (write_protect_page(vma, page, &orig_pte, NULL) == 0) {
2324 + long map_sharing = atomic_read(&page->_mapcount);
2326 * While we hold page lock, upgrade page from
2327 * PageAnon+anon_vma to PageKsm+NULL stable_node:
2328 * stable_tree_insert() will update stable_node.
2330 set_page_stable_node(page, NULL);
2332 + add_zone_page_state(page_zone(page),
2333 + NR_KSM_PAGES_SHARING,
2335 mark_page_accessed(page);
2337 - } else if (pages_identical(page, kpage))
2338 - err = replace_page(vma, page, kpage, orig_pte);
2340 + if (pages_identical(page, kpage))
2341 + err = replace_page(vma, page, kpage, orig_pte);
2343 + err = check_collision(rmap_item, hash);
2347 if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
2348 @@ -924,378 +1458,2697 @@
2353 - * try_to_merge_with_ksm_page - like try_to_merge_two_pages,
2354 - * but no new kernel page is allocated: kpage must already be a ksm page.
2358 + * If two pages fail to merge in try_to_merge_two_pages, then we have a chance
2359 + * to restore a page mapping that has been changed in try_to_merge_two_pages.
2361 - * This function returns 0 if the pages were merged, -EFAULT otherwise.
2362 + * @return 0 on success.
2364 -static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
2365 - struct page *page, struct page *kpage)
2366 +static int restore_ksm_page_pte(struct vm_area_struct *vma, unsigned long addr,
2367 + pte_t orig_pte, pte_t wprt_pte)
2369 - struct mm_struct *mm = rmap_item->mm;
2370 - struct vm_area_struct *vma;
2371 + struct mm_struct *mm = vma->vm_mm;
2380 - down_read(&mm->mmap_sem);
2381 - if (ksm_test_exit(mm))
2383 - vma = find_vma(mm, rmap_item->address);
2384 - if (!vma || vma->vm_start > rmap_item->address)
2385 + pgd = pgd_offset(mm, addr);
2386 + if (!pgd_present(*pgd))
2389 + pud = pud_offset(pgd, addr);
2390 + if (!pud_present(*pud))
2393 + pmd = pmd_offset(pud, addr);
2394 + if (!pmd_present(*pmd))
2397 + ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
2398 + if (!pte_same(*ptep, wprt_pte)) {
2399 + /* already copied, let it be */
2400 + pte_unmap_unlock(ptep, ptl);
2405 + * Good boy, still here. When we still get the ksm page, it does not
2406 + * return to the free page pool, there is no way that a pte was changed
2407 + * to other page and gets back to this page. And remind that ksm page
2408 + * do not reuse in do_wp_page(). So it's safe to restore the original
2411 + flush_cache_page(vma, addr, pte_pfn(*ptep));
2412 + ptep_clear_flush(vma, addr, ptep);
2413 + set_pte_at_notify(mm, addr, ptep, orig_pte);
2415 + pte_unmap_unlock(ptep, ptl);
2422 + * try_to_merge_two_pages() - take two identical pages and prepare
2423 + * them to be merged into one page(rmap_item->page)
2425 + * @return 0 if we successfully merged two identical pages into
2426 + * one ksm page. MERGE_ERR_COLLI if it's only a hash collision
2427 + * search in rbtree. MERGE_ERR_CHANGED if rmap_item has been
2428 + * changed since it's hashed. MERGE_ERR_PGERR otherwise.
2431 +static int try_to_merge_two_pages(struct rmap_item *rmap_item,
2432 + struct rmap_item *tree_rmap_item,
2435 + pte_t orig_pte1 = __pte(0), orig_pte2 = __pte(0);
2436 + pte_t wprt_pte1 = __pte(0), wprt_pte2 = __pte(0);
2437 + struct vm_area_struct *vma1 = rmap_item->slot->vma;
2438 + struct vm_area_struct *vma2 = tree_rmap_item->slot->vma;
2439 + struct page *page = rmap_item->page;
2440 + struct page *tree_page = tree_rmap_item->page;
2441 + int err = MERGE_ERR_PGERR;
2444 + struct address_space *saved_mapping;
2447 + if (rmap_item->page == tree_rmap_item->page)
2450 + if (PageTransCompound(page) && page_trans_compound_anon_split(page))
2452 + BUG_ON(PageTransCompound(page));
2454 + if (PageTransCompound(tree_page) && page_trans_compound_anon_split(tree_page))
2456 + BUG_ON(PageTransCompound(tree_page));
2458 + if (!PageAnon(page) || !PageAnon(tree_page))
2461 + if (!trylock_page(page))
2465 + if (write_protect_page(vma1, page, &wprt_pte1, &orig_pte1) != 0) {
2466 + unlock_page(page);
2471 + * While we hold page lock, upgrade page from
2472 + * PageAnon+anon_vma to PageKsm+NULL stable_node:
2473 + * stable_tree_insert() will update stable_node.
2475 + saved_mapping = page->mapping;
2476 + map_sharing = atomic_read(&page->_mapcount);
2477 + set_page_stable_node(page, NULL);
2479 + add_zone_page_state(page_zone(page),
2480 + NR_KSM_PAGES_SHARING,
2482 + mark_page_accessed(page);
2483 + unlock_page(page);
2485 + if (!trylock_page(tree_page))
2488 + if (write_protect_page(vma2, tree_page, &wprt_pte2, &orig_pte2) != 0) {
2489 + unlock_page(tree_page);
2493 + if (pages_identical(page, tree_page)) {
2494 + err = replace_page(vma2, tree_page, page, wprt_pte2);
2498 + if ((vma2->vm_flags & VM_LOCKED)) {
2499 + munlock_vma_page(tree_page);
2500 + if (!PageMlocked(page)) {
2501 + unlock_page(tree_page);
2503 + mlock_vma_page(page);
2504 + tree_page = page; /* for final unlock */
2508 + unlock_page(tree_page);
2510 + goto out; /* success */
2513 + if (page_hash(page, hash_strength, 0) ==
2514 + page_hash(tree_page, hash_strength, 0)) {
2515 + inc_rshash_neg(memcmp_cost + hash_strength * 2);
2516 + err = MERGE_ERR_COLLI;
2518 + err = MERGE_ERR_CHANGED;
2520 + unlock_page(tree_page);
2525 + if (!restore_ksm_page_pte(vma1, get_rmap_addr(rmap_item),
2526 + orig_pte1, wprt_pte1))
2527 + page->mapping = saved_mapping;
2529 + unlock_page(page);
2534 +static inline int hash_cmp(u32 new_val, u32 node_val)
2536 + if (new_val > node_val)
2538 + else if (new_val < node_val)
2544 +static inline u32 rmap_item_hash_max(struct rmap_item *item, u32 hash)
2546 + u32 hash_max = item->hash_max;
2549 + hash_max = page_hash_max(item->page, hash);
2551 + item->hash_max = hash_max;
2560 + * stable_tree_search() - search the stable tree for a page
2562 + * @item: the rmap_item we are comparing with
2563 + * @hash: the hash value of this item->page already calculated
2565 + * @return the page we have found, NULL otherwise. The page returned has
2568 +static struct page *stable_tree_search(struct rmap_item *item, u32 hash)
2570 + struct rb_node *node = root_stable_treep->rb_node;
2571 + struct tree_node *tree_node;
2572 + unsigned long hash_max;
2573 + struct page *page = item->page;
2574 + struct stable_node *stable_node;
2576 + stable_node = page_stable_node(page);
2577 + if (stable_node) {
2578 + /* ksm page forked, that is
2579 + * if (PageKsm(page) && !in_stable_tree(rmap_item))
2580 + * it's actually gotten once outside.
2589 + tree_node = rb_entry(node, struct tree_node, node);
2591 + cmp = hash_cmp(hash, tree_node->hash);
2594 + node = node->rb_left;
2596 + node = node->rb_right;
2604 + if (tree_node->count == 1) {
2605 + stable_node = rb_entry(tree_node->sub_root.rb_node,
2606 + struct stable_node, node);
2607 + BUG_ON(!stable_node);
2609 + goto get_page_out;
2613 + * ok, we have to search the second
2614 + * level subtree, hash the page to a
2617 + node = tree_node->sub_root.rb_node;
2619 + hash_max = rmap_item_hash_max(item, hash);
2624 + stable_node = rb_entry(node, struct stable_node, node);
2626 + cmp = hash_cmp(hash_max, stable_node->hash_max);
2629 + node = node->rb_left;
2631 + node = node->rb_right;
2633 + goto get_page_out;
2639 + page = get_ksm_page(stable_node, 1, 1);
2645 + * try_to_merge_with_stable_page() - when two rmap_items need to be inserted
2646 + * into stable tree, the page was found to be identical to a stable ksm page,
2647 + * this is the last chance we can merge them into one.
2649 + * @item1: the rmap_item holding the page which we wanted to insert
2650 + * into stable tree.
2651 + * @item2: the other rmap_item we found when unstable tree search
2652 + * @oldpage: the page currently mapped by the two rmap_items
2653 + * @tree_page: the page we found identical in stable tree node
2654 + * @success1: return if item1 is successfully merged
2655 + * @success2: return if item2 is successfully merged
2657 +static void try_merge_with_stable(struct rmap_item *item1,
2658 + struct rmap_item *item2,
2659 + struct page **kpage,
2660 + struct page *tree_page,
2661 + int *success1, int *success2)
2663 + spinlock_t *ptl1, *ptl2;
2664 + pte_t *ptep1, *ptep2;
2665 + unsigned long addr1, addr2;
2666 + struct vm_area_struct *vma1 = item1->slot->vma;
2667 + struct vm_area_struct *vma2 = item2->slot->vma;
2672 + if (unlikely(*kpage == tree_page)) {
2673 + /* I don't think this can really happen */
2674 + goto success_both;
2677 + if (!PageAnon(*kpage) || !PageKsm(*kpage))
2680 + if (!trylock_page(tree_page))
2683 + /* If the oldpage is still ksm and still pointed
2684 + * to in the right place, and still write protected,
2685 + * we are confident it's not changed, no need to
2687 + * be ware, we cannot take nested pte locks,
2690 + addr1 = get_rmap_addr(item1);
2692 + ptep1 = page_check_address(*kpage, vma1->vm_mm, addr1, &ptl1, 0);
2696 + if (pte_write(*ptep1)) {
2697 + /* has changed, abort! */
2698 + pte_unmap_unlock(ptep1, ptl1);
2702 + get_page(tree_page);
2703 + page_add_anon_rmap(tree_page, vma1, addr1);
2705 + flush_cache_page(vma1, addr1, pte_pfn(*ptep1));
2706 + ptep_clear_flush(vma1, addr1, ptep1);
2707 + set_pte_at_notify(vma1->vm_mm, addr1, ptep1,
2708 + mk_pte(tree_page, vma1->vm_page_prot));
2710 + page_remove_rmap(*kpage);
2713 + pte_unmap_unlock(ptep1, ptl1);
2716 + /* ok, then vma2, remind that pte1 already set */
2717 + addr2 = get_rmap_addr(item2);
2719 + ptep2 = page_check_address(*kpage, vma2->vm_mm, addr2, &ptl2, 0);
2723 + if (pte_write(*ptep2)) {
2724 + /* has changed, abort! */
2725 + pte_unmap_unlock(ptep2, ptl2);
2729 + get_page(tree_page);
2730 + page_add_anon_rmap(tree_page, vma2, addr2);
2732 + flush_cache_page(vma2, addr2, pte_pfn(*ptep2));
2733 + ptep_clear_flush(vma2, addr2, ptep2);
2734 + set_pte_at_notify(vma2->vm_mm, addr2, ptep2,
2735 + mk_pte(tree_page, vma2->vm_page_prot));
2737 + page_remove_rmap(*kpage);
2740 + pte_unmap_unlock(ptep2, ptl2);
2749 + if ((*success1 && vma1->vm_flags & VM_LOCKED) ||
2750 + (*success2 && vma2->vm_flags & VM_LOCKED)) {
2751 + munlock_vma_page(*kpage);
2752 + if (!PageMlocked(tree_page))
2753 + mlock_vma_page(tree_page);
2757 + * We do not need oldpage any more in the caller, so can break the lock
2760 + unlock_page(*kpage);
2761 + *kpage = tree_page; /* Get unlocked outside. */
2766 +static inline void stable_node_hash_max(struct stable_node *node,
2767 + struct page *page, u32 hash)
2769 + u32 hash_max = node->hash_max;
2772 + hash_max = page_hash_max(page, hash);
2773 + node->hash_max = hash_max;
2778 +struct stable_node *new_stable_node(struct tree_node *tree_node,
2779 + struct page *kpage, u32 hash_max)
2781 + struct stable_node *new_stable_node;
2783 + new_stable_node = alloc_stable_node();
2784 + if (!new_stable_node)
2787 + new_stable_node->kpfn = page_to_pfn(kpage);
2788 + new_stable_node->hash_max = hash_max;
2789 + new_stable_node->tree_node = tree_node;
2790 + set_page_stable_node(kpage, new_stable_node);
2792 + return new_stable_node;
2796 +struct stable_node *first_level_insert(struct tree_node *tree_node,
2797 + struct rmap_item *rmap_item,
2798 + struct rmap_item *tree_rmap_item,
2799 + struct page **kpage, u32 hash,
2800 + int *success1, int *success2)
2803 + struct page *tree_page;
2805 + struct stable_node *stable_node, *new_snode;
2806 + struct rb_node *parent = NULL, **new;
2808 + /* this tree node contains no sub-tree yet */
2809 + stable_node = rb_entry(tree_node->sub_root.rb_node,
2810 + struct stable_node, node);
2812 + tree_page = get_ksm_page(stable_node, 1, 0);
2814 + cmp = memcmp_pages(*kpage, tree_page, 1);
2816 + try_merge_with_stable(rmap_item, tree_rmap_item, kpage,
2817 + tree_page, success1, success2);
2818 + put_page(tree_page);
2819 + if (!*success1 && !*success2)
2822 + return stable_node;
2826 + * collision in first level try to create a subtree.
2827 + * A new node need to be created.
2829 + put_page(tree_page);
2831 + stable_node_hash_max(stable_node, tree_page,
2833 + hash_max = rmap_item_hash_max(rmap_item, hash);
2834 + cmp = hash_cmp(hash_max, stable_node->hash_max);
2836 + parent = &stable_node->node;
2838 + new = &parent->rb_left;
2839 + } else if (cmp > 0) {
2840 + new = &parent->rb_right;
2847 + /* the only stable_node deleted, we reuse its tree_node.
2850 + new = &tree_node->sub_root.rb_node;
2853 + new_snode = new_stable_node(tree_node, *kpage, hash_max);
2857 + rb_link_node(&new_snode->node, parent, new);
2858 + rb_insert_color(&new_snode->node, &tree_node->sub_root);
2859 + tree_node->count++;
2860 + *success1 = *success2 = 1;
2869 +struct stable_node *stable_subtree_insert(struct tree_node *tree_node,
2870 + struct rmap_item *rmap_item,
2871 + struct rmap_item *tree_rmap_item,
2872 + struct page **kpage, u32 hash,
2873 + int *success1, int *success2)
2875 + struct page *tree_page;
2877 + struct stable_node *stable_node, *new_snode;
2878 + struct rb_node *parent, **new;
2882 + new = &tree_node->sub_root.rb_node;
2884 + hash_max = rmap_item_hash_max(rmap_item, hash);
2888 + stable_node = rb_entry(*new, struct stable_node, node);
2890 + cmp = hash_cmp(hash_max, stable_node->hash_max);
2894 + new = &parent->rb_left;
2895 + } else if (cmp > 0) {
2897 + new = &parent->rb_right;
2899 + tree_page = get_ksm_page(stable_node, 1, 0);
2901 + cmp = memcmp_pages(*kpage, tree_page, 1);
2903 + try_merge_with_stable(rmap_item,
2904 + tree_rmap_item, kpage,
2905 + tree_page, success1, success2);
2907 + put_page(tree_page);
2908 + if (!*success1 && !*success2)
2911 + * successfully merged with a stable
2914 + return stable_node;
2916 + put_page(tree_page);
2921 + * stable node may be deleted,
2922 + * and subtree maybe
2923 + * restructed, cannot
2924 + * continue, research it.
2926 + if (tree_node->count) {
2929 + /* reuse the tree node*/
2931 + new = &tree_node->sub_root.rb_node;
2937 + new_snode = new_stable_node(tree_node, *kpage, hash_max);
2941 + rb_link_node(&new_snode->node, parent, new);
2942 + rb_insert_color(&new_snode->node, &tree_node->sub_root);
2943 + tree_node->count++;
2944 + *success1 = *success2 = 1;
2954 + * stable_tree_insert() - try to insert a merged page in unstable tree to
2957 + * @kpage: the page need to be inserted
2958 + * @hash: the current hash of this page
2959 + * @rmap_item: the rmap_item being scanned
2960 + * @tree_rmap_item: the rmap_item found on unstable tree
2961 + * @success1: return if rmap_item is merged
2962 + * @success2: return if tree_rmap_item is merged
2964 + * @return the stable_node on stable tree if at least one
2965 + * rmap_item is inserted into stable tree, NULL
2968 +static struct stable_node *
2969 +stable_tree_insert(struct page **kpage, u32 hash,
2970 + struct rmap_item *rmap_item,
2971 + struct rmap_item *tree_rmap_item,
2972 + int *success1, int *success2)
2974 + struct rb_node **new = &root_stable_treep->rb_node;
2975 + struct rb_node *parent = NULL;
2976 + struct stable_node *stable_node;
2977 + struct tree_node *tree_node;
2980 + *success1 = *success2 = 0;
2985 + tree_node = rb_entry(*new, struct tree_node, node);
2987 + cmp = hash_cmp(hash, tree_node->hash);
2991 + new = &parent->rb_left;
2992 + } else if (cmp > 0) {
2994 + new = &parent->rb_right;
3000 + if (tree_node->count == 1) {
3001 + stable_node = first_level_insert(tree_node, rmap_item,
3002 + tree_rmap_item, kpage,
3003 + hash, success1, success2);
3005 + stable_node = stable_subtree_insert(tree_node,
3006 + rmap_item, tree_rmap_item, kpage,
3007 + hash, success1, success2);
3011 + /* no tree node found */
3012 + tree_node = alloc_tree_node(stable_tree_node_listp);
3014 + stable_node = NULL;
3018 + stable_node = new_stable_node(tree_node, *kpage, hash_max);
3019 + if (!stable_node) {
3020 + free_tree_node(tree_node);
3024 + tree_node->hash = hash;
3025 + rb_link_node(&tree_node->node, parent, new);
3026 + rb_insert_color(&tree_node->node, root_stable_treep);
3028 + new = &tree_node->sub_root.rb_node;
3030 + rb_link_node(&stable_node->node, parent, new);
3031 + rb_insert_color(&stable_node->node, &tree_node->sub_root);
3032 + tree_node->count++;
3033 + *success1 = *success2 = 1;
3037 + return stable_node;
3042 + * get_tree_rmap_item_page() - try to get the page and lock the mmap_sem
3044 + * @return 0 on success, -EBUSY if unable to lock the mmap_sem,
3045 + * -EINVAL if the page mapping has been changed.
3047 +static inline int get_tree_rmap_item_page(struct rmap_item *tree_rmap_item)
3051 + err = get_mergeable_page_lock_mmap(tree_rmap_item);
3053 + if (err == -EINVAL) {
3054 + /* its page map has been changed, remove it */
3055 + remove_rmap_item_from_tree(tree_rmap_item);
3058 + /* The page is gotten and mmap_sem is locked now. */
3064 + * unstable_tree_search_insert() - search an unstable tree rmap_item with the
3065 + * same hash value. Get its page and trylock the mmap_sem
3068 +struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
3072 + struct rb_node **new = &root_unstable_tree.rb_node;
3073 + struct rb_node *parent = NULL;
3074 + struct tree_node *tree_node;
3076 + struct rmap_item *tree_rmap_item;
3081 + tree_node = rb_entry(*new, struct tree_node, node);
3083 + cmp = hash_cmp(hash, tree_node->hash);
3087 + new = &parent->rb_left;
3088 + } else if (cmp > 0) {
3090 + new = &parent->rb_right;
3096 + /* got the tree_node */
3097 + if (tree_node->count == 1) {
3098 + tree_rmap_item = rb_entry(tree_node->sub_root.rb_node,
3099 + struct rmap_item, node);
3100 + BUG_ON(!tree_rmap_item);
3102 + goto get_page_out;
3105 + /* well, search the collision subtree */
3106 + new = &tree_node->sub_root.rb_node;
3108 + hash_max = rmap_item_hash_max(rmap_item, hash);
3113 + tree_rmap_item = rb_entry(*new, struct rmap_item,
3116 + cmp = hash_cmp(hash_max, tree_rmap_item->hash_max);
3119 + new = &parent->rb_left;
3121 + new = &parent->rb_right;
3123 + goto get_page_out;
3126 + /* alloc a new tree_node */
3127 + tree_node = alloc_tree_node(&unstable_tree_node_list);
3131 + tree_node->hash = hash;
3132 + rb_link_node(&tree_node->node, parent, new);
3133 + rb_insert_color(&tree_node->node, &root_unstable_tree);
3135 + new = &tree_node->sub_root.rb_node;
3138 + /* did not found even in sub-tree */
3139 + rmap_item->tree_node = tree_node;
3140 + rmap_item->address |= UNSTABLE_FLAG;
3141 + rmap_item->append_round = ksm_scan_round;
3142 + rb_link_node(&rmap_item->node, parent, new);
3143 + rb_insert_color(&rmap_item->node, &tree_node->sub_root);
3145 + ksm_pages_unshared++;
3149 + if (tree_rmap_item->page == rmap_item->page)
3152 + if (get_tree_rmap_item_page(tree_rmap_item))
3155 + return tree_rmap_item;
3158 +static void enter_vma_tree(struct vma_slot *slot)
3163 + i = ksm_vma_tree_index_end;
3165 + ret = radix_tree_insert(&ksm_vma_tree, i, slot);
3168 + slot->ksm_index = i;
3169 + ksm_vma_tree_num++;
3170 + ksm_vma_tree_index_end++;
3173 +static inline void get_sub_dup_vma(struct vma_slot **slot,
3174 + struct vma_slot **sub_slot)
3176 + struct vma_slot *tmp;
3178 + if ((*slot)->ksm_index > (*sub_slot)->ksm_index) {
3180 + *slot = *sub_slot;
3186 + * Inc or dec the dup pages stored in a slot, return the dup page num after
3189 +static inline unsigned long dup_pages_mod(void **slot, int inc)
3191 + unsigned long item, ret;
3193 + item = (unsigned long)(*slot) >> INDIRECT_OFFSET;
3202 + item <<= INDIRECT_OFFSET;
3203 + *slot = (void *)item;
3208 +static void inc_dup_vma(struct vma_slot *slot, struct vma_slot *sub_slot)
3211 + unsigned long dup_pages;
3214 + if (slot->ksm_index == -1)
3215 + enter_vma_tree(slot);
3217 + if (sub_slot->ksm_index == -1)
3218 + enter_vma_tree(sub_slot);
3220 + get_sub_dup_vma(&slot, &sub_slot);
3222 + dup_slot = radix_tree_lookup_slot(&slot->dup_tree, sub_slot->ksm_index);
3227 + * In order to store dup_pages in radix tree, we must make
3228 + * radix_tree_is_indirect_ptr() happy.
3230 + dup_pages = 1 << INDIRECT_OFFSET;
3232 + /* no such entry yet, insert one */
3233 + ret = radix_tree_insert(&slot->dup_tree, sub_slot->ksm_index,
3234 + (void *)dup_pages);
3240 + dup_pages_mod(dup_slot, 1);
3243 +static void dec_dup_vma(struct vma_slot *slot, struct vma_slot *sub_slot)
3246 + unsigned long dup_pages;
3248 + BUG_ON(slot->ksm_index == -1 || sub_slot->ksm_index == -1);
3250 + get_sub_dup_vma(&slot, &sub_slot);
3252 + dup_slot = radix_tree_lookup_slot(&slot->dup_tree, sub_slot->ksm_index);
3253 + BUG_ON(!dup_slot);
3255 + dup_pages = dup_pages_mod(dup_slot, 0);
3257 + /* dup_pages == 0, we need to kick it out */
3259 + radix_tree_delete(&slot->dup_tree, sub_slot->ksm_index);
3262 +static void hold_anon_vma(struct rmap_item *rmap_item,
3263 + struct anon_vma *anon_vma)
3265 + rmap_item->anon_vma = anon_vma;
3266 + get_anon_vma(anon_vma);
3271 + * stable_tree_append() - append a rmap_item to a stable node. Deduplication
3272 + * ratio statistics is done in this function.
3275 +static void stable_tree_append(struct rmap_item *rmap_item,
3276 + struct stable_node *stable_node)
3278 + struct node_vma *node_vma = NULL, *new_node_vma, *node_vma_iter = NULL;
3279 + struct hlist_node *hlist, *cont_p = NULL;
3280 + unsigned long key = (unsigned long)rmap_item->slot;
3282 + BUG_ON(!stable_node);
3283 + rmap_item->address |= STABLE_FLAG;
3284 + rmap_item->append_round = ksm_scan_round;
3286 + if (hlist_empty(&stable_node->hlist)) {
3287 + ksm_pages_shared++;
3288 + goto node_vma_new;
3290 + ksm_pages_sharing++;
3293 + hlist_for_each_entry(node_vma, hlist, &stable_node->hlist, hlist) {
3294 + if (node_vma->last_update == ksm_scan_round)
3295 + inc_dup_vma(rmap_item->slot, node_vma->slot);
3297 + if (node_vma->key >= key)
3303 + if (node_vma && node_vma->key == key) {
3304 + if (node_vma->last_update == ksm_scan_round) {
3306 + * we consider this page a inner duplicate, cancel
3309 + hlist_for_each_entry(node_vma_iter, hlist,
3310 + &stable_node->hlist, hlist) {
3311 + if (node_vma_iter->key == key)
3314 + /* only need to increase the same vma */
3315 + if (node_vma_iter->last_update ==
3317 + dec_dup_vma(rmap_item->slot,
3318 + node_vma_iter->slot);
3323 + * Although it's same vma, it contains no duplicate for this
3324 + * round. Continue scan other vma.
3326 + hlist_for_each_entry_continue(node_vma_iter,
3328 + if (node_vma_iter->last_update ==
3330 + inc_dup_vma(rmap_item->slot,
3331 + node_vma_iter->slot);
3341 + /* no same vma already in node, alloc a new node_vma */
3342 + new_node_vma = alloc_node_vma();
3343 + BUG_ON(!new_node_vma);
3344 + new_node_vma->head = stable_node;
3345 + new_node_vma->slot = rmap_item->slot;
3348 + hlist_add_head(&new_node_vma->hlist, &stable_node->hlist);
3349 + } else if (node_vma->key != key) {
3350 + if (node_vma->key < key)
3351 + hlist_add_after(&node_vma->hlist, &new_node_vma->hlist);
3353 + hlist_for_each_entry_continue(node_vma_iter, cont_p,
3355 + if (node_vma_iter->last_update ==
3357 + inc_dup_vma(rmap_item->slot,
3358 + node_vma_iter->slot);
3361 + hlist_add_before(&new_node_vma->hlist,
3362 + &node_vma->hlist);
3366 + node_vma = new_node_vma;
3368 +node_vma_ok: /* ok, ready to add to the list */
3369 + rmap_item->head = node_vma;
3370 + hlist_add_head(&rmap_item->hlist, &node_vma->rmap_hlist);
3371 + node_vma->last_update = ksm_scan_round;
3372 + hold_anon_vma(rmap_item, rmap_item->slot->vma->anon_vma);
3373 + rmap_item->slot->pages_merged++;
3377 + * We use break_ksm to break COW on a ksm page: it's a stripped down
3379 + * if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1)
3382 + * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
3383 + * in case the application has unmapped and remapped mm,addr meanwhile.
3384 + * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP
3385 + * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
3387 +static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
3389 + struct page *page;
3394 + page = follow_page(vma, addr, FOLL_GET);
3395 + if (IS_ERR_OR_NULL(page))
3397 + if (PageKsm(page)) {
3398 + ret = handle_mm_fault(vma->vm_mm, vma, addr,
3399 + FAULT_FLAG_WRITE);
3401 + ret = VM_FAULT_WRITE;
3403 + } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_OOM)));
3405 + * We must loop because handle_mm_fault() may back out if there's
3406 + * any difficulty e.g. if pte accessed bit gets updated concurrently.
3408 + * VM_FAULT_WRITE is what we have been hoping for: it indicates that
3409 + * COW has been broken, even if the vma does not permit VM_WRITE;
3410 + * but note that a concurrent fault might break PageKsm for us.
3412 + * VM_FAULT_SIGBUS could occur if we race with truncation of the
3413 + * backing file, which also invalidates anonymous pages: that's
3414 + * okay, that truncation will have unmapped the PageKsm for us.
3416 + * VM_FAULT_OOM: at the time of writing (late July 2009), setting
3417 + * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
3418 + * current task has TIF_MEMDIE set, and will be OOM killed on return
3419 + * to user; and ksmd, having no mm, would never be chosen for that.
3421 + * But if the mm is in a limited mem_cgroup, then the fault may fail
3422 + * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
3423 + * even ksmd can fail in this way - though it's usually breaking ksm
3424 + * just to undo a merge it made a moment before, so unlikely to oom.
3426 + * That's a pity: we might therefore have more kernel pages allocated
3427 + * than we're counting as nodes in the stable tree; but ksm_do_scan
3428 + * will retry to break_cow on each pass, so should recover the page
3429 + * in due course. The important thing is to not let VM_MERGEABLE
3430 + * be cleared while any such pages might remain in the area.
3432 + return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
3435 +static void break_cow(struct rmap_item *rmap_item)
3437 + struct vm_area_struct *vma = rmap_item->slot->vma;
3438 + struct mm_struct *mm = vma->vm_mm;
3439 + unsigned long addr = get_rmap_addr(rmap_item);
3441 + if (ksm_test_exit(mm))
3444 + break_ksm(vma, addr);
3450 + * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather
3451 + * than check every pte of a given vma, the locking doesn't quite work for
3452 + * that - an rmap_item is assigned to the stable tree after inserting ksm
3453 + * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing
3454 + * rmap_items from parent to child at fork time (so as not to waste time
3455 + * if exit comes before the next scan reaches it).
3457 + * Similarly, although we'd like to remove rmap_items (so updating counts
3458 + * and freeing memory) when unmerging an area, it's easier to leave that
3459 + * to the next pass of ksmd - consider, for example, how ksmd might be
3460 + * in cmp_and_merge_page on one of the rmap_items we would be removing.
3462 +inline int unmerge_ksm_pages(struct vm_area_struct *vma,
3463 + unsigned long start, unsigned long end)
3465 + unsigned long addr;
3468 + for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
3469 + if (ksm_test_exit(vma->vm_mm))
3471 + if (signal_pending(current))
3472 + err = -ERESTARTSYS;
3474 + err = break_ksm(vma, addr);
3479 +static inline void inc_ksm_pages_scanned(void)
3484 + if (ksm_pages_scanned == U64_MAX) {
3487 + delta = ksm_pages_scanned >> pages_scanned_base;
3489 + if (CAN_OVERFLOW_U64(pages_scanned_stored, delta)) {
3490 + pages_scanned_stored >>= 1;
3492 + pages_scanned_base++;
3495 + pages_scanned_stored += delta;
3497 + ksm_pages_scanned = ksm_pages_scanned_last = 0;
3500 + ksm_pages_scanned++;
3503 +static inline int find_zero_page_hash(int strength, u32 hash)
3505 + return (zero_hash_table[strength] == hash);
3509 +int cmp_and_merge_zero_page(struct vm_area_struct *vma, struct page *page)
3511 + struct page *zero_page = empty_ksm_zero_page;
3512 + struct mm_struct *mm = vma->vm_mm;
3513 + pte_t orig_pte = __pte(0);
3514 + int err = -EFAULT;
3516 + if (ksm_test_exit(mm))
3519 + if (PageTransCompound(page) && page_trans_compound_anon_split(page))
3521 + BUG_ON(PageTransCompound(page));
3523 + if (!PageAnon(page))
3526 + if (!trylock_page(page))
3529 + if (write_protect_page(vma, page, &orig_pte, 0) == 0) {
3530 + if (is_page_full_zero(page))
3531 + err = replace_page(vma, page, zero_page, orig_pte);
3534 + unlock_page(page);
3540 + * cmp_and_merge_page() - first see if page can be merged into the stable
3541 + * tree; if not, compare hash to previous and if it's the same, see if page
3542 + * can be inserted into the unstable tree, or merged with a page already there
3543 + * and both transferred to the stable tree.
3545 + * @page: the page that we are searching identical page to.
3546 + * @rmap_item: the reverse mapping into the virtual address of this page
3548 +static void cmp_and_merge_page(struct rmap_item *rmap_item)
3550 + struct rmap_item *tree_rmap_item;
3551 + struct page *page;
3552 + struct page *kpage = NULL;
3553 + u32 hash, hash_max;
3555 + unsigned int success1, success2;
3556 + struct stable_node *snode;
3558 + struct rb_node *parent = NULL, **new;
3560 + remove_rmap_item_from_tree(rmap_item);
3561 + inc_ksm_pages_scanned();
3563 + page = rmap_item->page;
3565 + hash = page_hash(page, hash_strength, 1);
3567 + /*if the page content all zero, re-map to zero-page*/
3568 + if (find_zero_page_hash(hash_strength, hash)) {
3569 + if (!cmp_and_merge_zero_page(rmap_item->slot->vma, page)) {
3570 + __inc_zone_page_state(page, NR_KSM_ZERO_PAGES);
3573 + inc_rshash_neg(memcmp_cost / 2);
3576 + //ksm_pages_scanned++;
3578 + /* We first start with searching the page inside the stable tree */
3579 + kpage = stable_tree_search(rmap_item, hash);
3581 + err = try_to_merge_with_ksm_page(rmap_item, kpage,
3585 + * The page was successfully merged, add
3586 + * its rmap_item to the stable tree.
3587 + * page lock is needed because it's
3588 + * racing with try_to_unmap_ksm(), etc.
3591 + stable_tree_append(rmap_item, page_stable_node(kpage));
3592 + unlock_page(kpage);
3594 + return; /* success */
3599 + * if it's a collision and it has been search in sub-rbtree
3600 + * (hash_max != 0), we want to abort, because if it is
3601 + * successfully merged in unstable tree, the collision trends to
3604 + if (err == MERGE_ERR_COLLI && rmap_item->hash_max)
3609 + unstable_tree_search_insert(rmap_item, hash);
3610 + if (tree_rmap_item) {
3611 + err = try_to_merge_two_pages(rmap_item, tree_rmap_item, hash);
3613 + * As soon as we merge this page, we want to remove the
3614 + * rmap_item of the page we have merged with from the unstable
3615 + * tree, and insert it instead as new node in the stable tree.
3619 + remove_rmap_item_from_tree(tree_rmap_item);
3621 + snode = stable_tree_insert(&kpage, hash,
3622 + rmap_item, tree_rmap_item,
3623 + &success1, &success2);
3626 + stable_tree_append(rmap_item, snode);
3628 + break_cow(rmap_item);
3631 + stable_tree_append(tree_rmap_item, snode);
3633 + break_cow(tree_rmap_item);
3636 + * The original kpage may be unlocked inside
3637 + * stable_tree_insert() already.
3639 + unlock_page(kpage);
3641 + } else if (err == MERGE_ERR_COLLI) {
3642 + if (tree_rmap_item->tree_node->count == 1) {
3643 + rmap_item_hash_max(tree_rmap_item,
3644 + tree_rmap_item->tree_node->hash);
3646 + BUG_ON(!(tree_rmap_item->hash_max));
3648 + hash_max = rmap_item_hash_max(rmap_item, hash);
3649 + cmp = hash_cmp(hash_max, tree_rmap_item->hash_max);
3650 + parent = &tree_rmap_item->node;
3652 + new = &parent->rb_left;
3654 + new = &parent->rb_right;
3658 + rmap_item->tree_node = tree_rmap_item->tree_node;
3659 + rmap_item->address |= UNSTABLE_FLAG;
3660 + rmap_item->append_round = ksm_scan_round;
3661 + rb_link_node(&rmap_item->node, parent, new);
3662 + rb_insert_color(&rmap_item->node,
3663 + &tree_rmap_item->tree_node->sub_root);
3664 + rmap_item->tree_node->count++;
3667 + put_page(tree_rmap_item->page);
3668 + up_read(&tree_rmap_item->slot->vma->vm_mm->mmap_sem);
3675 +static inline unsigned long get_pool_index(struct vma_slot *slot,
3676 + unsigned long index)
3678 + unsigned long pool_index;
3680 + pool_index = (sizeof(struct rmap_list_entry *) * index) >> PAGE_SHIFT;
3681 + if (pool_index >= slot->pool_size)
3683 + return pool_index;
3686 +static inline unsigned long index_page_offset(unsigned long index)
3688 + return offset_in_page(sizeof(struct rmap_list_entry *) * index);
3692 +struct rmap_list_entry *get_rmap_list_entry(struct vma_slot *slot,
3693 + unsigned long index, int need_alloc)
3695 + unsigned long pool_index;
3699 + pool_index = get_pool_index(slot, index);
3700 + if (!slot->rmap_list_pool[pool_index]) {
3704 + slot->rmap_list_pool[pool_index] =
3705 + alloc_page(GFP_KERNEL | __GFP_ZERO);
3706 + BUG_ON(!slot->rmap_list_pool[pool_index]);
3709 + addr = kmap(slot->rmap_list_pool[pool_index]);
3710 + addr += index_page_offset(index);
3715 +static inline void put_rmap_list_entry(struct vma_slot *slot,
3716 + unsigned long index)
3718 + unsigned long pool_index;
3720 + pool_index = get_pool_index(slot, index);
3721 + BUG_ON(!slot->rmap_list_pool[pool_index]);
3722 + kunmap(slot->rmap_list_pool[pool_index]);
3725 +static inline int entry_is_new(struct rmap_list_entry *entry)
3727 + return !entry->item;
3730 +static inline unsigned long get_index_orig_addr(struct vma_slot *slot,
3731 + unsigned long index)
3733 + return slot->vma->vm_start + (index << PAGE_SHIFT);
3736 +static inline unsigned long get_entry_address(struct rmap_list_entry *entry)
3738 + unsigned long addr;
3740 + if (is_addr(entry->addr))
3741 + addr = get_clean_addr(entry->addr);
3742 + else if (entry->item)
3743 + addr = get_rmap_addr(entry->item);
3750 +static inline struct rmap_item *get_entry_item(struct rmap_list_entry *entry)
3752 + if (is_addr(entry->addr))
3755 + return entry->item;
3758 +static inline void inc_rmap_list_pool_count(struct vma_slot *slot,
3759 + unsigned long index)
3761 + unsigned long pool_index;
3763 + pool_index = get_pool_index(slot, index);
3764 + BUG_ON(!slot->rmap_list_pool[pool_index]);
3765 + slot->pool_counts[pool_index]++;
3768 +static inline void dec_rmap_list_pool_count(struct vma_slot *slot,
3769 + unsigned long index)
3771 + unsigned long pool_index;
3773 + pool_index = get_pool_index(slot, index);
3774 + BUG_ON(!slot->rmap_list_pool[pool_index]);
3775 + BUG_ON(!slot->pool_counts[pool_index]);
3776 + slot->pool_counts[pool_index]--;
3779 +static inline int entry_has_rmap(struct rmap_list_entry *entry)
3781 + return !is_addr(entry->addr) && entry->item;
3784 +static inline void swap_entries(struct rmap_list_entry *entry1,
3785 + unsigned long index1,
3786 + struct rmap_list_entry *entry2,
3787 + unsigned long index2)
3789 + struct rmap_list_entry tmp;
3791 + /* swapping two new entries is meaningless */
3792 + BUG_ON(entry_is_new(entry1) && entry_is_new(entry2));
3795 + *entry1 = *entry2;
3798 + if (entry_has_rmap(entry1))
3799 + entry1->item->entry_index = index1;
3801 + if (entry_has_rmap(entry2))
3802 + entry2->item->entry_index = index2;
3804 + if (entry_has_rmap(entry1) && !entry_has_rmap(entry2)) {
3805 + inc_rmap_list_pool_count(entry1->item->slot, index1);
3806 + dec_rmap_list_pool_count(entry1->item->slot, index2);
3807 + } else if (!entry_has_rmap(entry1) && entry_has_rmap(entry2)) {
3808 + inc_rmap_list_pool_count(entry2->item->slot, index2);
3809 + dec_rmap_list_pool_count(entry2->item->slot, index1);
3813 +static inline void free_entry_item(struct rmap_list_entry *entry)
3815 + unsigned long index;
3816 + struct rmap_item *item;
3818 + if (!is_addr(entry->addr)) {
3819 + BUG_ON(!entry->item);
3820 + item = entry->item;
3821 + entry->addr = get_rmap_addr(item);
3822 + set_is_addr(entry->addr);
3823 + index = item->entry_index;
3824 + remove_rmap_item_from_tree(item);
3825 + dec_rmap_list_pool_count(item->slot, index);
3826 + free_rmap_item(item);
3830 +static inline int pool_entry_boundary(unsigned long index)
3832 + unsigned long linear_addr;
3834 + linear_addr = sizeof(struct rmap_list_entry *) * index;
3835 + return index && !offset_in_page(linear_addr);
3838 +static inline void try_free_last_pool(struct vma_slot *slot,
3839 + unsigned long index)
3841 + unsigned long pool_index;
3843 + pool_index = get_pool_index(slot, index);
3844 + if (slot->rmap_list_pool[pool_index] &&
3845 + !slot->pool_counts[pool_index]) {
3846 + __free_page(slot->rmap_list_pool[pool_index]);
3847 + slot->rmap_list_pool[pool_index] = NULL;
3848 + slot->need_sort = 1;
3853 +static inline unsigned long vma_item_index(struct vm_area_struct *vma,
3854 + struct rmap_item *item)
3856 + return (get_rmap_addr(item) - vma->vm_start) >> PAGE_SHIFT;
3859 +static int within_same_pool(struct vma_slot *slot,
3860 + unsigned long i, unsigned long j)
3862 + unsigned long pool_i, pool_j;
3864 + pool_i = get_pool_index(slot, i);
3865 + pool_j = get_pool_index(slot, j);
3867 + return (pool_i == pool_j);
3870 +static void sort_rmap_entry_list(struct vma_slot *slot)
3872 + unsigned long i, j;
3873 + struct rmap_list_entry *entry, *swap_entry;
3875 + entry = get_rmap_list_entry(slot, 0, 0);
3876 + for (i = 0; i < slot->pages; ) {
3879 + goto skip_whole_pool;
3881 + if (entry_is_new(entry))
3884 + if (is_addr(entry->addr)) {
3889 + j = vma_item_index(slot->vma, entry->item);
3893 + if (within_same_pool(slot, i, j))
3894 + swap_entry = entry + j - i;
3896 + swap_entry = get_rmap_list_entry(slot, j, 1);
3898 + swap_entries(entry, i, swap_entry, j);
3899 + if (!within_same_pool(slot, i, j))
3900 + put_rmap_list_entry(slot, j);
3904 + i += PAGE_SIZE / sizeof(*entry);
3905 + if (i < slot->pages)
3906 + entry = get_rmap_list_entry(slot, i, 0);
3910 + if (i >= slot->pages - 1 ||
3911 + !within_same_pool(slot, i, i + 1)) {
3912 + put_rmap_list_entry(slot, i);
3913 + if (i + 1 < slot->pages)
3914 + entry = get_rmap_list_entry(slot, i + 1, 0);
3921 + /* free empty pool entries which contain no rmap_item */
3922 + /* CAN be simplied to based on only pool_counts when bug freed !!!!! */
3923 + for (i = 0; i < slot->pool_size; i++) {
3924 + unsigned char has_rmap;
3927 + if (!slot->rmap_list_pool[i])
3931 + addr = kmap(slot->rmap_list_pool[i]);
3933 + for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) {
3934 + entry = (struct rmap_list_entry *)addr + j;
3935 + if (is_addr(entry->addr))
3941 + kunmap(slot->rmap_list_pool[i]);
3943 + BUG_ON(slot->pool_counts[i]);
3944 + __free_page(slot->rmap_list_pool[i]);
3945 + slot->rmap_list_pool[i] = NULL;
3949 + slot->need_sort = 0;
3953 + * vma_fully_scanned() - if all the pages in this slot have been scanned.
3955 +static inline int vma_fully_scanned(struct vma_slot *slot)
3957 + return slot->pages_scanned && !(slot->pages_scanned % slot->pages);
3961 + * get_next_rmap_item() - Get the next rmap_item in a vma_slot according to
3962 + * its random permutation. This function is embedded with the random
3963 + * permutation index management code.
3965 +static struct rmap_item *get_next_rmap_item(struct vma_slot *slot)
3967 + unsigned long rand_range, addr, swap_index, scan_index;
3968 + struct rmap_item *item = NULL;
3969 + struct rmap_list_entry *scan_entry, *swap_entry = NULL;
3970 + struct page *page;
3972 + scan_index = swap_index = slot->pages_scanned % slot->pages;
3974 + if (pool_entry_boundary(scan_index))
3975 + try_free_last_pool(slot, scan_index - 1);
3977 + if (vma_fully_scanned(slot)) {
3978 + slot->need_rerand = slot->need_sort;
3979 + if (slot->need_sort)
3980 + sort_rmap_entry_list(slot);
3983 + scan_entry = get_rmap_list_entry(slot, scan_index, 1);
3984 + if (entry_is_new(scan_entry)) {
3985 + scan_entry->addr = get_index_orig_addr(slot, scan_index);
3986 + set_is_addr(scan_entry->addr);
3989 + if (slot->need_rerand) {
3990 + rand_range = slot->pages - scan_index;
3991 + BUG_ON(!rand_range);
3992 + swap_index = scan_index + (random32() % rand_range);
3995 + if (swap_index != scan_index) {
3996 + swap_entry = get_rmap_list_entry(slot, swap_index, 1);
3997 + if (entry_is_new(swap_entry)) {
3998 + swap_entry->addr = get_index_orig_addr(slot,
4000 + set_is_addr(swap_entry->addr);
4002 + swap_entries(scan_entry, scan_index, swap_entry, swap_index);
4005 + addr = get_entry_address(scan_entry);
4006 + item = get_entry_item(scan_entry);
4007 + BUG_ON(addr > slot->vma->vm_end || addr < slot->vma->vm_start);
4009 + page = follow_page(slot->vma, addr, FOLL_GET);
4010 + if (IS_ERR_OR_NULL(page))
4013 + if (!PageAnon(page) && !page_trans_compound_anon(page))
4016 + /*check is zero_page pfn or ksm_zero_page*/
4017 + if ((page_to_pfn(page) == zero_pfn)
4018 + || (page_to_pfn(page) == ksm_zero_pfn))
4021 + flush_anon_page(slot->vma, page, addr);
4022 + flush_dcache_page(page);
4025 + item = alloc_rmap_item();
4027 + /* It has already been zeroed */
4028 + item->slot = slot;
4029 + item->address = addr;
4030 + item->entry_index = scan_index;
4031 + scan_entry->item = item;
4032 + inc_rmap_list_pool_count(slot, scan_index);
4037 + BUG_ON(item->slot != slot);
4038 + /* the page may have changed */
4039 + item->page = page;
4040 + put_rmap_list_entry(slot, scan_index);
4042 + put_rmap_list_entry(slot, swap_index);
4049 + /* no page, store addr back and free rmap_item if possible */
4050 + free_entry_item(scan_entry);
4051 + put_rmap_list_entry(slot, scan_index);
4053 + put_rmap_list_entry(slot, swap_index);
4057 +static inline int in_stable_tree(struct rmap_item *rmap_item)
4059 + return rmap_item->address & STABLE_FLAG;
4063 + * scan_vma_one_page() - scan the next page in a vma_slot. Called with
4064 + * mmap_sem locked.
4066 +static void scan_vma_one_page(struct vma_slot *slot)
4068 + struct mm_struct *mm;
4069 + struct rmap_item *rmap_item = NULL;
4070 + struct vm_area_struct *vma = slot->vma;
4076 + rmap_item = get_next_rmap_item(slot);
4080 + if (PageKsm(rmap_item->page) && in_stable_tree(rmap_item))
4083 + cmp_and_merge_page(rmap_item);
4085 + put_page(rmap_item->page);
4087 + slot->pages_scanned++;
4088 + slot->slot_scanned = 1;
4089 + if (vma_fully_scanned(slot)) {
4090 + slot->fully_scanned = 1;
4091 + slot->rung->fully_scanned_slots++;
4092 + BUG_ON(!slot->rung->fully_scanned_slots);
4096 +static unsigned long get_vma_random_scan_num(struct vma_slot *slot,
4097 + unsigned long scan_ratio)
4099 + return slot->pages * scan_ratio / KSM_SCAN_RATIO_MAX;
4102 +static inline void vma_rung_enter(struct vma_slot *slot,
4103 + struct scan_rung *rung)
4105 + unsigned long pages_to_scan;
4106 + struct scan_rung *old_rung = slot->rung;
4108 + /* leave the old rung it was in */
4109 + BUG_ON(list_empty(&slot->ksm_list));
4111 + if (old_rung->current_scan == &slot->ksm_list)
4112 + old_rung->current_scan = slot->ksm_list.next;
4113 + list_del_init(&slot->ksm_list);
4114 + old_rung->vma_num--;
4115 + if (slot->fully_scanned)
4116 + old_rung->fully_scanned_slots--;
4118 + if (old_rung->current_scan == &old_rung->vma_list) {
4119 + /* This rung finishes a round */
4120 + old_rung->round_finished = 1;
4121 + old_rung->current_scan = old_rung->vma_list.next;
4122 + BUG_ON(old_rung->current_scan == &old_rung->vma_list &&
4123 + !list_empty(&old_rung->vma_list));
4126 + /* enter the new rung */
4127 + while (!(pages_to_scan =
4128 + get_vma_random_scan_num(slot, rung->scan_ratio))) {
4130 + BUG_ON(rung > &ksm_scan_ladder[ksm_scan_ladder_size - 1]);
4132 + if (list_empty(&rung->vma_list))
4133 + rung->current_scan = &slot->ksm_list;
4134 + list_add(&slot->ksm_list, &rung->vma_list);
4135 + slot->rung = rung;
4136 + slot->pages_to_scan = pages_to_scan;
4137 + slot->rung->vma_num++;
4138 + if (slot->fully_scanned)
4139 + rung->fully_scanned_slots++;
4141 + BUG_ON(rung->current_scan == &rung->vma_list &&
4142 + !list_empty(&rung->vma_list));
4145 +static inline void vma_rung_up(struct vma_slot *slot)
4147 + if (slot->rung == &ksm_scan_ladder[ksm_scan_ladder_size-1])
4150 + vma_rung_enter(slot, slot->rung + 1);
4153 +static inline void vma_rung_down(struct vma_slot *slot)
4155 + if (slot->rung == &ksm_scan_ladder[0])
4158 + vma_rung_enter(slot, slot->rung - 1);
4162 + * cal_dedup_ratio() - Calculate the deduplication ratio for this slot.
4164 +static unsigned long cal_dedup_ratio(struct vma_slot *slot)
4166 + struct vma_slot *slot2;
4168 + unsigned long dup_pages;
4169 + unsigned long dedup_num, pages1, scanned1;
4170 + unsigned long ret;
4173 + if (!slot->pages_scanned)
4176 + pages1 = slot->pages;
4177 + scanned1 = slot->pages_scanned - slot->last_scanned;
4178 + BUG_ON(scanned1 > slot->pages_scanned);
4180 + for (i = slot->ksm_index; i < ksm_vma_tree_index_end; i++) {
4181 + unsigned long pages2, scanned2;
4183 + dup_slot = radix_tree_lookup_slot(&slot->dup_tree, i);
4187 + dup_pages = (unsigned long)(*dup_slot) >> INDIRECT_OFFSET;
4189 + slot2 = radix_tree_lookup(&ksm_vma_tree, i);
4190 + BUG_ON(!slot2 || !slot2->pages_scanned);
4192 + pages2 = slot2->pages;
4193 + scanned2 = slot2->pages_scanned - slot2->last_scanned;
4194 + BUG_ON(scanned2 > slot2->pages_scanned);
4196 + BUG_ON(!scanned1 || !scanned2);
4198 + dedup_num = dup_pages * pages1 / scanned1 * pages2 / scanned2;
4199 + slot->dedup_num += dedup_num;
4200 + slot2->dedup_num += dedup_num;
4203 + ret = (slot->dedup_num * KSM_DEDUP_RATIO_SCALE / pages1);
4205 + /* Thrashing area filtering */
4206 + if (ksm_thrash_threshold) {
4207 + if (slot->pages_cowed * 100 / slot->pages_merged
4208 + > ksm_thrash_threshold) {
4211 + ret = ret * (slot->pages_merged - slot->pages_cowed)
4212 + / slot->pages_merged;
4221 + * stable_node_reinsert() - When the hash_strength has been adjusted, the
4222 + * stable tree need to be restructured, this is the function re-inserting the
4225 +static inline void stable_node_reinsert(struct stable_node *new_node,
4226 + struct page *page,
4227 + struct rb_root *root_treep,
4228 + struct list_head *tree_node_listp,
4231 + struct rb_node **new = &root_treep->rb_node;
4232 + struct rb_node *parent = NULL;
4233 + struct stable_node *stable_node;
4234 + struct tree_node *tree_node;
4235 + struct page *tree_page;
4241 + tree_node = rb_entry(*new, struct tree_node, node);
4243 + cmp = hash_cmp(hash, tree_node->hash);
4247 + new = &parent->rb_left;
4248 + } else if (cmp > 0) {
4250 + new = &parent->rb_right;
4256 + /* find a stable tree node with same first level hash value */
4257 + stable_node_hash_max(new_node, page, hash);
4258 + if (tree_node->count == 1) {
4259 + stable_node = rb_entry(tree_node->sub_root.rb_node,
4260 + struct stable_node, node);
4261 + tree_page = get_ksm_page(stable_node, 1, 0);
4263 + stable_node_hash_max(stable_node,
4265 + put_page(tree_page);
4267 + /* prepare for stable node insertion */
4269 + cmp = hash_cmp(new_node->hash_max,
4270 + stable_node->hash_max);
4271 + parent = &stable_node->node;
4273 + new = &parent->rb_left;
4275 + new = &parent->rb_right;
4281 + /* the only stable_node deleted, the tree node
4282 + * was not deleted.
4284 + goto tree_node_reuse;
4288 + /* well, search the collision subtree */
4289 + new = &tree_node->sub_root.rb_node;
4295 + stable_node = rb_entry(*new, struct stable_node, node);
4297 + cmp = hash_cmp(new_node->hash_max,
4298 + stable_node->hash_max);
4302 + new = &parent->rb_left;
4303 + } else if (cmp > 0) {
4305 + new = &parent->rb_right;
4307 + /* oh, no, still a collision */
4315 + /* no tree node found */
4316 + tree_node = alloc_tree_node(tree_node_listp);
4318 + printk(KERN_ERR "UKSM: memory allocation error!\n");
4321 + tree_node->hash = hash;
4322 + rb_link_node(&tree_node->node, parent, new);
4323 + rb_insert_color(&tree_node->node, root_treep);
4326 + /* prepare for stable node insertion */
4328 + new = &tree_node->sub_root.rb_node;
4332 + rb_link_node(&new_node->node, parent, new);
4333 + rb_insert_color(&new_node->node, &tree_node->sub_root);
4334 + new_node->tree_node = tree_node;
4335 + tree_node->count++;
4339 + /* This can only happen when two nodes have collided
4342 + new_node->tree_node = NULL;
4346 +static inline void free_all_tree_nodes(struct list_head *list)
4348 + struct tree_node *node, *tmp;
4350 + list_for_each_entry_safe(node, tmp, list, all_list) {
4351 + free_tree_node(node);
4356 + * stable_tree_delta_hash() - Delta hash the stable tree from previous hash
4357 + * strength to the current hash_strength. It re-structures the hole tree.
4359 +static inline void stable_tree_delta_hash(u32 prev_hash_strength)
4361 + struct stable_node *node, *tmp;
4362 + struct rb_root *root_new_treep;
4363 + struct list_head *new_tree_node_listp;
4365 + stable_tree_index = (stable_tree_index + 1) % 2;
4366 + root_new_treep = &root_stable_tree[stable_tree_index];
4367 + new_tree_node_listp = &stable_tree_node_list[stable_tree_index];
4368 + *root_new_treep = RB_ROOT;
4369 + BUG_ON(!list_empty(new_tree_node_listp));
4372 + * we need to be safe, the node could be removed by get_ksm_page()
4374 + list_for_each_entry_safe(node, tmp, &stable_node_list, all_list) {
4376 + struct page *node_page;
4380 + * We are completely re-structuring the stable nodes to a new
4381 + * stable tree. We don't want to touch the old tree unlinks and
4382 + * old tree_nodes. The old tree_nodes will be freed at once.
4384 + node_page = get_ksm_page(node, 0, 0);
4388 + if (node->tree_node) {
4389 + hash = node->tree_node->hash;
4391 + addr = kmap_atomic(node_page, KM_USER0);
4393 + hash = delta_hash(addr, prev_hash_strength,
4394 + hash_strength, hash);
4395 + kunmap_atomic(addr, KM_USER0);
4398 + *it was not inserted to rbtree due to collision in last
4401 + hash = page_hash(node_page, hash_strength, 0);
4404 + stable_node_reinsert(node, node_page, root_new_treep,
4405 + new_tree_node_listp, hash);
4406 + put_page(node_page);
4409 + root_stable_treep = root_new_treep;
4410 + free_all_tree_nodes(stable_tree_node_listp);
4411 + BUG_ON(!list_empty(stable_tree_node_listp));
4412 + stable_tree_node_listp = new_tree_node_listp;
4415 +static inline void inc_hash_strength(unsigned long delta)
4417 + hash_strength += 1 << delta;
4418 + if (hash_strength > HASH_STRENGTH_MAX)
4419 + hash_strength = HASH_STRENGTH_MAX;
4422 +static inline void dec_hash_strength(unsigned long delta)
4424 + unsigned long change = 1 << delta;
4426 + if (hash_strength <= change + 1)
4427 + hash_strength = 1;
4429 + hash_strength -= change;
4432 +static inline void inc_hash_strength_delta(void)
4434 + hash_strength_delta++;
4435 + if (hash_strength_delta > HASH_STRENGTH_DELTA_MAX)
4436 + hash_strength_delta = HASH_STRENGTH_DELTA_MAX;
4440 +static inline unsigned long get_current_neg_ratio(void)
4442 + if (!rshash_pos || rshash_neg > rshash_pos)
4445 + return div64_u64(100 * rshash_neg , rshash_pos);
4449 +static inline unsigned long get_current_neg_ratio(void)
4451 + u64 pos = benefit.pos;
4452 + u64 neg = benefit.neg;
4457 + if (!pos || neg > pos)
4460 + if (neg > div64_u64(U64_MAX, 100))
4461 + pos = div64_u64(pos, 100);
4465 + return div64_u64(neg, pos);
4468 +static inline unsigned long get_current_benefit(void)
4470 + u64 pos = benefit.pos;
4471 + u64 neg = benefit.neg;
4472 + u64 scanned = benefit.scanned;
4477 + return div64_u64((pos - neg), scanned);
4480 +static inline int judge_rshash_direction(void)
4482 + u64 current_neg_ratio, stable_benefit;
4483 + u64 current_benefit, delta = 0;
4486 + /* In case the system are still for a long time. */
4487 + if (ksm_scan_round % 1024 == 3) {
4492 + current_neg_ratio = get_current_neg_ratio();
4494 + if (current_neg_ratio == 0) {
4495 + rshash_neg_cont_zero++;
4496 + if (rshash_neg_cont_zero > 2)
4501 + rshash_neg_cont_zero = 0;
4503 + if (current_neg_ratio > 90) {
4508 - err = try_to_merge_one_page(vma, page, kpage);
4510 + current_benefit = get_current_benefit();
4511 + stable_benefit = rshash_state.stable_benefit;
4513 + if (!stable_benefit) {
4518 + if (current_benefit > stable_benefit)
4519 + delta = current_benefit - stable_benefit;
4520 + else if (current_benefit < stable_benefit)
4521 + delta = stable_benefit - current_benefit;
4523 + delta = div64_u64(100 * delta , stable_benefit);
4526 + rshash_cont_obscure++;
4527 + if (rshash_cont_obscure > 2)
4533 - /* Must get reference to anon_vma while still holding mmap_sem */
4534 - hold_anon_vma(rmap_item, vma->anon_vma);
4536 - up_read(&mm->mmap_sem);
4538 + rshash_cont_obscure = 0;
4543 - * try_to_merge_two_pages - take two identical pages and prepare them
4544 - * to be merged into one page.
4546 - * This function returns the kpage if we successfully merged two identical
4547 - * pages into one ksm page, NULL otherwise.
4549 - * Note that this function upgrades page to ksm page: if one of the pages
4550 - * is already a ksm page, try_to_merge_with_ksm_page should be used.
4552 + * rshash_adjust() - The main function to control the random sampling state
4553 + * machine for hash strength adapting.
4555 -static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
4556 - struct page *page,
4557 - struct rmap_item *tree_rmap_item,
4558 - struct page *tree_page)
4559 +static void rshash_adjust(void)
4562 + unsigned long prev_hash_strength = hash_strength;
4564 - err = try_to_merge_with_ksm_page(rmap_item, page, NULL);
4566 - err = try_to_merge_with_ksm_page(tree_rmap_item,
4569 - * If that fails, we have a ksm page with only one pte
4570 - * pointing to it: so break it.
4573 - break_cow(rmap_item);
4575 - return err ? NULL : page;
4577 + if (ksm_pages_scanned == ksm_pages_scanned_last)
4581 - * stable_tree_search - search for page inside the stable tree
4583 - * This function checks if there is a page inside the stable tree
4584 - * with identical content to the page that we are scanning right now.
4586 - * This function returns the stable tree node of identical content if found,
4589 -static struct page *stable_tree_search(struct page *page)
4591 - struct rb_node *node = root_stable_tree.rb_node;
4592 - struct stable_node *stable_node;
4595 - stable_node = page_stable_node(page);
4596 - if (stable_node) { /* ksm page forked */
4599 + switch (rshash_state.state) {
4600 + case RSHASH_STILL:
4601 + switch (judge_rshash_direction()) {
4603 + if (rshash_state.pre_direct == GO_DOWN)
4604 + hash_strength_delta = 0;
4606 + inc_hash_strength(hash_strength_delta);
4607 + inc_hash_strength_delta();
4608 + rshash_state.stable_benefit = get_current_benefit();
4609 + rshash_state.pre_direct = GO_UP;
4613 + if (rshash_state.pre_direct == GO_UP)
4614 + hash_strength_delta = 0;
4616 + dec_hash_strength(hash_strength_delta);
4617 + inc_hash_strength_delta();
4618 + rshash_state.stable_benefit = get_current_benefit();
4619 + rshash_state.pre_direct = GO_DOWN;
4623 + rshash_state.stable_point = hash_strength;
4624 + rshash_state.turn_point_down = hash_strength;
4625 + rshash_state.turn_point_up = hash_strength;
4626 + rshash_state.turn_benefit_down = get_current_benefit();
4627 + rshash_state.turn_benefit_up = get_current_benefit();
4628 + rshash_state.lookup_window_index = 0;
4629 + rshash_state.state = RSHASH_TRYDOWN;
4630 + dec_hash_strength(hash_strength_delta);
4631 + inc_hash_strength_delta();
4641 + case RSHASH_TRYDOWN:
4642 + if (rshash_state.lookup_window_index++ % 5 == 0)
4643 + rshash_state.below_count = 0;
4645 + if (get_current_benefit() < rshash_state.stable_benefit)
4646 + rshash_state.below_count++;
4647 + else if (get_current_benefit() >
4648 + rshash_state.turn_benefit_down) {
4649 + rshash_state.turn_point_down = hash_strength;
4650 + rshash_state.turn_benefit_down = get_current_benefit();
4653 + if (rshash_state.below_count >= 3 ||
4654 + judge_rshash_direction() == GO_UP ||
4655 + hash_strength == 1) {
4656 + hash_strength = rshash_state.stable_point;
4657 + hash_strength_delta = 0;
4658 + inc_hash_strength(hash_strength_delta);
4659 + inc_hash_strength_delta();
4660 + rshash_state.lookup_window_index = 0;
4661 + rshash_state.state = RSHASH_TRYUP;
4662 + hash_strength_delta = 0;
4664 + dec_hash_strength(hash_strength_delta);
4665 + inc_hash_strength_delta();
4669 + case RSHASH_TRYUP:
4670 + if (rshash_state.lookup_window_index++ % 5 == 0)
4671 + rshash_state.below_count = 0;
4673 + if (get_current_benefit() < rshash_state.turn_benefit_down)
4674 + rshash_state.below_count++;
4675 + else if (get_current_benefit() > rshash_state.turn_benefit_up) {
4676 + rshash_state.turn_point_up = hash_strength;
4677 + rshash_state.turn_benefit_up = get_current_benefit();
4680 + if (rshash_state.below_count >= 3 ||
4681 + judge_rshash_direction() == GO_DOWN ||
4682 + hash_strength == HASH_STRENGTH_MAX) {
4683 + hash_strength = rshash_state.turn_benefit_up >
4684 + rshash_state.turn_benefit_down ?
4685 + rshash_state.turn_point_up :
4686 + rshash_state.turn_point_down;
4688 + rshash_state.state = RSHASH_PRE_STILL;
4690 + inc_hash_strength(hash_strength_delta);
4691 + inc_hash_strength_delta();
4697 + case RSHASH_PRE_STILL:
4698 + rshash_state.stable_benefit = get_current_benefit();
4699 + rshash_state.state = RSHASH_STILL;
4700 + hash_strength_delta = 0;
4707 - struct page *tree_page;
4709 + /* rshash_neg = rshash_pos = 0; */
4713 - stable_node = rb_entry(node, struct stable_node, node);
4714 - tree_page = get_ksm_page(stable_node);
4717 + if (prev_hash_strength != hash_strength)
4718 + stable_tree_delta_hash(prev_hash_strength);
4721 - ret = memcmp_pages(page, tree_page);
4722 +static void free_vma_dup_tree(struct vma_slot *slot)
4724 + struct vma_slot *tmp_slot;
4728 - put_page(tree_page);
4729 - node = node->rb_left;
4730 - } else if (ret > 0) {
4731 - put_page(tree_page);
4732 - node = node->rb_right;
4735 + /* step 1: free entries in smaller vmas' dup tree */
4736 + for (i = 0; i < slot->ksm_index; i++) {
4737 + tmp_slot = radix_tree_lookup(&ksm_vma_tree, i);
4739 + radix_tree_delete(&tmp_slot->dup_tree, slot->ksm_index);
4743 + /* step 2: free my own dup tree */
4744 + for (i = slot->ksm_index; i < ksm_vma_tree_index_end; i++)
4745 + radix_tree_delete(&slot->dup_tree, i);
4747 + BUG_ON(slot->dup_tree.rnode);
4751 - * stable_tree_insert - insert rmap_item pointing to new ksm page
4752 - * into the stable tree.
4754 - * This function returns the stable tree node just allocated on success,
4757 + * round_update_ladder() - The main function to do update of all the
4758 + * adjustments whenever a scan round is finished.
4760 -static struct stable_node *stable_tree_insert(struct page *kpage)
4761 +static void round_update_ladder(void)
4763 - struct rb_node **new = &root_stable_tree.rb_node;
4764 - struct rb_node *parent = NULL;
4765 - struct stable_node *stable_node;
4767 + struct vma_slot *slot, *tmp_slot;
4768 + unsigned long dedup_ratio_max = 0, dedup_ratio_mean = 0;
4769 + unsigned long threshold;
4771 + for (i = 0; i < ksm_vma_tree_index_end; i++) {
4772 + slot = radix_tree_lookup(&ksm_vma_tree, i);
4775 + slot->dedup_ratio = cal_dedup_ratio(slot);
4776 + if (dedup_ratio_max < slot->dedup_ratio)
4777 + dedup_ratio_max = slot->dedup_ratio;
4778 + dedup_ratio_mean += slot->dedup_ratio;
4783 - struct page *tree_page;
4785 + dedup_ratio_mean /= ksm_vma_slot_num;
4786 + threshold = dedup_ratio_mean;
4789 - stable_node = rb_entry(*new, struct stable_node, node);
4790 - tree_page = get_ksm_page(stable_node);
4793 + for (i = 0; i < ksm_vma_tree_index_end; i++) {
4794 + slot = radix_tree_lookup(&ksm_vma_tree, i);
4796 - ret = memcmp_pages(kpage, tree_page);
4797 - put_page(tree_page);
4799 + if (slot->dedup_ratio &&
4800 + slot->dedup_ratio >= threshold) {
4801 + vma_rung_up(slot);
4803 + vma_rung_down(slot);
4808 - new = &parent->rb_left;
4810 - new = &parent->rb_right;
4812 + free_vma_dup_tree(slot);
4813 + radix_tree_delete(&ksm_vma_tree, i);
4814 + ksm_vma_tree_num--;
4815 + slot->ksm_index = -1;
4816 + slot->slot_scanned = 0;
4817 + slot->dedup_ratio = 0;
4818 + slot->dedup_num = 0;
4822 + for (i = 0; i < ksm_scan_ladder_size; i++) {
4823 + list_for_each_entry_safe(slot, tmp_slot,
4824 + &ksm_scan_ladder[i].vma_list,
4827 - * It is not a bug that stable_tree_search() didn't
4828 - * find this node: because at that time our page was
4829 - * not yet write-protected, so may have changed since.
4830 + * The slots were scanned but not in inter_tab, their
4831 + * dedup must be 0.
4834 + if (slot->slot_scanned) {
4835 + BUG_ON(slot->dedup_ratio != 0);
4836 + vma_rung_down(slot);
4839 + slot->dedup_ratio = 0;
4843 - stable_node = alloc_stable_node();
4846 + BUG_ON(ksm_vma_tree_num != 0);
4847 + ksm_vma_tree_index_end = 0;
4849 - rb_link_node(&stable_node->node, parent, new);
4850 - rb_insert_color(&stable_node->node, &root_stable_tree);
4851 + for (i = 0; i < ksm_scan_ladder_size; i++) {
4852 + ksm_scan_ladder[i].round_finished = 0;
4853 + ksm_scan_ladder[i].busy_searched = 0;
4855 + list_for_each_entry(slot, &ksm_scan_ladder[i].vma_list,
4857 + slot->last_scanned = slot->pages_scanned;
4858 + slot->slot_scanned = 0;
4859 + slot->pages_cowed = 0;
4860 + slot->pages_merged = 0;
4861 + if (slot->fully_scanned) {
4862 + slot->fully_scanned = 0;
4863 + ksm_scan_ladder[i].fully_scanned_slots--;
4865 + BUG_ON(slot->ksm_index != -1);
4868 - INIT_HLIST_HEAD(&stable_node->hlist);
4869 + BUG_ON(ksm_scan_ladder[i].fully_scanned_slots);
4872 - stable_node->kpfn = page_to_pfn(kpage);
4873 - set_page_stable_node(kpage, stable_node);
4876 - return stable_node;
4877 + //ksm_pages_scanned_last = ksm_pages_scanned;
4881 - * unstable_tree_search_insert - search for identical page,
4882 - * else insert rmap_item into the unstable tree.
4884 - * This function searches for a page in the unstable tree identical to the
4885 - * page currently being scanned; and if no identical page is found in the
4886 - * tree, we insert rmap_item as a new object into the unstable tree.
4888 - * This function returns pointer to rmap_item found to be identical
4889 - * to the currently scanned page, NULL otherwise.
4891 - * This function does both searching and inserting, because they share
4892 - * the same walking algorithm in an rbtree.
4895 -struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
4896 - struct page *page,
4897 - struct page **tree_pagep)
4898 +static inline unsigned int ksm_pages_to_scan(unsigned int batch_pages)
4900 + return totalram_pages * batch_pages / 1000000;
4903 +static inline void cal_ladder_pages_to_scan(unsigned int num)
4905 - struct rb_node **new = &root_unstable_tree.rb_node;
4906 - struct rb_node *parent = NULL;
4910 - struct rmap_item *tree_rmap_item;
4911 - struct page *tree_page;
4913 + for (i = 0; i < ksm_scan_ladder_size; i++) {
4914 + ksm_scan_ladder[i].pages_to_scan = num
4915 + * ksm_scan_ladder[i].scan_ratio / KSM_SCAN_RATIO_MAX;
4917 + ksm_scan_ladder[0].pages_to_scan /= 16;
4918 + ksm_scan_ladder[1].pages_to_scan /= 4;
4922 - tree_rmap_item = rb_entry(*new, struct rmap_item, node);
4923 - tree_page = get_mergeable_page(tree_rmap_item);
4924 - if (IS_ERR_OR_NULL(tree_page))
4926 +static inline void ksm_del_vma_slot(struct vma_slot *slot)
4929 + struct rmap_list_entry *entry;
4930 + struct vma_slot *tmp;
4933 - * Don't substitute a ksm page for a forked page.
4935 - if (page == tree_page) {
4936 - put_page(tree_page);
4939 + /* mutex lock contention maybe intensive, other idea ? */
4940 + BUG_ON(list_empty(&slot->ksm_list) || !slot->rung);
4942 - ret = memcmp_pages(page, tree_page);
4943 + if (slot->rung->current_scan == &slot->ksm_list)
4944 + slot->rung->current_scan = slot->rung->current_scan->next;
4948 - put_page(tree_page);
4949 - new = &parent->rb_left;
4950 - } else if (ret > 0) {
4951 - put_page(tree_page);
4952 - new = &parent->rb_right;
4954 - *tree_pagep = tree_page;
4955 - return tree_rmap_item;
4957 + list_del_init(&slot->ksm_list);
4958 + slot->rung->vma_num--;
4959 + if (slot->fully_scanned)
4960 + slot->rung->fully_scanned_slots--;
4962 + if (slot->rung->current_scan == &slot->rung->vma_list) {
4963 + /* This rung finishes a round */
4964 + slot->rung->round_finished = 1;
4965 + slot->rung->current_scan = slot->rung->vma_list.next;
4966 + BUG_ON(slot->rung->current_scan == &slot->rung->vma_list
4967 + && !list_empty(&slot->rung->vma_list));
4970 - rmap_item->address |= UNSTABLE_FLAG;
4971 - rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
4972 - rb_link_node(&rmap_item->node, parent, new);
4973 - rb_insert_color(&rmap_item->node, &root_unstable_tree);
4974 + if (slot->ksm_index == -1)
4977 - ksm_pages_unshared++;
4979 + tmp = radix_tree_delete(&ksm_vma_tree, slot->ksm_index);
4980 + BUG_ON(!tmp || tmp != slot);
4981 + free_vma_dup_tree(slot);
4982 + ksm_vma_tree_num--;
4983 + if (slot->ksm_index == ksm_vma_tree_index_end - 1)
4984 + ksm_vma_tree_index_end--;
4987 + if (!slot->rmap_list_pool)
4990 + for (i = 0; i < slot->pool_size; i++) {
4993 + if (!slot->rmap_list_pool[i])
4996 + addr = kmap(slot->rmap_list_pool[i]);
4998 + for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) {
4999 + entry = (struct rmap_list_entry *)addr + j;
5000 + if (is_addr(entry->addr))
5005 + remove_rmap_item_from_tree(entry->item);
5006 + free_rmap_item(entry->item);
5007 + slot->pool_counts[i]--;
5009 + BUG_ON(slot->pool_counts[i]);
5010 + kunmap(slot->rmap_list_pool[i]);
5011 + __free_page(slot->rmap_list_pool[i]);
5013 + kfree(slot->rmap_list_pool);
5014 + kfree(slot->pool_counts);
5017 + slot->rung = NULL;
5018 + free_vma_slot(slot);
5019 + BUG_ON(!ksm_vma_slot_num);
5020 + ksm_vma_slot_num--;
5024 - * stable_tree_append - add another rmap_item to the linked list of
5025 - * rmap_items hanging off a given node of the stable tree, all sharing
5026 - * the same ksm page.
5028 -static void stable_tree_append(struct rmap_item *rmap_item,
5029 - struct stable_node *stable_node)
5031 +static inline void cleanup_vma_slots(void)
5033 - rmap_item->head = stable_node;
5034 - rmap_item->address |= STABLE_FLAG;
5035 - hlist_add_head(&rmap_item->hlist, &stable_node->hlist);
5036 + struct vma_slot *slot;
5038 - if (rmap_item->hlist.next)
5039 - ksm_pages_sharing++;
5041 - ksm_pages_shared++;
5042 + spin_lock(&vma_slot_list_lock);
5043 + while (!list_empty(&vma_slot_del)) {
5044 + slot = list_entry(vma_slot_del.next,
5045 + struct vma_slot, slot_list);
5046 + list_del(&slot->slot_list);
5047 + spin_unlock(&vma_slot_list_lock);
5048 + ksm_del_vma_slot(slot);
5049 + spin_lock(&vma_slot_list_lock);
5051 + spin_unlock(&vma_slot_list_lock);
5055 - * cmp_and_merge_page - first see if page can be merged into the stable tree;
5056 - * if not, compare checksum to previous and if it's the same, see if page can
5057 - * be inserted into the unstable tree, or merged with a page already there and
5058 - * both transferred to the stable tree.
5060 - * @page: the page that we are searching identical page to.
5061 - * @rmap_item: the reverse mapping into the virtual address of this page
5062 +static inline int rung_fully_scanned(struct scan_rung *rung)
5064 + return (rung->fully_scanned_slots == rung->vma_num &&
5065 + rung->fully_scanned_slots);
5069 + * ksm_do_scan() - the main worker function.
5071 -static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
5072 +static void ksm_do_scan(void)
5074 - struct rmap_item *tree_rmap_item;
5075 - struct page *tree_page = NULL;
5076 - struct stable_node *stable_node;
5077 - struct page *kpage;
5078 - unsigned int checksum;
5080 + struct vma_slot *slot, *iter;
5081 + struct list_head *next_scan, *iter_head;
5082 + struct mm_struct *busy_mm;
5083 + unsigned char round_finished, all_rungs_emtpy;
5085 + unsigned long rest_pages;
5091 + for (i = ksm_scan_ladder_size - 1; i >= 0; i--) {
5092 + struct scan_rung *rung = &ksm_scan_ladder[i];
5094 - remove_rmap_item_from_tree(rmap_item);
5095 + if (!rung->pages_to_scan)
5098 - /* We first start with searching the page inside the stable tree */
5099 - kpage = stable_tree_search(page);
5101 - err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
5104 - * The page was successfully merged:
5105 - * add its rmap_item to the stable tree.
5108 - stable_tree_append(rmap_item, page_stable_node(kpage));
5109 - unlock_page(kpage);
5110 + if (list_empty(&rung->vma_list)) {
5111 + rung->pages_to_scan = 0;
5119 - * If the hash value of the page has changed from the last time
5120 - * we calculated it, this page is changing frequently: therefore we
5121 - * don't want to insert it in the unstable tree, and we don't want
5122 - * to waste our time searching for something identical to it there.
5124 - checksum = calc_checksum(page);
5125 - if (rmap_item->oldchecksum != checksum) {
5126 - rmap_item->oldchecksum = checksum;
5131 - unstable_tree_search_insert(rmap_item, page, &tree_page);
5132 - if (tree_rmap_item) {
5133 - kpage = try_to_merge_two_pages(rmap_item, page,
5134 - tree_rmap_item, tree_page);
5135 - put_page(tree_page);
5137 - * As soon as we merge this page, we want to remove the
5138 - * rmap_item of the page we have merged with from the unstable
5139 - * tree, and insert it instead as new node in the stable tree.
5140 + * if a higher rung is fully scanned, its rest pages should be
5141 + * propagated to the lower rungs. This can prevent the higher
5142 + * rung from waiting a long time while it still has its
5143 + * pages_to_scan quota.
5147 - remove_rmap_item_from_tree(tree_rmap_item);
5148 + if (rung_fully_scanned(rung)) {
5149 + rest_pages += rung->pages_to_scan;
5150 + rung->pages_to_scan = 0;
5155 - stable_node = stable_tree_insert(kpage);
5156 - if (stable_node) {
5157 - stable_tree_append(tree_rmap_item, stable_node);
5158 - stable_tree_append(rmap_item, stable_node);
5159 + rung->pages_to_scan += rest_pages;
5161 + while (rung->pages_to_scan && likely(!freezing(current))) {
5163 + cleanup_vma_slots();
5165 + if (list_empty(&rung->vma_list))
5169 + BUG_ON(rung->current_scan == &rung->vma_list &&
5170 + !list_empty(&rung->vma_list));
5172 + slot = list_entry(rung->current_scan,
5173 + struct vma_slot, ksm_list);
5176 + if (slot->fully_scanned)
5179 + err = try_down_read_slot_mmap_sem(slot);
5180 + if (err == -ENOENT)
5183 + busy_mm = slot->mm;
5186 + if (err == -EBUSY) {
5187 + /* skip other vmas on the same mm */
5188 + rung->busy_searched = 1;
5190 + iter_head = slot->ksm_list.next;
5192 + while (iter_head != &rung->vma_list) {
5193 + iter = list_entry(iter_head,
5196 + if (iter->vma->vm_mm != busy_mm)
5198 + iter_head = iter_head->next;
5201 + if (iter->vma->vm_mm != busy_mm) {
5202 + rung->current_scan = &iter->ksm_list;
5205 + /* at the end, but still busy */
5206 + rung->current_scan = iter->ksm_list.next;
5211 - unlock_page(kpage);
5214 - * If we fail to insert the page into the stable tree,
5215 - * we will have 2 virtual addresses that are pointing
5216 - * to a ksm page left outside the stable tree,
5217 - * in which case we need to break_cow on both.
5219 - if (!stable_node) {
5220 - break_cow(tree_rmap_item);
5221 - break_cow(rmap_item);
5222 + BUG_ON(!vma_can_enter(slot->vma));
5223 + if (ksm_test_exit(slot->vma->vm_mm)) {
5224 + busy_mm = slot->vma->vm_mm;
5225 + up_read(&slot->vma->vm_mm->mmap_sem);
5230 + if (rung->busy_searched)
5231 + rung->busy_searched = 0;
5232 + /* Ok, we have take the mmap_sem, ready to scan */
5233 + scan_vma_one_page(slot);
5234 + up_read(&slot->vma->vm_mm->mmap_sem);
5235 + rung->pages_to_scan--;
5237 + if ((slot->pages_scanned &&
5238 + slot->pages_scanned % slot->pages_to_scan == 0)
5239 + || slot->fully_scanned) {
5241 + next_scan = rung->current_scan->next;
5242 + if (next_scan == &rung->vma_list) {
5244 + * All the slots in this rung
5245 + * have been traveled in this
5248 + rung->round_finished = 1;
5249 + rung->current_scan =
5250 + rung->vma_list.next;
5251 + if (rung_fully_scanned(rung) ||
5252 + rung->busy_searched) {
5254 + * All the pages in all slots
5255 + * have been scanned. Or we
5256 + * did not make any progress
5257 + * because of busy mm.
5260 + rung->pages_to_scan;
5261 + rung->pages_to_scan = 0;
5265 + rung->current_scan = next_scan;
5272 + if (freezing(current))
5277 -static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
5278 - struct rmap_item **rmap_list,
5279 - unsigned long addr)
5281 - struct rmap_item *rmap_item;
5282 + if (freezing(current))
5285 - while (*rmap_list) {
5286 - rmap_item = *rmap_list;
5287 - if ((rmap_item->address & PAGE_MASK) == addr)
5289 - if (rmap_item->address > addr)
5290 + round_finished = 1;
5291 + all_rungs_emtpy = 1;
5292 + for (i = 0; i < ksm_scan_ladder_size; i++) {
5293 + struct scan_rung *rung = &ksm_scan_ladder[i];
5295 + if (!list_empty(&rung->vma_list)) {
5296 + all_rungs_emtpy = 0;
5297 + if (!rung->round_finished)
5298 + round_finished = 0;
5300 - *rmap_list = rmap_item->rmap_list;
5301 - remove_rmap_item_from_tree(rmap_item);
5302 - free_rmap_item(rmap_item);
5306 - rmap_item = alloc_rmap_item();
5308 - /* It has already been zeroed */
5309 - rmap_item->mm = mm_slot->mm;
5310 - rmap_item->address = addr;
5311 - rmap_item->rmap_list = *rmap_list;
5312 - *rmap_list = rmap_item;
5316 + if (all_rungs_emtpy)
5317 + round_finished = 0;
5319 -static struct rmap_item *scan_get_next_rmap_item(struct page **page)
5321 - struct mm_struct *mm;
5322 - struct mm_slot *slot;
5323 - struct vm_area_struct *vma;
5324 - struct rmap_item *rmap_item;
5325 + cleanup_vma_slots();
5327 - if (list_empty(&ksm_mm_head.mm_list))
5329 + if (round_finished) {
5330 + round_update_ladder();
5332 - slot = ksm_scan.mm_slot;
5333 - if (slot == &ksm_mm_head) {
5335 * A number of pages can hang around indefinitely on per-cpu
5336 * pagevecs, raised page count preventing write_protect_page
5337 @@ -1308,266 +4161,160 @@
5339 lru_add_drain_all();
5341 + /* sync with ksm_remove_vma for rb_erase */
5343 root_unstable_tree = RB_ROOT;
5345 - spin_lock(&ksm_mmlist_lock);
5346 - slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
5347 - ksm_scan.mm_slot = slot;
5348 - spin_unlock(&ksm_mmlist_lock);
5350 - ksm_scan.address = 0;
5351 - ksm_scan.rmap_list = &slot->rmap_list;
5355 - down_read(&mm->mmap_sem);
5356 - if (ksm_test_exit(mm))
5359 - vma = find_vma(mm, ksm_scan.address);
5361 - for (; vma; vma = vma->vm_next) {
5362 - if (!(vma->vm_flags & VM_MERGEABLE))
5364 - if (ksm_scan.address < vma->vm_start)
5365 - ksm_scan.address = vma->vm_start;
5366 - if (!vma->anon_vma)
5367 - ksm_scan.address = vma->vm_end;
5369 - while (ksm_scan.address < vma->vm_end) {
5370 - if (ksm_test_exit(mm))
5372 - *page = follow_page(vma, ksm_scan.address, FOLL_GET);
5373 - if (IS_ERR_OR_NULL(*page)) {
5374 - ksm_scan.address += PAGE_SIZE;
5378 - if (PageAnon(*page) ||
5379 - page_trans_compound_anon(*page)) {
5380 - flush_anon_page(vma, *page, ksm_scan.address);
5381 - flush_dcache_page(*page);
5382 - rmap_item = get_next_rmap_item(slot,
5383 - ksm_scan.rmap_list, ksm_scan.address);
5385 - ksm_scan.rmap_list =
5386 - &rmap_item->rmap_list;
5387 - ksm_scan.address += PAGE_SIZE;
5390 - up_read(&mm->mmap_sem);
5394 - ksm_scan.address += PAGE_SIZE;
5399 - if (ksm_test_exit(mm)) {
5400 - ksm_scan.address = 0;
5401 - ksm_scan.rmap_list = &slot->rmap_list;
5404 - * Nuke all the rmap_items that are above this current rmap:
5405 - * because there were no VM_MERGEABLE vmas with such addresses.
5407 - remove_trailing_rmap_items(slot, ksm_scan.rmap_list);
5409 - spin_lock(&ksm_mmlist_lock);
5410 - ksm_scan.mm_slot = list_entry(slot->mm_list.next,
5411 - struct mm_slot, mm_list);
5412 - if (ksm_scan.address == 0) {
5414 - * We've completed a full scan of all vmas, holding mmap_sem
5415 - * throughout, and found no VM_MERGEABLE: so do the same as
5416 - * __ksm_exit does to remove this mm from all our lists now.
5417 - * This applies either when cleaning up after __ksm_exit
5418 - * (but beware: we can reach here even before __ksm_exit),
5419 - * or when all VM_MERGEABLE areas have been unmapped (and
5420 - * mmap_sem then protects against race with MADV_MERGEABLE).
5422 - hlist_del(&slot->link);
5423 - list_del(&slot->mm_list);
5424 - spin_unlock(&ksm_mmlist_lock);
5426 - free_mm_slot(slot);
5427 - clear_bit(MMF_VM_MERGEABLE, &mm->flags);
5428 - up_read(&mm->mmap_sem);
5431 - spin_unlock(&ksm_mmlist_lock);
5432 - up_read(&mm->mmap_sem);
5433 + free_all_tree_nodes(&unstable_tree_node_list);
5436 - /* Repeat until we've completed scanning the whole list */
5437 - slot = ksm_scan.mm_slot;
5438 - if (slot != &ksm_mm_head)
5446 - * ksm_do_scan - the ksm scanner main worker function.
5447 - * @scan_npages - number of pages we want to scan before we return.
5449 -static void ksm_do_scan(unsigned int scan_npages)
5451 - struct rmap_item *rmap_item;
5452 - struct page *uninitialized_var(page);
5453 + for (i = 0; i < ksm_scan_ladder_size; i++) {
5454 + struct scan_rung *rung = &ksm_scan_ladder[i];
5456 - while (scan_npages-- && likely(!freezing(current))) {
5458 - rmap_item = scan_get_next_rmap_item(&page);
5461 - if (!PageKsm(page) || !in_stable_tree(rmap_item))
5462 - cmp_and_merge_page(page, rmap_item);
5465 + * Before we can go sleep, we should make sure that all the
5466 + * pages_to_scan quota for this scan has been finished
5468 + if (!list_empty(&rung->vma_list) && rung->pages_to_scan)
5472 + cal_ladder_pages_to_scan(ksm_scan_batch_pages);
5475 static int ksmd_should_run(void)
5477 - return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list);
5478 + return ksm_run & KSM_RUN_MERGE;
5481 -static int ksm_scan_thread(void *nothing)
5484 - set_user_nice(current, 5);
5486 - while (!kthread_should_stop()) {
5487 - mutex_lock(&ksm_thread_mutex);
5488 - if (ksmd_should_run())
5489 - ksm_do_scan(ksm_thread_pages_to_scan);
5490 - mutex_unlock(&ksm_thread_mutex);
5493 +#define __round_mask(x, y) ((__typeof__(x))((y)-1))
5494 +#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
5496 - if (ksmd_should_run()) {
5497 - schedule_timeout_interruptible(
5498 - msecs_to_jiffies(ksm_thread_sleep_millisecs));
5500 - wait_event_freezable(ksm_thread_wait,
5501 - ksmd_should_run() || kthread_should_stop());
5505 +static inline unsigned long vma_pool_size(struct vm_area_struct *vma)
5507 + return round_up(sizeof(struct rmap_list_entry) * vma_pages(vma),
5508 + PAGE_SIZE) >> PAGE_SHIFT;
5511 -int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
5512 - unsigned long end, int advice, unsigned long *vm_flags)
5519 + * @return int , 1 on success, 0 on failure
5521 +static int ksm_vma_enter(struct vma_slot *slot)
5523 - struct mm_struct *mm = vma->vm_mm;
5527 - case MADV_MERGEABLE:
5529 - * Be somewhat over-protective for now!
5531 - if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE |
5532 - VM_PFNMAP | VM_IO | VM_DONTEXPAND |
5533 - VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
5534 - VM_NONLINEAR | VM_MIXEDMAP | VM_SAO))
5535 - return 0; /* just ignore the advice */
5537 - if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
5538 - err = __ksm_enter(mm);
5543 - *vm_flags |= VM_MERGEABLE;
5545 + struct scan_rung *rung;
5546 + unsigned long pages_to_scan, pool_size;
5548 - case MADV_UNMERGEABLE:
5549 - if (!(*vm_flags & VM_MERGEABLE))
5550 - return 0; /* just ignore the advice */
5551 + BUG_ON(slot->pages != vma_pages(slot->vma));
5552 + rung = &ksm_scan_ladder[0];
5554 - if (vma->anon_vma) {
5555 - err = unmerge_ksm_pages(vma, start, end);
5558 + pages_to_scan = get_vma_random_scan_num(slot, rung->scan_ratio);
5559 + if (pages_to_scan) {
5560 + if (list_empty(&rung->vma_list))
5561 + rung->current_scan = &slot->ksm_list;
5562 + BUG_ON(!list_empty(&slot->ksm_list));
5564 + list_add(&slot->ksm_list, &rung->vma_list);
5565 + slot->rung = rung;
5566 + slot->pages_to_scan = pages_to_scan;
5567 + slot->rung->vma_num++;
5568 + BUG_ON(PAGE_SIZE % sizeof(struct rmap_list_entry) != 0);
5570 + pool_size = vma_pool_size(slot->vma);
5572 + slot->rmap_list_pool = kzalloc(sizeof(struct page *) *
5573 + pool_size, GFP_NOWAIT);
5574 + slot->pool_counts = kzalloc(sizeof(unsigned long) * pool_size,
5576 + slot->pool_size = pool_size;
5577 + if (!slot->rmap_list_pool)
5580 + if (!slot->pool_counts) {
5581 + kfree(slot->rmap_list_pool);
5585 - *vm_flags &= ~VM_MERGEABLE;
5587 + BUG_ON(rung->current_scan == &rung->vma_list &&
5588 + !list_empty(&rung->vma_list));
5590 + ksm_vma_slot_num++;
5591 + BUG_ON(!ksm_vma_slot_num);
5599 -int __ksm_enter(struct mm_struct *mm)
5601 - struct mm_slot *mm_slot;
5604 - mm_slot = alloc_mm_slot();
5608 - /* Check ksm_run too? Would need tighter locking */
5609 - needs_wakeup = list_empty(&ksm_mm_head.mm_list);
5611 - spin_lock(&ksm_mmlist_lock);
5612 - insert_to_mm_slots_hash(mm, mm_slot);
5614 - * Insert just behind the scanning cursor, to let the area settle
5615 - * down a little; when fork is followed by immediate exec, we don't
5616 - * want ksmd to waste time setting up and tearing down an rmap_list.
5618 - list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
5619 - spin_unlock(&ksm_mmlist_lock);
5620 +static void ksm_enter_all_slots(void)
5622 + struct vma_slot *slot;
5625 - set_bit(MMF_VM_MERGEABLE, &mm->flags);
5626 - atomic_inc(&mm->mm_count);
5627 + spin_lock(&vma_slot_list_lock);
5628 + while (!list_empty(&vma_slot_new)) {
5629 + slot = list_entry(vma_slot_new.next,
5630 + struct vma_slot, slot_list);
5632 + * slots are sorted by ctime_j, if one found to be too
5633 + * young, just stop scanning the rest ones.
5638 - wake_up_interruptible(&ksm_thread_wait);
5639 + if (time_before(jiffies, slot->ctime_j +
5640 + msecs_to_jiffies(1000))) {
5641 + spin_unlock(&vma_slot_list_lock);
5647 + list_del_init(&slot->slot_list);
5649 + if (vma_can_enter(slot->vma))
5650 + added = ksm_vma_enter(slot);
5653 + /* Put back to new list to be del by its creator */
5654 + slot->ctime_j = jiffies;
5655 + list_del(&slot->slot_list);
5656 + list_add_tail(&slot->slot_list, &vma_slot_noadd);
5658 + spin_unlock(&vma_slot_list_lock);
5660 + spin_lock(&vma_slot_list_lock);
5662 + spin_unlock(&vma_slot_list_lock);
5665 -void __ksm_exit(struct mm_struct *mm)
5666 +static int ksm_scan_thread(void *nothing)
5668 - struct mm_slot *mm_slot;
5669 - int easy_to_free = 0;
5671 + set_user_nice(current, 5);
5674 - * This process is exiting: if it's straightforward (as is the
5675 - * case when ksmd was never running), free mm_slot immediately.
5676 - * But if it's at the cursor or has rmap_items linked to it, use
5677 - * mmap_sem to synchronize with any break_cows before pagetables
5678 - * are freed, and leave the mm_slot on the list for ksmd to free.
5679 - * Beware: ksm may already have noticed it exiting and freed the slot.
5681 + while (!kthread_should_stop()) {
5682 + mutex_lock(&ksm_thread_mutex);
5683 + if (ksmd_should_run()) {
5684 + ksm_enter_all_slots();
5687 + mutex_unlock(&ksm_thread_mutex);
5691 - spin_lock(&ksm_mmlist_lock);
5692 - mm_slot = get_mm_slot(mm);
5693 - if (mm_slot && ksm_scan.mm_slot != mm_slot) {
5694 - if (!mm_slot->rmap_list) {
5695 - hlist_del(&mm_slot->link);
5696 - list_del(&mm_slot->mm_list);
5698 + if (ksmd_should_run()) {
5699 + schedule_timeout_interruptible(ksm_sleep_jiffies);
5700 + ksm_sleep_times++;
5702 - list_move(&mm_slot->mm_list,
5703 - &ksm_scan.mm_slot->mm_list);
5704 + wait_event_freezable(ksm_thread_wait,
5705 + ksmd_should_run() || kthread_should_stop());
5708 - spin_unlock(&ksm_mmlist_lock);
5710 - if (easy_to_free) {
5711 - free_mm_slot(mm_slot);
5712 - clear_bit(MMF_VM_MERGEABLE, &mm->flags);
5714 - } else if (mm_slot) {
5715 - down_write(&mm->mmap_sem);
5716 - up_write(&mm->mmap_sem);
5721 struct page *ksm_does_need_to_copy(struct page *page,
5722 @@ -1597,11 +4344,13 @@
5723 unsigned long *vm_flags)
5725 struct stable_node *stable_node;
5726 + struct node_vma *node_vma;
5727 struct rmap_item *rmap_item;
5728 - struct hlist_node *hlist;
5729 + struct hlist_node *hlist, *rmap_hlist;
5730 unsigned int mapcount = page_mapcount(page);
5732 int search_new_forks = 0;
5733 + unsigned long address;
5735 VM_BUG_ON(!PageKsm(page));
5736 VM_BUG_ON(!PageLocked(page));
5737 @@ -1609,38 +4358,51 @@
5738 stable_node = page_stable_node(page);
5742 - hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
5743 - struct anon_vma *anon_vma = rmap_item->anon_vma;
5744 - struct anon_vma_chain *vmac;
5745 - struct vm_area_struct *vma;
5747 - anon_vma_lock(anon_vma);
5748 - list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
5750 - if (rmap_item->address < vma->vm_start ||
5751 - rmap_item->address >= vma->vm_end)
5754 - * Initially we examine only the vma which covers this
5755 - * rmap_item; but later, if there is still work to do,
5756 - * we examine covering vmas in other mms: in case they
5757 - * were forked from the original since ksmd passed.
5759 - if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
5762 - if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
5765 - referenced += page_referenced_one(page, vma,
5766 - rmap_item->address, &mapcount, vm_flags);
5767 - if (!search_new_forks || !mapcount)
5770 + hlist_for_each_entry(node_vma, hlist, &stable_node->hlist, hlist) {
5771 + hlist_for_each_entry(rmap_item, rmap_hlist,
5772 + &node_vma->rmap_hlist, hlist) {
5773 + struct anon_vma *anon_vma = rmap_item->anon_vma;
5774 + struct anon_vma_chain *vmac;
5775 + struct vm_area_struct *vma;
5777 + anon_vma_lock(anon_vma);
5778 + list_for_each_entry(vmac, &anon_vma->head,
5781 + address = get_rmap_addr(rmap_item);
5783 + if (address < vma->vm_start ||
5784 + address >= vma->vm_end)
5787 + * Initially we examine only the vma which
5788 + * covers this rmap_item; but later, if there
5789 + * is still work to do, we examine covering
5790 + * vmas in other mms: in case they were forked
5791 + * from the original since ksmd passed.
5793 + if ((rmap_item->slot->vma == vma) ==
5798 + !mm_match_cgroup(vma->vm_mm, memcg))
5802 + page_referenced_one(page, vma,
5803 + address, &mapcount, vm_flags);
5804 + if (!search_new_forks || !mapcount)
5808 + anon_vma_unlock(anon_vma);
5812 - anon_vma_unlock(anon_vma);
5816 if (!search_new_forks++)
5818 @@ -1651,10 +4413,12 @@
5819 int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
5821 struct stable_node *stable_node;
5822 - struct hlist_node *hlist;
5823 + struct node_vma *node_vma;
5824 + struct hlist_node *hlist, *rmap_hlist;
5825 struct rmap_item *rmap_item;
5826 int ret = SWAP_AGAIN;
5827 int search_new_forks = 0;
5828 + unsigned long address;
5830 VM_BUG_ON(!PageKsm(page));
5831 VM_BUG_ON(!PageLocked(page));
5832 @@ -1663,34 +4427,42 @@
5836 - hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
5837 - struct anon_vma *anon_vma = rmap_item->anon_vma;
5838 - struct anon_vma_chain *vmac;
5839 - struct vm_area_struct *vma;
5841 - anon_vma_lock(anon_vma);
5842 - list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
5844 - if (rmap_item->address < vma->vm_start ||
5845 - rmap_item->address >= vma->vm_end)
5848 - * Initially we examine only the vma which covers this
5849 - * rmap_item; but later, if there is still work to do,
5850 - * we examine covering vmas in other mms: in case they
5851 - * were forked from the original since ksmd passed.
5853 - if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
5856 - ret = try_to_unmap_one(page, vma,
5857 - rmap_item->address, flags);
5858 - if (ret != SWAP_AGAIN || !page_mapped(page)) {
5859 - anon_vma_unlock(anon_vma);
5861 + hlist_for_each_entry(node_vma, hlist, &stable_node->hlist, hlist) {
5862 + hlist_for_each_entry(rmap_item, rmap_hlist,
5863 + &node_vma->rmap_hlist, hlist) {
5864 + struct anon_vma *anon_vma = rmap_item->anon_vma;
5865 + struct anon_vma_chain *vmac;
5866 + struct vm_area_struct *vma;
5868 + anon_vma_lock(anon_vma);
5869 + list_for_each_entry(vmac, &anon_vma->head,
5872 + address = get_rmap_addr(rmap_item);
5874 + if (address < vma->vm_start ||
5875 + address >= vma->vm_end)
5878 + * Initially we examine only the vma which
5879 + * covers this rmap_item; but later, if there
5880 + * is still work to do, we examine covering
5881 + * vmas in other mms: in case they were forked
5882 + * from the original since ksmd passed.
5884 + if ((rmap_item->slot->vma == vma) ==
5888 + ret = try_to_unmap_one(page, vma,
5890 + if (ret != SWAP_AGAIN || !page_mapped(page)) {
5891 + anon_vma_unlock(anon_vma);
5895 + anon_vma_unlock(anon_vma);
5897 - anon_vma_unlock(anon_vma);
5899 if (!search_new_forks++)
5901 @@ -1703,10 +4475,12 @@
5902 struct vm_area_struct *, unsigned long, void *), void *arg)
5904 struct stable_node *stable_node;
5905 - struct hlist_node *hlist;
5906 + struct node_vma *node_vma;
5907 + struct hlist_node *hlist, *rmap_hlist;
5908 struct rmap_item *rmap_item;
5909 int ret = SWAP_AGAIN;
5910 int search_new_forks = 0;
5911 + unsigned long address;
5913 VM_BUG_ON(!PageKsm(page));
5914 VM_BUG_ON(!PageLocked(page));
5915 @@ -1715,33 +4489,35 @@
5919 - hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
5920 - struct anon_vma *anon_vma = rmap_item->anon_vma;
5921 - struct anon_vma_chain *vmac;
5922 - struct vm_area_struct *vma;
5924 - anon_vma_lock(anon_vma);
5925 - list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
5927 - if (rmap_item->address < vma->vm_start ||
5928 - rmap_item->address >= vma->vm_end)
5931 - * Initially we examine only the vma which covers this
5932 - * rmap_item; but later, if there is still work to do,
5933 - * we examine covering vmas in other mms: in case they
5934 - * were forked from the original since ksmd passed.
5936 - if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
5939 - ret = rmap_one(page, vma, rmap_item->address, arg);
5940 - if (ret != SWAP_AGAIN) {
5941 - anon_vma_unlock(anon_vma);
5943 + hlist_for_each_entry(node_vma, hlist, &stable_node->hlist, hlist) {
5944 + hlist_for_each_entry(rmap_item, rmap_hlist,
5945 + &node_vma->rmap_hlist, hlist) {
5946 + struct anon_vma *anon_vma = rmap_item->anon_vma;
5947 + struct anon_vma_chain *vmac;
5948 + struct vm_area_struct *vma;
5950 + anon_vma_lock(anon_vma);
5951 + list_for_each_entry(vmac, &anon_vma->head,
5954 + address = get_rmap_addr(rmap_item);
5956 + if (address < vma->vm_start ||
5957 + address >= vma->vm_end)
5960 + if ((rmap_item->slot->vma == vma) ==
5964 + ret = rmap_one(page, vma, address, arg);
5965 + if (ret != SWAP_AGAIN) {
5966 + anon_vma_unlock(anon_vma);
5970 + anon_vma_unlock(anon_vma);
5972 - anon_vma_unlock(anon_vma);
5974 if (!search_new_forks++)
5976 @@ -1771,7 +4547,7 @@
5978 struct rb_node *node;
5980 - for (node = rb_first(&root_stable_tree); node; node = rb_next(node)) {
5981 + for (node = rb_first(root_stable_treep); node; node = rb_next(node)) {
5982 struct stable_node *stable_node;
5984 stable_node = rb_entry(node, struct stable_node, node);
5985 @@ -1810,7 +4586,7 @@
5987 while ((stable_node = ksm_check_stable_tree(mn->start_pfn,
5988 mn->start_pfn + mn->nr_pages)) != NULL)
5989 - remove_node_from_stable_tree(stable_node);
5990 + remove_node_from_stable_tree(stable_node, 1, 1);
5993 case MEM_CANCEL_OFFLINE:
5994 @@ -1835,7 +4611,7 @@
5995 static ssize_t sleep_millisecs_show(struct kobject *kobj,
5996 struct kobj_attribute *attr, char *buf)
5998 - return sprintf(buf, "%u\n", ksm_thread_sleep_millisecs);
5999 + return sprintf(buf, "%u\n", jiffies_to_msecs(ksm_sleep_jiffies));
6002 static ssize_t sleep_millisecs_store(struct kobject *kobj,
6003 @@ -1849,34 +4625,58 @@
6004 if (err || msecs > UINT_MAX)
6007 - ksm_thread_sleep_millisecs = msecs;
6008 + ksm_sleep_jiffies = msecs_to_jiffies(msecs);
6012 KSM_ATTR(sleep_millisecs);
6014 -static ssize_t pages_to_scan_show(struct kobject *kobj,
6015 +static ssize_t min_scan_ratio_show(struct kobject *kobj,
6016 + struct kobj_attribute *attr, char *buf)
6018 + return sprintf(buf, "%u\n", ksm_min_scan_ratio);
6021 +static ssize_t min_scan_ratio_store(struct kobject *kobj,
6022 + struct kobj_attribute *attr,
6023 + const char *buf, size_t count)
6025 + unsigned long msr;
6028 + err = strict_strtoul(buf, 10, &msr);
6029 + if (err || msr > UINT_MAX)
6032 + ksm_min_scan_ratio = msr;
6036 +KSM_ATTR(min_scan_ratio);
6038 +static ssize_t scan_batch_pages_show(struct kobject *kobj,
6039 struct kobj_attribute *attr, char *buf)
6041 - return sprintf(buf, "%u\n", ksm_thread_pages_to_scan);
6042 + return sprintf(buf, "%lu\n", ksm_scan_batch_pages);
6045 -static ssize_t pages_to_scan_store(struct kobject *kobj,
6046 +static ssize_t scan_batch_pages_store(struct kobject *kobj,
6047 struct kobj_attribute *attr,
6048 const char *buf, size_t count)
6051 - unsigned long nr_pages;
6052 + unsigned long batch_pages;
6054 - err = strict_strtoul(buf, 10, &nr_pages);
6055 - if (err || nr_pages > UINT_MAX)
6056 + err = strict_strtoul(buf, 10, &batch_pages);
6057 + if (err || batch_pages > UINT_MAX)
6060 - ksm_thread_pages_to_scan = nr_pages;
6061 + ksm_scan_batch_pages = batch_pages;
6062 + cal_ladder_pages_to_scan(ksm_scan_batch_pages);
6066 -KSM_ATTR(pages_to_scan);
6067 +KSM_ATTR(scan_batch_pages);
6069 static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
6071 @@ -1893,28 +4693,12 @@
6072 err = strict_strtoul(buf, 10, &flags);
6073 if (err || flags > UINT_MAX)
6075 - if (flags > KSM_RUN_UNMERGE)
6076 + if (flags > KSM_RUN_MERGE)
6080 - * KSM_RUN_MERGE sets ksmd running, and 0 stops it running.
6081 - * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items,
6082 - * breaking COW to free the pages_shared (but leaves mm_slots
6083 - * on the list for when ksmd may be set running again).
6086 mutex_lock(&ksm_thread_mutex);
6087 if (ksm_run != flags) {
6089 - if (flags & KSM_RUN_UNMERGE) {
6090 - current->flags |= PF_OOM_ORIGIN;
6091 - err = unmerge_and_remove_all_rmap_items();
6092 - current->flags &= ~PF_OOM_ORIGIN;
6094 - ksm_run = KSM_RUN_STOP;
6099 mutex_unlock(&ksm_thread_mutex);
6101 @@ -1925,6 +4709,30 @@
6106 +static ssize_t thrash_threshold_show(struct kobject *kobj,
6107 + struct kobj_attribute *attr, char *buf)
6109 + return sprintf(buf, "%u\n", ksm_thrash_threshold);
6112 +static ssize_t thrash_threshold_store(struct kobject *kobj,
6113 + struct kobj_attribute *attr,
6114 + const char *buf, size_t count)
6117 + unsigned long flags;
6119 + err = strict_strtoul(buf, 10, &flags);
6120 + if (err || flags > 99)
6123 + ksm_thrash_threshold = flags;
6127 +KSM_ATTR(thrash_threshold);
6129 static ssize_t pages_shared_show(struct kobject *kobj,
6130 struct kobj_attribute *attr, char *buf)
6132 @@ -1946,60 +4754,291 @@
6134 KSM_ATTR_RO(pages_unshared);
6136 -static ssize_t pages_volatile_show(struct kobject *kobj,
6137 - struct kobj_attribute *attr, char *buf)
6138 +static ssize_t full_scans_show(struct kobject *kobj,
6139 + struct kobj_attribute *attr, char *buf)
6141 - long ksm_pages_volatile;
6142 + return sprintf(buf, "%llu\n", ksm_scan_round);
6144 +KSM_ATTR_RO(full_scans);
6146 - ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared
6147 - - ksm_pages_sharing - ksm_pages_unshared;
6149 - * It was not worth any locking to calculate that statistic,
6150 - * but it might therefore sometimes be negative: conceal that.
6152 - if (ksm_pages_volatile < 0)
6153 - ksm_pages_volatile = 0;
6154 - return sprintf(buf, "%ld\n", ksm_pages_volatile);
6155 +static ssize_t pages_scanned_show(struct kobject *kobj,
6156 + struct kobj_attribute *attr, char *buf)
6158 + unsigned long base = 0;
6161 + if (pages_scanned_stored) {
6162 + base = pages_scanned_base;
6163 + ret = pages_scanned_stored;
6164 + delta = ksm_pages_scanned >> base;
6165 + if (CAN_OVERFLOW_U64(ret, delta)) {
6172 + ret = ksm_pages_scanned;
6175 + while (ret > ULONG_MAX) {
6181 + return sprintf(buf, "%lu * 2^%lu\n", (unsigned long)ret, base);
6183 + return sprintf(buf, "%lu\n", (unsigned long)ret);
6185 -KSM_ATTR_RO(pages_volatile);
6186 +KSM_ATTR_RO(pages_scanned);
6188 -static ssize_t full_scans_show(struct kobject *kobj,
6189 - struct kobj_attribute *attr, char *buf)
6190 +static ssize_t hash_strength_show(struct kobject *kobj,
6191 + struct kobj_attribute *attr, char *buf)
6193 - return sprintf(buf, "%lu\n", ksm_scan.seqnr);
6194 + return sprintf(buf, "%lu\n", hash_strength);
6196 -KSM_ATTR_RO(full_scans);
6197 +KSM_ATTR_RO(hash_strength);
6199 +static ssize_t sleep_times_show(struct kobject *kobj,
6200 + struct kobj_attribute *attr, char *buf)
6202 + return sprintf(buf, "%llu\n", ksm_sleep_times);
6204 +KSM_ATTR_RO(sleep_times);
6207 static struct attribute *ksm_attrs[] = {
6208 &sleep_millisecs_attr.attr,
6209 - &pages_to_scan_attr.attr,
6210 + &scan_batch_pages_attr.attr,
6212 &pages_shared_attr.attr,
6213 &pages_sharing_attr.attr,
6214 &pages_unshared_attr.attr,
6215 - &pages_volatile_attr.attr,
6216 &full_scans_attr.attr,
6217 + &min_scan_ratio_attr.attr,
6218 + &pages_scanned_attr.attr,
6219 + &hash_strength_attr.attr,
6220 + &sleep_times_attr.attr,
6221 + &thrash_threshold_attr.attr,
6225 static struct attribute_group ksm_attr_group = {
6230 #endif /* CONFIG_SYSFS */
6232 +static inline void init_scan_ladder(void)
6235 + unsigned long mul = 1;
6237 + unsigned long pages_to_scan;
6239 + pages_to_scan = ksm_scan_batch_pages;
6241 + for (i = 0; i < ksm_scan_ladder_size; i++,
6242 + mul *= ksm_scan_ratio_delta) {
6244 + ksm_scan_ladder[i].scan_ratio = ksm_min_scan_ratio * mul;
6245 + INIT_LIST_HEAD(&ksm_scan_ladder[i].vma_list);
6246 + ksm_scan_ladder[i].vma_num = 0;
6247 + ksm_scan_ladder[i].round_finished = 0;
6248 + ksm_scan_ladder[i].fully_scanned_slots = 0;
6249 + ksm_scan_ladder[i].busy_searched = 0;
6252 + cal_ladder_pages_to_scan(ksm_scan_batch_pages);
6255 +static inline int cal_positive_negative_costs(void)
6257 + struct page *p1, *p2;
6258 + unsigned char *addr1, *addr2;
6259 + unsigned long i, time_start, hash_cost;
6260 + unsigned long loopnum = 0;
6262 + /*IMPORTANT: volatile is needed to prevent over-optimization by gcc. */
6263 + volatile u32 hash;
6266 + p1 = alloc_page(GFP_KERNEL);
6270 + p2 = alloc_page(GFP_KERNEL);
6274 + addr1 = kmap_atomic(p1, KM_USER0);
6275 + addr2 = kmap_atomic(p2, KM_USER1);
6276 + memset(addr1, random32(), PAGE_SIZE);
6277 + memcpy(addr2, addr1, PAGE_SIZE);
6279 + /* make sure that the two pages differ in last byte */
6280 + addr2[PAGE_SIZE-1] = ~addr2[PAGE_SIZE-1];
6281 + kunmap_atomic(addr2, KM_USER1);
6282 + kunmap_atomic(addr1, KM_USER0);
6284 + time_start = jiffies;
6285 + while (jiffies - time_start < 100) {
6286 + for (i = 0; i < 100; i++)
6287 + hash = page_hash(p1, HASH_STRENGTH_FULL, 0);
6290 + hash_cost = (jiffies - time_start);
6292 + time_start = jiffies;
6293 + for (i = 0; i < loopnum; i++)
6294 + ret = pages_identical(p1, p2);
6295 + memcmp_cost = HASH_STRENGTH_FULL * (jiffies - time_start);
6296 + memcmp_cost /= hash_cost;
6297 + printk(KERN_INFO "UKSM: relative memcmp_cost = %lu.\n", memcmp_cost);
6304 +static int init_zeropage_hash_table(void)
6306 + struct page *page;
6310 + page = alloc_page(GFP_KERNEL);
6314 + addr = kmap_atomic(page, KM_USER0);
6315 + memset(addr, 0, PAGE_SIZE);
6316 + kunmap_atomic(addr, KM_USER0);
6318 + zero_hash_table = kmalloc(HASH_STRENGTH_MAX * sizeof(u32),
6320 + if (!zero_hash_table)
6323 + for (i = 0; i < HASH_STRENGTH_MAX; i++)
6324 + zero_hash_table[i] = page_hash(page, i, 0);
6326 + __free_page(page);
6331 +static inline int init_random_sampling(void)
6334 + random_nums = kmalloc(PAGE_SIZE, GFP_KERNEL);
6338 + for (i = 0; i < HASH_STRENGTH_FULL; i++)
6339 + random_nums[i] = i;
6341 + for (i = 0; i < HASH_STRENGTH_FULL; i++) {
6342 + unsigned long rand_range, swap_index, tmp;
6344 + rand_range = HASH_STRENGTH_FULL - i;
6345 + swap_index = i + random32() % rand_range;
6346 + tmp = random_nums[i];
6347 + random_nums[i] = random_nums[swap_index];
6348 + random_nums[swap_index] = tmp;
6351 + rshash_state.state = RSHASH_NEW;
6352 + rshash_state.below_count = 0;
6353 + rshash_state.lookup_window_index = 0;
6355 + return cal_positive_negative_costs();
6358 +static int __init ksm_slab_init(void)
6360 + rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
6361 + if (!rmap_item_cache)
6364 + stable_node_cache = KSM_KMEM_CACHE(stable_node, 0);
6365 + if (!stable_node_cache)
6368 + node_vma_cache = KSM_KMEM_CACHE(node_vma, 0);
6369 + if (!node_vma_cache)
6372 + vma_slot_cache = KSM_KMEM_CACHE(vma_slot, 0);
6373 + if (!vma_slot_cache)
6376 + tree_node_cache = KSM_KMEM_CACHE(tree_node, 0);
6377 + if (!tree_node_cache)
6383 + kmem_cache_destroy(vma_slot_cache);
6385 + kmem_cache_destroy(node_vma_cache);
6387 + kmem_cache_destroy(stable_node_cache);
6389 + kmem_cache_destroy(rmap_item_cache);
6394 +static void __init ksm_slab_free(void)
6396 + kmem_cache_destroy(stable_node_cache);
6397 + kmem_cache_destroy(rmap_item_cache);
6398 + kmem_cache_destroy(node_vma_cache);
6399 + kmem_cache_destroy(vma_slot_cache);
6400 + kmem_cache_destroy(tree_node_cache);
6403 static int __init ksm_init(void)
6405 struct task_struct *ksm_thread;
6407 + unsigned int sr = ksm_min_scan_ratio;
6409 + ksm_scan_ladder_size = 1;
6410 + while (sr < KSM_SCAN_RATIO_MAX) {
6411 + sr *= ksm_scan_ratio_delta;
6412 + ksm_scan_ladder_size++;
6414 + ksm_scan_ladder = kzalloc(sizeof(struct scan_rung) *
6415 + ksm_scan_ladder_size, GFP_KERNEL);
6416 + if (!ksm_scan_ladder) {
6417 + printk(KERN_ERR "uksm scan ladder allocation failed, size=%d\n",
6418 + ksm_scan_ladder_size);
6422 + init_scan_ladder();
6424 + INIT_RADIX_TREE(&ksm_vma_tree, GFP_KERNEL);
6426 + err = init_random_sampling();
6430 err = ksm_slab_init();
6435 - ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
6436 + err = init_zeropage_hash_table();
6440 + ksm_thread = kthread_run(ksm_scan_thread, NULL, "uksmd");
6441 if (IS_ERR(ksm_thread)) {
6442 - printk(KERN_ERR "ksm: creating kthread failed\n");
6443 + printk(KERN_ERR "uksm: creating kthread failed\n");
6444 err = PTR_ERR(ksm_thread);
6447 @@ -2007,7 +5046,7 @@
6449 err = sysfs_create_group(mm_kobj, &ksm_attr_group);
6451 - printk(KERN_ERR "ksm: register sysfs failed\n");
6452 + printk(KERN_ERR "uksm: register sysfs failed\n");
6453 kthread_stop(ksm_thread);
6456 @@ -2026,8 +5065,20 @@
6460 + kfree(zero_hash_table);
6464 + kfree(random_nums);
6466 + kfree(ksm_scan_ladder);
6472 module_init(ksm_init)
6474 +late_initcall(ksm_init);
6477 diff -urN linux-2.6.38/mm/madvise.c uksm-2.6.38-zhang/mm/madvise.c
6478 --- linux-2.6.38/mm/madvise.c 2011-03-15 09:20:32.000000000 +0800
6479 +++ uksm-2.6.38-zhang/mm/madvise.c 2012-01-09 10:05:59.862270375 +0800
6482 new_flags &= ~VM_DONTCOPY;
6484 - case MADV_MERGEABLE:
6485 - case MADV_UNMERGEABLE:
6486 - error = ksm_madvise(vma, start, end, behavior, &new_flags);
6491 case MADV_NOHUGEPAGE:
6492 error = hugepage_madvise(vma, &new_flags, behavior);
6493 @@ -285,10 +279,6 @@
6498 - case MADV_MERGEABLE:
6499 - case MADV_UNMERGEABLE:
6501 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
6503 case MADV_NOHUGEPAGE:
6504 diff -urN linux-2.6.38/mm/memory.c uksm-2.6.38-zhang/mm/memory.c
6505 --- linux-2.6.38/mm/memory.c 2011-03-15 09:20:32.000000000 +0800
6506 +++ uksm-2.6.38-zhang/mm/memory.c 2012-01-09 10:09:28.235610655 +0800
6507 @@ -112,6 +112,37 @@
6508 unsigned long zero_pfn __read_mostly;
6509 unsigned long highest_memmap_pfn __read_mostly;
6512 +unsigned long ksm_zero_pfn __read_mostly;
6513 +struct page *empty_ksm_zero_page;
6515 +static int __init setup_ksm_zero_page(void)
6517 + unsigned long addr;
6518 + addr = __get_free_pages(GFP_KERNEL | __GFP_ZERO, 0);
6520 + panic("Oh boy, that early out of memory?");
6522 + empty_ksm_zero_page = virt_to_page((void *) addr);
6523 + SetPageReserved(empty_ksm_zero_page);
6525 + ksm_zero_pfn = page_to_pfn(empty_ksm_zero_page);
6529 +core_initcall(setup_ksm_zero_page);
6531 +static inline int is_ksm_zero_pfn(unsigned long pfn)
6533 + return pfn == ksm_zero_pfn;
6536 +static inline int is_ksm_zero_pfn(unsigned long pfn)
6543 * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
6546 core_initcall(init_zero_pfn);
6550 #if defined(SPLIT_RSS_COUNTING)
6552 static void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm)
6555 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
6557 - if (!is_zero_pfn(pfn))
6558 + if (!is_zero_pfn(pfn) && !is_ksm_zero_pfn(pfn))
6559 print_bad_pte(vma, addr, pte, NULL);
6566 - if (is_zero_pfn(pfn))
6567 + if (is_zero_pfn(pfn) || is_ksm_zero_pfn(pfn))
6570 if (unlikely(pfn > highest_memmap_pfn)) {
6571 @@ -719,6 +751,10 @@
6572 rss[MM_ANONPAGES]++;
6574 rss[MM_FILEPAGES]++;
6576 + if (PageKsm(page)) /* follows page_dup_rmap() */
6577 + inc_zone_page_state(page, NR_KSM_PAGES_SHARING);
6582 @@ -1341,7 +1377,8 @@
6583 page = vm_normal_page(vma, address, pte);
6584 if (unlikely(!page)) {
6585 if ((flags & FOLL_DUMP) ||
6586 - !is_zero_pfn(pte_pfn(pte)))
6587 + !is_zero_pfn(pte_pfn(pte)) ||
6588 + !is_ksm_zero_pfn(pte_pfn(pte)))
6590 page = pte_page(pte);
6592 @@ -1423,7 +1460,7 @@
6594 VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
6598 * Require read or write permissions.
6599 * If FOLL_FORCE is set, we only require the "MAY" flags.
6601 @@ -1470,7 +1507,8 @@
6602 page = vm_normal_page(gate_vma, start, *pte);
6604 if (!(gup_flags & FOLL_DUMP) &&
6605 - is_zero_pfn(pte_pfn(*pte)))
6606 + (is_zero_pfn(pte_pfn(*pte)) ||
6607 + is_ksm_zero_pfn(pte_pfn(*pte))))
6608 page = pte_page(*pte);
6611 @@ -2158,8 +2196,13 @@
6613 kunmap_atomic(kaddr, KM_USER0);
6614 flush_dcache_page(dst);
6617 copy_user_highpage(dst, src, va, vma);
6619 + if (vma->ksm_vma_slot && PageKsm(src))
6620 + vma->ksm_vma_slot->pages_cowed++;
6626 @@ -2353,10 +2396,15 @@
6627 if (unlikely(anon_vma_prepare(vma)))
6630 - if (is_zero_pfn(pte_pfn(orig_pte))) {
6631 + if (is_zero_pfn(pte_pfn(orig_pte))
6632 + || is_ksm_zero_pfn(pte_pfn(orig_pte))) {
6633 new_page = alloc_zeroed_user_highpage_movable(vma, address);
6637 + if (vma->ksm_vma_slot && is_ksm_zero_pfn(pte_pfn(orig_pte)))
6638 + vma->ksm_vma_slot->pages_cowed++;
6641 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
6643 @@ -2378,6 +2426,11 @@
6644 dec_mm_counter_fast(mm, MM_FILEPAGES);
6645 inc_mm_counter_fast(mm, MM_ANONPAGES);
6648 + if (is_ksm_zero_pfn(pte_pfn(orig_pte)))
6649 + __dec_zone_page_state(old_page,
6650 + NR_KSM_ZERO_PAGES);
6653 inc_mm_counter_fast(mm, MM_ANONPAGES);
6654 flush_cache_page(vma, address, pte_pfn(orig_pte));
6655 Binary files linux-2.6.38/mm/.memory.c.swp and uksm-2.6.38-zhang/mm/.memory.c.swp differ
6656 diff -urN linux-2.6.38/mm/mmap.c uksm-2.6.38-zhang/mm/mmap.c
6657 --- linux-2.6.38/mm/mmap.c 2011-03-15 09:20:32.000000000 +0800
6658 +++ uksm-2.6.38-zhang/mm/mmap.c 2012-01-09 10:05:59.872270374 +0800
6660 #include <linux/perf_event.h>
6661 #include <linux/audit.h>
6662 #include <linux/khugepaged.h>
6663 +#include <linux/ksm.h>
6665 #include <asm/uaccess.h>
6666 #include <asm/cacheflush.h>
6668 * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes
6669 * w: (no) no w: (no) no w: (yes) yes w: (no) no
6670 * x: (no) no x: (no) yes x: (no) yes x: (yes) yes
6673 * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes
6674 * w: (no) no w: (no) no w: (copy) copy w: (no) no
6675 * x: (no) no x: (no) yes x: (no) yes x: (yes) yes
6677 removed_exe_file_vma(vma->vm_mm);
6679 mpol_put(vma_policy(vma));
6681 + ksm_remove_vma(vma);
6683 kmem_cache_free(vm_area_cachep, vma);
6686 @@ -529,9 +533,20 @@
6687 long adjust_next = 0;
6688 int remove_next = 0;
6691 + * to avoid deadlock, ksm_remove_vma must be done before any spin_lock is
6695 + ksm_remove_vma(vma);
6698 if (next && !insert) {
6699 struct vm_area_struct *exporter = NULL;
6702 + ksm_remove_vma(next);
6704 if (end >= next->vm_end) {
6706 * vma expands, overlapping all the next, and
6707 @@ -616,10 +631,10 @@
6709 vma_prio_tree_remove(next, root);
6712 vma->vm_start = start;
6714 vma->vm_pgoff = pgoff;
6717 next->vm_start += adjust_next << PAGE_SHIFT;
6718 next->vm_pgoff += adjust_next;
6719 @@ -672,10 +687,22 @@
6721 if (remove_next == 2) {
6722 next = vma->vm_next;
6724 + ksm_remove_vma(next);
6730 + if (next && !insert)
6731 + ksm_vma_add_new(next);
6736 + ksm_vma_add_new(vma);
6742 @@ -1352,6 +1379,9 @@
6744 vma_link(mm, vma, prev, rb_link, rb_parent);
6745 file = vma->vm_file;
6747 + ksm_vma_add_new(vma);
6750 /* Once vma denies write, undo our temporary denial count */
6752 @@ -1378,6 +1408,9 @@
6753 unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
6757 + ksm_remove_vma(vma);
6759 kmem_cache_free(vm_area_cachep, vma);
6762 @@ -1453,7 +1486,7 @@
6769 void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
6771 @@ -2014,6 +2047,10 @@
6773 err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
6776 + ksm_vma_add_new(new);
6782 @@ -2250,6 +2287,9 @@
6783 vma->vm_flags = flags;
6784 vma->vm_page_prot = vm_get_page_prot(flags);
6785 vma_link(mm, vma, prev, rb_link, rb_parent);
6787 + ksm_vma_add_new(vma);
6790 perf_event_mmap(vma);
6791 mm->total_vm += len >> PAGE_SHIFT;
6792 @@ -2273,6 +2313,12 @@
6793 /* mm's last user has gone, and its about to be pulled down */
6794 mmu_notifier_release(mm);
6797 + * Taking write lock on mmap_sem does not harm others,
6798 + * but it's crucial for uksm to avoid races.
6800 + down_write(&mm->mmap_sem);
6802 if (mm->locked_vm) {
6805 @@ -2306,6 +2352,11 @@
6807 vma = remove_vma(vma);
6810 + mm->mm_rb = RB_ROOT;
6811 + mm->mmap_cache = NULL;
6812 + up_write(&mm->mmap_sem);
6814 BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
6817 @@ -2397,6 +2448,9 @@
6818 if (new_vma->vm_ops && new_vma->vm_ops->open)
6819 new_vma->vm_ops->open(new_vma);
6820 vma_link(mm, new_vma, prev, rb_link, rb_parent);
6822 + ksm_vma_add_new(new_vma);
6827 @@ -2502,11 +2556,14 @@
6828 ret = insert_vm_struct(mm, vma);
6832 mm->total_vm += len >> PAGE_SHIFT;
6834 perf_event_mmap(vma);
6837 + ksm_vma_add_new(vma);
6843 diff -urN linux-2.6.38/mm/mremap.c uksm-2.6.38-zhang/mm/mremap.c
6844 --- linux-2.6.38/mm/mremap.c 2011-03-15 09:20:32.000000000 +0800
6845 +++ uksm-2.6.38-zhang/mm/mremap.c 2012-01-09 10:05:59.872270374 +0800
6847 * pages recently unmapped. But leave vma->vm_flags as it was,
6848 * so KSM can come around to merge on vma and new_vma afterwards.
6850 - err = ksm_madvise(vma, old_addr, old_addr + old_len,
6851 - MADV_UNMERGEABLE, &vm_flags);
6852 + err = unmerge_ksm_pages(vma, old_addr, old_addr + old_len);
6856 diff -urN linux-2.6.38/mm/rmap.c uksm-2.6.38-zhang/mm/rmap.c
6857 --- linux-2.6.38/mm/rmap.c 2011-03-15 09:20:32.000000000 +0800
6858 +++ uksm-2.6.38-zhang/mm/rmap.c 2012-01-09 10:05:59.875603707 +0800
6862 * __page_set_anon_rmap - set up new anonymous rmap
6863 - * @page: Page to add to rmap
6864 + * @page: Page to add to rmap
6865 * @vma: VM area to add page to.
6866 - * @address: User virtual address of the mapping
6867 + * @address: User virtual address of the mapping
6868 * @exclusive: the page is exclusively owned by the current process
6870 static void __page_set_anon_rmap(struct page *page,
6871 @@ -905,9 +905,12 @@
6872 __inc_zone_page_state(page,
6873 NR_ANON_TRANSPARENT_HUGEPAGES);
6875 - if (unlikely(PageKsm(page)))
6877 + if (unlikely(PageKsm(page))) {
6878 + __inc_zone_page_state(page, NR_KSM_PAGES_SHARING);
6883 VM_BUG_ON(!PageLocked(page));
6884 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
6886 @@ -965,6 +968,10 @@
6888 void page_remove_rmap(struct page *page)
6891 + if (PageKsm(page))
6892 + __dec_zone_page_state(page, NR_KSM_PAGES_SHARING);
6894 /* page still mapped by someone else? */
6895 if (!atomic_add_negative(-1, &page->_mapcount))
6897 diff -urN linux-2.6.38/security/apparmor/capability_names.h uksm-2.6.38-zhang/security/apparmor/capability_names.h
6898 --- linux-2.6.38/security/apparmor/capability_names.h 1970-01-01 08:00:00.000000000 +0800
6899 +++ uksm-2.6.38-zhang/security/apparmor/capability_names.h 2012-01-10 09:30:37.569678996 +0800
6901 +static const char *capability_names[] = {
6903 +[1] = "dac_override",
6904 +[2] = "dac_read_search",
6911 +[9] = "linux_immutable",
6912 +[10] = "net_bind_service",
6913 +[11] = "net_broadcast",
6914 +[12] = "net_admin",
6917 +[15] = "ipc_owner",
6918 +[16] = "sys_module",
6919 +[17] = "sys_rawio",
6920 +[18] = "sys_chroot",
6921 +[19] = "sys_ptrace",
6922 +[20] = "sys_pacct",
6923 +[21] = "sys_admin",
6926 +[24] = "sys_resource",
6928 +[26] = "sys_tty_config",
6931 +[29] = "audit_write",
6932 +[30] = "audit_control",
6934 +[32] = "mac_override",
6935 +[33] = "mac_admin",
6938 diff -urN linux-2.6.38/security/apparmor/rlim_names.h uksm-2.6.38-zhang/security/apparmor/rlim_names.h
6939 --- linux-2.6.38/security/apparmor/rlim_names.h 1970-01-01 08:00:00.000000000 +0800
6940 +++ uksm-2.6.38-zhang/security/apparmor/rlim_names.h 2012-01-10 09:30:41.073012457 +0800
6942 +static const char *rlim_names[] = {
6954 +[11] = "sigpending",
6960 +static const int rlim_map[] = {