This source file includes following definitions.
- ksm_slab_init
- ksm_slab_free
- is_stable_node_chain
- is_stable_node_dup
- stable_node_chain_add_dup
- __stable_node_dup_del
- stable_node_dup_del
- alloc_rmap_item
- free_rmap_item
- alloc_stable_node
- free_stable_node
- alloc_mm_slot
- free_mm_slot
- get_mm_slot
- insert_to_mm_slots_hash
- ksm_test_exit
- break_ksm
- find_mergeable_vma
- break_cow
- get_mergeable_page
- get_kpfn_nid
- alloc_stable_node_chain
- free_stable_node_chain
- remove_node_from_stable_tree
- get_ksm_page
- remove_rmap_item_from_tree
- remove_trailing_rmap_items
- unmerge_ksm_pages
- page_stable_node
- set_page_stable_node
- remove_stable_node
- remove_stable_node_chain
- remove_all_stable_nodes
- unmerge_and_remove_all_rmap_items
- calc_checksum
- write_protect_page
- replace_page
- try_to_merge_one_page
- try_to_merge_with_ksm_page
- try_to_merge_two_pages
- __is_page_sharing_candidate
- is_page_sharing_candidate
- stable_node_dup
- stable_node_dup_any
- __stable_node_chain
- chain_prune
- chain
- stable_tree_search
- stable_tree_insert
- unstable_tree_search_insert
- stable_tree_append
- cmp_and_merge_page
- get_next_rmap_item
- scan_get_next_rmap_item
- ksm_do_scan
- ksmd_should_run
- ksm_scan_thread
- ksm_madvise
- __ksm_enter
- __ksm_exit
- ksm_might_need_to_copy
- rmap_walk_ksm
- reuse_ksm_page
- ksm_migrate_page
- wait_while_offlining
- stable_node_dup_remove_range
- stable_node_chain_remove_range
- ksm_check_stable_tree
- ksm_memory_callback
- wait_while_offlining
- sleep_millisecs_show
- sleep_millisecs_store
- pages_to_scan_show
- pages_to_scan_store
- run_show
- run_store
- merge_across_nodes_show
- merge_across_nodes_store
- use_zero_pages_show
- use_zero_pages_store
- max_page_sharing_show
- max_page_sharing_store
- pages_shared_show
- pages_sharing_show
- pages_unshared_show
- pages_volatile_show
- stable_node_dups_show
- stable_node_chains_show
- stable_node_chains_prune_millisecs_show
- stable_node_chains_prune_millisecs_store
- full_scans_show
- ksm_init
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16 #include <linux/errno.h>
17 #include <linux/mm.h>
18 #include <linux/fs.h>
19 #include <linux/mman.h>
20 #include <linux/sched.h>
21 #include <linux/sched/mm.h>
22 #include <linux/sched/coredump.h>
23 #include <linux/rwsem.h>
24 #include <linux/pagemap.h>
25 #include <linux/rmap.h>
26 #include <linux/spinlock.h>
27 #include <linux/xxhash.h>
28 #include <linux/delay.h>
29 #include <linux/kthread.h>
30 #include <linux/wait.h>
31 #include <linux/slab.h>
32 #include <linux/rbtree.h>
33 #include <linux/memory.h>
34 #include <linux/mmu_notifier.h>
35 #include <linux/swap.h>
36 #include <linux/ksm.h>
37 #include <linux/hashtable.h>
38 #include <linux/freezer.h>
39 #include <linux/oom.h>
40 #include <linux/numa.h>
41
42 #include <asm/tlbflush.h>
43 #include "internal.h"
44
45 #ifdef CONFIG_NUMA
46 #define NUMA(x) (x)
47 #define DO_NUMA(x) do { (x); } while (0)
48 #else
49 #define NUMA(x) (0)
50 #define DO_NUMA(x) do { } while (0)
51 #endif
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120 struct mm_slot {
121 struct hlist_node link;
122 struct list_head mm_list;
123 struct rmap_item *rmap_list;
124 struct mm_struct *mm;
125 };
126
127
128
129
130
131
132
133
134
135
136 struct ksm_scan {
137 struct mm_slot *mm_slot;
138 unsigned long address;
139 struct rmap_item **rmap_list;
140 unsigned long seqnr;
141 };
142
143
144
145
146
147
148
149
150
151
152
153
154
155 struct stable_node {
156 union {
157 struct rb_node node;
158 struct {
159 struct list_head *head;
160 struct {
161 struct hlist_node hlist_dup;
162 struct list_head list;
163 };
164 };
165 };
166 struct hlist_head hlist;
167 union {
168 unsigned long kpfn;
169 unsigned long chain_prune_time;
170 };
171
172
173
174
175
176 #define STABLE_NODE_CHAIN -1024
177 int rmap_hlist_len;
178 #ifdef CONFIG_NUMA
179 int nid;
180 #endif
181 };
182
183
184
185
186
187
188
189
190
191
192
193
194
195 struct rmap_item {
196 struct rmap_item *rmap_list;
197 union {
198 struct anon_vma *anon_vma;
199 #ifdef CONFIG_NUMA
200 int nid;
201 #endif
202 };
203 struct mm_struct *mm;
204 unsigned long address;
205 unsigned int oldchecksum;
206 union {
207 struct rb_node node;
208 struct {
209 struct stable_node *head;
210 struct hlist_node hlist;
211 };
212 };
213 };
214
215 #define SEQNR_MASK 0x0ff
216 #define UNSTABLE_FLAG 0x100
217 #define STABLE_FLAG 0x200
218 #define KSM_FLAG_MASK (SEQNR_MASK|UNSTABLE_FLAG|STABLE_FLAG)
219
220
221
222 static struct rb_root one_stable_tree[1] = { RB_ROOT };
223 static struct rb_root one_unstable_tree[1] = { RB_ROOT };
224 static struct rb_root *root_stable_tree = one_stable_tree;
225 static struct rb_root *root_unstable_tree = one_unstable_tree;
226
227
228 static LIST_HEAD(migrate_nodes);
229 #define STABLE_NODE_DUP_HEAD ((struct list_head *)&migrate_nodes.prev)
230
231 #define MM_SLOTS_HASH_BITS 10
232 static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
233
234 static struct mm_slot ksm_mm_head = {
235 .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
236 };
237 static struct ksm_scan ksm_scan = {
238 .mm_slot = &ksm_mm_head,
239 };
240
241 static struct kmem_cache *rmap_item_cache;
242 static struct kmem_cache *stable_node_cache;
243 static struct kmem_cache *mm_slot_cache;
244
245
246 static unsigned long ksm_pages_shared;
247
248
249 static unsigned long ksm_pages_sharing;
250
251
252 static unsigned long ksm_pages_unshared;
253
254
255 static unsigned long ksm_rmap_items;
256
257
258 static unsigned long ksm_stable_node_chains;
259
260
261 static unsigned long ksm_stable_node_dups;
262
263
264 static int ksm_stable_node_chains_prune_millisecs = 2000;
265
266
267 static int ksm_max_page_sharing = 256;
268
269
270 static unsigned int ksm_thread_pages_to_scan = 100;
271
272
273 static unsigned int ksm_thread_sleep_millisecs = 20;
274
275
276 static unsigned int zero_checksum __read_mostly;
277
278
279 static bool ksm_use_zero_pages __read_mostly;
280
281 #ifdef CONFIG_NUMA
282
283 static unsigned int ksm_merge_across_nodes = 1;
284 static int ksm_nr_node_ids = 1;
285 #else
286 #define ksm_merge_across_nodes 1U
287 #define ksm_nr_node_ids 1
288 #endif
289
290 #define KSM_RUN_STOP 0
291 #define KSM_RUN_MERGE 1
292 #define KSM_RUN_UNMERGE 2
293 #define KSM_RUN_OFFLINE 4
294 static unsigned long ksm_run = KSM_RUN_STOP;
295 static void wait_while_offlining(void);
296
297 static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
298 static DECLARE_WAIT_QUEUE_HEAD(ksm_iter_wait);
299 static DEFINE_MUTEX(ksm_thread_mutex);
300 static DEFINE_SPINLOCK(ksm_mmlist_lock);
301
302 #define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
303 sizeof(struct __struct), __alignof__(struct __struct),\
304 (__flags), NULL)
305
306 static int __init ksm_slab_init(void)
307 {
308 rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
309 if (!rmap_item_cache)
310 goto out;
311
312 stable_node_cache = KSM_KMEM_CACHE(stable_node, 0);
313 if (!stable_node_cache)
314 goto out_free1;
315
316 mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0);
317 if (!mm_slot_cache)
318 goto out_free2;
319
320 return 0;
321
322 out_free2:
323 kmem_cache_destroy(stable_node_cache);
324 out_free1:
325 kmem_cache_destroy(rmap_item_cache);
326 out:
327 return -ENOMEM;
328 }
329
330 static void __init ksm_slab_free(void)
331 {
332 kmem_cache_destroy(mm_slot_cache);
333 kmem_cache_destroy(stable_node_cache);
334 kmem_cache_destroy(rmap_item_cache);
335 mm_slot_cache = NULL;
336 }
337
338 static __always_inline bool is_stable_node_chain(struct stable_node *chain)
339 {
340 return chain->rmap_hlist_len == STABLE_NODE_CHAIN;
341 }
342
343 static __always_inline bool is_stable_node_dup(struct stable_node *dup)
344 {
345 return dup->head == STABLE_NODE_DUP_HEAD;
346 }
347
348 static inline void stable_node_chain_add_dup(struct stable_node *dup,
349 struct stable_node *chain)
350 {
351 VM_BUG_ON(is_stable_node_dup(dup));
352 dup->head = STABLE_NODE_DUP_HEAD;
353 VM_BUG_ON(!is_stable_node_chain(chain));
354 hlist_add_head(&dup->hlist_dup, &chain->hlist);
355 ksm_stable_node_dups++;
356 }
357
358 static inline void __stable_node_dup_del(struct stable_node *dup)
359 {
360 VM_BUG_ON(!is_stable_node_dup(dup));
361 hlist_del(&dup->hlist_dup);
362 ksm_stable_node_dups--;
363 }
364
365 static inline void stable_node_dup_del(struct stable_node *dup)
366 {
367 VM_BUG_ON(is_stable_node_chain(dup));
368 if (is_stable_node_dup(dup))
369 __stable_node_dup_del(dup);
370 else
371 rb_erase(&dup->node, root_stable_tree + NUMA(dup->nid));
372 #ifdef CONFIG_DEBUG_VM
373 dup->head = NULL;
374 #endif
375 }
376
377 static inline struct rmap_item *alloc_rmap_item(void)
378 {
379 struct rmap_item *rmap_item;
380
381 rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL |
382 __GFP_NORETRY | __GFP_NOWARN);
383 if (rmap_item)
384 ksm_rmap_items++;
385 return rmap_item;
386 }
387
388 static inline void free_rmap_item(struct rmap_item *rmap_item)
389 {
390 ksm_rmap_items--;
391 rmap_item->mm = NULL;
392 kmem_cache_free(rmap_item_cache, rmap_item);
393 }
394
395 static inline struct stable_node *alloc_stable_node(void)
396 {
397
398
399
400
401
402 return kmem_cache_alloc(stable_node_cache, GFP_KERNEL | __GFP_HIGH);
403 }
404
405 static inline void free_stable_node(struct stable_node *stable_node)
406 {
407 VM_BUG_ON(stable_node->rmap_hlist_len &&
408 !is_stable_node_chain(stable_node));
409 kmem_cache_free(stable_node_cache, stable_node);
410 }
411
412 static inline struct mm_slot *alloc_mm_slot(void)
413 {
414 if (!mm_slot_cache)
415 return NULL;
416 return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
417 }
418
419 static inline void free_mm_slot(struct mm_slot *mm_slot)
420 {
421 kmem_cache_free(mm_slot_cache, mm_slot);
422 }
423
424 static struct mm_slot *get_mm_slot(struct mm_struct *mm)
425 {
426 struct mm_slot *slot;
427
428 hash_for_each_possible(mm_slots_hash, slot, link, (unsigned long)mm)
429 if (slot->mm == mm)
430 return slot;
431
432 return NULL;
433 }
434
435 static void insert_to_mm_slots_hash(struct mm_struct *mm,
436 struct mm_slot *mm_slot)
437 {
438 mm_slot->mm = mm;
439 hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm);
440 }
441
442
443
444
445
446
447
448
449
450 static inline bool ksm_test_exit(struct mm_struct *mm)
451 {
452 return atomic_read(&mm->mm_users) == 0;
453 }
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470 static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
471 {
472 struct page *page;
473 vm_fault_t ret = 0;
474
475 do {
476 cond_resched();
477 page = follow_page(vma, addr,
478 FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
479 if (IS_ERR_OR_NULL(page))
480 break;
481 if (PageKsm(page))
482 ret = handle_mm_fault(vma, addr,
483 FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE);
484 else
485 ret = VM_FAULT_WRITE;
486 put_page(page);
487 } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516 return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
517 }
518
519 static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
520 unsigned long addr)
521 {
522 struct vm_area_struct *vma;
523 if (ksm_test_exit(mm))
524 return NULL;
525 vma = find_vma(mm, addr);
526 if (!vma || vma->vm_start > addr)
527 return NULL;
528 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
529 return NULL;
530 return vma;
531 }
532
533 static void break_cow(struct rmap_item *rmap_item)
534 {
535 struct mm_struct *mm = rmap_item->mm;
536 unsigned long addr = rmap_item->address;
537 struct vm_area_struct *vma;
538
539
540
541
542
543 put_anon_vma(rmap_item->anon_vma);
544
545 down_read(&mm->mmap_sem);
546 vma = find_mergeable_vma(mm, addr);
547 if (vma)
548 break_ksm(vma, addr);
549 up_read(&mm->mmap_sem);
550 }
551
552 static struct page *get_mergeable_page(struct rmap_item *rmap_item)
553 {
554 struct mm_struct *mm = rmap_item->mm;
555 unsigned long addr = rmap_item->address;
556 struct vm_area_struct *vma;
557 struct page *page;
558
559 down_read(&mm->mmap_sem);
560 vma = find_mergeable_vma(mm, addr);
561 if (!vma)
562 goto out;
563
564 page = follow_page(vma, addr, FOLL_GET);
565 if (IS_ERR_OR_NULL(page))
566 goto out;
567 if (PageAnon(page)) {
568 flush_anon_page(vma, page, addr);
569 flush_dcache_page(page);
570 } else {
571 put_page(page);
572 out:
573 page = NULL;
574 }
575 up_read(&mm->mmap_sem);
576 return page;
577 }
578
579
580
581
582
583
584
585 static inline int get_kpfn_nid(unsigned long kpfn)
586 {
587 return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn));
588 }
589
590 static struct stable_node *alloc_stable_node_chain(struct stable_node *dup,
591 struct rb_root *root)
592 {
593 struct stable_node *chain = alloc_stable_node();
594 VM_BUG_ON(is_stable_node_chain(dup));
595 if (likely(chain)) {
596 INIT_HLIST_HEAD(&chain->hlist);
597 chain->chain_prune_time = jiffies;
598 chain->rmap_hlist_len = STABLE_NODE_CHAIN;
599 #if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA)
600 chain->nid = NUMA_NO_NODE;
601 #endif
602 ksm_stable_node_chains++;
603
604
605
606
607
608
609 rb_replace_node(&dup->node, &chain->node, root);
610
611
612
613
614
615
616
617
618 stable_node_chain_add_dup(dup, chain);
619 }
620 return chain;
621 }
622
623 static inline void free_stable_node_chain(struct stable_node *chain,
624 struct rb_root *root)
625 {
626 rb_erase(&chain->node, root);
627 free_stable_node(chain);
628 ksm_stable_node_chains--;
629 }
630
631 static void remove_node_from_stable_tree(struct stable_node *stable_node)
632 {
633 struct rmap_item *rmap_item;
634
635
636 BUG_ON(stable_node->rmap_hlist_len < 0);
637
638 hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
639 if (rmap_item->hlist.next)
640 ksm_pages_sharing--;
641 else
642 ksm_pages_shared--;
643 VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
644 stable_node->rmap_hlist_len--;
645 put_anon_vma(rmap_item->anon_vma);
646 rmap_item->address &= PAGE_MASK;
647 cond_resched();
648 }
649
650
651
652
653
654
655
656
657 #if defined(GCC_VERSION) && GCC_VERSION >= 40903
658 BUILD_BUG_ON(STABLE_NODE_DUP_HEAD <= &migrate_nodes);
659 BUILD_BUG_ON(STABLE_NODE_DUP_HEAD >= &migrate_nodes + 1);
660 #endif
661
662 if (stable_node->head == &migrate_nodes)
663 list_del(&stable_node->list);
664 else
665 stable_node_dup_del(stable_node);
666 free_stable_node(stable_node);
667 }
668
669 enum get_ksm_page_flags {
670 GET_KSM_PAGE_NOLOCK,
671 GET_KSM_PAGE_LOCK,
672 GET_KSM_PAGE_TRYLOCK
673 };
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694 static struct page *get_ksm_page(struct stable_node *stable_node,
695 enum get_ksm_page_flags flags)
696 {
697 struct page *page;
698 void *expected_mapping;
699 unsigned long kpfn;
700
701 expected_mapping = (void *)((unsigned long)stable_node |
702 PAGE_MAPPING_KSM);
703 again:
704 kpfn = READ_ONCE(stable_node->kpfn);
705 page = pfn_to_page(kpfn);
706 if (READ_ONCE(page->mapping) != expected_mapping)
707 goto stale;
708
709
710
711
712
713
714
715
716
717
718
719 while (!get_page_unless_zero(page)) {
720
721
722
723
724
725
726
727
728 if (!PageSwapCache(page))
729 goto stale;
730 cpu_relax();
731 }
732
733 if (READ_ONCE(page->mapping) != expected_mapping) {
734 put_page(page);
735 goto stale;
736 }
737
738 if (flags == GET_KSM_PAGE_TRYLOCK) {
739 if (!trylock_page(page)) {
740 put_page(page);
741 return ERR_PTR(-EBUSY);
742 }
743 } else if (flags == GET_KSM_PAGE_LOCK)
744 lock_page(page);
745
746 if (flags != GET_KSM_PAGE_NOLOCK) {
747 if (READ_ONCE(page->mapping) != expected_mapping) {
748 unlock_page(page);
749 put_page(page);
750 goto stale;
751 }
752 }
753 return page;
754
755 stale:
756
757
758
759
760
761
762 smp_rmb();
763 if (READ_ONCE(stable_node->kpfn) != kpfn)
764 goto again;
765 remove_node_from_stable_tree(stable_node);
766 return NULL;
767 }
768
769
770
771
772
773 static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
774 {
775 if (rmap_item->address & STABLE_FLAG) {
776 struct stable_node *stable_node;
777 struct page *page;
778
779 stable_node = rmap_item->head;
780 page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
781 if (!page)
782 goto out;
783
784 hlist_del(&rmap_item->hlist);
785 unlock_page(page);
786 put_page(page);
787
788 if (!hlist_empty(&stable_node->hlist))
789 ksm_pages_sharing--;
790 else
791 ksm_pages_shared--;
792 VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
793 stable_node->rmap_hlist_len--;
794
795 put_anon_vma(rmap_item->anon_vma);
796 rmap_item->address &= PAGE_MASK;
797
798 } else if (rmap_item->address & UNSTABLE_FLAG) {
799 unsigned char age;
800
801
802
803
804
805
806
807 age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
808 BUG_ON(age > 1);
809 if (!age)
810 rb_erase(&rmap_item->node,
811 root_unstable_tree + NUMA(rmap_item->nid));
812 ksm_pages_unshared--;
813 rmap_item->address &= PAGE_MASK;
814 }
815 out:
816 cond_resched();
817 }
818
819 static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
820 struct rmap_item **rmap_list)
821 {
822 while (*rmap_list) {
823 struct rmap_item *rmap_item = *rmap_list;
824 *rmap_list = rmap_item->rmap_list;
825 remove_rmap_item_from_tree(rmap_item);
826 free_rmap_item(rmap_item);
827 }
828 }
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843 static int unmerge_ksm_pages(struct vm_area_struct *vma,
844 unsigned long start, unsigned long end)
845 {
846 unsigned long addr;
847 int err = 0;
848
849 for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
850 if (ksm_test_exit(vma->vm_mm))
851 break;
852 if (signal_pending(current))
853 err = -ERESTARTSYS;
854 else
855 err = break_ksm(vma, addr);
856 }
857 return err;
858 }
859
860 static inline struct stable_node *page_stable_node(struct page *page)
861 {
862 return PageKsm(page) ? page_rmapping(page) : NULL;
863 }
864
865 static inline void set_page_stable_node(struct page *page,
866 struct stable_node *stable_node)
867 {
868 page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
869 }
870
871 #ifdef CONFIG_SYSFS
872
873
874
875 static int remove_stable_node(struct stable_node *stable_node)
876 {
877 struct page *page;
878 int err;
879
880 page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
881 if (!page) {
882
883
884
885 return 0;
886 }
887
888
889
890
891
892
893 err = -EBUSY;
894 if (!page_mapped(page)) {
895
896
897
898
899
900
901
902
903 set_page_stable_node(page, NULL);
904 remove_node_from_stable_tree(stable_node);
905 err = 0;
906 }
907
908 unlock_page(page);
909 put_page(page);
910 return err;
911 }
912
913 static int remove_stable_node_chain(struct stable_node *stable_node,
914 struct rb_root *root)
915 {
916 struct stable_node *dup;
917 struct hlist_node *hlist_safe;
918
919 if (!is_stable_node_chain(stable_node)) {
920 VM_BUG_ON(is_stable_node_dup(stable_node));
921 if (remove_stable_node(stable_node))
922 return true;
923 else
924 return false;
925 }
926
927 hlist_for_each_entry_safe(dup, hlist_safe,
928 &stable_node->hlist, hlist_dup) {
929 VM_BUG_ON(!is_stable_node_dup(dup));
930 if (remove_stable_node(dup))
931 return true;
932 }
933 BUG_ON(!hlist_empty(&stable_node->hlist));
934 free_stable_node_chain(stable_node, root);
935 return false;
936 }
937
938 static int remove_all_stable_nodes(void)
939 {
940 struct stable_node *stable_node, *next;
941 int nid;
942 int err = 0;
943
944 for (nid = 0; nid < ksm_nr_node_ids; nid++) {
945 while (root_stable_tree[nid].rb_node) {
946 stable_node = rb_entry(root_stable_tree[nid].rb_node,
947 struct stable_node, node);
948 if (remove_stable_node_chain(stable_node,
949 root_stable_tree + nid)) {
950 err = -EBUSY;
951 break;
952 }
953 cond_resched();
954 }
955 }
956 list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
957 if (remove_stable_node(stable_node))
958 err = -EBUSY;
959 cond_resched();
960 }
961 return err;
962 }
963
964 static int unmerge_and_remove_all_rmap_items(void)
965 {
966 struct mm_slot *mm_slot;
967 struct mm_struct *mm;
968 struct vm_area_struct *vma;
969 int err = 0;
970
971 spin_lock(&ksm_mmlist_lock);
972 ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next,
973 struct mm_slot, mm_list);
974 spin_unlock(&ksm_mmlist_lock);
975
976 for (mm_slot = ksm_scan.mm_slot;
977 mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) {
978 mm = mm_slot->mm;
979 down_read(&mm->mmap_sem);
980 for (vma = mm->mmap; vma; vma = vma->vm_next) {
981 if (ksm_test_exit(mm))
982 break;
983 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
984 continue;
985 err = unmerge_ksm_pages(vma,
986 vma->vm_start, vma->vm_end);
987 if (err)
988 goto error;
989 }
990
991 remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list);
992 up_read(&mm->mmap_sem);
993
994 spin_lock(&ksm_mmlist_lock);
995 ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
996 struct mm_slot, mm_list);
997 if (ksm_test_exit(mm)) {
998 hash_del(&mm_slot->link);
999 list_del(&mm_slot->mm_list);
1000 spin_unlock(&ksm_mmlist_lock);
1001
1002 free_mm_slot(mm_slot);
1003 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
1004 mmdrop(mm);
1005 } else
1006 spin_unlock(&ksm_mmlist_lock);
1007 }
1008
1009
1010 remove_all_stable_nodes();
1011 ksm_scan.seqnr = 0;
1012 return 0;
1013
1014 error:
1015 up_read(&mm->mmap_sem);
1016 spin_lock(&ksm_mmlist_lock);
1017 ksm_scan.mm_slot = &ksm_mm_head;
1018 spin_unlock(&ksm_mmlist_lock);
1019 return err;
1020 }
1021 #endif
1022
1023 static u32 calc_checksum(struct page *page)
1024 {
1025 u32 checksum;
1026 void *addr = kmap_atomic(page);
1027 checksum = xxhash(addr, PAGE_SIZE, 0);
1028 kunmap_atomic(addr);
1029 return checksum;
1030 }
1031
1032 static int write_protect_page(struct vm_area_struct *vma, struct page *page,
1033 pte_t *orig_pte)
1034 {
1035 struct mm_struct *mm = vma->vm_mm;
1036 struct page_vma_mapped_walk pvmw = {
1037 .page = page,
1038 .vma = vma,
1039 };
1040 int swapped;
1041 int err = -EFAULT;
1042 struct mmu_notifier_range range;
1043
1044 pvmw.address = page_address_in_vma(page, vma);
1045 if (pvmw.address == -EFAULT)
1046 goto out;
1047
1048 BUG_ON(PageTransCompound(page));
1049
1050 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
1051 pvmw.address,
1052 pvmw.address + PAGE_SIZE);
1053 mmu_notifier_invalidate_range_start(&range);
1054
1055 if (!page_vma_mapped_walk(&pvmw))
1056 goto out_mn;
1057 if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?"))
1058 goto out_unlock;
1059
1060 if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) ||
1061 (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte)) ||
1062 mm_tlb_flush_pending(mm)) {
1063 pte_t entry;
1064
1065 swapped = PageSwapCache(page);
1066 flush_cache_page(vma, pvmw.address, page_to_pfn(page));
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081 entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte);
1082
1083
1084
1085
1086 if (page_mapcount(page) + 1 + swapped != page_count(page)) {
1087 set_pte_at(mm, pvmw.address, pvmw.pte, entry);
1088 goto out_unlock;
1089 }
1090 if (pte_dirty(entry))
1091 set_page_dirty(page);
1092
1093 if (pte_protnone(entry))
1094 entry = pte_mkclean(pte_clear_savedwrite(entry));
1095 else
1096 entry = pte_mkclean(pte_wrprotect(entry));
1097 set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry);
1098 }
1099 *orig_pte = *pvmw.pte;
1100 err = 0;
1101
1102 out_unlock:
1103 page_vma_mapped_walk_done(&pvmw);
1104 out_mn:
1105 mmu_notifier_invalidate_range_end(&range);
1106 out:
1107 return err;
1108 }
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119 static int replace_page(struct vm_area_struct *vma, struct page *page,
1120 struct page *kpage, pte_t orig_pte)
1121 {
1122 struct mm_struct *mm = vma->vm_mm;
1123 pmd_t *pmd;
1124 pte_t *ptep;
1125 pte_t newpte;
1126 spinlock_t *ptl;
1127 unsigned long addr;
1128 int err = -EFAULT;
1129 struct mmu_notifier_range range;
1130
1131 addr = page_address_in_vma(page, vma);
1132 if (addr == -EFAULT)
1133 goto out;
1134
1135 pmd = mm_find_pmd(mm, addr);
1136 if (!pmd)
1137 goto out;
1138
1139 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
1140 addr + PAGE_SIZE);
1141 mmu_notifier_invalidate_range_start(&range);
1142
1143 ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
1144 if (!pte_same(*ptep, orig_pte)) {
1145 pte_unmap_unlock(ptep, ptl);
1146 goto out_mn;
1147 }
1148
1149
1150
1151
1152
1153 if (!is_zero_pfn(page_to_pfn(kpage))) {
1154 get_page(kpage);
1155 page_add_anon_rmap(kpage, vma, addr, false);
1156 newpte = mk_pte(kpage, vma->vm_page_prot);
1157 } else {
1158 newpte = pte_mkspecial(pfn_pte(page_to_pfn(kpage),
1159 vma->vm_page_prot));
1160
1161
1162
1163
1164
1165
1166 dec_mm_counter(mm, MM_ANONPAGES);
1167 }
1168
1169 flush_cache_page(vma, addr, pte_pfn(*ptep));
1170
1171
1172
1173
1174
1175
1176 ptep_clear_flush(vma, addr, ptep);
1177 set_pte_at_notify(mm, addr, ptep, newpte);
1178
1179 page_remove_rmap(page, false);
1180 if (!page_mapped(page))
1181 try_to_free_swap(page);
1182 put_page(page);
1183
1184 pte_unmap_unlock(ptep, ptl);
1185 err = 0;
1186 out_mn:
1187 mmu_notifier_invalidate_range_end(&range);
1188 out:
1189 return err;
1190 }
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201 static int try_to_merge_one_page(struct vm_area_struct *vma,
1202 struct page *page, struct page *kpage)
1203 {
1204 pte_t orig_pte = __pte(0);
1205 int err = -EFAULT;
1206
1207 if (page == kpage)
1208 return 0;
1209
1210 if (!PageAnon(page))
1211 goto out;
1212
1213
1214
1215
1216
1217
1218
1219
1220 if (!trylock_page(page))
1221 goto out;
1222
1223 if (PageTransCompound(page)) {
1224 if (split_huge_page(page))
1225 goto out_unlock;
1226 }
1227
1228
1229
1230
1231
1232
1233
1234 if (write_protect_page(vma, page, &orig_pte) == 0) {
1235 if (!kpage) {
1236
1237
1238
1239
1240
1241 set_page_stable_node(page, NULL);
1242 mark_page_accessed(page);
1243
1244
1245
1246
1247 if (!PageDirty(page))
1248 SetPageDirty(page);
1249 err = 0;
1250 } else if (pages_identical(page, kpage))
1251 err = replace_page(vma, page, kpage, orig_pte);
1252 }
1253
1254 if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
1255 munlock_vma_page(page);
1256 if (!PageMlocked(kpage)) {
1257 unlock_page(page);
1258 lock_page(kpage);
1259 mlock_vma_page(kpage);
1260 page = kpage;
1261 }
1262 }
1263
1264 out_unlock:
1265 unlock_page(page);
1266 out:
1267 return err;
1268 }
1269
1270
1271
1272
1273
1274
1275
1276 static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
1277 struct page *page, struct page *kpage)
1278 {
1279 struct mm_struct *mm = rmap_item->mm;
1280 struct vm_area_struct *vma;
1281 int err = -EFAULT;
1282
1283 down_read(&mm->mmap_sem);
1284 vma = find_mergeable_vma(mm, rmap_item->address);
1285 if (!vma)
1286 goto out;
1287
1288 err = try_to_merge_one_page(vma, page, kpage);
1289 if (err)
1290 goto out;
1291
1292
1293 remove_rmap_item_from_tree(rmap_item);
1294
1295
1296 rmap_item->anon_vma = vma->anon_vma;
1297 get_anon_vma(vma->anon_vma);
1298 out:
1299 up_read(&mm->mmap_sem);
1300 return err;
1301 }
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313 static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
1314 struct page *page,
1315 struct rmap_item *tree_rmap_item,
1316 struct page *tree_page)
1317 {
1318 int err;
1319
1320 err = try_to_merge_with_ksm_page(rmap_item, page, NULL);
1321 if (!err) {
1322 err = try_to_merge_with_ksm_page(tree_rmap_item,
1323 tree_page, page);
1324
1325
1326
1327
1328 if (err)
1329 break_cow(rmap_item);
1330 }
1331 return err ? NULL : page;
1332 }
1333
1334 static __always_inline
1335 bool __is_page_sharing_candidate(struct stable_node *stable_node, int offset)
1336 {
1337 VM_BUG_ON(stable_node->rmap_hlist_len < 0);
1338
1339
1340
1341
1342
1343
1344 return stable_node->rmap_hlist_len &&
1345 stable_node->rmap_hlist_len + offset < ksm_max_page_sharing;
1346 }
1347
1348 static __always_inline
1349 bool is_page_sharing_candidate(struct stable_node *stable_node)
1350 {
1351 return __is_page_sharing_candidate(stable_node, 0);
1352 }
1353
1354 static struct page *stable_node_dup(struct stable_node **_stable_node_dup,
1355 struct stable_node **_stable_node,
1356 struct rb_root *root,
1357 bool prune_stale_stable_nodes)
1358 {
1359 struct stable_node *dup, *found = NULL, *stable_node = *_stable_node;
1360 struct hlist_node *hlist_safe;
1361 struct page *_tree_page, *tree_page = NULL;
1362 int nr = 0;
1363 int found_rmap_hlist_len;
1364
1365 if (!prune_stale_stable_nodes ||
1366 time_before(jiffies, stable_node->chain_prune_time +
1367 msecs_to_jiffies(
1368 ksm_stable_node_chains_prune_millisecs)))
1369 prune_stale_stable_nodes = false;
1370 else
1371 stable_node->chain_prune_time = jiffies;
1372
1373 hlist_for_each_entry_safe(dup, hlist_safe,
1374 &stable_node->hlist, hlist_dup) {
1375 cond_resched();
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386 _tree_page = get_ksm_page(dup, GET_KSM_PAGE_NOLOCK);
1387 if (!_tree_page)
1388 continue;
1389 nr += 1;
1390 if (is_page_sharing_candidate(dup)) {
1391 if (!found ||
1392 dup->rmap_hlist_len > found_rmap_hlist_len) {
1393 if (found)
1394 put_page(tree_page);
1395 found = dup;
1396 found_rmap_hlist_len = found->rmap_hlist_len;
1397 tree_page = _tree_page;
1398
1399
1400 if (!prune_stale_stable_nodes)
1401 break;
1402 continue;
1403 }
1404 }
1405 put_page(_tree_page);
1406 }
1407
1408 if (found) {
1409
1410
1411
1412
1413
1414
1415 if (prune_stale_stable_nodes && nr == 1) {
1416
1417
1418
1419
1420
1421
1422 BUG_ON(stable_node->hlist.first->next);
1423
1424
1425
1426
1427
1428 rb_replace_node(&stable_node->node, &found->node,
1429 root);
1430 free_stable_node(stable_node);
1431 ksm_stable_node_chains--;
1432 ksm_stable_node_dups--;
1433
1434
1435
1436
1437
1438 *_stable_node = found;
1439
1440
1441
1442
1443
1444
1445 stable_node = NULL;
1446 } else if (stable_node->hlist.first != &found->hlist_dup &&
1447 __is_page_sharing_candidate(found, 1)) {
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463 hlist_del(&found->hlist_dup);
1464 hlist_add_head(&found->hlist_dup,
1465 &stable_node->hlist);
1466 }
1467 }
1468
1469 *_stable_node_dup = found;
1470 return tree_page;
1471 }
1472
1473 static struct stable_node *stable_node_dup_any(struct stable_node *stable_node,
1474 struct rb_root *root)
1475 {
1476 if (!is_stable_node_chain(stable_node))
1477 return stable_node;
1478 if (hlist_empty(&stable_node->hlist)) {
1479 free_stable_node_chain(stable_node, root);
1480 return NULL;
1481 }
1482 return hlist_entry(stable_node->hlist.first,
1483 typeof(*stable_node), hlist_dup);
1484 }
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500 static struct page *__stable_node_chain(struct stable_node **_stable_node_dup,
1501 struct stable_node **_stable_node,
1502 struct rb_root *root,
1503 bool prune_stale_stable_nodes)
1504 {
1505 struct stable_node *stable_node = *_stable_node;
1506 if (!is_stable_node_chain(stable_node)) {
1507 if (is_page_sharing_candidate(stable_node)) {
1508 *_stable_node_dup = stable_node;
1509 return get_ksm_page(stable_node, GET_KSM_PAGE_NOLOCK);
1510 }
1511
1512
1513
1514
1515 *_stable_node_dup = NULL;
1516 return NULL;
1517 }
1518 return stable_node_dup(_stable_node_dup, _stable_node, root,
1519 prune_stale_stable_nodes);
1520 }
1521
1522 static __always_inline struct page *chain_prune(struct stable_node **s_n_d,
1523 struct stable_node **s_n,
1524 struct rb_root *root)
1525 {
1526 return __stable_node_chain(s_n_d, s_n, root, true);
1527 }
1528
1529 static __always_inline struct page *chain(struct stable_node **s_n_d,
1530 struct stable_node *s_n,
1531 struct rb_root *root)
1532 {
1533 struct stable_node *old_stable_node = s_n;
1534 struct page *tree_page;
1535
1536 tree_page = __stable_node_chain(s_n_d, &s_n, root, false);
1537
1538 VM_BUG_ON(s_n != old_stable_node);
1539 return tree_page;
1540 }
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551 static struct page *stable_tree_search(struct page *page)
1552 {
1553 int nid;
1554 struct rb_root *root;
1555 struct rb_node **new;
1556 struct rb_node *parent;
1557 struct stable_node *stable_node, *stable_node_dup, *stable_node_any;
1558 struct stable_node *page_node;
1559
1560 page_node = page_stable_node(page);
1561 if (page_node && page_node->head != &migrate_nodes) {
1562
1563 get_page(page);
1564 return page;
1565 }
1566
1567 nid = get_kpfn_nid(page_to_pfn(page));
1568 root = root_stable_tree + nid;
1569 again:
1570 new = &root->rb_node;
1571 parent = NULL;
1572
1573 while (*new) {
1574 struct page *tree_page;
1575 int ret;
1576
1577 cond_resched();
1578 stable_node = rb_entry(*new, struct stable_node, node);
1579 stable_node_any = NULL;
1580 tree_page = chain_prune(&stable_node_dup, &stable_node, root);
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593 if (!stable_node_dup) {
1594
1595
1596
1597
1598
1599 stable_node_any = stable_node_dup_any(stable_node,
1600 root);
1601 if (!stable_node_any) {
1602
1603 goto again;
1604 }
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614 tree_page = get_ksm_page(stable_node_any,
1615 GET_KSM_PAGE_NOLOCK);
1616 }
1617 VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
1618 if (!tree_page) {
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628 goto again;
1629 }
1630
1631 ret = memcmp_pages(page, tree_page);
1632 put_page(tree_page);
1633
1634 parent = *new;
1635 if (ret < 0)
1636 new = &parent->rb_left;
1637 else if (ret > 0)
1638 new = &parent->rb_right;
1639 else {
1640 if (page_node) {
1641 VM_BUG_ON(page_node->head != &migrate_nodes);
1642
1643
1644
1645
1646
1647
1648 if (page_mapcount(page) > 1)
1649 goto chain_append;
1650 }
1651
1652 if (!stable_node_dup) {
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665 return NULL;
1666 }
1667
1668
1669
1670
1671
1672
1673
1674
1675 tree_page = get_ksm_page(stable_node_dup,
1676 GET_KSM_PAGE_TRYLOCK);
1677
1678 if (PTR_ERR(tree_page) == -EBUSY)
1679 return ERR_PTR(-EBUSY);
1680
1681 if (unlikely(!tree_page))
1682
1683
1684
1685
1686 goto again;
1687 unlock_page(tree_page);
1688
1689 if (get_kpfn_nid(stable_node_dup->kpfn) !=
1690 NUMA(stable_node_dup->nid)) {
1691 put_page(tree_page);
1692 goto replace;
1693 }
1694 return tree_page;
1695 }
1696 }
1697
1698 if (!page_node)
1699 return NULL;
1700
1701 list_del(&page_node->list);
1702 DO_NUMA(page_node->nid = nid);
1703 rb_link_node(&page_node->node, parent, new);
1704 rb_insert_color(&page_node->node, root);
1705 out:
1706 if (is_page_sharing_candidate(page_node)) {
1707 get_page(page);
1708 return page;
1709 } else
1710 return NULL;
1711
1712 replace:
1713
1714
1715
1716
1717
1718
1719
1720
1721 if (stable_node_dup == stable_node) {
1722 VM_BUG_ON(is_stable_node_chain(stable_node_dup));
1723 VM_BUG_ON(is_stable_node_dup(stable_node_dup));
1724
1725 if (page_node) {
1726 VM_BUG_ON(page_node->head != &migrate_nodes);
1727 list_del(&page_node->list);
1728 DO_NUMA(page_node->nid = nid);
1729 rb_replace_node(&stable_node_dup->node,
1730 &page_node->node,
1731 root);
1732 if (is_page_sharing_candidate(page_node))
1733 get_page(page);
1734 else
1735 page = NULL;
1736 } else {
1737 rb_erase(&stable_node_dup->node, root);
1738 page = NULL;
1739 }
1740 } else {
1741 VM_BUG_ON(!is_stable_node_chain(stable_node));
1742 __stable_node_dup_del(stable_node_dup);
1743 if (page_node) {
1744 VM_BUG_ON(page_node->head != &migrate_nodes);
1745 list_del(&page_node->list);
1746 DO_NUMA(page_node->nid = nid);
1747 stable_node_chain_add_dup(page_node, stable_node);
1748 if (is_page_sharing_candidate(page_node))
1749 get_page(page);
1750 else
1751 page = NULL;
1752 } else {
1753 page = NULL;
1754 }
1755 }
1756 stable_node_dup->head = &migrate_nodes;
1757 list_add(&stable_node_dup->list, stable_node_dup->head);
1758 return page;
1759
1760 chain_append:
1761
1762 if (!stable_node_dup)
1763 stable_node_dup = stable_node_any;
1764
1765
1766
1767
1768
1769
1770
1771
1772 if (stable_node_dup == stable_node) {
1773 VM_BUG_ON(is_stable_node_chain(stable_node_dup));
1774 VM_BUG_ON(is_stable_node_dup(stable_node_dup));
1775
1776 stable_node = alloc_stable_node_chain(stable_node_dup,
1777 root);
1778 if (!stable_node)
1779 return NULL;
1780 }
1781
1782
1783
1784
1785
1786
1787 VM_BUG_ON(!is_stable_node_chain(stable_node));
1788 VM_BUG_ON(!is_stable_node_dup(stable_node_dup));
1789 VM_BUG_ON(page_node->head != &migrate_nodes);
1790 list_del(&page_node->list);
1791 DO_NUMA(page_node->nid = nid);
1792 stable_node_chain_add_dup(page_node, stable_node);
1793 goto out;
1794 }
1795
1796
1797
1798
1799
1800
1801
1802
1803 static struct stable_node *stable_tree_insert(struct page *kpage)
1804 {
1805 int nid;
1806 unsigned long kpfn;
1807 struct rb_root *root;
1808 struct rb_node **new;
1809 struct rb_node *parent;
1810 struct stable_node *stable_node, *stable_node_dup, *stable_node_any;
1811 bool need_chain = false;
1812
1813 kpfn = page_to_pfn(kpage);
1814 nid = get_kpfn_nid(kpfn);
1815 root = root_stable_tree + nid;
1816 again:
1817 parent = NULL;
1818 new = &root->rb_node;
1819
1820 while (*new) {
1821 struct page *tree_page;
1822 int ret;
1823
1824 cond_resched();
1825 stable_node = rb_entry(*new, struct stable_node, node);
1826 stable_node_any = NULL;
1827 tree_page = chain(&stable_node_dup, stable_node, root);
1828 if (!stable_node_dup) {
1829
1830
1831
1832
1833
1834 stable_node_any = stable_node_dup_any(stable_node,
1835 root);
1836 if (!stable_node_any) {
1837
1838 goto again;
1839 }
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849 tree_page = get_ksm_page(stable_node_any,
1850 GET_KSM_PAGE_NOLOCK);
1851 }
1852 VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
1853 if (!tree_page) {
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863 goto again;
1864 }
1865
1866 ret = memcmp_pages(kpage, tree_page);
1867 put_page(tree_page);
1868
1869 parent = *new;
1870 if (ret < 0)
1871 new = &parent->rb_left;
1872 else if (ret > 0)
1873 new = &parent->rb_right;
1874 else {
1875 need_chain = true;
1876 break;
1877 }
1878 }
1879
1880 stable_node_dup = alloc_stable_node();
1881 if (!stable_node_dup)
1882 return NULL;
1883
1884 INIT_HLIST_HEAD(&stable_node_dup->hlist);
1885 stable_node_dup->kpfn = kpfn;
1886 set_page_stable_node(kpage, stable_node_dup);
1887 stable_node_dup->rmap_hlist_len = 0;
1888 DO_NUMA(stable_node_dup->nid = nid);
1889 if (!need_chain) {
1890 rb_link_node(&stable_node_dup->node, parent, new);
1891 rb_insert_color(&stable_node_dup->node, root);
1892 } else {
1893 if (!is_stable_node_chain(stable_node)) {
1894 struct stable_node *orig = stable_node;
1895
1896 stable_node = alloc_stable_node_chain(orig, root);
1897 if (!stable_node) {
1898 free_stable_node(stable_node_dup);
1899 return NULL;
1900 }
1901 }
1902 stable_node_chain_add_dup(stable_node_dup, stable_node);
1903 }
1904
1905 return stable_node_dup;
1906 }
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922 static
1923 struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
1924 struct page *page,
1925 struct page **tree_pagep)
1926 {
1927 struct rb_node **new;
1928 struct rb_root *root;
1929 struct rb_node *parent = NULL;
1930 int nid;
1931
1932 nid = get_kpfn_nid(page_to_pfn(page));
1933 root = root_unstable_tree + nid;
1934 new = &root->rb_node;
1935
1936 while (*new) {
1937 struct rmap_item *tree_rmap_item;
1938 struct page *tree_page;
1939 int ret;
1940
1941 cond_resched();
1942 tree_rmap_item = rb_entry(*new, struct rmap_item, node);
1943 tree_page = get_mergeable_page(tree_rmap_item);
1944 if (!tree_page)
1945 return NULL;
1946
1947
1948
1949
1950 if (page == tree_page) {
1951 put_page(tree_page);
1952 return NULL;
1953 }
1954
1955 ret = memcmp_pages(page, tree_page);
1956
1957 parent = *new;
1958 if (ret < 0) {
1959 put_page(tree_page);
1960 new = &parent->rb_left;
1961 } else if (ret > 0) {
1962 put_page(tree_page);
1963 new = &parent->rb_right;
1964 } else if (!ksm_merge_across_nodes &&
1965 page_to_nid(tree_page) != nid) {
1966
1967
1968
1969
1970
1971 put_page(tree_page);
1972 return NULL;
1973 } else {
1974 *tree_pagep = tree_page;
1975 return tree_rmap_item;
1976 }
1977 }
1978
1979 rmap_item->address |= UNSTABLE_FLAG;
1980 rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
1981 DO_NUMA(rmap_item->nid = nid);
1982 rb_link_node(&rmap_item->node, parent, new);
1983 rb_insert_color(&rmap_item->node, root);
1984
1985 ksm_pages_unshared++;
1986 return NULL;
1987 }
1988
1989
1990
1991
1992
1993
1994 static void stable_tree_append(struct rmap_item *rmap_item,
1995 struct stable_node *stable_node,
1996 bool max_page_sharing_bypass)
1997 {
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008 BUG_ON(stable_node->rmap_hlist_len < 0);
2009
2010 stable_node->rmap_hlist_len++;
2011 if (!max_page_sharing_bypass)
2012
2013 WARN_ON_ONCE(stable_node->rmap_hlist_len >
2014 ksm_max_page_sharing);
2015
2016 rmap_item->head = stable_node;
2017 rmap_item->address |= STABLE_FLAG;
2018 hlist_add_head(&rmap_item->hlist, &stable_node->hlist);
2019
2020 if (rmap_item->hlist.next)
2021 ksm_pages_sharing++;
2022 else
2023 ksm_pages_shared++;
2024 }
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035 static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
2036 {
2037 struct mm_struct *mm = rmap_item->mm;
2038 struct rmap_item *tree_rmap_item;
2039 struct page *tree_page = NULL;
2040 struct stable_node *stable_node;
2041 struct page *kpage;
2042 unsigned int checksum;
2043 int err;
2044 bool max_page_sharing_bypass = false;
2045
2046 stable_node = page_stable_node(page);
2047 if (stable_node) {
2048 if (stable_node->head != &migrate_nodes &&
2049 get_kpfn_nid(READ_ONCE(stable_node->kpfn)) !=
2050 NUMA(stable_node->nid)) {
2051 stable_node_dup_del(stable_node);
2052 stable_node->head = &migrate_nodes;
2053 list_add(&stable_node->list, stable_node->head);
2054 }
2055 if (stable_node->head != &migrate_nodes &&
2056 rmap_item->head == stable_node)
2057 return;
2058
2059
2060
2061
2062 if (!is_page_sharing_candidate(stable_node))
2063 max_page_sharing_bypass = true;
2064 }
2065
2066
2067 kpage = stable_tree_search(page);
2068 if (kpage == page && rmap_item->head == stable_node) {
2069 put_page(kpage);
2070 return;
2071 }
2072
2073 remove_rmap_item_from_tree(rmap_item);
2074
2075 if (kpage) {
2076 if (PTR_ERR(kpage) == -EBUSY)
2077 return;
2078
2079 err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
2080 if (!err) {
2081
2082
2083
2084
2085 lock_page(kpage);
2086 stable_tree_append(rmap_item, page_stable_node(kpage),
2087 max_page_sharing_bypass);
2088 unlock_page(kpage);
2089 }
2090 put_page(kpage);
2091 return;
2092 }
2093
2094
2095
2096
2097
2098
2099
2100 checksum = calc_checksum(page);
2101 if (rmap_item->oldchecksum != checksum) {
2102 rmap_item->oldchecksum = checksum;
2103 return;
2104 }
2105
2106
2107
2108
2109
2110 if (ksm_use_zero_pages && (checksum == zero_checksum)) {
2111 struct vm_area_struct *vma;
2112
2113 down_read(&mm->mmap_sem);
2114 vma = find_mergeable_vma(mm, rmap_item->address);
2115 if (vma) {
2116 err = try_to_merge_one_page(vma, page,
2117 ZERO_PAGE(rmap_item->address));
2118 } else {
2119
2120
2121
2122
2123 err = 0;
2124 }
2125 up_read(&mm->mmap_sem);
2126
2127
2128
2129
2130 if (!err)
2131 return;
2132 }
2133 tree_rmap_item =
2134 unstable_tree_search_insert(rmap_item, page, &tree_page);
2135 if (tree_rmap_item) {
2136 bool split;
2137
2138 kpage = try_to_merge_two_pages(rmap_item, page,
2139 tree_rmap_item, tree_page);
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150 split = PageTransCompound(page)
2151 && compound_head(page) == compound_head(tree_page);
2152 put_page(tree_page);
2153 if (kpage) {
2154
2155
2156
2157
2158 lock_page(kpage);
2159 stable_node = stable_tree_insert(kpage);
2160 if (stable_node) {
2161 stable_tree_append(tree_rmap_item, stable_node,
2162 false);
2163 stable_tree_append(rmap_item, stable_node,
2164 false);
2165 }
2166 unlock_page(kpage);
2167
2168
2169
2170
2171
2172
2173
2174 if (!stable_node) {
2175 break_cow(tree_rmap_item);
2176 break_cow(rmap_item);
2177 }
2178 } else if (split) {
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188 if (!trylock_page(page))
2189 return;
2190 split_huge_page(page);
2191 unlock_page(page);
2192 }
2193 }
2194 }
2195
2196 static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
2197 struct rmap_item **rmap_list,
2198 unsigned long addr)
2199 {
2200 struct rmap_item *rmap_item;
2201
2202 while (*rmap_list) {
2203 rmap_item = *rmap_list;
2204 if ((rmap_item->address & PAGE_MASK) == addr)
2205 return rmap_item;
2206 if (rmap_item->address > addr)
2207 break;
2208 *rmap_list = rmap_item->rmap_list;
2209 remove_rmap_item_from_tree(rmap_item);
2210 free_rmap_item(rmap_item);
2211 }
2212
2213 rmap_item = alloc_rmap_item();
2214 if (rmap_item) {
2215
2216 rmap_item->mm = mm_slot->mm;
2217 rmap_item->address = addr;
2218 rmap_item->rmap_list = *rmap_list;
2219 *rmap_list = rmap_item;
2220 }
2221 return rmap_item;
2222 }
2223
2224 static struct rmap_item *scan_get_next_rmap_item(struct page **page)
2225 {
2226 struct mm_struct *mm;
2227 struct mm_slot *slot;
2228 struct vm_area_struct *vma;
2229 struct rmap_item *rmap_item;
2230 int nid;
2231
2232 if (list_empty(&ksm_mm_head.mm_list))
2233 return NULL;
2234
2235 slot = ksm_scan.mm_slot;
2236 if (slot == &ksm_mm_head) {
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247 lru_add_drain_all();
2248
2249
2250
2251
2252
2253
2254
2255 if (!ksm_merge_across_nodes) {
2256 struct stable_node *stable_node, *next;
2257 struct page *page;
2258
2259 list_for_each_entry_safe(stable_node, next,
2260 &migrate_nodes, list) {
2261 page = get_ksm_page(stable_node,
2262 GET_KSM_PAGE_NOLOCK);
2263 if (page)
2264 put_page(page);
2265 cond_resched();
2266 }
2267 }
2268
2269 for (nid = 0; nid < ksm_nr_node_ids; nid++)
2270 root_unstable_tree[nid] = RB_ROOT;
2271
2272 spin_lock(&ksm_mmlist_lock);
2273 slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
2274 ksm_scan.mm_slot = slot;
2275 spin_unlock(&ksm_mmlist_lock);
2276
2277
2278
2279
2280 if (slot == &ksm_mm_head)
2281 return NULL;
2282 next_mm:
2283 ksm_scan.address = 0;
2284 ksm_scan.rmap_list = &slot->rmap_list;
2285 }
2286
2287 mm = slot->mm;
2288 down_read(&mm->mmap_sem);
2289 if (ksm_test_exit(mm))
2290 vma = NULL;
2291 else
2292 vma = find_vma(mm, ksm_scan.address);
2293
2294 for (; vma; vma = vma->vm_next) {
2295 if (!(vma->vm_flags & VM_MERGEABLE))
2296 continue;
2297 if (ksm_scan.address < vma->vm_start)
2298 ksm_scan.address = vma->vm_start;
2299 if (!vma->anon_vma)
2300 ksm_scan.address = vma->vm_end;
2301
2302 while (ksm_scan.address < vma->vm_end) {
2303 if (ksm_test_exit(mm))
2304 break;
2305 *page = follow_page(vma, ksm_scan.address, FOLL_GET);
2306 if (IS_ERR_OR_NULL(*page)) {
2307 ksm_scan.address += PAGE_SIZE;
2308 cond_resched();
2309 continue;
2310 }
2311 if (PageAnon(*page)) {
2312 flush_anon_page(vma, *page, ksm_scan.address);
2313 flush_dcache_page(*page);
2314 rmap_item = get_next_rmap_item(slot,
2315 ksm_scan.rmap_list, ksm_scan.address);
2316 if (rmap_item) {
2317 ksm_scan.rmap_list =
2318 &rmap_item->rmap_list;
2319 ksm_scan.address += PAGE_SIZE;
2320 } else
2321 put_page(*page);
2322 up_read(&mm->mmap_sem);
2323 return rmap_item;
2324 }
2325 put_page(*page);
2326 ksm_scan.address += PAGE_SIZE;
2327 cond_resched();
2328 }
2329 }
2330
2331 if (ksm_test_exit(mm)) {
2332 ksm_scan.address = 0;
2333 ksm_scan.rmap_list = &slot->rmap_list;
2334 }
2335
2336
2337
2338
2339 remove_trailing_rmap_items(slot, ksm_scan.rmap_list);
2340
2341 spin_lock(&ksm_mmlist_lock);
2342 ksm_scan.mm_slot = list_entry(slot->mm_list.next,
2343 struct mm_slot, mm_list);
2344 if (ksm_scan.address == 0) {
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354 hash_del(&slot->link);
2355 list_del(&slot->mm_list);
2356 spin_unlock(&ksm_mmlist_lock);
2357
2358 free_mm_slot(slot);
2359 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
2360 up_read(&mm->mmap_sem);
2361 mmdrop(mm);
2362 } else {
2363 up_read(&mm->mmap_sem);
2364
2365
2366
2367
2368
2369
2370
2371 spin_unlock(&ksm_mmlist_lock);
2372 }
2373
2374
2375 slot = ksm_scan.mm_slot;
2376 if (slot != &ksm_mm_head)
2377 goto next_mm;
2378
2379 ksm_scan.seqnr++;
2380 return NULL;
2381 }
2382
2383
2384
2385
2386
2387 static void ksm_do_scan(unsigned int scan_npages)
2388 {
2389 struct rmap_item *rmap_item;
2390 struct page *uninitialized_var(page);
2391
2392 while (scan_npages-- && likely(!freezing(current))) {
2393 cond_resched();
2394 rmap_item = scan_get_next_rmap_item(&page);
2395 if (!rmap_item)
2396 return;
2397 cmp_and_merge_page(page, rmap_item);
2398 put_page(page);
2399 }
2400 }
2401
2402 static int ksmd_should_run(void)
2403 {
2404 return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list);
2405 }
2406
2407 static int ksm_scan_thread(void *nothing)
2408 {
2409 unsigned int sleep_ms;
2410
2411 set_freezable();
2412 set_user_nice(current, 5);
2413
2414 while (!kthread_should_stop()) {
2415 mutex_lock(&ksm_thread_mutex);
2416 wait_while_offlining();
2417 if (ksmd_should_run())
2418 ksm_do_scan(ksm_thread_pages_to_scan);
2419 mutex_unlock(&ksm_thread_mutex);
2420
2421 try_to_freeze();
2422
2423 if (ksmd_should_run()) {
2424 sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs);
2425 wait_event_interruptible_timeout(ksm_iter_wait,
2426 sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs),
2427 msecs_to_jiffies(sleep_ms));
2428 } else {
2429 wait_event_freezable(ksm_thread_wait,
2430 ksmd_should_run() || kthread_should_stop());
2431 }
2432 }
2433 return 0;
2434 }
2435
2436 int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
2437 unsigned long end, int advice, unsigned long *vm_flags)
2438 {
2439 struct mm_struct *mm = vma->vm_mm;
2440 int err;
2441
2442 switch (advice) {
2443 case MADV_MERGEABLE:
2444
2445
2446
2447 if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE |
2448 VM_PFNMAP | VM_IO | VM_DONTEXPAND |
2449 VM_HUGETLB | VM_MIXEDMAP))
2450 return 0;
2451
2452 if (vma_is_dax(vma))
2453 return 0;
2454
2455 #ifdef VM_SAO
2456 if (*vm_flags & VM_SAO)
2457 return 0;
2458 #endif
2459 #ifdef VM_SPARC_ADI
2460 if (*vm_flags & VM_SPARC_ADI)
2461 return 0;
2462 #endif
2463
2464 if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
2465 err = __ksm_enter(mm);
2466 if (err)
2467 return err;
2468 }
2469
2470 *vm_flags |= VM_MERGEABLE;
2471 break;
2472
2473 case MADV_UNMERGEABLE:
2474 if (!(*vm_flags & VM_MERGEABLE))
2475 return 0;
2476
2477 if (vma->anon_vma) {
2478 err = unmerge_ksm_pages(vma, start, end);
2479 if (err)
2480 return err;
2481 }
2482
2483 *vm_flags &= ~VM_MERGEABLE;
2484 break;
2485 }
2486
2487 return 0;
2488 }
2489
2490 int __ksm_enter(struct mm_struct *mm)
2491 {
2492 struct mm_slot *mm_slot;
2493 int needs_wakeup;
2494
2495 mm_slot = alloc_mm_slot();
2496 if (!mm_slot)
2497 return -ENOMEM;
2498
2499
2500 needs_wakeup = list_empty(&ksm_mm_head.mm_list);
2501
2502 spin_lock(&ksm_mmlist_lock);
2503 insert_to_mm_slots_hash(mm, mm_slot);
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514 if (ksm_run & KSM_RUN_UNMERGE)
2515 list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list);
2516 else
2517 list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
2518 spin_unlock(&ksm_mmlist_lock);
2519
2520 set_bit(MMF_VM_MERGEABLE, &mm->flags);
2521 mmgrab(mm);
2522
2523 if (needs_wakeup)
2524 wake_up_interruptible(&ksm_thread_wait);
2525
2526 return 0;
2527 }
2528
2529 void __ksm_exit(struct mm_struct *mm)
2530 {
2531 struct mm_slot *mm_slot;
2532 int easy_to_free = 0;
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543 spin_lock(&ksm_mmlist_lock);
2544 mm_slot = get_mm_slot(mm);
2545 if (mm_slot && ksm_scan.mm_slot != mm_slot) {
2546 if (!mm_slot->rmap_list) {
2547 hash_del(&mm_slot->link);
2548 list_del(&mm_slot->mm_list);
2549 easy_to_free = 1;
2550 } else {
2551 list_move(&mm_slot->mm_list,
2552 &ksm_scan.mm_slot->mm_list);
2553 }
2554 }
2555 spin_unlock(&ksm_mmlist_lock);
2556
2557 if (easy_to_free) {
2558 free_mm_slot(mm_slot);
2559 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
2560 mmdrop(mm);
2561 } else if (mm_slot) {
2562 down_write(&mm->mmap_sem);
2563 up_write(&mm->mmap_sem);
2564 }
2565 }
2566
2567 struct page *ksm_might_need_to_copy(struct page *page,
2568 struct vm_area_struct *vma, unsigned long address)
2569 {
2570 struct anon_vma *anon_vma = page_anon_vma(page);
2571 struct page *new_page;
2572
2573 if (PageKsm(page)) {
2574 if (page_stable_node(page) &&
2575 !(ksm_run & KSM_RUN_UNMERGE))
2576 return page;
2577 } else if (!anon_vma) {
2578 return page;
2579 } else if (anon_vma->root == vma->anon_vma->root &&
2580 page->index == linear_page_index(vma, address)) {
2581 return page;
2582 }
2583 if (!PageUptodate(page))
2584 return page;
2585
2586 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
2587 if (new_page) {
2588 copy_user_highpage(new_page, page, address, vma);
2589
2590 SetPageDirty(new_page);
2591 __SetPageUptodate(new_page);
2592 __SetPageLocked(new_page);
2593 }
2594
2595 return new_page;
2596 }
2597
2598 void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
2599 {
2600 struct stable_node *stable_node;
2601 struct rmap_item *rmap_item;
2602 int search_new_forks = 0;
2603
2604 VM_BUG_ON_PAGE(!PageKsm(page), page);
2605
2606
2607
2608
2609
2610 VM_BUG_ON_PAGE(!PageLocked(page), page);
2611
2612 stable_node = page_stable_node(page);
2613 if (!stable_node)
2614 return;
2615 again:
2616 hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
2617 struct anon_vma *anon_vma = rmap_item->anon_vma;
2618 struct anon_vma_chain *vmac;
2619 struct vm_area_struct *vma;
2620
2621 cond_resched();
2622 anon_vma_lock_read(anon_vma);
2623 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
2624 0, ULONG_MAX) {
2625 unsigned long addr;
2626
2627 cond_resched();
2628 vma = vmac->vma;
2629
2630
2631 addr = rmap_item->address & ~KSM_FLAG_MASK;
2632
2633 if (addr < vma->vm_start || addr >= vma->vm_end)
2634 continue;
2635
2636
2637
2638
2639
2640
2641 if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
2642 continue;
2643
2644 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
2645 continue;
2646
2647 if (!rwc->rmap_one(page, vma, addr, rwc->arg)) {
2648 anon_vma_unlock_read(anon_vma);
2649 return;
2650 }
2651 if (rwc->done && rwc->done(page)) {
2652 anon_vma_unlock_read(anon_vma);
2653 return;
2654 }
2655 }
2656 anon_vma_unlock_read(anon_vma);
2657 }
2658 if (!search_new_forks++)
2659 goto again;
2660 }
2661
2662 bool reuse_ksm_page(struct page *page,
2663 struct vm_area_struct *vma,
2664 unsigned long address)
2665 {
2666 #ifdef CONFIG_DEBUG_VM
2667 if (WARN_ON(is_zero_pfn(page_to_pfn(page))) ||
2668 WARN_ON(!page_mapped(page)) ||
2669 WARN_ON(!PageLocked(page))) {
2670 dump_page(page, "reuse_ksm_page");
2671 return false;
2672 }
2673 #endif
2674
2675 if (PageSwapCache(page) || !page_stable_node(page))
2676 return false;
2677
2678 if (!page_ref_freeze(page, 1))
2679 return false;
2680
2681 page_move_anon_rmap(page, vma);
2682 page->index = linear_page_index(vma, address);
2683 page_ref_unfreeze(page, 1);
2684
2685 return true;
2686 }
2687 #ifdef CONFIG_MIGRATION
2688 void ksm_migrate_page(struct page *newpage, struct page *oldpage)
2689 {
2690 struct stable_node *stable_node;
2691
2692 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
2693 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
2694 VM_BUG_ON_PAGE(newpage->mapping != oldpage->mapping, newpage);
2695
2696 stable_node = page_stable_node(newpage);
2697 if (stable_node) {
2698 VM_BUG_ON_PAGE(stable_node->kpfn != page_to_pfn(oldpage), oldpage);
2699 stable_node->kpfn = page_to_pfn(newpage);
2700
2701
2702
2703
2704
2705
2706 smp_wmb();
2707 set_page_stable_node(oldpage, NULL);
2708 }
2709 }
2710 #endif
2711
2712 #ifdef CONFIG_MEMORY_HOTREMOVE
2713 static void wait_while_offlining(void)
2714 {
2715 while (ksm_run & KSM_RUN_OFFLINE) {
2716 mutex_unlock(&ksm_thread_mutex);
2717 wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE),
2718 TASK_UNINTERRUPTIBLE);
2719 mutex_lock(&ksm_thread_mutex);
2720 }
2721 }
2722
2723 static bool stable_node_dup_remove_range(struct stable_node *stable_node,
2724 unsigned long start_pfn,
2725 unsigned long end_pfn)
2726 {
2727 if (stable_node->kpfn >= start_pfn &&
2728 stable_node->kpfn < end_pfn) {
2729
2730
2731
2732
2733 remove_node_from_stable_tree(stable_node);
2734 return true;
2735 }
2736 return false;
2737 }
2738
2739 static bool stable_node_chain_remove_range(struct stable_node *stable_node,
2740 unsigned long start_pfn,
2741 unsigned long end_pfn,
2742 struct rb_root *root)
2743 {
2744 struct stable_node *dup;
2745 struct hlist_node *hlist_safe;
2746
2747 if (!is_stable_node_chain(stable_node)) {
2748 VM_BUG_ON(is_stable_node_dup(stable_node));
2749 return stable_node_dup_remove_range(stable_node, start_pfn,
2750 end_pfn);
2751 }
2752
2753 hlist_for_each_entry_safe(dup, hlist_safe,
2754 &stable_node->hlist, hlist_dup) {
2755 VM_BUG_ON(!is_stable_node_dup(dup));
2756 stable_node_dup_remove_range(dup, start_pfn, end_pfn);
2757 }
2758 if (hlist_empty(&stable_node->hlist)) {
2759 free_stable_node_chain(stable_node, root);
2760 return true;
2761 } else
2762 return false;
2763 }
2764
2765 static void ksm_check_stable_tree(unsigned long start_pfn,
2766 unsigned long end_pfn)
2767 {
2768 struct stable_node *stable_node, *next;
2769 struct rb_node *node;
2770 int nid;
2771
2772 for (nid = 0; nid < ksm_nr_node_ids; nid++) {
2773 node = rb_first(root_stable_tree + nid);
2774 while (node) {
2775 stable_node = rb_entry(node, struct stable_node, node);
2776 if (stable_node_chain_remove_range(stable_node,
2777 start_pfn, end_pfn,
2778 root_stable_tree +
2779 nid))
2780 node = rb_first(root_stable_tree + nid);
2781 else
2782 node = rb_next(node);
2783 cond_resched();
2784 }
2785 }
2786 list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
2787 if (stable_node->kpfn >= start_pfn &&
2788 stable_node->kpfn < end_pfn)
2789 remove_node_from_stable_tree(stable_node);
2790 cond_resched();
2791 }
2792 }
2793
2794 static int ksm_memory_callback(struct notifier_block *self,
2795 unsigned long action, void *arg)
2796 {
2797 struct memory_notify *mn = arg;
2798
2799 switch (action) {
2800 case MEM_GOING_OFFLINE:
2801
2802
2803
2804
2805
2806
2807
2808 mutex_lock(&ksm_thread_mutex);
2809 ksm_run |= KSM_RUN_OFFLINE;
2810 mutex_unlock(&ksm_thread_mutex);
2811 break;
2812
2813 case MEM_OFFLINE:
2814
2815
2816
2817
2818
2819
2820
2821 ksm_check_stable_tree(mn->start_pfn,
2822 mn->start_pfn + mn->nr_pages);
2823
2824
2825 case MEM_CANCEL_OFFLINE:
2826 mutex_lock(&ksm_thread_mutex);
2827 ksm_run &= ~KSM_RUN_OFFLINE;
2828 mutex_unlock(&ksm_thread_mutex);
2829
2830 smp_mb();
2831 wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE));
2832 break;
2833 }
2834 return NOTIFY_OK;
2835 }
2836 #else
2837 static void wait_while_offlining(void)
2838 {
2839 }
2840 #endif
2841
2842 #ifdef CONFIG_SYSFS
2843
2844
2845
2846
2847 #define KSM_ATTR_RO(_name) \
2848 static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
2849 #define KSM_ATTR(_name) \
2850 static struct kobj_attribute _name##_attr = \
2851 __ATTR(_name, 0644, _name##_show, _name##_store)
2852
2853 static ssize_t sleep_millisecs_show(struct kobject *kobj,
2854 struct kobj_attribute *attr, char *buf)
2855 {
2856 return sprintf(buf, "%u\n", ksm_thread_sleep_millisecs);
2857 }
2858
2859 static ssize_t sleep_millisecs_store(struct kobject *kobj,
2860 struct kobj_attribute *attr,
2861 const char *buf, size_t count)
2862 {
2863 unsigned long msecs;
2864 int err;
2865
2866 err = kstrtoul(buf, 10, &msecs);
2867 if (err || msecs > UINT_MAX)
2868 return -EINVAL;
2869
2870 ksm_thread_sleep_millisecs = msecs;
2871 wake_up_interruptible(&ksm_iter_wait);
2872
2873 return count;
2874 }
2875 KSM_ATTR(sleep_millisecs);
2876
2877 static ssize_t pages_to_scan_show(struct kobject *kobj,
2878 struct kobj_attribute *attr, char *buf)
2879 {
2880 return sprintf(buf, "%u\n", ksm_thread_pages_to_scan);
2881 }
2882
2883 static ssize_t pages_to_scan_store(struct kobject *kobj,
2884 struct kobj_attribute *attr,
2885 const char *buf, size_t count)
2886 {
2887 int err;
2888 unsigned long nr_pages;
2889
2890 err = kstrtoul(buf, 10, &nr_pages);
2891 if (err || nr_pages > UINT_MAX)
2892 return -EINVAL;
2893
2894 ksm_thread_pages_to_scan = nr_pages;
2895
2896 return count;
2897 }
2898 KSM_ATTR(pages_to_scan);
2899
2900 static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
2901 char *buf)
2902 {
2903 return sprintf(buf, "%lu\n", ksm_run);
2904 }
2905
2906 static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
2907 const char *buf, size_t count)
2908 {
2909 int err;
2910 unsigned long flags;
2911
2912 err = kstrtoul(buf, 10, &flags);
2913 if (err || flags > UINT_MAX)
2914 return -EINVAL;
2915 if (flags > KSM_RUN_UNMERGE)
2916 return -EINVAL;
2917
2918
2919
2920
2921
2922
2923
2924
2925 mutex_lock(&ksm_thread_mutex);
2926 wait_while_offlining();
2927 if (ksm_run != flags) {
2928 ksm_run = flags;
2929 if (flags & KSM_RUN_UNMERGE) {
2930 set_current_oom_origin();
2931 err = unmerge_and_remove_all_rmap_items();
2932 clear_current_oom_origin();
2933 if (err) {
2934 ksm_run = KSM_RUN_STOP;
2935 count = err;
2936 }
2937 }
2938 }
2939 mutex_unlock(&ksm_thread_mutex);
2940
2941 if (flags & KSM_RUN_MERGE)
2942 wake_up_interruptible(&ksm_thread_wait);
2943
2944 return count;
2945 }
2946 KSM_ATTR(run);
2947
2948 #ifdef CONFIG_NUMA
2949 static ssize_t merge_across_nodes_show(struct kobject *kobj,
2950 struct kobj_attribute *attr, char *buf)
2951 {
2952 return sprintf(buf, "%u\n", ksm_merge_across_nodes);
2953 }
2954
2955 static ssize_t merge_across_nodes_store(struct kobject *kobj,
2956 struct kobj_attribute *attr,
2957 const char *buf, size_t count)
2958 {
2959 int err;
2960 unsigned long knob;
2961
2962 err = kstrtoul(buf, 10, &knob);
2963 if (err)
2964 return err;
2965 if (knob > 1)
2966 return -EINVAL;
2967
2968 mutex_lock(&ksm_thread_mutex);
2969 wait_while_offlining();
2970 if (ksm_merge_across_nodes != knob) {
2971 if (ksm_pages_shared || remove_all_stable_nodes())
2972 err = -EBUSY;
2973 else if (root_stable_tree == one_stable_tree) {
2974 struct rb_root *buf;
2975
2976
2977
2978
2979
2980
2981
2982 buf = kcalloc(nr_node_ids + nr_node_ids, sizeof(*buf),
2983 GFP_KERNEL);
2984
2985 if (!buf)
2986 err = -ENOMEM;
2987 else {
2988 root_stable_tree = buf;
2989 root_unstable_tree = buf + nr_node_ids;
2990
2991 root_unstable_tree[0] = one_unstable_tree[0];
2992 }
2993 }
2994 if (!err) {
2995 ksm_merge_across_nodes = knob;
2996 ksm_nr_node_ids = knob ? 1 : nr_node_ids;
2997 }
2998 }
2999 mutex_unlock(&ksm_thread_mutex);
3000
3001 return err ? err : count;
3002 }
3003 KSM_ATTR(merge_across_nodes);
3004 #endif
3005
3006 static ssize_t use_zero_pages_show(struct kobject *kobj,
3007 struct kobj_attribute *attr, char *buf)
3008 {
3009 return sprintf(buf, "%u\n", ksm_use_zero_pages);
3010 }
3011 static ssize_t use_zero_pages_store(struct kobject *kobj,
3012 struct kobj_attribute *attr,
3013 const char *buf, size_t count)
3014 {
3015 int err;
3016 bool value;
3017
3018 err = kstrtobool(buf, &value);
3019 if (err)
3020 return -EINVAL;
3021
3022 ksm_use_zero_pages = value;
3023
3024 return count;
3025 }
3026 KSM_ATTR(use_zero_pages);
3027
3028 static ssize_t max_page_sharing_show(struct kobject *kobj,
3029 struct kobj_attribute *attr, char *buf)
3030 {
3031 return sprintf(buf, "%u\n", ksm_max_page_sharing);
3032 }
3033
3034 static ssize_t max_page_sharing_store(struct kobject *kobj,
3035 struct kobj_attribute *attr,
3036 const char *buf, size_t count)
3037 {
3038 int err;
3039 int knob;
3040
3041 err = kstrtoint(buf, 10, &knob);
3042 if (err)
3043 return err;
3044
3045
3046
3047
3048
3049 if (knob < 2)
3050 return -EINVAL;
3051
3052 if (READ_ONCE(ksm_max_page_sharing) == knob)
3053 return count;
3054
3055 mutex_lock(&ksm_thread_mutex);
3056 wait_while_offlining();
3057 if (ksm_max_page_sharing != knob) {
3058 if (ksm_pages_shared || remove_all_stable_nodes())
3059 err = -EBUSY;
3060 else
3061 ksm_max_page_sharing = knob;
3062 }
3063 mutex_unlock(&ksm_thread_mutex);
3064
3065 return err ? err : count;
3066 }
3067 KSM_ATTR(max_page_sharing);
3068
3069 static ssize_t pages_shared_show(struct kobject *kobj,
3070 struct kobj_attribute *attr, char *buf)
3071 {
3072 return sprintf(buf, "%lu\n", ksm_pages_shared);
3073 }
3074 KSM_ATTR_RO(pages_shared);
3075
3076 static ssize_t pages_sharing_show(struct kobject *kobj,
3077 struct kobj_attribute *attr, char *buf)
3078 {
3079 return sprintf(buf, "%lu\n", ksm_pages_sharing);
3080 }
3081 KSM_ATTR_RO(pages_sharing);
3082
3083 static ssize_t pages_unshared_show(struct kobject *kobj,
3084 struct kobj_attribute *attr, char *buf)
3085 {
3086 return sprintf(buf, "%lu\n", ksm_pages_unshared);
3087 }
3088 KSM_ATTR_RO(pages_unshared);
3089
3090 static ssize_t pages_volatile_show(struct kobject *kobj,
3091 struct kobj_attribute *attr, char *buf)
3092 {
3093 long ksm_pages_volatile;
3094
3095 ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared
3096 - ksm_pages_sharing - ksm_pages_unshared;
3097
3098
3099
3100
3101 if (ksm_pages_volatile < 0)
3102 ksm_pages_volatile = 0;
3103 return sprintf(buf, "%ld\n", ksm_pages_volatile);
3104 }
3105 KSM_ATTR_RO(pages_volatile);
3106
3107 static ssize_t stable_node_dups_show(struct kobject *kobj,
3108 struct kobj_attribute *attr, char *buf)
3109 {
3110 return sprintf(buf, "%lu\n", ksm_stable_node_dups);
3111 }
3112 KSM_ATTR_RO(stable_node_dups);
3113
3114 static ssize_t stable_node_chains_show(struct kobject *kobj,
3115 struct kobj_attribute *attr, char *buf)
3116 {
3117 return sprintf(buf, "%lu\n", ksm_stable_node_chains);
3118 }
3119 KSM_ATTR_RO(stable_node_chains);
3120
3121 static ssize_t
3122 stable_node_chains_prune_millisecs_show(struct kobject *kobj,
3123 struct kobj_attribute *attr,
3124 char *buf)
3125 {
3126 return sprintf(buf, "%u\n", ksm_stable_node_chains_prune_millisecs);
3127 }
3128
3129 static ssize_t
3130 stable_node_chains_prune_millisecs_store(struct kobject *kobj,
3131 struct kobj_attribute *attr,
3132 const char *buf, size_t count)
3133 {
3134 unsigned long msecs;
3135 int err;
3136
3137 err = kstrtoul(buf, 10, &msecs);
3138 if (err || msecs > UINT_MAX)
3139 return -EINVAL;
3140
3141 ksm_stable_node_chains_prune_millisecs = msecs;
3142
3143 return count;
3144 }
3145 KSM_ATTR(stable_node_chains_prune_millisecs);
3146
3147 static ssize_t full_scans_show(struct kobject *kobj,
3148 struct kobj_attribute *attr, char *buf)
3149 {
3150 return sprintf(buf, "%lu\n", ksm_scan.seqnr);
3151 }
3152 KSM_ATTR_RO(full_scans);
3153
3154 static struct attribute *ksm_attrs[] = {
3155 &sleep_millisecs_attr.attr,
3156 &pages_to_scan_attr.attr,
3157 &run_attr.attr,
3158 &pages_shared_attr.attr,
3159 &pages_sharing_attr.attr,
3160 &pages_unshared_attr.attr,
3161 &pages_volatile_attr.attr,
3162 &full_scans_attr.attr,
3163 #ifdef CONFIG_NUMA
3164 &merge_across_nodes_attr.attr,
3165 #endif
3166 &max_page_sharing_attr.attr,
3167 &stable_node_chains_attr.attr,
3168 &stable_node_dups_attr.attr,
3169 &stable_node_chains_prune_millisecs_attr.attr,
3170 &use_zero_pages_attr.attr,
3171 NULL,
3172 };
3173
3174 static const struct attribute_group ksm_attr_group = {
3175 .attrs = ksm_attrs,
3176 .name = "ksm",
3177 };
3178 #endif
3179
3180 static int __init ksm_init(void)
3181 {
3182 struct task_struct *ksm_thread;
3183 int err;
3184
3185
3186 zero_checksum = calc_checksum(ZERO_PAGE(0));
3187
3188 ksm_use_zero_pages = false;
3189
3190 err = ksm_slab_init();
3191 if (err)
3192 goto out;
3193
3194 ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
3195 if (IS_ERR(ksm_thread)) {
3196 pr_err("ksm: creating kthread failed\n");
3197 err = PTR_ERR(ksm_thread);
3198 goto out_free;
3199 }
3200
3201 #ifdef CONFIG_SYSFS
3202 err = sysfs_create_group(mm_kobj, &ksm_attr_group);
3203 if (err) {
3204 pr_err("ksm: register sysfs failed\n");
3205 kthread_stop(ksm_thread);
3206 goto out_free;
3207 }
3208 #else
3209 ksm_run = KSM_RUN_MERGE;
3210
3211 #endif
3212
3213 #ifdef CONFIG_MEMORY_HOTREMOVE
3214
3215 hotplug_memory_notifier(ksm_memory_callback, 100);
3216 #endif
3217 return 0;
3218
3219 out_free:
3220 ksm_slab_free();
3221 out:
3222 return err;
3223 }
3224 subsys_initcall(ksm_init);