This source file includes following definitions.
- disable_randmaps
- init_zero_pfn
- sync_mm_rss
- add_mm_counter_fast
- check_sync_rss_stat
- check_sync_rss_stat
- free_pte_range
- free_pmd_range
- free_pud_range
- free_p4d_range
- free_pgd_range
- free_pgtables
- __pte_alloc
- __pte_alloc_kernel
- init_rss_vec
- add_mm_rss_vec
- print_bad_pte
- vm_normal_page
- vm_normal_page_pmd
- copy_one_pte
- copy_pte_range
- copy_pmd_range
- copy_pud_range
- copy_p4d_range
- copy_page_range
- zap_pte_range
- zap_pmd_range
- zap_pud_range
- zap_p4d_range
- unmap_page_range
- unmap_single_vma
- unmap_vmas
- zap_page_range
- zap_page_range_single
- zap_vma_ptes
- __get_locked_pte
- insert_page
- vm_insert_page
- __vm_map_pages
- vm_map_pages
- vm_map_pages_zero
- insert_pfn
- vmf_insert_pfn_prot
- vmf_insert_pfn
- vm_mixed_ok
- __vm_insert_mixed
- vmf_insert_mixed
- vmf_insert_mixed_mkwrite
- remap_pte_range
- remap_pmd_range
- remap_pud_range
- remap_p4d_range
- remap_pfn_range
- vm_iomap_memory
- apply_to_pte_range
- apply_to_pmd_range
- apply_to_pud_range
- apply_to_p4d_range
- apply_to_page_range
- pte_unmap_same
- cow_user_page
- __get_fault_gfp_mask
- do_page_mkwrite
- fault_dirty_shared_page
- wp_page_reuse
- wp_page_copy
- finish_mkwrite_fault
- wp_pfn_shared
- wp_page_shared
- do_wp_page
- unmap_mapping_range_vma
- unmap_mapping_range_tree
- unmap_mapping_pages
- unmap_mapping_range
- do_swap_page
- do_anonymous_page
- __do_fault
- pmd_devmap_trans_unstable
- pte_alloc_one_map
- deposit_prealloc_pte
- do_set_pmd
- do_set_pmd
- alloc_set_pte
- finish_fault
- fault_around_bytes_get
- fault_around_bytes_set
- fault_around_debugfs
- do_fault_around
- do_read_fault
- do_cow_fault
- do_shared_fault
- do_fault
- numa_migrate_prep
- do_numa_page
- create_huge_pmd
- wp_huge_pmd
- vma_is_accessible
- create_huge_pud
- wp_huge_pud
- handle_pte_fault
- __handle_mm_fault
- handle_mm_fault
- __p4d_alloc
- __pud_alloc
- __pmd_alloc
- __follow_pte_pmd
- follow_pte
- follow_pte_pmd
- follow_pfn
- follow_phys
- generic_access_phys
- __access_remote_vm
- access_remote_vm
- access_process_vm
- print_vma_addr
- __might_fault
- process_huge_page
- clear_gigantic_page
- clear_subpage
- clear_huge_page
- copy_user_gigantic_page
- copy_subpage
- copy_user_huge_page
- copy_huge_page_from_user
- ptlock_cache_init
- ptlock_alloc
- ptlock_free
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42 #include <linux/kernel_stat.h>
43 #include <linux/mm.h>
44 #include <linux/sched/mm.h>
45 #include <linux/sched/coredump.h>
46 #include <linux/sched/numa_balancing.h>
47 #include <linux/sched/task.h>
48 #include <linux/hugetlb.h>
49 #include <linux/mman.h>
50 #include <linux/swap.h>
51 #include <linux/highmem.h>
52 #include <linux/pagemap.h>
53 #include <linux/memremap.h>
54 #include <linux/ksm.h>
55 #include <linux/rmap.h>
56 #include <linux/export.h>
57 #include <linux/delayacct.h>
58 #include <linux/init.h>
59 #include <linux/pfn_t.h>
60 #include <linux/writeback.h>
61 #include <linux/memcontrol.h>
62 #include <linux/mmu_notifier.h>
63 #include <linux/swapops.h>
64 #include <linux/elf.h>
65 #include <linux/gfp.h>
66 #include <linux/migrate.h>
67 #include <linux/string.h>
68 #include <linux/dma-debug.h>
69 #include <linux/debugfs.h>
70 #include <linux/userfaultfd_k.h>
71 #include <linux/dax.h>
72 #include <linux/oom.h>
73 #include <linux/numa.h>
74
75 #include <asm/io.h>
76 #include <asm/mmu_context.h>
77 #include <asm/pgalloc.h>
78 #include <linux/uaccess.h>
79 #include <asm/tlb.h>
80 #include <asm/tlbflush.h>
81 #include <asm/pgtable.h>
82
83 #include "internal.h"
84
85 #if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
86 #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
87 #endif
88
89 #ifndef CONFIG_NEED_MULTIPLE_NODES
90
91 unsigned long max_mapnr;
92 EXPORT_SYMBOL(max_mapnr);
93
94 struct page *mem_map;
95 EXPORT_SYMBOL(mem_map);
96 #endif
97
98
99
100
101
102
103
104
105 void *high_memory;
106 EXPORT_SYMBOL(high_memory);
107
108
109
110
111
112
113
114 int randomize_va_space __read_mostly =
115 #ifdef CONFIG_COMPAT_BRK
116 1;
117 #else
118 2;
119 #endif
120
121 static int __init disable_randmaps(char *s)
122 {
123 randomize_va_space = 0;
124 return 1;
125 }
126 __setup("norandmaps", disable_randmaps);
127
128 unsigned long zero_pfn __read_mostly;
129 EXPORT_SYMBOL(zero_pfn);
130
131 unsigned long highest_memmap_pfn __read_mostly;
132
133
134
135
136 static int __init init_zero_pfn(void)
137 {
138 zero_pfn = page_to_pfn(ZERO_PAGE(0));
139 return 0;
140 }
141 core_initcall(init_zero_pfn);
142
143
144 #if defined(SPLIT_RSS_COUNTING)
145
146 void sync_mm_rss(struct mm_struct *mm)
147 {
148 int i;
149
150 for (i = 0; i < NR_MM_COUNTERS; i++) {
151 if (current->rss_stat.count[i]) {
152 add_mm_counter(mm, i, current->rss_stat.count[i]);
153 current->rss_stat.count[i] = 0;
154 }
155 }
156 current->rss_stat.events = 0;
157 }
158
159 static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
160 {
161 struct task_struct *task = current;
162
163 if (likely(task->mm == mm))
164 task->rss_stat.count[member] += val;
165 else
166 add_mm_counter(mm, member, val);
167 }
168 #define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
169 #define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
170
171
172 #define TASK_RSS_EVENTS_THRESH (64)
173 static void check_sync_rss_stat(struct task_struct *task)
174 {
175 if (unlikely(task != current))
176 return;
177 if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
178 sync_mm_rss(task->mm);
179 }
180 #else
181
182 #define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
183 #define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
184
185 static void check_sync_rss_stat(struct task_struct *task)
186 {
187 }
188
189 #endif
190
191
192
193
194
195 static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
196 unsigned long addr)
197 {
198 pgtable_t token = pmd_pgtable(*pmd);
199 pmd_clear(pmd);
200 pte_free_tlb(tlb, token, addr);
201 mm_dec_nr_ptes(tlb->mm);
202 }
203
204 static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
205 unsigned long addr, unsigned long end,
206 unsigned long floor, unsigned long ceiling)
207 {
208 pmd_t *pmd;
209 unsigned long next;
210 unsigned long start;
211
212 start = addr;
213 pmd = pmd_offset(pud, addr);
214 do {
215 next = pmd_addr_end(addr, end);
216 if (pmd_none_or_clear_bad(pmd))
217 continue;
218 free_pte_range(tlb, pmd, addr);
219 } while (pmd++, addr = next, addr != end);
220
221 start &= PUD_MASK;
222 if (start < floor)
223 return;
224 if (ceiling) {
225 ceiling &= PUD_MASK;
226 if (!ceiling)
227 return;
228 }
229 if (end - 1 > ceiling - 1)
230 return;
231
232 pmd = pmd_offset(pud, start);
233 pud_clear(pud);
234 pmd_free_tlb(tlb, pmd, start);
235 mm_dec_nr_pmds(tlb->mm);
236 }
237
238 static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
239 unsigned long addr, unsigned long end,
240 unsigned long floor, unsigned long ceiling)
241 {
242 pud_t *pud;
243 unsigned long next;
244 unsigned long start;
245
246 start = addr;
247 pud = pud_offset(p4d, addr);
248 do {
249 next = pud_addr_end(addr, end);
250 if (pud_none_or_clear_bad(pud))
251 continue;
252 free_pmd_range(tlb, pud, addr, next, floor, ceiling);
253 } while (pud++, addr = next, addr != end);
254
255 start &= P4D_MASK;
256 if (start < floor)
257 return;
258 if (ceiling) {
259 ceiling &= P4D_MASK;
260 if (!ceiling)
261 return;
262 }
263 if (end - 1 > ceiling - 1)
264 return;
265
266 pud = pud_offset(p4d, start);
267 p4d_clear(p4d);
268 pud_free_tlb(tlb, pud, start);
269 mm_dec_nr_puds(tlb->mm);
270 }
271
272 static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
273 unsigned long addr, unsigned long end,
274 unsigned long floor, unsigned long ceiling)
275 {
276 p4d_t *p4d;
277 unsigned long next;
278 unsigned long start;
279
280 start = addr;
281 p4d = p4d_offset(pgd, addr);
282 do {
283 next = p4d_addr_end(addr, end);
284 if (p4d_none_or_clear_bad(p4d))
285 continue;
286 free_pud_range(tlb, p4d, addr, next, floor, ceiling);
287 } while (p4d++, addr = next, addr != end);
288
289 start &= PGDIR_MASK;
290 if (start < floor)
291 return;
292 if (ceiling) {
293 ceiling &= PGDIR_MASK;
294 if (!ceiling)
295 return;
296 }
297 if (end - 1 > ceiling - 1)
298 return;
299
300 p4d = p4d_offset(pgd, start);
301 pgd_clear(pgd);
302 p4d_free_tlb(tlb, p4d, start);
303 }
304
305
306
307
308 void free_pgd_range(struct mmu_gather *tlb,
309 unsigned long addr, unsigned long end,
310 unsigned long floor, unsigned long ceiling)
311 {
312 pgd_t *pgd;
313 unsigned long next;
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341 addr &= PMD_MASK;
342 if (addr < floor) {
343 addr += PMD_SIZE;
344 if (!addr)
345 return;
346 }
347 if (ceiling) {
348 ceiling &= PMD_MASK;
349 if (!ceiling)
350 return;
351 }
352 if (end - 1 > ceiling - 1)
353 end -= PMD_SIZE;
354 if (addr > end - 1)
355 return;
356
357
358
359
360 tlb_change_page_size(tlb, PAGE_SIZE);
361 pgd = pgd_offset(tlb->mm, addr);
362 do {
363 next = pgd_addr_end(addr, end);
364 if (pgd_none_or_clear_bad(pgd))
365 continue;
366 free_p4d_range(tlb, pgd, addr, next, floor, ceiling);
367 } while (pgd++, addr = next, addr != end);
368 }
369
370 void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
371 unsigned long floor, unsigned long ceiling)
372 {
373 while (vma) {
374 struct vm_area_struct *next = vma->vm_next;
375 unsigned long addr = vma->vm_start;
376
377
378
379
380
381 unlink_anon_vmas(vma);
382 unlink_file_vma(vma);
383
384 if (is_vm_hugetlb_page(vma)) {
385 hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
386 floor, next ? next->vm_start : ceiling);
387 } else {
388
389
390
391 while (next && next->vm_start <= vma->vm_end + PMD_SIZE
392 && !is_vm_hugetlb_page(next)) {
393 vma = next;
394 next = vma->vm_next;
395 unlink_anon_vmas(vma);
396 unlink_file_vma(vma);
397 }
398 free_pgd_range(tlb, addr, vma->vm_end,
399 floor, next ? next->vm_start : ceiling);
400 }
401 vma = next;
402 }
403 }
404
405 int __pte_alloc(struct mm_struct *mm, pmd_t *pmd)
406 {
407 spinlock_t *ptl;
408 pgtable_t new = pte_alloc_one(mm);
409 if (!new)
410 return -ENOMEM;
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425 smp_wmb();
426
427 ptl = pmd_lock(mm, pmd);
428 if (likely(pmd_none(*pmd))) {
429 mm_inc_nr_ptes(mm);
430 pmd_populate(mm, pmd, new);
431 new = NULL;
432 }
433 spin_unlock(ptl);
434 if (new)
435 pte_free(mm, new);
436 return 0;
437 }
438
439 int __pte_alloc_kernel(pmd_t *pmd)
440 {
441 pte_t *new = pte_alloc_one_kernel(&init_mm);
442 if (!new)
443 return -ENOMEM;
444
445 smp_wmb();
446
447 spin_lock(&init_mm.page_table_lock);
448 if (likely(pmd_none(*pmd))) {
449 pmd_populate_kernel(&init_mm, pmd, new);
450 new = NULL;
451 }
452 spin_unlock(&init_mm.page_table_lock);
453 if (new)
454 pte_free_kernel(&init_mm, new);
455 return 0;
456 }
457
458 static inline void init_rss_vec(int *rss)
459 {
460 memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
461 }
462
463 static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
464 {
465 int i;
466
467 if (current->mm == mm)
468 sync_mm_rss(mm);
469 for (i = 0; i < NR_MM_COUNTERS; i++)
470 if (rss[i])
471 add_mm_counter(mm, i, rss[i]);
472 }
473
474
475
476
477
478
479
480
481 static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
482 pte_t pte, struct page *page)
483 {
484 pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
485 p4d_t *p4d = p4d_offset(pgd, addr);
486 pud_t *pud = pud_offset(p4d, addr);
487 pmd_t *pmd = pmd_offset(pud, addr);
488 struct address_space *mapping;
489 pgoff_t index;
490 static unsigned long resume;
491 static unsigned long nr_shown;
492 static unsigned long nr_unshown;
493
494
495
496
497
498 if (nr_shown == 60) {
499 if (time_before(jiffies, resume)) {
500 nr_unshown++;
501 return;
502 }
503 if (nr_unshown) {
504 pr_alert("BUG: Bad page map: %lu messages suppressed\n",
505 nr_unshown);
506 nr_unshown = 0;
507 }
508 nr_shown = 0;
509 }
510 if (nr_shown++ == 0)
511 resume = jiffies + 60 * HZ;
512
513 mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
514 index = linear_page_index(vma, addr);
515
516 pr_alert("BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
517 current->comm,
518 (long long)pte_val(pte), (long long)pmd_val(*pmd));
519 if (page)
520 dump_page(page, "bad pte");
521 pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n",
522 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
523 pr_alert("file:%pD fault:%ps mmap:%ps readpage:%ps\n",
524 vma->vm_file,
525 vma->vm_ops ? vma->vm_ops->fault : NULL,
526 vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
527 mapping ? mapping->a_ops->readpage : NULL);
528 dump_stack();
529 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
530 }
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574 struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
575 pte_t pte)
576 {
577 unsigned long pfn = pte_pfn(pte);
578
579 if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) {
580 if (likely(!pte_special(pte)))
581 goto check_pfn;
582 if (vma->vm_ops && vma->vm_ops->find_special_page)
583 return vma->vm_ops->find_special_page(vma, addr);
584 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
585 return NULL;
586 if (is_zero_pfn(pfn))
587 return NULL;
588 if (pte_devmap(pte))
589 return NULL;
590
591 print_bad_pte(vma, addr, pte, NULL);
592 return NULL;
593 }
594
595
596
597 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
598 if (vma->vm_flags & VM_MIXEDMAP) {
599 if (!pfn_valid(pfn))
600 return NULL;
601 goto out;
602 } else {
603 unsigned long off;
604 off = (addr - vma->vm_start) >> PAGE_SHIFT;
605 if (pfn == vma->vm_pgoff + off)
606 return NULL;
607 if (!is_cow_mapping(vma->vm_flags))
608 return NULL;
609 }
610 }
611
612 if (is_zero_pfn(pfn))
613 return NULL;
614
615 check_pfn:
616 if (unlikely(pfn > highest_memmap_pfn)) {
617 print_bad_pte(vma, addr, pte, NULL);
618 return NULL;
619 }
620
621
622
623
624
625 out:
626 return pfn_to_page(pfn);
627 }
628
629 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
630 struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
631 pmd_t pmd)
632 {
633 unsigned long pfn = pmd_pfn(pmd);
634
635
636
637
638
639
640 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
641 if (vma->vm_flags & VM_MIXEDMAP) {
642 if (!pfn_valid(pfn))
643 return NULL;
644 goto out;
645 } else {
646 unsigned long off;
647 off = (addr - vma->vm_start) >> PAGE_SHIFT;
648 if (pfn == vma->vm_pgoff + off)
649 return NULL;
650 if (!is_cow_mapping(vma->vm_flags))
651 return NULL;
652 }
653 }
654
655 if (pmd_devmap(pmd))
656 return NULL;
657 if (is_zero_pfn(pfn))
658 return NULL;
659 if (unlikely(pfn > highest_memmap_pfn))
660 return NULL;
661
662
663
664
665
666 out:
667 return pfn_to_page(pfn);
668 }
669 #endif
670
671
672
673
674
675
676
677 static inline unsigned long
678 copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
679 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
680 unsigned long addr, int *rss)
681 {
682 unsigned long vm_flags = vma->vm_flags;
683 pte_t pte = *src_pte;
684 struct page *page;
685
686
687 if (unlikely(!pte_present(pte))) {
688 swp_entry_t entry = pte_to_swp_entry(pte);
689
690 if (likely(!non_swap_entry(entry))) {
691 if (swap_duplicate(entry) < 0)
692 return entry.val;
693
694
695 if (unlikely(list_empty(&dst_mm->mmlist))) {
696 spin_lock(&mmlist_lock);
697 if (list_empty(&dst_mm->mmlist))
698 list_add(&dst_mm->mmlist,
699 &src_mm->mmlist);
700 spin_unlock(&mmlist_lock);
701 }
702 rss[MM_SWAPENTS]++;
703 } else if (is_migration_entry(entry)) {
704 page = migration_entry_to_page(entry);
705
706 rss[mm_counter(page)]++;
707
708 if (is_write_migration_entry(entry) &&
709 is_cow_mapping(vm_flags)) {
710
711
712
713
714 make_migration_entry_read(&entry);
715 pte = swp_entry_to_pte(entry);
716 if (pte_swp_soft_dirty(*src_pte))
717 pte = pte_swp_mksoft_dirty(pte);
718 set_pte_at(src_mm, addr, src_pte, pte);
719 }
720 } else if (is_device_private_entry(entry)) {
721 page = device_private_entry_to_page(entry);
722
723
724
725
726
727
728
729
730
731
732 get_page(page);
733 rss[mm_counter(page)]++;
734 page_dup_rmap(page, false);
735
736
737
738
739
740
741
742
743 if (is_write_device_private_entry(entry) &&
744 is_cow_mapping(vm_flags)) {
745 make_device_private_entry_read(&entry);
746 pte = swp_entry_to_pte(entry);
747 set_pte_at(src_mm, addr, src_pte, pte);
748 }
749 }
750 goto out_set_pte;
751 }
752
753
754
755
756
757 if (is_cow_mapping(vm_flags) && pte_write(pte)) {
758 ptep_set_wrprotect(src_mm, addr, src_pte);
759 pte = pte_wrprotect(pte);
760 }
761
762
763
764
765
766 if (vm_flags & VM_SHARED)
767 pte = pte_mkclean(pte);
768 pte = pte_mkold(pte);
769
770 page = vm_normal_page(vma, addr, pte);
771 if (page) {
772 get_page(page);
773 page_dup_rmap(page, false);
774 rss[mm_counter(page)]++;
775 } else if (pte_devmap(pte)) {
776 page = pte_page(pte);
777 }
778
779 out_set_pte:
780 set_pte_at(dst_mm, addr, dst_pte, pte);
781 return 0;
782 }
783
784 static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
785 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
786 unsigned long addr, unsigned long end)
787 {
788 pte_t *orig_src_pte, *orig_dst_pte;
789 pte_t *src_pte, *dst_pte;
790 spinlock_t *src_ptl, *dst_ptl;
791 int progress = 0;
792 int rss[NR_MM_COUNTERS];
793 swp_entry_t entry = (swp_entry_t){0};
794
795 again:
796 init_rss_vec(rss);
797
798 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
799 if (!dst_pte)
800 return -ENOMEM;
801 src_pte = pte_offset_map(src_pmd, addr);
802 src_ptl = pte_lockptr(src_mm, src_pmd);
803 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
804 orig_src_pte = src_pte;
805 orig_dst_pte = dst_pte;
806 arch_enter_lazy_mmu_mode();
807
808 do {
809
810
811
812
813 if (progress >= 32) {
814 progress = 0;
815 if (need_resched() ||
816 spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
817 break;
818 }
819 if (pte_none(*src_pte)) {
820 progress++;
821 continue;
822 }
823 entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
824 vma, addr, rss);
825 if (entry.val)
826 break;
827 progress += 8;
828 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
829
830 arch_leave_lazy_mmu_mode();
831 spin_unlock(src_ptl);
832 pte_unmap(orig_src_pte);
833 add_mm_rss_vec(dst_mm, rss);
834 pte_unmap_unlock(orig_dst_pte, dst_ptl);
835 cond_resched();
836
837 if (entry.val) {
838 if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
839 return -ENOMEM;
840 progress = 0;
841 }
842 if (addr != end)
843 goto again;
844 return 0;
845 }
846
847 static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
848 pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
849 unsigned long addr, unsigned long end)
850 {
851 pmd_t *src_pmd, *dst_pmd;
852 unsigned long next;
853
854 dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
855 if (!dst_pmd)
856 return -ENOMEM;
857 src_pmd = pmd_offset(src_pud, addr);
858 do {
859 next = pmd_addr_end(addr, end);
860 if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)
861 || pmd_devmap(*src_pmd)) {
862 int err;
863 VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, vma);
864 err = copy_huge_pmd(dst_mm, src_mm,
865 dst_pmd, src_pmd, addr, vma);
866 if (err == -ENOMEM)
867 return -ENOMEM;
868 if (!err)
869 continue;
870
871 }
872 if (pmd_none_or_clear_bad(src_pmd))
873 continue;
874 if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
875 vma, addr, next))
876 return -ENOMEM;
877 } while (dst_pmd++, src_pmd++, addr = next, addr != end);
878 return 0;
879 }
880
881 static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
882 p4d_t *dst_p4d, p4d_t *src_p4d, struct vm_area_struct *vma,
883 unsigned long addr, unsigned long end)
884 {
885 pud_t *src_pud, *dst_pud;
886 unsigned long next;
887
888 dst_pud = pud_alloc(dst_mm, dst_p4d, addr);
889 if (!dst_pud)
890 return -ENOMEM;
891 src_pud = pud_offset(src_p4d, addr);
892 do {
893 next = pud_addr_end(addr, end);
894 if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) {
895 int err;
896
897 VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, vma);
898 err = copy_huge_pud(dst_mm, src_mm,
899 dst_pud, src_pud, addr, vma);
900 if (err == -ENOMEM)
901 return -ENOMEM;
902 if (!err)
903 continue;
904
905 }
906 if (pud_none_or_clear_bad(src_pud))
907 continue;
908 if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
909 vma, addr, next))
910 return -ENOMEM;
911 } while (dst_pud++, src_pud++, addr = next, addr != end);
912 return 0;
913 }
914
915 static inline int copy_p4d_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
916 pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
917 unsigned long addr, unsigned long end)
918 {
919 p4d_t *src_p4d, *dst_p4d;
920 unsigned long next;
921
922 dst_p4d = p4d_alloc(dst_mm, dst_pgd, addr);
923 if (!dst_p4d)
924 return -ENOMEM;
925 src_p4d = p4d_offset(src_pgd, addr);
926 do {
927 next = p4d_addr_end(addr, end);
928 if (p4d_none_or_clear_bad(src_p4d))
929 continue;
930 if (copy_pud_range(dst_mm, src_mm, dst_p4d, src_p4d,
931 vma, addr, next))
932 return -ENOMEM;
933 } while (dst_p4d++, src_p4d++, addr = next, addr != end);
934 return 0;
935 }
936
937 int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
938 struct vm_area_struct *vma)
939 {
940 pgd_t *src_pgd, *dst_pgd;
941 unsigned long next;
942 unsigned long addr = vma->vm_start;
943 unsigned long end = vma->vm_end;
944 struct mmu_notifier_range range;
945 bool is_cow;
946 int ret;
947
948
949
950
951
952
953
954 if (!(vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) &&
955 !vma->anon_vma)
956 return 0;
957
958 if (is_vm_hugetlb_page(vma))
959 return copy_hugetlb_page_range(dst_mm, src_mm, vma);
960
961 if (unlikely(vma->vm_flags & VM_PFNMAP)) {
962
963
964
965
966 ret = track_pfn_copy(vma);
967 if (ret)
968 return ret;
969 }
970
971
972
973
974
975
976
977 is_cow = is_cow_mapping(vma->vm_flags);
978
979 if (is_cow) {
980 mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
981 0, vma, src_mm, addr, end);
982 mmu_notifier_invalidate_range_start(&range);
983 }
984
985 ret = 0;
986 dst_pgd = pgd_offset(dst_mm, addr);
987 src_pgd = pgd_offset(src_mm, addr);
988 do {
989 next = pgd_addr_end(addr, end);
990 if (pgd_none_or_clear_bad(src_pgd))
991 continue;
992 if (unlikely(copy_p4d_range(dst_mm, src_mm, dst_pgd, src_pgd,
993 vma, addr, next))) {
994 ret = -ENOMEM;
995 break;
996 }
997 } while (dst_pgd++, src_pgd++, addr = next, addr != end);
998
999 if (is_cow)
1000 mmu_notifier_invalidate_range_end(&range);
1001 return ret;
1002 }
1003
1004 static unsigned long zap_pte_range(struct mmu_gather *tlb,
1005 struct vm_area_struct *vma, pmd_t *pmd,
1006 unsigned long addr, unsigned long end,
1007 struct zap_details *details)
1008 {
1009 struct mm_struct *mm = tlb->mm;
1010 int force_flush = 0;
1011 int rss[NR_MM_COUNTERS];
1012 spinlock_t *ptl;
1013 pte_t *start_pte;
1014 pte_t *pte;
1015 swp_entry_t entry;
1016
1017 tlb_change_page_size(tlb, PAGE_SIZE);
1018 again:
1019 init_rss_vec(rss);
1020 start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
1021 pte = start_pte;
1022 flush_tlb_batched_pending(mm);
1023 arch_enter_lazy_mmu_mode();
1024 do {
1025 pte_t ptent = *pte;
1026 if (pte_none(ptent))
1027 continue;
1028
1029 if (need_resched())
1030 break;
1031
1032 if (pte_present(ptent)) {
1033 struct page *page;
1034
1035 page = vm_normal_page(vma, addr, ptent);
1036 if (unlikely(details) && page) {
1037
1038
1039
1040
1041
1042 if (details->check_mapping &&
1043 details->check_mapping != page_rmapping(page))
1044 continue;
1045 }
1046 ptent = ptep_get_and_clear_full(mm, addr, pte,
1047 tlb->fullmm);
1048 tlb_remove_tlb_entry(tlb, pte, addr);
1049 if (unlikely(!page))
1050 continue;
1051
1052 if (!PageAnon(page)) {
1053 if (pte_dirty(ptent)) {
1054 force_flush = 1;
1055 set_page_dirty(page);
1056 }
1057 if (pte_young(ptent) &&
1058 likely(!(vma->vm_flags & VM_SEQ_READ)))
1059 mark_page_accessed(page);
1060 }
1061 rss[mm_counter(page)]--;
1062 page_remove_rmap(page, false);
1063 if (unlikely(page_mapcount(page) < 0))
1064 print_bad_pte(vma, addr, ptent, page);
1065 if (unlikely(__tlb_remove_page(tlb, page))) {
1066 force_flush = 1;
1067 addr += PAGE_SIZE;
1068 break;
1069 }
1070 continue;
1071 }
1072
1073 entry = pte_to_swp_entry(ptent);
1074 if (non_swap_entry(entry) && is_device_private_entry(entry)) {
1075 struct page *page = device_private_entry_to_page(entry);
1076
1077 if (unlikely(details && details->check_mapping)) {
1078
1079
1080
1081
1082
1083 if (details->check_mapping !=
1084 page_rmapping(page))
1085 continue;
1086 }
1087
1088 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1089 rss[mm_counter(page)]--;
1090 page_remove_rmap(page, false);
1091 put_page(page);
1092 continue;
1093 }
1094
1095
1096 if (unlikely(details))
1097 continue;
1098
1099 if (!non_swap_entry(entry))
1100 rss[MM_SWAPENTS]--;
1101 else if (is_migration_entry(entry)) {
1102 struct page *page;
1103
1104 page = migration_entry_to_page(entry);
1105 rss[mm_counter(page)]--;
1106 }
1107 if (unlikely(!free_swap_and_cache(entry)))
1108 print_bad_pte(vma, addr, ptent, NULL);
1109 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1110 } while (pte++, addr += PAGE_SIZE, addr != end);
1111
1112 add_mm_rss_vec(mm, rss);
1113 arch_leave_lazy_mmu_mode();
1114
1115
1116 if (force_flush)
1117 tlb_flush_mmu_tlbonly(tlb);
1118 pte_unmap_unlock(start_pte, ptl);
1119
1120
1121
1122
1123
1124
1125
1126 if (force_flush) {
1127 force_flush = 0;
1128 tlb_flush_mmu(tlb);
1129 }
1130
1131 if (addr != end) {
1132 cond_resched();
1133 goto again;
1134 }
1135
1136 return addr;
1137 }
1138
1139 static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1140 struct vm_area_struct *vma, pud_t *pud,
1141 unsigned long addr, unsigned long end,
1142 struct zap_details *details)
1143 {
1144 pmd_t *pmd;
1145 unsigned long next;
1146
1147 pmd = pmd_offset(pud, addr);
1148 do {
1149 next = pmd_addr_end(addr, end);
1150 if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
1151 if (next - addr != HPAGE_PMD_SIZE)
1152 __split_huge_pmd(vma, pmd, addr, false, NULL);
1153 else if (zap_huge_pmd(tlb, vma, pmd, addr))
1154 goto next;
1155
1156 }
1157
1158
1159
1160
1161
1162
1163
1164 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
1165 goto next;
1166 next = zap_pte_range(tlb, vma, pmd, addr, next, details);
1167 next:
1168 cond_resched();
1169 } while (pmd++, addr = next, addr != end);
1170
1171 return addr;
1172 }
1173
1174 static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1175 struct vm_area_struct *vma, p4d_t *p4d,
1176 unsigned long addr, unsigned long end,
1177 struct zap_details *details)
1178 {
1179 pud_t *pud;
1180 unsigned long next;
1181
1182 pud = pud_offset(p4d, addr);
1183 do {
1184 next = pud_addr_end(addr, end);
1185 if (pud_trans_huge(*pud) || pud_devmap(*pud)) {
1186 if (next - addr != HPAGE_PUD_SIZE) {
1187 VM_BUG_ON_VMA(!rwsem_is_locked(&tlb->mm->mmap_sem), vma);
1188 split_huge_pud(vma, pud, addr);
1189 } else if (zap_huge_pud(tlb, vma, pud, addr))
1190 goto next;
1191
1192 }
1193 if (pud_none_or_clear_bad(pud))
1194 continue;
1195 next = zap_pmd_range(tlb, vma, pud, addr, next, details);
1196 next:
1197 cond_resched();
1198 } while (pud++, addr = next, addr != end);
1199
1200 return addr;
1201 }
1202
1203 static inline unsigned long zap_p4d_range(struct mmu_gather *tlb,
1204 struct vm_area_struct *vma, pgd_t *pgd,
1205 unsigned long addr, unsigned long end,
1206 struct zap_details *details)
1207 {
1208 p4d_t *p4d;
1209 unsigned long next;
1210
1211 p4d = p4d_offset(pgd, addr);
1212 do {
1213 next = p4d_addr_end(addr, end);
1214 if (p4d_none_or_clear_bad(p4d))
1215 continue;
1216 next = zap_pud_range(tlb, vma, p4d, addr, next, details);
1217 } while (p4d++, addr = next, addr != end);
1218
1219 return addr;
1220 }
1221
1222 void unmap_page_range(struct mmu_gather *tlb,
1223 struct vm_area_struct *vma,
1224 unsigned long addr, unsigned long end,
1225 struct zap_details *details)
1226 {
1227 pgd_t *pgd;
1228 unsigned long next;
1229
1230 BUG_ON(addr >= end);
1231 tlb_start_vma(tlb, vma);
1232 pgd = pgd_offset(vma->vm_mm, addr);
1233 do {
1234 next = pgd_addr_end(addr, end);
1235 if (pgd_none_or_clear_bad(pgd))
1236 continue;
1237 next = zap_p4d_range(tlb, vma, pgd, addr, next, details);
1238 } while (pgd++, addr = next, addr != end);
1239 tlb_end_vma(tlb, vma);
1240 }
1241
1242
1243 static void unmap_single_vma(struct mmu_gather *tlb,
1244 struct vm_area_struct *vma, unsigned long start_addr,
1245 unsigned long end_addr,
1246 struct zap_details *details)
1247 {
1248 unsigned long start = max(vma->vm_start, start_addr);
1249 unsigned long end;
1250
1251 if (start >= vma->vm_end)
1252 return;
1253 end = min(vma->vm_end, end_addr);
1254 if (end <= vma->vm_start)
1255 return;
1256
1257 if (vma->vm_file)
1258 uprobe_munmap(vma, start, end);
1259
1260 if (unlikely(vma->vm_flags & VM_PFNMAP))
1261 untrack_pfn(vma, 0, 0);
1262
1263 if (start != end) {
1264 if (unlikely(is_vm_hugetlb_page(vma))) {
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276 if (vma->vm_file) {
1277 i_mmap_lock_write(vma->vm_file->f_mapping);
1278 __unmap_hugepage_range_final(tlb, vma, start, end, NULL);
1279 i_mmap_unlock_write(vma->vm_file->f_mapping);
1280 }
1281 } else
1282 unmap_page_range(tlb, vma, start, end, details);
1283 }
1284 }
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304 void unmap_vmas(struct mmu_gather *tlb,
1305 struct vm_area_struct *vma, unsigned long start_addr,
1306 unsigned long end_addr)
1307 {
1308 struct mmu_notifier_range range;
1309
1310 mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
1311 start_addr, end_addr);
1312 mmu_notifier_invalidate_range_start(&range);
1313 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
1314 unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
1315 mmu_notifier_invalidate_range_end(&range);
1316 }
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326 void zap_page_range(struct vm_area_struct *vma, unsigned long start,
1327 unsigned long size)
1328 {
1329 struct mmu_notifier_range range;
1330 struct mmu_gather tlb;
1331
1332 lru_add_drain();
1333 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
1334 start, start + size);
1335 tlb_gather_mmu(&tlb, vma->vm_mm, start, range.end);
1336 update_hiwater_rss(vma->vm_mm);
1337 mmu_notifier_invalidate_range_start(&range);
1338 for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next)
1339 unmap_single_vma(&tlb, vma, start, range.end, NULL);
1340 mmu_notifier_invalidate_range_end(&range);
1341 tlb_finish_mmu(&tlb, start, range.end);
1342 }
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353 static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
1354 unsigned long size, struct zap_details *details)
1355 {
1356 struct mmu_notifier_range range;
1357 struct mmu_gather tlb;
1358
1359 lru_add_drain();
1360 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
1361 address, address + size);
1362 tlb_gather_mmu(&tlb, vma->vm_mm, address, range.end);
1363 update_hiwater_rss(vma->vm_mm);
1364 mmu_notifier_invalidate_range_start(&range);
1365 unmap_single_vma(&tlb, vma, address, range.end, details);
1366 mmu_notifier_invalidate_range_end(&range);
1367 tlb_finish_mmu(&tlb, address, range.end);
1368 }
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381 void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1382 unsigned long size)
1383 {
1384 if (address < vma->vm_start || address + size > vma->vm_end ||
1385 !(vma->vm_flags & VM_PFNMAP))
1386 return;
1387
1388 zap_page_range_single(vma, address, size, NULL);
1389 }
1390 EXPORT_SYMBOL_GPL(zap_vma_ptes);
1391
1392 pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
1393 spinlock_t **ptl)
1394 {
1395 pgd_t *pgd;
1396 p4d_t *p4d;
1397 pud_t *pud;
1398 pmd_t *pmd;
1399
1400 pgd = pgd_offset(mm, addr);
1401 p4d = p4d_alloc(mm, pgd, addr);
1402 if (!p4d)
1403 return NULL;
1404 pud = pud_alloc(mm, p4d, addr);
1405 if (!pud)
1406 return NULL;
1407 pmd = pmd_alloc(mm, pud, addr);
1408 if (!pmd)
1409 return NULL;
1410
1411 VM_BUG_ON(pmd_trans_huge(*pmd));
1412 return pte_alloc_map_lock(mm, pmd, addr, ptl);
1413 }
1414
1415
1416
1417
1418
1419
1420
1421
1422 static int insert_page(struct vm_area_struct *vma, unsigned long addr,
1423 struct page *page, pgprot_t prot)
1424 {
1425 struct mm_struct *mm = vma->vm_mm;
1426 int retval;
1427 pte_t *pte;
1428 spinlock_t *ptl;
1429
1430 retval = -EINVAL;
1431 if (PageAnon(page) || PageSlab(page) || page_has_type(page))
1432 goto out;
1433 retval = -ENOMEM;
1434 flush_dcache_page(page);
1435 pte = get_locked_pte(mm, addr, &ptl);
1436 if (!pte)
1437 goto out;
1438 retval = -EBUSY;
1439 if (!pte_none(*pte))
1440 goto out_unlock;
1441
1442
1443 get_page(page);
1444 inc_mm_counter_fast(mm, mm_counter_file(page));
1445 page_add_file_rmap(page, false);
1446 set_pte_at(mm, addr, pte, mk_pte(page, prot));
1447
1448 retval = 0;
1449 out_unlock:
1450 pte_unmap_unlock(pte, ptl);
1451 out:
1452 return retval;
1453 }
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484 int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
1485 struct page *page)
1486 {
1487 if (addr < vma->vm_start || addr >= vma->vm_end)
1488 return -EFAULT;
1489 if (!page_count(page))
1490 return -EINVAL;
1491 if (!(vma->vm_flags & VM_MIXEDMAP)) {
1492 BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem));
1493 BUG_ON(vma->vm_flags & VM_PFNMAP);
1494 vma->vm_flags |= VM_MIXEDMAP;
1495 }
1496 return insert_page(vma, addr, page, vma->vm_page_prot);
1497 }
1498 EXPORT_SYMBOL(vm_insert_page);
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511 static int __vm_map_pages(struct vm_area_struct *vma, struct page **pages,
1512 unsigned long num, unsigned long offset)
1513 {
1514 unsigned long count = vma_pages(vma);
1515 unsigned long uaddr = vma->vm_start;
1516 int ret, i;
1517
1518
1519 if (offset >= num)
1520 return -ENXIO;
1521
1522
1523 if (count > num - offset)
1524 return -ENXIO;
1525
1526 for (i = 0; i < count; i++) {
1527 ret = vm_insert_page(vma, uaddr, pages[offset + i]);
1528 if (ret < 0)
1529 return ret;
1530 uaddr += PAGE_SIZE;
1531 }
1532
1533 return 0;
1534 }
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554 int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
1555 unsigned long num)
1556 {
1557 return __vm_map_pages(vma, pages, num, vma->vm_pgoff);
1558 }
1559 EXPORT_SYMBOL(vm_map_pages);
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574 int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
1575 unsigned long num)
1576 {
1577 return __vm_map_pages(vma, pages, num, 0);
1578 }
1579 EXPORT_SYMBOL(vm_map_pages_zero);
1580
1581 static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1582 pfn_t pfn, pgprot_t prot, bool mkwrite)
1583 {
1584 struct mm_struct *mm = vma->vm_mm;
1585 pte_t *pte, entry;
1586 spinlock_t *ptl;
1587
1588 pte = get_locked_pte(mm, addr, &ptl);
1589 if (!pte)
1590 return VM_FAULT_OOM;
1591 if (!pte_none(*pte)) {
1592 if (mkwrite) {
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603 if (pte_pfn(*pte) != pfn_t_to_pfn(pfn)) {
1604 WARN_ON_ONCE(!is_zero_pfn(pte_pfn(*pte)));
1605 goto out_unlock;
1606 }
1607 entry = pte_mkyoung(*pte);
1608 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1609 if (ptep_set_access_flags(vma, addr, pte, entry, 1))
1610 update_mmu_cache(vma, addr, pte);
1611 }
1612 goto out_unlock;
1613 }
1614
1615
1616 if (pfn_t_devmap(pfn))
1617 entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
1618 else
1619 entry = pte_mkspecial(pfn_t_pte(pfn, prot));
1620
1621 if (mkwrite) {
1622 entry = pte_mkyoung(entry);
1623 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1624 }
1625
1626 set_pte_at(mm, addr, pte, entry);
1627 update_mmu_cache(vma, addr, pte);
1628
1629 out_unlock:
1630 pte_unmap_unlock(pte, ptl);
1631 return VM_FAULT_NOPAGE;
1632 }
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652 vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
1653 unsigned long pfn, pgprot_t pgprot)
1654 {
1655
1656
1657
1658
1659
1660
1661 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
1662 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
1663 (VM_PFNMAP|VM_MIXEDMAP));
1664 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
1665 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
1666
1667 if (addr < vma->vm_start || addr >= vma->vm_end)
1668 return VM_FAULT_SIGBUS;
1669
1670 if (!pfn_modify_allowed(pfn, pgprot))
1671 return VM_FAULT_SIGBUS;
1672
1673 track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
1674
1675 return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
1676 false);
1677 }
1678 EXPORT_SYMBOL(vmf_insert_pfn_prot);
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700 vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1701 unsigned long pfn)
1702 {
1703 return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
1704 }
1705 EXPORT_SYMBOL(vmf_insert_pfn);
1706
1707 static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
1708 {
1709
1710 if (vma->vm_flags & VM_MIXEDMAP)
1711 return true;
1712 if (pfn_t_devmap(pfn))
1713 return true;
1714 if (pfn_t_special(pfn))
1715 return true;
1716 if (is_zero_pfn(pfn_t_to_pfn(pfn)))
1717 return true;
1718 return false;
1719 }
1720
1721 static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
1722 unsigned long addr, pfn_t pfn, bool mkwrite)
1723 {
1724 pgprot_t pgprot = vma->vm_page_prot;
1725 int err;
1726
1727 BUG_ON(!vm_mixed_ok(vma, pfn));
1728
1729 if (addr < vma->vm_start || addr >= vma->vm_end)
1730 return VM_FAULT_SIGBUS;
1731
1732 track_pfn_insert(vma, &pgprot, pfn);
1733
1734 if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
1735 return VM_FAULT_SIGBUS;
1736
1737
1738
1739
1740
1741
1742
1743
1744 if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) &&
1745 !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
1746 struct page *page;
1747
1748
1749
1750
1751
1752
1753 page = pfn_to_page(pfn_t_to_pfn(pfn));
1754 err = insert_page(vma, addr, page, pgprot);
1755 } else {
1756 return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
1757 }
1758
1759 if (err == -ENOMEM)
1760 return VM_FAULT_OOM;
1761 if (err < 0 && err != -EBUSY)
1762 return VM_FAULT_SIGBUS;
1763
1764 return VM_FAULT_NOPAGE;
1765 }
1766
1767 vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
1768 pfn_t pfn)
1769 {
1770 return __vm_insert_mixed(vma, addr, pfn, false);
1771 }
1772 EXPORT_SYMBOL(vmf_insert_mixed);
1773
1774
1775
1776
1777
1778
1779 vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
1780 unsigned long addr, pfn_t pfn)
1781 {
1782 return __vm_insert_mixed(vma, addr, pfn, true);
1783 }
1784 EXPORT_SYMBOL(vmf_insert_mixed_mkwrite);
1785
1786
1787
1788
1789
1790
1791 static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1792 unsigned long addr, unsigned long end,
1793 unsigned long pfn, pgprot_t prot)
1794 {
1795 pte_t *pte;
1796 spinlock_t *ptl;
1797 int err = 0;
1798
1799 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1800 if (!pte)
1801 return -ENOMEM;
1802 arch_enter_lazy_mmu_mode();
1803 do {
1804 BUG_ON(!pte_none(*pte));
1805 if (!pfn_modify_allowed(pfn, prot)) {
1806 err = -EACCES;
1807 break;
1808 }
1809 set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
1810 pfn++;
1811 } while (pte++, addr += PAGE_SIZE, addr != end);
1812 arch_leave_lazy_mmu_mode();
1813 pte_unmap_unlock(pte - 1, ptl);
1814 return err;
1815 }
1816
1817 static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
1818 unsigned long addr, unsigned long end,
1819 unsigned long pfn, pgprot_t prot)
1820 {
1821 pmd_t *pmd;
1822 unsigned long next;
1823 int err;
1824
1825 pfn -= addr >> PAGE_SHIFT;
1826 pmd = pmd_alloc(mm, pud, addr);
1827 if (!pmd)
1828 return -ENOMEM;
1829 VM_BUG_ON(pmd_trans_huge(*pmd));
1830 do {
1831 next = pmd_addr_end(addr, end);
1832 err = remap_pte_range(mm, pmd, addr, next,
1833 pfn + (addr >> PAGE_SHIFT), prot);
1834 if (err)
1835 return err;
1836 } while (pmd++, addr = next, addr != end);
1837 return 0;
1838 }
1839
1840 static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d,
1841 unsigned long addr, unsigned long end,
1842 unsigned long pfn, pgprot_t prot)
1843 {
1844 pud_t *pud;
1845 unsigned long next;
1846 int err;
1847
1848 pfn -= addr >> PAGE_SHIFT;
1849 pud = pud_alloc(mm, p4d, addr);
1850 if (!pud)
1851 return -ENOMEM;
1852 do {
1853 next = pud_addr_end(addr, end);
1854 err = remap_pmd_range(mm, pud, addr, next,
1855 pfn + (addr >> PAGE_SHIFT), prot);
1856 if (err)
1857 return err;
1858 } while (pud++, addr = next, addr != end);
1859 return 0;
1860 }
1861
1862 static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
1863 unsigned long addr, unsigned long end,
1864 unsigned long pfn, pgprot_t prot)
1865 {
1866 p4d_t *p4d;
1867 unsigned long next;
1868 int err;
1869
1870 pfn -= addr >> PAGE_SHIFT;
1871 p4d = p4d_alloc(mm, pgd, addr);
1872 if (!p4d)
1873 return -ENOMEM;
1874 do {
1875 next = p4d_addr_end(addr, end);
1876 err = remap_pud_range(mm, p4d, addr, next,
1877 pfn + (addr >> PAGE_SHIFT), prot);
1878 if (err)
1879 return err;
1880 } while (p4d++, addr = next, addr != end);
1881 return 0;
1882 }
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896 int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1897 unsigned long pfn, unsigned long size, pgprot_t prot)
1898 {
1899 pgd_t *pgd;
1900 unsigned long next;
1901 unsigned long end = addr + PAGE_ALIGN(size);
1902 struct mm_struct *mm = vma->vm_mm;
1903 unsigned long remap_pfn = pfn;
1904 int err;
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924 if (is_cow_mapping(vma->vm_flags)) {
1925 if (addr != vma->vm_start || end != vma->vm_end)
1926 return -EINVAL;
1927 vma->vm_pgoff = pfn;
1928 }
1929
1930 err = track_pfn_remap(vma, &prot, remap_pfn, addr, PAGE_ALIGN(size));
1931 if (err)
1932 return -EINVAL;
1933
1934 vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
1935
1936 BUG_ON(addr >= end);
1937 pfn -= addr >> PAGE_SHIFT;
1938 pgd = pgd_offset(mm, addr);
1939 flush_cache_range(vma, addr, end);
1940 do {
1941 next = pgd_addr_end(addr, end);
1942 err = remap_p4d_range(mm, pgd, addr, next,
1943 pfn + (addr >> PAGE_SHIFT), prot);
1944 if (err)
1945 break;
1946 } while (pgd++, addr = next, addr != end);
1947
1948 if (err)
1949 untrack_pfn(vma, remap_pfn, PAGE_ALIGN(size));
1950
1951 return err;
1952 }
1953 EXPORT_SYMBOL(remap_pfn_range);
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970 int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
1971 {
1972 unsigned long vm_len, pfn, pages;
1973
1974
1975 if (start + len < start)
1976 return -EINVAL;
1977
1978
1979
1980
1981
1982 len += start & ~PAGE_MASK;
1983 pfn = start >> PAGE_SHIFT;
1984 pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
1985 if (pfn + pages < pfn)
1986 return -EINVAL;
1987
1988
1989 if (vma->vm_pgoff > pages)
1990 return -EINVAL;
1991 pfn += vma->vm_pgoff;
1992 pages -= vma->vm_pgoff;
1993
1994
1995 vm_len = vma->vm_end - vma->vm_start;
1996 if (vm_len >> PAGE_SHIFT > pages)
1997 return -EINVAL;
1998
1999
2000 return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
2001 }
2002 EXPORT_SYMBOL(vm_iomap_memory);
2003
2004 static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
2005 unsigned long addr, unsigned long end,
2006 pte_fn_t fn, void *data)
2007 {
2008 pte_t *pte;
2009 int err;
2010 spinlock_t *uninitialized_var(ptl);
2011
2012 pte = (mm == &init_mm) ?
2013 pte_alloc_kernel(pmd, addr) :
2014 pte_alloc_map_lock(mm, pmd, addr, &ptl);
2015 if (!pte)
2016 return -ENOMEM;
2017
2018 BUG_ON(pmd_huge(*pmd));
2019
2020 arch_enter_lazy_mmu_mode();
2021
2022 do {
2023 err = fn(pte++, addr, data);
2024 if (err)
2025 break;
2026 } while (addr += PAGE_SIZE, addr != end);
2027
2028 arch_leave_lazy_mmu_mode();
2029
2030 if (mm != &init_mm)
2031 pte_unmap_unlock(pte-1, ptl);
2032 return err;
2033 }
2034
2035 static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
2036 unsigned long addr, unsigned long end,
2037 pte_fn_t fn, void *data)
2038 {
2039 pmd_t *pmd;
2040 unsigned long next;
2041 int err;
2042
2043 BUG_ON(pud_huge(*pud));
2044
2045 pmd = pmd_alloc(mm, pud, addr);
2046 if (!pmd)
2047 return -ENOMEM;
2048 do {
2049 next = pmd_addr_end(addr, end);
2050 err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
2051 if (err)
2052 break;
2053 } while (pmd++, addr = next, addr != end);
2054 return err;
2055 }
2056
2057 static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
2058 unsigned long addr, unsigned long end,
2059 pte_fn_t fn, void *data)
2060 {
2061 pud_t *pud;
2062 unsigned long next;
2063 int err;
2064
2065 pud = pud_alloc(mm, p4d, addr);
2066 if (!pud)
2067 return -ENOMEM;
2068 do {
2069 next = pud_addr_end(addr, end);
2070 err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
2071 if (err)
2072 break;
2073 } while (pud++, addr = next, addr != end);
2074 return err;
2075 }
2076
2077 static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
2078 unsigned long addr, unsigned long end,
2079 pte_fn_t fn, void *data)
2080 {
2081 p4d_t *p4d;
2082 unsigned long next;
2083 int err;
2084
2085 p4d = p4d_alloc(mm, pgd, addr);
2086 if (!p4d)
2087 return -ENOMEM;
2088 do {
2089 next = p4d_addr_end(addr, end);
2090 err = apply_to_pud_range(mm, p4d, addr, next, fn, data);
2091 if (err)
2092 break;
2093 } while (p4d++, addr = next, addr != end);
2094 return err;
2095 }
2096
2097
2098
2099
2100
2101 int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
2102 unsigned long size, pte_fn_t fn, void *data)
2103 {
2104 pgd_t *pgd;
2105 unsigned long next;
2106 unsigned long end = addr + size;
2107 int err;
2108
2109 if (WARN_ON(addr >= end))
2110 return -EINVAL;
2111
2112 pgd = pgd_offset(mm, addr);
2113 do {
2114 next = pgd_addr_end(addr, end);
2115 err = apply_to_p4d_range(mm, pgd, addr, next, fn, data);
2116 if (err)
2117 break;
2118 } while (pgd++, addr = next, addr != end);
2119
2120 return err;
2121 }
2122 EXPORT_SYMBOL_GPL(apply_to_page_range);
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132 static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
2133 pte_t *page_table, pte_t orig_pte)
2134 {
2135 int same = 1;
2136 #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
2137 if (sizeof(pte_t) > sizeof(unsigned long)) {
2138 spinlock_t *ptl = pte_lockptr(mm, pmd);
2139 spin_lock(ptl);
2140 same = pte_same(*page_table, orig_pte);
2141 spin_unlock(ptl);
2142 }
2143 #endif
2144 pte_unmap(page_table);
2145 return same;
2146 }
2147
2148 static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
2149 {
2150 debug_dma_assert_idle(src);
2151
2152
2153
2154
2155
2156
2157
2158 if (unlikely(!src)) {
2159 void *kaddr = kmap_atomic(dst);
2160 void __user *uaddr = (void __user *)(va & PAGE_MASK);
2161
2162
2163
2164
2165
2166
2167
2168 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
2169 clear_page(kaddr);
2170 kunmap_atomic(kaddr);
2171 flush_dcache_page(dst);
2172 } else
2173 copy_user_highpage(dst, src, va, vma);
2174 }
2175
2176 static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
2177 {
2178 struct file *vm_file = vma->vm_file;
2179
2180 if (vm_file)
2181 return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO;
2182
2183
2184
2185
2186
2187 return GFP_KERNEL;
2188 }
2189
2190
2191
2192
2193
2194
2195
2196 static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
2197 {
2198 vm_fault_t ret;
2199 struct page *page = vmf->page;
2200 unsigned int old_flags = vmf->flags;
2201
2202 vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
2203
2204 if (vmf->vma->vm_file &&
2205 IS_SWAPFILE(vmf->vma->vm_file->f_mapping->host))
2206 return VM_FAULT_SIGBUS;
2207
2208 ret = vmf->vma->vm_ops->page_mkwrite(vmf);
2209
2210 vmf->flags = old_flags;
2211 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
2212 return ret;
2213 if (unlikely(!(ret & VM_FAULT_LOCKED))) {
2214 lock_page(page);
2215 if (!page->mapping) {
2216 unlock_page(page);
2217 return 0;
2218 }
2219 ret |= VM_FAULT_LOCKED;
2220 } else
2221 VM_BUG_ON_PAGE(!PageLocked(page), page);
2222 return ret;
2223 }
2224
2225
2226
2227
2228
2229
2230 static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf)
2231 {
2232 struct vm_area_struct *vma = vmf->vma;
2233 struct address_space *mapping;
2234 struct page *page = vmf->page;
2235 bool dirtied;
2236 bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;
2237
2238 dirtied = set_page_dirty(page);
2239 VM_BUG_ON_PAGE(PageAnon(page), page);
2240
2241
2242
2243
2244
2245
2246 mapping = page_rmapping(page);
2247 unlock_page(page);
2248
2249 if (!page_mkwrite)
2250 file_update_time(vma->vm_file);
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261 if ((dirtied || page_mkwrite) && mapping) {
2262 struct file *fpin;
2263
2264 fpin = maybe_unlock_mmap_for_io(vmf, NULL);
2265 balance_dirty_pages_ratelimited(mapping);
2266 if (fpin) {
2267 fput(fpin);
2268 return VM_FAULT_RETRY;
2269 }
2270 }
2271
2272 return 0;
2273 }
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283 static inline void wp_page_reuse(struct vm_fault *vmf)
2284 __releases(vmf->ptl)
2285 {
2286 struct vm_area_struct *vma = vmf->vma;
2287 struct page *page = vmf->page;
2288 pte_t entry;
2289
2290
2291
2292
2293
2294 if (page)
2295 page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
2296
2297 flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
2298 entry = pte_mkyoung(vmf->orig_pte);
2299 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2300 if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
2301 update_mmu_cache(vma, vmf->address, vmf->pte);
2302 pte_unmap_unlock(vmf->pte, vmf->ptl);
2303 }
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321 static vm_fault_t wp_page_copy(struct vm_fault *vmf)
2322 {
2323 struct vm_area_struct *vma = vmf->vma;
2324 struct mm_struct *mm = vma->vm_mm;
2325 struct page *old_page = vmf->page;
2326 struct page *new_page = NULL;
2327 pte_t entry;
2328 int page_copied = 0;
2329 struct mem_cgroup *memcg;
2330 struct mmu_notifier_range range;
2331
2332 if (unlikely(anon_vma_prepare(vma)))
2333 goto oom;
2334
2335 if (is_zero_pfn(pte_pfn(vmf->orig_pte))) {
2336 new_page = alloc_zeroed_user_highpage_movable(vma,
2337 vmf->address);
2338 if (!new_page)
2339 goto oom;
2340 } else {
2341 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
2342 vmf->address);
2343 if (!new_page)
2344 goto oom;
2345 cow_user_page(new_page, old_page, vmf->address, vma);
2346 }
2347
2348 if (mem_cgroup_try_charge_delay(new_page, mm, GFP_KERNEL, &memcg, false))
2349 goto oom_free_new;
2350
2351 __SetPageUptodate(new_page);
2352
2353 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
2354 vmf->address & PAGE_MASK,
2355 (vmf->address & PAGE_MASK) + PAGE_SIZE);
2356 mmu_notifier_invalidate_range_start(&range);
2357
2358
2359
2360
2361 vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
2362 if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
2363 if (old_page) {
2364 if (!PageAnon(old_page)) {
2365 dec_mm_counter_fast(mm,
2366 mm_counter_file(old_page));
2367 inc_mm_counter_fast(mm, MM_ANONPAGES);
2368 }
2369 } else {
2370 inc_mm_counter_fast(mm, MM_ANONPAGES);
2371 }
2372 flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
2373 entry = mk_pte(new_page, vma->vm_page_prot);
2374 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2375
2376
2377
2378
2379
2380
2381 ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
2382 page_add_new_anon_rmap(new_page, vma, vmf->address, false);
2383 mem_cgroup_commit_charge(new_page, memcg, false, false);
2384 lru_cache_add_active_or_unevictable(new_page, vma);
2385
2386
2387
2388
2389
2390 set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
2391 update_mmu_cache(vma, vmf->address, vmf->pte);
2392 if (old_page) {
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415 page_remove_rmap(old_page, false);
2416 }
2417
2418
2419 new_page = old_page;
2420 page_copied = 1;
2421 } else {
2422 mem_cgroup_cancel_charge(new_page, memcg, false);
2423 }
2424
2425 if (new_page)
2426 put_page(new_page);
2427
2428 pte_unmap_unlock(vmf->pte, vmf->ptl);
2429
2430
2431
2432
2433 mmu_notifier_invalidate_range_only_end(&range);
2434 if (old_page) {
2435
2436
2437
2438
2439 if (page_copied && (vma->vm_flags & VM_LOCKED)) {
2440 lock_page(old_page);
2441 if (PageMlocked(old_page))
2442 munlock_vma_page(old_page);
2443 unlock_page(old_page);
2444 }
2445 put_page(old_page);
2446 }
2447 return page_copied ? VM_FAULT_WRITE : 0;
2448 oom_free_new:
2449 put_page(new_page);
2450 oom:
2451 if (old_page)
2452 put_page(old_page);
2453 return VM_FAULT_OOM;
2454 }
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472 vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
2473 {
2474 WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
2475 vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
2476 &vmf->ptl);
2477
2478
2479
2480
2481 if (!pte_same(*vmf->pte, vmf->orig_pte)) {
2482 pte_unmap_unlock(vmf->pte, vmf->ptl);
2483 return VM_FAULT_NOPAGE;
2484 }
2485 wp_page_reuse(vmf);
2486 return 0;
2487 }
2488
2489
2490
2491
2492
2493 static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
2494 {
2495 struct vm_area_struct *vma = vmf->vma;
2496
2497 if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
2498 vm_fault_t ret;
2499
2500 pte_unmap_unlock(vmf->pte, vmf->ptl);
2501 vmf->flags |= FAULT_FLAG_MKWRITE;
2502 ret = vma->vm_ops->pfn_mkwrite(vmf);
2503 if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
2504 return ret;
2505 return finish_mkwrite_fault(vmf);
2506 }
2507 wp_page_reuse(vmf);
2508 return VM_FAULT_WRITE;
2509 }
2510
2511 static vm_fault_t wp_page_shared(struct vm_fault *vmf)
2512 __releases(vmf->ptl)
2513 {
2514 struct vm_area_struct *vma = vmf->vma;
2515 vm_fault_t ret = VM_FAULT_WRITE;
2516
2517 get_page(vmf->page);
2518
2519 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
2520 vm_fault_t tmp;
2521
2522 pte_unmap_unlock(vmf->pte, vmf->ptl);
2523 tmp = do_page_mkwrite(vmf);
2524 if (unlikely(!tmp || (tmp &
2525 (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
2526 put_page(vmf->page);
2527 return tmp;
2528 }
2529 tmp = finish_mkwrite_fault(vmf);
2530 if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
2531 unlock_page(vmf->page);
2532 put_page(vmf->page);
2533 return tmp;
2534 }
2535 } else {
2536 wp_page_reuse(vmf);
2537 lock_page(vmf->page);
2538 }
2539 ret |= fault_dirty_shared_page(vmf);
2540 put_page(vmf->page);
2541
2542 return ret;
2543 }
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563 static vm_fault_t do_wp_page(struct vm_fault *vmf)
2564 __releases(vmf->ptl)
2565 {
2566 struct vm_area_struct *vma = vmf->vma;
2567
2568 vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
2569 if (!vmf->page) {
2570
2571
2572
2573
2574
2575
2576
2577 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2578 (VM_WRITE|VM_SHARED))
2579 return wp_pfn_shared(vmf);
2580
2581 pte_unmap_unlock(vmf->pte, vmf->ptl);
2582 return wp_page_copy(vmf);
2583 }
2584
2585
2586
2587
2588
2589 if (PageAnon(vmf->page)) {
2590 int total_map_swapcount;
2591 if (PageKsm(vmf->page) && (PageSwapCache(vmf->page) ||
2592 page_count(vmf->page) != 1))
2593 goto copy;
2594 if (!trylock_page(vmf->page)) {
2595 get_page(vmf->page);
2596 pte_unmap_unlock(vmf->pte, vmf->ptl);
2597 lock_page(vmf->page);
2598 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
2599 vmf->address, &vmf->ptl);
2600 if (!pte_same(*vmf->pte, vmf->orig_pte)) {
2601 unlock_page(vmf->page);
2602 pte_unmap_unlock(vmf->pte, vmf->ptl);
2603 put_page(vmf->page);
2604 return 0;
2605 }
2606 put_page(vmf->page);
2607 }
2608 if (PageKsm(vmf->page)) {
2609 bool reused = reuse_ksm_page(vmf->page, vmf->vma,
2610 vmf->address);
2611 unlock_page(vmf->page);
2612 if (!reused)
2613 goto copy;
2614 wp_page_reuse(vmf);
2615 return VM_FAULT_WRITE;
2616 }
2617 if (reuse_swap_page(vmf->page, &total_map_swapcount)) {
2618 if (total_map_swapcount == 1) {
2619
2620
2621
2622
2623
2624
2625
2626 page_move_anon_rmap(vmf->page, vma);
2627 }
2628 unlock_page(vmf->page);
2629 wp_page_reuse(vmf);
2630 return VM_FAULT_WRITE;
2631 }
2632 unlock_page(vmf->page);
2633 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2634 (VM_WRITE|VM_SHARED))) {
2635 return wp_page_shared(vmf);
2636 }
2637 copy:
2638
2639
2640
2641 get_page(vmf->page);
2642
2643 pte_unmap_unlock(vmf->pte, vmf->ptl);
2644 return wp_page_copy(vmf);
2645 }
2646
2647 static void unmap_mapping_range_vma(struct vm_area_struct *vma,
2648 unsigned long start_addr, unsigned long end_addr,
2649 struct zap_details *details)
2650 {
2651 zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
2652 }
2653
2654 static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
2655 struct zap_details *details)
2656 {
2657 struct vm_area_struct *vma;
2658 pgoff_t vba, vea, zba, zea;
2659
2660 vma_interval_tree_foreach(vma, root,
2661 details->first_index, details->last_index) {
2662
2663 vba = vma->vm_pgoff;
2664 vea = vba + vma_pages(vma) - 1;
2665 zba = details->first_index;
2666 if (zba < vba)
2667 zba = vba;
2668 zea = details->last_index;
2669 if (zea > vea)
2670 zea = vea;
2671
2672 unmap_mapping_range_vma(vma,
2673 ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
2674 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
2675 details);
2676 }
2677 }
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691 void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
2692 pgoff_t nr, bool even_cows)
2693 {
2694 struct zap_details details = { };
2695
2696 details.check_mapping = even_cows ? NULL : mapping;
2697 details.first_index = start;
2698 details.last_index = start + nr - 1;
2699 if (details.last_index < details.first_index)
2700 details.last_index = ULONG_MAX;
2701
2702 i_mmap_lock_write(mapping);
2703 if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
2704 unmap_mapping_range_tree(&mapping->i_mmap, &details);
2705 i_mmap_unlock_write(mapping);
2706 }
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725 void unmap_mapping_range(struct address_space *mapping,
2726 loff_t const holebegin, loff_t const holelen, int even_cows)
2727 {
2728 pgoff_t hba = holebegin >> PAGE_SHIFT;
2729 pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2730
2731
2732 if (sizeof(holelen) > sizeof(hlen)) {
2733 long long holeend =
2734 (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2735 if (holeend & ~(long long)ULONG_MAX)
2736 hlen = ULONG_MAX - hba + 1;
2737 }
2738
2739 unmap_mapping_pages(mapping, hba, hlen, even_cows);
2740 }
2741 EXPORT_SYMBOL(unmap_mapping_range);
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751 vm_fault_t do_swap_page(struct vm_fault *vmf)
2752 {
2753 struct vm_area_struct *vma = vmf->vma;
2754 struct page *page = NULL, *swapcache;
2755 struct mem_cgroup *memcg;
2756 swp_entry_t entry;
2757 pte_t pte;
2758 int locked;
2759 int exclusive = 0;
2760 vm_fault_t ret = 0;
2761
2762 if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
2763 goto out;
2764
2765 entry = pte_to_swp_entry(vmf->orig_pte);
2766 if (unlikely(non_swap_entry(entry))) {
2767 if (is_migration_entry(entry)) {
2768 migration_entry_wait(vma->vm_mm, vmf->pmd,
2769 vmf->address);
2770 } else if (is_device_private_entry(entry)) {
2771 vmf->page = device_private_entry_to_page(entry);
2772 ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
2773 } else if (is_hwpoison_entry(entry)) {
2774 ret = VM_FAULT_HWPOISON;
2775 } else {
2776 print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
2777 ret = VM_FAULT_SIGBUS;
2778 }
2779 goto out;
2780 }
2781
2782
2783 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
2784 page = lookup_swap_cache(entry, vma, vmf->address);
2785 swapcache = page;
2786
2787 if (!page) {
2788 struct swap_info_struct *si = swp_swap_info(entry);
2789
2790 if (si->flags & SWP_SYNCHRONOUS_IO &&
2791 __swap_count(entry) == 1) {
2792
2793 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
2794 vmf->address);
2795 if (page) {
2796 __SetPageLocked(page);
2797 __SetPageSwapBacked(page);
2798 set_page_private(page, entry.val);
2799 lru_cache_add_anon(page);
2800 swap_readpage(page, true);
2801 }
2802 } else {
2803 page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
2804 vmf);
2805 swapcache = page;
2806 }
2807
2808 if (!page) {
2809
2810
2811
2812
2813 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
2814 vmf->address, &vmf->ptl);
2815 if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
2816 ret = VM_FAULT_OOM;
2817 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2818 goto unlock;
2819 }
2820
2821
2822 ret = VM_FAULT_MAJOR;
2823 count_vm_event(PGMAJFAULT);
2824 count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
2825 } else if (PageHWPoison(page)) {
2826
2827
2828
2829
2830 ret = VM_FAULT_HWPOISON;
2831 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2832 goto out_release;
2833 }
2834
2835 locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
2836
2837 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2838 if (!locked) {
2839 ret |= VM_FAULT_RETRY;
2840 goto out_release;
2841 }
2842
2843
2844
2845
2846
2847
2848
2849 if (unlikely((!PageSwapCache(page) ||
2850 page_private(page) != entry.val)) && swapcache)
2851 goto out_page;
2852
2853 page = ksm_might_need_to_copy(page, vma, vmf->address);
2854 if (unlikely(!page)) {
2855 ret = VM_FAULT_OOM;
2856 page = swapcache;
2857 goto out_page;
2858 }
2859
2860 if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL,
2861 &memcg, false)) {
2862 ret = VM_FAULT_OOM;
2863 goto out_page;
2864 }
2865
2866
2867
2868
2869 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
2870 &vmf->ptl);
2871 if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
2872 goto out_nomap;
2873
2874 if (unlikely(!PageUptodate(page))) {
2875 ret = VM_FAULT_SIGBUS;
2876 goto out_nomap;
2877 }
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
2890 dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
2891 pte = mk_pte(page, vma->vm_page_prot);
2892 if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
2893 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
2894 vmf->flags &= ~FAULT_FLAG_WRITE;
2895 ret |= VM_FAULT_WRITE;
2896 exclusive = RMAP_EXCLUSIVE;
2897 }
2898 flush_icache_page(vma, page);
2899 if (pte_swp_soft_dirty(vmf->orig_pte))
2900 pte = pte_mksoft_dirty(pte);
2901 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
2902 arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
2903 vmf->orig_pte = pte;
2904
2905
2906 if (unlikely(page != swapcache && swapcache)) {
2907 page_add_new_anon_rmap(page, vma, vmf->address, false);
2908 mem_cgroup_commit_charge(page, memcg, false, false);
2909 lru_cache_add_active_or_unevictable(page, vma);
2910 } else {
2911 do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
2912 mem_cgroup_commit_charge(page, memcg, true, false);
2913 activate_page(page);
2914 }
2915
2916 swap_free(entry);
2917 if (mem_cgroup_swap_full(page) ||
2918 (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
2919 try_to_free_swap(page);
2920 unlock_page(page);
2921 if (page != swapcache && swapcache) {
2922
2923
2924
2925
2926
2927
2928
2929
2930 unlock_page(swapcache);
2931 put_page(swapcache);
2932 }
2933
2934 if (vmf->flags & FAULT_FLAG_WRITE) {
2935 ret |= do_wp_page(vmf);
2936 if (ret & VM_FAULT_ERROR)
2937 ret &= VM_FAULT_ERROR;
2938 goto out;
2939 }
2940
2941
2942 update_mmu_cache(vma, vmf->address, vmf->pte);
2943 unlock:
2944 pte_unmap_unlock(vmf->pte, vmf->ptl);
2945 out:
2946 return ret;
2947 out_nomap:
2948 mem_cgroup_cancel_charge(page, memcg, false);
2949 pte_unmap_unlock(vmf->pte, vmf->ptl);
2950 out_page:
2951 unlock_page(page);
2952 out_release:
2953 put_page(page);
2954 if (page != swapcache && swapcache) {
2955 unlock_page(swapcache);
2956 put_page(swapcache);
2957 }
2958 return ret;
2959 }
2960
2961
2962
2963
2964
2965
2966 static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
2967 {
2968 struct vm_area_struct *vma = vmf->vma;
2969 struct mem_cgroup *memcg;
2970 struct page *page;
2971 vm_fault_t ret = 0;
2972 pte_t entry;
2973
2974
2975 if (vma->vm_flags & VM_SHARED)
2976 return VM_FAULT_SIGBUS;
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988 if (pte_alloc(vma->vm_mm, vmf->pmd))
2989 return VM_FAULT_OOM;
2990
2991
2992 if (unlikely(pmd_trans_unstable(vmf->pmd)))
2993 return 0;
2994
2995
2996 if (!(vmf->flags & FAULT_FLAG_WRITE) &&
2997 !mm_forbids_zeropage(vma->vm_mm)) {
2998 entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address),
2999 vma->vm_page_prot));
3000 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
3001 vmf->address, &vmf->ptl);
3002 if (!pte_none(*vmf->pte))
3003 goto unlock;
3004 ret = check_stable_address_space(vma->vm_mm);
3005 if (ret)
3006 goto unlock;
3007
3008 if (userfaultfd_missing(vma)) {
3009 pte_unmap_unlock(vmf->pte, vmf->ptl);
3010 return handle_userfault(vmf, VM_UFFD_MISSING);
3011 }
3012 goto setpte;
3013 }
3014
3015
3016 if (unlikely(anon_vma_prepare(vma)))
3017 goto oom;
3018 page = alloc_zeroed_user_highpage_movable(vma, vmf->address);
3019 if (!page)
3020 goto oom;
3021
3022 if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL, &memcg,
3023 false))
3024 goto oom_free_page;
3025
3026
3027
3028
3029
3030
3031 __SetPageUptodate(page);
3032
3033 entry = mk_pte(page, vma->vm_page_prot);
3034 if (vma->vm_flags & VM_WRITE)
3035 entry = pte_mkwrite(pte_mkdirty(entry));
3036
3037 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
3038 &vmf->ptl);
3039 if (!pte_none(*vmf->pte))
3040 goto release;
3041
3042 ret = check_stable_address_space(vma->vm_mm);
3043 if (ret)
3044 goto release;
3045
3046
3047 if (userfaultfd_missing(vma)) {
3048 pte_unmap_unlock(vmf->pte, vmf->ptl);
3049 mem_cgroup_cancel_charge(page, memcg, false);
3050 put_page(page);
3051 return handle_userfault(vmf, VM_UFFD_MISSING);
3052 }
3053
3054 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
3055 page_add_new_anon_rmap(page, vma, vmf->address, false);
3056 mem_cgroup_commit_charge(page, memcg, false, false);
3057 lru_cache_add_active_or_unevictable(page, vma);
3058 setpte:
3059 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
3060
3061
3062 update_mmu_cache(vma, vmf->address, vmf->pte);
3063 unlock:
3064 pte_unmap_unlock(vmf->pte, vmf->ptl);
3065 return ret;
3066 release:
3067 mem_cgroup_cancel_charge(page, memcg, false);
3068 put_page(page);
3069 goto unlock;
3070 oom_free_page:
3071 put_page(page);
3072 oom:
3073 return VM_FAULT_OOM;
3074 }
3075
3076
3077
3078
3079
3080
3081 static vm_fault_t __do_fault(struct vm_fault *vmf)
3082 {
3083 struct vm_area_struct *vma = vmf->vma;
3084 vm_fault_t ret;
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101 if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
3102 vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
3103 if (!vmf->prealloc_pte)
3104 return VM_FAULT_OOM;
3105 smp_wmb();
3106 }
3107
3108 ret = vma->vm_ops->fault(vmf);
3109 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
3110 VM_FAULT_DONE_COW)))
3111 return ret;
3112
3113 if (unlikely(PageHWPoison(vmf->page))) {
3114 if (ret & VM_FAULT_LOCKED)
3115 unlock_page(vmf->page);
3116 put_page(vmf->page);
3117 vmf->page = NULL;
3118 return VM_FAULT_HWPOISON;
3119 }
3120
3121 if (unlikely(!(ret & VM_FAULT_LOCKED)))
3122 lock_page(vmf->page);
3123 else
3124 VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page);
3125
3126 return ret;
3127 }
3128
3129
3130
3131
3132
3133
3134
3135 static int pmd_devmap_trans_unstable(pmd_t *pmd)
3136 {
3137 return pmd_devmap(*pmd) || pmd_trans_unstable(pmd);
3138 }
3139
3140 static vm_fault_t pte_alloc_one_map(struct vm_fault *vmf)
3141 {
3142 struct vm_area_struct *vma = vmf->vma;
3143
3144 if (!pmd_none(*vmf->pmd))
3145 goto map_pte;
3146 if (vmf->prealloc_pte) {
3147 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
3148 if (unlikely(!pmd_none(*vmf->pmd))) {
3149 spin_unlock(vmf->ptl);
3150 goto map_pte;
3151 }
3152
3153 mm_inc_nr_ptes(vma->vm_mm);
3154 pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
3155 spin_unlock(vmf->ptl);
3156 vmf->prealloc_pte = NULL;
3157 } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) {
3158 return VM_FAULT_OOM;
3159 }
3160 map_pte:
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172 if (pmd_devmap_trans_unstable(vmf->pmd))
3173 return VM_FAULT_NOPAGE;
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
3185 &vmf->ptl);
3186 return 0;
3187 }
3188
3189 #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
3190 static void deposit_prealloc_pte(struct vm_fault *vmf)
3191 {
3192 struct vm_area_struct *vma = vmf->vma;
3193
3194 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
3195
3196
3197
3198
3199 mm_inc_nr_ptes(vma->vm_mm);
3200 vmf->prealloc_pte = NULL;
3201 }
3202
3203 static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
3204 {
3205 struct vm_area_struct *vma = vmf->vma;
3206 bool write = vmf->flags & FAULT_FLAG_WRITE;
3207 unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
3208 pmd_t entry;
3209 int i;
3210 vm_fault_t ret;
3211
3212 if (!transhuge_vma_suitable(vma, haddr))
3213 return VM_FAULT_FALLBACK;
3214
3215 ret = VM_FAULT_FALLBACK;
3216 page = compound_head(page);
3217
3218
3219
3220
3221
3222 if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
3223 vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
3224 if (!vmf->prealloc_pte)
3225 return VM_FAULT_OOM;
3226 smp_wmb();
3227 }
3228
3229 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
3230 if (unlikely(!pmd_none(*vmf->pmd)))
3231 goto out;
3232
3233 for (i = 0; i < HPAGE_PMD_NR; i++)
3234 flush_icache_page(vma, page + i);
3235
3236 entry = mk_huge_pmd(page, vma->vm_page_prot);
3237 if (write)
3238 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
3239
3240 add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR);
3241 page_add_file_rmap(page, true);
3242
3243
3244
3245 if (arch_needs_pgtable_deposit())
3246 deposit_prealloc_pte(vmf);
3247
3248 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
3249
3250 update_mmu_cache_pmd(vma, haddr, vmf->pmd);
3251
3252
3253 ret = 0;
3254 count_vm_event(THP_FILE_MAPPED);
3255 out:
3256 spin_unlock(vmf->ptl);
3257 return ret;
3258 }
3259 #else
3260 static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
3261 {
3262 BUILD_BUG();
3263 return 0;
3264 }
3265 #endif
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283 vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
3284 struct page *page)
3285 {
3286 struct vm_area_struct *vma = vmf->vma;
3287 bool write = vmf->flags & FAULT_FLAG_WRITE;
3288 pte_t entry;
3289 vm_fault_t ret;
3290
3291 if (pmd_none(*vmf->pmd) && PageTransCompound(page) &&
3292 IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
3293
3294 VM_BUG_ON_PAGE(memcg, page);
3295
3296 ret = do_set_pmd(vmf, page);
3297 if (ret != VM_FAULT_FALLBACK)
3298 return ret;
3299 }
3300
3301 if (!vmf->pte) {
3302 ret = pte_alloc_one_map(vmf);
3303 if (ret)
3304 return ret;
3305 }
3306
3307
3308 if (unlikely(!pte_none(*vmf->pte)))
3309 return VM_FAULT_NOPAGE;
3310
3311 flush_icache_page(vma, page);
3312 entry = mk_pte(page, vma->vm_page_prot);
3313 if (write)
3314 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
3315
3316 if (write && !(vma->vm_flags & VM_SHARED)) {
3317 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
3318 page_add_new_anon_rmap(page, vma, vmf->address, false);
3319 mem_cgroup_commit_charge(page, memcg, false, false);
3320 lru_cache_add_active_or_unevictable(page, vma);
3321 } else {
3322 inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
3323 page_add_file_rmap(page, false);
3324 }
3325 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
3326
3327
3328 update_mmu_cache(vma, vmf->address, vmf->pte);
3329
3330 return 0;
3331 }
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349 vm_fault_t finish_fault(struct vm_fault *vmf)
3350 {
3351 struct page *page;
3352 vm_fault_t ret = 0;
3353
3354
3355 if ((vmf->flags & FAULT_FLAG_WRITE) &&
3356 !(vmf->vma->vm_flags & VM_SHARED))
3357 page = vmf->cow_page;
3358 else
3359 page = vmf->page;
3360
3361
3362
3363
3364
3365 if (!(vmf->vma->vm_flags & VM_SHARED))
3366 ret = check_stable_address_space(vmf->vma->vm_mm);
3367 if (!ret)
3368 ret = alloc_set_pte(vmf, vmf->memcg, page);
3369 if (vmf->pte)
3370 pte_unmap_unlock(vmf->pte, vmf->ptl);
3371 return ret;
3372 }
3373
3374 static unsigned long fault_around_bytes __read_mostly =
3375 rounddown_pow_of_two(65536);
3376
3377 #ifdef CONFIG_DEBUG_FS
3378 static int fault_around_bytes_get(void *data, u64 *val)
3379 {
3380 *val = fault_around_bytes;
3381 return 0;
3382 }
3383
3384
3385
3386
3387
3388 static int fault_around_bytes_set(void *data, u64 val)
3389 {
3390 if (val / PAGE_SIZE > PTRS_PER_PTE)
3391 return -EINVAL;
3392 if (val > PAGE_SIZE)
3393 fault_around_bytes = rounddown_pow_of_two(val);
3394 else
3395 fault_around_bytes = PAGE_SIZE;
3396 return 0;
3397 }
3398 DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops,
3399 fault_around_bytes_get, fault_around_bytes_set, "%llu\n");
3400
3401 static int __init fault_around_debugfs(void)
3402 {
3403 debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL,
3404 &fault_around_bytes_fops);
3405 return 0;
3406 }
3407 late_initcall(fault_around_debugfs);
3408 #endif
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434 static vm_fault_t do_fault_around(struct vm_fault *vmf)
3435 {
3436 unsigned long address = vmf->address, nr_pages, mask;
3437 pgoff_t start_pgoff = vmf->pgoff;
3438 pgoff_t end_pgoff;
3439 int off;
3440 vm_fault_t ret = 0;
3441
3442 nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
3443 mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
3444
3445 vmf->address = max(address & mask, vmf->vma->vm_start);
3446 off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
3447 start_pgoff -= off;
3448
3449
3450
3451
3452
3453 end_pgoff = start_pgoff -
3454 ((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
3455 PTRS_PER_PTE - 1;
3456 end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1,
3457 start_pgoff + nr_pages - 1);
3458
3459 if (pmd_none(*vmf->pmd)) {
3460 vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
3461 if (!vmf->prealloc_pte)
3462 goto out;
3463 smp_wmb();
3464 }
3465
3466 vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
3467
3468
3469 if (pmd_trans_huge(*vmf->pmd)) {
3470 ret = VM_FAULT_NOPAGE;
3471 goto out;
3472 }
3473
3474
3475 if (!vmf->pte)
3476 goto out;
3477
3478
3479 vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT);
3480 if (!pte_none(*vmf->pte))
3481 ret = VM_FAULT_NOPAGE;
3482 pte_unmap_unlock(vmf->pte, vmf->ptl);
3483 out:
3484 vmf->address = address;
3485 vmf->pte = NULL;
3486 return ret;
3487 }
3488
3489 static vm_fault_t do_read_fault(struct vm_fault *vmf)
3490 {
3491 struct vm_area_struct *vma = vmf->vma;
3492 vm_fault_t ret = 0;
3493
3494
3495
3496
3497
3498
3499 if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
3500 ret = do_fault_around(vmf);
3501 if (ret)
3502 return ret;
3503 }
3504
3505 ret = __do_fault(vmf);
3506 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3507 return ret;
3508
3509 ret |= finish_fault(vmf);
3510 unlock_page(vmf->page);
3511 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3512 put_page(vmf->page);
3513 return ret;
3514 }
3515
3516 static vm_fault_t do_cow_fault(struct vm_fault *vmf)
3517 {
3518 struct vm_area_struct *vma = vmf->vma;
3519 vm_fault_t ret;
3520
3521 if (unlikely(anon_vma_prepare(vma)))
3522 return VM_FAULT_OOM;
3523
3524 vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
3525 if (!vmf->cow_page)
3526 return VM_FAULT_OOM;
3527
3528 if (mem_cgroup_try_charge_delay(vmf->cow_page, vma->vm_mm, GFP_KERNEL,
3529 &vmf->memcg, false)) {
3530 put_page(vmf->cow_page);
3531 return VM_FAULT_OOM;
3532 }
3533
3534 ret = __do_fault(vmf);
3535 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3536 goto uncharge_out;
3537 if (ret & VM_FAULT_DONE_COW)
3538 return ret;
3539
3540 copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma);
3541 __SetPageUptodate(vmf->cow_page);
3542
3543 ret |= finish_fault(vmf);
3544 unlock_page(vmf->page);
3545 put_page(vmf->page);
3546 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3547 goto uncharge_out;
3548 return ret;
3549 uncharge_out:
3550 mem_cgroup_cancel_charge(vmf->cow_page, vmf->memcg, false);
3551 put_page(vmf->cow_page);
3552 return ret;
3553 }
3554
3555 static vm_fault_t do_shared_fault(struct vm_fault *vmf)
3556 {
3557 struct vm_area_struct *vma = vmf->vma;
3558 vm_fault_t ret, tmp;
3559
3560 ret = __do_fault(vmf);
3561 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3562 return ret;
3563
3564
3565
3566
3567
3568 if (vma->vm_ops->page_mkwrite) {
3569 unlock_page(vmf->page);
3570 tmp = do_page_mkwrite(vmf);
3571 if (unlikely(!tmp ||
3572 (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
3573 put_page(vmf->page);
3574 return tmp;
3575 }
3576 }
3577
3578 ret |= finish_fault(vmf);
3579 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
3580 VM_FAULT_RETRY))) {
3581 unlock_page(vmf->page);
3582 put_page(vmf->page);
3583 return ret;
3584 }
3585
3586 ret |= fault_dirty_shared_page(vmf);
3587 return ret;
3588 }
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598 static vm_fault_t do_fault(struct vm_fault *vmf)
3599 {
3600 struct vm_area_struct *vma = vmf->vma;
3601 struct mm_struct *vm_mm = vma->vm_mm;
3602 vm_fault_t ret;
3603
3604
3605
3606
3607 if (!vma->vm_ops->fault) {
3608
3609
3610
3611
3612 if (unlikely(!pmd_present(*vmf->pmd)))
3613 ret = VM_FAULT_SIGBUS;
3614 else {
3615 vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm,
3616 vmf->pmd,
3617 vmf->address,
3618 &vmf->ptl);
3619
3620
3621
3622
3623
3624
3625
3626 if (unlikely(pte_none(*vmf->pte)))
3627 ret = VM_FAULT_SIGBUS;
3628 else
3629 ret = VM_FAULT_NOPAGE;
3630
3631 pte_unmap_unlock(vmf->pte, vmf->ptl);
3632 }
3633 } else if (!(vmf->flags & FAULT_FLAG_WRITE))
3634 ret = do_read_fault(vmf);
3635 else if (!(vma->vm_flags & VM_SHARED))
3636 ret = do_cow_fault(vmf);
3637 else
3638 ret = do_shared_fault(vmf);
3639
3640
3641 if (vmf->prealloc_pte) {
3642 pte_free(vm_mm, vmf->prealloc_pte);
3643 vmf->prealloc_pte = NULL;
3644 }
3645 return ret;
3646 }
3647
3648 static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
3649 unsigned long addr, int page_nid,
3650 int *flags)
3651 {
3652 get_page(page);
3653
3654 count_vm_numa_event(NUMA_HINT_FAULTS);
3655 if (page_nid == numa_node_id()) {
3656 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
3657 *flags |= TNF_FAULT_LOCAL;
3658 }
3659
3660 return mpol_misplaced(page, vma, addr);
3661 }
3662
3663 static vm_fault_t do_numa_page(struct vm_fault *vmf)
3664 {
3665 struct vm_area_struct *vma = vmf->vma;
3666 struct page *page = NULL;
3667 int page_nid = NUMA_NO_NODE;
3668 int last_cpupid;
3669 int target_nid;
3670 bool migrated = false;
3671 pte_t pte, old_pte;
3672 bool was_writable = pte_savedwrite(vmf->orig_pte);
3673 int flags = 0;
3674
3675
3676
3677
3678
3679
3680 vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd);
3681 spin_lock(vmf->ptl);
3682 if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
3683 pte_unmap_unlock(vmf->pte, vmf->ptl);
3684 goto out;
3685 }
3686
3687
3688
3689
3690
3691 old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte);
3692 pte = pte_modify(old_pte, vma->vm_page_prot);
3693 pte = pte_mkyoung(pte);
3694 if (was_writable)
3695 pte = pte_mkwrite(pte);
3696 ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
3697 update_mmu_cache(vma, vmf->address, vmf->pte);
3698
3699 page = vm_normal_page(vma, vmf->address, pte);
3700 if (!page) {
3701 pte_unmap_unlock(vmf->pte, vmf->ptl);
3702 return 0;
3703 }
3704
3705
3706 if (PageCompound(page)) {
3707 pte_unmap_unlock(vmf->pte, vmf->ptl);
3708 return 0;
3709 }
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719 if (!pte_write(pte))
3720 flags |= TNF_NO_GROUP;
3721
3722
3723
3724
3725
3726 if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
3727 flags |= TNF_SHARED;
3728
3729 last_cpupid = page_cpupid_last(page);
3730 page_nid = page_to_nid(page);
3731 target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
3732 &flags);
3733 pte_unmap_unlock(vmf->pte, vmf->ptl);
3734 if (target_nid == NUMA_NO_NODE) {
3735 put_page(page);
3736 goto out;
3737 }
3738
3739
3740 migrated = migrate_misplaced_page(page, vma, target_nid);
3741 if (migrated) {
3742 page_nid = target_nid;
3743 flags |= TNF_MIGRATED;
3744 } else
3745 flags |= TNF_MIGRATE_FAIL;
3746
3747 out:
3748 if (page_nid != NUMA_NO_NODE)
3749 task_numa_fault(last_cpupid, page_nid, 1, flags);
3750 return 0;
3751 }
3752
3753 static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
3754 {
3755 if (vma_is_anonymous(vmf->vma))
3756 return do_huge_pmd_anonymous_page(vmf);
3757 if (vmf->vma->vm_ops->huge_fault)
3758 return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
3759 return VM_FAULT_FALLBACK;
3760 }
3761
3762
3763 static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
3764 {
3765 if (vma_is_anonymous(vmf->vma))
3766 return do_huge_pmd_wp_page(vmf, orig_pmd);
3767 if (vmf->vma->vm_ops->huge_fault)
3768 return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
3769
3770
3771 VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
3772 __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
3773
3774 return VM_FAULT_FALLBACK;
3775 }
3776
3777 static inline bool vma_is_accessible(struct vm_area_struct *vma)
3778 {
3779 return vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE);
3780 }
3781
3782 static vm_fault_t create_huge_pud(struct vm_fault *vmf)
3783 {
3784 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
3785
3786 if (vma_is_anonymous(vmf->vma))
3787 return VM_FAULT_FALLBACK;
3788 if (vmf->vma->vm_ops->huge_fault)
3789 return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
3790 #endif
3791 return VM_FAULT_FALLBACK;
3792 }
3793
3794 static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
3795 {
3796 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
3797
3798 if (vma_is_anonymous(vmf->vma))
3799 return VM_FAULT_FALLBACK;
3800 if (vmf->vma->vm_ops->huge_fault)
3801 return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
3802 #endif
3803 return VM_FAULT_FALLBACK;
3804 }
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821 static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
3822 {
3823 pte_t entry;
3824
3825 if (unlikely(pmd_none(*vmf->pmd))) {
3826
3827
3828
3829
3830
3831
3832 vmf->pte = NULL;
3833 } else {
3834
3835 if (pmd_devmap_trans_unstable(vmf->pmd))
3836 return 0;
3837
3838
3839
3840
3841
3842
3843 vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
3844 vmf->orig_pte = *vmf->pte;
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854 barrier();
3855 if (pte_none(vmf->orig_pte)) {
3856 pte_unmap(vmf->pte);
3857 vmf->pte = NULL;
3858 }
3859 }
3860
3861 if (!vmf->pte) {
3862 if (vma_is_anonymous(vmf->vma))
3863 return do_anonymous_page(vmf);
3864 else
3865 return do_fault(vmf);
3866 }
3867
3868 if (!pte_present(vmf->orig_pte))
3869 return do_swap_page(vmf);
3870
3871 if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
3872 return do_numa_page(vmf);
3873
3874 vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
3875 spin_lock(vmf->ptl);
3876 entry = vmf->orig_pte;
3877 if (unlikely(!pte_same(*vmf->pte, entry)))
3878 goto unlock;
3879 if (vmf->flags & FAULT_FLAG_WRITE) {
3880 if (!pte_write(entry))
3881 return do_wp_page(vmf);
3882 entry = pte_mkdirty(entry);
3883 }
3884 entry = pte_mkyoung(entry);
3885 if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
3886 vmf->flags & FAULT_FLAG_WRITE)) {
3887 update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
3888 } else {
3889
3890
3891
3892
3893
3894
3895 if (vmf->flags & FAULT_FLAG_WRITE)
3896 flush_tlb_fix_spurious_fault(vmf->vma, vmf->address);
3897 }
3898 unlock:
3899 pte_unmap_unlock(vmf->pte, vmf->ptl);
3900 return 0;
3901 }
3902
3903
3904
3905
3906
3907
3908
3909 static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
3910 unsigned long address, unsigned int flags)
3911 {
3912 struct vm_fault vmf = {
3913 .vma = vma,
3914 .address = address & PAGE_MASK,
3915 .flags = flags,
3916 .pgoff = linear_page_index(vma, address),
3917 .gfp_mask = __get_fault_gfp_mask(vma),
3918 };
3919 unsigned int dirty = flags & FAULT_FLAG_WRITE;
3920 struct mm_struct *mm = vma->vm_mm;
3921 pgd_t *pgd;
3922 p4d_t *p4d;
3923 vm_fault_t ret;
3924
3925 pgd = pgd_offset(mm, address);
3926 p4d = p4d_alloc(mm, pgd, address);
3927 if (!p4d)
3928 return VM_FAULT_OOM;
3929
3930 vmf.pud = pud_alloc(mm, p4d, address);
3931 if (!vmf.pud)
3932 return VM_FAULT_OOM;
3933 if (pud_none(*vmf.pud) && __transparent_hugepage_enabled(vma)) {
3934 ret = create_huge_pud(&vmf);
3935 if (!(ret & VM_FAULT_FALLBACK))
3936 return ret;
3937 } else {
3938 pud_t orig_pud = *vmf.pud;
3939
3940 barrier();
3941 if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
3942
3943
3944
3945 if (dirty && !pud_write(orig_pud)) {
3946 ret = wp_huge_pud(&vmf, orig_pud);
3947 if (!(ret & VM_FAULT_FALLBACK))
3948 return ret;
3949 } else {
3950 huge_pud_set_accessed(&vmf, orig_pud);
3951 return 0;
3952 }
3953 }
3954 }
3955
3956 vmf.pmd = pmd_alloc(mm, vmf.pud, address);
3957 if (!vmf.pmd)
3958 return VM_FAULT_OOM;
3959 if (pmd_none(*vmf.pmd) && __transparent_hugepage_enabled(vma)) {
3960 ret = create_huge_pmd(&vmf);
3961 if (!(ret & VM_FAULT_FALLBACK))
3962 return ret;
3963 } else {
3964 pmd_t orig_pmd = *vmf.pmd;
3965
3966 barrier();
3967 if (unlikely(is_swap_pmd(orig_pmd))) {
3968 VM_BUG_ON(thp_migration_supported() &&
3969 !is_pmd_migration_entry(orig_pmd));
3970 if (is_pmd_migration_entry(orig_pmd))
3971 pmd_migration_entry_wait(mm, vmf.pmd);
3972 return 0;
3973 }
3974 if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
3975 if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
3976 return do_huge_pmd_numa_page(&vmf, orig_pmd);
3977
3978 if (dirty && !pmd_write(orig_pmd)) {
3979 ret = wp_huge_pmd(&vmf, orig_pmd);
3980 if (!(ret & VM_FAULT_FALLBACK))
3981 return ret;
3982 } else {
3983 huge_pmd_set_accessed(&vmf, orig_pmd);
3984 return 0;
3985 }
3986 }
3987 }
3988
3989 return handle_pte_fault(&vmf);
3990 }
3991
3992
3993
3994
3995
3996
3997
3998 vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
3999 unsigned int flags)
4000 {
4001 vm_fault_t ret;
4002
4003 __set_current_state(TASK_RUNNING);
4004
4005 count_vm_event(PGFAULT);
4006 count_memcg_event_mm(vma->vm_mm, PGFAULT);
4007
4008
4009 check_sync_rss_stat(current);
4010
4011 if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
4012 flags & FAULT_FLAG_INSTRUCTION,
4013 flags & FAULT_FLAG_REMOTE))
4014 return VM_FAULT_SIGSEGV;
4015
4016
4017
4018
4019
4020 if (flags & FAULT_FLAG_USER)
4021 mem_cgroup_enter_user_fault();
4022
4023 if (unlikely(is_vm_hugetlb_page(vma)))
4024 ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
4025 else
4026 ret = __handle_mm_fault(vma, address, flags);
4027
4028 if (flags & FAULT_FLAG_USER) {
4029 mem_cgroup_exit_user_fault();
4030
4031
4032
4033
4034
4035
4036 if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
4037 mem_cgroup_oom_synchronize(false);
4038 }
4039
4040 return ret;
4041 }
4042 EXPORT_SYMBOL_GPL(handle_mm_fault);
4043
4044 #ifndef __PAGETABLE_P4D_FOLDED
4045
4046
4047
4048
4049 int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
4050 {
4051 p4d_t *new = p4d_alloc_one(mm, address);
4052 if (!new)
4053 return -ENOMEM;
4054
4055 smp_wmb();
4056
4057 spin_lock(&mm->page_table_lock);
4058 if (pgd_present(*pgd))
4059 p4d_free(mm, new);
4060 else
4061 pgd_populate(mm, pgd, new);
4062 spin_unlock(&mm->page_table_lock);
4063 return 0;
4064 }
4065 #endif
4066
4067 #ifndef __PAGETABLE_PUD_FOLDED
4068
4069
4070
4071
4072 int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
4073 {
4074 pud_t *new = pud_alloc_one(mm, address);
4075 if (!new)
4076 return -ENOMEM;
4077
4078 smp_wmb();
4079
4080 spin_lock(&mm->page_table_lock);
4081 #ifndef __ARCH_HAS_5LEVEL_HACK
4082 if (!p4d_present(*p4d)) {
4083 mm_inc_nr_puds(mm);
4084 p4d_populate(mm, p4d, new);
4085 } else
4086 pud_free(mm, new);
4087 #else
4088 if (!pgd_present(*p4d)) {
4089 mm_inc_nr_puds(mm);
4090 pgd_populate(mm, p4d, new);
4091 } else
4092 pud_free(mm, new);
4093 #endif
4094 spin_unlock(&mm->page_table_lock);
4095 return 0;
4096 }
4097 #endif
4098
4099 #ifndef __PAGETABLE_PMD_FOLDED
4100
4101
4102
4103
4104 int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
4105 {
4106 spinlock_t *ptl;
4107 pmd_t *new = pmd_alloc_one(mm, address);
4108 if (!new)
4109 return -ENOMEM;
4110
4111 smp_wmb();
4112
4113 ptl = pud_lock(mm, pud);
4114 #ifndef __ARCH_HAS_4LEVEL_HACK
4115 if (!pud_present(*pud)) {
4116 mm_inc_nr_pmds(mm);
4117 pud_populate(mm, pud, new);
4118 } else
4119 pmd_free(mm, new);
4120 #else
4121 if (!pgd_present(*pud)) {
4122 mm_inc_nr_pmds(mm);
4123 pgd_populate(mm, pud, new);
4124 } else
4125 pmd_free(mm, new);
4126 #endif
4127 spin_unlock(ptl);
4128 return 0;
4129 }
4130 #endif
4131
4132 static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
4133 struct mmu_notifier_range *range,
4134 pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
4135 {
4136 pgd_t *pgd;
4137 p4d_t *p4d;
4138 pud_t *pud;
4139 pmd_t *pmd;
4140 pte_t *ptep;
4141
4142 pgd = pgd_offset(mm, address);
4143 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
4144 goto out;
4145
4146 p4d = p4d_offset(pgd, address);
4147 if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d)))
4148 goto out;
4149
4150 pud = pud_offset(p4d, address);
4151 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
4152 goto out;
4153
4154 pmd = pmd_offset(pud, address);
4155 VM_BUG_ON(pmd_trans_huge(*pmd));
4156
4157 if (pmd_huge(*pmd)) {
4158 if (!pmdpp)
4159 goto out;
4160
4161 if (range) {
4162 mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0,
4163 NULL, mm, address & PMD_MASK,
4164 (address & PMD_MASK) + PMD_SIZE);
4165 mmu_notifier_invalidate_range_start(range);
4166 }
4167 *ptlp = pmd_lock(mm, pmd);
4168 if (pmd_huge(*pmd)) {
4169 *pmdpp = pmd;
4170 return 0;
4171 }
4172 spin_unlock(*ptlp);
4173 if (range)
4174 mmu_notifier_invalidate_range_end(range);
4175 }
4176
4177 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
4178 goto out;
4179
4180 if (range) {
4181 mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
4182 address & PAGE_MASK,
4183 (address & PAGE_MASK) + PAGE_SIZE);
4184 mmu_notifier_invalidate_range_start(range);
4185 }
4186 ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
4187 if (!pte_present(*ptep))
4188 goto unlock;
4189 *ptepp = ptep;
4190 return 0;
4191 unlock:
4192 pte_unmap_unlock(ptep, *ptlp);
4193 if (range)
4194 mmu_notifier_invalidate_range_end(range);
4195 out:
4196 return -EINVAL;
4197 }
4198
4199 static inline int follow_pte(struct mm_struct *mm, unsigned long address,
4200 pte_t **ptepp, spinlock_t **ptlp)
4201 {
4202 int res;
4203
4204
4205 (void) __cond_lock(*ptlp,
4206 !(res = __follow_pte_pmd(mm, address, NULL,
4207 ptepp, NULL, ptlp)));
4208 return res;
4209 }
4210
4211 int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
4212 struct mmu_notifier_range *range,
4213 pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
4214 {
4215 int res;
4216
4217
4218 (void) __cond_lock(*ptlp,
4219 !(res = __follow_pte_pmd(mm, address, range,
4220 ptepp, pmdpp, ptlp)));
4221 return res;
4222 }
4223 EXPORT_SYMBOL(follow_pte_pmd);
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235 int follow_pfn(struct vm_area_struct *vma, unsigned long address,
4236 unsigned long *pfn)
4237 {
4238 int ret = -EINVAL;
4239 spinlock_t *ptl;
4240 pte_t *ptep;
4241
4242 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
4243 return ret;
4244
4245 ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
4246 if (ret)
4247 return ret;
4248 *pfn = pte_pfn(*ptep);
4249 pte_unmap_unlock(ptep, ptl);
4250 return 0;
4251 }
4252 EXPORT_SYMBOL(follow_pfn);
4253
4254 #ifdef CONFIG_HAVE_IOREMAP_PROT
4255 int follow_phys(struct vm_area_struct *vma,
4256 unsigned long address, unsigned int flags,
4257 unsigned long *prot, resource_size_t *phys)
4258 {
4259 int ret = -EINVAL;
4260 pte_t *ptep, pte;
4261 spinlock_t *ptl;
4262
4263 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
4264 goto out;
4265
4266 if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
4267 goto out;
4268 pte = *ptep;
4269
4270 if ((flags & FOLL_WRITE) && !pte_write(pte))
4271 goto unlock;
4272
4273 *prot = pgprot_val(pte_pgprot(pte));
4274 *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
4275
4276 ret = 0;
4277 unlock:
4278 pte_unmap_unlock(ptep, ptl);
4279 out:
4280 return ret;
4281 }
4282
4283 int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
4284 void *buf, int len, int write)
4285 {
4286 resource_size_t phys_addr;
4287 unsigned long prot = 0;
4288 void __iomem *maddr;
4289 int offset = addr & (PAGE_SIZE-1);
4290
4291 if (follow_phys(vma, addr, write, &prot, &phys_addr))
4292 return -EINVAL;
4293
4294 maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
4295 if (!maddr)
4296 return -ENOMEM;
4297
4298 if (write)
4299 memcpy_toio(maddr + offset, buf, len);
4300 else
4301 memcpy_fromio(buf, maddr + offset, len);
4302 iounmap(maddr);
4303
4304 return len;
4305 }
4306 EXPORT_SYMBOL_GPL(generic_access_phys);
4307 #endif
4308
4309
4310
4311
4312
4313 int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
4314 unsigned long addr, void *buf, int len, unsigned int gup_flags)
4315 {
4316 struct vm_area_struct *vma;
4317 void *old_buf = buf;
4318 int write = gup_flags & FOLL_WRITE;
4319
4320 if (down_read_killable(&mm->mmap_sem))
4321 return 0;
4322
4323
4324 while (len) {
4325 int bytes, ret, offset;
4326 void *maddr;
4327 struct page *page = NULL;
4328
4329 ret = get_user_pages_remote(tsk, mm, addr, 1,
4330 gup_flags, &page, &vma, NULL);
4331 if (ret <= 0) {
4332 #ifndef CONFIG_HAVE_IOREMAP_PROT
4333 break;
4334 #else
4335
4336
4337
4338
4339 vma = find_vma(mm, addr);
4340 if (!vma || vma->vm_start > addr)
4341 break;
4342 if (vma->vm_ops && vma->vm_ops->access)
4343 ret = vma->vm_ops->access(vma, addr, buf,
4344 len, write);
4345 if (ret <= 0)
4346 break;
4347 bytes = ret;
4348 #endif
4349 } else {
4350 bytes = len;
4351 offset = addr & (PAGE_SIZE-1);
4352 if (bytes > PAGE_SIZE-offset)
4353 bytes = PAGE_SIZE-offset;
4354
4355 maddr = kmap(page);
4356 if (write) {
4357 copy_to_user_page(vma, page, addr,
4358 maddr + offset, buf, bytes);
4359 set_page_dirty_lock(page);
4360 } else {
4361 copy_from_user_page(vma, page, addr,
4362 buf, maddr + offset, bytes);
4363 }
4364 kunmap(page);
4365 put_page(page);
4366 }
4367 len -= bytes;
4368 buf += bytes;
4369 addr += bytes;
4370 }
4371 up_read(&mm->mmap_sem);
4372
4373 return buf - old_buf;
4374 }
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388 int access_remote_vm(struct mm_struct *mm, unsigned long addr,
4389 void *buf, int len, unsigned int gup_flags)
4390 {
4391 return __access_remote_vm(NULL, mm, addr, buf, len, gup_flags);
4392 }
4393
4394
4395
4396
4397
4398
4399 int access_process_vm(struct task_struct *tsk, unsigned long addr,
4400 void *buf, int len, unsigned int gup_flags)
4401 {
4402 struct mm_struct *mm;
4403 int ret;
4404
4405 mm = get_task_mm(tsk);
4406 if (!mm)
4407 return 0;
4408
4409 ret = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags);
4410
4411 mmput(mm);
4412
4413 return ret;
4414 }
4415 EXPORT_SYMBOL_GPL(access_process_vm);
4416
4417
4418
4419
4420 void print_vma_addr(char *prefix, unsigned long ip)
4421 {
4422 struct mm_struct *mm = current->mm;
4423 struct vm_area_struct *vma;
4424
4425
4426
4427
4428 if (!down_read_trylock(&mm->mmap_sem))
4429 return;
4430
4431 vma = find_vma(mm, ip);
4432 if (vma && vma->vm_file) {
4433 struct file *f = vma->vm_file;
4434 char *buf = (char *)__get_free_page(GFP_NOWAIT);
4435 if (buf) {
4436 char *p;
4437
4438 p = file_path(f, buf, PAGE_SIZE);
4439 if (IS_ERR(p))
4440 p = "?";
4441 printk("%s%s[%lx+%lx]", prefix, kbasename(p),
4442 vma->vm_start,
4443 vma->vm_end - vma->vm_start);
4444 free_page((unsigned long)buf);
4445 }
4446 }
4447 up_read(&mm->mmap_sem);
4448 }
4449
4450 #if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
4451 void __might_fault(const char *file, int line)
4452 {
4453
4454
4455
4456
4457
4458
4459 if (uaccess_kernel())
4460 return;
4461 if (pagefault_disabled())
4462 return;
4463 __might_sleep(file, line, 0);
4464 #if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
4465 if (current->mm)
4466 might_lock_read(¤t->mm->mmap_sem);
4467 #endif
4468 }
4469 EXPORT_SYMBOL(__might_fault);
4470 #endif
4471
4472 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
4473
4474
4475
4476
4477
4478 static inline void process_huge_page(
4479 unsigned long addr_hint, unsigned int pages_per_huge_page,
4480 void (*process_subpage)(unsigned long addr, int idx, void *arg),
4481 void *arg)
4482 {
4483 int i, n, base, l;
4484 unsigned long addr = addr_hint &
4485 ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
4486
4487
4488 might_sleep();
4489 n = (addr_hint - addr) / PAGE_SIZE;
4490 if (2 * n <= pages_per_huge_page) {
4491
4492 base = 0;
4493 l = n;
4494
4495 for (i = pages_per_huge_page - 1; i >= 2 * n; i--) {
4496 cond_resched();
4497 process_subpage(addr + i * PAGE_SIZE, i, arg);
4498 }
4499 } else {
4500
4501 base = pages_per_huge_page - 2 * (pages_per_huge_page - n);
4502 l = pages_per_huge_page - n;
4503
4504 for (i = 0; i < base; i++) {
4505 cond_resched();
4506 process_subpage(addr + i * PAGE_SIZE, i, arg);
4507 }
4508 }
4509
4510
4511
4512
4513 for (i = 0; i < l; i++) {
4514 int left_idx = base + i;
4515 int right_idx = base + 2 * l - 1 - i;
4516
4517 cond_resched();
4518 process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg);
4519 cond_resched();
4520 process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg);
4521 }
4522 }
4523
4524 static void clear_gigantic_page(struct page *page,
4525 unsigned long addr,
4526 unsigned int pages_per_huge_page)
4527 {
4528 int i;
4529 struct page *p = page;
4530
4531 might_sleep();
4532 for (i = 0; i < pages_per_huge_page;
4533 i++, p = mem_map_next(p, page, i)) {
4534 cond_resched();
4535 clear_user_highpage(p, addr + i * PAGE_SIZE);
4536 }
4537 }
4538
4539 static void clear_subpage(unsigned long addr, int idx, void *arg)
4540 {
4541 struct page *page = arg;
4542
4543 clear_user_highpage(page + idx, addr);
4544 }
4545
4546 void clear_huge_page(struct page *page,
4547 unsigned long addr_hint, unsigned int pages_per_huge_page)
4548 {
4549 unsigned long addr = addr_hint &
4550 ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
4551
4552 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
4553 clear_gigantic_page(page, addr, pages_per_huge_page);
4554 return;
4555 }
4556
4557 process_huge_page(addr_hint, pages_per_huge_page, clear_subpage, page);
4558 }
4559
4560 static void copy_user_gigantic_page(struct page *dst, struct page *src,
4561 unsigned long addr,
4562 struct vm_area_struct *vma,
4563 unsigned int pages_per_huge_page)
4564 {
4565 int i;
4566 struct page *dst_base = dst;
4567 struct page *src_base = src;
4568
4569 for (i = 0; i < pages_per_huge_page; ) {
4570 cond_resched();
4571 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
4572
4573 i++;
4574 dst = mem_map_next(dst, dst_base, i);
4575 src = mem_map_next(src, src_base, i);
4576 }
4577 }
4578
4579 struct copy_subpage_arg {
4580 struct page *dst;
4581 struct page *src;
4582 struct vm_area_struct *vma;
4583 };
4584
4585 static void copy_subpage(unsigned long addr, int idx, void *arg)
4586 {
4587 struct copy_subpage_arg *copy_arg = arg;
4588
4589 copy_user_highpage(copy_arg->dst + idx, copy_arg->src + idx,
4590 addr, copy_arg->vma);
4591 }
4592
4593 void copy_user_huge_page(struct page *dst, struct page *src,
4594 unsigned long addr_hint, struct vm_area_struct *vma,
4595 unsigned int pages_per_huge_page)
4596 {
4597 unsigned long addr = addr_hint &
4598 ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
4599 struct copy_subpage_arg arg = {
4600 .dst = dst,
4601 .src = src,
4602 .vma = vma,
4603 };
4604
4605 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
4606 copy_user_gigantic_page(dst, src, addr, vma,
4607 pages_per_huge_page);
4608 return;
4609 }
4610
4611 process_huge_page(addr_hint, pages_per_huge_page, copy_subpage, &arg);
4612 }
4613
4614 long copy_huge_page_from_user(struct page *dst_page,
4615 const void __user *usr_src,
4616 unsigned int pages_per_huge_page,
4617 bool allow_pagefault)
4618 {
4619 void *src = (void *)usr_src;
4620 void *page_kaddr;
4621 unsigned long i, rc = 0;
4622 unsigned long ret_val = pages_per_huge_page * PAGE_SIZE;
4623
4624 for (i = 0; i < pages_per_huge_page; i++) {
4625 if (allow_pagefault)
4626 page_kaddr = kmap(dst_page + i);
4627 else
4628 page_kaddr = kmap_atomic(dst_page + i);
4629 rc = copy_from_user(page_kaddr,
4630 (const void __user *)(src + i * PAGE_SIZE),
4631 PAGE_SIZE);
4632 if (allow_pagefault)
4633 kunmap(dst_page + i);
4634 else
4635 kunmap_atomic(page_kaddr);
4636
4637 ret_val -= (PAGE_SIZE - rc);
4638 if (rc)
4639 break;
4640
4641 cond_resched();
4642 }
4643 return ret_val;
4644 }
4645 #endif
4646
4647 #if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
4648
4649 static struct kmem_cache *page_ptl_cachep;
4650
4651 void __init ptlock_cache_init(void)
4652 {
4653 page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
4654 SLAB_PANIC, NULL);
4655 }
4656
4657 bool ptlock_alloc(struct page *page)
4658 {
4659 spinlock_t *ptl;
4660
4661 ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
4662 if (!ptl)
4663 return false;
4664 page->ptl = ptl;
4665 return true;
4666 }
4667
4668 void ptlock_free(struct page *page)
4669 {
4670 kmem_cache_free(page_ptl_cachep, page->ptl);
4671 }
4672 #endif