This source file includes following definitions.
- madvise_need_mmap_write
- madvise_behavior
- swapin_walk_pmd_entry
- force_shm_swapin_readahead
- madvise_willneed
- madvise_cold_or_pageout_pte_range
- madvise_cold_page_range
- madvise_cold
- madvise_pageout_page_range
- can_do_pageout
- madvise_pageout
- madvise_free_pte_range
- madvise_free_single_vma
- madvise_dontneed_single_vma
- madvise_dontneed_free
- madvise_remove
- madvise_inject_error
- madvise_vma
- madvise_behavior_valid
- SYSCALL_DEFINE3
1
2
3
4
5
6
7
8
9 #include <linux/mman.h>
10 #include <linux/pagemap.h>
11 #include <linux/syscalls.h>
12 #include <linux/mempolicy.h>
13 #include <linux/page-isolation.h>
14 #include <linux/page_idle.h>
15 #include <linux/userfaultfd_k.h>
16 #include <linux/hugetlb.h>
17 #include <linux/falloc.h>
18 #include <linux/fadvise.h>
19 #include <linux/sched.h>
20 #include <linux/ksm.h>
21 #include <linux/fs.h>
22 #include <linux/file.h>
23 #include <linux/blkdev.h>
24 #include <linux/backing-dev.h>
25 #include <linux/pagewalk.h>
26 #include <linux/swap.h>
27 #include <linux/swapops.h>
28 #include <linux/shmem_fs.h>
29 #include <linux/mmu_notifier.h>
30
31 #include <asm/tlb.h>
32
33 #include "internal.h"
34
35 struct madvise_walk_private {
36 struct mmu_gather *tlb;
37 bool pageout;
38 };
39
40
41
42
43
44
45 static int madvise_need_mmap_write(int behavior)
46 {
47 switch (behavior) {
48 case MADV_REMOVE:
49 case MADV_WILLNEED:
50 case MADV_DONTNEED:
51 case MADV_COLD:
52 case MADV_PAGEOUT:
53 case MADV_FREE:
54 return 0;
55 default:
56
57 return 1;
58 }
59 }
60
61
62
63
64
65 static long madvise_behavior(struct vm_area_struct *vma,
66 struct vm_area_struct **prev,
67 unsigned long start, unsigned long end, int behavior)
68 {
69 struct mm_struct *mm = vma->vm_mm;
70 int error = 0;
71 pgoff_t pgoff;
72 unsigned long new_flags = vma->vm_flags;
73
74 switch (behavior) {
75 case MADV_NORMAL:
76 new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
77 break;
78 case MADV_SEQUENTIAL:
79 new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
80 break;
81 case MADV_RANDOM:
82 new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
83 break;
84 case MADV_DONTFORK:
85 new_flags |= VM_DONTCOPY;
86 break;
87 case MADV_DOFORK:
88 if (vma->vm_flags & VM_IO) {
89 error = -EINVAL;
90 goto out;
91 }
92 new_flags &= ~VM_DONTCOPY;
93 break;
94 case MADV_WIPEONFORK:
95
96 if (vma->vm_file || vma->vm_flags & VM_SHARED) {
97 error = -EINVAL;
98 goto out;
99 }
100 new_flags |= VM_WIPEONFORK;
101 break;
102 case MADV_KEEPONFORK:
103 new_flags &= ~VM_WIPEONFORK;
104 break;
105 case MADV_DONTDUMP:
106 new_flags |= VM_DONTDUMP;
107 break;
108 case MADV_DODUMP:
109 if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) {
110 error = -EINVAL;
111 goto out;
112 }
113 new_flags &= ~VM_DONTDUMP;
114 break;
115 case MADV_MERGEABLE:
116 case MADV_UNMERGEABLE:
117 error = ksm_madvise(vma, start, end, behavior, &new_flags);
118 if (error)
119 goto out_convert_errno;
120 break;
121 case MADV_HUGEPAGE:
122 case MADV_NOHUGEPAGE:
123 error = hugepage_madvise(vma, &new_flags, behavior);
124 if (error)
125 goto out_convert_errno;
126 break;
127 }
128
129 if (new_flags == vma->vm_flags) {
130 *prev = vma;
131 goto out;
132 }
133
134 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
135 *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
136 vma->vm_file, pgoff, vma_policy(vma),
137 vma->vm_userfaultfd_ctx);
138 if (*prev) {
139 vma = *prev;
140 goto success;
141 }
142
143 *prev = vma;
144
145 if (start != vma->vm_start) {
146 if (unlikely(mm->map_count >= sysctl_max_map_count)) {
147 error = -ENOMEM;
148 goto out;
149 }
150 error = __split_vma(mm, vma, start, 1);
151 if (error)
152 goto out_convert_errno;
153 }
154
155 if (end != vma->vm_end) {
156 if (unlikely(mm->map_count >= sysctl_max_map_count)) {
157 error = -ENOMEM;
158 goto out;
159 }
160 error = __split_vma(mm, vma, end, 0);
161 if (error)
162 goto out_convert_errno;
163 }
164
165 success:
166
167
168
169 vma->vm_flags = new_flags;
170
171 out_convert_errno:
172
173
174
175
176 if (error == -ENOMEM)
177 error = -EAGAIN;
178 out:
179 return error;
180 }
181
182 #ifdef CONFIG_SWAP
183 static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
184 unsigned long end, struct mm_walk *walk)
185 {
186 pte_t *orig_pte;
187 struct vm_area_struct *vma = walk->private;
188 unsigned long index;
189
190 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
191 return 0;
192
193 for (index = start; index != end; index += PAGE_SIZE) {
194 pte_t pte;
195 swp_entry_t entry;
196 struct page *page;
197 spinlock_t *ptl;
198
199 orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
200 pte = *(orig_pte + ((index - start) / PAGE_SIZE));
201 pte_unmap_unlock(orig_pte, ptl);
202
203 if (pte_present(pte) || pte_none(pte))
204 continue;
205 entry = pte_to_swp_entry(pte);
206 if (unlikely(non_swap_entry(entry)))
207 continue;
208
209 page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
210 vma, index, false);
211 if (page)
212 put_page(page);
213 }
214
215 return 0;
216 }
217
218 static const struct mm_walk_ops swapin_walk_ops = {
219 .pmd_entry = swapin_walk_pmd_entry,
220 };
221
222 static void force_shm_swapin_readahead(struct vm_area_struct *vma,
223 unsigned long start, unsigned long end,
224 struct address_space *mapping)
225 {
226 pgoff_t index;
227 struct page *page;
228 swp_entry_t swap;
229
230 for (; start < end; start += PAGE_SIZE) {
231 index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
232
233 page = find_get_entry(mapping, index);
234 if (!xa_is_value(page)) {
235 if (page)
236 put_page(page);
237 continue;
238 }
239 swap = radix_to_swp_entry(page);
240 page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
241 NULL, 0, false);
242 if (page)
243 put_page(page);
244 }
245
246 lru_add_drain();
247 }
248 #endif
249
250
251
252
253 static long madvise_willneed(struct vm_area_struct *vma,
254 struct vm_area_struct **prev,
255 unsigned long start, unsigned long end)
256 {
257 struct file *file = vma->vm_file;
258 loff_t offset;
259
260 *prev = vma;
261 #ifdef CONFIG_SWAP
262 if (!file) {
263 walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma);
264 lru_add_drain();
265 return 0;
266 }
267
268 if (shmem_mapping(file->f_mapping)) {
269 force_shm_swapin_readahead(vma, start, end,
270 file->f_mapping);
271 return 0;
272 }
273 #else
274 if (!file)
275 return -EBADF;
276 #endif
277
278 if (IS_DAX(file_inode(file))) {
279
280 return 0;
281 }
282
283
284
285
286
287
288
289 *prev = NULL;
290 get_file(file);
291 up_read(¤t->mm->mmap_sem);
292 offset = (loff_t)(start - vma->vm_start)
293 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
294 vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
295 fput(file);
296 down_read(¤t->mm->mmap_sem);
297 return 0;
298 }
299
300 static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
301 unsigned long addr, unsigned long end,
302 struct mm_walk *walk)
303 {
304 struct madvise_walk_private *private = walk->private;
305 struct mmu_gather *tlb = private->tlb;
306 bool pageout = private->pageout;
307 struct mm_struct *mm = tlb->mm;
308 struct vm_area_struct *vma = walk->vma;
309 pte_t *orig_pte, *pte, ptent;
310 spinlock_t *ptl;
311 struct page *page = NULL;
312 LIST_HEAD(page_list);
313
314 if (fatal_signal_pending(current))
315 return -EINTR;
316
317 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
318 if (pmd_trans_huge(*pmd)) {
319 pmd_t orig_pmd;
320 unsigned long next = pmd_addr_end(addr, end);
321
322 tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
323 ptl = pmd_trans_huge_lock(pmd, vma);
324 if (!ptl)
325 return 0;
326
327 orig_pmd = *pmd;
328 if (is_huge_zero_pmd(orig_pmd))
329 goto huge_unlock;
330
331 if (unlikely(!pmd_present(orig_pmd))) {
332 VM_BUG_ON(thp_migration_supported() &&
333 !is_pmd_migration_entry(orig_pmd));
334 goto huge_unlock;
335 }
336
337 page = pmd_page(orig_pmd);
338
339
340 if (page_mapcount(page) != 1)
341 goto huge_unlock;
342
343 if (next - addr != HPAGE_PMD_SIZE) {
344 int err;
345
346 get_page(page);
347 spin_unlock(ptl);
348 lock_page(page);
349 err = split_huge_page(page);
350 unlock_page(page);
351 put_page(page);
352 if (!err)
353 goto regular_page;
354 return 0;
355 }
356
357 if (pmd_young(orig_pmd)) {
358 pmdp_invalidate(vma, addr, pmd);
359 orig_pmd = pmd_mkold(orig_pmd);
360
361 set_pmd_at(mm, addr, pmd, orig_pmd);
362 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
363 }
364
365 ClearPageReferenced(page);
366 test_and_clear_page_young(page);
367 if (pageout) {
368 if (!isolate_lru_page(page)) {
369 if (PageUnevictable(page))
370 putback_lru_page(page);
371 else
372 list_add(&page->lru, &page_list);
373 }
374 } else
375 deactivate_page(page);
376 huge_unlock:
377 spin_unlock(ptl);
378 if (pageout)
379 reclaim_pages(&page_list);
380 return 0;
381 }
382
383 if (pmd_trans_unstable(pmd))
384 return 0;
385 regular_page:
386 #endif
387 tlb_change_page_size(tlb, PAGE_SIZE);
388 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
389 flush_tlb_batched_pending(mm);
390 arch_enter_lazy_mmu_mode();
391 for (; addr < end; pte++, addr += PAGE_SIZE) {
392 ptent = *pte;
393
394 if (pte_none(ptent))
395 continue;
396
397 if (!pte_present(ptent))
398 continue;
399
400 page = vm_normal_page(vma, addr, ptent);
401 if (!page)
402 continue;
403
404
405
406
407
408 if (PageTransCompound(page)) {
409 if (page_mapcount(page) != 1)
410 break;
411 get_page(page);
412 if (!trylock_page(page)) {
413 put_page(page);
414 break;
415 }
416 pte_unmap_unlock(orig_pte, ptl);
417 if (split_huge_page(page)) {
418 unlock_page(page);
419 put_page(page);
420 pte_offset_map_lock(mm, pmd, addr, &ptl);
421 break;
422 }
423 unlock_page(page);
424 put_page(page);
425 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
426 pte--;
427 addr -= PAGE_SIZE;
428 continue;
429 }
430
431
432 if (page_mapcount(page) != 1)
433 continue;
434
435 VM_BUG_ON_PAGE(PageTransCompound(page), page);
436
437 if (pte_young(ptent)) {
438 ptent = ptep_get_and_clear_full(mm, addr, pte,
439 tlb->fullmm);
440 ptent = pte_mkold(ptent);
441 set_pte_at(mm, addr, pte, ptent);
442 tlb_remove_tlb_entry(tlb, pte, addr);
443 }
444
445
446
447
448
449
450
451 ClearPageReferenced(page);
452 test_and_clear_page_young(page);
453 if (pageout) {
454 if (!isolate_lru_page(page)) {
455 if (PageUnevictable(page))
456 putback_lru_page(page);
457 else
458 list_add(&page->lru, &page_list);
459 }
460 } else
461 deactivate_page(page);
462 }
463
464 arch_leave_lazy_mmu_mode();
465 pte_unmap_unlock(orig_pte, ptl);
466 if (pageout)
467 reclaim_pages(&page_list);
468 cond_resched();
469
470 return 0;
471 }
472
473 static const struct mm_walk_ops cold_walk_ops = {
474 .pmd_entry = madvise_cold_or_pageout_pte_range,
475 };
476
477 static void madvise_cold_page_range(struct mmu_gather *tlb,
478 struct vm_area_struct *vma,
479 unsigned long addr, unsigned long end)
480 {
481 struct madvise_walk_private walk_private = {
482 .pageout = false,
483 .tlb = tlb,
484 };
485
486 tlb_start_vma(tlb, vma);
487 walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
488 tlb_end_vma(tlb, vma);
489 }
490
491 static long madvise_cold(struct vm_area_struct *vma,
492 struct vm_area_struct **prev,
493 unsigned long start_addr, unsigned long end_addr)
494 {
495 struct mm_struct *mm = vma->vm_mm;
496 struct mmu_gather tlb;
497
498 *prev = vma;
499 if (!can_madv_lru_vma(vma))
500 return -EINVAL;
501
502 lru_add_drain();
503 tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
504 madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
505 tlb_finish_mmu(&tlb, start_addr, end_addr);
506
507 return 0;
508 }
509
510 static void madvise_pageout_page_range(struct mmu_gather *tlb,
511 struct vm_area_struct *vma,
512 unsigned long addr, unsigned long end)
513 {
514 struct madvise_walk_private walk_private = {
515 .pageout = true,
516 .tlb = tlb,
517 };
518
519 tlb_start_vma(tlb, vma);
520 walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
521 tlb_end_vma(tlb, vma);
522 }
523
524 static inline bool can_do_pageout(struct vm_area_struct *vma)
525 {
526 if (vma_is_anonymous(vma))
527 return true;
528 if (!vma->vm_file)
529 return false;
530
531
532
533
534
535
536 return inode_owner_or_capable(file_inode(vma->vm_file)) ||
537 inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0;
538 }
539
540 static long madvise_pageout(struct vm_area_struct *vma,
541 struct vm_area_struct **prev,
542 unsigned long start_addr, unsigned long end_addr)
543 {
544 struct mm_struct *mm = vma->vm_mm;
545 struct mmu_gather tlb;
546
547 *prev = vma;
548 if (!can_madv_lru_vma(vma))
549 return -EINVAL;
550
551 if (!can_do_pageout(vma))
552 return 0;
553
554 lru_add_drain();
555 tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
556 madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
557 tlb_finish_mmu(&tlb, start_addr, end_addr);
558
559 return 0;
560 }
561
562 static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
563 unsigned long end, struct mm_walk *walk)
564
565 {
566 struct mmu_gather *tlb = walk->private;
567 struct mm_struct *mm = tlb->mm;
568 struct vm_area_struct *vma = walk->vma;
569 spinlock_t *ptl;
570 pte_t *orig_pte, *pte, ptent;
571 struct page *page;
572 int nr_swap = 0;
573 unsigned long next;
574
575 next = pmd_addr_end(addr, end);
576 if (pmd_trans_huge(*pmd))
577 if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
578 goto next;
579
580 if (pmd_trans_unstable(pmd))
581 return 0;
582
583 tlb_change_page_size(tlb, PAGE_SIZE);
584 orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
585 flush_tlb_batched_pending(mm);
586 arch_enter_lazy_mmu_mode();
587 for (; addr != end; pte++, addr += PAGE_SIZE) {
588 ptent = *pte;
589
590 if (pte_none(ptent))
591 continue;
592
593
594
595
596
597 if (!pte_present(ptent)) {
598 swp_entry_t entry;
599
600 entry = pte_to_swp_entry(ptent);
601 if (non_swap_entry(entry))
602 continue;
603 nr_swap--;
604 free_swap_and_cache(entry);
605 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
606 continue;
607 }
608
609 page = vm_normal_page(vma, addr, ptent);
610 if (!page)
611 continue;
612
613
614
615
616
617
618 if (PageTransCompound(page)) {
619 if (page_mapcount(page) != 1)
620 goto out;
621 get_page(page);
622 if (!trylock_page(page)) {
623 put_page(page);
624 goto out;
625 }
626 pte_unmap_unlock(orig_pte, ptl);
627 if (split_huge_page(page)) {
628 unlock_page(page);
629 put_page(page);
630 pte_offset_map_lock(mm, pmd, addr, &ptl);
631 goto out;
632 }
633 unlock_page(page);
634 put_page(page);
635 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
636 pte--;
637 addr -= PAGE_SIZE;
638 continue;
639 }
640
641 VM_BUG_ON_PAGE(PageTransCompound(page), page);
642
643 if (PageSwapCache(page) || PageDirty(page)) {
644 if (!trylock_page(page))
645 continue;
646
647
648
649
650 if (page_mapcount(page) != 1) {
651 unlock_page(page);
652 continue;
653 }
654
655 if (PageSwapCache(page) && !try_to_free_swap(page)) {
656 unlock_page(page);
657 continue;
658 }
659
660 ClearPageDirty(page);
661 unlock_page(page);
662 }
663
664 if (pte_young(ptent) || pte_dirty(ptent)) {
665
666
667
668
669
670
671 ptent = ptep_get_and_clear_full(mm, addr, pte,
672 tlb->fullmm);
673
674 ptent = pte_mkold(ptent);
675 ptent = pte_mkclean(ptent);
676 set_pte_at(mm, addr, pte, ptent);
677 tlb_remove_tlb_entry(tlb, pte, addr);
678 }
679 mark_page_lazyfree(page);
680 }
681 out:
682 if (nr_swap) {
683 if (current->mm == mm)
684 sync_mm_rss(mm);
685
686 add_mm_counter(mm, MM_SWAPENTS, nr_swap);
687 }
688 arch_leave_lazy_mmu_mode();
689 pte_unmap_unlock(orig_pte, ptl);
690 cond_resched();
691 next:
692 return 0;
693 }
694
695 static const struct mm_walk_ops madvise_free_walk_ops = {
696 .pmd_entry = madvise_free_pte_range,
697 };
698
699 static int madvise_free_single_vma(struct vm_area_struct *vma,
700 unsigned long start_addr, unsigned long end_addr)
701 {
702 struct mm_struct *mm = vma->vm_mm;
703 struct mmu_notifier_range range;
704 struct mmu_gather tlb;
705
706
707 if (!vma_is_anonymous(vma))
708 return -EINVAL;
709
710 range.start = max(vma->vm_start, start_addr);
711 if (range.start >= vma->vm_end)
712 return -EINVAL;
713 range.end = min(vma->vm_end, end_addr);
714 if (range.end <= vma->vm_start)
715 return -EINVAL;
716 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
717 range.start, range.end);
718
719 lru_add_drain();
720 tlb_gather_mmu(&tlb, mm, range.start, range.end);
721 update_hiwater_rss(mm);
722
723 mmu_notifier_invalidate_range_start(&range);
724 tlb_start_vma(&tlb, vma);
725 walk_page_range(vma->vm_mm, range.start, range.end,
726 &madvise_free_walk_ops, &tlb);
727 tlb_end_vma(&tlb, vma);
728 mmu_notifier_invalidate_range_end(&range);
729 tlb_finish_mmu(&tlb, range.start, range.end);
730
731 return 0;
732 }
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753 static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
754 unsigned long start, unsigned long end)
755 {
756 zap_page_range(vma, start, end - start);
757 return 0;
758 }
759
760 static long madvise_dontneed_free(struct vm_area_struct *vma,
761 struct vm_area_struct **prev,
762 unsigned long start, unsigned long end,
763 int behavior)
764 {
765 *prev = vma;
766 if (!can_madv_lru_vma(vma))
767 return -EINVAL;
768
769 if (!userfaultfd_remove(vma, start, end)) {
770 *prev = NULL;
771
772 down_read(¤t->mm->mmap_sem);
773 vma = find_vma(current->mm, start);
774 if (!vma)
775 return -ENOMEM;
776 if (start < vma->vm_start) {
777
778
779
780
781
782
783
784
785
786 return -ENOMEM;
787 }
788 if (!can_madv_lru_vma(vma))
789 return -EINVAL;
790 if (end > vma->vm_end) {
791
792
793
794
795
796
797
798
799
800
801
802
803 end = vma->vm_end;
804 }
805 VM_WARN_ON(start >= end);
806 }
807
808 if (behavior == MADV_DONTNEED)
809 return madvise_dontneed_single_vma(vma, start, end);
810 else if (behavior == MADV_FREE)
811 return madvise_free_single_vma(vma, start, end);
812 else
813 return -EINVAL;
814 }
815
816
817
818
819
820 static long madvise_remove(struct vm_area_struct *vma,
821 struct vm_area_struct **prev,
822 unsigned long start, unsigned long end)
823 {
824 loff_t offset;
825 int error;
826 struct file *f;
827
828 *prev = NULL;
829
830 if (vma->vm_flags & VM_LOCKED)
831 return -EINVAL;
832
833 f = vma->vm_file;
834
835 if (!f || !f->f_mapping || !f->f_mapping->host) {
836 return -EINVAL;
837 }
838
839 if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
840 return -EACCES;
841
842 offset = (loff_t)(start - vma->vm_start)
843 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
844
845
846
847
848
849
850
851 get_file(f);
852 if (userfaultfd_remove(vma, start, end)) {
853
854 up_read(¤t->mm->mmap_sem);
855 }
856 error = vfs_fallocate(f,
857 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
858 offset, end - start);
859 fput(f);
860 down_read(¤t->mm->mmap_sem);
861 return error;
862 }
863
864 #ifdef CONFIG_MEMORY_FAILURE
865
866
867
868 static int madvise_inject_error(int behavior,
869 unsigned long start, unsigned long end)
870 {
871 struct page *page;
872 struct zone *zone;
873 unsigned int order;
874
875 if (!capable(CAP_SYS_ADMIN))
876 return -EPERM;
877
878
879 for (; start < end; start += PAGE_SIZE << order) {
880 unsigned long pfn;
881 int ret;
882
883 ret = get_user_pages_fast(start, 1, 0, &page);
884 if (ret != 1)
885 return ret;
886 pfn = page_to_pfn(page);
887
888
889
890
891
892
893 order = compound_order(compound_head(page));
894
895 if (PageHWPoison(page)) {
896 put_page(page);
897 continue;
898 }
899
900 if (behavior == MADV_SOFT_OFFLINE) {
901 pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
902 pfn, start);
903
904 ret = soft_offline_page(page, MF_COUNT_INCREASED);
905 if (ret)
906 return ret;
907 continue;
908 }
909
910 pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
911 pfn, start);
912
913
914
915
916
917
918
919 put_page(page);
920 ret = memory_failure(pfn, 0);
921 if (ret)
922 return ret;
923 }
924
925
926 for_each_populated_zone(zone)
927 drain_all_pages(zone);
928
929 return 0;
930 }
931 #endif
932
933 static long
934 madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
935 unsigned long start, unsigned long end, int behavior)
936 {
937 switch (behavior) {
938 case MADV_REMOVE:
939 return madvise_remove(vma, prev, start, end);
940 case MADV_WILLNEED:
941 return madvise_willneed(vma, prev, start, end);
942 case MADV_COLD:
943 return madvise_cold(vma, prev, start, end);
944 case MADV_PAGEOUT:
945 return madvise_pageout(vma, prev, start, end);
946 case MADV_FREE:
947 case MADV_DONTNEED:
948 return madvise_dontneed_free(vma, prev, start, end, behavior);
949 default:
950 return madvise_behavior(vma, prev, start, end, behavior);
951 }
952 }
953
954 static bool
955 madvise_behavior_valid(int behavior)
956 {
957 switch (behavior) {
958 case MADV_DOFORK:
959 case MADV_DONTFORK:
960 case MADV_NORMAL:
961 case MADV_SEQUENTIAL:
962 case MADV_RANDOM:
963 case MADV_REMOVE:
964 case MADV_WILLNEED:
965 case MADV_DONTNEED:
966 case MADV_FREE:
967 case MADV_COLD:
968 case MADV_PAGEOUT:
969 #ifdef CONFIG_KSM
970 case MADV_MERGEABLE:
971 case MADV_UNMERGEABLE:
972 #endif
973 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
974 case MADV_HUGEPAGE:
975 case MADV_NOHUGEPAGE:
976 #endif
977 case MADV_DONTDUMP:
978 case MADV_DODUMP:
979 case MADV_WIPEONFORK:
980 case MADV_KEEPONFORK:
981 #ifdef CONFIG_MEMORY_FAILURE
982 case MADV_SOFT_OFFLINE:
983 case MADV_HWPOISON:
984 #endif
985 return true;
986
987 default:
988 return false;
989 }
990 }
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
1054 {
1055 unsigned long end, tmp;
1056 struct vm_area_struct *vma, *prev;
1057 int unmapped_error = 0;
1058 int error = -EINVAL;
1059 int write;
1060 size_t len;
1061 struct blk_plug plug;
1062
1063 start = untagged_addr(start);
1064
1065 if (!madvise_behavior_valid(behavior))
1066 return error;
1067
1068 if (start & ~PAGE_MASK)
1069 return error;
1070 len = (len_in + ~PAGE_MASK) & PAGE_MASK;
1071
1072
1073 if (len_in && !len)
1074 return error;
1075
1076 end = start + len;
1077 if (end < start)
1078 return error;
1079
1080 error = 0;
1081 if (end == start)
1082 return error;
1083
1084 #ifdef CONFIG_MEMORY_FAILURE
1085 if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
1086 return madvise_inject_error(behavior, start, start + len_in);
1087 #endif
1088
1089 write = madvise_need_mmap_write(behavior);
1090 if (write) {
1091 if (down_write_killable(¤t->mm->mmap_sem))
1092 return -EINTR;
1093 } else {
1094 down_read(¤t->mm->mmap_sem);
1095 }
1096
1097
1098
1099
1100
1101
1102 vma = find_vma_prev(current->mm, start, &prev);
1103 if (vma && start > vma->vm_start)
1104 prev = vma;
1105
1106 blk_start_plug(&plug);
1107 for (;;) {
1108
1109 error = -ENOMEM;
1110 if (!vma)
1111 goto out;
1112
1113
1114 if (start < vma->vm_start) {
1115 unmapped_error = -ENOMEM;
1116 start = vma->vm_start;
1117 if (start >= end)
1118 goto out;
1119 }
1120
1121
1122 tmp = vma->vm_end;
1123 if (end < tmp)
1124 tmp = end;
1125
1126
1127 error = madvise_vma(vma, &prev, start, tmp, behavior);
1128 if (error)
1129 goto out;
1130 start = tmp;
1131 if (prev && start < prev->vm_end)
1132 start = prev->vm_end;
1133 error = unmapped_error;
1134 if (start >= end)
1135 goto out;
1136 if (prev)
1137 vma = prev->vm_next;
1138 else
1139 vma = find_vma(current->mm, start);
1140 }
1141 out:
1142 blk_finish_plug(&plug);
1143 if (write)
1144 up_write(¤t->mm->mmap_sem);
1145 else
1146 up_read(¤t->mm->mmap_sem);
1147
1148 return error;
1149 }