1/*
2 *	linux/mm/madvise.c
3 *
4 * Copyright (C) 1999  Linus Torvalds
5 * Copyright (C) 2002  Christoph Hellwig
6 */
7
8#include <linux/mman.h>
9#include <linux/pagemap.h>
10#include <linux/syscalls.h>
11#include <linux/mempolicy.h>
12#include <linux/page-isolation.h>
13#include <linux/hugetlb.h>
14#include <linux/falloc.h>
15#include <linux/sched.h>
16#include <linux/ksm.h>
17#include <linux/fs.h>
18#include <linux/file.h>
19#include <linux/blkdev.h>
20#include <linux/swap.h>
21#include <linux/swapops.h>
22
23/*
24 * Any behaviour which results in changes to the vma->vm_flags needs to
25 * take mmap_sem for writing. Others, which simply traverse vmas, need
26 * to only take it for reading.
27 */
28static int madvise_need_mmap_write(int behavior)
29{
30	switch (behavior) {
31	case MADV_REMOVE:
32	case MADV_WILLNEED:
33	case MADV_DONTNEED:
34		return 0;
35	default:
36		/* be safe, default to 1. list exceptions explicitly */
37		return 1;
38	}
39}
40
41/*
42 * We can potentially split a vm area into separate
43 * areas, each area with its own behavior.
44 */
45static long madvise_behavior(struct vm_area_struct *vma,
46		     struct vm_area_struct **prev,
47		     unsigned long start, unsigned long end, int behavior)
48{
49	struct mm_struct *mm = vma->vm_mm;
50	int error = 0;
51	pgoff_t pgoff;
52	unsigned long new_flags = vma->vm_flags;
53
54	switch (behavior) {
55	case MADV_NORMAL:
56		new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
57		break;
58	case MADV_SEQUENTIAL:
59		new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
60		break;
61	case MADV_RANDOM:
62		new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
63		break;
64	case MADV_DONTFORK:
65		new_flags |= VM_DONTCOPY;
66		break;
67	case MADV_DOFORK:
68		if (vma->vm_flags & VM_IO) {
69			error = -EINVAL;
70			goto out;
71		}
72		new_flags &= ~VM_DONTCOPY;
73		break;
74	case MADV_DONTDUMP:
75		new_flags |= VM_DONTDUMP;
76		break;
77	case MADV_DODUMP:
78		if (new_flags & VM_SPECIAL) {
79			error = -EINVAL;
80			goto out;
81		}
82		new_flags &= ~VM_DONTDUMP;
83		break;
84	case MADV_MERGEABLE:
85	case MADV_UNMERGEABLE:
86		error = ksm_madvise(vma, start, end, behavior, &new_flags);
87		if (error)
88			goto out;
89		break;
90	case MADV_HUGEPAGE:
91	case MADV_NOHUGEPAGE:
92		error = hugepage_madvise(vma, &new_flags, behavior);
93		if (error)
94			goto out;
95		break;
96	}
97
98	if (new_flags == vma->vm_flags) {
99		*prev = vma;
100		goto out;
101	}
102
103	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
104	*prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
105				vma->vm_file, pgoff, vma_policy(vma));
106	if (*prev) {
107		vma = *prev;
108		goto success;
109	}
110
111	*prev = vma;
112
113	if (start != vma->vm_start) {
114		error = split_vma(mm, vma, start, 1);
115		if (error)
116			goto out;
117	}
118
119	if (end != vma->vm_end) {
120		error = split_vma(mm, vma, end, 0);
121		if (error)
122			goto out;
123	}
124
125success:
126	/*
127	 * vm_flags is protected by the mmap_sem held in write mode.
128	 */
129	vma->vm_flags = new_flags;
130
131out:
132	if (error == -ENOMEM)
133		error = -EAGAIN;
134	return error;
135}
136
137#ifdef CONFIG_SWAP
138static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
139	unsigned long end, struct mm_walk *walk)
140{
141	pte_t *orig_pte;
142	struct vm_area_struct *vma = walk->private;
143	unsigned long index;
144
145	if (pmd_none_or_trans_huge_or_clear_bad(pmd))
146		return 0;
147
148	for (index = start; index != end; index += PAGE_SIZE) {
149		pte_t pte;
150		swp_entry_t entry;
151		struct page *page;
152		spinlock_t *ptl;
153
154		orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
155		pte = *(orig_pte + ((index - start) / PAGE_SIZE));
156		pte_unmap_unlock(orig_pte, ptl);
157
158		if (pte_present(pte) || pte_none(pte))
159			continue;
160		entry = pte_to_swp_entry(pte);
161		if (unlikely(non_swap_entry(entry)))
162			continue;
163
164		page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
165								vma, index);
166		if (page)
167			page_cache_release(page);
168	}
169
170	return 0;
171}
172
173static void force_swapin_readahead(struct vm_area_struct *vma,
174		unsigned long start, unsigned long end)
175{
176	struct mm_walk walk = {
177		.mm = vma->vm_mm,
178		.pmd_entry = swapin_walk_pmd_entry,
179		.private = vma,
180	};
181
182	walk_page_range(start, end, &walk);
183
184	lru_add_drain();	/* Push any new pages onto the LRU now */
185}
186
187static void force_shm_swapin_readahead(struct vm_area_struct *vma,
188		unsigned long start, unsigned long end,
189		struct address_space *mapping)
190{
191	pgoff_t index;
192	struct page *page;
193	swp_entry_t swap;
194
195	for (; start < end; start += PAGE_SIZE) {
196		index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
197
198		page = find_get_entry(mapping, index);
199		if (!radix_tree_exceptional_entry(page)) {
200			if (page)
201				page_cache_release(page);
202			continue;
203		}
204		swap = radix_to_swp_entry(page);
205		page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
206								NULL, 0);
207		if (page)
208			page_cache_release(page);
209	}
210
211	lru_add_drain();	/* Push any new pages onto the LRU now */
212}
213#endif		/* CONFIG_SWAP */
214
215/*
216 * Schedule all required I/O operations.  Do not wait for completion.
217 */
218static long madvise_willneed(struct vm_area_struct *vma,
219			     struct vm_area_struct **prev,
220			     unsigned long start, unsigned long end)
221{
222	struct file *file = vma->vm_file;
223
224#ifdef CONFIG_SWAP
225	if (!file) {
226		*prev = vma;
227		force_swapin_readahead(vma, start, end);
228		return 0;
229	}
230
231	if (shmem_mapping(file->f_mapping)) {
232		*prev = vma;
233		force_shm_swapin_readahead(vma, start, end,
234					file->f_mapping);
235		return 0;
236	}
237#else
238	if (!file)
239		return -EBADF;
240#endif
241
242	if (IS_DAX(file_inode(file))) {
243		/* no bad return value, but ignore advice */
244		return 0;
245	}
246
247	*prev = vma;
248	start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
249	if (end > vma->vm_end)
250		end = vma->vm_end;
251	end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
252
253	force_page_cache_readahead(file->f_mapping, file, start, end - start);
254	return 0;
255}
256
257/*
258 * Application no longer needs these pages.  If the pages are dirty,
259 * it's OK to just throw them away.  The app will be more careful about
260 * data it wants to keep.  Be sure to free swap resources too.  The
261 * zap_page_range call sets things up for shrink_active_list to actually free
262 * these pages later if no one else has touched them in the meantime,
263 * although we could add these pages to a global reuse list for
264 * shrink_active_list to pick up before reclaiming other pages.
265 *
266 * NB: This interface discards data rather than pushes it out to swap,
267 * as some implementations do.  This has performance implications for
268 * applications like large transactional databases which want to discard
269 * pages in anonymous maps after committing to backing store the data
270 * that was kept in them.  There is no reason to write this data out to
271 * the swap area if the application is discarding it.
272 *
273 * An interface that causes the system to free clean pages and flush
274 * dirty pages is already available as msync(MS_INVALIDATE).
275 */
276static long madvise_dontneed(struct vm_area_struct *vma,
277			     struct vm_area_struct **prev,
278			     unsigned long start, unsigned long end)
279{
280	*prev = vma;
281	if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
282		return -EINVAL;
283
284	zap_page_range(vma, start, end - start, NULL);
285	return 0;
286}
287
288/*
289 * Application wants to free up the pages and associated backing store.
290 * This is effectively punching a hole into the middle of a file.
291 */
292static long madvise_remove(struct vm_area_struct *vma,
293				struct vm_area_struct **prev,
294				unsigned long start, unsigned long end)
295{
296	loff_t offset;
297	int error;
298	struct file *f;
299
300	*prev = NULL;	/* tell sys_madvise we drop mmap_sem */
301
302	if (vma->vm_flags & (VM_LOCKED | VM_HUGETLB))
303		return -EINVAL;
304
305	f = vma->vm_file;
306
307	if (!f || !f->f_mapping || !f->f_mapping->host) {
308			return -EINVAL;
309	}
310
311	if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
312		return -EACCES;
313
314	offset = (loff_t)(start - vma->vm_start)
315			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
316
317	/*
318	 * Filesystem's fallocate may need to take i_mutex.  We need to
319	 * explicitly grab a reference because the vma (and hence the
320	 * vma's reference to the file) can go away as soon as we drop
321	 * mmap_sem.
322	 */
323	get_file(f);
324	up_read(&current->mm->mmap_sem);
325	error = vfs_fallocate(f,
326				FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
327				offset, end - start);
328	fput(f);
329	down_read(&current->mm->mmap_sem);
330	return error;
331}
332
333#ifdef CONFIG_MEMORY_FAILURE
334/*
335 * Error injection support for memory error handling.
336 */
337static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
338{
339	struct page *p;
340	if (!capable(CAP_SYS_ADMIN))
341		return -EPERM;
342	for (; start < end; start += PAGE_SIZE <<
343				compound_order(compound_head(p))) {
344		int ret;
345
346		ret = get_user_pages_fast(start, 1, 0, &p);
347		if (ret != 1)
348			return ret;
349
350		if (PageHWPoison(p)) {
351			put_page(p);
352			continue;
353		}
354		if (bhv == MADV_SOFT_OFFLINE) {
355			pr_info("Soft offlining page %#lx at %#lx\n",
356				page_to_pfn(p), start);
357			ret = soft_offline_page(p, MF_COUNT_INCREASED);
358			if (ret)
359				return ret;
360			continue;
361		}
362		pr_info("Injecting memory failure for page %#lx at %#lx\n",
363		       page_to_pfn(p), start);
364		/* Ignore return value for now */
365		memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
366	}
367	return 0;
368}
369#endif
370
371static long
372madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
373		unsigned long start, unsigned long end, int behavior)
374{
375	switch (behavior) {
376	case MADV_REMOVE:
377		return madvise_remove(vma, prev, start, end);
378	case MADV_WILLNEED:
379		return madvise_willneed(vma, prev, start, end);
380	case MADV_DONTNEED:
381		return madvise_dontneed(vma, prev, start, end);
382	default:
383		return madvise_behavior(vma, prev, start, end, behavior);
384	}
385}
386
387static int
388madvise_behavior_valid(int behavior)
389{
390	switch (behavior) {
391	case MADV_DOFORK:
392	case MADV_DONTFORK:
393	case MADV_NORMAL:
394	case MADV_SEQUENTIAL:
395	case MADV_RANDOM:
396	case MADV_REMOVE:
397	case MADV_WILLNEED:
398	case MADV_DONTNEED:
399#ifdef CONFIG_KSM
400	case MADV_MERGEABLE:
401	case MADV_UNMERGEABLE:
402#endif
403#ifdef CONFIG_TRANSPARENT_HUGEPAGE
404	case MADV_HUGEPAGE:
405	case MADV_NOHUGEPAGE:
406#endif
407	case MADV_DONTDUMP:
408	case MADV_DODUMP:
409		return 1;
410
411	default:
412		return 0;
413	}
414}
415
416/*
417 * The madvise(2) system call.
418 *
419 * Applications can use madvise() to advise the kernel how it should
420 * handle paging I/O in this VM area.  The idea is to help the kernel
421 * use appropriate read-ahead and caching techniques.  The information
422 * provided is advisory only, and can be safely disregarded by the
423 * kernel without affecting the correct operation of the application.
424 *
425 * behavior values:
426 *  MADV_NORMAL - the default behavior is to read clusters.  This
427 *		results in some read-ahead and read-behind.
428 *  MADV_RANDOM - the system should read the minimum amount of data
429 *		on any access, since it is unlikely that the appli-
430 *		cation will need more than what it asks for.
431 *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
432 *		once, so they can be aggressively read ahead, and
433 *		can be freed soon after they are accessed.
434 *  MADV_WILLNEED - the application is notifying the system to read
435 *		some pages ahead.
436 *  MADV_DONTNEED - the application is finished with the given range,
437 *		so the kernel can free resources associated with it.
438 *  MADV_REMOVE - the application wants to free up the given range of
439 *		pages and associated backing store.
440 *  MADV_DONTFORK - omit this area from child's address space when forking:
441 *		typically, to avoid COWing pages pinned by get_user_pages().
442 *  MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
443 *  MADV_MERGEABLE - the application recommends that KSM try to merge pages in
444 *		this area with pages of identical content from other such areas.
445 *  MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
446 *
447 * return values:
448 *  zero    - success
449 *  -EINVAL - start + len < 0, start is not page-aligned,
450 *		"behavior" is not a valid value, or application
451 *		is attempting to release locked or shared pages.
452 *  -ENOMEM - addresses in the specified range are not currently
453 *		mapped, or are outside the AS of the process.
454 *  -EIO    - an I/O error occurred while paging in data.
455 *  -EBADF  - map exists, but area maps something that isn't a file.
456 *  -EAGAIN - a kernel resource was temporarily unavailable.
457 */
458SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
459{
460	unsigned long end, tmp;
461	struct vm_area_struct *vma, *prev;
462	int unmapped_error = 0;
463	int error = -EINVAL;
464	int write;
465	size_t len;
466	struct blk_plug plug;
467
468#ifdef CONFIG_MEMORY_FAILURE
469	if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
470		return madvise_hwpoison(behavior, start, start+len_in);
471#endif
472	if (!madvise_behavior_valid(behavior))
473		return error;
474
475	if (start & ~PAGE_MASK)
476		return error;
477	len = (len_in + ~PAGE_MASK) & PAGE_MASK;
478
479	/* Check to see whether len was rounded up from small -ve to zero */
480	if (len_in && !len)
481		return error;
482
483	end = start + len;
484	if (end < start)
485		return error;
486
487	error = 0;
488	if (end == start)
489		return error;
490
491	write = madvise_need_mmap_write(behavior);
492	if (write)
493		down_write(&current->mm->mmap_sem);
494	else
495		down_read(&current->mm->mmap_sem);
496
497	/*
498	 * If the interval [start,end) covers some unmapped address
499	 * ranges, just ignore them, but return -ENOMEM at the end.
500	 * - different from the way of handling in mlock etc.
501	 */
502	vma = find_vma_prev(current->mm, start, &prev);
503	if (vma && start > vma->vm_start)
504		prev = vma;
505
506	blk_start_plug(&plug);
507	for (;;) {
508		/* Still start < end. */
509		error = -ENOMEM;
510		if (!vma)
511			goto out;
512
513		/* Here start < (end|vma->vm_end). */
514		if (start < vma->vm_start) {
515			unmapped_error = -ENOMEM;
516			start = vma->vm_start;
517			if (start >= end)
518				goto out;
519		}
520
521		/* Here vma->vm_start <= start < (end|vma->vm_end) */
522		tmp = vma->vm_end;
523		if (end < tmp)
524			tmp = end;
525
526		/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
527		error = madvise_vma(vma, &prev, start, tmp, behavior);
528		if (error)
529			goto out;
530		start = tmp;
531		if (prev && start < prev->vm_end)
532			start = prev->vm_end;
533		error = unmapped_error;
534		if (start >= end)
535			goto out;
536		if (prev)
537			vma = prev->vm_next;
538		else	/* madvise_remove dropped mmap_sem */
539			vma = find_vma(current->mm, start);
540	}
541out:
542	blk_finish_plug(&plug);
543	if (write)
544		up_write(&current->mm->mmap_sem);
545	else
546		up_read(&current->mm->mmap_sem);
547
548	return error;
549}
550