1 /*
2  * hugetlbpage-backed filesystem.  Based on ramfs.
3  *
4  * Nadia Yvette Chambers, 2002
5  *
6  * Copyright (C) 2002 Linus Torvalds.
7  */
8 
9 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
10 
11 #include <linux/module.h>
12 #include <linux/thread_info.h>
13 #include <asm/current.h>
14 #include <linux/sched.h>		/* remove ASAP */
15 #include <linux/falloc.h>
16 #include <linux/fs.h>
17 #include <linux/mount.h>
18 #include <linux/file.h>
19 #include <linux/kernel.h>
20 #include <linux/writeback.h>
21 #include <linux/pagemap.h>
22 #include <linux/highmem.h>
23 #include <linux/init.h>
24 #include <linux/string.h>
25 #include <linux/capability.h>
26 #include <linux/ctype.h>
27 #include <linux/backing-dev.h>
28 #include <linux/hugetlb.h>
29 #include <linux/pagevec.h>
30 #include <linux/parser.h>
31 #include <linux/mman.h>
32 #include <linux/slab.h>
33 #include <linux/dnotify.h>
34 #include <linux/statfs.h>
35 #include <linux/security.h>
36 #include <linux/magic.h>
37 #include <linux/migrate.h>
38 #include <linux/uio.h>
39 
40 #include <asm/uaccess.h>
41 
42 static const struct super_operations hugetlbfs_ops;
43 static const struct address_space_operations hugetlbfs_aops;
44 const struct file_operations hugetlbfs_file_operations;
45 static const struct inode_operations hugetlbfs_dir_inode_operations;
46 static const struct inode_operations hugetlbfs_inode_operations;
47 
48 struct hugetlbfs_config {
49 	kuid_t   uid;
50 	kgid_t   gid;
51 	umode_t mode;
52 	long	max_hpages;
53 	long	nr_inodes;
54 	struct hstate *hstate;
55 	long    min_hpages;
56 };
57 
58 struct hugetlbfs_inode_info {
59 	struct shared_policy policy;
60 	struct inode vfs_inode;
61 };
62 
HUGETLBFS_I(struct inode * inode)63 static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode)
64 {
65 	return container_of(inode, struct hugetlbfs_inode_info, vfs_inode);
66 }
67 
68 int sysctl_hugetlb_shm_group;
69 
70 enum {
71 	Opt_size, Opt_nr_inodes,
72 	Opt_mode, Opt_uid, Opt_gid,
73 	Opt_pagesize, Opt_min_size,
74 	Opt_err,
75 };
76 
77 static const match_table_t tokens = {
78 	{Opt_size,	"size=%s"},
79 	{Opt_nr_inodes,	"nr_inodes=%s"},
80 	{Opt_mode,	"mode=%o"},
81 	{Opt_uid,	"uid=%u"},
82 	{Opt_gid,	"gid=%u"},
83 	{Opt_pagesize,	"pagesize=%s"},
84 	{Opt_min_size,	"min_size=%s"},
85 	{Opt_err,	NULL},
86 };
87 
88 #ifdef CONFIG_NUMA
hugetlb_set_vma_policy(struct vm_area_struct * vma,struct inode * inode,pgoff_t index)89 static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
90 					struct inode *inode, pgoff_t index)
91 {
92 	vma->vm_policy = mpol_shared_policy_lookup(&HUGETLBFS_I(inode)->policy,
93 							index);
94 }
95 
hugetlb_drop_vma_policy(struct vm_area_struct * vma)96 static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
97 {
98 	mpol_cond_put(vma->vm_policy);
99 }
100 #else
hugetlb_set_vma_policy(struct vm_area_struct * vma,struct inode * inode,pgoff_t index)101 static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
102 					struct inode *inode, pgoff_t index)
103 {
104 }
105 
hugetlb_drop_vma_policy(struct vm_area_struct * vma)106 static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
107 {
108 }
109 #endif
110 
huge_pagevec_release(struct pagevec * pvec)111 static void huge_pagevec_release(struct pagevec *pvec)
112 {
113 	int i;
114 
115 	for (i = 0; i < pagevec_count(pvec); ++i)
116 		put_page(pvec->pages[i]);
117 
118 	pagevec_reinit(pvec);
119 }
120 
hugetlbfs_file_mmap(struct file * file,struct vm_area_struct * vma)121 static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
122 {
123 	struct inode *inode = file_inode(file);
124 	loff_t len, vma_len;
125 	int ret;
126 	struct hstate *h = hstate_file(file);
127 
128 	/*
129 	 * vma address alignment (but not the pgoff alignment) has
130 	 * already been checked by prepare_hugepage_range.  If you add
131 	 * any error returns here, do so after setting VM_HUGETLB, so
132 	 * is_vm_hugetlb_page tests below unmap_region go the right
133 	 * way when do_mmap_pgoff unwinds (may be important on powerpc
134 	 * and ia64).
135 	 */
136 	vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND;
137 	vma->vm_ops = &hugetlb_vm_ops;
138 
139 	if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
140 		return -EINVAL;
141 
142 	vma_len = (loff_t)(vma->vm_end - vma->vm_start);
143 
144 	mutex_lock(&inode->i_mutex);
145 	file_accessed(file);
146 
147 	ret = -ENOMEM;
148 	len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
149 
150 	if (hugetlb_reserve_pages(inode,
151 				vma->vm_pgoff >> huge_page_order(h),
152 				len >> huge_page_shift(h), vma,
153 				vma->vm_flags))
154 		goto out;
155 
156 	ret = 0;
157 	if (vma->vm_flags & VM_WRITE && inode->i_size < len)
158 		inode->i_size = len;
159 out:
160 	mutex_unlock(&inode->i_mutex);
161 
162 	return ret;
163 }
164 
165 /*
166  * Called under down_write(mmap_sem).
167  */
168 
169 #ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
170 static unsigned long
hugetlb_get_unmapped_area(struct file * file,unsigned long addr,unsigned long len,unsigned long pgoff,unsigned long flags)171 hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
172 		unsigned long len, unsigned long pgoff, unsigned long flags)
173 {
174 	struct mm_struct *mm = current->mm;
175 	struct vm_area_struct *vma;
176 	struct hstate *h = hstate_file(file);
177 	struct vm_unmapped_area_info info;
178 
179 	if (len & ~huge_page_mask(h))
180 		return -EINVAL;
181 	if (len > TASK_SIZE)
182 		return -ENOMEM;
183 
184 	if (flags & MAP_FIXED) {
185 		if (prepare_hugepage_range(file, addr, len))
186 			return -EINVAL;
187 		return addr;
188 	}
189 
190 	if (addr) {
191 		addr = ALIGN(addr, huge_page_size(h));
192 		vma = find_vma(mm, addr);
193 		if (TASK_SIZE - len >= addr &&
194 		    (!vma || addr + len <= vma->vm_start))
195 			return addr;
196 	}
197 
198 	info.flags = 0;
199 	info.length = len;
200 	info.low_limit = TASK_UNMAPPED_BASE;
201 	info.high_limit = TASK_SIZE;
202 	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
203 	info.align_offset = 0;
204 	return vm_unmapped_area(&info);
205 }
206 #endif
207 
208 static size_t
hugetlbfs_read_actor(struct page * page,unsigned long offset,struct iov_iter * to,unsigned long size)209 hugetlbfs_read_actor(struct page *page, unsigned long offset,
210 			struct iov_iter *to, unsigned long size)
211 {
212 	size_t copied = 0;
213 	int i, chunksize;
214 
215 	/* Find which 4k chunk and offset with in that chunk */
216 	i = offset >> PAGE_CACHE_SHIFT;
217 	offset = offset & ~PAGE_CACHE_MASK;
218 
219 	while (size) {
220 		size_t n;
221 		chunksize = PAGE_CACHE_SIZE;
222 		if (offset)
223 			chunksize -= offset;
224 		if (chunksize > size)
225 			chunksize = size;
226 		n = copy_page_to_iter(&page[i], offset, chunksize, to);
227 		copied += n;
228 		if (n != chunksize)
229 			return copied;
230 		offset = 0;
231 		size -= chunksize;
232 		i++;
233 	}
234 	return copied;
235 }
236 
237 /*
238  * Support for read() - Find the page attached to f_mapping and copy out the
239  * data. Its *very* similar to do_generic_mapping_read(), we can't use that
240  * since it has PAGE_CACHE_SIZE assumptions.
241  */
hugetlbfs_read_iter(struct kiocb * iocb,struct iov_iter * to)242 static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
243 {
244 	struct file *file = iocb->ki_filp;
245 	struct hstate *h = hstate_file(file);
246 	struct address_space *mapping = file->f_mapping;
247 	struct inode *inode = mapping->host;
248 	unsigned long index = iocb->ki_pos >> huge_page_shift(h);
249 	unsigned long offset = iocb->ki_pos & ~huge_page_mask(h);
250 	unsigned long end_index;
251 	loff_t isize;
252 	ssize_t retval = 0;
253 
254 	while (iov_iter_count(to)) {
255 		struct page *page;
256 		size_t nr, copied;
257 
258 		/* nr is the maximum number of bytes to copy from this page */
259 		nr = huge_page_size(h);
260 		isize = i_size_read(inode);
261 		if (!isize)
262 			break;
263 		end_index = (isize - 1) >> huge_page_shift(h);
264 		if (index > end_index)
265 			break;
266 		if (index == end_index) {
267 			nr = ((isize - 1) & ~huge_page_mask(h)) + 1;
268 			if (nr <= offset)
269 				break;
270 		}
271 		nr = nr - offset;
272 
273 		/* Find the page */
274 		page = find_lock_page(mapping, index);
275 		if (unlikely(page == NULL)) {
276 			/*
277 			 * We have a HOLE, zero out the user-buffer for the
278 			 * length of the hole or request.
279 			 */
280 			copied = iov_iter_zero(nr, to);
281 		} else {
282 			unlock_page(page);
283 
284 			/*
285 			 * We have the page, copy it to user space buffer.
286 			 */
287 			copied = hugetlbfs_read_actor(page, offset, to, nr);
288 			page_cache_release(page);
289 		}
290 		offset += copied;
291 		retval += copied;
292 		if (copied != nr && iov_iter_count(to)) {
293 			if (!retval)
294 				retval = -EFAULT;
295 			break;
296 		}
297 		index += offset >> huge_page_shift(h);
298 		offset &= ~huge_page_mask(h);
299 	}
300 	iocb->ki_pos = ((loff_t)index << huge_page_shift(h)) + offset;
301 	return retval;
302 }
303 
hugetlbfs_write_begin(struct file * file,struct address_space * mapping,loff_t pos,unsigned len,unsigned flags,struct page ** pagep,void ** fsdata)304 static int hugetlbfs_write_begin(struct file *file,
305 			struct address_space *mapping,
306 			loff_t pos, unsigned len, unsigned flags,
307 			struct page **pagep, void **fsdata)
308 {
309 	return -EINVAL;
310 }
311 
hugetlbfs_write_end(struct file * file,struct address_space * mapping,loff_t pos,unsigned len,unsigned copied,struct page * page,void * fsdata)312 static int hugetlbfs_write_end(struct file *file, struct address_space *mapping,
313 			loff_t pos, unsigned len, unsigned copied,
314 			struct page *page, void *fsdata)
315 {
316 	BUG();
317 	return -EINVAL;
318 }
319 
remove_huge_page(struct page * page)320 static void remove_huge_page(struct page *page)
321 {
322 	ClearPageDirty(page);
323 	ClearPageUptodate(page);
324 	delete_from_page_cache(page);
325 }
326 
327 
328 /*
329  * remove_inode_hugepages handles two distinct cases: truncation and hole
330  * punch.  There are subtle differences in operation for each case.
331 
332  * truncation is indicated by end of range being LLONG_MAX
333  *	In this case, we first scan the range and release found pages.
334  *	After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
335  *	maps and global counts.  Page faults can not race with truncation
336  *	in this routine.  hugetlb_no_page() prevents page faults in the
337  *	truncated range.  It checks i_size before allocation, and again after
338  *	with the page table lock for the page held.  The same lock must be
339  *	acquired to unmap a page.
340  * hole punch is indicated if end is not LLONG_MAX
341  *	In the hole punch case we scan the range and release found pages.
342  *	Only when releasing a page is the associated region/reserv map
343  *	deleted.  The region/reserv map for ranges without associated
344  *	pages are not modified.  Page faults can race with hole punch.
345  *	This is indicated if we find a mapped page.
346  * Note: If the passed end of range value is beyond the end of file, but
347  * not LLONG_MAX this routine still performs a hole punch operation.
348  */
remove_inode_hugepages(struct inode * inode,loff_t lstart,loff_t lend)349 static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
350 				   loff_t lend)
351 {
352 	struct hstate *h = hstate_inode(inode);
353 	struct address_space *mapping = &inode->i_data;
354 	const pgoff_t start = lstart >> huge_page_shift(h);
355 	const pgoff_t end = lend >> huge_page_shift(h);
356 	struct vm_area_struct pseudo_vma;
357 	struct pagevec pvec;
358 	pgoff_t next;
359 	int i, freed = 0;
360 	long lookup_nr = PAGEVEC_SIZE;
361 	bool truncate_op = (lend == LLONG_MAX);
362 
363 	memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
364 	pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
365 	pagevec_init(&pvec, 0);
366 	next = start;
367 	while (next < end) {
368 		/*
369 		 * Don't grab more pages than the number left in the range.
370 		 */
371 		if (end - next < lookup_nr)
372 			lookup_nr = end - next;
373 
374 		/*
375 		 * When no more pages are found, we are done.
376 		 */
377 		if (!pagevec_lookup(&pvec, mapping, next, lookup_nr))
378 			break;
379 
380 		for (i = 0; i < pagevec_count(&pvec); ++i) {
381 			struct page *page = pvec.pages[i];
382 			u32 hash;
383 
384 			/*
385 			 * The page (index) could be beyond end.  This is
386 			 * only possible in the punch hole case as end is
387 			 * max page offset in the truncate case.
388 			 */
389 			next = page->index;
390 			if (next >= end)
391 				break;
392 
393 			hash = hugetlb_fault_mutex_hash(h, current->mm,
394 							&pseudo_vma,
395 							mapping, next, 0);
396 			mutex_lock(&hugetlb_fault_mutex_table[hash]);
397 
398 			lock_page(page);
399 			if (likely(!page_mapped(page))) {
400 				bool rsv_on_error = !PagePrivate(page);
401 				/*
402 				 * We must free the huge page and remove
403 				 * from page cache (remove_huge_page) BEFORE
404 				 * removing the region/reserve map
405 				 * (hugetlb_unreserve_pages).  In rare out
406 				 * of memory conditions, removal of the
407 				 * region/reserve map could fail.  Before
408 				 * free'ing the page, note PagePrivate which
409 				 * is used in case of error.
410 				 */
411 				remove_huge_page(page);
412 				freed++;
413 				if (!truncate_op) {
414 					if (unlikely(hugetlb_unreserve_pages(
415 							inode, next,
416 							next + 1, 1)))
417 						hugetlb_fix_reserve_counts(
418 							inode, rsv_on_error);
419 				}
420 			} else {
421 				/*
422 				 * If page is mapped, it was faulted in after
423 				 * being unmapped.  It indicates a race between
424 				 * hole punch and page fault.  Do nothing in
425 				 * this case.  Getting here in a truncate
426 				 * operation is a bug.
427 				 */
428 				BUG_ON(truncate_op);
429 			}
430 
431 			unlock_page(page);
432 			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
433 		}
434 		++next;
435 		huge_pagevec_release(&pvec);
436 		cond_resched();
437 	}
438 
439 	if (truncate_op)
440 		(void)hugetlb_unreserve_pages(inode, start, LONG_MAX, freed);
441 }
442 
hugetlbfs_evict_inode(struct inode * inode)443 static void hugetlbfs_evict_inode(struct inode *inode)
444 {
445 	struct resv_map *resv_map;
446 
447 	remove_inode_hugepages(inode, 0, LLONG_MAX);
448 	resv_map = (struct resv_map *)inode->i_mapping->private_data;
449 	/* root inode doesn't have the resv_map, so we should check it */
450 	if (resv_map)
451 		resv_map_release(&resv_map->refs);
452 	clear_inode(inode);
453 }
454 
455 static inline void
hugetlb_vmdelete_list(struct rb_root * root,pgoff_t start,pgoff_t end)456 hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
457 {
458 	struct vm_area_struct *vma;
459 
460 	/*
461 	 * end == 0 indicates that the entire range after
462 	 * start should be unmapped.
463 	 */
464 	vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
465 		unsigned long v_offset;
466 		unsigned long v_end;
467 
468 		/*
469 		 * Can the expression below overflow on 32-bit arches?
470 		 * No, because the interval tree returns us only those vmas
471 		 * which overlap the truncated area starting at pgoff,
472 		 * and no vma on a 32-bit arch can span beyond the 4GB.
473 		 */
474 		if (vma->vm_pgoff < start)
475 			v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
476 		else
477 			v_offset = 0;
478 
479 		if (!end)
480 			v_end = vma->vm_end;
481 		else {
482 			v_end = ((end - vma->vm_pgoff) << PAGE_SHIFT)
483 							+ vma->vm_start;
484 			if (v_end > vma->vm_end)
485 				v_end = vma->vm_end;
486 		}
487 
488 		unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end,
489 									NULL);
490 	}
491 }
492 
hugetlb_vmtruncate(struct inode * inode,loff_t offset)493 static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
494 {
495 	pgoff_t pgoff;
496 	struct address_space *mapping = inode->i_mapping;
497 	struct hstate *h = hstate_inode(inode);
498 
499 	BUG_ON(offset & ~huge_page_mask(h));
500 	pgoff = offset >> PAGE_SHIFT;
501 
502 	i_size_write(inode, offset);
503 	i_mmap_lock_write(mapping);
504 	if (!RB_EMPTY_ROOT(&mapping->i_mmap))
505 		hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
506 	i_mmap_unlock_write(mapping);
507 	remove_inode_hugepages(inode, offset, LLONG_MAX);
508 	return 0;
509 }
510 
hugetlbfs_punch_hole(struct inode * inode,loff_t offset,loff_t len)511 static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
512 {
513 	struct hstate *h = hstate_inode(inode);
514 	loff_t hpage_size = huge_page_size(h);
515 	loff_t hole_start, hole_end;
516 
517 	/*
518 	 * For hole punch round up the beginning offset of the hole and
519 	 * round down the end.
520 	 */
521 	hole_start = round_up(offset, hpage_size);
522 	hole_end = round_down(offset + len, hpage_size);
523 
524 	if (hole_end > hole_start) {
525 		struct address_space *mapping = inode->i_mapping;
526 
527 		mutex_lock(&inode->i_mutex);
528 		i_mmap_lock_write(mapping);
529 		if (!RB_EMPTY_ROOT(&mapping->i_mmap))
530 			hugetlb_vmdelete_list(&mapping->i_mmap,
531 						hole_start >> PAGE_SHIFT,
532 						hole_end  >> PAGE_SHIFT);
533 		i_mmap_unlock_write(mapping);
534 		remove_inode_hugepages(inode, hole_start, hole_end);
535 		mutex_unlock(&inode->i_mutex);
536 	}
537 
538 	return 0;
539 }
540 
hugetlbfs_fallocate(struct file * file,int mode,loff_t offset,loff_t len)541 static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
542 				loff_t len)
543 {
544 	struct inode *inode = file_inode(file);
545 	struct address_space *mapping = inode->i_mapping;
546 	struct hstate *h = hstate_inode(inode);
547 	struct vm_area_struct pseudo_vma;
548 	struct mm_struct *mm = current->mm;
549 	loff_t hpage_size = huge_page_size(h);
550 	unsigned long hpage_shift = huge_page_shift(h);
551 	pgoff_t start, index, end;
552 	int error;
553 	u32 hash;
554 
555 	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
556 		return -EOPNOTSUPP;
557 
558 	if (mode & FALLOC_FL_PUNCH_HOLE)
559 		return hugetlbfs_punch_hole(inode, offset, len);
560 
561 	/*
562 	 * Default preallocate case.
563 	 * For this range, start is rounded down and end is rounded up
564 	 * as well as being converted to page offsets.
565 	 */
566 	start = offset >> hpage_shift;
567 	end = (offset + len + hpage_size - 1) >> hpage_shift;
568 
569 	mutex_lock(&inode->i_mutex);
570 
571 	/* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
572 	error = inode_newsize_ok(inode, offset + len);
573 	if (error)
574 		goto out;
575 
576 	/*
577 	 * Initialize a pseudo vma as this is required by the huge page
578 	 * allocation routines.  If NUMA is configured, use page index
579 	 * as input to create an allocation policy.
580 	 */
581 	memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
582 	pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
583 	pseudo_vma.vm_file = file;
584 
585 	for (index = start; index < end; index++) {
586 		/*
587 		 * This is supposed to be the vaddr where the page is being
588 		 * faulted in, but we have no vaddr here.
589 		 */
590 		struct page *page;
591 		unsigned long addr;
592 		int avoid_reserve = 0;
593 
594 		cond_resched();
595 
596 		/*
597 		 * fallocate(2) manpage permits EINTR; we may have been
598 		 * interrupted because we are using up too much memory.
599 		 */
600 		if (signal_pending(current)) {
601 			error = -EINTR;
602 			break;
603 		}
604 
605 		/* Set numa allocation policy based on index */
606 		hugetlb_set_vma_policy(&pseudo_vma, inode, index);
607 
608 		/* addr is the offset within the file (zero based) */
609 		addr = index * hpage_size;
610 
611 		/* mutex taken here, fault path and hole punch */
612 		hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping,
613 						index, addr);
614 		mutex_lock(&hugetlb_fault_mutex_table[hash]);
615 
616 		/* See if already present in mapping to avoid alloc/free */
617 		page = find_get_page(mapping, index);
618 		if (page) {
619 			put_page(page);
620 			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
621 			hugetlb_drop_vma_policy(&pseudo_vma);
622 			continue;
623 		}
624 
625 		/* Allocate page and add to page cache */
626 		page = alloc_huge_page(&pseudo_vma, addr, avoid_reserve);
627 		hugetlb_drop_vma_policy(&pseudo_vma);
628 		if (IS_ERR(page)) {
629 			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
630 			error = PTR_ERR(page);
631 			goto out;
632 		}
633 		clear_huge_page(page, addr, pages_per_huge_page(h));
634 		__SetPageUptodate(page);
635 		error = huge_add_to_page_cache(page, mapping, index);
636 		if (unlikely(error)) {
637 			put_page(page);
638 			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
639 			goto out;
640 		}
641 
642 		mutex_unlock(&hugetlb_fault_mutex_table[hash]);
643 
644 		/*
645 		 * page_put due to reference from alloc_huge_page()
646 		 * unlock_page because locked by add_to_page_cache()
647 		 */
648 		put_page(page);
649 		unlock_page(page);
650 	}
651 
652 	if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
653 		i_size_write(inode, offset + len);
654 	inode->i_ctime = CURRENT_TIME;
655 out:
656 	mutex_unlock(&inode->i_mutex);
657 	return error;
658 }
659 
hugetlbfs_setattr(struct dentry * dentry,struct iattr * attr)660 static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
661 {
662 	struct inode *inode = d_inode(dentry);
663 	struct hstate *h = hstate_inode(inode);
664 	int error;
665 	unsigned int ia_valid = attr->ia_valid;
666 
667 	BUG_ON(!inode);
668 
669 	error = inode_change_ok(inode, attr);
670 	if (error)
671 		return error;
672 
673 	if (ia_valid & ATTR_SIZE) {
674 		error = -EINVAL;
675 		if (attr->ia_size & ~huge_page_mask(h))
676 			return -EINVAL;
677 		error = hugetlb_vmtruncate(inode, attr->ia_size);
678 		if (error)
679 			return error;
680 	}
681 
682 	setattr_copy(inode, attr);
683 	mark_inode_dirty(inode);
684 	return 0;
685 }
686 
hugetlbfs_get_root(struct super_block * sb,struct hugetlbfs_config * config)687 static struct inode *hugetlbfs_get_root(struct super_block *sb,
688 					struct hugetlbfs_config *config)
689 {
690 	struct inode *inode;
691 
692 	inode = new_inode(sb);
693 	if (inode) {
694 		struct hugetlbfs_inode_info *info;
695 		inode->i_ino = get_next_ino();
696 		inode->i_mode = S_IFDIR | config->mode;
697 		inode->i_uid = config->uid;
698 		inode->i_gid = config->gid;
699 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
700 		info = HUGETLBFS_I(inode);
701 		mpol_shared_policy_init(&info->policy, NULL);
702 		inode->i_op = &hugetlbfs_dir_inode_operations;
703 		inode->i_fop = &simple_dir_operations;
704 		/* directory inodes start off with i_nlink == 2 (for "." entry) */
705 		inc_nlink(inode);
706 		lockdep_annotate_inode_mutex_key(inode);
707 	}
708 	return inode;
709 }
710 
711 /*
712  * Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never
713  * be taken from reclaim -- unlike regular filesystems. This needs an
714  * annotation because huge_pmd_share() does an allocation under
715  * i_mmap_rwsem.
716  */
717 static struct lock_class_key hugetlbfs_i_mmap_rwsem_key;
718 
hugetlbfs_get_inode(struct super_block * sb,struct inode * dir,umode_t mode,dev_t dev)719 static struct inode *hugetlbfs_get_inode(struct super_block *sb,
720 					struct inode *dir,
721 					umode_t mode, dev_t dev)
722 {
723 	struct inode *inode;
724 	struct resv_map *resv_map;
725 
726 	resv_map = resv_map_alloc();
727 	if (!resv_map)
728 		return NULL;
729 
730 	inode = new_inode(sb);
731 	if (inode) {
732 		struct hugetlbfs_inode_info *info;
733 		inode->i_ino = get_next_ino();
734 		inode_init_owner(inode, dir, mode);
735 		lockdep_set_class(&inode->i_mapping->i_mmap_rwsem,
736 				&hugetlbfs_i_mmap_rwsem_key);
737 		inode->i_mapping->a_ops = &hugetlbfs_aops;
738 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
739 		inode->i_mapping->private_data = resv_map;
740 		info = HUGETLBFS_I(inode);
741 		/*
742 		 * The policy is initialized here even if we are creating a
743 		 * private inode because initialization simply creates an
744 		 * an empty rb tree and calls spin_lock_init(), later when we
745 		 * call mpol_free_shared_policy() it will just return because
746 		 * the rb tree will still be empty.
747 		 */
748 		mpol_shared_policy_init(&info->policy, NULL);
749 		switch (mode & S_IFMT) {
750 		default:
751 			init_special_inode(inode, mode, dev);
752 			break;
753 		case S_IFREG:
754 			inode->i_op = &hugetlbfs_inode_operations;
755 			inode->i_fop = &hugetlbfs_file_operations;
756 			break;
757 		case S_IFDIR:
758 			inode->i_op = &hugetlbfs_dir_inode_operations;
759 			inode->i_fop = &simple_dir_operations;
760 
761 			/* directory inodes start off with i_nlink == 2 (for "." entry) */
762 			inc_nlink(inode);
763 			break;
764 		case S_IFLNK:
765 			inode->i_op = &page_symlink_inode_operations;
766 			break;
767 		}
768 		lockdep_annotate_inode_mutex_key(inode);
769 	} else
770 		kref_put(&resv_map->refs, resv_map_release);
771 
772 	return inode;
773 }
774 
775 /*
776  * File creation. Allocate an inode, and we're done..
777  */
hugetlbfs_mknod(struct inode * dir,struct dentry * dentry,umode_t mode,dev_t dev)778 static int hugetlbfs_mknod(struct inode *dir,
779 			struct dentry *dentry, umode_t mode, dev_t dev)
780 {
781 	struct inode *inode;
782 	int error = -ENOSPC;
783 
784 	inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev);
785 	if (inode) {
786 		dir->i_ctime = dir->i_mtime = CURRENT_TIME;
787 		d_instantiate(dentry, inode);
788 		dget(dentry);	/* Extra count - pin the dentry in core */
789 		error = 0;
790 	}
791 	return error;
792 }
793 
hugetlbfs_mkdir(struct inode * dir,struct dentry * dentry,umode_t mode)794 static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
795 {
796 	int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0);
797 	if (!retval)
798 		inc_nlink(dir);
799 	return retval;
800 }
801 
hugetlbfs_create(struct inode * dir,struct dentry * dentry,umode_t mode,bool excl)802 static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl)
803 {
804 	return hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0);
805 }
806 
hugetlbfs_symlink(struct inode * dir,struct dentry * dentry,const char * symname)807 static int hugetlbfs_symlink(struct inode *dir,
808 			struct dentry *dentry, const char *symname)
809 {
810 	struct inode *inode;
811 	int error = -ENOSPC;
812 
813 	inode = hugetlbfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0);
814 	if (inode) {
815 		int l = strlen(symname)+1;
816 		error = page_symlink(inode, symname, l);
817 		if (!error) {
818 			d_instantiate(dentry, inode);
819 			dget(dentry);
820 		} else
821 			iput(inode);
822 	}
823 	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
824 
825 	return error;
826 }
827 
828 /*
829  * mark the head page dirty
830  */
hugetlbfs_set_page_dirty(struct page * page)831 static int hugetlbfs_set_page_dirty(struct page *page)
832 {
833 	struct page *head = compound_head(page);
834 
835 	SetPageDirty(head);
836 	return 0;
837 }
838 
hugetlbfs_migrate_page(struct address_space * mapping,struct page * newpage,struct page * page,enum migrate_mode mode)839 static int hugetlbfs_migrate_page(struct address_space *mapping,
840 				struct page *newpage, struct page *page,
841 				enum migrate_mode mode)
842 {
843 	int rc;
844 
845 	rc = migrate_huge_page_move_mapping(mapping, newpage, page);
846 	if (rc != MIGRATEPAGE_SUCCESS)
847 		return rc;
848 	migrate_page_copy(newpage, page);
849 
850 	return MIGRATEPAGE_SUCCESS;
851 }
852 
hugetlbfs_statfs(struct dentry * dentry,struct kstatfs * buf)853 static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
854 {
855 	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
856 	struct hstate *h = hstate_inode(d_inode(dentry));
857 
858 	buf->f_type = HUGETLBFS_MAGIC;
859 	buf->f_bsize = huge_page_size(h);
860 	if (sbinfo) {
861 		spin_lock(&sbinfo->stat_lock);
862 		/* If no limits set, just report 0 for max/free/used
863 		 * blocks, like simple_statfs() */
864 		if (sbinfo->spool) {
865 			long free_pages;
866 
867 			spin_lock(&sbinfo->spool->lock);
868 			buf->f_blocks = sbinfo->spool->max_hpages;
869 			free_pages = sbinfo->spool->max_hpages
870 				- sbinfo->spool->used_hpages;
871 			buf->f_bavail = buf->f_bfree = free_pages;
872 			spin_unlock(&sbinfo->spool->lock);
873 			buf->f_files = sbinfo->max_inodes;
874 			buf->f_ffree = sbinfo->free_inodes;
875 		}
876 		spin_unlock(&sbinfo->stat_lock);
877 	}
878 	buf->f_namelen = NAME_MAX;
879 	return 0;
880 }
881 
hugetlbfs_put_super(struct super_block * sb)882 static void hugetlbfs_put_super(struct super_block *sb)
883 {
884 	struct hugetlbfs_sb_info *sbi = HUGETLBFS_SB(sb);
885 
886 	if (sbi) {
887 		sb->s_fs_info = NULL;
888 
889 		if (sbi->spool)
890 			hugepage_put_subpool(sbi->spool);
891 
892 		kfree(sbi);
893 	}
894 }
895 
hugetlbfs_dec_free_inodes(struct hugetlbfs_sb_info * sbinfo)896 static inline int hugetlbfs_dec_free_inodes(struct hugetlbfs_sb_info *sbinfo)
897 {
898 	if (sbinfo->free_inodes >= 0) {
899 		spin_lock(&sbinfo->stat_lock);
900 		if (unlikely(!sbinfo->free_inodes)) {
901 			spin_unlock(&sbinfo->stat_lock);
902 			return 0;
903 		}
904 		sbinfo->free_inodes--;
905 		spin_unlock(&sbinfo->stat_lock);
906 	}
907 
908 	return 1;
909 }
910 
hugetlbfs_inc_free_inodes(struct hugetlbfs_sb_info * sbinfo)911 static void hugetlbfs_inc_free_inodes(struct hugetlbfs_sb_info *sbinfo)
912 {
913 	if (sbinfo->free_inodes >= 0) {
914 		spin_lock(&sbinfo->stat_lock);
915 		sbinfo->free_inodes++;
916 		spin_unlock(&sbinfo->stat_lock);
917 	}
918 }
919 
920 
921 static struct kmem_cache *hugetlbfs_inode_cachep;
922 
hugetlbfs_alloc_inode(struct super_block * sb)923 static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
924 {
925 	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb);
926 	struct hugetlbfs_inode_info *p;
927 
928 	if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo)))
929 		return NULL;
930 	p = kmem_cache_alloc(hugetlbfs_inode_cachep, GFP_KERNEL);
931 	if (unlikely(!p)) {
932 		hugetlbfs_inc_free_inodes(sbinfo);
933 		return NULL;
934 	}
935 	return &p->vfs_inode;
936 }
937 
hugetlbfs_i_callback(struct rcu_head * head)938 static void hugetlbfs_i_callback(struct rcu_head *head)
939 {
940 	struct inode *inode = container_of(head, struct inode, i_rcu);
941 	kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
942 }
943 
hugetlbfs_destroy_inode(struct inode * inode)944 static void hugetlbfs_destroy_inode(struct inode *inode)
945 {
946 	hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb));
947 	mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy);
948 	call_rcu(&inode->i_rcu, hugetlbfs_i_callback);
949 }
950 
951 static const struct address_space_operations hugetlbfs_aops = {
952 	.write_begin	= hugetlbfs_write_begin,
953 	.write_end	= hugetlbfs_write_end,
954 	.set_page_dirty	= hugetlbfs_set_page_dirty,
955 	.migratepage    = hugetlbfs_migrate_page,
956 };
957 
958 
init_once(void * foo)959 static void init_once(void *foo)
960 {
961 	struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo;
962 
963 	inode_init_once(&ei->vfs_inode);
964 }
965 
966 const struct file_operations hugetlbfs_file_operations = {
967 	.read_iter		= hugetlbfs_read_iter,
968 	.mmap			= hugetlbfs_file_mmap,
969 	.fsync			= noop_fsync,
970 	.get_unmapped_area	= hugetlb_get_unmapped_area,
971 	.llseek			= default_llseek,
972 	.fallocate		= hugetlbfs_fallocate,
973 };
974 
975 static const struct inode_operations hugetlbfs_dir_inode_operations = {
976 	.create		= hugetlbfs_create,
977 	.lookup		= simple_lookup,
978 	.link		= simple_link,
979 	.unlink		= simple_unlink,
980 	.symlink	= hugetlbfs_symlink,
981 	.mkdir		= hugetlbfs_mkdir,
982 	.rmdir		= simple_rmdir,
983 	.mknod		= hugetlbfs_mknod,
984 	.rename		= simple_rename,
985 	.setattr	= hugetlbfs_setattr,
986 };
987 
988 static const struct inode_operations hugetlbfs_inode_operations = {
989 	.setattr	= hugetlbfs_setattr,
990 };
991 
992 static const struct super_operations hugetlbfs_ops = {
993 	.alloc_inode    = hugetlbfs_alloc_inode,
994 	.destroy_inode  = hugetlbfs_destroy_inode,
995 	.evict_inode	= hugetlbfs_evict_inode,
996 	.statfs		= hugetlbfs_statfs,
997 	.put_super	= hugetlbfs_put_super,
998 	.show_options	= generic_show_options,
999 };
1000 
1001 enum { NO_SIZE, SIZE_STD, SIZE_PERCENT };
1002 
1003 /*
1004  * Convert size option passed from command line to number of huge pages
1005  * in the pool specified by hstate.  Size option could be in bytes
1006  * (val_type == SIZE_STD) or percentage of the pool (val_type == SIZE_PERCENT).
1007  */
1008 static long long
hugetlbfs_size_to_hpages(struct hstate * h,unsigned long long size_opt,int val_type)1009 hugetlbfs_size_to_hpages(struct hstate *h, unsigned long long size_opt,
1010 								int val_type)
1011 {
1012 	if (val_type == NO_SIZE)
1013 		return -1;
1014 
1015 	if (val_type == SIZE_PERCENT) {
1016 		size_opt <<= huge_page_shift(h);
1017 		size_opt *= h->max_huge_pages;
1018 		do_div(size_opt, 100);
1019 	}
1020 
1021 	size_opt >>= huge_page_shift(h);
1022 	return size_opt;
1023 }
1024 
1025 static int
hugetlbfs_parse_options(char * options,struct hugetlbfs_config * pconfig)1026 hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
1027 {
1028 	char *p, *rest;
1029 	substring_t args[MAX_OPT_ARGS];
1030 	int option;
1031 	unsigned long long max_size_opt = 0, min_size_opt = 0;
1032 	int max_val_type = NO_SIZE, min_val_type = NO_SIZE;
1033 
1034 	if (!options)
1035 		return 0;
1036 
1037 	while ((p = strsep(&options, ",")) != NULL) {
1038 		int token;
1039 		if (!*p)
1040 			continue;
1041 
1042 		token = match_token(p, tokens, args);
1043 		switch (token) {
1044 		case Opt_uid:
1045 			if (match_int(&args[0], &option))
1046  				goto bad_val;
1047 			pconfig->uid = make_kuid(current_user_ns(), option);
1048 			if (!uid_valid(pconfig->uid))
1049 				goto bad_val;
1050 			break;
1051 
1052 		case Opt_gid:
1053 			if (match_int(&args[0], &option))
1054  				goto bad_val;
1055 			pconfig->gid = make_kgid(current_user_ns(), option);
1056 			if (!gid_valid(pconfig->gid))
1057 				goto bad_val;
1058 			break;
1059 
1060 		case Opt_mode:
1061 			if (match_octal(&args[0], &option))
1062  				goto bad_val;
1063 			pconfig->mode = option & 01777U;
1064 			break;
1065 
1066 		case Opt_size: {
1067 			/* memparse() will accept a K/M/G without a digit */
1068 			if (!isdigit(*args[0].from))
1069 				goto bad_val;
1070 			max_size_opt = memparse(args[0].from, &rest);
1071 			max_val_type = SIZE_STD;
1072 			if (*rest == '%')
1073 				max_val_type = SIZE_PERCENT;
1074 			break;
1075 		}
1076 
1077 		case Opt_nr_inodes:
1078 			/* memparse() will accept a K/M/G without a digit */
1079 			if (!isdigit(*args[0].from))
1080 				goto bad_val;
1081 			pconfig->nr_inodes = memparse(args[0].from, &rest);
1082 			break;
1083 
1084 		case Opt_pagesize: {
1085 			unsigned long ps;
1086 			ps = memparse(args[0].from, &rest);
1087 			pconfig->hstate = size_to_hstate(ps);
1088 			if (!pconfig->hstate) {
1089 				pr_err("Unsupported page size %lu MB\n",
1090 					ps >> 20);
1091 				return -EINVAL;
1092 			}
1093 			break;
1094 		}
1095 
1096 		case Opt_min_size: {
1097 			/* memparse() will accept a K/M/G without a digit */
1098 			if (!isdigit(*args[0].from))
1099 				goto bad_val;
1100 			min_size_opt = memparse(args[0].from, &rest);
1101 			min_val_type = SIZE_STD;
1102 			if (*rest == '%')
1103 				min_val_type = SIZE_PERCENT;
1104 			break;
1105 		}
1106 
1107 		default:
1108 			pr_err("Bad mount option: \"%s\"\n", p);
1109 			return -EINVAL;
1110 			break;
1111 		}
1112 	}
1113 
1114 	/*
1115 	 * Use huge page pool size (in hstate) to convert the size
1116 	 * options to number of huge pages.  If NO_SIZE, -1 is returned.
1117 	 */
1118 	pconfig->max_hpages = hugetlbfs_size_to_hpages(pconfig->hstate,
1119 						max_size_opt, max_val_type);
1120 	pconfig->min_hpages = hugetlbfs_size_to_hpages(pconfig->hstate,
1121 						min_size_opt, min_val_type);
1122 
1123 	/*
1124 	 * If max_size was specified, then min_size must be smaller
1125 	 */
1126 	if (max_val_type > NO_SIZE &&
1127 	    pconfig->min_hpages > pconfig->max_hpages) {
1128 		pr_err("minimum size can not be greater than maximum size\n");
1129 		return -EINVAL;
1130 	}
1131 
1132 	return 0;
1133 
1134 bad_val:
1135 	pr_err("Bad value '%s' for mount option '%s'\n", args[0].from, p);
1136  	return -EINVAL;
1137 }
1138 
1139 static int
hugetlbfs_fill_super(struct super_block * sb,void * data,int silent)1140 hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
1141 {
1142 	int ret;
1143 	struct hugetlbfs_config config;
1144 	struct hugetlbfs_sb_info *sbinfo;
1145 
1146 	save_mount_options(sb, data);
1147 
1148 	config.max_hpages = -1; /* No limit on size by default */
1149 	config.nr_inodes = -1; /* No limit on number of inodes by default */
1150 	config.uid = current_fsuid();
1151 	config.gid = current_fsgid();
1152 	config.mode = 0755;
1153 	config.hstate = &default_hstate;
1154 	config.min_hpages = -1; /* No default minimum size */
1155 	ret = hugetlbfs_parse_options(data, &config);
1156 	if (ret)
1157 		return ret;
1158 
1159 	sbinfo = kmalloc(sizeof(struct hugetlbfs_sb_info), GFP_KERNEL);
1160 	if (!sbinfo)
1161 		return -ENOMEM;
1162 	sb->s_fs_info = sbinfo;
1163 	sbinfo->hstate = config.hstate;
1164 	spin_lock_init(&sbinfo->stat_lock);
1165 	sbinfo->max_inodes = config.nr_inodes;
1166 	sbinfo->free_inodes = config.nr_inodes;
1167 	sbinfo->spool = NULL;
1168 	/*
1169 	 * Allocate and initialize subpool if maximum or minimum size is
1170 	 * specified.  Any needed reservations (for minimim size) are taken
1171 	 * taken when the subpool is created.
1172 	 */
1173 	if (config.max_hpages != -1 || config.min_hpages != -1) {
1174 		sbinfo->spool = hugepage_new_subpool(config.hstate,
1175 							config.max_hpages,
1176 							config.min_hpages);
1177 		if (!sbinfo->spool)
1178 			goto out_free;
1179 	}
1180 	sb->s_maxbytes = MAX_LFS_FILESIZE;
1181 	sb->s_blocksize = huge_page_size(config.hstate);
1182 	sb->s_blocksize_bits = huge_page_shift(config.hstate);
1183 	sb->s_magic = HUGETLBFS_MAGIC;
1184 	sb->s_op = &hugetlbfs_ops;
1185 	sb->s_time_gran = 1;
1186 	sb->s_root = d_make_root(hugetlbfs_get_root(sb, &config));
1187 	if (!sb->s_root)
1188 		goto out_free;
1189 	return 0;
1190 out_free:
1191 	kfree(sbinfo->spool);
1192 	kfree(sbinfo);
1193 	return -ENOMEM;
1194 }
1195 
hugetlbfs_mount(struct file_system_type * fs_type,int flags,const char * dev_name,void * data)1196 static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type,
1197 	int flags, const char *dev_name, void *data)
1198 {
1199 	return mount_nodev(fs_type, flags, data, hugetlbfs_fill_super);
1200 }
1201 
1202 static struct file_system_type hugetlbfs_fs_type = {
1203 	.name		= "hugetlbfs",
1204 	.mount		= hugetlbfs_mount,
1205 	.kill_sb	= kill_litter_super,
1206 };
1207 MODULE_ALIAS_FS("hugetlbfs");
1208 
1209 static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];
1210 
can_do_hugetlb_shm(void)1211 static int can_do_hugetlb_shm(void)
1212 {
1213 	kgid_t shm_group;
1214 	shm_group = make_kgid(&init_user_ns, sysctl_hugetlb_shm_group);
1215 	return capable(CAP_IPC_LOCK) || in_group_p(shm_group);
1216 }
1217 
get_hstate_idx(int page_size_log)1218 static int get_hstate_idx(int page_size_log)
1219 {
1220 	struct hstate *h = hstate_sizelog(page_size_log);
1221 
1222 	if (!h)
1223 		return -1;
1224 	return h - hstates;
1225 }
1226 
1227 static const struct dentry_operations anon_ops = {
1228 	.d_dname = simple_dname
1229 };
1230 
1231 /*
1232  * Note that size should be aligned to proper hugepage size in caller side,
1233  * otherwise hugetlb_reserve_pages reserves one less hugepages than intended.
1234  */
hugetlb_file_setup(const char * name,size_t size,vm_flags_t acctflag,struct user_struct ** user,int creat_flags,int page_size_log)1235 struct file *hugetlb_file_setup(const char *name, size_t size,
1236 				vm_flags_t acctflag, struct user_struct **user,
1237 				int creat_flags, int page_size_log)
1238 {
1239 	struct file *file = ERR_PTR(-ENOMEM);
1240 	struct inode *inode;
1241 	struct path path;
1242 	struct super_block *sb;
1243 	struct qstr quick_string;
1244 	int hstate_idx;
1245 
1246 	hstate_idx = get_hstate_idx(page_size_log);
1247 	if (hstate_idx < 0)
1248 		return ERR_PTR(-ENODEV);
1249 
1250 	*user = NULL;
1251 	if (!hugetlbfs_vfsmount[hstate_idx])
1252 		return ERR_PTR(-ENOENT);
1253 
1254 	if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
1255 		*user = current_user();
1256 		if (user_shm_lock(size, *user)) {
1257 			task_lock(current);
1258 			pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n",
1259 				current->comm, current->pid);
1260 			task_unlock(current);
1261 		} else {
1262 			*user = NULL;
1263 			return ERR_PTR(-EPERM);
1264 		}
1265 	}
1266 
1267 	sb = hugetlbfs_vfsmount[hstate_idx]->mnt_sb;
1268 	quick_string.name = name;
1269 	quick_string.len = strlen(quick_string.name);
1270 	quick_string.hash = 0;
1271 	path.dentry = d_alloc_pseudo(sb, &quick_string);
1272 	if (!path.dentry)
1273 		goto out_shm_unlock;
1274 
1275 	d_set_d_op(path.dentry, &anon_ops);
1276 	path.mnt = mntget(hugetlbfs_vfsmount[hstate_idx]);
1277 	file = ERR_PTR(-ENOSPC);
1278 	inode = hugetlbfs_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0);
1279 	if (!inode)
1280 		goto out_dentry;
1281 	if (creat_flags == HUGETLB_SHMFS_INODE)
1282 		inode->i_flags |= S_PRIVATE;
1283 
1284 	file = ERR_PTR(-ENOMEM);
1285 	if (hugetlb_reserve_pages(inode, 0,
1286 			size >> huge_page_shift(hstate_inode(inode)), NULL,
1287 			acctflag))
1288 		goto out_inode;
1289 
1290 	d_instantiate(path.dentry, inode);
1291 	inode->i_size = size;
1292 	clear_nlink(inode);
1293 
1294 	file = alloc_file(&path, FMODE_WRITE | FMODE_READ,
1295 			&hugetlbfs_file_operations);
1296 	if (IS_ERR(file))
1297 		goto out_dentry; /* inode is already attached */
1298 
1299 	return file;
1300 
1301 out_inode:
1302 	iput(inode);
1303 out_dentry:
1304 	path_put(&path);
1305 out_shm_unlock:
1306 	if (*user) {
1307 		user_shm_unlock(size, *user);
1308 		*user = NULL;
1309 	}
1310 	return file;
1311 }
1312 
init_hugetlbfs_fs(void)1313 static int __init init_hugetlbfs_fs(void)
1314 {
1315 	struct hstate *h;
1316 	int error;
1317 	int i;
1318 
1319 	if (!hugepages_supported()) {
1320 		pr_info("disabling because there are no supported hugepage sizes\n");
1321 		return -ENOTSUPP;
1322 	}
1323 
1324 	error = -ENOMEM;
1325 	hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",
1326 					sizeof(struct hugetlbfs_inode_info),
1327 					0, 0, init_once);
1328 	if (hugetlbfs_inode_cachep == NULL)
1329 		goto out2;
1330 
1331 	error = register_filesystem(&hugetlbfs_fs_type);
1332 	if (error)
1333 		goto out;
1334 
1335 	i = 0;
1336 	for_each_hstate(h) {
1337 		char buf[50];
1338 		unsigned ps_kb = 1U << (h->order + PAGE_SHIFT - 10);
1339 
1340 		snprintf(buf, sizeof(buf), "pagesize=%uK", ps_kb);
1341 		hugetlbfs_vfsmount[i] = kern_mount_data(&hugetlbfs_fs_type,
1342 							buf);
1343 
1344 		if (IS_ERR(hugetlbfs_vfsmount[i])) {
1345 			pr_err("Cannot mount internal hugetlbfs for "
1346 				"page size %uK", ps_kb);
1347 			error = PTR_ERR(hugetlbfs_vfsmount[i]);
1348 			hugetlbfs_vfsmount[i] = NULL;
1349 		}
1350 		i++;
1351 	}
1352 	/* Non default hstates are optional */
1353 	if (!IS_ERR_OR_NULL(hugetlbfs_vfsmount[default_hstate_idx]))
1354 		return 0;
1355 
1356  out:
1357 	kmem_cache_destroy(hugetlbfs_inode_cachep);
1358  out2:
1359 	return error;
1360 }
1361 
exit_hugetlbfs_fs(void)1362 static void __exit exit_hugetlbfs_fs(void)
1363 {
1364 	struct hstate *h;
1365 	int i;
1366 
1367 
1368 	/*
1369 	 * Make sure all delayed rcu free inodes are flushed before we
1370 	 * destroy cache.
1371 	 */
1372 	rcu_barrier();
1373 	kmem_cache_destroy(hugetlbfs_inode_cachep);
1374 	i = 0;
1375 	for_each_hstate(h)
1376 		kern_unmount(hugetlbfs_vfsmount[i++]);
1377 	unregister_filesystem(&hugetlbfs_fs_type);
1378 }
1379 
1380 module_init(init_hugetlbfs_fs)
1381 module_exit(exit_hugetlbfs_fs)
1382 
1383 MODULE_LICENSE("GPL");
1384