Blame - marvell/linux/fs/hugetlbfs/inode.c - T108

blob: 47b292f9b4f805572c8a896dcd362ad016f988fa [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame^]	1	/*
				2	* hugetlbpage-backed filesystem. Based on ramfs.
				3	*
				4	* Nadia Yvette Chambers, 2002
				5	*
				6	* Copyright (C) 2002 Linus Torvalds.
				7	* License: GPL
				8	*/
				9
				10	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
				11
				12	#include <linux/thread_info.h>
				13	#include <asm/current.h>
				14	#include <linux/sched/signal.h> /* remove ASAP */
				15	#include <linux/falloc.h>
				16	#include <linux/fs.h>
				17	#include <linux/mount.h>
				18	#include <linux/file.h>
				19	#include <linux/kernel.h>
				20	#include <linux/writeback.h>
				21	#include <linux/pagemap.h>
				22	#include <linux/highmem.h>
				23	#include <linux/init.h>
				24	#include <linux/string.h>
				25	#include <linux/capability.h>
				26	#include <linux/ctype.h>
				27	#include <linux/backing-dev.h>
				28	#include <linux/hugetlb.h>
				29	#include <linux/pagevec.h>
				30	#include <linux/fs_parser.h>
				31	#include <linux/mman.h>
				32	#include <linux/slab.h>
				33	#include <linux/dnotify.h>
				34	#include <linux/statfs.h>
				35	#include <linux/security.h>
				36	#include <linux/magic.h>
				37	#include <linux/migrate.h>
				38	#include <linux/uio.h>
				39
				40	#include <linux/uaccess.h>
				41	#include <linux/sched/mm.h>
				42
				43	static const struct super_operations hugetlbfs_ops;
				44	static const struct address_space_operations hugetlbfs_aops;
				45	const struct file_operations hugetlbfs_file_operations;
				46	static const struct inode_operations hugetlbfs_dir_inode_operations;
				47	static const struct inode_operations hugetlbfs_inode_operations;
				48
				49	enum hugetlbfs_size_type { NO_SIZE, SIZE_STD, SIZE_PERCENT };
				50
				51	struct hugetlbfs_fs_context {
				52	struct hstate *hstate;
				53	unsigned long long max_size_opt;
				54	unsigned long long min_size_opt;
				55	long max_hpages;
				56	long nr_inodes;
				57	long min_hpages;
				58	enum hugetlbfs_size_type max_val_type;
				59	enum hugetlbfs_size_type min_val_type;
				60	kuid_t uid;
				61	kgid_t gid;
				62	umode_t mode;
				63	};
				64
				65	int sysctl_hugetlb_shm_group;
				66
				67	enum hugetlb_param {
				68	Opt_gid,
				69	Opt_min_size,
				70	Opt_mode,
				71	Opt_nr_inodes,
				72	Opt_pagesize,
				73	Opt_size,
				74	Opt_uid,
				75	};
				76
				77	static const struct fs_parameter_spec hugetlb_param_specs[] = {
				78	fsparam_u32 ("gid", Opt_gid),
				79	fsparam_string("min_size", Opt_min_size),
				80	fsparam_u32oct("mode", Opt_mode),
				81	fsparam_string("nr_inodes", Opt_nr_inodes),
				82	fsparam_string("pagesize", Opt_pagesize),
				83	fsparam_string("size", Opt_size),
				84	fsparam_u32 ("uid", Opt_uid),
				85	{}
				86	};
				87
				88	static const struct fs_parameter_description hugetlb_fs_parameters = {
				89	.name = "hugetlbfs",
				90	.specs = hugetlb_param_specs,
				91	};
				92
				93	#ifdef CONFIG_NUMA
				94	static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
				95	struct inode *inode, pgoff_t index)
				96	{
				97	vma->vm_policy = mpol_shared_policy_lookup(&HUGETLBFS_I(inode)->policy,
				98	index);
				99	}
				100
				101	static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
				102	{
				103	mpol_cond_put(vma->vm_policy);
				104	}
				105	#else
				106	static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
				107	struct inode *inode, pgoff_t index)
				108	{
				109	}
				110
				111	static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
				112	{
				113	}
				114	#endif
				115
				116	static void huge_pagevec_release(struct pagevec *pvec)
				117	{
				118	int i;
				119
				120	for (i = 0; i < pagevec_count(pvec); ++i)
				121	put_page(pvec->pages[i]);
				122
				123	pagevec_reinit(pvec);
				124	}
				125
				126	/*
				127	* Mask used when checking the page offset value passed in via system
				128	* calls. This value will be converted to a loff_t which is signed.
				129	* Therefore, we want to check the upper PAGE_SHIFT + 1 bits of the
				130	* value. The extra bit (- 1 in the shift value) is to take the sign
				131	* bit into account.
				132	*/
				133	#define PGOFF_LOFFT_MAX \
				134	(((1UL << (PAGE_SHIFT + 1)) - 1) << (BITS_PER_LONG - (PAGE_SHIFT + 1)))
				135
				136	static int hugetlbfs_file_mmap(struct file file, struct vm_area_struct vma)
				137	{
				138	struct inode *inode = file_inode(file);
				139	struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
				140	loff_t len, vma_len;
				141	int ret;
				142	struct hstate *h = hstate_file(file);
				143
				144	/*
				145	* vma address alignment (but not the pgoff alignment) has
				146	* already been checked by prepare_hugepage_range. If you add
				147	* any error returns here, do so after setting VM_HUGETLB, so
				148	* is_vm_hugetlb_page tests below unmap_region go the right
				149	* way when do_mmap_pgoff unwinds (may be important on powerpc
				150	* and ia64).
				151	*/
				152	vma->vm_flags \|= VM_HUGETLB \| VM_DONTEXPAND;
				153	vma->vm_ops = &hugetlb_vm_ops;
				154
				155	ret = seal_check_future_write(info->seals, vma);
				156	if (ret)
				157	return ret;
				158
				159	/*
				160	* page based offset in vm_pgoff could be sufficiently large to
				161	* overflow a loff_t when converted to byte offset. This can
				162	* only happen on architectures where sizeof(loff_t) ==
				163	* sizeof(unsigned long). So, only check in those instances.
				164	*/
				165	if (sizeof(unsigned long) == sizeof(loff_t)) {
				166	if (vma->vm_pgoff & PGOFF_LOFFT_MAX)
				167	return -EINVAL;
				168	}
				169
				170	/* must be huge page aligned */
				171	if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
				172	return -EINVAL;
				173
				174	vma_len = (loff_t)(vma->vm_end - vma->vm_start);
				175	len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
				176	/* check for overflow */
				177	if (len < vma_len)
				178	return -EINVAL;
				179
				180	inode_lock(inode);
				181	file_accessed(file);
				182
				183	ret = -ENOMEM;
				184	if (hugetlb_reserve_pages(inode,
				185	vma->vm_pgoff >> huge_page_order(h),
				186	len >> huge_page_shift(h), vma,
				187	vma->vm_flags))
				188	goto out;
				189
				190	ret = 0;
				191	if (vma->vm_flags & VM_WRITE && inode->i_size < len)
				192	i_size_write(inode, len);
				193	out:
				194	inode_unlock(inode);
				195
				196	return ret;
				197	}
				198
				199	/*
				200	* Called under down_write(mmap_sem).
				201	*/
				202
				203	#ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
				204	static unsigned long
				205	hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr,
				206	unsigned long len, unsigned long pgoff, unsigned long flags)
				207	{
				208	struct hstate *h = hstate_file(file);
				209	struct vm_unmapped_area_info info;
				210
				211	info.flags = 0;
				212	info.length = len;
				213	info.low_limit = current->mm->mmap_base;
				214	info.high_limit = arch_get_mmap_end(addr);
				215	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
				216	info.align_offset = 0;
				217	return vm_unmapped_area(&info);
				218	}
				219
				220	static unsigned long
				221	hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr,
				222	unsigned long len, unsigned long pgoff, unsigned long flags)
				223	{
				224	struct hstate *h = hstate_file(file);
				225	struct vm_unmapped_area_info info;
				226
				227	info.flags = VM_UNMAPPED_AREA_TOPDOWN;
				228	info.length = len;
				229	info.low_limit = max(PAGE_SIZE, mmap_min_addr);
				230	info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base);
				231	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
				232	info.align_offset = 0;
				233	addr = vm_unmapped_area(&info);
				234
				235	/*
				236	* A failed mmap() very likely causes application failure,
				237	* so fall back to the bottom-up function here. This scenario
				238	* can happen with large stack limits and large mmap()
				239	* allocations.
				240	*/
				241	if (unlikely(offset_in_page(addr))) {
				242	VM_BUG_ON(addr != -ENOMEM);
				243	info.flags = 0;
				244	info.low_limit = current->mm->mmap_base;
				245	info.high_limit = arch_get_mmap_end(addr);
				246	addr = vm_unmapped_area(&info);
				247	}
				248
				249	return addr;
				250	}
				251
				252	static unsigned long
				253	hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
				254	unsigned long len, unsigned long pgoff, unsigned long flags)
				255	{
				256	struct mm_struct *mm = current->mm;
				257	struct vm_area_struct *vma;
				258	struct hstate *h = hstate_file(file);
				259	const unsigned long mmap_end = arch_get_mmap_end(addr);
				260
				261	if (len & ~huge_page_mask(h))
				262	return -EINVAL;
				263	if (len > TASK_SIZE)
				264	return -ENOMEM;
				265
				266	if (flags & MAP_FIXED) {
				267	if (prepare_hugepage_range(file, addr, len))
				268	return -EINVAL;
				269	return addr;
				270	}
				271
				272	if (addr) {
				273	addr = ALIGN(addr, huge_page_size(h));
				274	vma = find_vma(mm, addr);
				275	if (mmap_end - len >= addr &&
				276	(!vma \|\| addr + len <= vm_start_gap(vma)))
				277	return addr;
				278	}
				279
				280	/*
				281	* Use mm->get_unmapped_area value as a hint to use topdown routine.
				282	* If architectures have special needs, they should define their own
				283	* version of hugetlb_get_unmapped_area.
				284	*/
				285	if (mm->get_unmapped_area == arch_get_unmapped_area_topdown)
				286	return hugetlb_get_unmapped_area_topdown(file, addr, len,
				287	pgoff, flags);
				288	return hugetlb_get_unmapped_area_bottomup(file, addr, len,
				289	pgoff, flags);
				290	}
				291	#endif
				292
				293	static size_t
				294	hugetlbfs_read_actor(struct page *page, unsigned long offset,
				295	struct iov_iter *to, unsigned long size)
				296	{
				297	size_t copied = 0;
				298	int i, chunksize;
				299
				300	/* Find which 4k chunk and offset with in that chunk */
				301	i = offset >> PAGE_SHIFT;
				302	offset = offset & ~PAGE_MASK;
				303
				304	while (size) {
				305	size_t n;
				306	chunksize = PAGE_SIZE;
				307	if (offset)
				308	chunksize -= offset;
				309	if (chunksize > size)
				310	chunksize = size;
				311	n = copy_page_to_iter(&page[i], offset, chunksize, to);
				312	copied += n;
				313	if (n != chunksize)
				314	return copied;
				315	offset = 0;
				316	size -= chunksize;
				317	i++;
				318	}
				319	return copied;
				320	}
				321
				322	/*
				323	* Support for read() - Find the page attached to f_mapping and copy out the
				324	* data. Its very similar to do_generic_mapping_read(), we can't use that
				325	* since it has PAGE_SIZE assumptions.
				326	*/
				327	static ssize_t hugetlbfs_read_iter(struct kiocb iocb, struct iov_iter to)
				328	{
				329	struct file *file = iocb->ki_filp;
				330	struct hstate *h = hstate_file(file);
				331	struct address_space *mapping = file->f_mapping;
				332	struct inode *inode = mapping->host;
				333	unsigned long index = iocb->ki_pos >> huge_page_shift(h);
				334	unsigned long offset = iocb->ki_pos & ~huge_page_mask(h);
				335	unsigned long end_index;
				336	loff_t isize;
				337	ssize_t retval = 0;
				338
				339	while (iov_iter_count(to)) {
				340	struct page *page;
				341	size_t nr, copied;
				342
				343	/* nr is the maximum number of bytes to copy from this page */
				344	nr = huge_page_size(h);
				345	isize = i_size_read(inode);
				346	if (!isize)
				347	break;
				348	end_index = (isize - 1) >> huge_page_shift(h);
				349	if (index > end_index)
				350	break;
				351	if (index == end_index) {
				352	nr = ((isize - 1) & ~huge_page_mask(h)) + 1;
				353	if (nr <= offset)
				354	break;
				355	}
				356	nr = nr - offset;
				357
				358	/* Find the page */
				359	page = find_lock_page(mapping, index);
				360	if (unlikely(page == NULL)) {
				361	/*
				362	* We have a HOLE, zero out the user-buffer for the
				363	* length of the hole or request.
				364	*/
				365	copied = iov_iter_zero(nr, to);
				366	} else {
				367	unlock_page(page);
				368
				369	/*
				370	* We have the page, copy it to user space buffer.
				371	*/
				372	copied = hugetlbfs_read_actor(page, offset, to, nr);
				373	put_page(page);
				374	}
				375	offset += copied;
				376	retval += copied;
				377	if (copied != nr && iov_iter_count(to)) {
				378	if (!retval)
				379	retval = -EFAULT;
				380	break;
				381	}
				382	index += offset >> huge_page_shift(h);
				383	offset &= ~huge_page_mask(h);
				384	}
				385	iocb->ki_pos = ((loff_t)index << huge_page_shift(h)) + offset;
				386	return retval;
				387	}
				388
				389	static int hugetlbfs_write_begin(struct file *file,
				390	struct address_space *mapping,
				391	loff_t pos, unsigned len, unsigned flags,
				392	struct page pagep, void fsdata)
				393	{
				394	return -EINVAL;
				395	}
				396
				397	static int hugetlbfs_write_end(struct file file, struct address_space mapping,
				398	loff_t pos, unsigned len, unsigned copied,
				399	struct page page, void fsdata)
				400	{
				401	BUG();
				402	return -EINVAL;
				403	}
				404
				405	static void remove_huge_page(struct page *page)
				406	{
				407	ClearPageDirty(page);
				408	ClearPageUptodate(page);
				409	delete_from_page_cache(page);
				410	}
				411
				412	static void
				413	hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end)
				414	{
				415	struct vm_area_struct *vma;
				416
				417	/*
				418	* end == 0 indicates that the entire range after
				419	* start should be unmapped.
				420	*/
				421	vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
				422	unsigned long v_offset;
				423	unsigned long v_end;
				424
				425	/*
				426	* Can the expression below overflow on 32-bit arches?
				427	* No, because the interval tree returns us only those vmas
				428	* which overlap the truncated area starting at pgoff,
				429	* and no vma on a 32-bit arch can span beyond the 4GB.
				430	*/
				431	if (vma->vm_pgoff < start)
				432	v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
				433	else
				434	v_offset = 0;
				435
				436	if (!end)
				437	v_end = vma->vm_end;
				438	else {
				439	v_end = ((end - vma->vm_pgoff) << PAGE_SHIFT)
				440	+ vma->vm_start;
				441	if (v_end > vma->vm_end)
				442	v_end = vma->vm_end;
				443	}
				444
				445	unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end,
				446	NULL);
				447	}
				448	}
				449
				450	/*
				451	* remove_inode_hugepages handles two distinct cases: truncation and hole
				452	* punch. There are subtle differences in operation for each case.
				453	*
				454	* truncation is indicated by end of range being LLONG_MAX
				455	* In this case, we first scan the range and release found pages.
				456	* After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
				457	* maps and global counts. Page faults can not race with truncation
				458	* in this routine. hugetlb_no_page() prevents page faults in the
				459	* truncated range. It checks i_size before allocation, and again after
				460	* with the page table lock for the page held. The same lock must be
				461	* acquired to unmap a page.
				462	* hole punch is indicated if end is not LLONG_MAX
				463	* In the hole punch case we scan the range and release found pages.
				464	* Only when releasing a page is the associated region/reserv map
				465	* deleted. The region/reserv map for ranges without associated
				466	* pages are not modified. Page faults can race with hole punch.
				467	* This is indicated if we find a mapped page.
				468	* Note: If the passed end of range value is beyond the end of file, but
				469	* not LLONG_MAX this routine still performs a hole punch operation.
				470	*/
				471	static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
				472	loff_t lend)
				473	{
				474	struct hstate *h = hstate_inode(inode);
				475	struct address_space *mapping = &inode->i_data;
				476	const pgoff_t start = lstart >> huge_page_shift(h);
				477	const pgoff_t end = lend >> huge_page_shift(h);
				478	struct vm_area_struct pseudo_vma;
				479	struct pagevec pvec;
				480	pgoff_t next, index;
				481	int i, freed = 0;
				482	bool truncate_op = (lend == LLONG_MAX);
				483
				484	vma_init(&pseudo_vma, current->mm);
				485	pseudo_vma.vm_flags = (VM_HUGETLB \| VM_MAYSHARE \| VM_SHARED);
				486	pagevec_init(&pvec);
				487	next = start;
				488	while (next < end) {
				489	/*
				490	* When no more pages are found, we are done.
				491	*/
				492	if (!pagevec_lookup_range(&pvec, mapping, &next, end - 1))
				493	break;
				494
				495	for (i = 0; i < pagevec_count(&pvec); ++i) {
				496	struct page *page = pvec.pages[i];
				497	u32 hash;
				498
				499	index = page->index;
				500	hash = hugetlb_fault_mutex_hash(h, mapping, index);
				501	mutex_lock(&hugetlb_fault_mutex_table[hash]);
				502
				503	/*
				504	* If page is mapped, it was faulted in after being
				505	* unmapped in caller. Unmap (again) now after taking
				506	* the fault mutex. The mutex will prevent faults
				507	* until we finish removing the page.
				508	*
				509	* This race can only happen in the hole punch case.
				510	* Getting here in a truncate operation is a bug.
				511	*/
				512	if (unlikely(page_mapped(page))) {
				513	BUG_ON(truncate_op);
				514
				515	i_mmap_lock_write(mapping);
				516	hugetlb_vmdelete_list(&mapping->i_mmap,
				517	index * pages_per_huge_page(h),
				518	(index + 1) * pages_per_huge_page(h));
				519	i_mmap_unlock_write(mapping);
				520	}
				521
				522	lock_page(page);
				523	/*
				524	* We must free the huge page and remove from page
				525	* cache (remove_huge_page) BEFORE removing the
				526	* region/reserve map (hugetlb_unreserve_pages). In
				527	* rare out of memory conditions, removal of the
				528	* region/reserve map could fail. Correspondingly,
				529	* the subpool and global reserve usage count can need
				530	* to be adjusted.
				531	*/
				532	VM_BUG_ON(PagePrivate(page));
				533	remove_huge_page(page);
				534	freed++;
				535	if (!truncate_op) {
				536	if (unlikely(hugetlb_unreserve_pages(inode,
				537	index, index + 1, 1)))
				538	hugetlb_fix_reserve_counts(inode);
				539	}
				540
				541	unlock_page(page);
				542	mutex_unlock(&hugetlb_fault_mutex_table[hash]);
				543	}
				544	huge_pagevec_release(&pvec);
				545	cond_resched();
				546	}
				547
				548	if (truncate_op)
				549	(void)hugetlb_unreserve_pages(inode, start, LONG_MAX, freed);
				550	}
				551
				552	static void hugetlbfs_evict_inode(struct inode *inode)
				553	{
				554	struct resv_map *resv_map;
				555
				556	remove_inode_hugepages(inode, 0, LLONG_MAX);
				557
				558	/*
				559	* Get the resv_map from the address space embedded in the inode.
				560	* This is the address space which points to any resv_map allocated
				561	* at inode creation time. If this is a device special inode,
				562	* i_mapping may not point to the original address space.
				563	*/
				564	resv_map = (struct resv_map *)(&inode->i_data)->private_data;
				565	/* Only regular and link inodes have associated reserve maps */
				566	if (resv_map)
				567	resv_map_release(&resv_map->refs);
				568	clear_inode(inode);
				569	}
				570
				571	static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
				572	{
				573	pgoff_t pgoff;
				574	struct address_space *mapping = inode->i_mapping;
				575	struct hstate *h = hstate_inode(inode);
				576
				577	BUG_ON(offset & ~huge_page_mask(h));
				578	pgoff = offset >> PAGE_SHIFT;
				579
				580	i_size_write(inode, offset);
				581	i_mmap_lock_write(mapping);
				582	if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
				583	hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
				584	i_mmap_unlock_write(mapping);
				585	remove_inode_hugepages(inode, offset, LLONG_MAX);
				586	return 0;
				587	}
				588
				589	static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
				590	{
				591	struct hstate *h = hstate_inode(inode);
				592	loff_t hpage_size = huge_page_size(h);
				593	loff_t hole_start, hole_end;
				594
				595	/*
				596	* For hole punch round up the beginning offset of the hole and
				597	* round down the end.
				598	*/
				599	hole_start = round_up(offset, hpage_size);
				600	hole_end = round_down(offset + len, hpage_size);
				601
				602	if (hole_end > hole_start) {
				603	struct address_space *mapping = inode->i_mapping;
				604	struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
				605
				606	inode_lock(inode);
				607
				608	/* protected by i_mutex */
				609	if (info->seals & (F_SEAL_WRITE \| F_SEAL_FUTURE_WRITE)) {
				610	inode_unlock(inode);
				611	return -EPERM;
				612	}
				613
				614	i_mmap_lock_write(mapping);
				615	if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
				616	hugetlb_vmdelete_list(&mapping->i_mmap,
				617	hole_start >> PAGE_SHIFT,
				618	hole_end >> PAGE_SHIFT);
				619	i_mmap_unlock_write(mapping);
				620	remove_inode_hugepages(inode, hole_start, hole_end);
				621	inode_unlock(inode);
				622	}
				623
				624	return 0;
				625	}
				626
				627	static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
				628	loff_t len)
				629	{
				630	struct inode *inode = file_inode(file);
				631	struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
				632	struct address_space *mapping = inode->i_mapping;
				633	struct hstate *h = hstate_inode(inode);
				634	struct vm_area_struct pseudo_vma;
				635	struct mm_struct *mm = current->mm;
				636	loff_t hpage_size = huge_page_size(h);
				637	unsigned long hpage_shift = huge_page_shift(h);
				638	pgoff_t start, index, end;
				639	int error;
				640	u32 hash;
				641
				642	if (mode & ~(FALLOC_FL_KEEP_SIZE \| FALLOC_FL_PUNCH_HOLE))
				643	return -EOPNOTSUPP;
				644
				645	if (mode & FALLOC_FL_PUNCH_HOLE)
				646	return hugetlbfs_punch_hole(inode, offset, len);
				647
				648	/*
				649	* Default preallocate case.
				650	* For this range, start is rounded down and end is rounded up
				651	* as well as being converted to page offsets.
				652	*/
				653	start = offset >> hpage_shift;
				654	end = (offset + len + hpage_size - 1) >> hpage_shift;
				655
				656	inode_lock(inode);
				657
				658	/* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
				659	error = inode_newsize_ok(inode, offset + len);
				660	if (error)
				661	goto out;
				662
				663	if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) {
				664	error = -EPERM;
				665	goto out;
				666	}
				667
				668	/*
				669	* Initialize a pseudo vma as this is required by the huge page
				670	* allocation routines. If NUMA is configured, use page index
				671	* as input to create an allocation policy.
				672	*/
				673	vma_init(&pseudo_vma, mm);
				674	pseudo_vma.vm_flags = (VM_HUGETLB \| VM_MAYSHARE \| VM_SHARED);
				675	pseudo_vma.vm_file = file;
				676
				677	for (index = start; index < end; index++) {
				678	/*
				679	* This is supposed to be the vaddr where the page is being
				680	* faulted in, but we have no vaddr here.
				681	*/
				682	struct page *page;
				683	unsigned long addr;
				684	int avoid_reserve = 0;
				685
				686	cond_resched();
				687
				688	/*
				689	* fallocate(2) manpage permits EINTR; we may have been
				690	* interrupted because we are using up too much memory.
				691	*/
				692	if (signal_pending(current)) {
				693	error = -EINTR;
				694	break;
				695	}
				696
				697	/* Set numa allocation policy based on index */
				698	hugetlb_set_vma_policy(&pseudo_vma, inode, index);
				699
				700	/* addr is the offset within the file (zero based) */
				701	addr = index * hpage_size;
				702
				703	/* mutex taken here, fault path and hole punch */
				704	hash = hugetlb_fault_mutex_hash(h, mapping, index);
				705	mutex_lock(&hugetlb_fault_mutex_table[hash]);
				706
				707	/* See if already present in mapping to avoid alloc/free */
				708	page = find_get_page(mapping, index);
				709	if (page) {
				710	put_page(page);
				711	mutex_unlock(&hugetlb_fault_mutex_table[hash]);
				712	hugetlb_drop_vma_policy(&pseudo_vma);
				713	continue;
				714	}
				715
				716	/* Allocate page and add to page cache */
				717	page = alloc_huge_page(&pseudo_vma, addr, avoid_reserve);
				718	hugetlb_drop_vma_policy(&pseudo_vma);
				719	if (IS_ERR(page)) {
				720	mutex_unlock(&hugetlb_fault_mutex_table[hash]);
				721	error = PTR_ERR(page);
				722	goto out;
				723	}
				724	clear_huge_page(page, addr, pages_per_huge_page(h));
				725	__SetPageUptodate(page);
				726	error = huge_add_to_page_cache(page, mapping, index);
				727	if (unlikely(error)) {
				728	put_page(page);
				729	mutex_unlock(&hugetlb_fault_mutex_table[hash]);
				730	goto out;
				731	}
				732
				733	mutex_unlock(&hugetlb_fault_mutex_table[hash]);
				734
				735	set_page_huge_active(page);
				736	/*
				737	* unlock_page because locked by add_to_page_cache()
				738	* put_page() due to reference from alloc_huge_page()
				739	*/
				740	unlock_page(page);
				741	put_page(page);
				742	}
				743
				744	if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
				745	i_size_write(inode, offset + len);
				746	inode->i_ctime = current_time(inode);
				747	out:
				748	inode_unlock(inode);
				749	return error;
				750	}
				751
				752	static int hugetlbfs_setattr(struct dentry dentry, struct iattr attr)
				753	{
				754	struct inode *inode = d_inode(dentry);
				755	struct hstate *h = hstate_inode(inode);
				756	int error;
				757	unsigned int ia_valid = attr->ia_valid;
				758	struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
				759
				760	BUG_ON(!inode);
				761
				762	error = setattr_prepare(dentry, attr);
				763	if (error)
				764	return error;
				765
				766	if (ia_valid & ATTR_SIZE) {
				767	loff_t oldsize = inode->i_size;
				768	loff_t newsize = attr->ia_size;
				769
				770	if (newsize & ~huge_page_mask(h))
				771	return -EINVAL;
				772	/* protected by i_mutex */
				773	if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) \|\|
				774	(newsize > oldsize && (info->seals & F_SEAL_GROW)))
				775	return -EPERM;
				776	error = hugetlb_vmtruncate(inode, newsize);
				777	if (error)
				778	return error;
				779	}
				780
				781	setattr_copy(inode, attr);
				782	mark_inode_dirty(inode);
				783	return 0;
				784	}
				785
				786	static struct inode hugetlbfs_get_root(struct super_block sb,
				787	struct hugetlbfs_fs_context *ctx)
				788	{
				789	struct inode *inode;
				790
				791	inode = new_inode(sb);
				792	if (inode) {
				793	inode->i_ino = get_next_ino();
				794	inode->i_mode = S_IFDIR \| ctx->mode;
				795	inode->i_uid = ctx->uid;
				796	inode->i_gid = ctx->gid;
				797	inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
				798	inode->i_op = &hugetlbfs_dir_inode_operations;
				799	inode->i_fop = &simple_dir_operations;
				800	/* directory inodes start off with i_nlink == 2 (for "." entry) */
				801	inc_nlink(inode);
				802	lockdep_annotate_inode_mutex_key(inode);
				803	}
				804	return inode;
				805	}
				806
				807	/*
				808	* Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never
				809	* be taken from reclaim -- unlike regular filesystems. This needs an
				810	* annotation because huge_pmd_share() does an allocation under hugetlb's
				811	* i_mmap_rwsem.
				812	*/
				813	static struct lock_class_key hugetlbfs_i_mmap_rwsem_key;
				814
				815	static struct inode hugetlbfs_get_inode(struct super_block sb,
				816	struct inode *dir,
				817	umode_t mode, dev_t dev)
				818	{
				819	struct inode *inode;
				820	struct resv_map *resv_map = NULL;
				821
				822	/*
				823	* Reserve maps are only needed for inodes that can have associated
				824	* page allocations.
				825	*/
				826	if (S_ISREG(mode) \|\| S_ISLNK(mode)) {
				827	resv_map = resv_map_alloc();
				828	if (!resv_map)
				829	return NULL;
				830	}
				831
				832	inode = new_inode(sb);
				833	if (inode) {
				834	struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
				835
				836	inode->i_ino = get_next_ino();
				837	inode_init_owner(inode, dir, mode);
				838	lockdep_set_class(&inode->i_mapping->i_mmap_rwsem,
				839	&hugetlbfs_i_mmap_rwsem_key);
				840	inode->i_mapping->a_ops = &hugetlbfs_aops;
				841	inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
				842	inode->i_mapping->private_data = resv_map;
				843	info->seals = F_SEAL_SEAL;
				844	switch (mode & S_IFMT) {
				845	default:
				846	init_special_inode(inode, mode, dev);
				847	break;
				848	case S_IFREG:
				849	inode->i_op = &hugetlbfs_inode_operations;
				850	inode->i_fop = &hugetlbfs_file_operations;
				851	break;
				852	case S_IFDIR:
				853	inode->i_op = &hugetlbfs_dir_inode_operations;
				854	inode->i_fop = &simple_dir_operations;
				855
				856	/* directory inodes start off with i_nlink == 2 (for "." entry) */
				857	inc_nlink(inode);
				858	break;
				859	case S_IFLNK:
				860	inode->i_op = &page_symlink_inode_operations;
				861	inode_nohighmem(inode);
				862	break;
				863	}
				864	lockdep_annotate_inode_mutex_key(inode);
				865	} else {
				866	if (resv_map)
				867	kref_put(&resv_map->refs, resv_map_release);
				868	}
				869
				870	return inode;
				871	}
				872
				873	/*
				874	* File creation. Allocate an inode, and we're done..
				875	*/
				876	static int hugetlbfs_mknod(struct inode *dir,
				877	struct dentry *dentry, umode_t mode, dev_t dev)
				878	{
				879	struct inode *inode;
				880	int error = -ENOSPC;
				881
				882	inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev);
				883	if (inode) {
				884	dir->i_ctime = dir->i_mtime = current_time(dir);
				885	d_instantiate(dentry, inode);
				886	dget(dentry); /* Extra count - pin the dentry in core */
				887	error = 0;
				888	}
				889	return error;
				890	}
				891
				892	static int hugetlbfs_mkdir(struct inode dir, struct dentry dentry, umode_t mode)
				893	{
				894	int retval = hugetlbfs_mknod(dir, dentry, mode \| S_IFDIR, 0);
				895	if (!retval)
				896	inc_nlink(dir);
				897	return retval;
				898	}
				899
				900	static int hugetlbfs_create(struct inode dir, struct dentry dentry, umode_t mode, bool excl)
				901	{
				902	return hugetlbfs_mknod(dir, dentry, mode \| S_IFREG, 0);
				903	}
				904
				905	static int hugetlbfs_symlink(struct inode *dir,
				906	struct dentry dentry, const char symname)
				907	{
				908	struct inode *inode;
				909	int error = -ENOSPC;
				910
				911	inode = hugetlbfs_get_inode(dir->i_sb, dir, S_IFLNK\|S_IRWXUGO, 0);
				912	if (inode) {
				913	int l = strlen(symname)+1;
				914	error = page_symlink(inode, symname, l);
				915	if (!error) {
				916	d_instantiate(dentry, inode);
				917	dget(dentry);
				918	} else
				919	iput(inode);
				920	}
				921	dir->i_ctime = dir->i_mtime = current_time(dir);
				922
				923	return error;
				924	}
				925
				926	/*
				927	* mark the head page dirty
				928	*/
				929	static int hugetlbfs_set_page_dirty(struct page *page)
				930	{
				931	struct page *head = compound_head(page);
				932
				933	SetPageDirty(head);
				934	return 0;
				935	}
				936
				937	static int hugetlbfs_migrate_page(struct address_space *mapping,
				938	struct page newpage, struct page page,
				939	enum migrate_mode mode)
				940	{
				941	int rc;
				942
				943	rc = migrate_huge_page_move_mapping(mapping, newpage, page);
				944	if (rc != MIGRATEPAGE_SUCCESS)
				945	return rc;
				946
				947	/*
				948	* page_private is subpool pointer in hugetlb pages. Transfer to
				949	* new page. PagePrivate is not associated with page_private for
				950	* hugetlb pages and can not be set here as only page_huge_active
				951	* pages can be migrated.
				952	*/
				953	if (page_private(page)) {
				954	set_page_private(newpage, page_private(page));
				955	set_page_private(page, 0);
				956	}
				957
				958	if (mode != MIGRATE_SYNC_NO_COPY)
				959	migrate_page_copy(newpage, page);
				960	else
				961	migrate_page_states(newpage, page);
				962
				963	return MIGRATEPAGE_SUCCESS;
				964	}
				965
				966	static int hugetlbfs_error_remove_page(struct address_space *mapping,
				967	struct page *page)
				968	{
				969	struct inode *inode = mapping->host;
				970	pgoff_t index = page->index;
				971
				972	remove_huge_page(page);
				973	if (unlikely(hugetlb_unreserve_pages(inode, index, index + 1, 1)))
				974	hugetlb_fix_reserve_counts(inode);
				975
				976	return 0;
				977	}
				978
				979	/*
				980	* Display the mount options in /proc/mounts.
				981	*/
				982	static int hugetlbfs_show_options(struct seq_file m, struct dentry root)
				983	{
				984	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(root->d_sb);
				985	struct hugepage_subpool *spool = sbinfo->spool;
				986	unsigned long hpage_size = huge_page_size(sbinfo->hstate);
				987	unsigned hpage_shift = huge_page_shift(sbinfo->hstate);
				988	char mod;
				989
				990	if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID))
				991	seq_printf(m, ",uid=%u",
				992	from_kuid_munged(&init_user_ns, sbinfo->uid));
				993	if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
				994	seq_printf(m, ",gid=%u",
				995	from_kgid_munged(&init_user_ns, sbinfo->gid));
				996	if (sbinfo->mode != 0755)
				997	seq_printf(m, ",mode=%o", sbinfo->mode);
				998	if (sbinfo->max_inodes != -1)
				999	seq_printf(m, ",nr_inodes=%lu", sbinfo->max_inodes);
				1000
				1001	hpage_size /= 1024;
				1002	mod = 'K';
				1003	if (hpage_size >= 1024) {
				1004	hpage_size /= 1024;
				1005	mod = 'M';
				1006	}
				1007	seq_printf(m, ",pagesize=%lu%c", hpage_size, mod);
				1008	if (spool) {
				1009	if (spool->max_hpages != -1)
				1010	seq_printf(m, ",size=%llu",
				1011	(unsigned long long)spool->max_hpages << hpage_shift);
				1012	if (spool->min_hpages != -1)
				1013	seq_printf(m, ",min_size=%llu",
				1014	(unsigned long long)spool->min_hpages << hpage_shift);
				1015	}
				1016	return 0;
				1017	}
				1018
				1019	static int hugetlbfs_statfs(struct dentry dentry, struct kstatfs buf)
				1020	{
				1021	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
				1022	struct hstate *h = hstate_inode(d_inode(dentry));
				1023
				1024	buf->f_type = HUGETLBFS_MAGIC;
				1025	buf->f_bsize = huge_page_size(h);
				1026	if (sbinfo) {
				1027	spin_lock(&sbinfo->stat_lock);
				1028	/* If no limits set, just report 0 for max/free/used
				1029	* blocks, like simple_statfs() */
				1030	if (sbinfo->spool) {
				1031	long free_pages;
				1032
				1033	spin_lock(&sbinfo->spool->lock);
				1034	buf->f_blocks = sbinfo->spool->max_hpages;
				1035	free_pages = sbinfo->spool->max_hpages
				1036	- sbinfo->spool->used_hpages;
				1037	buf->f_bavail = buf->f_bfree = free_pages;
				1038	spin_unlock(&sbinfo->spool->lock);
				1039	buf->f_files = sbinfo->max_inodes;
				1040	buf->f_ffree = sbinfo->free_inodes;
				1041	}
				1042	spin_unlock(&sbinfo->stat_lock);
				1043	}
				1044	buf->f_namelen = NAME_MAX;
				1045	return 0;
				1046	}
				1047
				1048	static void hugetlbfs_put_super(struct super_block *sb)
				1049	{
				1050	struct hugetlbfs_sb_info *sbi = HUGETLBFS_SB(sb);
				1051
				1052	if (sbi) {
				1053	sb->s_fs_info = NULL;
				1054
				1055	if (sbi->spool)
				1056	hugepage_put_subpool(sbi->spool);
				1057
				1058	kfree(sbi);
				1059	}
				1060	}
				1061
				1062	static inline int hugetlbfs_dec_free_inodes(struct hugetlbfs_sb_info *sbinfo)
				1063	{
				1064	if (sbinfo->free_inodes >= 0) {
				1065	spin_lock(&sbinfo->stat_lock);
				1066	if (unlikely(!sbinfo->free_inodes)) {
				1067	spin_unlock(&sbinfo->stat_lock);
				1068	return 0;
				1069	}
				1070	sbinfo->free_inodes--;
				1071	spin_unlock(&sbinfo->stat_lock);
				1072	}
				1073
				1074	return 1;
				1075	}
				1076
				1077	static void hugetlbfs_inc_free_inodes(struct hugetlbfs_sb_info *sbinfo)
				1078	{
				1079	if (sbinfo->free_inodes >= 0) {
				1080	spin_lock(&sbinfo->stat_lock);
				1081	sbinfo->free_inodes++;
				1082	spin_unlock(&sbinfo->stat_lock);
				1083	}
				1084	}
				1085
				1086
				1087	static struct kmem_cache *hugetlbfs_inode_cachep;
				1088
				1089	static struct inode hugetlbfs_alloc_inode(struct super_block sb)
				1090	{
				1091	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb);
				1092	struct hugetlbfs_inode_info *p;
				1093
				1094	if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo)))
				1095	return NULL;
				1096	p = kmem_cache_alloc(hugetlbfs_inode_cachep, GFP_KERNEL);
				1097	if (unlikely(!p)) {
				1098	hugetlbfs_inc_free_inodes(sbinfo);
				1099	return NULL;
				1100	}
				1101
				1102	/*
				1103	* Any time after allocation, hugetlbfs_destroy_inode can be called
				1104	* for the inode. mpol_free_shared_policy is unconditionally called
				1105	* as part of hugetlbfs_destroy_inode. So, initialize policy here
				1106	* in case of a quick call to destroy.
				1107	*
				1108	* Note that the policy is initialized even if we are creating a
				1109	* private inode. This simplifies hugetlbfs_destroy_inode.
				1110	*/
				1111	mpol_shared_policy_init(&p->policy, NULL);
				1112
				1113	return &p->vfs_inode;
				1114	}
				1115
				1116	static void hugetlbfs_free_inode(struct inode *inode)
				1117	{
				1118	kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
				1119	}
				1120
				1121	static void hugetlbfs_destroy_inode(struct inode *inode)
				1122	{
				1123	hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb));
				1124	mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy);
				1125	}
				1126
				1127	static const struct address_space_operations hugetlbfs_aops = {
				1128	.write_begin = hugetlbfs_write_begin,
				1129	.write_end = hugetlbfs_write_end,
				1130	.set_page_dirty = hugetlbfs_set_page_dirty,
				1131	.migratepage = hugetlbfs_migrate_page,
				1132	.error_remove_page = hugetlbfs_error_remove_page,
				1133	};
				1134
				1135
				1136	static void init_once(void *foo)
				1137	{
				1138	struct hugetlbfs_inode_info ei = (struct hugetlbfs_inode_info )foo;
				1139
				1140	inode_init_once(&ei->vfs_inode);
				1141	}
				1142
				1143	const struct file_operations hugetlbfs_file_operations = {
				1144	.read_iter = hugetlbfs_read_iter,
				1145	.mmap = hugetlbfs_file_mmap,
				1146	.fsync = noop_fsync,
				1147	.get_unmapped_area = hugetlb_get_unmapped_area,
				1148	.llseek = default_llseek,
				1149	.fallocate = hugetlbfs_fallocate,
				1150	};
				1151
				1152	static const struct inode_operations hugetlbfs_dir_inode_operations = {
				1153	.create = hugetlbfs_create,
				1154	.lookup = simple_lookup,
				1155	.link = simple_link,
				1156	.unlink = simple_unlink,
				1157	.symlink = hugetlbfs_symlink,
				1158	.mkdir = hugetlbfs_mkdir,
				1159	.rmdir = simple_rmdir,
				1160	.mknod = hugetlbfs_mknod,
				1161	.rename = simple_rename,
				1162	.setattr = hugetlbfs_setattr,
				1163	};
				1164
				1165	static const struct inode_operations hugetlbfs_inode_operations = {
				1166	.setattr = hugetlbfs_setattr,
				1167	};
				1168
				1169	static const struct super_operations hugetlbfs_ops = {
				1170	.alloc_inode = hugetlbfs_alloc_inode,
				1171	.free_inode = hugetlbfs_free_inode,
				1172	.destroy_inode = hugetlbfs_destroy_inode,
				1173	.evict_inode = hugetlbfs_evict_inode,
				1174	.statfs = hugetlbfs_statfs,
				1175	.put_super = hugetlbfs_put_super,
				1176	.show_options = hugetlbfs_show_options,
				1177	};
				1178
				1179	/*
				1180	* Convert size option passed from command line to number of huge pages
				1181	* in the pool specified by hstate. Size option could be in bytes
				1182	* (val_type == SIZE_STD) or percentage of the pool (val_type == SIZE_PERCENT).
				1183	*/
				1184	static long
				1185	hugetlbfs_size_to_hpages(struct hstate *h, unsigned long long size_opt,
				1186	enum hugetlbfs_size_type val_type)
				1187	{
				1188	if (val_type == NO_SIZE)
				1189	return -1;
				1190
				1191	if (val_type == SIZE_PERCENT) {
				1192	size_opt <<= huge_page_shift(h);
				1193	size_opt *= h->max_huge_pages;
				1194	do_div(size_opt, 100);
				1195	}
				1196
				1197	size_opt >>= huge_page_shift(h);
				1198	return size_opt;
				1199	}
				1200
				1201	/*
				1202	* Parse one mount parameter.
				1203	*/
				1204	static int hugetlbfs_parse_param(struct fs_context fc, struct fs_parameter param)
				1205	{
				1206	struct hugetlbfs_fs_context *ctx = fc->fs_private;
				1207	struct fs_parse_result result;
				1208	struct hstate *h;
				1209	char *rest;
				1210	unsigned long ps;
				1211	int opt;
				1212
				1213	opt = fs_parse(fc, &hugetlb_fs_parameters, param, &result);
				1214	if (opt < 0)
				1215	return opt;
				1216
				1217	switch (opt) {
				1218	case Opt_uid:
				1219	ctx->uid = make_kuid(current_user_ns(), result.uint_32);
				1220	if (!uid_valid(ctx->uid))
				1221	goto bad_val;
				1222	return 0;
				1223
				1224	case Opt_gid:
				1225	ctx->gid = make_kgid(current_user_ns(), result.uint_32);
				1226	if (!gid_valid(ctx->gid))
				1227	goto bad_val;
				1228	return 0;
				1229
				1230	case Opt_mode:
				1231	ctx->mode = result.uint_32 & 01777U;
				1232	return 0;
				1233
				1234	case Opt_size:
				1235	/* memparse() will accept a K/M/G without a digit */
				1236	if (!param->string \|\| !isdigit(param->string[0]))
				1237	goto bad_val;
				1238	ctx->max_size_opt = memparse(param->string, &rest);
				1239	ctx->max_val_type = SIZE_STD;
				1240	if (*rest == '%')
				1241	ctx->max_val_type = SIZE_PERCENT;
				1242	return 0;
				1243
				1244	case Opt_nr_inodes:
				1245	/* memparse() will accept a K/M/G without a digit */
				1246	if (!param->string \|\| !isdigit(param->string[0]))
				1247	goto bad_val;
				1248	ctx->nr_inodes = memparse(param->string, &rest);
				1249	return 0;
				1250
				1251	case Opt_pagesize:
				1252	ps = memparse(param->string, &rest);
				1253	h = size_to_hstate(ps);
				1254	if (!h) {
				1255	pr_err("Unsupported page size %lu MB\n", ps >> 20);
				1256	return -EINVAL;
				1257	}
				1258	ctx->hstate = h;
				1259	return 0;
				1260
				1261	case Opt_min_size:
				1262	/* memparse() will accept a K/M/G without a digit */
				1263	if (!param->string \|\| !isdigit(param->string[0]))
				1264	goto bad_val;
				1265	ctx->min_size_opt = memparse(param->string, &rest);
				1266	ctx->min_val_type = SIZE_STD;
				1267	if (*rest == '%')
				1268	ctx->min_val_type = SIZE_PERCENT;
				1269	return 0;
				1270
				1271	default:
				1272	return -EINVAL;
				1273	}
				1274
				1275	bad_val:
				1276	return invalf(fc, "hugetlbfs: Bad value '%s' for mount option '%s'\n",
				1277	param->string, param->key);
				1278	}
				1279
				1280	/*
				1281	* Validate the parsed options.
				1282	*/
				1283	static int hugetlbfs_validate(struct fs_context *fc)
				1284	{
				1285	struct hugetlbfs_fs_context *ctx = fc->fs_private;
				1286
				1287	/*
				1288	* Use huge page pool size (in hstate) to convert the size
				1289	* options to number of huge pages. If NO_SIZE, -1 is returned.
				1290	*/
				1291	ctx->max_hpages = hugetlbfs_size_to_hpages(ctx->hstate,
				1292	ctx->max_size_opt,
				1293	ctx->max_val_type);
				1294	ctx->min_hpages = hugetlbfs_size_to_hpages(ctx->hstate,
				1295	ctx->min_size_opt,
				1296	ctx->min_val_type);
				1297
				1298	/*
				1299	* If max_size was specified, then min_size must be smaller
				1300	*/
				1301	if (ctx->max_val_type > NO_SIZE &&
				1302	ctx->min_hpages > ctx->max_hpages) {
				1303	pr_err("Minimum size can not be greater than maximum size\n");
				1304	return -EINVAL;
				1305	}
				1306
				1307	return 0;
				1308	}
				1309
				1310	static int
				1311	hugetlbfs_fill_super(struct super_block sb, struct fs_context fc)
				1312	{
				1313	struct hugetlbfs_fs_context *ctx = fc->fs_private;
				1314	struct hugetlbfs_sb_info *sbinfo;
				1315
				1316	sbinfo = kmalloc(sizeof(struct hugetlbfs_sb_info), GFP_KERNEL);
				1317	if (!sbinfo)
				1318	return -ENOMEM;
				1319	sb->s_fs_info = sbinfo;
				1320	spin_lock_init(&sbinfo->stat_lock);
				1321	sbinfo->hstate = ctx->hstate;
				1322	sbinfo->max_inodes = ctx->nr_inodes;
				1323	sbinfo->free_inodes = ctx->nr_inodes;
				1324	sbinfo->spool = NULL;
				1325	sbinfo->uid = ctx->uid;
				1326	sbinfo->gid = ctx->gid;
				1327	sbinfo->mode = ctx->mode;
				1328
				1329	/*
				1330	* Allocate and initialize subpool if maximum or minimum size is
				1331	* specified. Any needed reservations (for minimim size) are taken
				1332	* taken when the subpool is created.
				1333	*/
				1334	if (ctx->max_hpages != -1 \|\| ctx->min_hpages != -1) {
				1335	sbinfo->spool = hugepage_new_subpool(ctx->hstate,
				1336	ctx->max_hpages,
				1337	ctx->min_hpages);
				1338	if (!sbinfo->spool)
				1339	goto out_free;
				1340	}
				1341	sb->s_maxbytes = MAX_LFS_FILESIZE;
				1342	sb->s_blocksize = huge_page_size(ctx->hstate);
				1343	sb->s_blocksize_bits = huge_page_shift(ctx->hstate);
				1344	sb->s_magic = HUGETLBFS_MAGIC;
				1345	sb->s_op = &hugetlbfs_ops;
				1346	sb->s_time_gran = 1;
				1347
				1348	/*
				1349	* Due to the special and limited functionality of hugetlbfs, it does
				1350	* not work well as a stacking filesystem.
				1351	*/
				1352	sb->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
				1353	sb->s_root = d_make_root(hugetlbfs_get_root(sb, ctx));
				1354	if (!sb->s_root)
				1355	goto out_free;
				1356	return 0;
				1357	out_free:
				1358	kfree(sbinfo->spool);
				1359	kfree(sbinfo);
				1360	return -ENOMEM;
				1361	}
				1362
				1363	static int hugetlbfs_get_tree(struct fs_context *fc)
				1364	{
				1365	int err = hugetlbfs_validate(fc);
				1366	if (err)
				1367	return err;
				1368	return get_tree_nodev(fc, hugetlbfs_fill_super);
				1369	}
				1370
				1371	static void hugetlbfs_fs_context_free(struct fs_context *fc)
				1372	{
				1373	kfree(fc->fs_private);
				1374	}
				1375
				1376	static const struct fs_context_operations hugetlbfs_fs_context_ops = {
				1377	.free = hugetlbfs_fs_context_free,
				1378	.parse_param = hugetlbfs_parse_param,
				1379	.get_tree = hugetlbfs_get_tree,
				1380	};
				1381
				1382	static int hugetlbfs_init_fs_context(struct fs_context *fc)
				1383	{
				1384	struct hugetlbfs_fs_context *ctx;
				1385
				1386	ctx = kzalloc(sizeof(struct hugetlbfs_fs_context), GFP_KERNEL);
				1387	if (!ctx)
				1388	return -ENOMEM;
				1389
				1390	ctx->max_hpages = -1; /* No limit on size by default */
				1391	ctx->nr_inodes = -1; /* No limit on number of inodes by default */
				1392	ctx->uid = current_fsuid();
				1393	ctx->gid = current_fsgid();
				1394	ctx->mode = 0755;
				1395	ctx->hstate = &default_hstate;
				1396	ctx->min_hpages = -1; /* No default minimum size */
				1397	ctx->max_val_type = NO_SIZE;
				1398	ctx->min_val_type = NO_SIZE;
				1399	fc->fs_private = ctx;
				1400	fc->ops = &hugetlbfs_fs_context_ops;
				1401	return 0;
				1402	}
				1403
				1404	static struct file_system_type hugetlbfs_fs_type = {
				1405	.name = "hugetlbfs",
				1406	.init_fs_context = hugetlbfs_init_fs_context,
				1407	.parameters = &hugetlb_fs_parameters,
				1408	.kill_sb = kill_litter_super,
				1409	};
				1410
				1411	static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];
				1412
				1413	static int can_do_hugetlb_shm(void)
				1414	{
				1415	kgid_t shm_group;
				1416	shm_group = make_kgid(&init_user_ns, sysctl_hugetlb_shm_group);
				1417	return capable(CAP_IPC_LOCK) \|\| in_group_p(shm_group);
				1418	}
				1419
				1420	static int get_hstate_idx(int page_size_log)
				1421	{
				1422	struct hstate *h = hstate_sizelog(page_size_log);
				1423
				1424	if (!h)
				1425	return -1;
				1426	return h - hstates;
				1427	}
				1428
				1429	/*
				1430	* Note that size should be aligned to proper hugepage size in caller side,
				1431	* otherwise hugetlb_reserve_pages reserves one less hugepages than intended.
				1432	*/
				1433	struct file hugetlb_file_setup(const char name, size_t size,
				1434	vm_flags_t acctflag, struct user_struct **user,
				1435	int creat_flags, int page_size_log)
				1436	{
				1437	struct inode *inode;
				1438	struct vfsmount *mnt;
				1439	int hstate_idx;
				1440	struct file *file;
				1441
				1442	hstate_idx = get_hstate_idx(page_size_log);
				1443	if (hstate_idx < 0)
				1444	return ERR_PTR(-ENODEV);
				1445
				1446	*user = NULL;
				1447	mnt = hugetlbfs_vfsmount[hstate_idx];
				1448	if (!mnt)
				1449	return ERR_PTR(-ENOENT);
				1450
				1451	if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
				1452	*user = current_user();
				1453	if (user_shm_lock(size, *user)) {
				1454	task_lock(current);
				1455	pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n",
				1456	current->comm, current->pid);
				1457	task_unlock(current);
				1458	} else {
				1459	*user = NULL;
				1460	return ERR_PTR(-EPERM);
				1461	}
				1462	}
				1463
				1464	file = ERR_PTR(-ENOSPC);
				1465	inode = hugetlbfs_get_inode(mnt->mnt_sb, NULL, S_IFREG \| S_IRWXUGO, 0);
				1466	if (!inode)
				1467	goto out;
				1468	if (creat_flags == HUGETLB_SHMFS_INODE)
				1469	inode->i_flags \|= S_PRIVATE;
				1470
				1471	inode->i_size = size;
				1472	clear_nlink(inode);
				1473
				1474	if (hugetlb_reserve_pages(inode, 0,
				1475	size >> huge_page_shift(hstate_inode(inode)), NULL,
				1476	acctflag))
				1477	file = ERR_PTR(-ENOMEM);
				1478	else
				1479	file = alloc_file_pseudo(inode, mnt, name, O_RDWR,
				1480	&hugetlbfs_file_operations);
				1481	if (!IS_ERR(file))
				1482	return file;
				1483
				1484	iput(inode);
				1485	out:
				1486	if (*user) {
				1487	user_shm_unlock(size, *user);
				1488	*user = NULL;
				1489	}
				1490	return file;
				1491	}
				1492
				1493	static struct vfsmount __init mount_one_hugetlbfs(struct hstate h)
				1494	{
				1495	struct fs_context *fc;
				1496	struct vfsmount *mnt;
				1497
				1498	fc = fs_context_for_mount(&hugetlbfs_fs_type, SB_KERNMOUNT);
				1499	if (IS_ERR(fc)) {
				1500	mnt = ERR_CAST(fc);
				1501	} else {
				1502	struct hugetlbfs_fs_context *ctx = fc->fs_private;
				1503	ctx->hstate = h;
				1504	mnt = fc_mount(fc);
				1505	put_fs_context(fc);
				1506	}
				1507	if (IS_ERR(mnt))
				1508	pr_err("Cannot mount internal hugetlbfs for page size %uK",
				1509	1U << (h->order + PAGE_SHIFT - 10));
				1510	return mnt;
				1511	}
				1512
				1513	static int __init init_hugetlbfs_fs(void)
				1514	{
				1515	struct vfsmount *mnt;
				1516	struct hstate *h;
				1517	int error;
				1518	int i;
				1519
				1520	if (!hugepages_supported()) {
				1521	pr_info("disabling because there are no supported hugepage sizes\n");
				1522	return -ENOTSUPP;
				1523	}
				1524
				1525	error = -ENOMEM;
				1526	hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",
				1527	sizeof(struct hugetlbfs_inode_info),
				1528	0, SLAB_ACCOUNT, init_once);
				1529	if (hugetlbfs_inode_cachep == NULL)
				1530	goto out;
				1531
				1532	error = register_filesystem(&hugetlbfs_fs_type);
				1533	if (error)
				1534	goto out_free;
				1535
				1536	/* default hstate mount is required */
				1537	mnt = mount_one_hugetlbfs(&hstates[default_hstate_idx]);
				1538	if (IS_ERR(mnt)) {
				1539	error = PTR_ERR(mnt);
				1540	goto out_unreg;
				1541	}
				1542	hugetlbfs_vfsmount[default_hstate_idx] = mnt;
				1543
				1544	/* other hstates are optional */
				1545	i = 0;
				1546	for_each_hstate(h) {
				1547	if (i == default_hstate_idx) {
				1548	i++;
				1549	continue;
				1550	}
				1551
				1552	mnt = mount_one_hugetlbfs(h);
				1553	if (IS_ERR(mnt))
				1554	hugetlbfs_vfsmount[i] = NULL;
				1555	else
				1556	hugetlbfs_vfsmount[i] = mnt;
				1557	i++;
				1558	}
				1559
				1560	return 0;
				1561
				1562	out_unreg:
				1563	(void)unregister_filesystem(&hugetlbfs_fs_type);
				1564	out_free:
				1565	kmem_cache_destroy(hugetlbfs_inode_cachep);
				1566	out:
				1567	return error;
				1568	}
				1569	fs_initcall(init_hugetlbfs_fs)