Blame - marvell/linux/fs/namespace.c - T108

blob: 281f08eaba5b9bf83370ec3c2a558ba4eaa7209e [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame^]	1	// SPDX-License-Identifier: GPL-2.0-only
				2	/*
				3	* linux/fs/namespace.c
				4	*
				5	* (C) Copyright Al Viro 2000, 2001
				6	*
				7	* Based on code from fs/super.c, copyright Linus Torvalds and others.
				8	* Heavily rewritten.
				9	*/
				10
				11	#include <linux/syscalls.h>
				12	#include <linux/export.h>
				13	#include <linux/capability.h>
				14	#include <linux/mnt_namespace.h>
				15	#include <linux/user_namespace.h>
				16	#include <linux/namei.h>
				17	#include <linux/security.h>
				18	#include <linux/cred.h>
				19	#include <linux/idr.h>
				20	#include <linux/init.h> /* init_rootfs */
				21	#include <linux/fs_struct.h> /* get_fs_root et.al. */
				22	#include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */
				23	#include <linux/file.h>
				24	#include <linux/uaccess.h>
				25	#include <linux/proc_ns.h>
				26	#include <linux/magic.h>
				27	#include <linux/memblock.h>
				28	#include <linux/task_work.h>
				29	#include <linux/sched/task.h>
				30	#include <uapi/linux/mount.h>
				31	#include <linux/fs_context.h>
				32	#include <linux/shmem_fs.h>
				33
				34	#include "pnode.h"
				35	#include "internal.h"
				36
				37	/* Maximum number of mounts in a mount namespace */
				38	unsigned int sysctl_mount_max __read_mostly = 100000;
				39
				40	static unsigned int m_hash_mask __read_mostly;
				41	static unsigned int m_hash_shift __read_mostly;
				42	static unsigned int mp_hash_mask __read_mostly;
				43	static unsigned int mp_hash_shift __read_mostly;
				44
				45	static __initdata unsigned long mhash_entries;
				46	static int __init set_mhash_entries(char *str)
				47	{
				48	if (!str)
				49	return 0;
				50	mhash_entries = simple_strtoul(str, &str, 0);
				51	return 1;
				52	}
				53	__setup("mhash_entries=", set_mhash_entries);
				54
				55	static __initdata unsigned long mphash_entries;
				56	static int __init set_mphash_entries(char *str)
				57	{
				58	if (!str)
				59	return 0;
				60	mphash_entries = simple_strtoul(str, &str, 0);
				61	return 1;
				62	}
				63	__setup("mphash_entries=", set_mphash_entries);
				64
				65	static u64 event;
				66	static DEFINE_IDA(mnt_id_ida);
				67	static DEFINE_IDA(mnt_group_ida);
				68
				69	static struct hlist_head *mount_hashtable __read_mostly;
				70	static struct hlist_head *mountpoint_hashtable __read_mostly;
				71	static struct kmem_cache *mnt_cache __read_mostly;
				72	static DECLARE_RWSEM(namespace_sem);
				73	static HLIST_HEAD(unmounted); /* protected by namespace_sem */
				74	static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
				75
				76	/* /sys/fs */
				77	struct kobject *fs_kobj;
				78	EXPORT_SYMBOL_GPL(fs_kobj);
				79
				80	/*
				81	* vfsmount lock may be taken for read to prevent changes to the
				82	* vfsmount hash, ie. during mountpoint lookups or walking back
				83	* up the tree.
				84	*
				85	* It should be taken for write in all cases where the vfsmount
				86	* tree or hash is modified or when a vfsmount structure is modified.
				87	*/
				88	__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
				89
				90	static inline struct hlist_head m_hash(struct vfsmount mnt, struct dentry *dentry)
				91	{
				92	unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
				93	tmp += ((unsigned long)dentry / L1_CACHE_BYTES);
				94	tmp = tmp + (tmp >> m_hash_shift);
				95	return &mount_hashtable[tmp & m_hash_mask];
				96	}
				97
				98	static inline struct hlist_head mp_hash(struct dentry dentry)
				99	{
				100	unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES);
				101	tmp = tmp + (tmp >> mp_hash_shift);
				102	return &mountpoint_hashtable[tmp & mp_hash_mask];
				103	}
				104
				105	static int mnt_alloc_id(struct mount *mnt)
				106	{
				107	int res = ida_alloc(&mnt_id_ida, GFP_KERNEL);
				108
				109	if (res < 0)
				110	return res;
				111	mnt->mnt_id = res;
				112	return 0;
				113	}
				114
				115	static void mnt_free_id(struct mount *mnt)
				116	{
				117	ida_free(&mnt_id_ida, mnt->mnt_id);
				118	}
				119
				120	/*
				121	* Allocate a new peer group ID
				122	*/
				123	static int mnt_alloc_group_id(struct mount *mnt)
				124	{
				125	int res = ida_alloc_min(&mnt_group_ida, 1, GFP_KERNEL);
				126
				127	if (res < 0)
				128	return res;
				129	mnt->mnt_group_id = res;
				130	return 0;
				131	}
				132
				133	/*
				134	* Release a peer group ID
				135	*/
				136	void mnt_release_group_id(struct mount *mnt)
				137	{
				138	ida_free(&mnt_group_ida, mnt->mnt_group_id);
				139	mnt->mnt_group_id = 0;
				140	}
				141
				142	/*
				143	* vfsmount lock must be held for read
				144	*/
				145	static inline void mnt_add_count(struct mount *mnt, int n)
				146	{
				147	#ifdef CONFIG_SMP
				148	this_cpu_add(mnt->mnt_pcp->mnt_count, n);
				149	#else
				150	preempt_disable();
				151	mnt->mnt_count += n;
				152	preempt_enable();
				153	#endif
				154	}
				155
				156	/*
				157	* vfsmount lock must be held for write
				158	*/
				159	int mnt_get_count(struct mount *mnt)
				160	{
				161	#ifdef CONFIG_SMP
				162	int count = 0;
				163	int cpu;
				164
				165	for_each_possible_cpu(cpu) {
				166	count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;
				167	}
				168
				169	return count;
				170	#else
				171	return mnt->mnt_count;
				172	#endif
				173	}
				174
				175	static struct mount alloc_vfsmnt(const char name)
				176	{
				177	struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
				178	if (mnt) {
				179	int err;
				180
				181	err = mnt_alloc_id(mnt);
				182	if (err)
				183	goto out_free_cache;
				184
				185	if (name) {
				186	mnt->mnt_devname = kstrdup_const(name, GFP_KERNEL);
				187	if (!mnt->mnt_devname)
				188	goto out_free_id;
				189	}
				190
				191	#ifdef CONFIG_SMP
				192	mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
				193	if (!mnt->mnt_pcp)
				194	goto out_free_devname;
				195
				196	this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
				197	#else
				198	mnt->mnt_count = 1;
				199	mnt->mnt_writers = 0;
				200	#endif
				201
				202	INIT_HLIST_NODE(&mnt->mnt_hash);
				203	INIT_LIST_HEAD(&mnt->mnt_child);
				204	INIT_LIST_HEAD(&mnt->mnt_mounts);
				205	INIT_LIST_HEAD(&mnt->mnt_list);
				206	INIT_LIST_HEAD(&mnt->mnt_expire);
				207	INIT_LIST_HEAD(&mnt->mnt_share);
				208	INIT_LIST_HEAD(&mnt->mnt_slave_list);
				209	INIT_LIST_HEAD(&mnt->mnt_slave);
				210	INIT_HLIST_NODE(&mnt->mnt_mp_list);
				211	INIT_LIST_HEAD(&mnt->mnt_umounting);
				212	INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
				213	}
				214	return mnt;
				215
				216	#ifdef CONFIG_SMP
				217	out_free_devname:
				218	kfree_const(mnt->mnt_devname);
				219	#endif
				220	out_free_id:
				221	mnt_free_id(mnt);
				222	out_free_cache:
				223	kmem_cache_free(mnt_cache, mnt);
				224	return NULL;
				225	}
				226
				227	/*
				228	* Most r/o checks on a fs are for operations that take
				229	* discrete amounts of time, like a write() or unlink().
				230	* We must keep track of when those operations start
				231	* (for permission checks) and when they end, so that
				232	* we can determine when writes are able to occur to
				233	* a filesystem.
				234	*/
				235	/*
				236	* __mnt_is_readonly: check whether a mount is read-only
				237	* @mnt: the mount to check for its write status
				238	*
				239	* This shouldn't be used directly ouside of the VFS.
				240	* It does not guarantee that the filesystem will stay
				241	* r/w, just that it is right now. This can not and
				242	* should not be used in place of IS_RDONLY(inode).
				243	* mnt_want/drop_write() will _keep_ the filesystem
				244	* r/w.
				245	*/
				246	bool __mnt_is_readonly(struct vfsmount *mnt)
				247	{
				248	return (mnt->mnt_flags & MNT_READONLY) \|\| sb_rdonly(mnt->mnt_sb);
				249	}
				250	EXPORT_SYMBOL_GPL(__mnt_is_readonly);
				251
				252	static inline void mnt_inc_writers(struct mount *mnt)
				253	{
				254	#ifdef CONFIG_SMP
				255	this_cpu_inc(mnt->mnt_pcp->mnt_writers);
				256	#else
				257	mnt->mnt_writers++;
				258	#endif
				259	}
				260
				261	static inline void mnt_dec_writers(struct mount *mnt)
				262	{
				263	#ifdef CONFIG_SMP
				264	this_cpu_dec(mnt->mnt_pcp->mnt_writers);
				265	#else
				266	mnt->mnt_writers--;
				267	#endif
				268	}
				269
				270	static unsigned int mnt_get_writers(struct mount *mnt)
				271	{
				272	#ifdef CONFIG_SMP
				273	unsigned int count = 0;
				274	int cpu;
				275
				276	for_each_possible_cpu(cpu) {
				277	count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;
				278	}
				279
				280	return count;
				281	#else
				282	return mnt->mnt_writers;
				283	#endif
				284	}
				285
				286	static int mnt_is_readonly(struct vfsmount *mnt)
				287	{
				288	if (mnt->mnt_sb->s_readonly_remount)
				289	return 1;
				290	/* Order wrt setting s_flags/s_readonly_remount in do_remount() */
				291	smp_rmb();
				292	return __mnt_is_readonly(mnt);
				293	}
				294
				295	/*
				296	* Most r/o & frozen checks on a fs are for operations that take discrete
				297	* amounts of time, like a write() or unlink(). We must keep track of when
				298	* those operations start (for permission checks) and when they end, so that we
				299	* can determine when writes are able to occur to a filesystem.
				300	*/
				301	/**
				302	* __mnt_want_write - get write access to a mount without freeze protection
				303	* @m: the mount on which to take a write
				304	*
				305	* This tells the low-level filesystem that a write is about to be performed to
				306	* it, and makes sure that writes are allowed (mnt it read-write) before
				307	* returning success. This operation does not protect against filesystem being
				308	* frozen. When the write operation is finished, __mnt_drop_write() must be
				309	* called. This is effectively a refcount.
				310	*/
				311	int __mnt_want_write(struct vfsmount *m)
				312	{
				313	struct mount *mnt = real_mount(m);
				314	int ret = 0;
				315
				316	preempt_disable();
				317	mnt_inc_writers(mnt);
				318	/*
				319	* The store to mnt_inc_writers must be visible before we pass
				320	* MNT_WRITE_HOLD loop below, so that the slowpath can see our
				321	* incremented count after it has set MNT_WRITE_HOLD.
				322	*/
				323	smp_mb();
				324	while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
				325	cpu_relax();
				326	/*
				327	* After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
				328	* be set to match its requirements. So we must not load that until
				329	* MNT_WRITE_HOLD is cleared.
				330	*/
				331	smp_rmb();
				332	if (mnt_is_readonly(m)) {
				333	mnt_dec_writers(mnt);
				334	ret = -EROFS;
				335	}
				336	preempt_enable();
				337
				338	return ret;
				339	}
				340
				341	/**
				342	* mnt_want_write - get write access to a mount
				343	* @m: the mount on which to take a write
				344	*
				345	* This tells the low-level filesystem that a write is about to be performed to
				346	* it, and makes sure that writes are allowed (mount is read-write, filesystem
				347	* is not frozen) before returning success. When the write operation is
				348	* finished, mnt_drop_write() must be called. This is effectively a refcount.
				349	*/
				350	int mnt_want_write(struct vfsmount *m)
				351	{
				352	int ret;
				353
				354	sb_start_write(m->mnt_sb);
				355	ret = __mnt_want_write(m);
				356	if (ret)
				357	sb_end_write(m->mnt_sb);
				358	return ret;
				359	}
				360	EXPORT_SYMBOL_GPL(mnt_want_write);
				361
				362	/**
				363	* mnt_clone_write - get write access to a mount
				364	* @mnt: the mount on which to take a write
				365	*
				366	* This is effectively like mnt_want_write, except
				367	* it must only be used to take an extra write reference
				368	* on a mountpoint that we already know has a write reference
				369	* on it. This allows some optimisation.
				370	*
				371	* After finished, mnt_drop_write must be called as usual to
				372	* drop the reference.
				373	*/
				374	int mnt_clone_write(struct vfsmount *mnt)
				375	{
				376	/* superblock may be r/o */
				377	if (__mnt_is_readonly(mnt))
				378	return -EROFS;
				379	preempt_disable();
				380	mnt_inc_writers(real_mount(mnt));
				381	preempt_enable();
				382	return 0;
				383	}
				384	EXPORT_SYMBOL_GPL(mnt_clone_write);
				385
				386	/**
				387	* __mnt_want_write_file - get write access to a file's mount
				388	* @file: the file who's mount on which to take a write
				389	*
				390	* This is like __mnt_want_write, but it takes a file and can
				391	* do some optimisations if the file is open for write already
				392	*/
				393	int __mnt_want_write_file(struct file *file)
				394	{
				395	if (!(file->f_mode & FMODE_WRITER))
				396	return __mnt_want_write(file->f_path.mnt);
				397	else
				398	return mnt_clone_write(file->f_path.mnt);
				399	}
				400
				401	/**
				402	* mnt_want_write_file - get write access to a file's mount
				403	* @file: the file who's mount on which to take a write
				404	*
				405	* This is like mnt_want_write, but it takes a file and can
				406	* do some optimisations if the file is open for write already
				407	*/
				408	int mnt_want_write_file(struct file *file)
				409	{
				410	int ret;
				411
				412	sb_start_write(file_inode(file)->i_sb);
				413	ret = __mnt_want_write_file(file);
				414	if (ret)
				415	sb_end_write(file_inode(file)->i_sb);
				416	return ret;
				417	}
				418	EXPORT_SYMBOL_GPL(mnt_want_write_file);
				419
				420	/**
				421	* __mnt_drop_write - give up write access to a mount
				422	* @mnt: the mount on which to give up write access
				423	*
				424	* Tells the low-level filesystem that we are done
				425	* performing writes to it. Must be matched with
				426	* __mnt_want_write() call above.
				427	*/
				428	void __mnt_drop_write(struct vfsmount *mnt)
				429	{
				430	preempt_disable();
				431	mnt_dec_writers(real_mount(mnt));
				432	preempt_enable();
				433	}
				434
				435	/**
				436	* mnt_drop_write - give up write access to a mount
				437	* @mnt: the mount on which to give up write access
				438	*
				439	* Tells the low-level filesystem that we are done performing writes to it and
				440	* also allows filesystem to be frozen again. Must be matched with
				441	* mnt_want_write() call above.
				442	*/
				443	void mnt_drop_write(struct vfsmount *mnt)
				444	{
				445	__mnt_drop_write(mnt);
				446	sb_end_write(mnt->mnt_sb);
				447	}
				448	EXPORT_SYMBOL_GPL(mnt_drop_write);
				449
				450	void __mnt_drop_write_file(struct file *file)
				451	{
				452	__mnt_drop_write(file->f_path.mnt);
				453	}
				454
				455	void mnt_drop_write_file(struct file *file)
				456	{
				457	__mnt_drop_write_file(file);
				458	sb_end_write(file_inode(file)->i_sb);
				459	}
				460	EXPORT_SYMBOL(mnt_drop_write_file);
				461
				462	static int mnt_make_readonly(struct mount *mnt)
				463	{
				464	int ret = 0;
				465
				466	lock_mount_hash();
				467	mnt->mnt.mnt_flags \|= MNT_WRITE_HOLD;
				468	/*
				469	* After storing MNT_WRITE_HOLD, we'll read the counters. This store
				470	* should be visible before we do.
				471	*/
				472	smp_mb();
				473
				474	/*
				475	* With writers on hold, if this value is zero, then there are
				476	* definitely no active writers (although held writers may subsequently
				477	* increment the count, they'll have to wait, and decrement it after
				478	* seeing MNT_READONLY).
				479	*
				480	* It is OK to have counter incremented on one CPU and decremented on
				481	* another: the sum will add up correctly. The danger would be when we
				482	* sum up each counter, if we read a counter before it is incremented,
				483	* but then read another CPU's count which it has been subsequently
				484	* decremented from -- we would see more decrements than we should.
				485	* MNT_WRITE_HOLD protects against this scenario, because
				486	* mnt_want_write first increments count, then smp_mb, then spins on
				487	* MNT_WRITE_HOLD, so it can't be decremented by another CPU while
				488	* we're counting up here.
				489	*/
				490	if (mnt_get_writers(mnt) > 0)
				491	ret = -EBUSY;
				492	else
				493	mnt->mnt.mnt_flags \|= MNT_READONLY;
				494	/*
				495	* MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
				496	* that become unheld will see MNT_READONLY.
				497	*/
				498	smp_wmb();
				499	mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
				500	unlock_mount_hash();
				501	return ret;
				502	}
				503
				504	static int __mnt_unmake_readonly(struct mount *mnt)
				505	{
				506	lock_mount_hash();
				507	mnt->mnt.mnt_flags &= ~MNT_READONLY;
				508	unlock_mount_hash();
				509	return 0;
				510	}
				511
				512	int sb_prepare_remount_readonly(struct super_block *sb)
				513	{
				514	struct mount *mnt;
				515	int err = 0;
				516
				517	/* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */
				518	if (atomic_long_read(&sb->s_remove_count))
				519	return -EBUSY;
				520
				521	lock_mount_hash();
				522	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
				523	if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
				524	mnt->mnt.mnt_flags \|= MNT_WRITE_HOLD;
				525	smp_mb();
				526	if (mnt_get_writers(mnt) > 0) {
				527	err = -EBUSY;
				528	break;
				529	}
				530	}
				531	}
				532	if (!err && atomic_long_read(&sb->s_remove_count))
				533	err = -EBUSY;
				534
				535	if (!err) {
				536	sb->s_readonly_remount = 1;
				537	smp_wmb();
				538	}
				539	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
				540	if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
				541	mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
				542	}
				543	unlock_mount_hash();
				544
				545	return err;
				546	}
				547
				548	static void free_vfsmnt(struct mount *mnt)
				549	{
				550	kfree_const(mnt->mnt_devname);
				551	#ifdef CONFIG_SMP
				552	free_percpu(mnt->mnt_pcp);
				553	#endif
				554	kmem_cache_free(mnt_cache, mnt);
				555	}
				556
				557	static void delayed_free_vfsmnt(struct rcu_head *head)
				558	{
				559	free_vfsmnt(container_of(head, struct mount, mnt_rcu));
				560	}
				561
				562	/* call under rcu_read_lock */
				563	int __legitimize_mnt(struct vfsmount *bastard, unsigned seq)
				564	{
				565	struct mount *mnt;
				566	if (read_seqretry(&mount_lock, seq))
				567	return 1;
				568	if (bastard == NULL)
				569	return 0;
				570	mnt = real_mount(bastard);
				571	mnt_add_count(mnt, 1);
				572	smp_mb(); // see mntput_no_expire()
				573	if (likely(!read_seqretry(&mount_lock, seq)))
				574	return 0;
				575	if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {
				576	mnt_add_count(mnt, -1);
				577	return 1;
				578	}
				579	lock_mount_hash();
				580	if (unlikely(bastard->mnt_flags & MNT_DOOMED)) {
				581	mnt_add_count(mnt, -1);
				582	unlock_mount_hash();
				583	return 1;
				584	}
				585	unlock_mount_hash();
				586	/* caller will mntput() */
				587	return -1;
				588	}
				589
				590	/* call under rcu_read_lock */
				591	bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
				592	{
				593	int res = __legitimize_mnt(bastard, seq);
				594	if (likely(!res))
				595	return true;
				596	if (unlikely(res < 0)) {
				597	rcu_read_unlock();
				598	mntput(bastard);
				599	rcu_read_lock();
				600	}
				601	return false;
				602	}
				603
				604	/*
				605	* find the first mount at @dentry on vfsmount @mnt.
				606	* call under rcu_read_lock()
				607	*/
				608	struct mount __lookup_mnt(struct vfsmount mnt, struct dentry *dentry)
				609	{
				610	struct hlist_head *head = m_hash(mnt, dentry);
				611	struct mount *p;
				612
				613	hlist_for_each_entry_rcu(p, head, mnt_hash)
				614	if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
				615	return p;
				616	return NULL;
				617	}
				618
				619	/*
				620	* lookup_mnt - Return the first child mount mounted at path
				621	*
				622	* "First" means first mounted chronologically. If you create the
				623	* following mounts:
				624	*
				625	* mount /dev/sda1 /mnt
				626	* mount /dev/sda2 /mnt
				627	* mount /dev/sda3 /mnt
				628	*
				629	* Then lookup_mnt() on the base /mnt dentry in the root mount will
				630	* return successively the root dentry and vfsmount of /dev/sda1, then
				631	* /dev/sda2, then /dev/sda3, then NULL.
				632	*
				633	* lookup_mnt takes a reference to the found vfsmount.
				634	*/
				635	struct vfsmount lookup_mnt(const struct path path)
				636	{
				637	struct mount *child_mnt;
				638	struct vfsmount *m;
				639	unsigned seq;
				640
				641	rcu_read_lock();
				642	do {
				643	seq = read_seqbegin(&mount_lock);
				644	child_mnt = __lookup_mnt(path->mnt, path->dentry);
				645	m = child_mnt ? &child_mnt->mnt : NULL;
				646	} while (!legitimize_mnt(m, seq));
				647	rcu_read_unlock();
				648	return m;
				649	}
				650
				651	/*
				652	* __is_local_mountpoint - Test to see if dentry is a mountpoint in the
				653	* current mount namespace.
				654	*
				655	* The common case is dentries are not mountpoints at all and that
				656	* test is handled inline. For the slow case when we are actually
				657	* dealing with a mountpoint of some kind, walk through all of the
				658	* mounts in the current mount namespace and test to see if the dentry
				659	* is a mountpoint.
				660	*
				661	* The mount_hashtable is not usable in the context because we
				662	* need to identify all mounts that may be in the current mount
				663	* namespace not just a mount that happens to have some specified
				664	* parent mount.
				665	*/
				666	bool __is_local_mountpoint(struct dentry *dentry)
				667	{
				668	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
				669	struct mount *mnt;
				670	bool is_covered = false;
				671
				672	if (!d_mountpoint(dentry))
				673	goto out;
				674
				675	down_read(&namespace_sem);
				676	list_for_each_entry(mnt, &ns->list, mnt_list) {
				677	is_covered = (mnt->mnt_mountpoint == dentry);
				678	if (is_covered)
				679	break;
				680	}
				681	up_read(&namespace_sem);
				682	out:
				683	return is_covered;
				684	}
				685
				686	static struct mountpoint lookup_mountpoint(struct dentry dentry)
				687	{
				688	struct hlist_head *chain = mp_hash(dentry);
				689	struct mountpoint *mp;
				690
				691	hlist_for_each_entry(mp, chain, m_hash) {
				692	if (mp->m_dentry == dentry) {
				693	mp->m_count++;
				694	return mp;
				695	}
				696	}
				697	return NULL;
				698	}
				699
				700	static struct mountpoint get_mountpoint(struct dentry dentry)
				701	{
				702	struct mountpoint mp, new = NULL;
				703	int ret;
				704
				705	if (d_mountpoint(dentry)) {
				706	/* might be worth a WARN_ON() */
				707	if (d_unlinked(dentry))
				708	return ERR_PTR(-ENOENT);
				709	mountpoint:
				710	read_seqlock_excl(&mount_lock);
				711	mp = lookup_mountpoint(dentry);
				712	read_sequnlock_excl(&mount_lock);
				713	if (mp)
				714	goto done;
				715	}
				716
				717	if (!new)
				718	new = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
				719	if (!new)
				720	return ERR_PTR(-ENOMEM);
				721
				722
				723	/* Exactly one processes may set d_mounted */
				724	ret = d_set_mounted(dentry);
				725
				726	/* Someone else set d_mounted? */
				727	if (ret == -EBUSY)
				728	goto mountpoint;
				729
				730	/* The dentry is not available as a mountpoint? */
				731	mp = ERR_PTR(ret);
				732	if (ret)
				733	goto done;
				734
				735	/* Add the new mountpoint to the hash table */
				736	read_seqlock_excl(&mount_lock);
				737	new->m_dentry = dget(dentry);
				738	new->m_count = 1;
				739	hlist_add_head(&new->m_hash, mp_hash(dentry));
				740	INIT_HLIST_HEAD(&new->m_list);
				741	read_sequnlock_excl(&mount_lock);
				742
				743	mp = new;
				744	new = NULL;
				745	done:
				746	kfree(new);
				747	return mp;
				748	}
				749
				750	/*
				751	* vfsmount lock must be held. Additionally, the caller is responsible
				752	* for serializing calls for given disposal list.
				753	*/
				754	static void __put_mountpoint(struct mountpoint mp, struct list_head list)
				755	{
				756	if (!--mp->m_count) {
				757	struct dentry *dentry = mp->m_dentry;
				758	BUG_ON(!hlist_empty(&mp->m_list));
				759	spin_lock(&dentry->d_lock);
				760	dentry->d_flags &= ~DCACHE_MOUNTED;
				761	spin_unlock(&dentry->d_lock);
				762	dput_to_list(dentry, list);
				763	hlist_del(&mp->m_hash);
				764	kfree(mp);
				765	}
				766	}
				767
				768	/* called with namespace_lock and vfsmount lock */
				769	static void put_mountpoint(struct mountpoint *mp)
				770	{
				771	__put_mountpoint(mp, &ex_mountpoints);
				772	}
				773
				774	static inline int check_mnt(struct mount *mnt)
				775	{
				776	return mnt->mnt_ns == current->nsproxy->mnt_ns;
				777	}
				778
				779	/*
				780	* vfsmount lock must be held for write
				781	*/
				782	static void touch_mnt_namespace(struct mnt_namespace *ns)
				783	{
				784	if (ns) {
				785	ns->event = ++event;
				786	wake_up_interruptible(&ns->poll);
				787	}
				788	}
				789
				790	/*
				791	* vfsmount lock must be held for write
				792	*/
				793	static void __touch_mnt_namespace(struct mnt_namespace *ns)
				794	{
				795	if (ns && ns->event != event) {
				796	ns->event = event;
				797	wake_up_interruptible(&ns->poll);
				798	}
				799	}
				800
				801	/*
				802	* vfsmount lock must be held for write
				803	*/
				804	static struct mountpoint unhash_mnt(struct mount mnt)
				805	{
				806	struct mountpoint *mp;
				807	mnt->mnt_parent = mnt;
				808	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
				809	list_del_init(&mnt->mnt_child);
				810	hlist_del_init_rcu(&mnt->mnt_hash);
				811	hlist_del_init(&mnt->mnt_mp_list);
				812	mp = mnt->mnt_mp;
				813	mnt->mnt_mp = NULL;
				814	return mp;
				815	}
				816
				817	/*
				818	* vfsmount lock must be held for write
				819	*/
				820	static void umount_mnt(struct mount *mnt)
				821	{
				822	put_mountpoint(unhash_mnt(mnt));
				823	}
				824
				825	/*
				826	* vfsmount lock must be held for write
				827	*/
				828	void mnt_set_mountpoint(struct mount *mnt,
				829	struct mountpoint *mp,
				830	struct mount *child_mnt)
				831	{
				832	mp->m_count++;
				833	mnt_add_count(mnt, 1); /* essentially, that's mntget */
				834	child_mnt->mnt_mountpoint = mp->m_dentry;
				835	child_mnt->mnt_parent = mnt;
				836	child_mnt->mnt_mp = mp;
				837	hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
				838	}
				839
				840	static void __attach_mnt(struct mount mnt, struct mount parent)
				841	{
				842	hlist_add_head_rcu(&mnt->mnt_hash,
				843	m_hash(&parent->mnt, mnt->mnt_mountpoint));
				844	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
				845	}
				846
				847	/*
				848	* vfsmount lock must be held for write
				849	*/
				850	static void attach_mnt(struct mount *mnt,
				851	struct mount *parent,
				852	struct mountpoint *mp)
				853	{
				854	mnt_set_mountpoint(parent, mp, mnt);
				855	__attach_mnt(mnt, parent);
				856	}
				857
				858	void mnt_change_mountpoint(struct mount parent, struct mountpoint mp, struct mount *mnt)
				859	{
				860	struct mountpoint *old_mp = mnt->mnt_mp;
				861	struct mount *old_parent = mnt->mnt_parent;
				862
				863	list_del_init(&mnt->mnt_child);
				864	hlist_del_init(&mnt->mnt_mp_list);
				865	hlist_del_init_rcu(&mnt->mnt_hash);
				866
				867	attach_mnt(mnt, parent, mp);
				868
				869	put_mountpoint(old_mp);
				870	mnt_add_count(old_parent, -1);
				871	}
				872
				873	/*
				874	* vfsmount lock must be held for write
				875	*/
				876	static void commit_tree(struct mount *mnt)
				877	{
				878	struct mount *parent = mnt->mnt_parent;
				879	struct mount *m;
				880	LIST_HEAD(head);
				881	struct mnt_namespace *n = parent->mnt_ns;
				882
				883	BUG_ON(parent == mnt);
				884
				885	list_add_tail(&head, &mnt->mnt_list);
				886	list_for_each_entry(m, &head, mnt_list)
				887	m->mnt_ns = n;
				888
				889	list_splice(&head, n->list.prev);
				890
				891	n->mounts += n->pending_mounts;
				892	n->pending_mounts = 0;
				893
				894	__attach_mnt(mnt, parent);
				895	touch_mnt_namespace(n);
				896	}
				897
				898	static struct mount next_mnt(struct mount p, struct mount *root)
				899	{
				900	struct list_head *next = p->mnt_mounts.next;
				901	if (next == &p->mnt_mounts) {
				902	while (1) {
				903	if (p == root)
				904	return NULL;
				905	next = p->mnt_child.next;
				906	if (next != &p->mnt_parent->mnt_mounts)
				907	break;
				908	p = p->mnt_parent;
				909	}
				910	}
				911	return list_entry(next, struct mount, mnt_child);
				912	}
				913
				914	static struct mount skip_mnt_tree(struct mount p)
				915	{
				916	struct list_head *prev = p->mnt_mounts.prev;
				917	while (prev != &p->mnt_mounts) {
				918	p = list_entry(prev, struct mount, mnt_child);
				919	prev = p->mnt_mounts.prev;
				920	}
				921	return p;
				922	}
				923
				924	/**
				925	* vfs_create_mount - Create a mount for a configured superblock
				926	* @fc: The configuration context with the superblock attached
				927	*
				928	* Create a mount to an already configured superblock. If necessary, the
				929	* caller should invoke vfs_get_tree() before calling this.
				930	*
				931	* Note that this does not attach the mount to anything.
				932	*/
				933	struct vfsmount vfs_create_mount(struct fs_context fc)
				934	{
				935	struct mount *mnt;
				936
				937	if (!fc->root)
				938	return ERR_PTR(-EINVAL);
				939
				940	mnt = alloc_vfsmnt(fc->source ?: "none");
				941	if (!mnt)
				942	return ERR_PTR(-ENOMEM);
				943
				944	if (fc->sb_flags & SB_KERNMOUNT)
				945	mnt->mnt.mnt_flags = MNT_INTERNAL;
				946
				947	atomic_inc(&fc->root->d_sb->s_active);
				948	mnt->mnt.mnt_sb = fc->root->d_sb;
				949	mnt->mnt.mnt_root = dget(fc->root);
				950	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
				951	mnt->mnt_parent = mnt;
				952
				953	lock_mount_hash();
				954	list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts);
				955	unlock_mount_hash();
				956	return &mnt->mnt;
				957	}
				958	EXPORT_SYMBOL(vfs_create_mount);
				959
				960	struct vfsmount fc_mount(struct fs_context fc)
				961	{
				962	int err = vfs_get_tree(fc);
				963	if (!err) {
				964	up_write(&fc->root->d_sb->s_umount);
				965	return vfs_create_mount(fc);
				966	}
				967	return ERR_PTR(err);
				968	}
				969	EXPORT_SYMBOL(fc_mount);
				970
				971	struct vfsmount vfs_kern_mount(struct file_system_type type,
				972	int flags, const char *name,
				973	void *data)
				974	{
				975	struct fs_context *fc;
				976	struct vfsmount *mnt;
				977	int ret = 0;
				978
				979	if (!type)
				980	return ERR_PTR(-EINVAL);
				981
				982	fc = fs_context_for_mount(type, flags);
				983	if (IS_ERR(fc))
				984	return ERR_CAST(fc);
				985
				986	if (name)
				987	ret = vfs_parse_fs_string(fc, "source",
				988	name, strlen(name));
				989	if (!ret)
				990	ret = parse_monolithic_mount_data(fc, data);
				991	if (!ret)
				992	mnt = fc_mount(fc);
				993	else
				994	mnt = ERR_PTR(ret);
				995
				996	put_fs_context(fc);
				997	return mnt;
				998	}
				999	EXPORT_SYMBOL_GPL(vfs_kern_mount);
				1000
				1001	struct vfsmount *
				1002	vfs_submount(const struct dentry mountpoint, struct file_system_type type,
				1003	const char name, void data)
				1004	{
				1005	/* Until it is worked out how to pass the user namespace
				1006	* through from the parent mount to the submount don't support
				1007	* unprivileged mounts with submounts.
				1008	*/
				1009	if (mountpoint->d_sb->s_user_ns != &init_user_ns)
				1010	return ERR_PTR(-EPERM);
				1011
				1012	return vfs_kern_mount(type, SB_SUBMOUNT, name, data);
				1013	}
				1014	EXPORT_SYMBOL_GPL(vfs_submount);
				1015
				1016	static struct mount clone_mnt(struct mount old, struct dentry *root,
				1017	int flag)
				1018	{
				1019	struct super_block *sb = old->mnt.mnt_sb;
				1020	struct mount *mnt;
				1021	int err;
				1022
				1023	mnt = alloc_vfsmnt(old->mnt_devname);
				1024	if (!mnt)
				1025	return ERR_PTR(-ENOMEM);
				1026
				1027	if (flag & (CL_SLAVE \| CL_PRIVATE \| CL_SHARED_TO_SLAVE))
				1028	mnt->mnt_group_id = 0; /* not a peer of original */
				1029	else
				1030	mnt->mnt_group_id = old->mnt_group_id;
				1031
				1032	if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) {
				1033	err = mnt_alloc_group_id(mnt);
				1034	if (err)
				1035	goto out_free;
				1036	}
				1037
				1038	mnt->mnt.mnt_flags = old->mnt.mnt_flags;
				1039	mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD\|MNT_MARKED\|MNT_INTERNAL);
				1040
				1041	atomic_inc(&sb->s_active);
				1042	mnt->mnt.mnt_sb = sb;
				1043	mnt->mnt.mnt_root = dget(root);
				1044	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
				1045	mnt->mnt_parent = mnt;
				1046	lock_mount_hash();
				1047	list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
				1048	unlock_mount_hash();
				1049
				1050	if ((flag & CL_SLAVE) \|\|
				1051	((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) {
				1052	list_add(&mnt->mnt_slave, &old->mnt_slave_list);
				1053	mnt->mnt_master = old;
				1054	CLEAR_MNT_SHARED(mnt);
				1055	} else if (!(flag & CL_PRIVATE)) {
				1056	if ((flag & CL_MAKE_SHARED) \|\| IS_MNT_SHARED(old))
				1057	list_add(&mnt->mnt_share, &old->mnt_share);
				1058	if (IS_MNT_SLAVE(old))
				1059	list_add(&mnt->mnt_slave, &old->mnt_slave);
				1060	mnt->mnt_master = old->mnt_master;
				1061	} else {
				1062	CLEAR_MNT_SHARED(mnt);
				1063	}
				1064	if (flag & CL_MAKE_SHARED)
				1065	set_mnt_shared(mnt);
				1066
				1067	/* stick the duplicate mount on the same expiry list
				1068	* as the original if that was on one */
				1069	if (flag & CL_EXPIRE) {
				1070	if (!list_empty(&old->mnt_expire))
				1071	list_add(&mnt->mnt_expire, &old->mnt_expire);
				1072	}
				1073
				1074	return mnt;
				1075
				1076	out_free:
				1077	mnt_free_id(mnt);
				1078	free_vfsmnt(mnt);
				1079	return ERR_PTR(err);
				1080	}
				1081
				1082	static void cleanup_mnt(struct mount *mnt)
				1083	{
				1084	struct hlist_node *p;
				1085	struct mount *m;
				1086	/*
				1087	* The warning here probably indicates that somebody messed
				1088	* up a mnt_want/drop_write() pair. If this happens, the
				1089	* filesystem was probably unable to make r/w->r/o transitions.
				1090	* The locking used to deal with mnt_count decrement provides barriers,
				1091	* so mnt_get_writers() below is safe.
				1092	*/
				1093	WARN_ON(mnt_get_writers(mnt));
				1094	if (unlikely(mnt->mnt_pins.first))
				1095	mnt_pin_kill(mnt);
				1096	hlist_for_each_entry_safe(m, p, &mnt->mnt_stuck_children, mnt_umount) {
				1097	hlist_del(&m->mnt_umount);
				1098	mntput(&m->mnt);
				1099	}
				1100	fsnotify_vfsmount_delete(&mnt->mnt);
				1101	dput(mnt->mnt.mnt_root);
				1102	deactivate_super(mnt->mnt.mnt_sb);
				1103	mnt_free_id(mnt);
				1104	call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
				1105	}
				1106
				1107	static void __cleanup_mnt(struct rcu_head *head)
				1108	{
				1109	cleanup_mnt(container_of(head, struct mount, mnt_rcu));
				1110	}
				1111
				1112	static LLIST_HEAD(delayed_mntput_list);
				1113	static void delayed_mntput(struct work_struct *unused)
				1114	{
				1115	struct llist_node *node = llist_del_all(&delayed_mntput_list);
				1116	struct mount m, t;
				1117
				1118	llist_for_each_entry_safe(m, t, node, mnt_llist)
				1119	cleanup_mnt(m);
				1120	}
				1121	static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput);
				1122
				1123	static void mntput_no_expire(struct mount *mnt)
				1124	{
				1125	LIST_HEAD(list);
				1126	int count;
				1127
				1128	rcu_read_lock();
				1129	if (likely(READ_ONCE(mnt->mnt_ns))) {
				1130	/*
				1131	* Since we don't do lock_mount_hash() here,
				1132	* ->mnt_ns can change under us. However, if it's
				1133	* non-NULL, then there's a reference that won't
				1134	* be dropped until after an RCU delay done after
				1135	* turning ->mnt_ns NULL. So if we observe it
				1136	* non-NULL under rcu_read_lock(), the reference
				1137	* we are dropping is not the final one.
				1138	*/
				1139	mnt_add_count(mnt, -1);
				1140	rcu_read_unlock();
				1141	return;
				1142	}
				1143	lock_mount_hash();
				1144	/*
				1145	* make sure that if __legitimize_mnt() has not seen us grab
				1146	* mount_lock, we'll see their refcount increment here.
				1147	*/
				1148	smp_mb();
				1149	mnt_add_count(mnt, -1);
				1150	count = mnt_get_count(mnt);
				1151	if (count != 0) {
				1152	WARN_ON(count < 0);
				1153	rcu_read_unlock();
				1154	unlock_mount_hash();
				1155	return;
				1156	}
				1157	if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
				1158	rcu_read_unlock();
				1159	unlock_mount_hash();
				1160	return;
				1161	}
				1162	mnt->mnt.mnt_flags \|= MNT_DOOMED;
				1163	rcu_read_unlock();
				1164
				1165	list_del(&mnt->mnt_instance);
				1166
				1167	if (unlikely(!list_empty(&mnt->mnt_mounts))) {
				1168	struct mount p, tmp;
				1169	list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts, mnt_child) {
				1170	__put_mountpoint(unhash_mnt(p), &list);
				1171	hlist_add_head(&p->mnt_umount, &mnt->mnt_stuck_children);
				1172	}
				1173	}
				1174	unlock_mount_hash();
				1175	shrink_dentry_list(&list);
				1176
				1177	if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
				1178	struct task_struct *task = current;
				1179	if (likely(!(task->flags & PF_KTHREAD))) {
				1180	init_task_work(&mnt->mnt_rcu, __cleanup_mnt);
				1181	if (!task_work_add(task, &mnt->mnt_rcu, true))
				1182	return;
				1183	}
				1184	if (llist_add(&mnt->mnt_llist, &delayed_mntput_list))
				1185	schedule_delayed_work(&delayed_mntput_work, 1);
				1186	return;
				1187	}
				1188	cleanup_mnt(mnt);
				1189	}
				1190
				1191	void mntput(struct vfsmount *mnt)
				1192	{
				1193	if (mnt) {
				1194	struct mount *m = real_mount(mnt);
				1195	/* avoid cacheline pingpong, hope gcc doesn't get "smart" */
				1196	if (unlikely(m->mnt_expiry_mark))
				1197	m->mnt_expiry_mark = 0;
				1198	mntput_no_expire(m);
				1199	}
				1200	}
				1201	EXPORT_SYMBOL(mntput);
				1202
				1203	struct vfsmount mntget(struct vfsmount mnt)
				1204	{
				1205	if (mnt)
				1206	mnt_add_count(real_mount(mnt), 1);
				1207	return mnt;
				1208	}
				1209	EXPORT_SYMBOL(mntget);
				1210
				1211	/* path_is_mountpoint() - Check if path is a mount in the current
				1212	* namespace.
				1213	*
				1214	* d_mountpoint() can only be used reliably to establish if a dentry is
				1215	* not mounted in any namespace and that common case is handled inline.
				1216	* d_mountpoint() isn't aware of the possibility there may be multiple
				1217	* mounts using a given dentry in a different namespace. This function
				1218	* checks if the passed in path is a mountpoint rather than the dentry
				1219	* alone.
				1220	*/
				1221	bool path_is_mountpoint(const struct path *path)
				1222	{
				1223	unsigned seq;
				1224	bool res;
				1225
				1226	if (!d_mountpoint(path->dentry))
				1227	return false;
				1228
				1229	rcu_read_lock();
				1230	do {
				1231	seq = read_seqbegin(&mount_lock);
				1232	res = __path_is_mountpoint(path);
				1233	} while (read_seqretry(&mount_lock, seq));
				1234	rcu_read_unlock();
				1235
				1236	return res;
				1237	}
				1238	EXPORT_SYMBOL(path_is_mountpoint);
				1239
				1240	struct vfsmount mnt_clone_internal(const struct path path)
				1241	{
				1242	struct mount *p;
				1243	p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE);
				1244	if (IS_ERR(p))
				1245	return ERR_CAST(p);
				1246	p->mnt.mnt_flags \|= MNT_INTERNAL;
				1247	return &p->mnt;
				1248	}
				1249
				1250	#ifdef CONFIG_PROC_FS
				1251	/* iterator; we want it to have access to namespace_sem, thus here... */
				1252	static void m_start(struct seq_file m, loff_t *pos)
				1253	{
				1254	struct proc_mounts *p = m->private;
				1255
				1256	down_read(&namespace_sem);
				1257	if (p->cached_event == p->ns->event) {
				1258	void *v = p->cached_mount;
				1259	if (*pos == p->cached_index)
				1260	return v;
				1261	if (*pos == p->cached_index + 1) {
				1262	v = seq_list_next(v, &p->ns->list, &p->cached_index);
				1263	return p->cached_mount = v;
				1264	}
				1265	}
				1266
				1267	p->cached_event = p->ns->event;
				1268	p->cached_mount = seq_list_start(&p->ns->list, *pos);
				1269	p->cached_index = *pos;
				1270	return p->cached_mount;
				1271	}
				1272
				1273	static void m_next(struct seq_file m, void v, loff_t pos)
				1274	{
				1275	struct proc_mounts *p = m->private;
				1276
				1277	p->cached_mount = seq_list_next(v, &p->ns->list, pos);
				1278	p->cached_index = *pos;
				1279	return p->cached_mount;
				1280	}
				1281
				1282	static void m_stop(struct seq_file m, void v)
				1283	{
				1284	up_read(&namespace_sem);
				1285	}
				1286
				1287	static int m_show(struct seq_file m, void v)
				1288	{
				1289	struct proc_mounts *p = m->private;
				1290	struct mount *r = list_entry(v, struct mount, mnt_list);
				1291	return p->show(m, &r->mnt);
				1292	}
				1293
				1294	const struct seq_operations mounts_op = {
				1295	.start = m_start,
				1296	.next = m_next,
				1297	.stop = m_stop,
				1298	.show = m_show,
				1299	};
				1300	#endif /* CONFIG_PROC_FS */
				1301
				1302	/**
				1303	* may_umount_tree - check if a mount tree is busy
				1304	* @mnt: root of mount tree
				1305	*
				1306	* This is called to check if a tree of mounts has any
				1307	* open files, pwds, chroots or sub mounts that are
				1308	* busy.
				1309	*/
				1310	int may_umount_tree(struct vfsmount *m)
				1311	{
				1312	struct mount *mnt = real_mount(m);
				1313	int actual_refs = 0;
				1314	int minimum_refs = 0;
				1315	struct mount *p;
				1316	BUG_ON(!m);
				1317
				1318	/* write lock needed for mnt_get_count */
				1319	lock_mount_hash();
				1320	for (p = mnt; p; p = next_mnt(p, mnt)) {
				1321	actual_refs += mnt_get_count(p);
				1322	minimum_refs += 2;
				1323	}
				1324	unlock_mount_hash();
				1325
				1326	if (actual_refs > minimum_refs)
				1327	return 0;
				1328
				1329	return 1;
				1330	}
				1331
				1332	EXPORT_SYMBOL(may_umount_tree);
				1333
				1334	/**
				1335	* may_umount - check if a mount point is busy
				1336	* @mnt: root of mount
				1337	*
				1338	* This is called to check if a mount point has any
				1339	* open files, pwds, chroots or sub mounts. If the
				1340	* mount has sub mounts this will return busy
				1341	* regardless of whether the sub mounts are busy.
				1342	*
				1343	* Doesn't take quota and stuff into account. IOW, in some cases it will
				1344	* give false negatives. The main reason why it's here is that we need
				1345	* a non-destructive way to look for easily umountable filesystems.
				1346	*/
				1347	int may_umount(struct vfsmount *mnt)
				1348	{
				1349	int ret = 1;
				1350	down_read(&namespace_sem);
				1351	lock_mount_hash();
				1352	if (propagate_mount_busy(real_mount(mnt), 2))
				1353	ret = 0;
				1354	unlock_mount_hash();
				1355	up_read(&namespace_sem);
				1356	return ret;
				1357	}
				1358
				1359	EXPORT_SYMBOL(may_umount);
				1360
				1361	static void namespace_unlock(void)
				1362	{
				1363	struct hlist_head head;
				1364	struct hlist_node *p;
				1365	struct mount *m;
				1366	LIST_HEAD(list);
				1367
				1368	hlist_move_list(&unmounted, &head);
				1369	list_splice_init(&ex_mountpoints, &list);
				1370
				1371	up_write(&namespace_sem);
				1372
				1373	shrink_dentry_list(&list);
				1374
				1375	if (likely(hlist_empty(&head)))
				1376	return;
				1377
				1378	synchronize_rcu_expedited();
				1379
				1380	hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
				1381	hlist_del(&m->mnt_umount);
				1382	mntput(&m->mnt);
				1383	}
				1384	}
				1385
				1386	static inline void namespace_lock(void)
				1387	{
				1388	down_write(&namespace_sem);
				1389	}
				1390
				1391	enum umount_tree_flags {
				1392	UMOUNT_SYNC = 1,
				1393	UMOUNT_PROPAGATE = 2,
				1394	UMOUNT_CONNECTED = 4,
				1395	};
				1396
				1397	static bool disconnect_mount(struct mount *mnt, enum umount_tree_flags how)
				1398	{
				1399	/* Leaving mounts connected is only valid for lazy umounts */
				1400	if (how & UMOUNT_SYNC)
				1401	return true;
				1402
				1403	/* A mount without a parent has nothing to be connected to */
				1404	if (!mnt_has_parent(mnt))
				1405	return true;
				1406
				1407	/* Because the reference counting rules change when mounts are
				1408	* unmounted and connected, umounted mounts may not be
				1409	* connected to mounted mounts.
				1410	*/
				1411	if (!(mnt->mnt_parent->mnt.mnt_flags & MNT_UMOUNT))
				1412	return true;
				1413
				1414	/* Has it been requested that the mount remain connected? */
				1415	if (how & UMOUNT_CONNECTED)
				1416	return false;
				1417
				1418	/* Is the mount locked such that it needs to remain connected? */
				1419	if (IS_MNT_LOCKED(mnt))
				1420	return false;
				1421
				1422	/* By default disconnect the mount */
				1423	return true;
				1424	}
				1425
				1426	/*
				1427	* mount_lock must be held
				1428	* namespace_sem must be held for write
				1429	*/
				1430	static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
				1431	{
				1432	LIST_HEAD(tmp_list);
				1433	struct mount *p;
				1434
				1435	if (how & UMOUNT_PROPAGATE)
				1436	propagate_mount_unlock(mnt);
				1437
				1438	/* Gather the mounts to umount */
				1439	for (p = mnt; p; p = next_mnt(p, mnt)) {
				1440	p->mnt.mnt_flags \|= MNT_UMOUNT;
				1441	list_move(&p->mnt_list, &tmp_list);
				1442	}
				1443
				1444	/* Hide the mounts from mnt_mounts */
				1445	list_for_each_entry(p, &tmp_list, mnt_list) {
				1446	list_del_init(&p->mnt_child);
				1447	}
				1448
				1449	/* Add propogated mounts to the tmp_list */
				1450	if (how & UMOUNT_PROPAGATE)
				1451	propagate_umount(&tmp_list);
				1452
				1453	while (!list_empty(&tmp_list)) {
				1454	struct mnt_namespace *ns;
				1455	bool disconnect;
				1456	p = list_first_entry(&tmp_list, struct mount, mnt_list);
				1457	list_del_init(&p->mnt_expire);
				1458	list_del_init(&p->mnt_list);
				1459	ns = p->mnt_ns;
				1460	if (ns) {
				1461	ns->mounts--;
				1462	__touch_mnt_namespace(ns);
				1463	}
				1464	p->mnt_ns = NULL;
				1465	if (how & UMOUNT_SYNC)
				1466	p->mnt.mnt_flags \|= MNT_SYNC_UMOUNT;
				1467
				1468	disconnect = disconnect_mount(p, how);
				1469	if (mnt_has_parent(p)) {
				1470	mnt_add_count(p->mnt_parent, -1);
				1471	if (!disconnect) {
				1472	/* Don't forget about p */
				1473	list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts);
				1474	} else {
				1475	umount_mnt(p);
				1476	}
				1477	}
				1478	change_mnt_propagation(p, MS_PRIVATE);
				1479	if (disconnect)
				1480	hlist_add_head(&p->mnt_umount, &unmounted);
				1481	}
				1482	}
				1483
				1484	static void shrink_submounts(struct mount *mnt);
				1485
				1486	static int do_umount_root(struct super_block *sb)
				1487	{
				1488	int ret = 0;
				1489
				1490	down_write(&sb->s_umount);
				1491	if (!sb_rdonly(sb)) {
				1492	struct fs_context *fc;
				1493
				1494	fc = fs_context_for_reconfigure(sb->s_root, SB_RDONLY,
				1495	SB_RDONLY);
				1496	if (IS_ERR(fc)) {
				1497	ret = PTR_ERR(fc);
				1498	} else {
				1499	ret = parse_monolithic_mount_data(fc, NULL);
				1500	if (!ret)
				1501	ret = reconfigure_super(fc);
				1502	put_fs_context(fc);
				1503	}
				1504	}
				1505	up_write(&sb->s_umount);
				1506	return ret;
				1507	}
				1508
				1509	static int do_umount(struct mount *mnt, int flags)
				1510	{
				1511	struct super_block *sb = mnt->mnt.mnt_sb;
				1512	int retval;
				1513
				1514	retval = security_sb_umount(&mnt->mnt, flags);
				1515	if (retval)
				1516	return retval;
				1517
				1518	/*
				1519	* Allow userspace to request a mountpoint be expired rather than
				1520	* unmounting unconditionally. Unmount only happens if:
				1521	* (1) the mark is already set (the mark is cleared by mntput())
				1522	* (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount]
				1523	*/
				1524	if (flags & MNT_EXPIRE) {
				1525	if (&mnt->mnt == current->fs->root.mnt \|\|
				1526	flags & (MNT_FORCE \| MNT_DETACH))
				1527	return -EINVAL;
				1528
				1529	/*
				1530	* probably don't strictly need the lock here if we examined
				1531	* all race cases, but it's a slowpath.
				1532	*/
				1533	lock_mount_hash();
				1534	if (mnt_get_count(mnt) != 2) {
				1535	unlock_mount_hash();
				1536	return -EBUSY;
				1537	}
				1538	unlock_mount_hash();
				1539
				1540	if (!xchg(&mnt->mnt_expiry_mark, 1))
				1541	return -EAGAIN;
				1542	}
				1543
				1544	/*
				1545	* If we may have to abort operations to get out of this
				1546	* mount, and they will themselves hold resources we must
				1547	* allow the fs to do things. In the Unix tradition of
				1548	* 'Gee thats tricky lets do it in userspace' the umount_begin
				1549	* might fail to complete on the first run through as other tasks
				1550	* must return, and the like. Thats for the mount program to worry
				1551	* about for the moment.
				1552	*/
				1553
				1554	if (flags & MNT_FORCE && sb->s_op->umount_begin) {
				1555	sb->s_op->umount_begin(sb);
				1556	}
				1557
				1558	/*
				1559	* No sense to grab the lock for this test, but test itself looks
				1560	* somewhat bogus. Suggestions for better replacement?
				1561	* Ho-hum... In principle, we might treat that as umount + switch
				1562	* to rootfs. GC would eventually take care of the old vfsmount.
				1563	* Actually it makes sense, especially if rootfs would contain a
				1564	* /reboot - static binary that would close all descriptors and
				1565	* call reboot(9). Then init(8) could umount root and exec /reboot.
				1566	*/
				1567	if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) {
				1568	/*
				1569	* Special case for "unmounting" root ...
				1570	* we just try to remount it readonly.
				1571	*/
				1572	if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
				1573	return -EPERM;
				1574	return do_umount_root(sb);
				1575	}
				1576
				1577	namespace_lock();
				1578	lock_mount_hash();
				1579
				1580	/* Recheck MNT_LOCKED with the locks held */
				1581	retval = -EINVAL;
				1582	if (mnt->mnt.mnt_flags & MNT_LOCKED)
				1583	goto out;
				1584
				1585	event++;
				1586	if (flags & MNT_DETACH) {
				1587	if (!list_empty(&mnt->mnt_list))
				1588	umount_tree(mnt, UMOUNT_PROPAGATE);
				1589	retval = 0;
				1590	} else {
				1591	shrink_submounts(mnt);
				1592	retval = -EBUSY;
				1593	if (!propagate_mount_busy(mnt, 2)) {
				1594	if (!list_empty(&mnt->mnt_list))
				1595	umount_tree(mnt, UMOUNT_PROPAGATE\|UMOUNT_SYNC);
				1596	retval = 0;
				1597	}
				1598	}
				1599	out:
				1600	unlock_mount_hash();
				1601	namespace_unlock();
				1602	return retval;
				1603	}
				1604
				1605	/*
				1606	* __detach_mounts - lazily unmount all mounts on the specified dentry
				1607	*
				1608	* During unlink, rmdir, and d_drop it is possible to loose the path
				1609	* to an existing mountpoint, and wind up leaking the mount.
				1610	* detach_mounts allows lazily unmounting those mounts instead of
				1611	* leaking them.
				1612	*
				1613	* The caller may hold dentry->d_inode->i_mutex.
				1614	*/
				1615	void __detach_mounts(struct dentry *dentry)
				1616	{
				1617	struct mountpoint *mp;
				1618	struct mount *mnt;
				1619
				1620	namespace_lock();
				1621	lock_mount_hash();
				1622	mp = lookup_mountpoint(dentry);
				1623	if (!mp)
				1624	goto out_unlock;
				1625
				1626	event++;
				1627	while (!hlist_empty(&mp->m_list)) {
				1628	mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
				1629	if (mnt->mnt.mnt_flags & MNT_UMOUNT) {
				1630	umount_mnt(mnt);
				1631	hlist_add_head(&mnt->mnt_umount, &unmounted);
				1632	}
				1633	else umount_tree(mnt, UMOUNT_CONNECTED);
				1634	}
				1635	put_mountpoint(mp);
				1636	out_unlock:
				1637	unlock_mount_hash();
				1638	namespace_unlock();
				1639	}
				1640
				1641	/*
				1642	* Is the caller allowed to modify his namespace?
				1643	*/
				1644	static inline bool may_mount(void)
				1645	{
				1646	return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN);
				1647	}
				1648
				1649	#ifdef CONFIG_MANDATORY_FILE_LOCKING
				1650	static bool may_mandlock(void)
				1651	{
				1652	pr_warn_once("======================================================\n"
				1653	"WARNING: the mand mount option is being deprecated and\n"
				1654	" will be removed in v5.15!\n"
				1655	"======================================================\n");
				1656	return capable(CAP_SYS_ADMIN);
				1657	}
				1658	#else
				1659	static inline bool may_mandlock(void)
				1660	{
				1661	pr_warn("VFS: \"mand\" mount option not supported");
				1662	return false;
				1663	}
				1664	#endif
				1665
				1666	/*
				1667	* Now umount can handle mount points as well as block devices.
				1668	* This is important for filesystems which use unnamed block devices.
				1669	*
				1670	* We now support a flag for forced unmount like the other 'big iron'
				1671	* unixes. Our API is identical to OSF/1 to avoid making a mess of AMD
				1672	*/
				1673
				1674	int ksys_umount(char __user *name, int flags)
				1675	{
				1676	struct path path;
				1677	struct mount *mnt;
				1678	int retval;
				1679	int lookup_flags = 0;
				1680
				1681	if (flags & ~(MNT_FORCE \| MNT_DETACH \| MNT_EXPIRE \| UMOUNT_NOFOLLOW))
				1682	return -EINVAL;
				1683
				1684	if (!may_mount())
				1685	return -EPERM;
				1686
				1687	if (!(flags & UMOUNT_NOFOLLOW))
				1688	lookup_flags \|= LOOKUP_FOLLOW;
				1689
				1690	retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path);
				1691	if (retval)
				1692	goto out;
				1693	mnt = real_mount(path.mnt);
				1694	retval = -EINVAL;
				1695	if (path.dentry != path.mnt->mnt_root)
				1696	goto dput_and_out;
				1697	if (!check_mnt(mnt))
				1698	goto dput_and_out;
				1699	if (mnt->mnt.mnt_flags & MNT_LOCKED) /* Check optimistically */
				1700	goto dput_and_out;
				1701	retval = -EPERM;
				1702	if (flags & MNT_FORCE && !capable(CAP_SYS_ADMIN))
				1703	goto dput_and_out;
				1704
				1705	retval = do_umount(mnt, flags);
				1706	dput_and_out:
				1707	/* we mustn't call path_put() as that would clear mnt_expiry_mark */
				1708	dput(path.dentry);
				1709	mntput_no_expire(mnt);
				1710	out:
				1711	return retval;
				1712	}
				1713
				1714	SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
				1715	{
				1716	return ksys_umount(name, flags);
				1717	}
				1718
				1719	#ifdef __ARCH_WANT_SYS_OLDUMOUNT
				1720
				1721	/*
				1722	* The 2.0 compatible umount. No flags.
				1723	*/
				1724	SYSCALL_DEFINE1(oldumount, char __user *, name)
				1725	{
				1726	return ksys_umount(name, 0);
				1727	}
				1728
				1729	#endif
				1730
				1731	static bool is_mnt_ns_file(struct dentry *dentry)
				1732	{
				1733	/* Is this a proxy for a mount namespace? */
				1734	return dentry->d_op == &ns_dentry_operations &&
				1735	dentry->d_fsdata == &mntns_operations;
				1736	}
				1737
				1738	struct mnt_namespace to_mnt_ns(struct ns_common ns)
				1739	{
				1740	return container_of(ns, struct mnt_namespace, ns);
				1741	}
				1742
				1743	static bool mnt_ns_loop(struct dentry *dentry)
				1744	{
				1745	/* Could bind mounting the mount namespace inode cause a
				1746	* mount namespace loop?
				1747	*/
				1748	struct mnt_namespace *mnt_ns;
				1749	if (!is_mnt_ns_file(dentry))
				1750	return false;
				1751
				1752	mnt_ns = to_mnt_ns(get_proc_ns(dentry->d_inode));
				1753	return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;
				1754	}
				1755
				1756	struct mount copy_tree(struct mount mnt, struct dentry *dentry,
				1757	int flag)
				1758	{
				1759	struct mount res, p, q, r, *parent;
				1760
				1761	if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt))
				1762	return ERR_PTR(-EINVAL);
				1763
				1764	if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry))
				1765	return ERR_PTR(-EINVAL);
				1766
				1767	res = q = clone_mnt(mnt, dentry, flag);
				1768	if (IS_ERR(q))
				1769	return q;
				1770
				1771	q->mnt_mountpoint = mnt->mnt_mountpoint;
				1772
				1773	p = mnt;
				1774	list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) {
				1775	struct mount *s;
				1776	if (!is_subdir(r->mnt_mountpoint, dentry))
				1777	continue;
				1778
				1779	for (s = r; s; s = next_mnt(s, r)) {
				1780	if (!(flag & CL_COPY_UNBINDABLE) &&
				1781	IS_MNT_UNBINDABLE(s)) {
				1782	if (s->mnt.mnt_flags & MNT_LOCKED) {
				1783	/* Both unbindable and locked. */
				1784	q = ERR_PTR(-EPERM);
				1785	goto out;
				1786	} else {
				1787	s = skip_mnt_tree(s);
				1788	continue;
				1789	}
				1790	}
				1791	if (!(flag & CL_COPY_MNT_NS_FILE) &&
				1792	is_mnt_ns_file(s->mnt.mnt_root)) {
				1793	s = skip_mnt_tree(s);
				1794	continue;
				1795	}
				1796	while (p != s->mnt_parent) {
				1797	p = p->mnt_parent;
				1798	q = q->mnt_parent;
				1799	}
				1800	p = s;
				1801	parent = q;
				1802	q = clone_mnt(p, p->mnt.mnt_root, flag);
				1803	if (IS_ERR(q))
				1804	goto out;
				1805	lock_mount_hash();
				1806	list_add_tail(&q->mnt_list, &res->mnt_list);
				1807	attach_mnt(q, parent, p->mnt_mp);
				1808	unlock_mount_hash();
				1809	}
				1810	}
				1811	return res;
				1812	out:
				1813	if (res) {
				1814	lock_mount_hash();
				1815	umount_tree(res, UMOUNT_SYNC);
				1816	unlock_mount_hash();
				1817	}
				1818	return q;
				1819	}
				1820
				1821	/* Caller should check returned pointer for errors */
				1822
				1823	struct vfsmount collect_mounts(const struct path path)
				1824	{
				1825	struct mount *tree;
				1826	namespace_lock();
				1827	if (!check_mnt(real_mount(path->mnt)))
				1828	tree = ERR_PTR(-EINVAL);
				1829	else
				1830	tree = copy_tree(real_mount(path->mnt), path->dentry,
				1831	CL_COPY_ALL \| CL_PRIVATE);
				1832	namespace_unlock();
				1833	if (IS_ERR(tree))
				1834	return ERR_CAST(tree);
				1835	return &tree->mnt;
				1836	}
				1837
				1838	static void free_mnt_ns(struct mnt_namespace *);
				1839	static struct mnt_namespace alloc_mnt_ns(struct user_namespace , bool);
				1840
				1841	void dissolve_on_fput(struct vfsmount *mnt)
				1842	{
				1843	struct mnt_namespace *ns;
				1844	namespace_lock();
				1845	lock_mount_hash();
				1846	ns = real_mount(mnt)->mnt_ns;
				1847	if (ns) {
				1848	if (is_anon_ns(ns))
				1849	umount_tree(real_mount(mnt), UMOUNT_CONNECTED);
				1850	else
				1851	ns = NULL;
				1852	}
				1853	unlock_mount_hash();
				1854	namespace_unlock();
				1855	if (ns)
				1856	free_mnt_ns(ns);
				1857	}
				1858
				1859	void drop_collected_mounts(struct vfsmount *mnt)
				1860	{
				1861	namespace_lock();
				1862	lock_mount_hash();
				1863	umount_tree(real_mount(mnt), 0);
				1864	unlock_mount_hash();
				1865	namespace_unlock();
				1866	}
				1867
				1868	static bool has_locked_children(struct mount mnt, struct dentry dentry)
				1869	{
				1870	struct mount *child;
				1871
				1872	list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
				1873	if (!is_subdir(child->mnt_mountpoint, dentry))
				1874	continue;
				1875
				1876	if (child->mnt.mnt_flags & MNT_LOCKED)
				1877	return true;
				1878	}
				1879	return false;
				1880	}
				1881
				1882	/**
				1883	* clone_private_mount - create a private clone of a path
				1884	*
				1885	* This creates a new vfsmount, which will be the clone of @path. The new will
				1886	* not be attached anywhere in the namespace and will be private (i.e. changes
				1887	* to the originating mount won't be propagated into this).
				1888	*
				1889	* Release with mntput().
				1890	*/
				1891	struct vfsmount clone_private_mount(const struct path path)
				1892	{
				1893	struct mount *old_mnt = real_mount(path->mnt);
				1894	struct mount *new_mnt;
				1895
				1896	down_read(&namespace_sem);
				1897	if (IS_MNT_UNBINDABLE(old_mnt))
				1898	goto invalid;
				1899
				1900	if (!check_mnt(old_mnt))
				1901	goto invalid;
				1902
				1903	if (has_locked_children(old_mnt, path->dentry))
				1904	goto invalid;
				1905
				1906	new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
				1907	up_read(&namespace_sem);
				1908
				1909	if (IS_ERR(new_mnt))
				1910	return ERR_CAST(new_mnt);
				1911
				1912	return &new_mnt->mnt;
				1913
				1914	invalid:
				1915	up_read(&namespace_sem);
				1916	return ERR_PTR(-EINVAL);
				1917	}
				1918	EXPORT_SYMBOL_GPL(clone_private_mount);
				1919
				1920	int iterate_mounts(int (f)(struct vfsmount , void ), void arg,
				1921	struct vfsmount *root)
				1922	{
				1923	struct mount *mnt;
				1924	int res = f(root, arg);
				1925	if (res)
				1926	return res;
				1927	list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) {
				1928	res = f(&mnt->mnt, arg);
				1929	if (res)
				1930	return res;
				1931	}
				1932	return 0;
				1933	}
				1934
				1935	static void lock_mnt_tree(struct mount *mnt)
				1936	{
				1937	struct mount *p;
				1938
				1939	for (p = mnt; p; p = next_mnt(p, mnt)) {
				1940	int flags = p->mnt.mnt_flags;
				1941	/* Don't allow unprivileged users to change mount flags */
				1942	flags \|= MNT_LOCK_ATIME;
				1943
				1944	if (flags & MNT_READONLY)
				1945	flags \|= MNT_LOCK_READONLY;
				1946
				1947	if (flags & MNT_NODEV)
				1948	flags \|= MNT_LOCK_NODEV;
				1949
				1950	if (flags & MNT_NOSUID)
				1951	flags \|= MNT_LOCK_NOSUID;
				1952
				1953	if (flags & MNT_NOEXEC)
				1954	flags \|= MNT_LOCK_NOEXEC;
				1955	/* Don't allow unprivileged users to reveal what is under a mount */
				1956	if (list_empty(&p->mnt_expire))
				1957	flags \|= MNT_LOCKED;
				1958	p->mnt.mnt_flags = flags;
				1959	}
				1960	}
				1961
				1962	static void cleanup_group_ids(struct mount mnt, struct mount end)
				1963	{
				1964	struct mount *p;
				1965
				1966	for (p = mnt; p != end; p = next_mnt(p, mnt)) {
				1967	if (p->mnt_group_id && !IS_MNT_SHARED(p))
				1968	mnt_release_group_id(p);
				1969	}
				1970	}
				1971
				1972	static int invent_group_ids(struct mount *mnt, bool recurse)
				1973	{
				1974	struct mount *p;
				1975
				1976	for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) {
				1977	if (!p->mnt_group_id && !IS_MNT_SHARED(p)) {
				1978	int err = mnt_alloc_group_id(p);
				1979	if (err) {
				1980	cleanup_group_ids(mnt, p);
				1981	return err;
				1982	}
				1983	}
				1984	}
				1985
				1986	return 0;
				1987	}
				1988
				1989	int count_mounts(struct mnt_namespace ns, struct mount mnt)
				1990	{
				1991	unsigned int max = READ_ONCE(sysctl_mount_max);
				1992	unsigned int mounts = 0, old, pending, sum;
				1993	struct mount *p;
				1994
				1995	for (p = mnt; p; p = next_mnt(p, mnt))
				1996	mounts++;
				1997
				1998	old = ns->mounts;
				1999	pending = ns->pending_mounts;
				2000	sum = old + pending;
				2001	if ((old > sum) \|\|
				2002	(pending > sum) \|\|
				2003	(max < sum) \|\|
				2004	(mounts > (max - sum)))
				2005	return -ENOSPC;
				2006
				2007	ns->pending_mounts = pending + mounts;
				2008	return 0;
				2009	}
				2010
				2011	/*
				2012	* @source_mnt : mount tree to be attached
				2013	* @nd : place the mount tree @source_mnt is attached
				2014	* @parent_nd : if non-null, detach the source_mnt from its parent and
				2015	* store the parent mount and mountpoint dentry.
				2016	* (done when source_mnt is moved)
				2017	*
				2018	* NOTE: in the table below explains the semantics when a source mount
				2019	* of a given type is attached to a destination mount of a given type.
				2020	* ---------------------------------------------------------------------------
				2021	* \| BIND MOUNT OPERATION \|
				2022	* \|**************************************************************************
				2023	* \| source-->\| shared \| private \| slave \| unbindable \|
				2024	* \| dest \| \| \| \| \|
				2025	* \| \| \| \| \| \| \|
				2026	* \| v \| \| \| \| \|
				2027	* \|**************************************************************************
				2028	* \| shared \| shared (++) \| shared (+) \| shared(+++)\| invalid \|
				2029	* \| \| \| \| \| \|
				2030	* \|non-shared\| shared (+) \| private \| slave (*) \| invalid \|
				2031	* ***************************************************************************
				2032	* A bind operation clones the source mount and mounts the clone on the
				2033	* destination mount.
				2034	*
				2035	* (++) the cloned mount is propagated to all the mounts in the propagation
				2036	* tree of the destination mount and the cloned mount is added to
				2037	* the peer group of the source mount.
				2038	* (+) the cloned mount is created under the destination mount and is marked
				2039	* as shared. The cloned mount is added to the peer group of the source
				2040	* mount.
				2041	* (+++) the mount is propagated to all the mounts in the propagation tree
				2042	* of the destination mount and the cloned mount is made slave
				2043	* of the same master as that of the source mount. The cloned mount
				2044	* is marked as 'shared and slave'.
				2045	* (*) the cloned mount is made a slave of the same master as that of the
				2046	* source mount.
				2047	*
				2048	* ---------------------------------------------------------------------------
				2049	* \| MOVE MOUNT OPERATION \|
				2050	* \|**************************************************************************
				2051	* \| source-->\| shared \| private \| slave \| unbindable \|
				2052	* \| dest \| \| \| \| \|
				2053	* \| \| \| \| \| \| \|
				2054	* \| v \| \| \| \| \|
				2055	* \|**************************************************************************
				2056	* \| shared \| shared (+) \| shared (+) \| shared(+++) \| invalid \|
				2057	* \| \| \| \| \| \|
				2058	* \|non-shared\| shared (+) \| private \| slave () \| unbindable \|
				2059	* ***************************************************************************
				2060	*
				2061	* (+) the mount is moved to the destination. And is then propagated to
				2062	* all the mounts in the propagation tree of the destination mount.
				2063	* (+*) the mount is moved to the destination.
				2064	* (+++) the mount is moved to the destination and is then propagated to
				2065	* all the mounts belonging to the destination mount's propagation tree.
				2066	* the mount is marked as 'shared and slave'.
				2067	* (*) the mount continues to be a slave at the new location.
				2068	*
				2069	* if the source mount is a tree, the operations explained above is
				2070	* applied to each mount in the tree.
				2071	* Must be called without spinlocks held, since this function can sleep
				2072	* in allocations.
				2073	*/
				2074	static int attach_recursive_mnt(struct mount *source_mnt,
				2075	struct mount *dest_mnt,
				2076	struct mountpoint *dest_mp,
				2077	bool moving)
				2078	{
				2079	struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
				2080	HLIST_HEAD(tree_list);
				2081	struct mnt_namespace *ns = dest_mnt->mnt_ns;
				2082	struct mountpoint *smp;
				2083	struct mount child, p;
				2084	struct hlist_node *n;
				2085	int err;
				2086
				2087	/* Preallocate a mountpoint in case the new mounts need
				2088	* to be tucked under other mounts.
				2089	*/
				2090	smp = get_mountpoint(source_mnt->mnt.mnt_root);
				2091	if (IS_ERR(smp))
				2092	return PTR_ERR(smp);
				2093
				2094	/* Is there space to add these mounts to the mount namespace? */
				2095	if (!moving) {
				2096	err = count_mounts(ns, source_mnt);
				2097	if (err)
				2098	goto out;
				2099	}
				2100
				2101	if (IS_MNT_SHARED(dest_mnt)) {
				2102	err = invent_group_ids(source_mnt, true);
				2103	if (err)
				2104	goto out;
				2105	err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);
				2106	lock_mount_hash();
				2107	if (err)
				2108	goto out_cleanup_ids;
				2109	for (p = source_mnt; p; p = next_mnt(p, source_mnt))
				2110	set_mnt_shared(p);
				2111	} else {
				2112	lock_mount_hash();
				2113	}
				2114	if (moving) {
				2115	unhash_mnt(source_mnt);
				2116	attach_mnt(source_mnt, dest_mnt, dest_mp);
				2117	touch_mnt_namespace(source_mnt->mnt_ns);
				2118	} else {
				2119	if (source_mnt->mnt_ns) {
				2120	/* move from anon - the caller will destroy */
				2121	list_del_init(&source_mnt->mnt_ns->list);
				2122	}
				2123	mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
				2124	commit_tree(source_mnt);
				2125	}
				2126
				2127	hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
				2128	struct mount *q;
				2129	hlist_del_init(&child->mnt_hash);
				2130	q = __lookup_mnt(&child->mnt_parent->mnt,
				2131	child->mnt_mountpoint);
				2132	if (q)
				2133	mnt_change_mountpoint(child, smp, q);
				2134	/* Notice when we are propagating across user namespaces */
				2135	if (child->mnt_parent->mnt_ns->user_ns != user_ns)
				2136	lock_mnt_tree(child);
				2137	child->mnt.mnt_flags &= ~MNT_LOCKED;
				2138	commit_tree(child);
				2139	}
				2140	put_mountpoint(smp);
				2141	unlock_mount_hash();
				2142
				2143	return 0;
				2144
				2145	out_cleanup_ids:
				2146	while (!hlist_empty(&tree_list)) {
				2147	child = hlist_entry(tree_list.first, struct mount, mnt_hash);
				2148	child->mnt_parent->mnt_ns->pending_mounts = 0;
				2149	umount_tree(child, UMOUNT_SYNC);
				2150	}
				2151	unlock_mount_hash();
				2152	cleanup_group_ids(source_mnt, NULL);
				2153	out:
				2154	ns->pending_mounts = 0;
				2155
				2156	read_seqlock_excl(&mount_lock);
				2157	put_mountpoint(smp);
				2158	read_sequnlock_excl(&mount_lock);
				2159
				2160	return err;
				2161	}
				2162
				2163	static struct mountpoint lock_mount(struct path path)
				2164	{
				2165	struct vfsmount *mnt;
				2166	struct dentry *dentry = path->dentry;
				2167	retry:
				2168	inode_lock(dentry->d_inode);
				2169	if (unlikely(cant_mount(dentry))) {
				2170	inode_unlock(dentry->d_inode);
				2171	return ERR_PTR(-ENOENT);
				2172	}
				2173	namespace_lock();
				2174	mnt = lookup_mnt(path);
				2175	if (likely(!mnt)) {
				2176	struct mountpoint *mp = get_mountpoint(dentry);
				2177	if (IS_ERR(mp)) {
				2178	namespace_unlock();
				2179	inode_unlock(dentry->d_inode);
				2180	return mp;
				2181	}
				2182	return mp;
				2183	}
				2184	namespace_unlock();
				2185	inode_unlock(path->dentry->d_inode);
				2186	path_put(path);
				2187	path->mnt = mnt;
				2188	dentry = path->dentry = dget(mnt->mnt_root);
				2189	goto retry;
				2190	}
				2191
				2192	static void unlock_mount(struct mountpoint *where)
				2193	{
				2194	struct dentry *dentry = where->m_dentry;
				2195
				2196	read_seqlock_excl(&mount_lock);
				2197	put_mountpoint(where);
				2198	read_sequnlock_excl(&mount_lock);
				2199
				2200	namespace_unlock();
				2201	inode_unlock(dentry->d_inode);
				2202	}
				2203
				2204	static int graft_tree(struct mount mnt, struct mount p, struct mountpoint *mp)
				2205	{
				2206	if (mnt->mnt.mnt_sb->s_flags & SB_NOUSER)
				2207	return -EINVAL;
				2208
				2209	if (d_is_dir(mp->m_dentry) !=
				2210	d_is_dir(mnt->mnt.mnt_root))
				2211	return -ENOTDIR;
				2212
				2213	return attach_recursive_mnt(mnt, p, mp, false);
				2214	}
				2215
				2216	/*
				2217	* Sanity check the flags to change_mnt_propagation.
				2218	*/
				2219
				2220	static int flags_to_propagation_type(int ms_flags)
				2221	{
				2222	int type = ms_flags & ~(MS_REC \| MS_SILENT);
				2223
				2224	/* Fail if any non-propagation flags are set */
				2225	if (type & ~(MS_SHARED \| MS_PRIVATE \| MS_SLAVE \| MS_UNBINDABLE))
				2226	return 0;
				2227	/* Only one propagation flag should be set */
				2228	if (!is_power_of_2(type))
				2229	return 0;
				2230	return type;
				2231	}
				2232
				2233	/*
				2234	* recursively change the type of the mountpoint.
				2235	*/
				2236	static int do_change_type(struct path *path, int ms_flags)
				2237	{
				2238	struct mount *m;
				2239	struct mount *mnt = real_mount(path->mnt);
				2240	int recurse = ms_flags & MS_REC;
				2241	int type;
				2242	int err = 0;
				2243
				2244	if (path->dentry != path->mnt->mnt_root)
				2245	return -EINVAL;
				2246
				2247	type = flags_to_propagation_type(ms_flags);
				2248	if (!type)
				2249	return -EINVAL;
				2250
				2251	namespace_lock();
				2252	if (type == MS_SHARED) {
				2253	err = invent_group_ids(mnt, recurse);
				2254	if (err)
				2255	goto out_unlock;
				2256	}
				2257
				2258	lock_mount_hash();
				2259	for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
				2260	change_mnt_propagation(m, type);
				2261	unlock_mount_hash();
				2262
				2263	out_unlock:
				2264	namespace_unlock();
				2265	return err;
				2266	}
				2267
				2268	static struct mount __do_loopback(struct path old_path, int recurse)
				2269	{
				2270	struct mount mnt = ERR_PTR(-EINVAL), old = real_mount(old_path->mnt);
				2271
				2272	if (IS_MNT_UNBINDABLE(old))
				2273	return mnt;
				2274
				2275	if (!check_mnt(old) && old_path->dentry->d_op != &ns_dentry_operations)
				2276	return mnt;
				2277
				2278	if (!recurse && has_locked_children(old, old_path->dentry))
				2279	return mnt;
				2280
				2281	if (recurse)
				2282	mnt = copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE);
				2283	else
				2284	mnt = clone_mnt(old, old_path->dentry, 0);
				2285
				2286	if (!IS_ERR(mnt))
				2287	mnt->mnt.mnt_flags &= ~MNT_LOCKED;
				2288
				2289	return mnt;
				2290	}
				2291
				2292	/*
				2293	* do loopback mount.
				2294	*/
				2295	static int do_loopback(struct path path, const char old_name,
				2296	int recurse)
				2297	{
				2298	struct path old_path;
				2299	struct mount mnt = NULL, parent;
				2300	struct mountpoint *mp;
				2301	int err;
				2302	if (!old_name \|\| !*old_name)
				2303	return -EINVAL;
				2304	err = kern_path(old_name, LOOKUP_FOLLOW\|LOOKUP_AUTOMOUNT, &old_path);
				2305	if (err)
				2306	return err;
				2307
				2308	err = -EINVAL;
				2309	if (mnt_ns_loop(old_path.dentry))
				2310	goto out;
				2311
				2312	mp = lock_mount(path);
				2313	if (IS_ERR(mp)) {
				2314	err = PTR_ERR(mp);
				2315	goto out;
				2316	}
				2317
				2318	parent = real_mount(path->mnt);
				2319	if (!check_mnt(parent))
				2320	goto out2;
				2321
				2322	mnt = __do_loopback(&old_path, recurse);
				2323	if (IS_ERR(mnt)) {
				2324	err = PTR_ERR(mnt);
				2325	goto out2;
				2326	}
				2327
				2328	err = graft_tree(mnt, parent, mp);
				2329	if (err) {
				2330	lock_mount_hash();
				2331	umount_tree(mnt, UMOUNT_SYNC);
				2332	unlock_mount_hash();
				2333	}
				2334	out2:
				2335	unlock_mount(mp);
				2336	out:
				2337	path_put(&old_path);
				2338	return err;
				2339	}
				2340
				2341	static struct file open_detached_copy(struct path path, bool recursive)
				2342	{
				2343	struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
				2344	struct mnt_namespace *ns = alloc_mnt_ns(user_ns, true);
				2345	struct mount mnt, p;
				2346	struct file *file;
				2347
				2348	if (IS_ERR(ns))
				2349	return ERR_CAST(ns);
				2350
				2351	namespace_lock();
				2352	mnt = __do_loopback(path, recursive);
				2353	if (IS_ERR(mnt)) {
				2354	namespace_unlock();
				2355	free_mnt_ns(ns);
				2356	return ERR_CAST(mnt);
				2357	}
				2358
				2359	lock_mount_hash();
				2360	for (p = mnt; p; p = next_mnt(p, mnt)) {
				2361	p->mnt_ns = ns;
				2362	ns->mounts++;
				2363	}
				2364	ns->root = mnt;
				2365	list_add_tail(&ns->list, &mnt->mnt_list);
				2366	mntget(&mnt->mnt);
				2367	unlock_mount_hash();
				2368	namespace_unlock();
				2369
				2370	mntput(path->mnt);
				2371	path->mnt = &mnt->mnt;
				2372	file = dentry_open(path, O_PATH, current_cred());
				2373	if (IS_ERR(file))
				2374	dissolve_on_fput(path->mnt);
				2375	else
				2376	file->f_mode \|= FMODE_NEED_UNMOUNT;
				2377	return file;
				2378	}
				2379
				2380	SYSCALL_DEFINE3(open_tree, int, dfd, const char *, filename, unsigned, flags)
				2381	{
				2382	struct file *file;
				2383	struct path path;
				2384	int lookup_flags = LOOKUP_AUTOMOUNT \| LOOKUP_FOLLOW;
				2385	bool detached = flags & OPEN_TREE_CLONE;
				2386	int error;
				2387	int fd;
				2388
				2389	BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC);
				2390
				2391	if (flags & ~(AT_EMPTY_PATH \| AT_NO_AUTOMOUNT \| AT_RECURSIVE \|
				2392	AT_SYMLINK_NOFOLLOW \| OPEN_TREE_CLONE \|
				2393	OPEN_TREE_CLOEXEC))
				2394	return -EINVAL;
				2395
				2396	if ((flags & (AT_RECURSIVE \| OPEN_TREE_CLONE)) == AT_RECURSIVE)
				2397	return -EINVAL;
				2398
				2399	if (flags & AT_NO_AUTOMOUNT)
				2400	lookup_flags &= ~LOOKUP_AUTOMOUNT;
				2401	if (flags & AT_SYMLINK_NOFOLLOW)
				2402	lookup_flags &= ~LOOKUP_FOLLOW;
				2403	if (flags & AT_EMPTY_PATH)
				2404	lookup_flags \|= LOOKUP_EMPTY;
				2405
				2406	if (detached && !may_mount())
				2407	return -EPERM;
				2408
				2409	fd = get_unused_fd_flags(flags & O_CLOEXEC);
				2410	if (fd < 0)
				2411	return fd;
				2412
				2413	error = user_path_at(dfd, filename, lookup_flags, &path);
				2414	if (unlikely(error)) {
				2415	file = ERR_PTR(error);
				2416	} else {
				2417	if (detached)
				2418	file = open_detached_copy(&path, flags & AT_RECURSIVE);
				2419	else
				2420	file = dentry_open(&path, O_PATH, current_cred());
				2421	path_put(&path);
				2422	}
				2423	if (IS_ERR(file)) {
				2424	put_unused_fd(fd);
				2425	return PTR_ERR(file);
				2426	}
				2427	fd_install(fd, file);
				2428	return fd;
				2429	}
				2430
				2431	/*
				2432	* Don't allow locked mount flags to be cleared.
				2433	*
				2434	* No locks need to be held here while testing the various MNT_LOCK
				2435	* flags because those flags can never be cleared once they are set.
				2436	*/
				2437	static bool can_change_locked_flags(struct mount *mnt, unsigned int mnt_flags)
				2438	{
				2439	unsigned int fl = mnt->mnt.mnt_flags;
				2440
				2441	if ((fl & MNT_LOCK_READONLY) &&
				2442	!(mnt_flags & MNT_READONLY))
				2443	return false;
				2444
				2445	if ((fl & MNT_LOCK_NODEV) &&
				2446	!(mnt_flags & MNT_NODEV))
				2447	return false;
				2448
				2449	if ((fl & MNT_LOCK_NOSUID) &&
				2450	!(mnt_flags & MNT_NOSUID))
				2451	return false;
				2452
				2453	if ((fl & MNT_LOCK_NOEXEC) &&
				2454	!(mnt_flags & MNT_NOEXEC))
				2455	return false;
				2456
				2457	if ((fl & MNT_LOCK_ATIME) &&
				2458	((fl & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK)))
				2459	return false;
				2460
				2461	return true;
				2462	}
				2463
				2464	static int change_mount_ro_state(struct mount *mnt, unsigned int mnt_flags)
				2465	{
				2466	bool readonly_request = (mnt_flags & MNT_READONLY);
				2467
				2468	if (readonly_request == __mnt_is_readonly(&mnt->mnt))
				2469	return 0;
				2470
				2471	if (readonly_request)
				2472	return mnt_make_readonly(mnt);
				2473
				2474	return __mnt_unmake_readonly(mnt);
				2475	}
				2476
				2477	/*
				2478	* Update the user-settable attributes on a mount. The caller must hold
				2479	* sb->s_umount for writing.
				2480	*/
				2481	static void set_mount_attributes(struct mount *mnt, unsigned int mnt_flags)
				2482	{
				2483	lock_mount_hash();
				2484	mnt_flags \|= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
				2485	mnt->mnt.mnt_flags = mnt_flags;
				2486	touch_mnt_namespace(mnt->mnt_ns);
				2487	unlock_mount_hash();
				2488	}
				2489
				2490	static void mnt_warn_timestamp_expiry(struct path mountpoint, struct vfsmount mnt)
				2491	{
				2492	struct super_block *sb = mnt->mnt_sb;
				2493
				2494	if (!__mnt_is_readonly(mnt) &&
				2495	(!(sb->s_iflags & SB_I_TS_EXPIRY_WARNED)) &&
				2496	(ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) {
				2497	char buf, mntpath;
				2498
				2499	buf = (char *)__get_free_page(GFP_KERNEL);
				2500	if (buf)
				2501	mntpath = d_path(mountpoint, buf, PAGE_SIZE);
				2502	else
				2503	mntpath = ERR_PTR(-ENOMEM);
				2504	if (IS_ERR(mntpath))
				2505	mntpath = "(unknown)";
				2506
				2507	pr_warn("%s filesystem being %s at %s supports timestamps until %ptTd (0x%llx)\n",
				2508	sb->s_type->name,
				2509	is_mounted(mnt) ? "remounted" : "mounted",
				2510	mntpath, &sb->s_time_max,
				2511	(unsigned long long)sb->s_time_max);
				2512
				2513	sb->s_iflags \|= SB_I_TS_EXPIRY_WARNED;
				2514	if (buf)
				2515	free_page((unsigned long)buf);
				2516	}
				2517	}
				2518
				2519	/*
				2520	* Handle reconfiguration of the mountpoint only without alteration of the
				2521	* superblock it refers to. This is triggered by specifying MS_REMOUNT\|MS_BIND
				2522	* to mount(2).
				2523	*/
				2524	static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags)
				2525	{
				2526	struct super_block *sb = path->mnt->mnt_sb;
				2527	struct mount *mnt = real_mount(path->mnt);
				2528	int ret;
				2529
				2530	if (!check_mnt(mnt))
				2531	return -EINVAL;
				2532
				2533	if (path->dentry != mnt->mnt.mnt_root)
				2534	return -EINVAL;
				2535
				2536	if (!can_change_locked_flags(mnt, mnt_flags))
				2537	return -EPERM;
				2538
				2539	down_write(&sb->s_umount);
				2540	ret = change_mount_ro_state(mnt, mnt_flags);
				2541	if (ret == 0)
				2542	set_mount_attributes(mnt, mnt_flags);
				2543	up_write(&sb->s_umount);
				2544
				2545	mnt_warn_timestamp_expiry(path, &mnt->mnt);
				2546
				2547	return ret;
				2548	}
				2549
				2550	/*
				2551	* change filesystem flags. dir should be a physical root of filesystem.
				2552	* If you've mounted a non-root directory somewhere and want to do remount
				2553	* on it - tough luck.
				2554	*/
				2555	static int do_remount(struct path *path, int ms_flags, int sb_flags,
				2556	int mnt_flags, void *data)
				2557	{
				2558	int err;
				2559	struct super_block *sb = path->mnt->mnt_sb;
				2560	struct mount *mnt = real_mount(path->mnt);
				2561	struct fs_context *fc;
				2562
				2563	if (!check_mnt(mnt))
				2564	return -EINVAL;
				2565
				2566	if (path->dentry != path->mnt->mnt_root)
				2567	return -EINVAL;
				2568
				2569	if (!can_change_locked_flags(mnt, mnt_flags))
				2570	return -EPERM;
				2571
				2572	fc = fs_context_for_reconfigure(path->dentry, sb_flags, MS_RMT_MASK);
				2573	if (IS_ERR(fc))
				2574	return PTR_ERR(fc);
				2575
				2576	err = parse_monolithic_mount_data(fc, data);
				2577	if (!err) {
				2578	down_write(&sb->s_umount);
				2579	err = -EPERM;
				2580	if (ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) {
				2581	err = reconfigure_super(fc);
				2582	if (!err)
				2583	set_mount_attributes(mnt, mnt_flags);
				2584	}
				2585	up_write(&sb->s_umount);
				2586	}
				2587
				2588	mnt_warn_timestamp_expiry(path, &mnt->mnt);
				2589
				2590	put_fs_context(fc);
				2591	return err;
				2592	}
				2593
				2594	static inline int tree_contains_unbindable(struct mount *mnt)
				2595	{
				2596	struct mount *p;
				2597	for (p = mnt; p; p = next_mnt(p, mnt)) {
				2598	if (IS_MNT_UNBINDABLE(p))
				2599	return 1;
				2600	}
				2601	return 0;
				2602	}
				2603
				2604	/*
				2605	* Check that there aren't references to earlier/same mount namespaces in the
				2606	* specified subtree. Such references can act as pins for mount namespaces
				2607	* that aren't checked by the mount-cycle checking code, thereby allowing
				2608	* cycles to be made.
				2609	*/
				2610	static bool check_for_nsfs_mounts(struct mount *subtree)
				2611	{
				2612	struct mount *p;
				2613	bool ret = false;
				2614
				2615	lock_mount_hash();
				2616	for (p = subtree; p; p = next_mnt(p, subtree))
				2617	if (mnt_ns_loop(p->mnt.mnt_root))
				2618	goto out;
				2619
				2620	ret = true;
				2621	out:
				2622	unlock_mount_hash();
				2623	return ret;
				2624	}
				2625
				2626	static int do_move_mount(struct path old_path, struct path new_path)
				2627	{
				2628	struct mnt_namespace *ns;
				2629	struct mount *p;
				2630	struct mount *old;
				2631	struct mount *parent;
				2632	struct mountpoint mp, old_mp;
				2633	int err;
				2634	bool attached;
				2635
				2636	mp = lock_mount(new_path);
				2637	if (IS_ERR(mp))
				2638	return PTR_ERR(mp);
				2639
				2640	old = real_mount(old_path->mnt);
				2641	p = real_mount(new_path->mnt);
				2642	parent = old->mnt_parent;
				2643	attached = mnt_has_parent(old);
				2644	old_mp = old->mnt_mp;
				2645	ns = old->mnt_ns;
				2646
				2647	err = -EINVAL;
				2648	/* The mountpoint must be in our namespace. */
				2649	if (!check_mnt(p))
				2650	goto out;
				2651
				2652	/* The thing moved must be mounted... */
				2653	if (!is_mounted(&old->mnt))
				2654	goto out;
				2655
				2656	/* ... and either ours or the root of anon namespace */
				2657	if (!(attached ? check_mnt(old) : is_anon_ns(ns)))
				2658	goto out;
				2659
				2660	if (old->mnt.mnt_flags & MNT_LOCKED)
				2661	goto out;
				2662
				2663	if (old_path->dentry != old_path->mnt->mnt_root)
				2664	goto out;
				2665
				2666	if (d_is_dir(new_path->dentry) !=
				2667	d_is_dir(old_path->dentry))
				2668	goto out;
				2669	/*
				2670	* Don't move a mount residing in a shared parent.
				2671	*/
				2672	if (attached && IS_MNT_SHARED(parent))
				2673	goto out;
				2674	/*
				2675	* Don't move a mount tree containing unbindable mounts to a destination
				2676	* mount which is shared.
				2677	*/
				2678	if (IS_MNT_SHARED(p) && tree_contains_unbindable(old))
				2679	goto out;
				2680	err = -ELOOP;
				2681	if (!check_for_nsfs_mounts(old))
				2682	goto out;
				2683	for (; mnt_has_parent(p); p = p->mnt_parent)
				2684	if (p == old)
				2685	goto out;
				2686
				2687	err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp,
				2688	attached);
				2689	if (err)
				2690	goto out;
				2691
				2692	/* if the mount is moved, it should no longer be expire
				2693	* automatically */
				2694	list_del_init(&old->mnt_expire);
				2695	if (attached)
				2696	put_mountpoint(old_mp);
				2697	out:
				2698	unlock_mount(mp);
				2699	if (!err) {
				2700	if (attached)
				2701	mntput_no_expire(parent);
				2702	else
				2703	free_mnt_ns(ns);
				2704	}
				2705	return err;
				2706	}
				2707
				2708	static int do_move_mount_old(struct path path, const char old_name)
				2709	{
				2710	struct path old_path;
				2711	int err;
				2712
				2713	if (!old_name \|\| !*old_name)
				2714	return -EINVAL;
				2715
				2716	err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
				2717	if (err)
				2718	return err;
				2719
				2720	err = do_move_mount(&old_path, path);
				2721	path_put(&old_path);
				2722	return err;
				2723	}
				2724
				2725	/*
				2726	* add a mount into a namespace's mount tree
				2727	*/
				2728	static int do_add_mount(struct mount newmnt, struct path path, int mnt_flags)
				2729	{
				2730	struct mountpoint *mp;
				2731	struct mount *parent;
				2732	int err;
				2733
				2734	mnt_flags &= ~MNT_INTERNAL_FLAGS;
				2735
				2736	mp = lock_mount(path);
				2737	if (IS_ERR(mp))
				2738	return PTR_ERR(mp);
				2739
				2740	parent = real_mount(path->mnt);
				2741	err = -EINVAL;
				2742	if (unlikely(!check_mnt(parent))) {
				2743	/* that's acceptable only for automounts done in private ns */
				2744	if (!(mnt_flags & MNT_SHRINKABLE))
				2745	goto unlock;
				2746	/* ... and for those we'd better have mountpoint still alive */
				2747	if (!parent->mnt_ns)
				2748	goto unlock;
				2749	}
				2750
				2751	/* Refuse the same filesystem on the same mount point */
				2752	err = -EBUSY;
				2753	if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb &&
				2754	path->mnt->mnt_root == path->dentry)
				2755	goto unlock;
				2756
				2757	err = -EINVAL;
				2758	if (d_is_symlink(newmnt->mnt.mnt_root))
				2759	goto unlock;
				2760
				2761	newmnt->mnt.mnt_flags = mnt_flags;
				2762	err = graft_tree(newmnt, parent, mp);
				2763
				2764	unlock:
				2765	unlock_mount(mp);
				2766	return err;
				2767	}
				2768
				2769	static bool mount_too_revealing(const struct super_block sb, int new_mnt_flags);
				2770
				2771	/*
				2772	* Create a new mount using a superblock configuration and request it
				2773	* be added to the namespace tree.
				2774	*/
				2775	static int do_new_mount_fc(struct fs_context fc, struct path mountpoint,
				2776	unsigned int mnt_flags)
				2777	{
				2778	struct vfsmount *mnt;
				2779	struct super_block *sb = fc->root->d_sb;
				2780	int error;
				2781
				2782	error = security_sb_kern_mount(sb);
				2783	if (!error && mount_too_revealing(sb, &mnt_flags))
				2784	error = -EPERM;
				2785
				2786	if (unlikely(error)) {
				2787	fc_drop_locked(fc);
				2788	return error;
				2789	}
				2790
				2791	up_write(&sb->s_umount);
				2792
				2793	mnt = vfs_create_mount(fc);
				2794	if (IS_ERR(mnt))
				2795	return PTR_ERR(mnt);
				2796
				2797	mnt_warn_timestamp_expiry(mountpoint, mnt);
				2798
				2799	error = do_add_mount(real_mount(mnt), mountpoint, mnt_flags);
				2800	if (error < 0)
				2801	mntput(mnt);
				2802	return error;
				2803	}
				2804
				2805	/*
				2806	* create a new mount for userspace and request it to be added into the
				2807	* namespace's tree
				2808	*/
				2809	static int do_new_mount(struct path path, const char fstype, int sb_flags,
				2810	int mnt_flags, const char name, void data)
				2811	{
				2812	struct file_system_type *type;
				2813	struct fs_context *fc;
				2814	const char *subtype = NULL;
				2815	int err = 0;
				2816
				2817	if (!fstype)
				2818	return -EINVAL;
				2819
				2820	type = get_fs_type(fstype);
				2821	if (!type)
				2822	return -ENODEV;
				2823
				2824	if (type->fs_flags & FS_HAS_SUBTYPE) {
				2825	subtype = strchr(fstype, '.');
				2826	if (subtype) {
				2827	subtype++;
				2828	if (!*subtype) {
				2829	put_filesystem(type);
				2830	return -EINVAL;
				2831	}
				2832	}
				2833	}
				2834
				2835	fc = fs_context_for_mount(type, sb_flags);
				2836	put_filesystem(type);
				2837	if (IS_ERR(fc))
				2838	return PTR_ERR(fc);
				2839
				2840	if (subtype)
				2841	err = vfs_parse_fs_string(fc, "subtype",
				2842	subtype, strlen(subtype));
				2843	if (!err && name)
				2844	err = vfs_parse_fs_string(fc, "source", name, strlen(name));
				2845	if (!err)
				2846	err = parse_monolithic_mount_data(fc, data);
				2847	if (!err && !mount_capable(fc))
				2848	err = -EPERM;
				2849	if (!err)
				2850	err = vfs_get_tree(fc);
				2851	if (!err)
				2852	err = do_new_mount_fc(fc, path, mnt_flags);
				2853
				2854	put_fs_context(fc);
				2855	return err;
				2856	}
				2857
				2858	int finish_automount(struct vfsmount m, struct path path)
				2859	{
				2860	struct mount *mnt = real_mount(m);
				2861	int err;
				2862	/* The new mount record should have at least 2 refs to prevent it being
				2863	* expired before we get a chance to add it
				2864	*/
				2865	BUG_ON(mnt_get_count(mnt) < 2);
				2866
				2867	if (m->mnt_sb == path->mnt->mnt_sb &&
				2868	m->mnt_root == path->dentry) {
				2869	err = -ELOOP;
				2870	goto fail;
				2871	}
				2872
				2873	err = do_add_mount(mnt, path, path->mnt->mnt_flags \| MNT_SHRINKABLE);
				2874	if (!err)
				2875	return 0;
				2876	fail:
				2877	/* remove m from any expiration list it may be on */
				2878	if (!list_empty(&mnt->mnt_expire)) {
				2879	namespace_lock();
				2880	list_del_init(&mnt->mnt_expire);
				2881	namespace_unlock();
				2882	}
				2883	mntput(m);
				2884	mntput(m);
				2885	return err;
				2886	}
				2887
				2888	/**
				2889	* mnt_set_expiry - Put a mount on an expiration list
				2890	* @mnt: The mount to list.
				2891	* @expiry_list: The list to add the mount to.
				2892	*/
				2893	void mnt_set_expiry(struct vfsmount mnt, struct list_head expiry_list)
				2894	{
				2895	namespace_lock();
				2896
				2897	list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);
				2898
				2899	namespace_unlock();
				2900	}
				2901	EXPORT_SYMBOL(mnt_set_expiry);
				2902
				2903	/*
				2904	* process a list of expirable mountpoints with the intent of discarding any
				2905	* mountpoints that aren't in use and haven't been touched since last we came
				2906	* here
				2907	*/
				2908	void mark_mounts_for_expiry(struct list_head *mounts)
				2909	{
				2910	struct mount mnt, next;
				2911	LIST_HEAD(graveyard);
				2912
				2913	if (list_empty(mounts))
				2914	return;
				2915
				2916	namespace_lock();
				2917	lock_mount_hash();
				2918
				2919	/* extract from the expiration list every vfsmount that matches the
				2920	* following criteria:
				2921	* - only referenced by its parent vfsmount
				2922	* - still marked for expiry (marked on the last call here; marks are
				2923	* cleared by mntput())
				2924	*/
				2925	list_for_each_entry_safe(mnt, next, mounts, mnt_expire) {
				2926	if (!xchg(&mnt->mnt_expiry_mark, 1) \|\|
				2927	propagate_mount_busy(mnt, 1))
				2928	continue;
				2929	list_move(&mnt->mnt_expire, &graveyard);
				2930	}
				2931	while (!list_empty(&graveyard)) {
				2932	mnt = list_first_entry(&graveyard, struct mount, mnt_expire);
				2933	touch_mnt_namespace(mnt->mnt_ns);
				2934	umount_tree(mnt, UMOUNT_PROPAGATE\|UMOUNT_SYNC);
				2935	}
				2936	unlock_mount_hash();
				2937	namespace_unlock();
				2938	}
				2939
				2940	EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
				2941
				2942	/*
				2943	* Ripoff of 'select_parent()'
				2944	*
				2945	* search the list of submounts for a given mountpoint, and move any
				2946	* shrinkable submounts to the 'graveyard' list.
				2947	*/
				2948	static int select_submounts(struct mount parent, struct list_head graveyard)
				2949	{
				2950	struct mount *this_parent = parent;
				2951	struct list_head *next;
				2952	int found = 0;
				2953
				2954	repeat:
				2955	next = this_parent->mnt_mounts.next;
				2956	resume:
				2957	while (next != &this_parent->mnt_mounts) {
				2958	struct list_head *tmp = next;
				2959	struct mount *mnt = list_entry(tmp, struct mount, mnt_child);
				2960
				2961	next = tmp->next;
				2962	if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE))
				2963	continue;
				2964	/*
				2965	* Descend a level if the d_mounts list is non-empty.
				2966	*/
				2967	if (!list_empty(&mnt->mnt_mounts)) {
				2968	this_parent = mnt;
				2969	goto repeat;
				2970	}
				2971
				2972	if (!propagate_mount_busy(mnt, 1)) {
				2973	list_move_tail(&mnt->mnt_expire, graveyard);
				2974	found++;
				2975	}
				2976	}
				2977	/*
				2978	* All done at this level ... ascend and resume the search
				2979	*/
				2980	if (this_parent != parent) {
				2981	next = this_parent->mnt_child.next;
				2982	this_parent = this_parent->mnt_parent;
				2983	goto resume;
				2984	}
				2985	return found;
				2986	}
				2987
				2988	/*
				2989	* process a list of expirable mountpoints with the intent of discarding any
				2990	* submounts of a specific parent mountpoint
				2991	*
				2992	* mount_lock must be held for write
				2993	*/
				2994	static void shrink_submounts(struct mount *mnt)
				2995	{
				2996	LIST_HEAD(graveyard);
				2997	struct mount *m;
				2998
				2999	/* extract submounts of 'mountpoint' from the expiration list */
				3000	while (select_submounts(mnt, &graveyard)) {
				3001	while (!list_empty(&graveyard)) {
				3002	m = list_first_entry(&graveyard, struct mount,
				3003	mnt_expire);
				3004	touch_mnt_namespace(m->mnt_ns);
				3005	umount_tree(m, UMOUNT_PROPAGATE\|UMOUNT_SYNC);
				3006	}
				3007	}
				3008	}
				3009
				3010	/*
				3011	* Some copy_from_user() implementations do not return the exact number of
				3012	* bytes remaining to copy on a fault. But copy_mount_options() requires that.
				3013	* Note that this function differs from copy_from_user() in that it will oops
				3014	* on bad values of `to', rather than returning a short copy.
				3015	*/
				3016	static long exact_copy_from_user(void to, const void __user from,
				3017	unsigned long n)
				3018	{
				3019	char *t = to;
				3020	const char __user *f = from;
				3021	char c;
				3022
				3023	if (!access_ok(from, n))
				3024	return n;
				3025
				3026	while (n) {
				3027	if (__get_user(c, f)) {
				3028	memset(t, 0, n);
				3029	break;
				3030	}
				3031	*t++ = c;
				3032	f++;
				3033	n--;
				3034	}
				3035	return n;
				3036	}
				3037
				3038	void copy_mount_options(const void __user data)
				3039	{
				3040	int i;
				3041	unsigned long size;
				3042	char *copy;
				3043
				3044	if (!data)
				3045	return NULL;
				3046
				3047	copy = kmalloc(PAGE_SIZE, GFP_KERNEL);
				3048	if (!copy)
				3049	return ERR_PTR(-ENOMEM);
				3050
				3051	/* We only care that some data at the address the user
				3052	* gave us is valid. Just in case, we'll zero
				3053	* the remainder of the page.
				3054	*/
				3055	/* copy_from_user cannot cross TASK_SIZE ! */
				3056	size = TASK_SIZE - (unsigned long)untagged_addr(data);
				3057	if (size > PAGE_SIZE)
				3058	size = PAGE_SIZE;
				3059
				3060	i = size - exact_copy_from_user(copy, data, size);
				3061	if (!i) {
				3062	kfree(copy);
				3063	return ERR_PTR(-EFAULT);
				3064	}
				3065	if (i != PAGE_SIZE)
				3066	memset(copy + i, 0, PAGE_SIZE - i);
				3067	return copy;
				3068	}
				3069
				3070	char copy_mount_string(const void __user data)
				3071	{
				3072	return data ? strndup_user(data, PATH_MAX) : NULL;
				3073	}
				3074
				3075	/*
				3076	* Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
				3077	* be given to the mount() call (ie: read-only, no-dev, no-suid etc).
				3078	*
				3079	* data is a (void *) that can point to any structure up to
				3080	* PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
				3081	* information (or be NULL).
				3082	*
				3083	* Pre-0.97 versions of mount() didn't have a flags word.
				3084	* When the flags word was introduced its top half was required
				3085	* to have the magic value 0xC0ED, and this remained so until 2.4.0-test9.
				3086	* Therefore, if this magic number is present, it carries no information
				3087	* and must be discarded.
				3088	*/
				3089	long do_mount(const char dev_name, const char __user dir_name,
				3090	const char type_page, unsigned long flags, void data_page)
				3091	{
				3092	struct path path;
				3093	unsigned int mnt_flags = 0, sb_flags;
				3094	int retval = 0;
				3095
				3096	/* Discard magic */
				3097	if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
				3098	flags &= ~MS_MGC_MSK;
				3099
				3100	/* Basic sanity checks */
				3101	if (data_page)
				3102	((char *)data_page)[PAGE_SIZE - 1] = 0;
				3103
				3104	if (flags & MS_NOUSER)
				3105	return -EINVAL;
				3106
				3107	/* ... and get the mountpoint */
				3108	retval = user_path_at(AT_FDCWD, dir_name, LOOKUP_FOLLOW, &path);
				3109	if (retval)
				3110	return retval;
				3111
				3112	retval = security_sb_mount(dev_name, &path,
				3113	type_page, flags, data_page);
				3114	if (!retval && !may_mount())
				3115	retval = -EPERM;
				3116	if (!retval && (flags & SB_MANDLOCK) && !may_mandlock())
				3117	retval = -EPERM;
				3118	if (retval)
				3119	goto dput_out;
				3120
				3121	/* Default to relatime unless overriden */
				3122	if (!(flags & MS_NOATIME))
				3123	mnt_flags \|= MNT_RELATIME;
				3124
				3125	/* Separate the per-mountpoint flags */
				3126	if (flags & MS_NOSUID)
				3127	mnt_flags \|= MNT_NOSUID;
				3128	if (flags & MS_NODEV)
				3129	mnt_flags \|= MNT_NODEV;
				3130	if (flags & MS_NOEXEC)
				3131	mnt_flags \|= MNT_NOEXEC;
				3132	if (flags & MS_NOATIME)
				3133	mnt_flags \|= MNT_NOATIME;
				3134	if (flags & MS_NODIRATIME)
				3135	mnt_flags \|= MNT_NODIRATIME;
				3136	if (flags & MS_STRICTATIME)
				3137	mnt_flags &= ~(MNT_RELATIME \| MNT_NOATIME);
				3138	if (flags & MS_RDONLY)
				3139	mnt_flags \|= MNT_READONLY;
				3140
				3141	/* The default atime for remount is preservation */
				3142	if ((flags & MS_REMOUNT) &&
				3143	((flags & (MS_NOATIME \| MS_NODIRATIME \| MS_RELATIME \|
				3144	MS_STRICTATIME)) == 0)) {
				3145	mnt_flags &= ~MNT_ATIME_MASK;
				3146	mnt_flags \|= path.mnt->mnt_flags & MNT_ATIME_MASK;
				3147	}
				3148
				3149	sb_flags = flags & (SB_RDONLY \|
				3150	SB_SYNCHRONOUS \|
				3151	SB_MANDLOCK \|
				3152	SB_DIRSYNC \|
				3153	SB_SILENT \|
				3154	SB_POSIXACL \|
				3155	SB_LAZYTIME \|
				3156	SB_I_VERSION);
				3157
				3158	if ((flags & (MS_REMOUNT \| MS_BIND)) == (MS_REMOUNT \| MS_BIND))
				3159	retval = do_reconfigure_mnt(&path, mnt_flags);
				3160	else if (flags & MS_REMOUNT)
				3161	retval = do_remount(&path, flags, sb_flags, mnt_flags,
				3162	data_page);
				3163	else if (flags & MS_BIND)
				3164	retval = do_loopback(&path, dev_name, flags & MS_REC);
				3165	else if (flags & (MS_SHARED \| MS_PRIVATE \| MS_SLAVE \| MS_UNBINDABLE))
				3166	retval = do_change_type(&path, flags);
				3167	else if (flags & MS_MOVE)
				3168	retval = do_move_mount_old(&path, dev_name);
				3169	else
				3170	retval = do_new_mount(&path, type_page, sb_flags, mnt_flags,
				3171	dev_name, data_page);
				3172	dput_out:
				3173	path_put(&path);
				3174	return retval;
				3175	}
				3176
				3177	static struct ucounts inc_mnt_namespaces(struct user_namespace ns)
				3178	{
				3179	return inc_ucount(ns, current_euid(), UCOUNT_MNT_NAMESPACES);
				3180	}
				3181
				3182	static void dec_mnt_namespaces(struct ucounts *ucounts)
				3183	{
				3184	dec_ucount(ucounts, UCOUNT_MNT_NAMESPACES);
				3185	}
				3186
				3187	static void free_mnt_ns(struct mnt_namespace *ns)
				3188	{
				3189	if (!is_anon_ns(ns))
				3190	ns_free_inum(&ns->ns);
				3191	dec_mnt_namespaces(ns->ucounts);
				3192	put_user_ns(ns->user_ns);
				3193	kfree(ns);
				3194	}
				3195
				3196	/*
				3197	* Assign a sequence number so we can detect when we attempt to bind
				3198	* mount a reference to an older mount namespace into the current
				3199	* mount namespace, preventing reference counting loops. A 64bit
				3200	* number incrementing at 10Ghz will take 12,427 years to wrap which
				3201	* is effectively never, so we can ignore the possibility.
				3202	*/
				3203	static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);
				3204
				3205	static struct mnt_namespace alloc_mnt_ns(struct user_namespace user_ns, bool anon)
				3206	{
				3207	struct mnt_namespace *new_ns;
				3208	struct ucounts *ucounts;
				3209	int ret;
				3210
				3211	ucounts = inc_mnt_namespaces(user_ns);
				3212	if (!ucounts)
				3213	return ERR_PTR(-ENOSPC);
				3214
				3215	new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
				3216	if (!new_ns) {
				3217	dec_mnt_namespaces(ucounts);
				3218	return ERR_PTR(-ENOMEM);
				3219	}
				3220	if (!anon) {
				3221	ret = ns_alloc_inum(&new_ns->ns);
				3222	if (ret) {
				3223	kfree(new_ns);
				3224	dec_mnt_namespaces(ucounts);
				3225	return ERR_PTR(ret);
				3226	}
				3227	}
				3228	new_ns->ns.ops = &mntns_operations;
				3229	if (!anon)
				3230	new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
				3231	atomic_set(&new_ns->count, 1);
				3232	INIT_LIST_HEAD(&new_ns->list);
				3233	init_waitqueue_head(&new_ns->poll);
				3234	new_ns->user_ns = get_user_ns(user_ns);
				3235	new_ns->ucounts = ucounts;
				3236	return new_ns;
				3237	}
				3238
				3239	__latent_entropy
				3240	struct mnt_namespace copy_mnt_ns(unsigned long flags, struct mnt_namespace ns,
				3241	struct user_namespace user_ns, struct fs_struct new_fs)
				3242	{
				3243	struct mnt_namespace *new_ns;
				3244	struct vfsmount rootmnt = NULL, pwdmnt = NULL;
				3245	struct mount p, q;
				3246	struct mount *old;
				3247	struct mount *new;
				3248	int copy_flags;
				3249
				3250	BUG_ON(!ns);
				3251
				3252	if (likely(!(flags & CLONE_NEWNS))) {
				3253	get_mnt_ns(ns);
				3254	return ns;
				3255	}
				3256
				3257	old = ns->root;
				3258
				3259	new_ns = alloc_mnt_ns(user_ns, false);
				3260	if (IS_ERR(new_ns))
				3261	return new_ns;
				3262
				3263	namespace_lock();
				3264	/* First pass: copy the tree topology */
				3265	copy_flags = CL_COPY_UNBINDABLE \| CL_EXPIRE;
				3266	if (user_ns != ns->user_ns)
				3267	copy_flags \|= CL_SHARED_TO_SLAVE;
				3268	new = copy_tree(old, old->mnt.mnt_root, copy_flags);
				3269	if (IS_ERR(new)) {
				3270	namespace_unlock();
				3271	free_mnt_ns(new_ns);
				3272	return ERR_CAST(new);
				3273	}
				3274	if (user_ns != ns->user_ns) {
				3275	lock_mount_hash();
				3276	lock_mnt_tree(new);
				3277	unlock_mount_hash();
				3278	}
				3279	new_ns->root = new;
				3280	list_add_tail(&new_ns->list, &new->mnt_list);
				3281
				3282	/*
				3283	* Second pass: switch the tsk->fs->* elements and mark new vfsmounts
				3284	* as belonging to new namespace. We have already acquired a private
				3285	* fs_struct, so tsk->fs->lock is not needed.
				3286	*/
				3287	p = old;
				3288	q = new;
				3289	while (p) {
				3290	q->mnt_ns = new_ns;
				3291	new_ns->mounts++;
				3292	if (new_fs) {
				3293	if (&p->mnt == new_fs->root.mnt) {
				3294	new_fs->root.mnt = mntget(&q->mnt);
				3295	rootmnt = &p->mnt;
				3296	}
				3297	if (&p->mnt == new_fs->pwd.mnt) {
				3298	new_fs->pwd.mnt = mntget(&q->mnt);
				3299	pwdmnt = &p->mnt;
				3300	}
				3301	}
				3302	p = next_mnt(p, old);
				3303	q = next_mnt(q, new);
				3304	if (!q)
				3305	break;
				3306	while (p->mnt.mnt_root != q->mnt.mnt_root)
				3307	p = next_mnt(p, old);
				3308	}
				3309	namespace_unlock();
				3310
				3311	if (rootmnt)
				3312	mntput(rootmnt);
				3313	if (pwdmnt)
				3314	mntput(pwdmnt);
				3315
				3316	return new_ns;
				3317	}
				3318
				3319	struct dentry mount_subtree(struct vfsmount m, const char *name)
				3320	{
				3321	struct mount *mnt = real_mount(m);
				3322	struct mnt_namespace *ns;
				3323	struct super_block *s;
				3324	struct path path;
				3325	int err;
				3326
				3327	ns = alloc_mnt_ns(&init_user_ns, true);
				3328	if (IS_ERR(ns)) {
				3329	mntput(m);
				3330	return ERR_CAST(ns);
				3331	}
				3332	mnt->mnt_ns = ns;
				3333	ns->root = mnt;
				3334	ns->mounts++;
				3335	list_add(&mnt->mnt_list, &ns->list);
				3336
				3337	err = vfs_path_lookup(m->mnt_root, m,
				3338	name, LOOKUP_FOLLOW\|LOOKUP_AUTOMOUNT, &path);
				3339
				3340	put_mnt_ns(ns);
				3341
				3342	if (err)
				3343	return ERR_PTR(err);
				3344
				3345	/* trade a vfsmount reference for active sb one */
				3346	s = path.mnt->mnt_sb;
				3347	atomic_inc(&s->s_active);
				3348	mntput(path.mnt);
				3349	/* lock the sucker */
				3350	down_write(&s->s_umount);
				3351	/* ... and return the root of (sub)tree on it */
				3352	return path.dentry;
				3353	}
				3354	EXPORT_SYMBOL(mount_subtree);
				3355
				3356	int ksys_mount(const char __user dev_name, const char __user dir_name,
				3357	const char __user type, unsigned long flags, void __user data)
				3358	{
				3359	int ret;
				3360	char *kernel_type;
				3361	char *kernel_dev;
				3362	void *options;
				3363
				3364	kernel_type = copy_mount_string(type);
				3365	ret = PTR_ERR(kernel_type);
				3366	if (IS_ERR(kernel_type))
				3367	goto out_type;
				3368
				3369	kernel_dev = copy_mount_string(dev_name);
				3370	ret = PTR_ERR(kernel_dev);
				3371	if (IS_ERR(kernel_dev))
				3372	goto out_dev;
				3373
				3374	options = copy_mount_options(data);
				3375	ret = PTR_ERR(options);
				3376	if (IS_ERR(options))
				3377	goto out_data;
				3378
				3379	ret = do_mount(kernel_dev, dir_name, kernel_type, flags, options);
				3380
				3381	kfree(options);
				3382	out_data:
				3383	kfree(kernel_dev);
				3384	out_dev:
				3385	kfree(kernel_type);
				3386	out_type:
				3387	return ret;
				3388	}
				3389
				3390	SYSCALL_DEFINE5(mount, char __user , dev_name, char __user , dir_name,
				3391	char __user , type, unsigned long, flags, void __user , data)
				3392	{
				3393	return ksys_mount(dev_name, dir_name, type, flags, data);
				3394	}
				3395
				3396	/*
				3397	* Create a kernel mount representation for a new, prepared superblock
				3398	* (specified by fs_fd) and attach to an open_tree-like file descriptor.
				3399	*/
				3400	SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
				3401	unsigned int, attr_flags)
				3402	{
				3403	struct mnt_namespace *ns;
				3404	struct fs_context *fc;
				3405	struct file *file;
				3406	struct path newmount;
				3407	struct mount *mnt;
				3408	struct fd f;
				3409	unsigned int mnt_flags = 0;
				3410	long ret;
				3411
				3412	if (!may_mount())
				3413	return -EPERM;
				3414
				3415	if ((flags & ~(FSMOUNT_CLOEXEC)) != 0)
				3416	return -EINVAL;
				3417
				3418	if (attr_flags & ~(MOUNT_ATTR_RDONLY \|
				3419	MOUNT_ATTR_NOSUID \|
				3420	MOUNT_ATTR_NODEV \|
				3421	MOUNT_ATTR_NOEXEC \|
				3422	MOUNT_ATTR__ATIME \|
				3423	MOUNT_ATTR_NODIRATIME))
				3424	return -EINVAL;
				3425
				3426	if (attr_flags & MOUNT_ATTR_RDONLY)
				3427	mnt_flags \|= MNT_READONLY;
				3428	if (attr_flags & MOUNT_ATTR_NOSUID)
				3429	mnt_flags \|= MNT_NOSUID;
				3430	if (attr_flags & MOUNT_ATTR_NODEV)
				3431	mnt_flags \|= MNT_NODEV;
				3432	if (attr_flags & MOUNT_ATTR_NOEXEC)
				3433	mnt_flags \|= MNT_NOEXEC;
				3434	if (attr_flags & MOUNT_ATTR_NODIRATIME)
				3435	mnt_flags \|= MNT_NODIRATIME;
				3436
				3437	switch (attr_flags & MOUNT_ATTR__ATIME) {
				3438	case MOUNT_ATTR_STRICTATIME:
				3439	break;
				3440	case MOUNT_ATTR_NOATIME:
				3441	mnt_flags \|= MNT_NOATIME;
				3442	break;
				3443	case MOUNT_ATTR_RELATIME:
				3444	mnt_flags \|= MNT_RELATIME;
				3445	break;
				3446	default:
				3447	return -EINVAL;
				3448	}
				3449
				3450	f = fdget(fs_fd);
				3451	if (!f.file)
				3452	return -EBADF;
				3453
				3454	ret = -EINVAL;
				3455	if (f.file->f_op != &fscontext_fops)
				3456	goto err_fsfd;
				3457
				3458	fc = f.file->private_data;
				3459
				3460	ret = mutex_lock_interruptible(&fc->uapi_mutex);
				3461	if (ret < 0)
				3462	goto err_fsfd;
				3463
				3464	/* There must be a valid superblock or we can't mount it */
				3465	ret = -EINVAL;
				3466	if (!fc->root)
				3467	goto err_unlock;
				3468
				3469	ret = -EPERM;
				3470	if (mount_too_revealing(fc->root->d_sb, &mnt_flags)) {
				3471	pr_warn("VFS: Mount too revealing\n");
				3472	goto err_unlock;
				3473	}
				3474
				3475	ret = -EBUSY;
				3476	if (fc->phase != FS_CONTEXT_AWAITING_MOUNT)
				3477	goto err_unlock;
				3478
				3479	ret = -EPERM;
				3480	if ((fc->sb_flags & SB_MANDLOCK) && !may_mandlock())
				3481	goto err_unlock;
				3482
				3483	newmount.mnt = vfs_create_mount(fc);
				3484	if (IS_ERR(newmount.mnt)) {
				3485	ret = PTR_ERR(newmount.mnt);
				3486	goto err_unlock;
				3487	}
				3488	newmount.dentry = dget(fc->root);
				3489	newmount.mnt->mnt_flags = mnt_flags;
				3490
				3491	/* We've done the mount bit - now move the file context into more or
				3492	* less the same state as if we'd done an fspick(). We don't want to
				3493	* do any memory allocation or anything like that at this point as we
				3494	* don't want to have to handle any errors incurred.
				3495	*/
				3496	vfs_clean_context(fc);
				3497
				3498	ns = alloc_mnt_ns(current->nsproxy->mnt_ns->user_ns, true);
				3499	if (IS_ERR(ns)) {
				3500	ret = PTR_ERR(ns);
				3501	goto err_path;
				3502	}
				3503	mnt = real_mount(newmount.mnt);
				3504	mnt->mnt_ns = ns;
				3505	ns->root = mnt;
				3506	ns->mounts = 1;
				3507	list_add(&mnt->mnt_list, &ns->list);
				3508	mntget(newmount.mnt);
				3509
				3510	/* Attach to an apparent O_PATH fd with a note that we need to unmount
				3511	* it, not just simply put it.
				3512	*/
				3513	file = dentry_open(&newmount, O_PATH, fc->cred);
				3514	if (IS_ERR(file)) {
				3515	dissolve_on_fput(newmount.mnt);
				3516	ret = PTR_ERR(file);
				3517	goto err_path;
				3518	}
				3519	file->f_mode \|= FMODE_NEED_UNMOUNT;
				3520
				3521	ret = get_unused_fd_flags((flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : 0);
				3522	if (ret >= 0)
				3523	fd_install(ret, file);
				3524	else
				3525	fput(file);
				3526
				3527	err_path:
				3528	path_put(&newmount);
				3529	err_unlock:
				3530	mutex_unlock(&fc->uapi_mutex);
				3531	err_fsfd:
				3532	fdput(f);
				3533	return ret;
				3534	}
				3535
				3536	/*
				3537	* Move a mount from one place to another. In combination with
				3538	* fsopen()/fsmount() this is used to install a new mount and in combination
				3539	* with open_tree(OPEN_TREE_CLONE [\| AT_RECURSIVE]) it can be used to copy
				3540	* a mount subtree.
				3541	*
				3542	* Note the flags value is a combination of MOVE_MOUNT_* flags.
				3543	*/
				3544	SYSCALL_DEFINE5(move_mount,
				3545	int, from_dfd, const char *, from_pathname,
				3546	int, to_dfd, const char *, to_pathname,
				3547	unsigned int, flags)
				3548	{
				3549	struct path from_path, to_path;
				3550	unsigned int lflags;
				3551	int ret = 0;
				3552
				3553	if (!may_mount())
				3554	return -EPERM;
				3555
				3556	if (flags & ~MOVE_MOUNT__MASK)
				3557	return -EINVAL;
				3558
				3559	/* If someone gives a pathname, they aren't permitted to move
				3560	* from an fd that requires unmount as we can't get at the flag
				3561	* to clear it afterwards.
				3562	*/
				3563	lflags = 0;
				3564	if (flags & MOVE_MOUNT_F_SYMLINKS) lflags \|= LOOKUP_FOLLOW;
				3565	if (flags & MOVE_MOUNT_F_AUTOMOUNTS) lflags \|= LOOKUP_AUTOMOUNT;
				3566	if (flags & MOVE_MOUNT_F_EMPTY_PATH) lflags \|= LOOKUP_EMPTY;
				3567
				3568	ret = user_path_at(from_dfd, from_pathname, lflags, &from_path);
				3569	if (ret < 0)
				3570	return ret;
				3571
				3572	lflags = 0;
				3573	if (flags & MOVE_MOUNT_T_SYMLINKS) lflags \|= LOOKUP_FOLLOW;
				3574	if (flags & MOVE_MOUNT_T_AUTOMOUNTS) lflags \|= LOOKUP_AUTOMOUNT;
				3575	if (flags & MOVE_MOUNT_T_EMPTY_PATH) lflags \|= LOOKUP_EMPTY;
				3576
				3577	ret = user_path_at(to_dfd, to_pathname, lflags, &to_path);
				3578	if (ret < 0)
				3579	goto out_from;
				3580
				3581	ret = security_move_mount(&from_path, &to_path);
				3582	if (ret < 0)
				3583	goto out_to;
				3584
				3585	ret = do_move_mount(&from_path, &to_path);
				3586
				3587	out_to:
				3588	path_put(&to_path);
				3589	out_from:
				3590	path_put(&from_path);
				3591	return ret;
				3592	}
				3593
				3594	/*
				3595	* Return true if path is reachable from root
				3596	*
				3597	* namespace_sem or mount_lock is held
				3598	*/
				3599	bool is_path_reachable(struct mount mnt, struct dentry dentry,
				3600	const struct path *root)
				3601	{
				3602	while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) {
				3603	dentry = mnt->mnt_mountpoint;
				3604	mnt = mnt->mnt_parent;
				3605	}
				3606	return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry);
				3607	}
				3608
				3609	bool path_is_under(const struct path path1, const struct path path2)
				3610	{
				3611	bool res;
				3612	read_seqlock_excl(&mount_lock);
				3613	res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
				3614	read_sequnlock_excl(&mount_lock);
				3615	return res;
				3616	}
				3617	EXPORT_SYMBOL(path_is_under);
				3618
				3619	/*
				3620	* pivot_root Semantics:
				3621	* Moves the root file system of the current process to the directory put_old,
				3622	* makes new_root as the new root file system of the current process, and sets
				3623	* root/cwd of all processes which had them on the current root to new_root.
				3624	*
				3625	* Restrictions:
				3626	* The new_root and put_old must be directories, and must not be on the
				3627	* same file system as the current process root. The put_old must be
				3628	* underneath new_root, i.e. adding a non-zero number of /.. to the string
				3629	* pointed to by put_old must yield the same directory as new_root. No other
				3630	* file system may be mounted on put_old. After all, new_root is a mountpoint.
				3631	*
				3632	* Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.
				3633	* See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives
				3634	* in this situation.
				3635	*
				3636	* Notes:
				3637	* - we don't move root/cwd if they are not at the root (reason: if something
				3638	* cared enough to change them, it's probably wrong to force them elsewhere)
				3639	* - it's okay to pick a root that isn't the root of a file system, e.g.
				3640	* /nfs/my_root where /nfs is the mount point. It must be a mountpoint,
				3641	* though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
				3642	* first.
				3643	*/
				3644	SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
				3645	const char __user *, put_old)
				3646	{
				3647	struct path new, old, root;
				3648	struct mount new_mnt, root_mnt, old_mnt, root_parent, *ex_parent;
				3649	struct mountpoint old_mp, root_mp;
				3650	int error;
				3651
				3652	if (!may_mount())
				3653	return -EPERM;
				3654
				3655	error = user_path_at(AT_FDCWD, new_root,
				3656	LOOKUP_FOLLOW \| LOOKUP_DIRECTORY, &new);
				3657	if (error)
				3658	goto out0;
				3659
				3660	error = user_path_at(AT_FDCWD, put_old,
				3661	LOOKUP_FOLLOW \| LOOKUP_DIRECTORY, &old);
				3662	if (error)
				3663	goto out1;
				3664
				3665	error = security_sb_pivotroot(&old, &new);
				3666	if (error)
				3667	goto out2;
				3668
				3669	get_fs_root(current->fs, &root);
				3670	old_mp = lock_mount(&old);
				3671	error = PTR_ERR(old_mp);
				3672	if (IS_ERR(old_mp))
				3673	goto out3;
				3674
				3675	error = -EINVAL;
				3676	new_mnt = real_mount(new.mnt);
				3677	root_mnt = real_mount(root.mnt);
				3678	old_mnt = real_mount(old.mnt);
				3679	ex_parent = new_mnt->mnt_parent;
				3680	root_parent = root_mnt->mnt_parent;
				3681	if (IS_MNT_SHARED(old_mnt) \|\|
				3682	IS_MNT_SHARED(ex_parent) \|\|
				3683	IS_MNT_SHARED(root_parent))
				3684	goto out4;
				3685	if (!check_mnt(root_mnt) \|\| !check_mnt(new_mnt))
				3686	goto out4;
				3687	if (new_mnt->mnt.mnt_flags & MNT_LOCKED)
				3688	goto out4;
				3689	error = -ENOENT;
				3690	if (d_unlinked(new.dentry))
				3691	goto out4;
				3692	error = -EBUSY;
				3693	if (new_mnt == root_mnt \|\| old_mnt == root_mnt)
				3694	goto out4; /* loop, on the same file system */
				3695	error = -EINVAL;
				3696	if (root.mnt->mnt_root != root.dentry)
				3697	goto out4; /* not a mountpoint */
				3698	if (!mnt_has_parent(root_mnt))
				3699	goto out4; /* not attached */
				3700	if (new.mnt->mnt_root != new.dentry)
				3701	goto out4; /* not a mountpoint */
				3702	if (!mnt_has_parent(new_mnt))
				3703	goto out4; /* not attached */
				3704	/* make sure we can reach put_old from new_root */
				3705	if (!is_path_reachable(old_mnt, old.dentry, &new))
				3706	goto out4;
				3707	/* make certain new is below the root */
				3708	if (!is_path_reachable(new_mnt, new.dentry, &root))
				3709	goto out4;
				3710	lock_mount_hash();
				3711	umount_mnt(new_mnt);
				3712	root_mp = unhash_mnt(root_mnt); /* we'll need its mountpoint */
				3713	if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {
				3714	new_mnt->mnt.mnt_flags \|= MNT_LOCKED;
				3715	root_mnt->mnt.mnt_flags &= ~MNT_LOCKED;
				3716	}
				3717	/* mount old root on put_old */
				3718	attach_mnt(root_mnt, old_mnt, old_mp);
				3719	/* mount new_root on / */
				3720	attach_mnt(new_mnt, root_parent, root_mp);
				3721	mnt_add_count(root_parent, -1);
				3722	touch_mnt_namespace(current->nsproxy->mnt_ns);
				3723	/* A moved mount should not expire automatically */
				3724	list_del_init(&new_mnt->mnt_expire);
				3725	put_mountpoint(root_mp);
				3726	unlock_mount_hash();
				3727	chroot_fs_refs(&root, &new);
				3728	error = 0;
				3729	out4:
				3730	unlock_mount(old_mp);
				3731	if (!error)
				3732	mntput_no_expire(ex_parent);
				3733	out3:
				3734	path_put(&root);
				3735	out2:
				3736	path_put(&old);
				3737	out1:
				3738	path_put(&new);
				3739	out0:
				3740	return error;
				3741	}
				3742
				3743	static void __init init_mount_tree(void)
				3744	{
				3745	struct vfsmount *mnt;
				3746	struct mount *m;
				3747	struct mnt_namespace *ns;
				3748	struct path root;
				3749
				3750	mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", NULL);
				3751	if (IS_ERR(mnt))
				3752	panic("Can't create rootfs");
				3753
				3754	ns = alloc_mnt_ns(&init_user_ns, false);
				3755	if (IS_ERR(ns))
				3756	panic("Can't allocate initial namespace");
				3757	m = real_mount(mnt);
				3758	m->mnt_ns = ns;
				3759	ns->root = m;
				3760	ns->mounts = 1;
				3761	list_add(&m->mnt_list, &ns->list);
				3762	init_task.nsproxy->mnt_ns = ns;
				3763	get_mnt_ns(ns);
				3764
				3765	root.mnt = mnt;
				3766	root.dentry = mnt->mnt_root;
				3767	mnt->mnt_flags \|= MNT_LOCKED;
				3768
				3769	set_fs_pwd(current->fs, &root);
				3770	set_fs_root(current->fs, &root);
				3771	}
				3772
				3773	void __init mnt_init(void)
				3774	{
				3775	int err;
				3776
				3777	mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),
				3778	0, SLAB_HWCACHE_ALIGN \| SLAB_PANIC, NULL);
				3779
				3780	mount_hashtable = alloc_large_system_hash("Mount-cache",
				3781	sizeof(struct hlist_head),
				3782	mhash_entries, 19,
				3783	HASH_ZERO,
				3784	&m_hash_shift, &m_hash_mask, 0, 0);
				3785	mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache",
				3786	sizeof(struct hlist_head),
				3787	mphash_entries, 19,
				3788	HASH_ZERO,
				3789	&mp_hash_shift, &mp_hash_mask, 0, 0);
				3790
				3791	if (!mount_hashtable \|\| !mountpoint_hashtable)
				3792	panic("Failed to allocate mount hash table\n");
				3793
				3794	kernfs_init();
				3795
				3796	err = sysfs_init();
				3797	if (err)
				3798	printk(KERN_WARNING "%s: sysfs_init error: %d\n",
				3799	__func__, err);
				3800	fs_kobj = kobject_create_and_add("fs", NULL);
				3801	if (!fs_kobj)
				3802	printk(KERN_WARNING "%s: kobj create error\n", __func__);
				3803	shmem_init();
				3804	init_rootfs();
				3805	init_mount_tree();
				3806	}
				3807
				3808	void put_mnt_ns(struct mnt_namespace *ns)
				3809	{
				3810	if (!atomic_dec_and_test(&ns->count))
				3811	return;
				3812	drop_collected_mounts(&ns->root->mnt);
				3813	free_mnt_ns(ns);
				3814	}
				3815
				3816	struct vfsmount kern_mount(struct file_system_type type)
				3817	{
				3818	struct vfsmount *mnt;
				3819	mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL);
				3820	if (!IS_ERR(mnt)) {
				3821	/*
				3822	* it is a longterm mount, don't release mnt until
				3823	* we unmount before file sys is unregistered
				3824	*/
				3825	real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL;
				3826	}
				3827	return mnt;
				3828	}
				3829	EXPORT_SYMBOL_GPL(kern_mount);
				3830
				3831	void kern_unmount(struct vfsmount *mnt)
				3832	{
				3833	/* release long term mount so mount point can be released */
				3834	if (!IS_ERR_OR_NULL(mnt)) {
				3835	real_mount(mnt)->mnt_ns = NULL;
				3836	synchronize_rcu(); /* yecchhh... */
				3837	mntput(mnt);
				3838	}
				3839	}
				3840	EXPORT_SYMBOL(kern_unmount);
				3841
				3842	bool our_mnt(struct vfsmount *mnt)
				3843	{
				3844	return check_mnt(real_mount(mnt));
				3845	}
				3846
				3847	bool current_chrooted(void)
				3848	{
				3849	/* Does the current process have a non-standard root */
				3850	struct path ns_root;
				3851	struct path fs_root;
				3852	bool chrooted;
				3853
				3854	/* Find the namespace root */
				3855	ns_root.mnt = &current->nsproxy->mnt_ns->root->mnt;
				3856	ns_root.dentry = ns_root.mnt->mnt_root;
				3857	path_get(&ns_root);
				3858	while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root))
				3859	;
				3860
				3861	get_fs_root(current->fs, &fs_root);
				3862
				3863	chrooted = !path_equal(&fs_root, &ns_root);
				3864
				3865	path_put(&fs_root);
				3866	path_put(&ns_root);
				3867
				3868	return chrooted;
				3869	}
				3870
				3871	static bool mnt_already_visible(struct mnt_namespace *ns,
				3872	const struct super_block *sb,
				3873	int *new_mnt_flags)
				3874	{
				3875	int new_flags = *new_mnt_flags;
				3876	struct mount *mnt;
				3877	bool visible = false;
				3878
				3879	down_read(&namespace_sem);
				3880	list_for_each_entry(mnt, &ns->list, mnt_list) {
				3881	struct mount *child;
				3882	int mnt_flags;
				3883
				3884	if (mnt->mnt.mnt_sb->s_type != sb->s_type)
				3885	continue;
				3886
				3887	/* This mount is not fully visible if it's root directory
				3888	* is not the root directory of the filesystem.
				3889	*/
				3890	if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root)
				3891	continue;
				3892
				3893	/* A local view of the mount flags */
				3894	mnt_flags = mnt->mnt.mnt_flags;
				3895
				3896	/* Don't miss readonly hidden in the superblock flags */
				3897	if (sb_rdonly(mnt->mnt.mnt_sb))
				3898	mnt_flags \|= MNT_LOCK_READONLY;
				3899
				3900	/* Verify the mount flags are equal to or more permissive
				3901	* than the proposed new mount.
				3902	*/
				3903	if ((mnt_flags & MNT_LOCK_READONLY) &&
				3904	!(new_flags & MNT_READONLY))
				3905	continue;
				3906	if ((mnt_flags & MNT_LOCK_ATIME) &&
				3907	((mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK)))
				3908	continue;
				3909
				3910	/* This mount is not fully visible if there are any
				3911	* locked child mounts that cover anything except for
				3912	* empty directories.
				3913	*/
				3914	list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
				3915	struct inode *inode = child->mnt_mountpoint->d_inode;
				3916	/* Only worry about locked mounts */
				3917	if (!(child->mnt.mnt_flags & MNT_LOCKED))
				3918	continue;
				3919	/* Is the directory permanetly empty? */
				3920	if (!is_empty_dir_inode(inode))
				3921	goto next;
				3922	}
				3923	/* Preserve the locked attributes */
				3924	*new_mnt_flags \|= mnt_flags & (MNT_LOCK_READONLY \| \
				3925	MNT_LOCK_ATIME);
				3926	visible = true;
				3927	goto found;
				3928	next: ;
				3929	}
				3930	found:
				3931	up_read(&namespace_sem);
				3932	return visible;
				3933	}
				3934
				3935	static bool mount_too_revealing(const struct super_block sb, int new_mnt_flags)
				3936	{
				3937	const unsigned long required_iflags = SB_I_NOEXEC \| SB_I_NODEV;
				3938	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
				3939	unsigned long s_iflags;
				3940
				3941	if (ns->user_ns == &init_user_ns)
				3942	return false;
				3943
				3944	/* Can this filesystem be too revealing? */
				3945	s_iflags = sb->s_iflags;
				3946	if (!(s_iflags & SB_I_USERNS_VISIBLE))
				3947	return false;
				3948
				3949	if ((s_iflags & required_iflags) != required_iflags) {
				3950	WARN_ONCE(1, "Expected s_iflags to contain 0x%lx\n",
				3951	required_iflags);
				3952	return true;
				3953	}
				3954
				3955	return !mnt_already_visible(ns, sb, new_mnt_flags);
				3956	}
				3957
				3958	bool mnt_may_suid(struct vfsmount *mnt)
				3959	{
				3960	/*
				3961	* Foreign mounts (accessed via fchdir or through /proc
				3962	* symlinks) are always treated as if they are nosuid. This
				3963	* prevents namespaces from trusting potentially unsafe
				3964	* suid/sgid bits, file caps, or security labels that originate
				3965	* in other namespaces.
				3966	*/
				3967	return !(mnt->mnt_flags & MNT_NOSUID) && check_mnt(real_mount(mnt)) &&
				3968	current_in_userns(mnt->mnt_sb->s_user_ns);
				3969	}
				3970
				3971	static struct ns_common mntns_get(struct task_struct task)
				3972	{
				3973	struct ns_common *ns = NULL;
				3974	struct nsproxy *nsproxy;
				3975
				3976	task_lock(task);
				3977	nsproxy = task->nsproxy;
				3978	if (nsproxy) {
				3979	ns = &nsproxy->mnt_ns->ns;
				3980	get_mnt_ns(to_mnt_ns(ns));
				3981	}
				3982	task_unlock(task);
				3983
				3984	return ns;
				3985	}
				3986
				3987	static void mntns_put(struct ns_common *ns)
				3988	{
				3989	put_mnt_ns(to_mnt_ns(ns));
				3990	}
				3991
				3992	static int mntns_install(struct nsproxy nsproxy, struct ns_common ns)
				3993	{
				3994	struct fs_struct *fs = current->fs;
				3995	struct mnt_namespace mnt_ns = to_mnt_ns(ns), old_mnt_ns;
				3996	struct path root;
				3997	int err;
				3998
				3999	if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) \|\|
				4000	!ns_capable(current_user_ns(), CAP_SYS_CHROOT) \|\|
				4001	!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
				4002	return -EPERM;
				4003
				4004	if (is_anon_ns(mnt_ns))
				4005	return -EINVAL;
				4006
				4007	if (fs->users != 1)
				4008	return -EINVAL;
				4009
				4010	get_mnt_ns(mnt_ns);
				4011	old_mnt_ns = nsproxy->mnt_ns;
				4012	nsproxy->mnt_ns = mnt_ns;
				4013
				4014	/* Find the root */
				4015	err = vfs_path_lookup(mnt_ns->root->mnt.mnt_root, &mnt_ns->root->mnt,
				4016	"/", LOOKUP_DOWN, &root);
				4017	if (err) {
				4018	/* revert to old namespace */
				4019	nsproxy->mnt_ns = old_mnt_ns;
				4020	put_mnt_ns(mnt_ns);
				4021	return err;
				4022	}
				4023
				4024	put_mnt_ns(old_mnt_ns);
				4025
				4026	/* Update the pwd and root */
				4027	set_fs_pwd(fs, &root);
				4028	set_fs_root(fs, &root);
				4029
				4030	path_put(&root);
				4031	return 0;
				4032	}
				4033
				4034	static struct user_namespace mntns_owner(struct ns_common ns)
				4035	{
				4036	return to_mnt_ns(ns)->user_ns;
				4037	}
				4038
				4039	const struct proc_ns_operations mntns_operations = {
				4040	.name = "mnt",
				4041	.type = CLONE_NEWNS,
				4042	.get = mntns_get,
				4043	.put = mntns_put,
				4044	.install = mntns_install,
				4045	.owner = mntns_owner,
				4046	};