Blame - src/kernel/linux/v4.19/fs/namespace.c - T800

blob: 3347c1d1decfc137df2806a09b42ab0cb1b5f0d2 [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	/*
				2	* linux/fs/namespace.c
				3	*
				4	* (C) Copyright Al Viro 2000, 2001
				5	* Released under GPL v2.
				6	*
				7	* Based on code from fs/super.c, copyright Linus Torvalds and others.
				8	* Heavily rewritten.
				9	*/
				10
				11	#include <linux/syscalls.h>
				12	#include <linux/export.h>
				13	#include <linux/capability.h>
				14	#include <linux/mnt_namespace.h>
				15	#include <linux/user_namespace.h>
				16	#include <linux/namei.h>
				17	#include <linux/security.h>
				18	#include <linux/cred.h>
				19	#include <linux/idr.h>
				20	#include <linux/init.h> /* init_rootfs */
				21	#include <linux/fs_struct.h> /* get_fs_root et.al. */
				22	#include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */
				23	#include <linux/uaccess.h>
				24	#include <linux/proc_ns.h>
				25	#include <linux/magic.h>
				26	#include <linux/bootmem.h>
				27	#include <linux/task_work.h>
				28	#include <linux/sched/task.h>
				29
				30	#include "pnode.h"
				31	#include "internal.h"
				32
				33	/* Maximum number of mounts in a mount namespace */
				34	unsigned int sysctl_mount_max __read_mostly = 100000;
				35
				36	static unsigned int m_hash_mask __read_mostly;
				37	static unsigned int m_hash_shift __read_mostly;
				38	static unsigned int mp_hash_mask __read_mostly;
				39	static unsigned int mp_hash_shift __read_mostly;
				40
				41	static __initdata unsigned long mhash_entries;
				42	static int __init set_mhash_entries(char *str)
				43	{
				44	if (!str)
				45	return 0;
				46	mhash_entries = simple_strtoul(str, &str, 0);
				47	return 1;
				48	}
				49	__setup("mhash_entries=", set_mhash_entries);
				50
				51	static __initdata unsigned long mphash_entries;
				52	static int __init set_mphash_entries(char *str)
				53	{
				54	if (!str)
				55	return 0;
				56	mphash_entries = simple_strtoul(str, &str, 0);
				57	return 1;
				58	}
				59	__setup("mphash_entries=", set_mphash_entries);
				60
				61	static u64 event;
				62	static DEFINE_IDA(mnt_id_ida);
				63	static DEFINE_IDA(mnt_group_ida);
				64
				65	static struct hlist_head *mount_hashtable __read_mostly;
				66	static struct hlist_head *mountpoint_hashtable __read_mostly;
				67	static struct kmem_cache *mnt_cache __read_mostly;
				68	static DECLARE_RWSEM(namespace_sem);
				69
				70	/* /sys/fs */
				71	struct kobject *fs_kobj;
				72	EXPORT_SYMBOL_GPL(fs_kobj);
				73
				74	/*
				75	* vfsmount lock may be taken for read to prevent changes to the
				76	* vfsmount hash, ie. during mountpoint lookups or walking back
				77	* up the tree.
				78	*
				79	* It should be taken for write in all cases where the vfsmount
				80	* tree or hash is modified or when a vfsmount structure is modified.
				81	*/
				82	__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
				83
				84	static inline struct hlist_head m_hash(struct vfsmount mnt, struct dentry *dentry)
				85	{
				86	unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
				87	tmp += ((unsigned long)dentry / L1_CACHE_BYTES);
				88	tmp = tmp + (tmp >> m_hash_shift);
				89	return &mount_hashtable[tmp & m_hash_mask];
				90	}
				91
				92	static inline struct hlist_head mp_hash(struct dentry dentry)
				93	{
				94	unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES);
				95	tmp = tmp + (tmp >> mp_hash_shift);
				96	return &mountpoint_hashtable[tmp & mp_hash_mask];
				97	}
				98
				99	static int mnt_alloc_id(struct mount *mnt)
				100	{
				101	int res = ida_alloc(&mnt_id_ida, GFP_KERNEL);
				102
				103	if (res < 0)
				104	return res;
				105	mnt->mnt_id = res;
				106	return 0;
				107	}
				108
				109	static void mnt_free_id(struct mount *mnt)
				110	{
				111	ida_free(&mnt_id_ida, mnt->mnt_id);
				112	}
				113
				114	/*
				115	* Allocate a new peer group ID
				116	*/
				117	static int mnt_alloc_group_id(struct mount *mnt)
				118	{
				119	int res = ida_alloc_min(&mnt_group_ida, 1, GFP_KERNEL);
				120
				121	if (res < 0)
				122	return res;
				123	mnt->mnt_group_id = res;
				124	return 0;
				125	}
				126
				127	/*
				128	* Release a peer group ID
				129	*/
				130	void mnt_release_group_id(struct mount *mnt)
				131	{
				132	ida_free(&mnt_group_ida, mnt->mnt_group_id);
				133	mnt->mnt_group_id = 0;
				134	}
				135
				136	/*
				137	* vfsmount lock must be held for read
				138	*/
				139	static inline void mnt_add_count(struct mount *mnt, int n)
				140	{
				141	#ifdef CONFIG_SMP
				142	this_cpu_add(mnt->mnt_pcp->mnt_count, n);
				143	#else
				144	preempt_disable();
				145	mnt->mnt_count += n;
				146	preempt_enable();
				147	#endif
				148	}
				149
				150	/*
				151	* vfsmount lock must be held for write
				152	*/
				153	unsigned int mnt_get_count(struct mount *mnt)
				154	{
				155	#ifdef CONFIG_SMP
				156	unsigned int count = 0;
				157	int cpu;
				158
				159	for_each_possible_cpu(cpu) {
				160	count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;
				161	}
				162
				163	return count;
				164	#else
				165	return mnt->mnt_count;
				166	#endif
				167	}
				168
				169	static void drop_mountpoint(struct fs_pin *p)
				170	{
				171	struct mount *m = container_of(p, struct mount, mnt_umount);
				172	dput(m->mnt_ex_mountpoint);
				173	pin_remove(p);
				174	mntput(&m->mnt);
				175	}
				176
				177	static struct mount alloc_vfsmnt(const char name)
				178	{
				179	struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
				180	if (mnt) {
				181	int err;
				182
				183	err = mnt_alloc_id(mnt);
				184	if (err)
				185	goto out_free_cache;
				186
				187	if (name) {
				188	mnt->mnt_devname = kstrdup_const(name, GFP_KERNEL);
				189	if (!mnt->mnt_devname)
				190	goto out_free_id;
				191	}
				192
				193	#ifdef CONFIG_SMP
				194	mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
				195	if (!mnt->mnt_pcp)
				196	goto out_free_devname;
				197
				198	this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
				199	#else
				200	mnt->mnt_count = 1;
				201	mnt->mnt_writers = 0;
				202	#endif
				203	mnt->mnt.data = NULL;
				204
				205	INIT_HLIST_NODE(&mnt->mnt_hash);
				206	INIT_LIST_HEAD(&mnt->mnt_child);
				207	INIT_LIST_HEAD(&mnt->mnt_mounts);
				208	INIT_LIST_HEAD(&mnt->mnt_list);
				209	INIT_LIST_HEAD(&mnt->mnt_expire);
				210	INIT_LIST_HEAD(&mnt->mnt_share);
				211	INIT_LIST_HEAD(&mnt->mnt_slave_list);
				212	INIT_LIST_HEAD(&mnt->mnt_slave);
				213	INIT_HLIST_NODE(&mnt->mnt_mp_list);
				214	INIT_LIST_HEAD(&mnt->mnt_umounting);
				215	init_fs_pin(&mnt->mnt_umount, drop_mountpoint);
				216	}
				217	return mnt;
				218
				219	#ifdef CONFIG_SMP
				220	out_free_devname:
				221	kfree_const(mnt->mnt_devname);
				222	#endif
				223	out_free_id:
				224	mnt_free_id(mnt);
				225	out_free_cache:
				226	kmem_cache_free(mnt_cache, mnt);
				227	return NULL;
				228	}
				229
				230	/*
				231	* Most r/o checks on a fs are for operations that take
				232	* discrete amounts of time, like a write() or unlink().
				233	* We must keep track of when those operations start
				234	* (for permission checks) and when they end, so that
				235	* we can determine when writes are able to occur to
				236	* a filesystem.
				237	*/
				238	/*
				239	* __mnt_is_readonly: check whether a mount is read-only
				240	* @mnt: the mount to check for its write status
				241	*
				242	* This shouldn't be used directly ouside of the VFS.
				243	* It does not guarantee that the filesystem will stay
				244	* r/w, just that it is right now. This can not and
				245	* should not be used in place of IS_RDONLY(inode).
				246	* mnt_want/drop_write() will _keep_ the filesystem
				247	* r/w.
				248	*/
				249	int __mnt_is_readonly(struct vfsmount *mnt)
				250	{
				251	if (mnt->mnt_flags & MNT_READONLY)
				252	return 1;
				253	if (sb_rdonly(mnt->mnt_sb))
				254	return 1;
				255	return 0;
				256	}
				257	EXPORT_SYMBOL_GPL(__mnt_is_readonly);
				258
				259	static inline void mnt_inc_writers(struct mount *mnt)
				260	{
				261	#ifdef CONFIG_SMP
				262	this_cpu_inc(mnt->mnt_pcp->mnt_writers);
				263	#else
				264	mnt->mnt_writers++;
				265	#endif
				266	}
				267
				268	static inline void mnt_dec_writers(struct mount *mnt)
				269	{
				270	#ifdef CONFIG_SMP
				271	this_cpu_dec(mnt->mnt_pcp->mnt_writers);
				272	#else
				273	mnt->mnt_writers--;
				274	#endif
				275	}
				276
				277	static unsigned int mnt_get_writers(struct mount *mnt)
				278	{
				279	#ifdef CONFIG_SMP
				280	unsigned int count = 0;
				281	int cpu;
				282
				283	for_each_possible_cpu(cpu) {
				284	count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;
				285	}
				286
				287	return count;
				288	#else
				289	return mnt->mnt_writers;
				290	#endif
				291	}
				292
				293	static int mnt_is_readonly(struct vfsmount *mnt)
				294	{
				295	if (mnt->mnt_sb->s_readonly_remount)
				296	return 1;
				297	/* Order wrt setting s_flags/s_readonly_remount in do_remount() */
				298	smp_rmb();
				299	return __mnt_is_readonly(mnt);
				300	}
				301
				302	/*
				303	* Most r/o & frozen checks on a fs are for operations that take discrete
				304	* amounts of time, like a write() or unlink(). We must keep track of when
				305	* those operations start (for permission checks) and when they end, so that we
				306	* can determine when writes are able to occur to a filesystem.
				307	*/
				308	/**
				309	* __mnt_want_write - get write access to a mount without freeze protection
				310	* @m: the mount on which to take a write
				311	*
				312	* This tells the low-level filesystem that a write is about to be performed to
				313	* it, and makes sure that writes are allowed (mnt it read-write) before
				314	* returning success. This operation does not protect against filesystem being
				315	* frozen. When the write operation is finished, __mnt_drop_write() must be
				316	* called. This is effectively a refcount.
				317	*/
				318	int __mnt_want_write(struct vfsmount *m)
				319	{
				320	struct mount *mnt = real_mount(m);
				321	int ret = 0;
				322
				323	preempt_disable();
				324	mnt_inc_writers(mnt);
				325	/*
				326	* The store to mnt_inc_writers must be visible before we pass
				327	* MNT_WRITE_HOLD loop below, so that the slowpath can see our
				328	* incremented count after it has set MNT_WRITE_HOLD.
				329	*/
				330	smp_mb();
				331	while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
				332	cpu_relax();
				333	/*
				334	* After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
				335	* be set to match its requirements. So we must not load that until
				336	* MNT_WRITE_HOLD is cleared.
				337	*/
				338	smp_rmb();
				339	if (mnt_is_readonly(m)) {
				340	mnt_dec_writers(mnt);
				341	ret = -EROFS;
				342	}
				343	preempt_enable();
				344
				345	return ret;
				346	}
				347
				348	/**
				349	* mnt_want_write - get write access to a mount
				350	* @m: the mount on which to take a write
				351	*
				352	* This tells the low-level filesystem that a write is about to be performed to
				353	* it, and makes sure that writes are allowed (mount is read-write, filesystem
				354	* is not frozen) before returning success. When the write operation is
				355	* finished, mnt_drop_write() must be called. This is effectively a refcount.
				356	*/
				357	int mnt_want_write(struct vfsmount *m)
				358	{
				359	int ret;
				360
				361	sb_start_write(m->mnt_sb);
				362	ret = __mnt_want_write(m);
				363	if (ret)
				364	sb_end_write(m->mnt_sb);
				365	return ret;
				366	}
				367	EXPORT_SYMBOL_GPL(mnt_want_write);
				368
				369	/**
				370	* mnt_clone_write - get write access to a mount
				371	* @mnt: the mount on which to take a write
				372	*
				373	* This is effectively like mnt_want_write, except
				374	* it must only be used to take an extra write reference
				375	* on a mountpoint that we already know has a write reference
				376	* on it. This allows some optimisation.
				377	*
				378	* After finished, mnt_drop_write must be called as usual to
				379	* drop the reference.
				380	*/
				381	int mnt_clone_write(struct vfsmount *mnt)
				382	{
				383	/* superblock may be r/o */
				384	if (__mnt_is_readonly(mnt))
				385	return -EROFS;
				386	preempt_disable();
				387	mnt_inc_writers(real_mount(mnt));
				388	preempt_enable();
				389	return 0;
				390	}
				391	EXPORT_SYMBOL_GPL(mnt_clone_write);
				392
				393	/**
				394	* __mnt_want_write_file - get write access to a file's mount
				395	* @file: the file who's mount on which to take a write
				396	*
				397	* This is like __mnt_want_write, but it takes a file and can
				398	* do some optimisations if the file is open for write already
				399	*/
				400	int __mnt_want_write_file(struct file *file)
				401	{
				402	if (!(file->f_mode & FMODE_WRITER))
				403	return __mnt_want_write(file->f_path.mnt);
				404	else
				405	return mnt_clone_write(file->f_path.mnt);
				406	}
				407
				408	/**
				409	* mnt_want_write_file - get write access to a file's mount
				410	* @file: the file who's mount on which to take a write
				411	*
				412	* This is like mnt_want_write, but it takes a file and can
				413	* do some optimisations if the file is open for write already
				414	*/
				415	int mnt_want_write_file(struct file *file)
				416	{
				417	int ret;
				418
				419	sb_start_write(file_inode(file)->i_sb);
				420	ret = __mnt_want_write_file(file);
				421	if (ret)
				422	sb_end_write(file_inode(file)->i_sb);
				423	return ret;
				424	}
				425	EXPORT_SYMBOL_GPL(mnt_want_write_file);
				426
				427	/**
				428	* __mnt_drop_write - give up write access to a mount
				429	* @mnt: the mount on which to give up write access
				430	*
				431	* Tells the low-level filesystem that we are done
				432	* performing writes to it. Must be matched with
				433	* __mnt_want_write() call above.
				434	*/
				435	void __mnt_drop_write(struct vfsmount *mnt)
				436	{
				437	preempt_disable();
				438	mnt_dec_writers(real_mount(mnt));
				439	preempt_enable();
				440	}
				441
				442	/**
				443	* mnt_drop_write - give up write access to a mount
				444	* @mnt: the mount on which to give up write access
				445	*
				446	* Tells the low-level filesystem that we are done performing writes to it and
				447	* also allows filesystem to be frozen again. Must be matched with
				448	* mnt_want_write() call above.
				449	*/
				450	void mnt_drop_write(struct vfsmount *mnt)
				451	{
				452	__mnt_drop_write(mnt);
				453	sb_end_write(mnt->mnt_sb);
				454	}
				455	EXPORT_SYMBOL_GPL(mnt_drop_write);
				456
				457	void __mnt_drop_write_file(struct file *file)
				458	{
				459	__mnt_drop_write(file->f_path.mnt);
				460	}
				461
				462	void mnt_drop_write_file(struct file *file)
				463	{
				464	__mnt_drop_write_file(file);
				465	sb_end_write(file_inode(file)->i_sb);
				466	}
				467	EXPORT_SYMBOL(mnt_drop_write_file);
				468
				469	static int mnt_make_readonly(struct mount *mnt)
				470	{
				471	int ret = 0;
				472
				473	lock_mount_hash();
				474	mnt->mnt.mnt_flags \|= MNT_WRITE_HOLD;
				475	/*
				476	* After storing MNT_WRITE_HOLD, we'll read the counters. This store
				477	* should be visible before we do.
				478	*/
				479	smp_mb();
				480
				481	/*
				482	* With writers on hold, if this value is zero, then there are
				483	* definitely no active writers (although held writers may subsequently
				484	* increment the count, they'll have to wait, and decrement it after
				485	* seeing MNT_READONLY).
				486	*
				487	* It is OK to have counter incremented on one CPU and decremented on
				488	* another: the sum will add up correctly. The danger would be when we
				489	* sum up each counter, if we read a counter before it is incremented,
				490	* but then read another CPU's count which it has been subsequently
				491	* decremented from -- we would see more decrements than we should.
				492	* MNT_WRITE_HOLD protects against this scenario, because
				493	* mnt_want_write first increments count, then smp_mb, then spins on
				494	* MNT_WRITE_HOLD, so it can't be decremented by another CPU while
				495	* we're counting up here.
				496	*/
				497	if (mnt_get_writers(mnt) > 0)
				498	ret = -EBUSY;
				499	else
				500	mnt->mnt.mnt_flags \|= MNT_READONLY;
				501	/*
				502	* MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
				503	* that become unheld will see MNT_READONLY.
				504	*/
				505	smp_wmb();
				506	mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
				507	unlock_mount_hash();
				508	return ret;
				509	}
				510
				511	static void __mnt_unmake_readonly(struct mount *mnt)
				512	{
				513	lock_mount_hash();
				514	mnt->mnt.mnt_flags &= ~MNT_READONLY;
				515	unlock_mount_hash();
				516	}
				517
				518	int sb_prepare_remount_readonly(struct super_block *sb)
				519	{
				520	struct mount *mnt;
				521	int err = 0;
				522
				523	/* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */
				524	if (atomic_long_read(&sb->s_remove_count))
				525	return -EBUSY;
				526
				527	lock_mount_hash();
				528	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
				529	if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
				530	mnt->mnt.mnt_flags \|= MNT_WRITE_HOLD;
				531	smp_mb();
				532	if (mnt_get_writers(mnt) > 0) {
				533	err = -EBUSY;
				534	break;
				535	}
				536	}
				537	}
				538	if (!err && atomic_long_read(&sb->s_remove_count))
				539	err = -EBUSY;
				540
				541	if (!err) {
				542	sb->s_readonly_remount = 1;
				543	smp_wmb();
				544	}
				545	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
				546	if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
				547	mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
				548	}
				549	unlock_mount_hash();
				550
				551	return err;
				552	}
				553
				554	static void free_vfsmnt(struct mount *mnt)
				555	{
				556	kfree(mnt->mnt.data);
				557	kfree_const(mnt->mnt_devname);
				558	#ifdef CONFIG_SMP
				559	free_percpu(mnt->mnt_pcp);
				560	#endif
				561	kmem_cache_free(mnt_cache, mnt);
				562	}
				563
				564	static void delayed_free_vfsmnt(struct rcu_head *head)
				565	{
				566	free_vfsmnt(container_of(head, struct mount, mnt_rcu));
				567	}
				568
				569	/* call under rcu_read_lock */
				570	int __legitimize_mnt(struct vfsmount *bastard, unsigned seq)
				571	{
				572	struct mount *mnt;
				573	if (read_seqretry(&mount_lock, seq))
				574	return 1;
				575	if (bastard == NULL)
				576	return 0;
				577	mnt = real_mount(bastard);
				578	mnt_add_count(mnt, 1);
				579	smp_mb(); // see mntput_no_expire()
				580	if (likely(!read_seqretry(&mount_lock, seq)))
				581	return 0;
				582	if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {
				583	mnt_add_count(mnt, -1);
				584	return 1;
				585	}
				586	lock_mount_hash();
				587	if (unlikely(bastard->mnt_flags & MNT_DOOMED)) {
				588	mnt_add_count(mnt, -1);
				589	unlock_mount_hash();
				590	return 1;
				591	}
				592	unlock_mount_hash();
				593	/* caller will mntput() */
				594	return -1;
				595	}
				596
				597	/* call under rcu_read_lock */
				598	bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
				599	{
				600	int res = __legitimize_mnt(bastard, seq);
				601	if (likely(!res))
				602	return true;
				603	if (unlikely(res < 0)) {
				604	rcu_read_unlock();
				605	mntput(bastard);
				606	rcu_read_lock();
				607	}
				608	return false;
				609	}
				610
				611	/*
				612	* find the first mount at @dentry on vfsmount @mnt.
				613	* call under rcu_read_lock()
				614	*/
				615	struct mount __lookup_mnt(struct vfsmount mnt, struct dentry *dentry)
				616	{
				617	struct hlist_head *head = m_hash(mnt, dentry);
				618	struct mount *p;
				619
				620	hlist_for_each_entry_rcu(p, head, mnt_hash)
				621	if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
				622	return p;
				623	return NULL;
				624	}
				625
				626	/*
				627	* lookup_mnt - Return the first child mount mounted at path
				628	*
				629	* "First" means first mounted chronologically. If you create the
				630	* following mounts:
				631	*
				632	* mount /dev/sda1 /mnt
				633	* mount /dev/sda2 /mnt
				634	* mount /dev/sda3 /mnt
				635	*
				636	* Then lookup_mnt() on the base /mnt dentry in the root mount will
				637	* return successively the root dentry and vfsmount of /dev/sda1, then
				638	* /dev/sda2, then /dev/sda3, then NULL.
				639	*
				640	* lookup_mnt takes a reference to the found vfsmount.
				641	*/
				642	struct vfsmount lookup_mnt(const struct path path)
				643	{
				644	struct mount *child_mnt;
				645	struct vfsmount *m;
				646	unsigned seq;
				647
				648	rcu_read_lock();
				649	do {
				650	seq = read_seqbegin(&mount_lock);
				651	child_mnt = __lookup_mnt(path->mnt, path->dentry);
				652	m = child_mnt ? &child_mnt->mnt : NULL;
				653	} while (!legitimize_mnt(m, seq));
				654	rcu_read_unlock();
				655	return m;
				656	}
				657
				658	/*
				659	* __is_local_mountpoint - Test to see if dentry is a mountpoint in the
				660	* current mount namespace.
				661	*
				662	* The common case is dentries are not mountpoints at all and that
				663	* test is handled inline. For the slow case when we are actually
				664	* dealing with a mountpoint of some kind, walk through all of the
				665	* mounts in the current mount namespace and test to see if the dentry
				666	* is a mountpoint.
				667	*
				668	* The mount_hashtable is not usable in the context because we
				669	* need to identify all mounts that may be in the current mount
				670	* namespace not just a mount that happens to have some specified
				671	* parent mount.
				672	*/
				673	bool __is_local_mountpoint(struct dentry *dentry)
				674	{
				675	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
				676	struct mount *mnt;
				677	bool is_covered = false;
				678
				679	if (!d_mountpoint(dentry))
				680	goto out;
				681
				682	down_read(&namespace_sem);
				683	list_for_each_entry(mnt, &ns->list, mnt_list) {
				684	is_covered = (mnt->mnt_mountpoint == dentry);
				685	if (is_covered)
				686	break;
				687	}
				688	up_read(&namespace_sem);
				689	out:
				690	return is_covered;
				691	}
				692
				693	static struct mountpoint lookup_mountpoint(struct dentry dentry)
				694	{
				695	struct hlist_head *chain = mp_hash(dentry);
				696	struct mountpoint *mp;
				697
				698	hlist_for_each_entry(mp, chain, m_hash) {
				699	if (mp->m_dentry == dentry) {
				700	mp->m_count++;
				701	return mp;
				702	}
				703	}
				704	return NULL;
				705	}
				706
				707	static struct mountpoint get_mountpoint(struct dentry dentry)
				708	{
				709	struct mountpoint mp, new = NULL;
				710	int ret;
				711
				712	if (d_mountpoint(dentry)) {
				713	/* might be worth a WARN_ON() */
				714	if (d_unlinked(dentry))
				715	return ERR_PTR(-ENOENT);
				716	mountpoint:
				717	read_seqlock_excl(&mount_lock);
				718	mp = lookup_mountpoint(dentry);
				719	read_sequnlock_excl(&mount_lock);
				720	if (mp)
				721	goto done;
				722	}
				723
				724	if (!new)
				725	new = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
				726	if (!new)
				727	return ERR_PTR(-ENOMEM);
				728
				729
				730	/* Exactly one processes may set d_mounted */
				731	ret = d_set_mounted(dentry);
				732
				733	/* Someone else set d_mounted? */
				734	if (ret == -EBUSY)
				735	goto mountpoint;
				736
				737	/* The dentry is not available as a mountpoint? */
				738	mp = ERR_PTR(ret);
				739	if (ret)
				740	goto done;
				741
				742	/* Add the new mountpoint to the hash table */
				743	read_seqlock_excl(&mount_lock);
				744	new->m_dentry = dentry;
				745	new->m_count = 1;
				746	hlist_add_head(&new->m_hash, mp_hash(dentry));
				747	INIT_HLIST_HEAD(&new->m_list);
				748	read_sequnlock_excl(&mount_lock);
				749
				750	mp = new;
				751	new = NULL;
				752	done:
				753	kfree(new);
				754	return mp;
				755	}
				756
				757	static void put_mountpoint(struct mountpoint *mp)
				758	{
				759	if (!--mp->m_count) {
				760	struct dentry *dentry = mp->m_dentry;
				761	BUG_ON(!hlist_empty(&mp->m_list));
				762	spin_lock(&dentry->d_lock);
				763	dentry->d_flags &= ~DCACHE_MOUNTED;
				764	spin_unlock(&dentry->d_lock);
				765	hlist_del(&mp->m_hash);
				766	kfree(mp);
				767	}
				768	}
				769
				770	static inline int check_mnt(struct mount *mnt)
				771	{
				772	return mnt->mnt_ns == current->nsproxy->mnt_ns;
				773	}
				774
				775	/*
				776	* vfsmount lock must be held for write
				777	*/
				778	static void touch_mnt_namespace(struct mnt_namespace *ns)
				779	{
				780	if (ns) {
				781	ns->event = ++event;
				782	wake_up_interruptible(&ns->poll);
				783	}
				784	}
				785
				786	/*
				787	* vfsmount lock must be held for write
				788	*/
				789	static void __touch_mnt_namespace(struct mnt_namespace *ns)
				790	{
				791	if (ns && ns->event != event) {
				792	ns->event = event;
				793	wake_up_interruptible(&ns->poll);
				794	}
				795	}
				796
				797	/*
				798	* vfsmount lock must be held for write
				799	*/
				800	static void unhash_mnt(struct mount *mnt)
				801	{
				802	mnt->mnt_parent = mnt;
				803	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
				804	list_del_init(&mnt->mnt_child);
				805	hlist_del_init_rcu(&mnt->mnt_hash);
				806	hlist_del_init(&mnt->mnt_mp_list);
				807	put_mountpoint(mnt->mnt_mp);
				808	mnt->mnt_mp = NULL;
				809	}
				810
				811	/*
				812	* vfsmount lock must be held for write
				813	*/
				814	static void detach_mnt(struct mount mnt, struct path old_path)
				815	{
				816	old_path->dentry = mnt->mnt_mountpoint;
				817	old_path->mnt = &mnt->mnt_parent->mnt;
				818	unhash_mnt(mnt);
				819	}
				820
				821	/*
				822	* vfsmount lock must be held for write
				823	*/
				824	static void umount_mnt(struct mount *mnt)
				825	{
				826	/* old mountpoint will be dropped when we can do that */
				827	mnt->mnt_ex_mountpoint = mnt->mnt_mountpoint;
				828	unhash_mnt(mnt);
				829	}
				830
				831	/*
				832	* vfsmount lock must be held for write
				833	*/
				834	void mnt_set_mountpoint(struct mount *mnt,
				835	struct mountpoint *mp,
				836	struct mount *child_mnt)
				837	{
				838	mp->m_count++;
				839	mnt_add_count(mnt, 1); /* essentially, that's mntget */
				840	child_mnt->mnt_mountpoint = dget(mp->m_dentry);
				841	child_mnt->mnt_parent = mnt;
				842	child_mnt->mnt_mp = mp;
				843	hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
				844	}
				845
				846	static void __attach_mnt(struct mount mnt, struct mount parent)
				847	{
				848	hlist_add_head_rcu(&mnt->mnt_hash,
				849	m_hash(&parent->mnt, mnt->mnt_mountpoint));
				850	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
				851	}
				852
				853	/*
				854	* vfsmount lock must be held for write
				855	*/
				856	static void attach_mnt(struct mount *mnt,
				857	struct mount *parent,
				858	struct mountpoint *mp)
				859	{
				860	mnt_set_mountpoint(parent, mp, mnt);
				861	__attach_mnt(mnt, parent);
				862	}
				863
				864	void mnt_change_mountpoint(struct mount parent, struct mountpoint mp, struct mount *mnt)
				865	{
				866	struct mountpoint *old_mp = mnt->mnt_mp;
				867	struct dentry *old_mountpoint = mnt->mnt_mountpoint;
				868	struct mount *old_parent = mnt->mnt_parent;
				869
				870	list_del_init(&mnt->mnt_child);
				871	hlist_del_init(&mnt->mnt_mp_list);
				872	hlist_del_init_rcu(&mnt->mnt_hash);
				873
				874	attach_mnt(mnt, parent, mp);
				875
				876	put_mountpoint(old_mp);
				877
				878	/*
				879	* Safely avoid even the suggestion this code might sleep or
				880	* lock the mount hash by taking advantage of the knowledge that
				881	* mnt_change_mountpoint will not release the final reference
				882	* to a mountpoint.
				883	*
				884	* During mounting, the mount passed in as the parent mount will
				885	* continue to use the old mountpoint and during unmounting, the
				886	* old mountpoint will continue to exist until namespace_unlock,
				887	* which happens well after mnt_change_mountpoint.
				888	*/
				889	spin_lock(&old_mountpoint->d_lock);
				890	old_mountpoint->d_lockref.count--;
				891	spin_unlock(&old_mountpoint->d_lock);
				892
				893	mnt_add_count(old_parent, -1);
				894	}
				895
				896	/*
				897	* vfsmount lock must be held for write
				898	*/
				899	static void commit_tree(struct mount *mnt)
				900	{
				901	struct mount *parent = mnt->mnt_parent;
				902	struct mount *m;
				903	LIST_HEAD(head);
				904	struct mnt_namespace *n = parent->mnt_ns;
				905
				906	BUG_ON(parent == mnt);
				907
				908	list_add_tail(&head, &mnt->mnt_list);
				909	list_for_each_entry(m, &head, mnt_list)
				910	m->mnt_ns = n;
				911
				912	list_splice(&head, n->list.prev);
				913
				914	n->mounts += n->pending_mounts;
				915	n->pending_mounts = 0;
				916
				917	__attach_mnt(mnt, parent);
				918	touch_mnt_namespace(n);
				919	}
				920
				921	static struct mount next_mnt(struct mount p, struct mount *root)
				922	{
				923	struct list_head *next = p->mnt_mounts.next;
				924	if (next == &p->mnt_mounts) {
				925	while (1) {
				926	if (p == root)
				927	return NULL;
				928	next = p->mnt_child.next;
				929	if (next != &p->mnt_parent->mnt_mounts)
				930	break;
				931	p = p->mnt_parent;
				932	}
				933	}
				934	return list_entry(next, struct mount, mnt_child);
				935	}
				936
				937	static struct mount skip_mnt_tree(struct mount p)
				938	{
				939	struct list_head *prev = p->mnt_mounts.prev;
				940	while (prev != &p->mnt_mounts) {
				941	p = list_entry(prev, struct mount, mnt_child);
				942	prev = p->mnt_mounts.prev;
				943	}
				944	return p;
				945	}
				946
				947	struct vfsmount *
				948	vfs_kern_mount(struct file_system_type type, int flags, const char name, void *data)
				949	{
				950	struct mount *mnt;
				951	struct dentry *root;
				952
				953	if (!type)
				954	return ERR_PTR(-ENODEV);
				955
				956	mnt = alloc_vfsmnt(name);
				957	if (!mnt)
				958	return ERR_PTR(-ENOMEM);
				959
				960	if (type->alloc_mnt_data) {
				961	mnt->mnt.data = type->alloc_mnt_data();
				962	if (!mnt->mnt.data) {
				963	mnt_free_id(mnt);
				964	free_vfsmnt(mnt);
				965	return ERR_PTR(-ENOMEM);
				966	}
				967	}
				968	if (flags & SB_KERNMOUNT)
				969	mnt->mnt.mnt_flags = MNT_INTERNAL;
				970
				971	root = mount_fs(type, flags, name, &mnt->mnt, data);
				972	if (IS_ERR(root)) {
				973	mnt_free_id(mnt);
				974	free_vfsmnt(mnt);
				975	return ERR_CAST(root);
				976	}
				977
				978	mnt->mnt.mnt_root = root;
				979	mnt->mnt.mnt_sb = root->d_sb;
				980	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
				981	mnt->mnt_parent = mnt;
				982	lock_mount_hash();
				983	list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts);
				984	unlock_mount_hash();
				985	return &mnt->mnt;
				986	}
				987	EXPORT_SYMBOL_GPL(vfs_kern_mount);
				988
				989	struct vfsmount *
				990	vfs_submount(const struct dentry mountpoint, struct file_system_type type,
				991	const char name, void data)
				992	{
				993	/* Until it is worked out how to pass the user namespace
				994	* through from the parent mount to the submount don't support
				995	* unprivileged mounts with submounts.
				996	*/
				997	if (mountpoint->d_sb->s_user_ns != &init_user_ns)
				998	return ERR_PTR(-EPERM);
				999
				1000	return vfs_kern_mount(type, SB_SUBMOUNT, name, data);
				1001	}
				1002	EXPORT_SYMBOL_GPL(vfs_submount);
				1003
				1004	static struct mount clone_mnt(struct mount old, struct dentry *root,
				1005	int flag)
				1006	{
				1007	struct super_block *sb = old->mnt.mnt_sb;
				1008	struct mount *mnt;
				1009	int err;
				1010
				1011	mnt = alloc_vfsmnt(old->mnt_devname);
				1012	if (!mnt)
				1013	return ERR_PTR(-ENOMEM);
				1014
				1015	if (sb->s_op->clone_mnt_data) {
				1016	mnt->mnt.data = sb->s_op->clone_mnt_data(old->mnt.data);
				1017	if (!mnt->mnt.data) {
				1018	err = -ENOMEM;
				1019	goto out_free;
				1020	}
				1021	}
				1022
				1023	if (flag & (CL_SLAVE \| CL_PRIVATE \| CL_SHARED_TO_SLAVE))
				1024	mnt->mnt_group_id = 0; /* not a peer of original */
				1025	else
				1026	mnt->mnt_group_id = old->mnt_group_id;
				1027
				1028	if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) {
				1029	err = mnt_alloc_group_id(mnt);
				1030	if (err)
				1031	goto out_free;
				1032	}
				1033
				1034	mnt->mnt.mnt_flags = old->mnt.mnt_flags;
				1035	mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD\|MNT_MARKED\|MNT_INTERNAL);
				1036	/* Don't allow unprivileged users to change mount flags */
				1037	if (flag & CL_UNPRIVILEGED) {
				1038	mnt->mnt.mnt_flags \|= MNT_LOCK_ATIME;
				1039
				1040	if (mnt->mnt.mnt_flags & MNT_READONLY)
				1041	mnt->mnt.mnt_flags \|= MNT_LOCK_READONLY;
				1042
				1043	if (mnt->mnt.mnt_flags & MNT_NODEV)
				1044	mnt->mnt.mnt_flags \|= MNT_LOCK_NODEV;
				1045
				1046	if (mnt->mnt.mnt_flags & MNT_NOSUID)
				1047	mnt->mnt.mnt_flags \|= MNT_LOCK_NOSUID;
				1048
				1049	if (mnt->mnt.mnt_flags & MNT_NOEXEC)
				1050	mnt->mnt.mnt_flags \|= MNT_LOCK_NOEXEC;
				1051	}
				1052
				1053	/* Don't allow unprivileged users to reveal what is under a mount */
				1054	if ((flag & CL_UNPRIVILEGED) &&
				1055	(!(flag & CL_EXPIRE) \|\| list_empty(&old->mnt_expire)))
				1056	mnt->mnt.mnt_flags \|= MNT_LOCKED;
				1057
				1058	atomic_inc(&sb->s_active);
				1059	mnt->mnt.mnt_sb = sb;
				1060	mnt->mnt.mnt_root = dget(root);
				1061	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
				1062	mnt->mnt_parent = mnt;
				1063	lock_mount_hash();
				1064	list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
				1065	unlock_mount_hash();
				1066
				1067	if ((flag & CL_SLAVE) \|\|
				1068	((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) {
				1069	list_add(&mnt->mnt_slave, &old->mnt_slave_list);
				1070	mnt->mnt_master = old;
				1071	CLEAR_MNT_SHARED(mnt);
				1072	} else if (!(flag & CL_PRIVATE)) {
				1073	if ((flag & CL_MAKE_SHARED) \|\| IS_MNT_SHARED(old))
				1074	list_add(&mnt->mnt_share, &old->mnt_share);
				1075	if (IS_MNT_SLAVE(old))
				1076	list_add(&mnt->mnt_slave, &old->mnt_slave);
				1077	mnt->mnt_master = old->mnt_master;
				1078	} else {
				1079	CLEAR_MNT_SHARED(mnt);
				1080	}
				1081	if (flag & CL_MAKE_SHARED)
				1082	set_mnt_shared(mnt);
				1083
				1084	/* stick the duplicate mount on the same expiry list
				1085	* as the original if that was on one */
				1086	if (flag & CL_EXPIRE) {
				1087	if (!list_empty(&old->mnt_expire))
				1088	list_add(&mnt->mnt_expire, &old->mnt_expire);
				1089	}
				1090
				1091	return mnt;
				1092
				1093	out_free:
				1094	mnt_free_id(mnt);
				1095	free_vfsmnt(mnt);
				1096	return ERR_PTR(err);
				1097	}
				1098
				1099	static void cleanup_mnt(struct mount *mnt)
				1100	{
				1101	/*
				1102	* This probably indicates that somebody messed
				1103	* up a mnt_want/drop_write() pair. If this
				1104	* happens, the filesystem was probably unable
				1105	* to make r/w->r/o transitions.
				1106	*/
				1107	/*
				1108	* The locking used to deal with mnt_count decrement provides barriers,
				1109	* so mnt_get_writers() below is safe.
				1110	*/
				1111	WARN_ON(mnt_get_writers(mnt));
				1112	if (unlikely(mnt->mnt_pins.first))
				1113	mnt_pin_kill(mnt);
				1114	fsnotify_vfsmount_delete(&mnt->mnt);
				1115	dput(mnt->mnt.mnt_root);
				1116	deactivate_super(mnt->mnt.mnt_sb);
				1117	mnt_free_id(mnt);
				1118	call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
				1119	}
				1120
				1121	static void __cleanup_mnt(struct rcu_head *head)
				1122	{
				1123	cleanup_mnt(container_of(head, struct mount, mnt_rcu));
				1124	}
				1125
				1126	static LLIST_HEAD(delayed_mntput_list);
				1127	static void delayed_mntput(struct work_struct *unused)
				1128	{
				1129	struct llist_node *node = llist_del_all(&delayed_mntput_list);
				1130	struct mount m, t;
				1131
				1132	llist_for_each_entry_safe(m, t, node, mnt_llist)
				1133	cleanup_mnt(m);
				1134	}
				1135	static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput);
				1136
				1137	static void mntput_no_expire(struct mount *mnt)
				1138	{
				1139	rcu_read_lock();
				1140	if (likely(READ_ONCE(mnt->mnt_ns))) {
				1141	/*
				1142	* Since we don't do lock_mount_hash() here,
				1143	* ->mnt_ns can change under us. However, if it's
				1144	* non-NULL, then there's a reference that won't
				1145	* be dropped until after an RCU delay done after
				1146	* turning ->mnt_ns NULL. So if we observe it
				1147	* non-NULL under rcu_read_lock(), the reference
				1148	* we are dropping is not the final one.
				1149	*/
				1150	mnt_add_count(mnt, -1);
				1151	rcu_read_unlock();
				1152	return;
				1153	}
				1154	lock_mount_hash();
				1155	/*
				1156	* make sure that if __legitimize_mnt() has not seen us grab
				1157	* mount_lock, we'll see their refcount increment here.
				1158	*/
				1159	smp_mb();
				1160	mnt_add_count(mnt, -1);
				1161	if (mnt_get_count(mnt)) {
				1162	rcu_read_unlock();
				1163	unlock_mount_hash();
				1164	return;
				1165	}
				1166	if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
				1167	rcu_read_unlock();
				1168	unlock_mount_hash();
				1169	return;
				1170	}
				1171	mnt->mnt.mnt_flags \|= MNT_DOOMED;
				1172	rcu_read_unlock();
				1173
				1174	list_del(&mnt->mnt_instance);
				1175
				1176	if (unlikely(!list_empty(&mnt->mnt_mounts))) {
				1177	struct mount p, tmp;
				1178	list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts, mnt_child) {
				1179	umount_mnt(p);
				1180	}
				1181	}
				1182	unlock_mount_hash();
				1183
				1184	if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
				1185	struct task_struct *task = current;
				1186	if (likely(!(task->flags & PF_KTHREAD))) {
				1187	init_task_work(&mnt->mnt_rcu, __cleanup_mnt);
				1188	if (!task_work_add(task, &mnt->mnt_rcu, true))
				1189	return;
				1190	}
				1191	if (llist_add(&mnt->mnt_llist, &delayed_mntput_list))
				1192	schedule_delayed_work(&delayed_mntput_work, 1);
				1193	return;
				1194	}
				1195	cleanup_mnt(mnt);
				1196	}
				1197
				1198	void mntput(struct vfsmount *mnt)
				1199	{
				1200	if (mnt) {
				1201	struct mount *m = real_mount(mnt);
				1202	/* avoid cacheline pingpong, hope gcc doesn't get "smart" */
				1203	if (unlikely(m->mnt_expiry_mark))
				1204	m->mnt_expiry_mark = 0;
				1205	mntput_no_expire(m);
				1206	}
				1207	}
				1208	EXPORT_SYMBOL(mntput);
				1209
				1210	struct vfsmount mntget(struct vfsmount mnt)
				1211	{
				1212	if (mnt)
				1213	mnt_add_count(real_mount(mnt), 1);
				1214	return mnt;
				1215	}
				1216	EXPORT_SYMBOL(mntget);
				1217
				1218	/* path_is_mountpoint() - Check if path is a mount in the current
				1219	* namespace.
				1220	*
				1221	* d_mountpoint() can only be used reliably to establish if a dentry is
				1222	* not mounted in any namespace and that common case is handled inline.
				1223	* d_mountpoint() isn't aware of the possibility there may be multiple
				1224	* mounts using a given dentry in a different namespace. This function
				1225	* checks if the passed in path is a mountpoint rather than the dentry
				1226	* alone.
				1227	*/
				1228	bool path_is_mountpoint(const struct path *path)
				1229	{
				1230	unsigned seq;
				1231	bool res;
				1232
				1233	if (!d_mountpoint(path->dentry))
				1234	return false;
				1235
				1236	rcu_read_lock();
				1237	do {
				1238	seq = read_seqbegin(&mount_lock);
				1239	res = __path_is_mountpoint(path);
				1240	} while (read_seqretry(&mount_lock, seq));
				1241	rcu_read_unlock();
				1242
				1243	return res;
				1244	}
				1245	EXPORT_SYMBOL(path_is_mountpoint);
				1246
				1247	struct vfsmount mnt_clone_internal(const struct path path)
				1248	{
				1249	struct mount *p;
				1250	p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE);
				1251	if (IS_ERR(p))
				1252	return ERR_CAST(p);
				1253	p->mnt.mnt_flags \|= MNT_INTERNAL;
				1254	return &p->mnt;
				1255	}
				1256
				1257	#ifdef CONFIG_PROC_FS
				1258	/* iterator; we want it to have access to namespace_sem, thus here... */
				1259	static void m_start(struct seq_file m, loff_t *pos)
				1260	{
				1261	struct proc_mounts *p = m->private;
				1262
				1263	down_read(&namespace_sem);
				1264	if (p->cached_event == p->ns->event) {
				1265	void *v = p->cached_mount;
				1266	if (*pos == p->cached_index)
				1267	return v;
				1268	if (*pos == p->cached_index + 1) {
				1269	v = seq_list_next(v, &p->ns->list, &p->cached_index);
				1270	return p->cached_mount = v;
				1271	}
				1272	}
				1273
				1274	p->cached_event = p->ns->event;
				1275	p->cached_mount = seq_list_start(&p->ns->list, *pos);
				1276	p->cached_index = *pos;
				1277	return p->cached_mount;
				1278	}
				1279
				1280	static void m_next(struct seq_file m, void v, loff_t pos)
				1281	{
				1282	struct proc_mounts *p = m->private;
				1283
				1284	p->cached_mount = seq_list_next(v, &p->ns->list, pos);
				1285	p->cached_index = *pos;
				1286	return p->cached_mount;
				1287	}
				1288
				1289	static void m_stop(struct seq_file m, void v)
				1290	{
				1291	up_read(&namespace_sem);
				1292	}
				1293
				1294	static int m_show(struct seq_file m, void v)
				1295	{
				1296	struct proc_mounts *p = m->private;
				1297	struct mount *r = list_entry(v, struct mount, mnt_list);
				1298	return p->show(m, &r->mnt);
				1299	}
				1300
				1301	const struct seq_operations mounts_op = {
				1302	.start = m_start,
				1303	.next = m_next,
				1304	.stop = m_stop,
				1305	.show = m_show,
				1306	};
				1307	#endif /* CONFIG_PROC_FS */
				1308
				1309	/**
				1310	* may_umount_tree - check if a mount tree is busy
				1311	* @mnt: root of mount tree
				1312	*
				1313	* This is called to check if a tree of mounts has any
				1314	* open files, pwds, chroots or sub mounts that are
				1315	* busy.
				1316	*/
				1317	int may_umount_tree(struct vfsmount *m)
				1318	{
				1319	struct mount *mnt = real_mount(m);
				1320	int actual_refs = 0;
				1321	int minimum_refs = 0;
				1322	struct mount *p;
				1323	BUG_ON(!m);
				1324
				1325	/* write lock needed for mnt_get_count */
				1326	lock_mount_hash();
				1327	for (p = mnt; p; p = next_mnt(p, mnt)) {
				1328	actual_refs += mnt_get_count(p);
				1329	minimum_refs += 2;
				1330	}
				1331	unlock_mount_hash();
				1332
				1333	if (actual_refs > minimum_refs)
				1334	return 0;
				1335
				1336	return 1;
				1337	}
				1338
				1339	EXPORT_SYMBOL(may_umount_tree);
				1340
				1341	/**
				1342	* may_umount - check if a mount point is busy
				1343	* @mnt: root of mount
				1344	*
				1345	* This is called to check if a mount point has any
				1346	* open files, pwds, chroots or sub mounts. If the
				1347	* mount has sub mounts this will return busy
				1348	* regardless of whether the sub mounts are busy.
				1349	*
				1350	* Doesn't take quota and stuff into account. IOW, in some cases it will
				1351	* give false negatives. The main reason why it's here is that we need
				1352	* a non-destructive way to look for easily umountable filesystems.
				1353	*/
				1354	int may_umount(struct vfsmount *mnt)
				1355	{
				1356	int ret = 1;
				1357	down_read(&namespace_sem);
				1358	lock_mount_hash();
				1359	if (propagate_mount_busy(real_mount(mnt), 2))
				1360	ret = 0;
				1361	unlock_mount_hash();
				1362	up_read(&namespace_sem);
				1363	return ret;
				1364	}
				1365
				1366	EXPORT_SYMBOL(may_umount);
				1367
				1368	static HLIST_HEAD(unmounted); /* protected by namespace_sem */
				1369
				1370	static void namespace_unlock(void)
				1371	{
				1372	struct hlist_head head;
				1373
				1374	hlist_move_list(&unmounted, &head);
				1375
				1376	up_write(&namespace_sem);
				1377
				1378	if (likely(hlist_empty(&head)))
				1379	return;
				1380
				1381	synchronize_rcu();
				1382
				1383	group_pin_kill(&head);
				1384	}
				1385
				1386	static inline void namespace_lock(void)
				1387	{
				1388	down_write(&namespace_sem);
				1389	}
				1390
				1391	enum umount_tree_flags {
				1392	UMOUNT_SYNC = 1,
				1393	UMOUNT_PROPAGATE = 2,
				1394	UMOUNT_CONNECTED = 4,
				1395	};
				1396
				1397	static bool disconnect_mount(struct mount *mnt, enum umount_tree_flags how)
				1398	{
				1399	/* Leaving mounts connected is only valid for lazy umounts */
				1400	if (how & UMOUNT_SYNC)
				1401	return true;
				1402
				1403	/* A mount without a parent has nothing to be connected to */
				1404	if (!mnt_has_parent(mnt))
				1405	return true;
				1406
				1407	/* Because the reference counting rules change when mounts are
				1408	* unmounted and connected, umounted mounts may not be
				1409	* connected to mounted mounts.
				1410	*/
				1411	if (!(mnt->mnt_parent->mnt.mnt_flags & MNT_UMOUNT))
				1412	return true;
				1413
				1414	/* Has it been requested that the mount remain connected? */
				1415	if (how & UMOUNT_CONNECTED)
				1416	return false;
				1417
				1418	/* Is the mount locked such that it needs to remain connected? */
				1419	if (IS_MNT_LOCKED(mnt))
				1420	return false;
				1421
				1422	/* By default disconnect the mount */
				1423	return true;
				1424	}
				1425
				1426	/*
				1427	* mount_lock must be held
				1428	* namespace_sem must be held for write
				1429	*/
				1430	static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
				1431	{
				1432	LIST_HEAD(tmp_list);
				1433	struct mount *p;
				1434
				1435	if (how & UMOUNT_PROPAGATE)
				1436	propagate_mount_unlock(mnt);
				1437
				1438	/* Gather the mounts to umount */
				1439	for (p = mnt; p; p = next_mnt(p, mnt)) {
				1440	p->mnt.mnt_flags \|= MNT_UMOUNT;
				1441	list_move(&p->mnt_list, &tmp_list);
				1442	}
				1443
				1444	/* Hide the mounts from mnt_mounts */
				1445	list_for_each_entry(p, &tmp_list, mnt_list) {
				1446	list_del_init(&p->mnt_child);
				1447	}
				1448
				1449	/* Add propogated mounts to the tmp_list */
				1450	if (how & UMOUNT_PROPAGATE)
				1451	propagate_umount(&tmp_list);
				1452
				1453	while (!list_empty(&tmp_list)) {
				1454	struct mnt_namespace *ns;
				1455	bool disconnect;
				1456	p = list_first_entry(&tmp_list, struct mount, mnt_list);
				1457	list_del_init(&p->mnt_expire);
				1458	list_del_init(&p->mnt_list);
				1459	ns = p->mnt_ns;
				1460	if (ns) {
				1461	ns->mounts--;
				1462	__touch_mnt_namespace(ns);
				1463	}
				1464	p->mnt_ns = NULL;
				1465	if (how & UMOUNT_SYNC)
				1466	p->mnt.mnt_flags \|= MNT_SYNC_UMOUNT;
				1467
				1468	disconnect = disconnect_mount(p, how);
				1469
				1470	pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt,
				1471	disconnect ? &unmounted : NULL);
				1472	if (mnt_has_parent(p)) {
				1473	mnt_add_count(p->mnt_parent, -1);
				1474	if (!disconnect) {
				1475	/* Don't forget about p */
				1476	list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts);
				1477	} else {
				1478	umount_mnt(p);
				1479	}
				1480	}
				1481	change_mnt_propagation(p, MS_PRIVATE);
				1482	}
				1483	}
				1484
				1485	static void shrink_submounts(struct mount *mnt);
				1486
				1487	static int do_umount(struct mount *mnt, int flags)
				1488	{
				1489	struct super_block *sb = mnt->mnt.mnt_sb;
				1490	int retval;
				1491
				1492	retval = security_sb_umount(&mnt->mnt, flags);
				1493	if (retval)
				1494	return retval;
				1495
				1496	/*
				1497	* Allow userspace to request a mountpoint be expired rather than
				1498	* unmounting unconditionally. Unmount only happens if:
				1499	* (1) the mark is already set (the mark is cleared by mntput())
				1500	* (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount]
				1501	*/
				1502	if (flags & MNT_EXPIRE) {
				1503	if (&mnt->mnt == current->fs->root.mnt \|\|
				1504	flags & (MNT_FORCE \| MNT_DETACH))
				1505	return -EINVAL;
				1506
				1507	/*
				1508	* probably don't strictly need the lock here if we examined
				1509	* all race cases, but it's a slowpath.
				1510	*/
				1511	lock_mount_hash();
				1512	if (mnt_get_count(mnt) != 2) {
				1513	unlock_mount_hash();
				1514	return -EBUSY;
				1515	}
				1516	unlock_mount_hash();
				1517
				1518	if (!xchg(&mnt->mnt_expiry_mark, 1))
				1519	return -EAGAIN;
				1520	}
				1521
				1522	/*
				1523	* If we may have to abort operations to get out of this
				1524	* mount, and they will themselves hold resources we must
				1525	* allow the fs to do things. In the Unix tradition of
				1526	* 'Gee thats tricky lets do it in userspace' the umount_begin
				1527	* might fail to complete on the first run through as other tasks
				1528	* must return, and the like. Thats for the mount program to worry
				1529	* about for the moment.
				1530	*/
				1531
				1532	if (flags & MNT_FORCE && sb->s_op->umount_begin) {
				1533	sb->s_op->umount_begin(sb);
				1534	}
				1535
				1536	/*
				1537	* No sense to grab the lock for this test, but test itself looks
				1538	* somewhat bogus. Suggestions for better replacement?
				1539	* Ho-hum... In principle, we might treat that as umount + switch
				1540	* to rootfs. GC would eventually take care of the old vfsmount.
				1541	* Actually it makes sense, especially if rootfs would contain a
				1542	* /reboot - static binary that would close all descriptors and
				1543	* call reboot(9). Then init(8) could umount root and exec /reboot.
				1544	*/
				1545	if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) {
				1546	/*
				1547	* Special case for "unmounting" root ...
				1548	* we just try to remount it readonly.
				1549	*/
				1550	if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
				1551	return -EPERM;
				1552	down_write(&sb->s_umount);
				1553	if (!sb_rdonly(sb))
				1554	retval = do_remount_sb(sb, SB_RDONLY, NULL, 0);
				1555	up_write(&sb->s_umount);
				1556	return retval;
				1557	}
				1558
				1559	namespace_lock();
				1560	lock_mount_hash();
				1561
				1562	/* Recheck MNT_LOCKED with the locks held */
				1563	retval = -EINVAL;
				1564	if (mnt->mnt.mnt_flags & MNT_LOCKED)
				1565	goto out;
				1566
				1567	event++;
				1568	if (flags & MNT_DETACH) {
				1569	if (!list_empty(&mnt->mnt_list))
				1570	umount_tree(mnt, UMOUNT_PROPAGATE);
				1571	retval = 0;
				1572	} else {
				1573	shrink_submounts(mnt);
				1574	retval = -EBUSY;
				1575	if (!propagate_mount_busy(mnt, 2)) {
				1576	if (!list_empty(&mnt->mnt_list))
				1577	umount_tree(mnt, UMOUNT_PROPAGATE\|UMOUNT_SYNC);
				1578	retval = 0;
				1579	}
				1580	}
				1581	out:
				1582	unlock_mount_hash();
				1583	namespace_unlock();
				1584	return retval;
				1585	}
				1586
				1587	/*
				1588	* __detach_mounts - lazily unmount all mounts on the specified dentry
				1589	*
				1590	* During unlink, rmdir, and d_drop it is possible to loose the path
				1591	* to an existing mountpoint, and wind up leaking the mount.
				1592	* detach_mounts allows lazily unmounting those mounts instead of
				1593	* leaking them.
				1594	*
				1595	* The caller may hold dentry->d_inode->i_mutex.
				1596	*/
				1597	void __detach_mounts(struct dentry *dentry)
				1598	{
				1599	struct mountpoint *mp;
				1600	struct mount *mnt;
				1601
				1602	namespace_lock();
				1603	lock_mount_hash();
				1604	mp = lookup_mountpoint(dentry);
				1605	if (IS_ERR_OR_NULL(mp))
				1606	goto out_unlock;
				1607
				1608	event++;
				1609	while (!hlist_empty(&mp->m_list)) {
				1610	mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
				1611	if (mnt->mnt.mnt_flags & MNT_UMOUNT) {
				1612	hlist_add_head(&mnt->mnt_umount.s_list, &unmounted);
				1613	umount_mnt(mnt);
				1614	}
				1615	else umount_tree(mnt, UMOUNT_CONNECTED);
				1616	}
				1617	put_mountpoint(mp);
				1618	out_unlock:
				1619	unlock_mount_hash();
				1620	namespace_unlock();
				1621	}
				1622
				1623	/*
				1624	* Is the caller allowed to modify his namespace?
				1625	*/
				1626	static inline bool may_mount(void)
				1627	{
				1628	return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN);
				1629	}
				1630
				1631	static inline bool may_mandlock(void)
				1632	{
				1633	#ifndef CONFIG_MANDATORY_FILE_LOCKING
				1634	return false;
				1635	#endif
				1636	return capable(CAP_SYS_ADMIN);
				1637	}
				1638
				1639	/*
				1640	* Now umount can handle mount points as well as block devices.
				1641	* This is important for filesystems which use unnamed block devices.
				1642	*
				1643	* We now support a flag for forced unmount like the other 'big iron'
				1644	* unixes. Our API is identical to OSF/1 to avoid making a mess of AMD
				1645	*/
				1646
				1647	int ksys_umount(char __user *name, int flags)
				1648	{
				1649	struct path path;
				1650	struct mount *mnt;
				1651	int retval;
				1652	int lookup_flags = 0;
				1653
				1654	if (flags & ~(MNT_FORCE \| MNT_DETACH \| MNT_EXPIRE \| UMOUNT_NOFOLLOW))
				1655	return -EINVAL;
				1656
				1657	if (!may_mount())
				1658	return -EPERM;
				1659
				1660	if (!(flags & UMOUNT_NOFOLLOW))
				1661	lookup_flags \|= LOOKUP_FOLLOW;
				1662
				1663	retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path);
				1664	if (retval)
				1665	goto out;
				1666	mnt = real_mount(path.mnt);
				1667	retval = -EINVAL;
				1668	if (path.dentry != path.mnt->mnt_root)
				1669	goto dput_and_out;
				1670	if (!check_mnt(mnt))
				1671	goto dput_and_out;
				1672	if (mnt->mnt.mnt_flags & MNT_LOCKED) /* Check optimistically */
				1673	goto dput_and_out;
				1674	retval = -EPERM;
				1675	if (flags & MNT_FORCE && !capable(CAP_SYS_ADMIN))
				1676	goto dput_and_out;
				1677
				1678	retval = do_umount(mnt, flags);
				1679	dput_and_out:
				1680	/* we mustn't call path_put() as that would clear mnt_expiry_mark */
				1681	dput(path.dentry);
				1682	mntput_no_expire(mnt);
				1683	out:
				1684	return retval;
				1685	}
				1686
				1687	SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
				1688	{
				1689	return ksys_umount(name, flags);
				1690	}
				1691
				1692	#ifdef __ARCH_WANT_SYS_OLDUMOUNT
				1693
				1694	/*
				1695	* The 2.0 compatible umount. No flags.
				1696	*/
				1697	SYSCALL_DEFINE1(oldumount, char __user *, name)
				1698	{
				1699	return ksys_umount(name, 0);
				1700	}
				1701
				1702	#endif
				1703
				1704	static bool is_mnt_ns_file(struct dentry *dentry)
				1705	{
				1706	/* Is this a proxy for a mount namespace? */
				1707	return dentry->d_op == &ns_dentry_operations &&
				1708	dentry->d_fsdata == &mntns_operations;
				1709	}
				1710
				1711	struct mnt_namespace to_mnt_ns(struct ns_common ns)
				1712	{
				1713	return container_of(ns, struct mnt_namespace, ns);
				1714	}
				1715
				1716	static bool mnt_ns_loop(struct dentry *dentry)
				1717	{
				1718	/* Could bind mounting the mount namespace inode cause a
				1719	* mount namespace loop?
				1720	*/
				1721	struct mnt_namespace *mnt_ns;
				1722	if (!is_mnt_ns_file(dentry))
				1723	return false;
				1724
				1725	mnt_ns = to_mnt_ns(get_proc_ns(dentry->d_inode));
				1726	return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;
				1727	}
				1728
				1729	struct mount copy_tree(struct mount mnt, struct dentry *dentry,
				1730	int flag)
				1731	{
				1732	struct mount res, p, q, r, *parent;
				1733
				1734	if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt))
				1735	return ERR_PTR(-EINVAL);
				1736
				1737	if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry))
				1738	return ERR_PTR(-EINVAL);
				1739
				1740	res = q = clone_mnt(mnt, dentry, flag);
				1741	if (IS_ERR(q))
				1742	return q;
				1743
				1744	q->mnt_mountpoint = mnt->mnt_mountpoint;
				1745
				1746	p = mnt;
				1747	list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) {
				1748	struct mount *s;
				1749	if (!is_subdir(r->mnt_mountpoint, dentry))
				1750	continue;
				1751
				1752	for (s = r; s; s = next_mnt(s, r)) {
				1753	if (!(flag & CL_COPY_UNBINDABLE) &&
				1754	IS_MNT_UNBINDABLE(s)) {
				1755	if (s->mnt.mnt_flags & MNT_LOCKED) {
				1756	/* Both unbindable and locked. */
				1757	q = ERR_PTR(-EPERM);
				1758	goto out;
				1759	} else {
				1760	s = skip_mnt_tree(s);
				1761	continue;
				1762	}
				1763	}
				1764	if (!(flag & CL_COPY_MNT_NS_FILE) &&
				1765	is_mnt_ns_file(s->mnt.mnt_root)) {
				1766	s = skip_mnt_tree(s);
				1767	continue;
				1768	}
				1769	while (p != s->mnt_parent) {
				1770	p = p->mnt_parent;
				1771	q = q->mnt_parent;
				1772	}
				1773	p = s;
				1774	parent = q;
				1775	q = clone_mnt(p, p->mnt.mnt_root, flag);
				1776	if (IS_ERR(q))
				1777	goto out;
				1778	lock_mount_hash();
				1779	list_add_tail(&q->mnt_list, &res->mnt_list);
				1780	attach_mnt(q, parent, p->mnt_mp);
				1781	unlock_mount_hash();
				1782	}
				1783	}
				1784	return res;
				1785	out:
				1786	if (res) {
				1787	lock_mount_hash();
				1788	umount_tree(res, UMOUNT_SYNC);
				1789	unlock_mount_hash();
				1790	}
				1791	return q;
				1792	}
				1793
				1794	/* Caller should check returned pointer for errors */
				1795
				1796	struct vfsmount collect_mounts(const struct path path)
				1797	{
				1798	struct mount *tree;
				1799	namespace_lock();
				1800	if (!check_mnt(real_mount(path->mnt)))
				1801	tree = ERR_PTR(-EINVAL);
				1802	else
				1803	tree = copy_tree(real_mount(path->mnt), path->dentry,
				1804	CL_COPY_ALL \| CL_PRIVATE);
				1805	namespace_unlock();
				1806	if (IS_ERR(tree))
				1807	return ERR_CAST(tree);
				1808	return &tree->mnt;
				1809	}
				1810
				1811	void drop_collected_mounts(struct vfsmount *mnt)
				1812	{
				1813	namespace_lock();
				1814	lock_mount_hash();
				1815	umount_tree(real_mount(mnt), 0);
				1816	unlock_mount_hash();
				1817	namespace_unlock();
				1818	}
				1819
				1820	/**
				1821	* clone_private_mount - create a private clone of a path
				1822	*
				1823	* This creates a new vfsmount, which will be the clone of @path. The new will
				1824	* not be attached anywhere in the namespace and will be private (i.e. changes
				1825	* to the originating mount won't be propagated into this).
				1826	*
				1827	* Release with mntput().
				1828	*/
				1829	struct vfsmount clone_private_mount(const struct path path)
				1830	{
				1831	struct mount *old_mnt = real_mount(path->mnt);
				1832	struct mount *new_mnt;
				1833
				1834	if (IS_MNT_UNBINDABLE(old_mnt))
				1835	return ERR_PTR(-EINVAL);
				1836
				1837	new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
				1838	if (IS_ERR(new_mnt))
				1839	return ERR_CAST(new_mnt);
				1840
				1841	return &new_mnt->mnt;
				1842	}
				1843	EXPORT_SYMBOL_GPL(clone_private_mount);
				1844
				1845	int iterate_mounts(int (f)(struct vfsmount , void ), void arg,
				1846	struct vfsmount *root)
				1847	{
				1848	struct mount *mnt;
				1849	int res = f(root, arg);
				1850	if (res)
				1851	return res;
				1852	list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) {
				1853	res = f(&mnt->mnt, arg);
				1854	if (res)
				1855	return res;
				1856	}
				1857	return 0;
				1858	}
				1859
				1860	static void cleanup_group_ids(struct mount mnt, struct mount end)
				1861	{
				1862	struct mount *p;
				1863
				1864	for (p = mnt; p != end; p = next_mnt(p, mnt)) {
				1865	if (p->mnt_group_id && !IS_MNT_SHARED(p))
				1866	mnt_release_group_id(p);
				1867	}
				1868	}
				1869
				1870	static int invent_group_ids(struct mount *mnt, bool recurse)
				1871	{
				1872	struct mount *p;
				1873
				1874	for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) {
				1875	if (!p->mnt_group_id && !IS_MNT_SHARED(p)) {
				1876	int err = mnt_alloc_group_id(p);
				1877	if (err) {
				1878	cleanup_group_ids(mnt, p);
				1879	return err;
				1880	}
				1881	}
				1882	}
				1883
				1884	return 0;
				1885	}
				1886
				1887	int count_mounts(struct mnt_namespace ns, struct mount mnt)
				1888	{
				1889	unsigned int max = READ_ONCE(sysctl_mount_max);
				1890	unsigned int mounts = 0, old, pending, sum;
				1891	struct mount *p;
				1892
				1893	for (p = mnt; p; p = next_mnt(p, mnt))
				1894	mounts++;
				1895
				1896	old = ns->mounts;
				1897	pending = ns->pending_mounts;
				1898	sum = old + pending;
				1899	if ((old > sum) \|\|
				1900	(pending > sum) \|\|
				1901	(max < sum) \|\|
				1902	(mounts > (max - sum)))
				1903	return -ENOSPC;
				1904
				1905	ns->pending_mounts = pending + mounts;
				1906	return 0;
				1907	}
				1908
				1909	/*
				1910	* @source_mnt : mount tree to be attached
				1911	* @nd : place the mount tree @source_mnt is attached
				1912	* @parent_nd : if non-null, detach the source_mnt from its parent and
				1913	* store the parent mount and mountpoint dentry.
				1914	* (done when source_mnt is moved)
				1915	*
				1916	* NOTE: in the table below explains the semantics when a source mount
				1917	* of a given type is attached to a destination mount of a given type.
				1918	* ---------------------------------------------------------------------------
				1919	* \| BIND MOUNT OPERATION \|
				1920	* \|**************************************************************************
				1921	* \| source-->\| shared \| private \| slave \| unbindable \|
				1922	* \| dest \| \| \| \| \|
				1923	* \| \| \| \| \| \| \|
				1924	* \| v \| \| \| \| \|
				1925	* \|**************************************************************************
				1926	* \| shared \| shared (++) \| shared (+) \| shared(+++)\| invalid \|
				1927	* \| \| \| \| \| \|
				1928	* \|non-shared\| shared (+) \| private \| slave (*) \| invalid \|
				1929	* ***************************************************************************
				1930	* A bind operation clones the source mount and mounts the clone on the
				1931	* destination mount.
				1932	*
				1933	* (++) the cloned mount is propagated to all the mounts in the propagation
				1934	* tree of the destination mount and the cloned mount is added to
				1935	* the peer group of the source mount.
				1936	* (+) the cloned mount is created under the destination mount and is marked
				1937	* as shared. The cloned mount is added to the peer group of the source
				1938	* mount.
				1939	* (+++) the mount is propagated to all the mounts in the propagation tree
				1940	* of the destination mount and the cloned mount is made slave
				1941	* of the same master as that of the source mount. The cloned mount
				1942	* is marked as 'shared and slave'.
				1943	* (*) the cloned mount is made a slave of the same master as that of the
				1944	* source mount.
				1945	*
				1946	* ---------------------------------------------------------------------------
				1947	* \| MOVE MOUNT OPERATION \|
				1948	* \|**************************************************************************
				1949	* \| source-->\| shared \| private \| slave \| unbindable \|
				1950	* \| dest \| \| \| \| \|
				1951	* \| \| \| \| \| \| \|
				1952	* \| v \| \| \| \| \|
				1953	* \|**************************************************************************
				1954	* \| shared \| shared (+) \| shared (+) \| shared(+++) \| invalid \|
				1955	* \| \| \| \| \| \|
				1956	* \|non-shared\| shared (+) \| private \| slave () \| unbindable \|
				1957	* ***************************************************************************
				1958	*
				1959	* (+) the mount is moved to the destination. And is then propagated to
				1960	* all the mounts in the propagation tree of the destination mount.
				1961	* (+*) the mount is moved to the destination.
				1962	* (+++) the mount is moved to the destination and is then propagated to
				1963	* all the mounts belonging to the destination mount's propagation tree.
				1964	* the mount is marked as 'shared and slave'.
				1965	* (*) the mount continues to be a slave at the new location.
				1966	*
				1967	* if the source mount is a tree, the operations explained above is
				1968	* applied to each mount in the tree.
				1969	* Must be called without spinlocks held, since this function can sleep
				1970	* in allocations.
				1971	*/
				1972	static int attach_recursive_mnt(struct mount *source_mnt,
				1973	struct mount *dest_mnt,
				1974	struct mountpoint *dest_mp,
				1975	struct path *parent_path)
				1976	{
				1977	HLIST_HEAD(tree_list);
				1978	struct mnt_namespace *ns = dest_mnt->mnt_ns;
				1979	struct mountpoint *smp;
				1980	struct mount child, p;
				1981	struct hlist_node *n;
				1982	int err;
				1983
				1984	/* Preallocate a mountpoint in case the new mounts need
				1985	* to be tucked under other mounts.
				1986	*/
				1987	smp = get_mountpoint(source_mnt->mnt.mnt_root);
				1988	if (IS_ERR(smp))
				1989	return PTR_ERR(smp);
				1990
				1991	/* Is there space to add these mounts to the mount namespace? */
				1992	if (!parent_path) {
				1993	err = count_mounts(ns, source_mnt);
				1994	if (err)
				1995	goto out;
				1996	}
				1997
				1998	if (IS_MNT_SHARED(dest_mnt)) {
				1999	err = invent_group_ids(source_mnt, true);
				2000	if (err)
				2001	goto out;
				2002	err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);
				2003	lock_mount_hash();
				2004	if (err)
				2005	goto out_cleanup_ids;
				2006	for (p = source_mnt; p; p = next_mnt(p, source_mnt))
				2007	set_mnt_shared(p);
				2008	} else {
				2009	lock_mount_hash();
				2010	}
				2011	if (parent_path) {
				2012	detach_mnt(source_mnt, parent_path);
				2013	attach_mnt(source_mnt, dest_mnt, dest_mp);
				2014	touch_mnt_namespace(source_mnt->mnt_ns);
				2015	} else {
				2016	mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
				2017	commit_tree(source_mnt);
				2018	}
				2019
				2020	hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
				2021	struct mount *q;
				2022	hlist_del_init(&child->mnt_hash);
				2023	q = __lookup_mnt(&child->mnt_parent->mnt,
				2024	child->mnt_mountpoint);
				2025	if (q)
				2026	mnt_change_mountpoint(child, smp, q);
				2027	commit_tree(child);
				2028	}
				2029	put_mountpoint(smp);
				2030	unlock_mount_hash();
				2031
				2032	return 0;
				2033
				2034	out_cleanup_ids:
				2035	while (!hlist_empty(&tree_list)) {
				2036	child = hlist_entry(tree_list.first, struct mount, mnt_hash);
				2037	child->mnt_parent->mnt_ns->pending_mounts = 0;
				2038	umount_tree(child, UMOUNT_SYNC);
				2039	}
				2040	unlock_mount_hash();
				2041	cleanup_group_ids(source_mnt, NULL);
				2042	out:
				2043	ns->pending_mounts = 0;
				2044
				2045	read_seqlock_excl(&mount_lock);
				2046	put_mountpoint(smp);
				2047	read_sequnlock_excl(&mount_lock);
				2048
				2049	return err;
				2050	}
				2051
				2052	static struct mountpoint lock_mount(struct path path)
				2053	{
				2054	struct vfsmount *mnt;
				2055	struct dentry *dentry = path->dentry;
				2056	retry:
				2057	inode_lock(dentry->d_inode);
				2058	if (unlikely(cant_mount(dentry))) {
				2059	inode_unlock(dentry->d_inode);
				2060	return ERR_PTR(-ENOENT);
				2061	}
				2062	namespace_lock();
				2063	mnt = lookup_mnt(path);
				2064	if (likely(!mnt)) {
				2065	struct mountpoint *mp = get_mountpoint(dentry);
				2066	if (IS_ERR(mp)) {
				2067	namespace_unlock();
				2068	inode_unlock(dentry->d_inode);
				2069	return mp;
				2070	}
				2071	return mp;
				2072	}
				2073	namespace_unlock();
				2074	inode_unlock(path->dentry->d_inode);
				2075	path_put(path);
				2076	path->mnt = mnt;
				2077	dentry = path->dentry = dget(mnt->mnt_root);
				2078	goto retry;
				2079	}
				2080
				2081	static void unlock_mount(struct mountpoint *where)
				2082	{
				2083	struct dentry *dentry = where->m_dentry;
				2084
				2085	read_seqlock_excl(&mount_lock);
				2086	put_mountpoint(where);
				2087	read_sequnlock_excl(&mount_lock);
				2088
				2089	namespace_unlock();
				2090	inode_unlock(dentry->d_inode);
				2091	}
				2092
				2093	static int graft_tree(struct mount mnt, struct mount p, struct mountpoint *mp)
				2094	{
				2095	if (mnt->mnt.mnt_sb->s_flags & SB_NOUSER)
				2096	return -EINVAL;
				2097
				2098	if (d_is_dir(mp->m_dentry) !=
				2099	d_is_dir(mnt->mnt.mnt_root))
				2100	return -ENOTDIR;
				2101
				2102	return attach_recursive_mnt(mnt, p, mp, NULL);
				2103	}
				2104
				2105	/*
				2106	* Sanity check the flags to change_mnt_propagation.
				2107	*/
				2108
				2109	static int flags_to_propagation_type(int ms_flags)
				2110	{
				2111	int type = ms_flags & ~(MS_REC \| MS_SILENT);
				2112
				2113	/* Fail if any non-propagation flags are set */
				2114	if (type & ~(MS_SHARED \| MS_PRIVATE \| MS_SLAVE \| MS_UNBINDABLE))
				2115	return 0;
				2116	/* Only one propagation flag should be set */
				2117	if (!is_power_of_2(type))
				2118	return 0;
				2119	return type;
				2120	}
				2121
				2122	/*
				2123	* recursively change the type of the mountpoint.
				2124	*/
				2125	static int do_change_type(struct path *path, int ms_flags)
				2126	{
				2127	struct mount *m;
				2128	struct mount *mnt = real_mount(path->mnt);
				2129	int recurse = ms_flags & MS_REC;
				2130	int type;
				2131	int err = 0;
				2132
				2133	if (path->dentry != path->mnt->mnt_root)
				2134	return -EINVAL;
				2135
				2136	type = flags_to_propagation_type(ms_flags);
				2137	if (!type)
				2138	return -EINVAL;
				2139
				2140	namespace_lock();
				2141	if (type == MS_SHARED) {
				2142	err = invent_group_ids(mnt, recurse);
				2143	if (err)
				2144	goto out_unlock;
				2145	}
				2146
				2147	lock_mount_hash();
				2148	for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
				2149	change_mnt_propagation(m, type);
				2150	unlock_mount_hash();
				2151
				2152	out_unlock:
				2153	namespace_unlock();
				2154	return err;
				2155	}
				2156
				2157	static bool has_locked_children(struct mount mnt, struct dentry dentry)
				2158	{
				2159	struct mount *child;
				2160	list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
				2161	if (!is_subdir(child->mnt_mountpoint, dentry))
				2162	continue;
				2163
				2164	if (child->mnt.mnt_flags & MNT_LOCKED)
				2165	return true;
				2166	}
				2167	return false;
				2168	}
				2169
				2170	/*
				2171	* do loopback mount.
				2172	*/
				2173	static int do_loopback(struct path path, const char old_name,
				2174	int recurse)
				2175	{
				2176	struct path old_path;
				2177	struct mount mnt = NULL, old, *parent;
				2178	struct mountpoint *mp;
				2179	int err;
				2180	if (!old_name \|\| !*old_name)
				2181	return -EINVAL;
				2182	err = kern_path(old_name, LOOKUP_FOLLOW\|LOOKUP_AUTOMOUNT, &old_path);
				2183	if (err)
				2184	return err;
				2185
				2186	err = -EINVAL;
				2187	if (mnt_ns_loop(old_path.dentry))
				2188	goto out;
				2189
				2190	mp = lock_mount(path);
				2191	err = PTR_ERR(mp);
				2192	if (IS_ERR(mp))
				2193	goto out;
				2194
				2195	old = real_mount(old_path.mnt);
				2196	parent = real_mount(path->mnt);
				2197
				2198	err = -EINVAL;
				2199	if (IS_MNT_UNBINDABLE(old))
				2200	goto out2;
				2201
				2202	if (!check_mnt(parent))
				2203	goto out2;
				2204
				2205	if (!check_mnt(old) && old_path.dentry->d_op != &ns_dentry_operations)
				2206	goto out2;
				2207
				2208	if (!recurse && has_locked_children(old, old_path.dentry))
				2209	goto out2;
				2210
				2211	if (recurse)
				2212	mnt = copy_tree(old, old_path.dentry, CL_COPY_MNT_NS_FILE);
				2213	else
				2214	mnt = clone_mnt(old, old_path.dentry, 0);
				2215
				2216	if (IS_ERR(mnt)) {
				2217	err = PTR_ERR(mnt);
				2218	goto out2;
				2219	}
				2220
				2221	mnt->mnt.mnt_flags &= ~MNT_LOCKED;
				2222
				2223	err = graft_tree(mnt, parent, mp);
				2224	if (err) {
				2225	lock_mount_hash();
				2226	umount_tree(mnt, UMOUNT_SYNC);
				2227	unlock_mount_hash();
				2228	}
				2229	out2:
				2230	unlock_mount(mp);
				2231	out:
				2232	path_put(&old_path);
				2233	return err;
				2234	}
				2235
				2236	static int change_mount_flags(struct vfsmount *mnt, int ms_flags)
				2237	{
				2238	int error = 0;
				2239	int readonly_request = 0;
				2240
				2241	if (ms_flags & MS_RDONLY)
				2242	readonly_request = 1;
				2243	if (readonly_request == __mnt_is_readonly(mnt))
				2244	return 0;
				2245
				2246	if (readonly_request)
				2247	error = mnt_make_readonly(real_mount(mnt));
				2248	else
				2249	__mnt_unmake_readonly(real_mount(mnt));
				2250	return error;
				2251	}
				2252
				2253	/*
				2254	* change filesystem flags. dir should be a physical root of filesystem.
				2255	* If you've mounted a non-root directory somewhere and want to do remount
				2256	* on it - tough luck.
				2257	*/
				2258	static int do_remount(struct path *path, int ms_flags, int sb_flags,
				2259	int mnt_flags, void *data)
				2260	{
				2261	int err;
				2262	struct super_block *sb = path->mnt->mnt_sb;
				2263	struct mount *mnt = real_mount(path->mnt);
				2264
				2265	if (!check_mnt(mnt))
				2266	return -EINVAL;
				2267
				2268	if (path->dentry != path->mnt->mnt_root)
				2269	return -EINVAL;
				2270
				2271	/* Don't allow changing of locked mnt flags.
				2272	*
				2273	* No locks need to be held here while testing the various
				2274	* MNT_LOCK flags because those flags can never be cleared
				2275	* once they are set.
				2276	*/
				2277	if ((mnt->mnt.mnt_flags & MNT_LOCK_READONLY) &&
				2278	!(mnt_flags & MNT_READONLY)) {
				2279	return -EPERM;
				2280	}
				2281	if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) &&
				2282	!(mnt_flags & MNT_NODEV)) {
				2283	return -EPERM;
				2284	}
				2285	if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) &&
				2286	!(mnt_flags & MNT_NOSUID)) {
				2287	return -EPERM;
				2288	}
				2289	if ((mnt->mnt.mnt_flags & MNT_LOCK_NOEXEC) &&
				2290	!(mnt_flags & MNT_NOEXEC)) {
				2291	return -EPERM;
				2292	}
				2293	if ((mnt->mnt.mnt_flags & MNT_LOCK_ATIME) &&
				2294	((mnt->mnt.mnt_flags & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK))) {
				2295	return -EPERM;
				2296	}
				2297
				2298	err = security_sb_remount(sb, data);
				2299	if (err)
				2300	return err;
				2301
				2302	down_write(&sb->s_umount);
				2303	if (ms_flags & MS_BIND)
				2304	err = change_mount_flags(path->mnt, ms_flags);
				2305	else if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
				2306	err = -EPERM;
				2307	else {
				2308	err = do_remount_sb2(path->mnt, sb, sb_flags, data, 0);
				2309	namespace_lock();
				2310	lock_mount_hash();
				2311	propagate_remount(mnt);
				2312	unlock_mount_hash();
				2313	namespace_unlock();
				2314	}
				2315	if (!err) {
				2316	lock_mount_hash();
				2317	mnt_flags \|= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
				2318	mnt->mnt.mnt_flags = mnt_flags;
				2319	touch_mnt_namespace(mnt->mnt_ns);
				2320	unlock_mount_hash();
				2321	}
				2322	up_write(&sb->s_umount);
				2323	return err;
				2324	}
				2325
				2326	static inline int tree_contains_unbindable(struct mount *mnt)
				2327	{
				2328	struct mount *p;
				2329	for (p = mnt; p; p = next_mnt(p, mnt)) {
				2330	if (IS_MNT_UNBINDABLE(p))
				2331	return 1;
				2332	}
				2333	return 0;
				2334	}
				2335
				2336	static int do_move_mount(struct path path, const char old_name)
				2337	{
				2338	struct path old_path, parent_path;
				2339	struct mount *p;
				2340	struct mount *old;
				2341	struct mountpoint *mp;
				2342	int err;
				2343	if (!old_name \|\| !*old_name)
				2344	return -EINVAL;
				2345	err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
				2346	if (err)
				2347	return err;
				2348
				2349	mp = lock_mount(path);
				2350	err = PTR_ERR(mp);
				2351	if (IS_ERR(mp))
				2352	goto out;
				2353
				2354	old = real_mount(old_path.mnt);
				2355	p = real_mount(path->mnt);
				2356
				2357	err = -EINVAL;
				2358	if (!check_mnt(p) \|\| !check_mnt(old))
				2359	goto out1;
				2360
				2361	if (old->mnt.mnt_flags & MNT_LOCKED)
				2362	goto out1;
				2363
				2364	err = -EINVAL;
				2365	if (old_path.dentry != old_path.mnt->mnt_root)
				2366	goto out1;
				2367
				2368	if (!mnt_has_parent(old))
				2369	goto out1;
				2370
				2371	if (d_is_dir(path->dentry) !=
				2372	d_is_dir(old_path.dentry))
				2373	goto out1;
				2374	/*
				2375	* Don't move a mount residing in a shared parent.
				2376	*/
				2377	if (IS_MNT_SHARED(old->mnt_parent))
				2378	goto out1;
				2379	/*
				2380	* Don't move a mount tree containing unbindable mounts to a destination
				2381	* mount which is shared.
				2382	*/
				2383	if (IS_MNT_SHARED(p) && tree_contains_unbindable(old))
				2384	goto out1;
				2385	err = -ELOOP;
				2386	for (; mnt_has_parent(p); p = p->mnt_parent)
				2387	if (p == old)
				2388	goto out1;
				2389
				2390	err = attach_recursive_mnt(old, real_mount(path->mnt), mp, &parent_path);
				2391	if (err)
				2392	goto out1;
				2393
				2394	/* if the mount is moved, it should no longer be expire
				2395	* automatically */
				2396	list_del_init(&old->mnt_expire);
				2397	out1:
				2398	unlock_mount(mp);
				2399	out:
				2400	if (!err)
				2401	path_put(&parent_path);
				2402	path_put(&old_path);
				2403	return err;
				2404	}
				2405
				2406	static struct vfsmount fs_set_subtype(struct vfsmount mnt, const char *fstype)
				2407	{
				2408	int err;
				2409	const char *subtype = strchr(fstype, '.');
				2410	if (subtype) {
				2411	subtype++;
				2412	err = -EINVAL;
				2413	if (!subtype[0])
				2414	goto err;
				2415	} else
				2416	subtype = "";
				2417
				2418	mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL);
				2419	err = -ENOMEM;
				2420	if (!mnt->mnt_sb->s_subtype)
				2421	goto err;
				2422	return mnt;
				2423
				2424	err:
				2425	mntput(mnt);
				2426	return ERR_PTR(err);
				2427	}
				2428
				2429	/*
				2430	* add a mount into a namespace's mount tree
				2431	*/
				2432	static int do_add_mount(struct mount newmnt, struct path path, int mnt_flags)
				2433	{
				2434	struct mountpoint *mp;
				2435	struct mount *parent;
				2436	int err;
				2437
				2438	mnt_flags &= ~MNT_INTERNAL_FLAGS;
				2439
				2440	mp = lock_mount(path);
				2441	if (IS_ERR(mp))
				2442	return PTR_ERR(mp);
				2443
				2444	parent = real_mount(path->mnt);
				2445	err = -EINVAL;
				2446	if (unlikely(!check_mnt(parent))) {
				2447	/* that's acceptable only for automounts done in private ns */
				2448	if (!(mnt_flags & MNT_SHRINKABLE))
				2449	goto unlock;
				2450	/* ... and for those we'd better have mountpoint still alive */
				2451	if (!parent->mnt_ns)
				2452	goto unlock;
				2453	}
				2454
				2455	/* Refuse the same filesystem on the same mount point */
				2456	err = -EBUSY;
				2457	if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb &&
				2458	path->mnt->mnt_root == path->dentry)
				2459	goto unlock;
				2460
				2461	err = -EINVAL;
				2462	if (d_is_symlink(newmnt->mnt.mnt_root))
				2463	goto unlock;
				2464
				2465	newmnt->mnt.mnt_flags = mnt_flags;
				2466	err = graft_tree(newmnt, parent, mp);
				2467
				2468	unlock:
				2469	unlock_mount(mp);
				2470	return err;
				2471	}
				2472
				2473	static bool mount_too_revealing(struct vfsmount mnt, int new_mnt_flags);
				2474
				2475	/*
				2476	* create a new mount for userspace and request it to be added into the
				2477	* namespace's tree
				2478	*/
				2479	static int do_new_mount(struct path path, const char fstype, int sb_flags,
				2480	int mnt_flags, const char name, void data)
				2481	{
				2482	struct file_system_type *type;
				2483	struct vfsmount *mnt;
				2484	int err;
				2485
				2486	if (!fstype)
				2487	return -EINVAL;
				2488
				2489	type = get_fs_type(fstype);
				2490	if (!type)
				2491	return -ENODEV;
				2492
				2493	mnt = vfs_kern_mount(type, sb_flags, name, data);
				2494	if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
				2495	!mnt->mnt_sb->s_subtype)
				2496	mnt = fs_set_subtype(mnt, fstype);
				2497
				2498	put_filesystem(type);
				2499	if (IS_ERR(mnt))
				2500	return PTR_ERR(mnt);
				2501
				2502	if (mount_too_revealing(mnt, &mnt_flags)) {
				2503	mntput(mnt);
				2504	return -EPERM;
				2505	}
				2506
				2507	err = do_add_mount(real_mount(mnt), path, mnt_flags);
				2508	if (err)
				2509	mntput(mnt);
				2510	return err;
				2511	}
				2512
				2513	int finish_automount(struct vfsmount m, struct path path)
				2514	{
				2515	struct mount *mnt = real_mount(m);
				2516	int err;
				2517	/* The new mount record should have at least 2 refs to prevent it being
				2518	* expired before we get a chance to add it
				2519	*/
				2520	BUG_ON(mnt_get_count(mnt) < 2);
				2521
				2522	if (m->mnt_sb == path->mnt->mnt_sb &&
				2523	m->mnt_root == path->dentry) {
				2524	err = -ELOOP;
				2525	goto fail;
				2526	}
				2527
				2528	err = do_add_mount(mnt, path, path->mnt->mnt_flags \| MNT_SHRINKABLE);
				2529	if (!err)
				2530	return 0;
				2531	fail:
				2532	/* remove m from any expiration list it may be on */
				2533	if (!list_empty(&mnt->mnt_expire)) {
				2534	namespace_lock();
				2535	list_del_init(&mnt->mnt_expire);
				2536	namespace_unlock();
				2537	}
				2538	mntput(m);
				2539	mntput(m);
				2540	return err;
				2541	}
				2542
				2543	/**
				2544	* mnt_set_expiry - Put a mount on an expiration list
				2545	* @mnt: The mount to list.
				2546	* @expiry_list: The list to add the mount to.
				2547	*/
				2548	void mnt_set_expiry(struct vfsmount mnt, struct list_head expiry_list)
				2549	{
				2550	namespace_lock();
				2551
				2552	list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);
				2553
				2554	namespace_unlock();
				2555	}
				2556	EXPORT_SYMBOL(mnt_set_expiry);
				2557
				2558	/*
				2559	* process a list of expirable mountpoints with the intent of discarding any
				2560	* mountpoints that aren't in use and haven't been touched since last we came
				2561	* here
				2562	*/
				2563	void mark_mounts_for_expiry(struct list_head *mounts)
				2564	{
				2565	struct mount mnt, next;
				2566	LIST_HEAD(graveyard);
				2567
				2568	if (list_empty(mounts))
				2569	return;
				2570
				2571	namespace_lock();
				2572	lock_mount_hash();
				2573
				2574	/* extract from the expiration list every vfsmount that matches the
				2575	* following criteria:
				2576	* - only referenced by its parent vfsmount
				2577	* - still marked for expiry (marked on the last call here; marks are
				2578	* cleared by mntput())
				2579	*/
				2580	list_for_each_entry_safe(mnt, next, mounts, mnt_expire) {
				2581	if (!xchg(&mnt->mnt_expiry_mark, 1) \|\|
				2582	propagate_mount_busy(mnt, 1))
				2583	continue;
				2584	list_move(&mnt->mnt_expire, &graveyard);
				2585	}
				2586	while (!list_empty(&graveyard)) {
				2587	mnt = list_first_entry(&graveyard, struct mount, mnt_expire);
				2588	touch_mnt_namespace(mnt->mnt_ns);
				2589	umount_tree(mnt, UMOUNT_PROPAGATE\|UMOUNT_SYNC);
				2590	}
				2591	unlock_mount_hash();
				2592	namespace_unlock();
				2593	}
				2594
				2595	EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
				2596
				2597	/*
				2598	* Ripoff of 'select_parent()'
				2599	*
				2600	* search the list of submounts for a given mountpoint, and move any
				2601	* shrinkable submounts to the 'graveyard' list.
				2602	*/
				2603	static int select_submounts(struct mount parent, struct list_head graveyard)
				2604	{
				2605	struct mount *this_parent = parent;
				2606	struct list_head *next;
				2607	int found = 0;
				2608
				2609	repeat:
				2610	next = this_parent->mnt_mounts.next;
				2611	resume:
				2612	while (next != &this_parent->mnt_mounts) {
				2613	struct list_head *tmp = next;
				2614	struct mount *mnt = list_entry(tmp, struct mount, mnt_child);
				2615
				2616	next = tmp->next;
				2617	if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE))
				2618	continue;
				2619	/*
				2620	* Descend a level if the d_mounts list is non-empty.
				2621	*/
				2622	if (!list_empty(&mnt->mnt_mounts)) {
				2623	this_parent = mnt;
				2624	goto repeat;
				2625	}
				2626
				2627	if (!propagate_mount_busy(mnt, 1)) {
				2628	list_move_tail(&mnt->mnt_expire, graveyard);
				2629	found++;
				2630	}
				2631	}
				2632	/*
				2633	* All done at this level ... ascend and resume the search
				2634	*/
				2635	if (this_parent != parent) {
				2636	next = this_parent->mnt_child.next;
				2637	this_parent = this_parent->mnt_parent;
				2638	goto resume;
				2639	}
				2640	return found;
				2641	}
				2642
				2643	/*
				2644	* process a list of expirable mountpoints with the intent of discarding any
				2645	* submounts of a specific parent mountpoint
				2646	*
				2647	* mount_lock must be held for write
				2648	*/
				2649	static void shrink_submounts(struct mount *mnt)
				2650	{
				2651	LIST_HEAD(graveyard);
				2652	struct mount *m;
				2653
				2654	/* extract submounts of 'mountpoint' from the expiration list */
				2655	while (select_submounts(mnt, &graveyard)) {
				2656	while (!list_empty(&graveyard)) {
				2657	m = list_first_entry(&graveyard, struct mount,
				2658	mnt_expire);
				2659	touch_mnt_namespace(m->mnt_ns);
				2660	umount_tree(m, UMOUNT_PROPAGATE\|UMOUNT_SYNC);
				2661	}
				2662	}
				2663	}
				2664
				2665	/*
				2666	* Some copy_from_user() implementations do not return the exact number of
				2667	* bytes remaining to copy on a fault. But copy_mount_options() requires that.
				2668	* Note that this function differs from copy_from_user() in that it will oops
				2669	* on bad values of `to', rather than returning a short copy.
				2670	*/
				2671	static long exact_copy_from_user(void to, const void __user from,
				2672	unsigned long n)
				2673	{
				2674	char *t = to;
				2675	const char __user *f = from;
				2676	char c;
				2677
				2678	if (!access_ok(VERIFY_READ, from, n))
				2679	return n;
				2680
				2681	while (n) {
				2682	if (__get_user(c, f)) {
				2683	memset(t, 0, n);
				2684	break;
				2685	}
				2686	*t++ = c;
				2687	f++;
				2688	n--;
				2689	}
				2690	return n;
				2691	}
				2692
				2693	void copy_mount_options(const void __user data)
				2694	{
				2695	int i;
				2696	unsigned long size;
				2697	char *copy;
				2698
				2699	if (!data)
				2700	return NULL;
				2701
				2702	copy = kmalloc(PAGE_SIZE, GFP_KERNEL);
				2703	if (!copy)
				2704	return ERR_PTR(-ENOMEM);
				2705
				2706	/* We only care that some data at the address the user
				2707	* gave us is valid. Just in case, we'll zero
				2708	* the remainder of the page.
				2709	*/
				2710	/* copy_from_user cannot cross TASK_SIZE ! */
				2711	size = TASK_SIZE - (unsigned long)untagged_addr(data);
				2712	if (size > PAGE_SIZE)
				2713	size = PAGE_SIZE;
				2714
				2715	i = size - exact_copy_from_user(copy, data, size);
				2716	if (!i) {
				2717	kfree(copy);
				2718	return ERR_PTR(-EFAULT);
				2719	}
				2720	if (i != PAGE_SIZE)
				2721	memset(copy + i, 0, PAGE_SIZE - i);
				2722	return copy;
				2723	}
				2724
				2725	char copy_mount_string(const void __user data)
				2726	{
				2727	return data ? strndup_user(data, PAGE_SIZE) : NULL;
				2728	}
				2729
				2730	/*
				2731	* Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
				2732	* be given to the mount() call (ie: read-only, no-dev, no-suid etc).
				2733	*
				2734	* data is a (void *) that can point to any structure up to
				2735	* PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
				2736	* information (or be NULL).
				2737	*
				2738	* Pre-0.97 versions of mount() didn't have a flags word.
				2739	* When the flags word was introduced its top half was required
				2740	* to have the magic value 0xC0ED, and this remained so until 2.4.0-test9.
				2741	* Therefore, if this magic number is present, it carries no information
				2742	* and must be discarded.
				2743	*/
				2744	long do_mount(const char dev_name, const char __user dir_name,
				2745	const char type_page, unsigned long flags, void data_page)
				2746	{
				2747	struct path path;
				2748	unsigned int mnt_flags = 0, sb_flags;
				2749	int retval = 0;
				2750
				2751	/* Discard magic */
				2752	if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
				2753	flags &= ~MS_MGC_MSK;
				2754
				2755	/* Basic sanity checks */
				2756	if (data_page)
				2757	((char *)data_page)[PAGE_SIZE - 1] = 0;
				2758
				2759	if (flags & MS_NOUSER)
				2760	return -EINVAL;
				2761
				2762	/* ... and get the mountpoint */
				2763	retval = user_path(dir_name, &path);
				2764	if (retval)
				2765	return retval;
				2766
				2767	retval = security_sb_mount(dev_name, &path,
				2768	type_page, flags, data_page);
				2769	if (!retval && !may_mount())
				2770	retval = -EPERM;
				2771	if (!retval && (flags & SB_MANDLOCK) && !may_mandlock())
				2772	retval = -EPERM;
				2773	if (retval)
				2774	goto dput_out;
				2775
				2776	/* Default to relatime unless overriden */
				2777	if (!(flags & MS_NOATIME))
				2778	mnt_flags \|= MNT_RELATIME;
				2779
				2780	/* Separate the per-mountpoint flags */
				2781	if (flags & MS_NOSUID)
				2782	mnt_flags \|= MNT_NOSUID;
				2783	if (flags & MS_NODEV)
				2784	mnt_flags \|= MNT_NODEV;
				2785	if (flags & MS_NOEXEC)
				2786	mnt_flags \|= MNT_NOEXEC;
				2787	if (flags & MS_NOATIME)
				2788	mnt_flags \|= MNT_NOATIME;
				2789	if (flags & MS_NODIRATIME)
				2790	mnt_flags \|= MNT_NODIRATIME;
				2791	if (flags & MS_STRICTATIME)
				2792	mnt_flags &= ~(MNT_RELATIME \| MNT_NOATIME);
				2793	if (flags & MS_RDONLY)
				2794	mnt_flags \|= MNT_READONLY;
				2795
				2796	/* The default atime for remount is preservation */
				2797	if ((flags & MS_REMOUNT) &&
				2798	((flags & (MS_NOATIME \| MS_NODIRATIME \| MS_RELATIME \|
				2799	MS_STRICTATIME)) == 0)) {
				2800	mnt_flags &= ~MNT_ATIME_MASK;
				2801	mnt_flags \|= path.mnt->mnt_flags & MNT_ATIME_MASK;
				2802	}
				2803
				2804	sb_flags = flags & (SB_RDONLY \|
				2805	SB_SYNCHRONOUS \|
				2806	SB_MANDLOCK \|
				2807	SB_DIRSYNC \|
				2808	SB_SILENT \|
				2809	SB_POSIXACL \|
				2810	SB_LAZYTIME \|
				2811	SB_I_VERSION);
				2812
				2813	if (flags & MS_REMOUNT)
				2814	retval = do_remount(&path, flags, sb_flags, mnt_flags,
				2815	data_page);
				2816	else if (flags & MS_BIND)
				2817	retval = do_loopback(&path, dev_name, flags & MS_REC);
				2818	else if (flags & (MS_SHARED \| MS_PRIVATE \| MS_SLAVE \| MS_UNBINDABLE))
				2819	retval = do_change_type(&path, flags);
				2820	else if (flags & MS_MOVE)
				2821	retval = do_move_mount(&path, dev_name);
				2822	else
				2823	retval = do_new_mount(&path, type_page, sb_flags, mnt_flags,
				2824	dev_name, data_page);
				2825	dput_out:
				2826	path_put(&path);
				2827	return retval;
				2828	}
				2829
				2830	static struct ucounts inc_mnt_namespaces(struct user_namespace ns)
				2831	{
				2832	return inc_ucount(ns, current_euid(), UCOUNT_MNT_NAMESPACES);
				2833	}
				2834
				2835	static void dec_mnt_namespaces(struct ucounts *ucounts)
				2836	{
				2837	dec_ucount(ucounts, UCOUNT_MNT_NAMESPACES);
				2838	}
				2839
				2840	static void free_mnt_ns(struct mnt_namespace *ns)
				2841	{
				2842	ns_free_inum(&ns->ns);
				2843	dec_mnt_namespaces(ns->ucounts);
				2844	put_user_ns(ns->user_ns);
				2845	kfree(ns);
				2846	}
				2847
				2848	/*
				2849	* Assign a sequence number so we can detect when we attempt to bind
				2850	* mount a reference to an older mount namespace into the current
				2851	* mount namespace, preventing reference counting loops. A 64bit
				2852	* number incrementing at 10Ghz will take 12,427 years to wrap which
				2853	* is effectively never, so we can ignore the possibility.
				2854	*/
				2855	static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);
				2856
				2857	static struct mnt_namespace alloc_mnt_ns(struct user_namespace user_ns)
				2858	{
				2859	struct mnt_namespace *new_ns;
				2860	struct ucounts *ucounts;
				2861	int ret;
				2862
				2863	ucounts = inc_mnt_namespaces(user_ns);
				2864	if (!ucounts)
				2865	return ERR_PTR(-ENOSPC);
				2866
				2867	new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
				2868	if (!new_ns) {
				2869	dec_mnt_namespaces(ucounts);
				2870	return ERR_PTR(-ENOMEM);
				2871	}
				2872	ret = ns_alloc_inum(&new_ns->ns);
				2873	if (ret) {
				2874	kfree(new_ns);
				2875	dec_mnt_namespaces(ucounts);
				2876	return ERR_PTR(ret);
				2877	}
				2878	new_ns->ns.ops = &mntns_operations;
				2879	new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
				2880	atomic_set(&new_ns->count, 1);
				2881	new_ns->root = NULL;
				2882	INIT_LIST_HEAD(&new_ns->list);
				2883	init_waitqueue_head(&new_ns->poll);
				2884	new_ns->event = 0;
				2885	new_ns->user_ns = get_user_ns(user_ns);
				2886	new_ns->ucounts = ucounts;
				2887	new_ns->mounts = 0;
				2888	new_ns->pending_mounts = 0;
				2889	return new_ns;
				2890	}
				2891
				2892	__latent_entropy
				2893	struct mnt_namespace copy_mnt_ns(unsigned long flags, struct mnt_namespace ns,
				2894	struct user_namespace user_ns, struct fs_struct new_fs)
				2895	{
				2896	struct mnt_namespace *new_ns;
				2897	struct vfsmount rootmnt = NULL, pwdmnt = NULL;
				2898	struct mount p, q;
				2899	struct mount *old;
				2900	struct mount *new;
				2901	int copy_flags;
				2902
				2903	BUG_ON(!ns);
				2904
				2905	if (likely(!(flags & CLONE_NEWNS))) {
				2906	get_mnt_ns(ns);
				2907	return ns;
				2908	}
				2909
				2910	old = ns->root;
				2911
				2912	new_ns = alloc_mnt_ns(user_ns);
				2913	if (IS_ERR(new_ns))
				2914	return new_ns;
				2915
				2916	namespace_lock();
				2917	/* First pass: copy the tree topology */
				2918	copy_flags = CL_COPY_UNBINDABLE \| CL_EXPIRE;
				2919	if (user_ns != ns->user_ns)
				2920	copy_flags \|= CL_SHARED_TO_SLAVE \| CL_UNPRIVILEGED;
				2921	new = copy_tree(old, old->mnt.mnt_root, copy_flags);
				2922	if (IS_ERR(new)) {
				2923	namespace_unlock();
				2924	free_mnt_ns(new_ns);
				2925	return ERR_CAST(new);
				2926	}
				2927	new_ns->root = new;
				2928	list_add_tail(&new_ns->list, &new->mnt_list);
				2929
				2930	/*
				2931	* Second pass: switch the tsk->fs->* elements and mark new vfsmounts
				2932	* as belonging to new namespace. We have already acquired a private
				2933	* fs_struct, so tsk->fs->lock is not needed.
				2934	*/
				2935	p = old;
				2936	q = new;
				2937	while (p) {
				2938	q->mnt_ns = new_ns;
				2939	new_ns->mounts++;
				2940	if (new_fs) {
				2941	if (&p->mnt == new_fs->root.mnt) {
				2942	new_fs->root.mnt = mntget(&q->mnt);
				2943	rootmnt = &p->mnt;
				2944	}
				2945	if (&p->mnt == new_fs->pwd.mnt) {
				2946	new_fs->pwd.mnt = mntget(&q->mnt);
				2947	pwdmnt = &p->mnt;
				2948	}
				2949	}
				2950	p = next_mnt(p, old);
				2951	q = next_mnt(q, new);
				2952	if (!q)
				2953	break;
				2954	while (p->mnt.mnt_root != q->mnt.mnt_root)
				2955	p = next_mnt(p, old);
				2956	}
				2957	namespace_unlock();
				2958
				2959	if (rootmnt)
				2960	mntput(rootmnt);
				2961	if (pwdmnt)
				2962	mntput(pwdmnt);
				2963
				2964	return new_ns;
				2965	}
				2966
				2967	/**
				2968	* create_mnt_ns - creates a private namespace and adds a root filesystem
				2969	* @mnt: pointer to the new root filesystem mountpoint
				2970	*/
				2971	static struct mnt_namespace create_mnt_ns(struct vfsmount m)
				2972	{
				2973	struct mnt_namespace *new_ns = alloc_mnt_ns(&init_user_ns);
				2974	if (!IS_ERR(new_ns)) {
				2975	struct mount *mnt = real_mount(m);
				2976	mnt->mnt_ns = new_ns;
				2977	new_ns->root = mnt;
				2978	new_ns->mounts++;
				2979	list_add(&mnt->mnt_list, &new_ns->list);
				2980	} else {
				2981	mntput(m);
				2982	}
				2983	return new_ns;
				2984	}
				2985
				2986	struct dentry mount_subtree(struct vfsmount mnt, const char *name)
				2987	{
				2988	struct mnt_namespace *ns;
				2989	struct super_block *s;
				2990	struct path path;
				2991	int err;
				2992
				2993	ns = create_mnt_ns(mnt);
				2994	if (IS_ERR(ns))
				2995	return ERR_CAST(ns);
				2996
				2997	err = vfs_path_lookup(mnt->mnt_root, mnt,
				2998	name, LOOKUP_FOLLOW\|LOOKUP_AUTOMOUNT, &path);
				2999
				3000	put_mnt_ns(ns);
				3001
				3002	if (err)
				3003	return ERR_PTR(err);
				3004
				3005	/* trade a vfsmount reference for active sb one */
				3006	s = path.mnt->mnt_sb;
				3007	atomic_inc(&s->s_active);
				3008	mntput(path.mnt);
				3009	/* lock the sucker */
				3010	down_write(&s->s_umount);
				3011	/* ... and return the root of (sub)tree on it */
				3012	return path.dentry;
				3013	}
				3014	EXPORT_SYMBOL(mount_subtree);
				3015
				3016	int ksys_mount(char __user dev_name, char __user dir_name, char __user *type,
				3017	unsigned long flags, void __user *data)
				3018	{
				3019	int ret;
				3020	char *kernel_type;
				3021	char *kernel_dev;
				3022	void *options;
				3023
				3024	kernel_type = copy_mount_string(type);
				3025	ret = PTR_ERR(kernel_type);
				3026	if (IS_ERR(kernel_type))
				3027	goto out_type;
				3028
				3029	kernel_dev = copy_mount_string(dev_name);
				3030	ret = PTR_ERR(kernel_dev);
				3031	if (IS_ERR(kernel_dev))
				3032	goto out_dev;
				3033
				3034	options = copy_mount_options(data);
				3035	ret = PTR_ERR(options);
				3036	if (IS_ERR(options))
				3037	goto out_data;
				3038
				3039	ret = do_mount(kernel_dev, dir_name, kernel_type, flags, options);
				3040
				3041	kfree(options);
				3042	out_data:
				3043	kfree(kernel_dev);
				3044	out_dev:
				3045	kfree(kernel_type);
				3046	out_type:
				3047	return ret;
				3048	}
				3049
				3050	SYSCALL_DEFINE5(mount, char __user , dev_name, char __user , dir_name,
				3051	char __user , type, unsigned long, flags, void __user , data)
				3052	{
				3053	return ksys_mount(dev_name, dir_name, type, flags, data);
				3054	}
				3055
				3056	/*
				3057	* Return true if path is reachable from root
				3058	*
				3059	* namespace_sem or mount_lock is held
				3060	*/
				3061	bool is_path_reachable(struct mount mnt, struct dentry dentry,
				3062	const struct path *root)
				3063	{
				3064	while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) {
				3065	dentry = mnt->mnt_mountpoint;
				3066	mnt = mnt->mnt_parent;
				3067	}
				3068	return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry);
				3069	}
				3070
				3071	bool path_is_under(const struct path path1, const struct path path2)
				3072	{
				3073	bool res;
				3074	read_seqlock_excl(&mount_lock);
				3075	res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
				3076	read_sequnlock_excl(&mount_lock);
				3077	return res;
				3078	}
				3079	EXPORT_SYMBOL(path_is_under);
				3080
				3081	/*
				3082	* pivot_root Semantics:
				3083	* Moves the root file system of the current process to the directory put_old,
				3084	* makes new_root as the new root file system of the current process, and sets
				3085	* root/cwd of all processes which had them on the current root to new_root.
				3086	*
				3087	* Restrictions:
				3088	* The new_root and put_old must be directories, and must not be on the
				3089	* same file system as the current process root. The put_old must be
				3090	* underneath new_root, i.e. adding a non-zero number of /.. to the string
				3091	* pointed to by put_old must yield the same directory as new_root. No other
				3092	* file system may be mounted on put_old. After all, new_root is a mountpoint.
				3093	*
				3094	* Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.
				3095	* See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives
				3096	* in this situation.
				3097	*
				3098	* Notes:
				3099	* - we don't move root/cwd if they are not at the root (reason: if something
				3100	* cared enough to change them, it's probably wrong to force them elsewhere)
				3101	* - it's okay to pick a root that isn't the root of a file system, e.g.
				3102	* /nfs/my_root where /nfs is the mount point. It must be a mountpoint,
				3103	* though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
				3104	* first.
				3105	*/
				3106	SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
				3107	const char __user *, put_old)
				3108	{
				3109	struct path new, old, parent_path, root_parent, root;
				3110	struct mount new_mnt, root_mnt, *old_mnt;
				3111	struct mountpoint old_mp, root_mp;
				3112	int error;
				3113
				3114	if (!may_mount())
				3115	return -EPERM;
				3116
				3117	error = user_path_dir(new_root, &new);
				3118	if (error)
				3119	goto out0;
				3120
				3121	error = user_path_dir(put_old, &old);
				3122	if (error)
				3123	goto out1;
				3124
				3125	error = security_sb_pivotroot(&old, &new);
				3126	if (error)
				3127	goto out2;
				3128
				3129	get_fs_root(current->fs, &root);
				3130	old_mp = lock_mount(&old);
				3131	error = PTR_ERR(old_mp);
				3132	if (IS_ERR(old_mp))
				3133	goto out3;
				3134
				3135	error = -EINVAL;
				3136	new_mnt = real_mount(new.mnt);
				3137	root_mnt = real_mount(root.mnt);
				3138	old_mnt = real_mount(old.mnt);
				3139	if (IS_MNT_SHARED(old_mnt) \|\|
				3140	IS_MNT_SHARED(new_mnt->mnt_parent) \|\|
				3141	IS_MNT_SHARED(root_mnt->mnt_parent))
				3142	goto out4;
				3143	if (!check_mnt(root_mnt) \|\| !check_mnt(new_mnt))
				3144	goto out4;
				3145	if (new_mnt->mnt.mnt_flags & MNT_LOCKED)
				3146	goto out4;
				3147	error = -ENOENT;
				3148	if (d_unlinked(new.dentry))
				3149	goto out4;
				3150	error = -EBUSY;
				3151	if (new_mnt == root_mnt \|\| old_mnt == root_mnt)
				3152	goto out4; /* loop, on the same file system */
				3153	error = -EINVAL;
				3154	if (root.mnt->mnt_root != root.dentry)
				3155	goto out4; /* not a mountpoint */
				3156	if (!mnt_has_parent(root_mnt))
				3157	goto out4; /* not attached */
				3158	root_mp = root_mnt->mnt_mp;
				3159	if (new.mnt->mnt_root != new.dentry)
				3160	goto out4; /* not a mountpoint */
				3161	if (!mnt_has_parent(new_mnt))
				3162	goto out4; /* not attached */
				3163	/* make sure we can reach put_old from new_root */
				3164	if (!is_path_reachable(old_mnt, old.dentry, &new))
				3165	goto out4;
				3166	/* make certain new is below the root */
				3167	if (!is_path_reachable(new_mnt, new.dentry, &root))
				3168	goto out4;
				3169	root_mp->m_count++; /* pin it so it won't go away */
				3170	lock_mount_hash();
				3171	detach_mnt(new_mnt, &parent_path);
				3172	detach_mnt(root_mnt, &root_parent);
				3173	if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {
				3174	new_mnt->mnt.mnt_flags \|= MNT_LOCKED;
				3175	root_mnt->mnt.mnt_flags &= ~MNT_LOCKED;
				3176	}
				3177	/* mount old root on put_old */
				3178	attach_mnt(root_mnt, old_mnt, old_mp);
				3179	/* mount new_root on / */
				3180	attach_mnt(new_mnt, real_mount(root_parent.mnt), root_mp);
				3181	touch_mnt_namespace(current->nsproxy->mnt_ns);
				3182	/* A moved mount should not expire automatically */
				3183	list_del_init(&new_mnt->mnt_expire);
				3184	put_mountpoint(root_mp);
				3185	unlock_mount_hash();
				3186	chroot_fs_refs(&root, &new);
				3187	error = 0;
				3188	out4:
				3189	unlock_mount(old_mp);
				3190	if (!error) {
				3191	path_put(&root_parent);
				3192	path_put(&parent_path);
				3193	}
				3194	out3:
				3195	path_put(&root);
				3196	out2:
				3197	path_put(&old);
				3198	out1:
				3199	path_put(&new);
				3200	out0:
				3201	return error;
				3202	}
				3203
				3204	static void __init init_mount_tree(void)
				3205	{
				3206	struct vfsmount *mnt;
				3207	struct mnt_namespace *ns;
				3208	struct path root;
				3209	struct file_system_type *type;
				3210
				3211	type = get_fs_type("rootfs");
				3212	if (!type)
				3213	panic("Can't find rootfs type");
				3214	mnt = vfs_kern_mount(type, 0, "rootfs", NULL);
				3215	put_filesystem(type);
				3216	if (IS_ERR(mnt))
				3217	panic("Can't create rootfs");
				3218
				3219	ns = create_mnt_ns(mnt);
				3220	if (IS_ERR(ns))
				3221	panic("Can't allocate initial namespace");
				3222
				3223	init_task.nsproxy->mnt_ns = ns;
				3224	get_mnt_ns(ns);
				3225
				3226	root.mnt = mnt;
				3227	root.dentry = mnt->mnt_root;
				3228	mnt->mnt_flags \|= MNT_LOCKED;
				3229
				3230	set_fs_pwd(current->fs, &root);
				3231	set_fs_root(current->fs, &root);
				3232	}
				3233
				3234	void __init mnt_init(void)
				3235	{
				3236	int err;
				3237
				3238	mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),
				3239	0, SLAB_HWCACHE_ALIGN \| SLAB_PANIC, NULL);
				3240
				3241	mount_hashtable = alloc_large_system_hash("Mount-cache",
				3242	sizeof(struct hlist_head),
				3243	mhash_entries, 19,
				3244	HASH_ZERO,
				3245	&m_hash_shift, &m_hash_mask, 0, 0);
				3246	mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache",
				3247	sizeof(struct hlist_head),
				3248	mphash_entries, 19,
				3249	HASH_ZERO,
				3250	&mp_hash_shift, &mp_hash_mask, 0, 0);
				3251
				3252	if (!mount_hashtable \|\| !mountpoint_hashtable)
				3253	panic("Failed to allocate mount hash table\n");
				3254
				3255	kernfs_init();
				3256
				3257	err = sysfs_init();
				3258	if (err)
				3259	printk(KERN_WARNING "%s: sysfs_init error: %d\n",
				3260	__func__, err);
				3261	fs_kobj = kobject_create_and_add("fs", NULL);
				3262	if (!fs_kobj)
				3263	printk(KERN_WARNING "%s: kobj create error\n", __func__);
				3264	init_rootfs();
				3265	init_mount_tree();
				3266	}
				3267
				3268	void put_mnt_ns(struct mnt_namespace *ns)
				3269	{
				3270	if (!atomic_dec_and_test(&ns->count))
				3271	return;
				3272	drop_collected_mounts(&ns->root->mnt);
				3273	free_mnt_ns(ns);
				3274	}
				3275
				3276	struct vfsmount kern_mount_data(struct file_system_type type, void *data)
				3277	{
				3278	struct vfsmount *mnt;
				3279	mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, data);
				3280	if (!IS_ERR(mnt)) {
				3281	/*
				3282	* it is a longterm mount, don't release mnt until
				3283	* we unmount before file sys is unregistered
				3284	*/
				3285	real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL;
				3286	}
				3287	return mnt;
				3288	}
				3289	EXPORT_SYMBOL_GPL(kern_mount_data);
				3290
				3291	void kern_unmount(struct vfsmount *mnt)
				3292	{
				3293	/* release long term mount so mount point can be released */
				3294	if (!IS_ERR_OR_NULL(mnt)) {
				3295	real_mount(mnt)->mnt_ns = NULL;
				3296	synchronize_rcu(); /* yecchhh... */
				3297	mntput(mnt);
				3298	}
				3299	}
				3300	EXPORT_SYMBOL(kern_unmount);
				3301
				3302	bool our_mnt(struct vfsmount *mnt)
				3303	{
				3304	return check_mnt(real_mount(mnt));
				3305	}
				3306
				3307	bool current_chrooted(void)
				3308	{
				3309	/* Does the current process have a non-standard root */
				3310	struct path ns_root;
				3311	struct path fs_root;
				3312	bool chrooted;
				3313
				3314	/* Find the namespace root */
				3315	ns_root.mnt = &current->nsproxy->mnt_ns->root->mnt;
				3316	ns_root.dentry = ns_root.mnt->mnt_root;
				3317	path_get(&ns_root);
				3318	while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root))
				3319	;
				3320
				3321	get_fs_root(current->fs, &fs_root);
				3322
				3323	chrooted = !path_equal(&fs_root, &ns_root);
				3324
				3325	path_put(&fs_root);
				3326	path_put(&ns_root);
				3327
				3328	return chrooted;
				3329	}
				3330
				3331	static bool mnt_already_visible(struct mnt_namespace ns, struct vfsmount new,
				3332	int *new_mnt_flags)
				3333	{
				3334	int new_flags = *new_mnt_flags;
				3335	struct mount *mnt;
				3336	bool visible = false;
				3337
				3338	down_read(&namespace_sem);
				3339	list_for_each_entry(mnt, &ns->list, mnt_list) {
				3340	struct mount *child;
				3341	int mnt_flags;
				3342
				3343	if (mnt->mnt.mnt_sb->s_type != new->mnt_sb->s_type)
				3344	continue;
				3345
				3346	/* This mount is not fully visible if it's root directory
				3347	* is not the root directory of the filesystem.
				3348	*/
				3349	if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root)
				3350	continue;
				3351
				3352	/* A local view of the mount flags */
				3353	mnt_flags = mnt->mnt.mnt_flags;
				3354
				3355	/* Don't miss readonly hidden in the superblock flags */
				3356	if (sb_rdonly(mnt->mnt.mnt_sb))
				3357	mnt_flags \|= MNT_LOCK_READONLY;
				3358
				3359	/* Verify the mount flags are equal to or more permissive
				3360	* than the proposed new mount.
				3361	*/
				3362	if ((mnt_flags & MNT_LOCK_READONLY) &&
				3363	!(new_flags & MNT_READONLY))
				3364	continue;
				3365	if ((mnt_flags & MNT_LOCK_ATIME) &&
				3366	((mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK)))
				3367	continue;
				3368
				3369	/* This mount is not fully visible if there are any
				3370	* locked child mounts that cover anything except for
				3371	* empty directories.
				3372	*/
				3373	list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
				3374	struct inode *inode = child->mnt_mountpoint->d_inode;
				3375	/* Only worry about locked mounts */
				3376	if (!(child->mnt.mnt_flags & MNT_LOCKED))
				3377	continue;
				3378	/* Is the directory permanetly empty? */
				3379	if (!is_empty_dir_inode(inode))
				3380	goto next;
				3381	}
				3382	/* Preserve the locked attributes */
				3383	*new_mnt_flags \|= mnt_flags & (MNT_LOCK_READONLY \| \
				3384	MNT_LOCK_ATIME);
				3385	visible = true;
				3386	goto found;
				3387	next: ;
				3388	}
				3389	found:
				3390	up_read(&namespace_sem);
				3391	return visible;
				3392	}
				3393
				3394	static bool mount_too_revealing(struct vfsmount mnt, int new_mnt_flags)
				3395	{
				3396	const unsigned long required_iflags = SB_I_NOEXEC \| SB_I_NODEV;
				3397	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
				3398	unsigned long s_iflags;
				3399
				3400	if (ns->user_ns == &init_user_ns)
				3401	return false;
				3402
				3403	/* Can this filesystem be too revealing? */
				3404	s_iflags = mnt->mnt_sb->s_iflags;
				3405	if (!(s_iflags & SB_I_USERNS_VISIBLE))
				3406	return false;
				3407
				3408	if ((s_iflags & required_iflags) != required_iflags) {
				3409	WARN_ONCE(1, "Expected s_iflags to contain 0x%lx\n",
				3410	required_iflags);
				3411	return true;
				3412	}
				3413
				3414	return !mnt_already_visible(ns, mnt, new_mnt_flags);
				3415	}
				3416
				3417	bool mnt_may_suid(struct vfsmount *mnt)
				3418	{
				3419	/*
				3420	* Foreign mounts (accessed via fchdir or through /proc
				3421	* symlinks) are always treated as if they are nosuid. This
				3422	* prevents namespaces from trusting potentially unsafe
				3423	* suid/sgid bits, file caps, or security labels that originate
				3424	* in other namespaces.
				3425	*/
				3426	return !(mnt->mnt_flags & MNT_NOSUID) && check_mnt(real_mount(mnt)) &&
				3427	current_in_userns(mnt->mnt_sb->s_user_ns);
				3428	}
				3429
				3430	static struct ns_common mntns_get(struct task_struct task)
				3431	{
				3432	struct ns_common *ns = NULL;
				3433	struct nsproxy *nsproxy;
				3434
				3435	task_lock(task);
				3436	nsproxy = task->nsproxy;
				3437	if (nsproxy) {
				3438	ns = &nsproxy->mnt_ns->ns;
				3439	get_mnt_ns(to_mnt_ns(ns));
				3440	}
				3441	task_unlock(task);
				3442
				3443	return ns;
				3444	}
				3445
				3446	static void mntns_put(struct ns_common *ns)
				3447	{
				3448	put_mnt_ns(to_mnt_ns(ns));
				3449	}
				3450
				3451	static int mntns_install(struct nsproxy nsproxy, struct ns_common ns)
				3452	{
				3453	struct fs_struct *fs = current->fs;
				3454	struct mnt_namespace mnt_ns = to_mnt_ns(ns), old_mnt_ns;
				3455	struct path root;
				3456	int err;
				3457
				3458	if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) \|\|
				3459	!ns_capable(current_user_ns(), CAP_SYS_CHROOT) \|\|
				3460	!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
				3461	return -EPERM;
				3462
				3463	if (fs->users != 1)
				3464	return -EINVAL;
				3465
				3466	get_mnt_ns(mnt_ns);
				3467	old_mnt_ns = nsproxy->mnt_ns;
				3468	nsproxy->mnt_ns = mnt_ns;
				3469
				3470	/* Find the root */
				3471	err = vfs_path_lookup(mnt_ns->root->mnt.mnt_root, &mnt_ns->root->mnt,
				3472	"/", LOOKUP_DOWN, &root);
				3473	if (err) {
				3474	/* revert to old namespace */
				3475	nsproxy->mnt_ns = old_mnt_ns;
				3476	put_mnt_ns(mnt_ns);
				3477	return err;
				3478	}
				3479
				3480	put_mnt_ns(old_mnt_ns);
				3481
				3482	/* Update the pwd and root */
				3483	set_fs_pwd(fs, &root);
				3484	set_fs_root(fs, &root);
				3485
				3486	path_put(&root);
				3487	return 0;
				3488	}
				3489
				3490	static struct user_namespace mntns_owner(struct ns_common ns)
				3491	{
				3492	return to_mnt_ns(ns)->user_ns;
				3493	}
				3494
				3495	const struct proc_ns_operations mntns_operations = {
				3496	.name = "mnt",
				3497	.type = CLONE_NEWNS,
				3498	.get = mntns_get,
				3499	.put = mntns_put,
				3500	.install = mntns_install,
				3501	.owner = mntns_owner,
				3502	};