Blame - src/kernel/linux/v4.19/ipc/mqueue.c - T800

blob: 8763ee733e3c8c3379087545c8a80349dda7ee5e [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	/*
				2	* POSIX message queues filesystem for Linux.
				3	*
				4	* Copyright (C) 2003,2004 Krzysztof Benedyczak (golbi@mat.uni.torun.pl)
				5	* Michal Wronski (michal.wronski@gmail.com)
				6	*
				7	* Spinlocks: Mohamed Abbas (abbas.mohamed@intel.com)
				8	* Lockless receive & send, fd based notify:
				9	* Manfred Spraul (manfred@colorfullife.com)
				10	*
				11	* Audit: George Wilson (ltcgcw@us.ibm.com)
				12	*
				13	* This file is released under the GPL.
				14	*/
				15
				16	#include <linux/capability.h>
				17	#include <linux/init.h>
				18	#include <linux/pagemap.h>
				19	#include <linux/file.h>
				20	#include <linux/mount.h>
				21	#include <linux/namei.h>
				22	#include <linux/sysctl.h>
				23	#include <linux/poll.h>
				24	#include <linux/mqueue.h>
				25	#include <linux/msg.h>
				26	#include <linux/skbuff.h>
				27	#include <linux/vmalloc.h>
				28	#include <linux/netlink.h>
				29	#include <linux/syscalls.h>
				30	#include <linux/audit.h>
				31	#include <linux/signal.h>
				32	#include <linux/mutex.h>
				33	#include <linux/nsproxy.h>
				34	#include <linux/pid.h>
				35	#include <linux/ipc_namespace.h>
				36	#include <linux/user_namespace.h>
				37	#include <linux/slab.h>
				38	#include <linux/sched/wake_q.h>
				39	#include <linux/sched/signal.h>
				40	#include <linux/sched/user.h>
				41
				42	#include <net/sock.h>
				43	#include "util.h"
				44
				45	#define MQUEUE_MAGIC 0x19800202
				46	#define DIRENT_SIZE 20
				47	#define FILENT_SIZE 80
				48
				49	#define SEND 0
				50	#define RECV 1
				51
				52	#define STATE_NONE 0
				53	#define STATE_READY 1
				54
				55	struct posix_msg_tree_node {
				56	struct rb_node rb_node;
				57	struct list_head msg_list;
				58	int priority;
				59	};
				60
				61	struct ext_wait_queue { /* queue of sleeping tasks */
				62	struct task_struct *task;
				63	struct list_head list;
				64	struct msg_msg msg; / ptr of loaded message */
				65	int state; /* one of STATE_* values */
				66	};
				67
				68	struct mqueue_inode_info {
				69	spinlock_t lock;
				70	struct inode vfs_inode;
				71	wait_queue_head_t wait_q;
				72
				73	struct rb_root msg_tree;
				74	struct posix_msg_tree_node *node_cache;
				75	struct mq_attr attr;
				76
				77	struct sigevent notify;
				78	struct pid *notify_owner;
				79	struct user_namespace *notify_user_ns;
				80	struct user_struct user; / user who created, for accounting */
				81	struct sock *notify_sock;
				82	struct sk_buff *notify_cookie;
				83
				84	/* for tasks waiting for free space and messages, respectively */
				85	struct ext_wait_queue e_wait_q[2];
				86
				87	unsigned long qsize; /* size of queue in memory (sum of all msgs) */
				88	};
				89
				90	static const struct inode_operations mqueue_dir_inode_operations;
				91	static const struct file_operations mqueue_file_operations;
				92	static const struct super_operations mqueue_super_ops;
				93	static void remove_notification(struct mqueue_inode_info *info);
				94
				95	static struct kmem_cache *mqueue_inode_cachep;
				96
				97	static struct ctl_table_header *mq_sysctl_table;
				98
				99	static inline struct mqueue_inode_info MQUEUE_I(struct inode inode)
				100	{
				101	return container_of(inode, struct mqueue_inode_info, vfs_inode);
				102	}
				103
				104	/*
				105	* This routine should be called with the mq_lock held.
				106	*/
				107	static inline struct ipc_namespace __get_ns_from_inode(struct inode inode)
				108	{
				109	return get_ipc_ns(inode->i_sb->s_fs_info);
				110	}
				111
				112	static struct ipc_namespace get_ns_from_inode(struct inode inode)
				113	{
				114	struct ipc_namespace *ns;
				115
				116	spin_lock(&mq_lock);
				117	ns = __get_ns_from_inode(inode);
				118	spin_unlock(&mq_lock);
				119	return ns;
				120	}
				121
				122	/* Auxiliary functions to manipulate messages' list */
				123	static int msg_insert(struct msg_msg msg, struct mqueue_inode_info info)
				124	{
				125	struct rb_node *p, parent = NULL;
				126	struct posix_msg_tree_node *leaf;
				127
				128	p = &info->msg_tree.rb_node;
				129	while (*p) {
				130	parent = *p;
				131	leaf = rb_entry(parent, struct posix_msg_tree_node, rb_node);
				132
				133	if (likely(leaf->priority == msg->m_type))
				134	goto insert_msg;
				135	else if (msg->m_type < leaf->priority)
				136	p = &(*p)->rb_left;
				137	else
				138	p = &(*p)->rb_right;
				139	}
				140	if (info->node_cache) {
				141	leaf = info->node_cache;
				142	info->node_cache = NULL;
				143	} else {
				144	leaf = kmalloc(sizeof(*leaf), GFP_ATOMIC);
				145	if (!leaf)
				146	return -ENOMEM;
				147	INIT_LIST_HEAD(&leaf->msg_list);
				148	}
				149	leaf->priority = msg->m_type;
				150	rb_link_node(&leaf->rb_node, parent, p);
				151	rb_insert_color(&leaf->rb_node, &info->msg_tree);
				152	insert_msg:
				153	info->attr.mq_curmsgs++;
				154	info->qsize += msg->m_ts;
				155	list_add_tail(&msg->m_list, &leaf->msg_list);
				156	return 0;
				157	}
				158
				159	static inline struct msg_msg msg_get(struct mqueue_inode_info info)
				160	{
				161	struct rb_node *p, parent = NULL;
				162	struct posix_msg_tree_node *leaf;
				163	struct msg_msg *msg;
				164
				165	try_again:
				166	p = &info->msg_tree.rb_node;
				167	while (*p) {
				168	parent = *p;
				169	/*
				170	* During insert, low priorities go to the left and high to the
				171	* right. On receive, we want the highest priorities first, so
				172	* walk all the way to the right.
				173	*/
				174	p = &(*p)->rb_right;
				175	}
				176	if (!parent) {
				177	if (info->attr.mq_curmsgs) {
				178	pr_warn_once("Inconsistency in POSIX message queue, "
				179	"no tree element, but supposedly messages "
				180	"should exist!\n");
				181	info->attr.mq_curmsgs = 0;
				182	}
				183	return NULL;
				184	}
				185	leaf = rb_entry(parent, struct posix_msg_tree_node, rb_node);
				186	if (unlikely(list_empty(&leaf->msg_list))) {
				187	pr_warn_once("Inconsistency in POSIX message queue, "
				188	"empty leaf node but we haven't implemented "
				189	"lazy leaf delete!\n");
				190	rb_erase(&leaf->rb_node, &info->msg_tree);
				191	if (info->node_cache) {
				192	kfree(leaf);
				193	} else {
				194	info->node_cache = leaf;
				195	}
				196	goto try_again;
				197	} else {
				198	msg = list_first_entry(&leaf->msg_list,
				199	struct msg_msg, m_list);
				200	list_del(&msg->m_list);
				201	if (list_empty(&leaf->msg_list)) {
				202	rb_erase(&leaf->rb_node, &info->msg_tree);
				203	if (info->node_cache) {
				204	kfree(leaf);
				205	} else {
				206	info->node_cache = leaf;
				207	}
				208	}
				209	}
				210	info->attr.mq_curmsgs--;
				211	info->qsize -= msg->m_ts;
				212	return msg;
				213	}
				214
				215	static struct inode mqueue_get_inode(struct super_block sb,
				216	struct ipc_namespace *ipc_ns, umode_t mode,
				217	struct mq_attr *attr)
				218	{
				219	struct user_struct *u = current_user();
				220	struct inode *inode;
				221	int ret = -ENOMEM;
				222
				223	inode = new_inode(sb);
				224	if (!inode)
				225	goto err;
				226
				227	inode->i_ino = get_next_ino();
				228	inode->i_mode = mode;
				229	inode->i_uid = current_fsuid();
				230	inode->i_gid = current_fsgid();
				231	inode->i_mtime = inode->i_ctime = inode->i_atime = current_time(inode);
				232
				233	if (S_ISREG(mode)) {
				234	struct mqueue_inode_info *info;
				235	unsigned long mq_bytes, mq_treesize;
				236
				237	inode->i_fop = &mqueue_file_operations;
				238	inode->i_size = FILENT_SIZE;
				239	/* mqueue specific info */
				240	info = MQUEUE_I(inode);
				241	spin_lock_init(&info->lock);
				242	init_waitqueue_head(&info->wait_q);
				243	INIT_LIST_HEAD(&info->e_wait_q[0].list);
				244	INIT_LIST_HEAD(&info->e_wait_q[1].list);
				245	info->notify_owner = NULL;
				246	info->notify_user_ns = NULL;
				247	info->qsize = 0;
				248	info->user = NULL; /* set when all is ok */
				249	info->msg_tree = RB_ROOT;
				250	info->node_cache = NULL;
				251	memset(&info->attr, 0, sizeof(info->attr));
				252	info->attr.mq_maxmsg = min(ipc_ns->mq_msg_max,
				253	ipc_ns->mq_msg_default);
				254	info->attr.mq_msgsize = min(ipc_ns->mq_msgsize_max,
				255	ipc_ns->mq_msgsize_default);
				256	if (attr) {
				257	info->attr.mq_maxmsg = attr->mq_maxmsg;
				258	info->attr.mq_msgsize = attr->mq_msgsize;
				259	}
				260	/*
				261	* We used to allocate a static array of pointers and account
				262	* the size of that array as well as one msg_msg struct per
				263	* possible message into the queue size. That's no longer
				264	* accurate as the queue is now an rbtree and will grow and
				265	* shrink depending on usage patterns. We can, however, still
				266	* account one msg_msg struct per message, but the nodes are
				267	* allocated depending on priority usage, and most programs
				268	* only use one, or a handful, of priorities. However, since
				269	* this is pinned memory, we need to assume worst case, so
				270	* that means the min(mq_maxmsg, max_priorities) * struct
				271	* posix_msg_tree_node.
				272	*/
				273
				274	ret = -EINVAL;
				275	if (info->attr.mq_maxmsg <= 0 \|\| info->attr.mq_msgsize <= 0)
				276	goto out_inode;
				277	if (capable(CAP_SYS_RESOURCE)) {
				278	if (info->attr.mq_maxmsg > HARD_MSGMAX \|\|
				279	info->attr.mq_msgsize > HARD_MSGSIZEMAX)
				280	goto out_inode;
				281	} else {
				282	if (info->attr.mq_maxmsg > ipc_ns->mq_msg_max \|\|
				283	info->attr.mq_msgsize > ipc_ns->mq_msgsize_max)
				284	goto out_inode;
				285	}
				286	ret = -EOVERFLOW;
				287	/* check for overflow */
				288	if (info->attr.mq_msgsize > ULONG_MAX/info->attr.mq_maxmsg)
				289	goto out_inode;
				290	mq_treesize = info->attr.mq_maxmsg * sizeof(struct msg_msg) +
				291	min_t(unsigned int, info->attr.mq_maxmsg, MQ_PRIO_MAX) *
				292	sizeof(struct posix_msg_tree_node);
				293	mq_bytes = info->attr.mq_maxmsg * info->attr.mq_msgsize;
				294	if (mq_bytes + mq_treesize < mq_bytes)
				295	goto out_inode;
				296	mq_bytes += mq_treesize;
				297	spin_lock(&mq_lock);
				298	if (u->mq_bytes + mq_bytes < u->mq_bytes \|\|
				299	u->mq_bytes + mq_bytes > rlimit(RLIMIT_MSGQUEUE)) {
				300	spin_unlock(&mq_lock);
				301	/* mqueue_evict_inode() releases info->messages */
				302	ret = -EMFILE;
				303	goto out_inode;
				304	}
				305	u->mq_bytes += mq_bytes;
				306	spin_unlock(&mq_lock);
				307
				308	/* all is ok */
				309	info->user = get_uid(u);
				310	} else if (S_ISDIR(mode)) {
				311	inc_nlink(inode);
				312	/* Some things misbehave if size == 0 on a directory */
				313	inode->i_size = 2 * DIRENT_SIZE;
				314	inode->i_op = &mqueue_dir_inode_operations;
				315	inode->i_fop = &simple_dir_operations;
				316	}
				317
				318	return inode;
				319	out_inode:
				320	iput(inode);
				321	err:
				322	return ERR_PTR(ret);
				323	}
				324
				325	static int mqueue_fill_super(struct super_block sb, void data, int silent)
				326	{
				327	struct inode *inode;
				328	struct ipc_namespace *ns = sb->s_fs_info;
				329
				330	sb->s_iflags \|= SB_I_NOEXEC \| SB_I_NODEV;
				331	sb->s_blocksize = PAGE_SIZE;
				332	sb->s_blocksize_bits = PAGE_SHIFT;
				333	sb->s_magic = MQUEUE_MAGIC;
				334	sb->s_op = &mqueue_super_ops;
				335
				336	inode = mqueue_get_inode(sb, ns, S_IFDIR \| S_ISVTX \| S_IRWXUGO, NULL);
				337	if (IS_ERR(inode))
				338	return PTR_ERR(inode);
				339
				340	sb->s_root = d_make_root(inode);
				341	if (!sb->s_root)
				342	return -ENOMEM;
				343	return 0;
				344	}
				345
				346	static struct dentry mqueue_mount(struct file_system_type fs_type,
				347	int flags, const char *dev_name,
				348	void *data)
				349	{
				350	struct ipc_namespace *ns;
				351	if (flags & SB_KERNMOUNT) {
				352	ns = data;
				353	data = NULL;
				354	} else {
				355	ns = current->nsproxy->ipc_ns;
				356	}
				357	return mount_ns(fs_type, flags, data, ns, ns->user_ns, mqueue_fill_super);
				358	}
				359
				360	static void init_once(void *foo)
				361	{
				362	struct mqueue_inode_info p = (struct mqueue_inode_info ) foo;
				363
				364	inode_init_once(&p->vfs_inode);
				365	}
				366
				367	static struct inode mqueue_alloc_inode(struct super_block sb)
				368	{
				369	struct mqueue_inode_info *ei;
				370
				371	ei = kmem_cache_alloc(mqueue_inode_cachep, GFP_KERNEL);
				372	if (!ei)
				373	return NULL;
				374	return &ei->vfs_inode;
				375	}
				376
				377	static void mqueue_i_callback(struct rcu_head *head)
				378	{
				379	struct inode *inode = container_of(head, struct inode, i_rcu);
				380	kmem_cache_free(mqueue_inode_cachep, MQUEUE_I(inode));
				381	}
				382
				383	static void mqueue_destroy_inode(struct inode *inode)
				384	{
				385	call_rcu(&inode->i_rcu, mqueue_i_callback);
				386	}
				387
				388	static void mqueue_evict_inode(struct inode *inode)
				389	{
				390	struct mqueue_inode_info *info;
				391	struct user_struct *user;
				392	struct ipc_namespace *ipc_ns;
				393	struct msg_msg msg, nmsg;
				394	LIST_HEAD(tmp_msg);
				395
				396	clear_inode(inode);
				397
				398	if (S_ISDIR(inode->i_mode))
				399	return;
				400
				401	ipc_ns = get_ns_from_inode(inode);
				402	info = MQUEUE_I(inode);
				403	spin_lock(&info->lock);
				404	while ((msg = msg_get(info)) != NULL)
				405	list_add_tail(&msg->m_list, &tmp_msg);
				406	kfree(info->node_cache);
				407	spin_unlock(&info->lock);
				408
				409	list_for_each_entry_safe(msg, nmsg, &tmp_msg, m_list) {
				410	list_del(&msg->m_list);
				411	free_msg(msg);
				412	}
				413
				414	user = info->user;
				415	if (user) {
				416	unsigned long mq_bytes, mq_treesize;
				417
				418	/* Total amount of bytes accounted for the mqueue */
				419	mq_treesize = info->attr.mq_maxmsg * sizeof(struct msg_msg) +
				420	min_t(unsigned int, info->attr.mq_maxmsg, MQ_PRIO_MAX) *
				421	sizeof(struct posix_msg_tree_node);
				422
				423	mq_bytes = mq_treesize + (info->attr.mq_maxmsg *
				424	info->attr.mq_msgsize);
				425
				426	spin_lock(&mq_lock);
				427	user->mq_bytes -= mq_bytes;
				428	/*
				429	* get_ns_from_inode() ensures that the
				430	* (ipc_ns = sb->s_fs_info) is either a valid ipc_ns
				431	* to which we now hold a reference, or it is NULL.
				432	* We can't put it here under mq_lock, though.
				433	*/
				434	if (ipc_ns)
				435	ipc_ns->mq_queues_count--;
				436	spin_unlock(&mq_lock);
				437	free_uid(user);
				438	}
				439	if (ipc_ns)
				440	put_ipc_ns(ipc_ns);
				441	}
				442
				443	static int mqueue_create_attr(struct dentry dentry, umode_t mode, void arg)
				444	{
				445	struct inode *dir = dentry->d_parent->d_inode;
				446	struct inode *inode;
				447	struct mq_attr *attr = arg;
				448	int error;
				449	struct ipc_namespace *ipc_ns;
				450
				451	spin_lock(&mq_lock);
				452	ipc_ns = __get_ns_from_inode(dir);
				453	if (!ipc_ns) {
				454	error = -EACCES;
				455	goto out_unlock;
				456	}
				457
				458	if (ipc_ns->mq_queues_count >= ipc_ns->mq_queues_max &&
				459	!capable(CAP_SYS_RESOURCE)) {
				460	error = -ENOSPC;
				461	goto out_unlock;
				462	}
				463	ipc_ns->mq_queues_count++;
				464	spin_unlock(&mq_lock);
				465
				466	inode = mqueue_get_inode(dir->i_sb, ipc_ns, mode, attr);
				467	if (IS_ERR(inode)) {
				468	error = PTR_ERR(inode);
				469	spin_lock(&mq_lock);
				470	ipc_ns->mq_queues_count--;
				471	goto out_unlock;
				472	}
				473
				474	put_ipc_ns(ipc_ns);
				475	dir->i_size += DIRENT_SIZE;
				476	dir->i_ctime = dir->i_mtime = dir->i_atime = current_time(dir);
				477
				478	d_instantiate(dentry, inode);
				479	dget(dentry);
				480	return 0;
				481	out_unlock:
				482	spin_unlock(&mq_lock);
				483	if (ipc_ns)
				484	put_ipc_ns(ipc_ns);
				485	return error;
				486	}
				487
				488	static int mqueue_create(struct inode dir, struct dentry dentry,
				489	umode_t mode, bool excl)
				490	{
				491	return mqueue_create_attr(dentry, mode, NULL);
				492	}
				493
				494	static int mqueue_unlink(struct inode dir, struct dentry dentry)
				495	{
				496	struct inode *inode = d_inode(dentry);
				497
				498	dir->i_ctime = dir->i_mtime = dir->i_atime = current_time(dir);
				499	dir->i_size -= DIRENT_SIZE;
				500	drop_nlink(inode);
				501	dput(dentry);
				502	return 0;
				503	}
				504
				505	/*
				506	* This is routine for system read from queue file.
				507	* To avoid mess with doing here some sort of mq_receive we allow
				508	* to read only queue size & notification info (the only values
				509	* that are interesting from user point of view and aren't accessible
				510	* through std routines)
				511	*/
				512	static ssize_t mqueue_read_file(struct file filp, char __user u_data,
				513	size_t count, loff_t *off)
				514	{
				515	struct mqueue_inode_info *info = MQUEUE_I(file_inode(filp));
				516	char buffer[FILENT_SIZE];
				517	ssize_t ret;
				518
				519	spin_lock(&info->lock);
				520	snprintf(buffer, sizeof(buffer),
				521	"QSIZE:%-10lu NOTIFY:%-5d SIGNO:%-5d NOTIFY_PID:%-6d\n",
				522	info->qsize,
				523	info->notify_owner ? info->notify.sigev_notify : 0,
				524	(info->notify_owner &&
				525	info->notify.sigev_notify == SIGEV_SIGNAL) ?
				526	info->notify.sigev_signo : 0,
				527	pid_vnr(info->notify_owner));
				528	spin_unlock(&info->lock);
				529	buffer[sizeof(buffer)-1] = '\0';
				530
				531	ret = simple_read_from_buffer(u_data, count, off, buffer,
				532	strlen(buffer));
				533	if (ret <= 0)
				534	return ret;
				535
				536	file_inode(filp)->i_atime = file_inode(filp)->i_ctime = current_time(file_inode(filp));
				537	return ret;
				538	}
				539
				540	static int mqueue_flush_file(struct file *filp, fl_owner_t id)
				541	{
				542	struct mqueue_inode_info *info = MQUEUE_I(file_inode(filp));
				543
				544	spin_lock(&info->lock);
				545	if (task_tgid(current) == info->notify_owner)
				546	remove_notification(info);
				547
				548	spin_unlock(&info->lock);
				549	return 0;
				550	}
				551
				552	static __poll_t mqueue_poll_file(struct file filp, struct poll_table_struct poll_tab)
				553	{
				554	struct mqueue_inode_info *info = MQUEUE_I(file_inode(filp));
				555	__poll_t retval = 0;
				556
				557	poll_wait(filp, &info->wait_q, poll_tab);
				558
				559	spin_lock(&info->lock);
				560	if (info->attr.mq_curmsgs)
				561	retval = EPOLLIN \| EPOLLRDNORM;
				562
				563	if (info->attr.mq_curmsgs < info->attr.mq_maxmsg)
				564	retval \|= EPOLLOUT \| EPOLLWRNORM;
				565	spin_unlock(&info->lock);
				566
				567	return retval;
				568	}
				569
				570	/* Adds current to info->e_wait_q[sr] before element with smaller prio */
				571	static void wq_add(struct mqueue_inode_info *info, int sr,
				572	struct ext_wait_queue *ewp)
				573	{
				574	struct ext_wait_queue *walk;
				575
				576	ewp->task = current;
				577
				578	list_for_each_entry(walk, &info->e_wait_q[sr].list, list) {
				579	if (walk->task->prio <= current->prio) {
				580	list_add_tail(&ewp->list, &walk->list);
				581	return;
				582	}
				583	}
				584	list_add_tail(&ewp->list, &info->e_wait_q[sr].list);
				585	}
				586
				587	/*
				588	* Puts current task to sleep. Caller must hold queue lock. After return
				589	* lock isn't held.
				590	* sr: SEND or RECV
				591	*/
				592	static int wq_sleep(struct mqueue_inode_info *info, int sr,
				593	ktime_t timeout, struct ext_wait_queue ewp)
				594	__releases(&info->lock)
				595	{
				596	int retval;
				597	signed long time;
				598
				599	wq_add(info, sr, ewp);
				600
				601	for (;;) {
				602	__set_current_state(TASK_INTERRUPTIBLE);
				603
				604	spin_unlock(&info->lock);
				605	time = schedule_hrtimeout_range_clock(timeout, 0,
				606	HRTIMER_MODE_ABS, CLOCK_REALTIME);
				607
				608	if (ewp->state == STATE_READY) {
				609	retval = 0;
				610	goto out;
				611	}
				612	spin_lock(&info->lock);
				613	if (ewp->state == STATE_READY) {
				614	retval = 0;
				615	goto out_unlock;
				616	}
				617	if (signal_pending(current)) {
				618	retval = -ERESTARTSYS;
				619	break;
				620	}
				621	if (time == 0) {
				622	retval = -ETIMEDOUT;
				623	break;
				624	}
				625	}
				626	list_del(&ewp->list);
				627	out_unlock:
				628	spin_unlock(&info->lock);
				629	out:
				630	return retval;
				631	}
				632
				633	/*
				634	* Returns waiting task that should be serviced first or NULL if none exists
				635	*/
				636	static struct ext_wait_queue *wq_get_first_waiter(
				637	struct mqueue_inode_info *info, int sr)
				638	{
				639	struct list_head *ptr;
				640
				641	ptr = info->e_wait_q[sr].list.prev;
				642	if (ptr == &info->e_wait_q[sr].list)
				643	return NULL;
				644	return list_entry(ptr, struct ext_wait_queue, list);
				645	}
				646
				647
				648	static inline void set_cookie(struct sk_buff *skb, char code)
				649	{
				650	((char *)skb->data)[NOTIFY_COOKIE_LEN-1] = code;
				651	}
				652
				653	/*
				654	* The next function is only to split too long sys_mq_timedsend
				655	*/
				656	static void __do_notify(struct mqueue_inode_info *info)
				657	{
				658	/* notification
				659	* invoked when there is registered process and there isn't process
				660	* waiting synchronously for message AND state of queue changed from
				661	* empty to not empty. Here we are sure that no one is waiting
				662	* synchronously. */
				663	if (info->notify_owner &&
				664	info->attr.mq_curmsgs == 1) {
				665	struct siginfo sig_i;
				666	switch (info->notify.sigev_notify) {
				667	case SIGEV_NONE:
				668	break;
				669	case SIGEV_SIGNAL:
				670	/* sends signal */
				671
				672	clear_siginfo(&sig_i);
				673	sig_i.si_signo = info->notify.sigev_signo;
				674	sig_i.si_errno = 0;
				675	sig_i.si_code = SI_MESGQ;
				676	sig_i.si_value = info->notify.sigev_value;
				677	/* map current pid/uid into info->owner's namespaces */
				678	rcu_read_lock();
				679	sig_i.si_pid = task_tgid_nr_ns(current,
				680	ns_of_pid(info->notify_owner));
				681	sig_i.si_uid = from_kuid_munged(info->notify_user_ns, current_uid());
				682	rcu_read_unlock();
				683
				684	kill_pid_info(info->notify.sigev_signo,
				685	&sig_i, info->notify_owner);
				686	break;
				687	case SIGEV_THREAD:
				688	set_cookie(info->notify_cookie, NOTIFY_WOKENUP);
				689	netlink_sendskb(info->notify_sock, info->notify_cookie);
				690	break;
				691	}
				692	/* after notification unregisters process */
				693	put_pid(info->notify_owner);
				694	put_user_ns(info->notify_user_ns);
				695	info->notify_owner = NULL;
				696	info->notify_user_ns = NULL;
				697	}
				698	wake_up(&info->wait_q);
				699	}
				700
				701	static int prepare_timeout(const struct __kernel_timespec __user *u_abs_timeout,
				702	struct timespec64 *ts)
				703	{
				704	if (get_timespec64(ts, u_abs_timeout))
				705	return -EFAULT;
				706	if (!timespec64_valid(ts))
				707	return -EINVAL;
				708	return 0;
				709	}
				710
				711	static void remove_notification(struct mqueue_inode_info *info)
				712	{
				713	if (info->notify_owner != NULL &&
				714	info->notify.sigev_notify == SIGEV_THREAD) {
				715	set_cookie(info->notify_cookie, NOTIFY_REMOVED);
				716	netlink_sendskb(info->notify_sock, info->notify_cookie);
				717	}
				718	put_pid(info->notify_owner);
				719	put_user_ns(info->notify_user_ns);
				720	info->notify_owner = NULL;
				721	info->notify_user_ns = NULL;
				722	}
				723
				724	static int prepare_open(struct vfsmount mnt, struct dentry dentry, int oflag, int ro,
				725	umode_t mode, struct filename *name,
				726	struct mq_attr *attr)
				727	{
				728	static const int oflag2acc[O_ACCMODE] = { MAY_READ, MAY_WRITE,
				729	MAY_READ \| MAY_WRITE };
				730	int acc;
				731
				732	if (d_really_is_negative(dentry)) {
				733	if (!(oflag & O_CREAT))
				734	return -ENOENT;
				735	if (ro)
				736	return ro;
				737	audit_inode_parent_hidden(name, dentry->d_parent);
				738	return vfs_mkobj2(mnt, dentry, mode & ~current_umask(),
				739	mqueue_create_attr, attr);
				740	}
				741	/* it already existed */
				742	audit_inode(name, dentry, 0);
				743	if ((oflag & (O_CREAT\|O_EXCL)) == (O_CREAT\|O_EXCL))
				744	return -EEXIST;
				745	if ((oflag & O_ACCMODE) == (O_RDWR \| O_WRONLY))
				746	return -EINVAL;
				747	acc = oflag2acc[oflag & O_ACCMODE];
				748	return inode_permission2(mnt, d_inode(dentry), acc);
				749	}
				750
				751	static int do_mq_open(const char __user *u_name, int oflag, umode_t mode,
				752	struct mq_attr *attr)
				753	{
				754	struct vfsmount *mnt = current->nsproxy->ipc_ns->mq_mnt;
				755	struct dentry *root = mnt->mnt_root;
				756	struct filename *name;
				757	struct path path;
				758	int fd, error;
				759	int ro;
				760
				761	audit_mq_open(oflag, mode, attr);
				762
				763	if (IS_ERR(name = getname(u_name)))
				764	return PTR_ERR(name);
				765
				766	fd = get_unused_fd_flags(O_CLOEXEC);
				767	if (fd < 0)
				768	goto out_putname;
				769
				770	ro = mnt_want_write(mnt); /* we'll drop it in any case */
				771	inode_lock(d_inode(root));
				772	path.dentry = lookup_one_len2(name->name, mnt, root, strlen(name->name));
				773	if (IS_ERR(path.dentry)) {
				774	error = PTR_ERR(path.dentry);
				775	goto out_putfd;
				776	}
				777	path.mnt = mntget(mnt);
				778	error = prepare_open(path.mnt, path.dentry, oflag, ro, mode, name, attr);
				779	if (!error) {
				780	struct file *file = dentry_open(&path, oflag, current_cred());
				781	if (!IS_ERR(file))
				782	fd_install(fd, file);
				783	else
				784	error = PTR_ERR(file);
				785	}
				786	path_put(&path);
				787	out_putfd:
				788	if (error) {
				789	put_unused_fd(fd);
				790	fd = error;
				791	}
				792	inode_unlock(d_inode(root));
				793	if (!ro)
				794	mnt_drop_write(mnt);
				795	out_putname:
				796	putname(name);
				797	return fd;
				798	}
				799
				800	SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, umode_t, mode,
				801	struct mq_attr __user *, u_attr)
				802	{
				803	struct mq_attr attr;
				804	if (u_attr && copy_from_user(&attr, u_attr, sizeof(struct mq_attr)))
				805	return -EFAULT;
				806
				807	return do_mq_open(u_name, oflag, mode, u_attr ? &attr : NULL);
				808	}
				809
				810	SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name)
				811	{
				812	int err;
				813	struct filename *name;
				814	struct dentry *dentry;
				815	struct inode *inode = NULL;
				816	struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
				817	struct vfsmount *mnt = ipc_ns->mq_mnt;
				818
				819	name = getname(u_name);
				820	if (IS_ERR(name))
				821	return PTR_ERR(name);
				822
				823	audit_inode_parent_hidden(name, mnt->mnt_root);
				824	err = mnt_want_write(mnt);
				825	if (err)
				826	goto out_name;
				827	inode_lock_nested(d_inode(mnt->mnt_root), I_MUTEX_PARENT);
				828	dentry = lookup_one_len2(name->name, mnt, mnt->mnt_root,
				829	strlen(name->name));
				830	if (IS_ERR(dentry)) {
				831	err = PTR_ERR(dentry);
				832	goto out_unlock;
				833	}
				834
				835	inode = d_inode(dentry);
				836	if (!inode) {
				837	err = -ENOENT;
				838	} else {
				839	ihold(inode);
				840	err = vfs_unlink2(mnt, d_inode(dentry->d_parent), dentry, NULL);
				841	}
				842	dput(dentry);
				843
				844	out_unlock:
				845	inode_unlock(d_inode(mnt->mnt_root));
				846	if (inode)
				847	iput(inode);
				848	mnt_drop_write(mnt);
				849	out_name:
				850	putname(name);
				851
				852	return err;
				853	}
				854
				855	/* Pipelined send and receive functions.
				856	*
				857	* If a receiver finds no waiting message, then it registers itself in the
				858	* list of waiting receivers. A sender checks that list before adding the new
				859	* message into the message array. If there is a waiting receiver, then it
				860	* bypasses the message array and directly hands the message over to the
				861	* receiver. The receiver accepts the message and returns without grabbing the
				862	* queue spinlock:
				863	*
				864	* - Set pointer to message.
				865	* - Queue the receiver task for later wakeup (without the info->lock).
				866	* - Update its state to STATE_READY. Now the receiver can continue.
				867	* - Wake up the process after the lock is dropped. Should the process wake up
				868	* before this wakeup (due to a timeout or a signal) it will either see
				869	* STATE_READY and continue or acquire the lock to check the state again.
				870	*
				871	* The same algorithm is used for senders.
				872	*/
				873
				874	/* pipelined_send() - send a message directly to the task waiting in
				875	* sys_mq_timedreceive() (without inserting message into a queue).
				876	*/
				877	static inline void pipelined_send(struct wake_q_head *wake_q,
				878	struct mqueue_inode_info *info,
				879	struct msg_msg *message,
				880	struct ext_wait_queue *receiver)
				881	{
				882	receiver->msg = message;
				883	list_del(&receiver->list);
				884	wake_q_add(wake_q, receiver->task);
				885	/*
				886	* Rely on the implicit cmpxchg barrier from wake_q_add such
				887	* that we can ensure that updating receiver->state is the last
				888	* write operation: As once set, the receiver can continue,
				889	* and if we don't have the reference count from the wake_q,
				890	* yet, at that point we can later have a use-after-free
				891	* condition and bogus wakeup.
				892	*/
				893	receiver->state = STATE_READY;
				894	}
				895
				896	/* pipelined_receive() - if there is task waiting in sys_mq_timedsend()
				897	* gets its message and put to the queue (we have one free place for sure). */
				898	static inline void pipelined_receive(struct wake_q_head *wake_q,
				899	struct mqueue_inode_info *info)
				900	{
				901	struct ext_wait_queue *sender = wq_get_first_waiter(info, SEND);
				902
				903	if (!sender) {
				904	/* for poll */
				905	wake_up_interruptible(&info->wait_q);
				906	return;
				907	}
				908	if (msg_insert(sender->msg, info))
				909	return;
				910
				911	list_del(&sender->list);
				912	wake_q_add(wake_q, sender->task);
				913	sender->state = STATE_READY;
				914	}
				915
				916	static int do_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr,
				917	size_t msg_len, unsigned int msg_prio,
				918	struct timespec64 *ts)
				919	{
				920	struct fd f;
				921	struct inode *inode;
				922	struct ext_wait_queue wait;
				923	struct ext_wait_queue *receiver;
				924	struct msg_msg *msg_ptr;
				925	struct mqueue_inode_info *info;
				926	ktime_t expires, *timeout = NULL;
				927	struct posix_msg_tree_node *new_leaf = NULL;
				928	int ret = 0;
				929	DEFINE_WAKE_Q(wake_q);
				930
				931	if (unlikely(msg_prio >= (unsigned long) MQ_PRIO_MAX))
				932	return -EINVAL;
				933
				934	if (ts) {
				935	expires = timespec64_to_ktime(*ts);
				936	timeout = &expires;
				937	}
				938
				939	audit_mq_sendrecv(mqdes, msg_len, msg_prio, ts);
				940
				941	f = fdget(mqdes);
				942	if (unlikely(!f.file)) {
				943	ret = -EBADF;
				944	goto out;
				945	}
				946
				947	inode = file_inode(f.file);
				948	if (unlikely(f.file->f_op != &mqueue_file_operations)) {
				949	ret = -EBADF;
				950	goto out_fput;
				951	}
				952	info = MQUEUE_I(inode);
				953	audit_file(f.file);
				954
				955	if (unlikely(!(f.file->f_mode & FMODE_WRITE))) {
				956	ret = -EBADF;
				957	goto out_fput;
				958	}
				959
				960	if (unlikely(msg_len > info->attr.mq_msgsize)) {
				961	ret = -EMSGSIZE;
				962	goto out_fput;
				963	}
				964
				965	/* First try to allocate memory, before doing anything with
				966	* existing queues. */
				967	msg_ptr = load_msg(u_msg_ptr, msg_len);
				968	if (IS_ERR(msg_ptr)) {
				969	ret = PTR_ERR(msg_ptr);
				970	goto out_fput;
				971	}
				972	msg_ptr->m_ts = msg_len;
				973	msg_ptr->m_type = msg_prio;
				974
				975	/*
				976	* msg_insert really wants us to have a valid, spare node struct so
				977	* it doesn't have to kmalloc a GFP_ATOMIC allocation, but it will
				978	* fall back to that if necessary.
				979	*/
				980	if (!info->node_cache)
				981	new_leaf = kmalloc(sizeof(*new_leaf), GFP_KERNEL);
				982
				983	spin_lock(&info->lock);
				984
				985	if (!info->node_cache && new_leaf) {
				986	/* Save our speculative allocation into the cache */
				987	INIT_LIST_HEAD(&new_leaf->msg_list);
				988	info->node_cache = new_leaf;
				989	new_leaf = NULL;
				990	} else {
				991	kfree(new_leaf);
				992	}
				993
				994	if (info->attr.mq_curmsgs == info->attr.mq_maxmsg) {
				995	if (f.file->f_flags & O_NONBLOCK) {
				996	ret = -EAGAIN;
				997	} else {
				998	wait.task = current;
				999	wait.msg = (void *) msg_ptr;
				1000	wait.state = STATE_NONE;
				1001	ret = wq_sleep(info, SEND, timeout, &wait);
				1002	/*
				1003	* wq_sleep must be called with info->lock held, and
				1004	* returns with the lock released
				1005	*/
				1006	goto out_free;
				1007	}
				1008	} else {
				1009	receiver = wq_get_first_waiter(info, RECV);
				1010	if (receiver) {
				1011	pipelined_send(&wake_q, info, msg_ptr, receiver);
				1012	} else {
				1013	/* adds message to the queue */
				1014	ret = msg_insert(msg_ptr, info);
				1015	if (ret)
				1016	goto out_unlock;
				1017	__do_notify(info);
				1018	}
				1019	inode->i_atime = inode->i_mtime = inode->i_ctime =
				1020	current_time(inode);
				1021	}
				1022	out_unlock:
				1023	spin_unlock(&info->lock);
				1024	wake_up_q(&wake_q);
				1025	out_free:
				1026	if (ret)
				1027	free_msg(msg_ptr);
				1028	out_fput:
				1029	fdput(f);
				1030	out:
				1031	return ret;
				1032	}
				1033
				1034	static int do_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr,
				1035	size_t msg_len, unsigned int __user *u_msg_prio,
				1036	struct timespec64 *ts)
				1037	{
				1038	ssize_t ret;
				1039	struct msg_msg *msg_ptr;
				1040	struct fd f;
				1041	struct inode *inode;
				1042	struct mqueue_inode_info *info;
				1043	struct ext_wait_queue wait;
				1044	ktime_t expires, *timeout = NULL;
				1045	struct posix_msg_tree_node *new_leaf = NULL;
				1046
				1047	if (ts) {
				1048	expires = timespec64_to_ktime(*ts);
				1049	timeout = &expires;
				1050	}
				1051
				1052	audit_mq_sendrecv(mqdes, msg_len, 0, ts);
				1053
				1054	f = fdget(mqdes);
				1055	if (unlikely(!f.file)) {
				1056	ret = -EBADF;
				1057	goto out;
				1058	}
				1059
				1060	inode = file_inode(f.file);
				1061	if (unlikely(f.file->f_op != &mqueue_file_operations)) {
				1062	ret = -EBADF;
				1063	goto out_fput;
				1064	}
				1065	info = MQUEUE_I(inode);
				1066	audit_file(f.file);
				1067
				1068	if (unlikely(!(f.file->f_mode & FMODE_READ))) {
				1069	ret = -EBADF;
				1070	goto out_fput;
				1071	}
				1072
				1073	/* checks if buffer is big enough */
				1074	if (unlikely(msg_len < info->attr.mq_msgsize)) {
				1075	ret = -EMSGSIZE;
				1076	goto out_fput;
				1077	}
				1078
				1079	/*
				1080	* msg_insert really wants us to have a valid, spare node struct so
				1081	* it doesn't have to kmalloc a GFP_ATOMIC allocation, but it will
				1082	* fall back to that if necessary.
				1083	*/
				1084	if (!info->node_cache)
				1085	new_leaf = kmalloc(sizeof(*new_leaf), GFP_KERNEL);
				1086
				1087	spin_lock(&info->lock);
				1088
				1089	if (!info->node_cache && new_leaf) {
				1090	/* Save our speculative allocation into the cache */
				1091	INIT_LIST_HEAD(&new_leaf->msg_list);
				1092	info->node_cache = new_leaf;
				1093	} else {
				1094	kfree(new_leaf);
				1095	}
				1096
				1097	if (info->attr.mq_curmsgs == 0) {
				1098	if (f.file->f_flags & O_NONBLOCK) {
				1099	spin_unlock(&info->lock);
				1100	ret = -EAGAIN;
				1101	} else {
				1102	wait.task = current;
				1103	wait.state = STATE_NONE;
				1104	ret = wq_sleep(info, RECV, timeout, &wait);
				1105	msg_ptr = wait.msg;
				1106	}
				1107	} else {
				1108	DEFINE_WAKE_Q(wake_q);
				1109
				1110	msg_ptr = msg_get(info);
				1111
				1112	inode->i_atime = inode->i_mtime = inode->i_ctime =
				1113	current_time(inode);
				1114
				1115	/* There is now free space in queue. */
				1116	pipelined_receive(&wake_q, info);
				1117	spin_unlock(&info->lock);
				1118	wake_up_q(&wake_q);
				1119	ret = 0;
				1120	}
				1121	if (ret == 0) {
				1122	ret = msg_ptr->m_ts;
				1123
				1124	if ((u_msg_prio && put_user(msg_ptr->m_type, u_msg_prio)) \|\|
				1125	store_msg(u_msg_ptr, msg_ptr, msg_ptr->m_ts)) {
				1126	ret = -EFAULT;
				1127	}
				1128	free_msg(msg_ptr);
				1129	}
				1130	out_fput:
				1131	fdput(f);
				1132	out:
				1133	return ret;
				1134	}
				1135
				1136	SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr,
				1137	size_t, msg_len, unsigned int, msg_prio,
				1138	const struct __kernel_timespec __user *, u_abs_timeout)
				1139	{
				1140	struct timespec64 ts, *p = NULL;
				1141	if (u_abs_timeout) {
				1142	int res = prepare_timeout(u_abs_timeout, &ts);
				1143	if (res)
				1144	return res;
				1145	p = &ts;
				1146	}
				1147	return do_mq_timedsend(mqdes, u_msg_ptr, msg_len, msg_prio, p);
				1148	}
				1149
				1150	SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, char __user *, u_msg_ptr,
				1151	size_t, msg_len, unsigned int __user *, u_msg_prio,
				1152	const struct __kernel_timespec __user *, u_abs_timeout)
				1153	{
				1154	struct timespec64 ts, *p = NULL;
				1155	if (u_abs_timeout) {
				1156	int res = prepare_timeout(u_abs_timeout, &ts);
				1157	if (res)
				1158	return res;
				1159	p = &ts;
				1160	}
				1161	return do_mq_timedreceive(mqdes, u_msg_ptr, msg_len, u_msg_prio, p);
				1162	}
				1163
				1164	/*
				1165	* Notes: the case when user wants us to deregister (with NULL as pointer)
				1166	* and he isn't currently owner of notification, will be silently discarded.
				1167	* It isn't explicitly defined in the POSIX.
				1168	*/
				1169	static int do_mq_notify(mqd_t mqdes, const struct sigevent *notification)
				1170	{
				1171	int ret;
				1172	struct fd f;
				1173	struct sock *sock;
				1174	struct inode *inode;
				1175	struct mqueue_inode_info *info;
				1176	struct sk_buff *nc;
				1177
				1178	audit_mq_notify(mqdes, notification);
				1179
				1180	nc = NULL;
				1181	sock = NULL;
				1182	if (notification != NULL) {
				1183	if (unlikely(notification->sigev_notify != SIGEV_NONE &&
				1184	notification->sigev_notify != SIGEV_SIGNAL &&
				1185	notification->sigev_notify != SIGEV_THREAD))
				1186	return -EINVAL;
				1187	if (notification->sigev_notify == SIGEV_SIGNAL &&
				1188	!valid_signal(notification->sigev_signo)) {
				1189	return -EINVAL;
				1190	}
				1191	if (notification->sigev_notify == SIGEV_THREAD) {
				1192	long timeo;
				1193
				1194	/* create the notify skb */
				1195	nc = alloc_skb(NOTIFY_COOKIE_LEN, GFP_KERNEL);
				1196	if (!nc) {
				1197	ret = -ENOMEM;
				1198	goto out;
				1199	}
				1200	if (copy_from_user(nc->data,
				1201	notification->sigev_value.sival_ptr,
				1202	NOTIFY_COOKIE_LEN)) {
				1203	ret = -EFAULT;
				1204	goto out;
				1205	}
				1206
				1207	/* TODO: add a header? */
				1208	skb_put(nc, NOTIFY_COOKIE_LEN);
				1209	/* and attach it to the socket */
				1210	retry:
				1211	f = fdget(notification->sigev_signo);
				1212	if (!f.file) {
				1213	ret = -EBADF;
				1214	goto out;
				1215	}
				1216	sock = netlink_getsockbyfilp(f.file);
				1217	fdput(f);
				1218	if (IS_ERR(sock)) {
				1219	ret = PTR_ERR(sock);
				1220	sock = NULL;
				1221	goto out;
				1222	}
				1223
				1224	timeo = MAX_SCHEDULE_TIMEOUT;
				1225	ret = netlink_attachskb(sock, nc, &timeo, NULL);
				1226	if (ret == 1) {
				1227	sock = NULL;
				1228	goto retry;
				1229	}
				1230	if (ret) {
				1231	sock = NULL;
				1232	nc = NULL;
				1233	goto out;
				1234	}
				1235	}
				1236	}
				1237
				1238	f = fdget(mqdes);
				1239	if (!f.file) {
				1240	ret = -EBADF;
				1241	goto out;
				1242	}
				1243
				1244	inode = file_inode(f.file);
				1245	if (unlikely(f.file->f_op != &mqueue_file_operations)) {
				1246	ret = -EBADF;
				1247	goto out_fput;
				1248	}
				1249	info = MQUEUE_I(inode);
				1250
				1251	ret = 0;
				1252	spin_lock(&info->lock);
				1253	if (notification == NULL) {
				1254	if (info->notify_owner == task_tgid(current)) {
				1255	remove_notification(info);
				1256	inode->i_atime = inode->i_ctime = current_time(inode);
				1257	}
				1258	} else if (info->notify_owner != NULL) {
				1259	ret = -EBUSY;
				1260	} else {
				1261	switch (notification->sigev_notify) {
				1262	case SIGEV_NONE:
				1263	info->notify.sigev_notify = SIGEV_NONE;
				1264	break;
				1265	case SIGEV_THREAD:
				1266	info->notify_sock = sock;
				1267	info->notify_cookie = nc;
				1268	sock = NULL;
				1269	nc = NULL;
				1270	info->notify.sigev_notify = SIGEV_THREAD;
				1271	break;
				1272	case SIGEV_SIGNAL:
				1273	info->notify.sigev_signo = notification->sigev_signo;
				1274	info->notify.sigev_value = notification->sigev_value;
				1275	info->notify.sigev_notify = SIGEV_SIGNAL;
				1276	break;
				1277	}
				1278
				1279	info->notify_owner = get_pid(task_tgid(current));
				1280	info->notify_user_ns = get_user_ns(current_user_ns());
				1281	inode->i_atime = inode->i_ctime = current_time(inode);
				1282	}
				1283	spin_unlock(&info->lock);
				1284	out_fput:
				1285	fdput(f);
				1286	out:
				1287	if (sock)
				1288	netlink_detachskb(sock, nc);
				1289	else if (nc)
				1290	dev_kfree_skb(nc);
				1291
				1292	return ret;
				1293	}
				1294
				1295	SYSCALL_DEFINE2(mq_notify, mqd_t, mqdes,
				1296	const struct sigevent __user *, u_notification)
				1297	{
				1298	struct sigevent n, *p = NULL;
				1299	if (u_notification) {
				1300	if (copy_from_user(&n, u_notification, sizeof(struct sigevent)))
				1301	return -EFAULT;
				1302	p = &n;
				1303	}
				1304	return do_mq_notify(mqdes, p);
				1305	}
				1306
				1307	static int do_mq_getsetattr(int mqdes, struct mq_attr new, struct mq_attr old)
				1308	{
				1309	struct fd f;
				1310	struct inode *inode;
				1311	struct mqueue_inode_info *info;
				1312
				1313	if (new && (new->mq_flags & (~O_NONBLOCK)))
				1314	return -EINVAL;
				1315
				1316	f = fdget(mqdes);
				1317	if (!f.file)
				1318	return -EBADF;
				1319
				1320	if (unlikely(f.file->f_op != &mqueue_file_operations)) {
				1321	fdput(f);
				1322	return -EBADF;
				1323	}
				1324
				1325	inode = file_inode(f.file);
				1326	info = MQUEUE_I(inode);
				1327
				1328	spin_lock(&info->lock);
				1329
				1330	if (old) {
				1331	*old = info->attr;
				1332	old->mq_flags = f.file->f_flags & O_NONBLOCK;
				1333	}
				1334	if (new) {
				1335	audit_mq_getsetattr(mqdes, new);
				1336	spin_lock(&f.file->f_lock);
				1337	if (new->mq_flags & O_NONBLOCK)
				1338	f.file->f_flags \|= O_NONBLOCK;
				1339	else
				1340	f.file->f_flags &= ~O_NONBLOCK;
				1341	spin_unlock(&f.file->f_lock);
				1342
				1343	inode->i_atime = inode->i_ctime = current_time(inode);
				1344	}
				1345
				1346	spin_unlock(&info->lock);
				1347	fdput(f);
				1348	return 0;
				1349	}
				1350
				1351	SYSCALL_DEFINE3(mq_getsetattr, mqd_t, mqdes,
				1352	const struct mq_attr __user *, u_mqstat,
				1353	struct mq_attr __user *, u_omqstat)
				1354	{
				1355	int ret;
				1356	struct mq_attr mqstat, omqstat;
				1357	struct mq_attr new = NULL, old = NULL;
				1358
				1359	if (u_mqstat) {
				1360	new = &mqstat;
				1361	if (copy_from_user(new, u_mqstat, sizeof(struct mq_attr)))
				1362	return -EFAULT;
				1363	}
				1364	if (u_omqstat)
				1365	old = &omqstat;
				1366
				1367	ret = do_mq_getsetattr(mqdes, new, old);
				1368	if (ret \|\| !old)
				1369	return ret;
				1370
				1371	if (copy_to_user(u_omqstat, old, sizeof(struct mq_attr)))
				1372	return -EFAULT;
				1373	return 0;
				1374	}
				1375
				1376	#ifdef CONFIG_COMPAT
				1377
				1378	struct compat_mq_attr {
				1379	compat_long_t mq_flags; /* message queue flags */
				1380	compat_long_t mq_maxmsg; /* maximum number of messages */
				1381	compat_long_t mq_msgsize; /* maximum message size */
				1382	compat_long_t mq_curmsgs; /* number of messages currently queued */
				1383	compat_long_t __reserved[4]; /* ignored for input, zeroed for output */
				1384	};
				1385
				1386	static inline int get_compat_mq_attr(struct mq_attr *attr,
				1387	const struct compat_mq_attr __user *uattr)
				1388	{
				1389	struct compat_mq_attr v;
				1390
				1391	if (copy_from_user(&v, uattr, sizeof(*uattr)))
				1392	return -EFAULT;
				1393
				1394	memset(attr, 0, sizeof(*attr));
				1395	attr->mq_flags = v.mq_flags;
				1396	attr->mq_maxmsg = v.mq_maxmsg;
				1397	attr->mq_msgsize = v.mq_msgsize;
				1398	attr->mq_curmsgs = v.mq_curmsgs;
				1399	return 0;
				1400	}
				1401
				1402	static inline int put_compat_mq_attr(const struct mq_attr *attr,
				1403	struct compat_mq_attr __user *uattr)
				1404	{
				1405	struct compat_mq_attr v;
				1406
				1407	memset(&v, 0, sizeof(v));
				1408	v.mq_flags = attr->mq_flags;
				1409	v.mq_maxmsg = attr->mq_maxmsg;
				1410	v.mq_msgsize = attr->mq_msgsize;
				1411	v.mq_curmsgs = attr->mq_curmsgs;
				1412	if (copy_to_user(uattr, &v, sizeof(*uattr)))
				1413	return -EFAULT;
				1414	return 0;
				1415	}
				1416
				1417	COMPAT_SYSCALL_DEFINE4(mq_open, const char __user *, u_name,
				1418	int, oflag, compat_mode_t, mode,
				1419	struct compat_mq_attr __user *, u_attr)
				1420	{
				1421	struct mq_attr attr, *p = NULL;
				1422	if (u_attr && oflag & O_CREAT) {
				1423	p = &attr;
				1424	if (get_compat_mq_attr(&attr, u_attr))
				1425	return -EFAULT;
				1426	}
				1427	return do_mq_open(u_name, oflag, mode, p);
				1428	}
				1429
				1430	COMPAT_SYSCALL_DEFINE2(mq_notify, mqd_t, mqdes,
				1431	const struct compat_sigevent __user *, u_notification)
				1432	{
				1433	struct sigevent n, *p = NULL;
				1434	if (u_notification) {
				1435	if (get_compat_sigevent(&n, u_notification))
				1436	return -EFAULT;
				1437	if (n.sigev_notify == SIGEV_THREAD)
				1438	n.sigev_value.sival_ptr = compat_ptr(n.sigev_value.sival_int);
				1439	p = &n;
				1440	}
				1441	return do_mq_notify(mqdes, p);
				1442	}
				1443
				1444	COMPAT_SYSCALL_DEFINE3(mq_getsetattr, mqd_t, mqdes,
				1445	const struct compat_mq_attr __user *, u_mqstat,
				1446	struct compat_mq_attr __user *, u_omqstat)
				1447	{
				1448	int ret;
				1449	struct mq_attr mqstat, omqstat;
				1450	struct mq_attr new = NULL, old = NULL;
				1451
				1452	if (u_mqstat) {
				1453	new = &mqstat;
				1454	if (get_compat_mq_attr(new, u_mqstat))
				1455	return -EFAULT;
				1456	}
				1457	if (u_omqstat)
				1458	old = &omqstat;
				1459
				1460	ret = do_mq_getsetattr(mqdes, new, old);
				1461	if (ret \|\| !old)
				1462	return ret;
				1463
				1464	if (put_compat_mq_attr(old, u_omqstat))
				1465	return -EFAULT;
				1466	return 0;
				1467	}
				1468	#endif
				1469
				1470	#ifdef CONFIG_COMPAT_32BIT_TIME
				1471	static int compat_prepare_timeout(const struct compat_timespec __user *p,
				1472	struct timespec64 *ts)
				1473	{
				1474	if (compat_get_timespec64(ts, p))
				1475	return -EFAULT;
				1476	if (!timespec64_valid(ts))
				1477	return -EINVAL;
				1478	return 0;
				1479	}
				1480
				1481	COMPAT_SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes,
				1482	const char __user *, u_msg_ptr,
				1483	compat_size_t, msg_len, unsigned int, msg_prio,
				1484	const struct compat_timespec __user *, u_abs_timeout)
				1485	{
				1486	struct timespec64 ts, *p = NULL;
				1487	if (u_abs_timeout) {
				1488	int res = compat_prepare_timeout(u_abs_timeout, &ts);
				1489	if (res)
				1490	return res;
				1491	p = &ts;
				1492	}
				1493	return do_mq_timedsend(mqdes, u_msg_ptr, msg_len, msg_prio, p);
				1494	}
				1495
				1496	COMPAT_SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes,
				1497	char __user *, u_msg_ptr,
				1498	compat_size_t, msg_len, unsigned int __user *, u_msg_prio,
				1499	const struct compat_timespec __user *, u_abs_timeout)
				1500	{
				1501	struct timespec64 ts, *p = NULL;
				1502	if (u_abs_timeout) {
				1503	int res = compat_prepare_timeout(u_abs_timeout, &ts);
				1504	if (res)
				1505	return res;
				1506	p = &ts;
				1507	}
				1508	return do_mq_timedreceive(mqdes, u_msg_ptr, msg_len, u_msg_prio, p);
				1509	}
				1510	#endif
				1511
				1512	static const struct inode_operations mqueue_dir_inode_operations = {
				1513	.lookup = simple_lookup,
				1514	.create = mqueue_create,
				1515	.unlink = mqueue_unlink,
				1516	};
				1517
				1518	static const struct file_operations mqueue_file_operations = {
				1519	.flush = mqueue_flush_file,
				1520	.poll = mqueue_poll_file,
				1521	.read = mqueue_read_file,
				1522	.llseek = default_llseek,
				1523	};
				1524
				1525	static const struct super_operations mqueue_super_ops = {
				1526	.alloc_inode = mqueue_alloc_inode,
				1527	.destroy_inode = mqueue_destroy_inode,
				1528	.evict_inode = mqueue_evict_inode,
				1529	.statfs = simple_statfs,
				1530	};
				1531
				1532	static struct file_system_type mqueue_fs_type = {
				1533	.name = "mqueue",
				1534	.mount = mqueue_mount,
				1535	.kill_sb = kill_litter_super,
				1536	.fs_flags = FS_USERNS_MOUNT,
				1537	};
				1538
				1539	int mq_init_ns(struct ipc_namespace *ns)
				1540	{
				1541	ns->mq_queues_count = 0;
				1542	ns->mq_queues_max = DFLT_QUEUESMAX;
				1543	ns->mq_msg_max = DFLT_MSGMAX;
				1544	ns->mq_msgsize_max = DFLT_MSGSIZEMAX;
				1545	ns->mq_msg_default = DFLT_MSG;
				1546	ns->mq_msgsize_default = DFLT_MSGSIZE;
				1547
				1548	ns->mq_mnt = kern_mount_data(&mqueue_fs_type, ns);
				1549	if (IS_ERR(ns->mq_mnt)) {
				1550	int err = PTR_ERR(ns->mq_mnt);
				1551	ns->mq_mnt = NULL;
				1552	return err;
				1553	}
				1554	return 0;
				1555	}
				1556
				1557	void mq_clear_sbinfo(struct ipc_namespace *ns)
				1558	{
				1559	ns->mq_mnt->mnt_sb->s_fs_info = NULL;
				1560	}
				1561
				1562	void mq_put_mnt(struct ipc_namespace *ns)
				1563	{
				1564	kern_unmount(ns->mq_mnt);
				1565	}
				1566
				1567	static int __init init_mqueue_fs(void)
				1568	{
				1569	int error;
				1570
				1571	mqueue_inode_cachep = kmem_cache_create("mqueue_inode_cache",
				1572	sizeof(struct mqueue_inode_info), 0,
				1573	SLAB_HWCACHE_ALIGN\|SLAB_ACCOUNT, init_once);
				1574	if (mqueue_inode_cachep == NULL)
				1575	return -ENOMEM;
				1576
				1577	/* ignore failures - they are not fatal */
				1578	mq_sysctl_table = mq_register_sysctl_table();
				1579
				1580	error = register_filesystem(&mqueue_fs_type);
				1581	if (error)
				1582	goto out_sysctl;
				1583
				1584	spin_lock_init(&mq_lock);
				1585
				1586	error = mq_init_ns(&init_ipc_ns);
				1587	if (error)
				1588	goto out_filesystem;
				1589
				1590	return 0;
				1591
				1592	out_filesystem:
				1593	unregister_filesystem(&mqueue_fs_type);
				1594	out_sysctl:
				1595	if (mq_sysctl_table)
				1596	unregister_sysctl_table(mq_sysctl_table);
				1597	kmem_cache_destroy(mqueue_inode_cachep);
				1598	return error;
				1599	}
				1600
				1601	device_initcall(init_mqueue_fs);