Blame - ap/os/linux/linux-3.4.x/fs/pipe.c - R306

blob: abfb93525ca6e54003425e8a3a9097802ae0c7d5 [file] [log] [blame]

yuezonghe	824eb0c	2024-06-27 02:32:26 -0700	[diff] [blame]	1	/*
				2	* linux/fs/pipe.c
				3	*
				4	* Copyright (C) 1991, 1992, 1999 Linus Torvalds
				5	*/
				6
				7	#include <linux/mm.h>
				8	#include <linux/file.h>
				9	#include <linux/poll.h>
				10	#include <linux/slab.h>
				11	#include <linux/module.h>
				12	#include <linux/init.h>
				13	#include <linux/fs.h>
				14	#include <linux/log2.h>
				15	#include <linux/mount.h>
				16	#include <linux/magic.h>
				17	#include <linux/pipe_fs_i.h>
				18	#include <linux/uio.h>
				19	#include <linux/highmem.h>
				20	#include <linux/pagemap.h>
				21	#include <linux/audit.h>
				22	#include <linux/syscalls.h>
				23	#include <linux/fcntl.h>
				24
				25	#include <asm/uaccess.h>
				26	#include <asm/ioctls.h>
				27
				28	/*
				29	* The max size that a non-root user is allowed to grow the pipe. Can
				30	* be set by root in /proc/sys/fs/pipe-max-size
				31	*/
				32	unsigned int pipe_max_size = 1048576;
				33
				34	/*
				35	* Minimum pipe size, as required by POSIX
				36	*/
				37	unsigned int pipe_min_size = PAGE_SIZE;
				38
				39	/*
				40	* We use a start+len construction, which provides full use of the
				41	* allocated memory.
				42	* -- Florian Coosmann (FGC)
				43	*
				44	* Reads with count = 0 should always return 0.
				45	* -- Julian Bradfield 1999-06-07.
				46	*
				47	* FIFOs and Pipes now generate SIGIO for both readers and writers.
				48	* -- Jeremy Elson <jelson@circlemud.org> 2001-08-16
				49	*
				50	* pipe_read & write cleanup
				51	* -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09
				52	*/
				53
				54	static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass)
				55	{
				56	if (pipe->inode)
				57	mutex_lock_nested(&pipe->inode->i_mutex, subclass);
				58	}
				59
				60	void pipe_lock(struct pipe_inode_info *pipe)
				61	{
				62	/*
				63	* pipe_lock() nests non-pipe inode locks (for writing to a file)
				64	*/
				65	pipe_lock_nested(pipe, I_MUTEX_PARENT);
				66	}
				67	EXPORT_SYMBOL(pipe_lock);
				68
				69	void pipe_unlock(struct pipe_inode_info *pipe)
				70	{
				71	if (pipe->inode)
				72	mutex_unlock(&pipe->inode->i_mutex);
				73	}
				74	EXPORT_SYMBOL(pipe_unlock);
				75
				76	void pipe_double_lock(struct pipe_inode_info *pipe1,
				77	struct pipe_inode_info *pipe2)
				78	{
				79	BUG_ON(pipe1 == pipe2);
				80
				81	if (pipe1 < pipe2) {
				82	pipe_lock_nested(pipe1, I_MUTEX_PARENT);
				83	pipe_lock_nested(pipe2, I_MUTEX_CHILD);
				84	} else {
				85	pipe_lock_nested(pipe2, I_MUTEX_PARENT);
				86	pipe_lock_nested(pipe1, I_MUTEX_CHILD);
				87	}
				88	}
				89
				90	/* Drop the inode semaphore and wait for a pipe event, atomically */
				91	void pipe_wait(struct pipe_inode_info *pipe)
				92	{
				93	DEFINE_WAIT(wait);
				94
				95	/*
				96	* Pipes are system-local resources, so sleeping on them
				97	* is considered a noninteractive wait:
				98	*/
				99	prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE);
				100	pipe_unlock(pipe);
				101	schedule();
				102	finish_wait(&pipe->wait, &wait);
				103	pipe_lock(pipe);
				104	}
				105
				106	static int
				107	pipe_iov_copy_from_user(void addr, int offset, struct iovec *iov,
				108	size_t *remaining, int atomic)
				109	{
				110	unsigned long copy;
				111
				112	while (*remaining > 0) {
				113	while (!iov->iov_len)
				114	iov++;
				115	copy = min_t(unsigned long, *remaining, iov->iov_len);
				116
				117	if (atomic) {
				118	if (__copy_from_user_inatomic(addr + *offset,
				119	iov->iov_base, copy))
				120	return -EFAULT;
				121	} else {
				122	if (copy_from_user(addr + *offset,
				123	iov->iov_base, copy))
				124	return -EFAULT;
				125	}
				126	*offset += copy;
				127	*remaining -= copy;
				128	iov->iov_base += copy;
				129	iov->iov_len -= copy;
				130	}
				131	return 0;
				132	}
				133
				134	static int
				135	pipe_iov_copy_to_user(struct iovec iov, void addr, int *offset,
				136	size_t *remaining, int atomic)
				137	{
				138	unsigned long copy;
				139
				140	while (*remaining > 0) {
				141	while (!iov->iov_len)
				142	iov++;
				143	copy = min_t(unsigned long, *remaining, iov->iov_len);
				144
				145	if (atomic) {
				146	if (__copy_to_user_inatomic(iov->iov_base,
				147	addr + *offset, copy))
				148	return -EFAULT;
				149	} else {
				150	if (copy_to_user(iov->iov_base,
				151	addr + *offset, copy))
				152	return -EFAULT;
				153	}
				154	*offset += copy;
				155	*remaining -= copy;
				156	iov->iov_base += copy;
				157	iov->iov_len -= copy;
				158	}
				159	return 0;
				160	}
				161
				162	/*
				163	* Attempt to pre-fault in the user memory, so we can use atomic copies.
				164	* Returns the number of bytes not faulted in.
				165	*/
				166	static int iov_fault_in_pages_write(struct iovec *iov, unsigned long len)
				167	{
				168	while (!iov->iov_len)
				169	iov++;
				170
				171	while (len > 0) {
				172	unsigned long this_len;
				173
				174	this_len = min_t(unsigned long, len, iov->iov_len);
				175	if (fault_in_pages_writeable(iov->iov_base, this_len))
				176	break;
				177
				178	len -= this_len;
				179	iov++;
				180	}
				181
				182	return len;
				183	}
				184
				185	/*
				186	* Pre-fault in the user memory, so we can use atomic copies.
				187	*/
				188	static void iov_fault_in_pages_read(struct iovec *iov, unsigned long len)
				189	{
				190	while (!iov->iov_len)
				191	iov++;
				192
				193	while (len > 0) {
				194	unsigned long this_len;
				195
				196	this_len = min_t(unsigned long, len, iov->iov_len);
				197	fault_in_pages_readable(iov->iov_base, this_len);
				198	len -= this_len;
				199	iov++;
				200	}
				201	}
				202
				203	static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
				204	struct pipe_buffer *buf)
				205	{
				206	struct page *page = buf->page;
				207
				208	/*
				209	* If nobody else uses this page, and we don't already have a
				210	* temporary page, let's keep track of it as a one-deep
				211	* allocation cache. (Otherwise just release our reference to it)
				212	*/
				213	if (page_count(page) == 1 && !pipe->tmp_page)
				214	pipe->tmp_page = page;
				215	else
				216	page_cache_release(page);
				217	}
				218
				219	/**
				220	* generic_pipe_buf_map - virtually map a pipe buffer
				221	* @pipe: the pipe that the buffer belongs to
				222	* @buf: the buffer that should be mapped
				223	* @atomic: whether to use an atomic map
				224	*
				225	* Description:
				226	* This function returns a kernel virtual address mapping for the
				227	* pipe_buffer passed in @buf. If @atomic is set, an atomic map is provided
				228	* and the caller has to be careful not to fault before calling
				229	* the unmap function.
				230	*
				231	* Note that this function occupies KM_USER0 if @atomic != 0.
				232	*/
				233	void generic_pipe_buf_map(struct pipe_inode_info pipe,
				234	struct pipe_buffer *buf, int atomic)
				235	{
				236	if (atomic) {
				237	buf->flags \|= PIPE_BUF_FLAG_ATOMIC;
				238	return kmap_atomic(buf->page);
				239	}
				240
				241	return kmap(buf->page);
				242	}
				243	EXPORT_SYMBOL(generic_pipe_buf_map);
				244
				245	/**
				246	* generic_pipe_buf_unmap - unmap a previously mapped pipe buffer
				247	* @pipe: the pipe that the buffer belongs to
				248	* @buf: the buffer that should be unmapped
				249	* @map_data: the data that the mapping function returned
				250	*
				251	* Description:
				252	* This function undoes the mapping that ->map() provided.
				253	*/
				254	void generic_pipe_buf_unmap(struct pipe_inode_info *pipe,
				255	struct pipe_buffer buf, void map_data)
				256	{
				257	if (buf->flags & PIPE_BUF_FLAG_ATOMIC) {
				258	buf->flags &= ~PIPE_BUF_FLAG_ATOMIC;
				259	kunmap_atomic(map_data);
				260	} else
				261	kunmap(buf->page);
				262	}
				263	EXPORT_SYMBOL(generic_pipe_buf_unmap);
				264
				265	/**
				266	* generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer
				267	* @pipe: the pipe that the buffer belongs to
				268	* @buf: the buffer to attempt to steal
				269	*
				270	* Description:
				271	* This function attempts to steal the &struct page attached to
				272	* @buf. If successful, this function returns 0 and returns with
				273	* the page locked. The caller may then reuse the page for whatever
				274	* he wishes; the typical use is insertion into a different file
				275	* page cache.
				276	*/
				277	int generic_pipe_buf_steal(struct pipe_inode_info *pipe,
				278	struct pipe_buffer *buf)
				279	{
				280	struct page *page = buf->page;
				281
				282	/*
				283	* A reference of one is golden, that means that the owner of this
				284	* page is the only one holding a reference to it. lock the page
				285	* and return OK.
				286	*/
				287	if (page_count(page) == 1) {
				288	lock_page(page);
				289	return 0;
				290	}
				291
				292	return 1;
				293	}
				294	EXPORT_SYMBOL(generic_pipe_buf_steal);
				295
				296	/**
				297	* generic_pipe_buf_get - get a reference to a &struct pipe_buffer
				298	* @pipe: the pipe that the buffer belongs to
				299	* @buf: the buffer to get a reference to
				300	*
				301	* Description:
				302	* This function grabs an extra reference to @buf. It's used in
				303	* in the tee() system call, when we duplicate the buffers in one
				304	* pipe into another.
				305	*/
				306	void generic_pipe_buf_get(struct pipe_inode_info pipe, struct pipe_buffer buf)
				307	{
				308	page_cache_get(buf->page);
				309	}
				310	EXPORT_SYMBOL(generic_pipe_buf_get);
				311
				312	/**
				313	* generic_pipe_buf_confirm - verify contents of the pipe buffer
				314	* @info: the pipe that the buffer belongs to
				315	* @buf: the buffer to confirm
				316	*
				317	* Description:
				318	* This function does nothing, because the generic pipe code uses
				319	* pages that are always good when inserted into the pipe.
				320	*/
				321	int generic_pipe_buf_confirm(struct pipe_inode_info *info,
				322	struct pipe_buffer *buf)
				323	{
				324	return 0;
				325	}
				326	EXPORT_SYMBOL(generic_pipe_buf_confirm);
				327
				328	/**
				329	* generic_pipe_buf_release - put a reference to a &struct pipe_buffer
				330	* @pipe: the pipe that the buffer belongs to
				331	* @buf: the buffer to put a reference to
				332	*
				333	* Description:
				334	* This function releases a reference to @buf.
				335	*/
				336	void generic_pipe_buf_release(struct pipe_inode_info *pipe,
				337	struct pipe_buffer *buf)
				338	{
				339	page_cache_release(buf->page);
				340	}
				341	EXPORT_SYMBOL(generic_pipe_buf_release);
				342
				343	static const struct pipe_buf_operations anon_pipe_buf_ops = {
				344	.can_merge = 1,
				345	.map = generic_pipe_buf_map,
				346	.unmap = generic_pipe_buf_unmap,
				347	.confirm = generic_pipe_buf_confirm,
				348	.release = anon_pipe_buf_release,
				349	.steal = generic_pipe_buf_steal,
				350	.get = generic_pipe_buf_get,
				351	};
				352
				353	static const struct pipe_buf_operations packet_pipe_buf_ops = {
				354	.can_merge = 0,
				355	.map = generic_pipe_buf_map,
				356	.unmap = generic_pipe_buf_unmap,
				357	.confirm = generic_pipe_buf_confirm,
				358	.release = anon_pipe_buf_release,
				359	.steal = generic_pipe_buf_steal,
				360	.get = generic_pipe_buf_get,
				361	};
				362
				363	static ssize_t
				364	pipe_read(struct kiocb iocb, const struct iovec _iov,
				365	unsigned long nr_segs, loff_t pos)
				366	{
				367	struct file *filp = iocb->ki_filp;
				368	struct inode *inode = filp->f_path.dentry->d_inode;
				369	struct pipe_inode_info *pipe;
				370	int do_wakeup;
				371	ssize_t ret;
				372	struct iovec iov = (struct iovec )_iov;
				373	size_t total_len;
				374
				375	total_len = iov_length(iov, nr_segs);
				376	/* Null read succeeds. */
				377	if (unlikely(total_len == 0))
				378	return 0;
				379
				380	do_wakeup = 0;
				381	ret = 0;
				382	mutex_lock(&inode->i_mutex);
				383	pipe = inode->i_pipe;
				384	for (;;) {
				385	int bufs = pipe->nrbufs;
				386	if (bufs) {
				387	int curbuf = pipe->curbuf;
				388	struct pipe_buffer *buf = pipe->bufs + curbuf;
				389	const struct pipe_buf_operations *ops = buf->ops;
				390	void *addr;
				391	size_t chars = buf->len, remaining;
				392	int error, atomic;
				393
				394	if (chars > total_len)
				395	chars = total_len;
				396
				397	error = ops->confirm(pipe, buf);
				398	if (error) {
				399	if (!ret)
				400	ret = error;
				401	break;
				402	}
				403
				404	atomic = !iov_fault_in_pages_write(iov, chars);
				405	remaining = chars;
				406	redo:
				407	addr = ops->map(pipe, buf, atomic);
				408	error = pipe_iov_copy_to_user(iov, addr, &buf->offset,
				409	&remaining, atomic);
				410	ops->unmap(pipe, buf, addr);
				411	if (unlikely(error)) {
				412	/*
				413	* Just retry with the slow path if we failed.
				414	*/
				415	if (atomic) {
				416	atomic = 0;
				417	goto redo;
				418	}
				419	if (!ret)
				420	ret = error;
				421	break;
				422	}
				423	ret += chars;
				424	buf->len -= chars;
				425
				426	/* Was it a packet buffer? Clean up and exit */
				427	if (buf->flags & PIPE_BUF_FLAG_PACKET) {
				428	total_len = chars;
				429	buf->len = 0;
				430	}
				431
				432	if (!buf->len) {
				433	buf->ops = NULL;
				434	ops->release(pipe, buf);
				435	curbuf = (curbuf + 1) & (pipe->buffers - 1);
				436	pipe->curbuf = curbuf;
				437	pipe->nrbufs = --bufs;
				438	do_wakeup = 1;
				439	}
				440	total_len -= chars;
				441	if (!total_len)
				442	break; /* common path: read succeeded */
				443	}
				444	if (bufs) /* More to do? */
				445	continue;
				446	if (!pipe->writers)
				447	break;
				448	if (!pipe->waiting_writers) {
				449	/* syscall merging: Usually we must not sleep
				450	* if O_NONBLOCK is set, or if we got some data.
				451	* But if a writer sleeps in kernel space, then
				452	* we can wait for that data without violating POSIX.
				453	*/
				454	if (ret)
				455	break;
				456	if (filp->f_flags & O_NONBLOCK) {
				457	ret = -EAGAIN;
				458	break;
				459	}
				460	}
				461	if (signal_pending(current)) {
				462	if (!ret)
				463	ret = -ERESTARTSYS;
				464	break;
				465	}
				466	if (do_wakeup) {
				467	wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT \| POLLWRNORM);
				468	kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
				469	}
				470	pipe_wait(pipe);
				471	}
				472	mutex_unlock(&inode->i_mutex);
				473
				474	/* Signal writers asynchronously that there is more room. */
				475	if (do_wakeup) {
				476	wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT \| POLLWRNORM);
				477	kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
				478	}
				479	if (ret > 0)
				480	file_accessed(filp);
				481	return ret;
				482	}
				483
				484	static inline int is_packetized(struct file *file)
				485	{
				486	return (file->f_flags & O_DIRECT) != 0;
				487	}
				488
				489	static ssize_t
				490	pipe_write(struct kiocb iocb, const struct iovec _iov,
				491	unsigned long nr_segs, loff_t ppos)
				492	{
				493	struct file *filp = iocb->ki_filp;
				494	struct inode *inode = filp->f_path.dentry->d_inode;
				495	struct pipe_inode_info *pipe;
				496	ssize_t ret;
				497	int do_wakeup;
				498	struct iovec iov = (struct iovec )_iov;
				499	size_t total_len;
				500	ssize_t chars;
				501
				502	total_len = iov_length(iov, nr_segs);
				503	/* Null write succeeds. */
				504	if (unlikely(total_len == 0))
				505	return 0;
				506
				507	do_wakeup = 0;
				508	ret = 0;
				509	mutex_lock(&inode->i_mutex);
				510	pipe = inode->i_pipe;
				511
				512	if (!pipe->readers) {
				513	send_sig(SIGPIPE, current, 0);
				514	ret = -EPIPE;
				515	goto out;
				516	}
				517
				518	/* We try to merge small writes */
				519	chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */
				520	if (pipe->nrbufs && chars != 0) {
				521	int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) &
				522	(pipe->buffers - 1);
				523	struct pipe_buffer *buf = pipe->bufs + lastbuf;
				524	const struct pipe_buf_operations *ops = buf->ops;
				525	int offset = buf->offset + buf->len;
				526
				527	if (ops->can_merge && offset + chars <= PAGE_SIZE) {
				528	int error, atomic = 1;
				529	void *addr;
				530	size_t remaining = chars;
				531
				532	error = ops->confirm(pipe, buf);
				533	if (error)
				534	goto out;
				535
				536	iov_fault_in_pages_read(iov, chars);
				537	redo1:
				538	addr = ops->map(pipe, buf, atomic);
				539	error = pipe_iov_copy_from_user(addr, &offset, iov,
				540	&remaining, atomic);
				541	ops->unmap(pipe, buf, addr);
				542	ret = error;
				543	do_wakeup = 1;
				544	if (error) {
				545	if (atomic) {
				546	atomic = 0;
				547	goto redo1;
				548	}
				549	goto out;
				550	}
				551	buf->len += chars;
				552	total_len -= chars;
				553	ret = chars;
				554	if (!total_len)
				555	goto out;
				556	}
				557	}
				558
				559	for (;;) {
				560	int bufs;
				561
				562	if (!pipe->readers) {
				563	send_sig(SIGPIPE, current, 0);
				564	if (!ret)
				565	ret = -EPIPE;
				566	break;
				567	}
				568	bufs = pipe->nrbufs;
				569	if (bufs < pipe->buffers) {
				570	int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1);
				571	struct pipe_buffer *buf = pipe->bufs + newbuf;
				572	struct page *page = pipe->tmp_page;
				573	char *src;
				574	int error, atomic = 1;
				575	int offset = 0;
				576	size_t remaining;
				577
				578	if (!page) {
				579	page = alloc_page(GFP_HIGHUSER);
				580	if (unlikely(!page)) {
				581	ret = ret ? : -ENOMEM;
				582	break;
				583	}
				584	pipe->tmp_page = page;
				585	}
				586	/* Always wake up, even if the copy fails. Otherwise
				587	* we lock up (O_NONBLOCK-)readers that sleep due to
				588	* syscall merging.
				589	* FIXME! Is this really true?
				590	*/
				591	do_wakeup = 1;
				592	chars = PAGE_SIZE;
				593	if (chars > total_len)
				594	chars = total_len;
				595
				596	iov_fault_in_pages_read(iov, chars);
				597	remaining = chars;
				598	redo2:
				599	if (atomic)
				600	src = kmap_atomic(page);
				601	else
				602	src = kmap(page);
				603
				604	error = pipe_iov_copy_from_user(src, &offset, iov,
				605	&remaining, atomic);
				606	if (atomic)
				607	kunmap_atomic(src);
				608	else
				609	kunmap(page);
				610
				611	if (unlikely(error)) {
				612	if (atomic) {
				613	atomic = 0;
				614	goto redo2;
				615	}
				616	if (!ret)
				617	ret = error;
				618	break;
				619	}
				620	ret += chars;
				621
				622	/* Insert it into the buffer array */
				623	buf->page = page;
				624	buf->ops = &anon_pipe_buf_ops;
				625	buf->offset = 0;
				626	buf->len = chars;
				627	buf->flags = 0;
				628	if (is_packetized(filp)) {
				629	buf->ops = &packet_pipe_buf_ops;
				630	buf->flags = PIPE_BUF_FLAG_PACKET;
				631	}
				632	pipe->nrbufs = ++bufs;
				633	pipe->tmp_page = NULL;
				634
				635	total_len -= chars;
				636	if (!total_len)
				637	break;
				638	}
				639	if (bufs < pipe->buffers)
				640	continue;
				641	if (filp->f_flags & O_NONBLOCK) {
				642	if (!ret)
				643	ret = -EAGAIN;
				644	break;
				645	}
				646	if (signal_pending(current)) {
				647	if (!ret)
				648	ret = -ERESTARTSYS;
				649	break;
				650	}
				651	if (do_wakeup) {
				652	wake_up_interruptible_sync_poll(&pipe->wait, POLLIN \| POLLRDNORM);
				653	kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
				654	do_wakeup = 0;
				655	}
				656	pipe->waiting_writers++;
				657	pipe_wait(pipe);
				658	pipe->waiting_writers--;
				659	}
				660	out:
				661	mutex_unlock(&inode->i_mutex);
				662	if (do_wakeup) {
				663	wake_up_interruptible_sync_poll(&pipe->wait, POLLIN \| POLLRDNORM);
				664	kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
				665	}
				666	if (ret > 0)
				667	file_update_time(filp);
				668	return ret;
				669	}
				670
				671	static ssize_t
				672	bad_pipe_r(struct file filp, char __user buf, size_t count, loff_t *ppos)
				673	{
				674	return -EBADF;
				675	}
				676
				677	static ssize_t
				678	bad_pipe_w(struct file filp, const char __user buf, size_t count,
				679	loff_t *ppos)
				680	{
				681	return -EBADF;
				682	}
				683
				684	static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
				685	{
				686	struct inode *inode = filp->f_path.dentry->d_inode;
				687	struct pipe_inode_info *pipe;
				688	int count, buf, nrbufs;
				689
				690	switch (cmd) {
				691	case FIONREAD:
				692	mutex_lock(&inode->i_mutex);
				693	pipe = inode->i_pipe;
				694	count = 0;
				695	buf = pipe->curbuf;
				696	nrbufs = pipe->nrbufs;
				697	while (--nrbufs >= 0) {
				698	count += pipe->bufs[buf].len;
				699	buf = (buf+1) & (pipe->buffers - 1);
				700	}
				701	mutex_unlock(&inode->i_mutex);
				702
				703	return put_user(count, (int __user *)arg);
				704	default:
				705	return -EINVAL;
				706	}
				707	}
				708
				709	/* No kernel lock held - fine */
				710	static unsigned int
				711	pipe_poll(struct file filp, poll_table wait)
				712	{
				713	unsigned int mask;
				714	struct inode *inode = filp->f_path.dentry->d_inode;
				715	struct pipe_inode_info *pipe = inode->i_pipe;
				716	int nrbufs;
				717
				718	poll_wait(filp, &pipe->wait, wait);
				719
				720	/* Reading only -- no need for acquiring the semaphore. */
				721	nrbufs = pipe->nrbufs;
				722	mask = 0;
				723	if (filp->f_mode & FMODE_READ) {
				724	mask = (nrbufs > 0) ? POLLIN \| POLLRDNORM : 0;
				725	if (!pipe->writers && filp->f_version != pipe->w_counter)
				726	mask \|= POLLHUP;
				727	}
				728
				729	if (filp->f_mode & FMODE_WRITE) {
				730	mask \|= (nrbufs < pipe->buffers) ? POLLOUT \| POLLWRNORM : 0;
				731	/*
				732	* Most Unices do not set POLLERR for FIFOs but on Linux they
				733	* behave exactly like pipes for poll().
				734	*/
				735	if (!pipe->readers)
				736	mask \|= POLLERR;
				737	}
				738
				739	return mask;
				740	}
				741
				742	static int
				743	pipe_release(struct inode *inode, int decr, int decw)
				744	{
				745	struct pipe_inode_info *pipe;
				746
				747	mutex_lock(&inode->i_mutex);
				748	pipe = inode->i_pipe;
				749	pipe->readers -= decr;
				750	pipe->writers -= decw;
				751
				752	if (!pipe->readers && !pipe->writers) {
				753	free_pipe_info(inode);
				754	} else {
				755	wake_up_interruptible_sync_poll(&pipe->wait, POLLIN \| POLLOUT \| POLLRDNORM \| POLLWRNORM \| POLLERR \| POLLHUP);
				756	kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
				757	kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
				758	}
				759	mutex_unlock(&inode->i_mutex);
				760
				761	return 0;
				762	}
				763
				764	static int
				765	pipe_read_fasync(int fd, struct file *filp, int on)
				766	{
				767	struct inode *inode = filp->f_path.dentry->d_inode;
				768	int retval;
				769
				770	mutex_lock(&inode->i_mutex);
				771	retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_readers);
				772	mutex_unlock(&inode->i_mutex);
				773
				774	return retval;
				775	}
				776
				777
				778	static int
				779	pipe_write_fasync(int fd, struct file *filp, int on)
				780	{
				781	struct inode *inode = filp->f_path.dentry->d_inode;
				782	int retval;
				783
				784	mutex_lock(&inode->i_mutex);
				785	retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_writers);
				786	mutex_unlock(&inode->i_mutex);
				787
				788	return retval;
				789	}
				790
				791
				792	static int
				793	pipe_rdwr_fasync(int fd, struct file *filp, int on)
				794	{
				795	struct inode *inode = filp->f_path.dentry->d_inode;
				796	struct pipe_inode_info *pipe = inode->i_pipe;
				797	int retval;
				798
				799	mutex_lock(&inode->i_mutex);
				800	retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
				801	if (retval >= 0) {
				802	retval = fasync_helper(fd, filp, on, &pipe->fasync_writers);
				803	if (retval < 0) /* this can happen only if on == T */
				804	fasync_helper(-1, filp, 0, &pipe->fasync_readers);
				805	}
				806	mutex_unlock(&inode->i_mutex);
				807	return retval;
				808	}
				809
				810
				811	static int
				812	pipe_read_release(struct inode inode, struct file filp)
				813	{
				814	return pipe_release(inode, 1, 0);
				815	}
				816
				817	static int
				818	pipe_write_release(struct inode inode, struct file filp)
				819	{
				820	return pipe_release(inode, 0, 1);
				821	}
				822
				823	static int
				824	pipe_rdwr_release(struct inode inode, struct file filp)
				825	{
				826	int decr, decw;
				827
				828	decr = (filp->f_mode & FMODE_READ) != 0;
				829	decw = (filp->f_mode & FMODE_WRITE) != 0;
				830	return pipe_release(inode, decr, decw);
				831	}
				832
				833	static int
				834	pipe_read_open(struct inode inode, struct file filp)
				835	{
				836	int ret = -ENOENT;
				837
				838	mutex_lock(&inode->i_mutex);
				839
				840	if (inode->i_pipe) {
				841	ret = 0;
				842	inode->i_pipe->readers++;
				843	}
				844
				845	mutex_unlock(&inode->i_mutex);
				846
				847	return ret;
				848	}
				849
				850	static int
				851	pipe_write_open(struct inode inode, struct file filp)
				852	{
				853	int ret = -ENOENT;
				854
				855	mutex_lock(&inode->i_mutex);
				856
				857	if (inode->i_pipe) {
				858	ret = 0;
				859	inode->i_pipe->writers++;
				860	}
				861
				862	mutex_unlock(&inode->i_mutex);
				863
				864	return ret;
				865	}
				866
				867	static int
				868	pipe_rdwr_open(struct inode inode, struct file filp)
				869	{
				870	int ret = -ENOENT;
				871
				872	if (!(filp->f_mode & (FMODE_READ\|FMODE_WRITE)))
				873	return -EINVAL;
				874
				875	mutex_lock(&inode->i_mutex);
				876
				877	if (inode->i_pipe) {
				878	ret = 0;
				879	if (filp->f_mode & FMODE_READ)
				880	inode->i_pipe->readers++;
				881	if (filp->f_mode & FMODE_WRITE)
				882	inode->i_pipe->writers++;
				883	}
				884
				885	mutex_unlock(&inode->i_mutex);
				886
				887	return ret;
				888	}
				889
				890	/*
				891	* The file_operations structs are not static because they
				892	* are also used in linux/fs/fifo.c to do operations on FIFOs.
				893	*
				894	* Pipes reuse fifos' file_operations structs.
				895	*/
				896	const struct file_operations read_pipefifo_fops = {
				897	.llseek = no_llseek,
				898	.read = do_sync_read,
				899	.aio_read = pipe_read,
				900	.write = bad_pipe_w,
				901	.poll = pipe_poll,
				902	.unlocked_ioctl = pipe_ioctl,
				903	.open = pipe_read_open,
				904	.release = pipe_read_release,
				905	.fasync = pipe_read_fasync,
				906	};
				907
				908	const struct file_operations write_pipefifo_fops = {
				909	.llseek = no_llseek,
				910	.read = bad_pipe_r,
				911	.write = do_sync_write,
				912	.aio_write = pipe_write,
				913	.poll = pipe_poll,
				914	.unlocked_ioctl = pipe_ioctl,
				915	.open = pipe_write_open,
				916	.release = pipe_write_release,
				917	.fasync = pipe_write_fasync,
				918	};
				919
				920	const struct file_operations rdwr_pipefifo_fops = {
				921	.llseek = no_llseek,
				922	.read = do_sync_read,
				923	.aio_read = pipe_read,
				924	.write = do_sync_write,
				925	.aio_write = pipe_write,
				926	.poll = pipe_poll,
				927	.unlocked_ioctl = pipe_ioctl,
				928	.open = pipe_rdwr_open,
				929	.release = pipe_rdwr_release,
				930	.fasync = pipe_rdwr_fasync,
				931	};
				932
				933	struct pipe_inode_info * alloc_pipe_info(struct inode *inode)
				934	{
				935	struct pipe_inode_info *pipe;
				936
				937	pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
				938	if (pipe) {
				939	pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL);
				940	if (pipe->bufs) {
				941	init_waitqueue_head(&pipe->wait);
				942	pipe->r_counter = pipe->w_counter = 1;
				943	pipe->inode = inode;
				944	pipe->buffers = PIPE_DEF_BUFFERS;
				945	return pipe;
				946	}
				947	kfree(pipe);
				948	}
				949
				950	return NULL;
				951	}
				952
				953	void __free_pipe_info(struct pipe_inode_info *pipe)
				954	{
				955	int i;
				956
				957	for (i = 0; i < pipe->buffers; i++) {
				958	struct pipe_buffer *buf = pipe->bufs + i;
				959	if (buf->ops)
				960	buf->ops->release(pipe, buf);
				961	}
				962	if (pipe->tmp_page)
				963	__free_page(pipe->tmp_page);
				964	kfree(pipe->bufs);
				965	kfree(pipe);
				966	}
				967
				968	void free_pipe_info(struct inode *inode)
				969	{
				970	__free_pipe_info(inode->i_pipe);
				971	inode->i_pipe = NULL;
				972	}
				973
				974	static struct vfsmount *pipe_mnt __read_mostly;
				975
				976	/*
				977	* pipefs_dname() is called from d_path().
				978	*/
				979	static char pipefs_dname(struct dentry dentry, char *buffer, int buflen)
				980	{
				981	return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]",
				982	dentry->d_inode->i_ino);
				983	}
				984
				985	static const struct dentry_operations pipefs_dentry_operations = {
				986	.d_dname = pipefs_dname,
				987	};
				988
				989	static struct inode * get_pipe_inode(void)
				990	{
				991	struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb);
				992	struct pipe_inode_info *pipe;
				993
				994	if (!inode)
				995	goto fail_inode;
				996
				997	inode->i_ino = get_next_ino();
				998
				999	pipe = alloc_pipe_info(inode);
				1000	if (!pipe)
				1001	goto fail_iput;
				1002	inode->i_pipe = pipe;
				1003
				1004	pipe->readers = pipe->writers = 1;
				1005	inode->i_fop = &rdwr_pipefifo_fops;
				1006
				1007	/*
				1008	* Mark the inode dirty from the very beginning,
				1009	* that way it will never be moved to the dirty
				1010	* list because "mark_inode_dirty()" will think
				1011	* that it already _is_ on the dirty list.
				1012	*/
				1013	inode->i_state = I_DIRTY;
				1014	inode->i_mode = S_IFIFO \| S_IRUSR \| S_IWUSR;
				1015	inode->i_uid = current_fsuid();
				1016	inode->i_gid = current_fsgid();
				1017	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
				1018
				1019	return inode;
				1020
				1021	fail_iput:
				1022	iput(inode);
				1023
				1024	fail_inode:
				1025	return NULL;
				1026	}
				1027
				1028	struct file *create_write_pipe(int flags)
				1029	{
				1030	int err;
				1031	struct inode *inode;
				1032	struct file *f;
				1033	struct path path;
				1034	struct qstr name = { .name = "" };
				1035
				1036	err = -ENFILE;
				1037	inode = get_pipe_inode();
				1038	if (!inode)
				1039	goto err;
				1040
				1041	err = -ENOMEM;
				1042	path.dentry = d_alloc_pseudo(pipe_mnt->mnt_sb, &name);
				1043	if (!path.dentry)
				1044	goto err_inode;
				1045	path.mnt = mntget(pipe_mnt);
				1046
				1047	d_instantiate(path.dentry, inode);
				1048
				1049	err = -ENFILE;
				1050	f = alloc_file(&path, FMODE_WRITE, &write_pipefifo_fops);
				1051	if (!f)
				1052	goto err_dentry;
				1053	f->f_mapping = inode->i_mapping;
				1054
				1055	f->f_flags = O_WRONLY \| (flags & (O_NONBLOCK \| O_DIRECT));
				1056	f->f_version = 0;
				1057
				1058	return f;
				1059
				1060	err_dentry:
				1061	free_pipe_info(inode);
				1062	path_put(&path);
				1063	return ERR_PTR(err);
				1064
				1065	err_inode:
				1066	free_pipe_info(inode);
				1067	iput(inode);
				1068	err:
				1069	return ERR_PTR(err);
				1070	}
				1071
				1072	void free_write_pipe(struct file *f)
				1073	{
				1074	free_pipe_info(f->f_dentry->d_inode);
				1075	path_put(&f->f_path);
				1076	put_filp(f);
				1077	}
				1078
				1079	struct file create_read_pipe(struct file wrf, int flags)
				1080	{
				1081	/* Grab pipe from the writer */
				1082	struct file *f = alloc_file(&wrf->f_path, FMODE_READ,
				1083	&read_pipefifo_fops);
				1084	if (!f)
				1085	return ERR_PTR(-ENFILE);
				1086
				1087	path_get(&wrf->f_path);
				1088	f->f_flags = O_RDONLY \| (flags & O_NONBLOCK);
				1089
				1090	return f;
				1091	}
				1092
				1093	int do_pipe_flags(int *fd, int flags)
				1094	{
				1095	struct file fw, fr;
				1096	int error;
				1097	int fdw, fdr;
				1098
				1099	if (flags & ~(O_CLOEXEC \| O_NONBLOCK \| O_DIRECT))
				1100	return -EINVAL;
				1101
				1102	fw = create_write_pipe(flags);
				1103	if (IS_ERR(fw))
				1104	return PTR_ERR(fw);
				1105	fr = create_read_pipe(fw, flags);
				1106	error = PTR_ERR(fr);
				1107	if (IS_ERR(fr))
				1108	goto err_write_pipe;
				1109
				1110	error = get_unused_fd_flags(flags);
				1111	if (error < 0)
				1112	goto err_read_pipe;
				1113	fdr = error;
				1114
				1115	error = get_unused_fd_flags(flags);
				1116	if (error < 0)
				1117	goto err_fdr;
				1118	fdw = error;
				1119
				1120	audit_fd_pair(fdr, fdw);
				1121	fd_install(fdr, fr);
				1122	fd_install(fdw, fw);
				1123	fd[0] = fdr;
				1124	fd[1] = fdw;
				1125
				1126	return 0;
				1127
				1128	err_fdr:
				1129	put_unused_fd(fdr);
				1130	err_read_pipe:
				1131	path_put(&fr->f_path);
				1132	put_filp(fr);
				1133	err_write_pipe:
				1134	free_write_pipe(fw);
				1135	return error;
				1136	}
				1137
				1138	/*
				1139	* sys_pipe() is the normal C calling standard for creating
				1140	* a pipe. It's not the way Unix traditionally does this, though.
				1141	*/
				1142	SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags)
				1143	{
				1144	int fd[2];
				1145	int error;
				1146
				1147	error = do_pipe_flags(fd, flags);
				1148	if (!error) {
				1149	if (copy_to_user(fildes, fd, sizeof(fd))) {
				1150	sys_close(fd[0]);
				1151	sys_close(fd[1]);
				1152	error = -EFAULT;
				1153	}
				1154	}
				1155	return error;
				1156	}
				1157
				1158	SYSCALL_DEFINE1(pipe, int __user *, fildes)
				1159	{
				1160	return sys_pipe2(fildes, 0);
				1161	}
				1162
				1163	/*
				1164	* Allocate a new array of pipe buffers and copy the info over. Returns the
				1165	* pipe size if successful, or return -ERROR on error.
				1166	*/
				1167	static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages)
				1168	{
				1169	struct pipe_buffer *bufs;
				1170
				1171	/*
				1172	* We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't
				1173	* expect a lot of shrink+grow operations, just free and allocate
				1174	* again like we would do for growing. If the pipe currently
				1175	* contains more buffers than arg, then return busy.
				1176	*/
				1177	if (nr_pages < pipe->nrbufs)
				1178	return -EBUSY;
				1179
				1180	bufs = kcalloc(nr_pages, sizeof(*bufs), GFP_KERNEL \| __GFP_NOWARN);
				1181	if (unlikely(!bufs))
				1182	return -ENOMEM;
				1183
				1184	/*
				1185	* The pipe array wraps around, so just start the new one at zero
				1186	* and adjust the indexes.
				1187	*/
				1188	if (pipe->nrbufs) {
				1189	unsigned int tail;
				1190	unsigned int head;
				1191
				1192	tail = pipe->curbuf + pipe->nrbufs;
				1193	if (tail < pipe->buffers)
				1194	tail = 0;
				1195	else
				1196	tail &= (pipe->buffers - 1);
				1197
				1198	head = pipe->nrbufs - tail;
				1199	if (head)
				1200	memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer));
				1201	if (tail)
				1202	memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer));
				1203	}
				1204
				1205	pipe->curbuf = 0;
				1206	kfree(pipe->bufs);
				1207	pipe->bufs = bufs;
				1208	pipe->buffers = nr_pages;
				1209	return nr_pages * PAGE_SIZE;
				1210	}
				1211
				1212	/*
				1213	* Currently we rely on the pipe array holding a power-of-2 number
				1214	* of pages.
				1215	*/
				1216	static inline unsigned int round_pipe_size(unsigned int size)
				1217	{
				1218	unsigned long nr_pages;
				1219
				1220	nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
				1221	return roundup_pow_of_two(nr_pages) << PAGE_SHIFT;
				1222	}
				1223
				1224	/*
				1225	* This should work even if CONFIG_PROC_FS isn't set, as proc_dointvec_minmax
				1226	* will return an error.
				1227	*/
				1228	int pipe_proc_fn(struct ctl_table table, int write, void __user buf,
				1229	size_t lenp, loff_t ppos)
				1230	{
				1231	int ret;
				1232
				1233	ret = proc_dointvec_minmax(table, write, buf, lenp, ppos);
				1234	if (ret < 0 \|\| !write)
				1235	return ret;
				1236
				1237	pipe_max_size = round_pipe_size(pipe_max_size);
				1238	return ret;
				1239	}
				1240
				1241	/*
				1242	* After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
				1243	* location, so checking ->i_pipe is not enough to verify that this is a
				1244	* pipe.
				1245	*/
				1246	struct pipe_inode_info get_pipe_info(struct file file)
				1247	{
				1248	struct inode *i = file->f_path.dentry->d_inode;
				1249
				1250	return S_ISFIFO(i->i_mode) ? i->i_pipe : NULL;
				1251	}
				1252
				1253	long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
				1254	{
				1255	struct pipe_inode_info *pipe;
				1256	long ret;
				1257
				1258	pipe = get_pipe_info(file);
				1259	if (!pipe)
				1260	return -EBADF;
				1261
				1262	mutex_lock(&pipe->inode->i_mutex);
				1263
				1264	switch (cmd) {
				1265	case F_SETPIPE_SZ: {
				1266	unsigned int size, nr_pages;
				1267
				1268	size = round_pipe_size(arg);
				1269	nr_pages = size >> PAGE_SHIFT;
				1270
				1271	ret = -EINVAL;
				1272	if (!nr_pages)
				1273	goto out;
				1274
				1275	if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) {
				1276	ret = -EPERM;
				1277	goto out;
				1278	}
				1279	ret = pipe_set_size(pipe, nr_pages);
				1280	break;
				1281	}
				1282	case F_GETPIPE_SZ:
				1283	ret = pipe->buffers * PAGE_SIZE;
				1284	break;
				1285	default:
				1286	ret = -EINVAL;
				1287	break;
				1288	}
				1289
				1290	out:
				1291	mutex_unlock(&pipe->inode->i_mutex);
				1292	return ret;
				1293	}
				1294
				1295	static const struct super_operations pipefs_ops = {
				1296	.destroy_inode = free_inode_nonrcu,
				1297	.statfs = simple_statfs,
				1298	};
				1299
				1300	/*
				1301	* pipefs should _never_ be mounted by userland - too much of security hassle,
				1302	* no real gain from having the whole whorehouse mounted. So we don't need
				1303	* any operations on the root directory. However, we need a non-trivial
				1304	* d_name - pipe: will go nicely and kill the special-casing in procfs.
				1305	*/
				1306	static struct dentry pipefs_mount(struct file_system_type fs_type,
				1307	int flags, const char dev_name, void data)
				1308	{
				1309	return mount_pseudo(fs_type, "pipe:", &pipefs_ops,
				1310	&pipefs_dentry_operations, PIPEFS_MAGIC);
				1311	}
				1312
				1313	static struct file_system_type pipe_fs_type = {
				1314	.name = "pipefs",
				1315	.mount = pipefs_mount,
				1316	.kill_sb = kill_anon_super,
				1317	};
				1318
				1319	static int __init init_pipe_fs(void)
				1320	{
				1321	int err = register_filesystem(&pipe_fs_type);
				1322
				1323	if (!err) {
				1324	pipe_mnt = kern_mount(&pipe_fs_type);
				1325	if (IS_ERR(pipe_mnt)) {
				1326	err = PTR_ERR(pipe_mnt);
				1327	unregister_filesystem(&pipe_fs_type);
				1328	}
				1329	}
				1330	return err;
				1331	}
				1332
				1333	fs_initcall(init_pipe_fs);