blob: abfb93525ca6e54003425e8a3a9097802ae0c7d5 [file] [log] [blame]
yuezonghe824eb0c2024-06-27 02:32:26 -07001/*
2 * linux/fs/pipe.c
3 *
4 * Copyright (C) 1991, 1992, 1999 Linus Torvalds
5 */
6
7#include <linux/mm.h>
8#include <linux/file.h>
9#include <linux/poll.h>
10#include <linux/slab.h>
11#include <linux/module.h>
12#include <linux/init.h>
13#include <linux/fs.h>
14#include <linux/log2.h>
15#include <linux/mount.h>
16#include <linux/magic.h>
17#include <linux/pipe_fs_i.h>
18#include <linux/uio.h>
19#include <linux/highmem.h>
20#include <linux/pagemap.h>
21#include <linux/audit.h>
22#include <linux/syscalls.h>
23#include <linux/fcntl.h>
24
25#include <asm/uaccess.h>
26#include <asm/ioctls.h>
27
28/*
29 * The max size that a non-root user is allowed to grow the pipe. Can
30 * be set by root in /proc/sys/fs/pipe-max-size
31 */
32unsigned int pipe_max_size = 1048576;
33
34/*
35 * Minimum pipe size, as required by POSIX
36 */
37unsigned int pipe_min_size = PAGE_SIZE;
38
39/*
40 * We use a start+len construction, which provides full use of the
41 * allocated memory.
42 * -- Florian Coosmann (FGC)
43 *
44 * Reads with count = 0 should always return 0.
45 * -- Julian Bradfield 1999-06-07.
46 *
47 * FIFOs and Pipes now generate SIGIO for both readers and writers.
48 * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16
49 *
50 * pipe_read & write cleanup
51 * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09
52 */
53
54static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass)
55{
56 if (pipe->inode)
57 mutex_lock_nested(&pipe->inode->i_mutex, subclass);
58}
59
60void pipe_lock(struct pipe_inode_info *pipe)
61{
62 /*
63 * pipe_lock() nests non-pipe inode locks (for writing to a file)
64 */
65 pipe_lock_nested(pipe, I_MUTEX_PARENT);
66}
67EXPORT_SYMBOL(pipe_lock);
68
69void pipe_unlock(struct pipe_inode_info *pipe)
70{
71 if (pipe->inode)
72 mutex_unlock(&pipe->inode->i_mutex);
73}
74EXPORT_SYMBOL(pipe_unlock);
75
76void pipe_double_lock(struct pipe_inode_info *pipe1,
77 struct pipe_inode_info *pipe2)
78{
79 BUG_ON(pipe1 == pipe2);
80
81 if (pipe1 < pipe2) {
82 pipe_lock_nested(pipe1, I_MUTEX_PARENT);
83 pipe_lock_nested(pipe2, I_MUTEX_CHILD);
84 } else {
85 pipe_lock_nested(pipe2, I_MUTEX_PARENT);
86 pipe_lock_nested(pipe1, I_MUTEX_CHILD);
87 }
88}
89
90/* Drop the inode semaphore and wait for a pipe event, atomically */
91void pipe_wait(struct pipe_inode_info *pipe)
92{
93 DEFINE_WAIT(wait);
94
95 /*
96 * Pipes are system-local resources, so sleeping on them
97 * is considered a noninteractive wait:
98 */
99 prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE);
100 pipe_unlock(pipe);
101 schedule();
102 finish_wait(&pipe->wait, &wait);
103 pipe_lock(pipe);
104}
105
106static int
107pipe_iov_copy_from_user(void *addr, int *offset, struct iovec *iov,
108 size_t *remaining, int atomic)
109{
110 unsigned long copy;
111
112 while (*remaining > 0) {
113 while (!iov->iov_len)
114 iov++;
115 copy = min_t(unsigned long, *remaining, iov->iov_len);
116
117 if (atomic) {
118 if (__copy_from_user_inatomic(addr + *offset,
119 iov->iov_base, copy))
120 return -EFAULT;
121 } else {
122 if (copy_from_user(addr + *offset,
123 iov->iov_base, copy))
124 return -EFAULT;
125 }
126 *offset += copy;
127 *remaining -= copy;
128 iov->iov_base += copy;
129 iov->iov_len -= copy;
130 }
131 return 0;
132}
133
134static int
135pipe_iov_copy_to_user(struct iovec *iov, void *addr, int *offset,
136 size_t *remaining, int atomic)
137{
138 unsigned long copy;
139
140 while (*remaining > 0) {
141 while (!iov->iov_len)
142 iov++;
143 copy = min_t(unsigned long, *remaining, iov->iov_len);
144
145 if (atomic) {
146 if (__copy_to_user_inatomic(iov->iov_base,
147 addr + *offset, copy))
148 return -EFAULT;
149 } else {
150 if (copy_to_user(iov->iov_base,
151 addr + *offset, copy))
152 return -EFAULT;
153 }
154 *offset += copy;
155 *remaining -= copy;
156 iov->iov_base += copy;
157 iov->iov_len -= copy;
158 }
159 return 0;
160}
161
162/*
163 * Attempt to pre-fault in the user memory, so we can use atomic copies.
164 * Returns the number of bytes not faulted in.
165 */
166static int iov_fault_in_pages_write(struct iovec *iov, unsigned long len)
167{
168 while (!iov->iov_len)
169 iov++;
170
171 while (len > 0) {
172 unsigned long this_len;
173
174 this_len = min_t(unsigned long, len, iov->iov_len);
175 if (fault_in_pages_writeable(iov->iov_base, this_len))
176 break;
177
178 len -= this_len;
179 iov++;
180 }
181
182 return len;
183}
184
185/*
186 * Pre-fault in the user memory, so we can use atomic copies.
187 */
188static void iov_fault_in_pages_read(struct iovec *iov, unsigned long len)
189{
190 while (!iov->iov_len)
191 iov++;
192
193 while (len > 0) {
194 unsigned long this_len;
195
196 this_len = min_t(unsigned long, len, iov->iov_len);
197 fault_in_pages_readable(iov->iov_base, this_len);
198 len -= this_len;
199 iov++;
200 }
201}
202
203static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
204 struct pipe_buffer *buf)
205{
206 struct page *page = buf->page;
207
208 /*
209 * If nobody else uses this page, and we don't already have a
210 * temporary page, let's keep track of it as a one-deep
211 * allocation cache. (Otherwise just release our reference to it)
212 */
213 if (page_count(page) == 1 && !pipe->tmp_page)
214 pipe->tmp_page = page;
215 else
216 page_cache_release(page);
217}
218
219/**
220 * generic_pipe_buf_map - virtually map a pipe buffer
221 * @pipe: the pipe that the buffer belongs to
222 * @buf: the buffer that should be mapped
223 * @atomic: whether to use an atomic map
224 *
225 * Description:
226 * This function returns a kernel virtual address mapping for the
227 * pipe_buffer passed in @buf. If @atomic is set, an atomic map is provided
228 * and the caller has to be careful not to fault before calling
229 * the unmap function.
230 *
231 * Note that this function occupies KM_USER0 if @atomic != 0.
232 */
233void *generic_pipe_buf_map(struct pipe_inode_info *pipe,
234 struct pipe_buffer *buf, int atomic)
235{
236 if (atomic) {
237 buf->flags |= PIPE_BUF_FLAG_ATOMIC;
238 return kmap_atomic(buf->page);
239 }
240
241 return kmap(buf->page);
242}
243EXPORT_SYMBOL(generic_pipe_buf_map);
244
245/**
246 * generic_pipe_buf_unmap - unmap a previously mapped pipe buffer
247 * @pipe: the pipe that the buffer belongs to
248 * @buf: the buffer that should be unmapped
249 * @map_data: the data that the mapping function returned
250 *
251 * Description:
252 * This function undoes the mapping that ->map() provided.
253 */
254void generic_pipe_buf_unmap(struct pipe_inode_info *pipe,
255 struct pipe_buffer *buf, void *map_data)
256{
257 if (buf->flags & PIPE_BUF_FLAG_ATOMIC) {
258 buf->flags &= ~PIPE_BUF_FLAG_ATOMIC;
259 kunmap_atomic(map_data);
260 } else
261 kunmap(buf->page);
262}
263EXPORT_SYMBOL(generic_pipe_buf_unmap);
264
265/**
266 * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer
267 * @pipe: the pipe that the buffer belongs to
268 * @buf: the buffer to attempt to steal
269 *
270 * Description:
271 * This function attempts to steal the &struct page attached to
272 * @buf. If successful, this function returns 0 and returns with
273 * the page locked. The caller may then reuse the page for whatever
274 * he wishes; the typical use is insertion into a different file
275 * page cache.
276 */
277int generic_pipe_buf_steal(struct pipe_inode_info *pipe,
278 struct pipe_buffer *buf)
279{
280 struct page *page = buf->page;
281
282 /*
283 * A reference of one is golden, that means that the owner of this
284 * page is the only one holding a reference to it. lock the page
285 * and return OK.
286 */
287 if (page_count(page) == 1) {
288 lock_page(page);
289 return 0;
290 }
291
292 return 1;
293}
294EXPORT_SYMBOL(generic_pipe_buf_steal);
295
296/**
297 * generic_pipe_buf_get - get a reference to a &struct pipe_buffer
298 * @pipe: the pipe that the buffer belongs to
299 * @buf: the buffer to get a reference to
300 *
301 * Description:
302 * This function grabs an extra reference to @buf. It's used in
303 * in the tee() system call, when we duplicate the buffers in one
304 * pipe into another.
305 */
306void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
307{
308 page_cache_get(buf->page);
309}
310EXPORT_SYMBOL(generic_pipe_buf_get);
311
312/**
313 * generic_pipe_buf_confirm - verify contents of the pipe buffer
314 * @info: the pipe that the buffer belongs to
315 * @buf: the buffer to confirm
316 *
317 * Description:
318 * This function does nothing, because the generic pipe code uses
319 * pages that are always good when inserted into the pipe.
320 */
321int generic_pipe_buf_confirm(struct pipe_inode_info *info,
322 struct pipe_buffer *buf)
323{
324 return 0;
325}
326EXPORT_SYMBOL(generic_pipe_buf_confirm);
327
328/**
329 * generic_pipe_buf_release - put a reference to a &struct pipe_buffer
330 * @pipe: the pipe that the buffer belongs to
331 * @buf: the buffer to put a reference to
332 *
333 * Description:
334 * This function releases a reference to @buf.
335 */
336void generic_pipe_buf_release(struct pipe_inode_info *pipe,
337 struct pipe_buffer *buf)
338{
339 page_cache_release(buf->page);
340}
341EXPORT_SYMBOL(generic_pipe_buf_release);
342
343static const struct pipe_buf_operations anon_pipe_buf_ops = {
344 .can_merge = 1,
345 .map = generic_pipe_buf_map,
346 .unmap = generic_pipe_buf_unmap,
347 .confirm = generic_pipe_buf_confirm,
348 .release = anon_pipe_buf_release,
349 .steal = generic_pipe_buf_steal,
350 .get = generic_pipe_buf_get,
351};
352
353static const struct pipe_buf_operations packet_pipe_buf_ops = {
354 .can_merge = 0,
355 .map = generic_pipe_buf_map,
356 .unmap = generic_pipe_buf_unmap,
357 .confirm = generic_pipe_buf_confirm,
358 .release = anon_pipe_buf_release,
359 .steal = generic_pipe_buf_steal,
360 .get = generic_pipe_buf_get,
361};
362
363static ssize_t
364pipe_read(struct kiocb *iocb, const struct iovec *_iov,
365 unsigned long nr_segs, loff_t pos)
366{
367 struct file *filp = iocb->ki_filp;
368 struct inode *inode = filp->f_path.dentry->d_inode;
369 struct pipe_inode_info *pipe;
370 int do_wakeup;
371 ssize_t ret;
372 struct iovec *iov = (struct iovec *)_iov;
373 size_t total_len;
374
375 total_len = iov_length(iov, nr_segs);
376 /* Null read succeeds. */
377 if (unlikely(total_len == 0))
378 return 0;
379
380 do_wakeup = 0;
381 ret = 0;
382 mutex_lock(&inode->i_mutex);
383 pipe = inode->i_pipe;
384 for (;;) {
385 int bufs = pipe->nrbufs;
386 if (bufs) {
387 int curbuf = pipe->curbuf;
388 struct pipe_buffer *buf = pipe->bufs + curbuf;
389 const struct pipe_buf_operations *ops = buf->ops;
390 void *addr;
391 size_t chars = buf->len, remaining;
392 int error, atomic;
393
394 if (chars > total_len)
395 chars = total_len;
396
397 error = ops->confirm(pipe, buf);
398 if (error) {
399 if (!ret)
400 ret = error;
401 break;
402 }
403
404 atomic = !iov_fault_in_pages_write(iov, chars);
405 remaining = chars;
406redo:
407 addr = ops->map(pipe, buf, atomic);
408 error = pipe_iov_copy_to_user(iov, addr, &buf->offset,
409 &remaining, atomic);
410 ops->unmap(pipe, buf, addr);
411 if (unlikely(error)) {
412 /*
413 * Just retry with the slow path if we failed.
414 */
415 if (atomic) {
416 atomic = 0;
417 goto redo;
418 }
419 if (!ret)
420 ret = error;
421 break;
422 }
423 ret += chars;
424 buf->len -= chars;
425
426 /* Was it a packet buffer? Clean up and exit */
427 if (buf->flags & PIPE_BUF_FLAG_PACKET) {
428 total_len = chars;
429 buf->len = 0;
430 }
431
432 if (!buf->len) {
433 buf->ops = NULL;
434 ops->release(pipe, buf);
435 curbuf = (curbuf + 1) & (pipe->buffers - 1);
436 pipe->curbuf = curbuf;
437 pipe->nrbufs = --bufs;
438 do_wakeup = 1;
439 }
440 total_len -= chars;
441 if (!total_len)
442 break; /* common path: read succeeded */
443 }
444 if (bufs) /* More to do? */
445 continue;
446 if (!pipe->writers)
447 break;
448 if (!pipe->waiting_writers) {
449 /* syscall merging: Usually we must not sleep
450 * if O_NONBLOCK is set, or if we got some data.
451 * But if a writer sleeps in kernel space, then
452 * we can wait for that data without violating POSIX.
453 */
454 if (ret)
455 break;
456 if (filp->f_flags & O_NONBLOCK) {
457 ret = -EAGAIN;
458 break;
459 }
460 }
461 if (signal_pending(current)) {
462 if (!ret)
463 ret = -ERESTARTSYS;
464 break;
465 }
466 if (do_wakeup) {
467 wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM);
468 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
469 }
470 pipe_wait(pipe);
471 }
472 mutex_unlock(&inode->i_mutex);
473
474 /* Signal writers asynchronously that there is more room. */
475 if (do_wakeup) {
476 wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM);
477 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
478 }
479 if (ret > 0)
480 file_accessed(filp);
481 return ret;
482}
483
484static inline int is_packetized(struct file *file)
485{
486 return (file->f_flags & O_DIRECT) != 0;
487}
488
489static ssize_t
490pipe_write(struct kiocb *iocb, const struct iovec *_iov,
491 unsigned long nr_segs, loff_t ppos)
492{
493 struct file *filp = iocb->ki_filp;
494 struct inode *inode = filp->f_path.dentry->d_inode;
495 struct pipe_inode_info *pipe;
496 ssize_t ret;
497 int do_wakeup;
498 struct iovec *iov = (struct iovec *)_iov;
499 size_t total_len;
500 ssize_t chars;
501
502 total_len = iov_length(iov, nr_segs);
503 /* Null write succeeds. */
504 if (unlikely(total_len == 0))
505 return 0;
506
507 do_wakeup = 0;
508 ret = 0;
509 mutex_lock(&inode->i_mutex);
510 pipe = inode->i_pipe;
511
512 if (!pipe->readers) {
513 send_sig(SIGPIPE, current, 0);
514 ret = -EPIPE;
515 goto out;
516 }
517
518 /* We try to merge small writes */
519 chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */
520 if (pipe->nrbufs && chars != 0) {
521 int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) &
522 (pipe->buffers - 1);
523 struct pipe_buffer *buf = pipe->bufs + lastbuf;
524 const struct pipe_buf_operations *ops = buf->ops;
525 int offset = buf->offset + buf->len;
526
527 if (ops->can_merge && offset + chars <= PAGE_SIZE) {
528 int error, atomic = 1;
529 void *addr;
530 size_t remaining = chars;
531
532 error = ops->confirm(pipe, buf);
533 if (error)
534 goto out;
535
536 iov_fault_in_pages_read(iov, chars);
537redo1:
538 addr = ops->map(pipe, buf, atomic);
539 error = pipe_iov_copy_from_user(addr, &offset, iov,
540 &remaining, atomic);
541 ops->unmap(pipe, buf, addr);
542 ret = error;
543 do_wakeup = 1;
544 if (error) {
545 if (atomic) {
546 atomic = 0;
547 goto redo1;
548 }
549 goto out;
550 }
551 buf->len += chars;
552 total_len -= chars;
553 ret = chars;
554 if (!total_len)
555 goto out;
556 }
557 }
558
559 for (;;) {
560 int bufs;
561
562 if (!pipe->readers) {
563 send_sig(SIGPIPE, current, 0);
564 if (!ret)
565 ret = -EPIPE;
566 break;
567 }
568 bufs = pipe->nrbufs;
569 if (bufs < pipe->buffers) {
570 int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1);
571 struct pipe_buffer *buf = pipe->bufs + newbuf;
572 struct page *page = pipe->tmp_page;
573 char *src;
574 int error, atomic = 1;
575 int offset = 0;
576 size_t remaining;
577
578 if (!page) {
579 page = alloc_page(GFP_HIGHUSER);
580 if (unlikely(!page)) {
581 ret = ret ? : -ENOMEM;
582 break;
583 }
584 pipe->tmp_page = page;
585 }
586 /* Always wake up, even if the copy fails. Otherwise
587 * we lock up (O_NONBLOCK-)readers that sleep due to
588 * syscall merging.
589 * FIXME! Is this really true?
590 */
591 do_wakeup = 1;
592 chars = PAGE_SIZE;
593 if (chars > total_len)
594 chars = total_len;
595
596 iov_fault_in_pages_read(iov, chars);
597 remaining = chars;
598redo2:
599 if (atomic)
600 src = kmap_atomic(page);
601 else
602 src = kmap(page);
603
604 error = pipe_iov_copy_from_user(src, &offset, iov,
605 &remaining, atomic);
606 if (atomic)
607 kunmap_atomic(src);
608 else
609 kunmap(page);
610
611 if (unlikely(error)) {
612 if (atomic) {
613 atomic = 0;
614 goto redo2;
615 }
616 if (!ret)
617 ret = error;
618 break;
619 }
620 ret += chars;
621
622 /* Insert it into the buffer array */
623 buf->page = page;
624 buf->ops = &anon_pipe_buf_ops;
625 buf->offset = 0;
626 buf->len = chars;
627 buf->flags = 0;
628 if (is_packetized(filp)) {
629 buf->ops = &packet_pipe_buf_ops;
630 buf->flags = PIPE_BUF_FLAG_PACKET;
631 }
632 pipe->nrbufs = ++bufs;
633 pipe->tmp_page = NULL;
634
635 total_len -= chars;
636 if (!total_len)
637 break;
638 }
639 if (bufs < pipe->buffers)
640 continue;
641 if (filp->f_flags & O_NONBLOCK) {
642 if (!ret)
643 ret = -EAGAIN;
644 break;
645 }
646 if (signal_pending(current)) {
647 if (!ret)
648 ret = -ERESTARTSYS;
649 break;
650 }
651 if (do_wakeup) {
652 wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
653 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
654 do_wakeup = 0;
655 }
656 pipe->waiting_writers++;
657 pipe_wait(pipe);
658 pipe->waiting_writers--;
659 }
660out:
661 mutex_unlock(&inode->i_mutex);
662 if (do_wakeup) {
663 wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
664 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
665 }
666 if (ret > 0)
667 file_update_time(filp);
668 return ret;
669}
670
671static ssize_t
672bad_pipe_r(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
673{
674 return -EBADF;
675}
676
677static ssize_t
678bad_pipe_w(struct file *filp, const char __user *buf, size_t count,
679 loff_t *ppos)
680{
681 return -EBADF;
682}
683
684static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
685{
686 struct inode *inode = filp->f_path.dentry->d_inode;
687 struct pipe_inode_info *pipe;
688 int count, buf, nrbufs;
689
690 switch (cmd) {
691 case FIONREAD:
692 mutex_lock(&inode->i_mutex);
693 pipe = inode->i_pipe;
694 count = 0;
695 buf = pipe->curbuf;
696 nrbufs = pipe->nrbufs;
697 while (--nrbufs >= 0) {
698 count += pipe->bufs[buf].len;
699 buf = (buf+1) & (pipe->buffers - 1);
700 }
701 mutex_unlock(&inode->i_mutex);
702
703 return put_user(count, (int __user *)arg);
704 default:
705 return -EINVAL;
706 }
707}
708
709/* No kernel lock held - fine */
710static unsigned int
711pipe_poll(struct file *filp, poll_table *wait)
712{
713 unsigned int mask;
714 struct inode *inode = filp->f_path.dentry->d_inode;
715 struct pipe_inode_info *pipe = inode->i_pipe;
716 int nrbufs;
717
718 poll_wait(filp, &pipe->wait, wait);
719
720 /* Reading only -- no need for acquiring the semaphore. */
721 nrbufs = pipe->nrbufs;
722 mask = 0;
723 if (filp->f_mode & FMODE_READ) {
724 mask = (nrbufs > 0) ? POLLIN | POLLRDNORM : 0;
725 if (!pipe->writers && filp->f_version != pipe->w_counter)
726 mask |= POLLHUP;
727 }
728
729 if (filp->f_mode & FMODE_WRITE) {
730 mask |= (nrbufs < pipe->buffers) ? POLLOUT | POLLWRNORM : 0;
731 /*
732 * Most Unices do not set POLLERR for FIFOs but on Linux they
733 * behave exactly like pipes for poll().
734 */
735 if (!pipe->readers)
736 mask |= POLLERR;
737 }
738
739 return mask;
740}
741
742static int
743pipe_release(struct inode *inode, int decr, int decw)
744{
745 struct pipe_inode_info *pipe;
746
747 mutex_lock(&inode->i_mutex);
748 pipe = inode->i_pipe;
749 pipe->readers -= decr;
750 pipe->writers -= decw;
751
752 if (!pipe->readers && !pipe->writers) {
753 free_pipe_info(inode);
754 } else {
755 wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM | POLLERR | POLLHUP);
756 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
757 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
758 }
759 mutex_unlock(&inode->i_mutex);
760
761 return 0;
762}
763
764static int
765pipe_read_fasync(int fd, struct file *filp, int on)
766{
767 struct inode *inode = filp->f_path.dentry->d_inode;
768 int retval;
769
770 mutex_lock(&inode->i_mutex);
771 retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_readers);
772 mutex_unlock(&inode->i_mutex);
773
774 return retval;
775}
776
777
778static int
779pipe_write_fasync(int fd, struct file *filp, int on)
780{
781 struct inode *inode = filp->f_path.dentry->d_inode;
782 int retval;
783
784 mutex_lock(&inode->i_mutex);
785 retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_writers);
786 mutex_unlock(&inode->i_mutex);
787
788 return retval;
789}
790
791
792static int
793pipe_rdwr_fasync(int fd, struct file *filp, int on)
794{
795 struct inode *inode = filp->f_path.dentry->d_inode;
796 struct pipe_inode_info *pipe = inode->i_pipe;
797 int retval;
798
799 mutex_lock(&inode->i_mutex);
800 retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
801 if (retval >= 0) {
802 retval = fasync_helper(fd, filp, on, &pipe->fasync_writers);
803 if (retval < 0) /* this can happen only if on == T */
804 fasync_helper(-1, filp, 0, &pipe->fasync_readers);
805 }
806 mutex_unlock(&inode->i_mutex);
807 return retval;
808}
809
810
811static int
812pipe_read_release(struct inode *inode, struct file *filp)
813{
814 return pipe_release(inode, 1, 0);
815}
816
817static int
818pipe_write_release(struct inode *inode, struct file *filp)
819{
820 return pipe_release(inode, 0, 1);
821}
822
823static int
824pipe_rdwr_release(struct inode *inode, struct file *filp)
825{
826 int decr, decw;
827
828 decr = (filp->f_mode & FMODE_READ) != 0;
829 decw = (filp->f_mode & FMODE_WRITE) != 0;
830 return pipe_release(inode, decr, decw);
831}
832
833static int
834pipe_read_open(struct inode *inode, struct file *filp)
835{
836 int ret = -ENOENT;
837
838 mutex_lock(&inode->i_mutex);
839
840 if (inode->i_pipe) {
841 ret = 0;
842 inode->i_pipe->readers++;
843 }
844
845 mutex_unlock(&inode->i_mutex);
846
847 return ret;
848}
849
850static int
851pipe_write_open(struct inode *inode, struct file *filp)
852{
853 int ret = -ENOENT;
854
855 mutex_lock(&inode->i_mutex);
856
857 if (inode->i_pipe) {
858 ret = 0;
859 inode->i_pipe->writers++;
860 }
861
862 mutex_unlock(&inode->i_mutex);
863
864 return ret;
865}
866
867static int
868pipe_rdwr_open(struct inode *inode, struct file *filp)
869{
870 int ret = -ENOENT;
871
872 if (!(filp->f_mode & (FMODE_READ|FMODE_WRITE)))
873 return -EINVAL;
874
875 mutex_lock(&inode->i_mutex);
876
877 if (inode->i_pipe) {
878 ret = 0;
879 if (filp->f_mode & FMODE_READ)
880 inode->i_pipe->readers++;
881 if (filp->f_mode & FMODE_WRITE)
882 inode->i_pipe->writers++;
883 }
884
885 mutex_unlock(&inode->i_mutex);
886
887 return ret;
888}
889
890/*
891 * The file_operations structs are not static because they
892 * are also used in linux/fs/fifo.c to do operations on FIFOs.
893 *
894 * Pipes reuse fifos' file_operations structs.
895 */
896const struct file_operations read_pipefifo_fops = {
897 .llseek = no_llseek,
898 .read = do_sync_read,
899 .aio_read = pipe_read,
900 .write = bad_pipe_w,
901 .poll = pipe_poll,
902 .unlocked_ioctl = pipe_ioctl,
903 .open = pipe_read_open,
904 .release = pipe_read_release,
905 .fasync = pipe_read_fasync,
906};
907
908const struct file_operations write_pipefifo_fops = {
909 .llseek = no_llseek,
910 .read = bad_pipe_r,
911 .write = do_sync_write,
912 .aio_write = pipe_write,
913 .poll = pipe_poll,
914 .unlocked_ioctl = pipe_ioctl,
915 .open = pipe_write_open,
916 .release = pipe_write_release,
917 .fasync = pipe_write_fasync,
918};
919
920const struct file_operations rdwr_pipefifo_fops = {
921 .llseek = no_llseek,
922 .read = do_sync_read,
923 .aio_read = pipe_read,
924 .write = do_sync_write,
925 .aio_write = pipe_write,
926 .poll = pipe_poll,
927 .unlocked_ioctl = pipe_ioctl,
928 .open = pipe_rdwr_open,
929 .release = pipe_rdwr_release,
930 .fasync = pipe_rdwr_fasync,
931};
932
933struct pipe_inode_info * alloc_pipe_info(struct inode *inode)
934{
935 struct pipe_inode_info *pipe;
936
937 pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
938 if (pipe) {
939 pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL);
940 if (pipe->bufs) {
941 init_waitqueue_head(&pipe->wait);
942 pipe->r_counter = pipe->w_counter = 1;
943 pipe->inode = inode;
944 pipe->buffers = PIPE_DEF_BUFFERS;
945 return pipe;
946 }
947 kfree(pipe);
948 }
949
950 return NULL;
951}
952
953void __free_pipe_info(struct pipe_inode_info *pipe)
954{
955 int i;
956
957 for (i = 0; i < pipe->buffers; i++) {
958 struct pipe_buffer *buf = pipe->bufs + i;
959 if (buf->ops)
960 buf->ops->release(pipe, buf);
961 }
962 if (pipe->tmp_page)
963 __free_page(pipe->tmp_page);
964 kfree(pipe->bufs);
965 kfree(pipe);
966}
967
968void free_pipe_info(struct inode *inode)
969{
970 __free_pipe_info(inode->i_pipe);
971 inode->i_pipe = NULL;
972}
973
974static struct vfsmount *pipe_mnt __read_mostly;
975
976/*
977 * pipefs_dname() is called from d_path().
978 */
979static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen)
980{
981 return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]",
982 dentry->d_inode->i_ino);
983}
984
985static const struct dentry_operations pipefs_dentry_operations = {
986 .d_dname = pipefs_dname,
987};
988
989static struct inode * get_pipe_inode(void)
990{
991 struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb);
992 struct pipe_inode_info *pipe;
993
994 if (!inode)
995 goto fail_inode;
996
997 inode->i_ino = get_next_ino();
998
999 pipe = alloc_pipe_info(inode);
1000 if (!pipe)
1001 goto fail_iput;
1002 inode->i_pipe = pipe;
1003
1004 pipe->readers = pipe->writers = 1;
1005 inode->i_fop = &rdwr_pipefifo_fops;
1006
1007 /*
1008 * Mark the inode dirty from the very beginning,
1009 * that way it will never be moved to the dirty
1010 * list because "mark_inode_dirty()" will think
1011 * that it already _is_ on the dirty list.
1012 */
1013 inode->i_state = I_DIRTY;
1014 inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
1015 inode->i_uid = current_fsuid();
1016 inode->i_gid = current_fsgid();
1017 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1018
1019 return inode;
1020
1021fail_iput:
1022 iput(inode);
1023
1024fail_inode:
1025 return NULL;
1026}
1027
1028struct file *create_write_pipe(int flags)
1029{
1030 int err;
1031 struct inode *inode;
1032 struct file *f;
1033 struct path path;
1034 struct qstr name = { .name = "" };
1035
1036 err = -ENFILE;
1037 inode = get_pipe_inode();
1038 if (!inode)
1039 goto err;
1040
1041 err = -ENOMEM;
1042 path.dentry = d_alloc_pseudo(pipe_mnt->mnt_sb, &name);
1043 if (!path.dentry)
1044 goto err_inode;
1045 path.mnt = mntget(pipe_mnt);
1046
1047 d_instantiate(path.dentry, inode);
1048
1049 err = -ENFILE;
1050 f = alloc_file(&path, FMODE_WRITE, &write_pipefifo_fops);
1051 if (!f)
1052 goto err_dentry;
1053 f->f_mapping = inode->i_mapping;
1054
1055 f->f_flags = O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT));
1056 f->f_version = 0;
1057
1058 return f;
1059
1060 err_dentry:
1061 free_pipe_info(inode);
1062 path_put(&path);
1063 return ERR_PTR(err);
1064
1065 err_inode:
1066 free_pipe_info(inode);
1067 iput(inode);
1068 err:
1069 return ERR_PTR(err);
1070}
1071
1072void free_write_pipe(struct file *f)
1073{
1074 free_pipe_info(f->f_dentry->d_inode);
1075 path_put(&f->f_path);
1076 put_filp(f);
1077}
1078
1079struct file *create_read_pipe(struct file *wrf, int flags)
1080{
1081 /* Grab pipe from the writer */
1082 struct file *f = alloc_file(&wrf->f_path, FMODE_READ,
1083 &read_pipefifo_fops);
1084 if (!f)
1085 return ERR_PTR(-ENFILE);
1086
1087 path_get(&wrf->f_path);
1088 f->f_flags = O_RDONLY | (flags & O_NONBLOCK);
1089
1090 return f;
1091}
1092
1093int do_pipe_flags(int *fd, int flags)
1094{
1095 struct file *fw, *fr;
1096 int error;
1097 int fdw, fdr;
1098
1099 if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT))
1100 return -EINVAL;
1101
1102 fw = create_write_pipe(flags);
1103 if (IS_ERR(fw))
1104 return PTR_ERR(fw);
1105 fr = create_read_pipe(fw, flags);
1106 error = PTR_ERR(fr);
1107 if (IS_ERR(fr))
1108 goto err_write_pipe;
1109
1110 error = get_unused_fd_flags(flags);
1111 if (error < 0)
1112 goto err_read_pipe;
1113 fdr = error;
1114
1115 error = get_unused_fd_flags(flags);
1116 if (error < 0)
1117 goto err_fdr;
1118 fdw = error;
1119
1120 audit_fd_pair(fdr, fdw);
1121 fd_install(fdr, fr);
1122 fd_install(fdw, fw);
1123 fd[0] = fdr;
1124 fd[1] = fdw;
1125
1126 return 0;
1127
1128 err_fdr:
1129 put_unused_fd(fdr);
1130 err_read_pipe:
1131 path_put(&fr->f_path);
1132 put_filp(fr);
1133 err_write_pipe:
1134 free_write_pipe(fw);
1135 return error;
1136}
1137
1138/*
1139 * sys_pipe() is the normal C calling standard for creating
1140 * a pipe. It's not the way Unix traditionally does this, though.
1141 */
1142SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags)
1143{
1144 int fd[2];
1145 int error;
1146
1147 error = do_pipe_flags(fd, flags);
1148 if (!error) {
1149 if (copy_to_user(fildes, fd, sizeof(fd))) {
1150 sys_close(fd[0]);
1151 sys_close(fd[1]);
1152 error = -EFAULT;
1153 }
1154 }
1155 return error;
1156}
1157
1158SYSCALL_DEFINE1(pipe, int __user *, fildes)
1159{
1160 return sys_pipe2(fildes, 0);
1161}
1162
1163/*
1164 * Allocate a new array of pipe buffers and copy the info over. Returns the
1165 * pipe size if successful, or return -ERROR on error.
1166 */
1167static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages)
1168{
1169 struct pipe_buffer *bufs;
1170
1171 /*
1172 * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't
1173 * expect a lot of shrink+grow operations, just free and allocate
1174 * again like we would do for growing. If the pipe currently
1175 * contains more buffers than arg, then return busy.
1176 */
1177 if (nr_pages < pipe->nrbufs)
1178 return -EBUSY;
1179
1180 bufs = kcalloc(nr_pages, sizeof(*bufs), GFP_KERNEL | __GFP_NOWARN);
1181 if (unlikely(!bufs))
1182 return -ENOMEM;
1183
1184 /*
1185 * The pipe array wraps around, so just start the new one at zero
1186 * and adjust the indexes.
1187 */
1188 if (pipe->nrbufs) {
1189 unsigned int tail;
1190 unsigned int head;
1191
1192 tail = pipe->curbuf + pipe->nrbufs;
1193 if (tail < pipe->buffers)
1194 tail = 0;
1195 else
1196 tail &= (pipe->buffers - 1);
1197
1198 head = pipe->nrbufs - tail;
1199 if (head)
1200 memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer));
1201 if (tail)
1202 memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer));
1203 }
1204
1205 pipe->curbuf = 0;
1206 kfree(pipe->bufs);
1207 pipe->bufs = bufs;
1208 pipe->buffers = nr_pages;
1209 return nr_pages * PAGE_SIZE;
1210}
1211
1212/*
1213 * Currently we rely on the pipe array holding a power-of-2 number
1214 * of pages.
1215 */
1216static inline unsigned int round_pipe_size(unsigned int size)
1217{
1218 unsigned long nr_pages;
1219
1220 nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
1221 return roundup_pow_of_two(nr_pages) << PAGE_SHIFT;
1222}
1223
1224/*
1225 * This should work even if CONFIG_PROC_FS isn't set, as proc_dointvec_minmax
1226 * will return an error.
1227 */
1228int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
1229 size_t *lenp, loff_t *ppos)
1230{
1231 int ret;
1232
1233 ret = proc_dointvec_minmax(table, write, buf, lenp, ppos);
1234 if (ret < 0 || !write)
1235 return ret;
1236
1237 pipe_max_size = round_pipe_size(pipe_max_size);
1238 return ret;
1239}
1240
1241/*
1242 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
1243 * location, so checking ->i_pipe is not enough to verify that this is a
1244 * pipe.
1245 */
1246struct pipe_inode_info *get_pipe_info(struct file *file)
1247{
1248 struct inode *i = file->f_path.dentry->d_inode;
1249
1250 return S_ISFIFO(i->i_mode) ? i->i_pipe : NULL;
1251}
1252
1253long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
1254{
1255 struct pipe_inode_info *pipe;
1256 long ret;
1257
1258 pipe = get_pipe_info(file);
1259 if (!pipe)
1260 return -EBADF;
1261
1262 mutex_lock(&pipe->inode->i_mutex);
1263
1264 switch (cmd) {
1265 case F_SETPIPE_SZ: {
1266 unsigned int size, nr_pages;
1267
1268 size = round_pipe_size(arg);
1269 nr_pages = size >> PAGE_SHIFT;
1270
1271 ret = -EINVAL;
1272 if (!nr_pages)
1273 goto out;
1274
1275 if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) {
1276 ret = -EPERM;
1277 goto out;
1278 }
1279 ret = pipe_set_size(pipe, nr_pages);
1280 break;
1281 }
1282 case F_GETPIPE_SZ:
1283 ret = pipe->buffers * PAGE_SIZE;
1284 break;
1285 default:
1286 ret = -EINVAL;
1287 break;
1288 }
1289
1290out:
1291 mutex_unlock(&pipe->inode->i_mutex);
1292 return ret;
1293}
1294
1295static const struct super_operations pipefs_ops = {
1296 .destroy_inode = free_inode_nonrcu,
1297 .statfs = simple_statfs,
1298};
1299
1300/*
1301 * pipefs should _never_ be mounted by userland - too much of security hassle,
1302 * no real gain from having the whole whorehouse mounted. So we don't need
1303 * any operations on the root directory. However, we need a non-trivial
1304 * d_name - pipe: will go nicely and kill the special-casing in procfs.
1305 */
1306static struct dentry *pipefs_mount(struct file_system_type *fs_type,
1307 int flags, const char *dev_name, void *data)
1308{
1309 return mount_pseudo(fs_type, "pipe:", &pipefs_ops,
1310 &pipefs_dentry_operations, PIPEFS_MAGIC);
1311}
1312
1313static struct file_system_type pipe_fs_type = {
1314 .name = "pipefs",
1315 .mount = pipefs_mount,
1316 .kill_sb = kill_anon_super,
1317};
1318
1319static int __init init_pipe_fs(void)
1320{
1321 int err = register_filesystem(&pipe_fs_type);
1322
1323 if (!err) {
1324 pipe_mnt = kern_mount(&pipe_fs_type);
1325 if (IS_ERR(pipe_mnt)) {
1326 err = PTR_ERR(pipe_mnt);
1327 unregister_filesystem(&pipe_fs_type);
1328 }
1329 }
1330 return err;
1331}
1332
1333fs_initcall(init_pipe_fs);