blob: 4954fae7f73636c27b8a48d1e6d59cceaaeed3dd [file] [log] [blame]
xjb04a4022021-11-25 15:01:52 +08001// SPDX-License-Identifier: GPL-2.0
2/*
3 * linux/fs/read_write.c
4 *
5 * Copyright (C) 1991, 1992 Linus Torvalds
6 */
7
8#include <linux/slab.h>
9#include <linux/stat.h>
10#include <linux/sched/xacct.h>
11#include <linux/fcntl.h>
12#include <linux/file.h>
13#include <linux/uio.h>
14#include <linux/fsnotify.h>
15#include <linux/security.h>
16#include <linux/export.h>
17#include <linux/syscalls.h>
18#include <linux/pagemap.h>
19#include <linux/splice.h>
20#include <linux/compat.h>
21#include <linux/mount.h>
22#include <linux/fs.h>
23#include "internal.h"
24
25#include <linux/uaccess.h>
26#include <asm/unistd.h>
27
28const struct file_operations generic_ro_fops = {
29 .llseek = generic_file_llseek,
30 .read_iter = generic_file_read_iter,
31 .mmap = generic_file_readonly_mmap,
32 .splice_read = generic_file_splice_read,
33};
34
35EXPORT_SYMBOL(generic_ro_fops);
36
37static inline bool unsigned_offsets(struct file *file)
38{
39 return file->f_mode & FMODE_UNSIGNED_OFFSET;
40}
41
42/**
43 * vfs_setpos - update the file offset for lseek
44 * @file: file structure in question
45 * @offset: file offset to seek to
46 * @maxsize: maximum file size
47 *
48 * This is a low-level filesystem helper for updating the file offset to
49 * the value specified by @offset if the given offset is valid and it is
50 * not equal to the current file offset.
51 *
52 * Return the specified offset on success and -EINVAL on invalid offset.
53 */
54loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
55{
56 if (offset < 0 && !unsigned_offsets(file))
57 return -EINVAL;
58 if (offset > maxsize)
59 return -EINVAL;
60
61 if (offset != file->f_pos) {
62 file->f_pos = offset;
63 file->f_version = 0;
64 }
65 return offset;
66}
67EXPORT_SYMBOL(vfs_setpos);
68
69/**
70 * generic_file_llseek_size - generic llseek implementation for regular files
71 * @file: file structure to seek on
72 * @offset: file offset to seek to
73 * @whence: type of seek
74 * @size: max size of this file in file system
75 * @eof: offset used for SEEK_END position
76 *
77 * This is a variant of generic_file_llseek that allows passing in a custom
78 * maximum file size and a custom EOF position, for e.g. hashed directories
79 *
80 * Synchronization:
81 * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
82 * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
83 * read/writes behave like SEEK_SET against seeks.
84 */
85loff_t
86generic_file_llseek_size(struct file *file, loff_t offset, int whence,
87 loff_t maxsize, loff_t eof)
88{
89 switch (whence) {
90 case SEEK_END:
91 offset += eof;
92 break;
93 case SEEK_CUR:
94 /*
95 * Here we special-case the lseek(fd, 0, SEEK_CUR)
96 * position-querying operation. Avoid rewriting the "same"
97 * f_pos value back to the file because a concurrent read(),
98 * write() or lseek() might have altered it
99 */
100 if (offset == 0)
101 return file->f_pos;
102 /*
103 * f_lock protects against read/modify/write race with other
104 * SEEK_CURs. Note that parallel writes and reads behave
105 * like SEEK_SET.
106 */
107 spin_lock(&file->f_lock);
108 offset = vfs_setpos(file, file->f_pos + offset, maxsize);
109 spin_unlock(&file->f_lock);
110 return offset;
111 case SEEK_DATA:
112 /*
113 * In the generic case the entire file is data, so as long as
114 * offset isn't at the end of the file then the offset is data.
115 */
116 if ((unsigned long long)offset >= eof)
117 return -ENXIO;
118 break;
119 case SEEK_HOLE:
120 /*
121 * There is a virtual hole at the end of the file, so as long as
122 * offset isn't i_size or larger, return i_size.
123 */
124 if ((unsigned long long)offset >= eof)
125 return -ENXIO;
126 offset = eof;
127 break;
128 }
129
130 return vfs_setpos(file, offset, maxsize);
131}
132EXPORT_SYMBOL(generic_file_llseek_size);
133
134/**
135 * generic_file_llseek - generic llseek implementation for regular files
136 * @file: file structure to seek on
137 * @offset: file offset to seek to
138 * @whence: type of seek
139 *
140 * This is a generic implemenation of ->llseek useable for all normal local
141 * filesystems. It just updates the file offset to the value specified by
142 * @offset and @whence.
143 */
144loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
145{
146 struct inode *inode = file->f_mapping->host;
147
148 return generic_file_llseek_size(file, offset, whence,
149 inode->i_sb->s_maxbytes,
150 i_size_read(inode));
151}
152EXPORT_SYMBOL(generic_file_llseek);
153
154/**
155 * fixed_size_llseek - llseek implementation for fixed-sized devices
156 * @file: file structure to seek on
157 * @offset: file offset to seek to
158 * @whence: type of seek
159 * @size: size of the file
160 *
161 */
162loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size)
163{
164 switch (whence) {
165 case SEEK_SET: case SEEK_CUR: case SEEK_END:
166 return generic_file_llseek_size(file, offset, whence,
167 size, size);
168 default:
169 return -EINVAL;
170 }
171}
172EXPORT_SYMBOL(fixed_size_llseek);
173
174/**
175 * no_seek_end_llseek - llseek implementation for fixed-sized devices
176 * @file: file structure to seek on
177 * @offset: file offset to seek to
178 * @whence: type of seek
179 *
180 */
181loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence)
182{
183 switch (whence) {
184 case SEEK_SET: case SEEK_CUR:
185 return generic_file_llseek_size(file, offset, whence,
186 OFFSET_MAX, 0);
187 default:
188 return -EINVAL;
189 }
190}
191EXPORT_SYMBOL(no_seek_end_llseek);
192
193/**
194 * no_seek_end_llseek_size - llseek implementation for fixed-sized devices
195 * @file: file structure to seek on
196 * @offset: file offset to seek to
197 * @whence: type of seek
198 * @size: maximal offset allowed
199 *
200 */
201loff_t no_seek_end_llseek_size(struct file *file, loff_t offset, int whence, loff_t size)
202{
203 switch (whence) {
204 case SEEK_SET: case SEEK_CUR:
205 return generic_file_llseek_size(file, offset, whence,
206 size, 0);
207 default:
208 return -EINVAL;
209 }
210}
211EXPORT_SYMBOL(no_seek_end_llseek_size);
212
213/**
214 * noop_llseek - No Operation Performed llseek implementation
215 * @file: file structure to seek on
216 * @offset: file offset to seek to
217 * @whence: type of seek
218 *
219 * This is an implementation of ->llseek useable for the rare special case when
220 * userspace expects the seek to succeed but the (device) file is actually not
221 * able to perform the seek. In this case you use noop_llseek() instead of
222 * falling back to the default implementation of ->llseek.
223 */
224loff_t noop_llseek(struct file *file, loff_t offset, int whence)
225{
226 return file->f_pos;
227}
228EXPORT_SYMBOL(noop_llseek);
229
230loff_t no_llseek(struct file *file, loff_t offset, int whence)
231{
232 return -ESPIPE;
233}
234EXPORT_SYMBOL(no_llseek);
235
236loff_t default_llseek(struct file *file, loff_t offset, int whence)
237{
238 struct inode *inode = file_inode(file);
239 loff_t retval;
240
241 inode_lock(inode);
242 switch (whence) {
243 case SEEK_END:
244 offset += i_size_read(inode);
245 break;
246 case SEEK_CUR:
247 if (offset == 0) {
248 retval = file->f_pos;
249 goto out;
250 }
251 offset += file->f_pos;
252 break;
253 case SEEK_DATA:
254 /*
255 * In the generic case the entire file is data, so as
256 * long as offset isn't at the end of the file then the
257 * offset is data.
258 */
259 if (offset >= inode->i_size) {
260 retval = -ENXIO;
261 goto out;
262 }
263 break;
264 case SEEK_HOLE:
265 /*
266 * There is a virtual hole at the end of the file, so
267 * as long as offset isn't i_size or larger, return
268 * i_size.
269 */
270 if (offset >= inode->i_size) {
271 retval = -ENXIO;
272 goto out;
273 }
274 offset = inode->i_size;
275 break;
276 }
277 retval = -EINVAL;
278 if (offset >= 0 || unsigned_offsets(file)) {
279 if (offset != file->f_pos) {
280 file->f_pos = offset;
281 file->f_version = 0;
282 }
283 retval = offset;
284 }
285out:
286 inode_unlock(inode);
287 return retval;
288}
289EXPORT_SYMBOL(default_llseek);
290
291loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
292{
293 loff_t (*fn)(struct file *, loff_t, int);
294
295 fn = no_llseek;
296 if (file->f_mode & FMODE_LSEEK) {
297 if (file->f_op->llseek)
298 fn = file->f_op->llseek;
299 }
300 return fn(file, offset, whence);
301}
302EXPORT_SYMBOL(vfs_llseek);
303
304off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence)
305{
306 off_t retval;
307 struct fd f = fdget_pos(fd);
308 if (!f.file)
309 return -EBADF;
310
311 retval = -EINVAL;
312 if (whence <= SEEK_MAX) {
313 loff_t res = vfs_llseek(f.file, offset, whence);
314 retval = res;
315 if (res != (loff_t)retval)
316 retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */
317 }
318 fdput_pos(f);
319 return retval;
320}
321
322SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
323{
324 return ksys_lseek(fd, offset, whence);
325}
326
327#ifdef CONFIG_COMPAT
328COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
329{
330 return ksys_lseek(fd, offset, whence);
331}
332#endif
333
334#ifdef __ARCH_WANT_SYS_LLSEEK
335SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
336 unsigned long, offset_low, loff_t __user *, result,
337 unsigned int, whence)
338{
339 int retval;
340 struct fd f = fdget_pos(fd);
341 loff_t offset;
342
343 if (!f.file)
344 return -EBADF;
345
346 retval = -EINVAL;
347 if (whence > SEEK_MAX)
348 goto out_putf;
349
350 offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
351 whence);
352
353 retval = (int)offset;
354 if (offset >= 0) {
355 retval = -EFAULT;
356 if (!copy_to_user(result, &offset, sizeof(offset)))
357 retval = 0;
358 }
359out_putf:
360 fdput_pos(f);
361 return retval;
362}
363#endif
364
365int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
366{
367 struct inode *inode;
368 loff_t pos;
369 int retval = -EINVAL;
370
371 inode = file_inode(file);
372 if (unlikely((ssize_t) count < 0))
373 return retval;
374 pos = *ppos;
375 if (unlikely(pos < 0)) {
376 if (!unsigned_offsets(file))
377 return retval;
378 if (count >= -pos) /* both values are in 0..LLONG_MAX */
379 return -EOVERFLOW;
380 } else if (unlikely((loff_t) (pos + count) < 0)) {
381 if (!unsigned_offsets(file))
382 return retval;
383 }
384
385 if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
386 retval = locks_mandatory_area(inode, file, pos, pos + count - 1,
387 read_write == READ ? F_RDLCK : F_WRLCK);
388 if (retval < 0)
389 return retval;
390 }
391 return security_file_permission(file,
392 read_write == READ ? MAY_READ : MAY_WRITE);
393}
394
395static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
396{
397 struct iovec iov = { .iov_base = buf, .iov_len = len };
398 struct kiocb kiocb;
399 struct iov_iter iter;
400 ssize_t ret;
401
402 init_sync_kiocb(&kiocb, filp);
403 kiocb.ki_pos = *ppos;
404 iov_iter_init(&iter, READ, &iov, 1, len);
405
406 ret = call_read_iter(filp, &kiocb, &iter);
407 BUG_ON(ret == -EIOCBQUEUED);
408 *ppos = kiocb.ki_pos;
409 return ret;
410}
411
412ssize_t __vfs_read(struct file *file, char __user *buf, size_t count,
413 loff_t *pos)
414{
415 if (file->f_op->read)
416 return file->f_op->read(file, buf, count, pos);
417 else if (file->f_op->read_iter)
418 return new_sync_read(file, buf, count, pos);
419 else
420 return -EINVAL;
421}
422
423ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
424{
425 mm_segment_t old_fs;
426 ssize_t result;
427
428 old_fs = get_fs();
429 set_fs(get_ds());
430 /* The cast to a user pointer is valid due to the set_fs() */
431 result = vfs_read(file, (void __user *)buf, count, pos);
432 set_fs(old_fs);
433 return result;
434}
435EXPORT_SYMBOL(kernel_read);
436
437ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
438{
439 ssize_t ret;
440
441 if (!(file->f_mode & FMODE_READ))
442 return -EBADF;
443 if (!(file->f_mode & FMODE_CAN_READ))
444 return -EINVAL;
445 if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
446 return -EFAULT;
447
448 ret = rw_verify_area(READ, file, pos, count);
449 if (!ret) {
450 if (count > MAX_RW_COUNT)
451 count = MAX_RW_COUNT;
452 ret = __vfs_read(file, buf, count, pos);
453 if (ret > 0) {
454 fsnotify_access(file);
455 add_rchar(current, ret);
456 }
457 inc_syscr(current);
458 }
459
460 return ret;
461}
462
463EXPORT_SYMBOL_GPL(vfs_read);
464
465static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
466{
467 struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
468 struct kiocb kiocb;
469 struct iov_iter iter;
470 ssize_t ret;
471
472 init_sync_kiocb(&kiocb, filp);
473 kiocb.ki_pos = *ppos;
474 iov_iter_init(&iter, WRITE, &iov, 1, len);
475
476 ret = call_write_iter(filp, &kiocb, &iter);
477 BUG_ON(ret == -EIOCBQUEUED);
478 if (ret > 0)
479 *ppos = kiocb.ki_pos;
480 return ret;
481}
482
483ssize_t __vfs_write(struct file *file, const char __user *p, size_t count,
484 loff_t *pos)
485{
486 if (file->f_op->write)
487 return file->f_op->write(file, p, count, pos);
488 else if (file->f_op->write_iter)
489 return new_sync_write(file, p, count, pos);
490 else
491 return -EINVAL;
492}
493
494ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
495{
496 mm_segment_t old_fs;
497 const char __user *p;
498 ssize_t ret;
499
500 if (!(file->f_mode & FMODE_CAN_WRITE))
501 return -EINVAL;
502
503 old_fs = get_fs();
504 set_fs(get_ds());
505 p = (__force const char __user *)buf;
506 if (count > MAX_RW_COUNT)
507 count = MAX_RW_COUNT;
508 ret = __vfs_write(file, p, count, pos);
509 set_fs(old_fs);
510 if (ret > 0) {
511 fsnotify_modify(file);
512 add_wchar(current, ret);
513 }
514 inc_syscw(current);
515 return ret;
516}
517EXPORT_SYMBOL(__kernel_write);
518
519ssize_t kernel_write(struct file *file, const void *buf, size_t count,
520 loff_t *pos)
521{
522 mm_segment_t old_fs;
523 ssize_t res;
524
525 old_fs = get_fs();
526 set_fs(get_ds());
527 /* The cast to a user pointer is valid due to the set_fs() */
528 res = vfs_write(file, (__force const char __user *)buf, count, pos);
529 set_fs(old_fs);
530
531 return res;
532}
533EXPORT_SYMBOL(kernel_write);
534
535ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
536{
537 ssize_t ret;
538
539 if (!(file->f_mode & FMODE_WRITE))
540 return -EBADF;
541 if (!(file->f_mode & FMODE_CAN_WRITE))
542 return -EINVAL;
543 if (unlikely(!access_ok(VERIFY_READ, buf, count)))
544 return -EFAULT;
545
546 ret = rw_verify_area(WRITE, file, pos, count);
547 if (!ret) {
548 if (count > MAX_RW_COUNT)
549 count = MAX_RW_COUNT;
550 file_start_write(file);
551 ret = __vfs_write(file, buf, count, pos);
552 if (ret > 0) {
553 fsnotify_modify(file);
554 add_wchar(current, ret);
555 }
556 inc_syscw(current);
557 file_end_write(file);
558 }
559
560 return ret;
561}
562
563EXPORT_SYMBOL_GPL(vfs_write);
564
565static inline loff_t file_pos_read(struct file *file)
566{
567 return file->f_mode & FMODE_STREAM ? 0 : file->f_pos;
568}
569
570static inline void file_pos_write(struct file *file, loff_t pos)
571{
572 if ((file->f_mode & FMODE_STREAM) == 0)
573 file->f_pos = pos;
574}
575
576ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count)
577{
578 struct fd f = fdget_pos(fd);
579 ssize_t ret = -EBADF;
580
581 if (f.file) {
582 loff_t pos = file_pos_read(f.file);
583 ret = vfs_read(f.file, buf, count, &pos);
584 if (ret >= 0)
585 file_pos_write(f.file, pos);
586 fdput_pos(f);
587 }
588 return ret;
589}
590
591SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
592{
593 return ksys_read(fd, buf, count);
594}
595
596ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
597{
598 struct fd f = fdget_pos(fd);
599 ssize_t ret = -EBADF;
600
601 if (f.file) {
602 loff_t pos = file_pos_read(f.file);
603 ret = vfs_write(f.file, buf, count, &pos);
604 if (ret >= 0)
605 file_pos_write(f.file, pos);
606 fdput_pos(f);
607 }
608
609 return ret;
610}
611
612SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
613 size_t, count)
614{
615 return ksys_write(fd, buf, count);
616}
617
618ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count,
619 loff_t pos)
620{
621 struct fd f;
622 ssize_t ret = -EBADF;
623
624 if (pos < 0)
625 return -EINVAL;
626
627 f = fdget(fd);
628 if (f.file) {
629 ret = -ESPIPE;
630 if (f.file->f_mode & FMODE_PREAD)
631 ret = vfs_read(f.file, buf, count, &pos);
632 fdput(f);
633 }
634
635 return ret;
636}
637
638SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
639 size_t, count, loff_t, pos)
640{
641 return ksys_pread64(fd, buf, count, pos);
642}
643
644ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf,
645 size_t count, loff_t pos)
646{
647 struct fd f;
648 ssize_t ret = -EBADF;
649
650 if (pos < 0)
651 return -EINVAL;
652
653 f = fdget(fd);
654 if (f.file) {
655 ret = -ESPIPE;
656 if (f.file->f_mode & FMODE_PWRITE)
657 ret = vfs_write(f.file, buf, count, &pos);
658 fdput(f);
659 }
660
661 return ret;
662}
663
664SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
665 size_t, count, loff_t, pos)
666{
667 return ksys_pwrite64(fd, buf, count, pos);
668}
669
670static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
671 loff_t *ppos, int type, rwf_t flags)
672{
673 struct kiocb kiocb;
674 ssize_t ret;
675
676 init_sync_kiocb(&kiocb, filp);
677 ret = kiocb_set_rw_flags(&kiocb, flags);
678 if (ret)
679 return ret;
680 kiocb.ki_pos = *ppos;
681
682 if (type == READ)
683 ret = call_read_iter(filp, &kiocb, iter);
684 else
685 ret = call_write_iter(filp, &kiocb, iter);
686 BUG_ON(ret == -EIOCBQUEUED);
687 *ppos = kiocb.ki_pos;
688 return ret;
689}
690
691/* Do it by hand, with file-ops */
692static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
693 loff_t *ppos, int type, rwf_t flags)
694{
695 ssize_t ret = 0;
696
697 if (flags & ~RWF_HIPRI)
698 return -EOPNOTSUPP;
699
700 while (iov_iter_count(iter)) {
701 struct iovec iovec = iov_iter_iovec(iter);
702 ssize_t nr;
703
704 if (type == READ) {
705 nr = filp->f_op->read(filp, iovec.iov_base,
706 iovec.iov_len, ppos);
707 } else {
708 nr = filp->f_op->write(filp, iovec.iov_base,
709 iovec.iov_len, ppos);
710 }
711
712 if (nr < 0) {
713 if (!ret)
714 ret = nr;
715 break;
716 }
717 ret += nr;
718 if (nr != iovec.iov_len)
719 break;
720 iov_iter_advance(iter, nr);
721 }
722
723 return ret;
724}
725
726/* A write operation does a read from user space and vice versa */
727#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
728
729/**
730 * rw_copy_check_uvector() - Copy an array of &struct iovec from userspace
731 * into the kernel and check that it is valid.
732 *
733 * @type: One of %CHECK_IOVEC_ONLY, %READ, or %WRITE.
734 * @uvector: Pointer to the userspace array.
735 * @nr_segs: Number of elements in userspace array.
736 * @fast_segs: Number of elements in @fast_pointer.
737 * @fast_pointer: Pointer to (usually small on-stack) kernel array.
738 * @ret_pointer: (output parameter) Pointer to a variable that will point to
739 * either @fast_pointer, a newly allocated kernel array, or NULL,
740 * depending on which array was used.
741 *
742 * This function copies an array of &struct iovec of @nr_segs from
743 * userspace into the kernel and checks that each element is valid (e.g.
744 * it does not point to a kernel address or cause overflow by being too
745 * large, etc.).
746 *
747 * As an optimization, the caller may provide a pointer to a small
748 * on-stack array in @fast_pointer, typically %UIO_FASTIOV elements long
749 * (the size of this array, or 0 if unused, should be given in @fast_segs).
750 *
751 * @ret_pointer will always point to the array that was used, so the
752 * caller must take care not to call kfree() on it e.g. in case the
753 * @fast_pointer array was used and it was allocated on the stack.
754 *
755 * Return: The total number of bytes covered by the iovec array on success
756 * or a negative error code on error.
757 */
758ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
759 unsigned long nr_segs, unsigned long fast_segs,
760 struct iovec *fast_pointer,
761 struct iovec **ret_pointer)
762{
763 unsigned long seg;
764 ssize_t ret;
765 struct iovec *iov = fast_pointer;
766
767 /*
768 * SuS says "The readv() function *may* fail if the iovcnt argument
769 * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
770 * traditionally returned zero for zero segments, so...
771 */
772 if (nr_segs == 0) {
773 ret = 0;
774 goto out;
775 }
776
777 /*
778 * First get the "struct iovec" from user memory and
779 * verify all the pointers
780 */
781 if (nr_segs > UIO_MAXIOV) {
782 ret = -EINVAL;
783 goto out;
784 }
785 if (nr_segs > fast_segs) {
786 iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
787 if (iov == NULL) {
788 ret = -ENOMEM;
789 goto out;
790 }
791 }
792 if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
793 ret = -EFAULT;
794 goto out;
795 }
796
797 /*
798 * According to the Single Unix Specification we should return EINVAL
799 * if an element length is < 0 when cast to ssize_t or if the
800 * total length would overflow the ssize_t return value of the
801 * system call.
802 *
803 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
804 * overflow case.
805 */
806 ret = 0;
807 for (seg = 0; seg < nr_segs; seg++) {
808 void __user *buf = iov[seg].iov_base;
809 ssize_t len = (ssize_t)iov[seg].iov_len;
810
811 /* see if we we're about to use an invalid len or if
812 * it's about to overflow ssize_t */
813 if (len < 0) {
814 ret = -EINVAL;
815 goto out;
816 }
817 if (type >= 0
818 && unlikely(!access_ok(vrfy_dir(type), buf, len))) {
819 ret = -EFAULT;
820 goto out;
821 }
822 if (len > MAX_RW_COUNT - ret) {
823 len = MAX_RW_COUNT - ret;
824 iov[seg].iov_len = len;
825 }
826 ret += len;
827 }
828out:
829 *ret_pointer = iov;
830 return ret;
831}
832
833#ifdef CONFIG_COMPAT
834ssize_t compat_rw_copy_check_uvector(int type,
835 const struct compat_iovec __user *uvector, unsigned long nr_segs,
836 unsigned long fast_segs, struct iovec *fast_pointer,
837 struct iovec **ret_pointer)
838{
839 compat_ssize_t tot_len;
840 struct iovec *iov = *ret_pointer = fast_pointer;
841 ssize_t ret = 0;
842 int seg;
843
844 /*
845 * SuS says "The readv() function *may* fail if the iovcnt argument
846 * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
847 * traditionally returned zero for zero segments, so...
848 */
849 if (nr_segs == 0)
850 goto out;
851
852 ret = -EINVAL;
853 if (nr_segs > UIO_MAXIOV)
854 goto out;
855 if (nr_segs > fast_segs) {
856 ret = -ENOMEM;
857 iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
858 if (iov == NULL)
859 goto out;
860 }
861 *ret_pointer = iov;
862
863 ret = -EFAULT;
864 if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
865 goto out;
866
867 /*
868 * Single unix specification:
869 * We should -EINVAL if an element length is not >= 0 and fitting an
870 * ssize_t.
871 *
872 * In Linux, the total length is limited to MAX_RW_COUNT, there is
873 * no overflow possibility.
874 */
875 tot_len = 0;
876 ret = -EINVAL;
877 for (seg = 0; seg < nr_segs; seg++) {
878 compat_uptr_t buf;
879 compat_ssize_t len;
880
881 if (__get_user(len, &uvector->iov_len) ||
882 __get_user(buf, &uvector->iov_base)) {
883 ret = -EFAULT;
884 goto out;
885 }
886 if (len < 0) /* size_t not fitting in compat_ssize_t .. */
887 goto out;
888 if (type >= 0 &&
889 !access_ok(vrfy_dir(type), compat_ptr(buf), len)) {
890 ret = -EFAULT;
891 goto out;
892 }
893 if (len > MAX_RW_COUNT - tot_len)
894 len = MAX_RW_COUNT - tot_len;
895 tot_len += len;
896 iov->iov_base = compat_ptr(buf);
897 iov->iov_len = (compat_size_t) len;
898 uvector++;
899 iov++;
900 }
901 ret = tot_len;
902
903out:
904 return ret;
905}
906#endif
907
908static ssize_t do_iter_read(struct file *file, struct iov_iter *iter,
909 loff_t *pos, rwf_t flags)
910{
911 size_t tot_len;
912 ssize_t ret = 0;
913
914 if (!(file->f_mode & FMODE_READ))
915 return -EBADF;
916 if (!(file->f_mode & FMODE_CAN_READ))
917 return -EINVAL;
918
919 tot_len = iov_iter_count(iter);
920 if (!tot_len)
921 goto out;
922 ret = rw_verify_area(READ, file, pos, tot_len);
923 if (ret < 0)
924 return ret;
925
926 if (file->f_op->read_iter)
927 ret = do_iter_readv_writev(file, iter, pos, READ, flags);
928 else
929 ret = do_loop_readv_writev(file, iter, pos, READ, flags);
930out:
931 if (ret >= 0)
932 fsnotify_access(file);
933 return ret;
934}
935
936ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
937 rwf_t flags)
938{
939 if (!file->f_op->read_iter)
940 return -EINVAL;
941 return do_iter_read(file, iter, ppos, flags);
942}
943EXPORT_SYMBOL(vfs_iter_read);
944
945static ssize_t do_iter_write(struct file *file, struct iov_iter *iter,
946 loff_t *pos, rwf_t flags)
947{
948 size_t tot_len;
949 ssize_t ret = 0;
950
951 if (!(file->f_mode & FMODE_WRITE))
952 return -EBADF;
953 if (!(file->f_mode & FMODE_CAN_WRITE))
954 return -EINVAL;
955
956 tot_len = iov_iter_count(iter);
957 if (!tot_len)
958 return 0;
959 ret = rw_verify_area(WRITE, file, pos, tot_len);
960 if (ret < 0)
961 return ret;
962
963 if (file->f_op->write_iter)
964 ret = do_iter_readv_writev(file, iter, pos, WRITE, flags);
965 else
966 ret = do_loop_readv_writev(file, iter, pos, WRITE, flags);
967 if (ret > 0)
968 fsnotify_modify(file);
969 return ret;
970}
971
972ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
973 rwf_t flags)
974{
975 if (!file->f_op->write_iter)
976 return -EINVAL;
977 return do_iter_write(file, iter, ppos, flags);
978}
979EXPORT_SYMBOL(vfs_iter_write);
980
981ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
982 unsigned long vlen, loff_t *pos, rwf_t flags)
983{
984 struct iovec iovstack[UIO_FASTIOV];
985 struct iovec *iov = iovstack;
986 struct iov_iter iter;
987 ssize_t ret;
988
989 ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
990 if (ret >= 0) {
991 ret = do_iter_read(file, &iter, pos, flags);
992 kfree(iov);
993 }
994
995 return ret;
996}
997
998static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
999 unsigned long vlen, loff_t *pos, rwf_t flags)
1000{
1001 struct iovec iovstack[UIO_FASTIOV];
1002 struct iovec *iov = iovstack;
1003 struct iov_iter iter;
1004 ssize_t ret;
1005
1006 ret = import_iovec(WRITE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
1007 if (ret >= 0) {
1008 file_start_write(file);
1009 ret = do_iter_write(file, &iter, pos, flags);
1010 file_end_write(file);
1011 kfree(iov);
1012 }
1013 return ret;
1014}
1015
1016static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
1017 unsigned long vlen, rwf_t flags)
1018{
1019 struct fd f = fdget_pos(fd);
1020 ssize_t ret = -EBADF;
1021
1022 if (f.file) {
1023 loff_t pos = file_pos_read(f.file);
1024 ret = vfs_readv(f.file, vec, vlen, &pos, flags);
1025 if (ret >= 0)
1026 file_pos_write(f.file, pos);
1027 fdput_pos(f);
1028 }
1029
1030 if (ret > 0)
1031 add_rchar(current, ret);
1032 inc_syscr(current);
1033 return ret;
1034}
1035
1036static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec,
1037 unsigned long vlen, rwf_t flags)
1038{
1039 struct fd f = fdget_pos(fd);
1040 ssize_t ret = -EBADF;
1041
1042 if (f.file) {
1043 loff_t pos = file_pos_read(f.file);
1044 ret = vfs_writev(f.file, vec, vlen, &pos, flags);
1045 if (ret >= 0)
1046 file_pos_write(f.file, pos);
1047 fdput_pos(f);
1048 }
1049
1050 if (ret > 0)
1051 add_wchar(current, ret);
1052 inc_syscw(current);
1053 return ret;
1054}
1055
1056static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
1057{
1058#define HALF_LONG_BITS (BITS_PER_LONG / 2)
1059 return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
1060}
1061
1062static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec,
1063 unsigned long vlen, loff_t pos, rwf_t flags)
1064{
1065 struct fd f;
1066 ssize_t ret = -EBADF;
1067
1068 if (pos < 0)
1069 return -EINVAL;
1070
1071 f = fdget(fd);
1072 if (f.file) {
1073 ret = -ESPIPE;
1074 if (f.file->f_mode & FMODE_PREAD)
1075 ret = vfs_readv(f.file, vec, vlen, &pos, flags);
1076 fdput(f);
1077 }
1078
1079 if (ret > 0)
1080 add_rchar(current, ret);
1081 inc_syscr(current);
1082 return ret;
1083}
1084
1085static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec,
1086 unsigned long vlen, loff_t pos, rwf_t flags)
1087{
1088 struct fd f;
1089 ssize_t ret = -EBADF;
1090
1091 if (pos < 0)
1092 return -EINVAL;
1093
1094 f = fdget(fd);
1095 if (f.file) {
1096 ret = -ESPIPE;
1097 if (f.file->f_mode & FMODE_PWRITE)
1098 ret = vfs_writev(f.file, vec, vlen, &pos, flags);
1099 fdput(f);
1100 }
1101
1102 if (ret > 0)
1103 add_wchar(current, ret);
1104 inc_syscw(current);
1105 return ret;
1106}
1107
1108SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
1109 unsigned long, vlen)
1110{
1111 return do_readv(fd, vec, vlen, 0);
1112}
1113
1114SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
1115 unsigned long, vlen)
1116{
1117 return do_writev(fd, vec, vlen, 0);
1118}
1119
1120SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
1121 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
1122{
1123 loff_t pos = pos_from_hilo(pos_h, pos_l);
1124
1125 return do_preadv(fd, vec, vlen, pos, 0);
1126}
1127
1128SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec,
1129 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
1130 rwf_t, flags)
1131{
1132 loff_t pos = pos_from_hilo(pos_h, pos_l);
1133
1134 if (pos == -1)
1135 return do_readv(fd, vec, vlen, flags);
1136
1137 return do_preadv(fd, vec, vlen, pos, flags);
1138}
1139
1140SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
1141 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
1142{
1143 loff_t pos = pos_from_hilo(pos_h, pos_l);
1144
1145 return do_pwritev(fd, vec, vlen, pos, 0);
1146}
1147
1148SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec,
1149 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
1150 rwf_t, flags)
1151{
1152 loff_t pos = pos_from_hilo(pos_h, pos_l);
1153
1154 if (pos == -1)
1155 return do_writev(fd, vec, vlen, flags);
1156
1157 return do_pwritev(fd, vec, vlen, pos, flags);
1158}
1159
1160#ifdef CONFIG_COMPAT
1161static size_t compat_readv(struct file *file,
1162 const struct compat_iovec __user *vec,
1163 unsigned long vlen, loff_t *pos, rwf_t flags)
1164{
1165 struct iovec iovstack[UIO_FASTIOV];
1166 struct iovec *iov = iovstack;
1167 struct iov_iter iter;
1168 ssize_t ret;
1169
1170 ret = compat_import_iovec(READ, vec, vlen, UIO_FASTIOV, &iov, &iter);
1171 if (ret >= 0) {
1172 ret = do_iter_read(file, &iter, pos, flags);
1173 kfree(iov);
1174 }
1175 if (ret > 0)
1176 add_rchar(current, ret);
1177 inc_syscr(current);
1178 return ret;
1179}
1180
1181static size_t do_compat_readv(compat_ulong_t fd,
1182 const struct compat_iovec __user *vec,
1183 compat_ulong_t vlen, rwf_t flags)
1184{
1185 struct fd f = fdget_pos(fd);
1186 ssize_t ret;
1187 loff_t pos;
1188
1189 if (!f.file)
1190 return -EBADF;
1191 pos = f.file->f_pos;
1192 ret = compat_readv(f.file, vec, vlen, &pos, flags);
1193 if (ret >= 0)
1194 f.file->f_pos = pos;
1195 fdput_pos(f);
1196 return ret;
1197
1198}
1199
1200COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
1201 const struct compat_iovec __user *,vec,
1202 compat_ulong_t, vlen)
1203{
1204 return do_compat_readv(fd, vec, vlen, 0);
1205}
1206
1207static long do_compat_preadv64(unsigned long fd,
1208 const struct compat_iovec __user *vec,
1209 unsigned long vlen, loff_t pos, rwf_t flags)
1210{
1211 struct fd f;
1212 ssize_t ret;
1213
1214 if (pos < 0)
1215 return -EINVAL;
1216 f = fdget(fd);
1217 if (!f.file)
1218 return -EBADF;
1219 ret = -ESPIPE;
1220 if (f.file->f_mode & FMODE_PREAD)
1221 ret = compat_readv(f.file, vec, vlen, &pos, flags);
1222 fdput(f);
1223 return ret;
1224}
1225
1226#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
1227COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
1228 const struct compat_iovec __user *,vec,
1229 unsigned long, vlen, loff_t, pos)
1230{
1231 return do_compat_preadv64(fd, vec, vlen, pos, 0);
1232}
1233#endif
1234
1235COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
1236 const struct compat_iovec __user *,vec,
1237 compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1238{
1239 loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1240
1241 return do_compat_preadv64(fd, vec, vlen, pos, 0);
1242}
1243
1244#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2
1245COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd,
1246 const struct compat_iovec __user *,vec,
1247 unsigned long, vlen, loff_t, pos, rwf_t, flags)
1248{
1249 if (pos == -1)
1250 return do_compat_readv(fd, vec, vlen, flags);
1251
1252 return do_compat_preadv64(fd, vec, vlen, pos, flags);
1253}
1254#endif
1255
1256COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd,
1257 const struct compat_iovec __user *,vec,
1258 compat_ulong_t, vlen, u32, pos_low, u32, pos_high,
1259 rwf_t, flags)
1260{
1261 loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1262
1263 if (pos == -1)
1264 return do_compat_readv(fd, vec, vlen, flags);
1265
1266 return do_compat_preadv64(fd, vec, vlen, pos, flags);
1267}
1268
1269static size_t compat_writev(struct file *file,
1270 const struct compat_iovec __user *vec,
1271 unsigned long vlen, loff_t *pos, rwf_t flags)
1272{
1273 struct iovec iovstack[UIO_FASTIOV];
1274 struct iovec *iov = iovstack;
1275 struct iov_iter iter;
1276 ssize_t ret;
1277
1278 ret = compat_import_iovec(WRITE, vec, vlen, UIO_FASTIOV, &iov, &iter);
1279 if (ret >= 0) {
1280 file_start_write(file);
1281 ret = do_iter_write(file, &iter, pos, flags);
1282 file_end_write(file);
1283 kfree(iov);
1284 }
1285 if (ret > 0)
1286 add_wchar(current, ret);
1287 inc_syscw(current);
1288 return ret;
1289}
1290
1291static size_t do_compat_writev(compat_ulong_t fd,
1292 const struct compat_iovec __user* vec,
1293 compat_ulong_t vlen, rwf_t flags)
1294{
1295 struct fd f = fdget_pos(fd);
1296 ssize_t ret;
1297 loff_t pos;
1298
1299 if (!f.file)
1300 return -EBADF;
1301 pos = f.file->f_pos;
1302 ret = compat_writev(f.file, vec, vlen, &pos, flags);
1303 if (ret >= 0)
1304 f.file->f_pos = pos;
1305 fdput_pos(f);
1306 return ret;
1307}
1308
1309COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
1310 const struct compat_iovec __user *, vec,
1311 compat_ulong_t, vlen)
1312{
1313 return do_compat_writev(fd, vec, vlen, 0);
1314}
1315
1316static long do_compat_pwritev64(unsigned long fd,
1317 const struct compat_iovec __user *vec,
1318 unsigned long vlen, loff_t pos, rwf_t flags)
1319{
1320 struct fd f;
1321 ssize_t ret;
1322
1323 if (pos < 0)
1324 return -EINVAL;
1325 f = fdget(fd);
1326 if (!f.file)
1327 return -EBADF;
1328 ret = -ESPIPE;
1329 if (f.file->f_mode & FMODE_PWRITE)
1330 ret = compat_writev(f.file, vec, vlen, &pos, flags);
1331 fdput(f);
1332 return ret;
1333}
1334
1335#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
1336COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
1337 const struct compat_iovec __user *,vec,
1338 unsigned long, vlen, loff_t, pos)
1339{
1340 return do_compat_pwritev64(fd, vec, vlen, pos, 0);
1341}
1342#endif
1343
1344COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
1345 const struct compat_iovec __user *,vec,
1346 compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1347{
1348 loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1349
1350 return do_compat_pwritev64(fd, vec, vlen, pos, 0);
1351}
1352
1353#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2
1354COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd,
1355 const struct compat_iovec __user *,vec,
1356 unsigned long, vlen, loff_t, pos, rwf_t, flags)
1357{
1358 if (pos == -1)
1359 return do_compat_writev(fd, vec, vlen, flags);
1360
1361 return do_compat_pwritev64(fd, vec, vlen, pos, flags);
1362}
1363#endif
1364
1365COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd,
1366 const struct compat_iovec __user *,vec,
1367 compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags)
1368{
1369 loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1370
1371 if (pos == -1)
1372 return do_compat_writev(fd, vec, vlen, flags);
1373
1374 return do_compat_pwritev64(fd, vec, vlen, pos, flags);
1375}
1376
1377#endif
1378
1379static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
1380 size_t count, loff_t max)
1381{
1382 struct fd in, out;
1383 struct inode *in_inode, *out_inode;
1384 loff_t pos;
1385 loff_t out_pos;
1386 ssize_t retval;
1387 int fl;
1388
1389 /*
1390 * Get input file, and verify that it is ok..
1391 */
1392 retval = -EBADF;
1393 in = fdget(in_fd);
1394 if (!in.file)
1395 goto out;
1396 if (!(in.file->f_mode & FMODE_READ))
1397 goto fput_in;
1398 retval = -ESPIPE;
1399 if (!ppos) {
1400 pos = in.file->f_pos;
1401 } else {
1402 pos = *ppos;
1403 if (!(in.file->f_mode & FMODE_PREAD))
1404 goto fput_in;
1405 }
1406 retval = rw_verify_area(READ, in.file, &pos, count);
1407 if (retval < 0)
1408 goto fput_in;
1409 if (count > MAX_RW_COUNT)
1410 count = MAX_RW_COUNT;
1411
1412 /*
1413 * Get output file, and verify that it is ok..
1414 */
1415 retval = -EBADF;
1416 out = fdget(out_fd);
1417 if (!out.file)
1418 goto fput_in;
1419 if (!(out.file->f_mode & FMODE_WRITE))
1420 goto fput_out;
1421 retval = -EINVAL;
1422 in_inode = file_inode(in.file);
1423 out_inode = file_inode(out.file);
1424 out_pos = out.file->f_pos;
1425 retval = rw_verify_area(WRITE, out.file, &out_pos, count);
1426 if (retval < 0)
1427 goto fput_out;
1428
1429 if (!max)
1430 max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
1431
1432 if (unlikely(pos + count > max)) {
1433 retval = -EOVERFLOW;
1434 if (pos >= max)
1435 goto fput_out;
1436 count = max - pos;
1437 }
1438
1439 fl = 0;
1440#if 0
1441 /*
1442 * We need to debate whether we can enable this or not. The
1443 * man page documents EAGAIN return for the output at least,
1444 * and the application is arguably buggy if it doesn't expect
1445 * EAGAIN on a non-blocking file descriptor.
1446 */
1447 if (in.file->f_flags & O_NONBLOCK)
1448 fl = SPLICE_F_NONBLOCK;
1449#endif
1450 file_start_write(out.file);
1451 retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl);
1452 file_end_write(out.file);
1453
1454 if (retval > 0) {
1455 add_rchar(current, retval);
1456 add_wchar(current, retval);
1457 fsnotify_access(in.file);
1458 fsnotify_modify(out.file);
1459 out.file->f_pos = out_pos;
1460 if (ppos)
1461 *ppos = pos;
1462 else
1463 in.file->f_pos = pos;
1464 }
1465
1466 inc_syscr(current);
1467 inc_syscw(current);
1468 if (pos > max)
1469 retval = -EOVERFLOW;
1470
1471fput_out:
1472 fdput(out);
1473fput_in:
1474 fdput(in);
1475out:
1476 return retval;
1477}
1478
1479SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
1480{
1481 loff_t pos;
1482 off_t off;
1483 ssize_t ret;
1484
1485 if (offset) {
1486 if (unlikely(get_user(off, offset)))
1487 return -EFAULT;
1488 pos = off;
1489 ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1490 if (unlikely(put_user(pos, offset)))
1491 return -EFAULT;
1492 return ret;
1493 }
1494
1495 return do_sendfile(out_fd, in_fd, NULL, count, 0);
1496}
1497
1498SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
1499{
1500 loff_t pos;
1501 ssize_t ret;
1502
1503 if (offset) {
1504 if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1505 return -EFAULT;
1506 ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1507 if (unlikely(put_user(pos, offset)))
1508 return -EFAULT;
1509 return ret;
1510 }
1511
1512 return do_sendfile(out_fd, in_fd, NULL, count, 0);
1513}
1514
1515#ifdef CONFIG_COMPAT
1516COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
1517 compat_off_t __user *, offset, compat_size_t, count)
1518{
1519 loff_t pos;
1520 off_t off;
1521 ssize_t ret;
1522
1523 if (offset) {
1524 if (unlikely(get_user(off, offset)))
1525 return -EFAULT;
1526 pos = off;
1527 ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1528 if (unlikely(put_user(pos, offset)))
1529 return -EFAULT;
1530 return ret;
1531 }
1532
1533 return do_sendfile(out_fd, in_fd, NULL, count, 0);
1534}
1535
1536COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
1537 compat_loff_t __user *, offset, compat_size_t, count)
1538{
1539 loff_t pos;
1540 ssize_t ret;
1541
1542 if (offset) {
1543 if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1544 return -EFAULT;
1545 ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1546 if (unlikely(put_user(pos, offset)))
1547 return -EFAULT;
1548 return ret;
1549 }
1550
1551 return do_sendfile(out_fd, in_fd, NULL, count, 0);
1552}
1553#endif
1554
1555/*
1556 * copy_file_range() differs from regular file read and write in that it
1557 * specifically allows return partial success. When it does so is up to
1558 * the copy_file_range method.
1559 */
1560ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
1561 struct file *file_out, loff_t pos_out,
1562 size_t len, unsigned int flags)
1563{
1564 struct inode *inode_in = file_inode(file_in);
1565 struct inode *inode_out = file_inode(file_out);
1566 ssize_t ret;
1567
1568 if (flags != 0)
1569 return -EINVAL;
1570
1571 if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
1572 return -EISDIR;
1573 if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
1574 return -EINVAL;
1575
1576 ret = rw_verify_area(READ, file_in, &pos_in, len);
1577 if (unlikely(ret))
1578 return ret;
1579
1580 ret = rw_verify_area(WRITE, file_out, &pos_out, len);
1581 if (unlikely(ret))
1582 return ret;
1583
1584 if (!(file_in->f_mode & FMODE_READ) ||
1585 !(file_out->f_mode & FMODE_WRITE) ||
1586 (file_out->f_flags & O_APPEND))
1587 return -EBADF;
1588
1589 /* this could be relaxed once a method supports cross-fs copies */
1590 if (inode_in->i_sb != inode_out->i_sb)
1591 return -EXDEV;
1592
1593 if (len == 0)
1594 return 0;
1595
1596 file_start_write(file_out);
1597
1598 /*
1599 * Try cloning first, this is supported by more file systems, and
1600 * more efficient if both clone and copy are supported (e.g. NFS).
1601 */
1602 if (file_in->f_op->clone_file_range) {
1603 ret = file_in->f_op->clone_file_range(file_in, pos_in,
1604 file_out, pos_out, len);
1605 if (ret == 0) {
1606 ret = len;
1607 goto done;
1608 }
1609 }
1610
1611 if (file_out->f_op->copy_file_range) {
1612 ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out,
1613 pos_out, len, flags);
1614 if (ret != -EOPNOTSUPP)
1615 goto done;
1616 }
1617
1618 ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
1619 len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
1620
1621done:
1622 if (ret > 0) {
1623 fsnotify_access(file_in);
1624 add_rchar(current, ret);
1625 fsnotify_modify(file_out);
1626 add_wchar(current, ret);
1627 }
1628
1629 inc_syscr(current);
1630 inc_syscw(current);
1631
1632 file_end_write(file_out);
1633
1634 return ret;
1635}
1636EXPORT_SYMBOL(vfs_copy_file_range);
1637
1638SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
1639 int, fd_out, loff_t __user *, off_out,
1640 size_t, len, unsigned int, flags)
1641{
1642 loff_t pos_in;
1643 loff_t pos_out;
1644 struct fd f_in;
1645 struct fd f_out;
1646 ssize_t ret = -EBADF;
1647
1648 f_in = fdget(fd_in);
1649 if (!f_in.file)
1650 goto out2;
1651
1652 f_out = fdget(fd_out);
1653 if (!f_out.file)
1654 goto out1;
1655
1656 ret = -EFAULT;
1657 if (off_in) {
1658 if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
1659 goto out;
1660 } else {
1661 pos_in = f_in.file->f_pos;
1662 }
1663
1664 if (off_out) {
1665 if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
1666 goto out;
1667 } else {
1668 pos_out = f_out.file->f_pos;
1669 }
1670
1671 ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len,
1672 flags);
1673 if (ret > 0) {
1674 pos_in += ret;
1675 pos_out += ret;
1676
1677 if (off_in) {
1678 if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
1679 ret = -EFAULT;
1680 } else {
1681 f_in.file->f_pos = pos_in;
1682 }
1683
1684 if (off_out) {
1685 if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
1686 ret = -EFAULT;
1687 } else {
1688 f_out.file->f_pos = pos_out;
1689 }
1690 }
1691
1692out:
1693 fdput(f_out);
1694out1:
1695 fdput(f_in);
1696out2:
1697 return ret;
1698}
1699
1700static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
1701{
1702 struct inode *inode = file_inode(file);
1703
1704 if (unlikely(pos < 0))
1705 return -EINVAL;
1706
1707 if (unlikely((loff_t) (pos + len) < 0))
1708 return -EINVAL;
1709
1710 if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
1711 loff_t end = len ? pos + len - 1 : OFFSET_MAX;
1712 int retval;
1713
1714 retval = locks_mandatory_area(inode, file, pos, end,
1715 write ? F_WRLCK : F_RDLCK);
1716 if (retval < 0)
1717 return retval;
1718 }
1719
1720 return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
1721}
1722/*
1723 * Ensure that we don't remap a partial EOF block in the middle of something
1724 * else. Assume that the offsets have already been checked for block
1725 * alignment.
1726 *
1727 * For deduplication we always scale down to the previous block because we
1728 * can't meaningfully compare post-EOF contents.
1729 *
1730 * For clone we only link a partial EOF block above the destination file's EOF.
1731 */
1732static int generic_remap_check_len(struct inode *inode_in,
1733 struct inode *inode_out,
1734 loff_t pos_out,
1735 u64 *len,
1736 bool is_dedupe)
1737{
1738 u64 blkmask = i_blocksize(inode_in) - 1;
1739
1740 if ((*len & blkmask) == 0)
1741 return 0;
1742
1743 if (is_dedupe)
1744 *len &= ~blkmask;
1745 else if (pos_out + *len < i_size_read(inode_out))
1746 return -EINVAL;
1747
1748 return 0;
1749}
1750
1751/*
1752 * Check that the two inodes are eligible for cloning, the ranges make
1753 * sense, and then flush all dirty data. Caller must ensure that the
1754 * inodes have been locked against any other modifications.
1755 *
1756 * Returns: 0 for "nothing to clone", 1 for "something to clone", or
1757 * the usual negative error code.
1758 */
1759int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in,
1760 struct inode *inode_out, loff_t pos_out,
1761 u64 *len, bool is_dedupe)
1762{
1763 loff_t bs = inode_out->i_sb->s_blocksize;
1764 loff_t blen;
1765 loff_t isize;
1766 bool same_inode = (inode_in == inode_out);
1767 int ret;
1768
1769 /* Don't touch certain kinds of inodes */
1770 if (IS_IMMUTABLE(inode_out))
1771 return -EPERM;
1772
1773 if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
1774 return -ETXTBSY;
1775
1776 /* Don't reflink dirs, pipes, sockets... */
1777 if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
1778 return -EISDIR;
1779 if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
1780 return -EINVAL;
1781
1782 /* Are we going all the way to the end? */
1783 isize = i_size_read(inode_in);
1784 if (isize == 0)
1785 return 0;
1786
1787 /* Zero length dedupe exits immediately; reflink goes to EOF. */
1788 if (*len == 0) {
1789 if (is_dedupe || pos_in == isize)
1790 return 0;
1791 if (pos_in > isize)
1792 return -EINVAL;
1793 *len = isize - pos_in;
1794 }
1795
1796 /* Ensure offsets don't wrap and the input is inside i_size */
1797 if (pos_in + *len < pos_in || pos_out + *len < pos_out ||
1798 pos_in + *len > isize)
1799 return -EINVAL;
1800
1801 /* Don't allow dedupe past EOF in the dest file */
1802 if (is_dedupe) {
1803 loff_t disize;
1804
1805 disize = i_size_read(inode_out);
1806 if (pos_out >= disize || pos_out + *len > disize)
1807 return -EINVAL;
1808 }
1809
1810 /* If we're linking to EOF, continue to the block boundary. */
1811 if (pos_in + *len == isize)
1812 blen = ALIGN(isize, bs) - pos_in;
1813 else
1814 blen = *len;
1815
1816 /* Only reflink if we're aligned to block boundaries */
1817 if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) ||
1818 !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs))
1819 return -EINVAL;
1820
1821 /* Don't allow overlapped reflink within the same file */
1822 if (same_inode) {
1823 if (pos_out + blen > pos_in && pos_out < pos_in + blen)
1824 return -EINVAL;
1825 }
1826
1827 /* Wait for the completion of any pending IOs on both files */
1828 inode_dio_wait(inode_in);
1829 if (!same_inode)
1830 inode_dio_wait(inode_out);
1831
1832 ret = filemap_write_and_wait_range(inode_in->i_mapping,
1833 pos_in, pos_in + *len - 1);
1834 if (ret)
1835 return ret;
1836
1837 ret = filemap_write_and_wait_range(inode_out->i_mapping,
1838 pos_out, pos_out + *len - 1);
1839 if (ret)
1840 return ret;
1841
1842 /*
1843 * Check that the extents are the same.
1844 */
1845 if (is_dedupe) {
1846 bool is_same = false;
1847
1848 ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
1849 inode_out, pos_out, *len, &is_same);
1850 if (ret)
1851 return ret;
1852 if (!is_same)
1853 return -EBADE;
1854 }
1855
1856 ret = generic_remap_check_len(inode_in, inode_out, pos_out, len,
1857 is_dedupe);
1858 if (ret)
1859 return ret;
1860
1861 return 1;
1862}
1863EXPORT_SYMBOL(vfs_clone_file_prep_inodes);
1864
1865int do_clone_file_range(struct file *file_in, loff_t pos_in,
1866 struct file *file_out, loff_t pos_out, u64 len)
1867{
1868 struct inode *inode_in = file_inode(file_in);
1869 struct inode *inode_out = file_inode(file_out);
1870 int ret;
1871
1872 if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
1873 return -EISDIR;
1874 if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
1875 return -EINVAL;
1876
1877 /*
1878 * FICLONE/FICLONERANGE ioctls enforce that src and dest files are on
1879 * the same mount. Practically, they only need to be on the same file
1880 * system.
1881 */
1882 if (inode_in->i_sb != inode_out->i_sb)
1883 return -EXDEV;
1884
1885 if (!(file_in->f_mode & FMODE_READ) ||
1886 !(file_out->f_mode & FMODE_WRITE) ||
1887 (file_out->f_flags & O_APPEND))
1888 return -EBADF;
1889
1890 if (!file_in->f_op->clone_file_range)
1891 return -EOPNOTSUPP;
1892
1893 ret = clone_verify_area(file_in, pos_in, len, false);
1894 if (ret)
1895 return ret;
1896
1897 ret = clone_verify_area(file_out, pos_out, len, true);
1898 if (ret)
1899 return ret;
1900
1901 if (pos_in + len > i_size_read(inode_in))
1902 return -EINVAL;
1903
1904 ret = file_in->f_op->clone_file_range(file_in, pos_in,
1905 file_out, pos_out, len);
1906 if (!ret) {
1907 fsnotify_access(file_in);
1908 fsnotify_modify(file_out);
1909 }
1910
1911 return ret;
1912}
1913EXPORT_SYMBOL(do_clone_file_range);
1914
1915int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
1916 struct file *file_out, loff_t pos_out, u64 len)
1917{
1918 int ret;
1919
1920 file_start_write(file_out);
1921 ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len);
1922 file_end_write(file_out);
1923
1924 return ret;
1925}
1926EXPORT_SYMBOL(vfs_clone_file_range);
1927
1928/* Read a page's worth of file data into the page cache. */
1929static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
1930{
1931 struct address_space *mapping;
1932 struct page *page;
1933 pgoff_t n;
1934
1935 n = offset >> PAGE_SHIFT;
1936 mapping = inode->i_mapping;
1937 page = read_mapping_page(mapping, n, NULL);
1938 if (IS_ERR(page))
1939 return page;
1940 if (!PageUptodate(page)) {
1941 put_page(page);
1942 return ERR_PTR(-EIO);
1943 }
1944 return page;
1945}
1946
1947/*
1948 * Lock two pages, ensuring that we lock in offset order if the pages are from
1949 * the same file.
1950 */
1951static void vfs_lock_two_pages(struct page *page1, struct page *page2)
1952{
1953 /* Always lock in order of increasing index. */
1954 if (page1->index > page2->index)
1955 swap(page1, page2);
1956
1957 lock_page(page1);
1958 if (page1 != page2)
1959 lock_page(page2);
1960}
1961
1962/* Unlock two pages, being careful not to unlock the same page twice. */
1963static void vfs_unlock_two_pages(struct page *page1, struct page *page2)
1964{
1965 unlock_page(page1);
1966 if (page1 != page2)
1967 unlock_page(page2);
1968}
1969
1970/*
1971 * Compare extents of two files to see if they are the same.
1972 * Caller must have locked both inodes to prevent write races.
1973 */
1974int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
1975 struct inode *dest, loff_t destoff,
1976 loff_t len, bool *is_same)
1977{
1978 loff_t src_poff;
1979 loff_t dest_poff;
1980 void *src_addr;
1981 void *dest_addr;
1982 struct page *src_page;
1983 struct page *dest_page;
1984 loff_t cmp_len;
1985 bool same;
1986 int error;
1987
1988 error = -EINVAL;
1989 same = true;
1990 while (len) {
1991 src_poff = srcoff & (PAGE_SIZE - 1);
1992 dest_poff = destoff & (PAGE_SIZE - 1);
1993 cmp_len = min(PAGE_SIZE - src_poff,
1994 PAGE_SIZE - dest_poff);
1995 cmp_len = min(cmp_len, len);
1996 if (cmp_len <= 0)
1997 goto out_error;
1998
1999 src_page = vfs_dedupe_get_page(src, srcoff);
2000 if (IS_ERR(src_page)) {
2001 error = PTR_ERR(src_page);
2002 goto out_error;
2003 }
2004 dest_page = vfs_dedupe_get_page(dest, destoff);
2005 if (IS_ERR(dest_page)) {
2006 error = PTR_ERR(dest_page);
2007 put_page(src_page);
2008 goto out_error;
2009 }
2010
2011 vfs_lock_two_pages(src_page, dest_page);
2012
2013 /*
2014 * Now that we've locked both pages, make sure they're still
2015 * mapped to the file data we're interested in. If not,
2016 * someone is invalidating pages on us and we lose.
2017 */
2018 if (!PageUptodate(src_page) || !PageUptodate(dest_page) ||
2019 src_page->mapping != src->i_mapping ||
2020 dest_page->mapping != dest->i_mapping) {
2021 same = false;
2022 goto unlock;
2023 }
2024
2025 src_addr = kmap_atomic(src_page);
2026 dest_addr = kmap_atomic(dest_page);
2027
2028 flush_dcache_page(src_page);
2029 flush_dcache_page(dest_page);
2030
2031 if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
2032 same = false;
2033
2034 kunmap_atomic(dest_addr);
2035 kunmap_atomic(src_addr);
2036unlock:
2037 vfs_unlock_two_pages(src_page, dest_page);
2038 put_page(dest_page);
2039 put_page(src_page);
2040
2041 if (!same)
2042 break;
2043
2044 srcoff += cmp_len;
2045 destoff += cmp_len;
2046 len -= cmp_len;
2047 }
2048
2049 *is_same = same;
2050 return 0;
2051
2052out_error:
2053 return error;
2054}
2055EXPORT_SYMBOL(vfs_dedupe_file_range_compare);
2056
2057int vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
2058 struct file *dst_file, loff_t dst_pos, u64 len)
2059{
2060 s64 ret;
2061
2062 ret = mnt_want_write_file(dst_file);
2063 if (ret)
2064 return ret;
2065
2066 ret = clone_verify_area(dst_file, dst_pos, len, true);
2067 if (ret < 0)
2068 goto out_drop_write;
2069
2070 ret = -EINVAL;
2071 if (!(capable(CAP_SYS_ADMIN) || (dst_file->f_mode & FMODE_WRITE)))
2072 goto out_drop_write;
2073
2074 ret = -EXDEV;
2075 if (src_file->f_path.mnt != dst_file->f_path.mnt)
2076 goto out_drop_write;
2077
2078 ret = -EISDIR;
2079 if (S_ISDIR(file_inode(dst_file)->i_mode))
2080 goto out_drop_write;
2081
2082 ret = -EINVAL;
2083 if (!dst_file->f_op->dedupe_file_range)
2084 goto out_drop_write;
2085
2086 ret = dst_file->f_op->dedupe_file_range(src_file, src_pos,
2087 dst_file, dst_pos, len);
2088out_drop_write:
2089 mnt_drop_write_file(dst_file);
2090
2091 return ret;
2092}
2093EXPORT_SYMBOL(vfs_dedupe_file_range_one);
2094
2095int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
2096{
2097 struct file_dedupe_range_info *info;
2098 struct inode *src = file_inode(file);
2099 u64 off;
2100 u64 len;
2101 int i;
2102 int ret;
2103 u16 count = same->dest_count;
2104 int deduped;
2105
2106 if (!(file->f_mode & FMODE_READ))
2107 return -EINVAL;
2108
2109 if (same->reserved1 || same->reserved2)
2110 return -EINVAL;
2111
2112 off = same->src_offset;
2113 len = same->src_length;
2114
2115 ret = -EISDIR;
2116 if (S_ISDIR(src->i_mode))
2117 goto out;
2118
2119 ret = -EINVAL;
2120 if (!S_ISREG(src->i_mode))
2121 goto out;
2122
2123 ret = clone_verify_area(file, off, len, false);
2124 if (ret < 0)
2125 goto out;
2126 ret = 0;
2127
2128 if (off + len > i_size_read(src))
2129 return -EINVAL;
2130
2131 /* Arbitrary 1G limit on a single dedupe request, can be raised. */
2132 len = min_t(u64, len, 1 << 30);
2133
2134 /* pre-format output fields to sane values */
2135 for (i = 0; i < count; i++) {
2136 same->info[i].bytes_deduped = 0ULL;
2137 same->info[i].status = FILE_DEDUPE_RANGE_SAME;
2138 }
2139
2140 for (i = 0, info = same->info; i < count; i++, info++) {
2141 struct fd dst_fd = fdget(info->dest_fd);
2142 struct file *dst_file = dst_fd.file;
2143
2144 if (!dst_file) {
2145 info->status = -EBADF;
2146 goto next_loop;
2147 }
2148
2149 if (info->reserved) {
2150 info->status = -EINVAL;
2151 goto next_fdput;
2152 }
2153
2154 deduped = vfs_dedupe_file_range_one(file, off, dst_file,
2155 info->dest_offset, len);
2156 if (deduped == -EBADE)
2157 info->status = FILE_DEDUPE_RANGE_DIFFERS;
2158 else if (deduped < 0)
2159 info->status = deduped;
2160 else
2161 info->bytes_deduped = len;
2162
2163next_fdput:
2164 fdput(dst_fd);
2165next_loop:
2166 if (fatal_signal_pending(current))
2167 goto out;
2168 }
2169
2170out:
2171 return ret;
2172}
2173EXPORT_SYMBOL(vfs_dedupe_file_range);