Blame - src/kernel/linux/v4.19/fs/sync.c - T800

blob: 055daab8652a58cafd652fb91c9e98db0d3a9bac [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* High-level sync()-related operations
				4	*/
				5
				6	#include <linux/kernel.h>
				7	#include <linux/file.h>
				8	#include <linux/fs.h>
				9	#include <linux/slab.h>
				10	#include <linux/export.h>
				11	#include <linux/namei.h>
				12	#include <linux/sched/xacct.h>
				13	#include <linux/writeback.h>
				14	#include <linux/syscalls.h>
				15	#include <linux/linkage.h>
				16	#include <linux/pagemap.h>
				17	#include <linux/quotaops.h>
				18	#include <linux/backing-dev.h>
				19	#include "internal.h"
				20
				21	#define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE\|SYNC_FILE_RANGE_WRITE\| \
				22	SYNC_FILE_RANGE_WAIT_AFTER)
				23
				24	/*
				25	* Do the filesystem syncing work. For simple filesystems
				26	* writeback_inodes_sb(sb) just dirties buffers with inodes so we have to
				27	* submit IO for these buffers via __sync_blockdev(). This also speeds up the
				28	* wait == 1 case since in that case write_inode() functions do
				29	* sync_dirty_buffer() and thus effectively write one block at a time.
				30	*/
				31	static int __sync_filesystem(struct super_block *sb, int wait)
				32	{
				33	if (wait)
				34	sync_inodes_sb(sb);
				35	else
				36	writeback_inodes_sb(sb, WB_REASON_SYNC);
				37
				38	if (sb->s_op->sync_fs)
				39	sb->s_op->sync_fs(sb, wait);
				40	return __sync_blockdev(sb->s_bdev, wait);
				41	}
				42
				43	/*
				44	* Write out and wait upon all dirty data associated with this
				45	* superblock. Filesystem data as well as the underlying block
				46	* device. Takes the superblock lock.
				47	*/
				48	int sync_filesystem(struct super_block *sb)
				49	{
				50	int ret;
				51
				52	/*
				53	* We need to be protected against the filesystem going from
				54	* r/o to r/w or vice versa.
				55	*/
				56	WARN_ON(!rwsem_is_locked(&sb->s_umount));
				57
				58	/*
				59	* No point in syncing out anything if the filesystem is read-only.
				60	*/
				61	if (sb_rdonly(sb))
				62	return 0;
				63
				64	ret = __sync_filesystem(sb, 0);
				65	if (ret < 0)
				66	return ret;
				67	return __sync_filesystem(sb, 1);
				68	}
				69	EXPORT_SYMBOL(sync_filesystem);
				70
				71	static void sync_inodes_one_sb(struct super_block sb, void arg)
				72	{
				73	if (!sb_rdonly(sb))
				74	sync_inodes_sb(sb);
				75	}
				76
				77	static void sync_fs_one_sb(struct super_block sb, void arg)
				78	{
				79	if (!sb_rdonly(sb) && sb->s_op->sync_fs)
				80	sb->s_op->sync_fs(sb, (int )arg);
				81	}
				82
				83	static void fdatawrite_one_bdev(struct block_device bdev, void arg)
				84	{
				85	filemap_fdatawrite(bdev->bd_inode->i_mapping);
				86	}
				87
				88	static void fdatawait_one_bdev(struct block_device bdev, void arg)
				89	{
				90	/*
				91	* We keep the error status of individual mapping so that
				92	* applications can catch the writeback error using fsync(2).
				93	* See filemap_fdatawait_keep_errors() for details.
				94	*/
				95	filemap_fdatawait_keep_errors(bdev->bd_inode->i_mapping);
				96	}
				97
				98	/*
				99	* Sync everything. We start by waking flusher threads so that most of
				100	* writeback runs on all devices in parallel. Then we sync all inodes reliably
				101	* which effectively also waits for all flusher threads to finish doing
				102	* writeback. At this point all data is on disk so metadata should be stable
				103	* and we tell filesystems to sync their metadata via ->sync_fs() calls.
				104	* Finally, we writeout all block devices because some filesystems (e.g. ext2)
				105	* just write metadata (such as inodes or bitmaps) to block device page cache
				106	* and do not sync it on their own in ->sync_fs().
				107	*/
				108	void ksys_sync(void)
				109	{
				110	int nowait = 0, wait = 1;
				111
				112	wakeup_flusher_threads(WB_REASON_SYNC);
				113	iterate_supers(sync_inodes_one_sb, NULL);
				114	iterate_supers(sync_fs_one_sb, &nowait);
				115	iterate_supers(sync_fs_one_sb, &wait);
				116	iterate_bdevs(fdatawrite_one_bdev, NULL);
				117	iterate_bdevs(fdatawait_one_bdev, NULL);
				118	if (unlikely(laptop_mode))
				119	laptop_sync_completion();
				120	}
				121
				122	SYSCALL_DEFINE0(sync)
				123	{
				124	ksys_sync();
				125	return 0;
				126	}
				127
				128	static void do_sync_work(struct work_struct *work)
				129	{
				130	int nowait = 0;
				131
				132	/*
				133	* Sync twice to reduce the possibility we skipped some inodes / pages
				134	* because they were temporarily locked
				135	*/
				136	iterate_supers(sync_inodes_one_sb, &nowait);
				137	iterate_supers(sync_fs_one_sb, &nowait);
				138	iterate_bdevs(fdatawrite_one_bdev, NULL);
				139	iterate_supers(sync_inodes_one_sb, &nowait);
				140	iterate_supers(sync_fs_one_sb, &nowait);
				141	iterate_bdevs(fdatawrite_one_bdev, NULL);
				142	printk("Emergency Sync complete\n");
				143	kfree(work);
				144	}
				145
				146	void emergency_sync(void)
				147	{
				148	struct work_struct *work;
				149
				150	work = kmalloc(sizeof(*work), GFP_ATOMIC);
				151	if (work) {
				152	INIT_WORK(work, do_sync_work);
				153	schedule_work(work);
				154	}
				155	}
				156
				157	/*
				158	* sync a single super
				159	*/
				160	SYSCALL_DEFINE1(syncfs, int, fd)
				161	{
				162	struct fd f = fdget(fd);
				163	struct super_block *sb;
				164	int ret;
				165
				166	if (!f.file)
				167	return -EBADF;
				168	sb = f.file->f_path.dentry->d_sb;
				169
				170	down_read(&sb->s_umount);
				171	ret = sync_filesystem(sb);
				172	up_read(&sb->s_umount);
				173
				174	fdput(f);
				175	return ret;
				176	}
				177
				178	/**
				179	* vfs_fsync_range - helper to sync a range of data & metadata to disk
				180	* @file: file to sync
				181	* @start: offset in bytes of the beginning of data range to sync
				182	* @end: offset in bytes of the end of data range (inclusive)
				183	* @datasync: perform only datasync
				184	*
				185	* Write back data in range @start..@end and metadata for @file to disk. If
				186	* @datasync is set only metadata needed to access modified file data is
				187	* written.
				188	*/
				189	int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
				190	{
				191	struct inode *inode = file->f_mapping->host;
				192
				193	if (!file->f_op->fsync)
				194	return -EINVAL;
				195	if (!datasync && (inode->i_state & I_DIRTY_TIME))
				196	mark_inode_dirty_sync(inode);
				197	return file->f_op->fsync(file, start, end, datasync);
				198	}
				199	EXPORT_SYMBOL(vfs_fsync_range);
				200
				201	/**
				202	* vfs_fsync - perform a fsync or fdatasync on a file
				203	* @file: file to sync
				204	* @datasync: only perform a fdatasync operation
				205	*
				206	* Write back data and metadata for @file to disk. If @datasync is
				207	* set only metadata needed to access modified file data is written.
				208	*/
				209	int vfs_fsync(struct file *file, int datasync)
				210	{
				211	return vfs_fsync_range(file, 0, LLONG_MAX, datasync);
				212	}
				213	EXPORT_SYMBOL(vfs_fsync);
				214
				215	static int do_fsync(unsigned int fd, int datasync)
				216	{
				217	struct fd f = fdget(fd);
				218	int ret = -EBADF;
				219
				220	if (f.file) {
				221	ret = vfs_fsync(f.file, datasync);
				222	fdput(f);
				223	inc_syscfs(current);
				224	}
				225	return ret;
				226	}
				227
				228	SYSCALL_DEFINE1(fsync, unsigned int, fd)
				229	{
				230	return do_fsync(fd, 0);
				231	}
				232
				233	SYSCALL_DEFINE1(fdatasync, unsigned int, fd)
				234	{
				235	return do_fsync(fd, 1);
				236	}
				237
				238	/*
				239	* sys_sync_file_range() permits finely controlled syncing over a segment of
				240	* a file in the range offset .. (offset+nbytes-1) inclusive. If nbytes is
				241	* zero then sys_sync_file_range() will operate from offset out to EOF.
				242	*
				243	* The flag bits are:
				244	*
				245	* SYNC_FILE_RANGE_WAIT_BEFORE: wait upon writeout of all pages in the range
				246	* before performing the write.
				247	*
				248	* SYNC_FILE_RANGE_WRITE: initiate writeout of all those dirty pages in the
				249	* range which are not presently under writeback. Note that this may block for
				250	* significant periods due to exhaustion of disk request structures.
				251	*
				252	* SYNC_FILE_RANGE_WAIT_AFTER: wait upon writeout of all pages in the range
				253	* after performing the write.
				254	*
				255	* Useful combinations of the flag bits are:
				256	*
				257	* SYNC_FILE_RANGE_WAIT_BEFORE\|SYNC_FILE_RANGE_WRITE: ensures that all pages
				258	* in the range which were dirty on entry to sys_sync_file_range() are placed
				259	* under writeout. This is a start-write-for-data-integrity operation.
				260	*
				261	* SYNC_FILE_RANGE_WRITE: start writeout of all dirty pages in the range which
				262	* are not presently under writeout. This is an asynchronous flush-to-disk
				263	* operation. Not suitable for data integrity operations.
				264	*
				265	* SYNC_FILE_RANGE_WAIT_BEFORE (or SYNC_FILE_RANGE_WAIT_AFTER): wait for
				266	* completion of writeout of all pages in the range. This will be used after an
				267	* earlier SYNC_FILE_RANGE_WAIT_BEFORE\|SYNC_FILE_RANGE_WRITE operation to wait
				268	* for that operation to complete and to return the result.
				269	*
				270	* SYNC_FILE_RANGE_WAIT_BEFORE\|SYNC_FILE_RANGE_WRITE\|SYNC_FILE_RANGE_WAIT_AFTER:
				271	* a traditional sync() operation. This is a write-for-data-integrity operation
				272	* which will ensure that all pages in the range which were dirty on entry to
				273	* sys_sync_file_range() are committed to disk.
				274	*
				275	*
				276	* SYNC_FILE_RANGE_WAIT_BEFORE and SYNC_FILE_RANGE_WAIT_AFTER will detect any
				277	* I/O errors or ENOSPC conditions and will return those to the caller, after
				278	* clearing the EIO and ENOSPC flags in the address_space.
				279	*
				280	* It should be noted that none of these operations write out the file's
				281	* metadata. So unless the application is strictly performing overwrites of
				282	* already-instantiated disk blocks, there are no guarantees here that the data
				283	* will be available after a crash.
				284	*/
				285	int ksys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
				286	unsigned int flags)
				287	{
				288	int ret;
				289	struct fd f;
				290	struct address_space *mapping;
				291	loff_t endbyte; /* inclusive */
				292	umode_t i_mode;
				293
				294	ret = -EINVAL;
				295	if (flags & ~VALID_FLAGS)
				296	goto out;
				297
				298	endbyte = offset + nbytes;
				299
				300	if ((s64)offset < 0)
				301	goto out;
				302	if ((s64)endbyte < 0)
				303	goto out;
				304	if (endbyte < offset)
				305	goto out;
				306
				307	if (sizeof(pgoff_t) == 4) {
				308	if (offset >= (0x100000000ULL << PAGE_SHIFT)) {
				309	/*
				310	* The range starts outside a 32 bit machine's
				311	* pagecache addressing capabilities. Let it "succeed"
				312	*/
				313	ret = 0;
				314	goto out;
				315	}
				316	if (endbyte >= (0x100000000ULL << PAGE_SHIFT)) {
				317	/*
				318	* Out to EOF
				319	*/
				320	nbytes = 0;
				321	}
				322	}
				323
				324	if (nbytes == 0)
				325	endbyte = LLONG_MAX;
				326	else
				327	endbyte--; /* inclusive */
				328
				329	ret = -EBADF;
				330	f = fdget(fd);
				331	if (!f.file)
				332	goto out;
				333
				334	i_mode = file_inode(f.file)->i_mode;
				335	ret = -ESPIPE;
				336	if (!S_ISREG(i_mode) && !S_ISBLK(i_mode) && !S_ISDIR(i_mode) &&
				337	!S_ISLNK(i_mode))
				338	goto out_put;
				339
				340	mapping = f.file->f_mapping;
				341	ret = 0;
				342	if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) {
				343	ret = file_fdatawait_range(f.file, offset, endbyte);
				344	if (ret < 0)
				345	goto out_put;
				346	}
				347
				348	if (flags & SYNC_FILE_RANGE_WRITE) {
				349	ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
				350	WB_SYNC_NONE);
				351	if (ret < 0)
				352	goto out_put;
				353	}
				354
				355	if (flags & SYNC_FILE_RANGE_WAIT_AFTER)
				356	ret = file_fdatawait_range(f.file, offset, endbyte);
				357
				358	out_put:
				359	fdput(f);
				360	out:
				361	return ret;
				362	}
				363
				364	SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes,
				365	unsigned int, flags)
				366	{
				367	return ksys_sync_file_range(fd, offset, nbytes, flags);
				368	}
				369
				370	/* It would be nice if people remember that not all the world's an i386
				371	when they introduce new system calls */
				372	SYSCALL_DEFINE4(sync_file_range2, int, fd, unsigned int, flags,
				373	loff_t, offset, loff_t, nbytes)
				374	{
				375	return ksys_sync_file_range(fd, offset, nbytes, flags);
				376	}