Blame - ap/os/linux/linux-3.4.x/fs/fs-writeback.c - R306

blob: 6ba6b1f6b759f810813d90c9daca4f30a7aaf80f [file] [log] [blame]

yuezonghe	824eb0c	2024-06-27 02:32:26 -0700	[diff] [blame]	1	/*
				2	* fs/fs-writeback.c
				3	*
				4	* Copyright (C) 2002, Linus Torvalds.
				5	*
				6	* Contains all the functions related to writing back and waiting
				7	* upon dirty inodes against superblocks, and writing back dirty
				8	* pages against inodes. ie: data writeback. Writeout of the
				9	* inode itself is not handled here.
				10	*
				11	* 10Apr2002 Andrew Morton
				12	* Split out of fs/inode.c
				13	* Additions for address_space-based writeback
				14	*/
				15
				16	#include <linux/kernel.h>
				17	#include <linux/export.h>
				18	#include <linux/spinlock.h>
				19	#include <linux/slab.h>
				20	#include <linux/sched.h>
				21	#include <linux/fs.h>
				22	#include <linux/mm.h>
				23	#include <linux/pagemap.h>
				24	#include <linux/kthread.h>
				25	#include <linux/freezer.h>
				26	#include <linux/writeback.h>
				27	#include <linux/blkdev.h>
				28	#include <linux/backing-dev.h>
				29	#include <linux/tracepoint.h>
				30	#include "internal.h"
				31
				32	/*
				33	* 4MB minimal write chunk size
				34	*/
				35	#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_CACHE_SHIFT - 10))
				36
				37	/*
				38	* Passed into wb_writeback(), essentially a subset of writeback_control
				39	*/
				40	struct wb_writeback_work {
				41	long nr_pages;
				42	struct super_block *sb;
				43	unsigned long *older_than_this;
				44	enum writeback_sync_modes sync_mode;
				45	unsigned int tagged_writepages:1;
				46	unsigned int for_kupdate:1;
				47	unsigned int range_cyclic:1;
				48	unsigned int for_background:1;
				49	enum wb_reason reason; /* why was writeback initiated? */
				50
				51	struct list_head list; /* pending work list */
				52	struct completion done; / set if the caller waits */
				53	unsigned int nr_writeback;
				54	unsigned int nr_free;
				55	unsigned int nr_pagecache;
				56	};
				57
				58	/*
				59	* We don't actually have pdflush, but this one is exported though /proc...
				60	*/
				61	int nr_pdflush_threads;
				62
				63	/**
				64	* writeback_in_progress - determine whether there is writeback in progress
				65	* @bdi: the device's backing_dev_info structure.
				66	*
				67	* Determine whether there is writeback waiting to be handled against a
				68	* backing device.
				69	*/
				70	int writeback_in_progress(struct backing_dev_info *bdi)
				71	{
				72	return test_bit(BDI_writeback_running, &bdi->state);
				73	}
				74	EXPORT_SYMBOL(writeback_in_progress);
				75
				76	static inline struct backing_dev_info inode_to_bdi(struct inode inode)
				77	{
				78	struct super_block *sb = inode->i_sb;
				79
				80	if (strcmp(sb->s_type->name, "bdev") == 0)
				81	return inode->i_mapping->backing_dev_info;
				82
				83	return sb->s_bdi;
				84	}
				85
				86	static inline struct inode wb_inode(struct list_head head)
				87	{
				88	return list_entry(head, struct inode, i_wb_list);
				89	}
				90
				91	/*
				92	* Include the creation of the trace points after defining the
				93	* wb_writeback_work structure and inline functions so that the definition
				94	* remains local to this file.
				95	*/
				96	#define CREATE_TRACE_POINTS
				97	#include <trace/events/writeback.h>
				98
				99	/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */
				100	static void bdi_wakeup_flusher(struct backing_dev_info *bdi)
				101	{
				102	if (bdi->wb.task) {
				103	wake_up_process(bdi->wb.task);
				104	} else {
				105	/*
				106	* The bdi thread isn't there, wake up the forker thread which
				107	* will create and run it.
				108	*/
				109	wake_up_process(default_backing_dev_info.wb.task);
				110	}
				111	}
				112
				113	static void bdi_queue_work(struct backing_dev_info *bdi,
				114	struct wb_writeback_work *work)
				115	{
				116	trace_writeback_queue(bdi, work);
				117
				118	spin_lock_bh(&bdi->wb_lock);
				119	list_add_tail(&work->list, &bdi->work_list);
				120	if (!bdi->wb.task)
				121	trace_writeback_nothread(bdi, work);
				122	bdi_wakeup_flusher(bdi);
				123	spin_unlock_bh(&bdi->wb_lock);
				124	}
				125
				126	static void
				127	__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
				128	bool range_cyclic, enum wb_reason reason)
				129	{
				130	struct wb_writeback_work *work;
				131
				132	/*
				133	* This is WB_SYNC_NONE writeback, so if allocation fails just
				134	* wakeup the thread for old dirty data writeback
				135	*/
				136	work = kzalloc(sizeof(*work), GFP_ATOMIC);
				137	if (!work) {
				138	if (bdi->wb.task) {
				139	trace_writeback_nowork(bdi);
				140	wake_up_process(bdi->wb.task);
				141	}
				142	return;
				143	}
				144
				145	work->sync_mode = WB_SYNC_NONE;
				146	work->nr_pages = nr_pages;
				147	work->range_cyclic = range_cyclic;
				148	work->reason = reason;
				149	work->nr_writeback = global_page_state(NR_WRITEBACK);
				150	work->nr_free = global_page_state(NR_FREE_PAGES);
				151	work->nr_pagecache = global_page_state(NR_FILE_PAGES);
				152
				153	bdi_queue_work(bdi, work);
				154	}
				155
				156	/**
				157	* bdi_start_writeback - start writeback
				158	* @bdi: the backing device to write from
				159	* @nr_pages: the number of pages to write
				160	* @reason: reason why some writeback work was initiated
				161	*
				162	* Description:
				163	* This does WB_SYNC_NONE opportunistic writeback. The IO is only
				164	* started when this function returns, we make no guarantees on
				165	* completion. Caller need not hold sb s_umount semaphore.
				166	*
				167	*/
				168	void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
				169	enum wb_reason reason)
				170	{
				171	__bdi_start_writeback(bdi, nr_pages, true, reason);
				172	}
				173
				174	/**
				175	* bdi_start_background_writeback - start background writeback
				176	* @bdi: the backing device to write from
				177	*
				178	* Description:
				179	* This makes sure WB_SYNC_NONE background writeback happens. When
				180	* this function returns, it is only guaranteed that for given BDI
				181	* some IO is happening if we are over background dirty threshold.
				182	* Caller need not hold sb s_umount semaphore.
				183	*/
				184	void bdi_start_background_writeback(struct backing_dev_info *bdi)
				185	{
				186	/*
				187	* We just wake up the flusher thread. It will perform background
				188	* writeback as soon as there is no other work to do.
				189	*/
				190	trace_writeback_wake_background(bdi);
				191	spin_lock_bh(&bdi->wb_lock);
				192	bdi_wakeup_flusher(bdi);
				193	spin_unlock_bh(&bdi->wb_lock);
				194	}
				195
				196	/*
				197	* Remove the inode from the writeback list it is on.
				198	*/
				199	void inode_wb_list_del(struct inode *inode)
				200	{
				201	struct backing_dev_info *bdi = inode_to_bdi(inode);
				202
				203	spin_lock(&bdi->wb.list_lock);
				204	list_del_init(&inode->i_wb_list);
				205	spin_unlock(&bdi->wb.list_lock);
				206	}
				207
				208	/*
				209	* Redirty an inode: set its when-it-was dirtied timestamp and move it to the
				210	* furthest end of its superblock's dirty-inode list.
				211	*
				212	* Before stamping the inode's ->dirtied_when, we check to see whether it is
				213	* already the most-recently-dirtied inode on the b_dirty list. If that is
				214	* the case then the inode must have been redirtied while it was being written
				215	* out and we don't reset its dirtied_when.
				216	*/
				217	static void redirty_tail(struct inode inode, struct bdi_writeback wb)
				218	{
				219	assert_spin_locked(&wb->list_lock);
				220	if (!list_empty(&wb->b_dirty)) {
				221	struct inode *tail;
				222
				223	tail = wb_inode(wb->b_dirty.next);
				224	if (time_before(inode->dirtied_when, tail->dirtied_when))
				225	inode->dirtied_when = jiffies;
				226	}
				227	list_move(&inode->i_wb_list, &wb->b_dirty);
				228	}
				229
				230	/*
				231	* requeue inode for re-scanning after bdi->b_io list is exhausted.
				232	*/
				233	static void requeue_io(struct inode inode, struct bdi_writeback wb)
				234	{
				235	assert_spin_locked(&wb->list_lock);
				236	list_move(&inode->i_wb_list, &wb->b_more_io);
				237	}
				238
				239	static void inode_sync_complete(struct inode *inode)
				240	{
				241	/*
				242	* Prevent speculative execution through
				243	* spin_unlock(&wb->list_lock);
				244	*/
				245
				246	smp_mb();
				247	wake_up_bit(&inode->i_state, __I_SYNC);
				248	}
				249
				250	static bool inode_dirtied_after(struct inode *inode, unsigned long t)
				251	{
				252	bool ret = time_after(inode->dirtied_when, t);
				253	#ifndef CONFIG_64BIT
				254	/*
				255	* For inodes being constantly redirtied, dirtied_when can get stuck.
				256	* It _appears_ to be in the future, but is actually in distant past.
				257	* This test is necessary to prevent such wrapped-around relative times
				258	* from permanently stopping the whole bdi writeback.
				259	*/
				260	ret = ret && time_before_eq(inode->dirtied_when, jiffies);
				261	#endif
				262	return ret;
				263	}
				264
				265	/*
				266	* Move expired (dirtied after work->older_than_this) dirty inodes from
				267	* @delaying_queue to @dispatch_queue.
				268	*/
				269	static int move_expired_inodes(struct list_head *delaying_queue,
				270	struct list_head *dispatch_queue,
				271	struct wb_writeback_work *work)
				272	{
				273	LIST_HEAD(tmp);
				274	struct list_head pos, node;
				275	struct super_block *sb = NULL;
				276	struct inode *inode;
				277	int do_sb_sort = 0;
				278	int moved = 0;
				279
				280	while (!list_empty(delaying_queue)) {
				281	inode = wb_inode(delaying_queue->prev);
				282	if (work->older_than_this &&
				283	inode_dirtied_after(inode, *work->older_than_this))
				284	break;
				285	if (sb && sb != inode->i_sb)
				286	do_sb_sort = 1;
				287	sb = inode->i_sb;
				288	list_move(&inode->i_wb_list, &tmp);
				289	moved++;
				290	}
				291
				292	/* just one sb in list, splice to dispatch_queue and we're done */
				293	if (!do_sb_sort) {
				294	list_splice(&tmp, dispatch_queue);
				295	goto out;
				296	}
				297
				298	/* Move inodes from one superblock together */
				299	while (!list_empty(&tmp)) {
				300	sb = wb_inode(tmp.prev)->i_sb;
				301	list_for_each_prev_safe(pos, node, &tmp) {
				302	inode = wb_inode(pos);
				303	if (inode->i_sb == sb)
				304	list_move(&inode->i_wb_list, dispatch_queue);
				305	}
				306	}
				307	out:
				308	return moved;
				309	}
				310
				311	/*
				312	* Queue all expired dirty inodes for io, eldest first.
				313	* Before
				314	* newly dirtied b_dirty b_io b_more_io
				315	* =============> gf edc BA
				316	* After
				317	* newly dirtied b_dirty b_io b_more_io
				318	* =============> g fBAedc
				319	* \|
				320	* +--> dequeue for IO
				321	*/
				322	static void queue_io(struct bdi_writeback wb, struct wb_writeback_work work)
				323	{
				324	int moved;
				325	assert_spin_locked(&wb->list_lock);
				326	list_splice_init(&wb->b_more_io, &wb->b_io);
				327	moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, work);
				328	trace_writeback_queue_io(wb, work, moved);
				329	}
				330
				331	static int write_inode(struct inode inode, struct writeback_control wbc)
				332	{
				333	if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
				334	return inode->i_sb->s_op->write_inode(inode, wbc);
				335	return 0;
				336	}
				337
				338	/*
				339	* Wait for writeback on an inode to complete.
				340	*/
				341	static void inode_wait_for_writeback(struct inode *inode,
				342	struct bdi_writeback *wb)
				343	{
				344	DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
				345	wait_queue_head_t *wqh;
				346
				347	wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
				348	while (inode->i_state & I_SYNC) {
				349	spin_unlock(&inode->i_lock);
				350	spin_unlock(&wb->list_lock);
				351	__wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
				352	spin_lock(&wb->list_lock);
				353	spin_lock(&inode->i_lock);
				354	}
				355	}
				356
				357	/*
				358	* Write out an inode's dirty pages. Called under wb->list_lock and
				359	* inode->i_lock. Either the caller has an active reference on the inode or
				360	* the inode has I_WILL_FREE set.
				361	*
				362	* If `wait' is set, wait on the writeout.
				363	*
				364	* The whole writeout design is quite complex and fragile. We want to avoid
				365	* starvation of particular inodes when others are being redirtied, prevent
				366	* livelocks, etc.
				367	*/
				368	static int
				369	writeback_single_inode(struct inode inode, struct bdi_writeback wb,
				370	struct writeback_control *wbc)
				371	{
				372	struct address_space *mapping = inode->i_mapping;
				373	long nr_to_write = wbc->nr_to_write;
				374	unsigned dirty;
				375	int ret;
				376
				377	assert_spin_locked(&wb->list_lock);
				378	assert_spin_locked(&inode->i_lock);
				379
				380	if (!atomic_read(&inode->i_count))
				381	WARN_ON(!(inode->i_state & (I_WILL_FREE\|I_FREEING)));
				382	else
				383	WARN_ON(inode->i_state & I_WILL_FREE);
				384
				385	if (inode->i_state & I_SYNC) {
				386	/*
				387	* If this inode is locked for writeback and we are not doing
				388	* writeback-for-data-integrity, move it to b_more_io so that
				389	* writeback can proceed with the other inodes on s_io.
				390	*
				391	* We'll have another go at writing back this inode when we
				392	* completed a full scan of b_io.
				393	*/
				394	if (wbc->sync_mode != WB_SYNC_ALL) {
				395	requeue_io(inode, wb);
				396	trace_writeback_single_inode_requeue(inode, wbc,
				397	nr_to_write);
				398	return 0;
				399	}
				400
				401	/*
				402	* It's a data-integrity sync. We must wait.
				403	*/
				404	inode_wait_for_writeback(inode, wb);
				405	}
				406
				407	BUG_ON(inode->i_state & I_SYNC);
				408
				409	/* Set I_SYNC, reset I_DIRTY_PAGES */
				410	inode->i_state \|= I_SYNC;
				411	spin_unlock(&inode->i_lock);
				412	spin_unlock(&wb->list_lock);
				413
				414	ret = do_writepages(mapping, wbc);
				415
				416	/*
				417	* Make sure to wait on the data before writing out the metadata.
				418	* This is important for filesystems that modify metadata on data
				419	* I/O completion.
				420	*/
				421	if (wbc->sync_mode == WB_SYNC_ALL) {
				422	int err = filemap_fdatawait(mapping);
				423	if (ret == 0)
				424	ret = err;
				425	}
				426
				427	/*
				428	* Some filesystems may redirty the inode during the writeback
				429	* due to delalloc, clear dirty metadata flags right before
				430	* write_inode()
				431	*/
				432	spin_lock(&inode->i_lock);
				433
				434	dirty = inode->i_state & I_DIRTY;
				435	inode->i_state &= ~I_DIRTY;
				436
				437	/*
				438	* Paired with smp_mb() in __mark_inode_dirty(). This allows
				439	* __mark_inode_dirty() to test i_state without grabbing i_lock -
				440	* either they see the I_DIRTY bits cleared or we see the dirtied
				441	* inode.
				442	*
				443	* I_DIRTY_PAGES is always cleared together above even if @mapping
				444	* still has dirty pages. The flag is reinstated after smp_mb() if
				445	* necessary. This guarantees that either __mark_inode_dirty()
				446	* sees clear I_DIRTY_PAGES or we see PAGECACHE_TAG_DIRTY.
				447	*/
				448	smp_mb();
				449
				450	if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
				451	inode->i_state \|= I_DIRTY_PAGES;
				452
				453	spin_unlock(&inode->i_lock);
				454
				455	/* Don't write the inode if only I_DIRTY_PAGES was set */
				456	if (dirty & (I_DIRTY_SYNC \| I_DIRTY_DATASYNC)) {
				457	int err = write_inode(inode, wbc);
				458	if (ret == 0)
				459	ret = err;
				460	}
				461
				462	spin_lock(&wb->list_lock);
				463	spin_lock(&inode->i_lock);
				464	inode->i_state &= ~I_SYNC;
				465	if (!(inode->i_state & I_FREEING)) {
				466	/*
				467	* Sync livelock prevention. Each inode is tagged and synced in
				468	* one shot. If still dirty, it will be redirty_tail()'ed below.
				469	* Update the dirty time to prevent enqueue and sync it again.
				470	*/
				471	if ((inode->i_state & I_DIRTY) &&
				472	(wbc->sync_mode == WB_SYNC_ALL \|\| wbc->tagged_writepages))
				473	inode->dirtied_when = jiffies;
				474
				475	if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
				476	/*
				477	* We didn't write back all the pages. nfs_writepages()
				478	* sometimes bales out without doing anything.
				479	*/
				480	if (wbc->nr_to_write <= 0) {
				481	/*
				482	* slice used up: queue for next turn
				483	*/
				484	requeue_io(inode, wb);
				485	} else {
				486	/*
				487	* Writeback blocked by something other than
				488	* congestion. Delay the inode for some time to
				489	* avoid spinning on the CPU (100% iowait)
				490	* retrying writeback of the dirty page/inode
				491	* that cannot be performed immediately.
				492	*/
				493	redirty_tail(inode, wb);
				494	}
				495	} else if (inode->i_state & I_DIRTY) {
				496	/*
				497	* Filesystems can dirty the inode during writeback
				498	* operations, such as delayed allocation during
				499	* submission or metadata updates after data IO
				500	* completion.
				501	*/
				502	redirty_tail(inode, wb);
				503	} else {
				504	/*
				505	* The inode is clean. At this point we either have
				506	* a reference to the inode or it's on it's way out.
				507	* No need to add it back to the LRU.
				508	*/
				509	list_del_init(&inode->i_wb_list);
				510	}
				511	}
				512	inode_sync_complete(inode);
				513	trace_writeback_single_inode(inode, wbc, nr_to_write);
				514	return ret;
				515	}
				516
				517	static long writeback_chunk_size(struct backing_dev_info *bdi,
				518	struct wb_writeback_work *work)
				519	{
				520	long pages;
				521
				522	/*
				523	* WB_SYNC_ALL mode does livelock avoidance by syncing dirty
				524	* inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
				525	* here avoids calling into writeback_inodes_wb() more than once.
				526	*
				527	* The intended call sequence for WB_SYNC_ALL writeback is:
				528	*
				529	* wb_writeback()
				530	* writeback_sb_inodes() <== called only once
				531	* write_cache_pages() <== called once for each inode
				532	* (quickly) tag currently dirty pages
				533	* (maybe slowly) sync all tagged pages
				534	*/
				535	if (work->sync_mode == WB_SYNC_ALL \|\| work->tagged_writepages)
				536	pages = LONG_MAX;
				537	else {
				538	pages = min(bdi->avg_write_bandwidth / 2,
				539	global_dirty_limit / DIRTY_SCOPE);
				540	pages = min(pages, work->nr_pages);
				541	pages = round_down(pages + MIN_WRITEBACK_PAGES,
				542	MIN_WRITEBACK_PAGES);
				543	}
				544
				545	return pages;
				546	}
				547
				548	/*
				549	* Write a portion of b_io inodes which belong to @sb.
				550	*
				551	* If @only_this_sb is true, then find and write all such
				552	* inodes. Otherwise write only ones which go sequentially
				553	* in reverse order.
				554	*
				555	* Return the number of pages and/or inodes written.
				556	*/
				557	static long writeback_sb_inodes(struct super_block *sb,
				558	struct bdi_writeback *wb,
				559	struct wb_writeback_work *work)
				560	{
				561	struct writeback_control wbc = {
				562	.sync_mode = work->sync_mode,
				563	.tagged_writepages = work->tagged_writepages,
				564	.for_kupdate = work->for_kupdate,
				565	.for_background = work->for_background,
				566	.range_cyclic = work->range_cyclic,
				567	.range_start = 0,
				568	.range_end = LLONG_MAX,
				569	};
				570	unsigned long start_time = jiffies;
				571	long write_chunk;
				572	long wrote = 0; /* count both pages and inodes */
				573
				574	while (!list_empty(&wb->b_io)) {
				575	struct inode *inode = wb_inode(wb->b_io.prev);
				576
				577	if (inode->i_sb != sb) {
				578	if (work->sb) {
				579	/*
				580	* We only want to write back data for this
				581	* superblock, move all inodes not belonging
				582	* to it back onto the dirty list.
				583	*/
				584	redirty_tail(inode, wb);
				585	continue;
				586	}
				587
				588	/*
				589	* The inode belongs to a different superblock.
				590	* Bounce back to the caller to unpin this and
				591	* pin the next superblock.
				592	*/
				593	break;
				594	}
				595
				596	/*
				597	* Don't bother with new inodes or inodes beeing freed, first
				598	* kind does not need peridic writeout yet, and for the latter
				599	* kind writeout is handled by the freer.
				600	*/
				601	spin_lock(&inode->i_lock);
				602	if (inode->i_state & (I_NEW \| I_FREEING \| I_WILL_FREE)) {
				603	spin_unlock(&inode->i_lock);
				604	redirty_tail(inode, wb);
				605	continue;
				606	}
				607	__iget(inode);
				608	write_chunk = writeback_chunk_size(wb->bdi, work);
				609	wbc.nr_to_write = write_chunk;
				610	wbc.pages_skipped = 0;
				611
				612	writeback_single_inode(inode, wb, &wbc);
				613
				614	work->nr_pages -= write_chunk - wbc.nr_to_write;
				615	wrote += write_chunk - wbc.nr_to_write;
				616	if (!(inode->i_state & I_DIRTY))
				617	wrote++;
				618	if (wbc.pages_skipped) {
				619	/*
				620	* writeback is not making progress due to locked
				621	* buffers. Skip this inode for now.
				622	*/
				623	redirty_tail(inode, wb);
				624	}
				625	spin_unlock(&inode->i_lock);
				626	spin_unlock(&wb->list_lock);
				627	iput(inode);
				628	cond_resched();
				629	spin_lock(&wb->list_lock);
				630	/*
				631	* bail out to wb_writeback() often enough to check
				632	* background threshold and other termination conditions.
				633	*/
				634	if (wrote) {
				635	if (time_is_before_jiffies(start_time + HZ / 10UL))
				636	break;
				637	if (work->nr_pages <= 0)
				638	break;
				639	}
				640	}
				641	return wrote;
				642	}
				643
				644	static long __writeback_inodes_wb(struct bdi_writeback *wb,
				645	struct wb_writeback_work *work)
				646	{
				647	unsigned long start_time = jiffies;
				648	long wrote = 0;
				649
				650	while (!list_empty(&wb->b_io)) {
				651	struct inode *inode = wb_inode(wb->b_io.prev);
				652	struct super_block *sb = inode->i_sb;
				653
				654	if (!grab_super_passive(sb)) {
				655	/*
				656	* grab_super_passive() may fail consistently due to
				657	* s_umount being grabbed by someone else. Don't use
				658	* requeue_io() to avoid busy retrying the inode/sb.
				659	*/
				660	redirty_tail(inode, wb);
				661	continue;
				662	}
				663	wrote += writeback_sb_inodes(sb, wb, work);
				664	drop_super(sb);
				665
				666	/* refer to the same tests at the end of writeback_sb_inodes */
				667	if (wrote) {
				668	if (time_is_before_jiffies(start_time + HZ / 10UL))
				669	break;
				670	if (work->nr_pages <= 0)
				671	break;
				672	}
				673	}
				674	/* Leave any unwritten inodes on b_io */
				675	return wrote;
				676	}
				677
				678	long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
				679	enum wb_reason reason)
				680	{
				681	struct wb_writeback_work work = {
				682	.nr_pages = nr_pages,
				683	.sync_mode = WB_SYNC_NONE,
				684	.range_cyclic = 1,
				685	.reason = reason,
				686	};
				687
				688	spin_lock(&wb->list_lock);
				689	if (list_empty(&wb->b_io))
				690	queue_io(wb, &work);
				691	__writeback_inodes_wb(wb, &work);
				692	spin_unlock(&wb->list_lock);
				693
				694	return nr_pages - work.nr_pages;
				695	}
				696
				697	static bool over_bground_thresh(struct backing_dev_info *bdi)
				698	{
				699	unsigned long background_thresh, dirty_thresh;
				700
				701	global_dirty_limits(&background_thresh, &dirty_thresh);
				702
				703	if (global_page_state(NR_FILE_DIRTY) +
				704	global_page_state(NR_UNSTABLE_NFS) > background_thresh)
				705	return true;
				706
				707	if (bdi_stat(bdi, BDI_RECLAIMABLE) >
				708	bdi_dirty_limit(bdi, background_thresh))
				709	return true;
				710
				711	return false;
				712	}
				713
				714	/*
				715	* Called under wb->list_lock. If there are multiple wb per bdi,
				716	* only the flusher working on the first wb should do it.
				717	*/
				718	static void wb_update_bandwidth(struct bdi_writeback *wb,
				719	unsigned long start_time)
				720	{
				721	__bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, 0, start_time);
				722	}
				723
				724	/*
				725	* Explicit flushing or periodic writeback of "old" data.
				726	*
				727	* Define "old": the first time one of an inode's pages is dirtied, we mark the
				728	* dirtying-time in the inode's address_space. So this periodic writeback code
				729	* just walks the superblock inode list, writing back any inodes which are
				730	* older than a specific point in time.
				731	*
				732	* Try to run once per dirty_writeback_interval. But if a writeback event
				733	* takes longer than a dirty_writeback_interval interval, then leave a
				734	* one-second gap.
				735	*
				736	* older_than_this takes precedence over nr_to_write. So we'll only write back
				737	* all dirty pages if they are all attached to "old" mappings.
				738	*/
				739	static long wb_writeback(struct bdi_writeback *wb,
				740	struct wb_writeback_work *work)
				741	{
				742	unsigned long wb_start = jiffies;
				743	long nr_pages = work->nr_pages;
				744	unsigned long oldest_jif;
				745	struct inode *inode;
				746	long progress;
				747
				748	oldest_jif = jiffies;
				749	work->older_than_this = &oldest_jif;
				750
				751	spin_lock(&wb->list_lock);
				752	for (;;) {
				753	/*
				754	* Stop writeback when nr_pages has been consumed
				755	*/
				756	if (work->nr_pages <= 0)
				757	break;
				758
				759	/*
				760	* Background writeout and kupdate-style writeback may
				761	* run forever. Stop them if there is other work to do
				762	* so that e.g. sync can proceed. They'll be restarted
				763	* after the other works are all done.
				764	*/
				765	if ((work->for_background \|\| work->for_kupdate) &&
				766	!list_empty(&wb->bdi->work_list))
				767	break;
				768
				769	/*
				770	* For background writeout, stop when we are below the
				771	* background dirty threshold
				772	*/
				773	if (work->for_background && !over_bground_thresh(wb->bdi))
				774	break;
				775
				776	/*
				777	* Kupdate and background works are special and we want to
				778	* include all inodes that need writing. Livelock avoidance is
				779	* handled by these works yielding to any other work so we are
				780	* safe.
				781	*/
				782	if (work->for_kupdate) {
				783	oldest_jif = jiffies -
				784	msecs_to_jiffies(dirty_expire_interval * 10);
				785	} else if (work->for_background)
				786	oldest_jif = jiffies;
				787
				788	trace_writeback_start(wb->bdi, work);
				789	if (list_empty(&wb->b_io))
				790	queue_io(wb, work);
				791	if (work->sb)
				792	progress = writeback_sb_inodes(work->sb, wb, work);
				793	else
				794	progress = __writeback_inodes_wb(wb, work);
				795	trace_writeback_written(wb->bdi, work);
				796
				797	wb_update_bandwidth(wb, wb_start);
				798
				799	/*
				800	* Did we write something? Try for more
				801	*
				802	* Dirty inodes are moved to b_io for writeback in batches.
				803	* The completion of the current batch does not necessarily
				804	* mean the overall work is done. So we keep looping as long
				805	* as made some progress on cleaning pages or inodes.
				806	*/
				807	if (progress)
				808	continue;
				809	/*
				810	* No more inodes for IO, bail
				811	*/
				812	if (list_empty(&wb->b_more_io))
				813	break;
				814	/*
				815	* Nothing written. Wait for some inode to
				816	* become available for writeback. Otherwise
				817	* we'll just busyloop.
				818	*/
				819	if (!list_empty(&wb->b_more_io)) {
				820	trace_writeback_wait(wb->bdi, work);
				821	inode = wb_inode(wb->b_more_io.prev);
				822	spin_lock(&inode->i_lock);
				823	inode_wait_for_writeback(inode, wb);
				824	spin_unlock(&inode->i_lock);
				825	}
				826	}
				827	spin_unlock(&wb->list_lock);
				828
				829	return nr_pages - work->nr_pages;
				830	}
				831
				832	/*
				833	* Return the next wb_writeback_work struct that hasn't been processed yet.
				834	*/
				835	static struct wb_writeback_work *
				836	get_next_work_item(struct backing_dev_info *bdi)
				837	{
				838	struct wb_writeback_work *work = NULL;
				839
				840	spin_lock_bh(&bdi->wb_lock);
				841	if (!list_empty(&bdi->work_list)) {
				842	work = list_entry(bdi->work_list.next,
				843	struct wb_writeback_work, list);
				844	list_del_init(&work->list);
				845	}
				846	spin_unlock_bh(&bdi->wb_lock);
				847	return work;
				848	}
				849
				850	/*
				851	* Add in the number of potentially dirty inodes, because each inode
				852	* write can dirty pagecache in the underlying blockdev.
				853	*/
				854	static unsigned long get_nr_dirty_pages(void)
				855	{
				856	return global_page_state(NR_FILE_DIRTY) +
				857	global_page_state(NR_UNSTABLE_NFS) +
				858	get_nr_dirty_inodes();
				859	}
				860
				861	static long wb_check_background_flush(struct bdi_writeback *wb)
				862	{
				863	if (over_bground_thresh(wb->bdi)) {
				864
				865	struct wb_writeback_work work = {
				866	.nr_pages = LONG_MAX,
				867	.sync_mode = WB_SYNC_NONE,
				868	.for_background = 1,
				869	.range_cyclic = 1,
				870	.reason = WB_REASON_BACKGROUND,
				871	};
				872
				873	return wb_writeback(wb, &work);
				874	}
				875
				876	return 0;
				877	}
				878
				879	static long wb_check_old_data_flush(struct bdi_writeback *wb)
				880	{
				881	unsigned long expired;
				882	long nr_pages;
				883
				884	/*
				885	* When set to zero, disable periodic writeback
				886	*/
				887	if (!dirty_writeback_interval)
				888	return 0;
				889
				890	expired = wb->last_old_flush +
				891	msecs_to_jiffies(dirty_writeback_interval * 10);
				892	if (time_before(jiffies, expired))
				893	return 0;
				894
				895	wb->last_old_flush = jiffies;
				896	nr_pages = get_nr_dirty_pages();
				897
				898	if (nr_pages) {
				899	struct wb_writeback_work work = {
				900	.nr_pages = nr_pages,
				901	.sync_mode = WB_SYNC_NONE,
				902	.for_kupdate = 1,
				903	.range_cyclic = 1,
				904	.reason = WB_REASON_PERIODIC,
				905	};
				906
				907	return wb_writeback(wb, &work);
				908	}
				909
				910	return 0;
				911	}
				912
				913	/*
				914	* Retrieve work items and do the writeback they describe
				915	*/
				916	long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
				917	{
				918	struct backing_dev_info *bdi = wb->bdi;
				919	struct wb_writeback_work *work;
				920	long wrote = 0;
				921
				922	set_bit(BDI_writeback_running, &wb->bdi->state);
				923	while ((work = get_next_work_item(bdi)) != NULL) {
				924	/*
				925	* Override sync mode, in case we must wait for completion
				926	* because this thread is exiting now.
				927	*/
				928	if (force_wait)
				929	work->sync_mode = WB_SYNC_ALL;
				930
				931	trace_writeback_exec(bdi, work);
				932
				933	wrote += wb_writeback(wb, work);
				934
				935	/*
				936	* Notify the caller of completion if this is a synchronous
				937	* work item, otherwise just free it.
				938	*/
				939	if (work->done)
				940	complete(work->done);
				941	else
				942	kfree(work);
				943	}
				944
				945	/*
				946	* Check for periodic writeback, kupdated() style
				947	*/
				948	wrote += wb_check_old_data_flush(wb);
				949	wrote += wb_check_background_flush(wb);
				950	clear_bit(BDI_writeback_running, &wb->bdi->state);
				951
				952	return wrote;
				953	}
				954
				955	/*
				956	* Handle writeback of dirty data for the device backed by this bdi. Also
				957	* wakes up periodically and does kupdated style flushing.
				958	*/
				959	int bdi_writeback_thread(void *data)
				960	{
				961	struct bdi_writeback *wb = data;
				962	struct backing_dev_info *bdi = wb->bdi;
				963	long pages_written;
				964
				965	current->flags \|= PF_SWAPWRITE;
				966	set_freezable();
				967	wb->last_active = jiffies;
				968
				969	/*
				970	* Our parent may run at a different priority, just set us to normal
				971	*/
				972	set_user_nice(current, 0);
				973
				974	trace_writeback_thread_start(bdi);
				975
				976	while (!kthread_freezable_should_stop(NULL)) {
				977	/*
				978	* Remove own delayed wake-up timer, since we are already awake
				979	* and we'll take care of the preriodic write-back.
				980	*/
				981	del_timer(&wb->wakeup_timer);
				982
				983	pages_written = wb_do_writeback(wb, 0);
				984
				985	trace_writeback_pages_written(pages_written);
				986
				987	if (pages_written)
				988	wb->last_active = jiffies;
				989
				990	set_current_state(TASK_INTERRUPTIBLE);
				991	if (!list_empty(&bdi->work_list) \|\| kthread_should_stop()) {
				992	__set_current_state(TASK_RUNNING);
				993	continue;
				994	}
				995
				996	if (wb_has_dirty_io(wb) && dirty_writeback_interval)
				997	schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
				998	else {
				999	/*
				1000	* We have nothing to do, so can go sleep without any
				1001	* timeout and save power. When a work is queued or
				1002	* something is made dirty - we will be woken up.
				1003	*/
				1004	schedule();
				1005	}
				1006	}
				1007
				1008	/* Flush any work that raced with us exiting */
				1009	if (!list_empty(&bdi->work_list))
				1010	wb_do_writeback(wb, 1);
				1011
				1012	trace_writeback_thread_stop(bdi);
				1013	return 0;
				1014	}
				1015
				1016
				1017	/*
				1018	* Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
				1019	* the whole world.
				1020	*/
				1021	void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
				1022	{
				1023	struct backing_dev_info *bdi;
				1024
				1025	if (!nr_pages) {
				1026	nr_pages = global_page_state(NR_FILE_DIRTY) +
				1027	global_page_state(NR_UNSTABLE_NFS);
				1028	}
				1029
				1030	rcu_read_lock();
				1031	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
				1032	if (!bdi_has_dirty_io(bdi))
				1033	continue;
				1034	__bdi_start_writeback(bdi, nr_pages, false, reason);
				1035	}
				1036	rcu_read_unlock();
				1037	}
				1038
				1039	static noinline void block_dump___mark_inode_dirty(struct inode *inode)
				1040	{
				1041	if (inode->i_ino \|\| strcmp(inode->i_sb->s_id, "bdev")) {
				1042	struct dentry *dentry;
				1043	const char *name = "?";
				1044
				1045	dentry = d_find_alias(inode);
				1046	if (dentry) {
				1047	spin_lock(&dentry->d_lock);
				1048	name = (const char *) dentry->d_name.name;
				1049	}
				1050	printk(KERN_DEBUG
				1051	"%s(%d): dirtied inode %lu (%s) on %s\n",
				1052	current->comm, task_pid_nr(current), inode->i_ino,
				1053	name, inode->i_sb->s_id);
				1054	if (dentry) {
				1055	spin_unlock(&dentry->d_lock);
				1056	dput(dentry);
				1057	}
				1058	}
				1059	}
				1060
				1061	/**
				1062	* __mark_inode_dirty - internal function
				1063	* @inode: inode to mark
				1064	* @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
				1065	* Mark an inode as dirty. Callers should use mark_inode_dirty or
				1066	* mark_inode_dirty_sync.
				1067	*
				1068	* Put the inode on the super block's dirty list.
				1069	*
				1070	* CAREFUL! We mark it dirty unconditionally, but move it onto the
				1071	* dirty list only if it is hashed or if it refers to a blockdev.
				1072	* If it was not hashed, it will never be added to the dirty list
				1073	* even if it is later hashed, as it will have been marked dirty already.
				1074	*
				1075	* In short, make sure you hash any inodes _before_ you start marking
				1076	* them dirty.
				1077	*
				1078	* Note that for blockdevs, inode->dirtied_when represents the dirtying time of
				1079	* the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of
				1080	* the kernel-internal blockdev inode represents the dirtying time of the
				1081	* blockdev's pages. This is why for I_DIRTY_PAGES we always use
				1082	* page->mapping->host, so the page-dirtying time is recorded in the internal
				1083	* blockdev inode.
				1084	*/
				1085	void __mark_inode_dirty(struct inode *inode, int flags)
				1086	{
				1087	struct super_block *sb = inode->i_sb;
				1088	struct backing_dev_info *bdi = NULL;
				1089
				1090	/*
				1091	* Don't do this for I_DIRTY_PAGES - that doesn't actually
				1092	* dirty the inode itself
				1093	*/
				1094	if (flags & (I_DIRTY_SYNC \| I_DIRTY_DATASYNC)) {
				1095	if (sb->s_op->dirty_inode)
				1096	sb->s_op->dirty_inode(inode, flags);
				1097	}
				1098
				1099	/*
				1100	* Paired with smp_mb() in __writeback_single_inode() for the
				1101	* following lockless i_state test. See there for details.
				1102	*/
				1103	smp_mb();
				1104
				1105	if ((inode->i_state & flags) == flags)
				1106	return;
				1107
				1108	if (unlikely(block_dump > 1))
				1109	block_dump___mark_inode_dirty(inode);
				1110
				1111	spin_lock(&inode->i_lock);
				1112	if ((inode->i_state & flags) != flags) {
				1113	const int was_dirty = inode->i_state & I_DIRTY;
				1114
				1115	inode->i_state \|= flags;
				1116
				1117	/*
				1118	* If the inode is being synced, just update its dirty state.
				1119	* The unlocker will place the inode on the appropriate
				1120	* superblock list, based upon its state.
				1121	*/
				1122	if (inode->i_state & I_SYNC)
				1123	goto out_unlock_inode;
				1124
				1125	/*
				1126	* Only add valid (hashed) inodes to the superblock's
				1127	* dirty list. Add blockdev inodes as well.
				1128	*/
				1129	if (!S_ISBLK(inode->i_mode)) {
				1130	if (inode_unhashed(inode))
				1131	goto out_unlock_inode;
				1132	}
				1133	if (inode->i_state & I_FREEING)
				1134	goto out_unlock_inode;
				1135
				1136	/*
				1137	* If the inode was already on b_dirty/b_io/b_more_io, don't
				1138	* reposition it (that would break b_dirty time-ordering).
				1139	*/
				1140	if (!was_dirty) {
				1141	bool wakeup_bdi = false;
				1142	bdi = inode_to_bdi(inode);
				1143
				1144	if (bdi_cap_writeback_dirty(bdi)) {
				1145	WARN(!test_bit(BDI_registered, &bdi->state),
				1146	"bdi-%s not registered\n", bdi->name);
				1147
				1148	/*
				1149	* If this is the first dirty inode for this
				1150	* bdi, we have to wake-up the corresponding
				1151	* bdi thread to make sure background
				1152	* write-back happens later.
				1153	*/
				1154	if (!wb_has_dirty_io(&bdi->wb))
				1155	wakeup_bdi = true;
				1156	}
				1157
				1158	spin_unlock(&inode->i_lock);
				1159	spin_lock(&bdi->wb.list_lock);
				1160	inode->dirtied_when = jiffies;
				1161	list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
				1162	spin_unlock(&bdi->wb.list_lock);
				1163
				1164	if (wakeup_bdi)
				1165	bdi_wakeup_thread_delayed(bdi);
				1166	return;
				1167	}
				1168	}
				1169	out_unlock_inode:
				1170	spin_unlock(&inode->i_lock);
				1171
				1172	}
				1173	EXPORT_SYMBOL(__mark_inode_dirty);
				1174
				1175	static void wait_sb_inodes(struct super_block *sb)
				1176	{
				1177	struct inode inode, old_inode = NULL;
				1178
				1179	/*
				1180	* We need to be protected against the filesystem going from
				1181	* r/o to r/w or vice versa.
				1182	*/
				1183	WARN_ON(!rwsem_is_locked(&sb->s_umount));
				1184
				1185	spin_lock(&inode_sb_list_lock);
				1186
				1187	/*
				1188	* Data integrity sync. Must wait for all pages under writeback,
				1189	* because there may have been pages dirtied before our sync
				1190	* call, but which had writeout started before we write it out.
				1191	* In which case, the inode may not be on the dirty list, but
				1192	* we still have to wait for that writeout.
				1193	*/
				1194	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
				1195	struct address_space *mapping = inode->i_mapping;
				1196
				1197	spin_lock(&inode->i_lock);
				1198	if ((inode->i_state & (I_FREEING\|I_WILL_FREE\|I_NEW)) \|\|
				1199	(mapping->nrpages == 0)) {
				1200	spin_unlock(&inode->i_lock);
				1201	continue;
				1202	}
				1203	__iget(inode);
				1204	spin_unlock(&inode->i_lock);
				1205	spin_unlock(&inode_sb_list_lock);
				1206
				1207	/*
				1208	* We hold a reference to 'inode' so it couldn't have been
				1209	* removed from s_inodes list while we dropped the
				1210	* inode_sb_list_lock. We cannot iput the inode now as we can
				1211	* be holding the last reference and we cannot iput it under
				1212	* inode_sb_list_lock. So we keep the reference and iput it
				1213	* later.
				1214	*/
				1215	iput(old_inode);
				1216	old_inode = inode;
				1217
				1218	filemap_fdatawait(mapping);
				1219
				1220	cond_resched();
				1221
				1222	spin_lock(&inode_sb_list_lock);
				1223	}
				1224	spin_unlock(&inode_sb_list_lock);
				1225	iput(old_inode);
				1226	}
				1227
				1228	/**
				1229	* writeback_inodes_sb_nr - writeback dirty inodes from given super_block
				1230	* @sb: the superblock
				1231	* @nr: the number of pages to write
				1232	* @reason: reason why some writeback work initiated
				1233	*
				1234	* Start writeback on some inodes on this super_block. No guarantees are made
				1235	* on how many (if any) will be written, and this function does not wait
				1236	* for IO completion of submitted IO.
				1237	*/
				1238	void writeback_inodes_sb_nr(struct super_block *sb,
				1239	unsigned long nr,
				1240	enum wb_reason reason)
				1241	{
				1242	DECLARE_COMPLETION_ONSTACK(done);
				1243	struct wb_writeback_work work = {
				1244	.sb = sb,
				1245	.sync_mode = WB_SYNC_NONE,
				1246	.tagged_writepages = 1,
				1247	.done = &done,
				1248	.nr_pages = nr,
				1249	.reason = reason,
				1250	};
				1251
				1252	WARN_ON(!rwsem_is_locked(&sb->s_umount));
				1253	bdi_queue_work(sb->s_bdi, &work);
				1254	wait_for_completion(&done);
				1255	}
				1256	EXPORT_SYMBOL(writeback_inodes_sb_nr);
				1257
				1258	/**
				1259	* writeback_inodes_sb - writeback dirty inodes from given super_block
				1260	* @sb: the superblock
				1261	* @reason: reason why some writeback work was initiated
				1262	*
				1263	* Start writeback on some inodes on this super_block. No guarantees are made
				1264	* on how many (if any) will be written, and this function does not wait
				1265	* for IO completion of submitted IO.
				1266	*/
				1267	void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
				1268	{
				1269	return writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
				1270	}
				1271	EXPORT_SYMBOL(writeback_inodes_sb);
				1272
				1273	/**
				1274	* writeback_inodes_sb_if_idle - start writeback if none underway
				1275	* @sb: the superblock
				1276	* @reason: reason why some writeback work was initiated
				1277	*
				1278	* Invoke writeback_inodes_sb if no writeback is currently underway.
				1279	* Returns 1 if writeback was started, 0 if not.
				1280	*/
				1281	int writeback_inodes_sb_if_idle(struct super_block *sb, enum wb_reason reason)
				1282	{
				1283	if (!writeback_in_progress(sb->s_bdi)) {
				1284	down_read(&sb->s_umount);
				1285	writeback_inodes_sb(sb, reason);
				1286	up_read(&sb->s_umount);
				1287	return 1;
				1288	} else
				1289	return 0;
				1290	}
				1291	EXPORT_SYMBOL(writeback_inodes_sb_if_idle);
				1292
				1293	/**
				1294	* writeback_inodes_sb_nr_if_idle - start writeback if none underway
				1295	* @sb: the superblock
				1296	* @nr: the number of pages to write
				1297	* @reason: reason why some writeback work was initiated
				1298	*
				1299	* Invoke writeback_inodes_sb if no writeback is currently underway.
				1300	* Returns 1 if writeback was started, 0 if not.
				1301	*/
				1302	int writeback_inodes_sb_nr_if_idle(struct super_block *sb,
				1303	unsigned long nr,
				1304	enum wb_reason reason)
				1305	{
				1306	if (!writeback_in_progress(sb->s_bdi)) {
				1307	down_read(&sb->s_umount);
				1308	writeback_inodes_sb_nr(sb, nr, reason);
				1309	up_read(&sb->s_umount);
				1310	return 1;
				1311	} else
				1312	return 0;
				1313	}
				1314	EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle);
				1315
				1316	/**
				1317	* sync_inodes_sb - sync sb inode pages
				1318	* @sb: the superblock
				1319	*
				1320	* This function writes and waits on any dirty inode belonging to this
				1321	* super_block.
				1322	*/
				1323	void sync_inodes_sb(struct super_block *sb)
				1324	{
				1325	DECLARE_COMPLETION_ONSTACK(done);
				1326	struct wb_writeback_work work = {
				1327	.sb = sb,
				1328	.sync_mode = WB_SYNC_ALL,
				1329	.nr_pages = LONG_MAX,
				1330	.range_cyclic = 0,
				1331	.done = &done,
				1332	.reason = WB_REASON_SYNC,
				1333	};
				1334
				1335	WARN_ON(!rwsem_is_locked(&sb->s_umount));
				1336
				1337	bdi_queue_work(sb->s_bdi, &work);
				1338	wait_for_completion(&done);
				1339
				1340	wait_sb_inodes(sb);
				1341	}
				1342	EXPORT_SYMBOL(sync_inodes_sb);
				1343
				1344	/**
				1345	* write_inode_now - write an inode to disk
				1346	* @inode: inode to write to disk
				1347	* @sync: whether the write should be synchronous or not
				1348	*
				1349	* This function commits an inode to disk immediately if it is dirty. This is
				1350	* primarily needed by knfsd.
				1351	*
				1352	* The caller must either have a ref on the inode or must have set I_WILL_FREE.
				1353	*/
				1354	int write_inode_now(struct inode *inode, int sync)
				1355	{
				1356	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
				1357	int ret;
				1358	struct writeback_control wbc = {
				1359	.nr_to_write = LONG_MAX,
				1360	.sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE,
				1361	.range_start = 0,
				1362	.range_end = LLONG_MAX,
				1363	};
				1364
				1365	if (!mapping_cap_writeback_dirty(inode->i_mapping))
				1366	wbc.nr_to_write = 0;
				1367
				1368	might_sleep();
				1369	spin_lock(&wb->list_lock);
				1370	spin_lock(&inode->i_lock);
				1371	ret = writeback_single_inode(inode, wb, &wbc);
				1372	spin_unlock(&inode->i_lock);
				1373	spin_unlock(&wb->list_lock);
				1374	return ret;
				1375	}
				1376	EXPORT_SYMBOL(write_inode_now);
				1377
				1378	/**
				1379	* sync_inode - write an inode and its pages to disk.
				1380	* @inode: the inode to sync
				1381	* @wbc: controls the writeback mode
				1382	*
				1383	* sync_inode() will write an inode and its pages to disk. It will also
				1384	* correctly update the inode on its superblock's dirty inode lists and will
				1385	* update inode->i_state.
				1386	*
				1387	* The caller must have a ref on the inode.
				1388	*/
				1389	int sync_inode(struct inode inode, struct writeback_control wbc)
				1390	{
				1391	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
				1392	int ret;
				1393
				1394	spin_lock(&wb->list_lock);
				1395	spin_lock(&inode->i_lock);
				1396	ret = writeback_single_inode(inode, wb, wbc);
				1397	spin_unlock(&inode->i_lock);
				1398	spin_unlock(&wb->list_lock);
				1399	return ret;
				1400	}
				1401	EXPORT_SYMBOL(sync_inode);
				1402
				1403	/**
				1404	* sync_inode_metadata - write an inode to disk
				1405	* @inode: the inode to sync
				1406	* @wait: wait for I/O to complete.
				1407	*
				1408	* Write an inode to disk and adjust its dirty state after completion.
				1409	*
				1410	* Note: only writes the actual inode, no associated data or other metadata.
				1411	*/
				1412	int sync_inode_metadata(struct inode *inode, int wait)
				1413	{
				1414	struct writeback_control wbc = {
				1415	.sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
				1416	.nr_to_write = 0, /* metadata-only */
				1417	};
				1418
				1419	return sync_inode(inode, &wbc);
				1420	}
				1421	EXPORT_SYMBOL(sync_inode_metadata);