Blame - src/kernel/linux/v4.19/fs/xfs/xfs_buf.c - T800

blob: c1f7c0d5d608a3e80825893505820e28661b343c [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* Copyright (c) 2000-2006 Silicon Graphics, Inc.
				4	* All Rights Reserved.
				5	*/
				6	#include "xfs.h"
				7	#include <linux/stddef.h>
				8	#include <linux/errno.h>
				9	#include <linux/gfp.h>
				10	#include <linux/pagemap.h>
				11	#include <linux/init.h>
				12	#include <linux/vmalloc.h>
				13	#include <linux/bio.h>
				14	#include <linux/sysctl.h>
				15	#include <linux/proc_fs.h>
				16	#include <linux/workqueue.h>
				17	#include <linux/percpu.h>
				18	#include <linux/blkdev.h>
				19	#include <linux/hash.h>
				20	#include <linux/kthread.h>
				21	#include <linux/migrate.h>
				22	#include <linux/backing-dev.h>
				23	#include <linux/freezer.h>
				24
				25	#include "xfs_format.h"
				26	#include "xfs_log_format.h"
				27	#include "xfs_trans_resv.h"
				28	#include "xfs_sb.h"
				29	#include "xfs_mount.h"
				30	#include "xfs_trace.h"
				31	#include "xfs_log.h"
				32	#include "xfs_errortag.h"
				33	#include "xfs_error.h"
				34
				35	static kmem_zone_t *xfs_buf_zone;
				36
				37	#define xb_to_gfp(flags) \
				38	((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : GFP_NOFS) \| __GFP_NOWARN)
				39
				40	/*
				41	* Locking orders
				42	*
				43	* xfs_buf_ioacct_inc:
				44	* xfs_buf_ioacct_dec:
				45	* b_sema (caller holds)
				46	* b_lock
				47	*
				48	* xfs_buf_stale:
				49	* b_sema (caller holds)
				50	* b_lock
				51	* lru_lock
				52	*
				53	* xfs_buf_rele:
				54	* b_lock
				55	* pag_buf_lock
				56	* lru_lock
				57	*
				58	* xfs_buftarg_wait_rele
				59	* lru_lock
				60	* b_lock (trylock due to inversion)
				61	*
				62	* xfs_buftarg_isolate
				63	* lru_lock
				64	* b_lock (trylock due to inversion)
				65	*/
				66
				67	static inline int
				68	xfs_buf_is_vmapped(
				69	struct xfs_buf *bp)
				70	{
				71	/*
				72	* Return true if the buffer is vmapped.
				73	*
				74	* b_addr is null if the buffer is not mapped, but the code is clever
				75	* enough to know it doesn't have to map a single page, so the check has
				76	* to be both for b_addr and bp->b_page_count > 1.
				77	*/
				78	return bp->b_addr && bp->b_page_count > 1;
				79	}
				80
				81	static inline int
				82	xfs_buf_vmap_len(
				83	struct xfs_buf *bp)
				84	{
				85	return (bp->b_page_count * PAGE_SIZE) - bp->b_offset;
				86	}
				87
				88	/*
				89	* Bump the I/O in flight count on the buftarg if we haven't yet done so for
				90	* this buffer. The count is incremented once per buffer (per hold cycle)
				91	* because the corresponding decrement is deferred to buffer release. Buffers
				92	* can undergo I/O multiple times in a hold-release cycle and per buffer I/O
				93	* tracking adds unnecessary overhead. This is used for sychronization purposes
				94	* with unmount (see xfs_wait_buftarg()), so all we really need is a count of
				95	* in-flight buffers.
				96	*
				97	* Buffers that are never released (e.g., superblock, iclog buffers) must set
				98	* the XBF_NO_IOACCT flag before I/O submission. Otherwise, the buftarg count
				99	* never reaches zero and unmount hangs indefinitely.
				100	*/
				101	static inline void
				102	xfs_buf_ioacct_inc(
				103	struct xfs_buf *bp)
				104	{
				105	if (bp->b_flags & XBF_NO_IOACCT)
				106	return;
				107
				108	ASSERT(bp->b_flags & XBF_ASYNC);
				109	spin_lock(&bp->b_lock);
				110	if (!(bp->b_state & XFS_BSTATE_IN_FLIGHT)) {
				111	bp->b_state \|= XFS_BSTATE_IN_FLIGHT;
				112	percpu_counter_inc(&bp->b_target->bt_io_count);
				113	}
				114	spin_unlock(&bp->b_lock);
				115	}
				116
				117	/*
				118	* Clear the in-flight state on a buffer about to be released to the LRU or
				119	* freed and unaccount from the buftarg.
				120	*/
				121	static inline void
				122	__xfs_buf_ioacct_dec(
				123	struct xfs_buf *bp)
				124	{
				125	lockdep_assert_held(&bp->b_lock);
				126
				127	if (bp->b_state & XFS_BSTATE_IN_FLIGHT) {
				128	bp->b_state &= ~XFS_BSTATE_IN_FLIGHT;
				129	percpu_counter_dec(&bp->b_target->bt_io_count);
				130	}
				131	}
				132
				133	static inline void
				134	xfs_buf_ioacct_dec(
				135	struct xfs_buf *bp)
				136	{
				137	spin_lock(&bp->b_lock);
				138	__xfs_buf_ioacct_dec(bp);
				139	spin_unlock(&bp->b_lock);
				140	}
				141
				142	/*
				143	* When we mark a buffer stale, we remove the buffer from the LRU and clear the
				144	* b_lru_ref count so that the buffer is freed immediately when the buffer
				145	* reference count falls to zero. If the buffer is already on the LRU, we need
				146	* to remove the reference that LRU holds on the buffer.
				147	*
				148	* This prevents build-up of stale buffers on the LRU.
				149	*/
				150	void
				151	xfs_buf_stale(
				152	struct xfs_buf *bp)
				153	{
				154	ASSERT(xfs_buf_islocked(bp));
				155
				156	bp->b_flags \|= XBF_STALE;
				157
				158	/*
				159	* Clear the delwri status so that a delwri queue walker will not
				160	* flush this buffer to disk now that it is stale. The delwri queue has
				161	* a reference to the buffer, so this is safe to do.
				162	*/
				163	bp->b_flags &= ~_XBF_DELWRI_Q;
				164
				165	/*
				166	* Once the buffer is marked stale and unlocked, a subsequent lookup
				167	* could reset b_flags. There is no guarantee that the buffer is
				168	* unaccounted (released to LRU) before that occurs. Drop in-flight
				169	* status now to preserve accounting consistency.
				170	*/
				171	spin_lock(&bp->b_lock);
				172	__xfs_buf_ioacct_dec(bp);
				173
				174	atomic_set(&bp->b_lru_ref, 0);
				175	if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
				176	(list_lru_del(&bp->b_target->bt_lru, &bp->b_lru)))
				177	atomic_dec(&bp->b_hold);
				178
				179	ASSERT(atomic_read(&bp->b_hold) >= 1);
				180	spin_unlock(&bp->b_lock);
				181	}
				182
				183	static int
				184	xfs_buf_get_maps(
				185	struct xfs_buf *bp,
				186	int map_count)
				187	{
				188	ASSERT(bp->b_maps == NULL);
				189	bp->b_map_count = map_count;
				190
				191	if (map_count == 1) {
				192	bp->b_maps = &bp->__b_map;
				193	return 0;
				194	}
				195
				196	bp->b_maps = kmem_zalloc(map_count * sizeof(struct xfs_buf_map),
				197	KM_NOFS);
				198	if (!bp->b_maps)
				199	return -ENOMEM;
				200	return 0;
				201	}
				202
				203	/*
				204	* Frees b_pages if it was allocated.
				205	*/
				206	static void
				207	xfs_buf_free_maps(
				208	struct xfs_buf *bp)
				209	{
				210	if (bp->b_maps != &bp->__b_map) {
				211	kmem_free(bp->b_maps);
				212	bp->b_maps = NULL;
				213	}
				214	}
				215
				216	struct xfs_buf *
				217	_xfs_buf_alloc(
				218	struct xfs_buftarg *target,
				219	struct xfs_buf_map *map,
				220	int nmaps,
				221	xfs_buf_flags_t flags)
				222	{
				223	struct xfs_buf *bp;
				224	int error;
				225	int i;
				226
				227	bp = kmem_zone_zalloc(xfs_buf_zone, KM_NOFS);
				228	if (unlikely(!bp))
				229	return NULL;
				230
				231	/*
				232	* We don't want certain flags to appear in b_flags unless they are
				233	* specifically set by later operations on the buffer.
				234	*/
				235	flags &= ~(XBF_UNMAPPED \| XBF_TRYLOCK \| XBF_ASYNC \| XBF_READ_AHEAD);
				236
				237	atomic_set(&bp->b_hold, 1);
				238	atomic_set(&bp->b_lru_ref, 1);
				239	init_completion(&bp->b_iowait);
				240	INIT_LIST_HEAD(&bp->b_lru);
				241	INIT_LIST_HEAD(&bp->b_list);
				242	INIT_LIST_HEAD(&bp->b_li_list);
				243	sema_init(&bp->b_sema, 0); /* held, no waiters */
				244	spin_lock_init(&bp->b_lock);
				245	bp->b_target = target;
				246	bp->b_flags = flags;
				247
				248	/*
				249	* Set length and io_length to the same value initially.
				250	* I/O routines should use io_length, which will be the same in
				251	* most cases but may be reset (e.g. XFS recovery).
				252	*/
				253	error = xfs_buf_get_maps(bp, nmaps);
				254	if (error) {
				255	kmem_zone_free(xfs_buf_zone, bp);
				256	return NULL;
				257	}
				258
				259	bp->b_bn = map[0].bm_bn;
				260	bp->b_length = 0;
				261	for (i = 0; i < nmaps; i++) {
				262	bp->b_maps[i].bm_bn = map[i].bm_bn;
				263	bp->b_maps[i].bm_len = map[i].bm_len;
				264	bp->b_length += map[i].bm_len;
				265	}
				266	bp->b_io_length = bp->b_length;
				267
				268	atomic_set(&bp->b_pin_count, 0);
				269	init_waitqueue_head(&bp->b_waiters);
				270
				271	XFS_STATS_INC(target->bt_mount, xb_create);
				272	trace_xfs_buf_init(bp, _RET_IP_);
				273
				274	return bp;
				275	}
				276
				277	/*
				278	* Allocate a page array capable of holding a specified number
				279	* of pages, and point the page buf at it.
				280	*/
				281	STATIC int
				282	_xfs_buf_get_pages(
				283	xfs_buf_t *bp,
				284	int page_count)
				285	{
				286	/* Make sure that we have a page list */
				287	if (bp->b_pages == NULL) {
				288	bp->b_page_count = page_count;
				289	if (page_count <= XB_PAGES) {
				290	bp->b_pages = bp->b_page_array;
				291	} else {
				292	bp->b_pages = kmem_alloc(sizeof(struct page )
				293	page_count, KM_NOFS);
				294	if (bp->b_pages == NULL)
				295	return -ENOMEM;
				296	}
				297	memset(bp->b_pages, 0, sizeof(struct page ) page_count);
				298	}
				299	return 0;
				300	}
				301
				302	/*
				303	* Frees b_pages if it was allocated.
				304	*/
				305	STATIC void
				306	_xfs_buf_free_pages(
				307	xfs_buf_t *bp)
				308	{
				309	if (bp->b_pages != bp->b_page_array) {
				310	kmem_free(bp->b_pages);
				311	bp->b_pages = NULL;
				312	}
				313	}
				314
				315	/*
				316	* Releases the specified buffer.
				317	*
				318	* The modification state of any associated pages is left unchanged.
				319	* The buffer must not be on any hash - use xfs_buf_rele instead for
				320	* hashed and refcounted buffers
				321	*/
				322	void
				323	xfs_buf_free(
				324	xfs_buf_t *bp)
				325	{
				326	trace_xfs_buf_free(bp, _RET_IP_);
				327
				328	ASSERT(list_empty(&bp->b_lru));
				329
				330	if (bp->b_flags & _XBF_PAGES) {
				331	uint i;
				332
				333	if (xfs_buf_is_vmapped(bp))
				334	vm_unmap_ram(bp->b_addr - bp->b_offset,
				335	bp->b_page_count);
				336
				337	for (i = 0; i < bp->b_page_count; i++) {
				338	struct page *page = bp->b_pages[i];
				339
				340	__free_page(page);
				341	}
				342	} else if (bp->b_flags & _XBF_KMEM)
				343	kmem_free(bp->b_addr);
				344	_xfs_buf_free_pages(bp);
				345	xfs_buf_free_maps(bp);
				346	kmem_zone_free(xfs_buf_zone, bp);
				347	}
				348
				349	/*
				350	* Allocates all the pages for buffer in question and builds it's page list.
				351	*/
				352	STATIC int
				353	xfs_buf_allocate_memory(
				354	xfs_buf_t *bp,
				355	uint flags)
				356	{
				357	size_t size;
				358	size_t nbytes, offset;
				359	gfp_t gfp_mask = xb_to_gfp(flags);
				360	unsigned short page_count, i;
				361	xfs_off_t start, end;
				362	int error;
				363
				364	/*
				365	* for buffers that are contained within a single page, just allocate
				366	* the memory from the heap - there's no need for the complexity of
				367	* page arrays to keep allocation down to order 0.
				368	*/
				369	size = BBTOB(bp->b_length);
				370	if (size < PAGE_SIZE) {
				371	bp->b_addr = kmem_alloc(size, KM_NOFS);
				372	if (!bp->b_addr) {
				373	/* low memory - use alloc_page loop instead */
				374	goto use_alloc_page;
				375	}
				376
				377	if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) !=
				378	((unsigned long)bp->b_addr & PAGE_MASK)) {
				379	/* b_addr spans two pages - use alloc_page instead */
				380	kmem_free(bp->b_addr);
				381	bp->b_addr = NULL;
				382	goto use_alloc_page;
				383	}
				384	bp->b_offset = offset_in_page(bp->b_addr);
				385	bp->b_pages = bp->b_page_array;
				386	bp->b_pages[0] = virt_to_page(bp->b_addr);
				387	bp->b_page_count = 1;
				388	bp->b_flags \|= _XBF_KMEM;
				389	return 0;
				390	}
				391
				392	use_alloc_page:
				393	start = BBTOB(bp->b_maps[0].bm_bn) >> PAGE_SHIFT;
				394	end = (BBTOB(bp->b_maps[0].bm_bn + bp->b_length) + PAGE_SIZE - 1)
				395	>> PAGE_SHIFT;
				396	page_count = end - start;
				397	error = _xfs_buf_get_pages(bp, page_count);
				398	if (unlikely(error))
				399	return error;
				400
				401	offset = bp->b_offset;
				402	bp->b_flags \|= _XBF_PAGES;
				403
				404	for (i = 0; i < bp->b_page_count; i++) {
				405	struct page *page;
				406	uint retries = 0;
				407	retry:
				408	page = alloc_page(gfp_mask);
				409	if (unlikely(page == NULL)) {
				410	if (flags & XBF_READ_AHEAD) {
				411	bp->b_page_count = i;
				412	error = -ENOMEM;
				413	goto out_free_pages;
				414	}
				415
				416	/*
				417	* This could deadlock.
				418	*
				419	* But until all the XFS lowlevel code is revamped to
				420	* handle buffer allocation failures we can't do much.
				421	*/
				422	if (!(++retries % 100))
				423	xfs_err(NULL,
				424	"%s(%u) possible memory allocation deadlock in %s (mode:0x%x)",
				425	current->comm, current->pid,
				426	__func__, gfp_mask);
				427
				428	XFS_STATS_INC(bp->b_target->bt_mount, xb_page_retries);
				429	congestion_wait(BLK_RW_ASYNC, HZ/50);
				430	goto retry;
				431	}
				432
				433	XFS_STATS_INC(bp->b_target->bt_mount, xb_page_found);
				434
				435	nbytes = min_t(size_t, size, PAGE_SIZE - offset);
				436	size -= nbytes;
				437	bp->b_pages[i] = page;
				438	offset = 0;
				439	}
				440	return 0;
				441
				442	out_free_pages:
				443	for (i = 0; i < bp->b_page_count; i++)
				444	__free_page(bp->b_pages[i]);
				445	bp->b_flags &= ~_XBF_PAGES;
				446	return error;
				447	}
				448
				449	/*
				450	* Map buffer into kernel address-space if necessary.
				451	*/
				452	STATIC int
				453	_xfs_buf_map_pages(
				454	xfs_buf_t *bp,
				455	uint flags)
				456	{
				457	ASSERT(bp->b_flags & _XBF_PAGES);
				458	if (bp->b_page_count == 1) {
				459	/* A single page buffer is always mappable */
				460	bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
				461	} else if (flags & XBF_UNMAPPED) {
				462	bp->b_addr = NULL;
				463	} else {
				464	int retried = 0;
				465	unsigned nofs_flag;
				466
				467	/*
				468	* vm_map_ram() will allocate auxillary structures (e.g.
				469	* pagetables) with GFP_KERNEL, yet we are likely to be under
				470	* GFP_NOFS context here. Hence we need to tell memory reclaim
				471	* that we are in such a context via PF_MEMALLOC_NOFS to prevent
				472	* memory reclaim re-entering the filesystem here and
				473	* potentially deadlocking.
				474	*/
				475	nofs_flag = memalloc_nofs_save();
				476	do {
				477	bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
				478	-1, PAGE_KERNEL);
				479	if (bp->b_addr)
				480	break;
				481	vm_unmap_aliases();
				482	} while (retried++ <= 1);
				483	memalloc_nofs_restore(nofs_flag);
				484
				485	if (!bp->b_addr)
				486	return -ENOMEM;
				487	bp->b_addr += bp->b_offset;
				488	}
				489
				490	return 0;
				491	}
				492
				493	/*
				494	* Finding and Reading Buffers
				495	*/
				496	static int
				497	_xfs_buf_obj_cmp(
				498	struct rhashtable_compare_arg *arg,
				499	const void *obj)
				500	{
				501	const struct xfs_buf_map *map = arg->key;
				502	const struct xfs_buf *bp = obj;
				503
				504	/*
				505	* The key hashing in the lookup path depends on the key being the
				506	* first element of the compare_arg, make sure to assert this.
				507	*/
				508	BUILD_BUG_ON(offsetof(struct xfs_buf_map, bm_bn) != 0);
				509
				510	if (bp->b_bn != map->bm_bn)
				511	return 1;
				512
				513	if (unlikely(bp->b_length != map->bm_len)) {
				514	/*
				515	* found a block number match. If the range doesn't
				516	* match, the only way this is allowed is if the buffer
				517	* in the cache is stale and the transaction that made
				518	* it stale has not yet committed. i.e. we are
				519	* reallocating a busy extent. Skip this buffer and
				520	* continue searching for an exact match.
				521	*/
				522	ASSERT(bp->b_flags & XBF_STALE);
				523	return 1;
				524	}
				525	return 0;
				526	}
				527
				528	static const struct rhashtable_params xfs_buf_hash_params = {
				529	.min_size = 32, /* empty AGs have minimal footprint */
				530	.nelem_hint = 16,
				531	.key_len = sizeof(xfs_daddr_t),
				532	.key_offset = offsetof(struct xfs_buf, b_bn),
				533	.head_offset = offsetof(struct xfs_buf, b_rhash_head),
				534	.automatic_shrinking = true,
				535	.obj_cmpfn = _xfs_buf_obj_cmp,
				536	};
				537
				538	int
				539	xfs_buf_hash_init(
				540	struct xfs_perag *pag)
				541	{
				542	spin_lock_init(&pag->pag_buf_lock);
				543	return rhashtable_init(&pag->pag_buf_hash, &xfs_buf_hash_params);
				544	}
				545
				546	void
				547	xfs_buf_hash_destroy(
				548	struct xfs_perag *pag)
				549	{
				550	rhashtable_destroy(&pag->pag_buf_hash);
				551	}
				552
				553	/*
				554	* Look up a buffer in the buffer cache and return it referenced and locked
				555	* in @found_bp.
				556	*
				557	* If @new_bp is supplied and we have a lookup miss, insert @new_bp into the
				558	* cache.
				559	*
				560	* If XBF_TRYLOCK is set in @flags, only try to lock the buffer and return
				561	* -EAGAIN if we fail to lock it.
				562	*
				563	* Return values are:
				564	* -EFSCORRUPTED if have been supplied with an invalid address
				565	* -EAGAIN on trylock failure
				566	* -ENOENT if we fail to find a match and @new_bp was NULL
				567	* 0, with @found_bp:
				568	* - @new_bp if we inserted it into the cache
				569	* - the buffer we found and locked.
				570	*/
				571	static int
				572	xfs_buf_find(
				573	struct xfs_buftarg *btp,
				574	struct xfs_buf_map *map,
				575	int nmaps,
				576	xfs_buf_flags_t flags,
				577	struct xfs_buf *new_bp,
				578	struct xfs_buf **found_bp)
				579	{
				580	struct xfs_perag *pag;
				581	xfs_buf_t *bp;
				582	struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn };
				583	xfs_daddr_t eofs;
				584	int i;
				585
				586	*found_bp = NULL;
				587
				588	for (i = 0; i < nmaps; i++)
				589	cmap.bm_len += map[i].bm_len;
				590
				591	/* Check for IOs smaller than the sector size / not sector aligned */
				592	ASSERT(!(BBTOB(cmap.bm_len) < btp->bt_meta_sectorsize));
				593	ASSERT(!(BBTOB(cmap.bm_bn) & (xfs_off_t)btp->bt_meta_sectormask));
				594
				595	/*
				596	* Corrupted block numbers can get through to here, unfortunately, so we
				597	* have to check that the buffer falls within the filesystem bounds.
				598	*/
				599	eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
				600	if (cmap.bm_bn < 0 \|\| cmap.bm_bn >= eofs) {
				601	xfs_alert(btp->bt_mount,
				602	"%s: daddr 0x%llx out of range, EOFS 0x%llx",
				603	__func__, cmap.bm_bn, eofs);
				604	WARN_ON(1);
				605	return -EFSCORRUPTED;
				606	}
				607
				608	pag = xfs_perag_get(btp->bt_mount,
				609	xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn));
				610
				611	spin_lock(&pag->pag_buf_lock);
				612	bp = rhashtable_lookup_fast(&pag->pag_buf_hash, &cmap,
				613	xfs_buf_hash_params);
				614	if (bp) {
				615	atomic_inc(&bp->b_hold);
				616	goto found;
				617	}
				618
				619	/* No match found */
				620	if (!new_bp) {
				621	XFS_STATS_INC(btp->bt_mount, xb_miss_locked);
				622	spin_unlock(&pag->pag_buf_lock);
				623	xfs_perag_put(pag);
				624	return -ENOENT;
				625	}
				626
				627	/* the buffer keeps the perag reference until it is freed */
				628	new_bp->b_pag = pag;
				629	rhashtable_insert_fast(&pag->pag_buf_hash, &new_bp->b_rhash_head,
				630	xfs_buf_hash_params);
				631	spin_unlock(&pag->pag_buf_lock);
				632	*found_bp = new_bp;
				633	return 0;
				634
				635	found:
				636	spin_unlock(&pag->pag_buf_lock);
				637	xfs_perag_put(pag);
				638
				639	if (!xfs_buf_trylock(bp)) {
				640	if (flags & XBF_TRYLOCK) {
				641	xfs_buf_rele(bp);
				642	XFS_STATS_INC(btp->bt_mount, xb_busy_locked);
				643	return -EAGAIN;
				644	}
				645	xfs_buf_lock(bp);
				646	XFS_STATS_INC(btp->bt_mount, xb_get_locked_waited);
				647	}
				648
				649	/*
				650	* if the buffer is stale, clear all the external state associated with
				651	* it. We need to keep flags such as how we allocated the buffer memory
				652	* intact here.
				653	*/
				654	if (bp->b_flags & XBF_STALE) {
				655	ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
				656	ASSERT(bp->b_iodone == NULL);
				657	bp->b_flags &= _XBF_KMEM \| _XBF_PAGES;
				658	bp->b_ops = NULL;
				659	}
				660
				661	trace_xfs_buf_find(bp, flags, _RET_IP_);
				662	XFS_STATS_INC(btp->bt_mount, xb_get_locked);
				663	*found_bp = bp;
				664	return 0;
				665	}
				666
				667	struct xfs_buf *
				668	xfs_buf_incore(
				669	struct xfs_buftarg *target,
				670	xfs_daddr_t blkno,
				671	size_t numblks,
				672	xfs_buf_flags_t flags)
				673	{
				674	struct xfs_buf *bp;
				675	int error;
				676	DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
				677
				678	error = xfs_buf_find(target, &map, 1, flags, NULL, &bp);
				679	if (error)
				680	return NULL;
				681	return bp;
				682	}
				683
				684	/*
				685	* Assembles a buffer covering the specified range. The code is optimised for
				686	* cache hits, as metadata intensive workloads will see 3 orders of magnitude
				687	* more hits than misses.
				688	*/
				689	struct xfs_buf *
				690	xfs_buf_get_map(
				691	struct xfs_buftarg *target,
				692	struct xfs_buf_map *map,
				693	int nmaps,
				694	xfs_buf_flags_t flags)
				695	{
				696	struct xfs_buf *bp;
				697	struct xfs_buf *new_bp;
				698	int error = 0;
				699
				700	error = xfs_buf_find(target, map, nmaps, flags, NULL, &bp);
				701
				702	switch (error) {
				703	case 0:
				704	/* cache hit */
				705	goto found;
				706	case -EAGAIN:
				707	/* cache hit, trylock failure, caller handles failure */
				708	ASSERT(flags & XBF_TRYLOCK);
				709	return NULL;
				710	case -ENOENT:
				711	/* cache miss, go for insert */
				712	break;
				713	case -EFSCORRUPTED:
				714	default:
				715	/*
				716	* None of the higher layers understand failure types
				717	* yet, so return NULL to signal a fatal lookup error.
				718	*/
				719	return NULL;
				720	}
				721
				722	new_bp = _xfs_buf_alloc(target, map, nmaps, flags);
				723	if (unlikely(!new_bp))
				724	return NULL;
				725
				726	error = xfs_buf_allocate_memory(new_bp, flags);
				727	if (error) {
				728	xfs_buf_free(new_bp);
				729	return NULL;
				730	}
				731
				732	error = xfs_buf_find(target, map, nmaps, flags, new_bp, &bp);
				733	if (error) {
				734	xfs_buf_free(new_bp);
				735	return NULL;
				736	}
				737
				738	if (bp != new_bp)
				739	xfs_buf_free(new_bp);
				740
				741	found:
				742	if (!bp->b_addr) {
				743	error = _xfs_buf_map_pages(bp, flags);
				744	if (unlikely(error)) {
				745	xfs_warn(target->bt_mount,
				746	"%s: failed to map pagesn", __func__);
				747	xfs_buf_relse(bp);
				748	return NULL;
				749	}
				750	}
				751
				752	/*
				753	* Clear b_error if this is a lookup from a caller that doesn't expect
				754	* valid data to be found in the buffer.
				755	*/
				756	if (!(flags & XBF_READ))
				757	xfs_buf_ioerror(bp, 0);
				758
				759	XFS_STATS_INC(target->bt_mount, xb_get);
				760	trace_xfs_buf_get(bp, flags, _RET_IP_);
				761	return bp;
				762	}
				763
				764	STATIC int
				765	_xfs_buf_read(
				766	xfs_buf_t *bp,
				767	xfs_buf_flags_t flags)
				768	{
				769	ASSERT(!(flags & XBF_WRITE));
				770	ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL);
				771
				772	bp->b_flags &= ~(XBF_WRITE \| XBF_ASYNC \| XBF_READ_AHEAD);
				773	bp->b_flags \|= flags & (XBF_READ \| XBF_ASYNC \| XBF_READ_AHEAD);
				774
				775	return xfs_buf_submit(bp);
				776	}
				777
				778	xfs_buf_t *
				779	xfs_buf_read_map(
				780	struct xfs_buftarg *target,
				781	struct xfs_buf_map *map,
				782	int nmaps,
				783	xfs_buf_flags_t flags,
				784	const struct xfs_buf_ops *ops)
				785	{
				786	struct xfs_buf *bp;
				787
				788	flags \|= XBF_READ;
				789
				790	bp = xfs_buf_get_map(target, map, nmaps, flags);
				791	if (bp) {
				792	trace_xfs_buf_read(bp, flags, _RET_IP_);
				793
				794	if (!(bp->b_flags & XBF_DONE)) {
				795	XFS_STATS_INC(target->bt_mount, xb_get_read);
				796	bp->b_ops = ops;
				797	_xfs_buf_read(bp, flags);
				798	} else if (flags & XBF_ASYNC) {
				799	/*
				800	* Read ahead call which is already satisfied,
				801	* drop the buffer
				802	*/
				803	xfs_buf_relse(bp);
				804	return NULL;
				805	} else {
				806	/* We do not want read in the flags */
				807	bp->b_flags &= ~XBF_READ;
				808	}
				809	}
				810
				811	return bp;
				812	}
				813
				814	/*
				815	* If we are not low on memory then do the readahead in a deadlock
				816	* safe manner.
				817	*/
				818	void
				819	xfs_buf_readahead_map(
				820	struct xfs_buftarg *target,
				821	struct xfs_buf_map *map,
				822	int nmaps,
				823	const struct xfs_buf_ops *ops)
				824	{
				825	if (bdi_read_congested(target->bt_bdev->bd_bdi))
				826	return;
				827
				828	xfs_buf_read_map(target, map, nmaps,
				829	XBF_TRYLOCK\|XBF_ASYNC\|XBF_READ_AHEAD, ops);
				830	}
				831
				832	/*
				833	* Read an uncached buffer from disk. Allocates and returns a locked
				834	* buffer containing the disk contents or nothing.
				835	*/
				836	int
				837	xfs_buf_read_uncached(
				838	struct xfs_buftarg *target,
				839	xfs_daddr_t daddr,
				840	size_t numblks,
				841	int flags,
				842	struct xfs_buf **bpp,
				843	const struct xfs_buf_ops *ops)
				844	{
				845	struct xfs_buf *bp;
				846
				847	*bpp = NULL;
				848
				849	bp = xfs_buf_get_uncached(target, numblks, flags);
				850	if (!bp)
				851	return -ENOMEM;
				852
				853	/* set up the buffer for a read IO */
				854	ASSERT(bp->b_map_count == 1);
				855	bp->b_bn = XFS_BUF_DADDR_NULL; /* always null for uncached buffers */
				856	bp->b_maps[0].bm_bn = daddr;
				857	bp->b_flags \|= XBF_READ;
				858	bp->b_ops = ops;
				859
				860	xfs_buf_submit(bp);
				861	if (bp->b_error) {
				862	int error = bp->b_error;
				863	xfs_buf_relse(bp);
				864	return error;
				865	}
				866
				867	*bpp = bp;
				868	return 0;
				869	}
				870
				871	/*
				872	* Return a buffer allocated as an empty buffer and associated to external
				873	* memory via xfs_buf_associate_memory() back to it's empty state.
				874	*/
				875	void
				876	xfs_buf_set_empty(
				877	struct xfs_buf *bp,
				878	size_t numblks)
				879	{
				880	if (bp->b_pages)
				881	_xfs_buf_free_pages(bp);
				882
				883	bp->b_pages = NULL;
				884	bp->b_page_count = 0;
				885	bp->b_addr = NULL;
				886	bp->b_length = numblks;
				887	bp->b_io_length = numblks;
				888
				889	ASSERT(bp->b_map_count == 1);
				890	bp->b_bn = XFS_BUF_DADDR_NULL;
				891	bp->b_maps[0].bm_bn = XFS_BUF_DADDR_NULL;
				892	bp->b_maps[0].bm_len = bp->b_length;
				893	}
				894
				895	static inline struct page *
				896	mem_to_page(
				897	void *addr)
				898	{
				899	if ((!is_vmalloc_addr(addr))) {
				900	return virt_to_page(addr);
				901	} else {
				902	return vmalloc_to_page(addr);
				903	}
				904	}
				905
				906	int
				907	xfs_buf_associate_memory(
				908	xfs_buf_t *bp,
				909	void *mem,
				910	size_t len)
				911	{
				912	int rval;
				913	int i = 0;
				914	unsigned long pageaddr;
				915	unsigned long offset;
				916	size_t buflen;
				917	int page_count;
				918
				919	pageaddr = (unsigned long)mem & PAGE_MASK;
				920	offset = (unsigned long)mem - pageaddr;
				921	buflen = PAGE_ALIGN(len + offset);
				922	page_count = buflen >> PAGE_SHIFT;
				923
				924	/* Free any previous set of page pointers */
				925	if (bp->b_pages)
				926	_xfs_buf_free_pages(bp);
				927
				928	bp->b_pages = NULL;
				929	bp->b_addr = mem;
				930
				931	rval = _xfs_buf_get_pages(bp, page_count);
				932	if (rval)
				933	return rval;
				934
				935	bp->b_offset = offset;
				936
				937	for (i = 0; i < bp->b_page_count; i++) {
				938	bp->b_pages[i] = mem_to_page((void *)pageaddr);
				939	pageaddr += PAGE_SIZE;
				940	}
				941
				942	bp->b_io_length = BTOBB(len);
				943	bp->b_length = BTOBB(buflen);
				944
				945	return 0;
				946	}
				947
				948	xfs_buf_t *
				949	xfs_buf_get_uncached(
				950	struct xfs_buftarg *target,
				951	size_t numblks,
				952	int flags)
				953	{
				954	unsigned long page_count;
				955	int error, i;
				956	struct xfs_buf *bp;
				957	DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks);
				958
				959	/* flags might contain irrelevant bits, pass only what we care about */
				960	bp = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT);
				961	if (unlikely(bp == NULL))
				962	goto fail;
				963
				964	page_count = PAGE_ALIGN(numblks << BBSHIFT) >> PAGE_SHIFT;
				965	error = _xfs_buf_get_pages(bp, page_count);
				966	if (error)
				967	goto fail_free_buf;
				968
				969	for (i = 0; i < page_count; i++) {
				970	bp->b_pages[i] = alloc_page(xb_to_gfp(flags));
				971	if (!bp->b_pages[i])
				972	goto fail_free_mem;
				973	}
				974	bp->b_flags \|= _XBF_PAGES;
				975
				976	error = _xfs_buf_map_pages(bp, 0);
				977	if (unlikely(error)) {
				978	xfs_warn(target->bt_mount,
				979	"%s: failed to map pages", __func__);
				980	goto fail_free_mem;
				981	}
				982
				983	trace_xfs_buf_get_uncached(bp, _RET_IP_);
				984	return bp;
				985
				986	fail_free_mem:
				987	while (--i >= 0)
				988	__free_page(bp->b_pages[i]);
				989	_xfs_buf_free_pages(bp);
				990	fail_free_buf:
				991	xfs_buf_free_maps(bp);
				992	kmem_zone_free(xfs_buf_zone, bp);
				993	fail:
				994	return NULL;
				995	}
				996
				997	/*
				998	* Increment reference count on buffer, to hold the buffer concurrently
				999	* with another thread which may release (free) the buffer asynchronously.
				1000	* Must hold the buffer already to call this function.
				1001	*/
				1002	void
				1003	xfs_buf_hold(
				1004	xfs_buf_t *bp)
				1005	{
				1006	trace_xfs_buf_hold(bp, _RET_IP_);
				1007	atomic_inc(&bp->b_hold);
				1008	}
				1009
				1010	/*
				1011	* Release a hold on the specified buffer. If the hold count is 1, the buffer is
				1012	* placed on LRU or freed (depending on b_lru_ref).
				1013	*/
				1014	void
				1015	xfs_buf_rele(
				1016	xfs_buf_t *bp)
				1017	{
				1018	struct xfs_perag *pag = bp->b_pag;
				1019	bool release;
				1020	bool freebuf = false;
				1021
				1022	trace_xfs_buf_rele(bp, _RET_IP_);
				1023
				1024	if (!pag) {
				1025	ASSERT(list_empty(&bp->b_lru));
				1026	if (atomic_dec_and_test(&bp->b_hold)) {
				1027	xfs_buf_ioacct_dec(bp);
				1028	xfs_buf_free(bp);
				1029	}
				1030	return;
				1031	}
				1032
				1033	ASSERT(atomic_read(&bp->b_hold) > 0);
				1034
				1035	/*
				1036	* We grab the b_lock here first to serialise racing xfs_buf_rele()
				1037	* calls. The pag_buf_lock being taken on the last reference only
				1038	* serialises against racing lookups in xfs_buf_find(). IOWs, the second
				1039	* to last reference we drop here is not serialised against the last
				1040	* reference until we take bp->b_lock. Hence if we don't grab b_lock
				1041	* first, the last "release" reference can win the race to the lock and
				1042	* free the buffer before the second-to-last reference is processed,
				1043	* leading to a use-after-free scenario.
				1044	*/
				1045	spin_lock(&bp->b_lock);
				1046	release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
				1047	if (!release) {
				1048	/*
				1049	* Drop the in-flight state if the buffer is already on the LRU
				1050	* and it holds the only reference. This is racy because we
				1051	* haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT
				1052	* ensures the decrement occurs only once per-buf.
				1053	*/
				1054	if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru))
				1055	__xfs_buf_ioacct_dec(bp);
				1056	goto out_unlock;
				1057	}
				1058
				1059	/* the last reference has been dropped ... */
				1060	__xfs_buf_ioacct_dec(bp);
				1061	if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
				1062	/*
				1063	* If the buffer is added to the LRU take a new reference to the
				1064	* buffer for the LRU and clear the (now stale) dispose list
				1065	* state flag
				1066	*/
				1067	if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
				1068	bp->b_state &= ~XFS_BSTATE_DISPOSE;
				1069	atomic_inc(&bp->b_hold);
				1070	}
				1071	spin_unlock(&pag->pag_buf_lock);
				1072	} else {
				1073	/*
				1074	* most of the time buffers will already be removed from the
				1075	* LRU, so optimise that case by checking for the
				1076	* XFS_BSTATE_DISPOSE flag indicating the last list the buffer
				1077	* was on was the disposal list
				1078	*/
				1079	if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
				1080	list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
				1081	} else {
				1082	ASSERT(list_empty(&bp->b_lru));
				1083	}
				1084
				1085	ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
				1086	rhashtable_remove_fast(&pag->pag_buf_hash, &bp->b_rhash_head,
				1087	xfs_buf_hash_params);
				1088	spin_unlock(&pag->pag_buf_lock);
				1089	xfs_perag_put(pag);
				1090	freebuf = true;
				1091	}
				1092
				1093	out_unlock:
				1094	spin_unlock(&bp->b_lock);
				1095
				1096	if (freebuf)
				1097	xfs_buf_free(bp);
				1098	}
				1099
				1100
				1101	/*
				1102	* Lock a buffer object, if it is not already locked.
				1103	*
				1104	* If we come across a stale, pinned, locked buffer, we know that we are
				1105	* being asked to lock a buffer that has been reallocated. Because it is
				1106	* pinned, we know that the log has not been pushed to disk and hence it
				1107	* will still be locked. Rather than continuing to have trylock attempts
				1108	* fail until someone else pushes the log, push it ourselves before
				1109	* returning. This means that the xfsaild will not get stuck trying
				1110	* to push on stale inode buffers.
				1111	*/
				1112	int
				1113	xfs_buf_trylock(
				1114	struct xfs_buf *bp)
				1115	{
				1116	int locked;
				1117
				1118	locked = down_trylock(&bp->b_sema) == 0;
				1119	if (locked)
				1120	trace_xfs_buf_trylock(bp, _RET_IP_);
				1121	else
				1122	trace_xfs_buf_trylock_fail(bp, _RET_IP_);
				1123	return locked;
				1124	}
				1125
				1126	/*
				1127	* Lock a buffer object.
				1128	*
				1129	* If we come across a stale, pinned, locked buffer, we know that we
				1130	* are being asked to lock a buffer that has been reallocated. Because
				1131	* it is pinned, we know that the log has not been pushed to disk and
				1132	* hence it will still be locked. Rather than sleeping until someone
				1133	* else pushes the log, push it ourselves before trying to get the lock.
				1134	*/
				1135	void
				1136	xfs_buf_lock(
				1137	struct xfs_buf *bp)
				1138	{
				1139	trace_xfs_buf_lock(bp, _RET_IP_);
				1140
				1141	if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
				1142	xfs_log_force(bp->b_target->bt_mount, 0);
				1143	down(&bp->b_sema);
				1144
				1145	trace_xfs_buf_lock_done(bp, _RET_IP_);
				1146	}
				1147
				1148	void
				1149	xfs_buf_unlock(
				1150	struct xfs_buf *bp)
				1151	{
				1152	ASSERT(xfs_buf_islocked(bp));
				1153
				1154	up(&bp->b_sema);
				1155	trace_xfs_buf_unlock(bp, _RET_IP_);
				1156	}
				1157
				1158	STATIC void
				1159	xfs_buf_wait_unpin(
				1160	xfs_buf_t *bp)
				1161	{
				1162	DECLARE_WAITQUEUE (wait, current);
				1163
				1164	if (atomic_read(&bp->b_pin_count) == 0)
				1165	return;
				1166
				1167	add_wait_queue(&bp->b_waiters, &wait);
				1168	for (;;) {
				1169	set_current_state(TASK_UNINTERRUPTIBLE);
				1170	if (atomic_read(&bp->b_pin_count) == 0)
				1171	break;
				1172	io_schedule();
				1173	}
				1174	remove_wait_queue(&bp->b_waiters, &wait);
				1175	set_current_state(TASK_RUNNING);
				1176	}
				1177
				1178	/*
				1179	* Buffer Utility Routines
				1180	*/
				1181
				1182	void
				1183	xfs_buf_ioend(
				1184	struct xfs_buf *bp)
				1185	{
				1186	bool read = bp->b_flags & XBF_READ;
				1187
				1188	trace_xfs_buf_iodone(bp, _RET_IP_);
				1189
				1190	bp->b_flags &= ~(XBF_READ \| XBF_WRITE \| XBF_READ_AHEAD);
				1191
				1192	/*
				1193	* Pull in IO completion errors now. We are guaranteed to be running
				1194	* single threaded, so we don't need the lock to read b_io_error.
				1195	*/
				1196	if (!bp->b_error && bp->b_io_error)
				1197	xfs_buf_ioerror(bp, bp->b_io_error);
				1198
				1199	/* Only validate buffers that were read without errors */
				1200	if (read && !bp->b_error && bp->b_ops) {
				1201	ASSERT(!bp->b_iodone);
				1202	bp->b_ops->verify_read(bp);
				1203	}
				1204
				1205	if (!bp->b_error)
				1206	bp->b_flags \|= XBF_DONE;
				1207
				1208	if (bp->b_iodone)
				1209	(*(bp->b_iodone))(bp);
				1210	else if (bp->b_flags & XBF_ASYNC)
				1211	xfs_buf_relse(bp);
				1212	else
				1213	complete(&bp->b_iowait);
				1214	}
				1215
				1216	static void
				1217	xfs_buf_ioend_work(
				1218	struct work_struct *work)
				1219	{
				1220	struct xfs_buf *bp =
				1221	container_of(work, xfs_buf_t, b_ioend_work);
				1222
				1223	xfs_buf_ioend(bp);
				1224	}
				1225
				1226	static void
				1227	xfs_buf_ioend_async(
				1228	struct xfs_buf *bp)
				1229	{
				1230	INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work);
				1231	queue_work(bp->b_ioend_wq, &bp->b_ioend_work);
				1232	}
				1233
				1234	void
				1235	__xfs_buf_ioerror(
				1236	xfs_buf_t *bp,
				1237	int error,
				1238	xfs_failaddr_t failaddr)
				1239	{
				1240	ASSERT(error <= 0 && error >= -1000);
				1241	bp->b_error = error;
				1242	trace_xfs_buf_ioerror(bp, error, failaddr);
				1243	}
				1244
				1245	void
				1246	xfs_buf_ioerror_alert(
				1247	struct xfs_buf *bp,
				1248	const char *func)
				1249	{
				1250	xfs_alert(bp->b_target->bt_mount,
				1251	"metadata I/O error in \"%s\" at daddr 0x%llx len %d error %d",
				1252	func, (uint64_t)XFS_BUF_ADDR(bp), bp->b_length,
				1253	-bp->b_error);
				1254	}
				1255
				1256	int
				1257	xfs_bwrite(
				1258	struct xfs_buf *bp)
				1259	{
				1260	int error;
				1261
				1262	ASSERT(xfs_buf_islocked(bp));
				1263
				1264	bp->b_flags \|= XBF_WRITE;
				1265	bp->b_flags &= ~(XBF_ASYNC \| XBF_READ \| _XBF_DELWRI_Q \|
				1266	XBF_WRITE_FAIL \| XBF_DONE);
				1267
				1268	error = xfs_buf_submit(bp);
				1269	if (error) {
				1270	xfs_force_shutdown(bp->b_target->bt_mount,
				1271	SHUTDOWN_META_IO_ERROR);
				1272	}
				1273	return error;
				1274	}
				1275
				1276	static void
				1277	xfs_buf_bio_end_io(
				1278	struct bio *bio)
				1279	{
				1280	struct xfs_buf bp = (struct xfs_buf )bio->bi_private;
				1281
				1282	/*
				1283	* don't overwrite existing errors - otherwise we can lose errors on
				1284	* buffers that require multiple bios to complete.
				1285	*/
				1286	if (bio->bi_status) {
				1287	int error = blk_status_to_errno(bio->bi_status);
				1288
				1289	cmpxchg(&bp->b_io_error, 0, error);
				1290	}
				1291
				1292	if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
				1293	invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
				1294
				1295	if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
				1296	xfs_buf_ioend_async(bp);
				1297	bio_put(bio);
				1298	}
				1299
				1300	static void
				1301	xfs_buf_ioapply_map(
				1302	struct xfs_buf *bp,
				1303	int map,
				1304	int *buf_offset,
				1305	int *count,
				1306	int op,
				1307	int op_flags)
				1308	{
				1309	int page_index;
				1310	int total_nr_pages = bp->b_page_count;
				1311	int nr_pages;
				1312	struct bio *bio;
				1313	sector_t sector = bp->b_maps[map].bm_bn;
				1314	int size;
				1315	int offset;
				1316
				1317	/* skip the pages in the buffer before the start offset */
				1318	page_index = 0;
				1319	offset = *buf_offset;
				1320	while (offset >= PAGE_SIZE) {
				1321	page_index++;
				1322	offset -= PAGE_SIZE;
				1323	}
				1324
				1325	/*
				1326	* Limit the IO size to the length of the current vector, and update the
				1327	* remaining IO count for the next time around.
				1328	*/
				1329	size = min_t(int, BBTOB(bp->b_maps[map].bm_len), *count);
				1330	*count -= size;
				1331	*buf_offset += size;
				1332
				1333	next_chunk:
				1334	atomic_inc(&bp->b_io_remaining);
				1335	nr_pages = min(total_nr_pages, BIO_MAX_PAGES);
				1336
				1337	bio = bio_alloc(GFP_NOIO, nr_pages);
				1338	bio_set_dev(bio, bp->b_target->bt_bdev);
				1339	bio->bi_iter.bi_sector = sector;
				1340	bio->bi_end_io = xfs_buf_bio_end_io;
				1341	bio->bi_private = bp;
				1342	bio_set_op_attrs(bio, op, op_flags);
				1343
				1344	for (; size && nr_pages; nr_pages--, page_index++) {
				1345	int rbytes, nbytes = PAGE_SIZE - offset;
				1346
				1347	if (nbytes > size)
				1348	nbytes = size;
				1349
				1350	rbytes = bio_add_page(bio, bp->b_pages[page_index], nbytes,
				1351	offset);
				1352	if (rbytes < nbytes)
				1353	break;
				1354
				1355	offset = 0;
				1356	sector += BTOBB(nbytes);
				1357	size -= nbytes;
				1358	total_nr_pages--;
				1359	}
				1360
				1361	if (likely(bio->bi_iter.bi_size)) {
				1362	if (xfs_buf_is_vmapped(bp)) {
				1363	flush_kernel_vmap_range(bp->b_addr,
				1364	xfs_buf_vmap_len(bp));
				1365	}
				1366	submit_bio(bio);
				1367	if (size)
				1368	goto next_chunk;
				1369	} else {
				1370	/*
				1371	* This is guaranteed not to be the last io reference count
				1372	* because the caller (xfs_buf_submit) holds a count itself.
				1373	*/
				1374	atomic_dec(&bp->b_io_remaining);
				1375	xfs_buf_ioerror(bp, -EIO);
				1376	bio_put(bio);
				1377	}
				1378
				1379	}
				1380
				1381	STATIC void
				1382	_xfs_buf_ioapply(
				1383	struct xfs_buf *bp)
				1384	{
				1385	struct blk_plug plug;
				1386	int op;
				1387	int op_flags = 0;
				1388	int offset;
				1389	int size;
				1390	int i;
				1391
				1392	/*
				1393	* Make sure we capture only current IO errors rather than stale errors
				1394	* left over from previous use of the buffer (e.g. failed readahead).
				1395	*/
				1396	bp->b_error = 0;
				1397
				1398	/*
				1399	* Initialize the I/O completion workqueue if we haven't yet or the
				1400	* submitter has not opted to specify a custom one.
				1401	*/
				1402	if (!bp->b_ioend_wq)
				1403	bp->b_ioend_wq = bp->b_target->bt_mount->m_buf_workqueue;
				1404
				1405	if (bp->b_flags & XBF_WRITE) {
				1406	op = REQ_OP_WRITE;
				1407	if (bp->b_flags & XBF_SYNCIO)
				1408	op_flags = REQ_SYNC;
				1409	if (bp->b_flags & XBF_FUA)
				1410	op_flags \|= REQ_FUA;
				1411	if (bp->b_flags & XBF_FLUSH)
				1412	op_flags \|= REQ_PREFLUSH;
				1413
				1414	/*
				1415	* Run the write verifier callback function if it exists. If
				1416	* this function fails it will mark the buffer with an error and
				1417	* the IO should not be dispatched.
				1418	*/
				1419	if (bp->b_ops) {
				1420	bp->b_ops->verify_write(bp);
				1421	if (bp->b_error) {
				1422	xfs_force_shutdown(bp->b_target->bt_mount,
				1423	SHUTDOWN_CORRUPT_INCORE);
				1424	return;
				1425	}
				1426	} else if (bp->b_bn != XFS_BUF_DADDR_NULL) {
				1427	struct xfs_mount *mp = bp->b_target->bt_mount;
				1428
				1429	/*
				1430	* non-crc filesystems don't attach verifiers during
				1431	* log recovery, so don't warn for such filesystems.
				1432	*/
				1433	if (xfs_sb_version_hascrc(&mp->m_sb)) {
				1434	xfs_warn(mp,
				1435	"%s: no buf ops on daddr 0x%llx len %d",
				1436	__func__, bp->b_bn, bp->b_length);
				1437	xfs_hex_dump(bp->b_addr,
				1438	XFS_CORRUPTION_DUMP_LEN);
				1439	dump_stack();
				1440	}
				1441	}
				1442	} else if (bp->b_flags & XBF_READ_AHEAD) {
				1443	op = REQ_OP_READ;
				1444	op_flags = REQ_RAHEAD;
				1445	} else {
				1446	op = REQ_OP_READ;
				1447	}
				1448
				1449	/* we only use the buffer cache for meta-data */
				1450	op_flags \|= REQ_META;
				1451
				1452	/*
				1453	* Walk all the vectors issuing IO on them. Set up the initial offset
				1454	* into the buffer and the desired IO size before we start -
				1455	* _xfs_buf_ioapply_vec() will modify them appropriately for each
				1456	* subsequent call.
				1457	*/
				1458	offset = bp->b_offset;
				1459	size = BBTOB(bp->b_io_length);
				1460	blk_start_plug(&plug);
				1461	for (i = 0; i < bp->b_map_count; i++) {
				1462	xfs_buf_ioapply_map(bp, i, &offset, &size, op, op_flags);
				1463	if (bp->b_error)
				1464	break;
				1465	if (size <= 0)
				1466	break; /* all done */
				1467	}
				1468	blk_finish_plug(&plug);
				1469	}
				1470
				1471	/*
				1472	* Wait for I/O completion of a sync buffer and return the I/O error code.
				1473	*/
				1474	static int
				1475	xfs_buf_iowait(
				1476	struct xfs_buf *bp)
				1477	{
				1478	ASSERT(!(bp->b_flags & XBF_ASYNC));
				1479
				1480	trace_xfs_buf_iowait(bp, _RET_IP_);
				1481	wait_for_completion(&bp->b_iowait);
				1482	trace_xfs_buf_iowait_done(bp, _RET_IP_);
				1483
				1484	return bp->b_error;
				1485	}
				1486
				1487	/*
				1488	* Buffer I/O submission path, read or write. Asynchronous submission transfers
				1489	* the buffer lock ownership and the current reference to the IO. It is not
				1490	* safe to reference the buffer after a call to this function unless the caller
				1491	* holds an additional reference itself.
				1492	*/
				1493	int
				1494	__xfs_buf_submit(
				1495	struct xfs_buf *bp,
				1496	bool wait)
				1497	{
				1498	int error = 0;
				1499
				1500	trace_xfs_buf_submit(bp, _RET_IP_);
				1501
				1502	ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
				1503
				1504	/* on shutdown we stale and complete the buffer immediately */
				1505	if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
				1506	xfs_buf_ioerror(bp, -EIO);
				1507	bp->b_flags &= ~XBF_DONE;
				1508	xfs_buf_stale(bp);
				1509	xfs_buf_ioend(bp);
				1510	return -EIO;
				1511	}
				1512
				1513	/*
				1514	* Grab a reference so the buffer does not go away underneath us. For
				1515	* async buffers, I/O completion drops the callers reference, which
				1516	* could occur before submission returns.
				1517	*/
				1518	xfs_buf_hold(bp);
				1519
				1520	if (bp->b_flags & XBF_WRITE)
				1521	xfs_buf_wait_unpin(bp);
				1522
				1523	/* clear the internal error state to avoid spurious errors */
				1524	bp->b_io_error = 0;
				1525
				1526	/*
				1527	* Set the count to 1 initially, this will stop an I/O completion
				1528	* callout which happens before we have started all the I/O from calling
				1529	* xfs_buf_ioend too early.
				1530	*/
				1531	atomic_set(&bp->b_io_remaining, 1);
				1532	if (bp->b_flags & XBF_ASYNC)
				1533	xfs_buf_ioacct_inc(bp);
				1534	_xfs_buf_ioapply(bp);
				1535
				1536	/*
				1537	* If _xfs_buf_ioapply failed, we can get back here with only the IO
				1538	* reference we took above. If we drop it to zero, run completion so
				1539	* that we don't return to the caller with completion still pending.
				1540	*/
				1541	if (atomic_dec_and_test(&bp->b_io_remaining) == 1) {
				1542	if (bp->b_error \|\| !(bp->b_flags & XBF_ASYNC))
				1543	xfs_buf_ioend(bp);
				1544	else
				1545	xfs_buf_ioend_async(bp);
				1546	}
				1547
				1548	if (wait)
				1549	error = xfs_buf_iowait(bp);
				1550
				1551	/*
				1552	* Release the hold that keeps the buffer referenced for the entire
				1553	* I/O. Note that if the buffer is async, it is not safe to reference
				1554	* after this release.
				1555	*/
				1556	xfs_buf_rele(bp);
				1557	return error;
				1558	}
				1559
				1560	void *
				1561	xfs_buf_offset(
				1562	struct xfs_buf *bp,
				1563	size_t offset)
				1564	{
				1565	struct page *page;
				1566
				1567	if (bp->b_addr)
				1568	return bp->b_addr + offset;
				1569
				1570	offset += bp->b_offset;
				1571	page = bp->b_pages[offset >> PAGE_SHIFT];
				1572	return page_address(page) + (offset & (PAGE_SIZE-1));
				1573	}
				1574
				1575	/*
				1576	* Move data into or out of a buffer.
				1577	*/
				1578	void
				1579	xfs_buf_iomove(
				1580	xfs_buf_t bp, / buffer to process */
				1581	size_t boff, /* starting buffer offset */
				1582	size_t bsize, /* length to copy */
				1583	void data, / data address */
				1584	xfs_buf_rw_t mode) /* read/write/zero flag */
				1585	{
				1586	size_t bend;
				1587
				1588	bend = boff + bsize;
				1589	while (boff < bend) {
				1590	struct page *page;
				1591	int page_index, page_offset, csize;
				1592
				1593	page_index = (boff + bp->b_offset) >> PAGE_SHIFT;
				1594	page_offset = (boff + bp->b_offset) & ~PAGE_MASK;
				1595	page = bp->b_pages[page_index];
				1596	csize = min_t(size_t, PAGE_SIZE - page_offset,
				1597	BBTOB(bp->b_io_length) - boff);
				1598
				1599	ASSERT((csize + page_offset) <= PAGE_SIZE);
				1600
				1601	switch (mode) {
				1602	case XBRW_ZERO:
				1603	memset(page_address(page) + page_offset, 0, csize);
				1604	break;
				1605	case XBRW_READ:
				1606	memcpy(data, page_address(page) + page_offset, csize);
				1607	break;
				1608	case XBRW_WRITE:
				1609	memcpy(page_address(page) + page_offset, data, csize);
				1610	}
				1611
				1612	boff += csize;
				1613	data += csize;
				1614	}
				1615	}
				1616
				1617	/*
				1618	* Handling of buffer targets (buftargs).
				1619	*/
				1620
				1621	/*
				1622	* Wait for any bufs with callbacks that have been submitted but have not yet
				1623	* returned. These buffers will have an elevated hold count, so wait on those
				1624	* while freeing all the buffers only held by the LRU.
				1625	*/
				1626	static enum lru_status
				1627	xfs_buftarg_wait_rele(
				1628	struct list_head *item,
				1629	struct list_lru_one *lru,
				1630	spinlock_t *lru_lock,
				1631	void *arg)
				1632
				1633	{
				1634	struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru);
				1635	struct list_head *dispose = arg;
				1636
				1637	if (atomic_read(&bp->b_hold) > 1) {
				1638	/* need to wait, so skip it this pass */
				1639	trace_xfs_buf_wait_buftarg(bp, _RET_IP_);
				1640	return LRU_SKIP;
				1641	}
				1642	if (!spin_trylock(&bp->b_lock))
				1643	return LRU_SKIP;
				1644
				1645	/*
				1646	* clear the LRU reference count so the buffer doesn't get
				1647	* ignored in xfs_buf_rele().
				1648	*/
				1649	atomic_set(&bp->b_lru_ref, 0);
				1650	bp->b_state \|= XFS_BSTATE_DISPOSE;
				1651	list_lru_isolate_move(lru, item, dispose);
				1652	spin_unlock(&bp->b_lock);
				1653	return LRU_REMOVED;
				1654	}
				1655
				1656	void
				1657	xfs_wait_buftarg(
				1658	struct xfs_buftarg *btp)
				1659	{
				1660	LIST_HEAD(dispose);
				1661	int loop = 0;
				1662
				1663	/*
				1664	* First wait on the buftarg I/O count for all in-flight buffers to be
				1665	* released. This is critical as new buffers do not make the LRU until
				1666	* they are released.
				1667	*
				1668	* Next, flush the buffer workqueue to ensure all completion processing
				1669	* has finished. Just waiting on buffer locks is not sufficient for
				1670	* async IO as the reference count held over IO is not released until
				1671	* after the buffer lock is dropped. Hence we need to ensure here that
				1672	* all reference counts have been dropped before we start walking the
				1673	* LRU list.
				1674	*/
				1675	while (percpu_counter_sum(&btp->bt_io_count))
				1676	delay(100);
				1677	flush_workqueue(btp->bt_mount->m_buf_workqueue);
				1678
				1679	/* loop until there is nothing left on the lru list. */
				1680	while (list_lru_count(&btp->bt_lru)) {
				1681	list_lru_walk(&btp->bt_lru, xfs_buftarg_wait_rele,
				1682	&dispose, LONG_MAX);
				1683
				1684	while (!list_empty(&dispose)) {
				1685	struct xfs_buf *bp;
				1686	bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
				1687	list_del_init(&bp->b_lru);
				1688	if (bp->b_flags & XBF_WRITE_FAIL) {
				1689	xfs_alert(btp->bt_mount,
				1690	"Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!",
				1691	(long long)bp->b_bn);
				1692	xfs_alert(btp->bt_mount,
				1693	"Please run xfs_repair to determine the extent of the problem.");
				1694	}
				1695	xfs_buf_rele(bp);
				1696	}
				1697	if (loop++ != 0)
				1698	delay(100);
				1699	}
				1700	}
				1701
				1702	static enum lru_status
				1703	xfs_buftarg_isolate(
				1704	struct list_head *item,
				1705	struct list_lru_one *lru,
				1706	spinlock_t *lru_lock,
				1707	void *arg)
				1708	{
				1709	struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru);
				1710	struct list_head *dispose = arg;
				1711
				1712	/*
				1713	* we are inverting the lru lock/bp->b_lock here, so use a trylock.
				1714	* If we fail to get the lock, just skip it.
				1715	*/
				1716	if (!spin_trylock(&bp->b_lock))
				1717	return LRU_SKIP;
				1718	/*
				1719	* Decrement the b_lru_ref count unless the value is already
				1720	* zero. If the value is already zero, we need to reclaim the
				1721	* buffer, otherwise it gets another trip through the LRU.
				1722	*/
				1723	if (atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
				1724	spin_unlock(&bp->b_lock);
				1725	return LRU_ROTATE;
				1726	}
				1727
				1728	bp->b_state \|= XFS_BSTATE_DISPOSE;
				1729	list_lru_isolate_move(lru, item, dispose);
				1730	spin_unlock(&bp->b_lock);
				1731	return LRU_REMOVED;
				1732	}
				1733
				1734	static unsigned long
				1735	xfs_buftarg_shrink_scan(
				1736	struct shrinker *shrink,
				1737	struct shrink_control *sc)
				1738	{
				1739	struct xfs_buftarg *btp = container_of(shrink,
				1740	struct xfs_buftarg, bt_shrinker);
				1741	LIST_HEAD(dispose);
				1742	unsigned long freed;
				1743
				1744	freed = list_lru_shrink_walk(&btp->bt_lru, sc,
				1745	xfs_buftarg_isolate, &dispose);
				1746
				1747	while (!list_empty(&dispose)) {
				1748	struct xfs_buf *bp;
				1749	bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
				1750	list_del_init(&bp->b_lru);
				1751	xfs_buf_rele(bp);
				1752	}
				1753
				1754	return freed;
				1755	}
				1756
				1757	static unsigned long
				1758	xfs_buftarg_shrink_count(
				1759	struct shrinker *shrink,
				1760	struct shrink_control *sc)
				1761	{
				1762	struct xfs_buftarg *btp = container_of(shrink,
				1763	struct xfs_buftarg, bt_shrinker);
				1764	return list_lru_shrink_count(&btp->bt_lru, sc);
				1765	}
				1766
				1767	void
				1768	xfs_free_buftarg(
				1769	struct xfs_buftarg *btp)
				1770	{
				1771	unregister_shrinker(&btp->bt_shrinker);
				1772	ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
				1773	percpu_counter_destroy(&btp->bt_io_count);
				1774	list_lru_destroy(&btp->bt_lru);
				1775
				1776	xfs_blkdev_issue_flush(btp);
				1777
				1778	kmem_free(btp);
				1779	}
				1780
				1781	int
				1782	xfs_setsize_buftarg(
				1783	xfs_buftarg_t *btp,
				1784	unsigned int sectorsize)
				1785	{
				1786	/* Set up metadata sector size info */
				1787	btp->bt_meta_sectorsize = sectorsize;
				1788	btp->bt_meta_sectormask = sectorsize - 1;
				1789
				1790	if (set_blocksize(btp->bt_bdev, sectorsize)) {
				1791	xfs_warn(btp->bt_mount,
				1792	"Cannot set_blocksize to %u on device %pg",
				1793	sectorsize, btp->bt_bdev);
				1794	return -EINVAL;
				1795	}
				1796
				1797	/* Set up device logical sector size mask */
				1798	btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev);
				1799	btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1;
				1800
				1801	return 0;
				1802	}
				1803
				1804	/*
				1805	* When allocating the initial buffer target we have not yet
				1806	* read in the superblock, so don't know what sized sectors
				1807	* are being used at this early stage. Play safe.
				1808	*/
				1809	STATIC int
				1810	xfs_setsize_buftarg_early(
				1811	xfs_buftarg_t *btp,
				1812	struct block_device *bdev)
				1813	{
				1814	return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev));
				1815	}
				1816
				1817	xfs_buftarg_t *
				1818	xfs_alloc_buftarg(
				1819	struct xfs_mount *mp,
				1820	struct block_device *bdev,
				1821	struct dax_device *dax_dev)
				1822	{
				1823	xfs_buftarg_t *btp;
				1824
				1825	btp = kmem_zalloc(sizeof(*btp), KM_SLEEP \| KM_NOFS);
				1826
				1827	btp->bt_mount = mp;
				1828	btp->bt_dev = bdev->bd_dev;
				1829	btp->bt_bdev = bdev;
				1830	btp->bt_daxdev = dax_dev;
				1831
				1832	if (xfs_setsize_buftarg_early(btp, bdev))
				1833	goto error_free;
				1834
				1835	if (list_lru_init(&btp->bt_lru))
				1836	goto error_free;
				1837
				1838	if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
				1839	goto error_lru;
				1840
				1841	btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count;
				1842	btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
				1843	btp->bt_shrinker.seeks = DEFAULT_SEEKS;
				1844	btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE;
				1845	if (register_shrinker(&btp->bt_shrinker))
				1846	goto error_pcpu;
				1847	return btp;
				1848
				1849	error_pcpu:
				1850	percpu_counter_destroy(&btp->bt_io_count);
				1851	error_lru:
				1852	list_lru_destroy(&btp->bt_lru);
				1853	error_free:
				1854	kmem_free(btp);
				1855	return NULL;
				1856	}
				1857
				1858	/*
				1859	* Cancel a delayed write list.
				1860	*
				1861	* Remove each buffer from the list, clear the delwri queue flag and drop the
				1862	* associated buffer reference.
				1863	*/
				1864	void
				1865	xfs_buf_delwri_cancel(
				1866	struct list_head *list)
				1867	{
				1868	struct xfs_buf *bp;
				1869
				1870	while (!list_empty(list)) {
				1871	bp = list_first_entry(list, struct xfs_buf, b_list);
				1872
				1873	xfs_buf_lock(bp);
				1874	bp->b_flags &= ~_XBF_DELWRI_Q;
				1875	list_del_init(&bp->b_list);
				1876	xfs_buf_relse(bp);
				1877	}
				1878	}
				1879
				1880	/*
				1881	* Add a buffer to the delayed write list.
				1882	*
				1883	* This queues a buffer for writeout if it hasn't already been. Note that
				1884	* neither this routine nor the buffer list submission functions perform
				1885	* any internal synchronization. It is expected that the lists are thread-local
				1886	* to the callers.
				1887	*
				1888	* Returns true if we queued up the buffer, or false if it already had
				1889	* been on the buffer list.
				1890	*/
				1891	bool
				1892	xfs_buf_delwri_queue(
				1893	struct xfs_buf *bp,
				1894	struct list_head *list)
				1895	{
				1896	ASSERT(xfs_buf_islocked(bp));
				1897	ASSERT(!(bp->b_flags & XBF_READ));
				1898
				1899	/*
				1900	* If the buffer is already marked delwri it already is queued up
				1901	* by someone else for imediate writeout. Just ignore it in that
				1902	* case.
				1903	*/
				1904	if (bp->b_flags & _XBF_DELWRI_Q) {
				1905	trace_xfs_buf_delwri_queued(bp, _RET_IP_);
				1906	return false;
				1907	}
				1908
				1909	trace_xfs_buf_delwri_queue(bp, _RET_IP_);
				1910
				1911	/*
				1912	* If a buffer gets written out synchronously or marked stale while it
				1913	* is on a delwri list we lazily remove it. To do this, the other party
				1914	* clears the _XBF_DELWRI_Q flag but otherwise leaves the buffer alone.
				1915	* It remains referenced and on the list. In a rare corner case it
				1916	* might get readded to a delwri list after the synchronous writeout, in
				1917	* which case we need just need to re-add the flag here.
				1918	*/
				1919	bp->b_flags \|= _XBF_DELWRI_Q;
				1920	if (list_empty(&bp->b_list)) {
				1921	atomic_inc(&bp->b_hold);
				1922	list_add_tail(&bp->b_list, list);
				1923	}
				1924
				1925	return true;
				1926	}
				1927
				1928	/*
				1929	* Compare function is more complex than it needs to be because
				1930	* the return value is only 32 bits and we are doing comparisons
				1931	* on 64 bit values
				1932	*/
				1933	static int
				1934	xfs_buf_cmp(
				1935	void *priv,
				1936	struct list_head *a,
				1937	struct list_head *b)
				1938	{
				1939	struct xfs_buf *ap = container_of(a, struct xfs_buf, b_list);
				1940	struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list);
				1941	xfs_daddr_t diff;
				1942
				1943	diff = ap->b_maps[0].bm_bn - bp->b_maps[0].bm_bn;
				1944	if (diff < 0)
				1945	return -1;
				1946	if (diff > 0)
				1947	return 1;
				1948	return 0;
				1949	}
				1950
				1951	/*
				1952	* Submit buffers for write. If wait_list is specified, the buffers are
				1953	* submitted using sync I/O and placed on the wait list such that the caller can
				1954	* iowait each buffer. Otherwise async I/O is used and the buffers are released
				1955	* at I/O completion time. In either case, buffers remain locked until I/O
				1956	* completes and the buffer is released from the queue.
				1957	*/
				1958	static int
				1959	xfs_buf_delwri_submit_buffers(
				1960	struct list_head *buffer_list,
				1961	struct list_head *wait_list)
				1962	{
				1963	struct xfs_buf bp, n;
				1964	LIST_HEAD (submit_list);
				1965	int pinned = 0;
				1966	struct blk_plug plug;
				1967
				1968	list_sort(NULL, buffer_list, xfs_buf_cmp);
				1969
				1970	blk_start_plug(&plug);
				1971	list_for_each_entry_safe(bp, n, buffer_list, b_list) {
				1972	if (!wait_list) {
				1973	if (xfs_buf_ispinned(bp)) {
				1974	pinned++;
				1975	continue;
				1976	}
				1977	if (!xfs_buf_trylock(bp))
				1978	continue;
				1979	} else {
				1980	xfs_buf_lock(bp);
				1981	}
				1982
				1983	/*
				1984	* Someone else might have written the buffer synchronously or
				1985	* marked it stale in the meantime. In that case only the
				1986	* _XBF_DELWRI_Q flag got cleared, and we have to drop the
				1987	* reference and remove it from the list here.
				1988	*/
				1989	if (!(bp->b_flags & _XBF_DELWRI_Q)) {
				1990	list_del_init(&bp->b_list);
				1991	xfs_buf_relse(bp);
				1992	continue;
				1993	}
				1994
				1995	trace_xfs_buf_delwri_split(bp, _RET_IP_);
				1996
				1997	/*
				1998	* If we have a wait list, each buffer (and associated delwri
				1999	* queue reference) transfers to it and is submitted
				2000	* synchronously. Otherwise, drop the buffer from the delwri
				2001	* queue and submit async.
				2002	*/
				2003	bp->b_flags &= ~(_XBF_DELWRI_Q \| XBF_WRITE_FAIL);
				2004	bp->b_flags \|= XBF_WRITE;
				2005	if (wait_list) {
				2006	bp->b_flags &= ~XBF_ASYNC;
				2007	list_move_tail(&bp->b_list, wait_list);
				2008	} else {
				2009	bp->b_flags \|= XBF_ASYNC;
				2010	list_del_init(&bp->b_list);
				2011	}
				2012	__xfs_buf_submit(bp, false);
				2013	}
				2014	blk_finish_plug(&plug);
				2015
				2016	return pinned;
				2017	}
				2018
				2019	/*
				2020	* Write out a buffer list asynchronously.
				2021	*
				2022	* This will take the @buffer_list, write all non-locked and non-pinned buffers
				2023	* out and not wait for I/O completion on any of the buffers. This interface
				2024	* is only safely useable for callers that can track I/O completion by higher
				2025	* level means, e.g. AIL pushing as the @buffer_list is consumed in this
				2026	* function.
				2027	*
				2028	* Note: this function will skip buffers it would block on, and in doing so
				2029	* leaves them on @buffer_list so they can be retried on a later pass. As such,
				2030	* it is up to the caller to ensure that the buffer list is fully submitted or
				2031	* cancelled appropriately when they are finished with the list. Failure to
				2032	* cancel or resubmit the list until it is empty will result in leaked buffers
				2033	* at unmount time.
				2034	*/
				2035	int
				2036	xfs_buf_delwri_submit_nowait(
				2037	struct list_head *buffer_list)
				2038	{
				2039	return xfs_buf_delwri_submit_buffers(buffer_list, NULL);
				2040	}
				2041
				2042	/*
				2043	* Write out a buffer list synchronously.
				2044	*
				2045	* This will take the @buffer_list, write all buffers out and wait for I/O
				2046	* completion on all of the buffers. @buffer_list is consumed by the function,
				2047	* so callers must have some other way of tracking buffers if they require such
				2048	* functionality.
				2049	*/
				2050	int
				2051	xfs_buf_delwri_submit(
				2052	struct list_head *buffer_list)
				2053	{
				2054	LIST_HEAD (wait_list);
				2055	int error = 0, error2;
				2056	struct xfs_buf *bp;
				2057
				2058	xfs_buf_delwri_submit_buffers(buffer_list, &wait_list);
				2059
				2060	/* Wait for IO to complete. */
				2061	while (!list_empty(&wait_list)) {
				2062	bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
				2063
				2064	list_del_init(&bp->b_list);
				2065
				2066	/*
				2067	* Wait on the locked buffer, check for errors and unlock and
				2068	* release the delwri queue reference.
				2069	*/
				2070	error2 = xfs_buf_iowait(bp);
				2071	xfs_buf_relse(bp);
				2072	if (!error)
				2073	error = error2;
				2074	}
				2075
				2076	return error;
				2077	}
				2078
				2079	/*
				2080	* Push a single buffer on a delwri queue.
				2081	*
				2082	* The purpose of this function is to submit a single buffer of a delwri queue
				2083	* and return with the buffer still on the original queue. The waiting delwri
				2084	* buffer submission infrastructure guarantees transfer of the delwri queue
				2085	* buffer reference to a temporary wait list. We reuse this infrastructure to
				2086	* transfer the buffer back to the original queue.
				2087	*
				2088	* Note the buffer transitions from the queued state, to the submitted and wait
				2089	* listed state and back to the queued state during this call. The buffer
				2090	* locking and queue management logic between _delwri_pushbuf() and
				2091	* _delwri_queue() guarantee that the buffer cannot be queued to another list
				2092	* before returning.
				2093	*/
				2094	int
				2095	xfs_buf_delwri_pushbuf(
				2096	struct xfs_buf *bp,
				2097	struct list_head *buffer_list)
				2098	{
				2099	LIST_HEAD (submit_list);
				2100	int error;
				2101
				2102	ASSERT(bp->b_flags & _XBF_DELWRI_Q);
				2103
				2104	trace_xfs_buf_delwri_pushbuf(bp, _RET_IP_);
				2105
				2106	/*
				2107	* Isolate the buffer to a new local list so we can submit it for I/O
				2108	* independently from the rest of the original list.
				2109	*/
				2110	xfs_buf_lock(bp);
				2111	list_move(&bp->b_list, &submit_list);
				2112	xfs_buf_unlock(bp);
				2113
				2114	/*
				2115	* Delwri submission clears the DELWRI_Q buffer flag and returns with
				2116	* the buffer on the wait list with the original reference. Rather than
				2117	* bounce the buffer from a local wait list back to the original list
				2118	* after I/O completion, reuse the original list as the wait list.
				2119	*/
				2120	xfs_buf_delwri_submit_buffers(&submit_list, buffer_list);
				2121
				2122	/*
				2123	* The buffer is now locked, under I/O and wait listed on the original
				2124	* delwri queue. Wait for I/O completion, restore the DELWRI_Q flag and
				2125	* return with the buffer unlocked and on the original queue.
				2126	*/
				2127	error = xfs_buf_iowait(bp);
				2128	bp->b_flags \|= _XBF_DELWRI_Q;
				2129	xfs_buf_unlock(bp);
				2130
				2131	return error;
				2132	}
				2133
				2134	int __init
				2135	xfs_buf_init(void)
				2136	{
				2137	xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf",
				2138	KM_ZONE_HWALIGN, NULL);
				2139	if (!xfs_buf_zone)
				2140	goto out;
				2141
				2142	return 0;
				2143
				2144	out:
				2145	return -ENOMEM;
				2146	}
				2147
				2148	void
				2149	xfs_buf_terminate(void)
				2150	{
				2151	kmem_zone_destroy(xfs_buf_zone);
				2152	}
				2153
				2154	void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
				2155	{
				2156	/*
				2157	* Set the lru reference count to 0 based on the error injection tag.
				2158	* This allows userspace to disrupt buffer caching for debug/testing
				2159	* purposes.
				2160	*/
				2161	if (XFS_TEST_ERROR(false, bp->b_target->bt_mount,
				2162	XFS_ERRTAG_BUF_LRU_REF))
				2163	lru_ref = 0;
				2164
				2165	atomic_set(&bp->b_lru_ref, lru_ref);
				2166	}