Blame - src/kernel/linux/v4.19/fs/gfs2/bmap.c - T800

blob: 096b4797213953cf32f5fc339068d2486cf399c2 [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	/*
				2	* Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
				3	* Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
				4	*
				5	* This copyrighted material is made available to anyone wishing to use,
				6	* modify, copy, or redistribute it subject to the terms and conditions
				7	* of the GNU General Public License version 2.
				8	*/
				9
				10	#include <linux/spinlock.h>
				11	#include <linux/completion.h>
				12	#include <linux/buffer_head.h>
				13	#include <linux/blkdev.h>
				14	#include <linux/gfs2_ondisk.h>
				15	#include <linux/crc32.h>
				16	#include <linux/iomap.h>
				17
				18	#include "gfs2.h"
				19	#include "incore.h"
				20	#include "bmap.h"
				21	#include "glock.h"
				22	#include "inode.h"
				23	#include "meta_io.h"
				24	#include "quota.h"
				25	#include "rgrp.h"
				26	#include "log.h"
				27	#include "super.h"
				28	#include "trans.h"
				29	#include "dir.h"
				30	#include "util.h"
				31	#include "aops.h"
				32	#include "trace_gfs2.h"
				33
				34	/* This doesn't need to be that large as max 64 bit pointers in a 4k
				35	* block is 512, so __u16 is fine for that. It saves stack space to
				36	* keep it small.
				37	*/
				38	struct metapath {
				39	struct buffer_head *mp_bh[GFS2_MAX_META_HEIGHT];
				40	__u16 mp_list[GFS2_MAX_META_HEIGHT];
				41	int mp_fheight; /* find_metapath height */
				42	int mp_aheight; /* actual height (lookup height) */
				43	};
				44
				45	static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length);
				46
				47	/**
				48	* gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page
				49	* @ip: the inode
				50	* @dibh: the dinode buffer
				51	* @block: the block number that was allocated
				52	* @page: The (optional) page. This is looked up if @page is NULL
				53	*
				54	* Returns: errno
				55	*/
				56
				57	static int gfs2_unstuffer_page(struct gfs2_inode ip, struct buffer_head dibh,
				58	u64 block, struct page *page)
				59	{
				60	struct inode *inode = &ip->i_inode;
				61	struct buffer_head *bh;
				62	int release = 0;
				63
				64	if (!page \|\| page->index) {
				65	page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS);
				66	if (!page)
				67	return -ENOMEM;
				68	release = 1;
				69	}
				70
				71	if (!PageUptodate(page)) {
				72	void *kaddr = kmap(page);
				73	u64 dsize = i_size_read(inode);
				74
				75	if (dsize > gfs2_max_stuffed_size(ip))
				76	dsize = gfs2_max_stuffed_size(ip);
				77
				78	memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
				79	memset(kaddr + dsize, 0, PAGE_SIZE - dsize);
				80	kunmap(page);
				81
				82	SetPageUptodate(page);
				83	}
				84
				85	if (!page_has_buffers(page))
				86	create_empty_buffers(page, BIT(inode->i_blkbits),
				87	BIT(BH_Uptodate));
				88
				89	bh = page_buffers(page);
				90
				91	if (!buffer_mapped(bh))
				92	map_bh(bh, inode->i_sb, block);
				93
				94	set_buffer_uptodate(bh);
				95	if (gfs2_is_jdata(ip))
				96	gfs2_trans_add_data(ip->i_gl, bh);
				97	else {
				98	mark_buffer_dirty(bh);
				99	gfs2_ordered_add_inode(ip);
				100	}
				101
				102	if (release) {
				103	unlock_page(page);
				104	put_page(page);
				105	}
				106
				107	return 0;
				108	}
				109
				110	/**
				111	* gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
				112	* @ip: The GFS2 inode to unstuff
				113	* @page: The (optional) page. This is looked up if the @page is NULL
				114	*
				115	* This routine unstuffs a dinode and returns it to a "normal" state such
				116	* that the height can be grown in the traditional way.
				117	*
				118	* Returns: errno
				119	*/
				120
				121	int gfs2_unstuff_dinode(struct gfs2_inode ip, struct page page)
				122	{
				123	struct buffer_head bh, dibh;
				124	struct gfs2_dinode *di;
				125	u64 block = 0;
				126	int isdir = gfs2_is_dir(ip);
				127	int error;
				128
				129	down_write(&ip->i_rw_mutex);
				130
				131	error = gfs2_meta_inode_buffer(ip, &dibh);
				132	if (error)
				133	goto out;
				134
				135	if (i_size_read(&ip->i_inode)) {
				136	/* Get a free block, fill it with the stuffed data,
				137	and write it out to disk */
				138
				139	unsigned int n = 1;
				140	error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL);
				141	if (error)
				142	goto out_brelse;
				143	if (isdir) {
				144	gfs2_trans_add_unrevoke(GFS2_SB(&ip->i_inode), block, 1);
				145	error = gfs2_dir_get_new_buffer(ip, block, &bh);
				146	if (error)
				147	goto out_brelse;
				148	gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_meta_header),
				149	dibh, sizeof(struct gfs2_dinode));
				150	brelse(bh);
				151	} else {
				152	error = gfs2_unstuffer_page(ip, dibh, block, page);
				153	if (error)
				154	goto out_brelse;
				155	}
				156	}
				157
				158	/* Set up the pointer to the new block */
				159
				160	gfs2_trans_add_meta(ip->i_gl, dibh);
				161	di = (struct gfs2_dinode *)dibh->b_data;
				162	gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
				163
				164	if (i_size_read(&ip->i_inode)) {
				165	(__be64 )(di + 1) = cpu_to_be64(block);
				166	gfs2_add_inode_blocks(&ip->i_inode, 1);
				167	di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
				168	}
				169
				170	ip->i_height = 1;
				171	di->di_height = cpu_to_be16(1);
				172
				173	out_brelse:
				174	brelse(dibh);
				175	out:
				176	up_write(&ip->i_rw_mutex);
				177	return error;
				178	}
				179
				180
				181	/**
				182	* find_metapath - Find path through the metadata tree
				183	* @sdp: The superblock
				184	* @block: The disk block to look up
				185	* @mp: The metapath to return the result in
				186	* @height: The pre-calculated height of the metadata tree
				187	*
				188	* This routine returns a struct metapath structure that defines a path
				189	* through the metadata of inode "ip" to get to block "block".
				190	*
				191	* Example:
				192	* Given: "ip" is a height 3 file, "offset" is 101342453, and this is a
				193	* filesystem with a blocksize of 4096.
				194	*
				195	* find_metapath() would return a struct metapath structure set to:
				196	* mp_fheight = 3, mp_list[0] = 0, mp_list[1] = 48, and mp_list[2] = 165.
				197	*
				198	* That means that in order to get to the block containing the byte at
				199	* offset 101342453, we would load the indirect block pointed to by pointer
				200	* 0 in the dinode. We would then load the indirect block pointed to by
				201	* pointer 48 in that indirect block. We would then load the data block
				202	* pointed to by pointer 165 in that indirect block.
				203	*
				204	* ----------------------------------------
				205	* \| Dinode \| \|
				206	* \| \| 4\|
				207	* \| \|0 1 2 3 4 5 9\|
				208	* \| \| 6\|
				209	* ----------------------------------------
				210	* \|
				211	* \|
				212	* V
				213	* ----------------------------------------
				214	* \| Indirect Block \|
				215	* \| 5\|
				216	* \| 4 4 4 4 4 5 5 1\|
				217	* \|0 5 6 7 8 9 0 1 2\|
				218	* ----------------------------------------
				219	* \|
				220	* \|
				221	* V
				222	* ----------------------------------------
				223	* \| Indirect Block \|
				224	* \| 1 1 1 1 1 5\|
				225	* \| 6 6 6 6 6 1\|
				226	* \|0 3 4 5 6 7 2\|
				227	* ----------------------------------------
				228	* \|
				229	* \|
				230	* V
				231	* ----------------------------------------
				232	* \| Data block containing offset \|
				233	* \| 101342453 \|
				234	* \| \|
				235	* \| \|
				236	* ----------------------------------------
				237	*
				238	*/
				239
				240	static void find_metapath(const struct gfs2_sbd *sdp, u64 block,
				241	struct metapath *mp, unsigned int height)
				242	{
				243	unsigned int i;
				244
				245	mp->mp_fheight = height;
				246	for (i = height; i--;)
				247	mp->mp_list[i] = do_div(block, sdp->sd_inptrs);
				248	}
				249
				250	static inline unsigned int metapath_branch_start(const struct metapath *mp)
				251	{
				252	if (mp->mp_list[0] == 0)
				253	return 2;
				254	return 1;
				255	}
				256
				257	/**
				258	* metaptr1 - Return the first possible metadata pointer in a metapath buffer
				259	* @height: The metadata height (0 = dinode)
				260	* @mp: The metapath
				261	*/
				262	static inline __be64 metaptr1(unsigned int height, const struct metapath mp)
				263	{
				264	struct buffer_head *bh = mp->mp_bh[height];
				265	if (height == 0)
				266	return ((__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)));
				267	return ((__be64 *)(bh->b_data + sizeof(struct gfs2_meta_header)));
				268	}
				269
				270	/**
				271	* metapointer - Return pointer to start of metadata in a buffer
				272	* @height: The metadata height (0 = dinode)
				273	* @mp: The metapath
				274	*
				275	* Return a pointer to the block number of the next height of the metadata
				276	* tree given a buffer containing the pointer to the current height of the
				277	* metadata tree.
				278	*/
				279
				280	static inline __be64 metapointer(unsigned int height, const struct metapath mp)
				281	{
				282	__be64 *p = metaptr1(height, mp);
				283	return p + mp->mp_list[height];
				284	}
				285
				286	static inline const __be64 metaend(unsigned int height, const struct metapath mp)
				287	{
				288	const struct buffer_head *bh = mp->mp_bh[height];
				289	return (const __be64 *)(bh->b_data + bh->b_size);
				290	}
				291
				292	static void clone_metapath(struct metapath clone, struct metapath mp)
				293	{
				294	unsigned int hgt;
				295
				296	clone = mp;
				297	for (hgt = 0; hgt < mp->mp_aheight; hgt++)
				298	get_bh(clone->mp_bh[hgt]);
				299	}
				300
				301	static void gfs2_metapath_ra(struct gfs2_glock gl, __be64 start, __be64 *end)
				302	{
				303	const __be64 *t;
				304
				305	for (t = start; t < end; t++) {
				306	struct buffer_head *rabh;
				307
				308	if (!*t)
				309	continue;
				310
				311	rabh = gfs2_getbuf(gl, be64_to_cpu(*t), CREATE);
				312	if (trylock_buffer(rabh)) {
				313	if (!buffer_uptodate(rabh)) {
				314	rabh->b_end_io = end_buffer_read_sync;
				315	submit_bh(REQ_OP_READ,
				316	REQ_RAHEAD \| REQ_META \| REQ_PRIO,
				317	rabh);
				318	continue;
				319	}
				320	unlock_buffer(rabh);
				321	}
				322	brelse(rabh);
				323	}
				324	}
				325
				326	static int __fillup_metapath(struct gfs2_inode ip, struct metapath mp,
				327	unsigned int x, unsigned int h)
				328	{
				329	for (; x < h; x++) {
				330	__be64 *ptr = metapointer(x, mp);
				331	u64 dblock = be64_to_cpu(*ptr);
				332	int ret;
				333
				334	if (!dblock)
				335	break;
				336	ret = gfs2_meta_indirect_buffer(ip, x + 1, dblock, &mp->mp_bh[x + 1]);
				337	if (ret)
				338	return ret;
				339	}
				340	mp->mp_aheight = x + 1;
				341	return 0;
				342	}
				343
				344	/**
				345	* lookup_metapath - Walk the metadata tree to a specific point
				346	* @ip: The inode
				347	* @mp: The metapath
				348	*
				349	* Assumes that the inode's buffer has already been looked up and
				350	* hooked onto mp->mp_bh[0] and that the metapath has been initialised
				351	* by find_metapath().
				352	*
				353	* If this function encounters part of the tree which has not been
				354	* allocated, it returns the current height of the tree at the point
				355	* at which it found the unallocated block. Blocks which are found are
				356	* added to the mp->mp_bh[] list.
				357	*
				358	* Returns: error
				359	*/
				360
				361	static int lookup_metapath(struct gfs2_inode ip, struct metapath mp)
				362	{
				363	return __fillup_metapath(ip, mp, 0, ip->i_height - 1);
				364	}
				365
				366	/**
				367	* fillup_metapath - fill up buffers for the metadata path to a specific height
				368	* @ip: The inode
				369	* @mp: The metapath
				370	* @h: The height to which it should be mapped
				371	*
				372	* Similar to lookup_metapath, but does lookups for a range of heights
				373	*
				374	* Returns: error or the number of buffers filled
				375	*/
				376
				377	static int fillup_metapath(struct gfs2_inode ip, struct metapath mp, int h)
				378	{
				379	unsigned int x = 0;
				380	int ret;
				381
				382	if (h) {
				383	/* find the first buffer we need to look up. */
				384	for (x = h - 1; x > 0; x--) {
				385	if (mp->mp_bh[x])
				386	break;
				387	}
				388	}
				389	ret = __fillup_metapath(ip, mp, x, h);
				390	if (ret)
				391	return ret;
				392	return mp->mp_aheight - x - 1;
				393	}
				394
				395	static sector_t metapath_to_block(struct gfs2_sbd sdp, struct metapath mp)
				396	{
				397	sector_t factor = 1, block = 0;
				398	int hgt;
				399
				400	for (hgt = mp->mp_fheight - 1; hgt >= 0; hgt--) {
				401	if (hgt < mp->mp_aheight)
				402	block += mp->mp_list[hgt] * factor;
				403	factor *= sdp->sd_inptrs;
				404	}
				405	return block;
				406	}
				407
				408	static void release_metapath(struct metapath *mp)
				409	{
				410	int i;
				411
				412	for (i = 0; i < GFS2_MAX_META_HEIGHT; i++) {
				413	if (mp->mp_bh[i] == NULL)
				414	break;
				415	brelse(mp->mp_bh[i]);
				416	mp->mp_bh[i] = NULL;
				417	}
				418	}
				419
				420	/**
				421	* gfs2_extent_length - Returns length of an extent of blocks
				422	* @bh: The metadata block
				423	* @ptr: Current position in @bh
				424	* @limit: Max extent length to return
				425	* @eob: Set to 1 if we hit "end of block"
				426	*
				427	* Returns: The length of the extent (minimum of one block)
				428	*/
				429
				430	static inline unsigned int gfs2_extent_length(struct buffer_head bh, __be64 ptr, size_t limit, int *eob)
				431	{
				432	const __be64 end = (__be64 )(bh->b_data + bh->b_size);
				433	const __be64 *first = ptr;
				434	u64 d = be64_to_cpu(*ptr);
				435
				436	*eob = 0;
				437	do {
				438	ptr++;
				439	if (ptr >= end)
				440	break;
				441	d++;
				442	} while(be64_to_cpu(*ptr) == d);
				443	if (ptr >= end)
				444	*eob = 1;
				445	return ptr - first;
				446	}
				447
				448	enum walker_status { WALK_STOP, WALK_FOLLOW, WALK_CONTINUE };
				449
				450	/*
				451	* gfs2_metadata_walker - walk an indirect block
				452	* @mp: Metapath to indirect block
				453	* @ptrs: Number of pointers to look at
				454	*
				455	* When returning WALK_FOLLOW, the walker must update @mp to point at the right
				456	* indirect block to follow.
				457	*/
				458	typedef enum walker_status (gfs2_metadata_walker)(struct metapath mp,
				459	unsigned int ptrs);
				460
				461	/*
				462	* gfs2_walk_metadata - walk a tree of indirect blocks
				463	* @inode: The inode
				464	* @mp: Starting point of walk
				465	* @max_len: Maximum number of blocks to walk
				466	* @walker: Called during the walk
				467	*
				468	* Returns 1 if the walk was stopped by @walker, 0 if we went past @max_len or
				469	* past the end of metadata, and a negative error code otherwise.
				470	*/
				471
				472	static int gfs2_walk_metadata(struct inode inode, struct metapath mp,
				473	u64 max_len, gfs2_metadata_walker walker)
				474	{
				475	struct gfs2_inode *ip = GFS2_I(inode);
				476	struct gfs2_sbd *sdp = GFS2_SB(inode);
				477	u64 factor = 1;
				478	unsigned int hgt;
				479	int ret;
				480
				481	/*
				482	* The walk starts in the lowest allocated indirect block, which may be
				483	* before the position indicated by @mp. Adjust @max_len accordingly
				484	* to avoid a short walk.
				485	*/
				486	for (hgt = mp->mp_fheight - 1; hgt >= mp->mp_aheight; hgt--) {
				487	max_len += mp->mp_list[hgt] * factor;
				488	mp->mp_list[hgt] = 0;
				489	factor *= sdp->sd_inptrs;
				490	}
				491
				492	for (;;) {
				493	u16 start = mp->mp_list[hgt];
				494	enum walker_status status;
				495	unsigned int ptrs;
				496	u64 len;
				497
				498	/* Walk indirect block. */
				499	ptrs = (hgt >= 1 ? sdp->sd_inptrs : sdp->sd_diptrs) - start;
				500	len = ptrs * factor;
				501	if (len > max_len)
				502	ptrs = DIV_ROUND_UP_ULL(max_len, factor);
				503	status = walker(mp, ptrs);
				504	switch (status) {
				505	case WALK_STOP:
				506	return 1;
				507	case WALK_FOLLOW:
				508	BUG_ON(mp->mp_aheight == mp->mp_fheight);
				509	ptrs = mp->mp_list[hgt] - start;
				510	len = ptrs * factor;
				511	break;
				512	case WALK_CONTINUE:
				513	break;
				514	}
				515	if (len >= max_len)
				516	break;
				517	max_len -= len;
				518	if (status == WALK_FOLLOW)
				519	goto fill_up_metapath;
				520
				521	lower_metapath:
				522	/* Decrease height of metapath. */
				523	brelse(mp->mp_bh[hgt]);
				524	mp->mp_bh[hgt] = NULL;
				525	mp->mp_list[hgt] = 0;
				526	if (!hgt)
				527	break;
				528	hgt--;
				529	factor *= sdp->sd_inptrs;
				530
				531	/* Advance in metadata tree. */
				532	(mp->mp_list[hgt])++;
				533	if (mp->mp_list[hgt] >= sdp->sd_inptrs) {
				534	if (!hgt)
				535	break;
				536	goto lower_metapath;
				537	}
				538
				539	fill_up_metapath:
				540	/* Increase height of metapath. */
				541	ret = fillup_metapath(ip, mp, ip->i_height - 1);
				542	if (ret < 0)
				543	return ret;
				544	hgt += ret;
				545	for (; ret; ret--)
				546	do_div(factor, sdp->sd_inptrs);
				547	mp->mp_aheight = hgt + 1;
				548	}
				549	return 0;
				550	}
				551
				552	static enum walker_status gfs2_hole_walker(struct metapath *mp,
				553	unsigned int ptrs)
				554	{
				555	const __be64 start, ptr, *end;
				556	unsigned int hgt;
				557
				558	hgt = mp->mp_aheight - 1;
				559	start = metapointer(hgt, mp);
				560	end = start + ptrs;
				561
				562	for (ptr = start; ptr < end; ptr++) {
				563	if (*ptr) {
				564	mp->mp_list[hgt] += ptr - start;
				565	if (mp->mp_aheight == mp->mp_fheight)
				566	return WALK_STOP;
				567	return WALK_FOLLOW;
				568	}
				569	}
				570	return WALK_CONTINUE;
				571	}
				572
				573	/**
				574	* gfs2_hole_size - figure out the size of a hole
				575	* @inode: The inode
				576	* @lblock: The logical starting block number
				577	* @len: How far to look (in blocks)
				578	* @mp: The metapath at lblock
				579	* @iomap: The iomap to store the hole size in
				580	*
				581	* This function modifies @mp.
				582	*
				583	* Returns: errno on error
				584	*/
				585	static int gfs2_hole_size(struct inode *inode, sector_t lblock, u64 len,
				586	struct metapath mp, struct iomap iomap)
				587	{
				588	struct metapath clone;
				589	u64 hole_size;
				590	int ret;
				591
				592	clone_metapath(&clone, mp);
				593	ret = gfs2_walk_metadata(inode, &clone, len, gfs2_hole_walker);
				594	if (ret < 0)
				595	goto out;
				596
				597	if (ret == 1)
				598	hole_size = metapath_to_block(GFS2_SB(inode), &clone) - lblock;
				599	else
				600	hole_size = len;
				601	iomap->length = hole_size << inode->i_blkbits;
				602	ret = 0;
				603
				604	out:
				605	release_metapath(&clone);
				606	return ret;
				607	}
				608
				609	static inline __be64 gfs2_indirect_init(struct metapath mp,
				610	struct gfs2_glock *gl, unsigned int i,
				611	unsigned offset, u64 bn)
				612	{
				613	__be64 ptr = (__be64 )(mp->mp_bh[i - 1]->b_data +
				614	((i > 1) ? sizeof(struct gfs2_meta_header) :
				615	sizeof(struct gfs2_dinode)));
				616	BUG_ON(i < 1);
				617	BUG_ON(mp->mp_bh[i] != NULL);
				618	mp->mp_bh[i] = gfs2_meta_new(gl, bn);
				619	gfs2_trans_add_meta(gl, mp->mp_bh[i]);
				620	gfs2_metatype_set(mp->mp_bh[i], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
				621	gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header));
				622	ptr += offset;
				623	*ptr = cpu_to_be64(bn);
				624	return ptr;
				625	}
				626
				627	enum alloc_state {
				628	ALLOC_DATA = 0,
				629	ALLOC_GROW_DEPTH = 1,
				630	ALLOC_GROW_HEIGHT = 2,
				631	/* ALLOC_UNSTUFF = 3, TBD and rather complicated */
				632	};
				633
				634	/**
				635	* gfs2_iomap_alloc - Build a metadata tree of the requested height
				636	* @inode: The GFS2 inode
				637	* @iomap: The iomap structure
				638	* @flags: iomap flags
				639	* @mp: The metapath, with proper height information calculated
				640	*
				641	* In this routine we may have to alloc:
				642	* i) Indirect blocks to grow the metadata tree height
				643	* ii) Indirect blocks to fill in lower part of the metadata tree
				644	* iii) Data blocks
				645	*
				646	* This function is called after gfs2_iomap_get, which works out the
				647	* total number of blocks which we need via gfs2_alloc_size.
				648	*
				649	* We then do the actual allocation asking for an extent at a time (if
				650	* enough contiguous free blocks are available, there will only be one
				651	* allocation request per call) and uses the state machine to initialise
				652	* the blocks in order.
				653	*
				654	* Right now, this function will allocate at most one indirect block
				655	* worth of data -- with a default block size of 4K, that's slightly
				656	* less than 2M. If this limitation is ever removed to allow huge
				657	* allocations, we would probably still want to limit the iomap size we
				658	* return to avoid stalling other tasks during huge writes; the next
				659	* iomap iteration would then find the blocks already allocated.
				660	*
				661	* Returns: errno on error
				662	*/
				663
				664	static int gfs2_iomap_alloc(struct inode inode, struct iomap iomap,
				665	unsigned flags, struct metapath *mp)
				666	{
				667	struct gfs2_inode *ip = GFS2_I(inode);
				668	struct gfs2_sbd *sdp = GFS2_SB(inode);
				669	struct buffer_head *dibh = mp->mp_bh[0];
				670	u64 bn;
				671	unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
				672	size_t dblks = iomap->length >> inode->i_blkbits;
				673	const unsigned end_of_metadata = mp->mp_fheight - 1;
				674	int ret;
				675	enum alloc_state state;
				676	__be64 *ptr;
				677	__be64 zero_bn = 0;
				678
				679	BUG_ON(mp->mp_aheight < 1);
				680	BUG_ON(dibh == NULL);
				681	BUG_ON(dblks < 1);
				682
				683	gfs2_trans_add_meta(ip->i_gl, dibh);
				684
				685	down_write(&ip->i_rw_mutex);
				686
				687	if (mp->mp_fheight == mp->mp_aheight) {
				688	/* Bottom indirect block exists */
				689	state = ALLOC_DATA;
				690	} else {
				691	/* Need to allocate indirect blocks */
				692	if (mp->mp_fheight == ip->i_height) {
				693	/* Writing into existing tree, extend tree down */
				694	iblks = mp->mp_fheight - mp->mp_aheight;
				695	state = ALLOC_GROW_DEPTH;
				696	} else {
				697	/* Building up tree height */
				698	state = ALLOC_GROW_HEIGHT;
				699	iblks = mp->mp_fheight - ip->i_height;
				700	branch_start = metapath_branch_start(mp);
				701	iblks += (mp->mp_fheight - branch_start);
				702	}
				703	}
				704
				705	/* start of the second part of the function (state machine) */
				706
				707	blks = dblks + iblks;
				708	i = mp->mp_aheight;
				709	do {
				710	n = blks - alloced;
				711	ret = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
				712	if (ret)
				713	goto out;
				714	alloced += n;
				715	if (state != ALLOC_DATA \|\| gfs2_is_jdata(ip))
				716	gfs2_trans_add_unrevoke(sdp, bn, n);
				717	switch (state) {
				718	/* Growing height of tree */
				719	case ALLOC_GROW_HEIGHT:
				720	if (i == 1) {
				721	ptr = (__be64 *)(dibh->b_data +
				722	sizeof(struct gfs2_dinode));
				723	zero_bn = *ptr;
				724	}
				725	for (; i - 1 < mp->mp_fheight - ip->i_height && n > 0;
				726	i++, n--)
				727	gfs2_indirect_init(mp, ip->i_gl, i, 0, bn++);
				728	if (i - 1 == mp->mp_fheight - ip->i_height) {
				729	i--;
				730	gfs2_buffer_copy_tail(mp->mp_bh[i],
				731	sizeof(struct gfs2_meta_header),
				732	dibh, sizeof(struct gfs2_dinode));
				733	gfs2_buffer_clear_tail(dibh,
				734	sizeof(struct gfs2_dinode) +
				735	sizeof(__be64));
				736	ptr = (__be64 *)(mp->mp_bh[i]->b_data +
				737	sizeof(struct gfs2_meta_header));
				738	*ptr = zero_bn;
				739	state = ALLOC_GROW_DEPTH;
				740	for(i = branch_start; i < mp->mp_fheight; i++) {
				741	if (mp->mp_bh[i] == NULL)
				742	break;
				743	brelse(mp->mp_bh[i]);
				744	mp->mp_bh[i] = NULL;
				745	}
				746	i = branch_start;
				747	}
				748	if (n == 0)
				749	break;
				750	/* Branching from existing tree */
				751	case ALLOC_GROW_DEPTH:
				752	if (i > 1 && i < mp->mp_fheight)
				753	gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[i-1]);
				754	for (; i < mp->mp_fheight && n > 0; i++, n--)
				755	gfs2_indirect_init(mp, ip->i_gl, i,
				756	mp->mp_list[i-1], bn++);
				757	if (i == mp->mp_fheight)
				758	state = ALLOC_DATA;
				759	if (n == 0)
				760	break;
				761	/* Tree complete, adding data blocks */
				762	case ALLOC_DATA:
				763	BUG_ON(n > dblks);
				764	BUG_ON(mp->mp_bh[end_of_metadata] == NULL);
				765	gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[end_of_metadata]);
				766	dblks = n;
				767	ptr = metapointer(end_of_metadata, mp);
				768	iomap->addr = bn << inode->i_blkbits;
				769	iomap->flags \|= IOMAP_F_MERGED \| IOMAP_F_NEW;
				770	while (n-- > 0)
				771	*ptr++ = cpu_to_be64(bn++);
				772	break;
				773	}
				774	} while (iomap->addr == IOMAP_NULL_ADDR);
				775
				776	iomap->type = IOMAP_MAPPED;
				777	iomap->length = (u64)dblks << inode->i_blkbits;
				778	ip->i_height = mp->mp_fheight;
				779	gfs2_add_inode_blocks(&ip->i_inode, alloced);
				780	gfs2_dinode_out(ip, dibh->b_data);
				781	out:
				782	up_write(&ip->i_rw_mutex);
				783	return ret;
				784	}
				785
				786	#define IOMAP_F_GFS2_BOUNDARY IOMAP_F_PRIVATE
				787
				788	/**
				789	* gfs2_alloc_size - Compute the maximum allocation size
				790	* @inode: The inode
				791	* @mp: The metapath
				792	* @size: Requested size in blocks
				793	*
				794	* Compute the maximum size of the next allocation at @mp.
				795	*
				796	* Returns: size in blocks
				797	*/
				798	static u64 gfs2_alloc_size(struct inode inode, struct metapath mp, u64 size)
				799	{
				800	struct gfs2_inode *ip = GFS2_I(inode);
				801	struct gfs2_sbd *sdp = GFS2_SB(inode);
				802	const __be64 first, ptr, *end;
				803
				804	/*
				805	* For writes to stuffed files, this function is called twice via
				806	* gfs2_iomap_get, before and after unstuffing. The size we return the
				807	* first time needs to be large enough to get the reservation and
				808	* allocation sizes right. The size we return the second time must
				809	* be exact or else gfs2_iomap_alloc won't do the right thing.
				810	*/
				811
				812	if (gfs2_is_stuffed(ip) \|\| mp->mp_fheight != mp->mp_aheight) {
				813	unsigned int maxsize = mp->mp_fheight > 1 ?
				814	sdp->sd_inptrs : sdp->sd_diptrs;
				815	maxsize -= mp->mp_list[mp->mp_fheight - 1];
				816	if (size > maxsize)
				817	size = maxsize;
				818	return size;
				819	}
				820
				821	first = metapointer(ip->i_height - 1, mp);
				822	end = metaend(ip->i_height - 1, mp);
				823	if (end - first > size)
				824	end = first + size;
				825	for (ptr = first; ptr < end; ptr++) {
				826	if (*ptr)
				827	break;
				828	}
				829	return ptr - first;
				830	}
				831
				832	/**
				833	* gfs2_iomap_get - Map blocks from an inode to disk blocks
				834	* @inode: The inode
				835	* @pos: Starting position in bytes
				836	* @length: Length to map, in bytes
				837	* @flags: iomap flags
				838	* @iomap: The iomap structure
				839	* @mp: The metapath
				840	*
				841	* Returns: errno
				842	*/
				843	static int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length,
				844	unsigned flags, struct iomap *iomap,
				845	struct metapath *mp)
				846	{
				847	struct gfs2_inode *ip = GFS2_I(inode);
				848	struct gfs2_sbd *sdp = GFS2_SB(inode);
				849	loff_t size = i_size_read(inode);
				850	__be64 *ptr;
				851	sector_t lblock;
				852	sector_t lblock_stop;
				853	int ret;
				854	int eob;
				855	u64 len;
				856	struct buffer_head dibh = NULL, bh;
				857	u8 height;
				858
				859	if (!length)
				860	return -EINVAL;
				861
				862	down_read(&ip->i_rw_mutex);
				863
				864	ret = gfs2_meta_inode_buffer(ip, &dibh);
				865	if (ret)
				866	goto unlock;
				867	mp->mp_bh[0] = dibh;
				868
				869	if (gfs2_is_stuffed(ip)) {
				870	if (flags & IOMAP_WRITE) {
				871	loff_t max_size = gfs2_max_stuffed_size(ip);
				872
				873	if (pos + length > max_size)
				874	goto unstuff;
				875	iomap->length = max_size;
				876	} else {
				877	if (pos >= size) {
				878	if (flags & IOMAP_REPORT) {
				879	ret = -ENOENT;
				880	goto unlock;
				881	} else {
				882	/* report a hole */
				883	iomap->offset = pos;
				884	iomap->length = length;
				885	goto do_alloc;
				886	}
				887	}
				888	iomap->length = size;
				889	}
				890	iomap->addr = (ip->i_no_addr << inode->i_blkbits) +
				891	sizeof(struct gfs2_dinode);
				892	iomap->type = IOMAP_INLINE;
				893	iomap->inline_data = dibh->b_data + sizeof(struct gfs2_dinode);
				894	goto out;
				895	}
				896
				897	unstuff:
				898	lblock = pos >> inode->i_blkbits;
				899	iomap->offset = lblock << inode->i_blkbits;
				900	lblock_stop = (pos + length - 1) >> inode->i_blkbits;
				901	len = lblock_stop - lblock + 1;
				902	iomap->length = len << inode->i_blkbits;
				903
				904	height = ip->i_height;
				905	while ((lblock + 1) * sdp->sd_sb.sb_bsize > sdp->sd_heightsize[height])
				906	height++;
				907	find_metapath(sdp, lblock, mp, height);
				908	if (height > ip->i_height \|\| gfs2_is_stuffed(ip))
				909	goto do_alloc;
				910
				911	ret = lookup_metapath(ip, mp);
				912	if (ret)
				913	goto unlock;
				914
				915	if (mp->mp_aheight != ip->i_height)
				916	goto do_alloc;
				917
				918	ptr = metapointer(ip->i_height - 1, mp);
				919	if (*ptr == 0)
				920	goto do_alloc;
				921
				922	bh = mp->mp_bh[ip->i_height - 1];
				923	len = gfs2_extent_length(bh, ptr, len, &eob);
				924
				925	iomap->addr = be64_to_cpu(*ptr) << inode->i_blkbits;
				926	iomap->length = len << inode->i_blkbits;
				927	iomap->type = IOMAP_MAPPED;
				928	iomap->flags \|= IOMAP_F_MERGED;
				929	if (eob)
				930	iomap->flags \|= IOMAP_F_GFS2_BOUNDARY;
				931
				932	out:
				933	iomap->bdev = inode->i_sb->s_bdev;
				934	unlock:
				935	up_read(&ip->i_rw_mutex);
				936	return ret;
				937
				938	do_alloc:
				939	iomap->addr = IOMAP_NULL_ADDR;
				940	iomap->type = IOMAP_HOLE;
				941	if (flags & IOMAP_REPORT) {
				942	if (pos >= size)
				943	ret = -ENOENT;
				944	else if (height == ip->i_height)
				945	ret = gfs2_hole_size(inode, lblock, len, mp, iomap);
				946	else
				947	iomap->length = size - pos;
				948	} else if (flags & IOMAP_WRITE) {
				949	u64 alloc_size;
				950
				951	if (flags & IOMAP_DIRECT)
				952	goto out; /* (see gfs2_file_direct_write) */
				953
				954	len = gfs2_alloc_size(inode, mp, len);
				955	alloc_size = len << inode->i_blkbits;
				956	if (alloc_size < iomap->length)
				957	iomap->length = alloc_size;
				958	} else {
				959	if (pos < size && height == ip->i_height)
				960	ret = gfs2_hole_size(inode, lblock, len, mp, iomap);
				961	}
				962	goto out;
				963	}
				964
				965	static int gfs2_write_lock(struct inode *inode)
				966	{
				967	struct gfs2_inode *ip = GFS2_I(inode);
				968	struct gfs2_sbd *sdp = GFS2_SB(inode);
				969	int error;
				970
				971	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
				972	error = gfs2_glock_nq(&ip->i_gh);
				973	if (error)
				974	goto out_uninit;
				975	if (&ip->i_inode == sdp->sd_rindex) {
				976	struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
				977
				978	error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE,
				979	GL_NOCACHE, &m_ip->i_gh);
				980	if (error)
				981	goto out_unlock;
				982	}
				983	return 0;
				984
				985	out_unlock:
				986	gfs2_glock_dq(&ip->i_gh);
				987	out_uninit:
				988	gfs2_holder_uninit(&ip->i_gh);
				989	return error;
				990	}
				991
				992	static void gfs2_write_unlock(struct inode *inode)
				993	{
				994	struct gfs2_inode *ip = GFS2_I(inode);
				995	struct gfs2_sbd *sdp = GFS2_SB(inode);
				996
				997	if (&ip->i_inode == sdp->sd_rindex) {
				998	struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
				999
				1000	gfs2_glock_dq_uninit(&m_ip->i_gh);
				1001	}
				1002	gfs2_glock_dq_uninit(&ip->i_gh);
				1003	}
				1004
				1005	static void gfs2_iomap_journaled_page_done(struct inode *inode, loff_t pos,
				1006	unsigned copied, struct page *page,
				1007	struct iomap *iomap)
				1008	{
				1009	struct gfs2_inode *ip = GFS2_I(inode);
				1010
				1011	gfs2_page_add_databufs(ip, page, offset_in_page(pos), copied);
				1012	}
				1013
				1014	static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos,
				1015	loff_t length, unsigned flags,
				1016	struct iomap *iomap,
				1017	struct metapath *mp)
				1018	{
				1019	struct gfs2_inode *ip = GFS2_I(inode);
				1020	struct gfs2_sbd *sdp = GFS2_SB(inode);
				1021	unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
				1022	bool unstuff, alloc_required;
				1023	int ret;
				1024
				1025	ret = gfs2_write_lock(inode);
				1026	if (ret)
				1027	return ret;
				1028
				1029	unstuff = gfs2_is_stuffed(ip) &&
				1030	pos + length > gfs2_max_stuffed_size(ip);
				1031
				1032	ret = gfs2_iomap_get(inode, pos, length, flags, iomap, mp);
				1033	if (ret)
				1034	goto out_unlock;
				1035
				1036	alloc_required = unstuff \|\| iomap->type == IOMAP_HOLE;
				1037
				1038	if (alloc_required \|\| gfs2_is_jdata(ip))
				1039	gfs2_write_calc_reserv(ip, iomap->length, &data_blocks,
				1040	&ind_blocks);
				1041
				1042	if (alloc_required) {
				1043	struct gfs2_alloc_parms ap = {
				1044	.target = data_blocks + ind_blocks
				1045	};
				1046
				1047	ret = gfs2_quota_lock_check(ip, &ap);
				1048	if (ret)
				1049	goto out_unlock;
				1050
				1051	ret = gfs2_inplace_reserve(ip, &ap);
				1052	if (ret)
				1053	goto out_qunlock;
				1054	}
				1055
				1056	rblocks = RES_DINODE + ind_blocks;
				1057	if (gfs2_is_jdata(ip))
				1058	rblocks += data_blocks;
				1059	if (ind_blocks \|\| data_blocks)
				1060	rblocks += RES_STATFS + RES_QUOTA;
				1061	if (inode == sdp->sd_rindex)
				1062	rblocks += 2 * RES_STATFS;
				1063	if (alloc_required)
				1064	rblocks += gfs2_rg_blocks(ip, data_blocks + ind_blocks);
				1065
				1066	ret = gfs2_trans_begin(sdp, rblocks, iomap->length >> inode->i_blkbits);
				1067	if (ret)
				1068	goto out_trans_fail;
				1069
				1070	if (unstuff) {
				1071	ret = gfs2_unstuff_dinode(ip, NULL);
				1072	if (ret)
				1073	goto out_trans_end;
				1074	release_metapath(mp);
				1075	ret = gfs2_iomap_get(inode, iomap->offset, iomap->length,
				1076	flags, iomap, mp);
				1077	if (ret)
				1078	goto out_trans_end;
				1079	}
				1080
				1081	if (iomap->type == IOMAP_HOLE) {
				1082	ret = gfs2_iomap_alloc(inode, iomap, flags, mp);
				1083	if (ret) {
				1084	gfs2_trans_end(sdp);
				1085	gfs2_inplace_release(ip);
				1086	punch_hole(ip, iomap->offset, iomap->length);
				1087	goto out_qunlock;
				1088	}
				1089	}
				1090	if (!gfs2_is_stuffed(ip) && gfs2_is_jdata(ip))
				1091	iomap->page_done = gfs2_iomap_journaled_page_done;
				1092	return 0;
				1093
				1094	out_trans_end:
				1095	gfs2_trans_end(sdp);
				1096	out_trans_fail:
				1097	if (alloc_required)
				1098	gfs2_inplace_release(ip);
				1099	out_qunlock:
				1100	if (alloc_required)
				1101	gfs2_quota_unlock(ip);
				1102	out_unlock:
				1103	gfs2_write_unlock(inode);
				1104	return ret;
				1105	}
				1106
				1107	static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
				1108	unsigned flags, struct iomap *iomap)
				1109	{
				1110	struct gfs2_inode *ip = GFS2_I(inode);
				1111	struct metapath mp = { .mp_aheight = 1, };
				1112	int ret;
				1113
				1114	iomap->flags \|= IOMAP_F_BUFFER_HEAD;
				1115
				1116	trace_gfs2_iomap_start(ip, pos, length, flags);
				1117	if ((flags & IOMAP_WRITE) && !(flags & IOMAP_DIRECT)) {
				1118	ret = gfs2_iomap_begin_write(inode, pos, length, flags, iomap, &mp);
				1119	} else {
				1120	ret = gfs2_iomap_get(inode, pos, length, flags, iomap, &mp);
				1121
				1122	/*
				1123	* Silently fall back to buffered I/O for stuffed files or if
				1124	* we've hot a hole (see gfs2_file_direct_write).
				1125	*/
				1126	if ((flags & IOMAP_WRITE) && (flags & IOMAP_DIRECT) &&
				1127	iomap->type != IOMAP_MAPPED)
				1128	ret = -ENOTBLK;
				1129	}
				1130	if (!ret) {
				1131	get_bh(mp.mp_bh[0]);
				1132	iomap->private = mp.mp_bh[0];
				1133	}
				1134	release_metapath(&mp);
				1135	trace_gfs2_iomap_end(ip, iomap, ret);
				1136	return ret;
				1137	}
				1138
				1139	static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length,
				1140	ssize_t written, unsigned flags, struct iomap *iomap)
				1141	{
				1142	struct gfs2_inode *ip = GFS2_I(inode);
				1143	struct gfs2_sbd *sdp = GFS2_SB(inode);
				1144	struct gfs2_trans *tr = current->journal_info;
				1145	struct buffer_head *dibh = iomap->private;
				1146
				1147	if ((flags & (IOMAP_WRITE \| IOMAP_DIRECT)) != IOMAP_WRITE)
				1148	goto out;
				1149
				1150	if (iomap->type != IOMAP_INLINE) {
				1151	gfs2_ordered_add_inode(ip);
				1152
				1153	if (tr->tr_num_buf_new)
				1154	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
				1155	else
				1156	gfs2_trans_add_meta(ip->i_gl, dibh);
				1157	}
				1158
				1159	if (inode == sdp->sd_rindex) {
				1160	adjust_fs_space(inode);
				1161	sdp->sd_rindex_uptodate = 0;
				1162	}
				1163
				1164	gfs2_trans_end(sdp);
				1165	gfs2_inplace_release(ip);
				1166
				1167	if (length != written && (iomap->flags & IOMAP_F_NEW)) {
				1168	/* Deallocate blocks that were just allocated. */
				1169	loff_t blockmask = i_blocksize(inode) - 1;
				1170	loff_t end = (pos + length) & ~blockmask;
				1171
				1172	pos = (pos + written + blockmask) & ~blockmask;
				1173	if (pos < end) {
				1174	truncate_pagecache_range(inode, pos, end - 1);
				1175	punch_hole(ip, pos, end - pos);
				1176	}
				1177	}
				1178
				1179	if (ip->i_qadata && ip->i_qadata->qa_qd_num)
				1180	gfs2_quota_unlock(ip);
				1181	gfs2_write_unlock(inode);
				1182
				1183	out:
				1184	if (dibh)
				1185	brelse(dibh);
				1186	return 0;
				1187	}
				1188
				1189	const struct iomap_ops gfs2_iomap_ops = {
				1190	.iomap_begin = gfs2_iomap_begin,
				1191	.iomap_end = gfs2_iomap_end,
				1192	};
				1193
				1194	/**
				1195	* gfs2_block_map - Map one or more blocks of an inode to a disk block
				1196	* @inode: The inode
				1197	* @lblock: The logical block number
				1198	* @bh_map: The bh to be mapped
				1199	* @create: True if its ok to alloc blocks to satify the request
				1200	*
				1201	* The size of the requested mapping is defined in bh_map->b_size.
				1202	*
				1203	* Clears buffer_mapped(bh_map) and leaves bh_map->b_size unchanged
				1204	* when @lblock is not mapped. Sets buffer_mapped(bh_map) and
				1205	* bh_map->b_size to indicate the size of the mapping when @lblock and
				1206	* successive blocks are mapped, up to the requested size.
				1207	*
				1208	* Sets buffer_boundary() if a read of metadata will be required
				1209	* before the next block can be mapped. Sets buffer_new() if new
				1210	* blocks were allocated.
				1211	*
				1212	* Returns: errno
				1213	*/
				1214
				1215	int gfs2_block_map(struct inode *inode, sector_t lblock,
				1216	struct buffer_head *bh_map, int create)
				1217	{
				1218	struct gfs2_inode *ip = GFS2_I(inode);
				1219	loff_t pos = (loff_t)lblock << inode->i_blkbits;
				1220	loff_t length = bh_map->b_size;
				1221	struct metapath mp = { .mp_aheight = 1, };
				1222	struct iomap iomap = { };
				1223	int ret;
				1224
				1225	clear_buffer_mapped(bh_map);
				1226	clear_buffer_new(bh_map);
				1227	clear_buffer_boundary(bh_map);
				1228	trace_gfs2_bmap(ip, bh_map, lblock, create, 1);
				1229
				1230	if (create) {
				1231	ret = gfs2_iomap_get(inode, pos, length, IOMAP_WRITE, &iomap, &mp);
				1232	if (!ret && iomap.type == IOMAP_HOLE)
				1233	ret = gfs2_iomap_alloc(inode, &iomap, IOMAP_WRITE, &mp);
				1234	release_metapath(&mp);
				1235	} else {
				1236	ret = gfs2_iomap_get(inode, pos, length, 0, &iomap, &mp);
				1237	release_metapath(&mp);
				1238	}
				1239	if (ret)
				1240	goto out;
				1241
				1242	if (iomap.length > bh_map->b_size) {
				1243	iomap.length = bh_map->b_size;
				1244	iomap.flags &= ~IOMAP_F_GFS2_BOUNDARY;
				1245	}
				1246	if (iomap.addr != IOMAP_NULL_ADDR)
				1247	map_bh(bh_map, inode->i_sb, iomap.addr >> inode->i_blkbits);
				1248	bh_map->b_size = iomap.length;
				1249	if (iomap.flags & IOMAP_F_GFS2_BOUNDARY)
				1250	set_buffer_boundary(bh_map);
				1251	if (iomap.flags & IOMAP_F_NEW)
				1252	set_buffer_new(bh_map);
				1253
				1254	out:
				1255	trace_gfs2_bmap(ip, bh_map, lblock, create, ret);
				1256	return ret;
				1257	}
				1258
				1259	/*
				1260	* Deprecated: do not use in new code
				1261	*/
				1262	int gfs2_extent_map(struct inode inode, u64 lblock, int new, u64 dblock, unsigned extlen)
				1263	{
				1264	struct buffer_head bh = { .b_state = 0, .b_blocknr = 0 };
				1265	int ret;
				1266	int create = *new;
				1267
				1268	BUG_ON(!extlen);
				1269	BUG_ON(!dblock);
				1270	BUG_ON(!new);
				1271
				1272	bh.b_size = BIT(inode->i_blkbits + (create ? 0 : 5));
				1273	ret = gfs2_block_map(inode, lblock, &bh, create);
				1274	*extlen = bh.b_size >> inode->i_blkbits;
				1275	*dblock = bh.b_blocknr;
				1276	if (buffer_new(&bh))
				1277	*new = 1;
				1278	else
				1279	*new = 0;
				1280	return ret;
				1281	}
				1282
				1283	/**
				1284	* gfs2_block_zero_range - Deal with zeroing out data
				1285	*
				1286	* This is partly borrowed from ext3.
				1287	*/
				1288	static int gfs2_block_zero_range(struct inode *inode, loff_t from,
				1289	unsigned int length)
				1290	{
				1291	struct address_space *mapping = inode->i_mapping;
				1292	struct gfs2_inode *ip = GFS2_I(inode);
				1293	unsigned long index = from >> PAGE_SHIFT;
				1294	unsigned offset = from & (PAGE_SIZE-1);
				1295	unsigned blocksize, iblock, pos;
				1296	struct buffer_head *bh;
				1297	struct page *page;
				1298	int err;
				1299
				1300	page = find_or_create_page(mapping, index, GFP_NOFS);
				1301	if (!page)
				1302	return 0;
				1303
				1304	blocksize = inode->i_sb->s_blocksize;
				1305	iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
				1306
				1307	if (!page_has_buffers(page))
				1308	create_empty_buffers(page, blocksize, 0);
				1309
				1310	/* Find the buffer that contains "offset" */
				1311	bh = page_buffers(page);
				1312	pos = blocksize;
				1313	while (offset >= pos) {
				1314	bh = bh->b_this_page;
				1315	iblock++;
				1316	pos += blocksize;
				1317	}
				1318
				1319	err = 0;
				1320
				1321	if (!buffer_mapped(bh)) {
				1322	gfs2_block_map(inode, iblock, bh, 0);
				1323	/* unmapped? It's a hole - nothing to do */
				1324	if (!buffer_mapped(bh))
				1325	goto unlock;
				1326	}
				1327
				1328	/* Ok, it's mapped. Make sure it's up-to-date */
				1329	if (PageUptodate(page))
				1330	set_buffer_uptodate(bh);
				1331
				1332	if (!buffer_uptodate(bh)) {
				1333	err = -EIO;
				1334	ll_rw_block(REQ_OP_READ, 0, 1, &bh);
				1335	wait_on_buffer(bh);
				1336	/* Uhhuh. Read error. Complain and punt. */
				1337	if (!buffer_uptodate(bh))
				1338	goto unlock;
				1339	err = 0;
				1340	}
				1341
				1342	if (gfs2_is_jdata(ip))
				1343	gfs2_trans_add_data(ip->i_gl, bh);
				1344	else
				1345	gfs2_ordered_add_inode(ip);
				1346
				1347	zero_user(page, offset, length);
				1348	mark_buffer_dirty(bh);
				1349	unlock:
				1350	unlock_page(page);
				1351	put_page(page);
				1352	return err;
				1353	}
				1354
				1355	#define GFS2_JTRUNC_REVOKES 8192
				1356
				1357	/**
				1358	* gfs2_journaled_truncate - Wrapper for truncate_pagecache for jdata files
				1359	* @inode: The inode being truncated
				1360	* @oldsize: The original (larger) size
				1361	* @newsize: The new smaller size
				1362	*
				1363	* With jdata files, we have to journal a revoke for each block which is
				1364	* truncated. As a result, we need to split this into separate transactions
				1365	* if the number of pages being truncated gets too large.
				1366	*/
				1367
				1368	static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize)
				1369	{
				1370	struct gfs2_sbd *sdp = GFS2_SB(inode);
				1371	u64 max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize;
				1372	u64 chunk;
				1373	int error;
				1374
				1375	while (oldsize != newsize) {
				1376	struct gfs2_trans *tr;
				1377	unsigned int offs;
				1378
				1379	chunk = oldsize - newsize;
				1380	if (chunk > max_chunk)
				1381	chunk = max_chunk;
				1382
				1383	offs = oldsize & ~PAGE_MASK;
				1384	if (offs && chunk > PAGE_SIZE)
				1385	chunk = offs + ((chunk - offs) & PAGE_MASK);
				1386
				1387	truncate_pagecache(inode, oldsize - chunk);
				1388	oldsize -= chunk;
				1389
				1390	tr = current->journal_info;
				1391	if (!test_bit(TR_TOUCHED, &tr->tr_flags))
				1392	continue;
				1393
				1394	gfs2_trans_end(sdp);
				1395	error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
				1396	if (error)
				1397	return error;
				1398	}
				1399
				1400	return 0;
				1401	}
				1402
				1403	static int trunc_start(struct inode *inode, u64 newsize)
				1404	{
				1405	struct gfs2_inode *ip = GFS2_I(inode);
				1406	struct gfs2_sbd *sdp = GFS2_SB(inode);
				1407	struct buffer_head *dibh = NULL;
				1408	int journaled = gfs2_is_jdata(ip);
				1409	u64 oldsize = inode->i_size;
				1410	int error;
				1411
				1412	if (journaled)
				1413	error = gfs2_trans_begin(sdp, RES_DINODE + RES_JDATA, GFS2_JTRUNC_REVOKES);
				1414	else
				1415	error = gfs2_trans_begin(sdp, RES_DINODE, 0);
				1416	if (error)
				1417	return error;
				1418
				1419	error = gfs2_meta_inode_buffer(ip, &dibh);
				1420	if (error)
				1421	goto out;
				1422
				1423	gfs2_trans_add_meta(ip->i_gl, dibh);
				1424
				1425	if (gfs2_is_stuffed(ip)) {
				1426	gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize);
				1427	} else {
				1428	unsigned int blocksize = i_blocksize(inode);
				1429	unsigned int offs = newsize & (blocksize - 1);
				1430	if (offs) {
				1431	error = gfs2_block_zero_range(inode, newsize,
				1432	blocksize - offs);
				1433	if (error)
				1434	goto out;
				1435	}
				1436	ip->i_diskflags \|= GFS2_DIF_TRUNC_IN_PROG;
				1437	}
				1438
				1439	i_size_write(inode, newsize);
				1440	ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
				1441	gfs2_dinode_out(ip, dibh->b_data);
				1442
				1443	if (journaled)
				1444	error = gfs2_journaled_truncate(inode, oldsize, newsize);
				1445	else
				1446	truncate_pagecache(inode, newsize);
				1447
				1448	out:
				1449	brelse(dibh);
				1450	if (current->journal_info)
				1451	gfs2_trans_end(sdp);
				1452	return error;
				1453	}
				1454
				1455	int gfs2_iomap_get_alloc(struct inode *inode, loff_t pos, loff_t length,
				1456	struct iomap *iomap)
				1457	{
				1458	struct metapath mp = { .mp_aheight = 1, };
				1459	int ret;
				1460
				1461	ret = gfs2_iomap_get(inode, pos, length, IOMAP_WRITE, iomap, &mp);
				1462	if (!ret && iomap->type == IOMAP_HOLE)
				1463	ret = gfs2_iomap_alloc(inode, iomap, IOMAP_WRITE, &mp);
				1464	release_metapath(&mp);
				1465	return ret;
				1466	}
				1467
				1468	/**
				1469	* sweep_bh_for_rgrps - find an rgrp in a meta buffer and free blocks therein
				1470	* @ip: inode
				1471	* @rg_gh: holder of resource group glock
				1472	* @bh: buffer head to sweep
				1473	* @start: starting point in bh
				1474	* @end: end point in bh
				1475	* @meta: true if bh points to metadata (rather than data)
				1476	* @btotal: place to keep count of total blocks freed
				1477	*
				1478	* We sweep a metadata buffer (provided by the metapath) for blocks we need to
				1479	* free, and free them all. However, we do it one rgrp at a time. If this
				1480	* block has references to multiple rgrps, we break it into individual
				1481	* transactions. This allows other processes to use the rgrps while we're
				1482	* focused on a single one, for better concurrency / performance.
				1483	* At every transaction boundary, we rewrite the inode into the journal.
				1484	* That way the bitmaps are kept consistent with the inode and we can recover
				1485	* if we're interrupted by power-outages.
				1486	*
				1487	* Returns: 0, or return code if an error occurred.
				1488	* *btotal has the total number of blocks freed
				1489	*/
				1490	static int sweep_bh_for_rgrps(struct gfs2_inode ip, struct gfs2_holder rd_gh,
				1491	struct buffer_head bh, __be64 start, __be64 *end,
				1492	bool meta, u32 *btotal)
				1493	{
				1494	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
				1495	struct gfs2_rgrpd *rgd;
				1496	struct gfs2_trans *tr;
				1497	__be64 *p;
				1498	int blks_outside_rgrp;
				1499	u64 bn, bstart, isize_blks;
				1500	s64 blen; /* needs to be s64 or gfs2_add_inode_blocks breaks */
				1501	int ret = 0;
				1502	bool buf_in_tr = false; /* buffer was added to transaction */
				1503
				1504	more_rgrps:
				1505	rgd = NULL;
				1506	if (gfs2_holder_initialized(rd_gh)) {
				1507	rgd = gfs2_glock2rgrp(rd_gh->gh_gl);
				1508	gfs2_assert_withdraw(sdp,
				1509	gfs2_glock_is_locked_by_me(rd_gh->gh_gl));
				1510	}
				1511	blks_outside_rgrp = 0;
				1512	bstart = 0;
				1513	blen = 0;
				1514
				1515	for (p = start; p < end; p++) {
				1516	if (!*p)
				1517	continue;
				1518	bn = be64_to_cpu(*p);
				1519
				1520	if (rgd) {
				1521	if (!rgrp_contains_block(rgd, bn)) {
				1522	blks_outside_rgrp++;
				1523	continue;
				1524	}
				1525	} else {
				1526	rgd = gfs2_blk2rgrpd(sdp, bn, true);
				1527	if (unlikely(!rgd)) {
				1528	ret = -EIO;
				1529	goto out;
				1530	}
				1531	ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
				1532	0, rd_gh);
				1533	if (ret)
				1534	goto out;
				1535
				1536	/* Must be done with the rgrp glock held: */
				1537	if (gfs2_rs_active(&ip->i_res) &&
				1538	rgd == ip->i_res.rs_rbm.rgd)
				1539	gfs2_rs_deltree(&ip->i_res);
				1540	}
				1541
				1542	/* The size of our transactions will be unknown until we
				1543	actually process all the metadata blocks that relate to
				1544	the rgrp. So we estimate. We know it can't be more than
				1545	the dinode's i_blocks and we don't want to exceed the
				1546	journal flush threshold, sd_log_thresh2. */
				1547	if (current->journal_info == NULL) {
				1548	unsigned int jblocks_rqsted, revokes;
				1549
				1550	jblocks_rqsted = rgd->rd_length + RES_DINODE +
				1551	RES_INDIRECT;
				1552	isize_blks = gfs2_get_inode_blocks(&ip->i_inode);
				1553	if (isize_blks > atomic_read(&sdp->sd_log_thresh2))
				1554	jblocks_rqsted +=
				1555	atomic_read(&sdp->sd_log_thresh2);
				1556	else
				1557	jblocks_rqsted += isize_blks;
				1558	revokes = jblocks_rqsted;
				1559	if (meta)
				1560	revokes += end - start;
				1561	else if (ip->i_depth)
				1562	revokes += sdp->sd_inptrs;
				1563	ret = gfs2_trans_begin(sdp, jblocks_rqsted, revokes);
				1564	if (ret)
				1565	goto out_unlock;
				1566	down_write(&ip->i_rw_mutex);
				1567	}
				1568	/* check if we will exceed the transaction blocks requested */
				1569	tr = current->journal_info;
				1570	if (tr->tr_num_buf_new + RES_STATFS +
				1571	RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2)) {
				1572	/* We set blks_outside_rgrp to ensure the loop will
				1573	be repeated for the same rgrp, but with a new
				1574	transaction. */
				1575	blks_outside_rgrp++;
				1576	/* This next part is tricky. If the buffer was added
				1577	to the transaction, we've already set some block
				1578	pointers to 0, so we better follow through and free
				1579	them, or we will introduce corruption (so break).
				1580	This may be impossible, or at least rare, but I
				1581	decided to cover the case regardless.
				1582
				1583	If the buffer was not added to the transaction
				1584	(this call), doing so would exceed our transaction
				1585	size, so we need to end the transaction and start a
				1586	new one (so goto). */
				1587
				1588	if (buf_in_tr)
				1589	break;
				1590	goto out_unlock;
				1591	}
				1592
				1593	gfs2_trans_add_meta(ip->i_gl, bh);
				1594	buf_in_tr = true;
				1595	*p = 0;
				1596	if (bstart + blen == bn) {
				1597	blen++;
				1598	continue;
				1599	}
				1600	if (bstart) {
				1601	__gfs2_free_blocks(ip, bstart, (u32)blen, meta);
				1602	(*btotal) += blen;
				1603	gfs2_add_inode_blocks(&ip->i_inode, -blen);
				1604	}
				1605	bstart = bn;
				1606	blen = 1;
				1607	}
				1608	if (bstart) {
				1609	__gfs2_free_blocks(ip, bstart, (u32)blen, meta);
				1610	(*btotal) += blen;
				1611	gfs2_add_inode_blocks(&ip->i_inode, -blen);
				1612	}
				1613	out_unlock:
				1614	if (!ret && blks_outside_rgrp) { /* If buffer still has non-zero blocks
				1615	outside the rgrp we just processed,
				1616	do it all over again. */
				1617	if (current->journal_info) {
				1618	struct buffer_head *dibh;
				1619
				1620	ret = gfs2_meta_inode_buffer(ip, &dibh);
				1621	if (ret)
				1622	goto out;
				1623
				1624	/* Every transaction boundary, we rewrite the dinode
				1625	to keep its di_blocks current in case of failure. */
				1626	ip->i_inode.i_mtime = ip->i_inode.i_ctime =
				1627	current_time(&ip->i_inode);
				1628	gfs2_trans_add_meta(ip->i_gl, dibh);
				1629	gfs2_dinode_out(ip, dibh->b_data);
				1630	brelse(dibh);
				1631	up_write(&ip->i_rw_mutex);
				1632	gfs2_trans_end(sdp);
				1633	buf_in_tr = false;
				1634	}
				1635	gfs2_glock_dq_uninit(rd_gh);
				1636	cond_resched();
				1637	goto more_rgrps;
				1638	}
				1639	out:
				1640	return ret;
				1641	}
				1642
				1643	static bool mp_eq_to_hgt(struct metapath mp, __u16 list, unsigned int h)
				1644	{
				1645	if (memcmp(mp->mp_list, list, h * sizeof(mp->mp_list[0])))
				1646	return false;
				1647	return true;
				1648	}
				1649
				1650	/**
				1651	* find_nonnull_ptr - find a non-null pointer given a metapath and height
				1652	* @mp: starting metapath
				1653	* @h: desired height to search
				1654	*
				1655	* Assumes the metapath is valid (with buffers) out to height h.
				1656	* Returns: true if a non-null pointer was found in the metapath buffer
				1657	* false if all remaining pointers are NULL in the buffer
				1658	*/
				1659	static bool find_nonnull_ptr(struct gfs2_sbd sdp, struct metapath mp,
				1660	unsigned int h,
				1661	__u16 *end_list, unsigned int end_aligned)
				1662	{
				1663	struct buffer_head *bh = mp->mp_bh[h];
				1664	__be64 first, ptr, *end;
				1665
				1666	first = metaptr1(h, mp);
				1667	ptr = first + mp->mp_list[h];
				1668	end = (__be64 *)(bh->b_data + bh->b_size);
				1669	if (end_list && mp_eq_to_hgt(mp, end_list, h)) {
				1670	bool keep_end = h < end_aligned;
				1671	end = first + end_list[h] + keep_end;
				1672	}
				1673
				1674	while (ptr < end) {
				1675	if (ptr) { / if we have a non-null pointer */
				1676	mp->mp_list[h] = ptr - first;
				1677	h++;
				1678	if (h < GFS2_MAX_META_HEIGHT)
				1679	mp->mp_list[h] = 0;
				1680	return true;
				1681	}
				1682	ptr++;
				1683	}
				1684	return false;
				1685	}
				1686
				1687	enum dealloc_states {
				1688	DEALLOC_MP_FULL = 0, /* Strip a metapath with all buffers read in */
				1689	DEALLOC_MP_LOWER = 1, /* lower the metapath strip height */
				1690	DEALLOC_FILL_MP = 2, /* Fill in the metapath to the given height. */
				1691	DEALLOC_DONE = 3, /* process complete */
				1692	};
				1693
				1694	static inline void
				1695	metapointer_range(struct metapath *mp, int height,
				1696	__u16 *start_list, unsigned int start_aligned,
				1697	__u16 *end_list, unsigned int end_aligned,
				1698	__be64 start, __be64 end)
				1699	{
				1700	struct buffer_head *bh = mp->mp_bh[height];
				1701	__be64 *first;
				1702
				1703	first = metaptr1(height, mp);
				1704	*start = first;
				1705	if (mp_eq_to_hgt(mp, start_list, height)) {
				1706	bool keep_start = height < start_aligned;
				1707	*start = first + start_list[height] + keep_start;
				1708	}
				1709	end = (__be64 )(bh->b_data + bh->b_size);
				1710	if (end_list && mp_eq_to_hgt(mp, end_list, height)) {
				1711	bool keep_end = height < end_aligned;
				1712	*end = first + end_list[height] + keep_end;
				1713	}
				1714	}
				1715
				1716	static inline bool walk_done(struct gfs2_sbd *sdp,
				1717	struct metapath *mp, int height,
				1718	__u16 *end_list, unsigned int end_aligned)
				1719	{
				1720	__u16 end;
				1721
				1722	if (end_list) {
				1723	bool keep_end = height < end_aligned;
				1724	if (!mp_eq_to_hgt(mp, end_list, height))
				1725	return false;
				1726	end = end_list[height] + keep_end;
				1727	} else
				1728	end = (height > 0) ? sdp->sd_inptrs : sdp->sd_diptrs;
				1729	return mp->mp_list[height] >= end;
				1730	}
				1731
				1732	/**
				1733	* punch_hole - deallocate blocks in a file
				1734	* @ip: inode to truncate
				1735	* @offset: the start of the hole
				1736	* @length: the size of the hole (or 0 for truncate)
				1737	*
				1738	* Punch a hole into a file or truncate a file at a given position. This
				1739	* function operates in whole blocks (@offset and @length are rounded
				1740	* accordingly); partially filled blocks must be cleared otherwise.
				1741	*
				1742	* This function works from the bottom up, and from the right to the left. In
				1743	* other words, it strips off the highest layer (data) before stripping any of
				1744	* the metadata. Doing it this way is best in case the operation is interrupted
				1745	* by power failure, etc. The dinode is rewritten in every transaction to
				1746	* guarantee integrity.
				1747	*/
				1748	static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length)
				1749	{
				1750	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
				1751	u64 maxsize = sdp->sd_heightsize[ip->i_height];
				1752	struct metapath mp = {};
				1753	struct buffer_head dibh, bh;
				1754	struct gfs2_holder rd_gh;
				1755	unsigned int bsize_shift = sdp->sd_sb.sb_bsize_shift;
				1756	u64 lblock = (offset + (1 << bsize_shift) - 1) >> bsize_shift;
				1757	__u16 start_list[GFS2_MAX_META_HEIGHT];
				1758	__u16 __end_list[GFS2_MAX_META_HEIGHT], *end_list = NULL;
				1759	unsigned int start_aligned, uninitialized_var(end_aligned);
				1760	unsigned int strip_h = ip->i_height - 1;
				1761	u32 btotal = 0;
				1762	int ret, state;
				1763	int mp_h; /* metapath buffers are read in to this height */
				1764	u64 prev_bnr = 0;
				1765	__be64 start, end;
				1766
				1767	if (offset >= maxsize) {
				1768	/*
				1769	* The starting point lies beyond the allocated meta-data;
				1770	* there are no blocks do deallocate.
				1771	*/
				1772	return 0;
				1773	}
				1774
				1775	/*
				1776	* The start position of the hole is defined by lblock, start_list, and
				1777	* start_aligned. The end position of the hole is defined by lend,
				1778	* end_list, and end_aligned.
				1779	*
				1780	* start_aligned and end_aligned define down to which height the start
				1781	* and end positions are aligned to the metadata tree (i.e., the
				1782	* position is a multiple of the metadata granularity at the height
				1783	* above). This determines at which heights additional meta pointers
				1784	* needs to be preserved for the remaining data.
				1785	*/
				1786
				1787	if (length) {
				1788	u64 end_offset = offset + length;
				1789	u64 lend;
				1790
				1791	/*
				1792	* Clip the end at the maximum file size for the given height:
				1793	* that's how far the metadata goes; files bigger than that
				1794	* will have additional layers of indirection.
				1795	*/
				1796	if (end_offset > maxsize)
				1797	end_offset = maxsize;
				1798	lend = end_offset >> bsize_shift;
				1799
				1800	if (lblock >= lend)
				1801	return 0;
				1802
				1803	find_metapath(sdp, lend, &mp, ip->i_height);
				1804	end_list = __end_list;
				1805	memcpy(end_list, mp.mp_list, sizeof(mp.mp_list));
				1806
				1807	for (mp_h = ip->i_height - 1; mp_h > 0; mp_h--) {
				1808	if (end_list[mp_h])
				1809	break;
				1810	}
				1811	end_aligned = mp_h;
				1812	}
				1813
				1814	find_metapath(sdp, lblock, &mp, ip->i_height);
				1815	memcpy(start_list, mp.mp_list, sizeof(start_list));
				1816
				1817	for (mp_h = ip->i_height - 1; mp_h > 0; mp_h--) {
				1818	if (start_list[mp_h])
				1819	break;
				1820	}
				1821	start_aligned = mp_h;
				1822
				1823	ret = gfs2_meta_inode_buffer(ip, &dibh);
				1824	if (ret)
				1825	return ret;
				1826
				1827	mp.mp_bh[0] = dibh;
				1828	ret = lookup_metapath(ip, &mp);
				1829	if (ret)
				1830	goto out_metapath;
				1831
				1832	/* issue read-ahead on metadata */
				1833	for (mp_h = 0; mp_h < mp.mp_aheight - 1; mp_h++) {
				1834	metapointer_range(&mp, mp_h, start_list, start_aligned,
				1835	end_list, end_aligned, &start, &end);
				1836	gfs2_metapath_ra(ip->i_gl, start, end);
				1837	}
				1838
				1839	if (mp.mp_aheight == ip->i_height)
				1840	state = DEALLOC_MP_FULL; /* We have a complete metapath */
				1841	else
				1842	state = DEALLOC_FILL_MP; /* deal with partial metapath */
				1843
				1844	ret = gfs2_rindex_update(sdp);
				1845	if (ret)
				1846	goto out_metapath;
				1847
				1848	ret = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
				1849	if (ret)
				1850	goto out_metapath;
				1851	gfs2_holder_mark_uninitialized(&rd_gh);
				1852
				1853	mp_h = strip_h;
				1854
				1855	while (state != DEALLOC_DONE) {
				1856	switch (state) {
				1857	/* Truncate a full metapath at the given strip height.
				1858	* Note that strip_h == mp_h in order to be in this state. */
				1859	case DEALLOC_MP_FULL:
				1860	bh = mp.mp_bh[mp_h];
				1861	gfs2_assert_withdraw(sdp, bh);
				1862	if (gfs2_assert_withdraw(sdp,
				1863	prev_bnr != bh->b_blocknr)) {
				1864	printk(KERN_EMERG "GFS2: fsid=%s:inode %llu, "
				1865	"block:%llu, i_h:%u, s_h:%u, mp_h:%u\n",
				1866	sdp->sd_fsname,
				1867	(unsigned long long)ip->i_no_addr,
				1868	prev_bnr, ip->i_height, strip_h, mp_h);
				1869	}
				1870	prev_bnr = bh->b_blocknr;
				1871
				1872	if (gfs2_metatype_check(sdp, bh,
				1873	(mp_h ? GFS2_METATYPE_IN :
				1874	GFS2_METATYPE_DI))) {
				1875	ret = -EIO;
				1876	goto out;
				1877	}
				1878
				1879	/*
				1880	* Below, passing end_aligned as 0 gives us the
				1881	* metapointer range excluding the end point: the end
				1882	* point is the first metapath we must not deallocate!
				1883	*/
				1884
				1885	metapointer_range(&mp, mp_h, start_list, start_aligned,
				1886	end_list, 0 /* end_aligned */,
				1887	&start, &end);
				1888	ret = sweep_bh_for_rgrps(ip, &rd_gh, mp.mp_bh[mp_h],
				1889	start, end,
				1890	mp_h != ip->i_height - 1,
				1891	&btotal);
				1892
				1893	/* If we hit an error or just swept dinode buffer,
				1894	just exit. */
				1895	if (ret \|\| !mp_h) {
				1896	state = DEALLOC_DONE;
				1897	break;
				1898	}
				1899	state = DEALLOC_MP_LOWER;
				1900	break;
				1901
				1902	/* lower the metapath strip height */
				1903	case DEALLOC_MP_LOWER:
				1904	/* We're done with the current buffer, so release it,
				1905	unless it's the dinode buffer. Then back up to the
				1906	previous pointer. */
				1907	if (mp_h) {
				1908	brelse(mp.mp_bh[mp_h]);
				1909	mp.mp_bh[mp_h] = NULL;
				1910	}
				1911	/* If we can't get any lower in height, we've stripped
				1912	off all we can. Next step is to back up and start
				1913	stripping the previous level of metadata. */
				1914	if (mp_h == 0) {
				1915	strip_h--;
				1916	memcpy(mp.mp_list, start_list, sizeof(start_list));
				1917	mp_h = strip_h;
				1918	state = DEALLOC_FILL_MP;
				1919	break;
				1920	}
				1921	mp.mp_list[mp_h] = 0;
				1922	mp_h--; /* search one metadata height down */
				1923	mp.mp_list[mp_h]++;
				1924	if (walk_done(sdp, &mp, mp_h, end_list, end_aligned))
				1925	break;
				1926	/* Here we've found a part of the metapath that is not
				1927	* allocated. We need to search at that height for the
				1928	* next non-null pointer. */
				1929	if (find_nonnull_ptr(sdp, &mp, mp_h, end_list, end_aligned)) {
				1930	state = DEALLOC_FILL_MP;
				1931	mp_h++;
				1932	}
				1933	/* No more non-null pointers at this height. Back up
				1934	to the previous height and try again. */
				1935	break; /* loop around in the same state */
				1936
				1937	/* Fill the metapath with buffers to the given height. */
				1938	case DEALLOC_FILL_MP:
				1939	/* Fill the buffers out to the current height. */
				1940	ret = fillup_metapath(ip, &mp, mp_h);
				1941	if (ret < 0)
				1942	goto out;
				1943
				1944	/* On the first pass, issue read-ahead on metadata. */
				1945	if (mp.mp_aheight > 1 && strip_h == ip->i_height - 1) {
				1946	unsigned int height = mp.mp_aheight - 1;
				1947
				1948	/* No read-ahead for data blocks. */
				1949	if (mp.mp_aheight - 1 == strip_h)
				1950	height--;
				1951
				1952	for (; height >= mp.mp_aheight - ret; height--) {
				1953	metapointer_range(&mp, height,
				1954	start_list, start_aligned,
				1955	end_list, end_aligned,
				1956	&start, &end);
				1957	gfs2_metapath_ra(ip->i_gl, start, end);
				1958	}
				1959	}
				1960
				1961	/* If buffers found for the entire strip height */
				1962	if (mp.mp_aheight - 1 == strip_h) {
				1963	state = DEALLOC_MP_FULL;
				1964	break;
				1965	}
				1966	if (mp.mp_aheight < ip->i_height) /* We have a partial height */
				1967	mp_h = mp.mp_aheight - 1;
				1968
				1969	/* If we find a non-null block pointer, crawl a bit
				1970	higher up in the metapath and try again, otherwise
				1971	we need to look lower for a new starting point. */
				1972	if (find_nonnull_ptr(sdp, &mp, mp_h, end_list, end_aligned))
				1973	mp_h++;
				1974	else
				1975	state = DEALLOC_MP_LOWER;
				1976	break;
				1977	}
				1978	}
				1979
				1980	if (btotal) {
				1981	if (current->journal_info == NULL) {
				1982	ret = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS +
				1983	RES_QUOTA, 0);
				1984	if (ret)
				1985	goto out;
				1986	down_write(&ip->i_rw_mutex);
				1987	}
				1988	gfs2_statfs_change(sdp, 0, +btotal, 0);
				1989	gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
				1990	ip->i_inode.i_gid);
				1991	ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
				1992	gfs2_trans_add_meta(ip->i_gl, dibh);
				1993	gfs2_dinode_out(ip, dibh->b_data);
				1994	up_write(&ip->i_rw_mutex);
				1995	gfs2_trans_end(sdp);
				1996	}
				1997
				1998	out:
				1999	if (gfs2_holder_initialized(&rd_gh))
				2000	gfs2_glock_dq_uninit(&rd_gh);
				2001	if (current->journal_info) {
				2002	up_write(&ip->i_rw_mutex);
				2003	gfs2_trans_end(sdp);
				2004	cond_resched();
				2005	}
				2006	gfs2_quota_unhold(ip);
				2007	out_metapath:
				2008	release_metapath(&mp);
				2009	return ret;
				2010	}
				2011
				2012	static int trunc_end(struct gfs2_inode *ip)
				2013	{
				2014	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
				2015	struct buffer_head *dibh;
				2016	int error;
				2017
				2018	error = gfs2_trans_begin(sdp, RES_DINODE, 0);
				2019	if (error)
				2020	return error;
				2021
				2022	down_write(&ip->i_rw_mutex);
				2023
				2024	error = gfs2_meta_inode_buffer(ip, &dibh);
				2025	if (error)
				2026	goto out;
				2027
				2028	if (!i_size_read(&ip->i_inode)) {
				2029	ip->i_height = 0;
				2030	ip->i_goal = ip->i_no_addr;
				2031	gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
				2032	gfs2_ordered_del_inode(ip);
				2033	}
				2034	ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
				2035	ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG;
				2036
				2037	gfs2_trans_add_meta(ip->i_gl, dibh);
				2038	gfs2_dinode_out(ip, dibh->b_data);
				2039	brelse(dibh);
				2040
				2041	out:
				2042	up_write(&ip->i_rw_mutex);
				2043	gfs2_trans_end(sdp);
				2044	return error;
				2045	}
				2046
				2047	/**
				2048	* do_shrink - make a file smaller
				2049	* @inode: the inode
				2050	* @newsize: the size to make the file
				2051	*
				2052	* Called with an exclusive lock on @inode. The @size must
				2053	* be equal to or smaller than the current inode size.
				2054	*
				2055	* Returns: errno
				2056	*/
				2057
				2058	static int do_shrink(struct inode *inode, u64 newsize)
				2059	{
				2060	struct gfs2_inode *ip = GFS2_I(inode);
				2061	int error;
				2062
				2063	error = trunc_start(inode, newsize);
				2064	if (error < 0)
				2065	return error;
				2066	if (gfs2_is_stuffed(ip))
				2067	return 0;
				2068
				2069	error = punch_hole(ip, newsize, 0);
				2070	if (error == 0)
				2071	error = trunc_end(ip);
				2072
				2073	return error;
				2074	}
				2075
				2076	void gfs2_trim_blocks(struct inode *inode)
				2077	{
				2078	int ret;
				2079
				2080	ret = do_shrink(inode, inode->i_size);
				2081	WARN_ON(ret != 0);
				2082	}
				2083
				2084	/**
				2085	* do_grow - Touch and update inode size
				2086	* @inode: The inode
				2087	* @size: The new size
				2088	*
				2089	* This function updates the timestamps on the inode and
				2090	* may also increase the size of the inode. This function
				2091	* must not be called with @size any smaller than the current
				2092	* inode size.
				2093	*
				2094	* Although it is not strictly required to unstuff files here,
				2095	* earlier versions of GFS2 have a bug in the stuffed file reading
				2096	* code which will result in a buffer overrun if the size is larger
				2097	* than the max stuffed file size. In order to prevent this from
				2098	* occurring, such files are unstuffed, but in other cases we can
				2099	* just update the inode size directly.
				2100	*
				2101	* Returns: 0 on success, or -ve on error
				2102	*/
				2103
				2104	static int do_grow(struct inode *inode, u64 size)
				2105	{
				2106	struct gfs2_inode *ip = GFS2_I(inode);
				2107	struct gfs2_sbd *sdp = GFS2_SB(inode);
				2108	struct gfs2_alloc_parms ap = { .target = 1, };
				2109	struct buffer_head *dibh;
				2110	int error;
				2111	int unstuff = 0;
				2112
				2113	if (gfs2_is_stuffed(ip) && size > gfs2_max_stuffed_size(ip)) {
				2114	error = gfs2_quota_lock_check(ip, &ap);
				2115	if (error)
				2116	return error;
				2117
				2118	error = gfs2_inplace_reserve(ip, &ap);
				2119	if (error)
				2120	goto do_grow_qunlock;
				2121	unstuff = 1;
				2122	}
				2123
				2124	error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT +
				2125	(unstuff &&
				2126	gfs2_is_jdata(ip) ? RES_JDATA : 0) +
				2127	(sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ?
				2128	0 : RES_QUOTA), 0);
				2129	if (error)
				2130	goto do_grow_release;
				2131
				2132	if (unstuff) {
				2133	error = gfs2_unstuff_dinode(ip, NULL);
				2134	if (error)
				2135	goto do_end_trans;
				2136	}
				2137
				2138	error = gfs2_meta_inode_buffer(ip, &dibh);
				2139	if (error)
				2140	goto do_end_trans;
				2141
				2142	i_size_write(inode, size);
				2143	ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
				2144	gfs2_trans_add_meta(ip->i_gl, dibh);
				2145	gfs2_dinode_out(ip, dibh->b_data);
				2146	brelse(dibh);
				2147
				2148	do_end_trans:
				2149	gfs2_trans_end(sdp);
				2150	do_grow_release:
				2151	if (unstuff) {
				2152	gfs2_inplace_release(ip);
				2153	do_grow_qunlock:
				2154	gfs2_quota_unlock(ip);
				2155	}
				2156	return error;
				2157	}
				2158
				2159	/**
				2160	* gfs2_setattr_size - make a file a given size
				2161	* @inode: the inode
				2162	* @newsize: the size to make the file
				2163	*
				2164	* The file size can grow, shrink, or stay the same size. This
				2165	* is called holding i_rwsem and an exclusive glock on the inode
				2166	* in question.
				2167	*
				2168	* Returns: errno
				2169	*/
				2170
				2171	int gfs2_setattr_size(struct inode *inode, u64 newsize)
				2172	{
				2173	struct gfs2_inode *ip = GFS2_I(inode);
				2174	int ret;
				2175
				2176	BUG_ON(!S_ISREG(inode->i_mode));
				2177
				2178	ret = inode_newsize_ok(inode, newsize);
				2179	if (ret)
				2180	return ret;
				2181
				2182	inode_dio_wait(inode);
				2183
				2184	ret = gfs2_rsqa_alloc(ip);
				2185	if (ret)
				2186	goto out;
				2187
				2188	if (newsize >= inode->i_size) {
				2189	ret = do_grow(inode, newsize);
				2190	goto out;
				2191	}
				2192
				2193	ret = do_shrink(inode, newsize);
				2194	out:
				2195	gfs2_rsqa_delete(ip, NULL);
				2196	return ret;
				2197	}
				2198
				2199	int gfs2_truncatei_resume(struct gfs2_inode *ip)
				2200	{
				2201	int error;
				2202	error = punch_hole(ip, i_size_read(&ip->i_inode), 0);
				2203	if (!error)
				2204	error = trunc_end(ip);
				2205	return error;
				2206	}
				2207
				2208	int gfs2_file_dealloc(struct gfs2_inode *ip)
				2209	{
				2210	return punch_hole(ip, 0, 0);
				2211	}
				2212
				2213	/**
				2214	* gfs2_free_journal_extents - Free cached journal bmap info
				2215	* @jd: The journal
				2216	*
				2217	*/
				2218
				2219	void gfs2_free_journal_extents(struct gfs2_jdesc *jd)
				2220	{
				2221	struct gfs2_journal_extent *jext;
				2222
				2223	while(!list_empty(&jd->extent_list)) {
				2224	jext = list_entry(jd->extent_list.next, struct gfs2_journal_extent, list);
				2225	list_del(&jext->list);
				2226	kfree(jext);
				2227	}
				2228	}
				2229
				2230	/**
				2231	* gfs2_add_jextent - Add or merge a new extent to extent cache
				2232	* @jd: The journal descriptor
				2233	* @lblock: The logical block at start of new extent
				2234	* @dblock: The physical block at start of new extent
				2235	* @blocks: Size of extent in fs blocks
				2236	*
				2237	* Returns: 0 on success or -ENOMEM
				2238	*/
				2239
				2240	static int gfs2_add_jextent(struct gfs2_jdesc *jd, u64 lblock, u64 dblock, u64 blocks)
				2241	{
				2242	struct gfs2_journal_extent *jext;
				2243
				2244	if (!list_empty(&jd->extent_list)) {
				2245	jext = list_entry(jd->extent_list.prev, struct gfs2_journal_extent, list);
				2246	if ((jext->dblock + jext->blocks) == dblock) {
				2247	jext->blocks += blocks;
				2248	return 0;
				2249	}
				2250	}
				2251
				2252	jext = kzalloc(sizeof(struct gfs2_journal_extent), GFP_NOFS);
				2253	if (jext == NULL)
				2254	return -ENOMEM;
				2255	jext->dblock = dblock;
				2256	jext->lblock = lblock;
				2257	jext->blocks = blocks;
				2258	list_add_tail(&jext->list, &jd->extent_list);
				2259	jd->nr_extents++;
				2260	return 0;
				2261	}
				2262
				2263	/**
				2264	* gfs2_map_journal_extents - Cache journal bmap info
				2265	* @sdp: The super block
				2266	* @jd: The journal to map
				2267	*
				2268	* Create a reusable "extent" mapping from all logical
				2269	* blocks to all physical blocks for the given journal. This will save
				2270	* us time when writing journal blocks. Most journals will have only one
				2271	* extent that maps all their logical blocks. That's because gfs2.mkfs
				2272	* arranges the journal blocks sequentially to maximize performance.
				2273	* So the extent would map the first block for the entire file length.
				2274	* However, gfs2_jadd can happen while file activity is happening, so
				2275	* those journals may not be sequential. Less likely is the case where
				2276	* the users created their own journals by mounting the metafs and
				2277	* laying it out. But it's still possible. These journals might have
				2278	* several extents.
				2279	*
				2280	* Returns: 0 on success, or error on failure
				2281	*/
				2282
				2283	int gfs2_map_journal_extents(struct gfs2_sbd sdp, struct gfs2_jdesc jd)
				2284	{
				2285	u64 lblock = 0;
				2286	u64 lblock_stop;
				2287	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
				2288	struct buffer_head bh;
				2289	unsigned int shift = sdp->sd_sb.sb_bsize_shift;
				2290	u64 size;
				2291	int rc;
				2292
				2293	lblock_stop = i_size_read(jd->jd_inode) >> shift;
				2294	size = (lblock_stop - lblock) << shift;
				2295	jd->nr_extents = 0;
				2296	WARN_ON(!list_empty(&jd->extent_list));
				2297
				2298	do {
				2299	bh.b_state = 0;
				2300	bh.b_blocknr = 0;
				2301	bh.b_size = size;
				2302	rc = gfs2_block_map(jd->jd_inode, lblock, &bh, 0);
				2303	if (rc \|\| !buffer_mapped(&bh))
				2304	goto fail;
				2305	rc = gfs2_add_jextent(jd, lblock, bh.b_blocknr, bh.b_size >> shift);
				2306	if (rc)
				2307	goto fail;
				2308	size -= bh.b_size;
				2309	lblock += (bh.b_size >> ip->i_inode.i_blkbits);
				2310	} while(size > 0);
				2311
				2312	fs_info(sdp, "journal %d mapped with %u extents\n", jd->jd_jid,
				2313	jd->nr_extents);
				2314	return 0;
				2315
				2316	fail:
				2317	fs_warn(sdp, "error %d mapping journal %u at offset %llu (extent %u)\n",
				2318	rc, jd->jd_jid,
				2319	(unsigned long long)(i_size_read(jd->jd_inode) - size),
				2320	jd->nr_extents);
				2321	fs_warn(sdp, "bmap=%d lblock=%llu block=%llu, state=0x%08lx, size=%llu\n",
				2322	rc, (unsigned long long)lblock, (unsigned long long)bh.b_blocknr,
				2323	bh.b_state, (unsigned long long)bh.b_size);
				2324	gfs2_free_journal_extents(jd);
				2325	return rc;
				2326	}
				2327
				2328	/**
				2329	* gfs2_write_alloc_required - figure out if a write will require an allocation
				2330	* @ip: the file being written to
				2331	* @offset: the offset to write to
				2332	* @len: the number of bytes being written
				2333	*
				2334	* Returns: 1 if an alloc is required, 0 otherwise
				2335	*/
				2336
				2337	int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
				2338	unsigned int len)
				2339	{
				2340	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
				2341	struct buffer_head bh;
				2342	unsigned int shift;
				2343	u64 lblock, lblock_stop, size;
				2344	u64 end_of_file;
				2345
				2346	if (!len)
				2347	return 0;
				2348
				2349	if (gfs2_is_stuffed(ip)) {
				2350	if (offset + len > gfs2_max_stuffed_size(ip))
				2351	return 1;
				2352	return 0;
				2353	}
				2354
				2355	shift = sdp->sd_sb.sb_bsize_shift;
				2356	BUG_ON(gfs2_is_dir(ip));
				2357	end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift;
				2358	lblock = offset >> shift;
				2359	lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
				2360	if (lblock_stop > end_of_file && ip != GFS2_I(sdp->sd_rindex))
				2361	return 1;
				2362
				2363	size = (lblock_stop - lblock) << shift;
				2364	do {
				2365	bh.b_state = 0;
				2366	bh.b_size = size;
				2367	gfs2_block_map(&ip->i_inode, lblock, &bh, 0);
				2368	if (!buffer_mapped(&bh))
				2369	return 1;
				2370	size -= bh.b_size;
				2371	lblock += (bh.b_size >> ip->i_inode.i_blkbits);
				2372	} while(size > 0);
				2373
				2374	return 0;
				2375	}
				2376
				2377	static int stuffed_zero_range(struct inode *inode, loff_t offset, loff_t length)
				2378	{
				2379	struct gfs2_inode *ip = GFS2_I(inode);
				2380	struct buffer_head *dibh;
				2381	int error;
				2382
				2383	if (offset >= inode->i_size)
				2384	return 0;
				2385	if (offset + length > inode->i_size)
				2386	length = inode->i_size - offset;
				2387
				2388	error = gfs2_meta_inode_buffer(ip, &dibh);
				2389	if (error)
				2390	return error;
				2391	gfs2_trans_add_meta(ip->i_gl, dibh);
				2392	memset(dibh->b_data + sizeof(struct gfs2_dinode) + offset, 0,
				2393	length);
				2394	brelse(dibh);
				2395	return 0;
				2396	}
				2397
				2398	static int gfs2_journaled_truncate_range(struct inode *inode, loff_t offset,
				2399	loff_t length)
				2400	{
				2401	struct gfs2_sbd *sdp = GFS2_SB(inode);
				2402	loff_t max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize;
				2403	int error;
				2404
				2405	while (length) {
				2406	struct gfs2_trans *tr;
				2407	loff_t chunk;
				2408	unsigned int offs;
				2409
				2410	chunk = length;
				2411	if (chunk > max_chunk)
				2412	chunk = max_chunk;
				2413
				2414	offs = offset & ~PAGE_MASK;
				2415	if (offs && chunk > PAGE_SIZE)
				2416	chunk = offs + ((chunk - offs) & PAGE_MASK);
				2417
				2418	truncate_pagecache_range(inode, offset, chunk);
				2419	offset += chunk;
				2420	length -= chunk;
				2421
				2422	tr = current->journal_info;
				2423	if (!test_bit(TR_TOUCHED, &tr->tr_flags))
				2424	continue;
				2425
				2426	gfs2_trans_end(sdp);
				2427	error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
				2428	if (error)
				2429	return error;
				2430	}
				2431	return 0;
				2432	}
				2433
				2434	int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length)
				2435	{
				2436	struct inode *inode = file_inode(file);
				2437	struct gfs2_inode *ip = GFS2_I(inode);
				2438	struct gfs2_sbd *sdp = GFS2_SB(inode);
				2439	int error;
				2440
				2441	if (gfs2_is_jdata(ip))
				2442	error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_JDATA,
				2443	GFS2_JTRUNC_REVOKES);
				2444	else
				2445	error = gfs2_trans_begin(sdp, RES_DINODE, 0);
				2446	if (error)
				2447	return error;
				2448
				2449	if (gfs2_is_stuffed(ip)) {
				2450	error = stuffed_zero_range(inode, offset, length);
				2451	if (error)
				2452	goto out;
				2453	} else {
				2454	unsigned int start_off, end_len, blocksize;
				2455
				2456	blocksize = i_blocksize(inode);
				2457	start_off = offset & (blocksize - 1);
				2458	end_len = (offset + length) & (blocksize - 1);
				2459	if (start_off) {
				2460	unsigned int len = length;
				2461	if (length > blocksize - start_off)
				2462	len = blocksize - start_off;
				2463	error = gfs2_block_zero_range(inode, offset, len);
				2464	if (error)
				2465	goto out;
				2466	if (start_off + length < blocksize)
				2467	end_len = 0;
				2468	}
				2469	if (end_len) {
				2470	error = gfs2_block_zero_range(inode,
				2471	offset + length - end_len, end_len);
				2472	if (error)
				2473	goto out;
				2474	}
				2475	}
				2476
				2477	if (gfs2_is_jdata(ip)) {
				2478	BUG_ON(!current->journal_info);
				2479	gfs2_journaled_truncate_range(inode, offset, length);
				2480	} else
				2481	truncate_pagecache_range(inode, offset, offset + length - 1);
				2482
				2483	file_update_time(file);
				2484	mark_inode_dirty(inode);
				2485
				2486	if (current->journal_info)
				2487	gfs2_trans_end(sdp);
				2488
				2489	if (!gfs2_is_stuffed(ip))
				2490	error = punch_hole(ip, offset, length);
				2491
				2492	out:
				2493	if (current->journal_info)
				2494	gfs2_trans_end(sdp);
				2495	return error;
				2496	}