Blame - src/kernel/linux/v4.14/fs/gfs2/bmap.c - T103

blob: b3a1b16d4e3e38f805ac19ee00df6019e48e5228 [file] [log] [blame]

rjw	1f88458	2022-01-06 17:20:42 +0800	[diff] [blame^]	1	/*
				2	* Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
				3	* Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
				4	*
				5	* This copyrighted material is made available to anyone wishing to use,
				6	* modify, copy, or redistribute it subject to the terms and conditions
				7	* of the GNU General Public License version 2.
				8	*/
				9
				10	#include <linux/spinlock.h>
				11	#include <linux/completion.h>
				12	#include <linux/buffer_head.h>
				13	#include <linux/blkdev.h>
				14	#include <linux/gfs2_ondisk.h>
				15	#include <linux/crc32.h>
				16
				17	#include "gfs2.h"
				18	#include "incore.h"
				19	#include "bmap.h"
				20	#include "glock.h"
				21	#include "inode.h"
				22	#include "meta_io.h"
				23	#include "quota.h"
				24	#include "rgrp.h"
				25	#include "log.h"
				26	#include "super.h"
				27	#include "trans.h"
				28	#include "dir.h"
				29	#include "util.h"
				30	#include "trace_gfs2.h"
				31
				32	/* This doesn't need to be that large as max 64 bit pointers in a 4k
				33	* block is 512, so __u16 is fine for that. It saves stack space to
				34	* keep it small.
				35	*/
				36	struct metapath {
				37	struct buffer_head *mp_bh[GFS2_MAX_META_HEIGHT];
				38	__u16 mp_list[GFS2_MAX_META_HEIGHT];
				39	};
				40
				41	/**
				42	* gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page
				43	* @ip: the inode
				44	* @dibh: the dinode buffer
				45	* @block: the block number that was allocated
				46	* @page: The (optional) page. This is looked up if @page is NULL
				47	*
				48	* Returns: errno
				49	*/
				50
				51	static int gfs2_unstuffer_page(struct gfs2_inode ip, struct buffer_head dibh,
				52	u64 block, struct page *page)
				53	{
				54	struct inode *inode = &ip->i_inode;
				55	struct buffer_head *bh;
				56	int release = 0;
				57
				58	if (!page \|\| page->index) {
				59	page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS);
				60	if (!page)
				61	return -ENOMEM;
				62	release = 1;
				63	}
				64
				65	if (!PageUptodate(page)) {
				66	void *kaddr = kmap(page);
				67	u64 dsize = i_size_read(inode);
				68
				69	if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode)))
				70	dsize = dibh->b_size - sizeof(struct gfs2_dinode);
				71
				72	memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
				73	memset(kaddr + dsize, 0, PAGE_SIZE - dsize);
				74	kunmap(page);
				75
				76	SetPageUptodate(page);
				77	}
				78
				79	if (!page_has_buffers(page))
				80	create_empty_buffers(page, BIT(inode->i_blkbits),
				81	BIT(BH_Uptodate));
				82
				83	bh = page_buffers(page);
				84
				85	if (!buffer_mapped(bh))
				86	map_bh(bh, inode->i_sb, block);
				87
				88	set_buffer_uptodate(bh);
				89	if (!gfs2_is_jdata(ip))
				90	mark_buffer_dirty(bh);
				91	if (!gfs2_is_writeback(ip))
				92	gfs2_trans_add_data(ip->i_gl, bh);
				93
				94	if (release) {
				95	unlock_page(page);
				96	put_page(page);
				97	}
				98
				99	return 0;
				100	}
				101
				102	/**
				103	* gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
				104	* @ip: The GFS2 inode to unstuff
				105	* @page: The (optional) page. This is looked up if the @page is NULL
				106	*
				107	* This routine unstuffs a dinode and returns it to a "normal" state such
				108	* that the height can be grown in the traditional way.
				109	*
				110	* Returns: errno
				111	*/
				112
				113	int gfs2_unstuff_dinode(struct gfs2_inode ip, struct page page)
				114	{
				115	struct buffer_head bh, dibh;
				116	struct gfs2_dinode *di;
				117	u64 block = 0;
				118	int isdir = gfs2_is_dir(ip);
				119	int error;
				120
				121	down_write(&ip->i_rw_mutex);
				122
				123	error = gfs2_meta_inode_buffer(ip, &dibh);
				124	if (error)
				125	goto out;
				126
				127	if (i_size_read(&ip->i_inode)) {
				128	/* Get a free block, fill it with the stuffed data,
				129	and write it out to disk */
				130
				131	unsigned int n = 1;
				132	error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL);
				133	if (error)
				134	goto out_brelse;
				135	if (isdir) {
				136	gfs2_trans_add_unrevoke(GFS2_SB(&ip->i_inode), block, 1);
				137	error = gfs2_dir_get_new_buffer(ip, block, &bh);
				138	if (error)
				139	goto out_brelse;
				140	gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_meta_header),
				141	dibh, sizeof(struct gfs2_dinode));
				142	brelse(bh);
				143	} else {
				144	error = gfs2_unstuffer_page(ip, dibh, block, page);
				145	if (error)
				146	goto out_brelse;
				147	}
				148	}
				149
				150	/* Set up the pointer to the new block */
				151
				152	gfs2_trans_add_meta(ip->i_gl, dibh);
				153	di = (struct gfs2_dinode *)dibh->b_data;
				154	gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
				155
				156	if (i_size_read(&ip->i_inode)) {
				157	(__be64 )(di + 1) = cpu_to_be64(block);
				158	gfs2_add_inode_blocks(&ip->i_inode, 1);
				159	di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
				160	}
				161
				162	ip->i_height = 1;
				163	di->di_height = cpu_to_be16(1);
				164
				165	out_brelse:
				166	brelse(dibh);
				167	out:
				168	up_write(&ip->i_rw_mutex);
				169	return error;
				170	}
				171
				172
				173	/**
				174	* find_metapath - Find path through the metadata tree
				175	* @sdp: The superblock
				176	* @mp: The metapath to return the result in
				177	* @block: The disk block to look up
				178	* @height: The pre-calculated height of the metadata tree
				179	*
				180	* This routine returns a struct metapath structure that defines a path
				181	* through the metadata of inode "ip" to get to block "block".
				182	*
				183	* Example:
				184	* Given: "ip" is a height 3 file, "offset" is 101342453, and this is a
				185	* filesystem with a blocksize of 4096.
				186	*
				187	* find_metapath() would return a struct metapath structure set to:
				188	* mp_offset = 101342453, mp_height = 3, mp_list[0] = 0, mp_list[1] = 48,
				189	* and mp_list[2] = 165.
				190	*
				191	* That means that in order to get to the block containing the byte at
				192	* offset 101342453, we would load the indirect block pointed to by pointer
				193	* 0 in the dinode. We would then load the indirect block pointed to by
				194	* pointer 48 in that indirect block. We would then load the data block
				195	* pointed to by pointer 165 in that indirect block.
				196	*
				197	* ----------------------------------------
				198	* \| Dinode \| \|
				199	* \| \| 4\|
				200	* \| \|0 1 2 3 4 5 9\|
				201	* \| \| 6\|
				202	* ----------------------------------------
				203	* \|
				204	* \|
				205	* V
				206	* ----------------------------------------
				207	* \| Indirect Block \|
				208	* \| 5\|
				209	* \| 4 4 4 4 4 5 5 1\|
				210	* \|0 5 6 7 8 9 0 1 2\|
				211	* ----------------------------------------
				212	* \|
				213	* \|
				214	* V
				215	* ----------------------------------------
				216	* \| Indirect Block \|
				217	* \| 1 1 1 1 1 5\|
				218	* \| 6 6 6 6 6 1\|
				219	* \|0 3 4 5 6 7 2\|
				220	* ----------------------------------------
				221	* \|
				222	* \|
				223	* V
				224	* ----------------------------------------
				225	* \| Data block containing offset \|
				226	* \| 101342453 \|
				227	* \| \|
				228	* \| \|
				229	* ----------------------------------------
				230	*
				231	*/
				232
				233	static void find_metapath(const struct gfs2_sbd *sdp, u64 block,
				234	struct metapath *mp, unsigned int height)
				235	{
				236	unsigned int i;
				237
				238	for (i = height; i--;)
				239	mp->mp_list[i] = do_div(block, sdp->sd_inptrs);
				240
				241	}
				242
				243	static inline unsigned int metapath_branch_start(const struct metapath *mp)
				244	{
				245	if (mp->mp_list[0] == 0)
				246	return 2;
				247	return 1;
				248	}
				249
				250	/**
				251	* metaptr1 - Return the first possible metadata pointer in a metaath buffer
				252	* @height: The metadata height (0 = dinode)
				253	* @mp: The metapath
				254	*/
				255	static inline __be64 metaptr1(unsigned int height, const struct metapath mp)
				256	{
				257	struct buffer_head *bh = mp->mp_bh[height];
				258	if (height == 0)
				259	return ((__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)));
				260	return ((__be64 *)(bh->b_data + sizeof(struct gfs2_meta_header)));
				261	}
				262
				263	/**
				264	* metapointer - Return pointer to start of metadata in a buffer
				265	* @height: The metadata height (0 = dinode)
				266	* @mp: The metapath
				267	*
				268	* Return a pointer to the block number of the next height of the metadata
				269	* tree given a buffer containing the pointer to the current height of the
				270	* metadata tree.
				271	*/
				272
				273	static inline __be64 metapointer(unsigned int height, const struct metapath mp)
				274	{
				275	__be64 *p = metaptr1(height, mp);
				276	return p + mp->mp_list[height];
				277	}
				278
				279	static void gfs2_metapath_ra(struct gfs2_glock *gl,
				280	const struct buffer_head bh, const __be64 pos)
				281	{
				282	struct buffer_head *rabh;
				283	const __be64 endp = (const __be64 )(bh->b_data + bh->b_size);
				284	const __be64 *t;
				285
				286	for (t = pos; t < endp; t++) {
				287	if (!*t)
				288	continue;
				289
				290	rabh = gfs2_getbuf(gl, be64_to_cpu(*t), CREATE);
				291	if (trylock_buffer(rabh)) {
				292	if (!buffer_uptodate(rabh)) {
				293	rabh->b_end_io = end_buffer_read_sync;
				294	submit_bh(REQ_OP_READ,
				295	REQ_RAHEAD \| REQ_META \| REQ_PRIO,
				296	rabh);
				297	continue;
				298	}
				299	unlock_buffer(rabh);
				300	}
				301	brelse(rabh);
				302	}
				303	}
				304
				305	/**
				306	* lookup_mp_height - helper function for lookup_metapath
				307	* @ip: the inode
				308	* @mp: the metapath
				309	* @h: the height which needs looking up
				310	*/
				311	static int lookup_mp_height(struct gfs2_inode ip, struct metapath mp, int h)
				312	{
				313	__be64 *ptr = metapointer(h, mp);
				314	u64 dblock = be64_to_cpu(*ptr);
				315
				316	if (!dblock)
				317	return h + 1;
				318
				319	return gfs2_meta_indirect_buffer(ip, h + 1, dblock, &mp->mp_bh[h + 1]);
				320	}
				321
				322	/**
				323	* lookup_metapath - Walk the metadata tree to a specific point
				324	* @ip: The inode
				325	* @mp: The metapath
				326	*
				327	* Assumes that the inode's buffer has already been looked up and
				328	* hooked onto mp->mp_bh[0] and that the metapath has been initialised
				329	* by find_metapath().
				330	*
				331	* If this function encounters part of the tree which has not been
				332	* allocated, it returns the current height of the tree at the point
				333	* at which it found the unallocated block. Blocks which are found are
				334	* added to the mp->mp_bh[] list.
				335	*
				336	* Returns: error or height of metadata tree
				337	*/
				338
				339	static int lookup_metapath(struct gfs2_inode ip, struct metapath mp)
				340	{
				341	unsigned int end_of_metadata = ip->i_height - 1;
				342	unsigned int x;
				343	int ret;
				344
				345	for (x = 0; x < end_of_metadata; x++) {
				346	ret = lookup_mp_height(ip, mp, x);
				347	if (ret)
				348	return ret;
				349	}
				350
				351	return ip->i_height;
				352	}
				353
				354	/**
				355	* fillup_metapath - fill up buffers for the metadata path to a specific height
				356	* @ip: The inode
				357	* @mp: The metapath
				358	* @h: The height to which it should be mapped
				359	*
				360	* Similar to lookup_metapath, but does lookups for a range of heights
				361	*
				362	* Returns: error or height of metadata tree
				363	*/
				364
				365	static int fillup_metapath(struct gfs2_inode ip, struct metapath mp, int h)
				366	{
				367	unsigned int start_h = h - 1;
				368	int ret;
				369
				370	if (h) {
				371	/* find the first buffer we need to look up. */
				372	while (start_h > 0 && mp->mp_bh[start_h] == NULL)
				373	start_h--;
				374	for (; start_h < h; start_h++) {
				375	ret = lookup_mp_height(ip, mp, start_h);
				376	if (ret)
				377	return ret;
				378	}
				379	}
				380	return ip->i_height;
				381	}
				382
				383	static inline void release_metapath(struct metapath *mp)
				384	{
				385	int i;
				386
				387	for (i = 0; i < GFS2_MAX_META_HEIGHT; i++) {
				388	if (mp->mp_bh[i] == NULL)
				389	break;
				390	brelse(mp->mp_bh[i]);
				391	}
				392	}
				393
				394	/**
				395	* gfs2_extent_length - Returns length of an extent of blocks
				396	* @start: Start of the buffer
				397	* @len: Length of the buffer in bytes
				398	* @ptr: Current position in the buffer
				399	* @limit: Max extent length to return (0 = unlimited)
				400	* @eob: Set to 1 if we hit "end of block"
				401	*
				402	* If the first block is zero (unallocated) it will return the number of
				403	* unallocated blocks in the extent, otherwise it will return the number
				404	* of contiguous blocks in the extent.
				405	*
				406	* Returns: The length of the extent (minimum of one block)
				407	*/
				408
				409	static inline unsigned int gfs2_extent_length(void start, unsigned int len, __be64 ptr, size_t limit, int *eob)
				410	{
				411	const __be64 *end = (start + len);
				412	const __be64 *first = ptr;
				413	u64 d = be64_to_cpu(*ptr);
				414
				415	*eob = 0;
				416	do {
				417	ptr++;
				418	if (ptr >= end)
				419	break;
				420	if (limit && --limit == 0)
				421	break;
				422	if (d)
				423	d++;
				424	} while(be64_to_cpu(*ptr) == d);
				425	if (ptr >= end)
				426	*eob = 1;
				427	return (ptr - first);
				428	}
				429
				430	static inline void bmap_lock(struct gfs2_inode *ip, int create)
				431	{
				432	if (create)
				433	down_write(&ip->i_rw_mutex);
				434	else
				435	down_read(&ip->i_rw_mutex);
				436	}
				437
				438	static inline void bmap_unlock(struct gfs2_inode *ip, int create)
				439	{
				440	if (create)
				441	up_write(&ip->i_rw_mutex);
				442	else
				443	up_read(&ip->i_rw_mutex);
				444	}
				445
				446	static inline __be64 gfs2_indirect_init(struct metapath mp,
				447	struct gfs2_glock *gl, unsigned int i,
				448	unsigned offset, u64 bn)
				449	{
				450	__be64 ptr = (__be64 )(mp->mp_bh[i - 1]->b_data +
				451	((i > 1) ? sizeof(struct gfs2_meta_header) :
				452	sizeof(struct gfs2_dinode)));
				453	BUG_ON(i < 1);
				454	BUG_ON(mp->mp_bh[i] != NULL);
				455	mp->mp_bh[i] = gfs2_meta_new(gl, bn);
				456	gfs2_trans_add_meta(gl, mp->mp_bh[i]);
				457	gfs2_metatype_set(mp->mp_bh[i], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
				458	gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header));
				459	ptr += offset;
				460	*ptr = cpu_to_be64(bn);
				461	return ptr;
				462	}
				463
				464	enum alloc_state {
				465	ALLOC_DATA = 0,
				466	ALLOC_GROW_DEPTH = 1,
				467	ALLOC_GROW_HEIGHT = 2,
				468	/* ALLOC_UNSTUFF = 3, TBD and rather complicated */
				469	};
				470
				471	static inline unsigned int hptrs(struct gfs2_sbd *sdp, const unsigned int hgt)
				472	{
				473	if (hgt)
				474	return sdp->sd_inptrs;
				475	return sdp->sd_diptrs;
				476	}
				477
				478	/**
				479	* gfs2_bmap_alloc - Build a metadata tree of the requested height
				480	* @inode: The GFS2 inode
				481	* @lblock: The logical starting block of the extent
				482	* @bh_map: This is used to return the mapping details
				483	* @mp: The metapath
				484	* @sheight: The starting height (i.e. whats already mapped)
				485	* @height: The height to build to
				486	* @maxlen: The max number of data blocks to alloc
				487	*
				488	* In this routine we may have to alloc:
				489	* i) Indirect blocks to grow the metadata tree height
				490	* ii) Indirect blocks to fill in lower part of the metadata tree
				491	* iii) Data blocks
				492	*
				493	* The function is in two parts. The first part works out the total
				494	* number of blocks which we need. The second part does the actual
				495	* allocation asking for an extent at a time (if enough contiguous free
				496	* blocks are available, there will only be one request per bmap call)
				497	* and uses the state machine to initialise the blocks in order.
				498	*
				499	* Returns: errno on error
				500	*/
				501
				502	static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
				503	struct buffer_head bh_map, struct metapath mp,
				504	const unsigned int sheight,
				505	const unsigned int height,
				506	const size_t maxlen)
				507	{
				508	struct gfs2_inode *ip = GFS2_I(inode);
				509	struct gfs2_sbd *sdp = GFS2_SB(inode);
				510	struct super_block *sb = sdp->sd_vfs;
				511	struct buffer_head *dibh = mp->mp_bh[0];
				512	u64 bn, dblock = 0;
				513	unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
				514	unsigned dblks = 0;
				515	unsigned ptrs_per_blk;
				516	const unsigned end_of_metadata = height - 1;
				517	int ret;
				518	int eob = 0;
				519	enum alloc_state state;
				520	__be64 *ptr;
				521	__be64 zero_bn = 0;
				522
				523	BUG_ON(sheight < 1);
				524	BUG_ON(dibh == NULL);
				525
				526	gfs2_trans_add_meta(ip->i_gl, dibh);
				527
				528	if (height == sheight) {
				529	struct buffer_head *bh;
				530	/* Bottom indirect block exists, find unalloced extent size */
				531	ptr = metapointer(end_of_metadata, mp);
				532	bh = mp->mp_bh[end_of_metadata];
				533	dblks = gfs2_extent_length(bh->b_data, bh->b_size, ptr, maxlen,
				534	&eob);
				535	BUG_ON(dblks < 1);
				536	state = ALLOC_DATA;
				537	} else {
				538	/* Need to allocate indirect blocks */
				539	ptrs_per_blk = height > 1 ? sdp->sd_inptrs : sdp->sd_diptrs;
				540	dblks = min(maxlen, (size_t)(ptrs_per_blk -
				541	mp->mp_list[end_of_metadata]));
				542	if (height == ip->i_height) {
				543	/* Writing into existing tree, extend tree down */
				544	iblks = height - sheight;
				545	state = ALLOC_GROW_DEPTH;
				546	} else {
				547	/* Building up tree height */
				548	state = ALLOC_GROW_HEIGHT;
				549	iblks = height - ip->i_height;
				550	branch_start = metapath_branch_start(mp);
				551	iblks += (height - branch_start);
				552	}
				553	}
				554
				555	/* start of the second part of the function (state machine) */
				556
				557	blks = dblks + iblks;
				558	i = sheight;
				559	do {
				560	int error;
				561	n = blks - alloced;
				562	error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
				563	if (error)
				564	return error;
				565	alloced += n;
				566	if (state != ALLOC_DATA \|\| gfs2_is_jdata(ip))
				567	gfs2_trans_add_unrevoke(sdp, bn, n);
				568	switch (state) {
				569	/* Growing height of tree */
				570	case ALLOC_GROW_HEIGHT:
				571	if (i == 1) {
				572	ptr = (__be64 *)(dibh->b_data +
				573	sizeof(struct gfs2_dinode));
				574	zero_bn = *ptr;
				575	}
				576	for (; i - 1 < height - ip->i_height && n > 0; i++, n--)
				577	gfs2_indirect_init(mp, ip->i_gl, i, 0, bn++);
				578	if (i - 1 == height - ip->i_height) {
				579	i--;
				580	gfs2_buffer_copy_tail(mp->mp_bh[i],
				581	sizeof(struct gfs2_meta_header),
				582	dibh, sizeof(struct gfs2_dinode));
				583	gfs2_buffer_clear_tail(dibh,
				584	sizeof(struct gfs2_dinode) +
				585	sizeof(__be64));
				586	ptr = (__be64 *)(mp->mp_bh[i]->b_data +
				587	sizeof(struct gfs2_meta_header));
				588	*ptr = zero_bn;
				589	state = ALLOC_GROW_DEPTH;
				590	for(i = branch_start; i < height; i++) {
				591	if (mp->mp_bh[i] == NULL)
				592	break;
				593	brelse(mp->mp_bh[i]);
				594	mp->mp_bh[i] = NULL;
				595	}
				596	i = branch_start;
				597	}
				598	if (n == 0)
				599	break;
				600	/* Branching from existing tree */
				601	case ALLOC_GROW_DEPTH:
				602	if (i > 1 && i < height)
				603	gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[i-1]);
				604	for (; i < height && n > 0; i++, n--)
				605	gfs2_indirect_init(mp, ip->i_gl, i,
				606	mp->mp_list[i-1], bn++);
				607	if (i == height)
				608	state = ALLOC_DATA;
				609	if (n == 0)
				610	break;
				611	/* Tree complete, adding data blocks */
				612	case ALLOC_DATA:
				613	BUG_ON(n > dblks);
				614	BUG_ON(mp->mp_bh[end_of_metadata] == NULL);
				615	gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[end_of_metadata]);
				616	dblks = n;
				617	ptr = metapointer(end_of_metadata, mp);
				618	dblock = bn;
				619	while (n-- > 0)
				620	*ptr++ = cpu_to_be64(bn++);
				621	if (buffer_zeronew(bh_map)) {
				622	ret = sb_issue_zeroout(sb, dblock, dblks,
				623	GFP_NOFS);
				624	if (ret) {
				625	fs_err(sdp,
				626	"Failed to zero data buffers\n");
				627	clear_buffer_zeronew(bh_map);
				628	}
				629	}
				630	break;
				631	}
				632	} while ((state != ALLOC_DATA) \|\| !dblock);
				633
				634	ip->i_height = height;
				635	gfs2_add_inode_blocks(&ip->i_inode, alloced);
				636	gfs2_dinode_out(ip, mp->mp_bh[0]->b_data);
				637	map_bh(bh_map, inode->i_sb, dblock);
				638	bh_map->b_size = dblks << inode->i_blkbits;
				639	set_buffer_new(bh_map);
				640	return 0;
				641	}
				642
				643	/**
				644	* gfs2_block_map - Map a block from an inode to a disk block
				645	* @inode: The inode
				646	* @lblock: The logical block number
				647	* @bh_map: The bh to be mapped
				648	* @create: True if its ok to alloc blocks to satify the request
				649	*
				650	* Sets buffer_mapped() if successful, sets buffer_boundary() if a
				651	* read of metadata will be required before the next block can be
				652	* mapped. Sets buffer_new() if new blocks were allocated.
				653	*
				654	* Returns: errno
				655	*/
				656
				657	int gfs2_block_map(struct inode *inode, sector_t lblock,
				658	struct buffer_head *bh_map, int create)
				659	{
				660	struct gfs2_inode *ip = GFS2_I(inode);
				661	struct gfs2_sbd *sdp = GFS2_SB(inode);
				662	unsigned int bsize = sdp->sd_sb.sb_bsize;
				663	const size_t maxlen = bh_map->b_size >> inode->i_blkbits;
				664	const u64 *arr = sdp->sd_heightsize;
				665	__be64 *ptr;
				666	u64 size;
				667	struct metapath mp;
				668	int ret;
				669	int eob;
				670	unsigned int len;
				671	struct buffer_head *bh;
				672	u8 height;
				673
				674	BUG_ON(maxlen == 0);
				675
				676	memset(&mp, 0, sizeof(mp));
				677	bmap_lock(ip, create);
				678	clear_buffer_mapped(bh_map);
				679	clear_buffer_new(bh_map);
				680	clear_buffer_boundary(bh_map);
				681	trace_gfs2_bmap(ip, bh_map, lblock, create, 1);
				682	if (gfs2_is_dir(ip)) {
				683	bsize = sdp->sd_jbsize;
				684	arr = sdp->sd_jheightsize;
				685	}
				686
				687	ret = gfs2_meta_inode_buffer(ip, &mp.mp_bh[0]);
				688	if (ret)
				689	goto out;
				690
				691	height = ip->i_height;
				692	size = (lblock + 1) * bsize;
				693	while (size > arr[height])
				694	height++;
				695	find_metapath(sdp, lblock, &mp, height);
				696	ret = 1;
				697	if (height > ip->i_height \|\| gfs2_is_stuffed(ip))
				698	goto do_alloc;
				699	ret = lookup_metapath(ip, &mp);
				700	if (ret < 0)
				701	goto out;
				702	if (ret != ip->i_height)
				703	goto do_alloc;
				704	ptr = metapointer(ip->i_height - 1, &mp);
				705	if (*ptr == 0)
				706	goto do_alloc;
				707	map_bh(bh_map, inode->i_sb, be64_to_cpu(*ptr));
				708	bh = mp.mp_bh[ip->i_height - 1];
				709	len = gfs2_extent_length(bh->b_data, bh->b_size, ptr, maxlen, &eob);
				710	bh_map->b_size = (len << inode->i_blkbits);
				711	if (eob)
				712	set_buffer_boundary(bh_map);
				713	ret = 0;
				714	out:
				715	release_metapath(&mp);
				716	trace_gfs2_bmap(ip, bh_map, lblock, create, ret);
				717	bmap_unlock(ip, create);
				718	return ret;
				719
				720	do_alloc:
				721	/* All allocations are done here, firstly check create flag */
				722	if (!create) {
				723	BUG_ON(gfs2_is_stuffed(ip));
				724	ret = 0;
				725	goto out;
				726	}
				727
				728	/* At this point ret is the tree depth of already allocated blocks */
				729	ret = gfs2_bmap_alloc(inode, lblock, bh_map, &mp, ret, height, maxlen);
				730	goto out;
				731	}
				732
				733	/*
				734	* Deprecated: do not use in new code
				735	*/
				736	int gfs2_extent_map(struct inode inode, u64 lblock, int new, u64 dblock, unsigned extlen)
				737	{
				738	struct buffer_head bh = { .b_state = 0, .b_blocknr = 0 };
				739	int ret;
				740	int create = *new;
				741
				742	BUG_ON(!extlen);
				743	BUG_ON(!dblock);
				744	BUG_ON(!new);
				745
				746	bh.b_size = BIT(inode->i_blkbits + (create ? 0 : 5));
				747	ret = gfs2_block_map(inode, lblock, &bh, create);
				748	*extlen = bh.b_size >> inode->i_blkbits;
				749	*dblock = bh.b_blocknr;
				750	if (buffer_new(&bh))
				751	*new = 1;
				752	else
				753	*new = 0;
				754	return ret;
				755	}
				756
				757	/**
				758	* gfs2_block_truncate_page - Deal with zeroing out data for truncate
				759	*
				760	* This is partly borrowed from ext3.
				761	*/
				762	static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from)
				763	{
				764	struct inode *inode = mapping->host;
				765	struct gfs2_inode *ip = GFS2_I(inode);
				766	unsigned long index = from >> PAGE_SHIFT;
				767	unsigned offset = from & (PAGE_SIZE-1);
				768	unsigned blocksize, iblock, length, pos;
				769	struct buffer_head *bh;
				770	struct page *page;
				771	int err;
				772
				773	page = find_or_create_page(mapping, index, GFP_NOFS);
				774	if (!page)
				775	return 0;
				776
				777	blocksize = inode->i_sb->s_blocksize;
				778	length = blocksize - (offset & (blocksize - 1));
				779	iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
				780
				781	if (!page_has_buffers(page))
				782	create_empty_buffers(page, blocksize, 0);
				783
				784	/* Find the buffer that contains "offset" */
				785	bh = page_buffers(page);
				786	pos = blocksize;
				787	while (offset >= pos) {
				788	bh = bh->b_this_page;
				789	iblock++;
				790	pos += blocksize;
				791	}
				792
				793	err = 0;
				794
				795	if (!buffer_mapped(bh)) {
				796	gfs2_block_map(inode, iblock, bh, 0);
				797	/* unmapped? It's a hole - nothing to do */
				798	if (!buffer_mapped(bh))
				799	goto unlock;
				800	}
				801
				802	/* Ok, it's mapped. Make sure it's up-to-date */
				803	if (PageUptodate(page))
				804	set_buffer_uptodate(bh);
				805
				806	if (!buffer_uptodate(bh)) {
				807	err = -EIO;
				808	ll_rw_block(REQ_OP_READ, 0, 1, &bh);
				809	wait_on_buffer(bh);
				810	/* Uhhuh. Read error. Complain and punt. */
				811	if (!buffer_uptodate(bh))
				812	goto unlock;
				813	err = 0;
				814	}
				815
				816	if (!gfs2_is_writeback(ip))
				817	gfs2_trans_add_data(ip->i_gl, bh);
				818
				819	zero_user(page, offset, length);
				820	mark_buffer_dirty(bh);
				821	unlock:
				822	unlock_page(page);
				823	put_page(page);
				824	return err;
				825	}
				826
				827	#define GFS2_JTRUNC_REVOKES 8192
				828
				829	/**
				830	* gfs2_journaled_truncate - Wrapper for truncate_pagecache for jdata files
				831	* @inode: The inode being truncated
				832	* @oldsize: The original (larger) size
				833	* @newsize: The new smaller size
				834	*
				835	* With jdata files, we have to journal a revoke for each block which is
				836	* truncated. As a result, we need to split this into separate transactions
				837	* if the number of pages being truncated gets too large.
				838	*/
				839
				840	static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize)
				841	{
				842	struct gfs2_sbd *sdp = GFS2_SB(inode);
				843	u64 max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize;
				844	u64 chunk;
				845	int error;
				846
				847	while (oldsize != newsize) {
				848	chunk = oldsize - newsize;
				849	if (chunk > max_chunk)
				850	chunk = max_chunk;
				851	truncate_pagecache(inode, oldsize - chunk);
				852	oldsize -= chunk;
				853	gfs2_trans_end(sdp);
				854	error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
				855	if (error)
				856	return error;
				857	}
				858
				859	return 0;
				860	}
				861
				862	static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize)
				863	{
				864	struct gfs2_inode *ip = GFS2_I(inode);
				865	struct gfs2_sbd *sdp = GFS2_SB(inode);
				866	struct address_space *mapping = inode->i_mapping;
				867	struct buffer_head *dibh;
				868	int journaled = gfs2_is_jdata(ip);
				869	int error;
				870
				871	if (journaled)
				872	error = gfs2_trans_begin(sdp, RES_DINODE + RES_JDATA, GFS2_JTRUNC_REVOKES);
				873	else
				874	error = gfs2_trans_begin(sdp, RES_DINODE, 0);
				875	if (error)
				876	return error;
				877
				878	error = gfs2_meta_inode_buffer(ip, &dibh);
				879	if (error)
				880	goto out;
				881
				882	gfs2_trans_add_meta(ip->i_gl, dibh);
				883
				884	if (gfs2_is_stuffed(ip)) {
				885	gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize);
				886	} else {
				887	if (newsize & (u64)(sdp->sd_sb.sb_bsize - 1)) {
				888	error = gfs2_block_truncate_page(mapping, newsize);
				889	if (error)
				890	goto out_brelse;
				891	}
				892	ip->i_diskflags \|= GFS2_DIF_TRUNC_IN_PROG;
				893	}
				894
				895	i_size_write(inode, newsize);
				896	ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
				897	gfs2_dinode_out(ip, dibh->b_data);
				898
				899	if (journaled)
				900	error = gfs2_journaled_truncate(inode, oldsize, newsize);
				901	else
				902	truncate_pagecache(inode, newsize);
				903
				904	if (error) {
				905	brelse(dibh);
				906	return error;
				907	}
				908
				909	out_brelse:
				910	brelse(dibh);
				911	out:
				912	gfs2_trans_end(sdp);
				913	return error;
				914	}
				915
				916	/**
				917	* sweep_bh_for_rgrps - find an rgrp in a meta buffer and free blocks therein
				918	* @ip: inode
				919	* @rg_gh: holder of resource group glock
				920	* @mp: current metapath fully populated with buffers
				921	* @btotal: place to keep count of total blocks freed
				922	* @hgt: height we're processing
				923	* @first: true if this is the first call to this function for this height
				924	*
				925	* We sweep a metadata buffer (provided by the metapath) for blocks we need to
				926	* free, and free them all. However, we do it one rgrp at a time. If this
				927	* block has references to multiple rgrps, we break it into individual
				928	* transactions. This allows other processes to use the rgrps while we're
				929	* focused on a single one, for better concurrency / performance.
				930	* At every transaction boundary, we rewrite the inode into the journal.
				931	* That way the bitmaps are kept consistent with the inode and we can recover
				932	* if we're interrupted by power-outages.
				933	*
				934	* Returns: 0, or return code if an error occurred.
				935	* *btotal has the total number of blocks freed
				936	*/
				937	static int sweep_bh_for_rgrps(struct gfs2_inode ip, struct gfs2_holder rd_gh,
				938	const struct metapath mp, u32 btotal, int hgt,
				939	bool preserve1)
				940	{
				941	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
				942	struct gfs2_rgrpd *rgd;
				943	struct gfs2_trans *tr;
				944	struct buffer_head *bh = mp->mp_bh[hgt];
				945	__be64 top, bottom, *p;
				946	int blks_outside_rgrp;
				947	u64 bn, bstart, isize_blks;
				948	s64 blen; /* needs to be s64 or gfs2_add_inode_blocks breaks */
				949	int meta = ((hgt != ip->i_height - 1) ? 1 : 0);
				950	int ret = 0;
				951	bool buf_in_tr = false; /* buffer was added to transaction */
				952
				953	if (gfs2_metatype_check(sdp, bh,
				954	(hgt ? GFS2_METATYPE_IN : GFS2_METATYPE_DI)))
				955	return -EIO;
				956
				957	more_rgrps:
				958	blks_outside_rgrp = 0;
				959	bstart = 0;
				960	blen = 0;
				961	top = metapointer(hgt, mp); /* first ptr from metapath */
				962	/* If we're keeping some data at the truncation point, we've got to
				963	preserve the metadata tree by adding 1 to the starting metapath. */
				964	if (preserve1)
				965	top++;
				966
				967	bottom = (__be64 *)(bh->b_data + bh->b_size);
				968
				969	for (p = top; p < bottom; p++) {
				970	if (!*p)
				971	continue;
				972	bn = be64_to_cpu(*p);
				973	if (gfs2_holder_initialized(rd_gh)) {
				974	rgd = gfs2_glock2rgrp(rd_gh->gh_gl);
				975	gfs2_assert_withdraw(sdp,
				976	gfs2_glock_is_locked_by_me(rd_gh->gh_gl));
				977	} else {
				978	rgd = gfs2_blk2rgrpd(sdp, bn, false);
				979	ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
				980	0, rd_gh);
				981	if (ret)
				982	goto out;
				983
				984	/* Must be done with the rgrp glock held: */
				985	if (gfs2_rs_active(&ip->i_res) &&
				986	rgd == ip->i_res.rs_rbm.rgd)
				987	gfs2_rs_deltree(&ip->i_res);
				988	}
				989
				990	if (!rgrp_contains_block(rgd, bn)) {
				991	blks_outside_rgrp++;
				992	continue;
				993	}
				994
				995	/* The size of our transactions will be unknown until we
				996	actually process all the metadata blocks that relate to
				997	the rgrp. So we estimate. We know it can't be more than
				998	the dinode's i_blocks and we don't want to exceed the
				999	journal flush threshold, sd_log_thresh2. */
				1000	if (current->journal_info == NULL) {
				1001	unsigned int jblocks_rqsted, revokes;
				1002
				1003	jblocks_rqsted = rgd->rd_length + RES_DINODE +
				1004	RES_INDIRECT;
				1005	isize_blks = gfs2_get_inode_blocks(&ip->i_inode);
				1006	if (isize_blks > atomic_read(&sdp->sd_log_thresh2))
				1007	jblocks_rqsted +=
				1008	atomic_read(&sdp->sd_log_thresh2);
				1009	else
				1010	jblocks_rqsted += isize_blks;
				1011	revokes = jblocks_rqsted;
				1012	if (meta)
				1013	revokes += hptrs(sdp, hgt);
				1014	else if (ip->i_depth)
				1015	revokes += sdp->sd_inptrs;
				1016	ret = gfs2_trans_begin(sdp, jblocks_rqsted, revokes);
				1017	if (ret)
				1018	goto out_unlock;
				1019	down_write(&ip->i_rw_mutex);
				1020	}
				1021	/* check if we will exceed the transaction blocks requested */
				1022	tr = current->journal_info;
				1023	if (tr->tr_num_buf_new + RES_STATFS +
				1024	RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2)) {
				1025	/* We set blks_outside_rgrp to ensure the loop will
				1026	be repeated for the same rgrp, but with a new
				1027	transaction. */
				1028	blks_outside_rgrp++;
				1029	/* This next part is tricky. If the buffer was added
				1030	to the transaction, we've already set some block
				1031	pointers to 0, so we better follow through and free
				1032	them, or we will introduce corruption (so break).
				1033	This may be impossible, or at least rare, but I
				1034	decided to cover the case regardless.
				1035
				1036	If the buffer was not added to the transaction
				1037	(this call), doing so would exceed our transaction
				1038	size, so we need to end the transaction and start a
				1039	new one (so goto). */
				1040
				1041	if (buf_in_tr)
				1042	break;
				1043	goto out_unlock;
				1044	}
				1045
				1046	gfs2_trans_add_meta(ip->i_gl, bh);
				1047	buf_in_tr = true;
				1048	*p = 0;
				1049	if (bstart + blen == bn) {
				1050	blen++;
				1051	continue;
				1052	}
				1053	if (bstart) {
				1054	__gfs2_free_blocks(ip, bstart, (u32)blen, meta);
				1055	(*btotal) += blen;
				1056	gfs2_add_inode_blocks(&ip->i_inode, -blen);
				1057	}
				1058	bstart = bn;
				1059	blen = 1;
				1060	}
				1061	if (bstart) {
				1062	__gfs2_free_blocks(ip, bstart, (u32)blen, meta);
				1063	(*btotal) += blen;
				1064	gfs2_add_inode_blocks(&ip->i_inode, -blen);
				1065	}
				1066	out_unlock:
				1067	if (!ret && blks_outside_rgrp) { /* If buffer still has non-zero blocks
				1068	outside the rgrp we just processed,
				1069	do it all over again. */
				1070	if (current->journal_info) {
				1071	struct buffer_head *dibh = mp->mp_bh[0];
				1072
				1073	/* Every transaction boundary, we rewrite the dinode
				1074	to keep its di_blocks current in case of failure. */
				1075	ip->i_inode.i_mtime = ip->i_inode.i_ctime =
				1076	current_time(&ip->i_inode);
				1077	gfs2_trans_add_meta(ip->i_gl, dibh);
				1078	gfs2_dinode_out(ip, dibh->b_data);
				1079	up_write(&ip->i_rw_mutex);
				1080	gfs2_trans_end(sdp);
				1081	buf_in_tr = false;
				1082	}
				1083	gfs2_glock_dq_uninit(rd_gh);
				1084	cond_resched();
				1085	goto more_rgrps;
				1086	}
				1087	out:
				1088	return ret;
				1089	}
				1090
				1091	/**
				1092	* find_nonnull_ptr - find a non-null pointer given a metapath and height
				1093	* assumes the metapath is valid (with buffers) out to height h
				1094	* @mp: starting metapath
				1095	* @h: desired height to search
				1096	*
				1097	* Returns: true if a non-null pointer was found in the metapath buffer
				1098	* false if all remaining pointers are NULL in the buffer
				1099	*/
				1100	static bool find_nonnull_ptr(struct gfs2_sbd sdp, struct metapath mp,
				1101	unsigned int h)
				1102	{
				1103	__be64 *ptr;
				1104	unsigned int ptrs = hptrs(sdp, h) - 1;
				1105
				1106	while (true) {
				1107	ptr = metapointer(h, mp);
				1108	if (ptr) { / if we have a non-null pointer */
				1109	/* Now zero the metapath after the current height. */
				1110	h++;
				1111	if (h < GFS2_MAX_META_HEIGHT)
				1112	memset(&mp->mp_list[h], 0,
				1113	(GFS2_MAX_META_HEIGHT - h) *
				1114	sizeof(mp->mp_list[0]));
				1115	return true;
				1116	}
				1117
				1118	if (mp->mp_list[h] < ptrs)
				1119	mp->mp_list[h]++;
				1120	else
				1121	return false; /* no more pointers in this buffer */
				1122	}
				1123	}
				1124
				1125	enum dealloc_states {
				1126	DEALLOC_MP_FULL = 0, /* Strip a metapath with all buffers read in */
				1127	DEALLOC_MP_LOWER = 1, /* lower the metapath strip height */
				1128	DEALLOC_FILL_MP = 2, /* Fill in the metapath to the given height. */
				1129	DEALLOC_DONE = 3, /* process complete */
				1130	};
				1131
				1132	static bool mp_eq_to_hgt(struct metapath mp, __u16 nbof, unsigned int h)
				1133	{
				1134	if (memcmp(mp->mp_list, nbof, h * sizeof(mp->mp_list[0])))
				1135	return false;
				1136	return true;
				1137	}
				1138
				1139	/**
				1140	* trunc_dealloc - truncate a file down to a desired size
				1141	* @ip: inode to truncate
				1142	* @newsize: The desired size of the file
				1143	*
				1144	* This function truncates a file to newsize. It works from the
				1145	* bottom up, and from the right to the left. In other words, it strips off
				1146	* the highest layer (data) before stripping any of the metadata. Doing it
				1147	* this way is best in case the operation is interrupted by power failure, etc.
				1148	* The dinode is rewritten in every transaction to guarantee integrity.
				1149	*/
				1150	static int trunc_dealloc(struct gfs2_inode *ip, u64 newsize)
				1151	{
				1152	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
				1153	struct metapath mp;
				1154	struct buffer_head dibh, bh;
				1155	struct gfs2_holder rd_gh;
				1156	u64 lblock;
				1157	__u16 nbof[GFS2_MAX_META_HEIGHT]; /* new beginning of truncation */
				1158	unsigned int strip_h = ip->i_height - 1;
				1159	u32 btotal = 0;
				1160	int ret, state;
				1161	int mp_h; /* metapath buffers are read in to this height */
				1162	sector_t last_ra = 0;
				1163	u64 prev_bnr = 0;
				1164	bool preserve1; /* need to preserve the first meta pointer? */
				1165
				1166	if (!newsize)
				1167	lblock = 0;
				1168	else
				1169	lblock = (newsize - 1) >> sdp->sd_sb.sb_bsize_shift;
				1170
				1171	memset(&mp, 0, sizeof(mp));
				1172	find_metapath(sdp, lblock, &mp, ip->i_height);
				1173
				1174	memcpy(&nbof, &mp.mp_list, sizeof(nbof));
				1175
				1176	ret = gfs2_meta_inode_buffer(ip, &dibh);
				1177	if (ret)
				1178	return ret;
				1179
				1180	mp.mp_bh[0] = dibh;
				1181	ret = lookup_metapath(ip, &mp);
				1182	if (ret == ip->i_height)
				1183	state = DEALLOC_MP_FULL; /* We have a complete metapath */
				1184	else
				1185	state = DEALLOC_FILL_MP; /* deal with partial metapath */
				1186
				1187	ret = gfs2_rindex_update(sdp);
				1188	if (ret)
				1189	goto out_metapath;
				1190
				1191	ret = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
				1192	if (ret)
				1193	goto out_metapath;
				1194	gfs2_holder_mark_uninitialized(&rd_gh);
				1195
				1196	mp_h = strip_h;
				1197
				1198	while (state != DEALLOC_DONE) {
				1199	switch (state) {
				1200	/* Truncate a full metapath at the given strip height.
				1201	* Note that strip_h == mp_h in order to be in this state. */
				1202	case DEALLOC_MP_FULL:
				1203	if (mp_h > 0) { /* issue read-ahead on metadata */
				1204	__be64 *top;
				1205
				1206	bh = mp.mp_bh[mp_h - 1];
				1207	if (bh->b_blocknr != last_ra) {
				1208	last_ra = bh->b_blocknr;
				1209	top = metaptr1(mp_h - 1, &mp);
				1210	gfs2_metapath_ra(ip->i_gl, bh, top);
				1211	}
				1212	}
				1213	/* If we're truncating to a non-zero size and the mp is
				1214	at the beginning of file for the strip height, we
				1215	need to preserve the first metadata pointer. */
				1216	preserve1 = (newsize && mp_eq_to_hgt(&mp, nbof, mp_h));
				1217	bh = mp.mp_bh[mp_h];
				1218	gfs2_assert_withdraw(sdp, bh);
				1219	if (gfs2_assert_withdraw(sdp,
				1220	prev_bnr != bh->b_blocknr)) {
				1221	printk(KERN_EMERG "GFS2: fsid=%s:inode %llu, "
				1222	"block:%llu, i_h:%u, s_h:%u, mp_h:%u\n",
				1223	sdp->sd_fsname,
				1224	(unsigned long long)ip->i_no_addr,
				1225	prev_bnr, ip->i_height, strip_h, mp_h);
				1226	}
				1227	prev_bnr = bh->b_blocknr;
				1228	ret = sweep_bh_for_rgrps(ip, &rd_gh, &mp, &btotal,
				1229	mp_h, preserve1);
				1230	/* If we hit an error or just swept dinode buffer,
				1231	just exit. */
				1232	if (ret \|\| !mp_h) {
				1233	state = DEALLOC_DONE;
				1234	break;
				1235	}
				1236	state = DEALLOC_MP_LOWER;
				1237	break;
				1238
				1239	/* lower the metapath strip height */
				1240	case DEALLOC_MP_LOWER:
				1241	/* We're done with the current buffer, so release it,
				1242	unless it's the dinode buffer. Then back up to the
				1243	previous pointer. */
				1244	if (mp_h) {
				1245	brelse(mp.mp_bh[mp_h]);
				1246	mp.mp_bh[mp_h] = NULL;
				1247	}
				1248	/* If we can't get any lower in height, we've stripped
				1249	off all we can. Next step is to back up and start
				1250	stripping the previous level of metadata. */
				1251	if (mp_h == 0) {
				1252	strip_h--;
				1253	memcpy(&mp.mp_list, &nbof, sizeof(nbof));
				1254	mp_h = strip_h;
				1255	state = DEALLOC_FILL_MP;
				1256	break;
				1257	}
				1258	mp.mp_list[mp_h] = 0;
				1259	mp_h--; /* search one metadata height down */
				1260	if (mp.mp_list[mp_h] >= hptrs(sdp, mp_h) - 1)
				1261	break; /* loop around in the same state */
				1262	mp.mp_list[mp_h]++;
				1263	/* Here we've found a part of the metapath that is not
				1264	* allocated. We need to search at that height for the
				1265	* next non-null pointer. */
				1266	if (find_nonnull_ptr(sdp, &mp, mp_h)) {
				1267	state = DEALLOC_FILL_MP;
				1268	mp_h++;
				1269	}
				1270	/* No more non-null pointers at this height. Back up
				1271	to the previous height and try again. */
				1272	break; /* loop around in the same state */
				1273
				1274	/* Fill the metapath with buffers to the given height. */
				1275	case DEALLOC_FILL_MP:
				1276	/* Fill the buffers out to the current height. */
				1277	ret = fillup_metapath(ip, &mp, mp_h);
				1278	if (ret < 0)
				1279	goto out;
				1280
				1281	/* If buffers found for the entire strip height */
				1282	if ((ret == ip->i_height) && (mp_h == strip_h)) {
				1283	state = DEALLOC_MP_FULL;
				1284	break;
				1285	}
				1286	if (ret < ip->i_height) /* We have a partial height */
				1287	mp_h = ret - 1;
				1288
				1289	/* If we find a non-null block pointer, crawl a bit
				1290	higher up in the metapath and try again, otherwise
				1291	we need to look lower for a new starting point. */
				1292	if (find_nonnull_ptr(sdp, &mp, mp_h))
				1293	mp_h++;
				1294	else
				1295	state = DEALLOC_MP_LOWER;
				1296	break;
				1297	}
				1298	}
				1299
				1300	if (btotal) {
				1301	if (current->journal_info == NULL) {
				1302	ret = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS +
				1303	RES_QUOTA, 0);
				1304	if (ret)
				1305	goto out;
				1306	down_write(&ip->i_rw_mutex);
				1307	}
				1308	gfs2_statfs_change(sdp, 0, +btotal, 0);
				1309	gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
				1310	ip->i_inode.i_gid);
				1311	ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
				1312	gfs2_trans_add_meta(ip->i_gl, dibh);
				1313	gfs2_dinode_out(ip, dibh->b_data);
				1314	up_write(&ip->i_rw_mutex);
				1315	gfs2_trans_end(sdp);
				1316	}
				1317
				1318	out:
				1319	if (gfs2_holder_initialized(&rd_gh))
				1320	gfs2_glock_dq_uninit(&rd_gh);
				1321	if (current->journal_info) {
				1322	up_write(&ip->i_rw_mutex);
				1323	gfs2_trans_end(sdp);
				1324	cond_resched();
				1325	}
				1326	gfs2_quota_unhold(ip);
				1327	out_metapath:
				1328	release_metapath(&mp);
				1329	return ret;
				1330	}
				1331
				1332	static int trunc_end(struct gfs2_inode *ip)
				1333	{
				1334	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
				1335	struct buffer_head *dibh;
				1336	int error;
				1337
				1338	error = gfs2_trans_begin(sdp, RES_DINODE, 0);
				1339	if (error)
				1340	return error;
				1341
				1342	down_write(&ip->i_rw_mutex);
				1343
				1344	error = gfs2_meta_inode_buffer(ip, &dibh);
				1345	if (error)
				1346	goto out;
				1347
				1348	if (!i_size_read(&ip->i_inode)) {
				1349	ip->i_height = 0;
				1350	ip->i_goal = ip->i_no_addr;
				1351	gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
				1352	gfs2_ordered_del_inode(ip);
				1353	}
				1354	ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
				1355	ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG;
				1356
				1357	gfs2_trans_add_meta(ip->i_gl, dibh);
				1358	gfs2_dinode_out(ip, dibh->b_data);
				1359	brelse(dibh);
				1360
				1361	out:
				1362	up_write(&ip->i_rw_mutex);
				1363	gfs2_trans_end(sdp);
				1364	return error;
				1365	}
				1366
				1367	/**
				1368	* do_shrink - make a file smaller
				1369	* @inode: the inode
				1370	* @oldsize: the current inode size
				1371	* @newsize: the size to make the file
				1372	*
				1373	* Called with an exclusive lock on @inode. The @size must
				1374	* be equal to or smaller than the current inode size.
				1375	*
				1376	* Returns: errno
				1377	*/
				1378
				1379	static int do_shrink(struct inode *inode, u64 oldsize, u64 newsize)
				1380	{
				1381	struct gfs2_inode *ip = GFS2_I(inode);
				1382	int error;
				1383
				1384	error = trunc_start(inode, oldsize, newsize);
				1385	if (error < 0)
				1386	return error;
				1387	if (gfs2_is_stuffed(ip))
				1388	return 0;
				1389
				1390	error = trunc_dealloc(ip, newsize);
				1391	if (error == 0)
				1392	error = trunc_end(ip);
				1393
				1394	return error;
				1395	}
				1396
				1397	void gfs2_trim_blocks(struct inode *inode)
				1398	{
				1399	u64 size = inode->i_size;
				1400	int ret;
				1401
				1402	ret = do_shrink(inode, size, size);
				1403	WARN_ON(ret != 0);
				1404	}
				1405
				1406	/**
				1407	* do_grow - Touch and update inode size
				1408	* @inode: The inode
				1409	* @size: The new size
				1410	*
				1411	* This function updates the timestamps on the inode and
				1412	* may also increase the size of the inode. This function
				1413	* must not be called with @size any smaller than the current
				1414	* inode size.
				1415	*
				1416	* Although it is not strictly required to unstuff files here,
				1417	* earlier versions of GFS2 have a bug in the stuffed file reading
				1418	* code which will result in a buffer overrun if the size is larger
				1419	* than the max stuffed file size. In order to prevent this from
				1420	* occurring, such files are unstuffed, but in other cases we can
				1421	* just update the inode size directly.
				1422	*
				1423	* Returns: 0 on success, or -ve on error
				1424	*/
				1425
				1426	static int do_grow(struct inode *inode, u64 size)
				1427	{
				1428	struct gfs2_inode *ip = GFS2_I(inode);
				1429	struct gfs2_sbd *sdp = GFS2_SB(inode);
				1430	struct gfs2_alloc_parms ap = { .target = 1, };
				1431	struct buffer_head *dibh;
				1432	int error;
				1433	int unstuff = 0;
				1434
				1435	if (gfs2_is_stuffed(ip) &&
				1436	(size > (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)))) {
				1437	error = gfs2_quota_lock_check(ip, &ap);
				1438	if (error)
				1439	return error;
				1440
				1441	error = gfs2_inplace_reserve(ip, &ap);
				1442	if (error)
				1443	goto do_grow_qunlock;
				1444	unstuff = 1;
				1445	}
				1446
				1447	error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT +
				1448	(unstuff &&
				1449	gfs2_is_jdata(ip) ? RES_JDATA : 0) +
				1450	(sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ?
				1451	0 : RES_QUOTA), 0);
				1452	if (error)
				1453	goto do_grow_release;
				1454
				1455	if (unstuff) {
				1456	error = gfs2_unstuff_dinode(ip, NULL);
				1457	if (error)
				1458	goto do_end_trans;
				1459	}
				1460
				1461	error = gfs2_meta_inode_buffer(ip, &dibh);
				1462	if (error)
				1463	goto do_end_trans;
				1464
				1465	i_size_write(inode, size);
				1466	ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
				1467	gfs2_trans_add_meta(ip->i_gl, dibh);
				1468	gfs2_dinode_out(ip, dibh->b_data);
				1469	brelse(dibh);
				1470
				1471	do_end_trans:
				1472	gfs2_trans_end(sdp);
				1473	do_grow_release:
				1474	if (unstuff) {
				1475	gfs2_inplace_release(ip);
				1476	do_grow_qunlock:
				1477	gfs2_quota_unlock(ip);
				1478	}
				1479	return error;
				1480	}
				1481
				1482	/**
				1483	* gfs2_setattr_size - make a file a given size
				1484	* @inode: the inode
				1485	* @newsize: the size to make the file
				1486	*
				1487	* The file size can grow, shrink, or stay the same size. This
				1488	* is called holding i_mutex and an exclusive glock on the inode
				1489	* in question.
				1490	*
				1491	* Returns: errno
				1492	*/
				1493
				1494	int gfs2_setattr_size(struct inode *inode, u64 newsize)
				1495	{
				1496	struct gfs2_inode *ip = GFS2_I(inode);
				1497	int ret;
				1498	u64 oldsize;
				1499
				1500	BUG_ON(!S_ISREG(inode->i_mode));
				1501
				1502	ret = inode_newsize_ok(inode, newsize);
				1503	if (ret)
				1504	return ret;
				1505
				1506	inode_dio_wait(inode);
				1507
				1508	ret = gfs2_rsqa_alloc(ip);
				1509	if (ret)
				1510	goto out;
				1511
				1512	oldsize = inode->i_size;
				1513	if (newsize >= oldsize) {
				1514	ret = do_grow(inode, newsize);
				1515	goto out;
				1516	}
				1517
				1518	ret = do_shrink(inode, oldsize, newsize);
				1519	out:
				1520	gfs2_rsqa_delete(ip, NULL);
				1521	return ret;
				1522	}
				1523
				1524	int gfs2_truncatei_resume(struct gfs2_inode *ip)
				1525	{
				1526	int error;
				1527	error = trunc_dealloc(ip, i_size_read(&ip->i_inode));
				1528	if (!error)
				1529	error = trunc_end(ip);
				1530	return error;
				1531	}
				1532
				1533	int gfs2_file_dealloc(struct gfs2_inode *ip)
				1534	{
				1535	return trunc_dealloc(ip, 0);
				1536	}
				1537
				1538	/**
				1539	* gfs2_free_journal_extents - Free cached journal bmap info
				1540	* @jd: The journal
				1541	*
				1542	*/
				1543
				1544	void gfs2_free_journal_extents(struct gfs2_jdesc *jd)
				1545	{
				1546	struct gfs2_journal_extent *jext;
				1547
				1548	while(!list_empty(&jd->extent_list)) {
				1549	jext = list_entry(jd->extent_list.next, struct gfs2_journal_extent, list);
				1550	list_del(&jext->list);
				1551	kfree(jext);
				1552	}
				1553	}
				1554
				1555	/**
				1556	* gfs2_add_jextent - Add or merge a new extent to extent cache
				1557	* @jd: The journal descriptor
				1558	* @lblock: The logical block at start of new extent
				1559	* @dblock: The physical block at start of new extent
				1560	* @blocks: Size of extent in fs blocks
				1561	*
				1562	* Returns: 0 on success or -ENOMEM
				1563	*/
				1564
				1565	static int gfs2_add_jextent(struct gfs2_jdesc *jd, u64 lblock, u64 dblock, u64 blocks)
				1566	{
				1567	struct gfs2_journal_extent *jext;
				1568
				1569	if (!list_empty(&jd->extent_list)) {
				1570	jext = list_entry(jd->extent_list.prev, struct gfs2_journal_extent, list);
				1571	if ((jext->dblock + jext->blocks) == dblock) {
				1572	jext->blocks += blocks;
				1573	return 0;
				1574	}
				1575	}
				1576
				1577	jext = kzalloc(sizeof(struct gfs2_journal_extent), GFP_NOFS);
				1578	if (jext == NULL)
				1579	return -ENOMEM;
				1580	jext->dblock = dblock;
				1581	jext->lblock = lblock;
				1582	jext->blocks = blocks;
				1583	list_add_tail(&jext->list, &jd->extent_list);
				1584	jd->nr_extents++;
				1585	return 0;
				1586	}
				1587
				1588	/**
				1589	* gfs2_map_journal_extents - Cache journal bmap info
				1590	* @sdp: The super block
				1591	* @jd: The journal to map
				1592	*
				1593	* Create a reusable "extent" mapping from all logical
				1594	* blocks to all physical blocks for the given journal. This will save
				1595	* us time when writing journal blocks. Most journals will have only one
				1596	* extent that maps all their logical blocks. That's because gfs2.mkfs
				1597	* arranges the journal blocks sequentially to maximize performance.
				1598	* So the extent would map the first block for the entire file length.
				1599	* However, gfs2_jadd can happen while file activity is happening, so
				1600	* those journals may not be sequential. Less likely is the case where
				1601	* the users created their own journals by mounting the metafs and
				1602	* laying it out. But it's still possible. These journals might have
				1603	* several extents.
				1604	*
				1605	* Returns: 0 on success, or error on failure
				1606	*/
				1607
				1608	int gfs2_map_journal_extents(struct gfs2_sbd sdp, struct gfs2_jdesc jd)
				1609	{
				1610	u64 lblock = 0;
				1611	u64 lblock_stop;
				1612	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
				1613	struct buffer_head bh;
				1614	unsigned int shift = sdp->sd_sb.sb_bsize_shift;
				1615	u64 size;
				1616	int rc;
				1617
				1618	lblock_stop = i_size_read(jd->jd_inode) >> shift;
				1619	size = (lblock_stop - lblock) << shift;
				1620	jd->nr_extents = 0;
				1621	WARN_ON(!list_empty(&jd->extent_list));
				1622
				1623	do {
				1624	bh.b_state = 0;
				1625	bh.b_blocknr = 0;
				1626	bh.b_size = size;
				1627	rc = gfs2_block_map(jd->jd_inode, lblock, &bh, 0);
				1628	if (rc \|\| !buffer_mapped(&bh))
				1629	goto fail;
				1630	rc = gfs2_add_jextent(jd, lblock, bh.b_blocknr, bh.b_size >> shift);
				1631	if (rc)
				1632	goto fail;
				1633	size -= bh.b_size;
				1634	lblock += (bh.b_size >> ip->i_inode.i_blkbits);
				1635	} while(size > 0);
				1636
				1637	fs_info(sdp, "journal %d mapped with %u extents\n", jd->jd_jid,
				1638	jd->nr_extents);
				1639	return 0;
				1640
				1641	fail:
				1642	fs_warn(sdp, "error %d mapping journal %u at offset %llu (extent %u)\n",
				1643	rc, jd->jd_jid,
				1644	(unsigned long long)(i_size_read(jd->jd_inode) - size),
				1645	jd->nr_extents);
				1646	fs_warn(sdp, "bmap=%d lblock=%llu block=%llu, state=0x%08lx, size=%llu\n",
				1647	rc, (unsigned long long)lblock, (unsigned long long)bh.b_blocknr,
				1648	bh.b_state, (unsigned long long)bh.b_size);
				1649	gfs2_free_journal_extents(jd);
				1650	return rc;
				1651	}
				1652
				1653	/**
				1654	* gfs2_write_alloc_required - figure out if a write will require an allocation
				1655	* @ip: the file being written to
				1656	* @offset: the offset to write to
				1657	* @len: the number of bytes being written
				1658	*
				1659	* Returns: 1 if an alloc is required, 0 otherwise
				1660	*/
				1661
				1662	int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
				1663	unsigned int len)
				1664	{
				1665	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
				1666	struct buffer_head bh;
				1667	unsigned int shift;
				1668	u64 lblock, lblock_stop, size;
				1669	u64 end_of_file;
				1670
				1671	if (!len)
				1672	return 0;
				1673
				1674	if (gfs2_is_stuffed(ip)) {
				1675	if (offset + len >
				1676	sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode))
				1677	return 1;
				1678	return 0;
				1679	}
				1680
				1681	shift = sdp->sd_sb.sb_bsize_shift;
				1682	BUG_ON(gfs2_is_dir(ip));
				1683	end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift;
				1684	lblock = offset >> shift;
				1685	lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
				1686	if (lblock_stop > end_of_file && ip != GFS2_I(sdp->sd_rindex))
				1687	return 1;
				1688
				1689	size = (lblock_stop - lblock) << shift;
				1690	do {
				1691	bh.b_state = 0;
				1692	bh.b_size = size;
				1693	gfs2_block_map(&ip->i_inode, lblock, &bh, 0);
				1694	if (!buffer_mapped(&bh))
				1695	return 1;
				1696	size -= bh.b_size;
				1697	lblock += (bh.b_size >> ip->i_inode.i_blkbits);
				1698	} while(size > 0);
				1699
				1700	return 0;
				1701	}
				1702