Blame - src/kernel/linux/v4.19/fs/nfs/blocklayout/blocklayout.c - T800

blob: 06cb0c1d9aee37dff885f511fa4f8da6f5ec1914 [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	/*
				2	* linux/fs/nfs/blocklayout/blocklayout.c
				3	*
				4	* Module for the NFSv4.1 pNFS block layout driver.
				5	*
				6	* Copyright (c) 2006 The Regents of the University of Michigan.
				7	* All rights reserved.
				8	*
				9	* Andy Adamson <andros@citi.umich.edu>
				10	* Fred Isaman <iisaman@umich.edu>
				11	*
				12	* permission is granted to use, copy, create derivative works and
				13	* redistribute this software and such derivative works for any purpose,
				14	* so long as the name of the university of michigan is not used in
				15	* any advertising or publicity pertaining to the use or distribution
				16	* of this software without specific, written prior authorization. if
				17	* the above copyright notice or any other identification of the
				18	* university of michigan is included in any copy of any portion of
				19	* this software, then the disclaimer below must also be included.
				20	*
				21	* this software is provided as is, without representation from the
				22	* university of michigan as to its fitness for any purpose, and without
				23	* warranty by the university of michigan of any kind, either express
				24	* or implied, including without limitation the implied warranties of
				25	* merchantability and fitness for a particular purpose. the regents
				26	* of the university of michigan shall not be liable for any damages,
				27	* including special, indirect, incidental, or consequential damages,
				28	* with respect to any claim arising out or in connection with the use
				29	* of the software, even if it has been or is hereafter advised of the
				30	* possibility of such damages.
				31	*/
				32
				33	#include <linux/module.h>
				34	#include <linux/init.h>
				35	#include <linux/mount.h>
				36	#include <linux/namei.h>
				37	#include <linux/bio.h> /* struct bio */
				38	#include <linux/prefetch.h>
				39	#include <linux/pagevec.h>
				40
				41	#include "../pnfs.h"
				42	#include "../nfs4session.h"
				43	#include "../internal.h"
				44	#include "blocklayout.h"
				45
				46	#define NFSDBG_FACILITY NFSDBG_PNFS_LD
				47
				48	MODULE_LICENSE("GPL");
				49	MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>");
				50	MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver");
				51
				52	static bool is_hole(struct pnfs_block_extent *be)
				53	{
				54	switch (be->be_state) {
				55	case PNFS_BLOCK_NONE_DATA:
				56	return true;
				57	case PNFS_BLOCK_INVALID_DATA:
				58	return be->be_tag ? false : true;
				59	default:
				60	return false;
				61	}
				62	}
				63
				64	/* The data we are handed might be spread across several bios. We need
				65	* to track when the last one is finished.
				66	*/
				67	struct parallel_io {
				68	struct kref refcnt;
				69	void (pnfs_callback) (void data);
				70	void *data;
				71	};
				72
				73	static inline struct parallel_io alloc_parallel(void data)
				74	{
				75	struct parallel_io *rv;
				76
				77	rv = kmalloc(sizeof(*rv), GFP_NOFS);
				78	if (rv) {
				79	rv->data = data;
				80	kref_init(&rv->refcnt);
				81	}
				82	return rv;
				83	}
				84
				85	static inline void get_parallel(struct parallel_io *p)
				86	{
				87	kref_get(&p->refcnt);
				88	}
				89
				90	static void destroy_parallel(struct kref *kref)
				91	{
				92	struct parallel_io *p = container_of(kref, struct parallel_io, refcnt);
				93
				94	dprintk("%s enter\n", __func__);
				95	p->pnfs_callback(p->data);
				96	kfree(p);
				97	}
				98
				99	static inline void put_parallel(struct parallel_io *p)
				100	{
				101	kref_put(&p->refcnt, destroy_parallel);
				102	}
				103
				104	static struct bio *
				105	bl_submit_bio(struct bio *bio)
				106	{
				107	if (bio) {
				108	get_parallel(bio->bi_private);
				109	dprintk("%s submitting %s bio %u@%llu\n", __func__,
				110	bio_op(bio) == READ ? "read" : "write",
				111	bio->bi_iter.bi_size,
				112	(unsigned long long)bio->bi_iter.bi_sector);
				113	submit_bio(bio);
				114	}
				115	return NULL;
				116	}
				117
				118	static struct bio *
				119	bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector,
				120	bio_end_io_t end_io, struct parallel_io *par)
				121	{
				122	struct bio *bio;
				123
				124	npg = min(npg, BIO_MAX_PAGES);
				125	bio = bio_alloc(GFP_NOIO, npg);
				126	if (!bio && (current->flags & PF_MEMALLOC)) {
				127	while (!bio && (npg /= 2))
				128	bio = bio_alloc(GFP_NOIO, npg);
				129	}
				130
				131	if (bio) {
				132	bio->bi_iter.bi_sector = disk_sector;
				133	bio_set_dev(bio, bdev);
				134	bio->bi_end_io = end_io;
				135	bio->bi_private = par;
				136	}
				137	return bio;
				138	}
				139
				140	static bool offset_in_map(u64 offset, struct pnfs_block_dev_map *map)
				141	{
				142	return offset >= map->start && offset < map->start + map->len;
				143	}
				144
				145	static struct bio *
				146	do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect,
				147	struct page page, struct pnfs_block_dev_map map,
				148	struct pnfs_block_extent *be, bio_end_io_t end_io,
				149	struct parallel_io par, unsigned int offset, int len)
				150	{
				151	struct pnfs_block_dev *dev =
				152	container_of(be->be_device, struct pnfs_block_dev, node);
				153	u64 disk_addr, end;
				154
				155	dprintk("%s: npg %d rw %d isect %llu offset %u len %d\n", __func__,
				156	npg, rw, (unsigned long long)isect, offset, *len);
				157
				158	/* translate to device offset */
				159	isect += be->be_v_offset;
				160	isect -= be->be_f_offset;
				161
				162	/* translate to physical disk offset */
				163	disk_addr = (u64)isect << SECTOR_SHIFT;
				164	if (!offset_in_map(disk_addr, map)) {
				165	if (!dev->map(dev, disk_addr, map) \|\| !offset_in_map(disk_addr, map))
				166	return ERR_PTR(-EIO);
				167	bio = bl_submit_bio(bio);
				168	}
				169	disk_addr += map->disk_offset;
				170	disk_addr -= map->start;
				171
				172	/* limit length to what the device mapping allows */
				173	end = disk_addr + *len;
				174	if (end >= map->start + map->len)
				175	*len = map->start + map->len - disk_addr;
				176
				177	retry:
				178	if (!bio) {
				179	bio = bl_alloc_init_bio(npg, map->bdev,
				180	disk_addr >> SECTOR_SHIFT, end_io, par);
				181	if (!bio)
				182	return ERR_PTR(-ENOMEM);
				183	bio_set_op_attrs(bio, rw, 0);
				184	}
				185	if (bio_add_page(bio, page, len, offset) < len) {
				186	bio = bl_submit_bio(bio);
				187	goto retry;
				188	}
				189	return bio;
				190	}
				191
				192	static void bl_mark_devices_unavailable(struct nfs_pgio_header *header, bool rw)
				193	{
				194	struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg);
				195	size_t bytes_left = header->args.count;
				196	sector_t isect, extent_length = 0;
				197	struct pnfs_block_extent be;
				198
				199	isect = header->args.offset >> SECTOR_SHIFT;
				200	bytes_left += header->args.offset - (isect << SECTOR_SHIFT);
				201
				202	while (bytes_left > 0) {
				203	if (!ext_tree_lookup(bl, isect, &be, rw))
				204	return;
				205	extent_length = be.be_length - (isect - be.be_f_offset);
				206	nfs4_mark_deviceid_unavailable(be.be_device);
				207	isect += extent_length;
				208	if (bytes_left > extent_length << SECTOR_SHIFT)
				209	bytes_left -= extent_length << SECTOR_SHIFT;
				210	else
				211	bytes_left = 0;
				212	}
				213	}
				214
				215	static void bl_end_io_read(struct bio *bio)
				216	{
				217	struct parallel_io *par = bio->bi_private;
				218
				219	if (bio->bi_status) {
				220	struct nfs_pgio_header *header = par->data;
				221
				222	if (!header->pnfs_error)
				223	header->pnfs_error = -EIO;
				224	pnfs_set_lo_fail(header->lseg);
				225	bl_mark_devices_unavailable(header, false);
				226	}
				227
				228	bio_put(bio);
				229	put_parallel(par);
				230	}
				231
				232	static void bl_read_cleanup(struct work_struct *work)
				233	{
				234	struct rpc_task *task;
				235	struct nfs_pgio_header *hdr;
				236	dprintk("%s enter\n", __func__);
				237	task = container_of(work, struct rpc_task, u.tk_work);
				238	hdr = container_of(task, struct nfs_pgio_header, task);
				239	pnfs_ld_read_done(hdr);
				240	}
				241
				242	static void
				243	bl_end_par_io_read(void *data)
				244	{
				245	struct nfs_pgio_header *hdr = data;
				246
				247	hdr->task.tk_status = hdr->pnfs_error;
				248	INIT_WORK(&hdr->task.u.tk_work, bl_read_cleanup);
				249	schedule_work(&hdr->task.u.tk_work);
				250	}
				251
				252	static enum pnfs_try_status
				253	bl_read_pagelist(struct nfs_pgio_header *header)
				254	{
				255	struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg);
				256	struct pnfs_block_dev_map map = { .start = NFS4_MAX_UINT64 };
				257	struct bio *bio = NULL;
				258	struct pnfs_block_extent be;
				259	sector_t isect, extent_length = 0;
				260	struct parallel_io *par;
				261	loff_t f_offset = header->args.offset;
				262	size_t bytes_left = header->args.count;
				263	unsigned int pg_offset = header->args.pgbase, pg_len;
				264	struct page **pages = header->args.pages;
				265	int pg_index = header->args.pgbase >> PAGE_SHIFT;
				266	const bool is_dio = (header->dreq != NULL);
				267	struct blk_plug plug;
				268	int i;
				269
				270	dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__,
				271	header->page_array.npages, f_offset,
				272	(unsigned int)header->args.count);
				273
				274	par = alloc_parallel(header);
				275	if (!par)
				276	return PNFS_NOT_ATTEMPTED;
				277	par->pnfs_callback = bl_end_par_io_read;
				278
				279	blk_start_plug(&plug);
				280
				281	isect = (sector_t) (f_offset >> SECTOR_SHIFT);
				282	/* Code assumes extents are page-aligned */
				283	for (i = pg_index; i < header->page_array.npages; i++) {
				284	if (extent_length <= 0) {
				285	/* We've used up the previous extent */
				286	bio = bl_submit_bio(bio);
				287
				288	/* Get the next one */
				289	if (!ext_tree_lookup(bl, isect, &be, false)) {
				290	header->pnfs_error = -EIO;
				291	goto out;
				292	}
				293	extent_length = be.be_length - (isect - be.be_f_offset);
				294	}
				295
				296	if (is_dio) {
				297	if (pg_offset + bytes_left > PAGE_SIZE)
				298	pg_len = PAGE_SIZE - pg_offset;
				299	else
				300	pg_len = bytes_left;
				301	} else {
				302	BUG_ON(pg_offset != 0);
				303	pg_len = PAGE_SIZE;
				304	}
				305
				306	if (is_hole(&be)) {
				307	bio = bl_submit_bio(bio);
				308	/* Fill hole w/ zeroes w/o accessing device */
				309	dprintk("%s Zeroing page for hole\n", __func__);
				310	zero_user_segment(pages[i], pg_offset, pg_len);
				311
				312	/* invalidate map */
				313	map.start = NFS4_MAX_UINT64;
				314	} else {
				315	bio = do_add_page_to_bio(bio,
				316	header->page_array.npages - i,
				317	READ,
				318	isect, pages[i], &map, &be,
				319	bl_end_io_read, par,
				320	pg_offset, &pg_len);
				321	if (IS_ERR(bio)) {
				322	header->pnfs_error = PTR_ERR(bio);
				323	bio = NULL;
				324	goto out;
				325	}
				326	}
				327	isect += (pg_len >> SECTOR_SHIFT);
				328	extent_length -= (pg_len >> SECTOR_SHIFT);
				329	f_offset += pg_len;
				330	bytes_left -= pg_len;
				331	pg_offset = 0;
				332	}
				333	if ((isect << SECTOR_SHIFT) >= header->inode->i_size) {
				334	header->res.eof = 1;
				335	header->res.count = header->inode->i_size - header->args.offset;
				336	} else {
				337	header->res.count = (isect << SECTOR_SHIFT) - header->args.offset;
				338	}
				339	out:
				340	bl_submit_bio(bio);
				341	blk_finish_plug(&plug);
				342	put_parallel(par);
				343	return PNFS_ATTEMPTED;
				344	}
				345
				346	static void bl_end_io_write(struct bio *bio)
				347	{
				348	struct parallel_io *par = bio->bi_private;
				349	struct nfs_pgio_header *header = par->data;
				350
				351	if (bio->bi_status) {
				352	if (!header->pnfs_error)
				353	header->pnfs_error = -EIO;
				354	pnfs_set_lo_fail(header->lseg);
				355	bl_mark_devices_unavailable(header, true);
				356	}
				357	bio_put(bio);
				358	put_parallel(par);
				359	}
				360
				361	/* Function scheduled for call during bl_end_par_io_write,
				362	* it marks sectors as written and extends the commitlist.
				363	*/
				364	static void bl_write_cleanup(struct work_struct *work)
				365	{
				366	struct rpc_task *task = container_of(work, struct rpc_task, u.tk_work);
				367	struct nfs_pgio_header *hdr =
				368	container_of(task, struct nfs_pgio_header, task);
				369
				370	dprintk("%s enter\n", __func__);
				371
				372	if (likely(!hdr->pnfs_error)) {
				373	struct pnfs_block_layout *bl = BLK_LSEG2EXT(hdr->lseg);
				374	u64 start = hdr->args.offset & (loff_t)PAGE_MASK;
				375	u64 end = (hdr->args.offset + hdr->args.count +
				376	PAGE_SIZE - 1) & (loff_t)PAGE_MASK;
				377	u64 lwb = hdr->args.offset + hdr->args.count;
				378
				379	ext_tree_mark_written(bl, start >> SECTOR_SHIFT,
				380	(end - start) >> SECTOR_SHIFT, lwb);
				381	}
				382
				383	pnfs_ld_write_done(hdr);
				384	}
				385
				386	/* Called when last of bios associated with a bl_write_pagelist call finishes */
				387	static void bl_end_par_io_write(void *data)
				388	{
				389	struct nfs_pgio_header *hdr = data;
				390
				391	hdr->task.tk_status = hdr->pnfs_error;
				392	hdr->verf.committed = NFS_FILE_SYNC;
				393	INIT_WORK(&hdr->task.u.tk_work, bl_write_cleanup);
				394	schedule_work(&hdr->task.u.tk_work);
				395	}
				396
				397	static enum pnfs_try_status
				398	bl_write_pagelist(struct nfs_pgio_header *header, int sync)
				399	{
				400	struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg);
				401	struct pnfs_block_dev_map map = { .start = NFS4_MAX_UINT64 };
				402	struct bio *bio = NULL;
				403	struct pnfs_block_extent be;
				404	sector_t isect, extent_length = 0;
				405	struct parallel_io *par = NULL;
				406	loff_t offset = header->args.offset;
				407	size_t count = header->args.count;
				408	struct page **pages = header->args.pages;
				409	int pg_index = header->args.pgbase >> PAGE_SHIFT;
				410	unsigned int pg_len;
				411	struct blk_plug plug;
				412	int i;
				413
				414	dprintk("%s enter, %zu@%lld\n", __func__, count, offset);
				415
				416	/* At this point, header->page_aray is a (sequential) list of nfs_pages.
				417	* We want to write each, and if there is an error set pnfs_error
				418	* to have it redone using nfs.
				419	*/
				420	par = alloc_parallel(header);
				421	if (!par)
				422	return PNFS_NOT_ATTEMPTED;
				423	par->pnfs_callback = bl_end_par_io_write;
				424
				425	blk_start_plug(&plug);
				426
				427	/* we always write out the whole page */
				428	offset = offset & (loff_t)PAGE_MASK;
				429	isect = offset >> SECTOR_SHIFT;
				430
				431	for (i = pg_index; i < header->page_array.npages; i++) {
				432	if (extent_length <= 0) {
				433	/* We've used up the previous extent */
				434	bio = bl_submit_bio(bio);
				435	/* Get the next one */
				436	if (!ext_tree_lookup(bl, isect, &be, true)) {
				437	header->pnfs_error = -EINVAL;
				438	goto out;
				439	}
				440
				441	extent_length = be.be_length - (isect - be.be_f_offset);
				442	}
				443
				444	pg_len = PAGE_SIZE;
				445	bio = do_add_page_to_bio(bio, header->page_array.npages - i,
				446	WRITE, isect, pages[i], &map, &be,
				447	bl_end_io_write, par,
				448	0, &pg_len);
				449	if (IS_ERR(bio)) {
				450	header->pnfs_error = PTR_ERR(bio);
				451	bio = NULL;
				452	goto out;
				453	}
				454
				455	offset += pg_len;
				456	count -= pg_len;
				457	isect += (pg_len >> SECTOR_SHIFT);
				458	extent_length -= (pg_len >> SECTOR_SHIFT);
				459	}
				460
				461	header->res.count = header->args.count;
				462	out:
				463	bl_submit_bio(bio);
				464	blk_finish_plug(&plug);
				465	put_parallel(par);
				466	return PNFS_ATTEMPTED;
				467	}
				468
				469	static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo)
				470	{
				471	struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
				472	int err;
				473
				474	dprintk("%s enter\n", __func__);
				475
				476	err = ext_tree_remove(bl, true, 0, LLONG_MAX);
				477	WARN_ON(err);
				478
				479	kfree(bl);
				480	}
				481
				482	static struct pnfs_layout_hdr __bl_alloc_layout_hdr(struct inode inode,
				483	gfp_t gfp_flags, bool is_scsi_layout)
				484	{
				485	struct pnfs_block_layout *bl;
				486
				487	dprintk("%s enter\n", __func__);
				488	bl = kzalloc(sizeof(*bl), gfp_flags);
				489	if (!bl)
				490	return NULL;
				491
				492	bl->bl_ext_rw = RB_ROOT;
				493	bl->bl_ext_ro = RB_ROOT;
				494	spin_lock_init(&bl->bl_ext_lock);
				495
				496	bl->bl_scsi_layout = is_scsi_layout;
				497	return &bl->bl_layout;
				498	}
				499
				500	static struct pnfs_layout_hdr bl_alloc_layout_hdr(struct inode inode,
				501	gfp_t gfp_flags)
				502	{
				503	return __bl_alloc_layout_hdr(inode, gfp_flags, false);
				504	}
				505
				506	static struct pnfs_layout_hdr sl_alloc_layout_hdr(struct inode inode,
				507	gfp_t gfp_flags)
				508	{
				509	return __bl_alloc_layout_hdr(inode, gfp_flags, true);
				510	}
				511
				512	static void bl_free_lseg(struct pnfs_layout_segment *lseg)
				513	{
				514	dprintk("%s enter\n", __func__);
				515	kfree(lseg);
				516	}
				517
				518	/* Tracks info needed to ensure extents in layout obey constraints of spec */
				519	struct layout_verification {
				520	u32 mode; /* R or RW */
				521	u64 start; /* Expected start of next non-COW extent */
				522	u64 inval; /* Start of INVAL coverage */
				523	u64 cowread; /* End of COW read coverage */
				524	};
				525
				526	/* Verify the extent meets the layout requirements of the pnfs-block draft,
				527	* section 2.3.1.
				528	*/
				529	static int verify_extent(struct pnfs_block_extent *be,
				530	struct layout_verification *lv)
				531	{
				532	if (lv->mode == IOMODE_READ) {
				533	if (be->be_state == PNFS_BLOCK_READWRITE_DATA \|\|
				534	be->be_state == PNFS_BLOCK_INVALID_DATA)
				535	return -EIO;
				536	if (be->be_f_offset != lv->start)
				537	return -EIO;
				538	lv->start += be->be_length;
				539	return 0;
				540	}
				541	/* lv->mode == IOMODE_RW */
				542	if (be->be_state == PNFS_BLOCK_READWRITE_DATA) {
				543	if (be->be_f_offset != lv->start)
				544	return -EIO;
				545	if (lv->cowread > lv->start)
				546	return -EIO;
				547	lv->start += be->be_length;
				548	lv->inval = lv->start;
				549	return 0;
				550	} else if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
				551	if (be->be_f_offset != lv->start)
				552	return -EIO;
				553	lv->start += be->be_length;
				554	return 0;
				555	} else if (be->be_state == PNFS_BLOCK_READ_DATA) {
				556	if (be->be_f_offset > lv->start)
				557	return -EIO;
				558	if (be->be_f_offset < lv->inval)
				559	return -EIO;
				560	if (be->be_f_offset < lv->cowread)
				561	return -EIO;
				562	/* It looks like you might want to min this with lv->start,
				563	* but you really don't.
				564	*/
				565	lv->inval = lv->inval + be->be_length;
				566	lv->cowread = be->be_f_offset + be->be_length;
				567	return 0;
				568	} else
				569	return -EIO;
				570	}
				571
				572	static int decode_sector_number(__be32 *rp, sector_t sp)
				573	{
				574	uint64_t s;
				575
				576	rp = xdr_decode_hyper(rp, &s);
				577	if (s & 0x1ff) {
				578	printk(KERN_WARNING "NFS: %s: sector not aligned\n", __func__);
				579	return -1;
				580	}
				581	*sp = s >> SECTOR_SHIFT;
				582	return 0;
				583	}
				584
				585	static struct nfs4_deviceid_node *
				586	bl_find_get_deviceid(struct nfs_server *server,
				587	const struct nfs4_deviceid id, struct rpc_cred cred,
				588	gfp_t gfp_mask)
				589	{
				590	struct nfs4_deviceid_node *node;
				591	unsigned long start, end;
				592
				593	retry:
				594	node = nfs4_find_get_deviceid(server, id, cred, gfp_mask);
				595	if (!node)
				596	return ERR_PTR(-ENODEV);
				597
				598	if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags) == 0)
				599	return node;
				600
				601	end = jiffies;
				602	start = end - PNFS_DEVICE_RETRY_TIMEOUT;
				603	if (!time_in_range(node->timestamp_unavailable, start, end)) {
				604	nfs4_delete_deviceid(node->ld, node->nfs_client, id);
				605	goto retry;
				606	}
				607	return ERR_PTR(-ENODEV);
				608	}
				609
				610	static int
				611	bl_alloc_extent(struct xdr_stream xdr, struct pnfs_layout_hdr lo,
				612	struct layout_verification lv, struct list_head extents,
				613	gfp_t gfp_mask)
				614	{
				615	struct pnfs_block_extent *be;
				616	struct nfs4_deviceid id;
				617	int error;
				618	__be32 *p;
				619
				620	p = xdr_inline_decode(xdr, 28 + NFS4_DEVICEID4_SIZE);
				621	if (!p)
				622	return -EIO;
				623
				624	be = kzalloc(sizeof(*be), GFP_NOFS);
				625	if (!be)
				626	return -ENOMEM;
				627
				628	memcpy(&id, p, NFS4_DEVICEID4_SIZE);
				629	p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
				630
				631	be->be_device = bl_find_get_deviceid(NFS_SERVER(lo->plh_inode), &id,
				632	lo->plh_lc_cred, gfp_mask);
				633	if (IS_ERR(be->be_device)) {
				634	error = PTR_ERR(be->be_device);
				635	goto out_free_be;
				636	}
				637
				638	/*
				639	* The next three values are read in as bytes, but stored in the
				640	* extent structure in 512-byte granularity.
				641	*/
				642	error = -EIO;
				643	if (decode_sector_number(&p, &be->be_f_offset) < 0)
				644	goto out_put_deviceid;
				645	if (decode_sector_number(&p, &be->be_length) < 0)
				646	goto out_put_deviceid;
				647	if (decode_sector_number(&p, &be->be_v_offset) < 0)
				648	goto out_put_deviceid;
				649	be->be_state = be32_to_cpup(p++);
				650
				651	error = verify_extent(be, lv);
				652	if (error) {
				653	dprintk("%s: extent verification failed\n", __func__);
				654	goto out_put_deviceid;
				655	}
				656
				657	list_add_tail(&be->be_list, extents);
				658	return 0;
				659
				660	out_put_deviceid:
				661	nfs4_put_deviceid_node(be->be_device);
				662	out_free_be:
				663	kfree(be);
				664	return error;
				665	}
				666
				667	static struct pnfs_layout_segment *
				668	bl_alloc_lseg(struct pnfs_layout_hdr lo, struct nfs4_layoutget_res lgr,
				669	gfp_t gfp_mask)
				670	{
				671	struct layout_verification lv = {
				672	.mode = lgr->range.iomode,
				673	.start = lgr->range.offset >> SECTOR_SHIFT,
				674	.inval = lgr->range.offset >> SECTOR_SHIFT,
				675	.cowread = lgr->range.offset >> SECTOR_SHIFT,
				676	};
				677	struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
				678	struct pnfs_layout_segment *lseg;
				679	struct xdr_buf buf;
				680	struct xdr_stream xdr;
				681	struct page *scratch;
				682	int status, i;
				683	uint32_t count;
				684	__be32 *p;
				685	LIST_HEAD(extents);
				686
				687	dprintk("---> %s\n", __func__);
				688
				689	lseg = kzalloc(sizeof(*lseg), gfp_mask);
				690	if (!lseg)
				691	return ERR_PTR(-ENOMEM);
				692
				693	status = -ENOMEM;
				694	scratch = alloc_page(gfp_mask);
				695	if (!scratch)
				696	goto out;
				697
				698	xdr_init_decode_pages(&xdr, &buf,
				699	lgr->layoutp->pages, lgr->layoutp->len);
				700	xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE);
				701
				702	status = -EIO;
				703	p = xdr_inline_decode(&xdr, 4);
				704	if (unlikely(!p))
				705	goto out_free_scratch;
				706
				707	count = be32_to_cpup(p++);
				708	dprintk("%s: number of extents %d\n", __func__, count);
				709
				710	/*
				711	* Decode individual extents, putting them in temporary staging area
				712	* until whole layout is decoded to make error recovery easier.
				713	*/
				714	for (i = 0; i < count; i++) {
				715	status = bl_alloc_extent(&xdr, lo, &lv, &extents, gfp_mask);
				716	if (status)
				717	goto process_extents;
				718	}
				719
				720	if (lgr->range.offset + lgr->range.length !=
				721	lv.start << SECTOR_SHIFT) {
				722	dprintk("%s Final length mismatch\n", __func__);
				723	status = -EIO;
				724	goto process_extents;
				725	}
				726
				727	if (lv.start < lv.cowread) {
				728	dprintk("%s Final uncovered COW extent\n", __func__);
				729	status = -EIO;
				730	}
				731
				732	process_extents:
				733	while (!list_empty(&extents)) {
				734	struct pnfs_block_extent *be =
				735	list_first_entry(&extents, struct pnfs_block_extent,
				736	be_list);
				737	list_del(&be->be_list);
				738
				739	if (!status)
				740	status = ext_tree_insert(bl, be);
				741
				742	if (status) {
				743	nfs4_put_deviceid_node(be->be_device);
				744	kfree(be);
				745	}
				746	}
				747
				748	out_free_scratch:
				749	__free_page(scratch);
				750	out:
				751	dprintk("%s returns %d\n", __func__, status);
				752	switch (status) {
				753	case -ENODEV:
				754	/* Our extent block devices are unavailable */
				755	set_bit(NFS_LSEG_UNAVAILABLE, &lseg->pls_flags);
				756	/* Fall through */
				757	case 0:
				758	return lseg;
				759	default:
				760	kfree(lseg);
				761	return ERR_PTR(status);
				762	}
				763	}
				764
				765	static void
				766	bl_return_range(struct pnfs_layout_hdr *lo,
				767	struct pnfs_layout_range *range)
				768	{
				769	struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
				770	sector_t offset = range->offset >> SECTOR_SHIFT, end;
				771
				772	if (range->offset % 8) {
				773	dprintk("%s: offset %lld not block size aligned\n",
				774	__func__, range->offset);
				775	return;
				776	}
				777
				778	if (range->length != NFS4_MAX_UINT64) {
				779	if (range->length % 8) {
				780	dprintk("%s: length %lld not block size aligned\n",
				781	__func__, range->length);
				782	return;
				783	}
				784
				785	end = offset + (range->length >> SECTOR_SHIFT);
				786	} else {
				787	end = round_down(NFS4_MAX_UINT64, PAGE_SIZE);
				788	}
				789
				790	ext_tree_remove(bl, range->iomode & IOMODE_RW, offset, end);
				791	}
				792
				793	static int
				794	bl_prepare_layoutcommit(struct nfs4_layoutcommit_args *arg)
				795	{
				796	return ext_tree_prepare_commit(arg);
				797	}
				798
				799	static void
				800	bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata)
				801	{
				802	ext_tree_mark_committed(&lcdata->args, lcdata->res.status);
				803	}
				804
				805	static int
				806	bl_set_layoutdriver(struct nfs_server server, const struct nfs_fh fh)
				807	{
				808	dprintk("%s enter\n", __func__);
				809
				810	if (server->pnfs_blksize == 0) {
				811	dprintk("%s Server did not return blksize\n", __func__);
				812	return -EINVAL;
				813	}
				814	if (server->pnfs_blksize > PAGE_SIZE) {
				815	printk(KERN_ERR "%s: pNFS blksize %d not supported.\n",
				816	__func__, server->pnfs_blksize);
				817	return -EINVAL;
				818	}
				819
				820	return 0;
				821	}
				822
				823	static bool
				824	is_aligned_req(struct nfs_pageio_descriptor *pgio,
				825	struct nfs_page *req, unsigned int alignment, bool is_write)
				826	{
				827	/*
				828	* Always accept buffered writes, higher layers take care of the
				829	* right alignment.
				830	*/
				831	if (pgio->pg_dreq == NULL)
				832	return true;
				833
				834	if (!IS_ALIGNED(req->wb_offset, alignment))
				835	return false;
				836
				837	if (IS_ALIGNED(req->wb_bytes, alignment))
				838	return true;
				839
				840	if (is_write &&
				841	(req_offset(req) + req->wb_bytes == i_size_read(pgio->pg_inode))) {
				842	/*
				843	* If the write goes up to the inode size, just write
				844	* the full page. Data past the inode size is
				845	* guaranteed to be zeroed by the higher level client
				846	* code, and this behaviour is mandated by RFC 5663
				847	* section 2.3.2.
				848	*/
				849	return true;
				850	}
				851
				852	return false;
				853	}
				854
				855	static void
				856	bl_pg_init_read(struct nfs_pageio_descriptor pgio, struct nfs_page req)
				857	{
				858	if (!is_aligned_req(pgio, req, SECTOR_SIZE, false)) {
				859	nfs_pageio_reset_read_mds(pgio);
				860	return;
				861	}
				862
				863	pnfs_generic_pg_init_read(pgio, req);
				864
				865	if (pgio->pg_lseg &&
				866	test_bit(NFS_LSEG_UNAVAILABLE, &pgio->pg_lseg->pls_flags)) {
				867	pnfs_error_mark_layout_for_return(pgio->pg_inode, pgio->pg_lseg);
				868	pnfs_set_lo_fail(pgio->pg_lseg);
				869	nfs_pageio_reset_read_mds(pgio);
				870	}
				871	}
				872
				873	/*
				874	* Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
				875	* of bytes (maximum @req->wb_bytes) that can be coalesced.
				876	*/
				877	static size_t
				878	bl_pg_test_read(struct nfs_pageio_descriptor pgio, struct nfs_page prev,
				879	struct nfs_page *req)
				880	{
				881	if (!is_aligned_req(pgio, req, SECTOR_SIZE, false))
				882	return 0;
				883	return pnfs_generic_pg_test(pgio, prev, req);
				884	}
				885
				886	/*
				887	* Return the number of contiguous bytes for a given inode
				888	* starting at page frame idx.
				889	*/
				890	static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx)
				891	{
				892	struct address_space *mapping = inode->i_mapping;
				893	pgoff_t end;
				894
				895	/* Optimize common case that writes from 0 to end of file */
				896	end = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
				897	if (end != inode->i_mapping->nrpages) {
				898	rcu_read_lock();
				899	end = page_cache_next_hole(mapping, idx + 1, ULONG_MAX);
				900	rcu_read_unlock();
				901	}
				902
				903	if (!end)
				904	return i_size_read(inode) - (idx << PAGE_SHIFT);
				905	else
				906	return (end - idx) << PAGE_SHIFT;
				907	}
				908
				909	static void
				910	bl_pg_init_write(struct nfs_pageio_descriptor pgio, struct nfs_page req)
				911	{
				912	u64 wb_size;
				913
				914	if (!is_aligned_req(pgio, req, PAGE_SIZE, true)) {
				915	nfs_pageio_reset_write_mds(pgio);
				916	return;
				917	}
				918
				919	if (pgio->pg_dreq == NULL)
				920	wb_size = pnfs_num_cont_bytes(pgio->pg_inode,
				921	req->wb_index);
				922	else
				923	wb_size = nfs_dreq_bytes_left(pgio->pg_dreq);
				924
				925	pnfs_generic_pg_init_write(pgio, req, wb_size);
				926
				927	if (pgio->pg_lseg &&
				928	test_bit(NFS_LSEG_UNAVAILABLE, &pgio->pg_lseg->pls_flags)) {
				929
				930	pnfs_error_mark_layout_for_return(pgio->pg_inode, pgio->pg_lseg);
				931	pnfs_set_lo_fail(pgio->pg_lseg);
				932	nfs_pageio_reset_write_mds(pgio);
				933	}
				934	}
				935
				936	/*
				937	* Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
				938	* of bytes (maximum @req->wb_bytes) that can be coalesced.
				939	*/
				940	static size_t
				941	bl_pg_test_write(struct nfs_pageio_descriptor pgio, struct nfs_page prev,
				942	struct nfs_page *req)
				943	{
				944	if (!is_aligned_req(pgio, req, PAGE_SIZE, true))
				945	return 0;
				946	return pnfs_generic_pg_test(pgio, prev, req);
				947	}
				948
				949	static const struct nfs_pageio_ops bl_pg_read_ops = {
				950	.pg_init = bl_pg_init_read,
				951	.pg_test = bl_pg_test_read,
				952	.pg_doio = pnfs_generic_pg_readpages,
				953	.pg_cleanup = pnfs_generic_pg_cleanup,
				954	};
				955
				956	static const struct nfs_pageio_ops bl_pg_write_ops = {
				957	.pg_init = bl_pg_init_write,
				958	.pg_test = bl_pg_test_write,
				959	.pg_doio = pnfs_generic_pg_writepages,
				960	.pg_cleanup = pnfs_generic_pg_cleanup,
				961	};
				962
				963	static struct pnfs_layoutdriver_type blocklayout_type = {
				964	.id = LAYOUT_BLOCK_VOLUME,
				965	.name = "LAYOUT_BLOCK_VOLUME",
				966	.owner = THIS_MODULE,
				967	.flags = PNFS_LAYOUTRET_ON_SETATTR \|
				968	PNFS_LAYOUTRET_ON_ERROR \|
				969	PNFS_READ_WHOLE_PAGE,
				970	.read_pagelist = bl_read_pagelist,
				971	.write_pagelist = bl_write_pagelist,
				972	.alloc_layout_hdr = bl_alloc_layout_hdr,
				973	.free_layout_hdr = bl_free_layout_hdr,
				974	.alloc_lseg = bl_alloc_lseg,
				975	.free_lseg = bl_free_lseg,
				976	.return_range = bl_return_range,
				977	.prepare_layoutcommit = bl_prepare_layoutcommit,
				978	.cleanup_layoutcommit = bl_cleanup_layoutcommit,
				979	.set_layoutdriver = bl_set_layoutdriver,
				980	.alloc_deviceid_node = bl_alloc_deviceid_node,
				981	.free_deviceid_node = bl_free_deviceid_node,
				982	.pg_read_ops = &bl_pg_read_ops,
				983	.pg_write_ops = &bl_pg_write_ops,
				984	.sync = pnfs_generic_sync,
				985	};
				986
				987	static struct pnfs_layoutdriver_type scsilayout_type = {
				988	.id = LAYOUT_SCSI,
				989	.name = "LAYOUT_SCSI",
				990	.owner = THIS_MODULE,
				991	.flags = PNFS_LAYOUTRET_ON_SETATTR \|
				992	PNFS_LAYOUTRET_ON_ERROR \|
				993	PNFS_READ_WHOLE_PAGE,
				994	.read_pagelist = bl_read_pagelist,
				995	.write_pagelist = bl_write_pagelist,
				996	.alloc_layout_hdr = sl_alloc_layout_hdr,
				997	.free_layout_hdr = bl_free_layout_hdr,
				998	.alloc_lseg = bl_alloc_lseg,
				999	.free_lseg = bl_free_lseg,
				1000	.return_range = bl_return_range,
				1001	.prepare_layoutcommit = bl_prepare_layoutcommit,
				1002	.cleanup_layoutcommit = bl_cleanup_layoutcommit,
				1003	.set_layoutdriver = bl_set_layoutdriver,
				1004	.alloc_deviceid_node = bl_alloc_deviceid_node,
				1005	.free_deviceid_node = bl_free_deviceid_node,
				1006	.pg_read_ops = &bl_pg_read_ops,
				1007	.pg_write_ops = &bl_pg_write_ops,
				1008	.sync = pnfs_generic_sync,
				1009	};
				1010
				1011
				1012	static int __init nfs4blocklayout_init(void)
				1013	{
				1014	int ret;
				1015
				1016	dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__);
				1017
				1018	ret = bl_init_pipefs();
				1019	if (ret)
				1020	goto out;
				1021
				1022	ret = pnfs_register_layoutdriver(&blocklayout_type);
				1023	if (ret)
				1024	goto out_cleanup_pipe;
				1025
				1026	ret = pnfs_register_layoutdriver(&scsilayout_type);
				1027	if (ret)
				1028	goto out_unregister_block;
				1029	return 0;
				1030
				1031	out_unregister_block:
				1032	pnfs_unregister_layoutdriver(&blocklayout_type);
				1033	out_cleanup_pipe:
				1034	bl_cleanup_pipefs();
				1035	out:
				1036	return ret;
				1037	}
				1038
				1039	static void __exit nfs4blocklayout_exit(void)
				1040	{
				1041	dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n",
				1042	__func__);
				1043
				1044	pnfs_unregister_layoutdriver(&scsilayout_type);
				1045	pnfs_unregister_layoutdriver(&blocklayout_type);
				1046	bl_cleanup_pipefs();
				1047	}
				1048
				1049	MODULE_ALIAS("nfs-layouttype4-3");
				1050	MODULE_ALIAS("nfs-layouttype4-5");
				1051
				1052	module_init(nfs4blocklayout_init);
				1053	module_exit(nfs4blocklayout_exit);