Blame - src/kernel/linux/v4.14/fs/exofs/inode.c - T103

blob: f17715d140b5bd5aaf48a6b716facdc16a2f41b7 [file] [log] [blame]

rjw	1f88458	2022-01-06 17:20:42 +0800	[diff] [blame^]	1	/*
				2	* Copyright (C) 2005, 2006
				3	* Avishay Traeger (avishay@gmail.com)
				4	* Copyright (C) 2008, 2009
				5	* Boaz Harrosh <ooo@electrozaur.com>
				6	*
				7	* Copyrights for code taken from ext2:
				8	* Copyright (C) 1992, 1993, 1994, 1995
				9	* Remy Card (card@masi.ibp.fr)
				10	* Laboratoire MASI - Institut Blaise Pascal
				11	* Universite Pierre et Marie Curie (Paris VI)
				12	* from
				13	* linux/fs/minix/inode.c
				14	* Copyright (C) 1991, 1992 Linus Torvalds
				15	*
				16	* This file is part of exofs.
				17	*
				18	* exofs is free software; you can redistribute it and/or modify
				19	* it under the terms of the GNU General Public License as published by
				20	* the Free Software Foundation. Since it is based on ext2, and the only
				21	* valid version of GPL for the Linux kernel is version 2, the only valid
				22	* version of GPL for exofs is version 2.
				23	*
				24	* exofs is distributed in the hope that it will be useful,
				25	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				26	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				27	* GNU General Public License for more details.
				28	*
				29	* You should have received a copy of the GNU General Public License
				30	* along with exofs; if not, write to the Free Software
				31	* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
				32	*/
				33
				34	#include <linux/slab.h>
				35
				36	#include "exofs.h"
				37
				38	#define EXOFS_DBGMSG2(M...) do {} while (0)
				39
				40	unsigned exofs_max_io_pages(struct ore_layout *layout,
				41	unsigned expected_pages)
				42	{
				43	unsigned pages = min_t(unsigned, expected_pages,
				44	layout->max_io_length / PAGE_SIZE);
				45
				46	return pages;
				47	}
				48
				49	struct page_collect {
				50	struct exofs_sb_info *sbi;
				51	struct inode *inode;
				52	unsigned expected_pages;
				53	struct ore_io_state *ios;
				54
				55	struct page **pages;
				56	unsigned alloc_pages;
				57	unsigned nr_pages;
				58	unsigned long length;
				59	loff_t pg_first; /* keep 64bit also in 32-arches */
				60	bool read_4_write; /* This means two things: that the read is sync
				61	* And the pages should not be unlocked.
				62	*/
				63	struct page *that_locked_page;
				64	};
				65
				66	static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
				67	struct inode *inode)
				68	{
				69	struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
				70
				71	pcol->sbi = sbi;
				72	pcol->inode = inode;
				73	pcol->expected_pages = expected_pages;
				74
				75	pcol->ios = NULL;
				76	pcol->pages = NULL;
				77	pcol->alloc_pages = 0;
				78	pcol->nr_pages = 0;
				79	pcol->length = 0;
				80	pcol->pg_first = -1;
				81	pcol->read_4_write = false;
				82	pcol->that_locked_page = NULL;
				83	}
				84
				85	static void _pcol_reset(struct page_collect *pcol)
				86	{
				87	pcol->expected_pages -= min(pcol->nr_pages, pcol->expected_pages);
				88
				89	pcol->pages = NULL;
				90	pcol->alloc_pages = 0;
				91	pcol->nr_pages = 0;
				92	pcol->length = 0;
				93	pcol->pg_first = -1;
				94	pcol->ios = NULL;
				95	pcol->that_locked_page = NULL;
				96
				97	/* this is probably the end of the loop but in writes
				98	* it might not end here. don't be left with nothing
				99	*/
				100	if (!pcol->expected_pages)
				101	pcol->expected_pages =
				102	exofs_max_io_pages(&pcol->sbi->layout, ~0);
				103	}
				104
				105	static int pcol_try_alloc(struct page_collect *pcol)
				106	{
				107	unsigned pages;
				108
				109	/* TODO: easily support bio chaining */
				110	pages = exofs_max_io_pages(&pcol->sbi->layout, pcol->expected_pages);
				111
				112	for (; pages; pages >>= 1) {
				113	pcol->pages = kmalloc(pages * sizeof(struct page *),
				114	GFP_KERNEL);
				115	if (likely(pcol->pages)) {
				116	pcol->alloc_pages = pages;
				117	return 0;
				118	}
				119	}
				120
				121	EXOFS_ERR("Failed to kmalloc expected_pages=%u\n",
				122	pcol->expected_pages);
				123	return -ENOMEM;
				124	}
				125
				126	static void pcol_free(struct page_collect *pcol)
				127	{
				128	kfree(pcol->pages);
				129	pcol->pages = NULL;
				130
				131	if (pcol->ios) {
				132	ore_put_io_state(pcol->ios);
				133	pcol->ios = NULL;
				134	}
				135	}
				136
				137	static int pcol_add_page(struct page_collect pcol, struct page page,
				138	unsigned len)
				139	{
				140	if (unlikely(pcol->nr_pages >= pcol->alloc_pages))
				141	return -ENOMEM;
				142
				143	pcol->pages[pcol->nr_pages++] = page;
				144	pcol->length += len;
				145	return 0;
				146	}
				147
				148	enum {PAGE_WAS_NOT_IN_IO = 17};
				149	static int update_read_page(struct page *page, int ret)
				150	{
				151	switch (ret) {
				152	case 0:
				153	/* Everything is OK */
				154	SetPageUptodate(page);
				155	if (PageError(page))
				156	ClearPageError(page);
				157	break;
				158	case -EFAULT:
				159	/* In this case we were trying to read something that wasn't on
				160	* disk yet - return a page full of zeroes. This should be OK,
				161	* because the object should be empty (if there was a write
				162	* before this read, the read would be waiting with the page
				163	* locked */
				164	clear_highpage(page);
				165
				166	SetPageUptodate(page);
				167	if (PageError(page))
				168	ClearPageError(page);
				169	EXOFS_DBGMSG("recovered read error\n");
				170	/* fall through */
				171	case PAGE_WAS_NOT_IN_IO:
				172	ret = 0; /* recovered error */
				173	break;
				174	default:
				175	SetPageError(page);
				176	}
				177	return ret;
				178	}
				179
				180	static void update_write_page(struct page *page, int ret)
				181	{
				182	if (unlikely(ret == PAGE_WAS_NOT_IN_IO))
				183	return; /* don't pass start don't collect $200 */
				184
				185	if (ret) {
				186	mapping_set_error(page->mapping, ret);
				187	SetPageError(page);
				188	}
				189	end_page_writeback(page);
				190	}
				191
				192	/* Called at the end of reads, to optionally unlock pages and update their
				193	* status.
				194	*/
				195	static int __readpages_done(struct page_collect *pcol)
				196	{
				197	int i;
				198	u64 good_bytes;
				199	u64 length = 0;
				200	int ret = ore_check_io(pcol->ios, NULL);
				201
				202	if (likely(!ret)) {
				203	good_bytes = pcol->length;
				204	ret = PAGE_WAS_NOT_IN_IO;
				205	} else {
				206	good_bytes = 0;
				207	}
				208
				209	EXOFS_DBGMSG2("readpages_done(0x%lx) good_bytes=0x%llx"
				210	" length=0x%lx nr_pages=%u\n",
				211	pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
				212	pcol->nr_pages);
				213
				214	for (i = 0; i < pcol->nr_pages; i++) {
				215	struct page *page = pcol->pages[i];
				216	struct inode *inode = page->mapping->host;
				217	int page_stat;
				218
				219	if (inode != pcol->inode)
				220	continue; /* osd might add more pages at end */
				221
				222	if (likely(length < good_bytes))
				223	page_stat = 0;
				224	else
				225	page_stat = ret;
				226
				227	EXOFS_DBGMSG2(" readpages_done(0x%lx, 0x%lx) %s\n",
				228	inode->i_ino, page->index,
				229	page_stat ? "bad_bytes" : "good_bytes");
				230
				231	ret = update_read_page(page, page_stat);
				232	if (!pcol->read_4_write)
				233	unlock_page(page);
				234	length += PAGE_SIZE;
				235	}
				236
				237	pcol_free(pcol);
				238	EXOFS_DBGMSG2("readpages_done END\n");
				239	return ret;
				240	}
				241
				242	/* callback of async reads */
				243	static void readpages_done(struct ore_io_state ios, void p)
				244	{
				245	struct page_collect *pcol = p;
				246
				247	__readpages_done(pcol);
				248	atomic_dec(&pcol->sbi->s_curr_pending);
				249	kfree(pcol);
				250	}
				251
				252	static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
				253	{
				254	int i;
				255
				256	for (i = 0; i < pcol->nr_pages; i++) {
				257	struct page *page = pcol->pages[i];
				258
				259	if (rw == READ)
				260	update_read_page(page, ret);
				261	else
				262	update_write_page(page, ret);
				263
				264	unlock_page(page);
				265	}
				266	}
				267
				268	static int _maybe_not_all_in_one_io(struct ore_io_state *ios,
				269	struct page_collect pcol_src, struct page_collect pcol)
				270	{
				271	/* length was wrong or offset was not page aligned */
				272	BUG_ON(pcol_src->nr_pages < ios->nr_pages);
				273
				274	if (pcol_src->nr_pages > ios->nr_pages) {
				275	struct page **src_page;
				276	unsigned pages_less = pcol_src->nr_pages - ios->nr_pages;
				277	unsigned long len_less = pcol_src->length - ios->length;
				278	unsigned i;
				279	int ret;
				280
				281	/* This IO was trimmed */
				282	pcol_src->nr_pages = ios->nr_pages;
				283	pcol_src->length = ios->length;
				284
				285	/* Left over pages are passed to the next io */
				286	pcol->expected_pages += pages_less;
				287	pcol->nr_pages = pages_less;
				288	pcol->length = len_less;
				289	src_page = pcol_src->pages + pcol_src->nr_pages;
				290	pcol->pg_first = (*src_page)->index;
				291
				292	ret = pcol_try_alloc(pcol);
				293	if (unlikely(ret))
				294	return ret;
				295
				296	for (i = 0; i < pages_less; ++i)
				297	pcol->pages[i] = *src_page++;
				298
				299	EXOFS_DBGMSG("Length was adjusted nr_pages=0x%x "
				300	"pages_less=0x%x expected_pages=0x%x "
				301	"next_offset=0x%llx next_len=0x%lx\n",
				302	pcol_src->nr_pages, pages_less, pcol->expected_pages,
				303	pcol->pg_first * PAGE_SIZE, pcol->length);
				304	}
				305	return 0;
				306	}
				307
				308	static int read_exec(struct page_collect *pcol)
				309	{
				310	struct exofs_i_info *oi = exofs_i(pcol->inode);
				311	struct ore_io_state *ios;
				312	struct page_collect *pcol_copy = NULL;
				313	int ret;
				314
				315	if (!pcol->pages)
				316	return 0;
				317
				318	if (!pcol->ios) {
				319	int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, true,
				320	pcol->pg_first << PAGE_SHIFT,
				321	pcol->length, &pcol->ios);
				322
				323	if (ret)
				324	return ret;
				325	}
				326
				327	ios = pcol->ios;
				328	ios->pages = pcol->pages;
				329
				330	if (pcol->read_4_write) {
				331	ore_read(pcol->ios);
				332	return __readpages_done(pcol);
				333	}
				334
				335	pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
				336	if (!pcol_copy) {
				337	ret = -ENOMEM;
				338	goto err;
				339	}
				340
				341	pcol_copy = pcol;
				342	ios->done = readpages_done;
				343	ios->private = pcol_copy;
				344
				345	/* pages ownership was passed to pcol_copy */
				346	_pcol_reset(pcol);
				347
				348	ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol);
				349	if (unlikely(ret))
				350	goto err;
				351
				352	EXOFS_DBGMSG2("read_exec(0x%lx) offset=0x%llx length=0x%llx\n",
				353	pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length));
				354
				355	ret = ore_read(ios);
				356	if (unlikely(ret))
				357	goto err;
				358
				359	atomic_inc(&pcol->sbi->s_curr_pending);
				360
				361	return 0;
				362
				363	err:
				364	if (!pcol_copy) /* Failed before ownership transfer */
				365	pcol_copy = pcol;
				366	_unlock_pcol_pages(pcol_copy, ret, READ);
				367	pcol_free(pcol_copy);
				368	kfree(pcol_copy);
				369
				370	return ret;
				371	}
				372
				373	/* readpage_strip is called either directly from readpage() or by the VFS from
				374	* within read_cache_pages(), to add one more page to be read. It will try to
				375	* collect as many contiguous pages as posible. If a discontinuity is
				376	* encountered, or it runs out of resources, it will submit the previous segment
				377	* and will start a new collection. Eventually caller must submit the last
				378	* segment if present.
				379	*/
				380	static int __readpage_strip(struct page_collect pcol, struct page page)
				381	{
				382	struct inode *inode = pcol->inode;
				383	struct exofs_i_info *oi = exofs_i(inode);
				384	loff_t i_size = i_size_read(inode);
				385	pgoff_t end_index = i_size >> PAGE_SHIFT;
				386	size_t len;
				387	int ret;
				388
				389	BUG_ON(!PageLocked(page));
				390
				391	/* FIXME: Just for debugging, will be removed */
				392	if (PageUptodate(page))
				393	EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino,
				394	page->index);
				395
				396	pcol->that_locked_page = page;
				397
				398	if (page->index < end_index)
				399	len = PAGE_SIZE;
				400	else if (page->index == end_index)
				401	len = i_size & ~PAGE_MASK;
				402	else
				403	len = 0;
				404
				405	if (!len \|\| !obj_created(oi)) {
				406	/* this will be out of bounds, or doesn't exist yet.
				407	* Current page is cleared and the request is split
				408	*/
				409	clear_highpage(page);
				410
				411	SetPageUptodate(page);
				412	if (PageError(page))
				413	ClearPageError(page);
				414
				415	if (!pcol->read_4_write)
				416	unlock_page(page);
				417	EXOFS_DBGMSG("readpage_strip(0x%lx) empty page len=%zx "
				418	"read_4_write=%d index=0x%lx end_index=0x%lx "
				419	"splitting\n", inode->i_ino, len,
				420	pcol->read_4_write, page->index, end_index);
				421
				422	return read_exec(pcol);
				423	}
				424
				425	try_again:
				426
				427	if (unlikely(pcol->pg_first == -1)) {
				428	pcol->pg_first = page->index;
				429	} else if (unlikely((pcol->pg_first + pcol->nr_pages) !=
				430	page->index)) {
				431	/* Discontinuity detected, split the request */
				432	ret = read_exec(pcol);
				433	if (unlikely(ret))
				434	goto fail;
				435	goto try_again;
				436	}
				437
				438	if (!pcol->pages) {
				439	ret = pcol_try_alloc(pcol);
				440	if (unlikely(ret))
				441	goto fail;
				442	}
				443
				444	if (len != PAGE_SIZE)
				445	zero_user(page, len, PAGE_SIZE - len);
				446
				447	EXOFS_DBGMSG2(" readpage_strip(0x%lx, 0x%lx) len=0x%zx\n",
				448	inode->i_ino, page->index, len);
				449
				450	ret = pcol_add_page(pcol, page, len);
				451	if (ret) {
				452	EXOFS_DBGMSG2("Failed pcol_add_page pages[i]=%p "
				453	"this_len=0x%zx nr_pages=%u length=0x%lx\n",
				454	page, len, pcol->nr_pages, pcol->length);
				455
				456	/* split the request, and start again with current page */
				457	ret = read_exec(pcol);
				458	if (unlikely(ret))
				459	goto fail;
				460
				461	goto try_again;
				462	}
				463
				464	return 0;
				465
				466	fail:
				467	/* SetPageError(page); ??? */
				468	unlock_page(page);
				469	return ret;
				470	}
				471
				472	static int readpage_strip(struct file data, struct page page)
				473	{
				474	struct page_collect pcol = (struct page_collect )data;
				475
				476	return __readpage_strip(pcol, page);
				477	}
				478
				479	static int exofs_readpages(struct file file, struct address_space mapping,
				480	struct list_head *pages, unsigned nr_pages)
				481	{
				482	struct page_collect pcol;
				483	int ret;
				484
				485	_pcol_init(&pcol, nr_pages, mapping->host);
				486
				487	ret = read_cache_pages(mapping, pages, readpage_strip, &pcol);
				488	if (ret) {
				489	EXOFS_ERR("read_cache_pages => %d\n", ret);
				490	return ret;
				491	}
				492
				493	ret = read_exec(&pcol);
				494	if (unlikely(ret))
				495	return ret;
				496
				497	return read_exec(&pcol);
				498	}
				499
				500	static int _readpage(struct page *page, bool read_4_write)
				501	{
				502	struct page_collect pcol;
				503	int ret;
				504
				505	_pcol_init(&pcol, 1, page->mapping->host);
				506
				507	pcol.read_4_write = read_4_write;
				508	ret = __readpage_strip(&pcol, page);
				509	if (ret) {
				510	EXOFS_ERR("_readpage => %d\n", ret);
				511	return ret;
				512	}
				513
				514	return read_exec(&pcol);
				515	}
				516
				517	/*
				518	* We don't need the file
				519	*/
				520	static int exofs_readpage(struct file file, struct page page)
				521	{
				522	return _readpage(page, false);
				523	}
				524
				525	/* Callback for osd_write. All writes are asynchronous */
				526	static void writepages_done(struct ore_io_state ios, void p)
				527	{
				528	struct page_collect *pcol = p;
				529	int i;
				530	u64 good_bytes;
				531	u64 length = 0;
				532	int ret = ore_check_io(ios, NULL);
				533
				534	atomic_dec(&pcol->sbi->s_curr_pending);
				535
				536	if (likely(!ret)) {
				537	good_bytes = pcol->length;
				538	ret = PAGE_WAS_NOT_IN_IO;
				539	} else {
				540	good_bytes = 0;
				541	}
				542
				543	EXOFS_DBGMSG2("writepages_done(0x%lx) good_bytes=0x%llx"
				544	" length=0x%lx nr_pages=%u\n",
				545	pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
				546	pcol->nr_pages);
				547
				548	for (i = 0; i < pcol->nr_pages; i++) {
				549	struct page *page = pcol->pages[i];
				550	struct inode *inode = page->mapping->host;
				551	int page_stat;
				552
				553	if (inode != pcol->inode)
				554	continue; /* osd might add more pages to a bio */
				555
				556	if (likely(length < good_bytes))
				557	page_stat = 0;
				558	else
				559	page_stat = ret;
				560
				561	update_write_page(page, page_stat);
				562	unlock_page(page);
				563	EXOFS_DBGMSG2(" writepages_done(0x%lx, 0x%lx) status=%d\n",
				564	inode->i_ino, page->index, page_stat);
				565
				566	length += PAGE_SIZE;
				567	}
				568
				569	pcol_free(pcol);
				570	kfree(pcol);
				571	EXOFS_DBGMSG2("writepages_done END\n");
				572	}
				573
				574	static struct page __r4w_get_page(void priv, u64 offset, bool *uptodate)
				575	{
				576	struct page_collect *pcol = priv;
				577	pgoff_t index = offset / PAGE_SIZE;
				578
				579	if (!pcol->that_locked_page \|\|
				580	(pcol->that_locked_page->index != index)) {
				581	struct page *page;
				582	loff_t i_size = i_size_read(pcol->inode);
				583
				584	if (offset >= i_size) {
				585	*uptodate = true;
				586	EXOFS_DBGMSG2("offset >= i_size index=0x%lx\n", index);
				587	return ZERO_PAGE(0);
				588	}
				589
				590	page = find_get_page(pcol->inode->i_mapping, index);
				591	if (!page) {
				592	page = find_or_create_page(pcol->inode->i_mapping,
				593	index, GFP_NOFS);
				594	if (unlikely(!page)) {
				595	EXOFS_DBGMSG("grab_cache_page Failed "
				596	"index=0x%llx\n", _LLU(index));
				597	return NULL;
				598	}
				599	unlock_page(page);
				600	}
				601	*uptodate = PageUptodate(page);
				602	EXOFS_DBGMSG2("index=0x%lx uptodate=%d\n", index, *uptodate);
				603	return page;
				604	} else {
				605	EXOFS_DBGMSG2("YES that_locked_page index=0x%lx\n",
				606	pcol->that_locked_page->index);
				607	*uptodate = true;
				608	return pcol->that_locked_page;
				609	}
				610	}
				611
				612	static void __r4w_put_page(void priv, struct page page)
				613	{
				614	struct page_collect *pcol = priv;
				615
				616	if ((pcol->that_locked_page != page) && (ZERO_PAGE(0) != page)) {
				617	EXOFS_DBGMSG2("index=0x%lx\n", page->index);
				618	put_page(page);
				619	return;
				620	}
				621	EXOFS_DBGMSG2("that_locked_page index=0x%lx\n",
				622	ZERO_PAGE(0) == page ? -1 : page->index);
				623	}
				624
				625	static const struct _ore_r4w_op _r4w_op = {
				626	.get_page = &__r4w_get_page,
				627	.put_page = &__r4w_put_page,
				628	};
				629
				630	static int write_exec(struct page_collect *pcol)
				631	{
				632	struct exofs_i_info *oi = exofs_i(pcol->inode);
				633	struct ore_io_state *ios;
				634	struct page_collect *pcol_copy = NULL;
				635	int ret;
				636
				637	if (!pcol->pages)
				638	return 0;
				639
				640	BUG_ON(pcol->ios);
				641	ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, false,
				642	pcol->pg_first << PAGE_SHIFT,
				643	pcol->length, &pcol->ios);
				644	if (unlikely(ret))
				645	goto err;
				646
				647	pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
				648	if (!pcol_copy) {
				649	EXOFS_ERR("write_exec: Failed to kmalloc(pcol)\n");
				650	ret = -ENOMEM;
				651	goto err;
				652	}
				653
				654	pcol_copy = pcol;
				655
				656	ios = pcol->ios;
				657	ios->pages = pcol_copy->pages;
				658	ios->done = writepages_done;
				659	ios->r4w = &_r4w_op;
				660	ios->private = pcol_copy;
				661
				662	/* pages ownership was passed to pcol_copy */
				663	_pcol_reset(pcol);
				664
				665	ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol);
				666	if (unlikely(ret))
				667	goto err;
				668
				669	EXOFS_DBGMSG2("write_exec(0x%lx) offset=0x%llx length=0x%llx\n",
				670	pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length));
				671
				672	ret = ore_write(ios);
				673	if (unlikely(ret)) {
				674	EXOFS_ERR("write_exec: ore_write() Failed\n");
				675	goto err;
				676	}
				677
				678	atomic_inc(&pcol->sbi->s_curr_pending);
				679	return 0;
				680
				681	err:
				682	if (!pcol_copy) /* Failed before ownership transfer */
				683	pcol_copy = pcol;
				684	_unlock_pcol_pages(pcol_copy, ret, WRITE);
				685	pcol_free(pcol_copy);
				686	kfree(pcol_copy);
				687
				688	return ret;
				689	}
				690
				691	/* writepage_strip is called either directly from writepage() or by the VFS from
				692	* within write_cache_pages(), to add one more page to be written to storage.
				693	* It will try to collect as many contiguous pages as possible. If a
				694	* discontinuity is encountered or it runs out of resources it will submit the
				695	* previous segment and will start a new collection.
				696	* Eventually caller must submit the last segment if present.
				697	*/
				698	static int writepage_strip(struct page *page,
				699	struct writeback_control wbc_unused, void data)
				700	{
				701	struct page_collect *pcol = data;
				702	struct inode *inode = pcol->inode;
				703	struct exofs_i_info *oi = exofs_i(inode);
				704	loff_t i_size = i_size_read(inode);
				705	pgoff_t end_index = i_size >> PAGE_SHIFT;
				706	size_t len;
				707	int ret;
				708
				709	BUG_ON(!PageLocked(page));
				710
				711	ret = wait_obj_created(oi);
				712	if (unlikely(ret))
				713	goto fail;
				714
				715	if (page->index < end_index)
				716	/* in this case, the page is within the limits of the file */
				717	len = PAGE_SIZE;
				718	else {
				719	len = i_size & ~PAGE_MASK;
				720
				721	if (page->index > end_index \|\| !len) {
				722	/* in this case, the page is outside the limits
				723	* (truncate in progress)
				724	*/
				725	ret = write_exec(pcol);
				726	if (unlikely(ret))
				727	goto fail;
				728	if (PageError(page))
				729	ClearPageError(page);
				730	unlock_page(page);
				731	EXOFS_DBGMSG("writepage_strip(0x%lx, 0x%lx) "
				732	"outside the limits\n",
				733	inode->i_ino, page->index);
				734	return 0;
				735	}
				736	}
				737
				738	try_again:
				739
				740	if (unlikely(pcol->pg_first == -1)) {
				741	pcol->pg_first = page->index;
				742	} else if (unlikely((pcol->pg_first + pcol->nr_pages) !=
				743	page->index)) {
				744	/* Discontinuity detected, split the request */
				745	ret = write_exec(pcol);
				746	if (unlikely(ret))
				747	goto fail;
				748
				749	EXOFS_DBGMSG("writepage_strip(0x%lx, 0x%lx) Discontinuity\n",
				750	inode->i_ino, page->index);
				751	goto try_again;
				752	}
				753
				754	if (!pcol->pages) {
				755	ret = pcol_try_alloc(pcol);
				756	if (unlikely(ret))
				757	goto fail;
				758	}
				759
				760	EXOFS_DBGMSG2(" writepage_strip(0x%lx, 0x%lx) len=0x%zx\n",
				761	inode->i_ino, page->index, len);
				762
				763	ret = pcol_add_page(pcol, page, len);
				764	if (unlikely(ret)) {
				765	EXOFS_DBGMSG2("Failed pcol_add_page "
				766	"nr_pages=%u total_length=0x%lx\n",
				767	pcol->nr_pages, pcol->length);
				768
				769	/* split the request, next loop will start again */
				770	ret = write_exec(pcol);
				771	if (unlikely(ret)) {
				772	EXOFS_DBGMSG("write_exec failed => %d", ret);
				773	goto fail;
				774	}
				775
				776	goto try_again;
				777	}
				778
				779	BUG_ON(PageWriteback(page));
				780	set_page_writeback(page);
				781
				782	return 0;
				783
				784	fail:
				785	EXOFS_DBGMSG("Error: writepage_strip(0x%lx, 0x%lx)=>%d\n",
				786	inode->i_ino, page->index, ret);
				787	mapping_set_error(page->mapping, -EIO);
				788	unlock_page(page);
				789	return ret;
				790	}
				791
				792	static int exofs_writepages(struct address_space *mapping,
				793	struct writeback_control *wbc)
				794	{
				795	struct page_collect pcol;
				796	long start, end, expected_pages;
				797	int ret;
				798
				799	start = wbc->range_start >> PAGE_SHIFT;
				800	end = (wbc->range_end == LLONG_MAX) ?
				801	start + mapping->nrpages :
				802	wbc->range_end >> PAGE_SHIFT;
				803
				804	if (start \|\| end)
				805	expected_pages = end - start + 1;
				806	else
				807	expected_pages = mapping->nrpages;
				808
				809	if (expected_pages < 32L)
				810	expected_pages = 32L;
				811
				812	EXOFS_DBGMSG2("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx "
				813	"nrpages=%lu start=0x%lx end=0x%lx expected_pages=%ld\n",
				814	mapping->host->i_ino, wbc->range_start, wbc->range_end,
				815	mapping->nrpages, start, end, expected_pages);
				816
				817	_pcol_init(&pcol, expected_pages, mapping->host);
				818
				819	ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol);
				820	if (unlikely(ret)) {
				821	EXOFS_ERR("write_cache_pages => %d\n", ret);
				822	return ret;
				823	}
				824
				825	ret = write_exec(&pcol);
				826	if (unlikely(ret))
				827	return ret;
				828
				829	if (wbc->sync_mode == WB_SYNC_ALL) {
				830	return write_exec(&pcol); /* pump the last reminder */
				831	} else if (pcol.nr_pages) {
				832	/* not SYNC let the reminder join the next writeout */
				833	unsigned i;
				834
				835	for (i = 0; i < pcol.nr_pages; i++) {
				836	struct page *page = pcol.pages[i];
				837
				838	end_page_writeback(page);
				839	set_page_dirty(page);
				840	unlock_page(page);
				841	}
				842	}
				843	return 0;
				844	}
				845
				846	/*
				847	static int exofs_writepage(struct page page, struct writeback_control wbc)
				848	{
				849	struct page_collect pcol;
				850	int ret;
				851
				852	_pcol_init(&pcol, 1, page->mapping->host);
				853
				854	ret = writepage_strip(page, NULL, &pcol);
				855	if (ret) {
				856	EXOFS_ERR("exofs_writepage => %d\n", ret);
				857	return ret;
				858	}
				859
				860	return write_exec(&pcol);
				861	}
				862	*/
				863	/* i_mutex held using inode->i_size directly */
				864	static void _write_failed(struct inode *inode, loff_t to)
				865	{
				866	if (to > inode->i_size)
				867	truncate_pagecache(inode, inode->i_size);
				868	}
				869
				870	int exofs_write_begin(struct file file, struct address_space mapping,
				871	loff_t pos, unsigned len, unsigned flags,
				872	struct page pagep, void fsdata)
				873	{
				874	int ret = 0;
				875	struct page *page;
				876
				877	page = *pagep;
				878	if (page == NULL) {
				879	page = grab_cache_page_write_begin(mapping, pos >> PAGE_SHIFT,
				880	flags);
				881	if (!page) {
				882	EXOFS_DBGMSG("grab_cache_page_write_begin failed\n");
				883	return -ENOMEM;
				884	}
				885	*pagep = page;
				886	}
				887
				888	/* read modify write */
				889	if (!PageUptodate(page) && (len != PAGE_SIZE)) {
				890	loff_t i_size = i_size_read(mapping->host);
				891	pgoff_t end_index = i_size >> PAGE_SHIFT;
				892
				893	if (page->index > end_index) {
				894	clear_highpage(page);
				895	SetPageUptodate(page);
				896	} else {
				897	ret = _readpage(page, true);
				898	if (ret) {
				899	unlock_page(page);
				900	EXOFS_DBGMSG("__readpage failed\n");
				901	}
				902	}
				903	}
				904	return ret;
				905	}
				906
				907	static int exofs_write_begin_export(struct file *file,
				908	struct address_space *mapping,
				909	loff_t pos, unsigned len, unsigned flags,
				910	struct page pagep, void fsdata)
				911	{
				912	*pagep = NULL;
				913
				914	return exofs_write_begin(file, mapping, pos, len, flags, pagep,
				915	fsdata);
				916	}
				917
				918	static int exofs_write_end(struct file file, struct address_space mapping,
				919	loff_t pos, unsigned len, unsigned copied,
				920	struct page page, void fsdata)
				921	{
				922	struct inode *inode = mapping->host;
				923	loff_t last_pos = pos + copied;
				924
				925	if (!PageUptodate(page)) {
				926	if (copied < len) {
				927	_write_failed(inode, pos + len);
				928	copied = 0;
				929	goto out;
				930	}
				931	SetPageUptodate(page);
				932	}
				933	if (last_pos > inode->i_size) {
				934	i_size_write(inode, last_pos);
				935	mark_inode_dirty(inode);
				936	}
				937	set_page_dirty(page);
				938	out:
				939	unlock_page(page);
				940	put_page(page);
				941	return copied;
				942	}
				943
				944	static int exofs_releasepage(struct page *page, gfp_t gfp)
				945	{
				946	EXOFS_DBGMSG("page 0x%lx\n", page->index);
				947	WARN_ON(1);
				948	return 0;
				949	}
				950
				951	static void exofs_invalidatepage(struct page *page, unsigned int offset,
				952	unsigned int length)
				953	{
				954	EXOFS_DBGMSG("page 0x%lx offset 0x%x length 0x%x\n",
				955	page->index, offset, length);
				956	WARN_ON(1);
				957	}
				958
				959
				960	/* TODO: Should be easy enough to do proprly */
				961	static ssize_t exofs_direct_IO(struct kiocb iocb, struct iov_iter iter)
				962	{
				963	return 0;
				964	}
				965
				966	const struct address_space_operations exofs_aops = {
				967	.readpage = exofs_readpage,
				968	.readpages = exofs_readpages,
				969	.writepage = NULL,
				970	.writepages = exofs_writepages,
				971	.write_begin = exofs_write_begin_export,
				972	.write_end = exofs_write_end,
				973	.releasepage = exofs_releasepage,
				974	.set_page_dirty = __set_page_dirty_nobuffers,
				975	.invalidatepage = exofs_invalidatepage,
				976
				977	/* Not implemented Yet */
				978	.bmap = NULL, /* TODO: use osd's OSD_ACT_READ_MAP */
				979	.direct_IO = exofs_direct_IO,
				980
				981	/* With these NULL has special meaning or default is not exported */
				982	.migratepage = NULL,
				983	.launder_page = NULL,
				984	.is_partially_uptodate = NULL,
				985	.error_remove_page = NULL,
				986	};
				987
				988	/******************************************************************************
				989	* INODE OPERATIONS
				990	*****************************************************************************/
				991
				992	/*
				993	* Test whether an inode is a fast symlink.
				994	*/
				995	static inline int exofs_inode_is_fast_symlink(struct inode *inode)
				996	{
				997	struct exofs_i_info *oi = exofs_i(inode);
				998
				999	return S_ISLNK(inode->i_mode) && (oi->i_data[0] != 0);
				1000	}
				1001
				1002	static int _do_truncate(struct inode *inode, loff_t newsize)
				1003	{
				1004	struct exofs_i_info *oi = exofs_i(inode);
				1005	struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
				1006	int ret;
				1007
				1008	inode->i_mtime = inode->i_ctime = current_time(inode);
				1009
				1010	ret = ore_truncate(&sbi->layout, &oi->oc, (u64)newsize);
				1011	if (likely(!ret))
				1012	truncate_setsize(inode, newsize);
				1013
				1014	EXOFS_DBGMSG2("(0x%lx) size=0x%llx ret=>%d\n",
				1015	inode->i_ino, newsize, ret);
				1016	return ret;
				1017	}
				1018
				1019	/*
				1020	* Set inode attributes - update size attribute on OSD if needed,
				1021	* otherwise just call generic functions.
				1022	*/
				1023	int exofs_setattr(struct dentry dentry, struct iattr iattr)
				1024	{
				1025	struct inode *inode = d_inode(dentry);
				1026	int error;
				1027
				1028	/* if we are about to modify an object, and it hasn't been
				1029	* created yet, wait
				1030	*/
				1031	error = wait_obj_created(exofs_i(inode));
				1032	if (unlikely(error))
				1033	return error;
				1034
				1035	error = setattr_prepare(dentry, iattr);
				1036	if (unlikely(error))
				1037	return error;
				1038
				1039	if ((iattr->ia_valid & ATTR_SIZE) &&
				1040	iattr->ia_size != i_size_read(inode)) {
				1041	error = _do_truncate(inode, iattr->ia_size);
				1042	if (unlikely(error))
				1043	return error;
				1044	}
				1045
				1046	setattr_copy(inode, iattr);
				1047	mark_inode_dirty(inode);
				1048	return 0;
				1049	}
				1050
				1051	static const struct osd_attr g_attr_inode_file_layout = ATTR_DEF(
				1052	EXOFS_APAGE_FS_DATA,
				1053	EXOFS_ATTR_INODE_FILE_LAYOUT,
				1054	0);
				1055	static const struct osd_attr g_attr_inode_dir_layout = ATTR_DEF(
				1056	EXOFS_APAGE_FS_DATA,
				1057	EXOFS_ATTR_INODE_DIR_LAYOUT,
				1058	0);
				1059
				1060	/*
				1061	* Read the Linux inode info from the OSD, and return it as is. In exofs the
				1062	* inode info is in an application specific page/attribute of the osd-object.
				1063	*/
				1064	static int exofs_get_inode(struct super_block sb, struct exofs_i_info oi,
				1065	struct exofs_fcb *inode)
				1066	{
				1067	struct exofs_sb_info *sbi = sb->s_fs_info;
				1068	struct osd_attr attrs[] = {
				1069	[0] = g_attr_inode_data,
				1070	[1] = g_attr_inode_file_layout,
				1071	[2] = g_attr_inode_dir_layout,
				1072	};
				1073	struct ore_io_state *ios;
				1074	struct exofs_on_disk_inode_layout *layout;
				1075	int ret;
				1076
				1077	ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
				1078	if (unlikely(ret)) {
				1079	EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
				1080	return ret;
				1081	}
				1082
				1083	attrs[1].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs);
				1084	attrs[2].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs);
				1085
				1086	ios->in_attr = attrs;
				1087	ios->in_attr_len = ARRAY_SIZE(attrs);
				1088
				1089	ret = ore_read(ios);
				1090	if (unlikely(ret)) {
				1091	EXOFS_ERR("object(0x%llx) corrupted, return empty file=>%d\n",
				1092	_LLU(oi->one_comp.obj.id), ret);
				1093	memset(inode, 0, sizeof(*inode));
				1094	inode->i_mode = 0040000 \| (0777 & ~022);
				1095	/* If object is lost on target we might as well enable it's
				1096	* delete.
				1097	*/
				1098	ret = 0;
				1099	goto out;
				1100	}
				1101
				1102	ret = extract_attr_from_ios(ios, &attrs[0]);
				1103	if (ret) {
				1104	EXOFS_ERR("%s: extract_attr 0 of inode failed\n", __func__);
				1105	goto out;
				1106	}
				1107	WARN_ON(attrs[0].len != EXOFS_INO_ATTR_SIZE);
				1108	memcpy(inode, attrs[0].val_ptr, EXOFS_INO_ATTR_SIZE);
				1109
				1110	ret = extract_attr_from_ios(ios, &attrs[1]);
				1111	if (ret) {
				1112	EXOFS_ERR("%s: extract_attr 1 of inode failed\n", __func__);
				1113	goto out;
				1114	}
				1115	if (attrs[1].len) {
				1116	layout = attrs[1].val_ptr;
				1117	if (layout->gen_func != cpu_to_le16(LAYOUT_MOVING_WINDOW)) {
				1118	EXOFS_ERR("%s: unsupported files layout %d\n",
				1119	__func__, layout->gen_func);
				1120	ret = -ENOTSUPP;
				1121	goto out;
				1122	}
				1123	}
				1124
				1125	ret = extract_attr_from_ios(ios, &attrs[2]);
				1126	if (ret) {
				1127	EXOFS_ERR("%s: extract_attr 2 of inode failed\n", __func__);
				1128	goto out;
				1129	}
				1130	if (attrs[2].len) {
				1131	layout = attrs[2].val_ptr;
				1132	if (layout->gen_func != cpu_to_le16(LAYOUT_MOVING_WINDOW)) {
				1133	EXOFS_ERR("%s: unsupported meta-data layout %d\n",
				1134	__func__, layout->gen_func);
				1135	ret = -ENOTSUPP;
				1136	goto out;
				1137	}
				1138	}
				1139
				1140	out:
				1141	ore_put_io_state(ios);
				1142	return ret;
				1143	}
				1144
				1145	static void __oi_init(struct exofs_i_info *oi)
				1146	{
				1147	init_waitqueue_head(&oi->i_wq);
				1148	oi->i_flags = 0;
				1149	}
				1150	/*
				1151	* Fill in an inode read from the OSD and set it up for use
				1152	*/
				1153	struct inode exofs_iget(struct super_block sb, unsigned long ino)
				1154	{
				1155	struct exofs_i_info *oi;
				1156	struct exofs_fcb fcb;
				1157	struct inode *inode;
				1158	int ret;
				1159
				1160	inode = iget_locked(sb, ino);
				1161	if (!inode)
				1162	return ERR_PTR(-ENOMEM);
				1163	if (!(inode->i_state & I_NEW))
				1164	return inode;
				1165	oi = exofs_i(inode);
				1166	__oi_init(oi);
				1167	exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info,
				1168	exofs_oi_objno(oi));
				1169
				1170	/* read the inode from the osd */
				1171	ret = exofs_get_inode(sb, oi, &fcb);
				1172	if (ret)
				1173	goto bad_inode;
				1174
				1175	set_obj_created(oi);
				1176
				1177	/* copy stuff from on-disk struct to in-memory struct */
				1178	inode->i_mode = le16_to_cpu(fcb.i_mode);
				1179	i_uid_write(inode, le32_to_cpu(fcb.i_uid));
				1180	i_gid_write(inode, le32_to_cpu(fcb.i_gid));
				1181	set_nlink(inode, le16_to_cpu(fcb.i_links_count));
				1182	inode->i_ctime.tv_sec = (signed)le32_to_cpu(fcb.i_ctime);
				1183	inode->i_atime.tv_sec = (signed)le32_to_cpu(fcb.i_atime);
				1184	inode->i_mtime.tv_sec = (signed)le32_to_cpu(fcb.i_mtime);
				1185	inode->i_ctime.tv_nsec =
				1186	inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = 0;
				1187	oi->i_commit_size = le64_to_cpu(fcb.i_size);
				1188	i_size_write(inode, oi->i_commit_size);
				1189	inode->i_blkbits = EXOFS_BLKSHIFT;
				1190	inode->i_generation = le32_to_cpu(fcb.i_generation);
				1191
				1192	oi->i_dir_start_lookup = 0;
				1193
				1194	if ((inode->i_nlink == 0) && (inode->i_mode == 0)) {
				1195	ret = -ESTALE;
				1196	goto bad_inode;
				1197	}
				1198
				1199	if (S_ISCHR(inode->i_mode) \|\| S_ISBLK(inode->i_mode)) {
				1200	if (fcb.i_data[0])
				1201	inode->i_rdev =
				1202	old_decode_dev(le32_to_cpu(fcb.i_data[0]));
				1203	else
				1204	inode->i_rdev =
				1205	new_decode_dev(le32_to_cpu(fcb.i_data[1]));
				1206	} else {
				1207	memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data));
				1208	}
				1209
				1210	if (S_ISREG(inode->i_mode)) {
				1211	inode->i_op = &exofs_file_inode_operations;
				1212	inode->i_fop = &exofs_file_operations;
				1213	inode->i_mapping->a_ops = &exofs_aops;
				1214	} else if (S_ISDIR(inode->i_mode)) {
				1215	inode->i_op = &exofs_dir_inode_operations;
				1216	inode->i_fop = &exofs_dir_operations;
				1217	inode->i_mapping->a_ops = &exofs_aops;
				1218	} else if (S_ISLNK(inode->i_mode)) {
				1219	if (exofs_inode_is_fast_symlink(inode)) {
				1220	inode->i_op = &simple_symlink_inode_operations;
				1221	inode->i_link = (char *)oi->i_data;
				1222	} else {
				1223	inode->i_op = &page_symlink_inode_operations;
				1224	inode_nohighmem(inode);
				1225	inode->i_mapping->a_ops = &exofs_aops;
				1226	}
				1227	} else {
				1228	inode->i_op = &exofs_special_inode_operations;
				1229	if (fcb.i_data[0])
				1230	init_special_inode(inode, inode->i_mode,
				1231	old_decode_dev(le32_to_cpu(fcb.i_data[0])));
				1232	else
				1233	init_special_inode(inode, inode->i_mode,
				1234	new_decode_dev(le32_to_cpu(fcb.i_data[1])));
				1235	}
				1236
				1237	unlock_new_inode(inode);
				1238	return inode;
				1239
				1240	bad_inode:
				1241	iget_failed(inode);
				1242	return ERR_PTR(ret);
				1243	}
				1244
				1245	int __exofs_wait_obj_created(struct exofs_i_info *oi)
				1246	{
				1247	if (!obj_created(oi)) {
				1248	EXOFS_DBGMSG("!obj_created\n");
				1249	BUG_ON(!obj_2bcreated(oi));
				1250	wait_event(oi->i_wq, obj_created(oi));
				1251	EXOFS_DBGMSG("wait_event done\n");
				1252	}
				1253	return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0;
				1254	}
				1255
				1256	/*
				1257	* Callback function from exofs_new_inode(). The important thing is that we
				1258	* set the obj_created flag so that other methods know that the object exists on
				1259	* the OSD.
				1260	*/
				1261	static void create_done(struct ore_io_state ios, void p)
				1262	{
				1263	struct inode *inode = p;
				1264	struct exofs_i_info *oi = exofs_i(inode);
				1265	struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
				1266	int ret;
				1267
				1268	ret = ore_check_io(ios, NULL);
				1269	ore_put_io_state(ios);
				1270
				1271	atomic_dec(&sbi->s_curr_pending);
				1272
				1273	if (unlikely(ret)) {
				1274	EXOFS_ERR("object=0x%llx creation failed in pid=0x%llx",
				1275	_LLU(exofs_oi_objno(oi)),
				1276	_LLU(oi->one_comp.obj.partition));
				1277	/*TODO: When FS is corrupted creation can fail, object already
				1278	* exist. Get rid of this asynchronous creation, if exist
				1279	* increment the obj counter and try the next object. Until we
				1280	* succeed. All these dangling objects will be made into lost
				1281	* files by chkfs.exofs
				1282	*/
				1283	}
				1284
				1285	set_obj_created(oi);
				1286
				1287	wake_up(&oi->i_wq);
				1288	}
				1289
				1290	/*
				1291	* Set up a new inode and create an object for it on the OSD
				1292	*/
				1293	struct inode exofs_new_inode(struct inode dir, umode_t mode)
				1294	{
				1295	struct super_block *sb = dir->i_sb;
				1296	struct exofs_sb_info *sbi = sb->s_fs_info;
				1297	struct inode *inode;
				1298	struct exofs_i_info *oi;
				1299	struct ore_io_state *ios;
				1300	int ret;
				1301
				1302	inode = new_inode(sb);
				1303	if (!inode)
				1304	return ERR_PTR(-ENOMEM);
				1305
				1306	oi = exofs_i(inode);
				1307	__oi_init(oi);
				1308
				1309	set_obj_2bcreated(oi);
				1310
				1311	inode_init_owner(inode, dir, mode);
				1312	inode->i_ino = sbi->s_nextid++;
				1313	inode->i_blkbits = EXOFS_BLKSHIFT;
				1314	inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
				1315	oi->i_commit_size = inode->i_size = 0;
				1316	spin_lock(&sbi->s_next_gen_lock);
				1317	inode->i_generation = sbi->s_next_generation++;
				1318	spin_unlock(&sbi->s_next_gen_lock);
				1319	insert_inode_hash(inode);
				1320
				1321	exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info,
				1322	exofs_oi_objno(oi));
				1323	exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */
				1324
				1325	mark_inode_dirty(inode);
				1326
				1327	ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
				1328	if (unlikely(ret)) {
				1329	EXOFS_ERR("exofs_new_inode: ore_get_io_state failed\n");
				1330	return ERR_PTR(ret);
				1331	}
				1332
				1333	ios->done = create_done;
				1334	ios->private = inode;
				1335
				1336	ret = ore_create(ios);
				1337	if (ret) {
				1338	ore_put_io_state(ios);
				1339	return ERR_PTR(ret);
				1340	}
				1341	atomic_inc(&sbi->s_curr_pending);
				1342
				1343	return inode;
				1344	}
				1345
				1346	/*
				1347	* struct to pass two arguments to update_inode's callback
				1348	*/
				1349	struct updatei_args {
				1350	struct exofs_sb_info *sbi;
				1351	struct exofs_fcb fcb;
				1352	};
				1353
				1354	/*
				1355	* Callback function from exofs_update_inode().
				1356	*/
				1357	static void updatei_done(struct ore_io_state ios, void p)
				1358	{
				1359	struct updatei_args *args = p;
				1360
				1361	ore_put_io_state(ios);
				1362
				1363	atomic_dec(&args->sbi->s_curr_pending);
				1364
				1365	kfree(args);
				1366	}
				1367
				1368	/*
				1369	* Write the inode to the OSD. Just fill up the struct, and set the attribute
				1370	* synchronously or asynchronously depending on the do_sync flag.
				1371	*/
				1372	static int exofs_update_inode(struct inode *inode, int do_sync)
				1373	{
				1374	struct exofs_i_info *oi = exofs_i(inode);
				1375	struct super_block *sb = inode->i_sb;
				1376	struct exofs_sb_info *sbi = sb->s_fs_info;
				1377	struct ore_io_state *ios;
				1378	struct osd_attr attr;
				1379	struct exofs_fcb *fcb;
				1380	struct updatei_args *args;
				1381	int ret;
				1382
				1383	args = kzalloc(sizeof(*args), GFP_KERNEL);
				1384	if (!args) {
				1385	EXOFS_DBGMSG("Failed kzalloc of args\n");
				1386	return -ENOMEM;
				1387	}
				1388
				1389	fcb = &args->fcb;
				1390
				1391	fcb->i_mode = cpu_to_le16(inode->i_mode);
				1392	fcb->i_uid = cpu_to_le32(i_uid_read(inode));
				1393	fcb->i_gid = cpu_to_le32(i_gid_read(inode));
				1394	fcb->i_links_count = cpu_to_le16(inode->i_nlink);
				1395	fcb->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
				1396	fcb->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
				1397	fcb->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
				1398	oi->i_commit_size = i_size_read(inode);
				1399	fcb->i_size = cpu_to_le64(oi->i_commit_size);
				1400	fcb->i_generation = cpu_to_le32(inode->i_generation);
				1401
				1402	if (S_ISCHR(inode->i_mode) \|\| S_ISBLK(inode->i_mode)) {
				1403	if (old_valid_dev(inode->i_rdev)) {
				1404	fcb->i_data[0] =
				1405	cpu_to_le32(old_encode_dev(inode->i_rdev));
				1406	fcb->i_data[1] = 0;
				1407	} else {
				1408	fcb->i_data[0] = 0;
				1409	fcb->i_data[1] =
				1410	cpu_to_le32(new_encode_dev(inode->i_rdev));
				1411	fcb->i_data[2] = 0;
				1412	}
				1413	} else
				1414	memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data));
				1415
				1416	ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
				1417	if (unlikely(ret)) {
				1418	EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
				1419	goto free_args;
				1420	}
				1421
				1422	attr = g_attr_inode_data;
				1423	attr.val_ptr = fcb;
				1424	ios->out_attr_len = 1;
				1425	ios->out_attr = &attr;
				1426
				1427	wait_obj_created(oi);
				1428
				1429	if (!do_sync) {
				1430	args->sbi = sbi;
				1431	ios->done = updatei_done;
				1432	ios->private = args;
				1433	}
				1434
				1435	ret = ore_write(ios);
				1436	if (!do_sync && !ret) {
				1437	atomic_inc(&sbi->s_curr_pending);
				1438	goto out; /* deallocation in updatei_done */
				1439	}
				1440
				1441	ore_put_io_state(ios);
				1442	free_args:
				1443	kfree(args);
				1444	out:
				1445	EXOFS_DBGMSG("(0x%lx) do_sync=%d ret=>%d\n",
				1446	inode->i_ino, do_sync, ret);
				1447	return ret;
				1448	}
				1449
				1450	int exofs_write_inode(struct inode inode, struct writeback_control wbc)
				1451	{
				1452	/* FIXME: fix fsync and use wbc->sync_mode == WB_SYNC_ALL */
				1453	return exofs_update_inode(inode, 1);
				1454	}
				1455
				1456	/*
				1457	* Callback function from exofs_delete_inode() - don't have much cleaning up to
				1458	* do.
				1459	*/
				1460	static void delete_done(struct ore_io_state ios, void p)
				1461	{
				1462	struct exofs_sb_info *sbi = p;
				1463
				1464	ore_put_io_state(ios);
				1465
				1466	atomic_dec(&sbi->s_curr_pending);
				1467	}
				1468
				1469	/*
				1470	* Called when the refcount of an inode reaches zero. We remove the object
				1471	* from the OSD here. We make sure the object was created before we try and
				1472	* delete it.
				1473	*/
				1474	void exofs_evict_inode(struct inode *inode)
				1475	{
				1476	struct exofs_i_info *oi = exofs_i(inode);
				1477	struct super_block *sb = inode->i_sb;
				1478	struct exofs_sb_info *sbi = sb->s_fs_info;
				1479	struct ore_io_state *ios;
				1480	int ret;
				1481
				1482	truncate_inode_pages_final(&inode->i_data);
				1483
				1484	/* TODO: should do better here */
				1485	if (inode->i_nlink \|\| is_bad_inode(inode))
				1486	goto no_delete;
				1487
				1488	inode->i_size = 0;
				1489	clear_inode(inode);
				1490
				1491	/* if we are deleting an obj that hasn't been created yet, wait.
				1492	* This also makes sure that create_done cannot be called with an
				1493	* already evicted inode.
				1494	*/
				1495	wait_obj_created(oi);
				1496	/* ignore the error, attempt a remove anyway */
				1497
				1498	/* Now Remove the OSD objects */
				1499	ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
				1500	if (unlikely(ret)) {
				1501	EXOFS_ERR("%s: ore_get_io_state failed\n", __func__);
				1502	return;
				1503	}
				1504
				1505	ios->done = delete_done;
				1506	ios->private = sbi;
				1507
				1508	ret = ore_remove(ios);
				1509	if (ret) {
				1510	EXOFS_ERR("%s: ore_remove failed\n", __func__);
				1511	ore_put_io_state(ios);
				1512	return;
				1513	}
				1514	atomic_inc(&sbi->s_curr_pending);
				1515
				1516	return;
				1517
				1518	no_delete:
				1519	clear_inode(inode);
				1520	}