Blame - src/kernel/linux/v4.14/drivers/block/loop.c - T103

blob: f81e329d71bfc0969f8054f6bf6c2875e9c230e2 [file] [log] [blame]

rjw	1f88458	2022-01-06 17:20:42 +0800	[diff] [blame^]	1	/*
				2	* linux/drivers/block/loop.c
				3	*
				4	* Written by Theodore Ts'o, 3/29/93
				5	*
				6	* Copyright 1993 by Theodore Ts'o. Redistribution of this file is
				7	* permitted under the GNU General Public License.
				8	*
				9	* DES encryption plus some minor changes by Werner Almesberger, 30-MAY-1993
				10	* more DES encryption plus IDEA encryption by Nicholas J. Leon, June 20, 1996
				11	*
				12	* Modularized and updated for 1.1.16 kernel - Mitch Dsouza 28th May 1994
				13	* Adapted for 1.3.59 kernel - Andries Brouwer, 1 Feb 1996
				14	*
				15	* Fixed do_loop_request() re-entrancy - Vincent.Renardias@waw.com Mar 20, 1997
				16	*
				17	* Added devfs support - Richard Gooch <rgooch@atnf.csiro.au> 16-Jan-1998
				18	*
				19	* Handle sparse backing files correctly - Kenn Humborg, Jun 28, 1998
				20	*
				21	* Loadable modules and other fixes by AK, 1998
				22	*
				23	* Make real block number available to downstream transfer functions, enables
				24	* CBC (and relatives) mode encryption requiring unique IVs per data block.
				25	* Reed H. Petty, rhp@draper.net
				26	*
				27	* Maximum number of loop devices now dynamic via max_loop module parameter.
				28	* Russell Kroll <rkroll@exploits.org> 19990701
				29	*
				30	* Maximum number of loop devices when compiled-in now selectable by passing
				31	* max_loop=<1-255> to the kernel on boot.
				32	* Erik I. Bolsø, <eriki@himolde.no>, Oct 31, 1999
				33	*
				34	* Completely rewrite request handling to be make_request_fn style and
				35	* non blocking, pushing work to a helper thread. Lots of fixes from
				36	* Al Viro too.
				37	* Jens Axboe <axboe@suse.de>, Nov 2000
				38	*
				39	* Support up to 256 loop devices
				40	* Heinz Mauelshagen <mge@sistina.com>, Feb 2002
				41	*
				42	* Support for falling back on the write file operation when the address space
				43	* operations write_begin is not available on the backing filesystem.
				44	* Anton Altaparmakov, 16 Feb 2005
				45	*
				46	* Still To Fix:
				47	* - Advisory locking is ignored here.
				48	* - Should use an own CAP_* category instead of CAP_SYS_ADMIN
				49	*
				50	*/
				51
				52	#include <linux/module.h>
				53	#include <linux/moduleparam.h>
				54	#include <linux/sched.h>
				55	#include <linux/fs.h>
				56	#include <linux/file.h>
				57	#include <linux/stat.h>
				58	#include <linux/errno.h>
				59	#include <linux/major.h>
				60	#include <linux/wait.h>
				61	#include <linux/blkdev.h>
				62	#include <linux/blkpg.h>
				63	#include <linux/init.h>
				64	#include <linux/swap.h>
				65	#include <linux/slab.h>
				66	#include <linux/compat.h>
				67	#include <linux/suspend.h>
				68	#include <linux/freezer.h>
				69	#include <linux/mutex.h>
				70	#include <linux/writeback.h>
				71	#include <linux/completion.h>
				72	#include <linux/highmem.h>
				73	#include <linux/kthread.h>
				74	#include <linux/splice.h>
				75	#include <linux/sysfs.h>
				76	#include <linux/miscdevice.h>
				77	#include <linux/falloc.h>
				78	#include <linux/uio.h>
				79	#include "loop.h"
				80
				81	#include <linux/uaccess.h>
				82
				83	static DEFINE_IDR(loop_index_idr);
				84	static DEFINE_MUTEX(loop_index_mutex);
				85
				86	static int max_part;
				87	static int part_shift;
				88
				89	static int transfer_xor(struct loop_device *lo, int cmd,
				90	struct page *raw_page, unsigned raw_off,
				91	struct page *loop_page, unsigned loop_off,
				92	int size, sector_t real_block)
				93	{
				94	char *raw_buf = kmap_atomic(raw_page) + raw_off;
				95	char *loop_buf = kmap_atomic(loop_page) + loop_off;
				96	char in, out, *key;
				97	int i, keysize;
				98
				99	if (cmd == READ) {
				100	in = raw_buf;
				101	out = loop_buf;
				102	} else {
				103	in = loop_buf;
				104	out = raw_buf;
				105	}
				106
				107	key = lo->lo_encrypt_key;
				108	keysize = lo->lo_encrypt_key_size;
				109	for (i = 0; i < size; i++)
				110	out++ = in++ ^ key[(i & 511) % keysize];
				111
				112	kunmap_atomic(loop_buf);
				113	kunmap_atomic(raw_buf);
				114	cond_resched();
				115	return 0;
				116	}
				117
				118	static int xor_init(struct loop_device lo, const struct loop_info64 info)
				119	{
				120	if (unlikely(info->lo_encrypt_key_size <= 0))
				121	return -EINVAL;
				122	return 0;
				123	}
				124
				125	static struct loop_func_table none_funcs = {
				126	.number = LO_CRYPT_NONE,
				127	};
				128
				129	static struct loop_func_table xor_funcs = {
				130	.number = LO_CRYPT_XOR,
				131	.transfer = transfer_xor,
				132	.init = xor_init
				133	};
				134
				135	/* xfer_funcs[0] is special - its release function is never called */
				136	static struct loop_func_table *xfer_funcs[MAX_LO_CRYPT] = {
				137	&none_funcs,
				138	&xor_funcs
				139	};
				140
				141	static loff_t get_size(loff_t offset, loff_t sizelimit, struct file *file)
				142	{
				143	loff_t loopsize;
				144
				145	/* Compute loopsize in bytes */
				146	loopsize = i_size_read(file->f_mapping->host);
				147	if (offset > 0)
				148	loopsize -= offset;
				149	/* offset is beyond i_size, weird but possible */
				150	if (loopsize < 0)
				151	return 0;
				152
				153	if (sizelimit > 0 && sizelimit < loopsize)
				154	loopsize = sizelimit;
				155	/*
				156	* Unfortunately, if we want to do I/O on the device,
				157	* the number of 512-byte sectors has to fit into a sector_t.
				158	*/
				159	return loopsize >> 9;
				160	}
				161
				162	static loff_t get_loop_size(struct loop_device lo, struct file file)
				163	{
				164	return get_size(lo->lo_offset, lo->lo_sizelimit, file);
				165	}
				166
				167	static void __loop_update_dio(struct loop_device *lo, bool dio)
				168	{
				169	struct file *file = lo->lo_backing_file;
				170	struct address_space *mapping = file->f_mapping;
				171	struct inode *inode = mapping->host;
				172	unsigned short sb_bsize = 0;
				173	unsigned dio_align = 0;
				174	bool use_dio;
				175
				176	if (inode->i_sb->s_bdev) {
				177	sb_bsize = bdev_logical_block_size(inode->i_sb->s_bdev);
				178	dio_align = sb_bsize - 1;
				179	}
				180
				181	/*
				182	* We support direct I/O only if lo_offset is aligned with the
				183	* logical I/O size of backing device, and the logical block
				184	* size of loop is bigger than the backing device's and the loop
				185	* needn't transform transfer.
				186	*
				187	* TODO: the above condition may be loosed in the future, and
				188	* direct I/O may be switched runtime at that time because most
				189	* of requests in sane applications should be PAGE_SIZE aligned
				190	*/
				191	if (dio) {
				192	if (queue_logical_block_size(lo->lo_queue) >= sb_bsize &&
				193	!(lo->lo_offset & dio_align) &&
				194	mapping->a_ops->direct_IO &&
				195	!lo->transfer)
				196	use_dio = true;
				197	else
				198	use_dio = false;
				199	} else {
				200	use_dio = false;
				201	}
				202
				203	if (lo->use_dio == use_dio)
				204	return;
				205
				206	/* flush dirty pages before changing direct IO */
				207	vfs_fsync(file, 0);
				208
				209	/*
				210	* The flag of LO_FLAGS_DIRECT_IO is handled similarly with
				211	* LO_FLAGS_READ_ONLY, both are set from kernel, and losetup
				212	* will get updated by ioctl(LOOP_GET_STATUS)
				213	*/
				214	blk_mq_freeze_queue(lo->lo_queue);
				215	lo->use_dio = use_dio;
				216	if (use_dio) {
				217	queue_flag_clear_unlocked(QUEUE_FLAG_NOMERGES, lo->lo_queue);
				218	lo->lo_flags \|= LO_FLAGS_DIRECT_IO;
				219	} else {
				220	queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, lo->lo_queue);
				221	lo->lo_flags &= ~LO_FLAGS_DIRECT_IO;
				222	}
				223	blk_mq_unfreeze_queue(lo->lo_queue);
				224	}
				225
				226	static int
				227	figure_loop_size(struct loop_device *lo, loff_t offset, loff_t sizelimit)
				228	{
				229	loff_t size = get_size(offset, sizelimit, lo->lo_backing_file);
				230	sector_t x = (sector_t)size;
				231	struct block_device *bdev = lo->lo_device;
				232
				233	if (unlikely((loff_t)x != size))
				234	return -EFBIG;
				235	if (lo->lo_offset != offset)
				236	lo->lo_offset = offset;
				237	if (lo->lo_sizelimit != sizelimit)
				238	lo->lo_sizelimit = sizelimit;
				239	set_capacity(lo->lo_disk, x);
				240	bd_set_size(bdev, (loff_t)get_capacity(bdev->bd_disk) << 9);
				241	/* let user-space know about the new size */
				242	kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
				243	return 0;
				244	}
				245
				246	static inline int
				247	lo_do_transfer(struct loop_device *lo, int cmd,
				248	struct page *rpage, unsigned roffs,
				249	struct page *lpage, unsigned loffs,
				250	int size, sector_t rblock)
				251	{
				252	int ret;
				253
				254	ret = lo->transfer(lo, cmd, rpage, roffs, lpage, loffs, size, rblock);
				255	if (likely(!ret))
				256	return 0;
				257
				258	printk_ratelimited(KERN_ERR
				259	"loop: Transfer error at byte offset %llu, length %i.\n",
				260	(unsigned long long)rblock << 9, size);
				261	return ret;
				262	}
				263
				264	static int lo_write_bvec(struct file file, struct bio_vec bvec, loff_t *ppos)
				265	{
				266	struct iov_iter i;
				267	ssize_t bw;
				268
				269	iov_iter_bvec(&i, ITER_BVEC \| WRITE, bvec, 1, bvec->bv_len);
				270
				271	file_start_write(file);
				272	bw = vfs_iter_write(file, &i, ppos, 0);
				273	file_end_write(file);
				274
				275	if (likely(bw == bvec->bv_len))
				276	return 0;
				277
				278	printk_ratelimited(KERN_ERR
				279	"loop: Write error at byte offset %llu, length %i.\n",
				280	(unsigned long long)*ppos, bvec->bv_len);
				281	if (bw >= 0)
				282	bw = -EIO;
				283	return bw;
				284	}
				285
				286	static int lo_write_simple(struct loop_device lo, struct request rq,
				287	loff_t pos)
				288	{
				289	struct bio_vec bvec;
				290	struct req_iterator iter;
				291	int ret = 0;
				292
				293	rq_for_each_segment(bvec, rq, iter) {
				294	ret = lo_write_bvec(lo->lo_backing_file, &bvec, &pos);
				295	if (ret < 0)
				296	break;
				297	cond_resched();
				298	}
				299
				300	return ret;
				301	}
				302
				303	/*
				304	* This is the slow, transforming version that needs to double buffer the
				305	* data as it cannot do the transformations in place without having direct
				306	* access to the destination pages of the backing file.
				307	*/
				308	static int lo_write_transfer(struct loop_device lo, struct request rq,
				309	loff_t pos)
				310	{
				311	struct bio_vec bvec, b;
				312	struct req_iterator iter;
				313	struct page *page;
				314	int ret = 0;
				315
				316	page = alloc_page(GFP_NOIO);
				317	if (unlikely(!page))
				318	return -ENOMEM;
				319
				320	rq_for_each_segment(bvec, rq, iter) {
				321	ret = lo_do_transfer(lo, WRITE, page, 0, bvec.bv_page,
				322	bvec.bv_offset, bvec.bv_len, pos >> 9);
				323	if (unlikely(ret))
				324	break;
				325
				326	b.bv_page = page;
				327	b.bv_offset = 0;
				328	b.bv_len = bvec.bv_len;
				329	ret = lo_write_bvec(lo->lo_backing_file, &b, &pos);
				330	if (ret < 0)
				331	break;
				332	}
				333
				334	__free_page(page);
				335	return ret;
				336	}
				337
				338	static int lo_read_simple(struct loop_device lo, struct request rq,
				339	loff_t pos)
				340	{
				341	struct bio_vec bvec;
				342	struct req_iterator iter;
				343	struct iov_iter i;
				344	ssize_t len;
				345
				346	rq_for_each_segment(bvec, rq, iter) {
				347	iov_iter_bvec(&i, ITER_BVEC, &bvec, 1, bvec.bv_len);
				348	len = vfs_iter_read(lo->lo_backing_file, &i, &pos, 0);
				349	if (len < 0)
				350	return len;
				351
				352	flush_dcache_page(bvec.bv_page);
				353
				354	if (len != bvec.bv_len) {
				355	struct bio *bio;
				356
				357	__rq_for_each_bio(bio, rq)
				358	zero_fill_bio(bio);
				359	break;
				360	}
				361	cond_resched();
				362	}
				363
				364	return 0;
				365	}
				366
				367	static int lo_read_transfer(struct loop_device lo, struct request rq,
				368	loff_t pos)
				369	{
				370	struct bio_vec bvec, b;
				371	struct req_iterator iter;
				372	struct iov_iter i;
				373	struct page *page;
				374	ssize_t len;
				375	int ret = 0;
				376
				377	page = alloc_page(GFP_NOIO);
				378	if (unlikely(!page))
				379	return -ENOMEM;
				380
				381	rq_for_each_segment(bvec, rq, iter) {
				382	loff_t offset = pos;
				383
				384	b.bv_page = page;
				385	b.bv_offset = 0;
				386	b.bv_len = bvec.bv_len;
				387
				388	iov_iter_bvec(&i, ITER_BVEC, &b, 1, b.bv_len);
				389	len = vfs_iter_read(lo->lo_backing_file, &i, &pos, 0);
				390	if (len < 0) {
				391	ret = len;
				392	goto out_free_page;
				393	}
				394
				395	ret = lo_do_transfer(lo, READ, page, 0, bvec.bv_page,
				396	bvec.bv_offset, len, offset >> 9);
				397	if (ret)
				398	goto out_free_page;
				399
				400	flush_dcache_page(bvec.bv_page);
				401
				402	if (len != bvec.bv_len) {
				403	struct bio *bio;
				404
				405	__rq_for_each_bio(bio, rq)
				406	zero_fill_bio(bio);
				407	break;
				408	}
				409	}
				410
				411	ret = 0;
				412	out_free_page:
				413	__free_page(page);
				414	return ret;
				415	}
				416
				417	static int lo_fallocate(struct loop_device lo, struct request rq, loff_t pos,
				418	int mode)
				419	{
				420	/*
				421	* We use fallocate to manipulate the space mappings used by the image
				422	* a.k.a. discard/zerorange. However we do not support this if
				423	* encryption is enabled, because it may give an attacker useful
				424	* information.
				425	*/
				426	struct file *file = lo->lo_backing_file;
				427	int ret;
				428
				429	mode \|= FALLOC_FL_KEEP_SIZE;
				430
				431	if ((!file->f_op->fallocate) \|\| lo->lo_encrypt_key_size) {
				432	ret = -EOPNOTSUPP;
				433	goto out;
				434	}
				435
				436	ret = file->f_op->fallocate(file, mode, pos, blk_rq_bytes(rq));
				437	if (unlikely(ret && ret != -EINVAL && ret != -EOPNOTSUPP))
				438	ret = -EIO;
				439	out:
				440	return ret;
				441	}
				442
				443	static int lo_req_flush(struct loop_device lo, struct request rq)
				444	{
				445	struct file *file = lo->lo_backing_file;
				446	int ret = vfs_fsync(file, 0);
				447	if (unlikely(ret && ret != -EINVAL))
				448	ret = -EIO;
				449
				450	return ret;
				451	}
				452
				453	static void lo_complete_rq(struct request *rq)
				454	{
				455	struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
				456
				457	if (unlikely(req_op(cmd->rq) == REQ_OP_READ && cmd->use_aio &&
				458	cmd->ret >= 0 && cmd->ret < blk_rq_bytes(cmd->rq))) {
				459	struct bio *bio = cmd->rq->bio;
				460
				461	bio_advance(bio, cmd->ret);
				462	zero_fill_bio(bio);
				463	}
				464
				465	blk_mq_end_request(rq, cmd->ret < 0 ? BLK_STS_IOERR : BLK_STS_OK);
				466	}
				467
				468	static void lo_rw_aio_do_completion(struct loop_cmd *cmd)
				469	{
				470	if (!atomic_dec_and_test(&cmd->ref))
				471	return;
				472	kfree(cmd->bvec);
				473	cmd->bvec = NULL;
				474	blk_mq_complete_request(cmd->rq);
				475	}
				476
				477	static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2)
				478	{
				479	struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb);
				480
				481	cmd->ret = ret;
				482	lo_rw_aio_do_completion(cmd);
				483	}
				484
				485	static int lo_rw_aio(struct loop_device lo, struct loop_cmd cmd,
				486	loff_t pos, bool rw)
				487	{
				488	struct iov_iter iter;
				489	struct bio_vec *bvec;
				490	struct request *rq = cmd->rq;
				491	struct bio *bio = rq->bio;
				492	struct file *file = lo->lo_backing_file;
				493	unsigned int offset;
				494	int segments = 0;
				495	int ret;
				496
				497	if (rq->bio != rq->biotail) {
				498	struct req_iterator iter;
				499	struct bio_vec tmp;
				500
				501	__rq_for_each_bio(bio, rq)
				502	segments += bio_segments(bio);
				503	bvec = kmalloc(sizeof(struct bio_vec) * segments, GFP_NOIO);
				504	if (!bvec)
				505	return -EIO;
				506	cmd->bvec = bvec;
				507
				508	/*
				509	* The bios of the request may be started from the middle of
				510	* the 'bvec' because of bio splitting, so we can't directly
				511	* copy bio->bi_iov_vec to new bvec. The rq_for_each_segment
				512	* API will take care of all details for us.
				513	*/
				514	rq_for_each_segment(tmp, rq, iter) {
				515	*bvec = tmp;
				516	bvec++;
				517	}
				518	bvec = cmd->bvec;
				519	offset = 0;
				520	} else {
				521	/*
				522	* Same here, this bio may be started from the middle of the
				523	* 'bvec' because of bio splitting, so offset from the bvec
				524	* must be passed to iov iterator
				525	*/
				526	offset = bio->bi_iter.bi_bvec_done;
				527	bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
				528	segments = bio_segments(bio);
				529	}
				530	atomic_set(&cmd->ref, 2);
				531
				532	iov_iter_bvec(&iter, ITER_BVEC \| rw, bvec,
				533	segments, blk_rq_bytes(rq));
				534	iter.iov_offset = offset;
				535
				536	cmd->iocb.ki_pos = pos;
				537	cmd->iocb.ki_filp = file;
				538	cmd->iocb.ki_complete = lo_rw_aio_complete;
				539	cmd->iocb.ki_flags = IOCB_DIRECT;
				540
				541	if (rw == WRITE)
				542	ret = call_write_iter(file, &cmd->iocb, &iter);
				543	else
				544	ret = call_read_iter(file, &cmd->iocb, &iter);
				545
				546	lo_rw_aio_do_completion(cmd);
				547
				548	if (ret != -EIOCBQUEUED)
				549	cmd->iocb.ki_complete(&cmd->iocb, ret, 0);
				550	return 0;
				551	}
				552
				553	static int do_req_filebacked(struct loop_device lo, struct request rq)
				554	{
				555	struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
				556	loff_t pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset;
				557
				558	/*
				559	* lo_write_simple and lo_read_simple should have been covered
				560	* by io submit style function like lo_rw_aio(), one blocker
				561	* is that lo_read_simple() need to call flush_dcache_page after
				562	* the page is written from kernel, and it isn't easy to handle
				563	* this in io submit style function which submits all segments
				564	* of the req at one time. And direct read IO doesn't need to
				565	* run flush_dcache_page().
				566	*/
				567	switch (req_op(rq)) {
				568	case REQ_OP_FLUSH:
				569	return lo_req_flush(lo, rq);
				570	case REQ_OP_WRITE_ZEROES:
				571	/*
				572	* If the caller doesn't want deallocation, call zeroout to
				573	* write zeroes the range. Otherwise, punch them out.
				574	*/
				575	return lo_fallocate(lo, rq, pos,
				576	(rq->cmd_flags & REQ_NOUNMAP) ?
				577	FALLOC_FL_ZERO_RANGE :
				578	FALLOC_FL_PUNCH_HOLE);
				579	case REQ_OP_DISCARD:
				580	return lo_fallocate(lo, rq, pos, FALLOC_FL_PUNCH_HOLE);
				581	case REQ_OP_WRITE:
				582	if (lo->transfer)
				583	return lo_write_transfer(lo, rq, pos);
				584	else if (cmd->use_aio)
				585	return lo_rw_aio(lo, cmd, pos, WRITE);
				586	else
				587	return lo_write_simple(lo, rq, pos);
				588	case REQ_OP_READ:
				589	if (lo->transfer)
				590	return lo_read_transfer(lo, rq, pos);
				591	else if (cmd->use_aio)
				592	return lo_rw_aio(lo, cmd, pos, READ);
				593	else
				594	return lo_read_simple(lo, rq, pos);
				595	default:
				596	WARN_ON_ONCE(1);
				597	return -EIO;
				598	break;
				599	}
				600	}
				601
				602	static inline void loop_update_dio(struct loop_device *lo)
				603	{
				604	__loop_update_dio(lo, io_is_direct(lo->lo_backing_file) \|
				605	lo->use_dio);
				606	}
				607
				608	static void loop_reread_partitions(struct loop_device *lo,
				609	struct block_device *bdev)
				610	{
				611	int rc;
				612
				613	/*
				614	* bd_mutex has been held already in release path, so don't
				615	* acquire it if this function is called in such case.
				616	*
				617	* If the reread partition isn't from release path, lo_refcnt
				618	* must be at least one and it can only become zero when the
				619	* current holder is released.
				620	*/
				621	if (!atomic_read(&lo->lo_refcnt))
				622	rc = __blkdev_reread_part(bdev);
				623	else
				624	rc = blkdev_reread_part(bdev);
				625	if (rc)
				626	pr_warn("%s: partition scan of loop%d (%s) failed (rc=%d)\n",
				627	__func__, lo->lo_number, lo->lo_file_name, rc);
				628	}
				629
				630	static inline int is_loop_device(struct file *file)
				631	{
				632	struct inode *i = file->f_mapping->host;
				633
				634	return i && S_ISBLK(i->i_mode) && MAJOR(i->i_rdev) == LOOP_MAJOR;
				635	}
				636
				637	static int loop_validate_file(struct file file, struct block_device bdev)
				638	{
				639	struct inode *inode = file->f_mapping->host;
				640	struct file *f = file;
				641
				642	/* Avoid recursion */
				643	while (is_loop_device(f)) {
				644	struct loop_device *l;
				645
				646	if (f->f_mapping->host->i_bdev == bdev)
				647	return -EBADF;
				648
				649	l = f->f_mapping->host->i_bdev->bd_disk->private_data;
				650	if (l->lo_state == Lo_unbound) {
				651	return -EINVAL;
				652	}
				653	f = l->lo_backing_file;
				654	}
				655	if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
				656	return -EINVAL;
				657	return 0;
				658	}
				659
				660	/*
				661	* loop_change_fd switched the backing store of a loopback device to
				662	* a new file. This is useful for operating system installers to free up
				663	* the original file and in High Availability environments to switch to
				664	* an alternative location for the content in case of server meltdown.
				665	* This can only work if the loop device is used read-only, and if the
				666	* new backing store is the same size and type as the old backing store.
				667	*/
				668	static int loop_change_fd(struct loop_device lo, struct block_device bdev,
				669	unsigned int arg)
				670	{
				671	struct file file, old_file;
				672	struct inode *inode;
				673	int error;
				674
				675	error = -ENXIO;
				676	if (lo->lo_state != Lo_bound)
				677	goto out;
				678
				679	/* the loop device has to be read-only */
				680	error = -EINVAL;
				681	if (!(lo->lo_flags & LO_FLAGS_READ_ONLY))
				682	goto out;
				683
				684	error = -EBADF;
				685	file = fget(arg);
				686	if (!file)
				687	goto out;
				688
				689	error = loop_validate_file(file, bdev);
				690	if (error)
				691	goto out_putf;
				692
				693	inode = file->f_mapping->host;
				694	old_file = lo->lo_backing_file;
				695
				696	error = -EINVAL;
				697
				698	/* size of the new backing store needs to be the same */
				699	if (get_loop_size(lo, file) != get_loop_size(lo, old_file))
				700	goto out_putf;
				701
				702	/* and ... switch */
				703	blk_mq_freeze_queue(lo->lo_queue);
				704	mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask);
				705	lo->lo_backing_file = file;
				706	lo->old_gfp_mask = mapping_gfp_mask(file->f_mapping);
				707	mapping_set_gfp_mask(file->f_mapping,
				708	lo->old_gfp_mask & ~(__GFP_IO\|__GFP_FS));
				709	loop_update_dio(lo);
				710	blk_mq_unfreeze_queue(lo->lo_queue);
				711
				712	fput(old_file);
				713	if (lo->lo_flags & LO_FLAGS_PARTSCAN)
				714	loop_reread_partitions(lo, bdev);
				715	return 0;
				716
				717	out_putf:
				718	fput(file);
				719	out:
				720	return error;
				721	}
				722
				723	/* loop sysfs attributes */
				724
				725	static ssize_t loop_attr_show(struct device dev, char page,
				726	ssize_t (callback)(struct loop_device , char *))
				727	{
				728	struct gendisk *disk = dev_to_disk(dev);
				729	struct loop_device *lo = disk->private_data;
				730
				731	return callback(lo, page);
				732	}
				733
				734	#define LOOP_ATTR_RO(_name) \
				735	static ssize_t loop_attr_##_name##_show(struct loop_device , char ); \
				736	static ssize_t loop_attr_do_show_##_name(struct device *d, \
				737	struct device_attribute attr, char b) \
				738	{ \
				739	return loop_attr_show(d, b, loop_attr_##_name##_show); \
				740	} \
				741	static struct device_attribute loop_attr_##_name = \
				742	__ATTR(_name, S_IRUGO, loop_attr_do_show_##_name, NULL);
				743
				744	static ssize_t loop_attr_backing_file_show(struct loop_device lo, char buf)
				745	{
				746	ssize_t ret;
				747	char *p = NULL;
				748
				749	spin_lock_irq(&lo->lo_lock);
				750	if (lo->lo_backing_file)
				751	p = file_path(lo->lo_backing_file, buf, PAGE_SIZE - 1);
				752	spin_unlock_irq(&lo->lo_lock);
				753
				754	if (IS_ERR_OR_NULL(p))
				755	ret = PTR_ERR(p);
				756	else {
				757	ret = strlen(p);
				758	memmove(buf, p, ret);
				759	buf[ret++] = '\n';
				760	buf[ret] = 0;
				761	}
				762
				763	return ret;
				764	}
				765
				766	static ssize_t loop_attr_offset_show(struct loop_device lo, char buf)
				767	{
				768	return sprintf(buf, "%llu\n", (unsigned long long)lo->lo_offset);
				769	}
				770
				771	static ssize_t loop_attr_sizelimit_show(struct loop_device lo, char buf)
				772	{
				773	return sprintf(buf, "%llu\n", (unsigned long long)lo->lo_sizelimit);
				774	}
				775
				776	static ssize_t loop_attr_autoclear_show(struct loop_device lo, char buf)
				777	{
				778	int autoclear = (lo->lo_flags & LO_FLAGS_AUTOCLEAR);
				779
				780	return sprintf(buf, "%s\n", autoclear ? "1" : "0");
				781	}
				782
				783	static ssize_t loop_attr_partscan_show(struct loop_device lo, char buf)
				784	{
				785	int partscan = (lo->lo_flags & LO_FLAGS_PARTSCAN);
				786
				787	return sprintf(buf, "%s\n", partscan ? "1" : "0");
				788	}
				789
				790	static ssize_t loop_attr_dio_show(struct loop_device lo, char buf)
				791	{
				792	int dio = (lo->lo_flags & LO_FLAGS_DIRECT_IO);
				793
				794	return sprintf(buf, "%s\n", dio ? "1" : "0");
				795	}
				796
				797	LOOP_ATTR_RO(backing_file);
				798	LOOP_ATTR_RO(offset);
				799	LOOP_ATTR_RO(sizelimit);
				800	LOOP_ATTR_RO(autoclear);
				801	LOOP_ATTR_RO(partscan);
				802	LOOP_ATTR_RO(dio);
				803
				804	static struct attribute *loop_attrs[] = {
				805	&loop_attr_backing_file.attr,
				806	&loop_attr_offset.attr,
				807	&loop_attr_sizelimit.attr,
				808	&loop_attr_autoclear.attr,
				809	&loop_attr_partscan.attr,
				810	&loop_attr_dio.attr,
				811	NULL,
				812	};
				813
				814	static struct attribute_group loop_attribute_group = {
				815	.name = "loop",
				816	.attrs= loop_attrs,
				817	};
				818
				819	static void loop_sysfs_init(struct loop_device *lo)
				820	{
				821	lo->sysfs_inited = !sysfs_create_group(&disk_to_dev(lo->lo_disk)->kobj,
				822	&loop_attribute_group);
				823	}
				824
				825	static void loop_sysfs_exit(struct loop_device *lo)
				826	{
				827	if (lo->sysfs_inited)
				828	sysfs_remove_group(&disk_to_dev(lo->lo_disk)->kobj,
				829	&loop_attribute_group);
				830	}
				831
				832	static void loop_config_discard(struct loop_device *lo)
				833	{
				834	struct file *file = lo->lo_backing_file;
				835	struct inode *inode = file->f_mapping->host;
				836	struct request_queue *q = lo->lo_queue;
				837
				838	/*
				839	* We use punch hole to reclaim the free space used by the
				840	* image a.k.a. discard. However we do not support discard if
				841	* encryption is enabled, because it may give an attacker
				842	* useful information.
				843	*/
				844	if ((!file->f_op->fallocate) \|\|
				845	lo->lo_encrypt_key_size) {
				846	q->limits.discard_granularity = 0;
				847	q->limits.discard_alignment = 0;
				848	blk_queue_max_discard_sectors(q, 0);
				849	blk_queue_max_write_zeroes_sectors(q, 0);
				850	queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
				851	return;
				852	}
				853
				854	q->limits.discard_granularity = inode->i_sb->s_blocksize;
				855	q->limits.discard_alignment = 0;
				856
				857	blk_queue_max_discard_sectors(q, UINT_MAX >> 9);
				858	blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9);
				859	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
				860	}
				861
				862	static void loop_unprepare_queue(struct loop_device *lo)
				863	{
				864	kthread_flush_worker(&lo->worker);
				865	kthread_stop(lo->worker_task);
				866	}
				867
				868	static int loop_kthread_worker_fn(void *worker_ptr)
				869	{
				870	current->flags \|= PF_LESS_THROTTLE \| PF_MEMALLOC_NOIO;
				871	return kthread_worker_fn(worker_ptr);
				872	}
				873
				874	static int loop_prepare_queue(struct loop_device *lo)
				875	{
				876	kthread_init_worker(&lo->worker);
				877	lo->worker_task = kthread_run(loop_kthread_worker_fn,
				878	&lo->worker, "loop%d", lo->lo_number);
				879	if (IS_ERR(lo->worker_task))
				880	return -ENOMEM;
				881	set_user_nice(lo->worker_task, MIN_NICE);
				882	return 0;
				883	}
				884
				885	static int loop_set_fd(struct loop_device *lo, fmode_t mode,
				886	struct block_device *bdev, unsigned int arg)
				887	{
				888	struct file *file;
				889	struct inode *inode;
				890	struct address_space *mapping;
				891	int lo_flags = 0;
				892	int error;
				893	loff_t size;
				894
				895	/* This is safe, since we have a reference from open(). */
				896	__module_get(THIS_MODULE);
				897
				898	error = -EBADF;
				899	file = fget(arg);
				900	if (!file)
				901	goto out;
				902
				903	error = -EBUSY;
				904	if (lo->lo_state != Lo_unbound)
				905	goto out_putf;
				906
				907	error = loop_validate_file(file, bdev);
				908	if (error)
				909	goto out_putf;
				910
				911	mapping = file->f_mapping;
				912	inode = mapping->host;
				913
				914	if (!(file->f_mode & FMODE_WRITE) \|\| !(mode & FMODE_WRITE) \|\|
				915	!file->f_op->write_iter)
				916	lo_flags \|= LO_FLAGS_READ_ONLY;
				917
				918	error = -EFBIG;
				919	size = get_loop_size(lo, file);
				920	if ((loff_t)(sector_t)size != size)
				921	goto out_putf;
				922	error = loop_prepare_queue(lo);
				923	if (error)
				924	goto out_putf;
				925
				926	error = 0;
				927
				928	set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0);
				929
				930	lo->use_dio = false;
				931	lo->lo_device = bdev;
				932	lo->lo_flags = lo_flags;
				933	lo->lo_backing_file = file;
				934	lo->transfer = NULL;
				935	lo->ioctl = NULL;
				936	lo->lo_sizelimit = 0;
				937	lo->old_gfp_mask = mapping_gfp_mask(mapping);
				938	mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO\|__GFP_FS));
				939
				940	if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync)
				941	blk_queue_write_cache(lo->lo_queue, true, false);
				942
				943	loop_update_dio(lo);
				944	set_capacity(lo->lo_disk, size);
				945	bd_set_size(bdev, size << 9);
				946	loop_sysfs_init(lo);
				947	/* let user-space know about the new size */
				948	kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
				949
				950	set_blocksize(bdev, S_ISBLK(inode->i_mode) ?
				951	block_size(inode->i_bdev) : PAGE_SIZE);
				952
				953	lo->lo_state = Lo_bound;
				954	if (part_shift)
				955	lo->lo_flags \|= LO_FLAGS_PARTSCAN;
				956	if (lo->lo_flags & LO_FLAGS_PARTSCAN)
				957	loop_reread_partitions(lo, bdev);
				958
				959	/* Grab the block_device to prevent its destruction after we
				960	* put /dev/loopXX inode. Later in loop_clr_fd() we bdput(bdev).
				961	*/
				962	bdgrab(bdev);
				963	return 0;
				964
				965	out_putf:
				966	fput(file);
				967	out:
				968	/* This is safe: open() is still holding a reference. */
				969	module_put(THIS_MODULE);
				970	return error;
				971	}
				972
				973	static int
				974	loop_release_xfer(struct loop_device *lo)
				975	{
				976	int err = 0;
				977	struct loop_func_table *xfer = lo->lo_encryption;
				978
				979	if (xfer) {
				980	if (xfer->release)
				981	err = xfer->release(lo);
				982	lo->transfer = NULL;
				983	lo->lo_encryption = NULL;
				984	module_put(xfer->owner);
				985	}
				986	return err;
				987	}
				988
				989	static int
				990	loop_init_xfer(struct loop_device lo, struct loop_func_table xfer,
				991	const struct loop_info64 *i)
				992	{
				993	int err = 0;
				994
				995	if (xfer) {
				996	struct module *owner = xfer->owner;
				997
				998	if (!try_module_get(owner))
				999	return -EINVAL;
				1000	if (xfer->init)
				1001	err = xfer->init(lo, i);
				1002	if (err)
				1003	module_put(owner);
				1004	else
				1005	lo->lo_encryption = xfer;
				1006	}
				1007	return err;
				1008	}
				1009
				1010	static int loop_clr_fd(struct loop_device *lo)
				1011	{
				1012	struct file *filp = lo->lo_backing_file;
				1013	gfp_t gfp = lo->old_gfp_mask;
				1014	struct block_device *bdev = lo->lo_device;
				1015
				1016	if (lo->lo_state != Lo_bound)
				1017	return -ENXIO;
				1018
				1019	/*
				1020	* If we've explicitly asked to tear down the loop device,
				1021	* and it has an elevated reference count, set it for auto-teardown when
				1022	* the last reference goes away. This stops $!~#$@ udev from
				1023	* preventing teardown because it decided that it needs to run blkid on
				1024	* the loopback device whenever they appear. xfstests is notorious for
				1025	* failing tests because blkid via udev races with a losetup
				1026	* <dev>/do something like mkfs/losetup -d <dev> causing the losetup -d
				1027	* command to fail with EBUSY.
				1028	*/
				1029	if (atomic_read(&lo->lo_refcnt) > 1) {
				1030	lo->lo_flags \|= LO_FLAGS_AUTOCLEAR;
				1031	mutex_unlock(&lo->lo_ctl_mutex);
				1032	return 0;
				1033	}
				1034
				1035	if (filp == NULL)
				1036	return -EINVAL;
				1037
				1038	/* freeze request queue during the transition */
				1039	blk_mq_freeze_queue(lo->lo_queue);
				1040
				1041	spin_lock_irq(&lo->lo_lock);
				1042	lo->lo_state = Lo_rundown;
				1043	lo->lo_backing_file = NULL;
				1044	spin_unlock_irq(&lo->lo_lock);
				1045
				1046	loop_release_xfer(lo);
				1047	lo->transfer = NULL;
				1048	lo->ioctl = NULL;
				1049	lo->lo_device = NULL;
				1050	lo->lo_encryption = NULL;
				1051	lo->lo_offset = 0;
				1052	lo->lo_sizelimit = 0;
				1053	lo->lo_encrypt_key_size = 0;
				1054	memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE);
				1055	memset(lo->lo_crypt_name, 0, LO_NAME_SIZE);
				1056	memset(lo->lo_file_name, 0, LO_NAME_SIZE);
				1057	blk_queue_logical_block_size(lo->lo_queue, 512);
				1058	blk_queue_physical_block_size(lo->lo_queue, 512);
				1059	blk_queue_io_min(lo->lo_queue, 512);
				1060	if (bdev) {
				1061	bdput(bdev);
				1062	invalidate_bdev(bdev);
				1063	}
				1064	set_capacity(lo->lo_disk, 0);
				1065	loop_sysfs_exit(lo);
				1066	if (bdev) {
				1067	bd_set_size(bdev, 0);
				1068	/* let user-space know about this change */
				1069	kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
				1070	}
				1071	mapping_set_gfp_mask(filp->f_mapping, gfp);
				1072	lo->lo_state = Lo_unbound;
				1073	/* This is safe: open() is still holding a reference. */
				1074	module_put(THIS_MODULE);
				1075	blk_mq_unfreeze_queue(lo->lo_queue);
				1076
				1077	if (lo->lo_flags & LO_FLAGS_PARTSCAN && bdev)
				1078	loop_reread_partitions(lo, bdev);
				1079	lo->lo_flags = 0;
				1080	if (!part_shift)
				1081	lo->lo_disk->flags \|= GENHD_FL_NO_PART_SCAN;
				1082	loop_unprepare_queue(lo);
				1083	mutex_unlock(&lo->lo_ctl_mutex);
				1084	/*
				1085	* Need not hold lo_ctl_mutex to fput backing file.
				1086	* Calling fput holding lo_ctl_mutex triggers a circular
				1087	* lock dependency possibility warning as fput can take
				1088	* bd_mutex which is usually taken before lo_ctl_mutex.
				1089	*/
				1090	fput(filp);
				1091	return 0;
				1092	}
				1093
				1094	static int
				1095	loop_set_status(struct loop_device lo, const struct loop_info64 info)
				1096	{
				1097	int err;
				1098	struct loop_func_table *xfer;
				1099	kuid_t uid = current_uid();
				1100
				1101	if (lo->lo_encrypt_key_size &&
				1102	!uid_eq(lo->lo_key_owner, uid) &&
				1103	!capable(CAP_SYS_ADMIN))
				1104	return -EPERM;
				1105	if (lo->lo_state != Lo_bound)
				1106	return -ENXIO;
				1107	if ((unsigned int) info->lo_encrypt_key_size > LO_KEY_SIZE)
				1108	return -EINVAL;
				1109
				1110	if (lo->lo_offset != info->lo_offset \|\|
				1111	lo->lo_sizelimit != info->lo_sizelimit) {
				1112	sync_blockdev(lo->lo_device);
				1113	invalidate_bdev(lo->lo_device);
				1114	}
				1115
				1116	/* I/O need to be drained during transfer transition */
				1117	blk_mq_freeze_queue(lo->lo_queue);
				1118
				1119	err = loop_release_xfer(lo);
				1120	if (err)
				1121	goto exit;
				1122
				1123	if (info->lo_encrypt_type) {
				1124	unsigned int type = info->lo_encrypt_type;
				1125
				1126	if (type >= MAX_LO_CRYPT) {
				1127	err = -EINVAL;
				1128	goto exit;
				1129	}
				1130	xfer = xfer_funcs[type];
				1131	if (xfer == NULL) {
				1132	err = -EINVAL;
				1133	goto exit;
				1134	}
				1135	} else
				1136	xfer = NULL;
				1137
				1138	err = loop_init_xfer(lo, xfer, info);
				1139	if (err)
				1140	goto exit;
				1141
				1142	if (lo->lo_offset != info->lo_offset \|\|
				1143	lo->lo_sizelimit != info->lo_sizelimit) {
				1144	/* kill_bdev should have truncated all the pages */
				1145	if (lo->lo_device->bd_inode->i_mapping->nrpages) {
				1146	err = -EAGAIN;
				1147	pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n",
				1148	__func__, lo->lo_number, lo->lo_file_name,
				1149	lo->lo_device->bd_inode->i_mapping->nrpages);
				1150	goto exit;
				1151	}
				1152	if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit)) {
				1153	err = -EFBIG;
				1154	goto exit;
				1155	}
				1156	}
				1157
				1158	loop_config_discard(lo);
				1159
				1160	memcpy(lo->lo_file_name, info->lo_file_name, LO_NAME_SIZE);
				1161	memcpy(lo->lo_crypt_name, info->lo_crypt_name, LO_NAME_SIZE);
				1162	lo->lo_file_name[LO_NAME_SIZE-1] = 0;
				1163	lo->lo_crypt_name[LO_NAME_SIZE-1] = 0;
				1164
				1165	if (!xfer)
				1166	xfer = &none_funcs;
				1167	lo->transfer = xfer->transfer;
				1168	lo->ioctl = xfer->ioctl;
				1169
				1170	if ((lo->lo_flags & LO_FLAGS_AUTOCLEAR) !=
				1171	(info->lo_flags & LO_FLAGS_AUTOCLEAR))
				1172	lo->lo_flags ^= LO_FLAGS_AUTOCLEAR;
				1173
				1174	lo->lo_encrypt_key_size = info->lo_encrypt_key_size;
				1175	lo->lo_init[0] = info->lo_init[0];
				1176	lo->lo_init[1] = info->lo_init[1];
				1177	if (info->lo_encrypt_key_size) {
				1178	memcpy(lo->lo_encrypt_key, info->lo_encrypt_key,
				1179	info->lo_encrypt_key_size);
				1180	lo->lo_key_owner = uid;
				1181	}
				1182
				1183	/* update dio if lo_offset or transfer is changed */
				1184	__loop_update_dio(lo, lo->use_dio);
				1185
				1186	exit:
				1187	blk_mq_unfreeze_queue(lo->lo_queue);
				1188
				1189	if (!err && (info->lo_flags & LO_FLAGS_PARTSCAN) &&
				1190	!(lo->lo_flags & LO_FLAGS_PARTSCAN)) {
				1191	lo->lo_flags \|= LO_FLAGS_PARTSCAN;
				1192	lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN;
				1193	loop_reread_partitions(lo, lo->lo_device);
				1194	}
				1195
				1196	return err;
				1197	}
				1198
				1199	static int
				1200	loop_get_status(struct loop_device lo, struct loop_info64 info)
				1201	{
				1202	struct path path;
				1203	struct kstat stat;
				1204	int ret;
				1205
				1206	if (lo->lo_state != Lo_bound) {
				1207	mutex_unlock(&lo->lo_ctl_mutex);
				1208	return -ENXIO;
				1209	}
				1210
				1211	memset(info, 0, sizeof(*info));
				1212	info->lo_number = lo->lo_number;
				1213	info->lo_offset = lo->lo_offset;
				1214	info->lo_sizelimit = lo->lo_sizelimit;
				1215	info->lo_flags = lo->lo_flags;
				1216	memcpy(info->lo_file_name, lo->lo_file_name, LO_NAME_SIZE);
				1217	memcpy(info->lo_crypt_name, lo->lo_crypt_name, LO_NAME_SIZE);
				1218	info->lo_encrypt_type =
				1219	lo->lo_encryption ? lo->lo_encryption->number : 0;
				1220	if (lo->lo_encrypt_key_size && capable(CAP_SYS_ADMIN)) {
				1221	info->lo_encrypt_key_size = lo->lo_encrypt_key_size;
				1222	memcpy(info->lo_encrypt_key, lo->lo_encrypt_key,
				1223	lo->lo_encrypt_key_size);
				1224	}
				1225
				1226	/* Drop lo_ctl_mutex while we call into the filesystem. */
				1227	path = lo->lo_backing_file->f_path;
				1228	path_get(&path);
				1229	mutex_unlock(&lo->lo_ctl_mutex);
				1230	ret = vfs_getattr(&path, &stat, STATX_INO, AT_STATX_SYNC_AS_STAT);
				1231	if (!ret) {
				1232	info->lo_device = huge_encode_dev(stat.dev);
				1233	info->lo_inode = stat.ino;
				1234	info->lo_rdevice = huge_encode_dev(stat.rdev);
				1235	}
				1236	path_put(&path);
				1237	return ret;
				1238	}
				1239
				1240	static void
				1241	loop_info64_from_old(const struct loop_info info, struct loop_info64 info64)
				1242	{
				1243	memset(info64, 0, sizeof(*info64));
				1244	info64->lo_number = info->lo_number;
				1245	info64->lo_device = info->lo_device;
				1246	info64->lo_inode = info->lo_inode;
				1247	info64->lo_rdevice = info->lo_rdevice;
				1248	info64->lo_offset = info->lo_offset;
				1249	info64->lo_sizelimit = 0;
				1250	info64->lo_encrypt_type = info->lo_encrypt_type;
				1251	info64->lo_encrypt_key_size = info->lo_encrypt_key_size;
				1252	info64->lo_flags = info->lo_flags;
				1253	info64->lo_init[0] = info->lo_init[0];
				1254	info64->lo_init[1] = info->lo_init[1];
				1255	if (info->lo_encrypt_type == LO_CRYPT_CRYPTOAPI)
				1256	memcpy(info64->lo_crypt_name, info->lo_name, LO_NAME_SIZE);
				1257	else
				1258	memcpy(info64->lo_file_name, info->lo_name, LO_NAME_SIZE);
				1259	memcpy(info64->lo_encrypt_key, info->lo_encrypt_key, LO_KEY_SIZE);
				1260	}
				1261
				1262	static int
				1263	loop_info64_to_old(const struct loop_info64 info64, struct loop_info info)
				1264	{
				1265	memset(info, 0, sizeof(*info));
				1266	info->lo_number = info64->lo_number;
				1267	info->lo_device = info64->lo_device;
				1268	info->lo_inode = info64->lo_inode;
				1269	info->lo_rdevice = info64->lo_rdevice;
				1270	info->lo_offset = info64->lo_offset;
				1271	info->lo_encrypt_type = info64->lo_encrypt_type;
				1272	info->lo_encrypt_key_size = info64->lo_encrypt_key_size;
				1273	info->lo_flags = info64->lo_flags;
				1274	info->lo_init[0] = info64->lo_init[0];
				1275	info->lo_init[1] = info64->lo_init[1];
				1276	if (info->lo_encrypt_type == LO_CRYPT_CRYPTOAPI)
				1277	memcpy(info->lo_name, info64->lo_crypt_name, LO_NAME_SIZE);
				1278	else
				1279	memcpy(info->lo_name, info64->lo_file_name, LO_NAME_SIZE);
				1280	memcpy(info->lo_encrypt_key, info64->lo_encrypt_key, LO_KEY_SIZE);
				1281
				1282	/* error in case values were truncated */
				1283	if (info->lo_device != info64->lo_device \|\|
				1284	info->lo_rdevice != info64->lo_rdevice \|\|
				1285	info->lo_inode != info64->lo_inode \|\|
				1286	info->lo_offset != info64->lo_offset)
				1287	return -EOVERFLOW;
				1288
				1289	return 0;
				1290	}
				1291
				1292	static int
				1293	loop_set_status_old(struct loop_device lo, const struct loop_info __user arg)
				1294	{
				1295	struct loop_info info;
				1296	struct loop_info64 info64;
				1297
				1298	if (copy_from_user(&info, arg, sizeof (struct loop_info)))
				1299	return -EFAULT;
				1300	loop_info64_from_old(&info, &info64);
				1301	return loop_set_status(lo, &info64);
				1302	}
				1303
				1304	static int
				1305	loop_set_status64(struct loop_device lo, const struct loop_info64 __user arg)
				1306	{
				1307	struct loop_info64 info64;
				1308
				1309	if (copy_from_user(&info64, arg, sizeof (struct loop_info64)))
				1310	return -EFAULT;
				1311	return loop_set_status(lo, &info64);
				1312	}
				1313
				1314	static int
				1315	loop_get_status_old(struct loop_device lo, struct loop_info __user arg) {
				1316	struct loop_info info;
				1317	struct loop_info64 info64;
				1318	int err;
				1319
				1320	if (!arg) {
				1321	mutex_unlock(&lo->lo_ctl_mutex);
				1322	return -EINVAL;
				1323	}
				1324	err = loop_get_status(lo, &info64);
				1325	if (!err)
				1326	err = loop_info64_to_old(&info64, &info);
				1327	if (!err && copy_to_user(arg, &info, sizeof(info)))
				1328	err = -EFAULT;
				1329
				1330	return err;
				1331	}
				1332
				1333	static int
				1334	loop_get_status64(struct loop_device lo, struct loop_info64 __user arg) {
				1335	struct loop_info64 info64;
				1336	int err;
				1337
				1338	if (!arg) {
				1339	mutex_unlock(&lo->lo_ctl_mutex);
				1340	return -EINVAL;
				1341	}
				1342	err = loop_get_status(lo, &info64);
				1343	if (!err && copy_to_user(arg, &info64, sizeof(info64)))
				1344	err = -EFAULT;
				1345
				1346	return err;
				1347	}
				1348
				1349	static int loop_set_capacity(struct loop_device *lo)
				1350	{
				1351	if (unlikely(lo->lo_state != Lo_bound))
				1352	return -ENXIO;
				1353
				1354	return figure_loop_size(lo, lo->lo_offset, lo->lo_sizelimit);
				1355	}
				1356
				1357	static int loop_set_dio(struct loop_device *lo, unsigned long arg)
				1358	{
				1359	int error = -ENXIO;
				1360	if (lo->lo_state != Lo_bound)
				1361	goto out;
				1362
				1363	__loop_update_dio(lo, !!arg);
				1364	if (lo->use_dio == !!arg)
				1365	return 0;
				1366	error = -EINVAL;
				1367	out:
				1368	return error;
				1369	}
				1370
				1371	static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
				1372	{
				1373	int err = 0;
				1374
				1375	if (lo->lo_state != Lo_bound)
				1376	return -ENXIO;
				1377
				1378	if (arg < 512 \|\| arg > PAGE_SIZE \|\| !is_power_of_2(arg))
				1379	return -EINVAL;
				1380
				1381	if (lo->lo_queue->limits.logical_block_size != arg) {
				1382	sync_blockdev(lo->lo_device);
				1383	invalidate_bdev(lo->lo_device);
				1384	}
				1385
				1386	blk_mq_freeze_queue(lo->lo_queue);
				1387
				1388	/* invalidate_bdev should have truncated all the pages */
				1389	if (lo->lo_queue->limits.logical_block_size != arg &&
				1390	lo->lo_device->bd_inode->i_mapping->nrpages) {
				1391	err = -EAGAIN;
				1392	pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n",
				1393	__func__, lo->lo_number, lo->lo_file_name,
				1394	lo->lo_device->bd_inode->i_mapping->nrpages);
				1395	goto out_unfreeze;
				1396	}
				1397
				1398	blk_queue_logical_block_size(lo->lo_queue, arg);
				1399	blk_queue_physical_block_size(lo->lo_queue, arg);
				1400	blk_queue_io_min(lo->lo_queue, arg);
				1401	loop_update_dio(lo);
				1402	out_unfreeze:
				1403	blk_mq_unfreeze_queue(lo->lo_queue);
				1404
				1405	return err;
				1406	}
				1407
				1408	static int lo_ioctl(struct block_device *bdev, fmode_t mode,
				1409	unsigned int cmd, unsigned long arg)
				1410	{
				1411	struct loop_device *lo = bdev->bd_disk->private_data;
				1412	int err;
				1413
				1414	mutex_lock_nested(&lo->lo_ctl_mutex, 1);
				1415	switch (cmd) {
				1416	case LOOP_SET_FD:
				1417	err = loop_set_fd(lo, mode, bdev, arg);
				1418	break;
				1419	case LOOP_CHANGE_FD:
				1420	err = loop_change_fd(lo, bdev, arg);
				1421	break;
				1422	case LOOP_CLR_FD:
				1423	/* loop_clr_fd would have unlocked lo_ctl_mutex on success */
				1424	err = loop_clr_fd(lo);
				1425	if (!err)
				1426	goto out_unlocked;
				1427	break;
				1428	case LOOP_SET_STATUS:
				1429	err = -EPERM;
				1430	if ((mode & FMODE_WRITE) \|\| capable(CAP_SYS_ADMIN))
				1431	err = loop_set_status_old(lo,
				1432	(struct loop_info __user *)arg);
				1433	break;
				1434	case LOOP_GET_STATUS:
				1435	err = loop_get_status_old(lo, (struct loop_info __user *) arg);
				1436	/* loop_get_status() unlocks lo_ctl_mutex */
				1437	goto out_unlocked;
				1438	case LOOP_SET_STATUS64:
				1439	err = -EPERM;
				1440	if ((mode & FMODE_WRITE) \|\| capable(CAP_SYS_ADMIN))
				1441	err = loop_set_status64(lo,
				1442	(struct loop_info64 __user *) arg);
				1443	break;
				1444	case LOOP_GET_STATUS64:
				1445	err = loop_get_status64(lo, (struct loop_info64 __user *) arg);
				1446	/* loop_get_status() unlocks lo_ctl_mutex */
				1447	goto out_unlocked;
				1448	case LOOP_SET_CAPACITY:
				1449	err = -EPERM;
				1450	if ((mode & FMODE_WRITE) \|\| capable(CAP_SYS_ADMIN))
				1451	err = loop_set_capacity(lo);
				1452	break;
				1453	case LOOP_SET_DIRECT_IO:
				1454	err = -EPERM;
				1455	if ((mode & FMODE_WRITE) \|\| capable(CAP_SYS_ADMIN))
				1456	err = loop_set_dio(lo, arg);
				1457	break;
				1458	case LOOP_SET_BLOCK_SIZE:
				1459	err = -EPERM;
				1460	if ((mode & FMODE_WRITE) \|\| capable(CAP_SYS_ADMIN))
				1461	err = loop_set_block_size(lo, arg);
				1462	break;
				1463	default:
				1464	err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL;
				1465	}
				1466	mutex_unlock(&lo->lo_ctl_mutex);
				1467
				1468	out_unlocked:
				1469	return err;
				1470	}
				1471
				1472	#ifdef CONFIG_COMPAT
				1473	struct compat_loop_info {
				1474	compat_int_t lo_number; /* ioctl r/o */
				1475	compat_dev_t lo_device; /* ioctl r/o */
				1476	compat_ulong_t lo_inode; /* ioctl r/o */
				1477	compat_dev_t lo_rdevice; /* ioctl r/o */
				1478	compat_int_t lo_offset;
				1479	compat_int_t lo_encrypt_type;
				1480	compat_int_t lo_encrypt_key_size; /* ioctl w/o */
				1481	compat_int_t lo_flags; /* ioctl r/o */
				1482	char lo_name[LO_NAME_SIZE];
				1483	unsigned char lo_encrypt_key[LO_KEY_SIZE]; /* ioctl w/o */
				1484	compat_ulong_t lo_init[2];
				1485	char reserved[4];
				1486	};
				1487
				1488	/*
				1489	* Transfer 32-bit compatibility structure in userspace to 64-bit loop info
				1490	* - noinlined to reduce stack space usage in main part of driver
				1491	*/
				1492	static noinline int
				1493	loop_info64_from_compat(const struct compat_loop_info __user *arg,
				1494	struct loop_info64 *info64)
				1495	{
				1496	struct compat_loop_info info;
				1497
				1498	if (copy_from_user(&info, arg, sizeof(info)))
				1499	return -EFAULT;
				1500
				1501	memset(info64, 0, sizeof(*info64));
				1502	info64->lo_number = info.lo_number;
				1503	info64->lo_device = info.lo_device;
				1504	info64->lo_inode = info.lo_inode;
				1505	info64->lo_rdevice = info.lo_rdevice;
				1506	info64->lo_offset = info.lo_offset;
				1507	info64->lo_sizelimit = 0;
				1508	info64->lo_encrypt_type = info.lo_encrypt_type;
				1509	info64->lo_encrypt_key_size = info.lo_encrypt_key_size;
				1510	info64->lo_flags = info.lo_flags;
				1511	info64->lo_init[0] = info.lo_init[0];
				1512	info64->lo_init[1] = info.lo_init[1];
				1513	if (info.lo_encrypt_type == LO_CRYPT_CRYPTOAPI)
				1514	memcpy(info64->lo_crypt_name, info.lo_name, LO_NAME_SIZE);
				1515	else
				1516	memcpy(info64->lo_file_name, info.lo_name, LO_NAME_SIZE);
				1517	memcpy(info64->lo_encrypt_key, info.lo_encrypt_key, LO_KEY_SIZE);
				1518	return 0;
				1519	}
				1520
				1521	/*
				1522	* Transfer 64-bit loop info to 32-bit compatibility structure in userspace
				1523	* - noinlined to reduce stack space usage in main part of driver
				1524	*/
				1525	static noinline int
				1526	loop_info64_to_compat(const struct loop_info64 *info64,
				1527	struct compat_loop_info __user *arg)
				1528	{
				1529	struct compat_loop_info info;
				1530
				1531	memset(&info, 0, sizeof(info));
				1532	info.lo_number = info64->lo_number;
				1533	info.lo_device = info64->lo_device;
				1534	info.lo_inode = info64->lo_inode;
				1535	info.lo_rdevice = info64->lo_rdevice;
				1536	info.lo_offset = info64->lo_offset;
				1537	info.lo_encrypt_type = info64->lo_encrypt_type;
				1538	info.lo_encrypt_key_size = info64->lo_encrypt_key_size;
				1539	info.lo_flags = info64->lo_flags;
				1540	info.lo_init[0] = info64->lo_init[0];
				1541	info.lo_init[1] = info64->lo_init[1];
				1542	if (info.lo_encrypt_type == LO_CRYPT_CRYPTOAPI)
				1543	memcpy(info.lo_name, info64->lo_crypt_name, LO_NAME_SIZE);
				1544	else
				1545	memcpy(info.lo_name, info64->lo_file_name, LO_NAME_SIZE);
				1546	memcpy(info.lo_encrypt_key, info64->lo_encrypt_key, LO_KEY_SIZE);
				1547
				1548	/* error in case values were truncated */
				1549	if (info.lo_device != info64->lo_device \|\|
				1550	info.lo_rdevice != info64->lo_rdevice \|\|
				1551	info.lo_inode != info64->lo_inode \|\|
				1552	info.lo_offset != info64->lo_offset \|\|
				1553	info.lo_init[0] != info64->lo_init[0] \|\|
				1554	info.lo_init[1] != info64->lo_init[1])
				1555	return -EOVERFLOW;
				1556
				1557	if (copy_to_user(arg, &info, sizeof(info)))
				1558	return -EFAULT;
				1559	return 0;
				1560	}
				1561
				1562	static int
				1563	loop_set_status_compat(struct loop_device *lo,
				1564	const struct compat_loop_info __user *arg)
				1565	{
				1566	struct loop_info64 info64;
				1567	int ret;
				1568
				1569	ret = loop_info64_from_compat(arg, &info64);
				1570	if (ret < 0)
				1571	return ret;
				1572	return loop_set_status(lo, &info64);
				1573	}
				1574
				1575	static int
				1576	loop_get_status_compat(struct loop_device *lo,
				1577	struct compat_loop_info __user *arg)
				1578	{
				1579	struct loop_info64 info64;
				1580	int err;
				1581
				1582	if (!arg) {
				1583	mutex_unlock(&lo->lo_ctl_mutex);
				1584	return -EINVAL;
				1585	}
				1586	err = loop_get_status(lo, &info64);
				1587	if (!err)
				1588	err = loop_info64_to_compat(&info64, arg);
				1589	return err;
				1590	}
				1591
				1592	static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode,
				1593	unsigned int cmd, unsigned long arg)
				1594	{
				1595	struct loop_device *lo = bdev->bd_disk->private_data;
				1596	int err;
				1597
				1598	switch(cmd) {
				1599	case LOOP_SET_STATUS:
				1600	mutex_lock(&lo->lo_ctl_mutex);
				1601	err = loop_set_status_compat(
				1602	lo, (const struct compat_loop_info __user *) arg);
				1603	mutex_unlock(&lo->lo_ctl_mutex);
				1604	break;
				1605	case LOOP_GET_STATUS:
				1606	mutex_lock(&lo->lo_ctl_mutex);
				1607	err = loop_get_status_compat(
				1608	lo, (struct compat_loop_info __user *) arg);
				1609	/* loop_get_status() unlocks lo_ctl_mutex */
				1610	break;
				1611	case LOOP_SET_CAPACITY:
				1612	case LOOP_CLR_FD:
				1613	case LOOP_GET_STATUS64:
				1614	case LOOP_SET_STATUS64:
				1615	arg = (unsigned long) compat_ptr(arg);
				1616	case LOOP_SET_FD:
				1617	case LOOP_CHANGE_FD:
				1618	case LOOP_SET_BLOCK_SIZE:
				1619	case LOOP_SET_DIRECT_IO:
				1620	err = lo_ioctl(bdev, mode, cmd, arg);
				1621	break;
				1622	default:
				1623	err = -ENOIOCTLCMD;
				1624	break;
				1625	}
				1626	return err;
				1627	}
				1628	#endif
				1629
				1630	static int lo_open(struct block_device *bdev, fmode_t mode)
				1631	{
				1632	struct loop_device *lo;
				1633	int err = 0;
				1634
				1635	mutex_lock(&loop_index_mutex);
				1636	lo = bdev->bd_disk->private_data;
				1637	if (!lo) {
				1638	err = -ENXIO;
				1639	goto out;
				1640	}
				1641
				1642	atomic_inc(&lo->lo_refcnt);
				1643	out:
				1644	mutex_unlock(&loop_index_mutex);
				1645	return err;
				1646	}
				1647
				1648	static void __lo_release(struct loop_device *lo)
				1649	{
				1650	int err;
				1651
				1652	if (atomic_dec_return(&lo->lo_refcnt))
				1653	return;
				1654
				1655	mutex_lock(&lo->lo_ctl_mutex);
				1656	if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) {
				1657	/*
				1658	* In autoclear mode, stop the loop thread
				1659	* and remove configuration after last close.
				1660	*/
				1661	err = loop_clr_fd(lo);
				1662	if (!err)
				1663	return;
				1664	} else if (lo->lo_state == Lo_bound) {
				1665	/*
				1666	* Otherwise keep thread (if running) and config,
				1667	* but flush possible ongoing bios in thread.
				1668	*/
				1669	blk_mq_freeze_queue(lo->lo_queue);
				1670	blk_mq_unfreeze_queue(lo->lo_queue);
				1671	}
				1672
				1673	mutex_unlock(&lo->lo_ctl_mutex);
				1674	}
				1675
				1676	static void lo_release(struct gendisk *disk, fmode_t mode)
				1677	{
				1678	mutex_lock(&loop_index_mutex);
				1679	__lo_release(disk->private_data);
				1680	mutex_unlock(&loop_index_mutex);
				1681	}
				1682
				1683	static const struct block_device_operations lo_fops = {
				1684	.owner = THIS_MODULE,
				1685	.open = lo_open,
				1686	.release = lo_release,
				1687	.ioctl = lo_ioctl,
				1688	#ifdef CONFIG_COMPAT
				1689	.compat_ioctl = lo_compat_ioctl,
				1690	#endif
				1691	};
				1692
				1693	/*
				1694	* And now the modules code and kernel interface.
				1695	*/
				1696	static int max_loop;
				1697	module_param(max_loop, int, S_IRUGO);
				1698	MODULE_PARM_DESC(max_loop, "Maximum number of loop devices");
				1699	module_param(max_part, int, S_IRUGO);
				1700	MODULE_PARM_DESC(max_part, "Maximum number of partitions per loop device");
				1701	MODULE_LICENSE("GPL");
				1702	MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR);
				1703
				1704	int loop_register_transfer(struct loop_func_table *funcs)
				1705	{
				1706	unsigned int n = funcs->number;
				1707
				1708	if (n >= MAX_LO_CRYPT \|\| xfer_funcs[n])
				1709	return -EINVAL;
				1710	xfer_funcs[n] = funcs;
				1711	return 0;
				1712	}
				1713
				1714	static int unregister_transfer_cb(int id, void ptr, void data)
				1715	{
				1716	struct loop_device *lo = ptr;
				1717	struct loop_func_table *xfer = data;
				1718
				1719	mutex_lock(&lo->lo_ctl_mutex);
				1720	if (lo->lo_encryption == xfer)
				1721	loop_release_xfer(lo);
				1722	mutex_unlock(&lo->lo_ctl_mutex);
				1723	return 0;
				1724	}
				1725
				1726	int loop_unregister_transfer(int number)
				1727	{
				1728	unsigned int n = number;
				1729	struct loop_func_table *xfer;
				1730
				1731	if (n == 0 \|\| n >= MAX_LO_CRYPT \|\| (xfer = xfer_funcs[n]) == NULL)
				1732	return -EINVAL;
				1733
				1734	xfer_funcs[n] = NULL;
				1735	idr_for_each(&loop_index_idr, &unregister_transfer_cb, xfer);
				1736	return 0;
				1737	}
				1738
				1739	EXPORT_SYMBOL(loop_register_transfer);
				1740	EXPORT_SYMBOL(loop_unregister_transfer);
				1741
				1742	static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx,
				1743	const struct blk_mq_queue_data *bd)
				1744	{
				1745	struct loop_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
				1746	struct loop_device *lo = cmd->rq->q->queuedata;
				1747
				1748	blk_mq_start_request(bd->rq);
				1749
				1750	if (lo->lo_state != Lo_bound)
				1751	return BLK_STS_IOERR;
				1752
				1753	switch (req_op(cmd->rq)) {
				1754	case REQ_OP_FLUSH:
				1755	case REQ_OP_DISCARD:
				1756	case REQ_OP_WRITE_ZEROES:
				1757	cmd->use_aio = false;
				1758	break;
				1759	default:
				1760	cmd->use_aio = lo->use_dio;
				1761	break;
				1762	}
				1763
				1764	kthread_queue_work(&lo->worker, &cmd->work);
				1765
				1766	return BLK_STS_OK;
				1767	}
				1768
				1769	static void loop_handle_cmd(struct loop_cmd *cmd)
				1770	{
				1771	const bool write = op_is_write(req_op(cmd->rq));
				1772	struct loop_device *lo = cmd->rq->q->queuedata;
				1773	int ret = 0;
				1774
				1775	if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY)) {
				1776	ret = -EIO;
				1777	goto failed;
				1778	}
				1779
				1780	ret = do_req_filebacked(lo, cmd->rq);
				1781	failed:
				1782	/* complete non-aio request */
				1783	if (!cmd->use_aio \|\| ret) {
				1784	cmd->ret = ret ? -EIO : 0;
				1785	blk_mq_complete_request(cmd->rq);
				1786	}
				1787	}
				1788
				1789	static void loop_queue_work(struct kthread_work *work)
				1790	{
				1791	struct loop_cmd *cmd =
				1792	container_of(work, struct loop_cmd, work);
				1793
				1794	loop_handle_cmd(cmd);
				1795	}
				1796
				1797	static int loop_init_request(struct blk_mq_tag_set set, struct request rq,
				1798	unsigned int hctx_idx, unsigned int numa_node)
				1799	{
				1800	struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
				1801
				1802	cmd->rq = rq;
				1803	kthread_init_work(&cmd->work, loop_queue_work);
				1804
				1805	return 0;
				1806	}
				1807
				1808	static const struct blk_mq_ops loop_mq_ops = {
				1809	.queue_rq = loop_queue_rq,
				1810	.init_request = loop_init_request,
				1811	.complete = lo_complete_rq,
				1812	};
				1813
				1814	static int loop_add(struct loop_device **l, int i)
				1815	{
				1816	struct loop_device *lo;
				1817	struct gendisk *disk;
				1818	int err;
				1819
				1820	err = -ENOMEM;
				1821	lo = kzalloc(sizeof(*lo), GFP_KERNEL);
				1822	if (!lo)
				1823	goto out;
				1824
				1825	lo->lo_state = Lo_unbound;
				1826
				1827	/* allocate id, if @id >= 0, we're requesting that specific id */
				1828	if (i >= 0) {
				1829	err = idr_alloc(&loop_index_idr, lo, i, i + 1, GFP_KERNEL);
				1830	if (err == -ENOSPC)
				1831	err = -EEXIST;
				1832	} else {
				1833	err = idr_alloc(&loop_index_idr, lo, 0, 0, GFP_KERNEL);
				1834	}
				1835	if (err < 0)
				1836	goto out_free_dev;
				1837	i = err;
				1838
				1839	err = -ENOMEM;
				1840	lo->tag_set.ops = &loop_mq_ops;
				1841	lo->tag_set.nr_hw_queues = 1;
				1842	lo->tag_set.queue_depth = 128;
				1843	lo->tag_set.numa_node = NUMA_NO_NODE;
				1844	lo->tag_set.cmd_size = sizeof(struct loop_cmd);
				1845	lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE \| BLK_MQ_F_SG_MERGE;
				1846	lo->tag_set.driver_data = lo;
				1847
				1848	err = blk_mq_alloc_tag_set(&lo->tag_set);
				1849	if (err)
				1850	goto out_free_idr;
				1851
				1852	lo->lo_queue = blk_mq_init_queue(&lo->tag_set);
				1853	if (IS_ERR_OR_NULL(lo->lo_queue)) {
				1854	err = PTR_ERR(lo->lo_queue);
				1855	goto out_cleanup_tags;
				1856	}
				1857	lo->lo_queue->queuedata = lo;
				1858
				1859	blk_queue_max_hw_sectors(lo->lo_queue, BLK_DEF_MAX_SECTORS);
				1860
				1861	/*
				1862	* By default, we do buffer IO, so it doesn't make sense to enable
				1863	* merge because the I/O submitted to backing file is handled page by
				1864	* page. For directio mode, merge does help to dispatch bigger request
				1865	* to underlayer disk. We will enable merge once directio is enabled.
				1866	*/
				1867	queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, lo->lo_queue);
				1868
				1869	err = -ENOMEM;
				1870	disk = lo->lo_disk = alloc_disk(1 << part_shift);
				1871	if (!disk)
				1872	goto out_free_queue;
				1873
				1874	/*
				1875	* Disable partition scanning by default. The in-kernel partition
				1876	* scanning can be requested individually per-device during its
				1877	* setup. Userspace can always add and remove partitions from all
				1878	* devices. The needed partition minors are allocated from the
				1879	* extended minor space, the main loop device numbers will continue
				1880	* to match the loop minors, regardless of the number of partitions
				1881	* used.
				1882	*
				1883	* If max_part is given, partition scanning is globally enabled for
				1884	* all loop devices. The minors for the main loop devices will be
				1885	* multiples of max_part.
				1886	*
				1887	* Note: Global-for-all-devices, set-only-at-init, read-only module
				1888	* parameteters like 'max_loop' and 'max_part' make things needlessly
				1889	* complicated, are too static, inflexible and may surprise
				1890	* userspace tools. Parameters like this in general should be avoided.
				1891	*/
				1892	if (!part_shift)
				1893	disk->flags \|= GENHD_FL_NO_PART_SCAN;
				1894	disk->flags \|= GENHD_FL_EXT_DEVT;
				1895	mutex_init(&lo->lo_ctl_mutex);
				1896	atomic_set(&lo->lo_refcnt, 0);
				1897	lo->lo_number = i;
				1898	spin_lock_init(&lo->lo_lock);
				1899	disk->major = LOOP_MAJOR;
				1900	disk->first_minor = i << part_shift;
				1901	disk->fops = &lo_fops;
				1902	disk->private_data = lo;
				1903	disk->queue = lo->lo_queue;
				1904	sprintf(disk->disk_name, "loop%d", i);
				1905	add_disk(disk);
				1906	*l = lo;
				1907	return lo->lo_number;
				1908
				1909	out_free_queue:
				1910	blk_cleanup_queue(lo->lo_queue);
				1911	out_cleanup_tags:
				1912	blk_mq_free_tag_set(&lo->tag_set);
				1913	out_free_idr:
				1914	idr_remove(&loop_index_idr, i);
				1915	out_free_dev:
				1916	kfree(lo);
				1917	out:
				1918	return err;
				1919	}
				1920
				1921	static void loop_remove(struct loop_device *lo)
				1922	{
				1923	blk_cleanup_queue(lo->lo_queue);
				1924	del_gendisk(lo->lo_disk);
				1925	blk_mq_free_tag_set(&lo->tag_set);
				1926	put_disk(lo->lo_disk);
				1927	kfree(lo);
				1928	}
				1929
				1930	static int find_free_cb(int id, void ptr, void data)
				1931	{
				1932	struct loop_device *lo = ptr;
				1933	struct loop_device **l = data;
				1934
				1935	if (lo->lo_state == Lo_unbound) {
				1936	*l = lo;
				1937	return 1;
				1938	}
				1939	return 0;
				1940	}
				1941
				1942	static int loop_lookup(struct loop_device **l, int i)
				1943	{
				1944	struct loop_device *lo;
				1945	int ret = -ENODEV;
				1946
				1947	if (i < 0) {
				1948	int err;
				1949
				1950	err = idr_for_each(&loop_index_idr, &find_free_cb, &lo);
				1951	if (err == 1) {
				1952	*l = lo;
				1953	ret = lo->lo_number;
				1954	}
				1955	goto out;
				1956	}
				1957
				1958	/* lookup and return a specific i */
				1959	lo = idr_find(&loop_index_idr, i);
				1960	if (lo) {
				1961	*l = lo;
				1962	ret = lo->lo_number;
				1963	}
				1964	out:
				1965	return ret;
				1966	}
				1967
				1968	static struct kobject loop_probe(dev_t dev, int part, void *data)
				1969	{
				1970	struct loop_device *lo;
				1971	struct kobject *kobj;
				1972	int err;
				1973
				1974	mutex_lock(&loop_index_mutex);
				1975	err = loop_lookup(&lo, MINOR(dev) >> part_shift);
				1976	if (err < 0)
				1977	err = loop_add(&lo, MINOR(dev) >> part_shift);
				1978	if (err < 0)
				1979	kobj = NULL;
				1980	else
				1981	kobj = get_disk(lo->lo_disk);
				1982	mutex_unlock(&loop_index_mutex);
				1983
				1984	*part = 0;
				1985	return kobj;
				1986	}
				1987
				1988	static long loop_control_ioctl(struct file *file, unsigned int cmd,
				1989	unsigned long parm)
				1990	{
				1991	struct loop_device *lo;
				1992	int ret = -ENOSYS;
				1993
				1994	mutex_lock(&loop_index_mutex);
				1995	switch (cmd) {
				1996	case LOOP_CTL_ADD:
				1997	ret = loop_lookup(&lo, parm);
				1998	if (ret >= 0) {
				1999	ret = -EEXIST;
				2000	break;
				2001	}
				2002	ret = loop_add(&lo, parm);
				2003	break;
				2004	case LOOP_CTL_REMOVE:
				2005	ret = loop_lookup(&lo, parm);
				2006	if (ret < 0)
				2007	break;
				2008	mutex_lock(&lo->lo_ctl_mutex);
				2009	if (lo->lo_state != Lo_unbound) {
				2010	ret = -EBUSY;
				2011	mutex_unlock(&lo->lo_ctl_mutex);
				2012	break;
				2013	}
				2014	if (atomic_read(&lo->lo_refcnt) > 0) {
				2015	ret = -EBUSY;
				2016	mutex_unlock(&lo->lo_ctl_mutex);
				2017	break;
				2018	}
				2019	lo->lo_disk->private_data = NULL;
				2020	mutex_unlock(&lo->lo_ctl_mutex);
				2021	idr_remove(&loop_index_idr, lo->lo_number);
				2022	loop_remove(lo);
				2023	break;
				2024	case LOOP_CTL_GET_FREE:
				2025	ret = loop_lookup(&lo, -1);
				2026	if (ret >= 0)
				2027	break;
				2028	ret = loop_add(&lo, -1);
				2029	}
				2030	mutex_unlock(&loop_index_mutex);
				2031
				2032	return ret;
				2033	}
				2034
				2035	static const struct file_operations loop_ctl_fops = {
				2036	.open = nonseekable_open,
				2037	.unlocked_ioctl = loop_control_ioctl,
				2038	.compat_ioctl = loop_control_ioctl,
				2039	.owner = THIS_MODULE,
				2040	.llseek = noop_llseek,
				2041	};
				2042
				2043	static struct miscdevice loop_misc = {
				2044	.minor = LOOP_CTRL_MINOR,
				2045	.name = "loop-control",
				2046	.fops = &loop_ctl_fops,
				2047	};
				2048
				2049	MODULE_ALIAS_MISCDEV(LOOP_CTRL_MINOR);
				2050	MODULE_ALIAS("devname:loop-control");
				2051
				2052	static int __init loop_init(void)
				2053	{
				2054	int i, nr;
				2055	unsigned long range;
				2056	struct loop_device *lo;
				2057	int err;
				2058
				2059	part_shift = 0;
				2060	if (max_part > 0) {
				2061	part_shift = fls(max_part);
				2062
				2063	/*
				2064	* Adjust max_part according to part_shift as it is exported
				2065	* to user space so that user can decide correct minor number
				2066	* if [s]he want to create more devices.
				2067	*
				2068	* Note that -1 is required because partition 0 is reserved
				2069	* for the whole disk.
				2070	*/
				2071	max_part = (1UL << part_shift) - 1;
				2072	}
				2073
				2074	if ((1UL << part_shift) > DISK_MAX_PARTS) {
				2075	err = -EINVAL;
				2076	goto err_out;
				2077	}
				2078
				2079	if (max_loop > 1UL << (MINORBITS - part_shift)) {
				2080	err = -EINVAL;
				2081	goto err_out;
				2082	}
				2083
				2084	/*
				2085	* If max_loop is specified, create that many devices upfront.
				2086	* This also becomes a hard limit. If max_loop is not specified,
				2087	* create CONFIG_BLK_DEV_LOOP_MIN_COUNT loop devices at module
				2088	* init time. Loop devices can be requested on-demand with the
				2089	* /dev/loop-control interface, or be instantiated by accessing
				2090	* a 'dead' device node.
				2091	*/
				2092	if (max_loop) {
				2093	nr = max_loop;
				2094	range = max_loop << part_shift;
				2095	} else {
				2096	nr = CONFIG_BLK_DEV_LOOP_MIN_COUNT;
				2097	range = 1UL << MINORBITS;
				2098	}
				2099
				2100	err = misc_register(&loop_misc);
				2101	if (err < 0)
				2102	goto err_out;
				2103
				2104
				2105	if (register_blkdev(LOOP_MAJOR, "loop")) {
				2106	err = -EIO;
				2107	goto misc_out;
				2108	}
				2109
				2110	blk_register_region(MKDEV(LOOP_MAJOR, 0), range,
				2111	THIS_MODULE, loop_probe, NULL, NULL);
				2112
				2113	/* pre-create number of devices given by config or max_loop */
				2114	mutex_lock(&loop_index_mutex);
				2115	for (i = 0; i < nr; i++)
				2116	loop_add(&lo, i);
				2117	mutex_unlock(&loop_index_mutex);
				2118
				2119	printk(KERN_INFO "loop: module loaded\n");
				2120	return 0;
				2121
				2122	misc_out:
				2123	misc_deregister(&loop_misc);
				2124	err_out:
				2125	return err;
				2126	}
				2127
				2128	static int loop_exit_cb(int id, void ptr, void data)
				2129	{
				2130	struct loop_device *lo = ptr;
				2131
				2132	loop_remove(lo);
				2133	return 0;
				2134	}
				2135
				2136	static void __exit loop_exit(void)
				2137	{
				2138	unsigned long range;
				2139
				2140	range = max_loop ? max_loop << part_shift : 1UL << MINORBITS;
				2141
				2142	idr_for_each(&loop_index_idr, &loop_exit_cb, NULL);
				2143	idr_destroy(&loop_index_idr);
				2144
				2145	blk_unregister_region(MKDEV(LOOP_MAJOR, 0), range);
				2146	unregister_blkdev(LOOP_MAJOR, "loop");
				2147
				2148	misc_deregister(&loop_misc);
				2149	}
				2150
				2151	module_init(loop_init);
				2152	module_exit(loop_exit);
				2153
				2154	#ifndef MODULE
				2155	static int __init max_loop_setup(char *str)
				2156	{
				2157	max_loop = simple_strtol(str, NULL, 0);
				2158	return 1;
				2159	}
				2160
				2161	__setup("max_loop=", max_loop_setup);
				2162	#endif